426 files changed, 354219 insertions, 0 deletions
diff --git a/sys/contrib/openzfs/module/.gitignore b/sys/contrib/openzfs/module/.gitignore
new file mode 100644
index 000000000000..7a4bd3673e77
--- /dev/null
+++ b/sys/contrib/openzfs/module/.gitignore
@@ -0,0 +1,26 @@
+*.ko
+*.ko.unsigned
+*.ko.out
+*.ko.out.sig
+*.ko.debug
+*.ko.full
+*.dwo
+.*.cmd
+.*.d
+*.mod
+
+/Kbuild
+/.cache.mk
+/.tmp_versions
+/Module.markers
+/Module.symvers
+/vnode_if*
+/bus_if.h
+/device_if.h
+/opt_global.h
+
+/export_syms
+/machine
+/x86
+
+!Makefile.in
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in
new file mode 100644
index 000000000000..1507965c5750
--- /dev/null
+++ b/sys/contrib/openzfs/module/Kbuild.in
@@ -0,0 +1,47 @@
+# When integrated in to a monolithic kernel the spl module must appear
+# first.  This ensures its module initialization function is run before
+# any of the other module initialization functions which depend on it.
+ZFS_MODULES += spl/
+ZFS_MODULES += avl/
+ZFS_MODULES += icp/
+ZFS_MODULES += lua/
+ZFS_MODULES += nvpair/
+ZFS_MODULES += unicode/
+ZFS_MODULES += zcommon/
+ZFS_MODULES += zfs/
+ZFS_MODULES += zstd/
+
+# The rest is only relevant when run by kbuild
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_ZFS) := $(ZFS_MODULES)
+
+ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement
+ZFS_MODULE_CFLAGS += -Wmissing-prototypes
+ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@  @NO_FORMAT_ZERO_LENGTH@
+
+ifneq ($(KBUILD_EXTMOD),)
+zfs_include = @abs_top_srcdir@/include
+ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h
+ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include
+else
+zfs_include = $(srctree)/include/zfs
+ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
+endif
+
+ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel
+ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl
+ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs
+ZFS_MODULE_CFLAGS += -I$(zfs_include)
+ZFS_MODULE_CPPFLAGS += -D_KERNEL
+ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
+
+ifneq ($(KBUILD_EXTMOD),)
+@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
+@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@
+endif
+
+subdir-asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
+subdir-ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
+
+endif
diff --git a/sys/contrib/openzfs/module/Makefile.bsd b/sys/contrib/openzfs/module/Makefile.bsd
new file mode 100644
index 000000000000..e7cddcc5bb5e
--- /dev/null
+++ b/sys/contrib/openzfs/module/Makefile.bsd
@@ -0,0 +1,368 @@
+.if !defined(WITH_CTF)
+WITH_CTF=1
+.endif
+
+.include <bsd.sys.mk>
+
+SRCDIR=${.CURDIR}
+INCDIR=${.CURDIR:H}/include
+
+KMOD=	openzfs
+
+.PATH:	${SRCDIR}/avl \
+	${SRCDIR}/lua \
+	${SRCDIR}/nvpair \
+	${SRCDIR}/os/freebsd/spl \
+	${SRCDIR}/os/freebsd/zfs \
+	${SRCDIR}/unicode \
+	${SRCDIR}/zcommon \
+	${SRCDIR}/zfs \
+	${SRCDIR}/zstd \
+	${SRCDIR}/zstd/lib
+
+
+
+CFLAGS+= -I${.OBJDIR:H}/include
+CFLAGS+= -I${INCDIR}
+CFLAGS+= -I${INCDIR}/os/freebsd
+CFLAGS+= -I${INCDIR}/os/freebsd/spl
+CFLAGS+= -I${INCDIR}/os/freebsd/zfs
+CFLAGS+= -I${SRCDIR}/zstd/include
+CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
+
+CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS  -D__BSD_VISIBLE=1 \
+	 -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \
+	 -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DHAVE_KSID -DCOMPAT_FREEBSD11
+
+.if ${MACHINE_ARCH} == "amd64"
+CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3
+.endif
+
+.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true"
+CFLAGS+= -DZFS_DEBUG -g
+.if defined(WITH_INVARIANTS) && ${WITH_INVARIANTS} == "true"
+ CFLAGS+= -DINVARIANTS -DWITNESS -DOPENSOLARIS_WITNESS
+.endif
+.if defined(WITH_O0) && ${WITH_O0} == "true"
+ CFLAGS+= -O0
+.endif
+.else
+CFLAGS += -DNDEBUG
+.endif
+
+.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true"
+# kernel must also be built with this option for this to work
+CFLAGS+= -DDEBUG_VFS_LOCKS
+.endif
+
+.if defined(WITH_GCOV) && ${WITH_GCOV} == "true"
+CFLAGS+=	 -fprofile-arcs -ftest-coverage
+.endif
+
+DEBUG_FLAGS=-g
+
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
+	${MACHINE_ARCH} == "arm"
+CFLAGS+= -DBITS_PER_LONG=32
+.else
+CFLAGS+= -DBITS_PER_LONG=64
+.endif
+
+SRCS=	vnode_if.h device_if.h bus_if.h
+
+# avl
+SRCS+=	avl.c
+
+#lua
+SRCS+=	lapi.c \
+	lauxlib.c \
+	lbaselib.c \
+	lcode.c \
+	lcompat.c \
+	lcorolib.c \
+	lctype.c \
+	ldebug.c \
+	ldo.c \
+	lfunc.c \
+	lgc.c \
+	llex.c \
+	lmem.c \
+	lobject.c \
+	lopcodes.c \
+	lparser.c \
+	lstate.c \
+	lstring.c \
+	lstrlib.c \
+	ltable.c \
+	ltablib.c \
+	ltm.c \
+	lvm.c \
+	lzio.c
+
+#nvpair
+SRCS+=	nvpair.c \
+	fnvpair.c \
+	nvpair_alloc_spl.c \
+	nvpair_alloc_fixed.c
+
+#os/freebsd/spl
+SRCS+=	acl_common.c \
+	callb.c \
+	list.c \
+	sha256c.c \
+	sha512c.c \
+	spl_acl.c \
+	spl_cmn_err.c \
+	spl_dtrace.c \
+	spl_kmem.c \
+	spl_kstat.c \
+	spl_misc.c \
+	spl_policy.c \
+	spl_procfs_list.c \
+	spl_string.c \
+	spl_sunddi.c \
+	spl_sysevent.c \
+	spl_taskq.c \
+	spl_uio.c \
+	spl_vfs.c \
+	spl_vm.c \
+	spl_zlib.c \
+	spl_zone.c
+
+
+.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \
+	${MACHINE_ARCH} == "arm"
+SRCS+= spl_atomic.c
+.endif
+
+#os/freebsd/zfs
+SRCS+=	abd_os.c \
+	arc_os.c \
+	crypto_os.c \
+	dmu_os.c \
+	hkdf.c \
+	kmod_core.c \
+	spa_os.c \
+	sysctl_os.c \
+	vdev_file.c \
+	vdev_geom.c \
+	vdev_label_os.c \
+	zfs_acl.c \
+	zfs_ctldir.c \
+	zfs_debug.c \
+	zfs_dir.c \
+	zfs_ioctl_compat.c \
+	zfs_ioctl_os.c \
+	zfs_vfsops.c \
+	zfs_vnops_os.c \
+	zfs_znode.c \
+	zio_crypt.c \
+	zvol_os.c
+
+#unicode
+SRCS+=	uconv.c \
+	u8_textprep.c
+
+#zcommon
+SRCS+=	zfeature_common.c \
+	zfs_comutil.c \
+	zfs_deleg.c \
+	zfs_fletcher.c \
+	zfs_fletcher_avx512.c \
+	zfs_fletcher_intel.c \
+	zfs_fletcher_sse.c \
+	zfs_fletcher_superscalar.c \
+	zfs_fletcher_superscalar4.c \
+	zfs_namecheck.c \
+	zfs_prop.c \
+	zpool_prop.c \
+	zprop_common.c
+
+#zfs
+SRCS+=	abd.c \
+	aggsum.c \
+	arc.c \
+	blkptr.c \
+	bplist.c \
+	bpobj.c \
+	btree.c \
+	cityhash.c \
+	dbuf.c \
+	dbuf_stats.c \
+	bptree.c \
+	bqueue.c \
+	dataset_kstats.c \
+	ddt.c \
+	ddt_zap.c \
+	dmu.c \
+	dmu_diff.c \
+	dmu_object.c \
+	dmu_objset.c \
+	dmu_recv.c \
+	dmu_redact.c \
+	dmu_send.c \
+	dmu_traverse.c \
+	dmu_tx.c \
+	dmu_zfetch.c \
+	dnode.c \
+	dnode_sync.c \
+	dsl_dataset.c \
+	dsl_deadlist.c \
+	dsl_deleg.c \
+	dsl_bookmark.c \
+	dsl_dir.c \
+	dsl_crypt.c \
+	dsl_destroy.c \
+	dsl_pool.c \
+	dsl_prop.c \
+	dsl_scan.c \
+	dsl_synctask.c \
+	dsl_userhold.c \
+	fm.c \
+	gzip.c \
+	lzjb.c \
+	lz4.c \
+	metaslab.c \
+	mmp.c \
+	multilist.c \
+	objlist.c \
+	pathname.c \
+	range_tree.c \
+	refcount.c \
+	rrwlock.c \
+	sa.c \
+	sha256.c \
+	skein_zfs.c \
+	spa.c \
+	spa_boot.c \
+	spa_checkpoint.c \
+	spa_config.c \
+	spa_errlog.c \
+	spa_history.c \
+	spa_log_spacemap.c \
+	spa_misc.c \
+	spa_stats.c \
+	space_map.c \
+	space_reftree.c \
+	txg.c \
+	uberblock.c \
+	unique.c \
+	vdev.c \
+	vdev_cache.c \
+	vdev_draid.c \
+	vdev_draid_rand.c \
+	vdev_indirect.c \
+	vdev_indirect_births.c \
+	vdev_indirect_mapping.c \
+	vdev_initialize.c \
+	vdev_label.c \
+	vdev_mirror.c \
+	vdev_missing.c \
+	vdev_queue.c \
+	vdev_raidz.c \
+	vdev_raidz_math.c \
+	vdev_raidz_math_scalar.c \
+	vdev_rebuild.c \
+	vdev_raidz_math_avx2.c \
+	vdev_raidz_math_avx512bw.c \
+	vdev_raidz_math_avx512f.c \
+	vdev_raidz_math_sse2.c \
+	vdev_raidz_math_ssse3.c \
+	vdev_removal.c \
+	vdev_root.c \
+	vdev_trim.c \
+	zap.c \
+	zap_leaf.c \
+	zap_micro.c \
+	zcp.c \
+	zcp_get.c \
+	zcp_global.c \
+	zcp_iter.c \
+	zcp_set.c \
+	zcp_synctask.c \
+	zfeature.c \
+	zfs_byteswap.c \
+	zfs_file_os.c \
+	zfs_fm.c \
+	zfs_fuid.c \
+	zfs_ioctl.c \
+	zfs_log.c \
+	zfs_onexit.c \
+	zfs_quota.c \
+	zfs_ratelimit.c \
+	zfs_replay.c \
+	zfs_rlock.c \
+	zfs_sa.c \
+	zfs_vnops.c \
+	zil.c \
+	zio.c \
+	zio_checksum.c \
+	zio_compress.c \
+	zio_inject.c \
+	zle.c \
+	zrlock.c \
+	zthr.c \
+	zvol.c
+
+#zstd
+SRCS+=	zfs_zstd.c \
+	zstd.c
+
+beforeinstall:
+.if ${MK_DEBUG_FILES} != "no"
+	mtree -eu \
+	    -f /etc/mtree/BSD.debug.dist \
+	    -p ${DESTDIR}/usr/lib
+.endif
+
+.include <bsd.kmod.mk>
+
+
+CFLAGS.gcc+= -Wno-pointer-to-int-cast
+
+CFLAGS.lapi.c= -Wno-cast-qual
+CFLAGS.lcompat.c= -Wno-cast-qual
+CFLAGS.lobject.c= -Wno-cast-qual
+CFLAGS.ltable.c= -Wno-cast-qual
+CFLAGS.lvm.c= -Wno-cast-qual
+CFLAGS.nvpair.c= -DHAVE_RPC_TYPES -Wno-cast-qual
+CFLAGS.spl_string.c= -Wno-cast-qual
+CFLAGS.spl_vm.c= -Wno-cast-qual
+CFLAGS.spl_zlib.c= -Wno-cast-qual
+CFLAGS.abd.c= -Wno-cast-qual
+CFLAGS.zfs_log.c= -Wno-cast-qual
+CFLAGS.zfs_vnops_os.c= -Wno-pointer-arith
+CFLAGS.u8_textprep.c= -Wno-cast-qual
+CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zprop_common.c= -Wno-cast-qual
+CFLAGS.ddt.c= -Wno-cast-qual
+CFLAGS.dmu.c= -Wno-cast-qual
+CFLAGS.dmu_traverse.c= -Wno-cast-qual
+CFLAGS.dsl_dir.c= -Wno-cast-qual
+CFLAGS.dsl_deadlist.c= -Wno-cast-qual
+CFLAGS.dsl_prop.c= -Wno-cast-qual
+CFLAGS.fm.c= -Wno-cast-qual
+CFLAGS.lz4.c= -Wno-cast-qual
+CFLAGS.spa.c= -Wno-cast-qual
+CFLAGS.spa_misc.c= -Wno-cast-qual
+CFLAGS.sysctl_os.c= -include ../zfs_config.h
+CFLAGS.vdev_draid.c= -Wno-cast-qual
+CFLAGS.vdev_raidz.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual
+CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier
+CFLAGS.zap_leaf.c= -Wno-cast-qual
+CFLAGS.zap_micro.c= -Wno-cast-qual
+CFLAGS.zcp.c= -Wno-cast-qual
+CFLAGS.zfs_fm.c= -Wno-cast-qual
+CFLAGS.zfs_ioctl.c= -Wno-cast-qual
+CFLAGS.zil.c= -Wno-cast-qual
+CFLAGS.zio.c= -Wno-cast-qual
+CFLAGS.zrlock.c= -Wno-cast-qual
+CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith
+CFLAGS.zstd.c= -fno-tree-vectorize -U__BMI__
diff --git a/sys/contrib/openzfs/module/Makefile.in b/sys/contrib/openzfs/module/Makefile.in
new file mode 100644
index 000000000000..69caf48570e9
--- /dev/null
+++ b/sys/contrib/openzfs/module/Makefile.in
@@ -0,0 +1,135 @@
+include Kbuild
+
+INSTALL_MOD_DIR ?= extra
+
+SUBDIR_TARGETS = icp lua zstd
+
+all: modules
+distclean maintainer-clean: clean
+install: modules_install
+uninstall: modules_uninstall
+check:
+
+.PHONY: all distclean maintainer-clean install uninstall check distdir \
+	modules modules-Linux modules-FreeBSD modules-unknown \
+	clean clean-Linux clean-FreeBSD \
+	modules_install modules_install-Linux modules_install-FreeBSD \
+	modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD \
+	cppcheck cppcheck-Linux cppcheck-FreeBSD
+
+# Filter out options that FreeBSD make doesn't understand
+getflags = ( \
+set -- \
+  $(filter-out --%,$(firstword $(MFLAGS))) \
+  $(filter -I%,$(MFLAGS)) \
+  $(filter -j%,$(MFLAGS)); \
+fmakeflags=""; \
+while getopts :deiI:j:knqrstw flag; do \
+  case $$flag in \
+    \?) :;; \
+    :) if [ $$OPTARG = "j" ]; then \
+	 ncpus=$$(sysctl -n kern.smp.cpus 2>/dev/null || :); \
+	 if [ -n "$$ncpus" ]; then fmakeflags="$$fmakeflags -j$$ncpus"; fi; \
+       fi;; \
+    d) fmakeflags="$$fmakeflags -dA";; \
+    *) fmakeflags="$$fmakeflags -$$flag$$OPTARG";; \
+  esac; \
+done; \
+echo $$fmakeflags \
+)
+FMAKEFLAGS = -C @abs_srcdir@ -f Makefile.bsd $(shell $(getflags))
+
+ifneq (@abs_srcdir@,@abs_builddir@)
+FMAKEFLAGS += MAKEOBJDIR=@abs_builddir@
+endif
+FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS)
+
+modules-Linux:
+	list='$(SUBDIR_TARGETS)'; for targetdir in $$list; do \
+		$(MAKE) -C $$targetdir; \
+	done
+	$(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m modules
+
+modules-FreeBSD:
+	+$(FMAKE)
+
+modules-unknown:
+	@true
+
+modules: modules-@ac_system@
+
+clean-Linux:
+	@# Only cleanup the kernel build directories when CONFIG_KERNEL
+	@# is defined.  This indicates that kernel modules should be built.
+@CONFIG_KERNEL_TRUE@	$(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ clean
+
+	if [ -f @LINUX_SYMBOLS@ ]; then $(RM) @LINUX_SYMBOLS@; fi
+	if [ -f Module.markers ]; then $(RM) Module.markers; fi
+
+	find . -name '*.ur-safe' -type f -print | xargs $(RM)
+
+clean-FreeBSD:
+	+$(FMAKE) clean
+
+clean: clean-@ac_system@
+
+modules_install-Linux:
+	@# Install the kernel modules
+	$(MAKE) -C @LINUX_OBJ@ M=`pwd` modules_install \
+		INSTALL_MOD_PATH=$(DESTDIR)$(INSTALL_MOD_PATH) \
+		INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \
+		KERNELRELEASE=@LINUX_VERSION@
+	@# Remove extraneous build products when packaging
+	kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \
+	if [ -n "$(DESTDIR)" ]; then \
+		find $$kmoddir -name 'modules.*' | xargs $(RM); \
+	fi
+	sysmap=$(DESTDIR)$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \
+	if [ -f $$sysmap ]; then \
+		depmod -ae -F $$sysmap @LINUX_VERSION@; \
+	fi
+
+modules_install-FreeBSD:
+	@# Install the kernel modules
+	+$(FMAKE) install
+
+modules_install: modules_install-@ac_system@
+
+modules_uninstall-Linux:
+	@# Uninstall the kernel modules
+	kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \
+	for objdir in $(ZFS_MODULES); do \
+		$(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \
+	done
+
+modules_uninstall-FreeBSD:
+	@false
+
+modules_uninstall: modules_uninstall-@ac_system@
+
+cppcheck-Linux:
+	@CPPCHECK@ -j@CPU_COUNT@ --std=c99 --quiet --force --error-exitcode=2 \
+		--inline-suppr --suppress=noValidConfiguration \
+		--enable=warning,information -D_KERNEL \
+		--include=@LINUX_OBJ@/include/generated/autoconf.h \
+		--include=@top_srcdir@/zfs_config.h \
+		--config-exclude=@LINUX_OBJ@/include \
+		-I @LINUX_OBJ@/include \
+		-I @top_srcdir@/include/os/linux/kernel \
+		-I @top_srcdir@/include/os/linux/spl \
+		-I @top_srcdir@/include/os/linux/zfs \
+		-I @top_srcdir@/include \
+		avl icp lua nvpair spl unicode zcommon zfs zstd os/linux
+
+cppcheck-FreeBSD:
+	@true
+
+cppcheck: cppcheck-@ac_system@
+
+distdir:
+	(cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \
+	while read path; do \
+		mkdir -p $$distdir/$${path%/*}; \
+		cp @srcdir@/$$path $$distdir/$$path; \
+	done; \
+	cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd
diff --git a/sys/contrib/openzfs/module/avl/Makefile.in b/sys/contrib/openzfs/module/avl/Makefile.in
new file mode 100644
index 000000000000..991d5f95b8c0
--- /dev/null
+++ b/sys/contrib/openzfs/module/avl/Makefile.in
@@ -0,0 +1,10 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zavl
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+$(MODULE)-objs += avl.o
diff --git a/sys/contrib/openzfs/module/avl/avl.c b/sys/contrib/openzfs/module/avl/avl.c
new file mode 100644
index 000000000000..d0473d883b3d
--- /dev/null
+++ b/sys/contrib/openzfs/module/avl/avl.c
@@ -0,0 +1,1093 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+/*
+ * AVL - generic AVL tree implementation for kernel use
+ *
+ * A complete description of AVL trees can be found in many CS textbooks.
+ *
+ * Here is a very brief overview. An AVL tree is a binary search tree that is
+ * almost perfectly balanced. By "almost" perfectly balanced, we mean that at
+ * any given node, the left and right subtrees are allowed to differ in height
+ * by at most 1 level.
+ *
+ * This relaxation from a perfectly balanced binary tree allows doing
+ * insertion and deletion relatively efficiently. Searching the tree is
+ * still a fast operation, roughly O(log(N)).
+ *
+ * The key to insertion and deletion is a set of tree manipulations called
+ * rotations, which bring unbalanced subtrees back into the semi-balanced state.
+ *
+ * This implementation of AVL trees has the following peculiarities:
+ *
+ *	- The AVL specific data structures are physically embedded as fields
+ *	  in the "using" data structures.  To maintain generality the code
+ *	  must constantly translate between "avl_node_t *" and containing
+ *	  data structure "void *"s by adding/subtracting the avl_offset.
+ *
+ *	- Since the AVL data is always embedded in other structures, there is
+ *	  no locking or memory allocation in the AVL routines. This must be
+ *	  provided for by the enclosing data structure's semantics. Typically,
+ *	  avl_insert()/_add()/_remove()/avl_insert_here() require some kind of
+ *	  exclusive write lock. Other operations require a read lock.
+ *
+ *      - The implementation uses iteration instead of explicit recursion,
+ *	  since it is intended to run on limited size kernel stacks. Since
+ *	  there is no recursion stack present to move "up" in the tree,
+ *	  there is an explicit "parent" link in the avl_node_t.
+ *
+ *      - The left/right children pointers of a node are in an array.
+ *	  In the code, variables (instead of constants) are used to represent
+ *	  left and right indices.  The implementation is written as if it only
+ *	  dealt with left handed manipulations.  By changing the value assigned
+ *	  to "left", the code also works for right handed trees.  The
+ *	  following variables/terms are frequently used:
+ *
+ *		int left;	// 0 when dealing with left children,
+ *				// 1 for dealing with right children
+ *
+ *		int left_heavy;	// -1 when left subtree is taller at some node,
+ *				// +1 when right subtree is taller
+ *
+ *		int right;	// will be the opposite of left (0 or 1)
+ *		int right_heavy;// will be the opposite of left_heavy (-1 or 1)
+ *
+ *		int direction;  // 0 for "<" (ie. left child); 1 for ">" (right)
+ *
+ *	  Though it is a little more confusing to read the code, the approach
+ *	  allows using half as much code (and hence cache footprint) for tree
+ *	  manipulations and eliminates many conditional branches.
+ *
+ *	- The avl_index_t is an opaque "cookie" used to find nodes at or
+ *	  adjacent to where a new value would be inserted in the tree. The value
+ *	  is a modified "avl_node_t *".  The bottom bit (normally 0 for a
+ *	  pointer) is set to indicate if that the new node has a value greater
+ *	  than the value of the indicated "avl_node_t *".
+ *
+ * Note - in addition to userland (e.g. libavl and libutil) and the kernel
+ * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
+ * which each have their own compilation environments and subsequent
+ * requirements. Each of these environments must be considered when adding
+ * dependencies from avl.c.
+ *
+ * Link to Illumos.org for more information on avl function:
+ * [1] https://illumos.org/man/9f/avl
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/debug.h>
+#include <sys/avl.h>
+#include <sys/cmn_err.h>
+#include <sys/mod.h>
+
+/*
+ * Small arrays to translate between balance (or diff) values and child indices.
+ *
+ * Code that deals with binary tree data structures will randomly use
+ * left and right children when examining a tree.  C "if()" statements
+ * which evaluate randomly suffer from very poor hardware branch prediction.
+ * In this code we avoid some of the branch mispredictions by using the
+ * following translation arrays. They replace random branches with an
+ * additional memory reference. Since the translation arrays are both very
+ * small the data should remain efficiently in cache.
+ */
+static const int  avl_child2balance[2]	= {-1, 1};
+static const int  avl_balance2child[]	= {0, 0, 1};
+
+
+/*
+ * Walk from one node to the previous valued node (ie. an infix walk
+ * towards the left). At any given node we do one of 2 things:
+ *
+ * - If there is a left child, go to it, then to it's rightmost descendant.
+ *
+ * - otherwise we return through parent nodes until we've come from a right
+ *   child.
+ *
+ * Return Value:
+ * NULL - if at the end of the nodes
+ * otherwise next node
+ */
+void *
+avl_walk(avl_tree_t *tree, void	*oldnode, int left)
+{
+	size_t off = tree->avl_offset;
+	avl_node_t *node = AVL_DATA2NODE(oldnode, off);
+	int right = 1 - left;
+	int was_child;
+
+
+	/*
+	 * nowhere to walk to if tree is empty
+	 */
+	if (node == NULL)
+		return (NULL);
+
+	/*
+	 * Visit the previous valued node. There are two possibilities:
+	 *
+	 * If this node has a left child, go down one left, then all
+	 * the way right.
+	 */
+	if (node->avl_child[left] != NULL) {
+		for (node = node->avl_child[left];
+		    node->avl_child[right] != NULL;
+		    node = node->avl_child[right])
+			;
+	/*
+	 * Otherwise, return through left children as far as we can.
+	 */
+	} else {
+		for (;;) {
+			was_child = AVL_XCHILD(node);
+			node = AVL_XPARENT(node);
+			if (node == NULL)
+				return (NULL);
+			if (was_child == right)
+				break;
+		}
+	}
+
+	return (AVL_NODE2DATA(node, off));
+}
+
+/*
+ * Return the lowest valued node in a tree or NULL.
+ * (leftmost child from root of tree)
+ */
+void *
+avl_first(avl_tree_t *tree)
+{
+	avl_node_t *node;
+	avl_node_t *prev = NULL;
+	size_t off = tree->avl_offset;
+
+	for (node = tree->avl_root; node != NULL; node = node->avl_child[0])
+		prev = node;
+
+	if (prev != NULL)
+		return (AVL_NODE2DATA(prev, off));
+	return (NULL);
+}
+
+/*
+ * Return the highest valued node in a tree or NULL.
+ * (rightmost child from root of tree)
+ */
+void *
+avl_last(avl_tree_t *tree)
+{
+	avl_node_t *node;
+	avl_node_t *prev = NULL;
+	size_t off = tree->avl_offset;
+
+	for (node = tree->avl_root; node != NULL; node = node->avl_child[1])
+		prev = node;
+
+	if (prev != NULL)
+		return (AVL_NODE2DATA(prev, off));
+	return (NULL);
+}
+
+/*
+ * Access the node immediately before or after an insertion point.
+ *
+ * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child
+ *
+ * Return value:
+ *	NULL: no node in the given direction
+ *	"void *"  of the found tree node
+ */
+void *
+avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
+{
+	int child = AVL_INDEX2CHILD(where);
+	avl_node_t *node = AVL_INDEX2NODE(where);
+	void *data;
+	size_t off = tree->avl_offset;
+
+	if (node == NULL) {
+		ASSERT(tree->avl_root == NULL);
+		return (NULL);
+	}
+	data = AVL_NODE2DATA(node, off);
+	if (child != direction)
+		return (data);
+
+	return (avl_walk(tree, data, direction));
+}
+
+
+/*
+ * Search for the node which contains "value".  The algorithm is a
+ * simple binary tree search.
+ *
+ * return value:
+ *	NULL: the value is not in the AVL tree
+ *		*where (if not NULL)  is set to indicate the insertion point
+ *	"void *"  of the found tree node
+ */
+void *
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
+{
+	avl_node_t *node;
+	avl_node_t *prev = NULL;
+	int child = 0;
+	int diff;
+	size_t off = tree->avl_offset;
+
+	for (node = tree->avl_root; node != NULL;
+	    node = node->avl_child[child]) {
+
+		prev = node;
+
+		diff = tree->avl_compar(value, AVL_NODE2DATA(node, off));
+		ASSERT(-1 <= diff && diff <= 1);
+		if (diff == 0) {
+#ifdef ZFS_DEBUG
+			if (where != NULL)
+				*where = 0;
+#endif
+			return (AVL_NODE2DATA(node, off));
+		}
+		child = avl_balance2child[1 + diff];
+
+	}
+
+	if (where != NULL)
+		*where = AVL_MKINDEX(prev, child);
+
+	return (NULL);
+}
+
+
+/*
+ * Perform a rotation to restore balance at the subtree given by depth.
+ *
+ * This routine is used by both insertion and deletion. The return value
+ * indicates:
+ *	 0 : subtree did not change height
+ *	!0 : subtree was reduced in height
+ *
+ * The code is written as if handling left rotations, right rotations are
+ * symmetric and handled by swapping values of variables right/left[_heavy]
+ *
+ * On input balance is the "new" balance at "node". This value is either
+ * -2 or +2.
+ */
+static int
+avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance)
+{
+	int left = !(balance < 0);	/* when balance = -2, left will be 0 */
+	int right = 1 - left;
+	int left_heavy = balance >> 1;
+	int right_heavy = -left_heavy;
+	avl_node_t *parent = AVL_XPARENT(node);
+	avl_node_t *child = node->avl_child[left];
+	avl_node_t *cright;
+	avl_node_t *gchild;
+	avl_node_t *gright;
+	avl_node_t *gleft;
+	int which_child = AVL_XCHILD(node);
+	int child_bal = AVL_XBALANCE(child);
+
+	/* BEGIN CSTYLED */
+	/*
+	 * case 1 : node is overly left heavy, the left child is balanced or
+	 * also left heavy. This requires the following rotation.
+	 *
+	 *                   (node bal:-2)
+	 *                    /           \
+	 *                   /             \
+	 *              (child bal:0 or -1)
+	 *              /    \
+	 *             /      \
+	 *                     cright
+	 *
+	 * becomes:
+	 *
+	 *              (child bal:1 or 0)
+	 *              /        \
+	 *             /          \
+	 *                        (node bal:-1 or 0)
+	 *                         /     \
+	 *                        /       \
+	 *                     cright
+	 *
+	 * we detect this situation by noting that child's balance is not
+	 * right_heavy.
+	 */
+	/* END CSTYLED */
+	if (child_bal != right_heavy) {
+
+		/*
+		 * compute new balance of nodes
+		 *
+		 * If child used to be left heavy (now balanced) we reduced
+		 * the height of this sub-tree -- used in "return...;" below
+		 */
+		child_bal += right_heavy; /* adjust towards right */
+
+		/*
+		 * move "cright" to be node's left child
+		 */
+		cright = child->avl_child[right];
+		node->avl_child[left] = cright;
+		if (cright != NULL) {
+			AVL_SETPARENT(cright, node);
+			AVL_SETCHILD(cright, left);
+		}
+
+		/*
+		 * move node to be child's right child
+		 */
+		child->avl_child[right] = node;
+		AVL_SETBALANCE(node, -child_bal);
+		AVL_SETCHILD(node, right);
+		AVL_SETPARENT(node, child);
+
+		/*
+		 * update the pointer into this subtree
+		 */
+		AVL_SETBALANCE(child, child_bal);
+		AVL_SETCHILD(child, which_child);
+		AVL_SETPARENT(child, parent);
+		if (parent != NULL)
+			parent->avl_child[which_child] = child;
+		else
+			tree->avl_root = child;
+
+		return (child_bal == 0);
+	}
+
+	/* BEGIN CSTYLED */
+	/*
+	 * case 2 : When node is left heavy, but child is right heavy we use
+	 * a different rotation.
+	 *
+	 *                   (node b:-2)
+	 *                    /   \
+	 *                   /     \
+	 *                  /       \
+	 *             (child b:+1)
+	 *              /     \
+	 *             /       \
+	 *                   (gchild b: != 0)
+	 *                     /  \
+	 *                    /    \
+	 *                 gleft   gright
+	 *
+	 * becomes:
+	 *
+	 *              (gchild b:0)
+	 *              /       \
+	 *             /         \
+	 *            /           \
+	 *        (child b:?)   (node b:?)
+	 *         /  \          /   \
+	 *        /    \        /     \
+	 *            gleft   gright
+	 *
+	 * computing the new balances is more complicated. As an example:
+	 *	 if gchild was right_heavy, then child is now left heavy
+	 *		else it is balanced
+	 */
+	/* END CSTYLED */
+	gchild = child->avl_child[right];
+	gleft = gchild->avl_child[left];
+	gright = gchild->avl_child[right];
+
+	/*
+	 * move gright to left child of node and
+	 *
+	 * move gleft to right child of node
+	 */
+	node->avl_child[left] = gright;
+	if (gright != NULL) {
+		AVL_SETPARENT(gright, node);
+		AVL_SETCHILD(gright, left);
+	}
+
+	child->avl_child[right] = gleft;
+	if (gleft != NULL) {
+		AVL_SETPARENT(gleft, child);
+		AVL_SETCHILD(gleft, right);
+	}
+
+	/*
+	 * move child to left child of gchild and
+	 *
+	 * move node to right child of gchild and
+	 *
+	 * fixup parent of all this to point to gchild
+	 */
+	balance = AVL_XBALANCE(gchild);
+	gchild->avl_child[left] = child;
+	AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0));
+	AVL_SETPARENT(child, gchild);
+	AVL_SETCHILD(child, left);
+
+	gchild->avl_child[right] = node;
+	AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0));
+	AVL_SETPARENT(node, gchild);
+	AVL_SETCHILD(node, right);
+
+	AVL_SETBALANCE(gchild, 0);
+	AVL_SETPARENT(gchild, parent);
+	AVL_SETCHILD(gchild, which_child);
+	if (parent != NULL)
+		parent->avl_child[which_child] = gchild;
+	else
+		tree->avl_root = gchild;
+
+	return (1);	/* the new tree is always shorter */
+}
+
+
+/*
+ * Insert a new node into an AVL tree at the specified (from avl_find()) place.
+ *
+ * Newly inserted nodes are always leaf nodes in the tree, since avl_find()
+ * searches out to the leaf positions.  The avl_index_t indicates the node
+ * which will be the parent of the new node.
+ *
+ * After the node is inserted, a single rotation further up the tree may
+ * be necessary to maintain an acceptable AVL balance.
+ */
+void
+avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where)
+{
+	avl_node_t *node;
+	avl_node_t *parent = AVL_INDEX2NODE(where);
+	int old_balance;
+	int new_balance;
+	int which_child = AVL_INDEX2CHILD(where);
+	size_t off = tree->avl_offset;
+
+#ifdef _LP64
+	ASSERT(((uintptr_t)new_data & 0x7) == 0);
+#endif
+
+	node = AVL_DATA2NODE(new_data, off);
+
+	/*
+	 * First, add the node to the tree at the indicated position.
+	 */
+	++tree->avl_numnodes;
+
+	node->avl_child[0] = NULL;
+	node->avl_child[1] = NULL;
+
+	AVL_SETCHILD(node, which_child);
+	AVL_SETBALANCE(node, 0);
+	AVL_SETPARENT(node, parent);
+	if (parent != NULL) {
+		ASSERT(parent->avl_child[which_child] == NULL);
+		parent->avl_child[which_child] = node;
+	} else {
+		ASSERT(tree->avl_root == NULL);
+		tree->avl_root = node;
+	}
+	/*
+	 * Now, back up the tree modifying the balance of all nodes above the
+	 * insertion point. If we get to a highly unbalanced ancestor, we
+	 * need to do a rotation.  If we back out of the tree we are done.
+	 * If we brought any subtree into perfect balance (0), we are also done.
+	 */
+	for (;;) {
+		node = parent;
+		if (node == NULL)
+			return;
+
+		/*
+		 * Compute the new balance
+		 */
+		old_balance = AVL_XBALANCE(node);
+		new_balance = old_balance + avl_child2balance[which_child];
+
+		/*
+		 * If we introduced equal balance, then we are done immediately
+		 */
+		if (new_balance == 0) {
+			AVL_SETBALANCE(node, 0);
+			return;
+		}
+
+		/*
+		 * If both old and new are not zero we went
+		 * from -1 to -2 balance, do a rotation.
+		 */
+		if (old_balance != 0)
+			break;
+
+		AVL_SETBALANCE(node, new_balance);
+		parent = AVL_XPARENT(node);
+		which_child = AVL_XCHILD(node);
+	}
+
+	/*
+	 * perform a rotation to fix the tree and return
+	 */
+	(void) avl_rotation(tree, node, new_balance);
+}
+
+/*
+ * Insert "new_data" in "tree" in the given "direction" either after or
+ * before (AVL_AFTER, AVL_BEFORE) the data "here".
+ *
+ * Insertions can only be done at empty leaf points in the tree, therefore
+ * if the given child of the node is already present we move to either
+ * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since
+ * every other node in the tree is a leaf, this always works.
+ *
+ * To help developers using this interface, we assert that the new node
+ * is correctly ordered at every step of the way in DEBUG kernels.
+ */
+void
+avl_insert_here(
+	avl_tree_t *tree,
+	void *new_data,
+	void *here,
+	int direction)
+{
+	avl_node_t *node;
+	int child = direction;	/* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */
+#ifdef ZFS_DEBUG
+	int diff;
+#endif
+
+	ASSERT(tree != NULL);
+	ASSERT(new_data != NULL);
+	ASSERT(here != NULL);
+	ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER);
+
+	/*
+	 * If corresponding child of node is not NULL, go to the neighboring
+	 * node and reverse the insertion direction.
+	 */
+	node = AVL_DATA2NODE(here, tree->avl_offset);
+
+#ifdef ZFS_DEBUG
+	diff = tree->avl_compar(new_data, here);
+	ASSERT(-1 <= diff && diff <= 1);
+	ASSERT(diff != 0);
+	ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+
+	if (node->avl_child[child] != NULL) {
+		node = node->avl_child[child];
+		child = 1 - child;
+		while (node->avl_child[child] != NULL) {
+#ifdef ZFS_DEBUG
+			diff = tree->avl_compar(new_data,
+			    AVL_NODE2DATA(node, tree->avl_offset));
+			ASSERT(-1 <= diff && diff <= 1);
+			ASSERT(diff != 0);
+			ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+			node = node->avl_child[child];
+		}
+#ifdef ZFS_DEBUG
+		diff = tree->avl_compar(new_data,
+		    AVL_NODE2DATA(node, tree->avl_offset));
+		ASSERT(-1 <= diff && diff <= 1);
+		ASSERT(diff != 0);
+		ASSERT(diff > 0 ? child == 1 : child == 0);
+#endif
+	}
+	ASSERT(node->avl_child[child] == NULL);
+
+	avl_insert(tree, new_data, AVL_MKINDEX(node, child));
+}
+
+/*
+ * Add a new node to an AVL tree.  Strictly enforce that no duplicates can
+ * be added to the tree with a VERIFY which is enabled for non-DEBUG builds.
+ */
+void
+avl_add(avl_tree_t *tree, void *new_node)
+{
+	avl_index_t where = 0;
+
+	VERIFY(avl_find(tree, new_node, &where) == NULL);
+
+	avl_insert(tree, new_node, where);
+}
+
+/*
+ * Delete a node from the AVL tree.  Deletion is similar to insertion, but
+ * with 2 complications.
+ *
+ * First, we may be deleting an interior node. Consider the following subtree:
+ *
+ *     d           c            c
+ *    / \         / \          / \
+ *   b   e       b   e        b   e
+ *  / \	        / \          /
+ * a   c       a            a
+ *
+ * When we are deleting node (d), we find and bring up an adjacent valued leaf
+ * node, say (c), to take the interior node's place. In the code this is
+ * handled by temporarily swapping (d) and (c) in the tree and then using
+ * common code to delete (d) from the leaf position.
+ *
+ * Secondly, an interior deletion from a deep tree may require more than one
+ * rotation to fix the balance. This is handled by moving up the tree through
+ * parents and applying rotations as needed. The return value from
+ * avl_rotation() is used to detect when a subtree did not change overall
+ * height due to a rotation.
+ */
+void
+avl_remove(avl_tree_t *tree, void *data)
+{
+	avl_node_t *delete;
+	avl_node_t *parent;
+	avl_node_t *node;
+	avl_node_t tmp;
+	int old_balance;
+	int new_balance;
+	int left;
+	int right;
+	int which_child;
+	size_t off = tree->avl_offset;
+
+	delete = AVL_DATA2NODE(data, off);
+
+	/*
+	 * Deletion is easiest with a node that has at most 1 child.
+	 * We swap a node with 2 children with a sequentially valued
+	 * neighbor node. That node will have at most 1 child. Note this
+	 * has no effect on the ordering of the remaining nodes.
+	 *
+	 * As an optimization, we choose the greater neighbor if the tree
+	 * is right heavy, otherwise the left neighbor. This reduces the
+	 * number of rotations needed.
+	 */
+	if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) {
+
+		/*
+		 * choose node to swap from whichever side is taller
+		 */
+		old_balance = AVL_XBALANCE(delete);
+		left = avl_balance2child[old_balance + 1];
+		right = 1 - left;
+
+		/*
+		 * get to the previous value'd node
+		 * (down 1 left, as far as possible right)
+		 */
+		for (node = delete->avl_child[left];
+		    node->avl_child[right] != NULL;
+		    node = node->avl_child[right])
+			;
+
+		/*
+		 * create a temp placeholder for 'node'
+		 * move 'node' to delete's spot in the tree
+		 */
+		tmp = *node;
+
+		*node = *delete;
+		if (node->avl_child[left] == node)
+			node->avl_child[left] = &tmp;
+
+		parent = AVL_XPARENT(node);
+		if (parent != NULL)
+			parent->avl_child[AVL_XCHILD(node)] = node;
+		else
+			tree->avl_root = node;
+		AVL_SETPARENT(node->avl_child[left], node);
+		AVL_SETPARENT(node->avl_child[right], node);
+
+		/*
+		 * Put tmp where node used to be (just temporary).
+		 * It always has a parent and at most 1 child.
+		 */
+		delete = &tmp;
+		parent = AVL_XPARENT(delete);
+		parent->avl_child[AVL_XCHILD(delete)] = delete;
+		which_child = (delete->avl_child[1] != 0);
+		if (delete->avl_child[which_child] != NULL)
+			AVL_SETPARENT(delete->avl_child[which_child], delete);
+	}
+
+
+	/*
+	 * Here we know "delete" is at least partially a leaf node. It can
+	 * be easily removed from the tree.
+	 */
+	ASSERT(tree->avl_numnodes > 0);
+	--tree->avl_numnodes;
+	parent = AVL_XPARENT(delete);
+	which_child = AVL_XCHILD(delete);
+	if (delete->avl_child[0] != NULL)
+		node = delete->avl_child[0];
+	else
+		node = delete->avl_child[1];
+
+	/*
+	 * Connect parent directly to node (leaving out delete).
+	 */
+	if (node != NULL) {
+		AVL_SETPARENT(node, parent);
+		AVL_SETCHILD(node, which_child);
+	}
+	if (parent == NULL) {
+		tree->avl_root = node;
+		return;
+	}
+	parent->avl_child[which_child] = node;
+
+
+	/*
+	 * Since the subtree is now shorter, begin adjusting parent balances
+	 * and performing any needed rotations.
+	 */
+	do {
+
+		/*
+		 * Move up the tree and adjust the balance
+		 *
+		 * Capture the parent and which_child values for the next
+		 * iteration before any rotations occur.
+		 */
+		node = parent;
+		old_balance = AVL_XBALANCE(node);
+		new_balance = old_balance - avl_child2balance[which_child];
+		parent = AVL_XPARENT(node);
+		which_child = AVL_XCHILD(node);
+
+		/*
+		 * If a node was in perfect balance but isn't anymore then
+		 * we can stop, since the height didn't change above this point
+		 * due to a deletion.
+		 */
+		if (old_balance == 0) {
+			AVL_SETBALANCE(node, new_balance);
+			break;
+		}
+
+		/*
+		 * If the new balance is zero, we don't need to rotate
+		 * else
+		 * need a rotation to fix the balance.
+		 * If the rotation doesn't change the height
+		 * of the sub-tree we have finished adjusting.
+		 */
+		if (new_balance == 0)
+			AVL_SETBALANCE(node, new_balance);
+		else if (!avl_rotation(tree, node, new_balance))
+			break;
+	} while (parent != NULL);
+}
+
+#define	AVL_REINSERT(tree, obj)		\
+	avl_remove((tree), (obj));	\
+	avl_add((tree), (obj))
+
+boolean_t
+avl_update_lt(avl_tree_t *t, void *obj)
+{
+	void *neighbor;
+
+	ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) ||
+	    (t->avl_compar(obj, neighbor) <= 0));
+
+	neighbor = AVL_PREV(t, obj);
+	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+		AVL_REINSERT(t, obj);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+boolean_t
+avl_update_gt(avl_tree_t *t, void *obj)
+{
+	void *neighbor;
+
+	ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) ||
+	    (t->avl_compar(obj, neighbor) >= 0));
+
+	neighbor = AVL_NEXT(t, obj);
+	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+		AVL_REINSERT(t, obj);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+boolean_t
+avl_update(avl_tree_t *t, void *obj)
+{
+	void *neighbor;
+
+	neighbor = AVL_PREV(t, obj);
+	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) {
+		AVL_REINSERT(t, obj);
+		return (B_TRUE);
+	}
+
+	neighbor = AVL_NEXT(t, obj);
+	if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) {
+		AVL_REINSERT(t, obj);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+void
+avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
+{
+	avl_node_t *temp_node;
+	ulong_t temp_numnodes;
+
+	ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
+	ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
+	ASSERT3U(tree1->avl_size, ==, tree2->avl_size);
+
+	temp_node = tree1->avl_root;
+	temp_numnodes = tree1->avl_numnodes;
+	tree1->avl_root = tree2->avl_root;
+	tree1->avl_numnodes = tree2->avl_numnodes;
+	tree2->avl_root = temp_node;
+	tree2->avl_numnodes = temp_numnodes;
+}
+
+/*
+ * initialize a new AVL tree
+ */
+void
+avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *),
+    size_t size, size_t offset)
+{
+	ASSERT(tree);
+	ASSERT(compar);
+	ASSERT(size > 0);
+	ASSERT(size >= offset + sizeof (avl_node_t));
+#ifdef _LP64
+	ASSERT((offset & 0x7) == 0);
+#endif
+
+	tree->avl_compar = compar;
+	tree->avl_root = NULL;
+	tree->avl_numnodes = 0;
+	tree->avl_size = size;
+	tree->avl_offset = offset;
+}
+
+/*
+ * Delete a tree.
+ */
+/* ARGSUSED */
+void
+avl_destroy(avl_tree_t *tree)
+{
+	ASSERT(tree);
+	ASSERT(tree->avl_numnodes == 0);
+	ASSERT(tree->avl_root == NULL);
+}
+
+
+/*
+ * Return the number of nodes in an AVL tree.
+ */
+ulong_t
+avl_numnodes(avl_tree_t *tree)
+{
+	ASSERT(tree);
+	return (tree->avl_numnodes);
+}
+
+boolean_t
+avl_is_empty(avl_tree_t *tree)
+{
+	ASSERT(tree);
+	return (tree->avl_numnodes == 0);
+}
+
+#define	CHILDBIT	(1L)
+
+/*
+ * Post-order tree walk used to visit all tree nodes and destroy the tree
+ * in post order. This is used for removing all the nodes from a tree without
+ * paying any cost for rebalancing it.
+ *
+ * example:
+ *
+ *	void *cookie = NULL;
+ *	my_data_t *node;
+ *
+ *	while ((node = avl_destroy_nodes(tree, &cookie)) != NULL)
+ *		free(node);
+ *	avl_destroy(tree);
+ *
+ * The cookie is really an avl_node_t to the current node's parent and
+ * an indication of which child you looked at last.
+ *
+ * On input, a cookie value of CHILDBIT indicates the tree is done.
+ */
+void *
+avl_destroy_nodes(avl_tree_t *tree, void **cookie)
+{
+	avl_node_t	*node;
+	avl_node_t	*parent;
+	int		child;
+	void		*first;
+	size_t		off = tree->avl_offset;
+
+	/*
+	 * Initial calls go to the first node or it's right descendant.
+	 */
+	if (*cookie == NULL) {
+		first = avl_first(tree);
+
+		/*
+		 * deal with an empty tree
+		 */
+		if (first == NULL) {
+			*cookie = (void *)CHILDBIT;
+			return (NULL);
+		}
+
+		node = AVL_DATA2NODE(first, off);
+		parent = AVL_XPARENT(node);
+		goto check_right_side;
+	}
+
+	/*
+	 * If there is no parent to return to we are done.
+	 */
+	parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT);
+	if (parent == NULL) {
+		if (tree->avl_root != NULL) {
+			ASSERT(tree->avl_numnodes == 1);
+			tree->avl_root = NULL;
+			tree->avl_numnodes = 0;
+		}
+		return (NULL);
+	}
+
+	/*
+	 * Remove the child pointer we just visited from the parent and tree.
+	 */
+	child = (uintptr_t)(*cookie) & CHILDBIT;
+	parent->avl_child[child] = NULL;
+	ASSERT(tree->avl_numnodes > 1);
+	--tree->avl_numnodes;
+
+	/*
+	 * If we just did a right child or there isn't one, go up to parent.
+	 */
+	if (child == 1 || parent->avl_child[1] == NULL) {
+		node = parent;
+		parent = AVL_XPARENT(parent);
+		goto done;
+	}
+
+	/*
+	 * Do parent's right child, then leftmost descendent.
+	 */
+	node = parent->avl_child[1];
+	while (node->avl_child[0] != NULL) {
+		parent = node;
+		node = node->avl_child[0];
+	}
+
+	/*
+	 * If here, we moved to a left child. It may have one
+	 * child on the right (when balance == +1).
+	 */
+check_right_side:
+	if (node->avl_child[1] != NULL) {
+		ASSERT(AVL_XBALANCE(node) == 1);
+		parent = node;
+		node = node->avl_child[1];
+		ASSERT(node->avl_child[0] == NULL &&
+		    node->avl_child[1] == NULL);
+	} else {
+		ASSERT(AVL_XBALANCE(node) <= 0);
+	}
+
+done:
+	if (parent == NULL) {
+		*cookie = (void *)CHILDBIT;
+		ASSERT(node == tree->avl_root);
+	} else {
+		*cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node));
+	}
+
+	return (AVL_NODE2DATA(node, off));
+}
+
+#if defined(_KERNEL)
+
+static int __init
+avl_init(void)
+{
+	return (0);
+}
+
+static void __exit
+avl_fini(void)
+{
+}
+
+module_init(avl_init);
+module_exit(avl_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("Generic AVL tree implementation");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(avl_create);
+EXPORT_SYMBOL(avl_find);
+EXPORT_SYMBOL(avl_insert);
+EXPORT_SYMBOL(avl_insert_here);
+EXPORT_SYMBOL(avl_walk);
+EXPORT_SYMBOL(avl_first);
+EXPORT_SYMBOL(avl_last);
+EXPORT_SYMBOL(avl_nearest);
+EXPORT_SYMBOL(avl_add);
+EXPORT_SYMBOL(avl_swap);
+EXPORT_SYMBOL(avl_is_empty);
+EXPORT_SYMBOL(avl_remove);
+EXPORT_SYMBOL(avl_numnodes);
+EXPORT_SYMBOL(avl_destroy_nodes);
+EXPORT_SYMBOL(avl_destroy);
+EXPORT_SYMBOL(avl_update_lt);
+EXPORT_SYMBOL(avl_update_gt);
+EXPORT_SYMBOL(avl_update);
diff --git a/sys/contrib/openzfs/module/icp/Makefile.in b/sys/contrib/openzfs/module/icp/Makefile.in
new file mode 100644
index 000000000000..7a01b2f08b8e
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/Makefile.in
@@ -0,0 +1,96 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+icp_include = $(src)/include
+else
+icp_include = $(srctree)/$(src)/include
+endif
+
+MODULE := icp
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+asflags-y := -I$(icp_include)
+ccflags-y := -I$(icp_include)
+
+$(MODULE)-objs += illumos-crypto.o
+$(MODULE)-objs += api/kcf_cipher.o
+$(MODULE)-objs += api/kcf_digest.o
+$(MODULE)-objs += api/kcf_mac.o
+$(MODULE)-objs += api/kcf_miscapi.o
+$(MODULE)-objs += api/kcf_ctxops.o
+$(MODULE)-objs += core/kcf_callprov.o
+$(MODULE)-objs += core/kcf_prov_tabs.o
+$(MODULE)-objs += core/kcf_sched.o
+$(MODULE)-objs += core/kcf_mech_tabs.o
+$(MODULE)-objs += core/kcf_prov_lib.o
+$(MODULE)-objs += spi/kcf_spi.o
+$(MODULE)-objs += io/aes.o
+$(MODULE)-objs += io/edonr_mod.o
+$(MODULE)-objs += io/sha1_mod.o
+$(MODULE)-objs += io/sha2_mod.o
+$(MODULE)-objs += io/skein_mod.o
+$(MODULE)-objs += os/modhash.o
+$(MODULE)-objs += os/modconf.o
+$(MODULE)-objs += algs/modes/cbc.o
+$(MODULE)-objs += algs/modes/ccm.o
+$(MODULE)-objs += algs/modes/ctr.o
+$(MODULE)-objs += algs/modes/ecb.o
+$(MODULE)-objs += algs/modes/gcm_generic.o
+$(MODULE)-objs += algs/modes/gcm.o
+$(MODULE)-objs += algs/modes/modes.o
+$(MODULE)-objs += algs/aes/aes_impl_generic.o
+$(MODULE)-objs += algs/aes/aes_impl.o
+$(MODULE)-objs += algs/aes/aes_modes.o
+$(MODULE)-objs += algs/edonr/edonr.o
+$(MODULE)-objs += algs/sha1/sha1.o
+$(MODULE)-objs += algs/sha2/sha2.o
+$(MODULE)-objs += algs/sha1/sha1.o
+$(MODULE)-objs += algs/skein/skein.o
+$(MODULE)-objs += algs/skein/skein_block.o
+$(MODULE)-objs += algs/skein/skein_iv.o
+
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aeskey.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_amd64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_aesni.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/gcm_pclmulqdq.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/aesni-gcm-x86_64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/ghash-x86_64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha1/sha1-x86_64.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha256_impl.o
+$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha512_impl.o
+
+$(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o
+$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o
+$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o
+
+# Suppress objtool "can't find jump dest instruction at" warnings.  They
+# are caused by the constants which are defined in the text section of the
+# assembly file using .byte instructions (e.g. bswap_mask).  The objtool
+# utility tries to interpret them as opcodes and obviously fails doing so.
+OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
+OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y
+
+ICP_DIRS = \
+	api \
+	core \
+	spi \
+	io \
+	os \
+	algs \
+	algs/aes \
+	algs/edonr \
+	algs/modes \
+	algs/sha1 \
+	algs/sha2 \
+	algs/skein \
+	asm-x86_64 \
+	asm-x86_64/aes \
+	asm-x86_64/modes \
+	asm-x86_64/sha1 \
+	asm-x86_64/sha2 \
+	asm-i386 \
+	asm-generic
+
+all:
+	mkdir -p $(ICP_DIRS)
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl.c
new file mode 100644
index 000000000000..037be0db60d7
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl.c
@@ -0,0 +1,443 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+#include <sys/simd.h>
+#include <modes/modes.h>
+#include <aes/aes_impl.h>
+
+/*
+ * Initialize AES encryption and decryption key schedules.
+ *
+ * Parameters:
+ * cipherKey	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ * keysched	AES key schedule to be initialized, of type aes_key_t.
+ *		Allocated by aes_alloc_keysched().
+ */
+void
+aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
+{
+	const aes_impl_ops_t *ops = aes_impl_get_ops();
+	aes_key_t *newbie = keysched;
+	uint_t keysize, i, j;
+	union {
+		uint64_t	ka64[4];
+		uint32_t	ka32[8];
+		} keyarr;
+
+	switch (keyBits) {
+	case 128:
+		newbie->nr = 10;
+		break;
+
+	case 192:
+		newbie->nr = 12;
+		break;
+
+	case 256:
+		newbie->nr = 14;
+		break;
+
+	default:
+		/* should never get here */
+		return;
+	}
+	keysize = CRYPTO_BITS2BYTES(keyBits);
+
+	/*
+	 * Generic C implementation requires byteswap for little endian
+	 * machines, various accelerated implementations for various
+	 * architectures may not.
+	 */
+	if (!ops->needs_byteswap) {
+		/* no byteswap needed */
+		if (IS_P2ALIGNED(cipherKey, sizeof (uint64_t))) {
+			for (i = 0, j = 0; j < keysize; i++, j += 8) {
+				/* LINTED: pointer alignment */
+				keyarr.ka64[i] = *((uint64_t *)&cipherKey[j]);
+			}
+		} else {
+			bcopy(cipherKey, keyarr.ka32, keysize);
+		}
+	} else {
+		/* byte swap */
+		for (i = 0, j = 0; j < keysize; i++, j += 4) {
+			keyarr.ka32[i] =
+			    htonl(*(uint32_t *)(void *)&cipherKey[j]);
+		}
+	}
+
+	ops->generate(newbie, keyarr.ka32, keyBits);
+	newbie->ops = ops;
+
+	/*
+	 * Note: if there are systems that need the AES_64BIT_KS type in the
+	 * future, move setting key schedule type to individual implementations
+	 */
+	newbie->type = AES_32BIT_KS;
+}
+
+
+/*
+ * Encrypt one block using AES.
+ * Align if needed and (for x86 32-bit only) byte-swap.
+ *
+ * Parameters:
+ * ks	Key schedule, of type aes_key_t
+ * pt	Input block (plain text)
+ * ct	Output block (crypto text).  Can overlap with pt
+ */
+int
+aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct)
+{
+	aes_key_t	*ksch = (aes_key_t *)ks;
+	const aes_impl_ops_t	*ops = ksch->ops;
+
+	if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t)) && !ops->needs_byteswap) {
+		/* LINTED:  pointer alignment */
+		ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr,
+		    /* LINTED:  pointer alignment */
+		    (uint32_t *)pt, (uint32_t *)ct);
+	} else {
+		uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
+
+		/* Copy input block into buffer */
+		if (ops->needs_byteswap) {
+			buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]);
+			buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]);
+			buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]);
+			buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]);
+		} else
+			bcopy(pt, &buffer, AES_BLOCK_LEN);
+
+		ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr, buffer, buffer);
+
+		/* Copy result from buffer to output block */
+		if (ops->needs_byteswap) {
+			*(uint32_t *)(void *)&ct[0] = htonl(buffer[0]);
+			*(uint32_t *)(void *)&ct[4] = htonl(buffer[1]);
+			*(uint32_t *)(void *)&ct[8] = htonl(buffer[2]);
+			*(uint32_t *)(void *)&ct[12] = htonl(buffer[3]);
+		} else
+			bcopy(&buffer, ct, AES_BLOCK_LEN);
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+
+/*
+ * Decrypt one block using AES.
+ * Align and byte-swap if needed.
+ *
+ * Parameters:
+ * ks	Key schedule, of type aes_key_t
+ * ct	Input block (crypto text)
+ * pt	Output block (plain text). Can overlap with pt
+ */
+int
+aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt)
+{
+	aes_key_t	*ksch = (aes_key_t *)ks;
+	const aes_impl_ops_t	*ops = ksch->ops;
+
+	if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t)) && !ops->needs_byteswap) {
+		/* LINTED:  pointer alignment */
+		ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr,
+		    /* LINTED:  pointer alignment */
+		    (uint32_t *)ct, (uint32_t *)pt);
+	} else {
+		uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)];
+
+		/* Copy input block into buffer */
+		if (ops->needs_byteswap) {
+			buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]);
+			buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]);
+			buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]);
+			buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]);
+		} else
+			bcopy(ct, &buffer, AES_BLOCK_LEN);
+
+		ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr, buffer, buffer);
+
+		/* Copy result from buffer to output block */
+		if (ops->needs_byteswap) {
+			*(uint32_t *)(void *)&pt[0] = htonl(buffer[0]);
+			*(uint32_t *)(void *)&pt[4] = htonl(buffer[1]);
+			*(uint32_t *)(void *)&pt[8] = htonl(buffer[2]);
+			*(uint32_t *)(void *)&pt[12] = htonl(buffer[3]);
+		} else
+			bcopy(&buffer, pt, AES_BLOCK_LEN);
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+
+/*
+ * Allocate key schedule for AES.
+ *
+ * Return the pointer and set size to the number of bytes allocated.
+ * Memory allocated must be freed by the caller when done.
+ *
+ * Parameters:
+ * size		Size of key schedule allocated, in bytes
+ * kmflag	Flag passed to kmem_alloc(9F); ignored in userland.
+ */
+/* ARGSUSED */
+void *
+aes_alloc_keysched(size_t *size, int kmflag)
+{
+	aes_key_t *keysched;
+
+	keysched = (aes_key_t *)kmem_alloc(sizeof (aes_key_t), kmflag);
+	if (keysched != NULL) {
+		*size = sizeof (aes_key_t);
+		return (keysched);
+	}
+	return (NULL);
+}
+
+/* AES implementation that contains the fastest methods */
+static aes_impl_ops_t aes_fastest_impl = {
+	.name = "fastest"
+};
+
+/* All compiled in implementations */
+const aes_impl_ops_t *aes_all_impl[] = {
+	&aes_generic_impl,
+#if defined(__x86_64)
+	&aes_x86_64_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AES)
+	&aes_aesni_impl,
+#endif
+};
+
+/* Indicate that benchmark has been completed */
+static boolean_t aes_impl_initialized = B_FALSE;
+
+/* Select aes implementation */
+#define	IMPL_FASTEST	(UINT32_MAX)
+#define	IMPL_CYCLE	(UINT32_MAX-1)
+
+#define	AES_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+
+static uint32_t icp_aes_impl = IMPL_FASTEST;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+/* Hold all supported implementations */
+static size_t aes_supp_impl_cnt = 0;
+static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
+
+/*
+ * Returns the AES operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
+ */
+const aes_impl_ops_t *
+aes_impl_get_ops(void)
+{
+	if (!kfpu_allowed())
+		return (&aes_generic_impl);
+
+	const aes_impl_ops_t *ops = NULL;
+	const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
+
+	switch (impl) {
+	case IMPL_FASTEST:
+		ASSERT(aes_impl_initialized);
+		ops = &aes_fastest_impl;
+		break;
+	case IMPL_CYCLE:
+		/* Cycle through supported implementations */
+		ASSERT(aes_impl_initialized);
+		ASSERT3U(aes_supp_impl_cnt, >, 0);
+		static size_t cycle_impl_idx = 0;
+		size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
+		ops = aes_supp_impl[idx];
+		break;
+	default:
+		ASSERT3U(impl, <, aes_supp_impl_cnt);
+		ASSERT3U(aes_supp_impl_cnt, >, 0);
+		if (impl < ARRAY_SIZE(aes_all_impl))
+			ops = aes_supp_impl[impl];
+		break;
+	}
+
+	ASSERT3P(ops, !=, NULL);
+
+	return (ops);
+}
+
+/*
+ * Initialize all supported implementations.
+ */
+void
+aes_impl_init(void)
+{
+	aes_impl_ops_t *curr_impl;
+	int i, c;
+
+	/* Move supported implementations into aes_supp_impls */
+	for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
+		curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
+
+		if (curr_impl->is_supported())
+			aes_supp_impl[c++] = (aes_impl_ops_t *)curr_impl;
+	}
+	aes_supp_impl_cnt = c;
+
+	/*
+	 * Set the fastest implementation given the assumption that the
+	 * hardware accelerated version is the fastest.
+	 */
+#if defined(__x86_64)
+#if defined(HAVE_AES)
+	if (aes_aesni_impl.is_supported()) {
+		memcpy(&aes_fastest_impl, &aes_aesni_impl,
+		    sizeof (aes_fastest_impl));
+	} else
+#endif
+	{
+		memcpy(&aes_fastest_impl, &aes_x86_64_impl,
+		    sizeof (aes_fastest_impl));
+	}
+#else
+	memcpy(&aes_fastest_impl, &aes_generic_impl,
+	    sizeof (aes_fastest_impl));
+#endif
+
+	strlcpy(aes_fastest_impl.name, "fastest", AES_IMPL_NAME_MAX);
+
+	/* Finish initialization */
+	atomic_swap_32(&icp_aes_impl, user_sel_impl);
+	aes_impl_initialized = B_TRUE;
+}
+
+static const struct {
+	char *name;
+	uint32_t sel;
+} aes_impl_opts[] = {
+		{ "cycle",	IMPL_CYCLE },
+		{ "fastest",	IMPL_FASTEST },
+};
+
+/*
+ * Function sets desired aes implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * icp_aes_impl.
+ *
+ * @val		Name of aes implementation to use
+ * @param	Unused.
+ */
+int
+aes_impl_set(const char *val)
+{
+	int err = -EINVAL;
+	char req_name[AES_IMPL_NAME_MAX];
+	uint32_t impl = AES_IMPL_READ(user_sel_impl);
+	size_t i;
+
+	/* sanitize input */
+	i = strnlen(val, AES_IMPL_NAME_MAX);
+	if (i == 0 || i >= AES_IMPL_NAME_MAX)
+		return (err);
+
+	strlcpy(req_name, val, AES_IMPL_NAME_MAX);
+	while (i > 0 && isspace(req_name[i-1]))
+		i--;
+	req_name[i] = '\0';
+
+	/* Check mandatory options */
+	for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) {
+		if (strcmp(req_name, aes_impl_opts[i].name) == 0) {
+			impl = aes_impl_opts[i].sel;
+			err = 0;
+			break;
+		}
+	}
+
+	/* check all supported impl if init() was already called */
+	if (err != 0 && aes_impl_initialized) {
+		/* check all supported implementations */
+		for (i = 0; i < aes_supp_impl_cnt; i++) {
+			if (strcmp(req_name, aes_supp_impl[i]->name) == 0) {
+				impl = i;
+				err = 0;
+				break;
+			}
+		}
+	}
+
+	if (err == 0) {
+		if (aes_impl_initialized)
+			atomic_swap_32(&icp_aes_impl, impl);
+		else
+			atomic_swap_32(&user_sel_impl, impl);
+	}
+
+	return (err);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+
+static int
+icp_aes_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+	return (aes_impl_set(val));
+}
+
+static int
+icp_aes_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+	int i, cnt = 0;
+	char *fmt;
+	const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
+
+	ASSERT(aes_impl_initialized);
+
+	/* list mandatory options */
+	for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) {
+		fmt = (impl == aes_impl_opts[i].sel) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, aes_impl_opts[i].name);
+	}
+
+	/* list all supported implementations */
+	for (i = 0; i < aes_supp_impl_cnt; i++) {
+		fmt = (i == impl) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, aes_supp_impl[i]->name);
+	}
+
+	return (cnt);
+}
+
+module_param_call(icp_aes_impl, icp_aes_impl_set, icp_aes_impl_get,
+    NULL, 0644);
+MODULE_PARM_DESC(icp_aes_impl, "Select aes implementation.");
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_aesni.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_aesni.c
new file mode 100644
index 000000000000..4b5eefd71b17
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_aesni.c
@@ -0,0 +1,124 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_AES)
+
+#include <sys/simd.h>
+#include <sys/types.h>
+
+/* These functions are used to execute AES-NI instructions: */
+extern int rijndael_key_setup_enc_intel(uint32_t rk[],
+	const uint32_t cipherKey[], uint64_t keyBits);
+extern int rijndael_key_setup_dec_intel(uint32_t rk[],
+	const uint32_t cipherKey[], uint64_t keyBits);
+extern void aes_encrypt_intel(const uint32_t rk[], int Nr,
+	const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_intel(const uint32_t rk[], int Nr,
+	const uint32_t ct[4], uint32_t pt[4]);
+
+
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key		AES key schedule to be initialized
+ * keyarr32	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_aesni_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+	kfpu_begin();
+	key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]),
+	    keyarr32, keybits);
+	key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]),
+	    keyarr32, keybits);
+	kfpu_end();
+}
+
+/*
+ * Encrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr		Number of rounds
+ * pt		Input block (plain text)
+ * ct		Output block (crypto text).  Can overlap with pt
+ */
+static void
+aes_aesni_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4])
+{
+	kfpu_begin();
+	aes_encrypt_intel(rk, Nr, pt, ct);
+	kfpu_end();
+}
+
+/*
+ * Decrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr		Number of rounds
+ * ct		Input block (crypto text)
+ * pt		Output block (plain text). Can overlap with pt
+ */
+static void
+aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+    uint32_t pt[4])
+{
+	kfpu_begin();
+	aes_decrypt_intel(rk, Nr, ct, pt);
+	kfpu_end();
+}
+
+static boolean_t
+aes_aesni_will_work(void)
+{
+	return (kfpu_allowed() && zfs_aes_available());
+}
+
+const aes_impl_ops_t aes_aesni_impl = {
+	.generate = &aes_aesni_generate,
+	.encrypt = &aes_aesni_encrypt,
+	.decrypt = &aes_aesni_decrypt,
+	.is_supported = &aes_aesni_will_work,
+	.needs_byteswap = B_FALSE,
+	.name = "aesni"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AES) */
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_generic.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_generic.c
new file mode 100644
index 000000000000..427c096c6ab3
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_generic.c
@@ -0,0 +1,1242 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <aes/aes_impl.h>
+
+/*
+ * This file is derived from the file  rijndael-alg-fst.c  taken from the
+ * "optimized C code v3.0" on the "rijndael home page"
+ * http://www.iaik.tu-graz.ac.at/research/krypto/AES/old/~rijmen/rijndael/
+ * pointed by the NIST web-site http://csrc.nist.gov/archive/aes/
+ *
+ * The following note is from the original file:
+ */
+
+/*
+ * rijndael-alg-fst.c
+ *
+ * @version 3.0 (December 2000)
+ *
+ * Optimised ANSI C code for the Rijndael cipher (now AES)
+ *
+ * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
+ * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
+ * @author Paulo Barreto <paulo.barreto@terra.com.br>
+ *
+ * This code is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *  Constant tables
+ */
+
+/*
+ * Te0[x] = S [x].[02, 01, 01, 03];
+ * Te1[x] = S [x].[03, 02, 01, 01];
+ * Te2[x] = S [x].[01, 03, 02, 01];
+ * Te3[x] = S [x].[01, 01, 03, 02];
+ * Te4[x] = S [x].[01, 01, 01, 01];
+ *
+ * Td0[x] = Si[x].[0e, 09, 0d, 0b];
+ * Td1[x] = Si[x].[0b, 0e, 09, 0d];
+ * Td2[x] = Si[x].[0d, 0b, 0e, 09];
+ * Td3[x] = Si[x].[09, 0d, 0b, 0e];
+ * Td4[x] = Si[x].[01, 01, 01, 01];
+ */
+
+/* Encrypt Sbox constants (for the substitute bytes operation) */
+
+static const uint32_t Te0[256] =
+{
+	0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU,
+	0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U,
+	0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU,
+	0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU,
+	0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U,
+	0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU,
+	0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU,
+	0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU,
+	0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU,
+	0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU,
+	0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U,
+	0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU,
+	0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU,
+	0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U,
+	0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU,
+	0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU,
+	0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU,
+	0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU,
+	0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU,
+	0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U,
+	0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU,
+	0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU,
+	0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU,
+	0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU,
+	0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U,
+	0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U,
+	0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U,
+	0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U,
+	0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU,
+	0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U,
+	0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U,
+	0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU,
+	0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU,
+	0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U,
+	0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U,
+	0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U,
+	0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU,
+	0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U,
+	0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU,
+	0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U,
+	0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU,
+	0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U,
+	0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U,
+	0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU,
+	0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U,
+	0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U,
+	0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U,
+	0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U,
+	0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U,
+	0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U,
+	0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U,
+	0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U,
+	0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU,
+	0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U,
+	0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U,
+	0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U,
+	0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U,
+	0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U,
+	0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U,
+	0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU,
+	0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U,
+	0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U,
+	0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U,
+	0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU
+};
+
+
+static const uint32_t Te1[256] =
+{
+	0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU,
+	0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U,
+	0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU,
+	0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U,
+	0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU,
+	0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U,
+	0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU,
+	0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U,
+	0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U,
+	0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU,
+	0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U,
+	0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U,
+	0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U,
+	0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU,
+	0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U,
+	0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U,
+	0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU,
+	0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U,
+	0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U,
+	0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U,
+	0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU,
+	0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU,
+	0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U,
+	0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU,
+	0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU,
+	0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U,
+	0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU,
+	0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U,
+	0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU,
+	0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U,
+	0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U,
+	0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U,
+	0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU,
+	0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U,
+	0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU,
+	0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U,
+	0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU,
+	0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U,
+	0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U,
+	0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU,
+	0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU,
+	0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU,
+	0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U,
+	0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U,
+	0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU,
+	0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U,
+	0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU,
+	0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U,
+	0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU,
+	0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U,
+	0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU,
+	0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU,
+	0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U,
+	0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU,
+	0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U,
+	0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU,
+	0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U,
+	0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U,
+	0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U,
+	0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU,
+	0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU,
+	0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U,
+	0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU,
+	0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U
+};
+
+
+static const uint32_t Te2[256] =
+{
+	0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU,
+	0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U,
+	0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU,
+	0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U,
+	0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU,
+	0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U,
+	0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU,
+	0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U,
+	0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U,
+	0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU,
+	0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U,
+	0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U,
+	0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U,
+	0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU,
+	0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U,
+	0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U,
+	0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU,
+	0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U,
+	0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U,
+	0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U,
+	0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU,
+	0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU,
+	0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U,
+	0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU,
+	0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU,
+	0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U,
+	0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU,
+	0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U,
+	0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU,
+	0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U,
+	0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U,
+	0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U,
+	0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU,
+	0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U,
+	0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU,
+	0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U,
+	0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU,
+	0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U,
+	0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U,
+	0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU,
+	0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU,
+	0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU,
+	0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U,
+	0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U,
+	0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU,
+	0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U,
+	0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU,
+	0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U,
+	0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU,
+	0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U,
+	0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU,
+	0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU,
+	0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U,
+	0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU,
+	0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U,
+	0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU,
+	0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U,
+	0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U,
+	0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U,
+	0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU,
+	0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU,
+	0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U,
+	0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU,
+	0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U
+};
+
+
+static const uint32_t Te3[256] =
+{
+	0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U,
+	0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U,
+	0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U,
+	0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU,
+	0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU,
+	0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU,
+	0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U,
+	0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU,
+	0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU,
+	0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U,
+	0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U,
+	0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU,
+	0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU,
+	0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU,
+	0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU,
+	0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU,
+	0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U,
+	0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU,
+	0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU,
+	0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U,
+	0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U,
+	0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U,
+	0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U,
+	0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U,
+	0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU,
+	0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U,
+	0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU,
+	0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU,
+	0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U,
+	0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U,
+	0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U,
+	0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU,
+	0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U,
+	0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU,
+	0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU,
+	0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U,
+	0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U,
+	0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU,
+	0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U,
+	0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU,
+	0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U,
+	0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U,
+	0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U,
+	0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U,
+	0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU,
+	0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U,
+	0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU,
+	0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U,
+	0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU,
+	0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U,
+	0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU,
+	0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU,
+	0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU,
+	0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU,
+	0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U,
+	0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U,
+	0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U,
+	0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U,
+	0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U,
+	0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U,
+	0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU,
+	0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U,
+	0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU,
+	0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU
+};
+
+static const uint32_t Te4[256] =
+{
+	0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU,
+	0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U,
+	0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU,
+	0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U,
+	0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU,
+	0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U,
+	0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU,
+	0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U,
+	0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U,
+	0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU,
+	0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U,
+	0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U,
+	0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U,
+	0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU,
+	0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U,
+	0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U,
+	0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU,
+	0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U,
+	0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U,
+	0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U,
+	0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU,
+	0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU,
+	0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U,
+	0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU,
+	0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU,
+	0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U,
+	0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU,
+	0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U,
+	0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU,
+	0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U,
+	0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U,
+	0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U,
+	0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU,
+	0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U,
+	0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU,
+	0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U,
+	0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU,
+	0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U,
+	0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U,
+	0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU,
+	0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU,
+	0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU,
+	0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U,
+	0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U,
+	0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU,
+	0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U,
+	0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU,
+	0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U,
+	0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU,
+	0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U,
+	0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU,
+	0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU,
+	0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U,
+	0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU,
+	0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U,
+	0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU,
+	0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U,
+	0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U,
+	0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U,
+	0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU,
+	0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU,
+	0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U,
+	0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU,
+	0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U
+};
+
+/* Decrypt Sbox constants (for the substitute bytes operation) */
+
+static const uint32_t Td0[256] =
+{
+	0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U,
+	0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U,
+	0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U,
+	0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU,
+	0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U,
+	0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U,
+	0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU,
+	0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U,
+	0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU,
+	0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U,
+	0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U,
+	0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U,
+	0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U,
+	0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU,
+	0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U,
+	0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU,
+	0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U,
+	0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU,
+	0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U,
+	0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U,
+	0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U,
+	0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU,
+	0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U,
+	0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU,
+	0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U,
+	0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU,
+	0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U,
+	0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU,
+	0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU,
+	0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U,
+	0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU,
+	0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U,
+	0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU,
+	0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U,
+	0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U,
+	0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U,
+	0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU,
+	0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U,
+	0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U,
+	0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU,
+	0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U,
+	0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U,
+	0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U,
+	0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U,
+	0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U,
+	0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU,
+	0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U,
+	0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U,
+	0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U,
+	0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U,
+	0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U,
+	0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU,
+	0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU,
+	0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU,
+	0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU,
+	0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U,
+	0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U,
+	0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU,
+	0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU,
+	0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U,
+	0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU,
+	0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U,
+	0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U,
+	0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U
+};
+
+static const uint32_t Td1[256] =
+{
+	0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU,
+	0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U,
+	0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU,
+	0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U,
+	0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U,
+	0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U,
+	0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U,
+	0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U,
+	0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U,
+	0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU,
+	0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU,
+	0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU,
+	0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U,
+	0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU,
+	0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U,
+	0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U,
+	0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U,
+	0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU,
+	0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU,
+	0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U,
+	0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU,
+	0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U,
+	0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU,
+	0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU,
+	0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U,
+	0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U,
+	0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U,
+	0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU,
+	0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U,
+	0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU,
+	0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U,
+	0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U,
+	0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U,
+	0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU,
+	0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U,
+	0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U,
+	0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U,
+	0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U,
+	0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U,
+	0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U,
+	0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU,
+	0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU,
+	0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U,
+	0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU,
+	0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U,
+	0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU,
+	0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU,
+	0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U,
+	0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU,
+	0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U,
+	0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U,
+	0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U,
+	0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U,
+	0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U,
+	0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U,
+	0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U,
+	0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU,
+	0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U,
+	0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U,
+	0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU,
+	0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U,
+	0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U,
+	0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U,
+	0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U
+};
+
+static const uint32_t Td2[256] =
+{
+	0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U,
+	0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U,
+	0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U,
+	0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U,
+	0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU,
+	0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U,
+	0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U,
+	0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U,
+	0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U,
+	0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU,
+	0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U,
+	0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U,
+	0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU,
+	0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U,
+	0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U,
+	0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U,
+	0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U,
+	0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U,
+	0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U,
+	0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU,
+	0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U,
+	0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U,
+	0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U,
+	0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U,
+	0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U,
+	0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU,
+	0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU,
+	0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U,
+	0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU,
+	0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U,
+	0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU,
+	0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU,
+	0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU,
+	0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU,
+	0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U,
+	0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U,
+	0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U,
+	0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U,
+	0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U,
+	0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U,
+	0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U,
+	0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU,
+	0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU,
+	0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U,
+	0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U,
+	0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU,
+	0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU,
+	0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U,
+	0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U,
+	0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U,
+	0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U,
+	0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U,
+	0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U,
+	0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U,
+	0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU,
+	0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U,
+	0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U,
+	0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U,
+	0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U,
+	0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U,
+	0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U,
+	0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU,
+	0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U,
+	0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U
+};
+
+static const uint32_t Td3[256] =
+{
+	0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU,
+	0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU,
+	0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U,
+	0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U,
+	0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU,
+	0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU,
+	0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U,
+	0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU,
+	0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U,
+	0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU,
+	0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U,
+	0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U,
+	0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U,
+	0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U,
+	0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U,
+	0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU,
+	0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU,
+	0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U,
+	0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U,
+	0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU,
+	0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU,
+	0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U,
+	0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U,
+	0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U,
+	0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U,
+	0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU,
+	0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U,
+	0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U,
+	0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU,
+	0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU,
+	0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U,
+	0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U,
+	0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U,
+	0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU,
+	0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U,
+	0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U,
+	0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U,
+	0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U,
+	0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U,
+	0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U,
+	0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U,
+	0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU,
+	0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U,
+	0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U,
+	0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU,
+	0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU,
+	0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U,
+	0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU,
+	0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U,
+	0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U,
+	0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U,
+	0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U,
+	0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U,
+	0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U,
+	0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU,
+	0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU,
+	0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU,
+	0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU,
+	0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U,
+	0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U,
+	0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U,
+	0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU,
+	0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U,
+	0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U
+};
+
+static const uint32_t Td4[256] =
+{
+	0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U,
+	0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U,
+	0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU,
+	0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU,
+	0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U,
+	0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U,
+	0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U,
+	0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU,
+	0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U,
+	0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU,
+	0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU,
+	0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU,
+	0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U,
+	0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U,
+	0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U,
+	0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U,
+	0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U,
+	0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U,
+	0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU,
+	0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U,
+	0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U,
+	0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU,
+	0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U,
+	0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U,
+	0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U,
+	0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU,
+	0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U,
+	0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U,
+	0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU,
+	0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U,
+	0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U,
+	0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU,
+	0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U,
+	0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU,
+	0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU,
+	0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U,
+	0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U,
+	0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U,
+	0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U,
+	0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU,
+	0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U,
+	0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U,
+	0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU,
+	0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU,
+	0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU,
+	0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U,
+	0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU,
+	0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U,
+	0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U,
+	0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U,
+	0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U,
+	0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU,
+	0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U,
+	0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU,
+	0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU,
+	0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU,
+	0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU,
+	0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U,
+	0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU,
+	0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U,
+	0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU,
+	0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U,
+	0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U,
+	0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU
+};
+
+/* Rcon is Round Constant; used for encryption key expansion */
+static const uint32_t rcon[RC_LENGTH] =
+{
+	/* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
+	0x01000000, 0x02000000, 0x04000000, 0x08000000,
+	0x10000000, 0x20000000, 0x40000000, 0x80000000,
+	0x1B000000, 0x36000000
+};
+
+
+/*
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		AES key schedule 32-bit array to be initialized
+ * cipherKey	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static int
+rijndael_key_setup_enc(uint32_t rk[], const uint32_t cipherKey[],
+    int keyBits)
+{
+	int		i = 0;
+	uint32_t	temp;
+
+	rk[0] = cipherKey[0];
+	rk[1] = cipherKey[1];
+	rk[2] = cipherKey[2];
+	rk[3] = cipherKey[3];
+
+	if (keyBits == 128) {
+		for (;;) {
+			temp  = rk[3];
+			rk[4] = rk[0] ^
+			    (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+			    (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+			    (Te4[temp & 0xff] & 0x0000ff00) ^
+			    (Te4[temp >> 24] & 0x000000ff) ^
+			    rcon[i];
+			rk[5] = rk[1] ^ rk[4];
+			rk[6] = rk[2] ^ rk[5];
+			rk[7] = rk[3] ^ rk[6];
+
+			if (++i == 10) {
+				return (10);
+			}
+			rk += 4;
+		}
+	}
+
+	rk[4] = cipherKey[4];
+	rk[5] = cipherKey[5];
+
+	if (keyBits == 192) {
+		for (;;) {
+			temp = rk[5];
+			rk[6] = rk[0] ^
+			    (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+			    (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+			    (Te4[temp & 0xff] & 0x0000ff00) ^
+			    (Te4[temp >> 24] & 0x000000ff) ^
+			    rcon[i];
+			rk[7] = rk[1] ^ rk[6];
+			rk[8] = rk[2] ^ rk[7];
+			rk[9] = rk[3] ^ rk[8];
+
+			if (++i == 8) {
+				return (12);
+			}
+
+			rk[10] = rk[4] ^ rk[9];
+			rk[11] = rk[5] ^ rk[10];
+			rk += 6;
+		}
+	}
+
+	rk[6] = cipherKey[6];
+	rk[7] = cipherKey[7];
+
+	if (keyBits == 256) {
+		for (;;) {
+			temp = rk[7];
+			rk[8] = rk[0] ^
+			    (Te4[(temp >> 16) & 0xff] & 0xff000000) ^
+			    (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^
+			    (Te4[temp & 0xff] & 0x0000ff00) ^
+			    (Te4[temp >> 24] & 0x000000ff) ^
+			    rcon[i];
+			rk[9] = rk[1] ^ rk[8];
+			rk[10] = rk[2] ^ rk[9];
+			rk[11] = rk[3] ^ rk[10];
+
+			if (++i == 7) {
+				return (14);
+			}
+			temp = rk[11];
+			rk[12] = rk[4] ^
+			    (Te4[temp >> 24] & 0xff000000) ^
+			    (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^
+			    (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^
+			    (Te4[temp & 0xff] & 0x000000ff);
+			rk[13] = rk[5] ^ rk[12];
+			rk[14] = rk[6] ^ rk[13];
+			rk[15] = rk[7] ^ rk[14];
+
+			rk += 8;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ *  Expand the cipher key into the decryption key schedule.
+ *  Return the number of rounds for the given cipher key size.
+ *  The size of the key schedule depends on the number of rounds
+ *  (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk		AES key schedule 32-bit array to be initialized
+ * cipherKey	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static int
+rijndael_key_setup_dec(uint32_t rk[], const uint32_t cipherKey[], int keyBits)
+{
+	int	 Nr, i, j;
+	uint32_t temp;
+
+	/* expand the cipher key: */
+	Nr = rijndael_key_setup_enc(rk, cipherKey, keyBits);
+
+	/* invert the order of the round keys: */
+	for (i = 0, j = 4 * Nr; i < j; i += 4, j -= 4) {
+		temp = rk[i];
+		rk[i] = rk[j];
+		rk[j] = temp;
+		temp = rk[i + 1];
+		rk[i + 1] = rk[j + 1];
+		rk[j + 1] = temp;
+		temp = rk[i + 2];
+		rk[i + 2] = rk[j + 2];
+		rk[j + 2] = temp;
+		temp = rk[i + 3];
+		rk[i + 3] = rk[j + 3];
+		rk[j + 3] = temp;
+	}
+
+	/*
+	 * apply the inverse MixColumn transform to all
+	 * round keys but the first and the last:
+	 */
+	for (i = 1; i < Nr; i++) {
+		rk += 4;
+		rk[0] = Td0[Te4[rk[0] >> 24] & 0xff] ^
+		    Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^
+		    Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^
+		    Td3[Te4[rk[0] & 0xff] & 0xff];
+		rk[1] = Td0[Te4[rk[1] >> 24] & 0xff] ^
+		    Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^
+		    Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^
+		    Td3[Te4[rk[1] & 0xff] & 0xff];
+		rk[2] = Td0[Te4[rk[2] >> 24] & 0xff] ^
+		    Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^
+		    Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^
+		    Td3[Te4[rk[2] & 0xff] & 0xff];
+		rk[3] = Td0[Te4[rk[3] >> 24] & 0xff] ^
+		    Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^
+		    Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^
+		    Td3[Te4[rk[3] & 0xff] & 0xff];
+	}
+
+	return (Nr);
+}
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key		AES key schedule to be initialized
+ * keyarr32	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_generic_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+	key->nr = rijndael_key_setup_enc(&(key->encr_ks.ks32[0]), keyarr32,
+	    keybits);
+	key->nr = rijndael_key_setup_dec(&(key->decr_ks.ks32[0]), keyarr32,
+	    keybits);
+}
+
+/*
+ * Encrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk	Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr	Number of rounds
+ * pt	Input block (plain text)
+ * ct	Output block (crypto text).  Can overlap with pt
+ */
+static void
+aes_generic_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4])
+{
+	uint32_t	s0, s1, s2, s3, t0, t1, t2, t3;
+	int		r;
+
+	/*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+
+	s0 = pt[0] ^ rk[0];
+	s1 = pt[1] ^ rk[1];
+	s2 = pt[2] ^ rk[2];
+	s3 = pt[3] ^ rk[3];
+
+	/*
+	 * Nr - 1 full rounds:
+	 */
+
+	r = Nr >> 1;
+
+	for (;;) {
+		t0 = Te0[s0 >> 24] ^
+		    Te1[(s1 >> 16) & 0xff] ^
+		    Te2[(s2 >>  8) & 0xff] ^
+		    Te3[s3 & 0xff] ^
+		    rk[4];
+
+		t1 = Te0[s1 >> 24] ^
+		    Te1[(s2 >> 16) & 0xff] ^
+		    Te2[(s3 >>  8) & 0xff] ^
+		    Te3[s0 & 0xff] ^
+		    rk[5];
+
+		t2 = Te0[s2 >> 24] ^
+		    Te1[(s3 >> 16) & 0xff] ^
+		    Te2[(s0 >>  8) & 0xff] ^
+		    Te3[s1 & 0xff] ^
+		    rk[6];
+
+		t3 = Te0[s3 >> 24] ^
+		    Te1[(s0 >> 16) & 0xff] ^
+		    Te2[(s1 >>  8) & 0xff] ^
+		    Te3[s2 & 0xff] ^
+		    rk[7];
+
+		rk += 8;
+
+		if (--r == 0) {
+			break;
+		}
+
+		s0 = Te0[t0 >> 24] ^
+		    Te1[(t1 >> 16) & 0xff] ^
+		    Te2[(t2 >>  8) & 0xff] ^
+		    Te3[t3 & 0xff] ^
+		    rk[0];
+
+		s1 = Te0[t1 >> 24] ^
+		    Te1[(t2 >> 16) & 0xff] ^
+		    Te2[(t3 >>  8) & 0xff] ^
+		    Te3[t0 & 0xff] ^
+		    rk[1];
+
+		s2 = Te0[t2 >> 24] ^
+		    Te1[(t3 >> 16) & 0xff] ^
+		    Te2[(t0 >>  8) & 0xff] ^
+		    Te3[t1 & 0xff] ^
+		    rk[2];
+
+		s3 = Te0[t3 >> 24] ^
+		    Te1[(t0 >> 16) & 0xff] ^
+		    Te2[(t1 >>  8) & 0xff] ^
+		    Te3[t2 & 0xff] ^
+		    rk[3];
+	}
+
+	/*
+	 * apply last round and
+	 * map cipher state to byte array block:
+	 */
+
+	s0 = (Te4[(t0 >> 24)] & 0xff000000) ^
+	    (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Te4[(t2 >>  8) & 0xff] & 0x0000ff00) ^
+	    (Te4[t3 & 0xff] & 0x000000ff) ^
+	    rk[0];
+	ct[0] = s0;
+
+	s1 = (Te4[(t1 >> 24)] & 0xff000000) ^
+	    (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Te4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+	    (Te4[t0 & 0xff] & 0x000000ff) ^
+	    rk[1];
+	ct[1] = s1;
+
+	s2 = (Te4[(t2 >> 24)] & 0xff000000) ^
+	    (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Te4[(t0 >>  8) & 0xff] & 0x0000ff00) ^
+	    (Te4[t1 & 0xff] & 0x000000ff) ^
+	    rk[2];
+	ct[2] = s2;
+
+	s3 = (Te4[(t3 >> 24)] & 0xff000000) ^
+	    (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Te4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+	    (Te4[t2 & 0xff] & 0x000000ff) ^
+	    rk[3];
+	ct[3] = s3;
+}
+
+
+/*
+ * Decrypt one block of data. The block is assumed to be an array
+ * of four uint32_t values, so copy for alignment (and byte-order
+ * reversal for little endian systems might be necessary on the
+ * input and output byte streams.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4*(Nr + 1).
+ *
+ * Parameters:
+ * rk	Key schedule, of aes_ks_t (60 32-bit integers)
+ * Nr	Number of rounds
+ * ct	Input block (crypto text)
+ * pt	Output block (plain text). Can overlap with pt
+ */
+static void
+aes_generic_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+    uint32_t pt[4])
+{
+	uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
+	int	 r;
+
+	/*
+	 * map byte array block to cipher state
+	 * and add initial round key:
+	 */
+	s0 = ct[0] ^ rk[0];
+	s1 = ct[1] ^ rk[1];
+	s2 = ct[2] ^ rk[2];
+	s3 = ct[3] ^ rk[3];
+
+	/*
+	 * Nr - 1 full rounds:
+	 */
+
+	r = Nr >> 1;
+
+	for (;;) {
+		t0 = Td0[s0 >> 24] ^
+		    Td1[(s3 >> 16) & 0xff] ^
+		    Td2[(s2 >> 8) & 0xff] ^
+		    Td3[s1 & 0xff] ^
+		    rk[4];
+
+		t1 = Td0[s1 >> 24] ^
+		    Td1[(s0 >> 16) & 0xff] ^
+		    Td2[(s3 >>  8) & 0xff] ^
+		    Td3[s2 & 0xff] ^
+		    rk[5];
+
+		t2 = Td0[s2 >> 24] ^
+		    Td1[(s1 >> 16) & 0xff] ^
+		    Td2[(s0 >>  8) & 0xff] ^
+		    Td3[s3 & 0xff] ^
+		    rk[6];
+
+		t3 = Td0[s3 >> 24] ^
+		    Td1[(s2 >> 16) & 0xff] ^
+		    Td2[(s1 >> 8) & 0xff] ^
+		    Td3[s0 & 0xff] ^
+		    rk[7];
+
+		rk += 8;
+
+		if (--r == 0) {
+			break;
+		}
+
+		s0 = Td0[t0 >> 24] ^
+		    Td1[(t3 >> 16) & 0xff] ^
+		    Td2[(t2 >> 8) & 0xff] ^
+		    Td3[t1 & 0xff] ^
+		    rk[0];
+
+		s1 = Td0[t1 >> 24] ^
+		    Td1[(t0 >> 16) & 0xff] ^
+		    Td2[(t3 >> 8) & 0xff] ^
+		    Td3[t2 & 0xff] ^
+		    rk[1];
+
+		s2 = Td0[t2 >> 24] ^
+		    Td1[(t1 >> 16) & 0xff] ^
+		    Td2[(t0 >> 8) & 0xff] ^
+		    Td3[t3 & 0xff] ^
+		    rk[2];
+
+		s3 = Td0[t3 >> 24] ^
+		    Td1[(t2 >> 16) & 0xff] ^
+		    Td2[(t1 >> 8) & 0xff] ^
+		    Td3[t0 & 0xff] ^
+		    rk[3];
+	}
+
+	/*
+	 * apply last round and
+	 * map cipher state to byte array block:
+	 */
+
+	s0 = (Td4[t0 >> 24] & 0xff000000) ^
+	    (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^
+	    (Td4[t1 & 0xff] & 0x000000ff) ^
+	    rk[0];
+	pt[0] = s0;
+
+	s1 = (Td4[t1 >> 24] & 0xff000000) ^
+	    (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Td4[(t3 >>  8) & 0xff] & 0x0000ff00) ^
+	    (Td4[t2 & 0xff] & 0x000000ff) ^
+	    rk[1];
+	pt[1] = s1;
+
+	s2 = (Td4[t2 >> 24] & 0xff000000) ^
+	    (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^
+	    (Td4[t3 & 0xff] & 0x000000ff) ^
+	    rk[2];
+	pt[2] = s2;
+
+	s3 = (Td4[t3 >> 24] & 0xff000000) ^
+	    (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^
+	    (Td4[(t1 >>  8) & 0xff] & 0x0000ff00) ^
+	    (Td4[t0 & 0xff] & 0x000000ff) ^
+	    rk[3];
+	pt[3] = s3;
+}
+
+static boolean_t
+aes_generic_will_work(void)
+{
+	return (B_TRUE);
+}
+
+/*
+ * For _LITTLE_ENDIAN machines, reverse every 4 bytes in the key.
+ * On _BIG_ENDIAN, copy the key without reversing bytes.
+ *
+ * SPARCv8/v9 uses a key schedule array with 64-bit elements.
+ * X86/AMD64  uses a key schedule array with 32-bit elements.
+ */
+const aes_impl_ops_t aes_generic_impl = {
+	.generate = &aes_generic_generate,
+	.encrypt = &aes_generic_encrypt,
+	.decrypt = &aes_generic_decrypt,
+	.is_supported = &aes_generic_will_work,
+#if defined(_ZFS_LITTLE_ENDIAN)
+	.needs_byteswap = B_TRUE,
+#else
+	.needs_byteswap = B_FALSE,
+#endif
+	.name = "generic"
+};
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_x86-64.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_x86-64.c
new file mode 100644
index 000000000000..19f8fd5012cf
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_impl_x86-64.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64)
+
+#include <sys/simd.h>
+#include <aes/aes_impl.h>
+
+/*
+ * Expand the 32-bit AES cipher key array into the encryption and decryption
+ * key schedules.
+ *
+ * Parameters:
+ * key		AES key schedule to be initialized
+ * keyarr32	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+static void
+aes_x86_64_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits)
+{
+	key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]),
+	    keyarr32, keybits);
+	key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]),
+	    keyarr32, keybits);
+}
+
+static boolean_t
+aes_x86_64_will_work(void)
+{
+	return (B_TRUE);
+}
+
+const aes_impl_ops_t aes_x86_64_impl = {
+	.generate = &aes_x86_64_generate,
+	.encrypt = &aes_encrypt_amd64,
+	.decrypt = &aes_decrypt_amd64,
+	.is_supported = &aes_x86_64_will_work,
+	.needs_byteswap = B_FALSE,
+	.name = "x86_64"
+};
+
+#endif /* defined(__x86_64) */
diff --git a/sys/contrib/openzfs/module/icp/algs/aes/aes_modes.c b/sys/contrib/openzfs/module/icp/algs/aes/aes_modes.c
new file mode 100644
index 000000000000..9e4b498fffcb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/aes/aes_modes.c
@@ -0,0 +1,135 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <aes/aes_impl.h>
+
+/* Copy a 16-byte AES block from "in" to "out" */
+void
+aes_copy_block(uint8_t *in, uint8_t *out)
+{
+	if (IS_P2ALIGNED2(in, out, sizeof (uint32_t))) {
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&out[0] = *(uint32_t *)&in[0];
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&out[4] = *(uint32_t *)&in[4];
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&out[8] = *(uint32_t *)&in[8];
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&out[12] = *(uint32_t *)&in[12];
+	} else {
+		AES_COPY_BLOCK(in, out);
+	}
+}
+
+
+/* XOR a 16-byte AES block of data into dst */
+void
+aes_xor_block(uint8_t *data, uint8_t *dst)
+{
+	if (IS_P2ALIGNED2(dst, data, sizeof (uint32_t))) {
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&dst[0] ^= *(uint32_t *)&data[0];
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&dst[4] ^= *(uint32_t *)&data[4];
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&dst[8] ^= *(uint32_t *)&data[8];
+		/* LINTED: pointer alignment */
+		*(uint32_t *)&dst[12] ^= *(uint32_t *)&data[12];
+	} else {
+		AES_XOR_BLOCK(data, dst);
+	}
+}
+
+
+/*
+ * Encrypt multiple blocks of data according to mode.
+ */
+int
+aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+    crypto_data_t *out)
+{
+	aes_ctx_t *aes_ctx = ctx;
+	int rv;
+
+	if (aes_ctx->ac_flags & CTR_MODE) {
+		rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+	} else if (aes_ctx->ac_flags & CCM_MODE) {
+		rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length,
+		    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+	} else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+		rv = gcm_mode_encrypt_contiguous_blocks(ctx, data, length,
+		    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+	} else if (aes_ctx->ac_flags & CBC_MODE) {
+		rv = cbc_encrypt_contiguous_blocks(ctx,
+		    data, length, out, AES_BLOCK_LEN, aes_encrypt_block,
+		    aes_copy_block, aes_xor_block);
+	} else {
+		rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
+		    AES_BLOCK_LEN, aes_encrypt_block);
+	}
+	return (rv);
+}
+
+
+/*
+ * Decrypt multiple blocks of data according to mode.
+ */
+int
+aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+    crypto_data_t *out)
+{
+	aes_ctx_t *aes_ctx = ctx;
+	int rv;
+
+	if (aes_ctx->ac_flags & CTR_MODE) {
+		rv = ctr_mode_contiguous_blocks(ctx, data, length, out,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+		if (rv == CRYPTO_DATA_LEN_RANGE)
+			rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+	} else if (aes_ctx->ac_flags & CCM_MODE) {
+		rv = ccm_mode_decrypt_contiguous_blocks(ctx, data, length,
+		    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+	} else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+		rv = gcm_mode_decrypt_contiguous_blocks(ctx, data, length,
+		    out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+	} else if (aes_ctx->ac_flags & CBC_MODE) {
+		rv = cbc_decrypt_contiguous_blocks(ctx, data, length, out,
+		    AES_BLOCK_LEN, aes_decrypt_block, aes_copy_block,
+		    aes_xor_block);
+	} else {
+		rv = ecb_cipher_contiguous_blocks(ctx, data, length, out,
+		    AES_BLOCK_LEN, aes_decrypt_block);
+		if (rv == CRYPTO_DATA_LEN_RANGE)
+			rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+	}
+	return (rv);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c b/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c
new file mode 100644
index 000000000000..7c677095f1ef
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/edonr/edonr.c
@@ -0,0 +1,746 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <jorn.amundsen@ntnu.no>
+ * Tweaked Edon-R implementation for SUPERCOP, based on NIST API.
+ *
+ * $Id: edonr.c 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+#include <sys/strings.h>
+#include <sys/edonr.h>
+#include <sys/debug.h>
+
+/* big endian support, provides no-op's if run on little endian hosts */
+#include "edonr_byteorder.h"
+
+#define	hashState224(x)	((x)->pipe->p256)
+#define	hashState256(x)	((x)->pipe->p256)
+#define	hashState384(x)	((x)->pipe->p512)
+#define	hashState512(x)	((x)->pipe->p512)
+
+/* shift and rotate shortcuts */
+#define	shl(x, n)	((x) << n)
+#define	shr(x, n)	((x) >> n)
+
+#define	rotl32(x, n)	(((x) << (n)) | ((x) >> (32 - (n))))
+#define	rotr32(x, n)	(((x) >> (n)) | ((x) << (32 - (n))))
+
+#define	rotl64(x, n)	(((x) << (n)) | ((x) >> (64 - (n))))
+#define	rotr64(x, n)	(((x) >> (n)) | ((x) << (64 - (n))))
+
+#if !defined(__C99_RESTRICT)
+#define	restrict	/* restrict */
+#endif
+
+#define	EDONR_VALID_HASHBITLEN(x) \
+	((x) == 512 || (x) == 384 || (x) == 256 || (x) == 224)
+
+/* EdonR224 initial double chaining pipe */
+static const uint32_t i224p2[16] = {
+	0x00010203ul, 0x04050607ul, 0x08090a0bul, 0x0c0d0e0ful,
+	0x10111213ul, 0x14151617ul, 0x18191a1bul, 0x1c1d1e1ful,
+	0x20212223ul, 0x24252627ul, 0x28292a2bul, 0x2c2d2e2ful,
+	0x30313233ul, 0x34353637ul, 0x38393a3bul, 0x3c3d3e3ful,
+};
+
+/* EdonR256 initial double chaining pipe */
+static const uint32_t i256p2[16] = {
+	0x40414243ul, 0x44454647ul, 0x48494a4bul, 0x4c4d4e4ful,
+	0x50515253ul, 0x54555657ul, 0x58595a5bul, 0x5c5d5e5ful,
+	0x60616263ul, 0x64656667ul, 0x68696a6bul, 0x6c6d6e6ful,
+	0x70717273ul, 0x74757677ul, 0x78797a7bul, 0x7c7d7e7ful,
+};
+
+/* EdonR384 initial double chaining pipe */
+static const uint64_t i384p2[16] = {
+	0x0001020304050607ull, 0x08090a0b0c0d0e0full,
+	0x1011121314151617ull, 0x18191a1b1c1d1e1full,
+	0x2021222324252627ull, 0x28292a2b2c2d2e2full,
+	0x3031323334353637ull, 0x38393a3b3c3d3e3full,
+	0x4041424344454647ull, 0x48494a4b4c4d4e4full,
+	0x5051525354555657ull, 0x58595a5b5c5d5e5full,
+	0x6061626364656667ull, 0x68696a6b6c6d6e6full,
+	0x7071727374757677ull, 0x78797a7b7c7d7e7full
+};
+
+/* EdonR512 initial double chaining pipe */
+static const uint64_t i512p2[16] = {
+	0x8081828384858687ull, 0x88898a8b8c8d8e8full,
+	0x9091929394959697ull, 0x98999a9b9c9d9e9full,
+	0xa0a1a2a3a4a5a6a7ull, 0xa8a9aaabacadaeafull,
+	0xb0b1b2b3b4b5b6b7ull, 0xb8b9babbbcbdbebfull,
+	0xc0c1c2c3c4c5c6c7ull, 0xc8c9cacbcccdcecfull,
+	0xd0d1d2d3d4d5d6d7ull, 0xd8d9dadbdcdddedfull,
+	0xe0e1e2e3e4e5e6e7ull, 0xe8e9eaebecedeeefull,
+	0xf0f1f2f3f4f5f6f7ull, 0xf8f9fafbfcfdfeffull
+};
+
+/*
+ * First Latin Square
+ * 0   7   1   3   2   4   6   5
+ * 4   1   7   6   3   0   5   2
+ * 7   0   4   2   5   3   1   6
+ * 1   4   0   5   6   2   7   3
+ * 2   3   6   7   1   5   0   4
+ * 5   2   3   1   7   6   4   0
+ * 3   6   5   0   4   7   2   1
+ * 6   5   2   4   0   1   3   7
+ */
+#define	LS1_256(c, x0, x1, x2, x3, x4, x5, x6, x7)			\
+{									\
+	uint32_t x04, x17, x23, x56, x07, x26;				\
+	x04 = x0+x4, x17 = x1+x7, x07 = x04+x17;			\
+	s0 = c + x07 + x2;						\
+	s1 = rotl32(x07 + x3, 4);					\
+	s2 = rotl32(x07 + x6, 8);					\
+	x23 = x2 + x3;							\
+	s5 = rotl32(x04 + x23 + x5, 22);				\
+	x56 = x5 + x6;							\
+	s6 = rotl32(x17 + x56 + x0, 24);				\
+	x26 = x23+x56;							\
+	s3 = rotl32(x26 + x7, 13);					\
+	s4 = rotl32(x26 + x1, 17);					\
+	s7 = rotl32(x26 + x4, 29);					\
+}
+
+#define	LS1_512(c, x0, x1, x2, x3, x4, x5, x6, x7)			\
+{									\
+	uint64_t x04, x17, x23, x56, x07, x26;				\
+	x04 = x0+x4, x17 = x1+x7, x07 = x04+x17;			\
+	s0 = c + x07 + x2;						\
+	s1 = rotl64(x07 + x3, 5);					\
+	s2 = rotl64(x07 + x6, 15);					\
+	x23 = x2 + x3;							\
+	s5 = rotl64(x04 + x23 + x5, 40);				\
+	x56 = x5 + x6;							\
+	s6 = rotl64(x17 + x56 + x0, 50);				\
+	x26 = x23+x56;							\
+	s3 = rotl64(x26 + x7, 22);					\
+	s4 = rotl64(x26 + x1, 31);					\
+	s7 = rotl64(x26 + x4, 59);					\
+}
+
+/*
+ * Second Orthogonal Latin Square
+ * 0   4   2   3   1   6   5   7
+ * 7   6   3   2   5   4   1   0
+ * 5   3   1   6   0   2   7   4
+ * 1   0   5   4   3   7   2   6
+ * 2   1   0   7   4   5   6   3
+ * 3   5   7   0   6   1   4   2
+ * 4   7   6   1   2   0   3   5
+ * 6   2   4   5   7   3   0   1
+ */
+#define	LS2_256(c, y0, y1, y2, y3, y4, y5, y6, y7)			\
+{									\
+	uint32_t y01, y25, y34, y67, y04, y05, y27, y37;		\
+	y01 = y0+y1, y25 = y2+y5, y05 = y01+y25;			\
+	t0  = ~c + y05 + y7;						\
+	t2 = rotl32(y05 + y3, 9);					\
+	y34 = y3+y4, y04 = y01+y34;					\
+	t1 = rotl32(y04 + y6, 5);					\
+	t4 = rotl32(y04 + y5, 15);					\
+	y67 = y6+y7, y37 = y34+y67;					\
+	t3 = rotl32(y37 + y2, 11);					\
+	t7 = rotl32(y37 + y0, 27);					\
+	y27 = y25+y67;							\
+	t5 = rotl32(y27 + y4, 20);					\
+	t6 = rotl32(y27 + y1, 25);					\
+}
+
+#define	LS2_512(c, y0, y1, y2, y3, y4, y5, y6, y7)			\
+{									\
+	uint64_t y01, y25, y34, y67, y04, y05, y27, y37;		\
+	y01 = y0+y1, y25 = y2+y5, y05 = y01+y25;			\
+	t0  = ~c + y05 + y7;						\
+	t2 = rotl64(y05 + y3, 19);					\
+	y34 = y3+y4, y04 = y01+y34;					\
+	t1 = rotl64(y04 + y6, 10);					\
+	t4 = rotl64(y04 + y5, 36);					\
+	y67 = y6+y7, y37 = y34+y67;					\
+	t3 = rotl64(y37 + y2, 29);					\
+	t7 = rotl64(y37 + y0, 55);					\
+	y27 = y25+y67;							\
+	t5 = rotl64(y27 + y4, 44);					\
+	t6 = rotl64(y27 + y1, 48);					\
+}
+
+#define	quasi_exform256(r0, r1, r2, r3, r4, r5, r6, r7)			\
+{									\
+	uint32_t s04, s17, s23, s56, t01, t25, t34, t67;		\
+	s04 = s0 ^ s4, t01 = t0 ^ t1;					\
+	r0 = (s04 ^ s1) + (t01 ^ t5);					\
+	t67 = t6 ^ t7;							\
+	r1 = (s04 ^ s7) + (t2 ^ t67);					\
+	s23 = s2 ^ s3;							\
+	r7 = (s23 ^ s5) + (t4 ^ t67);					\
+	t34 = t3 ^ t4;							\
+	r3 = (s23 ^ s4) + (t0 ^ t34);					\
+	s56 = s5 ^ s6;							\
+	r5 = (s3 ^ s56) + (t34 ^ t6);					\
+	t25 = t2 ^ t5;							\
+	r6 = (s2 ^ s56) + (t25 ^ t7);					\
+	s17 = s1 ^ s7;							\
+	r4 = (s0 ^ s17) + (t1 ^ t25);					\
+	r2 = (s17 ^ s6) + (t01 ^ t3);					\
+}
+
+#define	quasi_exform512(r0, r1, r2, r3, r4, r5, r6, r7)			\
+{									\
+	uint64_t s04, s17, s23, s56, t01, t25, t34, t67;		\
+	s04 = s0 ^ s4, t01 = t0 ^ t1;					\
+	r0 = (s04 ^ s1) + (t01 ^ t5);					\
+	t67 = t6 ^ t7;							\
+	r1 = (s04 ^ s7) + (t2 ^ t67);					\
+	s23 = s2 ^ s3;							\
+	r7 = (s23 ^ s5) + (t4 ^ t67);					\
+	t34 = t3 ^ t4;							\
+	r3 = (s23 ^ s4) + (t0 ^ t34);					\
+	s56 = s5 ^ s6;							\
+	r5 = (s3 ^ s56) + (t34 ^ t6);					\
+	t25 = t2 ^ t5;							\
+	r6 = (s2 ^ s56) + (t25 ^ t7);					\
+	s17 = s1 ^ s7;							\
+	r4 = (s0 ^ s17) + (t1 ^ t25);					\
+	r2 = (s17 ^ s6) + (t01 ^ t3);					\
+}
+
+static size_t
+Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p)
+{
+	size_t bl;
+
+	for (bl = bitlen; bl >= EdonR256_BLOCK_BITSIZE;
+	    bl -= EdonR256_BLOCK_BITSIZE, data += 16) {
+		uint32_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+		    t5, t6, t7;
+		uint32_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+		    q5, q6, q7;
+		const uint32_t defix = 0xaaaaaaaa;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+		    swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define	d(j)	swp ## j
+#define	s32(j)	ld_swap32((uint32_t *)data + j, swp ## j)
+#else
+#define	d(j)	data[j]
+#endif
+
+		/* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s32(8);
+		s32(9);
+		s32(10);
+		s32(11);
+		s32(12);
+		s32(13);
+		s32(14);
+		s32(15);
+#endif
+		LS1_256(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+		    d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s32(0);
+		s32(1);
+		s32(2);
+		s32(3);
+		s32(4);
+		s32(5);
+		s32(6);
+		s32(7);
+#undef s32
+#endif
+		LS2_256(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+		    d(15));
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Second row of quasigroup e-transformations */
+		LS1_256(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+		    p[15]);
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Third row of quasigroup e-transformations */
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Fourth row of quasigroup e-transformations */
+		LS1_256(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Edon-R tweak on the original SHA-3 Edon-R submission. */
+		p[0] ^= d(8) ^ p0;
+		p[1] ^= d(9) ^ p1;
+		p[2] ^= d(10) ^ p2;
+		p[3] ^= d(11) ^ p3;
+		p[4] ^= d(12) ^ p4;
+		p[5] ^= d(13) ^ p5;
+		p[6] ^= d(14) ^ p6;
+		p[7] ^= d(15) ^ p7;
+		p[8] ^= d(0) ^ q0;
+		p[9] ^= d(1) ^ q1;
+		p[10] ^= d(2) ^ q2;
+		p[11] ^= d(3) ^ q3;
+		p[12] ^= d(4) ^ q4;
+		p[13] ^= d(5) ^ q5;
+		p[14] ^= d(6) ^ q6;
+		p[15] ^= d(7) ^ q7;
+	}
+
+#undef d
+	return (bitlen - bl);
+}
+
+/*
+ * Why is this #pragma here?
+ *
+ * Checksum functions like this one can go over the stack frame size check
+ * Linux imposes on 32-bit platforms (-Wframe-larger-than=1024).  We can
+ * safely ignore the compiler error since we know that in ZoL, that
+ * the function will be called from a worker thread that won't be using
+ * much stack.  The only function that goes over the 1k limit is Q512(),
+ * which only goes over it by a hair (1248 bytes on ARM32).
+ */
+#include <sys/isa_defs.h>	/* for _ILP32 */
+#ifdef _ILP32   /* We're 32-bit, assume small stack frames */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+#if defined(__IBMC__) && defined(_AIX) && defined(__64BIT__)
+static inline size_t
+#else
+static size_t
+#endif
+Q512(size_t bitlen, const uint64_t *data, uint64_t *restrict p)
+{
+	size_t bl;
+
+	for (bl = bitlen; bl >= EdonR512_BLOCK_BITSIZE;
+	    bl -= EdonR512_BLOCK_BITSIZE, data += 16) {
+		uint64_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+		    t5, t6, t7;
+		uint64_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+		    q5, q6, q7;
+		const uint64_t defix = 0xaaaaaaaaaaaaaaaaull;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+		    swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define	d(j)	swp##j
+#define	s64(j)	ld_swap64((uint64_t *)data+j, swp##j)
+#else
+#define	d(j)	data[j]
+#endif
+
+		/* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s64(8);
+		s64(9);
+		s64(10);
+		s64(11);
+		s64(12);
+		s64(13);
+		s64(14);
+		s64(15);
+#endif
+		LS1_512(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+		    d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s64(0);
+		s64(1);
+		s64(2);
+		s64(3);
+		s64(4);
+		s64(5);
+		s64(6);
+		s64(7);
+#undef s64
+#endif
+		LS2_512(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+		    d(15));
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Second row of quasigroup e-transformations */
+		LS1_512(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+		    p[15]);
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Third row of quasigroup e-transformations */
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Fourth row of quasigroup e-transformations */
+		LS1_512(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Edon-R tweak on the original SHA-3 Edon-R submission. */
+		p[0] ^= d(8) ^ p0;
+		p[1] ^= d(9) ^ p1;
+		p[2] ^= d(10) ^ p2;
+		p[3] ^= d(11) ^ p3;
+		p[4] ^= d(12) ^ p4;
+		p[5] ^= d(13) ^ p5;
+		p[6] ^= d(14) ^ p6;
+		p[7] ^= d(15) ^ p7;
+		p[8] ^= d(0) ^ q0;
+		p[9] ^= d(1) ^ q1;
+		p[10] ^= d(2) ^ q2;
+		p[11] ^= d(3) ^ q3;
+		p[12] ^= d(4) ^ q4;
+		p[13] ^= d(5) ^ q5;
+		p[14] ^= d(6) ^ q6;
+		p[15] ^= d(7) ^ q7;
+	}
+
+#undef d
+	return (bitlen - bl);
+}
+
+void
+EdonRInit(EdonRState *state, size_t hashbitlen)
+{
+	ASSERT(EDONR_VALID_HASHBITLEN(hashbitlen));
+	switch (hashbitlen) {
+	case 224:
+		state->hashbitlen = 224;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i224p2, hashState224(state)->DoublePipe,
+		    16 * sizeof (uint32_t));
+		break;
+
+	case 256:
+		state->hashbitlen = 256;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i256p2, hashState256(state)->DoublePipe,
+		    16 * sizeof (uint32_t));
+		break;
+
+	case 384:
+		state->hashbitlen = 384;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i384p2, hashState384(state)->DoublePipe,
+		    16 * sizeof (uint64_t));
+		break;
+
+	case 512:
+		state->hashbitlen = 512;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i512p2, hashState224(state)->DoublePipe,
+		    16 * sizeof (uint64_t));
+		break;
+	}
+}
+
+
+void
+EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen)
+{
+	uint32_t *data32;
+	uint64_t *data64;
+
+	size_t bits_processed;
+
+	ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+	switch (state->hashbitlen) {
+	case 224:
+	case 256:
+		if (state->unprocessed_bits > 0) {
+			/* LastBytes = databitlen / 8 */
+			int LastBytes = (int)databitlen >> 3;
+
+			ASSERT(state->unprocessed_bits + databitlen <=
+			    EdonR256_BLOCK_SIZE * 8);
+
+			bcopy(data, hashState256(state)->LastPart
+			    + (state->unprocessed_bits >> 3), LastBytes);
+			state->unprocessed_bits += (int)databitlen;
+			databitlen = state->unprocessed_bits;
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data32 = (uint32_t *)hashState256(state)->LastPart;
+		} else
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data32 = (uint32_t *)data;
+
+		bits_processed = Q256(databitlen, data32,
+		    hashState256(state)->DoublePipe);
+		state->bits_processed += bits_processed;
+		databitlen -= bits_processed;
+		state->unprocessed_bits = (int)databitlen;
+		if (databitlen > 0) {
+			/* LastBytes = Ceil(databitlen / 8) */
+			int LastBytes =
+			    ((~(((-(int)databitlen) >> 3) & 0x01ff)) +
+			    1) & 0x01ff;
+
+			data32 += bits_processed >> 5;	/* byte size update */
+			bcopy(data32, hashState256(state)->LastPart, LastBytes);
+		}
+		break;
+
+	case 384:
+	case 512:
+		if (state->unprocessed_bits > 0) {
+			/* LastBytes = databitlen / 8 */
+			int LastBytes = (int)databitlen >> 3;
+
+			ASSERT(state->unprocessed_bits + databitlen <=
+			    EdonR512_BLOCK_SIZE * 8);
+
+			bcopy(data, hashState512(state)->LastPart
+			    + (state->unprocessed_bits >> 3), LastBytes);
+			state->unprocessed_bits += (int)databitlen;
+			databitlen = state->unprocessed_bits;
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data64 = (uint64_t *)hashState512(state)->LastPart;
+		} else
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data64 = (uint64_t *)data;
+
+		bits_processed = Q512(databitlen, data64,
+		    hashState512(state)->DoublePipe);
+		state->bits_processed += bits_processed;
+		databitlen -= bits_processed;
+		state->unprocessed_bits = (int)databitlen;
+		if (databitlen > 0) {
+			/* LastBytes = Ceil(databitlen / 8) */
+			int LastBytes =
+			    ((~(((-(int)databitlen) >> 3) & 0x03ff)) +
+			    1) & 0x03ff;
+
+			data64 += bits_processed >> 6;	/* byte size update */
+			bcopy(data64, hashState512(state)->LastPart, LastBytes);
+		}
+		break;
+	}
+}
+
+void
+EdonRFinal(EdonRState *state, uint8_t *hashval)
+{
+	uint32_t *data32;
+	uint64_t *data64, num_bits;
+
+	size_t databitlen;
+	int LastByte, PadOnePosition;
+
+	num_bits = state->bits_processed + state->unprocessed_bits;
+	ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+	switch (state->hashbitlen) {
+	case 224:
+	case 256:
+		LastByte = (int)state->unprocessed_bits >> 3;
+		PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+		hashState256(state)->LastPart[LastByte] =
+		    (hashState256(state)->LastPart[LastByte]
+		    & (0xff << (PadOnePosition + 1))) ^
+		    (0x01 << PadOnePosition);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data64 = (uint64_t *)hashState256(state)->LastPart;
+
+		if (state->unprocessed_bits < 448) {
+			(void) memset((hashState256(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR256_BLOCK_SIZE - LastByte - 9);
+			databitlen = EdonR256_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 7);
+#else
+			data64[7] = num_bits;
+#endif
+		} else {
+			(void) memset((hashState256(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR256_BLOCK_SIZE * 2 - LastByte - 9);
+			databitlen = EdonR256_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 15);
+#else
+			data64[15] = num_bits;
+#endif
+		}
+
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data32 = (uint32_t *)hashState256(state)->LastPart;
+		state->bits_processed += Q256(databitlen, data32,
+		    hashState256(state)->DoublePipe);
+		break;
+
+	case 384:
+	case 512:
+		LastByte = (int)state->unprocessed_bits >> 3;
+		PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+		hashState512(state)->LastPart[LastByte] =
+		    (hashState512(state)->LastPart[LastByte]
+		    & (0xff << (PadOnePosition + 1))) ^
+		    (0x01 << PadOnePosition);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data64 = (uint64_t *)hashState512(state)->LastPart;
+
+		if (state->unprocessed_bits < 960) {
+			(void) memset((hashState512(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR512_BLOCK_SIZE - LastByte - 9);
+			databitlen = EdonR512_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 15);
+#else
+			data64[15] = num_bits;
+#endif
+		} else {
+			(void) memset((hashState512(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR512_BLOCK_SIZE * 2 - LastByte - 9);
+			databitlen = EdonR512_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 31);
+#else
+			data64[31] = num_bits;
+#endif
+		}
+
+		state->bits_processed += Q512(databitlen, data64,
+		    hashState512(state)->DoublePipe);
+		break;
+	}
+
+	switch (state->hashbitlen) {
+	case 224: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t *d32 = (uint32_t *)hashval;
+		uint32_t *s32 = hashState224(state)->DoublePipe + 9;
+		int j;
+
+		for (j = 0; j < EdonR224_DIGEST_SIZE >> 2; j++)
+			st_swap32(s32[j], d32 + j);
+#else
+		bcopy(hashState256(state)->DoublePipe + 9, hashval,
+		    EdonR224_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 256: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t *d32 = (uint32_t *)hashval;
+		uint32_t *s32 = hashState224(state)->DoublePipe + 8;
+		int j;
+
+		for (j = 0; j < EdonR256_DIGEST_SIZE >> 2; j++)
+			st_swap32(s32[j], d32 + j);
+#else
+		bcopy(hashState256(state)->DoublePipe + 8, hashval,
+		    EdonR256_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 384: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t *d64 = (uint64_t *)hashval;
+		uint64_t *s64 = hashState384(state)->DoublePipe + 10;
+		int j;
+
+		for (j = 0; j < EdonR384_DIGEST_SIZE >> 3; j++)
+			st_swap64(s64[j], d64 + j);
+#else
+		bcopy(hashState384(state)->DoublePipe + 10, hashval,
+		    EdonR384_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 512: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t *d64 = (uint64_t *)hashval;
+		uint64_t *s64 = hashState512(state)->DoublePipe + 8;
+		int j;
+
+		for (j = 0; j < EdonR512_DIGEST_SIZE >> 3; j++)
+			st_swap64(s64[j], d64 + j);
+#else
+		bcopy(hashState512(state)->DoublePipe + 8, hashval,
+		    EdonR512_DIGEST_SIZE);
+#endif
+		break;
+	}
+	}
+}
+
+
+void
+EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen,
+    uint8_t *hashval)
+{
+	EdonRState state;
+
+	EdonRInit(&state, hashbitlen);
+	EdonRUpdate(&state, data, databitlen);
+	EdonRFinal(&state, hashval);
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(EdonRInit);
+EXPORT_SYMBOL(EdonRUpdate);
+EXPORT_SYMBOL(EdonRHash);
+EXPORT_SYMBOL(EdonRFinal);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/edonr/edonr_byteorder.h b/sys/contrib/openzfs/module/icp/algs/edonr/edonr_byteorder.h
new file mode 100644
index 000000000000..2b5d48287f26
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/edonr/edonr_byteorder.h
@@ -0,0 +1,216 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <jorn.amundsen@ntnu.no>
+ *
+ * C header file to determine compile machine byte order. Take care when cross
+ * compiling.
+ *
+ * $Id: byteorder.h 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+#ifndef _CRYPTO_EDONR_BYTEORDER_H
+#define	_CRYPTO_EDONR_BYTEORDER_H
+
+#include <sys/sysmacros.h>
+#include <sys/param.h>
+
+#if defined(__BYTE_ORDER)
+#if (__BYTE_ORDER == __BIG_ENDIAN)
+#define	MACHINE_IS_BIG_ENDIAN
+#elif (__BYTE_ORDER == __LITTLE_ENDIAN)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#elif defined(BYTE_ORDER)
+#if (BYTE_ORDER == BIG_ENDIAN)
+#define	MACHINE_IS_BIG_ENDIAN
+#elif (BYTE_ORDER == LITTLE_ENDIAN)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* __BYTE_ORDER || BYTE_ORDER */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#if defined(_ZFS_BIG_ENDIAN) || defined(_MIPSEB)
+#define	MACHINE_IS_BIG_ENDIAN
+#endif
+#if defined(_ZFS_LITTLE_ENDIAN) || defined(_MIPSEL)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* !MACHINE_IS_BIG_ENDIAN && !MACHINE_IS_LITTLE_ENDIAN */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#error unknown machine byte sex
+#endif
+
+#define	BYTEORDER_INCLUDED
+
+#if defined(MACHINE_IS_BIG_ENDIAN)
+/*
+ * Byte swapping macros for big endian architectures and compilers,
+ * add as appropriate for other architectures and/or compilers.
+ *
+ *     ld_swap64(src,dst) : uint64_t dst = *(src)
+ *     st_swap64(src,dst) : *(dst)       = uint64_t src
+ */
+
+#if defined(__PPC__) || defined(_ARCH_PPC)
+
+#if defined(__64BIT__)
+#if defined(_ARCH_PWR7)
+#define	aix_ld_swap64(s64, d64)\
+	__asm__("ldbrx %0,0,%1" : "=r"(d64) : "r"(s64))
+#define	aix_st_swap64(s64, d64)\
+	__asm__ volatile("stdbrx %1,0,%0" : : "r"(d64), "r"(s64))
+#else
+#define	aix_ld_swap64(s64, d64)						\
+{									\
+	uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */	\
+									\
+	__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0;rldimi %1,%2,32,0"\
+		: "+r"(s4), "=r"(d64), "=r"(h) : "b"(s64));		\
+}
+
+#define	aix_st_swap64(s64, d64)						\
+{									\
+	uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */	\
+	h = (s64) >> 32;						\
+	__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0"	\
+		: "+r"(s4) : "r"(s64), "r"(h), "b"(d64));		\
+}
+#endif /* 64BIT && PWR7 */
+#else
+#define	aix_ld_swap64(s64, d64)						\
+{									\
+	uint32_t *s4 = 0, h, l;	/* initialize to zero for gcc warning */\
+	__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0"		\
+		: "+r"(s4), "=r"(l), "=r"(h) : "b"(s64));		\
+	d64 = ((uint64_t)h<<32) | l;					\
+}
+
+#define	aix_st_swap64(s64, d64)						\
+{									\
+	uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\
+	l = (s64) & 0xfffffffful, h = (s64) >> 32;			\
+	__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0"	\
+		: "+r"(s4) : "r"(l), "r"(h), "b"(d64));			\
+}
+#endif /* __64BIT__ */
+#define	aix_ld_swap32(s32, d32)\
+	__asm__("lwbrx %0,0,%1" : "=r"(d32) : "r"(s32))
+#define	aix_st_swap32(s32, d32)\
+	__asm__ volatile("stwbrx %1,0,%0" : : "r"(d32), "r"(s32))
+#define	ld_swap32(s, d) aix_ld_swap32(s, d)
+#define	st_swap32(s, d) aix_st_swap32(s, d)
+#define	ld_swap64(s, d) aix_ld_swap64(s, d)
+#define	st_swap64(s, d) aix_st_swap64(s, d)
+#endif /* __PPC__ || _ARCH_PPC */
+
+#if defined(__sparc)
+#if !defined(__arch64__) && !defined(__sparcv8) && defined(__sparcv9)
+#define	__arch64__
+#endif
+#if defined(__GNUC__) || (defined(__SUNPRO_C) && __SUNPRO_C > 0x590)
+/* need Sun Studio C 5.10 and above for GNU inline assembly */
+#if defined(__arch64__)
+#define	sparc_ld_swap64(s64, d64)					\
+	__asm__("ldxa [%1]0x88,%0" : "=r"(d64) : "r"(s64))
+#define	sparc_st_swap64(s64, d64)					\
+	__asm__ volatile("stxa %0,[%1]0x88" : : "r"(s64), "r"(d64))
+#define	st_swap64(s, d) sparc_st_swap64(s, d)
+#else
+#define	sparc_ld_swap64(s64, d64)					\
+{									\
+	uint32_t *s4, h, l;						\
+	__asm__("add %3,4,%0\n\tlda [%3]0x88,%1\n\tlda [%0]0x88,%2"	\
+		: "+r"(s4), "=r"(l), "=r"(h) : "r"(s64));		\
+	d64 = ((uint64_t)h<<32) | l;					\
+}
+#define	sparc_st_swap64(s64, d64)					\
+{									\
+	uint32_t *s4, h, l;						\
+	l = (s64) & 0xfffffffful, h = (s64) >> 32;			\
+	__asm__ volatile("add %3,4,%0\n\tsta %1,[%3]0x88\n\tsta %2,[%0]0x88"\
+		: "+r"(s4) : "r"(l), "r"(h), "r"(d64));			\
+}
+#endif /* sparc64 */
+#define	sparc_ld_swap32(s32, d32)\
+	__asm__("lda [%1]0x88,%0" : "=r"(d32) : "r"(s32))
+#define	sparc_st_swap32(s32, d32)\
+	__asm__ volatile("sta %0,[%1]0x88" : : "r"(s32), "r"(d32))
+#define	ld_swap32(s, d) sparc_ld_swap32(s, d)
+#define	st_swap32(s, d) sparc_st_swap32(s, d)
+#define	ld_swap64(s, d) sparc_ld_swap64(s, d)
+#define	st_swap64(s, d) sparc_st_swap64(s, d)
+#endif /* GCC || Sun Studio C > 5.9 */
+#endif /* sparc */
+
+/* GCC fallback */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap32)
+#define	ld_swap32(s, d) (d = __builtin_bswap32(*(s)))
+#define	st_swap32(s, d) (*(d) = __builtin_bswap32(s))
+#endif /* GCC4/PGIC && !swap32 */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap64)
+#define	ld_swap64(s, d) (d = __builtin_bswap64(*(s)))
+#define	st_swap64(s, d) (*(d) = __builtin_bswap64(s))
+#endif /* GCC4/PGIC && !swap64 */
+
+/* generic fallback */
+#if !defined(ld_swap32)
+#define	ld_swap32(s, d)							\
+	(d = (*(s) >> 24) | (*(s) >> 8 & 0xff00) |			\
+	(*(s) << 8 & 0xff0000) | (*(s) << 24))
+#define	st_swap32(s, d)							\
+	(*(d) = ((s) >> 24) | ((s) >> 8 & 0xff00) |			\
+	((s) << 8 & 0xff0000) | ((s) << 24))
+#endif
+#if !defined(ld_swap64)
+#define	ld_swap64(s, d)							\
+	(d = (*(s) >> 56) | (*(s) >> 40 & 0xff00) |			\
+	(*(s) >> 24 & 0xff0000) | (*(s) >> 8 & 0xff000000) |		\
+	(*(s) & 0xff000000) << 8 | (*(s) & 0xff0000) << 24 |		\
+	(*(s) & 0xff00) << 40 | *(s) << 56)
+#define	st_swap64(s, d)							\
+	(*(d) = ((s) >> 56) | ((s) >> 40 & 0xff00) |			\
+	((s) >> 24 & 0xff0000) | ((s) >> 8 & 0xff000000) |		\
+	((s) & 0xff000000) << 8 | ((s) & 0xff0000) << 24 |		\
+	((s) & 0xff00) << 40 | (s) << 56)
+#endif
+
+#endif /* MACHINE_IS_BIG_ENDIAN */
+
+
+#if defined(MACHINE_IS_LITTLE_ENDIAN)
+/* replace swaps with simple assignments on little endian systems */
+#undef	ld_swap32
+#undef	st_swap32
+#define	ld_swap32(s, d) (d = *(s))
+#define	st_swap32(s, d) (*(d) = s)
+#undef	ld_swap64
+#undef	st_swap64
+#define	ld_swap64(s, d) (d = *(s))
+#define	st_swap64(s, d) (*(d) = s)
+#endif /* MACHINE_IS_LITTLE_ENDIAN */
+
+#endif /* _CRYPTO_EDONR_BYTEORDER_H */
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/cbc.c b/sys/contrib/openzfs/module/icp/algs/modes/cbc.c
new file mode 100644
index 000000000000..85864f56dead
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/cbc.c
@@ -0,0 +1,273 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Algorithm independent CBC functions.
+ */
+int
+cbc_encrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*encrypt)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	size_t remainder = length;
+	size_t need = 0;
+	uint8_t *datap = (uint8_t *)data;
+	uint8_t *blockp;
+	uint8_t *lastp;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+
+	if (length + ctx->cbc_remainder_len < block_size) {
+		/* accumulate bytes here and return */
+		bcopy(datap,
+		    (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len,
+		    length);
+		ctx->cbc_remainder_len += length;
+		ctx->cbc_copy_to = datap;
+		return (CRYPTO_SUCCESS);
+	}
+
+	lastp = (uint8_t *)ctx->cbc_iv;
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+	do {
+		/* Unprocessed data from last call. */
+		if (ctx->cbc_remainder_len > 0) {
+			need = block_size - ctx->cbc_remainder_len;
+
+			if (need > remainder)
+				return (CRYPTO_DATA_LEN_RANGE);
+
+			bcopy(datap, &((uint8_t *)ctx->cbc_remainder)
+			    [ctx->cbc_remainder_len], need);
+
+			blockp = (uint8_t *)ctx->cbc_remainder;
+		} else {
+			blockp = datap;
+		}
+
+		/*
+		 * XOR the previous cipher block or IV with the
+		 * current clear block.
+		 */
+		xor_block(blockp, lastp);
+		encrypt(ctx->cbc_keysched, lastp, lastp);
+		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+		    &out_data_1_len, &out_data_2, block_size);
+
+		/* copy block to where it belongs */
+		if (out_data_1_len == block_size) {
+			copy_block(lastp, out_data_1);
+		} else {
+			bcopy(lastp, out_data_1, out_data_1_len);
+			if (out_data_2 != NULL) {
+				bcopy(lastp + out_data_1_len,
+				    out_data_2,
+				    block_size - out_data_1_len);
+			}
+		}
+		/* update offset */
+		out->cd_offset += block_size;
+
+		/* Update pointer to next block of data to be processed. */
+		if (ctx->cbc_remainder_len != 0) {
+			datap += need;
+			ctx->cbc_remainder_len = 0;
+		} else {
+			datap += block_size;
+		}
+
+		remainder = (size_t)&data[length] - (size_t)datap;
+
+		/* Incomplete last block. */
+		if (remainder > 0 && remainder < block_size) {
+			bcopy(datap, ctx->cbc_remainder, remainder);
+			ctx->cbc_remainder_len = remainder;
+			ctx->cbc_copy_to = datap;
+			goto out;
+		}
+		ctx->cbc_copy_to = NULL;
+
+	} while (remainder > 0);
+
+out:
+	/*
+	 * Save the last encrypted block in the context.
+	 */
+	if (ctx->cbc_lastp != NULL) {
+		copy_block((uint8_t *)ctx->cbc_lastp, (uint8_t *)ctx->cbc_iv);
+		ctx->cbc_lastp = (uint8_t *)ctx->cbc_iv;
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+#define	OTHER(a, ctx) \
+	(((a) == (ctx)->cbc_lastblock) ? (ctx)->cbc_iv : (ctx)->cbc_lastblock)
+
+/* ARGSUSED */
+int
+cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*decrypt)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	size_t remainder = length;
+	size_t need = 0;
+	uint8_t *datap = (uint8_t *)data;
+	uint8_t *blockp;
+	uint8_t *lastp;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+
+	if (length + ctx->cbc_remainder_len < block_size) {
+		/* accumulate bytes here and return */
+		bcopy(datap,
+		    (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len,
+		    length);
+		ctx->cbc_remainder_len += length;
+		ctx->cbc_copy_to = datap;
+		return (CRYPTO_SUCCESS);
+	}
+
+	lastp = ctx->cbc_lastp;
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+	do {
+		/* Unprocessed data from last call. */
+		if (ctx->cbc_remainder_len > 0) {
+			need = block_size - ctx->cbc_remainder_len;
+
+			if (need > remainder)
+				return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+
+			bcopy(datap, &((uint8_t *)ctx->cbc_remainder)
+			    [ctx->cbc_remainder_len], need);
+
+			blockp = (uint8_t *)ctx->cbc_remainder;
+		} else {
+			blockp = datap;
+		}
+
+		/* LINTED: pointer alignment */
+		copy_block(blockp, (uint8_t *)OTHER((uint64_t *)lastp, ctx));
+
+		decrypt(ctx->cbc_keysched, blockp,
+		    (uint8_t *)ctx->cbc_remainder);
+		blockp = (uint8_t *)ctx->cbc_remainder;
+
+		/*
+		 * XOR the previous cipher block or IV with the
+		 * currently decrypted block.
+		 */
+		xor_block(lastp, blockp);
+
+		/* LINTED: pointer alignment */
+		lastp = (uint8_t *)OTHER((uint64_t *)lastp, ctx);
+
+		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+		    &out_data_1_len, &out_data_2, block_size);
+
+		bcopy(blockp, out_data_1, out_data_1_len);
+		if (out_data_2 != NULL) {
+			bcopy(blockp + out_data_1_len, out_data_2,
+			    block_size - out_data_1_len);
+		}
+
+		/* update offset */
+		out->cd_offset += block_size;
+
+		/* Update pointer to next block of data to be processed. */
+		if (ctx->cbc_remainder_len != 0) {
+			datap += need;
+			ctx->cbc_remainder_len = 0;
+		} else {
+			datap += block_size;
+		}
+
+		remainder = (size_t)&data[length] - (size_t)datap;
+
+		/* Incomplete last block. */
+		if (remainder > 0 && remainder < block_size) {
+			bcopy(datap, ctx->cbc_remainder, remainder);
+			ctx->cbc_remainder_len = remainder;
+			ctx->cbc_lastp = lastp;
+			ctx->cbc_copy_to = datap;
+			return (CRYPTO_SUCCESS);
+		}
+		ctx->cbc_copy_to = NULL;
+
+	} while (remainder > 0);
+
+	ctx->cbc_lastp = lastp;
+	return (CRYPTO_SUCCESS);
+}
+
+int
+cbc_init_ctx(cbc_ctx_t *cbc_ctx, char *param, size_t param_len,
+    size_t block_size, void (*copy_block)(uint8_t *, uint64_t *))
+{
+	/*
+	 * Copy IV into context.
+	 *
+	 * If cm_param == NULL then the IV comes from the
+	 * cd_miscdata field in the crypto_data structure.
+	 */
+	if (param != NULL) {
+		ASSERT(param_len == block_size);
+		copy_block((uchar_t *)param, cbc_ctx->cbc_iv);
+	}
+
+	cbc_ctx->cbc_lastp = (uint8_t *)&cbc_ctx->cbc_iv[0];
+	cbc_ctx->cbc_flags |= CBC_MODE;
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+void *
+cbc_alloc_ctx(int kmflag)
+{
+	cbc_ctx_t *cbc_ctx;
+
+	if ((cbc_ctx = kmem_zalloc(sizeof (cbc_ctx_t), kmflag)) == NULL)
+		return (NULL);
+
+	cbc_ctx->cbc_flags = CBC_MODE;
+	return (cbc_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ccm.c b/sys/contrib/openzfs/module/icp/algs/modes/ccm.c
new file mode 100644
index 000000000000..5d6507c49db1
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/ccm.c
@@ -0,0 +1,907 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
+#include <sys/byteorder.h>
+#define	UNALIGNED_POINTERS_PERMITTED
+#endif
+
+/*
+ * Encrypt multiple blocks of data in CCM mode.  Decrypt for CCM mode
+ * is done in another function.
+ */
+int
+ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	size_t remainder = length;
+	size_t need = 0;
+	uint8_t *datap = (uint8_t *)data;
+	uint8_t *blockp;
+	uint8_t *lastp;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+	uint64_t counter;
+	uint8_t *mac_buf;
+
+	if (length + ctx->ccm_remainder_len < block_size) {
+		/* accumulate bytes here and return */
+		bcopy(datap,
+		    (uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len,
+		    length);
+		ctx->ccm_remainder_len += length;
+		ctx->ccm_copy_to = datap;
+		return (CRYPTO_SUCCESS);
+	}
+
+	lastp = (uint8_t *)ctx->ccm_cb;
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+	mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+	do {
+		/* Unprocessed data from last call. */
+		if (ctx->ccm_remainder_len > 0) {
+			need = block_size - ctx->ccm_remainder_len;
+
+			if (need > remainder)
+				return (CRYPTO_DATA_LEN_RANGE);
+
+			bcopy(datap, &((uint8_t *)ctx->ccm_remainder)
+			    [ctx->ccm_remainder_len], need);
+
+			blockp = (uint8_t *)ctx->ccm_remainder;
+		} else {
+			blockp = datap;
+		}
+
+		/*
+		 * do CBC MAC
+		 *
+		 * XOR the previous cipher block current clear block.
+		 * mac_buf always contain previous cipher block.
+		 */
+		xor_block(blockp, mac_buf);
+		encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+		/* ccm_cb is the counter block */
+		encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb,
+		    (uint8_t *)ctx->ccm_tmp);
+
+		lastp = (uint8_t *)ctx->ccm_tmp;
+
+		/*
+		 * Increment counter. Counter bits are confined
+		 * to the bottom 64 bits of the counter block.
+		 */
+#ifdef _ZFS_LITTLE_ENDIAN
+		counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask);
+		counter = htonll(counter + 1);
+#else
+		counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask;
+		counter++;
+#endif	/* _ZFS_LITTLE_ENDIAN */
+		counter &= ctx->ccm_counter_mask;
+		ctx->ccm_cb[1] =
+		    (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter;
+
+		/*
+		 * XOR encrypted counter block with the current clear block.
+		 */
+		xor_block(blockp, lastp);
+
+		ctx->ccm_processed_data_len += block_size;
+
+		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+		    &out_data_1_len, &out_data_2, block_size);
+
+		/* copy block to where it belongs */
+		if (out_data_1_len == block_size) {
+			copy_block(lastp, out_data_1);
+		} else {
+			bcopy(lastp, out_data_1, out_data_1_len);
+			if (out_data_2 != NULL) {
+				bcopy(lastp + out_data_1_len,
+				    out_data_2,
+				    block_size - out_data_1_len);
+			}
+		}
+		/* update offset */
+		out->cd_offset += block_size;
+
+		/* Update pointer to next block of data to be processed. */
+		if (ctx->ccm_remainder_len != 0) {
+			datap += need;
+			ctx->ccm_remainder_len = 0;
+		} else {
+			datap += block_size;
+		}
+
+		remainder = (size_t)&data[length] - (size_t)datap;
+
+		/* Incomplete last block. */
+		if (remainder > 0 && remainder < block_size) {
+			bcopy(datap, ctx->ccm_remainder, remainder);
+			ctx->ccm_remainder_len = remainder;
+			ctx->ccm_copy_to = datap;
+			goto out;
+		}
+		ctx->ccm_copy_to = NULL;
+
+	} while (remainder > 0);
+
+out:
+	return (CRYPTO_SUCCESS);
+}
+
+void
+calculate_ccm_mac(ccm_ctx_t *ctx, uint8_t *ccm_mac,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *))
+{
+	uint64_t counter;
+	uint8_t *counterp, *mac_buf;
+	int i;
+
+	mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+	/* first counter block start with index 0 */
+	counter = 0;
+	ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter;
+
+	counterp = (uint8_t *)ctx->ccm_tmp;
+	encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp);
+
+	/* calculate XOR of MAC with first counter block */
+	for (i = 0; i < ctx->ccm_mac_len; i++) {
+		ccm_mac[i] = mac_buf[i] ^ counterp[i];
+	}
+}
+
+/* ARGSUSED */
+int
+ccm_encrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	uint8_t *lastp, *mac_buf, *ccm_mac_p, *macp = NULL;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+	int i;
+
+	if (out->cd_length < (ctx->ccm_remainder_len + ctx->ccm_mac_len)) {
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	/*
+	 * When we get here, the number of bytes of payload processed
+	 * plus whatever data remains, if any,
+	 * should be the same as the number of bytes that's being
+	 * passed in the argument during init time.
+	 */
+	if ((ctx->ccm_processed_data_len + ctx->ccm_remainder_len)
+	    != (ctx->ccm_data_len)) {
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+	if (ctx->ccm_remainder_len > 0) {
+
+		/* ccm_mac_input_buf is not used for encryption */
+		macp = (uint8_t *)ctx->ccm_mac_input_buf;
+		bzero(macp, block_size);
+
+		/* copy remainder to temporary buffer */
+		bcopy(ctx->ccm_remainder, macp, ctx->ccm_remainder_len);
+
+		/* calculate the CBC MAC */
+		xor_block(macp, mac_buf);
+		encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+		/* calculate the counter mode */
+		lastp = (uint8_t *)ctx->ccm_tmp;
+		encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, lastp);
+
+		/* XOR with counter block */
+		for (i = 0; i < ctx->ccm_remainder_len; i++) {
+			macp[i] ^= lastp[i];
+		}
+		ctx->ccm_processed_data_len += ctx->ccm_remainder_len;
+	}
+
+	/* Calculate the CCM MAC */
+	ccm_mac_p = (uint8_t *)ctx->ccm_tmp;
+	calculate_ccm_mac(ctx, ccm_mac_p, encrypt_block);
+
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+	crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+	    &out_data_1_len, &out_data_2,
+	    ctx->ccm_remainder_len + ctx->ccm_mac_len);
+
+	if (ctx->ccm_remainder_len > 0) {
+
+		/* copy temporary block to where it belongs */
+		if (out_data_2 == NULL) {
+			/* everything will fit in out_data_1 */
+			bcopy(macp, out_data_1, ctx->ccm_remainder_len);
+			bcopy(ccm_mac_p, out_data_1 + ctx->ccm_remainder_len,
+			    ctx->ccm_mac_len);
+		} else {
+
+			if (out_data_1_len < ctx->ccm_remainder_len) {
+
+				size_t data_2_len_used;
+
+				bcopy(macp, out_data_1, out_data_1_len);
+
+				data_2_len_used = ctx->ccm_remainder_len
+				    - out_data_1_len;
+
+				bcopy((uint8_t *)macp + out_data_1_len,
+				    out_data_2, data_2_len_used);
+				bcopy(ccm_mac_p, out_data_2 + data_2_len_used,
+				    ctx->ccm_mac_len);
+			} else {
+				bcopy(macp, out_data_1, out_data_1_len);
+				if (out_data_1_len == ctx->ccm_remainder_len) {
+					/* mac will be in out_data_2 */
+					bcopy(ccm_mac_p, out_data_2,
+					    ctx->ccm_mac_len);
+				} else {
+					size_t len_not_used = out_data_1_len -
+					    ctx->ccm_remainder_len;
+					/*
+					 * part of mac in will be in
+					 * out_data_1, part of the mac will be
+					 * in out_data_2
+					 */
+					bcopy(ccm_mac_p,
+					    out_data_1 + ctx->ccm_remainder_len,
+					    len_not_used);
+					bcopy(ccm_mac_p + len_not_used,
+					    out_data_2,
+					    ctx->ccm_mac_len - len_not_used);
+
+				}
+			}
+		}
+	} else {
+		/* copy block to where it belongs */
+		bcopy(ccm_mac_p, out_data_1, out_data_1_len);
+		if (out_data_2 != NULL) {
+			bcopy(ccm_mac_p + out_data_1_len, out_data_2,
+			    block_size - out_data_1_len);
+		}
+	}
+	out->cd_offset += ctx->ccm_remainder_len + ctx->ccm_mac_len;
+	ctx->ccm_remainder_len = 0;
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * This will only deal with decrypting the last block of the input that
+ * might not be a multiple of block length.
+ */
+static void
+ccm_decrypt_incomplete_block(ccm_ctx_t *ctx,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *))
+{
+	uint8_t *datap, *outp, *counterp;
+	int i;
+
+	datap = (uint8_t *)ctx->ccm_remainder;
+	outp = &((ctx->ccm_pt_buf)[ctx->ccm_processed_data_len]);
+
+	counterp = (uint8_t *)ctx->ccm_tmp;
+	encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp);
+
+	/* XOR with counter block */
+	for (i = 0; i < ctx->ccm_remainder_len; i++) {
+		outp[i] = datap[i] ^ counterp[i];
+	}
+}
+
+/*
+ * This will decrypt the cipher text.  However, the plaintext won't be
+ * returned to the caller.  It will be returned when decrypt_final() is
+ * called if the MAC matches
+ */
+/* ARGSUSED */
+int
+ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	size_t remainder = length;
+	size_t need = 0;
+	uint8_t *datap = (uint8_t *)data;
+	uint8_t *blockp;
+	uint8_t *cbp;
+	uint64_t counter;
+	size_t pt_len, total_decrypted_len, mac_len, pm_len, pd_len;
+	uint8_t *resultp;
+
+
+	pm_len = ctx->ccm_processed_mac_len;
+
+	if (pm_len > 0) {
+		uint8_t *tmp;
+		/*
+		 * all ciphertext has been processed, just waiting for
+		 * part of the value of the mac
+		 */
+		if ((pm_len + length) > ctx->ccm_mac_len) {
+			return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+		}
+		tmp = (uint8_t *)ctx->ccm_mac_input_buf;
+
+		bcopy(datap, tmp + pm_len, length);
+
+		ctx->ccm_processed_mac_len += length;
+		return (CRYPTO_SUCCESS);
+	}
+
+	/*
+	 * If we decrypt the given data, what total amount of data would
+	 * have been decrypted?
+	 */
+	pd_len = ctx->ccm_processed_data_len;
+	total_decrypted_len = pd_len + length + ctx->ccm_remainder_len;
+
+	if (total_decrypted_len >
+	    (ctx->ccm_data_len + ctx->ccm_mac_len)) {
+		return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+	}
+
+	pt_len = ctx->ccm_data_len;
+
+	if (total_decrypted_len > pt_len) {
+		/*
+		 * part of the input will be the MAC, need to isolate that
+		 * to be dealt with later.  The left-over data in
+		 * ccm_remainder_len from last time will not be part of the
+		 * MAC.  Otherwise, it would have already been taken out
+		 * when this call is made last time.
+		 */
+		size_t pt_part = pt_len - pd_len - ctx->ccm_remainder_len;
+
+		mac_len = length - pt_part;
+
+		ctx->ccm_processed_mac_len = mac_len;
+		bcopy(data + pt_part, ctx->ccm_mac_input_buf, mac_len);
+
+		if (pt_part + ctx->ccm_remainder_len < block_size) {
+			/*
+			 * since this is last of the ciphertext, will
+			 * just decrypt with it here
+			 */
+			bcopy(datap, &((uint8_t *)ctx->ccm_remainder)
+			    [ctx->ccm_remainder_len], pt_part);
+			ctx->ccm_remainder_len += pt_part;
+			ccm_decrypt_incomplete_block(ctx, encrypt_block);
+			ctx->ccm_processed_data_len += ctx->ccm_remainder_len;
+			ctx->ccm_remainder_len = 0;
+			return (CRYPTO_SUCCESS);
+		} else {
+			/* let rest of the code handle this */
+			length = pt_part;
+		}
+	} else if (length + ctx->ccm_remainder_len < block_size) {
+			/* accumulate bytes here and return */
+		bcopy(datap,
+		    (uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len,
+		    length);
+		ctx->ccm_remainder_len += length;
+		ctx->ccm_copy_to = datap;
+		return (CRYPTO_SUCCESS);
+	}
+
+	do {
+		/* Unprocessed data from last call. */
+		if (ctx->ccm_remainder_len > 0) {
+			need = block_size - ctx->ccm_remainder_len;
+
+			if (need > remainder)
+				return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+
+			bcopy(datap, &((uint8_t *)ctx->ccm_remainder)
+			    [ctx->ccm_remainder_len], need);
+
+			blockp = (uint8_t *)ctx->ccm_remainder;
+		} else {
+			blockp = datap;
+		}
+
+		/* Calculate the counter mode, ccm_cb is the counter block */
+		cbp = (uint8_t *)ctx->ccm_tmp;
+		encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, cbp);
+
+		/*
+		 * Increment counter.
+		 * Counter bits are confined to the bottom 64 bits
+		 */
+#ifdef _ZFS_LITTLE_ENDIAN
+		counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask);
+		counter = htonll(counter + 1);
+#else
+		counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask;
+		counter++;
+#endif	/* _ZFS_LITTLE_ENDIAN */
+		counter &= ctx->ccm_counter_mask;
+		ctx->ccm_cb[1] =
+		    (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter;
+
+		/* XOR with the ciphertext */
+		xor_block(blockp, cbp);
+
+		/* Copy the plaintext to the "holding buffer" */
+		resultp = (uint8_t *)ctx->ccm_pt_buf +
+		    ctx->ccm_processed_data_len;
+		copy_block(cbp, resultp);
+
+		ctx->ccm_processed_data_len += block_size;
+
+		ctx->ccm_lastp = blockp;
+
+		/* Update pointer to next block of data to be processed. */
+		if (ctx->ccm_remainder_len != 0) {
+			datap += need;
+			ctx->ccm_remainder_len = 0;
+		} else {
+			datap += block_size;
+		}
+
+		remainder = (size_t)&data[length] - (size_t)datap;
+
+		/* Incomplete last block */
+		if (remainder > 0 && remainder < block_size) {
+			bcopy(datap, ctx->ccm_remainder, remainder);
+			ctx->ccm_remainder_len = remainder;
+			ctx->ccm_copy_to = datap;
+			if (ctx->ccm_processed_mac_len > 0) {
+				/*
+				 * not expecting anymore ciphertext, just
+				 * compute plaintext for the remaining input
+				 */
+				ccm_decrypt_incomplete_block(ctx,
+				    encrypt_block);
+				ctx->ccm_processed_data_len += remainder;
+				ctx->ccm_remainder_len = 0;
+			}
+			goto out;
+		}
+		ctx->ccm_copy_to = NULL;
+
+	} while (remainder > 0);
+
+out:
+	return (CRYPTO_SUCCESS);
+}
+
+int
+ccm_decrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	size_t mac_remain, pt_len;
+	uint8_t *pt, *mac_buf, *macp, *ccm_mac_p;
+	int rv;
+
+	pt_len = ctx->ccm_data_len;
+
+	/* Make sure output buffer can fit all of the plaintext */
+	if (out->cd_length < pt_len) {
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	pt = ctx->ccm_pt_buf;
+	mac_remain = ctx->ccm_processed_data_len;
+	mac_buf = (uint8_t *)ctx->ccm_mac_buf;
+
+	macp = (uint8_t *)ctx->ccm_tmp;
+
+	while (mac_remain > 0) {
+
+		if (mac_remain < block_size) {
+			bzero(macp, block_size);
+			bcopy(pt, macp, mac_remain);
+			mac_remain = 0;
+		} else {
+			copy_block(pt, macp);
+			mac_remain -= block_size;
+			pt += block_size;
+		}
+
+		/* calculate the CBC MAC */
+		xor_block(macp, mac_buf);
+		encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+	}
+
+	/* Calculate the CCM MAC */
+	ccm_mac_p = (uint8_t *)ctx->ccm_tmp;
+	calculate_ccm_mac((ccm_ctx_t *)ctx, ccm_mac_p, encrypt_block);
+
+	/* compare the input CCM MAC value with what we calculated */
+	if (bcmp(ctx->ccm_mac_input_buf, ccm_mac_p, ctx->ccm_mac_len)) {
+		/* They don't match */
+		return (CRYPTO_INVALID_MAC);
+	} else {
+		rv = crypto_put_output_data(ctx->ccm_pt_buf, out, pt_len);
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+		out->cd_offset += pt_len;
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+static int
+ccm_validate_args(CK_AES_CCM_PARAMS *ccm_param, boolean_t is_encrypt_init)
+{
+	size_t macSize, nonceSize;
+	uint8_t q;
+	uint64_t maxValue;
+
+	/*
+	 * Check the length of the MAC.  The only valid
+	 * lengths for the MAC are: 4, 6, 8, 10, 12, 14, 16
+	 */
+	macSize = ccm_param->ulMACSize;
+	if ((macSize < 4) || (macSize > 16) || ((macSize % 2) != 0)) {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+
+	/* Check the nonce length.  Valid values are 7, 8, 9, 10, 11, 12, 13 */
+	nonceSize = ccm_param->ulNonceSize;
+	if ((nonceSize < 7) || (nonceSize > 13)) {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+
+	/* q is the length of the field storing the length, in bytes */
+	q = (uint8_t)((15 - nonceSize) & 0xFF);
+
+
+	/*
+	 * If it is decrypt, need to make sure size of ciphertext is at least
+	 * bigger than MAC len
+	 */
+	if ((!is_encrypt_init) && (ccm_param->ulDataSize < macSize)) {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+
+	/*
+	 * Check to make sure the length of the payload is within the
+	 * range of values allowed by q
+	 */
+	if (q < 8) {
+		maxValue = (1ULL << (q * 8)) - 1;
+	} else {
+		maxValue = ULONG_MAX;
+	}
+
+	if (ccm_param->ulDataSize > maxValue) {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Format the first block used in CBC-MAC (B0) and the initial counter
+ * block based on formatting functions and counter generation functions
+ * specified in RFC 3610 and NIST publication 800-38C, appendix A
+ *
+ * b0 is the first block used in CBC-MAC
+ * cb0 is the first counter block
+ *
+ * It's assumed that the arguments b0 and cb0 are preallocated AES blocks
+ *
+ */
+static void
+ccm_format_initial_blocks(uchar_t *nonce, ulong_t nonceSize,
+    ulong_t authDataSize, uint8_t *b0, ccm_ctx_t *aes_ctx)
+{
+	uint64_t payloadSize;
+	uint8_t t, q, have_adata = 0;
+	size_t limit;
+	int i, j, k;
+	uint64_t mask = 0;
+	uint8_t *cb;
+
+	q = (uint8_t)((15 - nonceSize) & 0xFF);
+	t = (uint8_t)((aes_ctx->ccm_mac_len) & 0xFF);
+
+	/* Construct the first octet of b0 */
+	if (authDataSize > 0) {
+		have_adata = 1;
+	}
+	b0[0] = (have_adata << 6) | (((t - 2)  / 2) << 3) | (q - 1);
+
+	/* copy the nonce value into b0 */
+	bcopy(nonce, &(b0[1]), nonceSize);
+
+	/* store the length of the payload into b0 */
+	bzero(&(b0[1+nonceSize]), q);
+
+	payloadSize = aes_ctx->ccm_data_len;
+	limit = 8 < q ? 8 : q;
+
+	for (i = 0, j = 0, k = 15; i < limit; i++, j += 8, k--) {
+		b0[k] = (uint8_t)((payloadSize >> j) & 0xFF);
+	}
+
+	/* format the counter block */
+
+	cb = (uint8_t *)aes_ctx->ccm_cb;
+
+	cb[0] = 0x07 & (q-1); /* first byte */
+
+	/* copy the nonce value into the counter block */
+	bcopy(nonce, &(cb[1]), nonceSize);
+
+	bzero(&(cb[1+nonceSize]), q);
+
+	/* Create the mask for the counter field based on the size of nonce */
+	q <<= 3;
+	while (q-- > 0) {
+		mask |= (1ULL << q);
+	}
+
+#ifdef _ZFS_LITTLE_ENDIAN
+	mask = htonll(mask);
+#endif
+	aes_ctx->ccm_counter_mask = mask;
+
+	/*
+	 * During calculation, we start using counter block 1, we will
+	 * set it up right here.
+	 * We can just set the last byte to have the value 1, because
+	 * even with the biggest nonce of 13, the last byte of the
+	 * counter block will be used for the counter value.
+	 */
+	cb[15] = 0x01;
+}
+
+/*
+ * Encode the length of the associated data as
+ * specified in RFC 3610 and NIST publication 800-38C, appendix A
+ */
+static void
+encode_adata_len(ulong_t auth_data_len, uint8_t *encoded, size_t *encoded_len)
+{
+#ifdef UNALIGNED_POINTERS_PERMITTED
+	uint32_t	*lencoded_ptr;
+#ifdef _LP64
+	uint64_t	*llencoded_ptr;
+#endif
+#endif	/* UNALIGNED_POINTERS_PERMITTED */
+
+	if (auth_data_len < ((1ULL<<16) - (1ULL<<8))) {
+		/* 0 < a < (2^16-2^8) */
+		*encoded_len = 2;
+		encoded[0] = (auth_data_len & 0xff00) >> 8;
+		encoded[1] = auth_data_len & 0xff;
+
+	} else if ((auth_data_len >= ((1ULL<<16) - (1ULL<<8))) &&
+	    (auth_data_len < (1ULL << 31))) {
+		/* (2^16-2^8) <= a < 2^32 */
+		*encoded_len = 6;
+		encoded[0] = 0xff;
+		encoded[1] = 0xfe;
+#ifdef UNALIGNED_POINTERS_PERMITTED
+		lencoded_ptr = (uint32_t *)&encoded[2];
+		*lencoded_ptr = htonl(auth_data_len);
+#else
+		encoded[2] = (auth_data_len & 0xff000000) >> 24;
+		encoded[3] = (auth_data_len & 0xff0000) >> 16;
+		encoded[4] = (auth_data_len & 0xff00) >> 8;
+		encoded[5] = auth_data_len & 0xff;
+#endif	/* UNALIGNED_POINTERS_PERMITTED */
+
+#ifdef _LP64
+	} else {
+		/* 2^32 <= a < 2^64 */
+		*encoded_len = 10;
+		encoded[0] = 0xff;
+		encoded[1] = 0xff;
+#ifdef UNALIGNED_POINTERS_PERMITTED
+		llencoded_ptr = (uint64_t *)&encoded[2];
+		*llencoded_ptr = htonl(auth_data_len);
+#else
+		encoded[2] = (auth_data_len & 0xff00000000000000) >> 56;
+		encoded[3] = (auth_data_len & 0xff000000000000) >> 48;
+		encoded[4] = (auth_data_len & 0xff0000000000) >> 40;
+		encoded[5] = (auth_data_len & 0xff00000000) >> 32;
+		encoded[6] = (auth_data_len & 0xff000000) >> 24;
+		encoded[7] = (auth_data_len & 0xff0000) >> 16;
+		encoded[8] = (auth_data_len & 0xff00) >> 8;
+		encoded[9] = auth_data_len & 0xff;
+#endif	/* UNALIGNED_POINTERS_PERMITTED */
+#endif	/* _LP64 */
+	}
+}
+
+static int
+ccm_init(ccm_ctx_t *ctx, unsigned char *nonce, size_t nonce_len,
+    unsigned char *auth_data, size_t auth_data_len, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	uint8_t *mac_buf, *datap, *ivp, *authp;
+	size_t remainder, processed;
+	uint8_t encoded_a[10]; /* max encoded auth data length is 10 octets */
+	size_t encoded_a_len = 0;
+
+	mac_buf = (uint8_t *)&(ctx->ccm_mac_buf);
+
+	/*
+	 * Format the 1st block for CBC-MAC and construct the
+	 * 1st counter block.
+	 *
+	 * aes_ctx->ccm_iv is used for storing the counter block
+	 * mac_buf will store b0 at this time.
+	 */
+	ccm_format_initial_blocks(nonce, nonce_len,
+	    auth_data_len, mac_buf, ctx);
+
+	/* The IV for CBC MAC for AES CCM mode is always zero */
+	ivp = (uint8_t *)ctx->ccm_tmp;
+	bzero(ivp, block_size);
+
+	xor_block(ivp, mac_buf);
+
+	/* encrypt the nonce */
+	encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+	/* take care of the associated data, if any */
+	if (auth_data_len == 0) {
+		return (CRYPTO_SUCCESS);
+	}
+
+	encode_adata_len(auth_data_len, encoded_a, &encoded_a_len);
+
+	remainder = auth_data_len;
+
+	/* 1st block: it contains encoded associated data, and some data */
+	authp = (uint8_t *)ctx->ccm_tmp;
+	bzero(authp, block_size);
+	bcopy(encoded_a, authp, encoded_a_len);
+	processed = block_size - encoded_a_len;
+	if (processed > auth_data_len) {
+		/* in case auth_data is very small */
+		processed = auth_data_len;
+	}
+	bcopy(auth_data, authp+encoded_a_len, processed);
+	/* xor with previous buffer */
+	xor_block(authp, mac_buf);
+	encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+	remainder -= processed;
+	if (remainder == 0) {
+		/* a small amount of associated data, it's all done now */
+		return (CRYPTO_SUCCESS);
+	}
+
+	do {
+		if (remainder < block_size) {
+			/*
+			 * There's not a block full of data, pad rest of
+			 * buffer with zero
+			 */
+			bzero(authp, block_size);
+			bcopy(&(auth_data[processed]), authp, remainder);
+			datap = (uint8_t *)authp;
+			remainder = 0;
+		} else {
+			datap = (uint8_t *)(&(auth_data[processed]));
+			processed += block_size;
+			remainder -= block_size;
+		}
+
+		xor_block(datap, mac_buf);
+		encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf);
+
+	} while (remainder > 0);
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * The following function should be call at encrypt or decrypt init time
+ * for AES CCM mode.
+ */
+int
+ccm_init_ctx(ccm_ctx_t *ccm_ctx, char *param, int kmflag,
+    boolean_t is_encrypt_init, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	int rv;
+	CK_AES_CCM_PARAMS *ccm_param;
+
+	if (param != NULL) {
+		ccm_param = (CK_AES_CCM_PARAMS *)param;
+
+		if ((rv = ccm_validate_args(ccm_param,
+		    is_encrypt_init)) != 0) {
+			return (rv);
+		}
+
+		ccm_ctx->ccm_mac_len = ccm_param->ulMACSize;
+		if (is_encrypt_init) {
+			ccm_ctx->ccm_data_len = ccm_param->ulDataSize;
+		} else {
+			ccm_ctx->ccm_data_len =
+			    ccm_param->ulDataSize - ccm_ctx->ccm_mac_len;
+			ccm_ctx->ccm_processed_mac_len = 0;
+		}
+		ccm_ctx->ccm_processed_data_len = 0;
+
+		ccm_ctx->ccm_flags |= CCM_MODE;
+	} else {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+
+	if (ccm_init(ccm_ctx, ccm_param->nonce, ccm_param->ulNonceSize,
+	    ccm_param->authData, ccm_param->ulAuthDataSize, block_size,
+	    encrypt_block, xor_block) != 0) {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+	if (!is_encrypt_init) {
+		/* allocate buffer for storing decrypted plaintext */
+		ccm_ctx->ccm_pt_buf = vmem_alloc(ccm_ctx->ccm_data_len,
+		    kmflag);
+		if (ccm_ctx->ccm_pt_buf == NULL) {
+			rv = CRYPTO_HOST_MEMORY;
+		}
+	}
+	return (rv);
+}
+
+void *
+ccm_alloc_ctx(int kmflag)
+{
+	ccm_ctx_t *ccm_ctx;
+
+	if ((ccm_ctx = kmem_zalloc(sizeof (ccm_ctx_t), kmflag)) == NULL)
+		return (NULL);
+
+	ccm_ctx->ccm_flags = CCM_MODE;
+	return (ccm_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ctr.c b/sys/contrib/openzfs/module/icp/algs/modes/ctr.c
new file mode 100644
index 000000000000..0188bdd395ff
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/ctr.c
@@ -0,0 +1,228 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/byteorder.h>
+
+/*
+ * Encrypt and decrypt multiple blocks of data in counter mode.
+ */
+int
+ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	size_t remainder = length;
+	size_t need = 0;
+	uint8_t *datap = (uint8_t *)data;
+	uint8_t *blockp;
+	uint8_t *lastp;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+	uint64_t lower_counter, upper_counter;
+
+	if (length + ctx->ctr_remainder_len < block_size) {
+		/* accumulate bytes here and return */
+		bcopy(datap,
+		    (uint8_t *)ctx->ctr_remainder + ctx->ctr_remainder_len,
+		    length);
+		ctx->ctr_remainder_len += length;
+		ctx->ctr_copy_to = datap;
+		return (CRYPTO_SUCCESS);
+	}
+
+	lastp = (uint8_t *)ctx->ctr_cb;
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+	do {
+		/* Unprocessed data from last call. */
+		if (ctx->ctr_remainder_len > 0) {
+			need = block_size - ctx->ctr_remainder_len;
+
+			if (need > remainder)
+				return (CRYPTO_DATA_LEN_RANGE);
+
+			bcopy(datap, &((uint8_t *)ctx->ctr_remainder)
+			    [ctx->ctr_remainder_len], need);
+
+			blockp = (uint8_t *)ctx->ctr_remainder;
+		} else {
+			blockp = datap;
+		}
+
+		/* ctr_cb is the counter block */
+		cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb,
+		    (uint8_t *)ctx->ctr_tmp);
+
+		lastp = (uint8_t *)ctx->ctr_tmp;
+
+		/*
+		 * Increment Counter.
+		 */
+		lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask);
+		lower_counter = htonll(lower_counter + 1);
+		lower_counter &= ctx->ctr_lower_mask;
+		ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) |
+		    lower_counter;
+
+		/* wrap around */
+		if (lower_counter == 0) {
+			upper_counter =
+			    ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask);
+			upper_counter = htonll(upper_counter + 1);
+			upper_counter &= ctx->ctr_upper_mask;
+			ctx->ctr_cb[0] =
+			    (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) |
+			    upper_counter;
+		}
+
+		/*
+		 * XOR encrypted counter block with the current clear block.
+		 */
+		xor_block(blockp, lastp);
+
+		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+		    &out_data_1_len, &out_data_2, block_size);
+
+		/* copy block to where it belongs */
+		bcopy(lastp, out_data_1, out_data_1_len);
+		if (out_data_2 != NULL) {
+			bcopy(lastp + out_data_1_len, out_data_2,
+			    block_size - out_data_1_len);
+		}
+		/* update offset */
+		out->cd_offset += block_size;
+
+		/* Update pointer to next block of data to be processed. */
+		if (ctx->ctr_remainder_len != 0) {
+			datap += need;
+			ctx->ctr_remainder_len = 0;
+		} else {
+			datap += block_size;
+		}
+
+		remainder = (size_t)&data[length] - (size_t)datap;
+
+		/* Incomplete last block. */
+		if (remainder > 0 && remainder < block_size) {
+			bcopy(datap, ctx->ctr_remainder, remainder);
+			ctx->ctr_remainder_len = remainder;
+			ctx->ctr_copy_to = datap;
+			goto out;
+		}
+		ctx->ctr_copy_to = NULL;
+
+	} while (remainder > 0);
+
+out:
+	return (CRYPTO_SUCCESS);
+}
+
+int
+ctr_mode_final(ctr_ctx_t *ctx, crypto_data_t *out,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *))
+{
+	uint8_t *lastp;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+	uint8_t *p;
+	int i;
+
+	if (out->cd_length < ctx->ctr_remainder_len)
+		return (CRYPTO_DATA_LEN_RANGE);
+
+	encrypt_block(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb,
+	    (uint8_t *)ctx->ctr_tmp);
+
+	lastp = (uint8_t *)ctx->ctr_tmp;
+	p = (uint8_t *)ctx->ctr_remainder;
+	for (i = 0; i < ctx->ctr_remainder_len; i++) {
+		p[i] ^= lastp[i];
+	}
+
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+	crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+	    &out_data_1_len, &out_data_2, ctx->ctr_remainder_len);
+
+	bcopy(p, out_data_1, out_data_1_len);
+	if (out_data_2 != NULL) {
+		bcopy((uint8_t *)p + out_data_1_len,
+		    out_data_2, ctx->ctr_remainder_len - out_data_1_len);
+	}
+	out->cd_offset += ctx->ctr_remainder_len;
+	ctx->ctr_remainder_len = 0;
+	return (CRYPTO_SUCCESS);
+}
+
+int
+ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb,
+    void (*copy_block)(uint8_t *, uint8_t *))
+{
+	uint64_t upper_mask = 0;
+	uint64_t lower_mask = 0;
+
+	if (count == 0 || count > 128) {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+	/* upper 64 bits of the mask */
+	if (count >= 64) {
+		count -= 64;
+		upper_mask = (count == 64) ? UINT64_MAX : (1ULL << count) - 1;
+		lower_mask = UINT64_MAX;
+	} else {
+		/* now the lower 63 bits */
+		lower_mask = (1ULL << count) - 1;
+	}
+	ctr_ctx->ctr_lower_mask = htonll(lower_mask);
+	ctr_ctx->ctr_upper_mask = htonll(upper_mask);
+
+	copy_block(cb, (uchar_t *)ctr_ctx->ctr_cb);
+	ctr_ctx->ctr_lastp = (uint8_t *)&ctr_ctx->ctr_cb[0];
+	ctr_ctx->ctr_flags |= CTR_MODE;
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+void *
+ctr_alloc_ctx(int kmflag)
+{
+	ctr_ctx_t *ctr_ctx;
+
+	if ((ctr_ctx = kmem_zalloc(sizeof (ctr_ctx_t), kmflag)) == NULL)
+		return (NULL);
+
+	ctr_ctx->ctr_flags = CTR_MODE;
+	return (ctr_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/ecb.c b/sys/contrib/openzfs/module/icp/algs/modes/ecb.c
new file mode 100644
index 000000000000..025f5825cf04
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/ecb.c
@@ -0,0 +1,128 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Algorithm independent ECB functions.
+ */
+int
+ecb_cipher_contiguous_blocks(ecb_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct))
+{
+	size_t remainder = length;
+	size_t need = 0;
+	uint8_t *datap = (uint8_t *)data;
+	uint8_t *blockp;
+	uint8_t *lastp;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+
+	if (length + ctx->ecb_remainder_len < block_size) {
+		/* accumulate bytes here and return */
+		bcopy(datap,
+		    (uint8_t *)ctx->ecb_remainder + ctx->ecb_remainder_len,
+		    length);
+		ctx->ecb_remainder_len += length;
+		ctx->ecb_copy_to = datap;
+		return (CRYPTO_SUCCESS);
+	}
+
+	lastp = (uint8_t *)ctx->ecb_iv;
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+	do {
+		/* Unprocessed data from last call. */
+		if (ctx->ecb_remainder_len > 0) {
+			need = block_size - ctx->ecb_remainder_len;
+
+			if (need > remainder)
+				return (CRYPTO_DATA_LEN_RANGE);
+
+			bcopy(datap, &((uint8_t *)ctx->ecb_remainder)
+			    [ctx->ecb_remainder_len], need);
+
+			blockp = (uint8_t *)ctx->ecb_remainder;
+		} else {
+			blockp = datap;
+		}
+
+		cipher(ctx->ecb_keysched, blockp, lastp);
+		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+		    &out_data_1_len, &out_data_2, block_size);
+
+		/* copy block to where it belongs */
+		bcopy(lastp, out_data_1, out_data_1_len);
+		if (out_data_2 != NULL) {
+			bcopy(lastp + out_data_1_len, out_data_2,
+			    block_size - out_data_1_len);
+		}
+		/* update offset */
+		out->cd_offset += block_size;
+
+		/* Update pointer to next block of data to be processed. */
+		if (ctx->ecb_remainder_len != 0) {
+			datap += need;
+			ctx->ecb_remainder_len = 0;
+		} else {
+			datap += block_size;
+		}
+
+		remainder = (size_t)&data[length] - (size_t)datap;
+
+		/* Incomplete last block. */
+		if (remainder > 0 && remainder < block_size) {
+			bcopy(datap, ctx->ecb_remainder, remainder);
+			ctx->ecb_remainder_len = remainder;
+			ctx->ecb_copy_to = datap;
+			goto out;
+		}
+		ctx->ecb_copy_to = NULL;
+
+	} while (remainder > 0);
+
+out:
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+void *
+ecb_alloc_ctx(int kmflag)
+{
+	ecb_ctx_t *ecb_ctx;
+
+	if ((ecb_ctx = kmem_zalloc(sizeof (ecb_ctx_t), kmflag)) == NULL)
+		return (NULL);
+
+	ecb_ctx->ecb_flags = ECB_MODE;
+	return (ecb_ctx);
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
new file mode 100644
index 000000000000..23686c59e8ce
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm.c
@@ -0,0 +1,1587 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/impl.h>
+#include <sys/byteorder.h>
+#include <sys/simd.h>
+#include <modes/gcm_impl.h>
+#ifdef CAN_USE_GCM_ASM
+#include <aes/aes_impl.h>
+#endif
+
+#define	GHASH(c, d, t, o) \
+	xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
+	(o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \
+	(uint64_t *)(void *)(t));
+
+/* Select GCM implementation */
+#define	IMPL_FASTEST	(UINT32_MAX)
+#define	IMPL_CYCLE	(UINT32_MAX-1)
+#ifdef CAN_USE_GCM_ASM
+#define	IMPL_AVX	(UINT32_MAX-2)
+#endif
+#define	GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i))
+static uint32_t icp_gcm_impl = IMPL_FASTEST;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+#ifdef CAN_USE_GCM_ASM
+/* Does the architecture we run on support the MOVBE instruction? */
+boolean_t gcm_avx_can_use_movbe = B_FALSE;
+/*
+ * Whether to use the optimized openssl gcm and ghash implementations.
+ * Set to true if module parameter icp_gcm_impl == "avx".
+ */
+static boolean_t gcm_use_avx = B_FALSE;
+#define	GCM_IMPL_USE_AVX	(*(volatile boolean_t *)&gcm_use_avx)
+
+extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+
+static inline boolean_t gcm_avx_will_work(void);
+static inline void gcm_set_avx(boolean_t);
+static inline boolean_t gcm_toggle_avx(void);
+static inline size_t gcm_simd_get_htab_size(boolean_t);
+
+static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t);
+
+static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
+static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t);
+static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *,
+    size_t, size_t);
+#endif /* ifdef CAN_USE_GCM_ASM */
+
+/*
+ * Encrypt multiple blocks of data in GCM mode.  Decrypt for GCM mode
+ * is done in another function.
+ */
+int
+gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+#ifdef CAN_USE_GCM_ASM
+	if (ctx->gcm_use_avx == B_TRUE)
+		return (gcm_mode_encrypt_contiguous_blocks_avx(
+		    ctx, data, length, out, block_size));
+#endif
+
+	const gcm_impl_ops_t *gops;
+	size_t remainder = length;
+	size_t need = 0;
+	uint8_t *datap = (uint8_t *)data;
+	uint8_t *blockp;
+	uint8_t *lastp;
+	void *iov_or_mp;
+	offset_t offset;
+	uint8_t *out_data_1;
+	uint8_t *out_data_2;
+	size_t out_data_1_len;
+	uint64_t counter;
+	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+
+	if (length + ctx->gcm_remainder_len < block_size) {
+		/* accumulate bytes here and return */
+		bcopy(datap,
+		    (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len,
+		    length);
+		ctx->gcm_remainder_len += length;
+		if (ctx->gcm_copy_to == NULL) {
+			ctx->gcm_copy_to = datap;
+		}
+		return (CRYPTO_SUCCESS);
+	}
+
+	lastp = (uint8_t *)ctx->gcm_cb;
+	crypto_init_ptrs(out, &iov_or_mp, &offset);
+
+	gops = gcm_impl_get_ops();
+	do {
+		/* Unprocessed data from last call. */
+		if (ctx->gcm_remainder_len > 0) {
+			need = block_size - ctx->gcm_remainder_len;
+
+			if (need > remainder)
+				return (CRYPTO_DATA_LEN_RANGE);
+
+			bcopy(datap, &((uint8_t *)ctx->gcm_remainder)
+			    [ctx->gcm_remainder_len], need);
+
+			blockp = (uint8_t *)ctx->gcm_remainder;
+		} else {
+			blockp = datap;
+		}
+
+		/*
+		 * Increment counter. Counter bits are confined
+		 * to the bottom 32 bits of the counter block.
+		 */
+		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+		counter = htonll(counter + 1);
+		counter &= counter_mask;
+		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
+		    (uint8_t *)ctx->gcm_tmp);
+		xor_block(blockp, (uint8_t *)ctx->gcm_tmp);
+
+		lastp = (uint8_t *)ctx->gcm_tmp;
+
+		ctx->gcm_processed_data_len += block_size;
+
+		crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1,
+		    &out_data_1_len, &out_data_2, block_size);
+
+		/* copy block to where it belongs */
+		if (out_data_1_len == block_size) {
+			copy_block(lastp, out_data_1);
+		} else {
+			bcopy(lastp, out_data_1, out_data_1_len);
+			if (out_data_2 != NULL) {
+				bcopy(lastp + out_data_1_len,
+				    out_data_2,
+				    block_size - out_data_1_len);
+			}
+		}
+		/* update offset */
+		out->cd_offset += block_size;
+
+		/* add ciphertext to the hash */
+		GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops);
+
+		/* Update pointer to next block of data to be processed. */
+		if (ctx->gcm_remainder_len != 0) {
+			datap += need;
+			ctx->gcm_remainder_len = 0;
+		} else {
+			datap += block_size;
+		}
+
+		remainder = (size_t)&data[length] - (size_t)datap;
+
+		/* Incomplete last block. */
+		if (remainder > 0 && remainder < block_size) {
+			bcopy(datap, ctx->gcm_remainder, remainder);
+			ctx->gcm_remainder_len = remainder;
+			ctx->gcm_copy_to = datap;
+			goto out;
+		}
+		ctx->gcm_copy_to = NULL;
+
+	} while (remainder > 0);
+out:
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+int
+gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+#ifdef CAN_USE_GCM_ASM
+	if (ctx->gcm_use_avx == B_TRUE)
+		return (gcm_encrypt_final_avx(ctx, out, block_size));
+#endif
+
+	const gcm_impl_ops_t *gops;
+	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+	uint8_t *ghash, *macp = NULL;
+	int i, rv;
+
+	if (out->cd_length <
+	    (ctx->gcm_remainder_len + ctx->gcm_tag_len)) {
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	gops = gcm_impl_get_ops();
+	ghash = (uint8_t *)ctx->gcm_ghash;
+
+	if (ctx->gcm_remainder_len > 0) {
+		uint64_t counter;
+		uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp;
+
+		/*
+		 * Here is where we deal with data that is not a
+		 * multiple of the block size.
+		 */
+
+		/*
+		 * Increment counter.
+		 */
+		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+		counter = htonll(counter + 1);
+		counter &= counter_mask;
+		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb,
+		    (uint8_t *)ctx->gcm_tmp);
+
+		macp = (uint8_t *)ctx->gcm_remainder;
+		bzero(macp + ctx->gcm_remainder_len,
+		    block_size - ctx->gcm_remainder_len);
+
+		/* XOR with counter block */
+		for (i = 0; i < ctx->gcm_remainder_len; i++) {
+			macp[i] ^= tmpp[i];
+		}
+
+		/* add ciphertext to the hash */
+		GHASH(ctx, macp, ghash, gops);
+
+		ctx->gcm_processed_data_len += ctx->gcm_remainder_len;
+	}
+
+	ctx->gcm_len_a_len_c[1] =
+	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
+	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
+	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
+	    (uint8_t *)ctx->gcm_J0);
+	xor_block((uint8_t *)ctx->gcm_J0, ghash);
+
+	if (ctx->gcm_remainder_len > 0) {
+		rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len);
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+	}
+	out->cd_offset += ctx->gcm_remainder_len;
+	ctx->gcm_remainder_len = 0;
+	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
+	if (rv != CRYPTO_SUCCESS)
+		return (rv);
+	out->cd_offset += ctx->gcm_tag_len;
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * This will only deal with decrypting the last block of the input that
+ * might not be a multiple of block length.
+ */
+static void
+gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	uint8_t *datap, *outp, *counterp;
+	uint64_t counter;
+	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+	int i;
+
+	/*
+	 * Increment counter.
+	 * Counter bits are confined to the bottom 32 bits
+	 */
+	counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+	counter = htonll(counter + 1);
+	counter &= counter_mask;
+	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+	datap = (uint8_t *)ctx->gcm_remainder;
+	outp = &((ctx->gcm_pt_buf)[index]);
+	counterp = (uint8_t *)ctx->gcm_tmp;
+
+	/* authentication tag */
+	bzero((uint8_t *)ctx->gcm_tmp, block_size);
+	bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len);
+
+	/* add ciphertext to the hash */
+	GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops());
+
+	/* decrypt remaining ciphertext */
+	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp);
+
+	/* XOR with counter block */
+	for (i = 0; i < ctx->gcm_remainder_len; i++) {
+		outp[i] = datap[i] ^ counterp[i];
+	}
+}
+
+/* ARGSUSED */
+int
+gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+    crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	size_t new_len;
+	uint8_t *new;
+
+	/*
+	 * Copy contiguous ciphertext input blocks to plaintext buffer.
+	 * Ciphertext will be decrypted in the final.
+	 */
+	if (length > 0) {
+		new_len = ctx->gcm_pt_buf_len + length;
+		new = vmem_alloc(new_len, ctx->gcm_kmflag);
+		if (new == NULL) {
+			vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
+			ctx->gcm_pt_buf = NULL;
+			return (CRYPTO_HOST_MEMORY);
+		}
+		bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len);
+		vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len);
+		ctx->gcm_pt_buf = new;
+		ctx->gcm_pt_buf_len = new_len;
+		bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len],
+		    length);
+		ctx->gcm_processed_data_len += length;
+	}
+
+	ctx->gcm_remainder_len = 0;
+	return (CRYPTO_SUCCESS);
+}
+
+int
+gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+#ifdef CAN_USE_GCM_ASM
+	if (ctx->gcm_use_avx == B_TRUE)
+		return (gcm_decrypt_final_avx(ctx, out, block_size));
+#endif
+
+	const gcm_impl_ops_t *gops;
+	size_t pt_len;
+	size_t remainder;
+	uint8_t *ghash;
+	uint8_t *blockp;
+	uint8_t *cbp;
+	uint64_t counter;
+	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+	int processed = 0, rv;
+
+	ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len);
+
+	gops = gcm_impl_get_ops();
+	pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
+	ghash = (uint8_t *)ctx->gcm_ghash;
+	blockp = ctx->gcm_pt_buf;
+	remainder = pt_len;
+	while (remainder > 0) {
+		/* Incomplete last block */
+		if (remainder < block_size) {
+			bcopy(blockp, ctx->gcm_remainder, remainder);
+			ctx->gcm_remainder_len = remainder;
+			/*
+			 * not expecting anymore ciphertext, just
+			 * compute plaintext for the remaining input
+			 */
+			gcm_decrypt_incomplete_block(ctx, block_size,
+			    processed, encrypt_block, xor_block);
+			ctx->gcm_remainder_len = 0;
+			goto out;
+		}
+		/* add ciphertext to the hash */
+		GHASH(ctx, blockp, ghash, gops);
+
+		/*
+		 * Increment counter.
+		 * Counter bits are confined to the bottom 32 bits
+		 */
+		counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+		counter = htonll(counter + 1);
+		counter &= counter_mask;
+		ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+
+		cbp = (uint8_t *)ctx->gcm_tmp;
+		encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp);
+
+		/* XOR with ciphertext */
+		xor_block(cbp, blockp);
+
+		processed += block_size;
+		blockp += block_size;
+		remainder -= block_size;
+	}
+out:
+	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
+	GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops);
+	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0,
+	    (uint8_t *)ctx->gcm_J0);
+	xor_block((uint8_t *)ctx->gcm_J0, ghash);
+
+	/* compare the input authentication tag with what we calculated */
+	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
+		/* They don't match */
+		return (CRYPTO_INVALID_MAC);
+	} else {
+		rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+		out->cd_offset += pt_len;
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+static int
+gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param)
+{
+	size_t tag_len;
+
+	/*
+	 * Check the length of the authentication tag (in bits).
+	 */
+	tag_len = gcm_param->ulTagBits;
+	switch (tag_len) {
+	case 32:
+	case 64:
+	case 96:
+	case 104:
+	case 112:
+	case 120:
+	case 128:
+		break;
+	default:
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+
+	if (gcm_param->ulIvLen == 0)
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+
+	return (CRYPTO_SUCCESS);
+}
+
+static void
+gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
+    gcm_ctx_t *ctx, size_t block_size,
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	const gcm_impl_ops_t *gops;
+	uint8_t *cb;
+	ulong_t remainder = iv_len;
+	ulong_t processed = 0;
+	uint8_t *datap, *ghash;
+	uint64_t len_a_len_c[2];
+
+	gops = gcm_impl_get_ops();
+	ghash = (uint8_t *)ctx->gcm_ghash;
+	cb = (uint8_t *)ctx->gcm_cb;
+	if (iv_len == 12) {
+		bcopy(iv, cb, 12);
+		cb[12] = 0;
+		cb[13] = 0;
+		cb[14] = 0;
+		cb[15] = 1;
+		/* J0 will be used again in the final */
+		copy_block(cb, (uint8_t *)ctx->gcm_J0);
+	} else {
+		/* GHASH the IV */
+		do {
+			if (remainder < block_size) {
+				bzero(cb, block_size);
+				bcopy(&(iv[processed]), cb, remainder);
+				datap = (uint8_t *)cb;
+				remainder = 0;
+			} else {
+				datap = (uint8_t *)(&(iv[processed]));
+				processed += block_size;
+				remainder -= block_size;
+			}
+			GHASH(ctx, datap, ghash, gops);
+		} while (remainder > 0);
+
+		len_a_len_c[0] = 0;
+		len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len));
+		GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops);
+
+		/* J0 will be used again in the final */
+		copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb);
+	}
+}
+
+static int
+gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
+    unsigned char *auth_data, size_t auth_data_len, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	const gcm_impl_ops_t *gops;
+	uint8_t *ghash, *datap, *authp;
+	size_t remainder, processed;
+
+	/* encrypt zero block to get subkey H */
+	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
+	encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H,
+	    (uint8_t *)ctx->gcm_H);
+
+	gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
+	    copy_block, xor_block);
+
+	gops = gcm_impl_get_ops();
+	authp = (uint8_t *)ctx->gcm_tmp;
+	ghash = (uint8_t *)ctx->gcm_ghash;
+	bzero(authp, block_size);
+	bzero(ghash, block_size);
+
+	processed = 0;
+	remainder = auth_data_len;
+	do {
+		if (remainder < block_size) {
+			/*
+			 * There's not a block full of data, pad rest of
+			 * buffer with zero
+			 */
+			bzero(authp, block_size);
+			bcopy(&(auth_data[processed]), authp, remainder);
+			datap = (uint8_t *)authp;
+			remainder = 0;
+		} else {
+			datap = (uint8_t *)(&(auth_data[processed]));
+			processed += block_size;
+			remainder -= block_size;
+		}
+
+		/* add auth data to the hash */
+		GHASH(ctx, datap, ghash, gops);
+
+	} while (remainder > 0);
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * The following function is called at encrypt or decrypt init time
+ * for AES GCM mode.
+ *
+ * Init the GCM context struct. Handle the cycle and avx implementations here.
+ */
+int
+gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	int rv;
+	CK_AES_GCM_PARAMS *gcm_param;
+
+	if (param != NULL) {
+		gcm_param = (CK_AES_GCM_PARAMS *)(void *)param;
+
+		if ((rv = gcm_validate_args(gcm_param)) != 0) {
+			return (rv);
+		}
+
+		gcm_ctx->gcm_tag_len = gcm_param->ulTagBits;
+		gcm_ctx->gcm_tag_len >>= 3;
+		gcm_ctx->gcm_processed_data_len = 0;
+
+		/* these values are in bits */
+		gcm_ctx->gcm_len_a_len_c[0]
+		    = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen));
+
+		rv = CRYPTO_SUCCESS;
+		gcm_ctx->gcm_flags |= GCM_MODE;
+	} else {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+
+#ifdef CAN_USE_GCM_ASM
+	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
+		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
+	} else {
+		/*
+		 * Handle the "cycle" implementation by creating avx and
+		 * non-avx contexts alternately.
+		 */
+		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
+		/*
+		 * We don't handle byte swapped key schedules in the avx
+		 * code path.
+		 */
+		aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
+		if (ks->ops->needs_byteswap == B_TRUE) {
+			gcm_ctx->gcm_use_avx = B_FALSE;
+		}
+		/* Use the MOVBE and the BSWAP variants alternately. */
+		if (gcm_ctx->gcm_use_avx == B_TRUE &&
+		    zfs_movbe_available() == B_TRUE) {
+			(void) atomic_toggle_boolean_nv(
+			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
+		}
+	}
+	/* Allocate Htab memory as needed. */
+	if (gcm_ctx->gcm_use_avx == B_TRUE) {
+		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
+
+		if (htab_len == 0) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		gcm_ctx->gcm_htab_len = htab_len;
+		gcm_ctx->gcm_Htable =
+		    (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
+
+		if (gcm_ctx->gcm_Htable == NULL) {
+			return (CRYPTO_HOST_MEMORY);
+		}
+	}
+	/* Avx and non avx context initialization differs from here on. */
+	if (gcm_ctx->gcm_use_avx == B_FALSE) {
+#endif /* ifdef CAN_USE_GCM_ASM */
+		if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
+		    gcm_param->pAAD, gcm_param->ulAADLen, block_size,
+		    encrypt_block, copy_block, xor_block) != 0) {
+			rv = CRYPTO_MECHANISM_PARAM_INVALID;
+		}
+#ifdef CAN_USE_GCM_ASM
+	} else {
+		if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen,
+		    gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) {
+			rv = CRYPTO_MECHANISM_PARAM_INVALID;
+		}
+	}
+#endif /* ifdef CAN_USE_GCM_ASM */
+
+	return (rv);
+}
+
+int
+gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *))
+{
+	int rv;
+	CK_AES_GMAC_PARAMS *gmac_param;
+
+	if (param != NULL) {
+		gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param;
+
+		gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS);
+		gcm_ctx->gcm_processed_data_len = 0;
+
+		/* these values are in bits */
+		gcm_ctx->gcm_len_a_len_c[0]
+		    = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen));
+
+		rv = CRYPTO_SUCCESS;
+		gcm_ctx->gcm_flags |= GMAC_MODE;
+	} else {
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+	}
+
+#ifdef CAN_USE_GCM_ASM
+	/*
+	 * Handle the "cycle" implementation by creating avx and non avx
+	 * contexts alternately.
+	 */
+	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
+		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
+	} else {
+		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
+	}
+	/* We don't handle byte swapped key schedules in the avx code path. */
+	aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
+	if (ks->ops->needs_byteswap == B_TRUE) {
+		gcm_ctx->gcm_use_avx = B_FALSE;
+	}
+	/* Allocate Htab memory as needed. */
+	if (gcm_ctx->gcm_use_avx == B_TRUE) {
+		size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
+
+		if (htab_len == 0) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		gcm_ctx->gcm_htab_len = htab_len;
+		gcm_ctx->gcm_Htable =
+		    (uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
+
+		if (gcm_ctx->gcm_Htable == NULL) {
+			return (CRYPTO_HOST_MEMORY);
+		}
+	}
+
+	/* Avx and non avx context initialization differs from here on. */
+	if (gcm_ctx->gcm_use_avx == B_FALSE) {
+#endif	/* ifdef CAN_USE_GCM_ASM */
+		if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
+		    gmac_param->pAAD, gmac_param->ulAADLen, block_size,
+		    encrypt_block, copy_block, xor_block) != 0) {
+			rv = CRYPTO_MECHANISM_PARAM_INVALID;
+		}
+#ifdef CAN_USE_GCM_ASM
+	} else {
+		if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN,
+		    gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) {
+			rv = CRYPTO_MECHANISM_PARAM_INVALID;
+		}
+	}
+#endif /* ifdef CAN_USE_GCM_ASM */
+
+	return (rv);
+}
+
+void *
+gcm_alloc_ctx(int kmflag)
+{
+	gcm_ctx_t *gcm_ctx;
+
+	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
+		return (NULL);
+
+	gcm_ctx->gcm_flags = GCM_MODE;
+	return (gcm_ctx);
+}
+
+void *
+gmac_alloc_ctx(int kmflag)
+{
+	gcm_ctx_t *gcm_ctx;
+
+	if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL)
+		return (NULL);
+
+	gcm_ctx->gcm_flags = GMAC_MODE;
+	return (gcm_ctx);
+}
+
+void
+gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag)
+{
+	ctx->gcm_kmflag = kmflag;
+}
+
+/* GCM implementation that contains the fastest methods */
+static gcm_impl_ops_t gcm_fastest_impl = {
+	.name = "fastest"
+};
+
+/* All compiled in implementations */
+const gcm_impl_ops_t *gcm_all_impl[] = {
+	&gcm_generic_impl,
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+	&gcm_pclmulqdq_impl,
+#endif
+};
+
+/* Indicate that benchmark has been completed */
+static boolean_t gcm_impl_initialized = B_FALSE;
+
+/* Hold all supported implementations */
+static size_t gcm_supp_impl_cnt = 0;
+static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
+
+/*
+ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
+ * SIMD implementation is not allowed in the current context, then
+ * fallback to the fastest generic implementation.
+ */
+const gcm_impl_ops_t *
+gcm_impl_get_ops()
+{
+	if (!kfpu_allowed())
+		return (&gcm_generic_impl);
+
+	const gcm_impl_ops_t *ops = NULL;
+	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+
+	switch (impl) {
+	case IMPL_FASTEST:
+		ASSERT(gcm_impl_initialized);
+		ops = &gcm_fastest_impl;
+		break;
+	case IMPL_CYCLE:
+		/* Cycle through supported implementations */
+		ASSERT(gcm_impl_initialized);
+		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+		static size_t cycle_impl_idx = 0;
+		size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
+		ops = gcm_supp_impl[idx];
+		break;
+#ifdef CAN_USE_GCM_ASM
+	case IMPL_AVX:
+		/*
+		 * Make sure that we return a valid implementation while
+		 * switching to the avx implementation since there still
+		 * may be unfinished non-avx contexts around.
+		 */
+		ops = &gcm_generic_impl;
+		break;
+#endif
+	default:
+		ASSERT3U(impl, <, gcm_supp_impl_cnt);
+		ASSERT3U(gcm_supp_impl_cnt, >, 0);
+		if (impl < ARRAY_SIZE(gcm_all_impl))
+			ops = gcm_supp_impl[impl];
+		break;
+	}
+
+	ASSERT3P(ops, !=, NULL);
+
+	return (ops);
+}
+
+/*
+ * Initialize all supported implementations.
+ */
+void
+gcm_impl_init(void)
+{
+	gcm_impl_ops_t *curr_impl;
+	int i, c;
+
+	/* Move supported implementations into gcm_supp_impls */
+	for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
+		curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
+
+		if (curr_impl->is_supported())
+			gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl;
+	}
+	gcm_supp_impl_cnt = c;
+
+	/*
+	 * Set the fastest implementation given the assumption that the
+	 * hardware accelerated version is the fastest.
+	 */
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+	if (gcm_pclmulqdq_impl.is_supported()) {
+		memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
+		    sizeof (gcm_fastest_impl));
+	} else
+#endif
+	{
+		memcpy(&gcm_fastest_impl, &gcm_generic_impl,
+		    sizeof (gcm_fastest_impl));
+	}
+
+	strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX);
+
+#ifdef CAN_USE_GCM_ASM
+	/*
+	 * Use the avx implementation if it's available and the implementation
+	 * hasn't changed from its default value of fastest on module load.
+	 */
+	if (gcm_avx_will_work()) {
+#ifdef HAVE_MOVBE
+		if (zfs_movbe_available() == B_TRUE) {
+			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
+		}
+#endif
+		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
+			gcm_set_avx(B_TRUE);
+		}
+	}
+#endif
+	/* Finish initialization */
+	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
+	gcm_impl_initialized = B_TRUE;
+}
+
+static const struct {
+	char *name;
+	uint32_t sel;
+} gcm_impl_opts[] = {
+		{ "cycle",	IMPL_CYCLE },
+		{ "fastest",	IMPL_FASTEST },
+#ifdef CAN_USE_GCM_ASM
+		{ "avx",	IMPL_AVX },
+#endif
+};
+
+/*
+ * Function sets desired gcm implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * icp_gcm_impl.
+ *
+ * @val		Name of gcm implementation to use
+ * @param	Unused.
+ */
+int
+gcm_impl_set(const char *val)
+{
+	int err = -EINVAL;
+	char req_name[GCM_IMPL_NAME_MAX];
+	uint32_t impl = GCM_IMPL_READ(user_sel_impl);
+	size_t i;
+
+	/* sanitize input */
+	i = strnlen(val, GCM_IMPL_NAME_MAX);
+	if (i == 0 || i >= GCM_IMPL_NAME_MAX)
+		return (err);
+
+	strlcpy(req_name, val, GCM_IMPL_NAME_MAX);
+	while (i > 0 && isspace(req_name[i-1]))
+		i--;
+	req_name[i] = '\0';
+
+	/* Check mandatory options */
+	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+#ifdef CAN_USE_GCM_ASM
+		/* Ignore avx implementation if it won't work. */
+		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
+			continue;
+		}
+#endif
+		if (strcmp(req_name, gcm_impl_opts[i].name) == 0) {
+			impl = gcm_impl_opts[i].sel;
+			err = 0;
+			break;
+		}
+	}
+
+	/* check all supported impl if init() was already called */
+	if (err != 0 && gcm_impl_initialized) {
+		/* check all supported implementations */
+		for (i = 0; i < gcm_supp_impl_cnt; i++) {
+			if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) {
+				impl = i;
+				err = 0;
+				break;
+			}
+		}
+	}
+#ifdef CAN_USE_GCM_ASM
+	/*
+	 * Use the avx implementation if available and the requested one is
+	 * avx or fastest.
+	 */
+	if (gcm_avx_will_work() == B_TRUE &&
+	    (impl == IMPL_AVX || impl == IMPL_FASTEST)) {
+		gcm_set_avx(B_TRUE);
+	} else {
+		gcm_set_avx(B_FALSE);
+	}
+#endif
+
+	if (err == 0) {
+		if (gcm_impl_initialized)
+			atomic_swap_32(&icp_gcm_impl, impl);
+		else
+			atomic_swap_32(&user_sel_impl, impl);
+	}
+
+	return (err);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+
+static int
+icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+	return (gcm_impl_set(val));
+}
+
+static int
+icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+	int i, cnt = 0;
+	char *fmt;
+	const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+
+	ASSERT(gcm_impl_initialized);
+
+	/* list mandatory options */
+	for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) {
+#ifdef CAN_USE_GCM_ASM
+		/* Ignore avx implementation if it won't work. */
+		if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) {
+			continue;
+		}
+#endif
+		fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name);
+	}
+
+	/* list all supported implementations */
+	for (i = 0; i < gcm_supp_impl_cnt; i++) {
+		fmt = (i == impl) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name);
+	}
+
+	return (cnt);
+}
+
+module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get,
+    NULL, 0644);
+MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
+#endif /* defined(__KERNEL) */
+
+#ifdef CAN_USE_GCM_ASM
+#define	GCM_BLOCK_LEN 16
+/*
+ * The openssl asm routines are 6x aggregated and need that many bytes
+ * at minimum.
+ */
+#define	GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6)
+#define	GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3)
+/*
+ * Ensure the chunk size is reasonable since we are allocating a
+ * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts.
+ */
+#define	GCM_AVX_MAX_CHUNK_SIZE \
+	(((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES)
+
+/* Get the chunk size module parameter. */
+#define	GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size
+
+/* Clear the FPU registers since they hold sensitive internal state. */
+#define	clear_fpu_regs() clear_fpu_regs_avx()
+#define	GHASH_AVX(ctx, in, len) \
+    gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
+    in, len)
+
+#define	gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
+
+/*
+ * Module parameter: number of bytes to process at once while owning the FPU.
+ * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is
+ * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES.
+ */
+static uint32_t gcm_avx_chunk_size =
+	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
+
+extern void clear_fpu_regs_avx(void);
+extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+extern void aes_encrypt_intel(const uint32_t rk[], int nr,
+    const uint32_t pt[4], uint32_t ct[4]);
+
+extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
+extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
+    const uint8_t *in, size_t len);
+
+extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
+    const void *, uint64_t *, uint64_t *);
+
+extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t,
+    const void *, uint64_t *, uint64_t *);
+
+static inline boolean_t
+gcm_avx_will_work(void)
+{
+	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
+	return (kfpu_allowed() &&
+	    zfs_avx_available() && zfs_aes_available() &&
+	    zfs_pclmulqdq_available());
+}
+
+static inline void
+gcm_set_avx(boolean_t val)
+{
+	if (gcm_avx_will_work() == B_TRUE) {
+		atomic_swap_32(&gcm_use_avx, val);
+	}
+}
+
+static inline boolean_t
+gcm_toggle_avx(void)
+{
+	if (gcm_avx_will_work() == B_TRUE) {
+		return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX));
+	} else {
+		return (B_FALSE);
+	}
+}
+
+static inline size_t
+gcm_simd_get_htab_size(boolean_t simd_mode)
+{
+	switch (simd_mode) {
+	case B_TRUE:
+		return (2 * 6 * 2 * sizeof (uint64_t));
+
+	default:
+		return (0);
+	}
+}
+
+/*
+ * Clear sensitive data in the context.
+ *
+ * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and
+ * ctx->gcm_Htable contain the hash sub key which protects authentication.
+ *
+ * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for
+ * a known plaintext attack, they consists of the IV and the first and last
+ * counter respectively. If they should be cleared is debatable.
+ */
+static inline void
+gcm_clear_ctx(gcm_ctx_t *ctx)
+{
+	bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
+	bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
+	bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
+	bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
+}
+
+/* Increment the GCM counter block by n. */
+static inline void
+gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n)
+{
+	uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+	uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask);
+
+	counter = htonll(counter + n);
+	counter &= counter_mask;
+	ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter;
+}
+
+/*
+ * Encrypt multiple blocks of data in GCM mode.
+ * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines
+ * if possible. While processing a chunk the FPU is "locked".
+ */
+static int
+gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data,
+    size_t length, crypto_data_t *out, size_t block_size)
+{
+	size_t bleft = length;
+	size_t need = 0;
+	size_t done = 0;
+	uint8_t *datap = (uint8_t *)data;
+	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
+	uint64_t *ghash = ctx->gcm_ghash;
+	uint64_t *cb = ctx->gcm_cb;
+	uint8_t *ct_buf = NULL;
+	uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
+	int rv = CRYPTO_SUCCESS;
+
+	ASSERT(block_size == GCM_BLOCK_LEN);
+	/*
+	 * If the last call left an incomplete block, try to fill
+	 * it first.
+	 */
+	if (ctx->gcm_remainder_len > 0) {
+		need = block_size - ctx->gcm_remainder_len;
+		if (length < need) {
+			/* Accumulate bytes here and return. */
+			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
+			    ctx->gcm_remainder_len, length);
+
+			ctx->gcm_remainder_len += length;
+			if (ctx->gcm_copy_to == NULL) {
+				ctx->gcm_copy_to = datap;
+			}
+			return (CRYPTO_SUCCESS);
+		} else {
+			/* Complete incomplete block. */
+			bcopy(datap, (uint8_t *)ctx->gcm_remainder +
+			    ctx->gcm_remainder_len, need);
+
+			ctx->gcm_copy_to = NULL;
+		}
+	}
+
+	/* Allocate a buffer to encrypt to if there is enough input. */
+	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
+		ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag);
+		if (ct_buf == NULL) {
+			return (CRYPTO_HOST_MEMORY);
+		}
+	}
+
+	/* If we completed an incomplete block, encrypt and write it out. */
+	if (ctx->gcm_remainder_len > 0) {
+		kfpu_begin();
+		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
+		    (const uint32_t *)cb, (uint32_t *)tmp);
+
+		gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp);
+		GHASH_AVX(ctx, tmp, block_size);
+		clear_fpu_regs();
+		kfpu_end();
+		rv = crypto_put_output_data(tmp, out, block_size);
+		out->cd_offset += block_size;
+		gcm_incr_counter_block(ctx);
+		ctx->gcm_processed_data_len += block_size;
+		bleft -= need;
+		datap += need;
+		ctx->gcm_remainder_len = 0;
+	}
+
+	/* Do the bulk encryption in chunk_size blocks. */
+	for (; bleft >= chunk_size; bleft -= chunk_size) {
+		kfpu_begin();
+		done = aesni_gcm_encrypt(
+		    datap, ct_buf, chunk_size, key, cb, ghash);
+
+		clear_fpu_regs();
+		kfpu_end();
+		if (done != chunk_size) {
+			rv = CRYPTO_FAILED;
+			goto out_nofpu;
+		}
+		rv = crypto_put_output_data(ct_buf, out, chunk_size);
+		if (rv != CRYPTO_SUCCESS) {
+			goto out_nofpu;
+		}
+		out->cd_offset += chunk_size;
+		datap += chunk_size;
+		ctx->gcm_processed_data_len += chunk_size;
+	}
+	/* Check if we are already done. */
+	if (bleft == 0) {
+		goto out_nofpu;
+	}
+	/* Bulk encrypt the remaining data. */
+	kfpu_begin();
+	if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) {
+		done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash);
+		if (done == 0) {
+			rv = CRYPTO_FAILED;
+			goto out;
+		}
+		rv = crypto_put_output_data(ct_buf, out, done);
+		if (rv != CRYPTO_SUCCESS) {
+			goto out;
+		}
+		out->cd_offset += done;
+		ctx->gcm_processed_data_len += done;
+		datap += done;
+		bleft -= done;
+
+	}
+	/* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */
+	while (bleft > 0) {
+		if (bleft < block_size) {
+			bcopy(datap, ctx->gcm_remainder, bleft);
+			ctx->gcm_remainder_len = bleft;
+			ctx->gcm_copy_to = datap;
+			goto out;
+		}
+		/* Encrypt, hash and write out. */
+		aes_encrypt_intel(key->encr_ks.ks32, key->nr,
+		    (const uint32_t *)cb, (uint32_t *)tmp);
+
+		gcm_xor_avx(datap, tmp);
+		GHASH_AVX(ctx, tmp, block_size);
+		rv = crypto_put_output_data(tmp, out, block_size);
+		if (rv != CRYPTO_SUCCESS) {
+			goto out;
+		}
+		out->cd_offset += block_size;
+		gcm_incr_counter_block(ctx);
+		ctx->gcm_processed_data_len += block_size;
+		datap += block_size;
+		bleft -= block_size;
+	}
+out:
+	clear_fpu_regs();
+	kfpu_end();
+out_nofpu:
+	if (ct_buf != NULL) {
+		vmem_free(ct_buf, chunk_size);
+	}
+	return (rv);
+}
+
+/*
+ * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual
+ * incomplete last block. Encrypt the ICB. Calculate the tag and write it out.
+ */
+static int
+gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
+{
+	uint8_t *ghash = (uint8_t *)ctx->gcm_ghash;
+	uint32_t *J0 = (uint32_t *)ctx->gcm_J0;
+	uint8_t *remainder = (uint8_t *)ctx->gcm_remainder;
+	size_t rem_len = ctx->gcm_remainder_len;
+	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
+	int aes_rounds = ((aes_key_t *)keysched)->nr;
+	int rv;
+
+	ASSERT(block_size == GCM_BLOCK_LEN);
+
+	if (out->cd_length < (rem_len + ctx->gcm_tag_len)) {
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	kfpu_begin();
+	/* Pad last incomplete block with zeros, encrypt and hash. */
+	if (rem_len > 0) {
+		uint8_t *tmp = (uint8_t *)ctx->gcm_tmp;
+		const uint32_t *cb = (uint32_t *)ctx->gcm_cb;
+
+		aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp);
+		bzero(remainder + rem_len, block_size - rem_len);
+		for (int i = 0; i < rem_len; i++) {
+			remainder[i] ^= tmp[i];
+		}
+		GHASH_AVX(ctx, remainder, block_size);
+		ctx->gcm_processed_data_len += rem_len;
+		/* No need to increment counter_block, it's the last block. */
+	}
+	/* Finish tag. */
+	ctx->gcm_len_a_len_c[1] =
+	    htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len));
+	GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size);
+	aes_encrypt_intel(keysched, aes_rounds, J0, J0);
+
+	gcm_xor_avx((uint8_t *)J0, ghash);
+	clear_fpu_regs();
+	kfpu_end();
+
+	/* Output remainder. */
+	if (rem_len > 0) {
+		rv = crypto_put_output_data(remainder, out, rem_len);
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+	}
+	out->cd_offset += rem_len;
+	ctx->gcm_remainder_len = 0;
+	rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len);
+	if (rv != CRYPTO_SUCCESS)
+		return (rv);
+
+	out->cd_offset += ctx->gcm_tag_len;
+	/* Clear sensitive data in the context before returning. */
+	gcm_clear_ctx(ctx);
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Finalize decryption: We just have accumulated crypto text, so now we
+ * decrypt it here inplace.
+ */
+static int
+gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size)
+{
+	ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len);
+	ASSERT3U(block_size, ==, 16);
+
+	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+	size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
+	uint8_t *datap = ctx->gcm_pt_buf;
+	const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched);
+	uint32_t *cb = (uint32_t *)ctx->gcm_cb;
+	uint64_t *ghash = ctx->gcm_ghash;
+	uint32_t *tmp = (uint32_t *)ctx->gcm_tmp;
+	int rv = CRYPTO_SUCCESS;
+	size_t bleft, done;
+
+	/*
+	 * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be
+	 * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of
+	 * GCM_AVX_MIN_DECRYPT_BYTES.
+	 */
+	for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) {
+		kfpu_begin();
+		done = aesni_gcm_decrypt(datap, datap, chunk_size,
+		    (const void *)key, ctx->gcm_cb, ghash);
+		clear_fpu_regs();
+		kfpu_end();
+		if (done != chunk_size) {
+			return (CRYPTO_FAILED);
+		}
+		datap += done;
+	}
+	/* Decrypt remainder, which is less then chunk size, in one go. */
+	kfpu_begin();
+	if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) {
+		done = aesni_gcm_decrypt(datap, datap, bleft,
+		    (const void *)key, ctx->gcm_cb, ghash);
+		if (done == 0) {
+			clear_fpu_regs();
+			kfpu_end();
+			return (CRYPTO_FAILED);
+		}
+		datap += done;
+		bleft -= done;
+	}
+	ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES);
+
+	/*
+	 * Now less then GCM_AVX_MIN_DECRYPT_BYTES bytes remain,
+	 * decrypt them block by block.
+	 */
+	while (bleft > 0) {
+		/* Incomplete last block. */
+		if (bleft < block_size) {
+			uint8_t *lastb = (uint8_t *)ctx->gcm_remainder;
+
+			bzero(lastb, block_size);
+			bcopy(datap, lastb, bleft);
+			/* The GCM processing. */
+			GHASH_AVX(ctx, lastb, block_size);
+			aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
+			for (size_t i = 0; i < bleft; i++) {
+				datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i];
+			}
+			break;
+		}
+		/* The GCM processing. */
+		GHASH_AVX(ctx, datap, block_size);
+		aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp);
+		gcm_xor_avx((uint8_t *)tmp, datap);
+		gcm_incr_counter_block(ctx);
+
+		datap += block_size;
+		bleft -= block_size;
+	}
+	if (rv != CRYPTO_SUCCESS) {
+		clear_fpu_regs();
+		kfpu_end();
+		return (rv);
+	}
+	/* Decryption done, finish the tag. */
+	ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len));
+	GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size);
+	aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0,
+	    (uint32_t *)ctx->gcm_J0);
+
+	gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash);
+
+	/* We are done with the FPU, restore its state. */
+	clear_fpu_regs();
+	kfpu_end();
+
+	/* Compare the input authentication tag with what we calculated. */
+	if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) {
+		/* They don't match. */
+		return (CRYPTO_INVALID_MAC);
+	}
+	rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len);
+	if (rv != CRYPTO_SUCCESS) {
+		return (rv);
+	}
+	out->cd_offset += pt_len;
+	gcm_clear_ctx(ctx);
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Initialize the GCM params H, Htabtle and the counter block. Save the
+ * initial counter block.
+ */
+static int
+gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
+    unsigned char *auth_data, size_t auth_data_len, size_t block_size)
+{
+	uint8_t *cb = (uint8_t *)ctx->gcm_cb;
+	uint64_t *H = ctx->gcm_H;
+	const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32;
+	int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr;
+	uint8_t *datap = auth_data;
+	size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ;
+	size_t bleft;
+
+	ASSERT(block_size == GCM_BLOCK_LEN);
+
+	/* Init H (encrypt zero block) and create the initial counter block. */
+	bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash));
+	bzero(H, sizeof (ctx->gcm_H));
+	kfpu_begin();
+	aes_encrypt_intel(keysched, aes_rounds,
+	    (const uint32_t *)H, (uint32_t *)H);
+
+	gcm_init_htab_avx(ctx->gcm_Htable, H);
+
+	if (iv_len == 12) {
+		bcopy(iv, cb, 12);
+		cb[12] = 0;
+		cb[13] = 0;
+		cb[14] = 0;
+		cb[15] = 1;
+		/* We need the ICB later. */
+		bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0));
+	} else {
+		/*
+		 * Most consumers use 12 byte IVs, so it's OK to use the
+		 * original routines for other IV sizes, just avoid nesting
+		 * kfpu_begin calls.
+		 */
+		clear_fpu_regs();
+		kfpu_end();
+		gcm_format_initial_blocks(iv, iv_len, ctx, block_size,
+		    aes_copy_block, aes_xor_block);
+		kfpu_begin();
+	}
+
+	/* Openssl post increments the counter, adjust for that. */
+	gcm_incr_counter_block(ctx);
+
+	/* Ghash AAD in chunk_size blocks. */
+	for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) {
+		GHASH_AVX(ctx, datap, chunk_size);
+		datap += chunk_size;
+		clear_fpu_regs();
+		kfpu_end();
+		kfpu_begin();
+	}
+	/* Ghash the remainder and handle possible incomplete GCM block. */
+	if (bleft > 0) {
+		size_t incomp = bleft % block_size;
+
+		bleft -= incomp;
+		if (bleft > 0) {
+			GHASH_AVX(ctx, datap, bleft);
+			datap += bleft;
+		}
+		if (incomp > 0) {
+			/* Zero pad and hash incomplete last block. */
+			uint8_t *authp = (uint8_t *)ctx->gcm_tmp;
+
+			bzero(authp, block_size);
+			bcopy(datap, authp, incomp);
+			GHASH_AVX(ctx, authp, block_size);
+		}
+	}
+	clear_fpu_regs();
+	kfpu_end();
+	return (CRYPTO_SUCCESS);
+}
+
+#if defined(_KERNEL)
+static int
+icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp)
+{
+	unsigned long val;
+	char val_rounded[16];
+	int error = 0;
+
+	error = kstrtoul(buf, 0, &val);
+	if (error)
+		return (error);
+
+	val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
+
+	if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE)
+		return (-EINVAL);
+
+	snprintf(val_rounded, 16, "%u", (uint32_t)val);
+	error = param_set_uint(val_rounded, kp);
+	return (error);
+}
+
+module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size,
+    param_get_uint, &gcm_avx_chunk_size, 0644);
+
+MODULE_PARM_DESC(icp_gcm_avx_chunk_size,
+	"How many bytes to process while owning the FPU");
+
+#endif /* defined(__KERNEL) */
+#endif /* ifdef CAN_USE_GCM_ASM */
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm_generic.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm_generic.c
new file mode 100644
index 000000000000..16b57998a92f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm_generic.c
@@ -0,0 +1,83 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <modes/gcm_impl.h>
+
+struct aes_block {
+	uint64_t a;
+	uint64_t b;
+};
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_generic_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+	static const uint64_t R = 0xe100000000000000ULL;
+	struct aes_block z = {0, 0};
+	struct aes_block v;
+	uint64_t x;
+	int i, j;
+
+	v.a = ntohll(y[0]);
+	v.b = ntohll(y[1]);
+
+	for (j = 0; j < 2; j++) {
+		x = ntohll(x_in[j]);
+		for (i = 0; i < 64; i++, x <<= 1) {
+			if (x & 0x8000000000000000ULL) {
+				z.a ^= v.a;
+				z.b ^= v.b;
+			}
+			if (v.b & 1ULL) {
+				v.b = (v.a << 63)|(v.b >> 1);
+				v.a = (v.a >> 1) ^ R;
+			} else {
+				v.b = (v.a << 63)|(v.b >> 1);
+				v.a = v.a >> 1;
+			}
+		}
+	}
+	res[0] = htonll(z.a);
+	res[1] = htonll(z.b);
+}
+
+static boolean_t
+gcm_generic_will_work(void)
+{
+	return (B_TRUE);
+}
+
+const gcm_impl_ops_t gcm_generic_impl = {
+	.mul = &gcm_generic_mul,
+	.is_supported = &gcm_generic_will_work,
+	.name = "generic"
+};
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/gcm_pclmulqdq.c b/sys/contrib/openzfs/module/icp/algs/modes/gcm_pclmulqdq.c
new file mode 100644
index 000000000000..05920115ce86
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/gcm_pclmulqdq.c
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+/* These functions are used to execute pclmulqdq based assembly methods */
+extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *);
+
+#include <modes/gcm_impl.h>
+
+/*
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on *x_in and *y and place the result in *res.
+ *
+ * Byte swap the input (*x_in and *y) and the output (*res).
+ *
+ * Note: x_in, y, and res all point to 16-byte numbers (an array of two
+ * 64-bit integers).
+ */
+static void
+gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+{
+	kfpu_begin();
+	gcm_mul_pclmulqdq(x_in, y, res);
+	kfpu_end();
+}
+
+static boolean_t
+gcm_pclmulqdq_will_work(void)
+{
+	return (kfpu_allowed() && zfs_pclmulqdq_available());
+}
+
+const gcm_impl_ops_t gcm_pclmulqdq_impl = {
+	.mul = &gcm_pclmulqdq_mul,
+	.is_supported = &gcm_pclmulqdq_will_work,
+	.name = "pclmulqdq"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_PCLMULQDQ) */
diff --git a/sys/contrib/openzfs/module/icp/algs/modes/modes.c b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
new file mode 100644
index 000000000000..59743c7d6829
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/modes/modes.c
@@ -0,0 +1,165 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Initialize by setting iov_or_mp to point to the current iovec or mp,
+ * and by setting current_offset to an offset within the current iovec or mp.
+ */
+void
+crypto_init_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset)
+{
+	offset_t offset;
+
+	switch (out->cd_format) {
+	case CRYPTO_DATA_RAW:
+		*current_offset = out->cd_offset;
+		break;
+
+	case CRYPTO_DATA_UIO: {
+		zfs_uio_t *uiop = out->cd_uio;
+		uint_t vec_idx;
+
+		offset = out->cd_offset;
+		offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx);
+
+		*current_offset = offset;
+		*iov_or_mp = (void *)(uintptr_t)vec_idx;
+		break;
+	}
+	} /* end switch */
+}
+
+/*
+ * Get pointers for where in the output to copy a block of encrypted or
+ * decrypted data.  The iov_or_mp argument stores a pointer to the current
+ * iovec or mp, and offset stores an offset into the current iovec or mp.
+ */
+void
+crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset,
+    uint8_t **out_data_1, size_t *out_data_1_len, uint8_t **out_data_2,
+    size_t amt)
+{
+	offset_t offset;
+
+	switch (out->cd_format) {
+	case CRYPTO_DATA_RAW: {
+		iovec_t *iov;
+
+		offset = *current_offset;
+		iov = &out->cd_raw;
+		if ((offset + amt) <= iov->iov_len) {
+			/* one block fits */
+			*out_data_1 = (uint8_t *)iov->iov_base + offset;
+			*out_data_1_len = amt;
+			*out_data_2 = NULL;
+			*current_offset = offset + amt;
+		}
+		break;
+	}
+
+	case CRYPTO_DATA_UIO: {
+		zfs_uio_t *uio = out->cd_uio;
+		offset_t offset;
+		uint_t vec_idx;
+		uint8_t *p;
+		uint64_t iov_len;
+		void *iov_base;
+
+		offset = *current_offset;
+		vec_idx = (uintptr_t)(*iov_or_mp);
+		zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len);
+		p = (uint8_t *)iov_base + offset;
+		*out_data_1 = p;
+
+		if (offset + amt <= iov_len) {
+			/* can fit one block into this iov */
+			*out_data_1_len = amt;
+			*out_data_2 = NULL;
+			*current_offset = offset + amt;
+		} else {
+			/* one block spans two iovecs */
+			*out_data_1_len = iov_len - offset;
+			if (vec_idx == zfs_uio_iovcnt(uio))
+				return;
+			vec_idx++;
+			zfs_uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len);
+			*out_data_2 = (uint8_t *)iov_base;
+			*current_offset = amt - *out_data_1_len;
+		}
+		*iov_or_mp = (void *)(uintptr_t)vec_idx;
+		break;
+	}
+	} /* end switch */
+}
+
+void
+crypto_free_mode_ctx(void *ctx)
+{
+	common_ctx_t *common_ctx = (common_ctx_t *)ctx;
+
+	switch (common_ctx->cc_flags &
+	    (ECB_MODE|CBC_MODE|CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) {
+	case ECB_MODE:
+		kmem_free(common_ctx, sizeof (ecb_ctx_t));
+		break;
+
+	case CBC_MODE:
+		kmem_free(common_ctx, sizeof (cbc_ctx_t));
+		break;
+
+	case CTR_MODE:
+		kmem_free(common_ctx, sizeof (ctr_ctx_t));
+		break;
+
+	case CCM_MODE:
+		if (((ccm_ctx_t *)ctx)->ccm_pt_buf != NULL)
+			vmem_free(((ccm_ctx_t *)ctx)->ccm_pt_buf,
+			    ((ccm_ctx_t *)ctx)->ccm_data_len);
+
+		kmem_free(ctx, sizeof (ccm_ctx_t));
+		break;
+
+	case GCM_MODE:
+	case GMAC_MODE:
+		if (((gcm_ctx_t *)ctx)->gcm_pt_buf != NULL)
+			vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf,
+			    ((gcm_ctx_t *)ctx)->gcm_pt_buf_len);
+
+#ifdef CAN_USE_GCM_ASM
+		if (((gcm_ctx_t *)ctx)->gcm_Htable != NULL) {
+			gcm_ctx_t *gcm_ctx = (gcm_ctx_t *)ctx;
+			bzero(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
+			kmem_free(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
+		}
+#endif
+
+		kmem_free(ctx, sizeof (gcm_ctx_t));
+	}
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/sha1/sha1.c b/sys/contrib/openzfs/module/icp/algs/sha1/sha1.c
new file mode 100644
index 000000000000..da34222c8fc3
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/sha1/sha1.c
@@ -0,0 +1,835 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * The basic framework for this code came from the reference
+ * implementation for MD5.  That implementation is Copyright (C)
+ * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * NOTE: Cleaned-up and optimized, version of SHA1, based on the FIPS 180-1
+ * standard, available at http://www.itl.nist.gov/fipspubs/fip180-1.htm
+ * Not as fast as one would like -- further optimizations are encouraged
+ * and appreciated.
+ */
+
+#include <sys/zfs_context.h>
+#include <sha1/sha1.h>
+#include <sha1/sha1_consts.h>
+
+#ifdef _LITTLE_ENDIAN
+#include <sys/byteorder.h>
+#define	HAVE_HTONL
+#endif
+
+#define	_RESTRICT_KYWD
+
+static void Encode(uint8_t *, const uint32_t *, size_t);
+
+#if	defined(__sparc)
+
+#define	SHA1_TRANSFORM(ctx, in) \
+	SHA1Transform((ctx)->state[0], (ctx)->state[1], (ctx)->state[2], \
+		(ctx)->state[3], (ctx)->state[4], (ctx), (in))
+
+static void SHA1Transform(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t,
+	SHA1_CTX *, const uint8_t *);
+
+#elif	defined(__amd64)
+
+#define	SHA1_TRANSFORM(ctx, in) sha1_block_data_order((ctx), (in), 1)
+#define	SHA1_TRANSFORM_BLOCKS(ctx, in, num) sha1_block_data_order((ctx), \
+		(in), (num))
+
+void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t num_blocks);
+
+#else
+
+#define	SHA1_TRANSFORM(ctx, in) SHA1Transform((ctx), (in))
+
+static void SHA1Transform(SHA1_CTX *, const uint8_t *);
+
+#endif
+
+
+static uint8_t PADDING[64] = { 0x80, /* all zeros */ };
+
+/*
+ * F, G, and H are the basic SHA1 functions.
+ */
+#define	F(b, c, d)	(((b) & (c)) | ((~b) & (d)))
+#define	G(b, c, d)	((b) ^ (c) ^ (d))
+#define	H(b, c, d)	(((b) & (c)) | (((b)|(c)) & (d)))
+
+/*
+ * SHA1Init()
+ *
+ * purpose: initializes the sha1 context and begins and sha1 digest operation
+ *   input: SHA1_CTX *	: the context to initializes.
+ *  output: void
+ */
+
+void
+SHA1Init(SHA1_CTX *ctx)
+{
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/*
+	 * load magic initialization constants. Tell lint
+	 * that these constants are unsigned by using U.
+	 */
+
+	ctx->state[0] = 0x67452301U;
+	ctx->state[1] = 0xefcdab89U;
+	ctx->state[2] = 0x98badcfeU;
+	ctx->state[3] = 0x10325476U;
+	ctx->state[4] = 0xc3d2e1f0U;
+}
+
+void
+SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len)
+{
+	uint32_t i, buf_index, buf_len;
+	const uint8_t *input = inptr;
+#if defined(__amd64)
+	uint32_t	block_count;
+#endif	/* __amd64 */
+
+	/* check for noop */
+	if (input_len == 0)
+		return;
+
+	/* compute number of bytes mod 64 */
+	buf_index = (ctx->count[1] >> 3) & 0x3F;
+
+	/* update number of bits */
+	if ((ctx->count[1] += (input_len << 3)) < (input_len << 3))
+		ctx->count[0]++;
+
+	ctx->count[0] += (input_len >> 29);
+
+	buf_len = 64 - buf_index;
+
+	/* transform as many times as possible */
+	i = 0;
+	if (input_len >= buf_len) {
+
+		/*
+		 * general optimization:
+		 *
+		 * only do initial bcopy() and SHA1Transform() if
+		 * buf_index != 0.  if buf_index == 0, we're just
+		 * wasting our time doing the bcopy() since there
+		 * wasn't any data left over from a previous call to
+		 * SHA1Update().
+		 */
+
+		if (buf_index) {
+			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
+			SHA1_TRANSFORM(ctx, ctx->buf_un.buf8);
+			i = buf_len;
+		}
+
+#if !defined(__amd64)
+		for (; i + 63 < input_len; i += 64)
+			SHA1_TRANSFORM(ctx, &input[i]);
+#else
+		block_count = (input_len - i) >> 6;
+		if (block_count > 0) {
+			SHA1_TRANSFORM_BLOCKS(ctx, &input[i], block_count);
+			i += block_count << 6;
+		}
+#endif	/* !__amd64 */
+
+		/*
+		 * general optimization:
+		 *
+		 * if i and input_len are the same, return now instead
+		 * of calling bcopy(), since the bcopy() in this case
+		 * will be an expensive nop.
+		 */
+
+		if (input_len == i)
+			return;
+
+		buf_index = 0;
+	}
+
+	/* buffer remaining input */
+	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
+}
+
+/*
+ * SHA1Final()
+ *
+ * purpose: ends an sha1 digest operation, finalizing the message digest and
+ *          zeroing the context.
+ *   input: uchar_t *	: A buffer to store the digest.
+ *			: The function actually uses void* because many
+ *			: callers pass things other than uchar_t here.
+ *          SHA1_CTX *  : the context to finalize, save, and zero
+ *  output: void
+ */
+
+void
+SHA1Final(void *digest, SHA1_CTX *ctx)
+{
+	uint8_t		bitcount_be[sizeof (ctx->count)];
+	uint32_t	index = (ctx->count[1] >> 3) & 0x3f;
+
+	/* store bit count, big endian */
+	Encode(bitcount_be, ctx->count, sizeof (bitcount_be));
+
+	/* pad out to 56 mod 64 */
+	SHA1Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
+
+	/* append length (before padding) */
+	SHA1Update(ctx, bitcount_be, sizeof (bitcount_be));
+
+	/* store state in digest */
+	Encode(digest, ctx->state, sizeof (ctx->state));
+
+	/* zeroize sensitive information */
+	bzero(ctx, sizeof (*ctx));
+}
+
+
+#if !defined(__amd64)
+
+typedef uint32_t sha1word;
+
+/*
+ * sparc optimization:
+ *
+ * on the sparc, we can load big endian 32-bit data easily.  note that
+ * special care must be taken to ensure the address is 32-bit aligned.
+ * in the interest of speed, we don't check to make sure, since
+ * careful programming can guarantee this for us.
+ */
+
+#if	defined(_ZFS_BIG_ENDIAN)
+#define	LOAD_BIG_32(addr)	(*(uint32_t *)(addr))
+
+#elif	defined(HAVE_HTONL)
+#define	LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr)))
+
+#else
+#define	LOAD_BIG_32(addr)	BE_32(*((uint32_t *)(addr)))
+#endif	/* _BIG_ENDIAN */
+
+/*
+ * SHA1Transform()
+ */
+#if	defined(W_ARRAY)
+#define	W(n) w[n]
+#else	/* !defined(W_ARRAY) */
+#define	W(n) w_ ## n
+#endif	/* !defined(W_ARRAY) */
+
+/*
+ * ROTATE_LEFT rotates x left n bits.
+ */
+
+#if	defined(__GNUC__) && defined(_LP64)
+static __inline__ uint64_t
+ROTATE_LEFT(uint64_t value, uint32_t n)
+{
+	uint32_t t32;
+
+	t32 = (uint32_t)value;
+	return ((t32 << n) | (t32 >> (32 - n)));
+}
+
+#else
+
+#define	ROTATE_LEFT(x, n)	\
+	(((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n))))
+
+#endif
+
+#if	defined(__sparc)
+
+
+/*
+ * sparc register window optimization:
+ *
+ * `a', `b', `c', `d', and `e' are passed into SHA1Transform
+ * explicitly since it increases the number of registers available to
+ * the compiler.  under this scheme, these variables can be held in
+ * %i0 - %i4, which leaves more local and out registers available.
+ *
+ * purpose: sha1 transformation -- updates the digest based on `block'
+ *   input: uint32_t	: bytes  1 -  4 of the digest
+ *          uint32_t	: bytes  5 -  8 of the digest
+ *          uint32_t	: bytes  9 - 12 of the digest
+ *          uint32_t	: bytes 12 - 16 of the digest
+ *          uint32_t	: bytes 16 - 20 of the digest
+ *          SHA1_CTX *	: the context to update
+ *          uint8_t [64]: the block to use to update the digest
+ *  output: void
+ */
+
+
+void
+SHA1Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e,
+    SHA1_CTX *ctx, const uint8_t blk[64])
+{
+	/*
+	 * sparc optimization:
+	 *
+	 * while it is somewhat counter-intuitive, on sparc, it is
+	 * more efficient to place all the constants used in this
+	 * function in an array and load the values out of the array
+	 * than to manually load the constants.  this is because
+	 * setting a register to a 32-bit value takes two ops in most
+	 * cases: a `sethi' and an `or', but loading a 32-bit value
+	 * from memory only takes one `ld' (or `lduw' on v9).  while
+	 * this increases memory usage, the compiler can find enough
+	 * other things to do while waiting to keep the pipeline does
+	 * not stall.  additionally, it is likely that many of these
+	 * constants are cached so that later accesses do not even go
+	 * out to the bus.
+	 *
+	 * this array is declared `static' to keep the compiler from
+	 * having to bcopy() this array onto the stack frame of
+	 * SHA1Transform() each time it is called -- which is
+	 * unacceptably expensive.
+	 *
+	 * the `const' is to ensure that callers are good citizens and
+	 * do not try to munge the array.  since these routines are
+	 * going to be called from inside multithreaded kernelland,
+	 * this is a good safety check. -- `sha1_consts' will end up in
+	 * .rodata.
+	 *
+	 * unfortunately, loading from an array in this manner hurts
+	 * performance under Intel.  So, there is a macro,
+	 * SHA1_CONST(), used in SHA1Transform(), that either expands to
+	 * a reference to this array, or to the actual constant,
+	 * depending on what platform this code is compiled for.
+	 */
+
+
+	static const uint32_t sha1_consts[] = {
+		SHA1_CONST_0, SHA1_CONST_1, SHA1_CONST_2, SHA1_CONST_3
+	};
+
+
+	/*
+	 * general optimization:
+	 *
+	 * use individual integers instead of using an array.  this is a
+	 * win, although the amount it wins by seems to vary quite a bit.
+	 */
+
+
+	uint32_t	w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
+	uint32_t	w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
+
+
+	/*
+	 * sparc optimization:
+	 *
+	 * if `block' is already aligned on a 4-byte boundary, use
+	 * LOAD_BIG_32() directly.  otherwise, bcopy() into a
+	 * buffer that *is* aligned on a 4-byte boundary and then do
+	 * the LOAD_BIG_32() on that buffer.  benchmarks have shown
+	 * that using the bcopy() is better than loading the bytes
+	 * individually and doing the endian-swap by hand.
+	 *
+	 * even though it's quite tempting to assign to do:
+	 *
+	 * blk = bcopy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32));
+	 *
+	 * and only have one set of LOAD_BIG_32()'s, the compiler
+	 * *does not* like that, so please resist the urge.
+	 */
+
+
+	if ((uintptr_t)blk & 0x3) {		/* not 4-byte aligned? */
+		bcopy(blk, ctx->buf_un.buf32,  sizeof (ctx->buf_un.buf32));
+		w_15 = LOAD_BIG_32(ctx->buf_un.buf32 + 15);
+		w_14 = LOAD_BIG_32(ctx->buf_un.buf32 + 14);
+		w_13 = LOAD_BIG_32(ctx->buf_un.buf32 + 13);
+		w_12 = LOAD_BIG_32(ctx->buf_un.buf32 + 12);
+		w_11 = LOAD_BIG_32(ctx->buf_un.buf32 + 11);
+		w_10 = LOAD_BIG_32(ctx->buf_un.buf32 + 10);
+		w_9  = LOAD_BIG_32(ctx->buf_un.buf32 +  9);
+		w_8  = LOAD_BIG_32(ctx->buf_un.buf32 +  8);
+		w_7  = LOAD_BIG_32(ctx->buf_un.buf32 +  7);
+		w_6  = LOAD_BIG_32(ctx->buf_un.buf32 +  6);
+		w_5  = LOAD_BIG_32(ctx->buf_un.buf32 +  5);
+		w_4  = LOAD_BIG_32(ctx->buf_un.buf32 +  4);
+		w_3  = LOAD_BIG_32(ctx->buf_un.buf32 +  3);
+		w_2  = LOAD_BIG_32(ctx->buf_un.buf32 +  2);
+		w_1  = LOAD_BIG_32(ctx->buf_un.buf32 +  1);
+		w_0  = LOAD_BIG_32(ctx->buf_un.buf32 +  0);
+	} else {
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_15 = LOAD_BIG_32(blk + 60);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_14 = LOAD_BIG_32(blk + 56);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_13 = LOAD_BIG_32(blk + 52);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_12 = LOAD_BIG_32(blk + 48);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_11 = LOAD_BIG_32(blk + 44);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_10 = LOAD_BIG_32(blk + 40);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_9  = LOAD_BIG_32(blk + 36);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_8  = LOAD_BIG_32(blk + 32);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_7  = LOAD_BIG_32(blk + 28);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_6  = LOAD_BIG_32(blk + 24);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_5  = LOAD_BIG_32(blk + 20);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_4  = LOAD_BIG_32(blk + 16);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_3  = LOAD_BIG_32(blk + 12);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_2  = LOAD_BIG_32(blk +  8);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_1  = LOAD_BIG_32(blk +  4);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		w_0  = LOAD_BIG_32(blk +  0);
+	}
+#else	/* !defined(__sparc) */
+
+void /* CSTYLED */
+SHA1Transform(SHA1_CTX *ctx, const uint8_t blk[64])
+{
+	/* CSTYLED */
+	sha1word a = ctx->state[0];
+	sha1word b = ctx->state[1];
+	sha1word c = ctx->state[2];
+	sha1word d = ctx->state[3];
+	sha1word e = ctx->state[4];
+
+#if	defined(W_ARRAY)
+	sha1word	w[16];
+#else	/* !defined(W_ARRAY) */
+	sha1word	w_0, w_1, w_2,  w_3,  w_4,  w_5,  w_6,  w_7;
+	sha1word	w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15;
+#endif	/* !defined(W_ARRAY) */
+
+	W(0)  = LOAD_BIG_32((void *)(blk +  0));
+	W(1)  = LOAD_BIG_32((void *)(blk +  4));
+	W(2)  = LOAD_BIG_32((void *)(blk +  8));
+	W(3)  = LOAD_BIG_32((void *)(blk + 12));
+	W(4)  = LOAD_BIG_32((void *)(blk + 16));
+	W(5)  = LOAD_BIG_32((void *)(blk + 20));
+	W(6)  = LOAD_BIG_32((void *)(blk + 24));
+	W(7)  = LOAD_BIG_32((void *)(blk + 28));
+	W(8)  = LOAD_BIG_32((void *)(blk + 32));
+	W(9)  = LOAD_BIG_32((void *)(blk + 36));
+	W(10) = LOAD_BIG_32((void *)(blk + 40));
+	W(11) = LOAD_BIG_32((void *)(blk + 44));
+	W(12) = LOAD_BIG_32((void *)(blk + 48));
+	W(13) = LOAD_BIG_32((void *)(blk + 52));
+	W(14) = LOAD_BIG_32((void *)(blk + 56));
+	W(15) = LOAD_BIG_32((void *)(blk + 60));
+
+#endif /* !defined(__sparc) */
+
+	/*
+	 * general optimization:
+	 *
+	 * even though this approach is described in the standard as
+	 * being slower algorithmically, it is 30-40% faster than the
+	 * "faster" version under SPARC, because this version has more
+	 * of the constraints specified at compile-time and uses fewer
+	 * variables (and therefore has better register utilization)
+	 * than its "speedier" brother.  (i've tried both, trust me)
+	 *
+	 * for either method given in the spec, there is an "assignment"
+	 * phase where the following takes place:
+	 *
+	 *	tmp = (main_computation);
+	 *	e = d; d = c; c = rotate_left(b, 30); b = a; a = tmp;
+	 *
+	 * we can make the algorithm go faster by not doing this work,
+	 * but just pretending that `d' is now `e', etc. this works
+	 * really well and obviates the need for a temporary variable.
+	 * however, we still explicitly perform the rotate action,
+	 * since it is cheaper on SPARC to do it once than to have to
+	 * do it over and over again.
+	 */
+
+	/* round 1 */
+	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(0) + SHA1_CONST(0); /* 0 */
+	b = ROTATE_LEFT(b, 30);
+
+	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(1) + SHA1_CONST(0); /* 1 */
+	a = ROTATE_LEFT(a, 30);
+
+	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(2) + SHA1_CONST(0); /* 2 */
+	e = ROTATE_LEFT(e, 30);
+
+	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(3) + SHA1_CONST(0); /* 3 */
+	d = ROTATE_LEFT(d, 30);
+
+	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(4) + SHA1_CONST(0); /* 4 */
+	c = ROTATE_LEFT(c, 30);
+
+	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(5) + SHA1_CONST(0); /* 5 */
+	b = ROTATE_LEFT(b, 30);
+
+	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(6) + SHA1_CONST(0); /* 6 */
+	a = ROTATE_LEFT(a, 30);
+
+	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(7) + SHA1_CONST(0); /* 7 */
+	e = ROTATE_LEFT(e, 30);
+
+	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(8) + SHA1_CONST(0); /* 8 */
+	d = ROTATE_LEFT(d, 30);
+
+	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(9) + SHA1_CONST(0); /* 9 */
+	c = ROTATE_LEFT(c, 30);
+
+	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(10) + SHA1_CONST(0); /* 10 */
+	b = ROTATE_LEFT(b, 30);
+
+	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(11) + SHA1_CONST(0); /* 11 */
+	a = ROTATE_LEFT(a, 30);
+
+	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(12) + SHA1_CONST(0); /* 12 */
+	e = ROTATE_LEFT(e, 30);
+
+	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(13) + SHA1_CONST(0); /* 13 */
+	d = ROTATE_LEFT(d, 30);
+
+	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(14) + SHA1_CONST(0); /* 14 */
+	c = ROTATE_LEFT(c, 30);
+
+	e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(15) + SHA1_CONST(0); /* 15 */
+	b = ROTATE_LEFT(b, 30);
+
+	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 16 */
+	d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(0) + SHA1_CONST(0);
+	a = ROTATE_LEFT(a, 30);
+
+	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 17 */
+	c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(1) + SHA1_CONST(0);
+	e = ROTATE_LEFT(e, 30);
+
+	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 18 */
+	b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(2) + SHA1_CONST(0);
+	d = ROTATE_LEFT(d, 30);
+
+	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 19 */
+	a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(3) + SHA1_CONST(0);
+	c = ROTATE_LEFT(c, 30);
+
+	/* round 2 */
+	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 20 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(4) + SHA1_CONST(1);
+	b = ROTATE_LEFT(b, 30);
+
+	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 21 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(5) + SHA1_CONST(1);
+	a = ROTATE_LEFT(a, 30);
+
+	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 22 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(6) + SHA1_CONST(1);
+	e = ROTATE_LEFT(e, 30);
+
+	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 23 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(7) + SHA1_CONST(1);
+	d = ROTATE_LEFT(d, 30);
+
+	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 24 */
+	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(8) + SHA1_CONST(1);
+	c = ROTATE_LEFT(c, 30);
+
+	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 25 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(9) + SHA1_CONST(1);
+	b = ROTATE_LEFT(b, 30);
+
+	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 26 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(10) + SHA1_CONST(1);
+	a = ROTATE_LEFT(a, 30);
+
+	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 27 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(11) + SHA1_CONST(1);
+	e = ROTATE_LEFT(e, 30);
+
+	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 28 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(12) + SHA1_CONST(1);
+	d = ROTATE_LEFT(d, 30);
+
+	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 29 */
+	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(13) + SHA1_CONST(1);
+	c = ROTATE_LEFT(c, 30);
+
+	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 30 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(14) + SHA1_CONST(1);
+	b = ROTATE_LEFT(b, 30);
+
+	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 31 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(15) + SHA1_CONST(1);
+	a = ROTATE_LEFT(a, 30);
+
+	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 32 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(0) + SHA1_CONST(1);
+	e = ROTATE_LEFT(e, 30);
+
+	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 33 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(1) + SHA1_CONST(1);
+	d = ROTATE_LEFT(d, 30);
+
+	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 34 */
+	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(2) + SHA1_CONST(1);
+	c = ROTATE_LEFT(c, 30);
+
+	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 35 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(3) + SHA1_CONST(1);
+	b = ROTATE_LEFT(b, 30);
+
+	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 36 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(4) + SHA1_CONST(1);
+	a = ROTATE_LEFT(a, 30);
+
+	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 37 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(5) + SHA1_CONST(1);
+	e = ROTATE_LEFT(e, 30);
+
+	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 38 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(6) + SHA1_CONST(1);
+	d = ROTATE_LEFT(d, 30);
+
+	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 39 */
+	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(7) + SHA1_CONST(1);
+	c = ROTATE_LEFT(c, 30);
+
+	/* round 3 */
+	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 40 */
+	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(8) + SHA1_CONST(2);
+	b = ROTATE_LEFT(b, 30);
+
+	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 41 */
+	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(9) + SHA1_CONST(2);
+	a = ROTATE_LEFT(a, 30);
+
+	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 42 */
+	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(10) + SHA1_CONST(2);
+	e = ROTATE_LEFT(e, 30);
+
+	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 43 */
+	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(11) + SHA1_CONST(2);
+	d = ROTATE_LEFT(d, 30);
+
+	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 44 */
+	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(12) + SHA1_CONST(2);
+	c = ROTATE_LEFT(c, 30);
+
+	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 45 */
+	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(13) + SHA1_CONST(2);
+	b = ROTATE_LEFT(b, 30);
+
+	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 46 */
+	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(14) + SHA1_CONST(2);
+	a = ROTATE_LEFT(a, 30);
+
+	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 47 */
+	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(15) + SHA1_CONST(2);
+	e = ROTATE_LEFT(e, 30);
+
+	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 48 */
+	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(0) + SHA1_CONST(2);
+	d = ROTATE_LEFT(d, 30);
+
+	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 49 */
+	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(1) + SHA1_CONST(2);
+	c = ROTATE_LEFT(c, 30);
+
+	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 50 */
+	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(2) + SHA1_CONST(2);
+	b = ROTATE_LEFT(b, 30);
+
+	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 51 */
+	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(3) + SHA1_CONST(2);
+	a = ROTATE_LEFT(a, 30);
+
+	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 52 */
+	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(4) + SHA1_CONST(2);
+	e = ROTATE_LEFT(e, 30);
+
+	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 53 */
+	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(5) + SHA1_CONST(2);
+	d = ROTATE_LEFT(d, 30);
+
+	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 54 */
+	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(6) + SHA1_CONST(2);
+	c = ROTATE_LEFT(c, 30);
+
+	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 55 */
+	e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(7) + SHA1_CONST(2);
+	b = ROTATE_LEFT(b, 30);
+
+	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 56 */
+	d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(8) + SHA1_CONST(2);
+	a = ROTATE_LEFT(a, 30);
+
+	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 57 */
+	c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(9) + SHA1_CONST(2);
+	e = ROTATE_LEFT(e, 30);
+
+	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 58 */
+	b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(10) + SHA1_CONST(2);
+	d = ROTATE_LEFT(d, 30);
+
+	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 59 */
+	a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(11) + SHA1_CONST(2);
+	c = ROTATE_LEFT(c, 30);
+
+	/* round 4 */
+	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 60 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(12) + SHA1_CONST(3);
+	b = ROTATE_LEFT(b, 30);
+
+	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 61 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(13) + SHA1_CONST(3);
+	a = ROTATE_LEFT(a, 30);
+
+	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 62 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(14) + SHA1_CONST(3);
+	e = ROTATE_LEFT(e, 30);
+
+	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 63 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(15) + SHA1_CONST(3);
+	d = ROTATE_LEFT(d, 30);
+
+	W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1);		/* 64 */
+	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(0) + SHA1_CONST(3);
+	c = ROTATE_LEFT(c, 30);
+
+	W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1);		/* 65 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(1) + SHA1_CONST(3);
+	b = ROTATE_LEFT(b, 30);
+
+	W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1);	/* 66 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(2) + SHA1_CONST(3);
+	a = ROTATE_LEFT(a, 30);
+
+	W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1);		/* 67 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(3) + SHA1_CONST(3);
+	e = ROTATE_LEFT(e, 30);
+
+	W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1);		/* 68 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(4) + SHA1_CONST(3);
+	d = ROTATE_LEFT(d, 30);
+
+	W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1);		/* 69 */
+	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(5) + SHA1_CONST(3);
+	c = ROTATE_LEFT(c, 30);
+
+	W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1);		/* 70 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(6) + SHA1_CONST(3);
+	b = ROTATE_LEFT(b, 30);
+
+	W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1);		/* 71 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(7) + SHA1_CONST(3);
+	a = ROTATE_LEFT(a, 30);
+
+	W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1);		/* 72 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(8) + SHA1_CONST(3);
+	e = ROTATE_LEFT(e, 30);
+
+	W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1);		/* 73 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(9) + SHA1_CONST(3);
+	d = ROTATE_LEFT(d, 30);
+
+	W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1);	/* 74 */
+	a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(10) + SHA1_CONST(3);
+	c = ROTATE_LEFT(c, 30);
+
+	W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1);	/* 75 */
+	e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(11) + SHA1_CONST(3);
+	b = ROTATE_LEFT(b, 30);
+
+	W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1);	/* 76 */
+	d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(12) + SHA1_CONST(3);
+	a = ROTATE_LEFT(a, 30);
+
+	W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1);	/* 77 */
+	c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(13) + SHA1_CONST(3);
+	e = ROTATE_LEFT(e, 30);
+
+	W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1);	/* 78 */
+	b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(14) + SHA1_CONST(3);
+	d = ROTATE_LEFT(d, 30);
+
+	W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1);	/* 79 */
+
+	ctx->state[0] += ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(15) +
+	    SHA1_CONST(3);
+	ctx->state[1] += b;
+	ctx->state[2] += ROTATE_LEFT(c, 30);
+	ctx->state[3] += d;
+	ctx->state[4] += e;
+
+	/* zeroize sensitive information */
+	W(0) = W(1) = W(2) = W(3) = W(4) = W(5) = W(6) = W(7) = W(8) = 0;
+	W(9) = W(10) = W(11) = W(12) = W(13) = W(14) = W(15) = 0;
+}
+#endif	/* !__amd64 */
+
+
+/*
+ * Encode()
+ *
+ * purpose: to convert a list of numbers from little endian to big endian
+ *   input: uint8_t *	: place to store the converted big endian numbers
+ *	    uint32_t *	: place to get numbers to convert from
+ *          size_t	: the length of the input in bytes
+ *  output: void
+ */
+
+static void
+Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input,
+    size_t len)
+{
+	size_t		i, j;
+
+#if defined(__sparc)
+	if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			*((uint32_t *)(output + j)) = input[i];
+		}
+	} else {
+#endif /* little endian -- will work on big endian, but slowly */
+
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			output[j]	= (input[i] >> 24) & 0xff;
+			output[j + 1]	= (input[i] >> 16) & 0xff;
+			output[j + 2]	= (input[i] >>  8) & 0xff;
+			output[j + 3]	= input[i] & 0xff;
+		}
+#if defined(__sparc)
+	}
+#endif
+}
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha2.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha2.c
new file mode 100644
index 000000000000..75f6a3c1af4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha2.c
@@ -0,0 +1,956 @@
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ */
+
+/*
+ * The basic framework for this code came from the reference
+ * implementation for MD5.  That implementation is Copyright (C)
+ * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * NOTE: Cleaned-up and optimized, version of SHA2, based on the FIPS 180-2
+ * standard, available at
+ * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf
+ * Not as fast as one would like -- further optimizations are encouraged
+ * and appreciated.
+ */
+
+#include <sys/zfs_context.h>
+#define	_SHA2_IMPL
+#include <sys/sha2.h>
+#include <sha2/sha2_consts.h>
+
+#define	_RESTRICT_KYWD
+
+#ifdef _ZFS_LITTLE_ENDIAN
+#include <sys/byteorder.h>
+#define	HAVE_HTONL
+#endif
+#include <sys/isa_defs.h>	/* for _ILP32 */
+
+static void Encode(uint8_t *, uint32_t *, size_t);
+static void Encode64(uint8_t *, uint64_t *, size_t);
+
+/* userspace only supports the generic version */
+#if	defined(__amd64) && defined(_KERNEL)
+#define	SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1)
+#define	SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1)
+
+void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
+void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
+
+#else
+static void SHA256Transform(SHA2_CTX *, const uint8_t *);
+static void SHA512Transform(SHA2_CTX *, const uint8_t *);
+#endif	/* __amd64 && _KERNEL */
+
+static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
+
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks are enforced (like 32-bit kernel builds), insert compiler memory
+ * barriers to reduce stack frame size. This can reduce the SHA512Transform()
+ * stack frame usage from 3k to <1k on ARM32, for example.
+ */
+#if defined(_ILP32) || defined(__powerpc)	/* small stack */
+#define	SMALL_STACK_MEMORY_BARRIER	asm volatile("": : :"memory");
+#else
+#define	SMALL_STACK_MEMORY_BARRIER
+#endif
+
+/* Ch and Maj are the basic SHA2 functions. */
+#define	Ch(b, c, d)	(((b) & (c)) ^ ((~b) & (d)))
+#define	Maj(b, c, d)	(((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d)))
+
+/* Rotates x right n bits. */
+#define	ROTR(x, n)	\
+	(((x) >> (n)) | ((x) << ((sizeof (x) * NBBY)-(n))))
+
+/* Shift x right n bits */
+#define	SHR(x, n)	((x) >> (n))
+
+/* SHA256 Functions */
+#define	BIGSIGMA0_256(x)	(ROTR((x), 2) ^ ROTR((x), 13) ^ ROTR((x), 22))
+#define	BIGSIGMA1_256(x)	(ROTR((x), 6) ^ ROTR((x), 11) ^ ROTR((x), 25))
+#define	SIGMA0_256(x)		(ROTR((x), 7) ^ ROTR((x), 18) ^ SHR((x), 3))
+#define	SIGMA1_256(x)		(ROTR((x), 17) ^ ROTR((x), 19) ^ SHR((x), 10))
+
+#define	SHA256ROUND(a, b, c, d, e, f, g, h, i, w)			\
+	T1 = h + BIGSIGMA1_256(e) + Ch(e, f, g) + SHA256_CONST(i) + w;	\
+	d += T1;							\
+	T2 = BIGSIGMA0_256(a) + Maj(a, b, c);				\
+	h = T1 + T2
+
+/* SHA384/512 Functions */
+#define	BIGSIGMA0(x)	(ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39))
+#define	BIGSIGMA1(x)	(ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41))
+#define	SIGMA0(x)	(ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7))
+#define	SIGMA1(x)	(ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6))
+#define	SHA512ROUND(a, b, c, d, e, f, g, h, i, w)			\
+	T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w;	\
+	d += T1;							\
+	T2 = BIGSIGMA0(a) + Maj(a, b, c);				\
+	h = T1 + T2;							\
+	SMALL_STACK_MEMORY_BARRIER;
+
+/*
+ * sparc optimization:
+ *
+ * on the sparc, we can load big endian 32-bit data easily.  note that
+ * special care must be taken to ensure the address is 32-bit aligned.
+ * in the interest of speed, we don't check to make sure, since
+ * careful programming can guarantee this for us.
+ */
+
+#if	defined(_ZFS_BIG_ENDIAN)
+#define	LOAD_BIG_32(addr)	(*(uint32_t *)(addr))
+#define	LOAD_BIG_64(addr)	(*(uint64_t *)(addr))
+
+#elif	defined(HAVE_HTONL)
+#define	LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr)))
+#define	LOAD_BIG_64(addr) htonll(*((uint64_t *)(addr)))
+
+#else
+/* little endian -- will work on big endian, but slowly */
+#define	LOAD_BIG_32(addr)	\
+	(((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3])
+#define	LOAD_BIG_64(addr)	\
+	(((uint64_t)(addr)[0] << 56) | ((uint64_t)(addr)[1] << 48) |	\
+	    ((uint64_t)(addr)[2] << 40) | ((uint64_t)(addr)[3] << 32) |	\
+	    ((uint64_t)(addr)[4] << 24) | ((uint64_t)(addr)[5] << 16) |	\
+	    ((uint64_t)(addr)[6] << 8) | (uint64_t)(addr)[7])
+#endif	/* _BIG_ENDIAN */
+
+
+#if	!defined(__amd64) || !defined(_KERNEL)
+/* SHA256 Transform */
+
+static void
+SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk)
+{
+	uint32_t a = ctx->state.s32[0];
+	uint32_t b = ctx->state.s32[1];
+	uint32_t c = ctx->state.s32[2];
+	uint32_t d = ctx->state.s32[3];
+	uint32_t e = ctx->state.s32[4];
+	uint32_t f = ctx->state.s32[5];
+	uint32_t g = ctx->state.s32[6];
+	uint32_t h = ctx->state.s32[7];
+
+	uint32_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint32_t w8, w9, w10, w11, w12, w13, w14, w15;
+	uint32_t T1, T2;
+
+#if	defined(__sparc)
+	static const uint32_t sha256_consts[] = {
+		SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2,
+		SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5,
+		SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8,
+		SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11,
+		SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14,
+		SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17,
+		SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20,
+		SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23,
+		SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26,
+		SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29,
+		SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32,
+		SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35,
+		SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38,
+		SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41,
+		SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44,
+		SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47,
+		SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50,
+		SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53,
+		SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56,
+		SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59,
+		SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62,
+		SHA256_CONST_63
+	};
+#endif	/* __sparc */
+
+	if ((uintptr_t)blk & 0x3) {		/* not 4-byte aligned? */
+		bcopy(blk, ctx->buf_un.buf32,  sizeof (ctx->buf_un.buf32));
+		blk = (uint8_t *)ctx->buf_un.buf32;
+	}
+
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w0 =  LOAD_BIG_32(blk + 4 * 0);
+	SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w1 =  LOAD_BIG_32(blk + 4 * 1);
+	SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w2 =  LOAD_BIG_32(blk + 4 * 2);
+	SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w3 =  LOAD_BIG_32(blk + 4 * 3);
+	SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w4 =  LOAD_BIG_32(blk + 4 * 4);
+	SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w5 =  LOAD_BIG_32(blk + 4 * 5);
+	SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w6 =  LOAD_BIG_32(blk + 4 * 6);
+	SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w7 =  LOAD_BIG_32(blk + 4 * 7);
+	SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w8 =  LOAD_BIG_32(blk + 4 * 8);
+	SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w9 =  LOAD_BIG_32(blk + 4 * 9);
+	SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w10 =  LOAD_BIG_32(blk + 4 * 10);
+	SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w11 =  LOAD_BIG_32(blk + 4 * 11);
+	SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w12 =  LOAD_BIG_32(blk + 4 * 12);
+	SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w13 =  LOAD_BIG_32(blk + 4 * 13);
+	SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w14 =  LOAD_BIG_32(blk + 4 * 14);
+	SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w15 =  LOAD_BIG_32(blk + 4 * 15);
+	SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15);
+
+	w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0;
+	SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0);
+	w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1;
+	SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1);
+	w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2;
+	SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2);
+	w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3;
+	SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3);
+	w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4;
+	SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4);
+	w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5;
+	SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5);
+	w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6;
+	SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6);
+	w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7;
+	SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7);
+	w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8;
+	SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8);
+	w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9;
+	SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9);
+	w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10;
+	SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10);
+	w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11;
+	SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11);
+	w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12;
+	SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12);
+	w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13;
+	SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13);
+	w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14;
+	SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14);
+	w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15;
+	SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15);
+
+	w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0;
+	SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0);
+	w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1;
+	SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1);
+	w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2;
+	SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2);
+	w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3;
+	SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3);
+	w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4;
+	SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4);
+	w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5;
+	SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5);
+	w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6;
+	SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6);
+	w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7;
+	SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7);
+	w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8;
+	SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8);
+	w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9;
+	SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9);
+	w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10;
+	SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10);
+	w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11;
+	SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11);
+	w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12;
+	SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12);
+	w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13;
+	SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13);
+	w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14;
+	SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14);
+	w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15;
+	SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15);
+
+	w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0;
+	SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0);
+	w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1;
+	SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1);
+	w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2;
+	SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2);
+	w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3;
+	SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3);
+	w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4;
+	SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4);
+	w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5;
+	SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5);
+	w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6;
+	SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6);
+	w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7;
+	SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7);
+	w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8;
+	SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8);
+	w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9;
+	SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9);
+	w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10;
+	SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10);
+	w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11;
+	SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11);
+	w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12;
+	SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12);
+	w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13;
+	SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13);
+	w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14;
+	SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14);
+	w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15;
+	SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15);
+
+	ctx->state.s32[0] += a;
+	ctx->state.s32[1] += b;
+	ctx->state.s32[2] += c;
+	ctx->state.s32[3] += d;
+	ctx->state.s32[4] += e;
+	ctx->state.s32[5] += f;
+	ctx->state.s32[6] += g;
+	ctx->state.s32[7] += h;
+}
+
+
+/* SHA384 and SHA512 Transform */
+
+static void
+SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk)
+{
+
+	uint64_t a = ctx->state.s64[0];
+	uint64_t b = ctx->state.s64[1];
+	uint64_t c = ctx->state.s64[2];
+	uint64_t d = ctx->state.s64[3];
+	uint64_t e = ctx->state.s64[4];
+	uint64_t f = ctx->state.s64[5];
+	uint64_t g = ctx->state.s64[6];
+	uint64_t h = ctx->state.s64[7];
+
+	uint64_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint64_t w8, w9, w10, w11, w12, w13, w14, w15;
+	uint64_t T1, T2;
+
+#if	defined(__sparc)
+	static const uint64_t sha512_consts[] = {
+		SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2,
+		SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5,
+		SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8,
+		SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11,
+		SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14,
+		SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17,
+		SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20,
+		SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23,
+		SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26,
+		SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29,
+		SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32,
+		SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35,
+		SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38,
+		SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41,
+		SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44,
+		SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47,
+		SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50,
+		SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53,
+		SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56,
+		SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59,
+		SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62,
+		SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65,
+		SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68,
+		SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71,
+		SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74,
+		SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77,
+		SHA512_CONST_78, SHA512_CONST_79
+	};
+#endif	/* __sparc */
+
+
+	if ((uintptr_t)blk & 0x7) {		/* not 8-byte aligned? */
+		bcopy(blk, ctx->buf_un.buf64,  sizeof (ctx->buf_un.buf64));
+		blk = (uint8_t *)ctx->buf_un.buf64;
+	}
+
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w0 =  LOAD_BIG_64(blk + 8 * 0);
+	SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w1 =  LOAD_BIG_64(blk + 8 * 1);
+	SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w2 =  LOAD_BIG_64(blk + 8 * 2);
+	SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w3 =  LOAD_BIG_64(blk + 8 * 3);
+	SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w4 =  LOAD_BIG_64(blk + 8 * 4);
+	SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w5 =  LOAD_BIG_64(blk + 8 * 5);
+	SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w6 =  LOAD_BIG_64(blk + 8 * 6);
+	SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w7 =  LOAD_BIG_64(blk + 8 * 7);
+	SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w8 =  LOAD_BIG_64(blk + 8 * 8);
+	SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w9 =  LOAD_BIG_64(blk + 8 * 9);
+	SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w10 =  LOAD_BIG_64(blk + 8 * 10);
+	SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w11 =  LOAD_BIG_64(blk + 8 * 11);
+	SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w12 =  LOAD_BIG_64(blk + 8 * 12);
+	SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w13 =  LOAD_BIG_64(blk + 8 * 13);
+	SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w14 =  LOAD_BIG_64(blk + 8 * 14);
+	SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w15 =  LOAD_BIG_64(blk + 8 * 15);
+	SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15);
+
+	ctx->state.s64[0] += a;
+	ctx->state.s64[1] += b;
+	ctx->state.s64[2] += c;
+	ctx->state.s64[3] += d;
+	ctx->state.s64[4] += e;
+	ctx->state.s64[5] += f;
+	ctx->state.s64[6] += g;
+	ctx->state.s64[7] += h;
+
+}
+#endif	/* !__amd64 || !_KERNEL */
+
+
+/*
+ * Encode()
+ *
+ * purpose: to convert a list of numbers from little endian to big endian
+ *   input: uint8_t *	: place to store the converted big endian numbers
+ *	    uint32_t *	: place to get numbers to convert from
+ *          size_t	: the length of the input in bytes
+ *  output: void
+ */
+
+static void
+Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input,
+    size_t len)
+{
+	size_t		i, j;
+
+#if	defined(__sparc)
+	if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			*((uint32_t *)(output + j)) = input[i];
+		}
+	} else {
+#endif	/* little endian -- will work on big endian, but slowly */
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			output[j]	= (input[i] >> 24) & 0xff;
+			output[j + 1]	= (input[i] >> 16) & 0xff;
+			output[j + 2]	= (input[i] >>  8) & 0xff;
+			output[j + 3]	= input[i] & 0xff;
+		}
+#if	defined(__sparc)
+	}
+#endif
+}
+
+static void
+Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input,
+    size_t len)
+{
+	size_t		i, j;
+
+#if	defined(__sparc)
+	if (IS_P2ALIGNED(output, sizeof (uint64_t))) {
+		for (i = 0, j = 0; j < len; i++, j += 8) {
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			*((uint64_t *)(output + j)) = input[i];
+		}
+	} else {
+#endif	/* little endian -- will work on big endian, but slowly */
+		for (i = 0, j = 0; j < len; i++, j += 8) {
+
+			output[j]	= (input[i] >> 56) & 0xff;
+			output[j + 1]	= (input[i] >> 48) & 0xff;
+			output[j + 2]	= (input[i] >> 40) & 0xff;
+			output[j + 3]	= (input[i] >> 32) & 0xff;
+			output[j + 4]	= (input[i] >> 24) & 0xff;
+			output[j + 5]	= (input[i] >> 16) & 0xff;
+			output[j + 6]	= (input[i] >>  8) & 0xff;
+			output[j + 7]	= input[i] & 0xff;
+		}
+#if	defined(__sparc)
+	}
+#endif
+}
+
+
+void
+SHA2Init(uint64_t mech, SHA2_CTX *ctx)
+{
+
+	switch (mech) {
+	case SHA256_MECH_INFO_TYPE:
+	case SHA256_HMAC_MECH_INFO_TYPE:
+	case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+		ctx->state.s32[0] = 0x6a09e667U;
+		ctx->state.s32[1] = 0xbb67ae85U;
+		ctx->state.s32[2] = 0x3c6ef372U;
+		ctx->state.s32[3] = 0xa54ff53aU;
+		ctx->state.s32[4] = 0x510e527fU;
+		ctx->state.s32[5] = 0x9b05688cU;
+		ctx->state.s32[6] = 0x1f83d9abU;
+		ctx->state.s32[7] = 0x5be0cd19U;
+		break;
+	case SHA384_MECH_INFO_TYPE:
+	case SHA384_HMAC_MECH_INFO_TYPE:
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL;
+		ctx->state.s64[1] = 0x629a292a367cd507ULL;
+		ctx->state.s64[2] = 0x9159015a3070dd17ULL;
+		ctx->state.s64[3] = 0x152fecd8f70e5939ULL;
+		ctx->state.s64[4] = 0x67332667ffc00b31ULL;
+		ctx->state.s64[5] = 0x8eb44a8768581511ULL;
+		ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL;
+		ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL;
+		break;
+	case SHA512_MECH_INFO_TYPE:
+	case SHA512_HMAC_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x6a09e667f3bcc908ULL;
+		ctx->state.s64[1] = 0xbb67ae8584caa73bULL;
+		ctx->state.s64[2] = 0x3c6ef372fe94f82bULL;
+		ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL;
+		ctx->state.s64[4] = 0x510e527fade682d1ULL;
+		ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL;
+		ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL;
+		ctx->state.s64[7] = 0x5be0cd19137e2179ULL;
+		break;
+	case SHA512_224_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x8C3D37C819544DA2ULL;
+		ctx->state.s64[1] = 0x73E1996689DCD4D6ULL;
+		ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL;
+		ctx->state.s64[3] = 0x679DD514582F9FCFULL;
+		ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL;
+		ctx->state.s64[5] = 0x77E36F7304C48942ULL;
+		ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL;
+		ctx->state.s64[7] = 0x1112E6AD91D692A1ULL;
+		break;
+	case SHA512_256_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x22312194FC2BF72CULL;
+		ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL;
+		ctx->state.s64[2] = 0x2393B86B6F53B151ULL;
+		ctx->state.s64[3] = 0x963877195940EABDULL;
+		ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL;
+		ctx->state.s64[5] = 0xBE5E1E2553863992ULL;
+		ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL;
+		ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL;
+		break;
+#ifdef _KERNEL
+	default:
+		cmn_err(CE_PANIC,
+		    "sha2_init: failed to find a supported algorithm: 0x%x",
+		    (uint32_t)mech);
+
+#endif /* _KERNEL */
+	}
+
+	ctx->algotype = (uint32_t)mech;
+	ctx->count.c64[0] = ctx->count.c64[1] = 0;
+}
+
+#ifndef _KERNEL
+
+// #pragma inline(SHA256Init, SHA384Init, SHA512Init)
+void
+SHA256Init(SHA256_CTX *ctx)
+{
+	SHA2Init(SHA256, ctx);
+}
+
+void
+SHA384Init(SHA384_CTX *ctx)
+{
+	SHA2Init(SHA384, ctx);
+}
+
+void
+SHA512Init(SHA512_CTX *ctx)
+{
+	SHA2Init(SHA512, ctx);
+}
+
+#endif /* _KERNEL */
+
+/*
+ * SHA2Update()
+ *
+ * purpose: continues an sha2 digest operation, using the message block
+ *          to update the context.
+ *   input: SHA2_CTX *	: the context to update
+ *          void *	: the message block
+ *          size_t      : the length of the message block, in bytes
+ *  output: void
+ */
+
+void
+SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
+{
+	uint32_t	i, buf_index, buf_len, buf_limit;
+	const uint8_t	*input = inptr;
+	uint32_t	algotype = ctx->algotype;
+
+	/* check for noop */
+	if (input_len == 0)
+		return;
+
+	if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+		buf_limit = 64;
+
+		/* compute number of bytes mod 64 */
+		buf_index = (ctx->count.c32[1] >> 3) & 0x3F;
+
+		/* update number of bits */
+		if ((ctx->count.c32[1] += (input_len << 3)) < (input_len << 3))
+			ctx->count.c32[0]++;
+
+		ctx->count.c32[0] += (input_len >> 29);
+
+	} else {
+		buf_limit = 128;
+
+		/* compute number of bytes mod 128 */
+		buf_index = (ctx->count.c64[1] >> 3) & 0x7F;
+
+		/* update number of bits */
+		if ((ctx->count.c64[1] += (input_len << 3)) < (input_len << 3))
+			ctx->count.c64[0]++;
+
+		ctx->count.c64[0] += (input_len >> 29);
+	}
+
+	buf_len = buf_limit - buf_index;
+
+	/* transform as many times as possible */
+	i = 0;
+	if (input_len >= buf_len) {
+
+		/*
+		 * general optimization:
+		 *
+		 * only do initial bcopy() and SHA2Transform() if
+		 * buf_index != 0.  if buf_index == 0, we're just
+		 * wasting our time doing the bcopy() since there
+		 * wasn't any data left over from a previous call to
+		 * SHA2Update().
+		 */
+		if (buf_index) {
+			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
+			if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE)
+				SHA256Transform(ctx, ctx->buf_un.buf8);
+			else
+				SHA512Transform(ctx, ctx->buf_un.buf8);
+
+			i = buf_len;
+		}
+
+#if !defined(__amd64) || !defined(_KERNEL)
+		if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+			for (; i + buf_limit - 1 < input_len; i += buf_limit) {
+				SHA256Transform(ctx, &input[i]);
+			}
+		} else {
+			for (; i + buf_limit - 1 < input_len; i += buf_limit) {
+				SHA512Transform(ctx, &input[i]);
+			}
+		}
+
+#else
+		uint32_t block_count;
+		if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+			block_count = (input_len - i) >> 6;
+			if (block_count > 0) {
+				SHA256TransformBlocks(ctx, &input[i],
+				    block_count);
+				i += block_count << 6;
+			}
+		} else {
+			block_count = (input_len - i) >> 7;
+			if (block_count > 0) {
+				SHA512TransformBlocks(ctx, &input[i],
+				    block_count);
+				i += block_count << 7;
+			}
+		}
+#endif	/* !__amd64 || !_KERNEL */
+
+		/*
+		 * general optimization:
+		 *
+		 * if i and input_len are the same, return now instead
+		 * of calling bcopy(), since the bcopy() in this case
+		 * will be an expensive noop.
+		 */
+
+		if (input_len == i)
+			return;
+
+		buf_index = 0;
+	}
+
+	/* buffer remaining input */
+	bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i);
+}
+
+
+/*
+ * SHA2Final()
+ *
+ * purpose: ends an sha2 digest operation, finalizing the message digest and
+ *          zeroing the context.
+ *   input: uchar_t *	: a buffer to store the digest
+ *			: The function actually uses void* because many
+ *			: callers pass things other than uchar_t here.
+ *          SHA2_CTX *  : the context to finalize, save, and zero
+ *  output: void
+ */
+
+void
+SHA2Final(void *digest, SHA2_CTX *ctx)
+{
+	uint8_t		bitcount_be[sizeof (ctx->count.c32)];
+	uint8_t		bitcount_be64[sizeof (ctx->count.c64)];
+	uint32_t	index;
+	uint32_t	algotype = ctx->algotype;
+
+	if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+		index  = (ctx->count.c32[1] >> 3) & 0x3f;
+		Encode(bitcount_be, ctx->count.c32, sizeof (bitcount_be));
+		SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
+		SHA2Update(ctx, bitcount_be, sizeof (bitcount_be));
+		Encode(digest, ctx->state.s32, sizeof (ctx->state.s32));
+	} else {
+		index  = (ctx->count.c64[1] >> 3) & 0x7f;
+		Encode64(bitcount_be64, ctx->count.c64,
+		    sizeof (bitcount_be64));
+		SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index);
+		SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64));
+		if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) {
+			ctx->state.s64[6] = ctx->state.s64[7] = 0;
+			Encode64(digest, ctx->state.s64,
+			    sizeof (uint64_t) * 6);
+		} else if (algotype == SHA512_224_MECH_INFO_TYPE) {
+			uint8_t last[sizeof (uint64_t)];
+			/*
+			 * Since SHA-512/224 doesn't align well to 64-bit
+			 * boundaries, we must do the encoding in three steps:
+			 * 1) encode the three 64-bit words that fit neatly
+			 * 2) encode the last 64-bit word to a temp buffer
+			 * 3) chop out the lower 32-bits from the temp buffer
+			 *    and append them to the digest
+			 */
+			Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3);
+			Encode64(last, &ctx->state.s64[3], sizeof (uint64_t));
+			bcopy(last, (uint8_t *)digest + 24, 4);
+		} else if (algotype == SHA512_256_MECH_INFO_TYPE) {
+			Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4);
+		} else {
+			Encode64(digest, ctx->state.s64,
+			    sizeof (ctx->state.s64));
+		}
+	}
+
+	/* zeroize sensitive information */
+	bzero(ctx, sizeof (*ctx));
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(SHA2Init);
+EXPORT_SYMBOL(SHA2Update);
+EXPORT_SYMBOL(SHA2Final);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE
new file mode 100644
index 000000000000..b7434fd17872
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE
@@ -0,0 +1,3 @@
+Implementation of the Skein hash function.
+Source code author: Doug Whiting, 2008.
+This algorithm and source code is released to the public domain.
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE.descrip b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
new file mode 100644
index 000000000000..0ae89cfdf5ce
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+LICENSE TERMS OF SKEIN HASH ALGORITHM IMPLEMENTATION
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein.c b/sys/contrib/openzfs/module/icp/algs/skein/skein.c
new file mode 100644
index 000000000000..83fe84260307
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein.c
@@ -0,0 +1,911 @@
+/*
+ * Implementation of the Skein hash function.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/skein.h>		/* get the Skein API definitions   */
+#include "skein_impl.h"		/* get internal definitions */
+
+/* 256-bit Skein */
+/* init the context for a straight hashing operation  */
+int
+Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN_256_STATE_BYTES];
+		uint64_t w[SKEIN_256_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 256:
+		bcopy(SKEIN_256_IV_256, ctx->X, sizeof (ctx->X));
+		break;
+	case 224:
+		bcopy(SKEIN_256_IV_224, ctx->X, sizeof (ctx->X));
+		break;
+	case 160:
+		bcopy(SKEIN_256_IV_160, ctx->X, sizeof (ctx->X));
+		break;
+	case 128:
+		bcopy(SKEIN_256_IV_128, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/* here if there is no precomputed IV value available */
+		/*
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen.
+	 * Set up to process the data message portion of the hash (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_256_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN_256_STATE_BYTES];
+		uint64_t w[SKEIN_256_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		/* hash the key */
+		(void) Skein_256_Update(ctx, key, keyBytes);
+		/* put result into cfg.b[] */
+		(void) Skein_256_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN_256_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);	/* hash result length in bits */
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(256, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+			Skein_256_Process_Block(ctx, ctx->b, 1,
+			    SKEIN_256_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from input
+		 * message data
+		 */
+		if (msgByteCnt > SKEIN_256_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES;
+			Skein_256_Process_Block(ctx, msg, n,
+			    SKEIN_256_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+			msg += n * SKEIN_256_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_256_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_256_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* 512-bit Skein */
+
+/* init the context for a straight hashing operation  */
+int
+Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN_512_STATE_BYTES];
+		uint64_t w[SKEIN_512_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 512:
+		bcopy(SKEIN_512_IV_512, ctx->X, sizeof (ctx->X));
+		break;
+	case 384:
+		bcopy(SKEIN_512_IV_384, ctx->X, sizeof (ctx->X));
+		break;
+	case 256:
+		bcopy(SKEIN_512_IV_256, ctx->X, sizeof (ctx->X));
+		break;
+	case 224:
+		bcopy(SKEIN_512_IV_224, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/*
+		 * here if there is no precomputed IV value available
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen. Set up to process the data message portion of the
+	 * hash (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_512_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN_512_STATE_BYTES];
+		uint64_t w[SKEIN_512_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		(void) Skein_512_Update(ctx, key, keyBytes); /* hash the key */
+		/* put result into cfg.b[] */
+		(void) Skein_512_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN_512_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);	/* hash result length in bits */
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(512, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+			Skein_512_Process_Block(ctx, ctx->b, 1,
+			    SKEIN_512_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from input
+		 * message data
+		 */
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
+			Skein_512_Process_Block(ctx, msg, n,
+			    SKEIN_512_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+			msg += n * SKEIN_512_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(512, &ctx->h, n,
+		    hashVal + i * SKEIN_512_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* 1024-bit Skein */
+
+/* init the context for a straight hashing operation  */
+int
+Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN1024_STATE_BYTES];
+		uint64_t w[SKEIN1024_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 512:
+		bcopy(SKEIN1024_IV_512, ctx->X, sizeof (ctx->X));
+		break;
+	case 384:
+		bcopy(SKEIN1024_IV_384, ctx->X, sizeof (ctx->X));
+		break;
+	case 1024:
+		bcopy(SKEIN1024_IV_1024, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/* here if there is no precomputed IV value available */
+		/*
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen. Set up to process the data message portion of the hash
+	 * (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein1024_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN1024_STATE_BYTES];
+		uint64_t w[SKEIN1024_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		(void) Skein1024_Update(ctx, key, keyBytes); /* hash the key */
+		/* put result into cfg.b[] */
+		(void) Skein1024_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN1024_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	/* hash result length in bits */
+	cfg.w[1] = Skein_Swap64(hashBitLen);
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(1024, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+			Skein1024_Process_Block(ctx, ctx->b, 1,
+			    SKEIN1024_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from
+		 * input message data
+		 */
+		if (msgByteCnt > SKEIN1024_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES;
+			Skein1024_Process_Block(ctx, msg, n,
+			    SKEIN1024_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+			msg += n * SKEIN1024_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN1024_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(1024, &ctx->h, n,
+		    hashVal + i * SKEIN1024_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* Functions to support MAC/tree hashing */
+/* (this code is identical for Optimized and Reference versions) */
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_256_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_512_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* tag as the final block */
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN1024_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+#if	SKEIN_TREE_HASH
+/* just do the OUTPUT stage */
+int
+Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_256_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_256_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_512_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN1024_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN1024_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+#endif
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(Skein_512_Init);
+EXPORT_SYMBOL(Skein_512_InitExt);
+EXPORT_SYMBOL(Skein_512_Update);
+EXPORT_SYMBOL(Skein_512_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c b/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c
new file mode 100644
index 000000000000..7ba165a48511
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_block.c
@@ -0,0 +1,790 @@
+/*
+ * Implementation of the Skein block functions.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ * Compile-time switches:
+ *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
+ *                    versions use ASM code for block processing
+ *                    [default: use C for all block sizes]
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#include <sys/skein.h>
+#include "skein_impl.h"
+#include <sys/isa_defs.h>	/* for _ILP32 */
+
+#ifndef	SKEIN_USE_ASM
+#define	SKEIN_USE_ASM	(0)	/* default is all C code (no ASM) */
+#endif
+
+#ifndef	SKEIN_LOOP
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
+ * checksum calculations to save stack space.
+ *
+ * Even with no loops unrolled, we still can exceed the 1k stack frame limit
+ * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32).  We can
+ * safely ignore it though, since that the checksum functions will be called
+ * from a worker thread that won't be using much stack.  That's why we have
+ * the #pragma here to ignore the warning.
+ */
+#if defined(_ILP32) || defined(__powerpc)	/* Assume small stack */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+/*
+ * We're running on 32-bit, don't unroll loops to save stack frame space
+ *
+ * Due to the ways the calculations on SKEIN_LOOP are done in
+ * Skein_*_Process_Block(), a value of 111 disables unrolling loops
+ * in any of those functions.
+ */
+#define	SKEIN_LOOP 111
+#else
+/* We're compiling with large stacks */
+#define	SKEIN_LOOP 001		/* default: unroll 256 and 512, but not 1024 */
+#endif
+#endif
+
+/* some useful definitions for code here */
+#define	BLK_BITS	(WCNT*64)
+#define	KW_TWK_BASE	(0)
+#define	KW_KEY_BASE	(3)
+#define	ks		(kw + KW_KEY_BASE)
+#define	ts		(kw + KW_TWK_BASE)
+
+/* no debugging in Illumos version */
+#define	DebugSaveTweak(ctx)
+
+/* Skein_256 */
+#if	!(SKEIN_USE_ASM & 256)
+void
+Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{
+	enum {
+		WCNT = SKEIN_256_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN_256_ROUNDS_TOTAL / 8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
+#else
+#define	SKEIN_UNROLL_256 (0)
+#endif
+
+#if	SKEIN_UNROLL_256
+#if	(RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+	/* local copy of context vars, for speed */
+	uint64_t X0, X1, X2, X3;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[4];
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+#endif
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X0 = w[0] + ks[0];	/* do the first full key injection */
+		X1 = w[1] + ks[1] + ts[0];
+		X2 = w[2] + ks[2] + ts[1];
+		X3 = w[3] + ks[3];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);	/* show starting state values */
+
+		blkPtr += SKEIN_256_BLOCK_BYTES;
+
+		/* run the rounds */
+
+#define	Round256(p0, p1, p2, p3, ROT, rNum)                          \
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
+
+#if	SKEIN_UNROLL_256 == 0
+#define	R256(p0, p1, p2, p3, ROT, rNum)		/* fully unrolled */	\
+	Round256(p0, p1, p2, p3, ROT, rNum)		\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define	I256(R)								\
+	X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
+	X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];			\
+	X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];			\
+	X3 += ks[((R) + 4) % 5] + (R) + 1;			\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R256(p0, p1, p2, p3, ROT, rNum)                             \
+	Round256(p0, p1, p2, p3, ROT, rNum)                             \
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define	I256(R)								\
+	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X1 += ks[r + (R) + 1] + ts[r + (R) + 0];			\
+	X2 += ks[r + (R) + 2] + ts[r + (R) + 1];			\
+	X3 += ks[r + (R) + 3] + r + (R);				\
+	ks[r + (R) + 4] = ks[r + (R) - 1];   /* rotate key schedule */	\
+	ts[r + (R) + 2] = ts[r + (R) - 1];			\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop through it */
+		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
+#endif
+		{
+#define	R256_8_rounds(R)                         \
+	R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
+	R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
+	R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
+	R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
+	I256(2 * (R));                           \
+	R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
+	R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
+	R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
+	R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
+	I256(2 * (R) + 1);
+
+			R256_8_rounds(0);
+
+#define	R256_Unroll_R(NN) \
+	((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
+	(SKEIN_UNROLL_256 > (NN)))
+
+#if	R256_Unroll_R(1)
+			R256_8_rounds(1);
+#endif
+#if	R256_Unroll_R(2)
+			R256_8_rounds(2);
+#endif
+#if	R256_Unroll_R(3)
+			R256_8_rounds(3);
+#endif
+#if	R256_Unroll_R(4)
+			R256_8_rounds(4);
+#endif
+#if	R256_Unroll_R(5)
+			R256_8_rounds(5);
+#endif
+#if	R256_Unroll_R(6)
+			R256_8_rounds(6);
+#endif
+#if	R256_Unroll_R(7)
+			R256_8_rounds(7);
+#endif
+#if	R256_Unroll_R(8)
+			R256_8_rounds(8);
+#endif
+#if	R256_Unroll_R(9)
+			R256_8_rounds(9);
+#endif
+#if	R256_Unroll_R(10)
+			R256_8_rounds(10);
+#endif
+#if	R256_Unroll_R(11)
+			R256_8_rounds(11);
+#endif
+#if	R256_Unroll_R(12)
+			R256_8_rounds(12);
+#endif
+#if	R256_Unroll_R(13)
+			R256_8_rounds(13);
+#endif
+#if	R256_Unroll_R(14)
+			R256_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_256 > 14)
+#error  "need more unrolling in Skein_256_Process_Block"
+#endif
+		}
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	} while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_256_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein_256_Process_Block);
+}
+
+uint_t
+Skein_256_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_256);
+}
+#endif
+#endif
+
+/* Skein_512 */
+#if	!(SKEIN_USE_ASM & 512)
+void
+Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{
+	enum {
+		WCNT = SKEIN_512_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
+#else
+#define	SKEIN_UNROLL_512 (0)
+#endif
+
+#if	SKEIN_UNROLL_512
+#if	(RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+	/* local copy of vars, for speed */
+	uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[8];
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+	Xptr[4] = &X4;
+	Xptr[5] = &X5;
+	Xptr[6] = &X6;
+	Xptr[7] = &X7;
+#endif
+
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X0 = w[0] + ks[0];	/* do the first full key injection */
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
+
+		blkPtr += SKEIN_512_BLOCK_BYTES;
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);
+		/* run the rounds */
+#define	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
+
+#if	SKEIN_UNROLL_512 == 0
+#define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)	/* unrolled */	\
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define	I512(R)								\
+	X0 += ks[((R) + 1) % 9];	/* inject the key schedule value */\
+	X1 += ks[((R) + 2) % 9];					\
+	X2 += ks[((R) + 3) % 9];					\
+	X3 += ks[((R) + 4) % 9];					\
+	X4 += ks[((R) + 5) % 9];					\
+	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];			\
+	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];			\
+	X7 += ks[((R) + 8) % 9] + (R) + 1;				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)			\
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define	I512(R)								\
+	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X1 += ks[r + (R) + 1];						\
+	X2 += ks[r + (R) + 2];						\
+	X3 += ks[r + (R) + 3];						\
+	X4 += ks[r + (R) + 4];						\
+	X5 += ks[r + (R) + 5] + ts[r + (R) + 0];			\
+	X6 += ks[r + (R) + 6] + ts[r + (R) + 1];			\
+	X7 += ks[r + (R) + 7] + r + (R);				\
+	ks[r + (R)+8] = ks[r + (R) - 1];	/* rotate key schedule */\
+	ts[r + (R)+2] = ts[r + (R) - 1];				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop through it */
+		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
+#endif				/* end of looped code definitions */
+		{
+#define	R512_8_rounds(R)	/* do 8 full rounds */			\
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);		\
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);		\
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);		\
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);		\
+	I512(2 * (R));							\
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);		\
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);		\
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);		\
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);		\
+	I512(2*(R) + 1);		/* and key injection */
+
+			R512_8_rounds(0);
+
+#define	R512_Unroll_R(NN) \
+	((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
+	(SKEIN_UNROLL_512 > (NN)))
+
+#if	R512_Unroll_R(1)
+			R512_8_rounds(1);
+#endif
+#if	R512_Unroll_R(2)
+			R512_8_rounds(2);
+#endif
+#if	R512_Unroll_R(3)
+			R512_8_rounds(3);
+#endif
+#if	R512_Unroll_R(4)
+			R512_8_rounds(4);
+#endif
+#if	R512_Unroll_R(5)
+			R512_8_rounds(5);
+#endif
+#if	R512_Unroll_R(6)
+			R512_8_rounds(6);
+#endif
+#if	R512_Unroll_R(7)
+			R512_8_rounds(7);
+#endif
+#if	R512_Unroll_R(8)
+			R512_8_rounds(8);
+#endif
+#if	R512_Unroll_R(9)
+			R512_8_rounds(9);
+#endif
+#if	R512_Unroll_R(10)
+			R512_8_rounds(10);
+#endif
+#if	R512_Unroll_R(11)
+			R512_8_rounds(11);
+#endif
+#if	R512_Unroll_R(12)
+			R512_8_rounds(12);
+#endif
+#if	R512_Unroll_R(13)
+			R512_8_rounds(13);
+#endif
+#if	R512_Unroll_R(14)
+			R512_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_512 > 14)
+#error "need more unrolling in Skein_512_Process_Block"
+#endif
+		}
+
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+		ctx->X[4] = X4 ^ w[4];
+		ctx->X[5] = X5 ^ w[5];
+		ctx->X[6] = X6 ^ w[6];
+		ctx->X[7] = X7 ^ w[7];
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	} while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_512_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein_512_Process_Block);
+}
+
+uint_t
+Skein_512_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_512);
+}
+#endif
+#endif
+
+/*  Skein1024 */
+#if	!(SKEIN_USE_ASM & 1024)
+void
+Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{
+	/* do it in C, always looping (unrolled is bigger AND slower!) */
+	enum {
+		WCNT = SKEIN1024_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define	SKEIN_UNROLL_1024 (0)
+#endif
+
+#if	(SKEIN_UNROLL_1024 != 0)
+#if	(RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+
+	/* local copy of vars, for speed */
+	uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
+	    X12, X13, X14, X15;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[16];
+	Xptr[0] = &X00;
+	Xptr[1] = &X01;
+	Xptr[2] = &X02;
+	Xptr[3] = &X03;
+	Xptr[4] = &X04;
+	Xptr[5] = &X05;
+	Xptr[6] = &X06;
+	Xptr[7] = &X07;
+	Xptr[8] = &X08;
+	Xptr[9] = &X09;
+	Xptr[10] = &X10;
+	Xptr[11] = &X11;
+	Xptr[12] = &X12;
+	Xptr[13] = &X13;
+	Xptr[14] = &X14;
+	Xptr[15] = &X15;
+#endif
+
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ctx->X[8];
+		ks[9] = ctx->X[9];
+		ks[10] = ctx->X[10];
+		ks[11] = ctx->X[11];
+		ks[12] = ctx->X[12];
+		ks[13] = ctx->X[13];
+		ks[14] = ctx->X[14];
+		ks[15] = ctx->X[15];
+		ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
+		    ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
+		    ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X00 = w[0] + ks[0];	/* do the first full key injection */
+		X01 = w[1] + ks[1];
+		X02 = w[2] + ks[2];
+		X03 = w[3] + ks[3];
+		X04 = w[4] + ks[4];
+		X05 = w[5] + ks[5];
+		X06 = w[6] + ks[6];
+		X07 = w[7] + ks[7];
+		X08 = w[8] + ks[8];
+		X09 = w[9] + ks[9];
+		X10 = w[10] + ks[10];
+		X11 = w[11] + ks[11];
+		X12 = w[12] + ks[12];
+		X13 = w[13] + ks[13] + ts[0];
+		X14 = w[14] + ks[14] + ts[1];
+		X15 = w[15] + ks[15];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);
+
+#define	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rNum)						\
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
+	X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
+	X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
+	X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
+	X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
+
+#if	SKEIN_UNROLL_1024 == 0
+#define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
+	pE, pF, ROT, rn)						\
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rn)						\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
+
+#define	I1024(R)							\
+	X00 += ks[((R) + 1) % 17];	/* inject the key schedule value */\
+	X01 += ks[((R) + 2) % 17];					\
+	X02 += ks[((R) + 3) % 17];					\
+	X03 += ks[((R) + 4) % 17];					\
+	X04 += ks[((R) + 5) % 17];					\
+	X05 += ks[((R) + 6) % 17];					\
+	X06 += ks[((R) + 7) % 17];					\
+	X07 += ks[((R) + 8) % 17];					\
+	X08 += ks[((R) + 9) % 17];					\
+	X09 += ks[((R) + 10) % 17];					\
+	X10 += ks[((R) + 11) % 17];					\
+	X11 += ks[((R) + 12) % 17];					\
+	X12 += ks[((R) + 13) % 17];					\
+	X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];			\
+	X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];			\
+	X15 += ks[((R) + 16) % 17] + (R) +1;				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
+	pE, pF, ROT, rn)						\
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rn)						\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
+
+#define	I1024(R)							\
+	X00 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X01 += ks[r + (R) + 1];						\
+	X02 += ks[r + (R) + 2];						\
+	X03 += ks[r + (R) + 3];						\
+	X04 += ks[r + (R) + 4];						\
+	X05 += ks[r + (R) + 5];						\
+	X06 += ks[r + (R) + 6];						\
+	X07 += ks[r + (R) + 7];						\
+	X08 += ks[r + (R) + 8];						\
+	X09 += ks[r + (R) + 9];						\
+	X10 += ks[r + (R) + 10];					\
+	X11 += ks[r + (R) + 11];					\
+	X12 += ks[r + (R) + 12];					\
+	X13 += ks[r + (R) + 13] + ts[r + (R) + 0];			\
+	X14 += ks[r + (R) + 14] + ts[r + (R) + 1];			\
+	X15 += ks[r + (R) + 15] +  r + (R);				\
+	ks[r + (R) + 16] = ks[r + (R) - 1];	/* rotate key schedule */\
+	ts[r + (R) + 2] = ts[r + (R) - 1];				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop through it */
+		for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
+#endif
+		{
+#define	R1024_8_rounds(R)	/* do 8 full rounds */			\
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
+	    14, 15, R1024_0, 8 * (R) + 1);				\
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
+	    08, 01, R1024_1, 8 * (R) + 2);				\
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
+	    10, 09, R1024_2, 8 * (R) + 3);				\
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
+	    12, 07, R1024_3, 8 * (R) + 4);				\
+	I1024(2 * (R));							\
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
+	    14, 15, R1024_4, 8 * (R) + 5);				\
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
+	    08, 01, R1024_5, 8 * (R) + 6);				\
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
+	    10, 09, R1024_6, 8 * (R) + 7);				\
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
+	    12, 07, R1024_7, 8 * (R) + 8);				\
+	I1024(2 * (R) + 1);
+
+			R1024_8_rounds(0);
+
+#define	R1024_Unroll_R(NN)						\
+	((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) ||	\
+	(SKEIN_UNROLL_1024 > (NN)))
+
+#if	R1024_Unroll_R(1)
+			R1024_8_rounds(1);
+#endif
+#if	R1024_Unroll_R(2)
+			R1024_8_rounds(2);
+#endif
+#if	R1024_Unroll_R(3)
+			R1024_8_rounds(3);
+#endif
+#if	R1024_Unroll_R(4)
+			R1024_8_rounds(4);
+#endif
+#if	R1024_Unroll_R(5)
+			R1024_8_rounds(5);
+#endif
+#if	R1024_Unroll_R(6)
+			R1024_8_rounds(6);
+#endif
+#if	R1024_Unroll_R(7)
+			R1024_8_rounds(7);
+#endif
+#if	R1024_Unroll_R(8)
+			R1024_8_rounds(8);
+#endif
+#if	R1024_Unroll_R(9)
+			R1024_8_rounds(9);
+#endif
+#if	R1024_Unroll_R(10)
+			R1024_8_rounds(10);
+#endif
+#if	R1024_Unroll_R(11)
+			R1024_8_rounds(11);
+#endif
+#if	R1024_Unroll_R(12)
+			R1024_8_rounds(12);
+#endif
+#if	R1024_Unroll_R(13)
+			R1024_8_rounds(13);
+#endif
+#if	R1024_Unroll_R(14)
+			R1024_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_1024 > 14)
+#error  "need more unrolling in Skein_1024_Process_Block"
+#endif
+		}
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+
+		ctx->X[0] = X00 ^ w[0];
+		ctx->X[1] = X01 ^ w[1];
+		ctx->X[2] = X02 ^ w[2];
+		ctx->X[3] = X03 ^ w[3];
+		ctx->X[4] = X04 ^ w[4];
+		ctx->X[5] = X05 ^ w[5];
+		ctx->X[6] = X06 ^ w[6];
+		ctx->X[7] = X07 ^ w[7];
+		ctx->X[8] = X08 ^ w[8];
+		ctx->X[9] = X09 ^ w[9];
+		ctx->X[10] = X10 ^ w[10];
+		ctx->X[11] = X11 ^ w[11];
+		ctx->X[12] = X12 ^ w[12];
+		ctx->X[13] = X13 ^ w[13];
+		ctx->X[14] = X14 ^ w[14];
+		ctx->X[15] = X15 ^ w[15];
+
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+		blkPtr += SKEIN1024_BLOCK_BYTES;
+	} while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein1024_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein1024_Process_Block);
+}
+
+uint_t
+Skein1024_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_1024);
+}
+#endif
+#endif
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_impl.h b/sys/contrib/openzfs/module/icp/algs/skein/skein_impl.h
new file mode 100644
index 000000000000..205a517d69db
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_impl.h
@@ -0,0 +1,292 @@
+/*
+ * Internal definitions for Skein hashing.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ *
+ * The following compile-time switches may be defined to control some
+ * tradeoffs between speed, code size, error checking, and security.
+ *
+ * The "default" note explains what happens when the switch is not defined.
+ *
+ *  SKEIN_DEBUG            -- make callouts from inside Skein code
+ *                            to examine/display intermediate values.
+ *                            [default: no callouts (no overhead)]
+ *
+ *  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+ *                            code. If not defined, most error checking
+ *                            is disabled (for performance). Otherwise,
+ *                            the switch value is interpreted as:
+ *                                0: use assert()      to flag errors
+ *                                1: return SKEIN_FAIL to flag errors
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef	_SKEIN_IMPL_H_
+#define	_SKEIN_IMPL_H_
+
+#include <sys/skein.h>
+#include <sys/strings.h>
+#include <sys/note.h>
+#include "skein_impl.h"
+#include "skein_port.h"
+
+/*
+ * "Internal" Skein definitions
+ *    -- not needed for sequential hashing API, but will be
+ *           helpful for other uses of Skein (e.g., tree hash mode).
+ *    -- included here so that they can be shared between
+ *           reference and optimized code.
+ */
+
+/* tweak word T[1]: bit field starting positions */
+/* offset 64 because it's the second word  */
+#define	SKEIN_T1_BIT(BIT)	((BIT) - 64)
+
+/* bits 112..118: level in hash tree */
+#define	SKEIN_T1_POS_TREE_LVL	SKEIN_T1_BIT(112)
+/* bit  119: partial final input byte */
+#define	SKEIN_T1_POS_BIT_PAD	SKEIN_T1_BIT(119)
+/* bits 120..125: type field */
+#define	SKEIN_T1_POS_BLK_TYPE	SKEIN_T1_BIT(120)
+/* bits 126: first block flag */
+#define	SKEIN_T1_POS_FIRST	SKEIN_T1_BIT(126)
+/* bit  127: final block flag */
+#define	SKEIN_T1_POS_FINAL	SKEIN_T1_BIT(127)
+
+/* tweak word T[1]: flag bit definition(s) */
+#define	SKEIN_T1_FLAG_FIRST	(((uint64_t)1) << SKEIN_T1_POS_FIRST)
+#define	SKEIN_T1_FLAG_FINAL	(((uint64_t)1) << SKEIN_T1_POS_FINAL)
+#define	SKEIN_T1_FLAG_BIT_PAD	(((uint64_t)1) << SKEIN_T1_POS_BIT_PAD)
+
+/* tweak word T[1]: tree level bit field mask */
+#define	SKEIN_T1_TREE_LVL_MASK	(((uint64_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define	SKEIN_T1_TREE_LEVEL(n)	(((uint64_t)(n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define	SKEIN_BLK_TYPE_KEY	(0)	/* key, for MAC and KDF */
+#define	SKEIN_BLK_TYPE_CFG	(4)	/* configuration block */
+#define	SKEIN_BLK_TYPE_PERS	(8)	/* personalization string */
+#define	SKEIN_BLK_TYPE_PK	(12)	/* public key (for signature hashing) */
+#define	SKEIN_BLK_TYPE_KDF	(16)	/* key identifier for KDF */
+#define	SKEIN_BLK_TYPE_NONCE	(20)	/* nonce for PRNG */
+#define	SKEIN_BLK_TYPE_MSG	(48)	/* message processing */
+#define	SKEIN_BLK_TYPE_OUT	(63)	/* output stage */
+#define	SKEIN_BLK_TYPE_MASK	(63)	/* bit field mask */
+
+#define	SKEIN_T1_BLK_TYPE(T)	\
+	(((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+/* key, for MAC and KDF */
+#define	SKEIN_T1_BLK_TYPE_KEY	SKEIN_T1_BLK_TYPE(KEY)
+/* configuration block */
+#define	SKEIN_T1_BLK_TYPE_CFG	SKEIN_T1_BLK_TYPE(CFG)
+/* personalization string */
+#define	SKEIN_T1_BLK_TYPE_PERS	SKEIN_T1_BLK_TYPE(PERS)
+/* public key (for digital signature hashing) */
+#define	SKEIN_T1_BLK_TYPE_PK	SKEIN_T1_BLK_TYPE(PK)
+/* key identifier for KDF */
+#define	SKEIN_T1_BLK_TYPE_KDF	SKEIN_T1_BLK_TYPE(KDF)
+/* nonce for PRNG */
+#define	SKEIN_T1_BLK_TYPE_NONCE	SKEIN_T1_BLK_TYPE(NONCE)
+/* message processing */
+#define	SKEIN_T1_BLK_TYPE_MSG	SKEIN_T1_BLK_TYPE(MSG)
+/* output stage */
+#define	SKEIN_T1_BLK_TYPE_OUT	SKEIN_T1_BLK_TYPE(OUT)
+/* field bit mask */
+#define	SKEIN_T1_BLK_TYPE_MASK	SKEIN_T1_BLK_TYPE(MASK)
+
+#define	SKEIN_T1_BLK_TYPE_CFG_FINAL	\
+	(SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define	SKEIN_T1_BLK_TYPE_OUT_FINAL	\
+	(SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define	SKEIN_VERSION		(1)
+
+#ifndef	SKEIN_ID_STRING_LE	/* allow compile-time personalization */
+#define	SKEIN_ID_STRING_LE	(0x33414853)	/* "SHA3" (little-endian) */
+#endif
+
+#define	SKEIN_MK_64(hi32, lo32)	((lo32) + (((uint64_t)(hi32)) << 32))
+#define	SKEIN_SCHEMA_VER	SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE)
+#define	SKEIN_KS_PARITY		SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
+
+#define	SKEIN_CFG_STR_LEN	(4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define	SKEIN_CFG_TREE_LEAF_SIZE_POS	(0)
+#define	SKEIN_CFG_TREE_NODE_SIZE_POS	(8)
+#define	SKEIN_CFG_TREE_MAX_LEVEL_POS	(16)
+
+#define	SKEIN_CFG_TREE_LEAF_SIZE_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define	SKEIN_CFG_TREE_NODE_SIZE_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define	SKEIN_CFG_TREE_MAX_LEVEL_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define	SKEIN_CFG_TREE_INFO(leaf, node, maxLvl)			\
+	((((uint64_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |	\
+	(((uint64_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) |	\
+	(((uint64_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS))
+
+/* use as treeInfo in InitExt() call for sequential processing */
+#define	SKEIN_CFG_TREE_INFO_SEQUENTIAL	SKEIN_CFG_TREE_INFO(0, 0, 0)
+
+/*
+ * Skein macros for getting/setting tweak words, etc.
+ * These are useful for partial input bytes, hash tree init/update, etc.
+ */
+#define	Skein_Get_Tweak(ctxPtr, TWK_NUM)	((ctxPtr)->h.T[TWK_NUM])
+#define	Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal)		\
+	do {						\
+		(ctxPtr)->h.T[TWK_NUM] = (tVal);	\
+		_NOTE(CONSTCOND)			\
+	} while (0)
+
+#define	Skein_Get_T0(ctxPtr)		Skein_Get_Tweak(ctxPtr, 0)
+#define	Skein_Get_T1(ctxPtr)		Skein_Get_Tweak(ctxPtr, 1)
+#define	Skein_Set_T0(ctxPtr, T0)	Skein_Set_Tweak(ctxPtr, 0, T0)
+#define	Skein_Set_T1(ctxPtr, T1)	Skein_Set_Tweak(ctxPtr, 1, T1)
+
+/* set both tweak words at once */
+#define	Skein_Set_T0_T1(ctxPtr, T0, T1)		\
+	do {					\
+		Skein_Set_T0(ctxPtr, (T0));	\
+		Skein_Set_T1(ctxPtr, (T1));	\
+		_NOTE(CONSTCOND)		\
+	} while (0)
+
+#define	Skein_Set_Type(ctxPtr, BLK_TYPE)	\
+	Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/*
+ * set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0;
+ */
+#define	Skein_Start_New_Type(ctxPtr, BLK_TYPE)				\
+	do {								\
+		Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST |	\
+		    SKEIN_T1_BLK_TYPE_ ## BLK_TYPE);			\
+		(ctxPtr)->h.bCnt = 0;	\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+#define	Skein_Clear_First_Flag(hdr)					\
+	do {								\
+		(hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;			\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+#define	Skein_Set_Bit_Pad_Flag(hdr)					\
+	do {								\
+		(hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;			\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+#define	Skein_Set_Tree_Level(hdr, height)				\
+	do {								\
+		(hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);		\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+/*
+ * "Internal" Skein definitions for debugging and error checking
+ * Note: in Illumos we always disable debugging features.
+ */
+#define	Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr)
+#define	Skein_Show_Round(bits, ctx, r, X)
+#define	Skein_Show_R_Ptr(bits, ctx, r, X_ptr)
+#define	Skein_Show_Final(bits, ctx, cnt, outPtr)
+#define	Skein_Show_Key(bits, ctx, key, keyBytes)
+
+/* run-time checks (e.g., bad params, uninitialized context)? */
+#ifndef	SKEIN_ERR_CHECK
+/* default: ignore all Asserts, for performance */
+#define	Skein_Assert(x, retCode)
+#define	Skein_assert(x)
+#elif	defined(SKEIN_ASSERT)
+#include <sys/debug.h>
+#define	Skein_Assert(x, retCode)	ASSERT(x)
+#define	Skein_assert(x)			ASSERT(x)
+#else
+#include <sys/debug.h>
+/*  caller error */
+#define	Skein_Assert(x, retCode)		\
+	do {					\
+		if (!(x))			\
+			return (retCode);	\
+		_NOTE(CONSTCOND)		\
+	} while (0)
+/* internal error */
+#define	Skein_assert(x)	ASSERT(x)
+#endif
+
+/*
+ * Skein block function constants (shared across Ref and Opt code)
+ */
+enum {
+	/* Skein_256 round rotation constants */
+	R_256_0_0 = 14, R_256_0_1 = 16,
+	R_256_1_0 = 52, R_256_1_1 = 57,
+	R_256_2_0 = 23, R_256_2_1 = 40,
+	R_256_3_0 = 5, R_256_3_1 = 37,
+	R_256_4_0 = 25, R_256_4_1 = 33,
+	R_256_5_0 = 46, R_256_5_1 = 12,
+	R_256_6_0 = 58, R_256_6_1 = 22,
+	R_256_7_0 = 32, R_256_7_1 = 32,
+
+	/* Skein_512 round rotation constants */
+	R_512_0_0 = 46, R_512_0_1 = 36, R_512_0_2 = 19, R_512_0_3 = 37,
+	R_512_1_0 = 33, R_512_1_1 = 27, R_512_1_2 = 14, R_512_1_3 = 42,
+	R_512_2_0 = 17, R_512_2_1 = 49, R_512_2_2 = 36, R_512_2_3 = 39,
+	R_512_3_0 = 44, R_512_3_1 = 9, R_512_3_2 = 54, R_512_3_3 = 56,
+	R_512_4_0 = 39, R_512_4_1 = 30, R_512_4_2 = 34, R_512_4_3 = 24,
+	R_512_5_0 = 13, R_512_5_1 = 50, R_512_5_2 = 10, R_512_5_3 = 17,
+	R_512_6_0 = 25, R_512_6_1 = 29, R_512_6_2 = 39, R_512_6_3 = 43,
+	R_512_7_0 = 8, R_512_7_1 = 35, R_512_7_2 = 56, R_512_7_3 = 22,
+
+	/* Skein1024 round rotation constants */
+	R1024_0_0 = 24, R1024_0_1 = 13, R1024_0_2 = 8, R1024_0_3 =
+	    47, R1024_0_4 = 8, R1024_0_5 = 17, R1024_0_6 = 22, R1024_0_7 = 37,
+	R1024_1_0 = 38, R1024_1_1 = 19, R1024_1_2 = 10, R1024_1_3 =
+	    55, R1024_1_4 = 49, R1024_1_5 = 18, R1024_1_6 = 23, R1024_1_7 = 52,
+	R1024_2_0 = 33, R1024_2_1 = 4, R1024_2_2 = 51, R1024_2_3 =
+	    13, R1024_2_4 = 34, R1024_2_5 = 41, R1024_2_6 = 59, R1024_2_7 = 17,
+	R1024_3_0 = 5, R1024_3_1 = 20, R1024_3_2 = 48, R1024_3_3 =
+	    41, R1024_3_4 = 47, R1024_3_5 = 28, R1024_3_6 = 16, R1024_3_7 = 25,
+	R1024_4_0 = 41, R1024_4_1 = 9, R1024_4_2 = 37, R1024_4_3 =
+	    31, R1024_4_4 = 12, R1024_4_5 = 47, R1024_4_6 = 44, R1024_4_7 = 30,
+	R1024_5_0 = 16, R1024_5_1 = 34, R1024_5_2 = 56, R1024_5_3 =
+	    51, R1024_5_4 = 4, R1024_5_5 = 53, R1024_5_6 = 42, R1024_5_7 = 41,
+	R1024_6_0 = 31, R1024_6_1 = 44, R1024_6_2 = 47, R1024_6_3 =
+	    46, R1024_6_4 = 19, R1024_6_5 = 42, R1024_6_6 = 44, R1024_6_7 = 25,
+	R1024_7_0 = 9, R1024_7_1 = 48, R1024_7_2 = 35, R1024_7_3 =
+	    52, R1024_7_4 = 23, R1024_7_5 = 31, R1024_7_6 = 37, R1024_7_7 = 20
+};
+
+/* number of rounds for the different block sizes */
+#define	SKEIN_256_ROUNDS_TOTAL	(72)
+#define	SKEIN_512_ROUNDS_TOTAL	(72)
+#define	SKEIN1024_ROUNDS_TOTAL	(80)
+
+
+extern const uint64_t SKEIN_256_IV_128[];
+extern const uint64_t SKEIN_256_IV_160[];
+extern const uint64_t SKEIN_256_IV_224[];
+extern const uint64_t SKEIN_256_IV_256[];
+extern const uint64_t SKEIN_512_IV_128[];
+extern const uint64_t SKEIN_512_IV_160[];
+extern const uint64_t SKEIN_512_IV_224[];
+extern const uint64_t SKEIN_512_IV_256[];
+extern const uint64_t SKEIN_512_IV_384[];
+extern const uint64_t SKEIN_512_IV_512[];
+extern const uint64_t SKEIN1024_IV_384[];
+extern const uint64_t SKEIN1024_IV_512[];
+extern const uint64_t SKEIN1024_IV_1024[];
+
+/* Functions to process blkCnt (nonzero) full block(s) of data. */
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+
+#endif	/* _SKEIN_IMPL_H_ */
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_iv.c b/sys/contrib/openzfs/module/icp/algs/skein/skein_iv.c
new file mode 100644
index 000000000000..140d38f76547
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_iv.c
@@ -0,0 +1,185 @@
+/*
+ * Pre-computed Skein IVs
+ *
+ * NOTE: these values are not "magic" constants, but
+ * are generated using the Threefish block function.
+ * They are pre-computed here only for speed; i.e., to
+ * avoid the need for a Threefish call during Init().
+ *
+ * The IV for any fixed hash length may be pre-computed.
+ * Only the most common values are included here.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+/*
+ * Illumos implementation note: these constants are for Skein v1.3 as per:
+ * http://www.skein-hash.info/sites/default/files/skein1.3.pdf
+ */
+
+#include <sys/skein.h>		/* get Skein macros and types */
+#include "skein_impl.h"		/* get internal definitions */
+
+#define	MK_64 SKEIN_MK_64
+
+/* blkSize =  256 bits. hashSize =  128 bits */
+const uint64_t SKEIN_256_IV_128[] = {
+	MK_64(0xE1111906, 0x964D7260),
+	MK_64(0x883DAAA7, 0x7C8D811C),
+	MK_64(0x10080DF4, 0x91960F7A),
+	MK_64(0xCCF7DDE5, 0xB45BC1C2)
+};
+
+/* blkSize =  256 bits. hashSize =  160 bits */
+const uint64_t SKEIN_256_IV_160[] = {
+	MK_64(0x14202314, 0x72825E98),
+	MK_64(0x2AC4E9A2, 0x5A77E590),
+	MK_64(0xD47A5856, 0x8838D63E),
+	MK_64(0x2DD2E496, 0x8586AB7D)
+};
+
+/* blkSize =  256 bits. hashSize =  224 bits */
+const uint64_t SKEIN_256_IV_224[] = {
+	MK_64(0xC6098A8C, 0x9AE5EA0B),
+	MK_64(0x876D5686, 0x08C5191C),
+	MK_64(0x99CB88D7, 0xD7F53884),
+	MK_64(0x384BDDB1, 0xAEDDB5DE)
+};
+
+/* blkSize =  256 bits. hashSize =  256 bits */
+const uint64_t SKEIN_256_IV_256[] = {
+	MK_64(0xFC9DA860, 0xD048B449),
+	MK_64(0x2FCA6647, 0x9FA7D833),
+	MK_64(0xB33BC389, 0x6656840F),
+	MK_64(0x6A54E920, 0xFDE8DA69)
+};
+
+/* blkSize =  512 bits. hashSize =  128 bits */
+const uint64_t SKEIN_512_IV_128[] = {
+	MK_64(0xA8BC7BF3, 0x6FBF9F52),
+	MK_64(0x1E9872CE, 0xBD1AF0AA),
+	MK_64(0x309B1790, 0xB32190D3),
+	MK_64(0xBCFBB854, 0x3F94805C),
+	MK_64(0x0DA61BCD, 0x6E31B11B),
+	MK_64(0x1A18EBEA, 0xD46A32E3),
+	MK_64(0xA2CC5B18, 0xCE84AA82),
+	MK_64(0x6982AB28, 0x9D46982D)
+};
+
+/* blkSize =  512 bits. hashSize =  160 bits */
+const uint64_t SKEIN_512_IV_160[] = {
+	MK_64(0x28B81A2A, 0xE013BD91),
+	MK_64(0xC2F11668, 0xB5BDF78F),
+	MK_64(0x1760D8F3, 0xF6A56F12),
+	MK_64(0x4FB74758, 0x8239904F),
+	MK_64(0x21EDE07F, 0x7EAF5056),
+	MK_64(0xD908922E, 0x63ED70B8),
+	MK_64(0xB8EC76FF, 0xECCB52FA),
+	MK_64(0x01A47BB8, 0xA3F27A6E)
+};
+
+/* blkSize =  512 bits. hashSize =  224 bits */
+const uint64_t SKEIN_512_IV_224[] = {
+	MK_64(0xCCD06162, 0x48677224),
+	MK_64(0xCBA65CF3, 0xA92339EF),
+	MK_64(0x8CCD69D6, 0x52FF4B64),
+	MK_64(0x398AED7B, 0x3AB890B4),
+	MK_64(0x0F59D1B1, 0x457D2BD0),
+	MK_64(0x6776FE65, 0x75D4EB3D),
+	MK_64(0x99FBC70E, 0x997413E9),
+	MK_64(0x9E2CFCCF, 0xE1C41EF7)
+};
+
+/* blkSize =  512 bits. hashSize =  256 bits */
+const uint64_t SKEIN_512_IV_256[] = {
+	MK_64(0xCCD044A1, 0x2FDB3E13),
+	MK_64(0xE8359030, 0x1A79A9EB),
+	MK_64(0x55AEA061, 0x4F816E6F),
+	MK_64(0x2A2767A4, 0xAE9B94DB),
+	MK_64(0xEC06025E, 0x74DD7683),
+	MK_64(0xE7A436CD, 0xC4746251),
+	MK_64(0xC36FBAF9, 0x393AD185),
+	MK_64(0x3EEDBA18, 0x33EDFC13)
+};
+
+/* blkSize =  512 bits. hashSize =  384 bits */
+const uint64_t SKEIN_512_IV_384[] = {
+	MK_64(0xA3F6C6BF, 0x3A75EF5F),
+	MK_64(0xB0FEF9CC, 0xFD84FAA4),
+	MK_64(0x9D77DD66, 0x3D770CFE),
+	MK_64(0xD798CBF3, 0xB468FDDA),
+	MK_64(0x1BC4A666, 0x8A0E4465),
+	MK_64(0x7ED7D434, 0xE5807407),
+	MK_64(0x548FC1AC, 0xD4EC44D6),
+	MK_64(0x266E1754, 0x6AA18FF8)
+};
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+const uint64_t SKEIN_512_IV_512[] = {
+	MK_64(0x4903ADFF, 0x749C51CE),
+	MK_64(0x0D95DE39, 0x9746DF03),
+	MK_64(0x8FD19341, 0x27C79BCE),
+	MK_64(0x9A255629, 0xFF352CB1),
+	MK_64(0x5DB62599, 0xDF6CA7B0),
+	MK_64(0xEABE394C, 0xA9D5C3F4),
+	MK_64(0x991112C7, 0x1A75B523),
+	MK_64(0xAE18A40B, 0x660FCC33)
+};
+
+/* blkSize = 1024 bits. hashSize =  384 bits */
+const uint64_t SKEIN1024_IV_384[] = {
+	MK_64(0x5102B6B8, 0xC1894A35),
+	MK_64(0xFEEBC9E3, 0xFE8AF11A),
+	MK_64(0x0C807F06, 0xE32BED71),
+	MK_64(0x60C13A52, 0xB41A91F6),
+	MK_64(0x9716D35D, 0xD4917C38),
+	MK_64(0xE780DF12, 0x6FD31D3A),
+	MK_64(0x797846B6, 0xC898303A),
+	MK_64(0xB172C2A8, 0xB3572A3B),
+	MK_64(0xC9BC8203, 0xA6104A6C),
+	MK_64(0x65909338, 0xD75624F4),
+	MK_64(0x94BCC568, 0x4B3F81A0),
+	MK_64(0x3EBBF51E, 0x10ECFD46),
+	MK_64(0x2DF50F0B, 0xEEB08542),
+	MK_64(0x3B5A6530, 0x0DBC6516),
+	MK_64(0x484B9CD2, 0x167BBCE1),
+	MK_64(0x2D136947, 0xD4CBAFEA)
+};
+
+/* blkSize = 1024 bits. hashSize =  512 bits */
+const uint64_t SKEIN1024_IV_512[] = {
+	MK_64(0xCAEC0E5D, 0x7C1B1B18),
+	MK_64(0xA01B0E04, 0x5F03E802),
+	MK_64(0x33840451, 0xED912885),
+	MK_64(0x374AFB04, 0xEAEC2E1C),
+	MK_64(0xDF25A0E2, 0x813581F7),
+	MK_64(0xE4004093, 0x8B12F9D2),
+	MK_64(0xA662D539, 0xC2ED39B6),
+	MK_64(0xFA8B85CF, 0x45D8C75A),
+	MK_64(0x8316ED8E, 0x29EDE796),
+	MK_64(0x053289C0, 0x2E9F91B8),
+	MK_64(0xC3F8EF1D, 0x6D518B73),
+	MK_64(0xBDCEC3C4, 0xD5EF332E),
+	MK_64(0x549A7E52, 0x22974487),
+	MK_64(0x67070872, 0x5B749816),
+	MK_64(0xB9CD28FB, 0xF0581BD1),
+	MK_64(0x0E2940B8, 0x15804974)
+};
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const uint64_t SKEIN1024_IV_1024[] = {
+	MK_64(0xD593DA07, 0x41E72355),
+	MK_64(0x15B5E511, 0xAC73E00C),
+	MK_64(0x5180E5AE, 0xBAF2C4F0),
+	MK_64(0x03BD41D3, 0xFCBCAFAF),
+	MK_64(0x1CAEC6FD, 0x1983A898),
+	MK_64(0x6E510B8B, 0xCDD0589F),
+	MK_64(0x77E2BDFD, 0xC6394ADA),
+	MK_64(0xC11E1DB5, 0x24DCB0A3),
+	MK_64(0xD6D14AF9, 0xC6329AB5),
+	MK_64(0x6A9B0BFC, 0x6EB67E0D),
+	MK_64(0x9243C60D, 0xCCFF1332),
+	MK_64(0x1A1F1DDE, 0x743F02D4),
+	MK_64(0x0996753C, 0x10ED0BB8),
+	MK_64(0x6572DD22, 0xF2B4969A),
+	MK_64(0x61FD3062, 0xD00A579A),
+	MK_64(0x1DE0536E, 0x8682E539)
+};
diff --git a/sys/contrib/openzfs/module/icp/algs/skein/skein_port.h b/sys/contrib/openzfs/module/icp/algs/skein/skein_port.h
new file mode 100644
index 000000000000..ce4353082552
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/algs/skein/skein_port.h
@@ -0,0 +1,116 @@
+/*
+ * Platform-specific definitions for Skein hash function.
+ *
+ * Source code author: Doug Whiting, 2008.
+ *
+ * This algorithm and source code is released to the public domain.
+ *
+ * Many thanks to Brian Gladman for his portable header files.
+ *
+ * To port Skein to an "unsupported" platform, change the definitions
+ * in this file appropriately.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef	_SKEIN_PORT_H_
+#define	_SKEIN_PORT_H_
+
+#include <sys/types.h>	/* get integer type definitions */
+
+#ifndef	RotL_64
+#define	RotL_64(x, N)	(((x) << (N)) | ((x) >> (64 - (N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs. The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    SKEIN_NEED_SWAP:  0 for little-endian, 1 for big-endian
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef	SKEIN_NEED_SWAP		/* compile-time "override" for endianness? */
+
+#include <sys/isa_defs.h>	/* get endianness selection */
+
+#if	defined(_ZFS_BIG_ENDIAN)
+/* here for big-endian CPUs */
+#define	SKEIN_NEED_SWAP   (1)
+#else
+/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define	SKEIN_NEED_SWAP   (0)
+#define	Skein_Put64_LSB_First(dst08, src64, bCnt) bcopy(src64, dst08, bCnt)
+#define	Skein_Get64_LSB_First(dst64, src08, wCnt) \
+	bcopy(src08, dst64, 8 * (wCnt))
+#endif
+
+#endif				/* ifndef SKEIN_NEED_SWAP */
+
+/*
+ * Provide any definitions still needed.
+ */
+#ifndef	Skein_Swap64	/* swap for big-endian, nop for little-endian */
+#if	SKEIN_NEED_SWAP
+#define	Skein_Swap64(w64)				\
+	(((((uint64_t)(w64)) & 0xFF) << 56) |		\
+	(((((uint64_t)(w64)) >> 8) & 0xFF) << 48) |	\
+	(((((uint64_t)(w64)) >> 16) & 0xFF) << 40) |	\
+	(((((uint64_t)(w64)) >> 24) & 0xFF) << 32) |	\
+	(((((uint64_t)(w64)) >> 32) & 0xFF) << 24) |	\
+	(((((uint64_t)(w64)) >> 40) & 0xFF) << 16) |	\
+	(((((uint64_t)(w64)) >> 48) & 0xFF) << 8) |	\
+	(((((uint64_t)(w64)) >> 56) & 0xFF)))
+#else
+#define	Skein_Swap64(w64)  (w64)
+#endif
+#endif				/* ifndef Skein_Swap64 */
+
+#ifndef	Skein_Put64_LSB_First
+static inline void
+Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt)
+{
+	/*
+	 * this version is fully portable (big-endian or little-endian),
+	 * but slow
+	 */
+	size_t n;
+
+	for (n = 0; n < bCnt; n++)
+		dst[n] = (uint8_t)(src[n >> 3] >> (8 * (n & 7)));
+}
+#endif				/* ifndef Skein_Put64_LSB_First */
+
+#ifndef	Skein_Get64_LSB_First
+static inline void
+Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt)
+{
+	/*
+	 * this version is fully portable (big-endian or little-endian),
+	 * but slow
+	 */
+	size_t n;
+
+	for (n = 0; n < 8 * wCnt; n += 8)
+		dst[n / 8] = (((uint64_t)src[n])) +
+		    (((uint64_t)src[n + 1]) << 8) +
+		    (((uint64_t)src[n + 2]) << 16) +
+		    (((uint64_t)src[n + 3]) << 24) +
+		    (((uint64_t)src[n + 4]) << 32) +
+		    (((uint64_t)src[n + 5]) << 40) +
+		    (((uint64_t)src[n + 6]) << 48) +
+		    (((uint64_t)src[n + 7]) << 56);
+}
+#endif				/* ifndef Skein_Get64_LSB_First */
+
+#endif	/* _SKEIN_PORT_H_ */
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_cipher.c b/sys/contrib/openzfs/module/icp/api/kcf_cipher.c
new file mode 100644
index 000000000000..d6aa48147edb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_cipher.c
@@ -0,0 +1,930 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Encryption and decryption routines.
+ */
+
+/*
+ * The following are the possible returned values common to all the routines
+ * below. The applicability of some of these return values depends on the
+ * presence of the arguments.
+ *
+ *	CRYPTO_SUCCESS:	The operation completed successfully.
+ *	CRYPTO_QUEUED:	A request was submitted successfully. The callback
+ *			routine will be called when the operation is done.
+ *	CRYPTO_INVALID_MECH_NUMBER, CRYPTO_INVALID_MECH_PARAM, or
+ *	CRYPTO_INVALID_MECH for problems with the 'mech'.
+ *	CRYPTO_INVALID_DATA for bogus 'data'
+ *	CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work.
+ *	CRYPTO_INVALID_CONTEXT: Not a valid context.
+ *	CRYPTO_BUSY:	Cannot process the request now. Schedule a
+ *			crypto_bufcall(), or try later.
+ *	CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED: No provider is
+ *			capable of a function or a mechanism.
+ *	CRYPTO_INVALID_KEY: bogus 'key' argument.
+ *	CRYPTO_INVALID_PLAINTEXT: bogus 'plaintext' argument.
+ *	CRYPTO_INVALID_CIPHERTEXT: bogus 'ciphertext' argument.
+ */
+
+/*
+ * crypto_cipher_init_prov()
+ *
+ * Arguments:
+ *
+ *	pd:	provider descriptor
+ *	sid:	session id
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	key:	pointer to a crypto_key_t structure.
+ *	tmpl:	a crypto_ctx_template_t, opaque template of a context of an
+ *		encryption  or decryption with the 'mech' using 'key'.
+ *		'tmpl' is created by a previous call to
+ *		crypto_create_ctx_template().
+ *	ctxp:	Pointer to a crypto_context_t.
+ *	func:	CRYPTO_FG_ENCRYPT or CRYPTO_FG_DECRYPT.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	This is a common function invoked internally by both
+ *	crypto_encrypt_init() and crypto_decrypt_init().
+ *	Asynchronously submits a request for, or synchronously performs the
+ *	initialization of an encryption or a decryption operation.
+ *	When possible and applicable, will internally use the pre-expanded key
+ *	schedule from the context template, tmpl.
+ *	When complete and successful, 'ctxp' will contain a crypto_context_t
+ *	valid for later calls to encrypt_update() and encrypt_final(), or
+ *	decrypt_update() and decrypt_final().
+ *	The caller should hold a reference on the specified provider
+ *	descriptor before calling this function.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+static int
+crypto_cipher_init_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_spi_ctx_template_t tmpl, crypto_context_t *ctxp,
+    crypto_call_req_t *crq, crypto_func_group_t func)
+{
+	int error;
+	crypto_ctx_t *ctx;
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		if (func == CRYPTO_FG_ENCRYPT) {
+			error = kcf_get_hardware_provider(mech->cm_type,
+			    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+			    &real_provider, CRYPTO_FG_ENCRYPT);
+		} else {
+			error = kcf_get_hardware_provider(mech->cm_type,
+			    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+			    &real_provider, CRYPTO_FG_DECRYPT);
+		}
+
+		if (error != CRYPTO_SUCCESS)
+			return (error);
+	}
+
+	/* Allocate and initialize the canonical context */
+	if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) {
+		if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+			KCF_PROV_REFRELE(real_provider);
+		return (CRYPTO_HOST_MEMORY);
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech);
+
+		if (func == CRYPTO_FG_ENCRYPT)
+			error = KCF_PROV_ENCRYPT_INIT(real_provider, ctx,
+			    &lmech, key, tmpl, KCF_SWFP_RHNDL(crq));
+		else {
+			ASSERT(func == CRYPTO_FG_DECRYPT);
+
+			error = KCF_PROV_DECRYPT_INIT(real_provider, ctx,
+			    &lmech, key, tmpl, KCF_SWFP_RHNDL(crq));
+		}
+		KCF_PROV_INCRSTATS(pd, error);
+
+		goto done;
+	}
+
+	/* Check if context sharing is possible */
+	if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+	    key->ck_format == CRYPTO_KEY_RAW &&
+	    KCF_CAN_SHARE_OPSTATE(pd, mech->cm_type)) {
+		kcf_context_t *tctxp = (kcf_context_t *)ctx;
+		kcf_provider_desc_t *tpd = NULL;
+		crypto_mech_info_t *sinfo;
+
+		if ((kcf_get_sw_prov(mech->cm_type, &tpd, &tctxp->kc_mech,
+		    B_FALSE) == CRYPTO_SUCCESS)) {
+			int tlen;
+
+			sinfo = &(KCF_TO_PROV_MECHINFO(tpd, mech->cm_type));
+			/*
+			 * key->ck_length from the consumer is always in bits.
+			 * We convert it to be in the same unit registered by
+			 * the provider in order to do a comparison.
+			 */
+			if (sinfo->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BYTES)
+				tlen = key->ck_length >> 3;
+			else
+				tlen = key->ck_length;
+			/*
+			 * Check if the software provider can support context
+			 * sharing and support this key length.
+			 */
+			if ((sinfo->cm_mech_flags & CRYPTO_CAN_SHARE_OPSTATE) &&
+			    (tlen >= sinfo->cm_min_key_length) &&
+			    (tlen <= sinfo->cm_max_key_length)) {
+				ctx->cc_flags = CRYPTO_INIT_OPSTATE;
+				tctxp->kc_sw_prov_desc = tpd;
+			} else
+				KCF_PROV_REFRELE(tpd);
+		}
+	}
+
+	if (func == CRYPTO_FG_ENCRYPT) {
+		KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_INIT, sid,
+		    mech, key, NULL, NULL, tmpl);
+	} else {
+		ASSERT(func == CRYPTO_FG_DECRYPT);
+		KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_INIT, sid,
+		    mech, key, NULL, NULL, tmpl);
+	}
+
+	error = kcf_submit_request(real_provider, ctx, crq, &params,
+	    B_FALSE);
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+done:
+	if ((error == CRYPTO_SUCCESS) || (error == CRYPTO_QUEUED))
+		*ctxp = (crypto_context_t)ctx;
+	else {
+		/* Release the hold done in kcf_new_ctx(). */
+		KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private);
+	}
+
+	return (error);
+}
+
+/*
+ * Same as crypto_cipher_init_prov(), but relies on the scheduler to pick
+ * an appropriate provider. See crypto_cipher_init_prov() comments for more
+ * details.
+ */
+static int
+crypto_cipher_init(crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+    crypto_call_req_t *crq, crypto_func_group_t func)
+{
+	int error;
+	kcf_mech_entry_t *me;
+	kcf_provider_desc_t *pd;
+	kcf_ctx_template_t *ctx_tmpl;
+	crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+	    list, func, CHECK_RESTRICT(crq), 0)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	/*
+	 * For SW providers, check the validity of the context template
+	 * It is very rare that the generation number mis-matches, so
+	 * is acceptable to fail here, and let the consumer recover by
+	 * freeing this tmpl and create a new one for the key and new SW
+	 * provider
+	 */
+	if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+	    ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+		if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+			if (list != NULL)
+				kcf_free_triedlist(list);
+			KCF_PROV_REFRELE(pd);
+			return (CRYPTO_OLD_CTX_TEMPLATE);
+		} else {
+			spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+		}
+	}
+
+	error = crypto_cipher_init_prov(pd, pd->pd_sid, mech, key,
+	    spi_ctx_tmpl, ctxp, crq, func);
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * crypto_encrypt_prov()
+ *
+ * Arguments:
+ *	pd:	provider descriptor
+ *	sid:	session id
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	key:	pointer to a crypto_key_t structure.
+ *	plaintext: The message to be encrypted
+ *	ciphertext: Storage for the encrypted message. The length needed
+ *		depends on the mechanism, and the plaintext's size.
+ *	tmpl:	a crypto_ctx_template_t, opaque template of a context of an
+ *		encryption with the 'mech' using 'key'. 'tmpl' is created by
+ *		a previous call to crypto_create_ctx_template().
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	single-part encryption of 'plaintext' with the mechanism 'mech', using
+ *	the key 'key'.
+ *	When complete and successful, 'ciphertext' will contain the encrypted
+ *	message.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_encrypt_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_data_t *plaintext, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_data_t *ciphertext,
+    crypto_call_req_t *crq)
+{
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+	int error;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		error = kcf_get_hardware_provider(mech->cm_type,
+		    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+		    &real_provider, CRYPTO_FG_ENCRYPT_ATOMIC);
+
+		if (error != CRYPTO_SUCCESS)
+			return (error);
+	}
+
+	KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, key,
+	    plaintext, ciphertext, tmpl);
+
+	error = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+	return (error);
+}
+
+/*
+ * Same as crypto_encrypt_prov(), but relies on the scheduler to pick
+ * a provider. See crypto_encrypt_prov() for more details.
+ */
+int
+crypto_encrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext,
+    crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *ciphertext,
+    crypto_call_req_t *crq)
+{
+	int error;
+	kcf_mech_entry_t *me;
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd;
+	kcf_ctx_template_t *ctx_tmpl;
+	crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+	    list, CRYPTO_FG_ENCRYPT_ATOMIC, CHECK_RESTRICT(crq),
+	    plaintext->cd_length)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	/*
+	 * For SW providers, check the validity of the context template
+	 * It is very rare that the generation number mis-matches, so
+	 * is acceptable to fail here, and let the consumer recover by
+	 * freeing this tmpl and create a new one for the key and new SW
+	 * provider
+	 */
+	if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+	    ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+		if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+			if (list != NULL)
+				kcf_free_triedlist(list);
+			KCF_PROV_REFRELE(pd);
+			return (CRYPTO_OLD_CTX_TEMPLATE);
+		} else {
+			spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+		}
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+		error = KCF_PROV_ENCRYPT_ATOMIC(pd, pd->pd_sid, &lmech, key,
+		    plaintext, ciphertext, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, pd->pd_sid,
+		    mech, key, plaintext, ciphertext, spi_ctx_tmpl);
+		error = kcf_submit_request(pd, NULL, crq, &params, B_FALSE);
+	}
+
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * crypto_encrypt_init_prov()
+ *
+ * Calls crypto_cipher_init_prov() to initialize an encryption operation.
+ */
+int
+crypto_encrypt_init_prov(crypto_provider_t pd, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+    crypto_call_req_t *crq)
+{
+	return (crypto_cipher_init_prov(pd, sid, mech, key, tmpl, ctxp, crq,
+	    CRYPTO_FG_ENCRYPT));
+}
+
+/*
+ * crypto_encrypt_init()
+ *
+ * Calls crypto_cipher_init() to initialize an encryption operation
+ */
+int
+crypto_encrypt_init(crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+    crypto_call_req_t *crq)
+{
+	return (crypto_cipher_init(mech, key, tmpl, ctxp, crq,
+	    CRYPTO_FG_ENCRYPT));
+}
+
+/*
+ * crypto_encrypt_update()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by encrypt_init().
+ *	plaintext: The message part to be encrypted
+ *	ciphertext: Storage for the encrypted message part.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	part of an encryption operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_encrypt_update(crypto_context_t context, crypto_data_t *plaintext,
+    crypto_data_t *ciphertext, crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_ENCRYPT_UPDATE(pd, ctx, plaintext,
+		    ciphertext, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+		return (error);
+	}
+
+	/* Check if we should use a software provider for small jobs */
+	if ((ctx->cc_flags & CRYPTO_USE_OPSTATE) && cr == NULL) {
+		if (plaintext->cd_length < kcf_ctx->kc_mech->me_threshold &&
+		    kcf_ctx->kc_sw_prov_desc != NULL &&
+		    KCF_IS_PROV_USABLE(kcf_ctx->kc_sw_prov_desc)) {
+			pd = kcf_ctx->kc_sw_prov_desc;
+		}
+	}
+
+	KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_UPDATE,
+	    ctx->cc_session, NULL, NULL, plaintext, ciphertext, NULL);
+	error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+
+	return (error);
+}
+
+/*
+ * crypto_encrypt_final()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by encrypt_init().
+ *	ciphertext: Storage for the last part of encrypted message
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs the
+ *	final part of an encryption operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_encrypt_final(crypto_context_t context, crypto_data_t *ciphertext,
+    crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_ENCRYPT_FINAL(pd, ctx, ciphertext, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_FINAL,
+		    ctx->cc_session, NULL, NULL, NULL, ciphertext, NULL);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+	return (error);
+}
+
+/*
+ * crypto_decrypt_prov()
+ *
+ * Arguments:
+ *	pd:	provider descriptor
+ *	sid:	session id
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	key:	pointer to a crypto_key_t structure.
+ *	ciphertext: The message to be encrypted
+ *	plaintext: Storage for the encrypted message. The length needed
+ *		depends on the mechanism, and the plaintext's size.
+ *	tmpl:	a crypto_ctx_template_t, opaque template of a context of an
+ *		encryption with the 'mech' using 'key'. 'tmpl' is created by
+ *		a previous call to crypto_create_ctx_template().
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	single-part decryption of 'ciphertext' with the mechanism 'mech', using
+ *	the key 'key'.
+ *	When complete and successful, 'plaintext' will contain the decrypted
+ *	message.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_decrypt_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_data_t *ciphertext, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_data_t *plaintext,
+    crypto_call_req_t *crq)
+{
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+	int rv;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		rv = kcf_get_hardware_provider(mech->cm_type,
+		    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+		    &real_provider, CRYPTO_FG_DECRYPT_ATOMIC);
+
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+	}
+
+	KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, key,
+	    ciphertext, plaintext, tmpl);
+
+	rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+	return (rv);
+}
+
+/*
+ * Same as crypto_decrypt_prov(), but relies on the KCF scheduler to
+ * choose a provider. See crypto_decrypt_prov() comments for more
+ * information.
+ */
+int
+crypto_decrypt(crypto_mechanism_t *mech, crypto_data_t *ciphertext,
+    crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *plaintext,
+    crypto_call_req_t *crq)
+{
+	int error;
+	kcf_mech_entry_t *me;
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd;
+	kcf_ctx_template_t *ctx_tmpl;
+	crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+	    list, CRYPTO_FG_DECRYPT_ATOMIC, CHECK_RESTRICT(crq),
+	    ciphertext->cd_length)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	/*
+	 * For SW providers, check the validity of the context template
+	 * It is very rare that the generation number mis-matches, so
+	 * is acceptable to fail here, and let the consumer recover by
+	 * freeing this tmpl and create a new one for the key and new SW
+	 * provider
+	 */
+	if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+	    ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+		if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+			if (list != NULL)
+				kcf_free_triedlist(list);
+			KCF_PROV_REFRELE(pd);
+			return (CRYPTO_OLD_CTX_TEMPLATE);
+		} else {
+			spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+		}
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+		error = KCF_PROV_DECRYPT_ATOMIC(pd, pd->pd_sid, &lmech, key,
+		    ciphertext, plaintext, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_ATOMIC, pd->pd_sid,
+		    mech, key, ciphertext, plaintext, spi_ctx_tmpl);
+		error = kcf_submit_request(pd, NULL, crq, &params, B_FALSE);
+	}
+
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * crypto_decrypt_init_prov()
+ *
+ * Calls crypto_cipher_init_prov() to initialize a decryption operation
+ */
+int
+crypto_decrypt_init_prov(crypto_provider_t pd, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+    crypto_call_req_t *crq)
+{
+	return (crypto_cipher_init_prov(pd, sid, mech, key, tmpl, ctxp, crq,
+	    CRYPTO_FG_DECRYPT));
+}
+
+/*
+ * crypto_decrypt_init()
+ *
+ * Calls crypto_cipher_init() to initialize a decryption operation
+ */
+int
+crypto_decrypt_init(crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+    crypto_call_req_t *crq)
+{
+	return (crypto_cipher_init(mech, key, tmpl, ctxp, crq,
+	    CRYPTO_FG_DECRYPT));
+}
+
+/*
+ * crypto_decrypt_update()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by decrypt_init().
+ *	ciphertext: The message part to be decrypted
+ *	plaintext: Storage for the decrypted message part.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	part of an decryption operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_decrypt_update(crypto_context_t context, crypto_data_t *ciphertext,
+    crypto_data_t *plaintext, crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_DECRYPT_UPDATE(pd, ctx, ciphertext,
+		    plaintext, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+		return (error);
+	}
+
+	/* Check if we should use a software provider for small jobs */
+	if ((ctx->cc_flags & CRYPTO_USE_OPSTATE) && cr == NULL) {
+		if (ciphertext->cd_length < kcf_ctx->kc_mech->me_threshold &&
+		    kcf_ctx->kc_sw_prov_desc != NULL &&
+		    KCF_IS_PROV_USABLE(kcf_ctx->kc_sw_prov_desc)) {
+			pd = kcf_ctx->kc_sw_prov_desc;
+		}
+	}
+
+	KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_UPDATE,
+	    ctx->cc_session, NULL, NULL, ciphertext, plaintext, NULL);
+	error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+
+	return (error);
+}
+
+/*
+ * crypto_decrypt_final()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by decrypt_init().
+ *	plaintext: Storage for the last part of the decrypted message
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs the
+ *	final part of a decryption operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_decrypt_final(crypto_context_t context, crypto_data_t *plaintext,
+    crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_DECRYPT_FINAL(pd, ctx, plaintext,
+		    NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_FINAL,
+		    ctx->cc_session, NULL, NULL, NULL, plaintext, NULL);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+	return (error);
+}
+
+/*
+ * See comments for crypto_encrypt_update().
+ */
+int
+crypto_encrypt_single(crypto_context_t context, crypto_data_t *plaintext,
+    crypto_data_t *ciphertext, crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_ENCRYPT(pd, ctx, plaintext,
+		    ciphertext, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_ENCRYPT_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+		    NULL, NULL, plaintext, ciphertext, NULL);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+	return (error);
+}
+
+/*
+ * See comments for crypto_decrypt_update().
+ */
+int
+crypto_decrypt_single(crypto_context_t context, crypto_data_t *ciphertext,
+    crypto_data_t *plaintext, crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_DECRYPT(pd, ctx, ciphertext,
+		    plaintext, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DECRYPT_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+		    NULL, NULL, ciphertext, plaintext, NULL);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+	return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_encrypt_prov);
+EXPORT_SYMBOL(crypto_encrypt);
+EXPORT_SYMBOL(crypto_encrypt_init_prov);
+EXPORT_SYMBOL(crypto_encrypt_init);
+EXPORT_SYMBOL(crypto_encrypt_update);
+EXPORT_SYMBOL(crypto_encrypt_final);
+EXPORT_SYMBOL(crypto_decrypt_prov);
+EXPORT_SYMBOL(crypto_decrypt);
+EXPORT_SYMBOL(crypto_decrypt_init_prov);
+EXPORT_SYMBOL(crypto_decrypt_init);
+EXPORT_SYMBOL(crypto_decrypt_update);
+EXPORT_SYMBOL(crypto_decrypt_final);
+EXPORT_SYMBOL(crypto_encrypt_single);
+EXPORT_SYMBOL(crypto_decrypt_single);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_ctxops.c b/sys/contrib/openzfs/module/icp/api/kcf_ctxops.c
new file mode 100644
index 000000000000..21b0977d3634
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_ctxops.c
@@ -0,0 +1,151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Crypto contexts manipulation routines
+ */
+
+/*
+ * crypto_create_ctx_template()
+ *
+ * Arguments:
+ *
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	key:	pointer to a crypto_key_t structure.
+ *	ptmpl:	a storage for the opaque crypto_ctx_template_t, allocated and
+ *		initialized by the software provider this routine is
+ *		dispatched to.
+ *	kmflag:	KM_SLEEP/KM_NOSLEEP mem. alloc. flag.
+ *
+ * Description:
+ *	Redirects the call to the software provider of the specified
+ *	mechanism. That provider will allocate and pre-compute/pre-expand
+ *	the context template, reusable by later calls to crypto_xxx_init().
+ *	The size and address of that provider context template are stored
+ *	in an internal structure, kcf_ctx_template_t. The address of that
+ *	structure is given back to the caller in *ptmpl.
+ *
+ * Context:
+ *	Process or interrupt.
+ *
+ * Returns:
+ *	CRYPTO_SUCCESS when the context template is successfully created.
+ *	CRYPTO_HOST_MEMORY: mem alloc failure
+ *	CRYPTO_ARGUMENTS_BAD: NULL storage for the ctx template.
+ *	RYPTO_MECHANISM_INVALID: invalid mechanism 'mech'.
+ */
+int
+crypto_create_ctx_template(crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_ctx_template_t *ptmpl, int kmflag)
+{
+	int error;
+	kcf_mech_entry_t *me;
+	kcf_provider_desc_t *pd;
+	kcf_ctx_template_t *ctx_tmpl;
+	crypto_mechanism_t prov_mech;
+
+	/* A few args validation */
+
+	if (ptmpl == NULL)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	if (mech == NULL)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	error = kcf_get_sw_prov(mech->cm_type, &pd, &me, B_TRUE);
+	if (error != CRYPTO_SUCCESS)
+		return (error);
+
+	if ((ctx_tmpl = (kcf_ctx_template_t *)kmem_alloc(
+	    sizeof (kcf_ctx_template_t), kmflag)) == NULL) {
+		KCF_PROV_REFRELE(pd);
+		return (CRYPTO_HOST_MEMORY);
+	}
+
+	/* Pass a mechtype that the provider understands */
+	prov_mech.cm_type = KCF_TO_PROV_MECHNUM(pd, mech->cm_type);
+	prov_mech.cm_param = mech->cm_param;
+	prov_mech.cm_param_len = mech->cm_param_len;
+
+	error = KCF_PROV_CREATE_CTX_TEMPLATE(pd, &prov_mech, key,
+	    &(ctx_tmpl->ct_prov_tmpl), &(ctx_tmpl->ct_size), KCF_RHNDL(kmflag));
+
+	if (error == CRYPTO_SUCCESS) {
+		ctx_tmpl->ct_generation = me->me_gen_swprov;
+		*ptmpl = ctx_tmpl;
+	} else {
+		kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t));
+	}
+	KCF_PROV_REFRELE(pd);
+
+	return (error);
+}
+
+/*
+ * crypto_destroy_ctx_template()
+ *
+ * Arguments:
+ *
+ *	tmpl:	an opaque crypto_ctx_template_t previously created by
+ *		crypto_create_ctx_template()
+ *
+ * Description:
+ *	Frees the embedded crypto_spi_ctx_template_t, then the
+ *	kcf_ctx_template_t.
+ *
+ * Context:
+ *	Process or interrupt.
+ *
+ */
+void
+crypto_destroy_ctx_template(crypto_ctx_template_t tmpl)
+{
+	kcf_ctx_template_t *ctx_tmpl = (kcf_ctx_template_t *)tmpl;
+
+	if (ctx_tmpl == NULL)
+		return;
+
+	ASSERT(ctx_tmpl->ct_prov_tmpl != NULL);
+
+	bzero(ctx_tmpl->ct_prov_tmpl, ctx_tmpl->ct_size);
+	kmem_free(ctx_tmpl->ct_prov_tmpl, ctx_tmpl->ct_size);
+	kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_create_ctx_template);
+EXPORT_SYMBOL(crypto_destroy_ctx_template);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_digest.c b/sys/contrib/openzfs/module/icp/api/kcf_digest.c
new file mode 100644
index 000000000000..aa68d69bc162
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_digest.c
@@ -0,0 +1,491 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Message digest routines
+ */
+
+/*
+ * The following are the possible returned values common to all the routines
+ * below. The applicability of some of these return values depends on the
+ * presence of the arguments.
+ *
+ *	CRYPTO_SUCCESS:	The operation completed successfully.
+ *	CRYPTO_QUEUED:	A request was submitted successfully. The callback
+ *			routine will be called when the operation is done.
+ *	CRYPTO_MECHANISM_INVALID or CRYPTO_INVALID_MECH_PARAM
+ *			for problems with the 'mech'.
+ *	CRYPTO_INVALID_DATA for bogus 'data'
+ *	CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work.
+ *	CRYPTO_INVALID_CONTEXT: Not a valid context.
+ *	CRYPTO_BUSY:	Cannot process the request now. Schedule a
+ *			crypto_bufcall(), or try later.
+ *	CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED:
+ *			No provider is capable of a function or a mechanism.
+ */
+
+
+/*
+ * crypto_digest_prov()
+ *
+ * Arguments:
+ *	pd:	pointer to the descriptor of the provider to use for this
+ *		operation.
+ *	sid:	provider session id.
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	data:	The message to be digested.
+ *	digest:	Storage for the digest. The length needed depends on the
+ *		mechanism.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs the
+ *	digesting operation of 'data' on the specified
+ *	provider with the specified session.
+ *	When complete and successful, 'digest' will contain the digest value.
+ *	The caller should hold a reference on the specified provider
+ *	descriptor before calling this function.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_digest_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_data_t *data, crypto_data_t *digest,
+    crypto_call_req_t *crq)
+{
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+	int rv;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		rv = kcf_get_hardware_provider(mech->cm_type,
+		    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq),
+		    pd, &real_provider, CRYPTO_FG_DIGEST_ATOMIC);
+
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+	}
+	KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, NULL,
+	    data, digest);
+
+	/* no crypto context to carry between multiple parts. */
+	rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+	return (rv);
+}
+
+
+/*
+ * Same as crypto_digest_prov(), but relies on the KCF scheduler to
+ * choose a provider. See crypto_digest_prov() comments for more information.
+ */
+int
+crypto_digest(crypto_mechanism_t *mech, crypto_data_t *data,
+    crypto_data_t *digest, crypto_call_req_t *crq)
+{
+	int error;
+	kcf_provider_desc_t *pd;
+	kcf_req_params_t params;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* The pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, NULL, &error, list,
+	    CRYPTO_FG_DIGEST_ATOMIC, CHECK_RESTRICT(crq),
+	    data->cd_length)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+		error = KCF_PROV_DIGEST_ATOMIC(pd, pd->pd_sid, &lmech, data,
+		    digest, KCF_SWFP_RHNDL(crq));
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+		    (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) &&
+		    (data->cd_length > pd->pd_hash_limit)) {
+			error = CRYPTO_BUFFER_TOO_BIG;
+		} else {
+			KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_ATOMIC,
+			    pd->pd_sid, mech, NULL, data, digest);
+
+			/* no crypto context to carry between multiple parts. */
+			error = kcf_submit_request(pd, NULL, crq, &params,
+			    B_FALSE);
+		}
+	}
+
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * crypto_digest_init_prov()
+ *
+ *	pd:	pointer to the descriptor of the provider to use for this
+ *		operation.
+ *	sid:	provider session id.
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	ctxp:	Pointer to a crypto_context_t.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs the
+ *	initialization of a message digest operation on the specified
+ *	provider with the specified session.
+ *	When complete and successful, 'ctxp' will contain a crypto_context_t
+ *	valid for later calls to digest_update() and digest_final().
+ *	The caller should hold a reference on the specified provider
+ *	descriptor before calling this function.
+ */
+int
+crypto_digest_init_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_context_t *ctxp, crypto_call_req_t  *crq)
+{
+	int error;
+	crypto_ctx_t *ctx;
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		error = kcf_get_hardware_provider(mech->cm_type,
+		    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+		    &real_provider, CRYPTO_FG_DIGEST);
+
+		if (error != CRYPTO_SUCCESS)
+			return (error);
+	}
+
+	/* Allocate and initialize the canonical context */
+	if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) {
+		if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+			KCF_PROV_REFRELE(real_provider);
+		return (CRYPTO_HOST_MEMORY);
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech);
+		error = KCF_PROV_DIGEST_INIT(real_provider, ctx, &lmech,
+		    KCF_SWFP_RHNDL(crq));
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_INIT, sid,
+		    mech, NULL, NULL, NULL);
+		error = kcf_submit_request(real_provider, ctx, crq, &params,
+		    B_FALSE);
+	}
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+	if ((error == CRYPTO_SUCCESS) || (error == CRYPTO_QUEUED))
+		*ctxp = (crypto_context_t)ctx;
+	else {
+		/* Release the hold done in kcf_new_ctx(). */
+		KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private);
+	}
+
+	return (error);
+}
+
+/*
+ * Same as crypto_digest_init_prov(), but relies on the KCF scheduler
+ * to choose a provider. See crypto_digest_init_prov() comments for
+ * more information.
+ */
+int
+crypto_digest_init(crypto_mechanism_t *mech, crypto_context_t *ctxp,
+    crypto_call_req_t  *crq)
+{
+	int error;
+	kcf_provider_desc_t *pd;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* The pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, NULL, &error,
+	    list, CRYPTO_FG_DIGEST, CHECK_RESTRICT(crq), 0)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+	    (pd->pd_flags & CRYPTO_HASH_NO_UPDATE)) {
+		/*
+		 * The hardware provider has limited digest support.
+		 * So, we fallback early here to using a software provider.
+		 *
+		 * XXX - need to enhance to do the fallback later in
+		 * crypto_digest_update() if the size of accumulated input data
+		 * exceeds the maximum size digestable by hardware provider.
+		 */
+		error = CRYPTO_BUFFER_TOO_BIG;
+	} else {
+		error = crypto_digest_init_prov(pd, pd->pd_sid,
+		    mech, ctxp, crq);
+	}
+
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * crypto_digest_update()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by digest_init().
+ *	data:	The part of message to be digested.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	part of a message digest operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_digest_update(crypto_context_t context, crypto_data_t *data,
+    crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_DIGEST_UPDATE(pd, ctx, data, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_UPDATE,
+		    ctx->cc_session, NULL, NULL, data, NULL);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	return (error);
+}
+
+/*
+ * crypto_digest_final()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by digest_init().
+ *	digest:	The storage for the digest.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs the
+ *	final part of a message digest operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_digest_final(crypto_context_t context, crypto_data_t *digest,
+    crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_DIGEST_FINAL(pd, ctx, digest, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_FINAL,
+		    ctx->cc_session, NULL, NULL, NULL, digest);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+	return (error);
+}
+
+/*
+ * Performs a digest update on the specified key. Note that there is
+ * no k-API crypto_digest_key() equivalent of this function.
+ */
+int
+crypto_digest_key_prov(crypto_context_t context, crypto_key_t *key,
+    crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_DIGEST_KEY(pd, ctx, key, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_DIGEST_KEY,
+		    ctx->cc_session, NULL, key, NULL, NULL);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	return (error);
+}
+
+/*
+ * See comments for crypto_digest_update() and crypto_digest_final().
+ */
+int
+crypto_digest_single(crypto_context_t context, crypto_data_t *data,
+    crypto_data_t *digest, crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_DIGEST(pd, ctx, data, digest, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_DIGEST_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+		    NULL, NULL, data, digest);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+	return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_digest_prov);
+EXPORT_SYMBOL(crypto_digest);
+EXPORT_SYMBOL(crypto_digest_init_prov);
+EXPORT_SYMBOL(crypto_digest_init);
+EXPORT_SYMBOL(crypto_digest_update);
+EXPORT_SYMBOL(crypto_digest_final);
+EXPORT_SYMBOL(crypto_digest_key_prov);
+EXPORT_SYMBOL(crypto_digest_single);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_mac.c b/sys/contrib/openzfs/module/icp/api/kcf_mac.c
new file mode 100644
index 000000000000..a7722d8f914c
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_mac.c
@@ -0,0 +1,645 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * Message authentication codes routines.
+ */
+
+/*
+ * The following are the possible returned values common to all the routines
+ * below. The applicability of some of these return values depends on the
+ * presence of the arguments.
+ *
+ *	CRYPTO_SUCCESS:	The operation completed successfully.
+ *	CRYPTO_QUEUED:	A request was submitted successfully. The callback
+ *			routine will be called when the operation is done.
+ *	CRYPTO_INVALID_MECH_NUMBER, CRYPTO_INVALID_MECH_PARAM, or
+ *	CRYPTO_INVALID_MECH for problems with the 'mech'.
+ *	CRYPTO_INVALID_DATA for bogus 'data'
+ *	CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work.
+ *	CRYPTO_INVALID_CONTEXT: Not a valid context.
+ *	CRYPTO_BUSY:	Cannot process the request now. Schedule a
+ *			crypto_bufcall(), or try later.
+ *	CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED: No provider is
+ *			capable of a function or a mechanism.
+ *	CRYPTO_INVALID_KEY: bogus 'key' argument.
+ *	CRYPTO_INVALID_MAC: bogus 'mac' argument.
+ */
+
+/*
+ * crypto_mac_prov()
+ *
+ * Arguments:
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	key:	pointer to a crypto_key_t structure.
+ *	data:	The message to compute the MAC for.
+ *	mac: Storage for the MAC. The length needed depends on the mechanism.
+ *	tmpl:	a crypto_ctx_template_t, opaque template of a context of a
+ *		MAC with the 'mech' using 'key'. 'tmpl' is created by
+ *		a previous call to crypto_create_ctx_template().
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	single-part message authentication of 'data' with the mechanism
+ *	'mech', using *	the key 'key', on the specified provider with
+ *	the specified session id.
+ *	When complete and successful, 'mac' will contain the message
+ *	authentication code.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'crq'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_mac_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_data_t *data, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_data_t *mac, crypto_call_req_t *crq)
+{
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+	int rv;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		rv = kcf_get_hardware_provider(mech->cm_type,
+		    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+		    &real_provider, CRYPTO_FG_MAC_ATOMIC);
+
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+	}
+
+	KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_ATOMIC, sid, mech, key,
+	    data, mac, tmpl);
+	rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+	return (rv);
+}
+
+/*
+ * Same as crypto_mac_prov(), but relies on the KCF scheduler to choose
+ * a provider. See crypto_mac() comments for more information.
+ */
+int
+crypto_mac(crypto_mechanism_t *mech, crypto_data_t *data,
+    crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac,
+    crypto_call_req_t *crq)
+{
+	int error;
+	kcf_mech_entry_t *me;
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd;
+	kcf_ctx_template_t *ctx_tmpl;
+	crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* The pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+	    list, CRYPTO_FG_MAC_ATOMIC, CHECK_RESTRICT(crq),
+	    data->cd_length)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	/*
+	 * For SW providers, check the validity of the context template
+	 * It is very rare that the generation number mis-matches, so
+	 * is acceptable to fail here, and let the consumer recover by
+	 * freeing this tmpl and create a new one for the key and new SW
+	 * provider
+	 */
+	if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+	    ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+		if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+			if (list != NULL)
+				kcf_free_triedlist(list);
+			KCF_PROV_REFRELE(pd);
+			return (CRYPTO_OLD_CTX_TEMPLATE);
+		} else {
+			spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+		}
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+		error = KCF_PROV_MAC_ATOMIC(pd, pd->pd_sid, &lmech, key, data,
+		    mac, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+		    (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) &&
+		    (data->cd_length > pd->pd_hash_limit)) {
+			/*
+			 * XXX - We need a check to see if this is indeed
+			 * a HMAC. So far, all kernel clients use
+			 * this interface only for HMAC. So, this is fine
+			 * for now.
+			 */
+			error = CRYPTO_BUFFER_TOO_BIG;
+		} else {
+			KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_ATOMIC,
+			    pd->pd_sid, mech, key, data, mac, spi_ctx_tmpl);
+
+			error = kcf_submit_request(pd, NULL, crq, &params,
+			    KCF_ISDUALREQ(crq));
+		}
+	}
+
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * Single part operation to compute the MAC corresponding to the specified
+ * 'data' and to verify that it matches the MAC specified by 'mac'.
+ * The other arguments are the same as the function crypto_mac_prov().
+ */
+int
+crypto_mac_verify_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_data_t *data, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_data_t *mac, crypto_call_req_t *crq)
+{
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+	int rv;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		rv = kcf_get_hardware_provider(mech->cm_type,
+		    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+		    &real_provider, CRYPTO_FG_MAC_ATOMIC);
+
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+	}
+
+	KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_MAC_VERIFY_ATOMIC, sid, mech,
+	    key, data, mac, tmpl);
+	rv = kcf_submit_request(real_provider, NULL, crq, &params, B_FALSE);
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+	return (rv);
+}
+
+/*
+ * Same as crypto_mac_verify_prov(), but relies on the KCF scheduler to choose
+ * a provider. See crypto_mac_verify_prov() comments for more information.
+ */
+int
+crypto_mac_verify(crypto_mechanism_t *mech, crypto_data_t *data,
+    crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac,
+    crypto_call_req_t *crq)
+{
+	int error;
+	kcf_mech_entry_t *me;
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd;
+	kcf_ctx_template_t *ctx_tmpl;
+	crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* The pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+	    list, CRYPTO_FG_MAC_ATOMIC, CHECK_RESTRICT(crq),
+	    data->cd_length)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	/*
+	 * For SW providers, check the validity of the context template
+	 * It is very rare that the generation number mis-matches, so
+	 * is acceptable to fail here, and let the consumer recover by
+	 * freeing this tmpl and create a new one for the key and new SW
+	 * provider
+	 */
+	if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+	    ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+		if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+			if (list != NULL)
+				kcf_free_triedlist(list);
+			KCF_PROV_REFRELE(pd);
+			return (CRYPTO_OLD_CTX_TEMPLATE);
+		} else {
+			spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+		}
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech);
+
+		error = KCF_PROV_MAC_VERIFY_ATOMIC(pd, pd->pd_sid, &lmech, key,
+		    data, mac, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq));
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+		    (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) &&
+		    (data->cd_length > pd->pd_hash_limit)) {
+			/* see comments in crypto_mac() */
+			error = CRYPTO_BUFFER_TOO_BIG;
+		} else {
+			KCF_WRAP_MAC_OPS_PARAMS(&params,
+			    KCF_OP_MAC_VERIFY_ATOMIC, pd->pd_sid, mech,
+			    key, data, mac, spi_ctx_tmpl);
+
+			error = kcf_submit_request(pd, NULL, crq, &params,
+			    KCF_ISDUALREQ(crq));
+		}
+	}
+
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * crypto_mac_init_prov()
+ *
+ * Arguments:
+ *	pd:	pointer to the descriptor of the provider to use for this
+ *		operation.
+ *	sid:	provider session id.
+ *	mech:	crypto_mechanism_t pointer.
+ *		mech_type is a valid value previously returned by
+ *		crypto_mech2id();
+ *		When the mech's parameter is not NULL, its definition depends
+ *		on the standard definition of the mechanism.
+ *	key:	pointer to a crypto_key_t structure.
+ *	tmpl:	a crypto_ctx_template_t, opaque template of a context of a
+ *		MAC with the 'mech' using 'key'. 'tmpl' is created by
+ *		a previous call to crypto_create_ctx_template().
+ *	ctxp:	Pointer to a crypto_context_t.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs the
+ *	initialization of a MAC operation on the specified provider with
+ *	the specified session.
+ *	When possible and applicable, will internally use the pre-computed MAC
+ *	context from the context template, tmpl.
+ *	When complete and successful, 'ctxp' will contain a crypto_context_t
+ *	valid for later calls to mac_update() and mac_final().
+ *	The caller should hold a reference on the specified provider
+ *	descriptor before calling this function.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_mac_init_prov(crypto_provider_t provider, crypto_session_id_t sid,
+    crypto_mechanism_t *mech, crypto_key_t *key, crypto_spi_ctx_template_t tmpl,
+    crypto_context_t *ctxp, crypto_call_req_t *crq)
+{
+	int rv;
+	crypto_ctx_t *ctx;
+	kcf_req_params_t params;
+	kcf_provider_desc_t *pd = provider;
+	kcf_provider_desc_t *real_provider = pd;
+
+	ASSERT(KCF_PROV_REFHELD(pd));
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		rv = kcf_get_hardware_provider(mech->cm_type,
+		    CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd,
+		    &real_provider, CRYPTO_FG_MAC);
+
+		if (rv != CRYPTO_SUCCESS)
+			return (rv);
+	}
+
+	/* Allocate and initialize the canonical context */
+	if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) {
+		if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+			KCF_PROV_REFRELE(real_provider);
+		return (CRYPTO_HOST_MEMORY);
+	}
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(crq, pd)) {
+		crypto_mechanism_t lmech;
+
+		lmech = *mech;
+		KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech);
+		rv = KCF_PROV_MAC_INIT(real_provider, ctx, &lmech, key, tmpl,
+		    KCF_SWFP_RHNDL(crq));
+		KCF_PROV_INCRSTATS(pd, rv);
+	} else {
+		KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_INIT, sid, mech, key,
+		    NULL, NULL, tmpl);
+		rv = kcf_submit_request(real_provider, ctx, crq, &params,
+		    B_FALSE);
+	}
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)
+		KCF_PROV_REFRELE(real_provider);
+
+	if ((rv == CRYPTO_SUCCESS) || (rv == CRYPTO_QUEUED))
+		*ctxp = (crypto_context_t)ctx;
+	else {
+		/* Release the hold done in kcf_new_ctx(). */
+		KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private);
+	}
+
+	return (rv);
+}
+
+/*
+ * Same as crypto_mac_init_prov(), but relies on the KCF scheduler to
+ * choose a provider. See crypto_mac_init_prov() comments for more
+ * information.
+ */
+int
+crypto_mac_init(crypto_mechanism_t *mech, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, crypto_context_t *ctxp,
+    crypto_call_req_t  *crq)
+{
+	int error;
+	kcf_mech_entry_t *me;
+	kcf_provider_desc_t *pd;
+	kcf_ctx_template_t *ctx_tmpl;
+	crypto_spi_ctx_template_t spi_ctx_tmpl = NULL;
+	kcf_prov_tried_t *list = NULL;
+
+retry:
+	/* The pd is returned held */
+	if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error,
+	    list, CRYPTO_FG_MAC, CHECK_RESTRICT(crq), 0)) == NULL) {
+		if (list != NULL)
+			kcf_free_triedlist(list);
+		return (error);
+	}
+
+	/*
+	 * For SW providers, check the validity of the context template
+	 * It is very rare that the generation number mis-matches, so
+	 * is acceptable to fail here, and let the consumer recover by
+	 * freeing this tmpl and create a new one for the key and new SW
+	 * provider
+	 */
+
+	if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+	    ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) {
+		if (ctx_tmpl->ct_generation != me->me_gen_swprov) {
+			if (list != NULL)
+				kcf_free_triedlist(list);
+			KCF_PROV_REFRELE(pd);
+			return (CRYPTO_OLD_CTX_TEMPLATE);
+		} else {
+			spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl;
+		}
+	}
+
+	if (pd->pd_prov_type == CRYPTO_HW_PROVIDER &&
+	    (pd->pd_flags & CRYPTO_HASH_NO_UPDATE)) {
+		/*
+		 * The hardware provider has limited HMAC support.
+		 * So, we fallback early here to using a software provider.
+		 *
+		 * XXX - need to enhance to do the fallback later in
+		 * crypto_mac_update() if the size of accumulated input data
+		 * exceeds the maximum size digestable by hardware provider.
+		 */
+		error = CRYPTO_BUFFER_TOO_BIG;
+	} else {
+		error = crypto_mac_init_prov(pd, pd->pd_sid, mech, key,
+		    spi_ctx_tmpl, ctxp, crq);
+	}
+	if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED &&
+	    IS_RECOVERABLE(error)) {
+		/* Add pd to the linked list of providers tried. */
+		if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL)
+			goto retry;
+	}
+
+	if (list != NULL)
+		kcf_free_triedlist(list);
+
+	KCF_PROV_REFRELE(pd);
+	return (error);
+}
+
+/*
+ * crypto_mac_update()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by mac_init().
+ *	data: The message part to be MAC'ed
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	part of a MAC operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_mac_update(crypto_context_t context, crypto_data_t *data,
+    crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	kcf_req_params_t params;
+	int rv;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		rv = KCF_PROV_MAC_UPDATE(pd, ctx, data, NULL);
+		KCF_PROV_INCRSTATS(pd, rv);
+	} else {
+		KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_UPDATE,
+		    ctx->cc_session, NULL, NULL, data, NULL, NULL);
+		rv = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	return (rv);
+}
+
+/*
+ * crypto_mac_final()
+ *
+ * Arguments:
+ *	context: A crypto_context_t initialized by mac_init().
+ *	mac: Storage for the message authentication code.
+ *	cr:	crypto_call_req_t calling conditions and call back info.
+ *
+ * Description:
+ *	Asynchronously submits a request for, or synchronously performs a
+ *	part of a message authentication operation.
+ *
+ * Context:
+ *	Process or interrupt, according to the semantics dictated by the 'cr'.
+ *
+ * Returns:
+ *	See comment in the beginning of the file.
+ */
+int
+crypto_mac_final(crypto_context_t context, crypto_data_t *mac,
+    crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	kcf_req_params_t params;
+	int rv;
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+	ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		rv = KCF_PROV_MAC_FINAL(pd, ctx, mac, NULL);
+		KCF_PROV_INCRSTATS(pd, rv);
+	} else {
+		KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_FINAL,
+		    ctx->cc_session, NULL, NULL, NULL, mac, NULL);
+		rv = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(rv, kcf_ctx);
+	return (rv);
+}
+
+/*
+ * See comments for crypto_mac_update() and crypto_mac_final().
+ */
+int
+crypto_mac_single(crypto_context_t context, crypto_data_t *data,
+    crypto_data_t *mac, crypto_call_req_t *cr)
+{
+	crypto_ctx_t *ctx = (crypto_ctx_t *)context;
+	kcf_context_t *kcf_ctx;
+	kcf_provider_desc_t *pd;
+	int error;
+	kcf_req_params_t params;
+
+
+	if ((ctx == NULL) ||
+	    ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) ||
+	    ((pd = kcf_ctx->kc_prov_desc) == NULL)) {
+		return (CRYPTO_INVALID_CONTEXT);
+	}
+
+
+	/* The fast path for SW providers. */
+	if (CHECK_FASTPATH(cr, pd)) {
+		error = KCF_PROV_MAC(pd, ctx, data, mac, NULL);
+		KCF_PROV_INCRSTATS(pd, error);
+	} else {
+		KCF_WRAP_MAC_OPS_PARAMS(&params, KCF_OP_SINGLE, pd->pd_sid,
+		    NULL, NULL, data, mac, NULL);
+		error = kcf_submit_request(pd, ctx, cr, &params, B_FALSE);
+	}
+
+	/* Release the hold done in kcf_new_ctx() during init step. */
+	KCF_CONTEXT_COND_RELEASE(error, kcf_ctx);
+	return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_mac_prov);
+EXPORT_SYMBOL(crypto_mac);
+EXPORT_SYMBOL(crypto_mac_verify_prov);
+EXPORT_SYMBOL(crypto_mac_verify);
+EXPORT_SYMBOL(crypto_mac_init_prov);
+EXPORT_SYMBOL(crypto_mac_init);
+EXPORT_SYMBOL(crypto_mac_update);
+EXPORT_SYMBOL(crypto_mac_final);
+EXPORT_SYMBOL(crypto_mac_single);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/api/kcf_miscapi.c b/sys/contrib/openzfs/module/icp/api/kcf_miscapi.c
new file mode 100644
index 000000000000..c0f415b264a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/api/kcf_miscapi.c
@@ -0,0 +1,127 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+
+/*
+ * All event subscribers are put on a list. kcf_notify_list_lock
+ * protects changes to this list.
+ *
+ * The following locking order is maintained in the code - The
+ * global kcf_notify_list_lock followed by the individual lock
+ * in a kcf_ntfy_elem structure (kn_lock).
+ */
+kmutex_t		ntfy_list_lock;
+kcondvar_t		ntfy_list_cv;   /* cv the service thread waits on */
+static kcf_ntfy_elem_t *ntfy_list_head;
+
+/*
+ * crypto_mech2id()
+ *
+ * Arguments:
+ *	. mechname: A null-terminated string identifying the mechanism name.
+ *
+ * Description:
+ *	Walks the mechanisms tables, looking for an entry that matches the
+ *	mechname. Once it find it, it builds the 64-bit mech_type and returns
+ *	it.  If there are no hardware or software providers for the mechanism,
+ *	but there is an unloaded software provider, this routine will attempt
+ *	to load it.
+ *
+ * Context:
+ *	Process and interruption.
+ *
+ * Returns:
+ *	The unique mechanism identified by 'mechname', if found.
+ *	CRYPTO_MECH_INVALID otherwise.
+ */
+crypto_mech_type_t
+crypto_mech2id(char *mechname)
+{
+	return (crypto_mech2id_common(mechname, B_TRUE));
+}
+
+/*
+ * We walk the notification list and do the callbacks.
+ */
+void
+kcf_walk_ntfylist(uint32_t event, void *event_arg)
+{
+	kcf_ntfy_elem_t *nep;
+	int nelem = 0;
+
+	mutex_enter(&ntfy_list_lock);
+
+	/*
+	 * Count how many clients are on the notification list. We need
+	 * this count to ensure that clients which joined the list after we
+	 * have started this walk, are not wrongly notified.
+	 */
+	for (nep = ntfy_list_head; nep != NULL; nep = nep->kn_next)
+		nelem++;
+
+	for (nep = ntfy_list_head; (nep != NULL && nelem); nep = nep->kn_next) {
+		nelem--;
+
+		/*
+		 * Check if this client is interested in the
+		 * event.
+		 */
+		if (!(nep->kn_event_mask & event))
+			continue;
+
+		mutex_enter(&nep->kn_lock);
+		nep->kn_state = NTFY_RUNNING;
+		mutex_exit(&nep->kn_lock);
+		mutex_exit(&ntfy_list_lock);
+
+		/*
+		 * We invoke the callback routine with no locks held. Another
+		 * client could have joined the list meanwhile. This is fine
+		 * as we maintain nelem as stated above. The NULL check in the
+		 * for loop guards against shrinkage. Also, any callers of
+		 * crypto_unnotify_events() at this point cv_wait till kn_state
+		 * changes to NTFY_WAITING. Hence, nep is assured to be valid.
+		 */
+		(*nep->kn_func)(event, event_arg);
+
+		mutex_enter(&nep->kn_lock);
+		nep->kn_state = NTFY_WAITING;
+		cv_broadcast(&nep->kn_cv);
+		mutex_exit(&nep->kn_lock);
+
+		mutex_enter(&ntfy_list_lock);
+	}
+
+	mutex_exit(&ntfy_list_lock);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(crypto_mech2id);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
new file mode 100644
index 000000000000..48fea7bb333e
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
@@ -0,0 +1,23 @@
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software is allowed (with or without
+ changes) provided that:
+
+  1. source code distributions include the above copyright notice, this
+     list of conditions and the following disclaimer;
+
+  2. binary distributions include the above copyright notice, this list
+     of conditions and the following disclaimer in their documentation;
+
+  3. the name of the copyright holder is not used to endorse products
+     built using this software without specific written permission.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..92c9e196a318
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,127 @@
+
+  LICENSE ISSUES
+  ==============
+
+  The OpenSSL toolkit stays under a dual license, i.e. both the conditions of
+  the OpenSSL License and the original SSLeay license apply to the toolkit.
+  See below for the actual license texts. Actually both licenses are BSD-style
+  Open Source licenses. In case of any license issues related to OpenSSL
+  please contact openssl-core@openssl.org.
+
+  OpenSSL License
+  ---------------
+
+/* ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+
+ Original SSLeay License
+ -----------------------
+
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ * 
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ * 
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the routines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from 
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ * 
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * 
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..5f822cf27586
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF AES FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_aesni.S b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_aesni.S
new file mode 100644
index 000000000000..4a80c62097ae
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_aesni.S
@@ -0,0 +1,748 @@
+/*
+ * ====================================================================
+ * Written by Intel Corporation for the OpenSSL project to add support
+ * for Intel AES-NI instructions. Rights for redistribution and usage
+ * in source and binary forms are granted according to the OpenSSL
+ * license.
+ *
+ *   Author: Huang Ying <ying.huang at intel dot com>
+ *           Vinodh Gopal <vinodh.gopal at intel dot com>
+ *           Kahraman Akdemir
+ *
+ * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
+ * instructions that are going to be introduced in the next generation
+ * of Intel processor, as of 2009. These instructions enable fast and
+ * secure data encryption and decryption, using the Advanced Encryption
+ * Standard (AES), defined by FIPS Publication number 197. The
+ * architecture introduces six instructions that offer full hardware
+ * support for AES. Four of them support high performance data
+ * encryption and decryption, and the other two instructions support
+ * the AES key expansion procedure.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
+ * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
+ * Huang Ying of Intel to the openssl-dev mailing list under the subject
+ * of "Add support to Intel AES-NI instruction set for x86_64 platform".
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Renamed functions, reordered parameters, and changed return value
+ * to match OpenSolaris:
+ *
+ * OpenSSL interface:
+ *	int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ *		const int bits, AES_KEY *key);
+ *	int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ *		const int bits, AES_KEY *key);
+ *	Return values for above are non-zero on error, 0 on success.
+ *
+ *	void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ *		const AES_KEY *key);
+ *	void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ *		const AES_KEY *key);
+ *	typedef struct aes_key_st {
+ *		unsigned int	rd_key[4 *(AES_MAXNR + 1)];
+ *		int		rounds;
+ *		unsigned int	pad[3];
+ *	} AES_KEY;
+ * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
+ * (ks32) instead of 64-bit (ks64).
+ * Number of rounds (aka round count) is at offset 240 of AES_KEY.
+ *
+ * OpenSolaris OS interface (#ifdefs removed for readability):
+ *	int rijndael_key_setup_dec_intel(uint32_t rk[],
+ *		const uint32_t cipherKey[], uint64_t keyBits);
+ *	int rijndael_key_setup_enc_intel(uint32_t rk[],
+ *		const uint32_t cipherKey[], uint64_t keyBits);
+ *	Return values for above are 0 on error, number of rounds on success.
+ *
+ *	void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4]);
+ *	void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4]);
+ *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
+ *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
+ *
+ *	typedef union {
+ *		uint32_t	ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+ *	} aes_ks_t;
+ *	typedef struct aes_key {
+ *		aes_ks_t	encr_ks, decr_ks;
+ *		long double	align128;
+ *		int		flags, nr, type;
+ *	} aes_key_t;
+ *
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ *
+ * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
+ *
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
+    uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
+    uint32_t pt[4]) {
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+    uint64_t keyBits) {
+	return (0);
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+   uint64_t keyBits) {
+	return (0);
+}
+
+
+#elif defined(HAVE_AES)	/* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
+ * _key_expansion_256a(), _key_expansion_256b()
+ *
+ * Helper functions called by rijndael_key_setup_inc_intel().
+ * Also used indirectly by rijndael_key_setup_dec_intel().
+ *
+ * Input:
+ * %xmm0	User-provided cipher key
+ * %xmm1	Round constant
+ * Output:
+ * (%rcx)	AES key
+ */
+
+ENTRY_NP2(_key_expansion_128, _key_expansion_256a)
+_key_expansion_128_local:
+_key_expansion_256a_local:
+	pshufd	$0b11111111, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	shufps	$0b10001100, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	pxor	%xmm1, %xmm0
+	movups	%xmm0, (%rcx)
+	add	$0x10, %rcx
+	ret
+	nop
+SET_SIZE(_key_expansion_128)
+SET_SIZE(_key_expansion_256a)
+
+
+ENTRY_NP(_key_expansion_192a)
+_key_expansion_192a_local:
+	pshufd	$0b01010101, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	shufps	$0b10001100, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	pxor	%xmm1, %xmm0
+
+	movups	%xmm2, %xmm5
+	movups	%xmm2, %xmm6
+	pslldq	$4, %xmm5
+	pshufd	$0b11111111, %xmm0, %xmm3
+	pxor	%xmm3, %xmm2
+	pxor	%xmm5, %xmm2
+
+	movups	%xmm0, %xmm1
+	shufps	$0b01000100, %xmm0, %xmm6
+	movups	%xmm6, (%rcx)
+	shufps	$0b01001110, %xmm2, %xmm1
+	movups	%xmm1, 0x10(%rcx)
+	add	$0x20, %rcx
+	ret
+SET_SIZE(_key_expansion_192a)
+
+
+ENTRY_NP(_key_expansion_192b)
+_key_expansion_192b_local:
+	pshufd	$0b01010101, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	shufps	$0b10001100, %xmm0, %xmm4
+	pxor	%xmm4, %xmm0
+	pxor	%xmm1, %xmm0
+
+	movups	%xmm2, %xmm5
+	pslldq	$4, %xmm5
+	pshufd	$0b11111111, %xmm0, %xmm3
+	pxor	%xmm3, %xmm2
+	pxor	%xmm5, %xmm2
+
+	movups	%xmm0, (%rcx)
+	add	$0x10, %rcx
+	ret
+SET_SIZE(_key_expansion_192b)
+
+
+ENTRY_NP(_key_expansion_256b)
+_key_expansion_256b_local:
+	pshufd	$0b10101010, %xmm1, %xmm1
+	shufps	$0b00010000, %xmm2, %xmm4
+	pxor	%xmm4, %xmm2
+	shufps	$0b10001100, %xmm2, %xmm4
+	pxor	%xmm4, %xmm2
+	pxor	%xmm1, %xmm2
+	movups	%xmm2, (%rcx)
+	add	$0x10, %rcx
+	ret
+SET_SIZE(_key_expansion_256b)
+
+
+/*
+ * rijndael_key_setup_enc_intel()
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ *	uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ *	const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+#ifdef	OPENSSL_INTERFACE
+#define	rijndael_key_setup_enc_intel	intel_AES_set_encrypt_key
+#define	rijndael_key_setup_dec_intel	intel_AES_set_decrypt_key
+
+#define	USERCIPHERKEY		rdi	/* P1, 64 bits */
+#define	KEYSIZE32		esi	/* P2, 32 bits */
+#define	KEYSIZE64		rsi	/* P2, 64 bits */
+#define	AESKEY			rdx	/* P3, 64 bits */
+
+#else	/* OpenSolaris Interface */
+#define	AESKEY			rdi	/* P1, 64 bits */
+#define	USERCIPHERKEY		rsi	/* P2, 64 bits */
+#define	KEYSIZE32		edx	/* P3, 32 bits */
+#define	KEYSIZE64		rdx	/* P3, 64 bits */
+#endif	/* OPENSSL_INTERFACE */
+
+#define	ROUNDS32		KEYSIZE32	/* temp */
+#define	ROUNDS64		KEYSIZE64	/* temp */
+#define	ENDAESKEY		USERCIPHERKEY	/* temp */
+
+ENTRY_NP(rijndael_key_setup_enc_intel)
+rijndael_key_setup_enc_intel_local:
+	FRAME_BEGIN
+	// NULL pointer sanity check
+	test	%USERCIPHERKEY, %USERCIPHERKEY
+	jz	.Lenc_key_invalid_param
+	test	%AESKEY, %AESKEY
+	jz	.Lenc_key_invalid_param
+
+	movups	(%USERCIPHERKEY), %xmm0	// user key (first 16 bytes)
+	movups	%xmm0, (%AESKEY)
+	lea	0x10(%AESKEY), %rcx	// key addr
+	pxor	%xmm4, %xmm4		// xmm4 is assumed 0 in _key_expansion_x
+
+	cmp	$256, %KEYSIZE32
+	jnz	.Lenc_key192
+
+	// AES 256: 14 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+	mov	$14, %ROUNDS32
+	movl	%ROUNDS32, 240(%AESKEY)		// key.rounds = 14
+#endif	/* OPENSSL_INTERFACE */
+
+	movups	0x10(%USERCIPHERKEY), %xmm2	// other user key (2nd 16 bytes)
+	movups	%xmm2, (%rcx)
+	add	$0x10, %rcx
+
+	aeskeygenassist $0x1, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x1, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x2, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x2, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x4, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x4, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x8, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x8, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x10, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x10, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x20, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+	aeskeygenassist $0x20, %xmm0, %xmm1
+	call	_key_expansion_256b_local
+	aeskeygenassist $0x40, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_256a_local
+
+#ifdef	OPENSSL_INTERFACE
+	xor	%rax, %rax			// return 0 (OK)
+#else	/* Open Solaris Interface */
+	mov	$14, %rax			// return # rounds = 14
+#endif
+	FRAME_END
+	ret
+
+.align 4
+.Lenc_key192:
+	cmp	$192, %KEYSIZE32
+	jnz	.Lenc_key128
+
+	// AES 192: 12 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+	mov	$12, %ROUNDS32
+	movl	%ROUNDS32, 240(%AESKEY)	// key.rounds = 12
+#endif	/* OPENSSL_INTERFACE */
+
+	movq	0x10(%USERCIPHERKEY), %xmm2	// other user key
+	aeskeygenassist $0x1, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x2, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+	aeskeygenassist $0x4, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x8, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+	aeskeygenassist $0x10, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x20, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+	aeskeygenassist $0x40, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192a_local
+	aeskeygenassist $0x80, %xmm2, %xmm1	// expand the key
+	call	_key_expansion_192b_local
+
+#ifdef	OPENSSL_INTERFACE
+	xor	%rax, %rax			// return 0 (OK)
+#else	/* OpenSolaris Interface */
+	mov	$12, %rax			// return # rounds = 12
+#endif
+	FRAME_END
+	ret
+
+.align 4
+.Lenc_key128:
+	cmp $128, %KEYSIZE32
+	jnz .Lenc_key_invalid_key_bits
+
+	// AES 128: 10 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+	mov	$10, %ROUNDS32
+	movl	%ROUNDS32, 240(%AESKEY)		// key.rounds = 10
+#endif	/* OPENSSL_INTERFACE */
+
+	aeskeygenassist $0x1, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x2, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x4, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x8, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x10, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x20, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x40, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x80, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x1b, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+	aeskeygenassist $0x36, %xmm0, %xmm1	// expand the key
+	call	_key_expansion_128_local
+
+#ifdef	OPENSSL_INTERFACE
+	xor	%rax, %rax			// return 0 (OK)
+#else	/* OpenSolaris Interface */
+	mov	$10, %rax			// return # rounds = 10
+#endif
+	FRAME_END
+	ret
+
+.Lenc_key_invalid_param:
+#ifdef	OPENSSL_INTERFACE
+	mov	$-1, %rax	// user key or AES key pointer is NULL
+	FRAME_END
+	ret
+#else
+	/* FALLTHROUGH */
+#endif	/* OPENSSL_INTERFACE */
+
+.Lenc_key_invalid_key_bits:
+#ifdef	OPENSSL_INTERFACE
+	mov	$-2, %rax	// keysize is invalid
+#else	/* Open Solaris Interface */
+	xor	%rax, %rax	// a key pointer is NULL or invalid keysize
+#endif	/* OPENSSL_INTERFACE */
+	FRAME_END
+	ret
+	SET_SIZE(rijndael_key_setup_enc_intel)
+
+
+/*
+ * rijndael_key_setup_dec_intel()
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ *	uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ * P1->P2, P2->P3, P3->P1
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ *	const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+ENTRY_NP(rijndael_key_setup_dec_intel)
+FRAME_BEGIN
+	// Generate round keys used for encryption
+	call	rijndael_key_setup_enc_intel_local
+	test	%rax, %rax
+#ifdef	OPENSSL_INTERFACE
+	jnz	.Ldec_key_exit	// Failed if returned non-0
+#else	/* OpenSolaris Interface */
+	jz	.Ldec_key_exit	// Failed if returned 0
+#endif	/* OPENSSL_INTERFACE */
+
+	/*
+	 * Convert round keys used for encryption
+	 * to a form usable for decryption
+	 */
+#ifndef	OPENSSL_INTERFACE		/* OpenSolaris Interface */
+	mov	%rax, %ROUNDS64		// set # rounds (10, 12, or 14)
+					// (already set for OpenSSL)
+#endif
+
+	lea	0x10(%AESKEY), %rcx	// key addr
+	shl	$4, %ROUNDS32
+	add	%AESKEY, %ROUNDS64
+	mov	%ROUNDS64, %ENDAESKEY
+
+.align 4
+.Ldec_key_reorder_loop:
+	movups	(%AESKEY), %xmm0
+	movups	(%ROUNDS64), %xmm1
+	movups	%xmm0, (%ROUNDS64)
+	movups	%xmm1, (%AESKEY)
+	lea	0x10(%AESKEY), %AESKEY
+	lea	-0x10(%ROUNDS64), %ROUNDS64
+	cmp	%AESKEY, %ROUNDS64
+	ja	.Ldec_key_reorder_loop
+
+.align 4
+.Ldec_key_inv_loop:
+	movups	(%rcx), %xmm0
+	// Convert an encryption round key to a form usable for decryption
+	// with the "AES Inverse Mix Columns" instruction
+	aesimc	%xmm0, %xmm1
+	movups	%xmm1, (%rcx)
+	lea	0x10(%rcx), %rcx
+	cmp	%ENDAESKEY, %rcx
+	jnz	.Ldec_key_inv_loop
+
+.Ldec_key_exit:
+	// OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
+	// OpenSSL: rax = 0 for OK, or non-zero for error
+	FRAME_END
+	ret
+	SET_SIZE(rijndael_key_setup_dec_intel)
+
+
+/*
+ * aes_encrypt_intel()
+ * Encrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0	State
+ * %xmm1	Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ *	const AES_KEY *key)
+ */
+
+#ifdef	OPENSSL_INTERFACE
+#define	aes_encrypt_intel	intel_AES_encrypt
+#define	aes_decrypt_intel	intel_AES_decrypt
+
+#define	INP		rdi	/* P1, 64 bits */
+#define	OUTP		rsi	/* P2, 64 bits */
+#define	KEYP		rdx	/* P3, 64 bits */
+
+/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx:  */
+#define	NROUNDS32	ecx	/* temporary, 32 bits */
+#define	NROUNDS		cl	/* temporary,  8 bits */
+
+#else	/* OpenSolaris Interface */
+#define	KEYP		rdi	/* P1, 64 bits */
+#define	NROUNDS		esi	/* P2, 32 bits */
+#define	INP		rdx	/* P3, 64 bits */
+#define	OUTP		rcx	/* P4, 64 bits */
+#endif	/* OPENSSL_INTERFACE */
+
+#define	STATE		xmm0	/* temporary, 128 bits */
+#define	KEY		xmm1	/* temporary, 128 bits */
+
+
+ENTRY_NP(aes_encrypt_intel)
+
+	movups	(%INP), %STATE			// input
+	movups	(%KEYP), %KEY			// key
+#ifdef	OPENSSL_INTERFACE
+	mov	240(%KEYP), %NROUNDS32		// round count
+#else	/* OpenSolaris Interface */
+	/* Round count is already present as P2 in %rsi/%esi */
+#endif	/* OPENSSL_INTERFACE */
+
+	pxor	%KEY, %STATE			// round 0
+	lea	0x30(%KEYP), %KEYP
+	cmp	$12, %NROUNDS
+	jb	.Lenc128
+	lea	0x20(%KEYP), %KEYP
+	je	.Lenc192
+
+	// AES 256
+	lea	0x20(%KEYP), %KEYP
+	movups	-0x60(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	-0x50(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+
+.align 4
+.Lenc192:
+	// AES 192 and 256
+	movups	-0x40(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	-0x30(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+
+.align 4
+.Lenc128:
+	// AES 128, 192, and 256
+	movups	-0x20(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	-0x10(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x10(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x20(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x30(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x40(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x50(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x60(%KEYP), %KEY
+	aesenc	%KEY, %STATE
+	movups	0x70(%KEYP), %KEY
+	aesenclast	 %KEY, %STATE		// last round
+	movups	%STATE, (%OUTP)			// output
+
+	ret
+	SET_SIZE(aes_encrypt_intel)
+
+
+/*
+ * aes_decrypt_intel()
+ * Decrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called.  This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively,  if TS is set
+ * on entry.  Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0	State
+ * %xmm1	Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ *	const AES_KEY *key);
+ */
+ENTRY_NP(aes_decrypt_intel)
+
+	movups	(%INP), %STATE			// input
+	movups	(%KEYP), %KEY			// key
+#ifdef	OPENSSL_INTERFACE
+	mov	240(%KEYP), %NROUNDS32		// round count
+#else	/* OpenSolaris Interface */
+	/* Round count is already present as P2 in %rsi/%esi */
+#endif	/* OPENSSL_INTERFACE */
+
+	pxor	%KEY, %STATE			// round 0
+	lea	0x30(%KEYP), %KEYP
+	cmp	$12, %NROUNDS
+	jb	.Ldec128
+	lea	0x20(%KEYP), %KEYP
+	je	.Ldec192
+
+	// AES 256
+	lea	0x20(%KEYP), %KEYP
+	movups	-0x60(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	-0x50(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+
+.align 4
+.Ldec192:
+	// AES 192 and 256
+	movups	-0x40(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	-0x30(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+
+.align 4
+.Ldec128:
+	// AES 128, 192, and 256
+	movups	-0x20(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	-0x10(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x10(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x20(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x30(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x40(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x50(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x60(%KEYP), %KEY
+	aesdec	%KEY, %STATE
+	movups	0x70(%KEYP), %KEY
+	aesdeclast	%KEY, %STATE		// last round
+	movups	%STATE, (%OUTP)			// output
+
+	ret
+	SET_SIZE(aes_decrypt_intel)
+
+#endif	/* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S
new file mode 100644
index 000000000000..9db3a3179230
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aes_amd64.S
@@ -0,0 +1,906 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *     list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *     of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *     built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue 20/12/2007
+ *
+ * I am grateful to Dag Arne Osvik for many discussions of the techniques that
+ * can be used to optimise AES assembler code on AMD64/EM64T architectures.
+ * Some of the techniques used in this implementation are the result of
+ * suggestions made by him for which I am most grateful.
+ *
+ * An AES implementation for AMD64 processors using the YASM assembler.  This
+ * implementation provides only encryption, decryption and hence requires key
+ * scheduling support in C. It uses 8k bytes of tables but its encryption and
+ * decryption performance is very close to that obtained using large tables.
+ * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions,
+ * which are as follows:
+ *               ms windows  gnu/linux/opensolaris os
+ *
+ *   in_blk          rcx     rdi
+ *   out_blk         rdx     rsi
+ *   context (cx)     r8     rdx
+ *
+ *   preserved       rsi      -    + rbx, rbp, rsp, r12, r13, r14 & r15
+ *   registers       rdi      -      on both
+ *
+ *   destroyed        -      rsi   + rax, rcx, rdx, r8, r9, r10 & r11
+ *   registers        -      rdi     on both
+ *
+ * The convention used here is that for gnu/linux/opensolaris os.
+ *
+ * This code provides the standard AES block size (128 bits, 16 bytes) and the
+ * three standard AES key sizes (128, 192 and 256 bits). It has the same call
+ * interface as my C implementation.  It uses the Microsoft C AMD64 calling
+ * conventions in which the three parameters are placed in  rcx, rdx and r8
+ * respectively.  The rbx, rsi, rdi, rbp and r12..r15 registers are preserved.
+ *
+ * OpenSolaris Note:
+ * Modified to use GNU/Linux/Solaris calling conventions.
+ * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively.
+ *
+ *     AES_RETURN aes_encrypt(const unsigned char in_blk[],
+ *                   unsigned char out_blk[], const aes_encrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_decrypt(const unsigned char in_blk[],
+ *                   unsigned char out_blk[], const aes_decrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[],
+ *                                            const aes_encrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[],
+ *                                            const aes_decrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_encrypt_key(const unsigned char key[],
+ *                           unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ *     AES_RETURN aes_decrypt_key(const unsigned char key[],
+ *                           unsigned int len, const aes_decrypt_ctx cx[1])/
+ *
+ * where <NNN> is 128, 102 or 256.  In the last two calls the length can be in
+ * either bits or bytes.
+ *
+ * Comment in/out the following lines to obtain the desired subroutines. These
+ * selections MUST match those in the C header file aesopt.h
+ */
+#define	AES_REV_DKS	  /* define if key decryption schedule is reversed */
+
+#define	LAST_ROUND_TABLES /* define for the faster version using extra tables */
+
+/*
+ * The encryption key schedule has the following in memory layout where N is the
+ * number of rounds (10, 12 or 14):
+ *
+ * lo: | input key (round 0)  |  / each round is four 32-bit words
+ *     | encryption round 1   |
+ *     | encryption round 2   |
+ *     ....
+ *     | encryption round N-1 |
+ * hi: | encryption round N   |
+ *
+ * The decryption key schedule is normally set up so that it has the same
+ * layout as above by actually reversing the order of the encryption key
+ * schedule in memory (this happens when AES_REV_DKS is set):
+ *
+ * lo: | decryption round 0   | =              | encryption round N   |
+ *     | decryption round 1   | = INV_MIX_COL[ | encryption round N-1 | ]
+ *     | decryption round 2   | = INV_MIX_COL[ | encryption round N-2 | ]
+ *     ....                       ....
+ *     | decryption round N-1 | = INV_MIX_COL[ | encryption round 1   | ]
+ * hi: | decryption round N   | =              | input key (round 0)  |
+ *
+ * with rounds except the first and last modified using inv_mix_column()
+ * But if AES_REV_DKS is NOT set the order of keys is left as it is for
+ * encryption so that it has to be accessed in reverse when used for
+ * decryption (although the inverse mix column modifications are done)
+ *
+ * lo: | decryption round 0   | =              | input key (round 0)  |
+ *     | decryption round 1   | = INV_MIX_COL[ | encryption round 1   | ]
+ *     | decryption round 2   | = INV_MIX_COL[ | encryption round 2   | ]
+ *     ....                       ....
+ *     | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ]
+ * hi: | decryption round N   | =              | encryption round N   |
+ *
+ * This layout is faster when the assembler key scheduling provided here
+ * is used.
+ *
+ * End of user defines
+ */
+
+/*
+ * ---------------------------------------------------------------------------
+ * OpenSolaris OS modifications
+ *
+ * This source originates from Brian Gladman file aes_amd64.asm
+ * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip
+ * with these changes:
+ *
+ * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and
+ * !__GNUC__ ifdefs.  Also removed ENCRYPTION, DECRYPTION,
+ * AES_128, AES_192, AES_256, AES_VAR ifdefs.
+ *
+ * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define
+ *
+ * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef
+ *
+ * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax
+ * (operands reversed, literals prefixed with "$", registers prefixed with "%",
+ * and "[register+offset]", addressing changed to "offset(register)",
+ * parenthesis in constant expressions "()" changed to square brackets "[]",
+ * "." removed from  local (numeric) labels, and other changes.
+ * Examples:
+ * Intel/yasm/nasm Syntax	ATT/OpenSolaris Syntax
+ * mov	rax,(4*20h)		mov	$[4*0x20],%rax
+ * mov	rax,[ebx+20h]		mov	0x20(%ebx),%rax
+ * lea	rax,[ebx+ecx]		lea	(%ebx,%ecx),%rax
+ * sub	rax,[ebx+ecx*4-20h]	sub	-0x20(%ebx,%ecx,4),%rax
+ *
+ * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 6. Renamed functions and reordered parameters to match OpenSolaris:
+ * Original Gladman interface:
+ *	int aes_encrypt(const unsigned char *in,
+ *		unsigned char *out, const aes_encrypt_ctx cx[1])/
+ *	int aes_decrypt(const unsigned char *in,
+ *		unsigned char *out, const aes_encrypt_ctx cx[1])/
+ * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t,
+ * and a union type, inf., containing inf.l, a uint32_t and
+ * inf.b, a 4-element array of uint32_t.  Only b[0] in the array (aka "l") is
+ * used and contains the key schedule length * 16 where key schedule length is
+ * 10, 12, or 14 bytes.
+ *
+ * OpenSolaris OS interface:
+ *	void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4])/
+ *	void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ *		const uint32_t pt[4], uint32_t ct[4])/
+ *	typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/
+ *		 uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ */
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+/* ARGSUSED */
+void
+aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4],
+	uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4],
+	uint32_t pt[4]) {
+}
+
+
+#else
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#define	KS_LENGTH	60
+
+#define	raxd		eax
+#define	rdxd		edx
+#define	rcxd		ecx
+#define	rbxd		ebx
+#define	rsid		esi
+#define	rdid		edi
+
+#define	raxb		al
+#define	rdxb		dl
+#define	rcxb		cl
+#define	rbxb		bl
+#define	rsib		sil
+#define	rdib		dil
+
+// finite field multiplies by {02}, {04} and {08}
+
+#define	f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]]
+#define	f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]]
+#define	f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]]
+
+// finite field multiplies required in table generation
+
+#define	f3(x) [[f2(x)] ^ [x]]
+#define	f9(x) [[f8(x)] ^ [x]]
+#define	fb(x) [[f8(x)] ^ [f2(x)] ^ [x]]
+#define	fd(x) [[f8(x)] ^ [f4(x)] ^ [x]]
+#define	fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]]
+
+// macros for expanding S-box data
+
+#define	u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)]
+#define	v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x]
+#define	w8(x) [x], 0, 0, 0, [x], 0, 0, 0
+
+#define	enc_vals(x)	\
+   .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \
+   .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \
+   .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \
+   .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \
+   .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \
+   .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \
+   .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \
+   .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \
+   .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \
+   .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \
+   .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \
+   .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \
+   .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \
+   .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \
+   .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \
+   .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \
+   .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \
+   .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \
+   .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \
+   .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \
+   .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \
+   .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \
+   .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \
+   .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \
+   .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \
+   .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \
+   .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \
+   .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \
+   .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \
+   .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \
+   .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \
+   .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16)
+
+#define	dec_vals(x) \
+   .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \
+   .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \
+   .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \
+   .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \
+   .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \
+   .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \
+   .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \
+   .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \
+   .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \
+   .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \
+   .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \
+   .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \
+   .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \
+   .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \
+   .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \
+   .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \
+   .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \
+   .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \
+   .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \
+   .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \
+   .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \
+   .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \
+   .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \
+   .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \
+   .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \
+   .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \
+   .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \
+   .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \
+   .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \
+   .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \
+   .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \
+   .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d)
+
+#define	tptr	%rbp	/* table pointer */
+#define	kptr	%r8	/* key schedule pointer */
+#define	fofs	128	/* adjust offset in key schedule to keep |disp| < 128 */
+#define	fk_ref(x, y)	-16*x+fofs+4*y(kptr)
+
+#ifdef	AES_REV_DKS
+#define	rofs		128
+#define	ik_ref(x, y)	-16*x+rofs+4*y(kptr)
+
+#else
+#define	rofs		-128
+#define	ik_ref(x, y)	16*x+rofs+4*y(kptr)
+#endif	/* AES_REV_DKS */
+
+#define	tab_0(x)	(tptr,x,8)
+#define	tab_1(x)	3(tptr,x,8)
+#define	tab_2(x)	2(tptr,x,8)
+#define	tab_3(x)	1(tptr,x,8)
+#define	tab_f(x)	1(tptr,x,8)
+#define	tab_i(x)	7(tptr,x,8)
+
+#define	ff_rnd(p1, p2, p3, p4, round)	/* normal forward round */ \
+	mov	fk_ref(round,0), p1; \
+	mov	fk_ref(round,1), p2; \
+	mov	fk_ref(round,2), p3; \
+	mov	fk_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p3; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p1; \
+ \
+	mov	p1, %eax; \
+	mov	p2, %ebx; \
+	mov	p3, %ecx; \
+	mov	p4, %edx
+
+#ifdef	LAST_ROUND_TABLES
+
+#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
+	add	$2048, tptr; \
+	mov	fk_ref(round,0), p1; \
+	mov	fk_ref(round,1), p2; \
+	mov	fk_ref(round,2), p3; \
+	mov	fk_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p3; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p1
+
+#else
+
+#define	fl_rnd(p1, p2, p3, p4, round)	/* last forward round */ \
+	mov	fk_ref(round,0), p1; \
+	mov	fk_ref(round,1), p2; \
+	mov	fk_ref(round,2), p3; \
+	mov	fk_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	xor	%esi, p1; \
+	rol	$8, %edi; \
+	xor	%edi, p4; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p3; \
+	xor	%edi, p2; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	xor	%esi, p2; \
+	rol	$8, %edi; \
+	xor	%edi, p1; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p4; \
+	xor	%edi, p3; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	shr	$16, %ecx; \
+	xor	%esi, p3; \
+	rol	$8, %edi; \
+	xor	%edi, p2; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p1; \
+	xor	%edi, p4; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	shr	$16, %edx; \
+	xor	%esi, p4; \
+	rol	$8, %edi; \
+	xor	%edi, p3; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_f(%rsi), %esi; \
+	movzx	tab_f(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p2; \
+	xor	%edi, p1
+
+#endif	/* LAST_ROUND_TABLES */
+
+#define	ii_rnd(p1, p2, p3, p4, round)	/* normal inverse round */ \
+	mov	ik_ref(round,0), p1; \
+	mov	ik_ref(round,1), p2; \
+	mov	ik_ref(round,2), p3; \
+	mov	ik_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p1; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p3; \
+ \
+	mov	p1, %eax; \
+	mov	p2, %ebx; \
+	mov	p3, %ecx; \
+	mov	p4, %edx
+
+#ifdef	LAST_ROUND_TABLES
+
+#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
+	add	$2048, tptr; \
+	mov	ik_ref(round,0), p1; \
+	mov	ik_ref(round,1), p2; \
+	mov	ik_ref(round,2), p3; \
+	mov	ik_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	shr	$16, %eax; \
+	xor	tab_0(%rsi), p1; \
+	xor	tab_1(%rdi), p2; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	xor	tab_2(%rsi), p3; \
+	xor	tab_3(%rdi), p4; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	shr	$16, %ebx; \
+	xor	tab_0(%rsi), p2; \
+	xor	tab_1(%rdi), p3; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	xor	tab_2(%rsi), p4; \
+	xor	tab_3(%rdi), p1; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	shr	$16, %ecx; \
+	xor	tab_0(%rsi), p3; \
+	xor	tab_1(%rdi), p4; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	xor	tab_2(%rsi), p1; \
+	xor	tab_3(%rdi), p2; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	shr	$16, %edx; \
+	xor	tab_0(%rsi), p4; \
+	xor	tab_1(%rdi), p1; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	xor	tab_2(%rsi), p2; \
+	xor	tab_3(%rdi), p3
+
+#else
+
+#define	il_rnd(p1, p2, p3, p4, round)	/* last inverse round */ \
+	mov	ik_ref(round,0), p1; \
+	mov	ik_ref(round,1), p2; \
+	mov	ik_ref(round,2), p3; \
+	mov	ik_ref(round,3), p4; \
+ \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %eax; \
+	xor	%esi, p1; \
+	rol	$8, %edi; \
+	xor	%edi, p2; \
+	movzx	%al, %esi; \
+	movzx	%ah, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p3; \
+	xor	%edi, p4; \
+ \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %ebx; \
+	xor	%esi, p2; \
+	rol	$8, %edi; \
+	xor	%edi, p3; \
+	movzx	%bl, %esi; \
+	movzx	%bh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p4; \
+	xor	%edi, p1; \
+ \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %ecx; \
+	xor	%esi, p3; \
+	rol	$8, %edi; \
+	xor	%edi, p4; \
+	movzx	%cl, %esi; \
+	movzx	%ch, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p1; \
+	xor	%edi, p2; \
+ \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	shr	$16, %edx; \
+	xor	%esi, p4; \
+	rol	$8, %edi; \
+	xor	%edi, p1; \
+	movzx	%dl, %esi; \
+	movzx	%dh, %edi; \
+	movzx	tab_i(%rsi), %esi; \
+	movzx	tab_i(%rdi), %edi; \
+	rol	$16, %esi; \
+	rol	$24, %edi; \
+	xor	%esi, p2; \
+	xor	%edi, p3
+
+#endif	/* LAST_ROUND_TABLES */
+
+/*
+ * OpenSolaris OS:
+ * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_encrypt(const unsigned char *in,
+ *	unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align	64
+enc_tab:
+	enc_vals(u8)
+#ifdef	LAST_ROUND_TABLES
+	// Last Round Tables:
+	enc_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_encrypt_amd64)
+#ifdef	GLADMAN_INTERFACE
+	// Original interface
+	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
+	mov	%rsi, (%rsp)	// output pointer (P2)
+	mov	%rdx, %r8	// context (P3)
+
+	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
+	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
+	mov	%r12, 3*8(%rsp)	// P3: context in r8
+	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
+
+#else
+	// OpenSolaris OS interface
+	sub	$[4*8], %rsp	// Make room on stack to save registers
+	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
+	mov	%rdi, %r8	// context (P1)
+	mov	%rdx, %rdi	// P3: save input pointer
+	shl	$4, %esi	// P2: esi byte key length * 16
+
+	mov	%rbx, 1*8(%rsp)	// Save registers
+	mov	%rbp, 2*8(%rsp)
+	mov	%r12, 3*8(%rsp)
+	// P1: context in r8
+	// P2: byte key length * 16 in esi
+	// P3: input pointer in rdi
+	// P4: output pointer in (rsp)
+#endif	/* GLADMAN_INTERFACE */
+
+	lea	enc_tab(%rip), tptr
+	sub	$fofs, kptr
+
+	// Load input block into registers
+	mov	(%rdi), %eax
+	mov	1*4(%rdi), %ebx
+	mov	2*4(%rdi), %ecx
+	mov	3*4(%rdi), %edx
+
+	xor	fofs(kptr), %eax
+	xor	fofs+4(kptr), %ebx
+	xor	fofs+8(kptr), %ecx
+	xor	fofs+12(kptr), %edx
+
+	lea	(kptr,%rsi), kptr
+	// Jump based on byte key length * 16:
+	cmp	$[10*16], %esi
+	je	3f
+	cmp	$[12*16], %esi
+	je	2f
+	cmp	$[14*16], %esi
+	je	1f
+	mov	$-1, %rax	// error
+	jmp	4f
+
+	// Perform normal forward rounds
+1:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2:	ff_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3:	ff_rnd(%r9d, %r10d, %r11d, %r12d,  9)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  8)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  7)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  6)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  5)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  4)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  3)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  2)
+	ff_rnd(%r9d, %r10d, %r11d, %r12d,  1)
+	fl_rnd(%r9d, %r10d, %r11d, %r12d,  0)
+
+	// Copy results
+	mov	(%rsp), %rbx
+	mov	%r9d, (%rbx)
+	mov	%r10d, 4(%rbx)
+	mov	%r11d, 8(%rbx)
+	mov	%r12d, 12(%rbx)
+	xor	%rax, %rax
+4:	// Restore registers
+	mov	1*8(%rsp), %rbx
+	mov	2*8(%rsp), %rbp
+	mov	3*8(%rsp), %r12
+	add	$[4*8], %rsp
+	ret
+
+	SET_SIZE(aes_encrypt_amd64)
+
+/*
+ * OpenSolaris OS:
+ * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr,
+ *	const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original interface:
+ * int aes_decrypt(const unsigned char *in,
+ *	unsigned char *out, const aes_encrypt_ctx cx[1])/
+ */
+.data
+.align	64
+dec_tab:
+	dec_vals(v8)
+#ifdef	LAST_ROUND_TABLES
+	// Last Round Tables:
+	dec_vals(w8)
+#endif
+
+
+ENTRY_NP(aes_decrypt_amd64)
+#ifdef	GLADMAN_INTERFACE
+	// Original interface
+	sub	$[4*8], %rsp	// gnu/linux/opensolaris binary interface
+	mov	%rsi, (%rsp)	// output pointer (P2)
+	mov	%rdx, %r8	// context (P3)
+
+	mov	%rbx, 1*8(%rsp)	// P1: input pointer in rdi
+	mov	%rbp, 2*8(%rsp)	// P2: output pointer in (rsp)
+	mov	%r12, 3*8(%rsp)	// P3: context in r8
+	movzx	4*KS_LENGTH(kptr), %esi	// Get byte key length * 16
+
+#else
+	// OpenSolaris OS interface
+	sub	$[4*8], %rsp	// Make room on stack to save registers
+	mov	%rcx, (%rsp)	// Save output pointer (P4) on stack
+	mov	%rdi, %r8	// context (P1)
+	mov	%rdx, %rdi	// P3: save input pointer
+	shl	$4, %esi	// P2: esi byte key length * 16
+
+	mov	%rbx, 1*8(%rsp)	// Save registers
+	mov	%rbp, 2*8(%rsp)
+	mov	%r12, 3*8(%rsp)
+	// P1: context in r8
+	// P2: byte key length * 16 in esi
+	// P3: input pointer in rdi
+	// P4: output pointer in (rsp)
+#endif	/* GLADMAN_INTERFACE */
+
+	lea	dec_tab(%rip), tptr
+	sub	$rofs, kptr
+
+	// Load input block into registers
+	mov	(%rdi), %eax
+	mov	1*4(%rdi), %ebx
+	mov	2*4(%rdi), %ecx
+	mov	3*4(%rdi), %edx
+
+#ifdef AES_REV_DKS
+	mov	kptr, %rdi
+	lea	(kptr,%rsi), kptr
+#else
+	lea	(kptr,%rsi), %rdi
+#endif
+
+	xor	rofs(%rdi), %eax
+	xor	rofs+4(%rdi), %ebx
+	xor	rofs+8(%rdi), %ecx
+	xor	rofs+12(%rdi), %edx
+
+	// Jump based on byte key length * 16:
+	cmp	$[10*16], %esi
+	je	3f
+	cmp	$[12*16], %esi
+	je	2f
+	cmp	$[14*16], %esi
+	je	1f
+	mov	$-1, %rax	// error
+	jmp	4f
+
+	// Perform normal inverse rounds
+1:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 13)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d, 12)
+2:	ii_rnd(%r9d, %r10d, %r11d, %r12d, 11)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d, 10)
+3:	ii_rnd(%r9d, %r10d, %r11d, %r12d,  9)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  8)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  7)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  6)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  5)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  4)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  3)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  2)
+	ii_rnd(%r9d, %r10d, %r11d, %r12d,  1)
+	il_rnd(%r9d, %r10d, %r11d, %r12d,  0)
+
+	// Copy results
+	mov	(%rsp), %rbx
+	mov	%r9d, (%rbx)
+	mov	%r10d, 4(%rbx)
+	mov	%r11d, 8(%rbx)
+	mov	%r12d, 12(%rbx)
+	xor	%rax, %rax
+4:	// Restore registers
+	mov	1*8(%rsp), %rbx
+	mov	2*8(%rsp), %rbp
+	mov	3*8(%rsp), %r12
+	add	$[4*8], %rsp
+	ret
+
+	SET_SIZE(aes_decrypt_amd64)
+#endif	/* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aeskey.c b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aeskey.c
new file mode 100644
index 000000000000..c3d1f2990874
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aeskey.c
@@ -0,0 +1,580 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *	 list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *	 of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *	 built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ */
+
+#include <aes/aes_impl.h>
+#include "aesopt.h"
+#include "aestab.h"
+#include "aestab2.h"
+
+/*
+ *	Initialise the key schedule from the user supplied key. The key
+ *	length can be specified in bytes, with legal values of 16, 24
+ *	and 32, or in bits, with legal values of 128, 192 and 256. These
+ *	values correspond with Nk values of 4, 6 and 8 respectively.
+ *
+ *	The following macros implement a single cycle in the key
+ *	schedule generation process. The number of cycles needed
+ *	for each cx->n_col and nk value is:
+ *
+ *	nk =		4  5  6  7  8
+ *	------------------------------
+ *	cx->n_col = 4	10  9  8  7  7
+ *	cx->n_col = 5	14 11 10  9  9
+ *	cx->n_col = 6	19 15 12 11 11
+ *	cx->n_col = 7	21 19 16 13 14
+ *	cx->n_col = 8	29 23 19 17 14
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added header files aes_impl.h and aestab2.h
+ * 2. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 3. Remove code under ifdef USE_VIA_ACE_IF_PRESENT (always undefined)
+ * 4. Removed always-defined ifdefs FUNCS_IN_C, ENC_KEYING_IN_C,
+ *	AES_128, AES_192, AES_256, AES_VAR defines
+ * 5. Changed aes_encrypt_key* aes_decrypt_key* functions to "static void"
+ * 6. Changed N_COLS to MAX_AES_NB
+ * 7. Replaced functions aes_encrypt_key and aes_decrypt_key with
+ *	OpenSolaris-compatible functions rijndael_key_setup_enc_amd64 and
+ *	rijndael_key_setup_dec_amd64
+ * 8. cstyled code and removed lint warnings
+ */
+
+#if defined(REDUCE_CODE_SIZE)
+#define	ls_box ls_sub
+	uint32_t	ls_sub(const uint32_t t, const uint32_t n);
+#define	inv_mcol im_sub
+	uint32_t	im_sub(const uint32_t x);
+#ifdef ENC_KS_UNROLL
+#undef ENC_KS_UNROLL
+#endif
+#ifdef DEC_KS_UNROLL
+#undef DEC_KS_UNROLL
+#endif
+#endif	/* REDUCE_CODE_SIZE */
+
+
+#define	ke4(k, i) \
+{	k[4 * (i) + 4] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[4 * (i) + 5] = ss[1] ^= ss[0]; \
+	k[4 * (i) + 6] = ss[2] ^= ss[1]; \
+	k[4 * (i) + 7] = ss[3] ^= ss[2]; \
+}
+
+static void
+aes_encrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[4];
+
+	rk[0] = ss[0] = word_in(key, 0);
+	rk[1] = ss[1] = word_in(key, 1);
+	rk[2] = ss[2] = word_in(key, 2);
+	rk[3] = ss[3] = word_in(key, 3);
+
+#ifdef ENC_KS_UNROLL
+	ke4(rk, 0);  ke4(rk, 1);
+	ke4(rk, 2);  ke4(rk, 3);
+	ke4(rk, 4);  ke4(rk, 5);
+	ke4(rk, 6);  ke4(rk, 7);
+	ke4(rk, 8);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 9; ++i)
+			ke4(rk, i);
+	}
+#endif	/* ENC_KS_UNROLL */
+	ke4(rk, 9);
+}
+
+
+#define	kef6(k, i) \
+{	k[6 * (i) + 6] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[6 * (i) + 7] = ss[1] ^= ss[0]; \
+	k[6 * (i) + 8] = ss[2] ^= ss[1]; \
+	k[6 * (i) + 9] = ss[3] ^= ss[2]; \
+}
+
+#define	ke6(k, i) \
+{	kef6(k, i); \
+	k[6 * (i) + 10] = ss[4] ^= ss[3]; \
+	k[6 * (i) + 11] = ss[5] ^= ss[4]; \
+}
+
+static void
+aes_encrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[6];
+
+	rk[0] = ss[0] = word_in(key, 0);
+	rk[1] = ss[1] = word_in(key, 1);
+	rk[2] = ss[2] = word_in(key, 2);
+	rk[3] = ss[3] = word_in(key, 3);
+	rk[4] = ss[4] = word_in(key, 4);
+	rk[5] = ss[5] = word_in(key, 5);
+
+#ifdef ENC_KS_UNROLL
+	ke6(rk, 0);  ke6(rk, 1);
+	ke6(rk, 2);  ke6(rk, 3);
+	ke6(rk, 4);  ke6(rk, 5);
+	ke6(rk, 6);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 7; ++i)
+			ke6(rk, i);
+	}
+#endif	/* ENC_KS_UNROLL */
+	kef6(rk, 7);
+}
+
+
+
+#define	kef8(k, i) \
+{	k[8 * (i) + 8] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[8 * (i) + 9] = ss[1] ^= ss[0]; \
+	k[8 * (i) + 10] = ss[2] ^= ss[1]; \
+	k[8 * (i) + 11] = ss[3] ^= ss[2]; \
+}
+
+#define	ke8(k, i) \
+{   kef8(k, i); \
+	k[8 * (i) + 12] = ss[4] ^= ls_box(ss[3], 0); \
+	k[8 * (i) + 13] = ss[5] ^= ss[4]; \
+	k[8 * (i) + 14] = ss[6] ^= ss[5]; \
+	k[8 * (i) + 15] = ss[7] ^= ss[6]; \
+}
+
+static void
+aes_encrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[8];
+
+	rk[0] = ss[0] = word_in(key, 0);
+	rk[1] = ss[1] = word_in(key, 1);
+	rk[2] = ss[2] = word_in(key, 2);
+	rk[3] = ss[3] = word_in(key, 3);
+	rk[4] = ss[4] = word_in(key, 4);
+	rk[5] = ss[5] = word_in(key, 5);
+	rk[6] = ss[6] = word_in(key, 6);
+	rk[7] = ss[7] = word_in(key, 7);
+
+#ifdef ENC_KS_UNROLL
+	ke8(rk, 0); ke8(rk, 1);
+	ke8(rk, 2); ke8(rk, 3);
+	ke8(rk, 4); ke8(rk, 5);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 6; ++i)
+			ke8(rk,  i);
+	}
+#endif	/* ENC_KS_UNROLL */
+	kef8(rk, 6);
+}
+
+
+/*
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk		AES key schedule 32-bit array to be initialized
+ * cipherKey	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[],
+    int keyBits)
+{
+	switch (keyBits) {
+	case 128:
+		aes_encrypt_key128((unsigned char *)&cipherKey[0], rk);
+		return (10);
+	case 192:
+		aes_encrypt_key192((unsigned char *)&cipherKey[0], rk);
+		return (12);
+	case 256:
+		aes_encrypt_key256((unsigned char *)&cipherKey[0], rk);
+		return (14);
+	default: /* should never get here */
+		break;
+	}
+
+	return (0);
+}
+
+
+/* this is used to store the decryption round keys  */
+/* in forward or reverse order */
+
+#ifdef AES_REV_DKS
+#define	v(n, i)  ((n) - (i) + 2 * ((i) & 3))
+#else
+#define	v(n, i)  (i)
+#endif
+
+#if DEC_ROUND == NO_TABLES
+#define	ff(x)   (x)
+#else
+#define	ff(x)   inv_mcol(x)
+#if defined(dec_imvars)
+#define	d_vars  dec_imvars
+#endif
+#endif	/* FUNCS_IN_C & DEC_KEYING_IN_C */
+
+
+#define	k4e(k, i) \
+{	k[v(40, (4 * (i)) + 4)] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[v(40, (4 * (i)) + 5)] = ss[1] ^= ss[0]; \
+	k[v(40, (4 * (i)) + 6)] = ss[2] ^= ss[1]; \
+	k[v(40, (4 * (i)) + 7)] = ss[3] ^= ss[2]; \
+}
+
+#if 1
+
+#define	kdf4(k, i) \
+{	ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \
+	ss[1] = ss[1] ^ ss[3]; \
+	ss[2] = ss[2] ^ ss[3]; \
+	ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+	ss[i % 4] ^= ss[4]; \
+	ss[4] ^= k[v(40, (4 * (i)))];   k[v(40, (4 * (i)) + 4)] = ff(ss[4]); \
+	ss[4] ^= k[v(40, (4 * (i)) + 1)]; k[v(40, (4 * (i)) + 5)] = ff(ss[4]); \
+	ss[4] ^= k[v(40, (4 * (i)) + 2)]; k[v(40, (4 * (i)) + 6)] = ff(ss[4]); \
+	ss[4] ^= k[v(40, (4 * (i)) + 3)]; k[v(40, (4 * (i)) + 7)] = ff(ss[4]); \
+}
+
+#define	kd4(k, i) \
+{	ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+	ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \
+	k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+	k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+	k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+	k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define	kdl4(k, i) \
+{	ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \
+	ss[i % 4] ^= ss[4]; \
+	k[v(40, (4 * (i)) + 4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \
+	k[v(40, (4 * (i)) + 5)] = ss[1] ^ ss[3]; \
+	k[v(40, (4 * (i)) + 6)] = ss[0]; \
+	k[v(40, (4 * (i)) + 7)] = ss[1]; \
+}
+
+#else
+
+#define	kdf4(k, i) \
+{	ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[v(40, (4 * (i)) + 4)] = ff(ss[0]); \
+	ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ff(ss[1]); \
+	ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ff(ss[2]); \
+	ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ff(ss[3]); \
+}
+
+#define	kd4(k, i) \
+{	ss[4] = ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	ss[0] ^= ss[4]; \
+	ss[4] = ff(ss[4]); \
+	k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \
+	ss[1] ^= ss[0]; \
+	k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \
+	ss[2] ^= ss[1]; \
+	k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \
+	ss[3] ^= ss[2]; \
+	k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \
+}
+
+#define	kdl4(k, i) \
+{	ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \
+	k[v(40, (4 * (i)) + 4)] = ss[0]; \
+	ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ss[1]; \
+	ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ss[2]; \
+	ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ss[3]; \
+}
+
+#endif
+
+static void
+aes_decrypt_key128(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[5];
+#if defined(d_vars)
+	d_vars;
+#endif
+	rk[v(40, (0))] = ss[0] = word_in(key, 0);
+	rk[v(40, (1))] = ss[1] = word_in(key, 1);
+	rk[v(40, (2))] = ss[2] = word_in(key, 2);
+	rk[v(40, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+	kdf4(rk, 0); kd4(rk, 1);
+	kd4(rk, 2);  kd4(rk, 3);
+	kd4(rk, 4);  kd4(rk, 5);
+	kd4(rk, 6);  kd4(rk, 7);
+	kd4(rk, 8);  kdl4(rk, 9);
+#else
+	{
+		uint32_t	i;
+		for (i = 0; i < 10; ++i)
+			k4e(rk, i);
+#if !(DEC_ROUND == NO_TABLES)
+		for (i = MAX_AES_NB; i < 10 * MAX_AES_NB; ++i)
+			rk[i] = inv_mcol(rk[i]);
+#endif
+	}
+#endif	/* DEC_KS_UNROLL */
+}
+
+
+
+#define	k6ef(k, i) \
+{	k[v(48, (6 * (i)) + 6)] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[v(48, (6 * (i)) + 7)] = ss[1] ^= ss[0]; \
+	k[v(48, (6 * (i)) + 8)] = ss[2] ^= ss[1]; \
+	k[v(48, (6 * (i)) + 9)] = ss[3] ^= ss[2]; \
+}
+
+#define	k6e(k, i) \
+{	k6ef(k, i); \
+	k[v(48, (6 * (i)) + 10)] = ss[4] ^= ss[3]; \
+	k[v(48, (6 * (i)) + 11)] = ss[5] ^= ss[4]; \
+}
+
+#define	kdf6(k, i) \
+{	ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[v(48, (6 * (i)) + 6)] = ff(ss[0]); \
+	ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ff(ss[1]); \
+	ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ff(ss[2]); \
+	ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ff(ss[3]); \
+	ss[4] ^= ss[3]; k[v(48, (6 * (i)) + 10)] = ff(ss[4]); \
+	ss[5] ^= ss[4]; k[v(48, (6 * (i)) + 11)] = ff(ss[5]); \
+}
+
+#define	kd6(k, i) \
+{	ss[6] = ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \
+	k[v(48, (6 * (i)) + 6)] = ss[6] ^= k[v(48, (6 * (i)))]; \
+	ss[1] ^= ss[0]; \
+	k[v(48, (6 * (i)) + 7)] = ss[6] ^= k[v(48, (6 * (i)) + 1)]; \
+	ss[2] ^= ss[1]; \
+	k[v(48, (6 * (i)) + 8)] = ss[6] ^= k[v(48, (6 * (i)) + 2)]; \
+	ss[3] ^= ss[2]; \
+	k[v(48, (6 * (i)) + 9)] = ss[6] ^= k[v(48, (6 * (i)) + 3)]; \
+	ss[4] ^= ss[3]; \
+	k[v(48, (6 * (i)) + 10)] = ss[6] ^= k[v(48, (6 * (i)) + 4)]; \
+	ss[5] ^= ss[4]; \
+	k[v(48, (6 * (i)) + 11)] = ss[6] ^= k[v(48, (6 * (i)) + 5)]; \
+}
+
+#define	kdl6(k, i) \
+{	ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \
+	k[v(48, (6 * (i)) + 6)] = ss[0]; \
+	ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ss[1]; \
+	ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ss[2]; \
+	ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key192(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[7];
+#if defined(d_vars)
+	d_vars;
+#endif
+	rk[v(48, (0))] = ss[0] = word_in(key, 0);
+	rk[v(48, (1))] = ss[1] = word_in(key, 1);
+	rk[v(48, (2))] = ss[2] = word_in(key, 2);
+	rk[v(48, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+	ss[4] = word_in(key, 4);
+	rk[v(48, (4))] = ff(ss[4]);
+	ss[5] = word_in(key, 5);
+	rk[v(48, (5))] = ff(ss[5]);
+	kdf6(rk, 0); kd6(rk, 1);
+	kd6(rk, 2);  kd6(rk, 3);
+	kd6(rk, 4);  kd6(rk, 5);
+	kd6(rk, 6);  kdl6(rk, 7);
+#else
+	rk[v(48, (4))] = ss[4] = word_in(key, 4);
+	rk[v(48, (5))] = ss[5] = word_in(key, 5);
+	{
+		uint32_t	i;
+
+		for (i = 0; i < 7; ++i)
+			k6e(rk, i);
+		k6ef(rk, 7);
+#if !(DEC_ROUND == NO_TABLES)
+		for (i = MAX_AES_NB; i < 12 * MAX_AES_NB; ++i)
+			rk[i] = inv_mcol(rk[i]);
+#endif
+	}
+#endif
+}
+
+
+
+#define	k8ef(k, i) \
+{	k[v(56, (8 * (i)) + 8)] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[v(56, (8 * (i)) + 9)] = ss[1] ^= ss[0]; \
+	k[v(56, (8 * (i)) + 10)] = ss[2] ^= ss[1]; \
+	k[v(56, (8 * (i)) + 11)] = ss[3] ^= ss[2]; \
+}
+
+#define	k8e(k, i) \
+{	k8ef(k, i); \
+	k[v(56, (8 * (i)) + 12)] = ss[4] ^= ls_box(ss[3], 0); \
+	k[v(56, (8 * (i)) + 13)] = ss[5] ^= ss[4]; \
+	k[v(56, (8 * (i)) + 14)] = ss[6] ^= ss[5]; \
+	k[v(56, (8 * (i)) + 15)] = ss[7] ^= ss[6]; \
+}
+
+#define	kdf8(k, i) \
+{	ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[v(56, (8 * (i)) + 8)] = ff(ss[0]); \
+	ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ff(ss[1]); \
+	ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ff(ss[2]); \
+	ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ff(ss[3]); \
+	ss[4] ^= ls_box(ss[3], 0); k[v(56, (8 * (i)) + 12)] = ff(ss[4]); \
+	ss[5] ^= ss[4]; k[v(56, (8 * (i)) + 13)] = ff(ss[5]); \
+	ss[6] ^= ss[5]; k[v(56, (8 * (i)) + 14)] = ff(ss[6]); \
+	ss[7] ^= ss[6]; k[v(56, (8 * (i)) + 15)] = ff(ss[7]); \
+}
+
+#define	kd8(k, i) \
+{	ss[8] = ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	ss[0] ^= ss[8]; \
+	ss[8] = ff(ss[8]); \
+	k[v(56, (8 * (i)) + 8)] = ss[8] ^= k[v(56, (8 * (i)))]; \
+	ss[1] ^= ss[0]; \
+	k[v(56, (8 * (i)) + 9)] = ss[8] ^= k[v(56, (8 * (i)) + 1)]; \
+	ss[2] ^= ss[1]; \
+	k[v(56, (8 * (i)) + 10)] = ss[8] ^= k[v(56, (8 * (i)) + 2)]; \
+	ss[3] ^= ss[2]; \
+	k[v(56, (8 * (i)) + 11)] = ss[8] ^= k[v(56, (8 * (i)) + 3)]; \
+	ss[8] = ls_box(ss[3], 0); \
+	ss[4] ^= ss[8]; \
+	ss[8] = ff(ss[8]); \
+	k[v(56, (8 * (i)) + 12)] = ss[8] ^= k[v(56, (8 * (i)) + 4)]; \
+	ss[5] ^= ss[4]; \
+	k[v(56, (8 * (i)) + 13)] = ss[8] ^= k[v(56, (8 * (i)) + 5)]; \
+	ss[6] ^= ss[5]; \
+	k[v(56, (8 * (i)) + 14)] = ss[8] ^= k[v(56, (8 * (i)) + 6)]; \
+	ss[7] ^= ss[6]; \
+	k[v(56, (8 * (i)) + 15)] = ss[8] ^= k[v(56, (8 * (i)) + 7)]; \
+}
+
+#define	kdl8(k, i) \
+{	ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \
+	k[v(56, (8 * (i)) + 8)] = ss[0]; \
+	ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ss[1]; \
+	ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ss[2]; \
+	ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ss[3]; \
+}
+
+static void
+aes_decrypt_key256(const unsigned char *key, uint32_t rk[])
+{
+	uint32_t	ss[9];
+#if defined(d_vars)
+	d_vars;
+#endif
+	rk[v(56, (0))] = ss[0] = word_in(key, 0);
+	rk[v(56, (1))] = ss[1] = word_in(key, 1);
+	rk[v(56, (2))] = ss[2] = word_in(key, 2);
+	rk[v(56, (3))] = ss[3] = word_in(key, 3);
+
+#ifdef DEC_KS_UNROLL
+	ss[4] = word_in(key, 4);
+	rk[v(56, (4))] = ff(ss[4]);
+	ss[5] = word_in(key, 5);
+	rk[v(56, (5))] = ff(ss[5]);
+	ss[6] = word_in(key, 6);
+	rk[v(56, (6))] = ff(ss[6]);
+	ss[7] = word_in(key, 7);
+	rk[v(56, (7))] = ff(ss[7]);
+	kdf8(rk, 0); kd8(rk, 1);
+	kd8(rk, 2);  kd8(rk, 3);
+	kd8(rk, 4);  kd8(rk, 5);
+	kdl8(rk, 6);
+#else
+	rk[v(56, (4))] = ss[4] = word_in(key, 4);
+	rk[v(56, (5))] = ss[5] = word_in(key, 5);
+	rk[v(56, (6))] = ss[6] = word_in(key, 6);
+	rk[v(56, (7))] = ss[7] = word_in(key, 7);
+	{
+		uint32_t	i;
+
+		for (i = 0; i < 6; ++i)
+			k8e(rk,  i);
+		k8ef(rk,  6);
+#if !(DEC_ROUND == NO_TABLES)
+		for (i = MAX_AES_NB; i < 14 * MAX_AES_NB; ++i)
+			rk[i] = inv_mcol(rk[i]);
+#endif
+	}
+#endif	/* DEC_KS_UNROLL */
+}
+
+
+/*
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * Return the number of rounds for the given cipher key size.
+ * The size of the key schedule depends on the number of rounds
+ * (which can be computed from the size of the key), i.e. 4 * (Nr + 1).
+ *
+ * Parameters:
+ * rk		AES key schedule 32-bit array to be initialized
+ * cipherKey	User key
+ * keyBits	AES key size (128, 192, or 256 bits)
+ */
+int
+rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[],
+    int keyBits)
+{
+	switch (keyBits) {
+	case 128:
+		aes_decrypt_key128((unsigned char *)&cipherKey[0], rk);
+		return (10);
+	case 192:
+		aes_decrypt_key192((unsigned char *)&cipherKey[0], rk);
+		return (12);
+	case 256:
+		aes_decrypt_key256((unsigned char *)&cipherKey[0], rk);
+		return (14);
+	default: /* should never get here */
+		break;
+	}
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aesopt.h b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aesopt.h
new file mode 100644
index 000000000000..472111f96e59
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aesopt.h
@@ -0,0 +1,770 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *	list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *	of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *	built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the compilation options for AES (Rijndael) and code
+ * that is common across encryption, key scheduling and table generation.
+ *
+ * OPERATION
+ *
+ * These source code files implement the AES algorithm Rijndael designed by
+ * Joan Daemen and Vincent Rijmen. This version is designed for the standard
+ * block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24
+ * and 32 bytes).
+ *
+ * This version is designed for flexibility and speed using operations on
+ * 32-bit words rather than operations on bytes.  It can be compiled with
+ * either big or little endian internal byte order but is faster when the
+ * native byte order for the processor is used.
+ *
+ * THE CIPHER INTERFACE
+ *
+ * The cipher interface is implemented as an array of bytes in which lower
+ * AES bit sequence indexes map to higher numeric significance within bytes.
+ */
+
+/*
+ * OpenSolaris changes
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header files sys/types.h and aes_impl.h
+ * 3. Added defines for AES_ENCRYPT, AES_DECRYPT, AES_REV_DKS, and ASM_AMD64_C
+ * 4. Moved defines for IS_BIG_ENDIAN, IS_LITTLE_ENDIAN, PLATFORM_BYTE_ORDER
+ *    from brg_endian.h
+ * 5. Undefined VIA_ACE_POSSIBLE and ASSUME_VIA_ACE_PRESENT
+ * 6. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 7. Defined aes_sw32 as htonl() for byte swapping
+ * 8. Cstyled and hdrchk code
+ *
+ */
+
+#ifndef _AESOPT_H
+#define	_AESOPT_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <aes/aes_impl.h>
+
+/*  SUPPORT FEATURES */
+#define	AES_ENCRYPT /* if support for encryption is needed */
+#define	AES_DECRYPT /* if support for decryption is needed */
+
+/*  PLATFORM-SPECIFIC FEATURES */
+#define	IS_BIG_ENDIAN		4321 /* byte 0 is most significant (mc68k) */
+#define	IS_LITTLE_ENDIAN	1234 /* byte 0 is least significant (i386) */
+#define	PLATFORM_BYTE_ORDER	IS_LITTLE_ENDIAN
+#define	AES_REV_DKS /* define to reverse decryption key schedule */
+
+
+/*
+ *  CONFIGURATION - THE USE OF DEFINES
+ *	Later in this section there are a number of defines that control the
+ *	operation of the code.  In each section, the purpose of each define is
+ *	explained so that the relevant form can be included or excluded by
+ *	setting either 1's or 0's respectively on the branches of the related
+ *	#if clauses.  The following local defines should not be changed.
+ */
+
+#define	ENCRYPTION_IN_C	1
+#define	DECRYPTION_IN_C	2
+#define	ENC_KEYING_IN_C	4
+#define	DEC_KEYING_IN_C	8
+
+#define	NO_TABLES	0
+#define	ONE_TABLE	1
+#define	FOUR_TABLES	4
+#define	NONE		0
+#define	PARTIAL		1
+#define	FULL		2
+
+/*  --- START OF USER CONFIGURED OPTIONS --- */
+
+/*
+ *  1. BYTE ORDER WITHIN 32 BIT WORDS
+ *
+ *	The fundamental data processing units in Rijndael are 8-bit bytes. The
+ *	input, output and key input are all enumerated arrays of bytes in which
+ *	bytes are numbered starting at zero and increasing to one less than the
+ *	number of bytes in the array in question. This enumeration is only used
+ *	for naming bytes and does not imply any adjacency or order relationship
+ *	from one byte to another. When these inputs and outputs are considered
+ *	as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to
+ *	byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte.
+ *	In this implementation bits are numbered from 0 to 7 starting at the
+ *	numerically least significant end of each byte.  Bit n represents 2^n.
+ *
+ *	However, Rijndael can be implemented more efficiently using 32-bit
+ *	words by packing bytes into words so that bytes 4*n to 4*n+3 are placed
+ *	into word[n]. While in principle these bytes can be assembled into words
+ *	in any positions, this implementation only supports the two formats in
+ *	which bytes in adjacent positions within words also have adjacent byte
+ *	numbers. This order is called big-endian if the lowest numbered bytes
+ *	in words have the highest numeric significance and little-endian if the
+ *	opposite applies.
+ *
+ *	This code can work in either order irrespective of the order used by the
+ *	machine on which it runs. Normally the internal byte order will be set
+ *	to the order of the processor on which the code is to be run but this
+ *	define	can be used to reverse this in special situations
+ *
+ *	WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set.
+ *	This define will hence be redefined later (in section 4) if necessary
+ */
+
+#if 1
+#define	ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#elif 0
+#define	ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN
+#elif 0
+#define	ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN
+#else
+#error The algorithm byte order is not defined
+#endif
+
+/*  2. VIA ACE SUPPORT */
+
+#if defined(__GNUC__) && defined(__i386__) || \
+	defined(_WIN32) && defined(_M_IX86) && \
+	!(defined(_WIN64) || defined(_WIN32_WCE) || \
+	defined(_MSC_VER) && (_MSC_VER <= 800))
+#define	VIA_ACE_POSSIBLE
+#endif
+
+/*
+ *  Define this option if support for the VIA ACE is required. This uses
+ *  inline assembler instructions and is only implemented for the Microsoft,
+ *  Intel and GCC compilers.  If VIA ACE is known to be present, then defining
+ *  ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption
+ *  code.  If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if
+ *  it is detected (both present and enabled) but the normal AES code will
+ *  also be present.
+ *
+ *  When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte
+ *  aligned; other input/output buffers do not need to be 16 byte aligned
+ *  but there are very large performance gains if this can be arranged.
+ *  VIA ACE also requires the decryption key schedule to be in reverse
+ *  order (which later checks below ensure).
+ */
+
+/*  VIA ACE is not used here for OpenSolaris: */
+#undef	VIA_ACE_POSSIBLE
+#undef	ASSUME_VIA_ACE_PRESENT
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define	USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(ASSUME_VIA_ACE_PRESENT)
+#define	ASSUME_VIA_ACE_PRESENT
+#endif
+
+
+/*
+ *  3. ASSEMBLER SUPPORT
+ *
+ *	This define (which can be on the command line) enables the use of the
+ *	assembler code routines for encryption, decryption and key scheduling
+ *	as follows:
+ *
+ *	ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for
+ *		encryption and decryption and but with key scheduling in C
+ *	ASM_X86_V2  uses assembler (aes_x86_v2.asm) with compressed tables for
+ *		encryption, decryption and key scheduling
+ *	ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for
+ *		encryption and decryption and but with key scheduling in C
+ *	ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for
+ *		encryption and decryption and but with key scheduling in C
+ *
+ *	Change one 'if 0' below to 'if 1' to select the version or define
+ *	as a compilation option.
+ */
+
+#if 0 && !defined(ASM_X86_V1C)
+#define	ASM_X86_V1C
+#elif 0 && !defined(ASM_X86_V2)
+#define	ASM_X86_V2
+#elif 0 && !defined(ASM_X86_V2C)
+#define	ASM_X86_V2C
+#elif 1 && !defined(ASM_AMD64_C)
+#define	ASM_AMD64_C
+#endif
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2) || defined(ASM_X86_V2C)) && \
+	!defined(_M_IX86) || defined(ASM_AMD64_C) && !defined(_M_X64) && \
+	!defined(__amd64)
+#error Assembler code is only available for x86 and AMD64 systems
+#endif
+
+/*
+ *  4. FAST INPUT/OUTPUT OPERATIONS.
+ *
+ *	On some machines it is possible to improve speed by transferring the
+ *	bytes in the input and output arrays to and from the internal 32-bit
+ *	variables by addressing these arrays as if they are arrays of 32-bit
+ *	words.  On some machines this will always be possible but there may
+ *	be a large performance penalty if the byte arrays are not aligned on
+ *	the normal word boundaries. On other machines this technique will
+ *	lead to memory access errors when such 32-bit word accesses are not
+ *	properly aligned. The option SAFE_IO avoids such problems but will
+ *	often be slower on those machines that support misaligned access
+ *	(especially so if care is taken to align the input  and output byte
+ *	arrays on 32-bit word boundaries). If SAFE_IO is not defined it is
+ *	assumed that access to byte arrays as if they are arrays of 32-bit
+ *	words will not cause problems when such accesses are misaligned.
+ */
+#if 1 && !defined(_MSC_VER)
+#define	SAFE_IO
+#endif
+
+/*
+ *  5. LOOP UNROLLING
+ *
+ *	The code for encryption and decryption cycles through a number of rounds
+ *	that can be implemented either in a loop or by expanding the code into a
+ *	long sequence of instructions, the latter producing a larger program but
+ *	one that will often be much faster. The latter is called loop unrolling.
+ *	There are also potential speed advantages in expanding two iterations in
+ *	a loop with half the number of iterations, which is called partial loop
+ *	unrolling.  The following options allow partial or full loop unrolling
+ *	to be set independently for encryption and decryption
+ */
+#if 1
+#define	ENC_UNROLL  FULL
+#elif 0
+#define	ENC_UNROLL  PARTIAL
+#else
+#define	ENC_UNROLL  NONE
+#endif
+
+#if 1
+#define	DEC_UNROLL  FULL
+#elif 0
+#define	DEC_UNROLL  PARTIAL
+#else
+#define	DEC_UNROLL  NONE
+#endif
+
+#if 1
+#define	ENC_KS_UNROLL
+#endif
+
+#if 1
+#define	DEC_KS_UNROLL
+#endif
+
+/*
+ *  6. FAST FINITE FIELD OPERATIONS
+ *
+ *	If this section is included, tables are used to provide faster finite
+ *	field arithmetic.  This has no effect if FIXED_TABLES is defined.
+ */
+#if 1
+#define	FF_TABLES
+#endif
+
+/*
+ *  7. INTERNAL STATE VARIABLE FORMAT
+ *
+ *	The internal state of Rijndael is stored in a number of local 32-bit
+ *	word variables which can be defined either as an array or as individual
+ *	names variables. Include this section if you want to store these local
+ *	variables in arrays. Otherwise individual local variables will be used.
+ */
+#if 1
+#define	ARRAYS
+#endif
+
+/*
+ *  8. FIXED OR DYNAMIC TABLES
+ *
+ *	When this section is included the tables used by the code are compiled
+ *	statically into the binary file.  Otherwise the subroutine aes_init()
+ *	must be called to compute them before the code is first used.
+ */
+#if 1 && !(defined(_MSC_VER) && (_MSC_VER <= 800))
+#define	FIXED_TABLES
+#endif
+
+/*
+ *  9. MASKING OR CASTING FROM LONGER VALUES TO BYTES
+ *
+ *	In some systems it is better to mask longer values to extract bytes
+ *	rather than using a cast. This option allows this choice.
+ */
+#if 0
+#define	to_byte(x)  ((uint8_t)(x))
+#else
+#define	to_byte(x)  ((x) & 0xff)
+#endif
+
+/*
+ *  10. TABLE ALIGNMENT
+ *
+ *	On some systems speed will be improved by aligning the AES large lookup
+ *	tables on particular boundaries. This define should be set to a power of
+ *	two giving the desired alignment. It can be left undefined if alignment
+ *	is not needed.  This option is specific to the Microsoft VC++ compiler -
+ *	it seems to sometimes cause trouble for the VC++ version 6 compiler.
+ */
+
+#if 1 && defined(_MSC_VER) && (_MSC_VER >= 1300)
+#define	TABLE_ALIGN 32
+#endif
+
+/*
+ *  11.  REDUCE CODE AND TABLE SIZE
+ *
+ *	This replaces some expanded macros with function calls if AES_ASM_V2 or
+ *	AES_ASM_V2C are defined
+ */
+
+#if 1 && (defined(ASM_X86_V2) || defined(ASM_X86_V2C))
+#define	REDUCE_CODE_SIZE
+#endif
+
+/*
+ *  12. TABLE OPTIONS
+ *
+ *	This cipher proceeds by repeating in a number of cycles known as rounds
+ *	which are implemented by a round function which is optionally be speeded
+ *	up using tables.  The basic tables are 256 32-bit words, with either
+ *	one or four tables being required for each round function depending on
+ *	how much speed is required. Encryption and decryption round functions
+ *	are different and the last encryption and decryption round functions are
+ *	different again making four different round functions in all.
+ *
+ *	This means that:
+ *	1. Normal encryption and decryption rounds can each use either 0, 1
+ *		or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ *	2. The last encryption and decryption rounds can also use either 0, 1
+ *		or 4 tables and table spaces of 0, 1024 or 4096 bytes each.
+ *
+ *	Include or exclude the appropriate definitions below to set the number
+ *	of tables used by this implementation.
+ */
+
+#if 1   /* set tables for the normal encryption round */
+#define	ENC_ROUND   FOUR_TABLES
+#elif 0
+#define	ENC_ROUND   ONE_TABLE
+#else
+#define	ENC_ROUND   NO_TABLES
+#endif
+
+#if 1   /* set tables for the last encryption round */
+#define	LAST_ENC_ROUND  FOUR_TABLES
+#elif 0
+#define	LAST_ENC_ROUND  ONE_TABLE
+#else
+#define	LAST_ENC_ROUND  NO_TABLES
+#endif
+
+#if 1   /* set tables for the normal decryption round */
+#define	DEC_ROUND   FOUR_TABLES
+#elif 0
+#define	DEC_ROUND   ONE_TABLE
+#else
+#define	DEC_ROUND   NO_TABLES
+#endif
+
+#if 1   /* set tables for the last decryption round */
+#define	LAST_DEC_ROUND  FOUR_TABLES
+#elif 0
+#define	LAST_DEC_ROUND  ONE_TABLE
+#else
+#define	LAST_DEC_ROUND  NO_TABLES
+#endif
+
+/*
+ *  The decryption key schedule can be speeded up with tables in the same
+ *	way that the round functions can.  Include or exclude the following
+ *	defines to set this requirement.
+ */
+#if 1
+#define	KEY_SCHED   FOUR_TABLES
+#elif 0
+#define	KEY_SCHED   ONE_TABLE
+#else
+#define	KEY_SCHED   NO_TABLES
+#endif
+
+/*  ---- END OF USER CONFIGURED OPTIONS ---- */
+
+/* VIA ACE support is only available for VC++ and GCC */
+
+#if !defined(_MSC_VER) && !defined(__GNUC__)
+#if defined(ASSUME_VIA_ACE_PRESENT)
+#undef ASSUME_VIA_ACE_PRESENT
+#endif
+#if defined(USE_VIA_ACE_IF_PRESENT)
+#undef USE_VIA_ACE_IF_PRESENT
+#endif
+#endif
+
+#if defined(ASSUME_VIA_ACE_PRESENT) && !defined(USE_VIA_ACE_IF_PRESENT)
+#define	USE_VIA_ACE_IF_PRESENT
+#endif
+
+#if defined(USE_VIA_ACE_IF_PRESENT) && !defined(AES_REV_DKS)
+#define	AES_REV_DKS
+#endif
+
+/* Assembler support requires the use of platform byte order */
+
+#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2C) || defined(ASM_AMD64_C)) && \
+	(ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER)
+#undef  ALGORITHM_BYTE_ORDER
+#define	ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER
+#endif
+
+/*
+ * In this implementation the columns of the state array are each held in
+ *	32-bit words. The state array can be held in various ways: in an array
+ *	of words, in a number of individual word variables or in a number of
+ *	processor registers. The following define maps a variable name x and
+ *	a column number c to the way the state array variable is to be held.
+ *	The first define below maps the state into an array x[c] whereas the
+ *	second form maps the state into a number of individual variables x0,
+ *	x1, etc.  Another form could map individual state columns to machine
+ *	register names.
+ */
+
+#if defined(ARRAYS)
+#define	s(x, c) x[c]
+#else
+#define	s(x, c) x##c
+#endif
+
+/*
+ *  This implementation provides subroutines for encryption, decryption
+ *	and for setting the three key lengths (separately) for encryption
+ *	and decryption. Since not all functions are needed, masks are set
+ *	up here to determine which will be implemented in C
+ */
+
+#if !defined(AES_ENCRYPT)
+#define	EFUNCS_IN_C   0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+	defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define	EFUNCS_IN_C   ENC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define	EFUNCS_IN_C   (ENCRYPTION_IN_C | ENC_KEYING_IN_C)
+#else
+#define	EFUNCS_IN_C   0
+#endif
+
+#if !defined(AES_DECRYPT)
+#define	DFUNCS_IN_C   0
+#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \
+	defined(ASM_X86_V2C) || defined(ASM_AMD64_C)
+#define	DFUNCS_IN_C   DEC_KEYING_IN_C
+#elif !defined(ASM_X86_V2)
+#define	DFUNCS_IN_C   (DECRYPTION_IN_C | DEC_KEYING_IN_C)
+#else
+#define	DFUNCS_IN_C   0
+#endif
+
+#define	FUNCS_IN_C  (EFUNCS_IN_C | DFUNCS_IN_C)
+
+/* END OF CONFIGURATION OPTIONS */
+
+/* Disable or report errors on some combinations of options */
+
+#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES
+#undef  LAST_ENC_ROUND
+#define	LAST_ENC_ROUND  NO_TABLES
+#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES
+#undef  LAST_ENC_ROUND
+#define	LAST_ENC_ROUND  ONE_TABLE
+#endif
+
+#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE
+#undef  ENC_UNROLL
+#define	ENC_UNROLL  NONE
+#endif
+
+#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES
+#undef  LAST_DEC_ROUND
+#define	LAST_DEC_ROUND  NO_TABLES
+#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES
+#undef  LAST_DEC_ROUND
+#define	LAST_DEC_ROUND  ONE_TABLE
+#endif
+
+#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE
+#undef  DEC_UNROLL
+#define	DEC_UNROLL  NONE
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define	aes_sw32	htonl
+#elif defined(bswap32)
+#define	aes_sw32	bswap32
+#elif defined(bswap_32)
+#define	aes_sw32	bswap_32
+#else
+#define	brot(x, n)  (((uint32_t)(x) << (n)) | ((uint32_t)(x) >> (32 - (n))))
+#define	aes_sw32(x) ((brot((x), 8) & 0x00ff00ff) | (brot((x), 24) & 0xff00ff00))
+#endif
+
+
+/*
+ *	upr(x, n):  rotates bytes within words by n positions, moving bytes to
+ *		higher index positions with wrap around into low positions
+ *	ups(x, n):  moves bytes by n positions to higher index positions in
+ *		words but without wrap around
+ *	bval(x, n): extracts a byte from a word
+ *
+ *	WARNING:   The definitions given here are intended only for use with
+ *		unsigned variables and with shift counts that are compile
+ *		time constants
+ */
+
+#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+#define	upr(x, n)	(((uint32_t)(x) << (8 * (n))) | \
+			((uint32_t)(x) >> (32 - 8 * (n))))
+#define	ups(x, n)	((uint32_t)(x) << (8 * (n)))
+#define	bval(x, n)	to_byte((x) >> (8 * (n)))
+#define	bytes2word(b0, b1, b2, b3)  \
+		(((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | \
+		((uint32_t)(b1) << 8) | (b0))
+#endif
+
+#if (ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN)
+#define	upr(x, n)	(((uint32_t)(x) >> (8 * (n))) | \
+			((uint32_t)(x) << (32 - 8 * (n))))
+#define	ups(x, n)	((uint32_t)(x) >> (8 * (n)))
+#define	bval(x, n)	to_byte((x) >> (24 - 8 * (n)))
+#define	bytes2word(b0, b1, b2, b3)  \
+		(((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | \
+		((uint32_t)(b2) << 8) | (b3))
+#endif
+
+#if defined(SAFE_IO)
+#define	word_in(x, c)	bytes2word(((const uint8_t *)(x) + 4 * c)[0], \
+				((const uint8_t *)(x) + 4 * c)[1], \
+				((const uint8_t *)(x) + 4 * c)[2], \
+				((const uint8_t *)(x) + 4 * c)[3])
+#define	word_out(x, c, v) { ((uint8_t *)(x) + 4 * c)[0] = bval(v, 0); \
+			((uint8_t *)(x) + 4 * c)[1] = bval(v, 1); \
+			((uint8_t *)(x) + 4 * c)[2] = bval(v, 2); \
+			((uint8_t *)(x) + 4 * c)[3] = bval(v, 3); }
+#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER)
+#define	word_in(x, c)	(*((uint32_t *)(x) + (c)))
+#define	word_out(x, c, v) (*((uint32_t *)(x) + (c)) = (v))
+#else
+#define	word_in(x, c)	aes_sw32(*((uint32_t *)(x) + (c)))
+#define	word_out(x, c, v) (*((uint32_t *)(x) + (c)) = aes_sw32(v))
+#endif
+
+/* the finite field modular polynomial and elements */
+
+#define	WPOLY   0x011b
+#define	BPOLY	0x1b
+
+/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */
+
+#define	m1  0x80808080
+#define	m2  0x7f7f7f7f
+#define	gf_mulx(x)  ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY))
+
+/*
+ * The following defines provide alternative definitions of gf_mulx that might
+ * give improved performance if a fast 32-bit multiply is not available. Note
+ * that a temporary variable u needs to be defined where gf_mulx is used.
+ *
+ * #define	gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ \
+ *			((u >> 3) | (u >> 6))
+ * #define	m4  (0x01010101 * BPOLY)
+ * #define	gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) \
+ *			& m4)
+ */
+
+/* Work out which tables are needed for the different options   */
+
+#if defined(ASM_X86_V1C)
+#if defined(ENC_ROUND)
+#undef  ENC_ROUND
+#endif
+#define	ENC_ROUND   FOUR_TABLES
+#if defined(LAST_ENC_ROUND)
+#undef  LAST_ENC_ROUND
+#endif
+#define	LAST_ENC_ROUND  FOUR_TABLES
+#if defined(DEC_ROUND)
+#undef  DEC_ROUND
+#endif
+#define	DEC_ROUND   FOUR_TABLES
+#if defined(LAST_DEC_ROUND)
+#undef  LAST_DEC_ROUND
+#endif
+#define	LAST_DEC_ROUND  FOUR_TABLES
+#if defined(KEY_SCHED)
+#undef  KEY_SCHED
+#define	KEY_SCHED   FOUR_TABLES
+#endif
+#endif
+
+#if (FUNCS_IN_C & ENCRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if ENC_ROUND == ONE_TABLE
+#define	FT1_SET
+#elif ENC_ROUND == FOUR_TABLES
+#define	FT4_SET
+#else
+#define	SBX_SET
+#endif
+#if LAST_ENC_ROUND == ONE_TABLE
+#define	FL1_SET
+#elif LAST_ENC_ROUND == FOUR_TABLES
+#define	FL4_SET
+#elif !defined(SBX_SET)
+#define	SBX_SET
+#endif
+#endif
+
+#if (FUNCS_IN_C & DECRYPTION_IN_C) || defined(ASM_X86_V1C)
+#if DEC_ROUND == ONE_TABLE
+#define	IT1_SET
+#elif DEC_ROUND == FOUR_TABLES
+#define	IT4_SET
+#else
+#define	ISB_SET
+#endif
+#if LAST_DEC_ROUND == ONE_TABLE
+#define	IL1_SET
+#elif LAST_DEC_ROUND == FOUR_TABLES
+#define	IL4_SET
+#elif !defined(ISB_SET)
+#define	ISB_SET
+#endif
+#endif
+
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+	defined(ASM_X86_V2C)))
+#if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C))
+#if KEY_SCHED == ONE_TABLE
+#if !defined(FL1_SET) && !defined(FL4_SET)
+#define	LS1_SET
+#endif
+#elif KEY_SCHED == FOUR_TABLES
+#if !defined(FL4_SET)
+#define	LS4_SET
+#endif
+#elif !defined(SBX_SET)
+#define	SBX_SET
+#endif
+#endif
+#if (FUNCS_IN_C & DEC_KEYING_IN_C)
+#if KEY_SCHED == ONE_TABLE
+#define	IM1_SET
+#elif KEY_SCHED == FOUR_TABLES
+#define	IM4_SET
+#elif !defined(SBX_SET)
+#define	SBX_SET
+#endif
+#endif
+#endif
+
+/* generic definitions of Rijndael macros that use tables */
+
+#define	no_table(x, box, vf, rf, c) bytes2word(\
+	box[bval(vf(x, 0, c), rf(0, c))], \
+	box[bval(vf(x, 1, c), rf(1, c))], \
+	box[bval(vf(x, 2, c), rf(2, c))], \
+	box[bval(vf(x, 3, c), rf(3, c))])
+
+#define	one_table(x, op, tab, vf, rf, c) \
+	(tab[bval(vf(x, 0, c), rf(0, c))] \
+	^ op(tab[bval(vf(x, 1, c), rf(1, c))], 1) \
+	^ op(tab[bval(vf(x, 2, c), rf(2, c))], 2) \
+	^ op(tab[bval(vf(x, 3, c), rf(3, c))], 3))
+
+#define	four_tables(x, tab, vf, rf, c) \
+	(tab[0][bval(vf(x, 0, c), rf(0, c))] \
+	^ tab[1][bval(vf(x, 1, c), rf(1, c))] \
+	^ tab[2][bval(vf(x, 2, c), rf(2, c))] \
+	^ tab[3][bval(vf(x, 3, c), rf(3, c))])
+
+#define	vf1(x, r, c)	(x)
+#define	rf1(r, c)	(r)
+#define	rf2(r, c)	((8+r-c)&3)
+
+/*
+ * Perform forward and inverse column mix operation on four bytes in long word
+ * x in parallel. NOTE: x must be a simple variable, NOT an expression in
+ * these macros.
+ */
+
+#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \
+	defined(ASM_X86_V2C)))
+
+#if defined(FM4_SET)	/* not currently used */
+#define	fwd_mcol(x)	four_tables(x, t_use(f, m), vf1, rf1, 0)
+#elif defined(FM1_SET)	/* not currently used */
+#define	fwd_mcol(x)	one_table(x, upr, t_use(f, m), vf1, rf1, 0)
+#else
+#define	dec_fmvars	uint32_t g2
+#define	fwd_mcol(x)	(g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ \
+				upr((x), 2) ^ upr((x), 1))
+#endif
+
+#if defined(IM4_SET)
+#define	inv_mcol(x)	four_tables(x, t_use(i, m), vf1, rf1, 0)
+#elif defined(IM1_SET)
+#define	inv_mcol(x)	one_table(x, upr, t_use(i, m), vf1, rf1, 0)
+#else
+#define	dec_imvars	uint32_t g2, g4, g9
+#define	inv_mcol(x)	(g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = \
+				(x) ^ gf_mulx(g4), g4 ^= g9, \
+				(x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ \
+				upr(g4, 2) ^ upr(g9, 1))
+#endif
+
+#if defined(FL4_SET)
+#define	ls_box(x, c)	four_tables(x, t_use(f, l), vf1, rf2, c)
+#elif defined(LS4_SET)
+#define	ls_box(x, c)	four_tables(x, t_use(l, s), vf1, rf2, c)
+#elif defined(FL1_SET)
+#define	ls_box(x, c)	one_table(x, upr, t_use(f, l), vf1, rf2, c)
+#elif defined(LS1_SET)
+#define	ls_box(x, c)	one_table(x, upr, t_use(l, s), vf1, rf2, c)
+#else
+#define	ls_box(x, c)	no_table(x, t_use(s, box), vf1, rf2, c)
+#endif
+
+#endif
+
+#if defined(ASM_X86_V1C) && defined(AES_DECRYPT) && !defined(ISB_SET)
+#define	ISB_SET
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _AESOPT_H */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab.h b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab.h
new file mode 100644
index 000000000000..33cdb6c6f9fe
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab.h
@@ -0,0 +1,165 @@
+/*
+ * ---------------------------------------------------------------------------
+ * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved.
+ *
+ * LICENSE TERMS
+ *
+ * The free distribution and use of this software is allowed (with or without
+ * changes) provided that:
+ *
+ *  1. source code distributions include the above copyright notice, this
+ *     list of conditions and the following disclaimer;
+ *
+ *  2. binary distributions include the above copyright notice, this list
+ *     of conditions and the following disclaimer in their documentation;
+ *
+ *  3. the name of the copyright holder is not used to endorse products
+ *     built using this software without specific written permission.
+ *
+ * DISCLAIMER
+ *
+ * This software is provided 'as is' with no explicit or implied warranties
+ * in respect of its properties, including, but not limited to, correctness
+ * and/or fitness for purpose.
+ * ---------------------------------------------------------------------------
+ * Issue Date: 20/12/2007
+ *
+ * This file contains the code for declaring the tables needed to implement
+ * AES. The file aesopt.h is assumed to be included before this header file.
+ * If there are no global variables, the definitions here can be used to put
+ * the AES tables in a structure so that a pointer can then be added to the
+ * AES context to pass them to the AES routines that need them.   If this
+ * facility is used, the calling program has to ensure that this pointer is
+ * managed appropriately.  In particular, the value of the t_dec(in, it) item
+ * in the table structure must be set to zero in order to ensure that the
+ * tables are initialised. In practice the three code sequences in aeskey.c
+ * that control the calls to aes_init() and the aes_init() routine itself will
+ * have to be changed for a specific implementation. If global variables are
+ * available it will generally be preferable to use them with the precomputed
+ * FIXED_TABLES option that uses static global tables.
+ *
+ * The following defines can be used to control the way the tables
+ * are defined, initialised and used in embedded environments that
+ * require special features for these purposes
+ *
+ *    the 't_dec' construction is used to declare fixed table arrays
+ *    the 't_set' construction is used to set fixed table values
+ *    the 't_use' construction is used to access fixed table values
+ *
+ *    256 byte tables:
+ *
+ *        t_xxx(s, box)    => forward S box
+ *        t_xxx(i, box)    => inverse S box
+ *
+ *    256 32-bit word OR 4 x 256 32-bit word tables:
+ *
+ *        t_xxx(f, n)      => forward normal round
+ *        t_xxx(f, l)      => forward last round
+ *        t_xxx(i, n)      => inverse normal round
+ *        t_xxx(i, l)      => inverse last round
+ *        t_xxx(l, s)      => key schedule table
+ *        t_xxx(i, m)      => key schedule table
+ *
+ *    Other variables and tables:
+ *
+ *        t_xxx(r, c)      => the rcon table
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * 1. Added __cplusplus and _AESTAB_H header guards
+ * 2. Added header file sys/types.h
+ * 3. Remove code defined for _MSC_VER
+ * 4. Changed all variables to "static const"
+ * 5. Changed uint_8t and uint_32t to uint8_t and uint32_t
+ * 6. Cstyled and hdrchk code
+ */
+
+#ifndef _AESTAB_H
+#define	_AESTAB_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+
+#define	t_dec(m, n) t_##m##n
+#define	t_set(m, n) t_##m##n
+#define	t_use(m, n) t_##m##n
+
+#if defined(DO_TABLES) && defined(FIXED_TABLES)
+#define	d_1(t, n, b, e)		 static const t n[256]    =   b(e)
+#define	d_4(t, n, b, e, f, g, h) static const t n[4][256] = \
+					{b(e), b(f), b(g), b(h)}
+static const uint32_t t_dec(r, c)[RC_LENGTH] = rc_data(w0);
+#else
+#define	d_1(t, n, b, e)			static const t n[256]
+#define	d_4(t, n, b, e, f, g, h)	static const t n[4][256]
+static const uint32_t t_dec(r, c)[RC_LENGTH];
+#endif
+
+#if defined(SBX_SET)
+	d_1(uint8_t, t_dec(s, box), sb_data, h0);
+#endif
+#if defined(ISB_SET)
+	d_1(uint8_t, t_dec(i, box), isb_data, h0);
+#endif
+
+#if defined(FT1_SET)
+	d_1(uint32_t, t_dec(f, n), sb_data, u0);
+#endif
+#if defined(FT4_SET)
+	d_4(uint32_t, t_dec(f, n), sb_data, u0, u1, u2, u3);
+#endif
+
+#if defined(FL1_SET)
+	d_1(uint32_t, t_dec(f, l), sb_data, w0);
+#endif
+#if defined(FL4_SET)
+	d_4(uint32_t, t_dec(f, l), sb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(IT1_SET)
+	d_1(uint32_t, t_dec(i, n), isb_data, v0);
+#endif
+#if defined(IT4_SET)
+	d_4(uint32_t, t_dec(i, n), isb_data, v0, v1, v2, v3);
+#endif
+
+#if defined(IL1_SET)
+	d_1(uint32_t, t_dec(i, l), isb_data, w0);
+#endif
+#if defined(IL4_SET)
+	d_4(uint32_t, t_dec(i, l), isb_data, w0, w1, w2, w3);
+#endif
+
+#if defined(LS1_SET)
+#if defined(FL1_SET)
+#undef  LS1_SET
+#else
+	d_1(uint32_t, t_dec(l, s), sb_data, w0);
+#endif
+#endif
+
+#if defined(LS4_SET)
+#if defined(FL4_SET)
+#undef  LS4_SET
+#else
+	d_4(uint32_t, t_dec(l, s), sb_data, w0, w1, w2, w3);
+#endif
+#endif
+
+#if defined(IM1_SET)
+	d_1(uint32_t, t_dec(i, m), mm_data, v0);
+#endif
+#if defined(IM4_SET)
+	d_4(uint32_t, t_dec(i, m), mm_data, v0, v1, v2, v3);
+#endif
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _AESTAB_H */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab2.h b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab2.h
new file mode 100644
index 000000000000..eb13f72b10d8
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/aes/aestab2.h
@@ -0,0 +1,594 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _AESTAB2_H
+#define	_AESTAB2_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * To create this file for OpenSolaris:
+ * 1. Compile and run tablegen.c, from aes-src-04-03-08.zip,
+ *	after defining ASM_AMD64_C
+ * 2. mv aestab2.c aestab2.h
+ * 3. Add __cplusplus and _AESTAB2_H header guards
+ * 3. Add #include <aes_impl.h>
+ * 4. Change "uint_32t" to "uint32_t"
+ * 5. Change all variables to "static const"
+ * 6. Cstyle and hdrchk this file
+ */
+
+#include <aes/aes_impl.h>
+
+static const uint32_t t_rc[RC_LENGTH] =
+{
+	0x00000001, 0x00000002, 0x00000004, 0x00000008,
+	0x00000010, 0x00000020, 0x00000040, 0x00000080,
+	0x0000001b, 0x00000036
+};
+
+static const uint32_t t_ls[4][256] =
+{
+	{
+	0x00000063, 0x0000007c, 0x00000077, 0x0000007b,
+	0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
+	0x00000030, 0x00000001, 0x00000067, 0x0000002b,
+	0x000000fe, 0x000000d7, 0x000000ab, 0x00000076,
+	0x000000ca, 0x00000082, 0x000000c9, 0x0000007d,
+	0x000000fa, 0x00000059, 0x00000047, 0x000000f0,
+	0x000000ad, 0x000000d4, 0x000000a2, 0x000000af,
+	0x0000009c, 0x000000a4, 0x00000072, 0x000000c0,
+	0x000000b7, 0x000000fd, 0x00000093, 0x00000026,
+	0x00000036, 0x0000003f, 0x000000f7, 0x000000cc,
+	0x00000034, 0x000000a5, 0x000000e5, 0x000000f1,
+	0x00000071, 0x000000d8, 0x00000031, 0x00000015,
+	0x00000004, 0x000000c7, 0x00000023, 0x000000c3,
+	0x00000018, 0x00000096, 0x00000005, 0x0000009a,
+	0x00000007, 0x00000012, 0x00000080, 0x000000e2,
+	0x000000eb, 0x00000027, 0x000000b2, 0x00000075,
+	0x00000009, 0x00000083, 0x0000002c, 0x0000001a,
+	0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0,
+	0x00000052, 0x0000003b, 0x000000d6, 0x000000b3,
+	0x00000029, 0x000000e3, 0x0000002f, 0x00000084,
+	0x00000053, 0x000000d1, 0x00000000, 0x000000ed,
+	0x00000020, 0x000000fc, 0x000000b1, 0x0000005b,
+	0x0000006a, 0x000000cb, 0x000000be, 0x00000039,
+	0x0000004a, 0x0000004c, 0x00000058, 0x000000cf,
+	0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb,
+	0x00000043, 0x0000004d, 0x00000033, 0x00000085,
+	0x00000045, 0x000000f9, 0x00000002, 0x0000007f,
+	0x00000050, 0x0000003c, 0x0000009f, 0x000000a8,
+	0x00000051, 0x000000a3, 0x00000040, 0x0000008f,
+	0x00000092, 0x0000009d, 0x00000038, 0x000000f5,
+	0x000000bc, 0x000000b6, 0x000000da, 0x00000021,
+	0x00000010, 0x000000ff, 0x000000f3, 0x000000d2,
+	0x000000cd, 0x0000000c, 0x00000013, 0x000000ec,
+	0x0000005f, 0x00000097, 0x00000044, 0x00000017,
+	0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d,
+	0x00000064, 0x0000005d, 0x00000019, 0x00000073,
+	0x00000060, 0x00000081, 0x0000004f, 0x000000dc,
+	0x00000022, 0x0000002a, 0x00000090, 0x00000088,
+	0x00000046, 0x000000ee, 0x000000b8, 0x00000014,
+	0x000000de, 0x0000005e, 0x0000000b, 0x000000db,
+	0x000000e0, 0x00000032, 0x0000003a, 0x0000000a,
+	0x00000049, 0x00000006, 0x00000024, 0x0000005c,
+	0x000000c2, 0x000000d3, 0x000000ac, 0x00000062,
+	0x00000091, 0x00000095, 0x000000e4, 0x00000079,
+	0x000000e7, 0x000000c8, 0x00000037, 0x0000006d,
+	0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9,
+	0x0000006c, 0x00000056, 0x000000f4, 0x000000ea,
+	0x00000065, 0x0000007a, 0x000000ae, 0x00000008,
+	0x000000ba, 0x00000078, 0x00000025, 0x0000002e,
+	0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6,
+	0x000000e8, 0x000000dd, 0x00000074, 0x0000001f,
+	0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a,
+	0x00000070, 0x0000003e, 0x000000b5, 0x00000066,
+	0x00000048, 0x00000003, 0x000000f6, 0x0000000e,
+	0x00000061, 0x00000035, 0x00000057, 0x000000b9,
+	0x00000086, 0x000000c1, 0x0000001d, 0x0000009e,
+	0x000000e1, 0x000000f8, 0x00000098, 0x00000011,
+	0x00000069, 0x000000d9, 0x0000008e, 0x00000094,
+	0x0000009b, 0x0000001e, 0x00000087, 0x000000e9,
+	0x000000ce, 0x00000055, 0x00000028, 0x000000df,
+	0x0000008c, 0x000000a1, 0x00000089, 0x0000000d,
+	0x000000bf, 0x000000e6, 0x00000042, 0x00000068,
+	0x00000041, 0x00000099, 0x0000002d, 0x0000000f,
+	0x000000b0, 0x00000054, 0x000000bb, 0x00000016
+	},
+	{
+	0x00006300, 0x00007c00, 0x00007700, 0x00007b00,
+	0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500,
+	0x00003000, 0x00000100, 0x00006700, 0x00002b00,
+	0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600,
+	0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00,
+	0x0000fa00, 0x00005900, 0x00004700, 0x0000f000,
+	0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00,
+	0x00009c00, 0x0000a400, 0x00007200, 0x0000c000,
+	0x0000b700, 0x0000fd00, 0x00009300, 0x00002600,
+	0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00,
+	0x00003400, 0x0000a500, 0x0000e500, 0x0000f100,
+	0x00007100, 0x0000d800, 0x00003100, 0x00001500,
+	0x00000400, 0x0000c700, 0x00002300, 0x0000c300,
+	0x00001800, 0x00009600, 0x00000500, 0x00009a00,
+	0x00000700, 0x00001200, 0x00008000, 0x0000e200,
+	0x0000eb00, 0x00002700, 0x0000b200, 0x00007500,
+	0x00000900, 0x00008300, 0x00002c00, 0x00001a00,
+	0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000,
+	0x00005200, 0x00003b00, 0x0000d600, 0x0000b300,
+	0x00002900, 0x0000e300, 0x00002f00, 0x00008400,
+	0x00005300, 0x0000d100, 0x00000000, 0x0000ed00,
+	0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00,
+	0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900,
+	0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00,
+	0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00,
+	0x00004300, 0x00004d00, 0x00003300, 0x00008500,
+	0x00004500, 0x0000f900, 0x00000200, 0x00007f00,
+	0x00005000, 0x00003c00, 0x00009f00, 0x0000a800,
+	0x00005100, 0x0000a300, 0x00004000, 0x00008f00,
+	0x00009200, 0x00009d00, 0x00003800, 0x0000f500,
+	0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100,
+	0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200,
+	0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00,
+	0x00005f00, 0x00009700, 0x00004400, 0x00001700,
+	0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00,
+	0x00006400, 0x00005d00, 0x00001900, 0x00007300,
+	0x00006000, 0x00008100, 0x00004f00, 0x0000dc00,
+	0x00002200, 0x00002a00, 0x00009000, 0x00008800,
+	0x00004600, 0x0000ee00, 0x0000b800, 0x00001400,
+	0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00,
+	0x0000e000, 0x00003200, 0x00003a00, 0x00000a00,
+	0x00004900, 0x00000600, 0x00002400, 0x00005c00,
+	0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200,
+	0x00009100, 0x00009500, 0x0000e400, 0x00007900,
+	0x0000e700, 0x0000c800, 0x00003700, 0x00006d00,
+	0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900,
+	0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00,
+	0x00006500, 0x00007a00, 0x0000ae00, 0x00000800,
+	0x0000ba00, 0x00007800, 0x00002500, 0x00002e00,
+	0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600,
+	0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00,
+	0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00,
+	0x00007000, 0x00003e00, 0x0000b500, 0x00006600,
+	0x00004800, 0x00000300, 0x0000f600, 0x00000e00,
+	0x00006100, 0x00003500, 0x00005700, 0x0000b900,
+	0x00008600, 0x0000c100, 0x00001d00, 0x00009e00,
+	0x0000e100, 0x0000f800, 0x00009800, 0x00001100,
+	0x00006900, 0x0000d900, 0x00008e00, 0x00009400,
+	0x00009b00, 0x00001e00, 0x00008700, 0x0000e900,
+	0x0000ce00, 0x00005500, 0x00002800, 0x0000df00,
+	0x00008c00, 0x0000a100, 0x00008900, 0x00000d00,
+	0x0000bf00, 0x0000e600, 0x00004200, 0x00006800,
+	0x00004100, 0x00009900, 0x00002d00, 0x00000f00,
+	0x0000b000, 0x00005400, 0x0000bb00, 0x00001600
+	},
+	{
+	0x00630000, 0x007c0000, 0x00770000, 0x007b0000,
+	0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000,
+	0x00300000, 0x00010000, 0x00670000, 0x002b0000,
+	0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000,
+	0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000,
+	0x00fa0000, 0x00590000, 0x00470000, 0x00f00000,
+	0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000,
+	0x009c0000, 0x00a40000, 0x00720000, 0x00c00000,
+	0x00b70000, 0x00fd0000, 0x00930000, 0x00260000,
+	0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000,
+	0x00340000, 0x00a50000, 0x00e50000, 0x00f10000,
+	0x00710000, 0x00d80000, 0x00310000, 0x00150000,
+	0x00040000, 0x00c70000, 0x00230000, 0x00c30000,
+	0x00180000, 0x00960000, 0x00050000, 0x009a0000,
+	0x00070000, 0x00120000, 0x00800000, 0x00e20000,
+	0x00eb0000, 0x00270000, 0x00b20000, 0x00750000,
+	0x00090000, 0x00830000, 0x002c0000, 0x001a0000,
+	0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000,
+	0x00520000, 0x003b0000, 0x00d60000, 0x00b30000,
+	0x00290000, 0x00e30000, 0x002f0000, 0x00840000,
+	0x00530000, 0x00d10000, 0x00000000, 0x00ed0000,
+	0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000,
+	0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000,
+	0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000,
+	0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000,
+	0x00430000, 0x004d0000, 0x00330000, 0x00850000,
+	0x00450000, 0x00f90000, 0x00020000, 0x007f0000,
+	0x00500000, 0x003c0000, 0x009f0000, 0x00a80000,
+	0x00510000, 0x00a30000, 0x00400000, 0x008f0000,
+	0x00920000, 0x009d0000, 0x00380000, 0x00f50000,
+	0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000,
+	0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000,
+	0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000,
+	0x005f0000, 0x00970000, 0x00440000, 0x00170000,
+	0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000,
+	0x00640000, 0x005d0000, 0x00190000, 0x00730000,
+	0x00600000, 0x00810000, 0x004f0000, 0x00dc0000,
+	0x00220000, 0x002a0000, 0x00900000, 0x00880000,
+	0x00460000, 0x00ee0000, 0x00b80000, 0x00140000,
+	0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000,
+	0x00e00000, 0x00320000, 0x003a0000, 0x000a0000,
+	0x00490000, 0x00060000, 0x00240000, 0x005c0000,
+	0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000,
+	0x00910000, 0x00950000, 0x00e40000, 0x00790000,
+	0x00e70000, 0x00c80000, 0x00370000, 0x006d0000,
+	0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000,
+	0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000,
+	0x00650000, 0x007a0000, 0x00ae0000, 0x00080000,
+	0x00ba0000, 0x00780000, 0x00250000, 0x002e0000,
+	0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000,
+	0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000,
+	0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000,
+	0x00700000, 0x003e0000, 0x00b50000, 0x00660000,
+	0x00480000, 0x00030000, 0x00f60000, 0x000e0000,
+	0x00610000, 0x00350000, 0x00570000, 0x00b90000,
+	0x00860000, 0x00c10000, 0x001d0000, 0x009e0000,
+	0x00e10000, 0x00f80000, 0x00980000, 0x00110000,
+	0x00690000, 0x00d90000, 0x008e0000, 0x00940000,
+	0x009b0000, 0x001e0000, 0x00870000, 0x00e90000,
+	0x00ce0000, 0x00550000, 0x00280000, 0x00df0000,
+	0x008c0000, 0x00a10000, 0x00890000, 0x000d0000,
+	0x00bf0000, 0x00e60000, 0x00420000, 0x00680000,
+	0x00410000, 0x00990000, 0x002d0000, 0x000f0000,
+	0x00b00000, 0x00540000, 0x00bb0000, 0x00160000
+	},
+	{
+	0x63000000, 0x7c000000, 0x77000000, 0x7b000000,
+	0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000,
+	0x30000000, 0x01000000, 0x67000000, 0x2b000000,
+	0xfe000000, 0xd7000000, 0xab000000, 0x76000000,
+	0xca000000, 0x82000000, 0xc9000000, 0x7d000000,
+	0xfa000000, 0x59000000, 0x47000000, 0xf0000000,
+	0xad000000, 0xd4000000, 0xa2000000, 0xaf000000,
+	0x9c000000, 0xa4000000, 0x72000000, 0xc0000000,
+	0xb7000000, 0xfd000000, 0x93000000, 0x26000000,
+	0x36000000, 0x3f000000, 0xf7000000, 0xcc000000,
+	0x34000000, 0xa5000000, 0xe5000000, 0xf1000000,
+	0x71000000, 0xd8000000, 0x31000000, 0x15000000,
+	0x04000000, 0xc7000000, 0x23000000, 0xc3000000,
+	0x18000000, 0x96000000, 0x05000000, 0x9a000000,
+	0x07000000, 0x12000000, 0x80000000, 0xe2000000,
+	0xeb000000, 0x27000000, 0xb2000000, 0x75000000,
+	0x09000000, 0x83000000, 0x2c000000, 0x1a000000,
+	0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000,
+	0x52000000, 0x3b000000, 0xd6000000, 0xb3000000,
+	0x29000000, 0xe3000000, 0x2f000000, 0x84000000,
+	0x53000000, 0xd1000000, 0x00000000, 0xed000000,
+	0x20000000, 0xfc000000, 0xb1000000, 0x5b000000,
+	0x6a000000, 0xcb000000, 0xbe000000, 0x39000000,
+	0x4a000000, 0x4c000000, 0x58000000, 0xcf000000,
+	0xd0000000, 0xef000000, 0xaa000000, 0xfb000000,
+	0x43000000, 0x4d000000, 0x33000000, 0x85000000,
+	0x45000000, 0xf9000000, 0x02000000, 0x7f000000,
+	0x50000000, 0x3c000000, 0x9f000000, 0xa8000000,
+	0x51000000, 0xa3000000, 0x40000000, 0x8f000000,
+	0x92000000, 0x9d000000, 0x38000000, 0xf5000000,
+	0xbc000000, 0xb6000000, 0xda000000, 0x21000000,
+	0x10000000, 0xff000000, 0xf3000000, 0xd2000000,
+	0xcd000000, 0x0c000000, 0x13000000, 0xec000000,
+	0x5f000000, 0x97000000, 0x44000000, 0x17000000,
+	0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000,
+	0x64000000, 0x5d000000, 0x19000000, 0x73000000,
+	0x60000000, 0x81000000, 0x4f000000, 0xdc000000,
+	0x22000000, 0x2a000000, 0x90000000, 0x88000000,
+	0x46000000, 0xee000000, 0xb8000000, 0x14000000,
+	0xde000000, 0x5e000000, 0x0b000000, 0xdb000000,
+	0xe0000000, 0x32000000, 0x3a000000, 0x0a000000,
+	0x49000000, 0x06000000, 0x24000000, 0x5c000000,
+	0xc2000000, 0xd3000000, 0xac000000, 0x62000000,
+	0x91000000, 0x95000000, 0xe4000000, 0x79000000,
+	0xe7000000, 0xc8000000, 0x37000000, 0x6d000000,
+	0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000,
+	0x6c000000, 0x56000000, 0xf4000000, 0xea000000,
+	0x65000000, 0x7a000000, 0xae000000, 0x08000000,
+	0xba000000, 0x78000000, 0x25000000, 0x2e000000,
+	0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000,
+	0xe8000000, 0xdd000000, 0x74000000, 0x1f000000,
+	0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000,
+	0x70000000, 0x3e000000, 0xb5000000, 0x66000000,
+	0x48000000, 0x03000000, 0xf6000000, 0x0e000000,
+	0x61000000, 0x35000000, 0x57000000, 0xb9000000,
+	0x86000000, 0xc1000000, 0x1d000000, 0x9e000000,
+	0xe1000000, 0xf8000000, 0x98000000, 0x11000000,
+	0x69000000, 0xd9000000, 0x8e000000, 0x94000000,
+	0x9b000000, 0x1e000000, 0x87000000, 0xe9000000,
+	0xce000000, 0x55000000, 0x28000000, 0xdf000000,
+	0x8c000000, 0xa1000000, 0x89000000, 0x0d000000,
+	0xbf000000, 0xe6000000, 0x42000000, 0x68000000,
+	0x41000000, 0x99000000, 0x2d000000, 0x0f000000,
+	0xb0000000, 0x54000000, 0xbb000000, 0x16000000
+	}
+};
+
+static const uint32_t t_im[4][256] =
+{
+	{
+	0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12,
+	0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a,
+	0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362,
+	0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a,
+	0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2,
+	0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca,
+	0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382,
+	0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba,
+	0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9,
+	0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1,
+	0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9,
+	0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81,
+	0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029,
+	0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411,
+	0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859,
+	0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61,
+	0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf,
+	0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987,
+	0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf,
+	0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7,
+	0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f,
+	0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967,
+	0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f,
+	0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117,
+	0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664,
+	0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c,
+	0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14,
+	0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c,
+	0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684,
+	0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc,
+	0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4,
+	0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc,
+	0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753,
+	0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b,
+	0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23,
+	0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b,
+	0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3,
+	0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b,
+	0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3,
+	0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb,
+	0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88,
+	0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0,
+	0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8,
+	0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0,
+	0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68,
+	0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850,
+	0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418,
+	0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020,
+	0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe,
+	0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6,
+	0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e,
+	0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6,
+	0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e,
+	0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526,
+	0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e,
+	0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56,
+	0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25,
+	0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d,
+	0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255,
+	0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d,
+	0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5,
+	0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd,
+	0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5,
+	0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d
+	},
+	{
+	0x00000000, 0x0d090e0b, 0x1a121c16, 0x171b121d,
+	0x3424382c, 0x392d3627, 0x2e36243a, 0x233f2a31,
+	0x68487058, 0x65417e53, 0x725a6c4e, 0x7f536245,
+	0x5c6c4874, 0x5165467f, 0x467e5462, 0x4b775a69,
+	0xd090e0b0, 0xdd99eebb, 0xca82fca6, 0xc78bf2ad,
+	0xe4b4d89c, 0xe9bdd697, 0xfea6c48a, 0xf3afca81,
+	0xb8d890e8, 0xb5d19ee3, 0xa2ca8cfe, 0xafc382f5,
+	0x8cfca8c4, 0x81f5a6cf, 0x96eeb4d2, 0x9be7bad9,
+	0xbb3bdb7b, 0xb632d570, 0xa129c76d, 0xac20c966,
+	0x8f1fe357, 0x8216ed5c, 0x950dff41, 0x9804f14a,
+	0xd373ab23, 0xde7aa528, 0xc961b735, 0xc468b93e,
+	0xe757930f, 0xea5e9d04, 0xfd458f19, 0xf04c8112,
+	0x6bab3bcb, 0x66a235c0, 0x71b927dd, 0x7cb029d6,
+	0x5f8f03e7, 0x52860dec, 0x459d1ff1, 0x489411fa,
+	0x03e34b93, 0x0eea4598, 0x19f15785, 0x14f8598e,
+	0x37c773bf, 0x3ace7db4, 0x2dd56fa9, 0x20dc61a2,
+	0x6d76adf6, 0x607fa3fd, 0x7764b1e0, 0x7a6dbfeb,
+	0x595295da, 0x545b9bd1, 0x434089cc, 0x4e4987c7,
+	0x053eddae, 0x0837d3a5, 0x1f2cc1b8, 0x1225cfb3,
+	0x311ae582, 0x3c13eb89, 0x2b08f994, 0x2601f79f,
+	0xbde64d46, 0xb0ef434d, 0xa7f45150, 0xaafd5f5b,
+	0x89c2756a, 0x84cb7b61, 0x93d0697c, 0x9ed96777,
+	0xd5ae3d1e, 0xd8a73315, 0xcfbc2108, 0xc2b52f03,
+	0xe18a0532, 0xec830b39, 0xfb981924, 0xf691172f,
+	0xd64d768d, 0xdb447886, 0xcc5f6a9b, 0xc1566490,
+	0xe2694ea1, 0xef6040aa, 0xf87b52b7, 0xf5725cbc,
+	0xbe0506d5, 0xb30c08de, 0xa4171ac3, 0xa91e14c8,
+	0x8a213ef9, 0x872830f2, 0x903322ef, 0x9d3a2ce4,
+	0x06dd963d, 0x0bd49836, 0x1ccf8a2b, 0x11c68420,
+	0x32f9ae11, 0x3ff0a01a, 0x28ebb207, 0x25e2bc0c,
+	0x6e95e665, 0x639ce86e, 0x7487fa73, 0x798ef478,
+	0x5ab1de49, 0x57b8d042, 0x40a3c25f, 0x4daacc54,
+	0xdaec41f7, 0xd7e54ffc, 0xc0fe5de1, 0xcdf753ea,
+	0xeec879db, 0xe3c177d0, 0xf4da65cd, 0xf9d36bc6,
+	0xb2a431af, 0xbfad3fa4, 0xa8b62db9, 0xa5bf23b2,
+	0x86800983, 0x8b890788, 0x9c921595, 0x919b1b9e,
+	0x0a7ca147, 0x0775af4c, 0x106ebd51, 0x1d67b35a,
+	0x3e58996b, 0x33519760, 0x244a857d, 0x29438b76,
+	0x6234d11f, 0x6f3ddf14, 0x7826cd09, 0x752fc302,
+	0x5610e933, 0x5b19e738, 0x4c02f525, 0x410bfb2e,
+	0x61d79a8c, 0x6cde9487, 0x7bc5869a, 0x76cc8891,
+	0x55f3a2a0, 0x58faacab, 0x4fe1beb6, 0x42e8b0bd,
+	0x099fead4, 0x0496e4df, 0x138df6c2, 0x1e84f8c9,
+	0x3dbbd2f8, 0x30b2dcf3, 0x27a9ceee, 0x2aa0c0e5,
+	0xb1477a3c, 0xbc4e7437, 0xab55662a, 0xa65c6821,
+	0x85634210, 0x886a4c1b, 0x9f715e06, 0x9278500d,
+	0xd90f0a64, 0xd406046f, 0xc31d1672, 0xce141879,
+	0xed2b3248, 0xe0223c43, 0xf7392e5e, 0xfa302055,
+	0xb79aec01, 0xba93e20a, 0xad88f017, 0xa081fe1c,
+	0x83bed42d, 0x8eb7da26, 0x99acc83b, 0x94a5c630,
+	0xdfd29c59, 0xd2db9252, 0xc5c0804f, 0xc8c98e44,
+	0xebf6a475, 0xe6ffaa7e, 0xf1e4b863, 0xfcedb668,
+	0x670a0cb1, 0x6a0302ba, 0x7d1810a7, 0x70111eac,
+	0x532e349d, 0x5e273a96, 0x493c288b, 0x44352680,
+	0x0f427ce9, 0x024b72e2, 0x155060ff, 0x18596ef4,
+	0x3b6644c5, 0x366f4ace, 0x217458d3, 0x2c7d56d8,
+	0x0ca1377a, 0x01a83971, 0x16b32b6c, 0x1bba2567,
+	0x38850f56, 0x358c015d, 0x22971340, 0x2f9e1d4b,
+	0x64e94722, 0x69e04929, 0x7efb5b34, 0x73f2553f,
+	0x50cd7f0e, 0x5dc47105, 0x4adf6318, 0x47d66d13,
+	0xdc31d7ca, 0xd138d9c1, 0xc623cbdc, 0xcb2ac5d7,
+	0xe815efe6, 0xe51ce1ed, 0xf207f3f0, 0xff0efdfb,
+	0xb479a792, 0xb970a999, 0xae6bbb84, 0xa362b58f,
+	0x805d9fbe, 0x8d5491b5, 0x9a4f83a8, 0x97468da3
+	},
+	{
+	0x00000000, 0x090e0b0d, 0x121c161a, 0x1b121d17,
+	0x24382c34, 0x2d362739, 0x36243a2e, 0x3f2a3123,
+	0x48705868, 0x417e5365, 0x5a6c4e72, 0x5362457f,
+	0x6c48745c, 0x65467f51, 0x7e546246, 0x775a694b,
+	0x90e0b0d0, 0x99eebbdd, 0x82fca6ca, 0x8bf2adc7,
+	0xb4d89ce4, 0xbdd697e9, 0xa6c48afe, 0xafca81f3,
+	0xd890e8b8, 0xd19ee3b5, 0xca8cfea2, 0xc382f5af,
+	0xfca8c48c, 0xf5a6cf81, 0xeeb4d296, 0xe7bad99b,
+	0x3bdb7bbb, 0x32d570b6, 0x29c76da1, 0x20c966ac,
+	0x1fe3578f, 0x16ed5c82, 0x0dff4195, 0x04f14a98,
+	0x73ab23d3, 0x7aa528de, 0x61b735c9, 0x68b93ec4,
+	0x57930fe7, 0x5e9d04ea, 0x458f19fd, 0x4c8112f0,
+	0xab3bcb6b, 0xa235c066, 0xb927dd71, 0xb029d67c,
+	0x8f03e75f, 0x860dec52, 0x9d1ff145, 0x9411fa48,
+	0xe34b9303, 0xea45980e, 0xf1578519, 0xf8598e14,
+	0xc773bf37, 0xce7db43a, 0xd56fa92d, 0xdc61a220,
+	0x76adf66d, 0x7fa3fd60, 0x64b1e077, 0x6dbfeb7a,
+	0x5295da59, 0x5b9bd154, 0x4089cc43, 0x4987c74e,
+	0x3eddae05, 0x37d3a508, 0x2cc1b81f, 0x25cfb312,
+	0x1ae58231, 0x13eb893c, 0x08f9942b, 0x01f79f26,
+	0xe64d46bd, 0xef434db0, 0xf45150a7, 0xfd5f5baa,
+	0xc2756a89, 0xcb7b6184, 0xd0697c93, 0xd967779e,
+	0xae3d1ed5, 0xa73315d8, 0xbc2108cf, 0xb52f03c2,
+	0x8a0532e1, 0x830b39ec, 0x981924fb, 0x91172ff6,
+	0x4d768dd6, 0x447886db, 0x5f6a9bcc, 0x566490c1,
+	0x694ea1e2, 0x6040aaef, 0x7b52b7f8, 0x725cbcf5,
+	0x0506d5be, 0x0c08deb3, 0x171ac3a4, 0x1e14c8a9,
+	0x213ef98a, 0x2830f287, 0x3322ef90, 0x3a2ce49d,
+	0xdd963d06, 0xd498360b, 0xcf8a2b1c, 0xc6842011,
+	0xf9ae1132, 0xf0a01a3f, 0xebb20728, 0xe2bc0c25,
+	0x95e6656e, 0x9ce86e63, 0x87fa7374, 0x8ef47879,
+	0xb1de495a, 0xb8d04257, 0xa3c25f40, 0xaacc544d,
+	0xec41f7da, 0xe54ffcd7, 0xfe5de1c0, 0xf753eacd,
+	0xc879dbee, 0xc177d0e3, 0xda65cdf4, 0xd36bc6f9,
+	0xa431afb2, 0xad3fa4bf, 0xb62db9a8, 0xbf23b2a5,
+	0x80098386, 0x8907888b, 0x9215959c, 0x9b1b9e91,
+	0x7ca1470a, 0x75af4c07, 0x6ebd5110, 0x67b35a1d,
+	0x58996b3e, 0x51976033, 0x4a857d24, 0x438b7629,
+	0x34d11f62, 0x3ddf146f, 0x26cd0978, 0x2fc30275,
+	0x10e93356, 0x19e7385b, 0x02f5254c, 0x0bfb2e41,
+	0xd79a8c61, 0xde94876c, 0xc5869a7b, 0xcc889176,
+	0xf3a2a055, 0xfaacab58, 0xe1beb64f, 0xe8b0bd42,
+	0x9fead409, 0x96e4df04, 0x8df6c213, 0x84f8c91e,
+	0xbbd2f83d, 0xb2dcf330, 0xa9ceee27, 0xa0c0e52a,
+	0x477a3cb1, 0x4e7437bc, 0x55662aab, 0x5c6821a6,
+	0x63421085, 0x6a4c1b88, 0x715e069f, 0x78500d92,
+	0x0f0a64d9, 0x06046fd4, 0x1d1672c3, 0x141879ce,
+	0x2b3248ed, 0x223c43e0, 0x392e5ef7, 0x302055fa,
+	0x9aec01b7, 0x93e20aba, 0x88f017ad, 0x81fe1ca0,
+	0xbed42d83, 0xb7da268e, 0xacc83b99, 0xa5c63094,
+	0xd29c59df, 0xdb9252d2, 0xc0804fc5, 0xc98e44c8,
+	0xf6a475eb, 0xffaa7ee6, 0xe4b863f1, 0xedb668fc,
+	0x0a0cb167, 0x0302ba6a, 0x1810a77d, 0x111eac70,
+	0x2e349d53, 0x273a965e, 0x3c288b49, 0x35268044,
+	0x427ce90f, 0x4b72e202, 0x5060ff15, 0x596ef418,
+	0x6644c53b, 0x6f4ace36, 0x7458d321, 0x7d56d82c,
+	0xa1377a0c, 0xa8397101, 0xb32b6c16, 0xba25671b,
+	0x850f5638, 0x8c015d35, 0x97134022, 0x9e1d4b2f,
+	0xe9472264, 0xe0492969, 0xfb5b347e, 0xf2553f73,
+	0xcd7f0e50, 0xc471055d, 0xdf63184a, 0xd66d1347,
+	0x31d7cadc, 0x38d9c1d1, 0x23cbdcc6, 0x2ac5d7cb,
+	0x15efe6e8, 0x1ce1ede5, 0x07f3f0f2, 0x0efdfbff,
+	0x79a792b4, 0x70a999b9, 0x6bbb84ae, 0x62b58fa3,
+	0x5d9fbe80, 0x5491b58d, 0x4f83a89a, 0x468da397
+	},
+	{
+	0x00000000, 0x0e0b0d09, 0x1c161a12, 0x121d171b,
+	0x382c3424, 0x3627392d, 0x243a2e36, 0x2a31233f,
+	0x70586848, 0x7e536541, 0x6c4e725a, 0x62457f53,
+	0x48745c6c, 0x467f5165, 0x5462467e, 0x5a694b77,
+	0xe0b0d090, 0xeebbdd99, 0xfca6ca82, 0xf2adc78b,
+	0xd89ce4b4, 0xd697e9bd, 0xc48afea6, 0xca81f3af,
+	0x90e8b8d8, 0x9ee3b5d1, 0x8cfea2ca, 0x82f5afc3,
+	0xa8c48cfc, 0xa6cf81f5, 0xb4d296ee, 0xbad99be7,
+	0xdb7bbb3b, 0xd570b632, 0xc76da129, 0xc966ac20,
+	0xe3578f1f, 0xed5c8216, 0xff41950d, 0xf14a9804,
+	0xab23d373, 0xa528de7a, 0xb735c961, 0xb93ec468,
+	0x930fe757, 0x9d04ea5e, 0x8f19fd45, 0x8112f04c,
+	0x3bcb6bab, 0x35c066a2, 0x27dd71b9, 0x29d67cb0,
+	0x03e75f8f, 0x0dec5286, 0x1ff1459d, 0x11fa4894,
+	0x4b9303e3, 0x45980eea, 0x578519f1, 0x598e14f8,
+	0x73bf37c7, 0x7db43ace, 0x6fa92dd5, 0x61a220dc,
+	0xadf66d76, 0xa3fd607f, 0xb1e07764, 0xbfeb7a6d,
+	0x95da5952, 0x9bd1545b, 0x89cc4340, 0x87c74e49,
+	0xddae053e, 0xd3a50837, 0xc1b81f2c, 0xcfb31225,
+	0xe582311a, 0xeb893c13, 0xf9942b08, 0xf79f2601,
+	0x4d46bde6, 0x434db0ef, 0x5150a7f4, 0x5f5baafd,
+	0x756a89c2, 0x7b6184cb, 0x697c93d0, 0x67779ed9,
+	0x3d1ed5ae, 0x3315d8a7, 0x2108cfbc, 0x2f03c2b5,
+	0x0532e18a, 0x0b39ec83, 0x1924fb98, 0x172ff691,
+	0x768dd64d, 0x7886db44, 0x6a9bcc5f, 0x6490c156,
+	0x4ea1e269, 0x40aaef60, 0x52b7f87b, 0x5cbcf572,
+	0x06d5be05, 0x08deb30c, 0x1ac3a417, 0x14c8a91e,
+	0x3ef98a21, 0x30f28728, 0x22ef9033, 0x2ce49d3a,
+	0x963d06dd, 0x98360bd4, 0x8a2b1ccf, 0x842011c6,
+	0xae1132f9, 0xa01a3ff0, 0xb20728eb, 0xbc0c25e2,
+	0xe6656e95, 0xe86e639c, 0xfa737487, 0xf478798e,
+	0xde495ab1, 0xd04257b8, 0xc25f40a3, 0xcc544daa,
+	0x41f7daec, 0x4ffcd7e5, 0x5de1c0fe, 0x53eacdf7,
+	0x79dbeec8, 0x77d0e3c1, 0x65cdf4da, 0x6bc6f9d3,
+	0x31afb2a4, 0x3fa4bfad, 0x2db9a8b6, 0x23b2a5bf,
+	0x09838680, 0x07888b89, 0x15959c92, 0x1b9e919b,
+	0xa1470a7c, 0xaf4c0775, 0xbd51106e, 0xb35a1d67,
+	0x996b3e58, 0x97603351, 0x857d244a, 0x8b762943,
+	0xd11f6234, 0xdf146f3d, 0xcd097826, 0xc302752f,
+	0xe9335610, 0xe7385b19, 0xf5254c02, 0xfb2e410b,
+	0x9a8c61d7, 0x94876cde, 0x869a7bc5, 0x889176cc,
+	0xa2a055f3, 0xacab58fa, 0xbeb64fe1, 0xb0bd42e8,
+	0xead4099f, 0xe4df0496, 0xf6c2138d, 0xf8c91e84,
+	0xd2f83dbb, 0xdcf330b2, 0xceee27a9, 0xc0e52aa0,
+	0x7a3cb147, 0x7437bc4e, 0x662aab55, 0x6821a65c,
+	0x42108563, 0x4c1b886a, 0x5e069f71, 0x500d9278,
+	0x0a64d90f, 0x046fd406, 0x1672c31d, 0x1879ce14,
+	0x3248ed2b, 0x3c43e022, 0x2e5ef739, 0x2055fa30,
+	0xec01b79a, 0xe20aba93, 0xf017ad88, 0xfe1ca081,
+	0xd42d83be, 0xda268eb7, 0xc83b99ac, 0xc63094a5,
+	0x9c59dfd2, 0x9252d2db, 0x804fc5c0, 0x8e44c8c9,
+	0xa475ebf6, 0xaa7ee6ff, 0xb863f1e4, 0xb668fced,
+	0x0cb1670a, 0x02ba6a03, 0x10a77d18, 0x1eac7011,
+	0x349d532e, 0x3a965e27, 0x288b493c, 0x26804435,
+	0x7ce90f42, 0x72e2024b, 0x60ff1550, 0x6ef41859,
+	0x44c53b66, 0x4ace366f, 0x58d32174, 0x56d82c7d,
+	0x377a0ca1, 0x397101a8, 0x2b6c16b3, 0x25671bba,
+	0x0f563885, 0x015d358c, 0x13402297, 0x1d4b2f9e,
+	0x472264e9, 0x492969e0, 0x5b347efb, 0x553f73f2,
+	0x7f0e50cd, 0x71055dc4, 0x63184adf, 0x6d1347d6,
+	0xd7cadc31, 0xd9c1d138, 0xcbdcc623, 0xc5d7cb2a,
+	0xefe6e815, 0xe1ede51c, 0xf3f0f207, 0xfdfbff0e,
+	0xa792b479, 0xa999b970, 0xbb84ae6b, 0xb58fa362,
+	0x9fbe805d, 0x91b58d54, 0x83a89a4f, 0x8da39746
+	}
+};
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _AESTAB2_H */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
new file mode 100644
index 000000000000..0de1883dc81b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
@@ -0,0 +1,36 @@
+Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+      *	Redistributions of source code must retain copyright notices,
+	this list of conditions and the following disclaimer.
+
+      *	Redistributions in binary form must reproduce the above
+	copyright notice, this list of conditions and the following
+	disclaimer in the documentation and/or other materials
+	provided with the distribution.
+
+      *	Neither the name of the CRYPTOGAMS nor the names of its
+	copyright holder and contributors may be used to endorse or
+	promote products derived from this software without specific
+	prior written permission.
+
+ALTERNATIVELY, provided that this notice is retained in full, this
+product may be distributed under the terms of the GNU General Public
+License (GPL), in which case the provisions of the GPL apply INSTEAD OF
+those given above.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
new file mode 100644
index 000000000000..49cc83d2ee29
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
@@ -0,0 +1,177 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        https://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
new file mode 100644
index 000000000000..6184759c8b74
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
@@ -0,0 +1 @@
+PORTIONS OF GCM and GHASH FUNCTIONALITY
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
new file mode 100644
index 000000000000..dc71ae2c1c89
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@@ -0,0 +1,1261 @@
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+#
+# AES-NI-CTR+GHASH stitch.
+#
+# February 2013
+#
+# OpenSSL GCM implementation is organized in such way that its
+# performance is rather close to the sum of its streamed components,
+# in the context parallelized AES-NI CTR and modulo-scheduled
+# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
+# was observed to perform significantly better than the sum of the
+# components on contemporary CPUs, the effort was deemed impossible to
+# justify. This module is based on combination of Intel submissions,
+# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
+# Locktyukhin of Intel Corp. who verified that it reduces shuffles
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor, 0.74 - on
+# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
+# measurements for favourable packet size, one divisible by 96.
+# Applications using the EVP interface will observe a few percent
+# worse performance.]
+#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.extern gcm_avx_can_use_movbe
+
+.text
+
+#ifdef HAVE_MOVBE
+.type	_aesni_ctr32_ghash_6x,@function
+.align	32
+_aesni_ctr32_ghash_6x:
+.cfi_startproc
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x
+
+.align	32
+.Loop6x:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	88(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	80(%r14),%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	72(%r14),%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	64(%r14),%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	56(%r14),%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	48(%r14),%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movbeq	40(%r14),%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	32(%r14),%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movbeq	24(%r14),%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movbeq	16(%r14),%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movbeq	8(%r14),%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movbeq	0(%r14),%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	cmpl	$14,%ebp	// ICP does not zero key schedule.
+	jb	.Lenc_tail
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail
+
+.align	32
+.Lhandle_ctr32:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32
+
+.align	32
+.Lenc_tail:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x
+
+.L6x_done:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
+#endif /* ifdef HAVE_MOVBE */
+
+.type	_aesni_ctr32_ghash_no_movbe_6x,@function
+.align	32
+_aesni_ctr32_ghash_no_movbe_6x:
+.cfi_startproc
+	vmovdqu	32(%r11),%xmm2
+	subq	$6,%rdx
+	vpxor	%xmm4,%xmm4,%xmm4
+	vmovdqu	0-128(%rcx),%xmm15
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovdqu	%xmm4,16+8(%rsp)
+	jmp	.Loop6x_nmb
+
+.align	32
+.Loop6x_nmb:
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_nmb
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm11,%xmm11
+
+.Lresume_ctr32_nmb:
+	vmovdqu	%xmm1,(%r8)
+	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm15,%xmm12,%xmm12
+	vmovups	16-128(%rcx),%xmm2
+	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
+	xorq	%r12,%r12
+	cmpq	%r14,%r15
+
+	vaesenc	%xmm2,%xmm9,%xmm9
+	vmovdqu	48+8(%rsp),%xmm0
+	vpxor	%xmm15,%xmm13,%xmm13
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
+	vaesenc	%xmm2,%xmm10,%xmm10
+	vpxor	%xmm15,%xmm14,%xmm14
+	setnc	%r12b
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vaesenc	%xmm2,%xmm11,%xmm11
+	vmovdqu	16-32(%r9),%xmm3
+	negq	%r12
+	vaesenc	%xmm2,%xmm12,%xmm12
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
+	vpxor	%xmm4,%xmm8,%xmm8
+	vaesenc	%xmm2,%xmm13,%xmm13
+	vpxor	%xmm5,%xmm1,%xmm4
+	andq	$0x60,%r12
+	vmovups	32-128(%rcx),%xmm15
+	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
+	vaesenc	%xmm2,%xmm14,%xmm14
+
+	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
+	leaq	(%r14,%r12,1),%r14
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
+	vmovdqu	64+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	88(%r14),%r13
+	bswapq	%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	80(%r14),%r12
+	bswapq	%r12
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,32+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,40+8(%rsp)
+	vmovdqu	48-32(%r9),%xmm5
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	48-128(%rcx),%xmm15
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
+	vmovdqu	80+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	%xmm1,%xmm4,%xmm4
+	vmovdqu	64-32(%r9),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	64-128(%rcx),%xmm15
+	vpxor	%xmm2,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	72(%r14),%r13
+	bswapq	%r13
+	vpxor	%xmm5,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	64(%r14),%r12
+	bswapq	%r12
+	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
+	vmovdqu	96+8(%rsp),%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,48+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,56+8(%rsp)
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	96-32(%r9),%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	80-128(%rcx),%xmm15
+	vpxor	%xmm3,%xmm6,%xmm6
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	56(%r14),%r13
+	bswapq	%r13
+	vpxor	%xmm1,%xmm7,%xmm7
+	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
+	vpxor	112+8(%rsp),%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	48(%r14),%r12
+	bswapq	%r12
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,64+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,72+8(%rsp)
+	vpxor	%xmm3,%xmm4,%xmm4
+	vmovdqu	112-32(%r9),%xmm3
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vmovups	96-128(%rcx),%xmm15
+	vpxor	%xmm5,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm6,%xmm6
+	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
+	vaesenc	%xmm15,%xmm10,%xmm10
+	movq	40(%r14),%r13
+	bswapq	%r13
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	32(%r14),%r12
+	bswapq	%r12
+	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r13,80+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	movq	%r12,88+8(%rsp)
+	vpxor	%xmm5,%xmm6,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm6,%xmm6
+
+	vmovups	112-128(%rcx),%xmm15
+	vpslldq	$8,%xmm6,%xmm5
+	vpxor	%xmm2,%xmm4,%xmm4
+	vmovdqu	16(%r11),%xmm3
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	%xmm8,%xmm7,%xmm7
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm4,%xmm4
+	movq	24(%r14),%r13
+	bswapq	%r13
+	vaesenc	%xmm15,%xmm11,%xmm11
+	movq	16(%r14),%r12
+	bswapq	%r12
+	vpalignr	$8,%xmm4,%xmm4,%xmm0
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	movq	%r13,96+8(%rsp)
+	vaesenc	%xmm15,%xmm12,%xmm12
+	movq	%r12,104+8(%rsp)
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vmovups	128-128(%rcx),%xmm1
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vmovups	144-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vpsrldq	$8,%xmm6,%xmm6
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vpxor	%xmm6,%xmm7,%xmm7
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vpxor	%xmm0,%xmm4,%xmm4
+	movq	8(%r14),%r13
+	bswapq	%r13
+	vaesenc	%xmm1,%xmm13,%xmm13
+	movq	0(%r14),%r12
+	bswapq	%r12
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	160-128(%rcx),%xmm1
+	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
+	jb	.Lenc_tail_nmb
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	176-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	192-128(%rcx),%xmm1
+	cmpl	$14,%ebp	// ICP does not zero key schedule.
+	jb	.Lenc_tail_nmb
+
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+
+	vaesenc	%xmm1,%xmm9,%xmm9
+	vaesenc	%xmm1,%xmm10,%xmm10
+	vaesenc	%xmm1,%xmm11,%xmm11
+	vaesenc	%xmm1,%xmm12,%xmm12
+	vaesenc	%xmm1,%xmm13,%xmm13
+	vmovups	208-128(%rcx),%xmm15
+	vaesenc	%xmm1,%xmm14,%xmm14
+	vmovups	224-128(%rcx),%xmm1
+	jmp	.Lenc_tail_nmb
+
+.align	32
+.Lhandle_ctr32_nmb:
+	vmovdqu	(%r11),%xmm0
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vmovdqu	0-32(%r9),%xmm3
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm15,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm15,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpshufb	%xmm0,%xmm1,%xmm1
+	jmp	.Lresume_ctr32_nmb
+
+.align	32
+.Lenc_tail_nmb:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vmovdqu	%xmm7,16+8(%rsp)
+	vpalignr	$8,%xmm4,%xmm4,%xmm8
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
+	vpxor	0(%rdi),%xmm1,%xmm2
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	16(%rdi),%xmm1,%xmm0
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	32(%rdi),%xmm1,%xmm5
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	48(%rdi),%xmm1,%xmm6
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	64(%rdi),%xmm1,%xmm7
+	vpxor	80(%rdi),%xmm1,%xmm3
+	vmovdqu	(%r8),%xmm1
+
+	vaesenclast	%xmm2,%xmm9,%xmm9
+	vmovdqu	32(%r11),%xmm2
+	vaesenclast	%xmm0,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm1,%xmm0
+	movq	%r13,112+8(%rsp)
+	leaq	96(%rdi),%rdi
+	vaesenclast	%xmm5,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm0,%xmm5
+	movq	%r12,120+8(%rsp)
+	leaq	96(%rsi),%rsi
+	vmovdqu	0-128(%rcx),%xmm15
+	vaesenclast	%xmm6,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm5,%xmm6
+	vaesenclast	%xmm7,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm6,%xmm7
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vpaddb	%xmm2,%xmm7,%xmm3
+
+	addq	$0x60,%r10
+	subq	$0x6,%rdx
+	jc	.L6x_done_nmb
+
+	vmovups	%xmm9,-96(%rsi)
+	vpxor	%xmm15,%xmm1,%xmm9
+	vmovups	%xmm10,-80(%rsi)
+	vmovdqa	%xmm0,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vmovdqa	%xmm5,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vmovdqa	%xmm6,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vmovdqa	%xmm7,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vmovdqa	%xmm3,%xmm14
+	vmovdqu	32+8(%rsp),%xmm7
+	jmp	.Loop6x_nmb
+
+.L6x_done_nmb:
+	vpxor	16+8(%rsp),%xmm8,%xmm8
+	vpxor	%xmm4,%xmm8,%xmm8
+
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	_aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
+
+.globl	aesni_gcm_decrypt
+.type	aesni_gcm_decrypt,@function
+.align	32
+aesni_gcm_decrypt:
+.cfi_startproc
+	xorq	%r10,%r10
+	cmpq	$0x60,%rdx
+	jb	.Lgcm_dec_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	pushq	%r9
+.cfi_offset	%r9,-64
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	vmovdqu	(%r9),%xmm8
+	andq	$-128,%rsp
+	vmovdqu	(%r11),%xmm0
+	leaq	128(%rcx),%rcx
+	movq	32(%r9),%r9
+	leaq	32(%r9),%r9
+	movl	504-128(%rcx),%ebp	// ICP has a larger offset for rounds.
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Ldec_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Ldec_no_key_aliasing
+	subq	%r15,%rsp
+.Ldec_no_key_aliasing:
+
+	vmovdqu	80(%rdi),%xmm7
+	leaq	(%rdi),%r14
+	vmovdqu	64(%rdi),%xmm4
+	leaq	-192(%rdi,%rdx,1),%r15
+	vmovdqu	48(%rdi),%xmm5
+	shrq	$4,%rdx
+	xorq	%r10,%r10
+	vmovdqu	32(%rdi),%xmm6
+	vpshufb	%xmm0,%xmm7,%xmm7
+	vmovdqu	16(%rdi),%xmm2
+	vpshufb	%xmm0,%xmm4,%xmm4
+	vmovdqu	(%rdi),%xmm3
+	vpshufb	%xmm0,%xmm5,%xmm5
+	vmovdqu	%xmm4,48(%rsp)
+	vpshufb	%xmm0,%xmm6,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm2,%xmm2
+	vmovdqu	%xmm6,80(%rsp)
+	vpshufb	%xmm0,%xmm3,%xmm3
+	vmovdqu	%xmm2,96(%rsp)
+	vmovdqu	%xmm3,112(%rsp)
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+	testl	$1,gcm_avx_can_use_movbe(%rip)
+#else
+	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+	jz	1f
+	call	_aesni_ctr32_ghash_6x
+	jmp	2f
+1:
+#endif
+	call	_aesni_ctr32_ghash_no_movbe_6x
+2:
+	vmovups	%xmm9,-96(%rsi)
+	vmovups	%xmm10,-80(%rsi)
+	vmovups	%xmm11,-64(%rsi)
+	vmovups	%xmm12,-48(%rsi)
+	vmovups	%xmm13,-32(%rsi)
+	vmovups	%xmm14,-16(%rsi)
+
+	vpshufb	(%r11),%xmm8,%xmm8
+	movq	-56(%rax),%r9
+.cfi_restore	%r9
+	vmovdqu	%xmm8,(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_dec_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	aesni_gcm_decrypt,.-aesni_gcm_decrypt
+.type	_aesni_ctr32_6x,@function
+.align	32
+_aesni_ctr32_6x:
+.cfi_startproc
+	vmovdqu	0-128(%rcx),%xmm4
+	vmovdqu	32(%r11),%xmm2
+	leaq	-2(%rbp),%r13	// ICP uses 10,12,14 not 9,11,13 for rounds.
+	vmovups	16-128(%rcx),%xmm15
+	leaq	32-128(%rcx),%r12
+	vpxor	%xmm4,%xmm1,%xmm9
+	addl	$100663296,%ebx
+	jc	.Lhandle_ctr32_2
+	vpaddb	%xmm2,%xmm1,%xmm10
+	vpaddb	%xmm2,%xmm10,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddb	%xmm2,%xmm11,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddb	%xmm2,%xmm12,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpaddb	%xmm2,%xmm13,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpaddb	%xmm2,%xmm14,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+
+.align	16
+.Loop_ctr32:
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vmovups	(%r12),%xmm15
+	leaq	16(%r12),%r12
+	decl	%r13d
+	jnz	.Loop_ctr32
+
+	vmovdqu	(%r12),%xmm3
+	vaesenc	%xmm15,%xmm9,%xmm9
+	vpxor	0(%rdi),%xmm3,%xmm4
+	vaesenc	%xmm15,%xmm10,%xmm10
+	vpxor	16(%rdi),%xmm3,%xmm5
+	vaesenc	%xmm15,%xmm11,%xmm11
+	vpxor	32(%rdi),%xmm3,%xmm6
+	vaesenc	%xmm15,%xmm12,%xmm12
+	vpxor	48(%rdi),%xmm3,%xmm8
+	vaesenc	%xmm15,%xmm13,%xmm13
+	vpxor	64(%rdi),%xmm3,%xmm2
+	vaesenc	%xmm15,%xmm14,%xmm14
+	vpxor	80(%rdi),%xmm3,%xmm3
+	leaq	96(%rdi),%rdi
+
+	vaesenclast	%xmm4,%xmm9,%xmm9
+	vaesenclast	%xmm5,%xmm10,%xmm10
+	vaesenclast	%xmm6,%xmm11,%xmm11
+	vaesenclast	%xmm8,%xmm12,%xmm12
+	vaesenclast	%xmm2,%xmm13,%xmm13
+	vaesenclast	%xmm3,%xmm14,%xmm14
+	vmovups	%xmm9,0(%rsi)
+	vmovups	%xmm10,16(%rsi)
+	vmovups	%xmm11,32(%rsi)
+	vmovups	%xmm12,48(%rsi)
+	vmovups	%xmm13,64(%rsi)
+	vmovups	%xmm14,80(%rsi)
+	leaq	96(%rsi),%rsi
+
+	.byte	0xf3,0xc3
+.align	32
+.Lhandle_ctr32_2:
+	vpshufb	%xmm0,%xmm1,%xmm6
+	vmovdqu	48(%r11),%xmm5
+	vpaddd	64(%r11),%xmm6,%xmm10
+	vpaddd	%xmm5,%xmm6,%xmm11
+	vpaddd	%xmm5,%xmm10,%xmm12
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm11,%xmm13
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vpxor	%xmm4,%xmm10,%xmm10
+	vpaddd	%xmm5,%xmm12,%xmm14
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vpxor	%xmm4,%xmm11,%xmm11
+	vpaddd	%xmm5,%xmm13,%xmm1
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vpxor	%xmm4,%xmm12,%xmm12
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vpxor	%xmm4,%xmm13,%xmm13
+	vpshufb	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm14,%xmm14
+	jmp	.Loop_ctr32
+.cfi_endproc
+.size	_aesni_ctr32_6x,.-_aesni_ctr32_6x
+
+.globl	aesni_gcm_encrypt
+.type	aesni_gcm_encrypt,@function
+.align	32
+aesni_gcm_encrypt:
+.cfi_startproc
+	xorq	%r10,%r10
+	cmpq	$288,%rdx
+	jb	.Lgcm_enc_abort
+
+	leaq	(%rsp),%rax
+.cfi_def_cfa_register	%rax
+	pushq	%rbx
+.cfi_offset	%rbx,-16
+	pushq	%rbp
+.cfi_offset	%rbp,-24
+	pushq	%r12
+.cfi_offset	%r12,-32
+	pushq	%r13
+.cfi_offset	%r13,-40
+	pushq	%r14
+.cfi_offset	%r14,-48
+	pushq	%r15
+.cfi_offset	%r15,-56
+	pushq	%r9
+.cfi_offset	%r9,-64
+	vzeroupper
+
+	vmovdqu	(%r8),%xmm1
+	addq	$-128,%rsp
+	movl	12(%r8),%ebx
+	leaq	.Lbswap_mask(%rip),%r11
+	leaq	-128(%rcx),%r14
+	movq	$0xf80,%r15
+	leaq	128(%rcx),%rcx
+	vmovdqu	(%r11),%xmm0
+	andq	$-128,%rsp
+	movl	504-128(%rcx),%ebp	// ICP has an larger offset for rounds.
+
+	andq	%r15,%r14
+	andq	%rsp,%r15
+	subq	%r14,%r15
+	jc	.Lenc_no_key_aliasing
+	cmpq	$768,%r15
+	jnc	.Lenc_no_key_aliasing
+	subq	%r15,%rsp
+.Lenc_no_key_aliasing:
+
+	leaq	(%rsi),%r14
+	leaq	-192(%rsi,%rdx,1),%r15
+	shrq	$4,%rdx
+
+	call	_aesni_ctr32_6x
+	vpshufb	%xmm0,%xmm9,%xmm8
+	vpshufb	%xmm0,%xmm10,%xmm2
+	vmovdqu	%xmm8,112(%rsp)
+	vpshufb	%xmm0,%xmm11,%xmm4
+	vmovdqu	%xmm2,96(%rsp)
+	vpshufb	%xmm0,%xmm12,%xmm5
+	vmovdqu	%xmm4,80(%rsp)
+	vpshufb	%xmm0,%xmm13,%xmm6
+	vmovdqu	%xmm5,64(%rsp)
+	vpshufb	%xmm0,%xmm14,%xmm7
+	vmovdqu	%xmm6,48(%rsp)
+
+	call	_aesni_ctr32_6x
+
+	vmovdqu	(%r9),%xmm8
+	movq	32(%r9),%r9
+	leaq	32(%r9),%r9
+	subq	$12,%rdx
+	movq	$192,%r10
+	vpshufb	%xmm0,%xmm8,%xmm8
+
+#ifdef HAVE_MOVBE
+#ifdef _KERNEL
+	testl	$1,gcm_avx_can_use_movbe(%rip)
+#else
+	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
+#endif
+	jz	1f
+	call	_aesni_ctr32_ghash_6x
+	jmp	2f
+1:
+#endif
+	call	_aesni_ctr32_ghash_no_movbe_6x
+2:
+	vmovdqu	32(%rsp),%xmm7
+	vmovdqu	(%r11),%xmm0
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm7,%xmm7,%xmm1
+	vmovdqu	32-32(%r9),%xmm15
+	vmovups	%xmm9,-96(%rsi)
+	vpshufb	%xmm0,%xmm9,%xmm9
+	vpxor	%xmm7,%xmm1,%xmm1
+	vmovups	%xmm10,-80(%rsi)
+	vpshufb	%xmm0,%xmm10,%xmm10
+	vmovups	%xmm11,-64(%rsi)
+	vpshufb	%xmm0,%xmm11,%xmm11
+	vmovups	%xmm12,-48(%rsi)
+	vpshufb	%xmm0,%xmm12,%xmm12
+	vmovups	%xmm13,-32(%rsi)
+	vpshufb	%xmm0,%xmm13,%xmm13
+	vmovups	%xmm14,-16(%rsi)
+	vpshufb	%xmm0,%xmm14,%xmm14
+	vmovdqu	%xmm9,16(%rsp)
+	vmovdqu	48(%rsp),%xmm6
+	vmovdqu	16-32(%r9),%xmm0
+	vpunpckhqdq	%xmm6,%xmm6,%xmm2
+	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm5
+	vpxor	%xmm6,%xmm2,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+
+	vmovdqu	64(%rsp),%xmm9
+	vpclmulqdq	$0x00,%xmm0,%xmm6,%xmm4
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm9,%xmm9,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm6,%xmm6
+	vpxor	%xmm9,%xmm5,%xmm5
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vmovdqu	80(%rsp),%xmm1
+	vpclmulqdq	$0x00,%xmm3,%xmm9,%xmm7
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm4,%xmm7,%xmm7
+	vpunpckhqdq	%xmm1,%xmm1,%xmm4
+	vpclmulqdq	$0x11,%xmm3,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpxor	%xmm6,%xmm9,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm5,%xmm5
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	96(%rsp),%xmm2
+	vpclmulqdq	$0x00,%xmm0,%xmm1,%xmm6
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm7,%xmm6,%xmm6
+	vpunpckhqdq	%xmm2,%xmm2,%xmm7
+	vpclmulqdq	$0x11,%xmm0,%xmm1,%xmm1
+	vpxor	%xmm2,%xmm7,%xmm7
+	vpxor	%xmm9,%xmm1,%xmm1
+	vpclmulqdq	$0x10,%xmm15,%xmm4,%xmm4
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm5,%xmm4,%xmm4
+
+	vpxor	112(%rsp),%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm3,%xmm2,%xmm5
+	vmovdqu	112-32(%r9),%xmm0
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpxor	%xmm6,%xmm5,%xmm5
+	vpclmulqdq	$0x11,%xmm3,%xmm2,%xmm2
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm1,%xmm2,%xmm2
+	vpclmulqdq	$0x00,%xmm15,%xmm7,%xmm7
+	vpxor	%xmm4,%xmm7,%xmm4
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm6
+	vmovdqu	0-32(%r9),%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm1
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm8
+	vpxor	%xmm14,%xmm1,%xmm1
+	vpxor	%xmm5,%xmm6,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm9
+	vmovdqu	32-32(%r9),%xmm15
+	vpxor	%xmm2,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm9,%xmm6
+
+	vmovdqu	16-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm7,%xmm9
+	vpclmulqdq	$0x00,%xmm3,%xmm14,%xmm4
+	vpxor	%xmm9,%xmm6,%xmm6
+	vpunpckhqdq	%xmm13,%xmm13,%xmm2
+	vpclmulqdq	$0x11,%xmm3,%xmm14,%xmm14
+	vpxor	%xmm13,%xmm2,%xmm2
+	vpslldq	$8,%xmm6,%xmm9
+	vpclmulqdq	$0x00,%xmm15,%xmm1,%xmm1
+	vpxor	%xmm9,%xmm5,%xmm8
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpclmulqdq	$0x00,%xmm0,%xmm13,%xmm5
+	vmovdqu	48-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm12,%xmm12,%xmm9
+	vpclmulqdq	$0x11,%xmm0,%xmm13,%xmm13
+	vpxor	%xmm12,%xmm9,%xmm9
+	vpxor	%xmm14,%xmm13,%xmm13
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpclmulqdq	$0x10,%xmm15,%xmm2,%xmm2
+	vmovdqu	80-32(%r9),%xmm15
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm3,%xmm12,%xmm4
+	vmovdqu	64-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm11,%xmm11,%xmm1
+	vpclmulqdq	$0x11,%xmm3,%xmm12,%xmm12
+	vpxor	%xmm11,%xmm1,%xmm1
+	vpxor	%xmm13,%xmm12,%xmm12
+	vxorps	16(%rsp),%xmm7,%xmm7
+	vpclmulqdq	$0x00,%xmm15,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm9,%xmm9
+
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm0,%xmm11,%xmm5
+	vmovdqu	96-32(%r9),%xmm3
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpunpckhqdq	%xmm10,%xmm10,%xmm2
+	vpclmulqdq	$0x11,%xmm0,%xmm11,%xmm11
+	vpxor	%xmm10,%xmm2,%xmm2
+	vpalignr	$8,%xmm8,%xmm8,%xmm14
+	vpxor	%xmm12,%xmm11,%xmm11
+	vpclmulqdq	$0x10,%xmm15,%xmm1,%xmm1
+	vmovdqu	128-32(%r9),%xmm15
+	vpxor	%xmm9,%xmm1,%xmm1
+
+	vxorps	%xmm7,%xmm14,%xmm14
+	vpclmulqdq	$0x10,16(%r11),%xmm8,%xmm8
+	vxorps	%xmm14,%xmm8,%xmm8
+
+	vpclmulqdq	$0x00,%xmm3,%xmm10,%xmm4
+	vmovdqu	112-32(%r9),%xmm0
+	vpxor	%xmm5,%xmm4,%xmm4
+	vpunpckhqdq	%xmm8,%xmm8,%xmm9
+	vpclmulqdq	$0x11,%xmm3,%xmm10,%xmm10
+	vpxor	%xmm8,%xmm9,%xmm9
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm15,%xmm2,%xmm2
+	vpxor	%xmm1,%xmm2,%xmm2
+
+	vpclmulqdq	$0x00,%xmm0,%xmm8,%xmm5
+	vpclmulqdq	$0x11,%xmm0,%xmm8,%xmm7
+	vpxor	%xmm4,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm15,%xmm9,%xmm6
+	vpxor	%xmm10,%xmm7,%xmm7
+	vpxor	%xmm2,%xmm6,%xmm6
+
+	vpxor	%xmm5,%xmm7,%xmm4
+	vpxor	%xmm4,%xmm6,%xmm6
+	vpslldq	$8,%xmm6,%xmm1
+	vmovdqu	16(%r11),%xmm3
+	vpsrldq	$8,%xmm6,%xmm6
+	vpxor	%xmm1,%xmm5,%xmm8
+	vpxor	%xmm6,%xmm7,%xmm7
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm2,%xmm8,%xmm8
+
+	vpalignr	$8,%xmm8,%xmm8,%xmm2
+	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm8
+	vpxor	%xmm7,%xmm2,%xmm2
+	vpxor	%xmm2,%xmm8,%xmm8
+	vpshufb	(%r11),%xmm8,%xmm8
+	movq	-56(%rax),%r9
+.cfi_restore	%r9
+	vmovdqu	%xmm8,(%r9)
+
+	vzeroupper
+	movq	-48(%rax),%r15
+.cfi_restore	%r15
+	movq	-40(%rax),%r14
+.cfi_restore	%r14
+	movq	-32(%rax),%r13
+.cfi_restore	%r13
+	movq	-24(%rax),%r12
+.cfi_restore	%r12
+	movq	-16(%rax),%rbp
+.cfi_restore	%rbp
+	movq	-8(%rax),%rbx
+.cfi_restore	%rbx
+	leaq	(%rax),%rsp
+.cfi_def_cfa_register	%rsp
+.Lgcm_enc_abort:
+	movq	%r10,%rax
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	aesni_gcm_encrypt,.-aesni_gcm_encrypt
+
+/* Some utility routines */
+
+/*
+ * clear all fpu registers
+ * void clear_fpu_regs_avx(void);
+ */
+.globl	clear_fpu_regs_avx
+.type	clear_fpu_regs_avx,@function
+.align	32
+clear_fpu_regs_avx:
+	vzeroall
+	ret
+.size	clear_fpu_regs_avx,.-clear_fpu_regs_avx
+
+/*
+ * void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
+ *
+ * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and
+ * stores the result at `dst'. The XOR is performed using FPU registers,
+ * so make sure FPU state is saved when running this in the kernel.
+ */
+.globl  gcm_xor_avx
+.type	gcm_xor_avx,@function
+.align	32
+gcm_xor_avx:
+	movdqu  (%rdi), %xmm0
+	movdqu  (%rsi), %xmm1
+	pxor    %xmm1, %xmm0
+	movdqu  %xmm0, (%rsi)
+	ret
+.size	gcm_xor_avx,.-gcm_xor_avx
+
+/*
+ * Toggle a boolean_t value atomically and return the new value.
+ * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
+ */
+.globl	atomic_toggle_boolean_nv
+.type	atomic_toggle_boolean_nv,@function
+.align	32
+atomic_toggle_boolean_nv:
+	xorl	%eax, %eax
+	lock
+	xorl	$1, (%rdi)
+	jz	1f
+	movl	$1, %eax
+1:
+	ret
+.size	atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv
+
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lpoly:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.Lone_msb:
+.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
+.Ltwo_lsb:
+.byte	2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.Lone_lsb:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+.byte	65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
new file mode 100644
index 000000000000..59edc4c8d56c
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S
@@ -0,0 +1,254 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009 Intel Corporation
+ * All Rights Reserved.
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
+ * instructions.  This file contains an accelerated
+ * Galois Field Multiplication implementation.
+ *
+ * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
+ * carry-less multiplication. More information about PCLMULQDQ can be
+ * found at:
+ * http://software.intel.com/en-us/articles/
+ * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
+ *
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as file galois_hash_asm.c from
+ * Intel Corporation dated September 21, 2009.
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
+ * definition for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Removed code to perform hashing.  This is already done with C macro
+ * GHASH in gcm.c.  For better performance, this removed code should be
+ * reintegrated in the future to replace the C GHASH macro.
+ *
+ * 5. Added code to byte swap 16-byte input and output.
+ *
+ * 6. Folded in comments from the original C source with embedded assembly
+ * (SB_w_shift_xor.c)
+ *
+ * 7. Renamed function and reordered parameters to match OpenSolaris:
+ * Intel interface:
+ *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ *		unsigned char *d, int length)
+ * OpenSolaris OS interface:
+ *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ * ====================================================================
+ */
+
+
+#if defined(lint) || defined(__lint)	/* lint */
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
+}
+
+#elif defined(HAVE_PCLMULQDQ)	/* guard by instruction set */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+/*
+ * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
+ */
+
+// static uint8_t byte_swap16_mask[] = {
+//	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
+.data
+.align XMM_ALIGN
+.Lbyte_swap16_mask:
+	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+
+/*
+ * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
+ *
+ * Perform a carry-less multiplication (that is, use XOR instead of the
+ * multiply operator) on P1 and P2 and place the result in P3.
+ *
+ * Byte swap the input and the output.
+ *
+ * Note: x_in, y, and res all point to a block of 20-byte numbers
+ * (an array of two 64-bit integers).
+ *
+ * Note2: For kernel code, caller is responsible for ensuring
+ * kpreempt_disable() has been called.  This is because %xmm registers are
+ * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
+ * respectively, if TS is set on entry.  Otherwise, if TS is not set,
+ * save and restore %xmm registers on the stack.
+ *
+ * Note3: Original Intel definition:
+ * void galois_hash_asm(unsigned char *hk, unsigned char *s,
+ *	unsigned char *d, int length)
+ *
+ * Note4: Register/parameter mapping:
+ * Intel:
+ *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
+ *	Parameter 2: %rdx (copied to %xmm1)	s or y
+ *	Parameter 3: %rdi (result)		d or res
+ * OpenSolaris:
+ *	Parameter 1: %rdi (copied to %xmm0)	x_in
+ *	Parameter 2: %rsi (copied to %xmm1)	y
+ *	Parameter 3: %rdx (result)		res
+ */
+
+ENTRY_NP(gcm_mul_pclmulqdq)
+	//
+	// Copy Parameters
+	//
+	movdqu	(%rdi), %xmm0	// P1
+	movdqu	(%rsi), %xmm1	// P2
+
+	//
+	// Byte swap 16-byte input
+	//
+	lea	.Lbyte_swap16_mask(%rip), %rax
+	movups	(%rax), %xmm10
+	pshufb	%xmm10, %xmm0
+	pshufb	%xmm10, %xmm1
+
+
+	//
+	// Multiply with the hash key
+	//
+	movdqu	%xmm0, %xmm3
+	pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0
+
+	movdqu	%xmm0, %xmm4
+	pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1
+
+	movdqu	%xmm0, %xmm5
+	pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
+	movdqu	%xmm0, %xmm6
+	pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1
+
+	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0
+
+	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
+	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
+	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
+	pxor	%xmm5, %xmm3
+	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
+				// of the carry-less multiplication of
+				// xmm0 by xmm1.
+
+	// We shift the result of the multiplication by one bit position
+	// to the left to cope for the fact that the bits are reversed.
+	movdqu	%xmm3, %xmm7
+	movdqu	%xmm6, %xmm8
+	pslld	$1, %xmm3
+	pslld	$1, %xmm6
+	psrld	$31, %xmm7
+	psrld	$31, %xmm8
+	movdqu	%xmm7, %xmm9
+	pslldq	$4, %xmm8
+	pslldq	$4, %xmm7
+	psrldq	$12, %xmm9
+	por	%xmm7, %xmm3
+	por	%xmm8, %xmm6
+	por	%xmm9, %xmm6
+
+	//
+	// First phase of the reduction
+	//
+	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+	// independently.
+	movdqu	%xmm3, %xmm7
+	movdqu	%xmm3, %xmm8
+	movdqu	%xmm3, %xmm9
+	pslld	$31, %xmm7	// packed right shift shifting << 31
+	pslld	$30, %xmm8	// packed right shift shifting << 30
+	pslld	$25, %xmm9	// packed right shift shifting << 25
+	pxor	%xmm8, %xmm7	// xor the shifted versions
+	pxor	%xmm9, %xmm7
+	movdqu	%xmm7, %xmm8
+	pslldq	$12, %xmm7
+	psrldq	$4, %xmm8
+	pxor	%xmm7, %xmm3	// first phase of the reduction complete
+
+	//
+	// Second phase of the reduction
+	//
+	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+	// shift operations.
+	movdqu	%xmm3, %xmm2
+	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
+	movdqu	%xmm3, %xmm5
+	psrld	$1, %xmm2
+	psrld	$2, %xmm4	// packed left shifting >> 2
+	psrld	$7, %xmm5	// packed left shifting >> 7
+	pxor	%xmm4, %xmm2	// xor the shifted versions
+	pxor	%xmm5, %xmm2
+	pxor	%xmm8, %xmm2
+	pxor	%xmm2, %xmm3
+	pxor	%xmm3, %xmm6	// the result is in xmm6
+
+	//
+	// Byte swap 16-byte result
+	//
+	pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask
+
+	//
+	// Store the result
+	//
+	movdqu	%xmm6, (%rdx)	// P3
+
+
+	//
+	// Return
+	//
+	ret
+	SET_SIZE(gcm_mul_pclmulqdq)
+
+#endif	/* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S
new file mode 100644
index 000000000000..90cc36b43a78
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/modes/ghash-x86_64.S
@@ -0,0 +1,714 @@
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March, June 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that
+# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
+# function features so called "528B" variant utilizing additional
+# 256+16 bytes of per-key storage [+512 bytes shared table].
+# Performance results are for this streamed GHASH subroutine and are
+# expressed in cycles per processed byte, less is better:
+#
+#		gcc 3.4.x(*)	assembler
+#
+# P4		28.6		14.0		+100%
+# Opteron	19.3		7.7		+150%
+# Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
+#
+# (*)	comparison is not completely fair, because C results are
+#	for vanilla "256B" implementation, while assembler results
+#	are for "528B";-)
+# (**)	it's mystery [to me] why Core2 result is not same as for
+#	Opteron;
+
+# May 2010
+#
+# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
+# See ghash-x86.pl for background information and details about coding
+# techniques.
+#
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
+
+# December 2012
+#
+# Overhaul: aggregate Karatsuba post-processing, improve ILP in
+# reduction_alg9, increase reduction aggregate factor to 4x. As for
+# the latter. ghash-x86.pl discusses that it makes lesser sense to
+# increase aggregate factor. Then why increase here? Critical path
+# consists of 3 independent pclmulqdq instructions, Karatsuba post-
+# processing and reduction. "On top" of this we lay down aggregated
+# multiplication operations, triplets of independent pclmulqdq's. As
+# issue rate for pclmulqdq is limited, it makes lesser sense to
+# aggregate more multiplications than it takes to perform remaining
+# non-multiplication operations. 2x is near-optimal coefficient for
+# contemporary Intel CPUs (therefore modest improvement coefficient),
+# but not for Bulldozer. Latter is because logical SIMD operations
+# are twice as slow in comparison to Intel, so that critical path is
+# longer. A CPU with higher pclmulqdq issue rate would also benefit
+# from higher aggregate factor...
+#
+# Westmere	1.78(+13%)
+# Sandy Bridge	1.80(+8%)
+# Ivy Bridge	1.80(+7%)
+# Haswell	0.55(+93%) (if system doesn't support AVX)
+# Broadwell	0.45(+110%)(if system doesn't support AVX)
+# Skylake	0.44(+110%)(if system doesn't support AVX)
+# Bulldozer	1.49(+27%)
+# Silvermont	2.88(+13%)
+# Knights L	2.12(-)    (if system doesn't support AVX)
+# Goldmont	1.08(+24%)
+
+# March 2013
+#
+# ... 8x aggregate factor AVX code path is using reduction algorithm
+# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
+# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
+# sub-optimally in comparison to above mentioned version. But thanks
+# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
+#
+# Knights Landing achieves 1.09 cpb.
+#
+# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
+
+# Generated once from
+# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl
+# and modified for ICP. Modification are kept at a bare minimum to ease later
+# upstream merges.
+
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+
+.text
+
+.globl	gcm_gmult_clmul
+.type	gcm_gmult_clmul,@function
+.align	16
+gcm_gmult_clmul:
+.cfi_startproc
+.L_gmult_clmul:
+	movdqu	(%rdi),%xmm0
+	movdqa	.Lbswap_mask(%rip),%xmm5
+	movdqu	(%rsi),%xmm2
+	movdqu	32(%rsi),%xmm4
+.byte	102,15,56,0,197
+	movdqa	%xmm0,%xmm1
+	pshufd	$78,%xmm0,%xmm3
+	pxor	%xmm0,%xmm3
+.byte	102,15,58,68,194,0
+.byte	102,15,58,68,202,17
+.byte	102,15,58,68,220,0
+	pxor	%xmm0,%xmm3
+	pxor	%xmm1,%xmm3
+
+	movdqa	%xmm3,%xmm4
+	psrldq	$8,%xmm3
+	pslldq	$8,%xmm4
+	pxor	%xmm3,%xmm1
+	pxor	%xmm4,%xmm0
+
+	movdqa	%xmm0,%xmm4
+	movdqa	%xmm0,%xmm3
+	psllq	$5,%xmm0
+	pxor	%xmm0,%xmm3
+	psllq	$1,%xmm0
+	pxor	%xmm3,%xmm0
+	psllq	$57,%xmm0
+	movdqa	%xmm0,%xmm3
+	pslldq	$8,%xmm0
+	psrldq	$8,%xmm3
+	pxor	%xmm4,%xmm0
+	pxor	%xmm3,%xmm1
+
+
+	movdqa	%xmm0,%xmm4
+	psrlq	$1,%xmm0
+	pxor	%xmm4,%xmm1
+	pxor	%xmm0,%xmm4
+	psrlq	$5,%xmm0
+	pxor	%xmm4,%xmm0
+	psrlq	$1,%xmm0
+	pxor	%xmm1,%xmm0
+.byte	102,15,56,0,197
+	movdqu	%xmm0,(%rdi)
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	gcm_gmult_clmul,.-gcm_gmult_clmul
+
+.globl	gcm_init_htab_avx
+.type	gcm_init_htab_avx,@function
+.align	32
+gcm_init_htab_avx:
+.cfi_startproc
+	vzeroupper
+
+	vmovdqu	(%rsi),%xmm2
+	// KCF/ICP stores H in network byte order with the hi qword first
+	// so we need to swap all bytes, not the 2 qwords.
+	vmovdqu	.Lbswap_mask(%rip),%xmm4
+	vpshufb	%xmm4,%xmm2,%xmm2
+
+
+	vpshufd	$255,%xmm2,%xmm4
+	vpsrlq	$63,%xmm2,%xmm3
+	vpsllq	$1,%xmm2,%xmm2
+	vpxor	%xmm5,%xmm5,%xmm5
+	vpcmpgtd	%xmm4,%xmm5,%xmm5
+	vpslldq	$8,%xmm3,%xmm3
+	vpor	%xmm3,%xmm2,%xmm2
+
+
+	vpand	.L0x1c2_polynomial(%rip),%xmm5,%xmm5
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vpunpckhqdq	%xmm2,%xmm2,%xmm6
+	vmovdqa	%xmm2,%xmm0
+	vpxor	%xmm2,%xmm6,%xmm6
+	movq	$4,%r10
+	jmp	.Linit_start_avx
+.align	32
+.Linit_loop_avx:
+	vpalignr	$8,%xmm3,%xmm4,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+.Linit_start_avx:
+	vmovdqa	%xmm0,%xmm5
+	vpunpckhqdq	%xmm0,%xmm0,%xmm3
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm1
+	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm3,%xmm3
+	vpxor	%xmm0,%xmm1,%xmm4
+	vpxor	%xmm4,%xmm3,%xmm3
+
+	vpslldq	$8,%xmm3,%xmm4
+	vpsrldq	$8,%xmm3,%xmm3
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpxor	%xmm3,%xmm1,%xmm1
+	vpsllq	$57,%xmm0,%xmm3
+	vpsllq	$62,%xmm0,%xmm4
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpsllq	$63,%xmm0,%xmm3
+	vpxor	%xmm3,%xmm4,%xmm4
+	vpslldq	$8,%xmm4,%xmm3
+	vpsrldq	$8,%xmm4,%xmm4
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpxor	%xmm4,%xmm1,%xmm1
+
+	vpsrlq	$1,%xmm0,%xmm4
+	vpxor	%xmm0,%xmm1,%xmm1
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$5,%xmm4,%xmm4
+	vpxor	%xmm4,%xmm0,%xmm0
+	vpsrlq	$1,%xmm0,%xmm0
+	vpxor	%xmm1,%xmm0,%xmm0
+	vpshufd	$78,%xmm5,%xmm3
+	vpshufd	$78,%xmm0,%xmm4
+	vpxor	%xmm5,%xmm3,%xmm3
+	vmovdqu	%xmm5,0(%rdi)
+	vpxor	%xmm0,%xmm4,%xmm4
+	vmovdqu	%xmm0,16(%rdi)
+	leaq	48(%rdi),%rdi
+	subq	$1,%r10
+	jnz	.Linit_loop_avx
+
+	vpalignr	$8,%xmm4,%xmm3,%xmm5
+	vmovdqu	%xmm5,-16(%rdi)
+
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	gcm_init_htab_avx,.-gcm_init_htab_avx
+
+.globl	gcm_gmult_avx
+.type	gcm_gmult_avx,@function
+.align	32
+gcm_gmult_avx:
+.cfi_startproc
+	jmp	.L_gmult_clmul
+.cfi_endproc
+.size	gcm_gmult_avx,.-gcm_gmult_avx
+.globl	gcm_ghash_avx
+.type	gcm_ghash_avx,@function
+.align	32
+gcm_ghash_avx:
+.cfi_startproc
+	vzeroupper
+
+	vmovdqu	(%rdi),%xmm10
+	leaq	.L0x1c2_polynomial(%rip),%r10
+	leaq	64(%rsi),%rsi
+	vmovdqu	.Lbswap_mask(%rip),%xmm13
+	vpshufb	%xmm13,%xmm10,%xmm10
+	cmpq	$0x80,%rcx
+	jb	.Lshort_avx
+	subq	$0x80,%rcx
+
+	vmovdqu	112(%rdx),%xmm14
+	vmovdqu	0-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vmovdqu	32-64(%rsi),%xmm7
+
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	80(%rdx),%xmm14
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	48-64(%rsi),%xmm6
+	vpxor	%xmm14,%xmm9,%xmm9
+	vmovdqu	64(%rdx),%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	48(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	32(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	16(%rdx),%xmm14
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpxor	%xmm5,%xmm2,%xmm2
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	(%rdx),%xmm15
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+
+	leaq	128(%rdx),%rdx
+	cmpq	$0x80,%rcx
+	jb	.Ltail_avx
+
+	vpxor	%xmm10,%xmm15,%xmm15
+	subq	$0x80,%rcx
+	jmp	.Loop8x_avx
+
+.align	32
+.Loop8x_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vmovdqu	112(%rdx),%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm10
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm11
+	vmovdqu	0-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm12
+	vmovdqu	32-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+
+	vmovdqu	96(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpxor	%xmm3,%xmm10,%xmm10
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vxorps	%xmm4,%xmm11,%xmm11
+	vmovdqu	16-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm5,%xmm12,%xmm12
+	vxorps	%xmm15,%xmm8,%xmm8
+
+	vmovdqu	80(%rdx),%xmm14
+	vpxor	%xmm10,%xmm12,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpxor	%xmm11,%xmm12,%xmm12
+	vpslldq	$8,%xmm12,%xmm9
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vpsrldq	$8,%xmm12,%xmm12
+	vpxor	%xmm9,%xmm10,%xmm10
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vxorps	%xmm12,%xmm11,%xmm11
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	80-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	64(%rdx),%xmm15
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vxorps	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+
+	vmovdqu	48(%rdx),%xmm14
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	96-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	128-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	32(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpxor	%xmm3,%xmm0,%xmm0
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm4,%xmm1,%xmm1
+	vpclmulqdq	$0x00,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm5,%xmm2,%xmm2
+	vxorps	%xmm12,%xmm10,%xmm10
+
+	vmovdqu	16(%rdx),%xmm14
+	vpalignr	$8,%xmm10,%xmm10,%xmm12
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm3
+	vpshufb	%xmm13,%xmm14,%xmm14
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm4
+	vmovdqu	144-64(%rsi),%xmm6
+	vpclmulqdq	$0x10,(%r10),%xmm10,%xmm10
+	vxorps	%xmm11,%xmm12,%xmm12
+	vpunpckhqdq	%xmm14,%xmm14,%xmm9
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x10,%xmm7,%xmm8,%xmm5
+	vmovdqu	176-64(%rsi),%xmm7
+	vpxor	%xmm14,%xmm9,%xmm9
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vmovdqu	(%rdx),%xmm15
+	vpclmulqdq	$0x00,%xmm6,%xmm14,%xmm0
+	vpshufb	%xmm13,%xmm15,%xmm15
+	vpclmulqdq	$0x11,%xmm6,%xmm14,%xmm1
+	vmovdqu	160-64(%rsi),%xmm6
+	vpxor	%xmm12,%xmm15,%xmm15
+	vpclmulqdq	$0x10,%xmm7,%xmm9,%xmm2
+	vpxor	%xmm10,%xmm15,%xmm15
+
+	leaq	128(%rdx),%rdx
+	subq	$0x80,%rcx
+	jnc	.Loop8x_avx
+
+	addq	$0x80,%rcx
+	jmp	.Ltail_no_xor_avx
+
+.align	32
+.Lshort_avx:
+	vmovdqu	-16(%rdx,%rcx,1),%xmm14
+	leaq	(%rdx,%rcx,1),%rdx
+	vmovdqu	0-64(%rsi),%xmm6
+	vmovdqu	32-64(%rsi),%xmm7
+	vpshufb	%xmm13,%xmm14,%xmm15
+
+	vmovdqa	%xmm0,%xmm3
+	vmovdqa	%xmm1,%xmm4
+	vmovdqa	%xmm2,%xmm5
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-32(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	16-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-48(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	48-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	80-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-64(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	64-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-80(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	96-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovdqu	128-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-96(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	112-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vpsrldq	$8,%xmm7,%xmm7
+	subq	$0x10,%rcx
+	jz	.Ltail_avx
+
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vmovdqu	-112(%rdx),%xmm14
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vmovdqu	144-64(%rsi),%xmm6
+	vpshufb	%xmm13,%xmm14,%xmm15
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+	vmovq	184-64(%rsi),%xmm7
+	subq	$0x10,%rcx
+	jmp	.Ltail_avx
+
+.align	32
+.Ltail_avx:
+	vpxor	%xmm10,%xmm15,%xmm15
+.Ltail_no_xor_avx:
+	vpunpckhqdq	%xmm15,%xmm15,%xmm8
+	vpxor	%xmm0,%xmm3,%xmm3
+	vpclmulqdq	$0x00,%xmm6,%xmm15,%xmm0
+	vpxor	%xmm15,%xmm8,%xmm8
+	vpxor	%xmm1,%xmm4,%xmm4
+	vpclmulqdq	$0x11,%xmm6,%xmm15,%xmm1
+	vpxor	%xmm2,%xmm5,%xmm5
+	vpclmulqdq	$0x00,%xmm7,%xmm8,%xmm2
+
+	vmovdqu	(%r10),%xmm12
+
+	vpxor	%xmm0,%xmm3,%xmm10
+	vpxor	%xmm1,%xmm4,%xmm11
+	vpxor	%xmm2,%xmm5,%xmm5
+
+	vpxor	%xmm10,%xmm5,%xmm5
+	vpxor	%xmm11,%xmm5,%xmm5
+	vpslldq	$8,%xmm5,%xmm9
+	vpsrldq	$8,%xmm5,%xmm5
+	vpxor	%xmm9,%xmm10,%xmm10
+	vpxor	%xmm5,%xmm11,%xmm11
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	vpclmulqdq	$0x10,%xmm12,%xmm10,%xmm9
+	vpalignr	$8,%xmm10,%xmm10,%xmm10
+	vpxor	%xmm11,%xmm10,%xmm10
+	vpxor	%xmm9,%xmm10,%xmm10
+
+	cmpq	$0,%rcx
+	jne	.Lshort_avx
+
+	vpshufb	%xmm13,%xmm10,%xmm10
+	vmovdqu	%xmm10,(%rdi)
+	vzeroupper
+	.byte	0xf3,0xc3
+.cfi_endproc
+.size	gcm_ghash_avx,.-gcm_ghash_avx
+.align	64
+.Lbswap_mask:
+.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.L0x1c2_polynomial:
+.byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
+.L7_mask:
+.long	7,0,7,0
+.L7_mask_poly:
+.long	7,0,450,0
+.align	64
+.type	.Lrem_4bit,@object
+.Lrem_4bit:
+.long	0,0,0,471859200,0,943718400,0,610271232
+.long	0,1887436800,0,1822425088,0,1220542464,0,1423966208
+.long	0,3774873600,0,4246732800,0,3644850176,0,3311403008
+.long	0,2441084928,0,2376073216,0,2847932416,0,3051356160
+.type	.Lrem_8bit,@object
+.Lrem_8bit:
+.value	0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
+.value	0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
+.value	0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
+.value	0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
+.value	0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
+.value	0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
+.value	0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
+.value	0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
+.value	0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
+.value	0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
+.value	0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
+.value	0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
+.value	0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
+.value	0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
+.value	0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
+.value	0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
+.value	0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
+.value	0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
+.value	0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
+.value	0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
+.value	0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
+.value	0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
+.value	0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
+.value	0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
+.value	0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
+.value	0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
+.value	0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
+.value	0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
+.value	0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
+.value	0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
+.value	0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
+.value	0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
+
+.byte	71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align	64
+
+/* Mark the stack non-executable. */
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha1/sha1-x86_64.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha1/sha1-x86_64.S
new file mode 100644
index 000000000000..cb923784a730
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha1/sha1-x86_64.S
@@ -0,0 +1,1353 @@
+/*
+ * !/usr/bin/env perl
+ *
+ *  ====================================================================
+ *  Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ *  project. The module is, however, dual licensed under OpenSSL and
+ *  CRYPTOGAMS licenses depending on where you obtain it. For further
+ *  details see http://www.openssl.org/~appro/cryptogams/.
+ *  ====================================================================
+ *
+ *  sha1_block procedure for x86_64.
+ *
+ *  It was brought to my attention that on EM64T compiler-generated code
+ *  was far behind 32-bit assembler implementation. This is unlike on
+ *  Opteron where compiler-generated code was only 15% behind 32-bit
+ *  assembler, which originally made it hard to motivate the effort.
+ *  There was suggestion to mechanically translate 32-bit code, but I
+ *  dismissed it, reasoning that x86_64 offers enough register bank
+ *  capacity to fully utilize SHA-1 parallelism. Therefore this fresh
+ *  implementation:-) However! While 64-bit code does performs better
+ *  on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
+ *  x86_64 does offer larger *addressable* bank, but out-of-order core
+ *  reaches for even more registers through dynamic aliasing, and EM64T
+ *  core must have managed to run-time optimize even 32-bit code just as
+ *  good as 64-bit one. Performance improvement is summarized in the
+ *  following table:
+ *
+ * 		gcc 3.4		32-bit asm	cycles/byte
+ *  Opteron	+45%		+20%		6.8
+ *  Xeon P4	+65%		+0%		9.9
+ *  Core2		+60%		+10%		7.0
+ *
+ *
+ *  OpenSolaris OS modifications
+ *
+ *  Sun elects to use this software under the BSD license.
+ *
+ *  This source originates from OpenSSL file sha1-x86_64.pl at
+ *  ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ *  (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ *  1. Added perl "use strict" and declared variables.
+ *
+ *  2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ *  /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ *  3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ *  assemblers).
+ *
+ */
+
+/*
+ * This file was generated by a perl script (sha1-x86_64.pl). The comments from
+ * the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sys/sha1.h>
+
+
+/* ARGSUSED */
+void
+sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks)
+{
+}
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+ENTRY_NP(sha1_block_data_order)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	mov	%rsp,%rax
+	mov	%rdi,%r8	# reassigned argument
+	sub	$72,%rsp
+	mov	%rsi,%r9	# reassigned argument
+	and	$-64,%rsp
+	mov	%rdx,%r10	# reassigned argument
+	mov	%rax,64(%rsp)
+
+	mov	0(%r8),%edx
+	mov	4(%r8),%esi
+	mov	8(%r8),%edi
+	mov	12(%r8),%ebp
+	mov	16(%r8),%r11d
+.align	4
+.Lloop:
+	mov	0(%r9),%eax
+	bswap	%eax
+	mov	%eax,0(%rsp)
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	%edi,%ebx
+	mov	4(%r9),%eax
+	mov	%edx,%r11d
+	xor	%ebp,%ebx
+	bswap	%eax
+	rol	$5,%r11d
+	and	%esi,%ebx
+	mov	%eax,4(%rsp)
+	add	%r11d,%r12d
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	8(%r9),%eax
+	mov	%r12d,%ebp
+	xor	%edi,%ebx
+	bswap	%eax
+	rol	$5,%ebp
+	and	%edx,%ebx
+	mov	%eax,8(%rsp)
+	add	%ebp,%r11d
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	lea	0x5a827999(%eax,%edi),%ebp
+	mov	%edx,%ebx
+	mov	12(%r9),%eax
+	mov	%r11d,%edi
+	xor	%esi,%ebx
+	bswap	%eax
+	rol	$5,%edi
+	and	%r12d,%ebx
+	mov	%eax,12(%rsp)
+	add	%edi,%ebp
+	xor	%esi,%ebx
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	lea	0x5a827999(%eax,%esi),%edi
+	mov	%r12d,%ebx
+	mov	16(%r9),%eax
+	mov	%ebp,%esi
+	xor	%edx,%ebx
+	bswap	%eax
+	rol	$5,%esi
+	and	%r11d,%ebx
+	mov	%eax,16(%rsp)
+	add	%esi,%edi
+	xor	%edx,%ebx
+	rol	$30,%r11d
+	add	%ebx,%edi
+	lea	0x5a827999(%eax,%edx),%esi
+	mov	%r11d,%ebx
+	mov	20(%r9),%eax
+	mov	%edi,%edx
+	xor	%r12d,%ebx
+	bswap	%eax
+	rol	$5,%edx
+	and	%ebp,%ebx
+	mov	%eax,20(%rsp)
+	add	%edx,%esi
+	xor	%r12d,%ebx
+	rol	$30,%ebp
+	add	%ebx,%esi
+	lea	0x5a827999(%eax,%r12d),%edx
+	mov	%ebp,%ebx
+	mov	24(%r9),%eax
+	mov	%esi,%r12d
+	xor	%r11d,%ebx
+	bswap	%eax
+	rol	$5,%r12d
+	and	%edi,%ebx
+	mov	%eax,24(%rsp)
+	add	%r12d,%edx
+	xor	%r11d,%ebx
+	rol	$30,%edi
+	add	%ebx,%edx
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	%edi,%ebx
+	mov	28(%r9),%eax
+	mov	%edx,%r11d
+	xor	%ebp,%ebx
+	bswap	%eax
+	rol	$5,%r11d
+	and	%esi,%ebx
+	mov	%eax,28(%rsp)
+	add	%r11d,%r12d
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	32(%r9),%eax
+	mov	%r12d,%ebp
+	xor	%edi,%ebx
+	bswap	%eax
+	rol	$5,%ebp
+	and	%edx,%ebx
+	mov	%eax,32(%rsp)
+	add	%ebp,%r11d
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	lea	0x5a827999(%eax,%edi),%ebp
+	mov	%edx,%ebx
+	mov	36(%r9),%eax
+	mov	%r11d,%edi
+	xor	%esi,%ebx
+	bswap	%eax
+	rol	$5,%edi
+	and	%r12d,%ebx
+	mov	%eax,36(%rsp)
+	add	%edi,%ebp
+	xor	%esi,%ebx
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	lea	0x5a827999(%eax,%esi),%edi
+	mov	%r12d,%ebx
+	mov	40(%r9),%eax
+	mov	%ebp,%esi
+	xor	%edx,%ebx
+	bswap	%eax
+	rol	$5,%esi
+	and	%r11d,%ebx
+	mov	%eax,40(%rsp)
+	add	%esi,%edi
+	xor	%edx,%ebx
+	rol	$30,%r11d
+	add	%ebx,%edi
+	lea	0x5a827999(%eax,%edx),%esi
+	mov	%r11d,%ebx
+	mov	44(%r9),%eax
+	mov	%edi,%edx
+	xor	%r12d,%ebx
+	bswap	%eax
+	rol	$5,%edx
+	and	%ebp,%ebx
+	mov	%eax,44(%rsp)
+	add	%edx,%esi
+	xor	%r12d,%ebx
+	rol	$30,%ebp
+	add	%ebx,%esi
+	lea	0x5a827999(%eax,%r12d),%edx
+	mov	%ebp,%ebx
+	mov	48(%r9),%eax
+	mov	%esi,%r12d
+	xor	%r11d,%ebx
+	bswap	%eax
+	rol	$5,%r12d
+	and	%edi,%ebx
+	mov	%eax,48(%rsp)
+	add	%r12d,%edx
+	xor	%r11d,%ebx
+	rol	$30,%edi
+	add	%ebx,%edx
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	%edi,%ebx
+	mov	52(%r9),%eax
+	mov	%edx,%r11d
+	xor	%ebp,%ebx
+	bswap	%eax
+	rol	$5,%r11d
+	and	%esi,%ebx
+	mov	%eax,52(%rsp)
+	add	%r11d,%r12d
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	56(%r9),%eax
+	mov	%r12d,%ebp
+	xor	%edi,%ebx
+	bswap	%eax
+	rol	$5,%ebp
+	and	%edx,%ebx
+	mov	%eax,56(%rsp)
+	add	%ebp,%r11d
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	lea	0x5a827999(%eax,%edi),%ebp
+	mov	%edx,%ebx
+	mov	60(%r9),%eax
+	mov	%r11d,%edi
+	xor	%esi,%ebx
+	bswap	%eax
+	rol	$5,%edi
+	and	%r12d,%ebx
+	mov	%eax,60(%rsp)
+	add	%edi,%ebp
+	xor	%esi,%ebx
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	lea	0x5a827999(%eax,%esi),%edi
+	mov	0(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	8(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%esi
+	xor	32(%rsp),%eax
+	and	%r11d,%ebx
+	add	%esi,%edi
+	xor	52(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,0(%rsp)
+	lea	0x5a827999(%eax,%edx),%esi
+	mov	4(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	12(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edx
+	xor	36(%rsp),%eax
+	and	%ebp,%ebx
+	add	%edx,%esi
+	xor	56(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,4(%rsp)
+	lea	0x5a827999(%eax,%r12d),%edx
+	mov	8(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	16(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%r12d
+	xor	40(%rsp),%eax
+	and	%edi,%ebx
+	add	%r12d,%edx
+	xor	60(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,8(%rsp)
+	lea	0x5a827999(%eax,%r11d),%r12d
+	mov	12(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	20(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%r11d
+	xor	44(%rsp),%eax
+	and	%esi,%ebx
+	add	%r11d,%r12d
+	xor	0(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,12(%rsp)
+	lea	0x5a827999(%eax,%ebp),%r11d
+	mov	16(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	24(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%ebp
+	xor	48(%rsp),%eax
+	and	%edx,%ebx
+	add	%ebp,%r11d
+	xor	4(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,16(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	20(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	28(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	52(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	8(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,20(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	24(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	32(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	56(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	12(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,24(%rsp)
+	lea	0x6ed9eba1(%eax,%edx),%esi
+	mov	28(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	36(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	60(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	16(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,28(%rsp)
+	lea	0x6ed9eba1(%eax,%r12d),%edx
+	mov	32(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	40(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	0(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	20(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,32(%rsp)
+	lea	0x6ed9eba1(%eax,%r11d),%r12d
+	mov	36(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	44(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	4(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	24(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,36(%rsp)
+	lea	0x6ed9eba1(%eax,%ebp),%r11d
+	mov	40(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	48(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	8(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	28(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,40(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	44(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	52(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	12(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	32(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,44(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	48(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	56(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	16(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	36(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,48(%rsp)
+	lea	0x6ed9eba1(%eax,%edx),%esi
+	mov	52(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	60(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	20(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	40(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,52(%rsp)
+	lea	0x6ed9eba1(%eax,%r12d),%edx
+	mov	56(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	0(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	24(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	44(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,56(%rsp)
+	lea	0x6ed9eba1(%eax,%r11d),%r12d
+	mov	60(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	4(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	28(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	48(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,60(%rsp)
+	lea	0x6ed9eba1(%eax,%ebp),%r11d
+	mov	0(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	8(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	32(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	52(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,0(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	4(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	12(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	36(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	56(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,4(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	8(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	16(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	40(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	60(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,8(%rsp)
+	lea	0x6ed9eba1(%eax,%edx),%esi
+	mov	12(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	20(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	44(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	0(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,12(%rsp)
+	lea	0x6ed9eba1(%eax,%r12d),%edx
+	mov	16(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	24(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	48(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	4(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,16(%rsp)
+	lea	0x6ed9eba1(%eax,%r11d),%r12d
+	mov	20(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	28(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	52(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	8(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,20(%rsp)
+	lea	0x6ed9eba1(%eax,%ebp),%r11d
+	mov	24(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	32(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	56(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	12(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,24(%rsp)
+	lea	0x6ed9eba1(%eax,%edi),%ebp
+	mov	28(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	36(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	60(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	16(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,28(%rsp)
+	lea	0x6ed9eba1(%eax,%esi),%edi
+	mov	32(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	40(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	0(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	20(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,32(%rsp)
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	36(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	44(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	4(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	24(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,36(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	40(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	48(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	8(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	28(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,40(%rsp)
+	add	%ebx,%edx
+	lea	-0x70e44324(%eax,%r11d),%r12d
+	mov	44(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%esi,%ecx
+	xor	52(%rsp),%eax
+	mov	%edx,%r11d
+	and	%edi,%ebx
+	xor	12(%rsp),%eax
+	or	%edi,%ecx
+	rol	$5,%r11d
+	xor	32(%rsp),%eax
+	and	%ebp,%ecx
+	add	%r11d,%r12d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%esi
+	mov	%eax,44(%rsp)
+	add	%ebx,%r12d
+	lea	-0x70e44324(%eax,%ebp),%r11d
+	mov	48(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%edx,%ecx
+	xor	56(%rsp),%eax
+	mov	%r12d,%ebp
+	and	%esi,%ebx
+	xor	16(%rsp),%eax
+	or	%esi,%ecx
+	rol	$5,%ebp
+	xor	36(%rsp),%eax
+	and	%edi,%ecx
+	add	%ebp,%r11d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edx
+	mov	%eax,48(%rsp)
+	add	%ebx,%r11d
+	lea	-0x70e44324(%eax,%edi),%ebp
+	mov	52(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%r12d,%ecx
+	xor	60(%rsp),%eax
+	mov	%r11d,%edi
+	and	%edx,%ebx
+	xor	20(%rsp),%eax
+	or	%edx,%ecx
+	rol	$5,%edi
+	xor	40(%rsp),%eax
+	and	%esi,%ecx
+	add	%edi,%ebp
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r12d
+	mov	%eax,52(%rsp)
+	add	%ebx,%ebp
+	lea	-0x70e44324(%eax,%esi),%edi
+	mov	56(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%r11d,%ecx
+	xor	0(%rsp),%eax
+	mov	%ebp,%esi
+	and	%r12d,%ebx
+	xor	24(%rsp),%eax
+	or	%r12d,%ecx
+	rol	$5,%esi
+	xor	44(%rsp),%eax
+	and	%edx,%ecx
+	add	%esi,%edi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r11d
+	mov	%eax,56(%rsp)
+	add	%ebx,%edi
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	60(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	4(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	28(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	48(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,60(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	0(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	8(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	32(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	52(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,0(%rsp)
+	add	%ebx,%edx
+	lea	-0x70e44324(%eax,%r11d),%r12d
+	mov	4(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%esi,%ecx
+	xor	12(%rsp),%eax
+	mov	%edx,%r11d
+	and	%edi,%ebx
+	xor	36(%rsp),%eax
+	or	%edi,%ecx
+	rol	$5,%r11d
+	xor	56(%rsp),%eax
+	and	%ebp,%ecx
+	add	%r11d,%r12d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%esi
+	mov	%eax,4(%rsp)
+	add	%ebx,%r12d
+	lea	-0x70e44324(%eax,%ebp),%r11d
+	mov	8(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%edx,%ecx
+	xor	16(%rsp),%eax
+	mov	%r12d,%ebp
+	and	%esi,%ebx
+	xor	40(%rsp),%eax
+	or	%esi,%ecx
+	rol	$5,%ebp
+	xor	60(%rsp),%eax
+	and	%edi,%ecx
+	add	%ebp,%r11d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edx
+	mov	%eax,8(%rsp)
+	add	%ebx,%r11d
+	lea	-0x70e44324(%eax,%edi),%ebp
+	mov	12(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%r12d,%ecx
+	xor	20(%rsp),%eax
+	mov	%r11d,%edi
+	and	%edx,%ebx
+	xor	44(%rsp),%eax
+	or	%edx,%ecx
+	rol	$5,%edi
+	xor	0(%rsp),%eax
+	and	%esi,%ecx
+	add	%edi,%ebp
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r12d
+	mov	%eax,12(%rsp)
+	add	%ebx,%ebp
+	lea	-0x70e44324(%eax,%esi),%edi
+	mov	16(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%r11d,%ecx
+	xor	24(%rsp),%eax
+	mov	%ebp,%esi
+	and	%r12d,%ebx
+	xor	48(%rsp),%eax
+	or	%r12d,%ecx
+	rol	$5,%esi
+	xor	4(%rsp),%eax
+	and	%edx,%ecx
+	add	%esi,%edi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r11d
+	mov	%eax,16(%rsp)
+	add	%ebx,%edi
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	20(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	28(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	52(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	8(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,20(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	24(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	32(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	56(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	12(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,24(%rsp)
+	add	%ebx,%edx
+	lea	-0x70e44324(%eax,%r11d),%r12d
+	mov	28(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%esi,%ecx
+	xor	36(%rsp),%eax
+	mov	%edx,%r11d
+	and	%edi,%ebx
+	xor	60(%rsp),%eax
+	or	%edi,%ecx
+	rol	$5,%r11d
+	xor	16(%rsp),%eax
+	and	%ebp,%ecx
+	add	%r11d,%r12d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%esi
+	mov	%eax,28(%rsp)
+	add	%ebx,%r12d
+	lea	-0x70e44324(%eax,%ebp),%r11d
+	mov	32(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%edx,%ecx
+	xor	40(%rsp),%eax
+	mov	%r12d,%ebp
+	and	%esi,%ebx
+	xor	0(%rsp),%eax
+	or	%esi,%ecx
+	rol	$5,%ebp
+	xor	20(%rsp),%eax
+	and	%edi,%ecx
+	add	%ebp,%r11d
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edx
+	mov	%eax,32(%rsp)
+	add	%ebx,%r11d
+	lea	-0x70e44324(%eax,%edi),%ebp
+	mov	36(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%r12d,%ecx
+	xor	44(%rsp),%eax
+	mov	%r11d,%edi
+	and	%edx,%ebx
+	xor	4(%rsp),%eax
+	or	%edx,%ecx
+	rol	$5,%edi
+	xor	24(%rsp),%eax
+	and	%esi,%ecx
+	add	%edi,%ebp
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r12d
+	mov	%eax,36(%rsp)
+	add	%ebx,%ebp
+	lea	-0x70e44324(%eax,%esi),%edi
+	mov	40(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%r11d,%ecx
+	xor	48(%rsp),%eax
+	mov	%ebp,%esi
+	and	%r12d,%ebx
+	xor	8(%rsp),%eax
+	or	%r12d,%ecx
+	rol	$5,%esi
+	xor	28(%rsp),%eax
+	and	%edx,%ecx
+	add	%esi,%edi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%r11d
+	mov	%eax,40(%rsp)
+	add	%ebx,%edi
+	lea	-0x70e44324(%eax,%edx),%esi
+	mov	44(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%ebp,%ecx
+	xor	52(%rsp),%eax
+	mov	%edi,%edx
+	and	%r11d,%ebx
+	xor	12(%rsp),%eax
+	or	%r11d,%ecx
+	rol	$5,%edx
+	xor	32(%rsp),%eax
+	and	%r12d,%ecx
+	add	%edx,%esi
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%ebp
+	mov	%eax,44(%rsp)
+	add	%ebx,%esi
+	lea	-0x70e44324(%eax,%r12d),%edx
+	mov	48(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edi,%ecx
+	xor	56(%rsp),%eax
+	mov	%esi,%r12d
+	and	%ebp,%ebx
+	xor	16(%rsp),%eax
+	or	%ebp,%ecx
+	rol	$5,%r12d
+	xor	36(%rsp),%eax
+	and	%r11d,%ecx
+	add	%r12d,%edx
+	rol	$1,%eax
+	or	%ecx,%ebx
+	rol	$30,%edi
+	mov	%eax,48(%rsp)
+	add	%ebx,%edx
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	52(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	60(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	20(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	40(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,52(%rsp)
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	56(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	0(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	24(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	44(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,56(%rsp)
+	lea	-0x359d3e2a(%eax,%edi),%ebp
+	mov	60(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	4(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	28(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	48(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,60(%rsp)
+	lea	-0x359d3e2a(%eax,%esi),%edi
+	mov	0(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	8(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	32(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	52(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,0(%rsp)
+	lea	-0x359d3e2a(%eax,%edx),%esi
+	mov	4(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	12(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	36(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	56(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,4(%rsp)
+	lea	-0x359d3e2a(%eax,%r12d),%edx
+	mov	8(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	16(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	40(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	60(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,8(%rsp)
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	12(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	20(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	44(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	0(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,12(%rsp)
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	16(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	24(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	48(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	4(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,16(%rsp)
+	lea	-0x359d3e2a(%eax,%edi),%ebp
+	mov	20(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	28(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	52(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	8(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,20(%rsp)
+	lea	-0x359d3e2a(%eax,%esi),%edi
+	mov	24(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	32(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	56(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	12(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,24(%rsp)
+	lea	-0x359d3e2a(%eax,%edx),%esi
+	mov	28(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	36(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	60(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	16(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	mov	%eax,28(%rsp)
+	lea	-0x359d3e2a(%eax,%r12d),%edx
+	mov	32(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	40(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	0(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	20(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	mov	%eax,32(%rsp)
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	36(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	44(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	4(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	24(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	mov	%eax,36(%rsp)
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	40(%rsp),%eax
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	48(%rsp),%eax
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	8(%rsp),%eax
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	xor	28(%rsp),%eax
+	rol	$30,%edx
+	add	%ebx,%r11d
+	rol	$1,%eax
+	mov	%eax,40(%rsp)
+	lea	-0x359d3e2a(%eax,%edi),%ebp
+	mov	44(%rsp),%eax
+	mov	%edx,%ebx
+	mov	%r11d,%edi
+	xor	52(%rsp),%eax
+	xor	%r12d,%ebx
+	rol	$5,%edi
+	xor	12(%rsp),%eax
+	xor	%esi,%ebx
+	add	%edi,%ebp
+	xor	32(%rsp),%eax
+	rol	$30,%r12d
+	add	%ebx,%ebp
+	rol	$1,%eax
+	mov	%eax,44(%rsp)
+	lea	-0x359d3e2a(%eax,%esi),%edi
+	mov	48(%rsp),%eax
+	mov	%r12d,%ebx
+	mov	%ebp,%esi
+	xor	56(%rsp),%eax
+	xor	%r11d,%ebx
+	rol	$5,%esi
+	xor	16(%rsp),%eax
+	xor	%edx,%ebx
+	add	%esi,%edi
+	xor	36(%rsp),%eax
+	rol	$30,%r11d
+	add	%ebx,%edi
+	rol	$1,%eax
+	mov	%eax,48(%rsp)
+	lea	-0x359d3e2a(%eax,%edx),%esi
+	mov	52(%rsp),%eax
+	mov	%r11d,%ebx
+	mov	%edi,%edx
+	xor	60(%rsp),%eax
+	xor	%ebp,%ebx
+	rol	$5,%edx
+	xor	20(%rsp),%eax
+	xor	%r12d,%ebx
+	add	%edx,%esi
+	xor	40(%rsp),%eax
+	rol	$30,%ebp
+	add	%ebx,%esi
+	rol	$1,%eax
+	lea	-0x359d3e2a(%eax,%r12d),%edx
+	mov	56(%rsp),%eax
+	mov	%ebp,%ebx
+	mov	%esi,%r12d
+	xor	0(%rsp),%eax
+	xor	%edi,%ebx
+	rol	$5,%r12d
+	xor	24(%rsp),%eax
+	xor	%r11d,%ebx
+	add	%r12d,%edx
+	xor	44(%rsp),%eax
+	rol	$30,%edi
+	add	%ebx,%edx
+	rol	$1,%eax
+	lea	-0x359d3e2a(%eax,%r11d),%r12d
+	mov	60(%rsp),%eax
+	mov	%edi,%ebx
+	mov	%edx,%r11d
+	xor	4(%rsp),%eax
+	xor	%esi,%ebx
+	rol	$5,%r11d
+	xor	28(%rsp),%eax
+	xor	%ebp,%ebx
+	add	%r11d,%r12d
+	xor	48(%rsp),%eax
+	rol	$30,%esi
+	add	%ebx,%r12d
+	rol	$1,%eax
+	lea	-0x359d3e2a(%eax,%ebp),%r11d
+	mov	%esi,%ebx
+	mov	%r12d,%ebp
+	xor	%edx,%ebx
+	rol	$5,%ebp
+	xor	%edi,%ebx
+	add	%ebp,%r11d
+	rol	$30,%edx
+	add	%ebx,%r11d
+	// Update and save state information in SHA-1 context
+	add	0(%r8),%r11d
+	add	4(%r8),%r12d
+	add	8(%r8),%edx
+	add	12(%r8),%esi
+	add	16(%r8),%edi
+	mov	%r11d,0(%r8)
+	mov	%r12d,4(%r8)
+	mov	%edx,8(%r8)
+	mov	%esi,12(%r8)
+	mov	%edi,16(%r8)
+
+	xchg	%r11d,%edx	# mov	%r11d,%edx
+	xchg	%r12d,%esi	# mov	%r12d,%esi
+	xchg	%r11d,%edi	# mov	%edx,%edi
+	xchg	%r12d,%ebp	# mov	%esi,%ebp
+			# mov	%edi,%r11d
+	lea	64(%r9),%r9
+	sub	$1,%r10
+	jnz	.Lloop
+	mov	64(%rsp),%rsp
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	ret
+SET_SIZE(sha1_block_data_order)
+
+.data
+.asciz	"SHA1 block transform for x86_64, CRYPTOGAMS by <appro@openssl.org>"
+
+#endif /* lint || __lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S
new file mode 100644
index 000000000000..766b75355f0b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -0,0 +1,2063 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).  Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA256TransformBlocks)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	mov	%rsp,%rbp		# copy %rsp
+	shl	$4,%rdx		# num*16
+	sub	$16*4+4*8,%rsp
+	lea	(%rsi,%rdx,4),%rdx	# inp+num*16*4
+	and	$-64,%rsp		# align stack frame
+	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
+	mov	%rdi,16*4+0*8(%rsp)		# save ctx, 1st arg
+	mov	%rsi,16*4+1*8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16*4+2*8(%rsp)		# save end pointer, "3rd" arg
+	mov	%rbp,16*4+3*8(%rsp)		# save copy of %rsp
+
+	#.picmeup %rbp
+	# The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+	# the address of the "next" instruction into the target register
+	# (%rbp).  This generates these 2 instructions:
+	lea	.Llea(%rip),%rbp
+	#nop	# .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+	lea	K256-.(%rbp),%rbp
+
+	mov	4*0(%rdi),%eax
+	mov	4*1(%rdi),%ebx
+	mov	4*2(%rdi),%ecx
+	mov	4*3(%rdi),%edx
+	mov	4*4(%rdi),%r8d
+	mov	4*5(%rdi),%r9d
+	mov	4*6(%rdi),%r10d
+	mov	4*7(%rdi),%r11d
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xor	%rdi,%rdi
+	mov	4*0(%rsi),%r12d
+	bswap	%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,0(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	4*1(%rsi),%r12d
+	bswap	%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,4(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	4*2(%rsi),%r12d
+	bswap	%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,8(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	4*3(%rsi),%r12d
+	bswap	%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,12(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	4*4(%rsi),%r12d
+	bswap	%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,16(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	4*5(%rsi),%r12d
+	bswap	%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,20(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	4*6(%rsi),%r12d
+	bswap	%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,24(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	4*7(%rsi),%r12d
+	bswap	%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,28(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	mov	4*8(%rsi),%r12d
+	bswap	%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,32(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	4*9(%rsi),%r12d
+	bswap	%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,36(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	4*10(%rsi),%r12d
+	bswap	%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,40(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	4*11(%rsi),%r12d
+	bswap	%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,44(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	4*12(%rsi),%r12d
+	bswap	%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,48(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	4*13(%rsi),%r12d
+	bswap	%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,52(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	4*14(%rsi),%r12d
+	bswap	%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,56(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	4*15(%rsi),%r12d
+	bswap	%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,60(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	mov	4(%rsp),%r13d
+	mov	56(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	36(%rsp),%r12d
+
+	add	0(%rsp),%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,0(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	8(%rsp),%r13d
+	mov	60(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	40(%rsp),%r12d
+
+	add	4(%rsp),%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,4(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	12(%rsp),%r13d
+	mov	0(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	44(%rsp),%r12d
+
+	add	8(%rsp),%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,8(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	16(%rsp),%r13d
+	mov	4(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	48(%rsp),%r12d
+
+	add	12(%rsp),%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,12(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	20(%rsp),%r13d
+	mov	8(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	52(%rsp),%r12d
+
+	add	16(%rsp),%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,16(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	24(%rsp),%r13d
+	mov	12(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	56(%rsp),%r12d
+
+	add	20(%rsp),%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,20(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	28(%rsp),%r13d
+	mov	16(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	60(%rsp),%r12d
+
+	add	24(%rsp),%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,24(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	32(%rsp),%r13d
+	mov	20(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	0(%rsp),%r12d
+
+	add	28(%rsp),%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,28(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	mov	36(%rsp),%r13d
+	mov	24(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	4(%rsp),%r12d
+
+	add	32(%rsp),%r12d
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+	mov	%r9d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r10d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r8d,%r15d			# (f^g)&e
+	mov	%r12d,32(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r10d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11d,%r12d			# T1+=h
+
+	mov	%eax,%r11d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+
+	ror	$2,%r11d
+	ror	$13,%r13d
+	mov	%eax,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r11d
+	ror	$9,%r13d
+	or	%ecx,%r14d			# a|c
+
+	xor	%r13d,%r11d			# h=Sigma0(a)
+	and	%ecx,%r15d			# a&c
+	add	%r12d,%edx			# d+=T1
+
+	and	%ebx,%r14d			# (a|c)&b
+	add	%r12d,%r11d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r11d			# h+=Maj(a,b,c)
+	mov	40(%rsp),%r13d
+	mov	28(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	8(%rsp),%r12d
+
+	add	36(%rsp),%r12d
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+	mov	%r8d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r9d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%edx,%r15d			# (f^g)&e
+	mov	%r12d,36(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r9d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10d,%r12d			# T1+=h
+
+	mov	%r11d,%r10d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+
+	ror	$2,%r10d
+	ror	$13,%r13d
+	mov	%r11d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r10d
+	ror	$9,%r13d
+	or	%ebx,%r14d			# a|c
+
+	xor	%r13d,%r10d			# h=Sigma0(a)
+	and	%ebx,%r15d			# a&c
+	add	%r12d,%ecx			# d+=T1
+
+	and	%eax,%r14d			# (a|c)&b
+	add	%r12d,%r10d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r10d			# h+=Maj(a,b,c)
+	mov	44(%rsp),%r13d
+	mov	32(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	12(%rsp),%r12d
+
+	add	40(%rsp),%r12d
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+	mov	%edx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r8d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ecx,%r15d			# (f^g)&e
+	mov	%r12d,40(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r8d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9d,%r12d			# T1+=h
+
+	mov	%r10d,%r9d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+
+	ror	$2,%r9d
+	ror	$13,%r13d
+	mov	%r10d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r9d
+	ror	$9,%r13d
+	or	%eax,%r14d			# a|c
+
+	xor	%r13d,%r9d			# h=Sigma0(a)
+	and	%eax,%r15d			# a&c
+	add	%r12d,%ebx			# d+=T1
+
+	and	%r11d,%r14d			# (a|c)&b
+	add	%r12d,%r9d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r9d			# h+=Maj(a,b,c)
+	mov	48(%rsp),%r13d
+	mov	36(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	16(%rsp),%r12d
+
+	add	44(%rsp),%r12d
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+	mov	%ecx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%edx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%ebx,%r15d			# (f^g)&e
+	mov	%r12d,44(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%edx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8d,%r12d			# T1+=h
+
+	mov	%r9d,%r8d
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+
+	ror	$2,%r8d
+	ror	$13,%r13d
+	mov	%r9d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%r8d
+	ror	$9,%r13d
+	or	%r11d,%r14d			# a|c
+
+	xor	%r13d,%r8d			# h=Sigma0(a)
+	and	%r11d,%r15d			# a&c
+	add	%r12d,%eax			# d+=T1
+
+	and	%r10d,%r14d			# (a|c)&b
+	add	%r12d,%r8d			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%r8d			# h+=Maj(a,b,c)
+	mov	52(%rsp),%r13d
+	mov	40(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	20(%rsp),%r12d
+
+	add	48(%rsp),%r12d
+	mov	%eax,%r13d
+	mov	%eax,%r14d
+	mov	%ebx,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ecx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%eax,%r15d			# (f^g)&e
+	mov	%r12d,48(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ecx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%edx,%r12d			# T1+=h
+
+	mov	%r8d,%edx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%r8d,%r13d
+	mov	%r8d,%r14d
+
+	ror	$2,%edx
+	ror	$13,%r13d
+	mov	%r8d,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%edx
+	ror	$9,%r13d
+	or	%r10d,%r14d			# a|c
+
+	xor	%r13d,%edx			# h=Sigma0(a)
+	and	%r10d,%r15d			# a&c
+	add	%r12d,%r11d			# d+=T1
+
+	and	%r9d,%r14d			# (a|c)&b
+	add	%r12d,%edx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%edx			# h+=Maj(a,b,c)
+	mov	56(%rsp),%r13d
+	mov	44(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	24(%rsp),%r12d
+
+	add	52(%rsp),%r12d
+	mov	%r11d,%r13d
+	mov	%r11d,%r14d
+	mov	%eax,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%ebx,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r11d,%r15d			# (f^g)&e
+	mov	%r12d,52(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%ebx,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ecx,%r12d			# T1+=h
+
+	mov	%edx,%ecx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%edx,%r13d
+	mov	%edx,%r14d
+
+	ror	$2,%ecx
+	ror	$13,%r13d
+	mov	%edx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ecx
+	ror	$9,%r13d
+	or	%r9d,%r14d			# a|c
+
+	xor	%r13d,%ecx			# h=Sigma0(a)
+	and	%r9d,%r15d			# a&c
+	add	%r12d,%r10d			# d+=T1
+
+	and	%r8d,%r14d			# (a|c)&b
+	add	%r12d,%ecx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ecx			# h+=Maj(a,b,c)
+	mov	60(%rsp),%r13d
+	mov	48(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	28(%rsp),%r12d
+
+	add	56(%rsp),%r12d
+	mov	%r10d,%r13d
+	mov	%r10d,%r14d
+	mov	%r11d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%eax,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r10d,%r15d			# (f^g)&e
+	mov	%r12d,56(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%eax,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%ebx,%r12d			# T1+=h
+
+	mov	%ecx,%ebx
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ecx,%r13d
+	mov	%ecx,%r14d
+
+	ror	$2,%ebx
+	ror	$13,%r13d
+	mov	%ecx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%ebx
+	ror	$9,%r13d
+	or	%r8d,%r14d			# a|c
+
+	xor	%r13d,%ebx			# h=Sigma0(a)
+	and	%r8d,%r15d			# a&c
+	add	%r12d,%r9d			# d+=T1
+
+	and	%edx,%r14d			# (a|c)&b
+	add	%r12d,%ebx			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%ebx			# h+=Maj(a,b,c)
+	mov	0(%rsp),%r13d
+	mov	52(%rsp),%r12d
+
+	mov	%r13d,%r15d
+
+	shr	$3,%r13d
+	ror	$7,%r15d
+
+	xor	%r15d,%r13d
+	ror	$11,%r15d
+
+	xor	%r15d,%r13d			# sigma0(X[(i+1)&0xf])
+	mov	%r12d,%r14d
+
+	shr	$10,%r12d
+	ror	$17,%r14d
+
+	xor	%r14d,%r12d
+	ror	$2,%r14d
+
+	xor	%r14d,%r12d			# sigma1(X[(i+14)&0xf])
+
+	add	%r13d,%r12d
+
+	add	32(%rsp),%r12d
+
+	add	60(%rsp),%r12d
+	mov	%r9d,%r13d
+	mov	%r9d,%r14d
+	mov	%r10d,%r15d
+
+	ror	$6,%r13d
+	ror	$11,%r14d
+	xor	%r11d,%r15d			# f^g
+
+	xor	%r14d,%r13d
+	ror	$14,%r14d
+	and	%r9d,%r15d			# (f^g)&e
+	mov	%r12d,60(%rsp)
+
+	xor	%r14d,%r13d			# Sigma1(e)
+	xor	%r11d,%r15d			# Ch(e,f,g)=((f^g)&e)^g
+	add	%eax,%r12d			# T1+=h
+
+	mov	%ebx,%eax
+	add	%r13d,%r12d			# T1+=Sigma1(e)
+
+	add	%r15d,%r12d			# T1+=Ch(e,f,g)
+	mov	%ebx,%r13d
+	mov	%ebx,%r14d
+
+	ror	$2,%eax
+	ror	$13,%r13d
+	mov	%ebx,%r15d
+	add	(%rbp,%rdi,4),%r12d	# T1+=K[round]
+
+	xor	%r13d,%eax
+	ror	$9,%r13d
+	or	%edx,%r14d			# a|c
+
+	xor	%r13d,%eax			# h=Sigma0(a)
+	and	%edx,%r15d			# a&c
+	add	%r12d,%r8d			# d+=T1
+
+	and	%ecx,%r14d			# (a|c)&b
+	add	%r12d,%eax			# h+=T1
+
+	or	%r15d,%r14d			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14d,%eax			# h+=Maj(a,b,c)
+	cmp	$64,%rdi
+	jb	.Lrounds_16_xx
+
+	mov	16*4+0*8(%rsp),%rdi
+	lea	16*4(%rsi),%rsi
+
+	add	4*0(%rdi),%eax
+	add	4*1(%rdi),%ebx
+	add	4*2(%rdi),%ecx
+	add	4*3(%rdi),%edx
+	add	4*4(%rdi),%r8d
+	add	4*5(%rdi),%r9d
+	add	4*6(%rdi),%r10d
+	add	4*7(%rdi),%r11d
+
+	cmp	16*4+2*8(%rsp),%rsi
+
+	mov	%eax,4*0(%rdi)
+	mov	%ebx,4*1(%rdi)
+	mov	%ecx,4*2(%rdi)
+	mov	%edx,4*3(%rdi)
+	mov	%r8d,4*4(%rdi)
+	mov	%r9d,4*5(%rdi)
+	mov	%r10d,4*6(%rdi)
+	mov	%r11d,4*7(%rdi)
+	jb	.Lloop
+
+	mov	16*4+3*8(%rsp),%rsp
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	ret
+SET_SIZE(SHA256TransformBlocks)
+
+.data
+.align	64
+.type	K256,@object
+K256:
+	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
+	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
+	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
+	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
+	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
+	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
+	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
+	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
+	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
+	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
+	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
+	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
+	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
+	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
+	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
+	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S
new file mode 100644
index 000000000000..6e37618761b2
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -0,0 +1,2088 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * performance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).  Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA512TransformBlocks)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	mov	%rsp,%rbp		# copy %rsp
+	shl	$4,%rdx		# num*16
+	sub	$16*8+4*8,%rsp
+	lea	(%rsi,%rdx,8),%rdx	# inp+num*16*8
+	and	$-64,%rsp		# align stack frame
+	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
+	mov	%rdi,16*8+0*8(%rsp)		# save ctx, 1st arg
+	mov	%rsi,16*8+1*8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16*8+2*8(%rsp)		# save end pointer, "3rd" arg
+	mov	%rbp,16*8+3*8(%rsp)		# save copy of %rsp
+
+	#.picmeup %rbp
+	# The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+	# the address of the "next" instruction into the target register
+	# (%rbp).  This generates these 2 instructions:
+	lea	.Llea(%rip),%rbp
+	#nop	# .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+	lea	K512-.(%rbp),%rbp
+
+	mov	8*0(%rdi),%rax
+	mov	8*1(%rdi),%rbx
+	mov	8*2(%rdi),%rcx
+	mov	8*3(%rdi),%rdx
+	mov	8*4(%rdi),%r8
+	mov	8*5(%rdi),%r9
+	mov	8*6(%rdi),%r10
+	mov	8*7(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xor	%rdi,%rdi
+	mov	8*0(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*1(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*2(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*3(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*4(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*5(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*6(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*7(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	8*8(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*9(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*10(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*11(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*12(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*13(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*14(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*15(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	mov	8(%rsp),%r13
+	mov	112(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	72(%rsp),%r12
+
+	add	0(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	16(%rsp),%r13
+	mov	120(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	80(%rsp),%r12
+
+	add	8(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	24(%rsp),%r13
+	mov	0(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	88(%rsp),%r12
+
+	add	16(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	32(%rsp),%r13
+	mov	8(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	96(%rsp),%r12
+
+	add	24(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	40(%rsp),%r13
+	mov	16(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	104(%rsp),%r12
+
+	add	32(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	48(%rsp),%r13
+	mov	24(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	112(%rsp),%r12
+
+	add	40(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	56(%rsp),%r13
+	mov	32(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	120(%rsp),%r12
+
+	add	48(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	64(%rsp),%r13
+	mov	40(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	0(%rsp),%r12
+
+	add	56(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	72(%rsp),%r13
+	mov	48(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	8(%rsp),%r12
+
+	add	64(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	80(%rsp),%r13
+	mov	56(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	16(%rsp),%r12
+
+	add	72(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	88(%rsp),%r13
+	mov	64(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	24(%rsp),%r12
+
+	add	80(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	96(%rsp),%r13
+	mov	72(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	32(%rsp),%r12
+
+	add	88(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	104(%rsp),%r13
+	mov	80(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	40(%rsp),%r12
+
+	add	96(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	112(%rsp),%r13
+	mov	88(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	48(%rsp),%r12
+
+	add	104(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	120(%rsp),%r13
+	mov	96(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	56(%rsp),%r12
+
+	add	112(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	0(%rsp),%r13
+	mov	104(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	64(%rsp),%r12
+
+	add	120(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	cmp	$80,%rdi
+	jb	.Lrounds_16_xx
+
+	mov	16*8+0*8(%rsp),%rdi
+	lea	16*8(%rsi),%rsi
+
+	add	8*0(%rdi),%rax
+	add	8*1(%rdi),%rbx
+	add	8*2(%rdi),%rcx
+	add	8*3(%rdi),%rdx
+	add	8*4(%rdi),%r8
+	add	8*5(%rdi),%r9
+	add	8*6(%rdi),%r10
+	add	8*7(%rdi),%r11
+
+	cmp	16*8+2*8(%rsp),%rsi
+
+	mov	%rax,8*0(%rdi)
+	mov	%rbx,8*1(%rdi)
+	mov	%rcx,8*2(%rdi)
+	mov	%rdx,8*3(%rdi)
+	mov	%r8,8*4(%rdi)
+	mov	%r9,8*5(%rdi)
+	mov	%r10,8*6(%rdi)
+	mov	%r11,8*7(%rdi)
+	jb	.Lloop
+
+	mov	16*8+3*8(%rsp),%rsp
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	ret
+SET_SIZE(SHA512TransformBlocks)
+
+.data
+.align	64
+.type	K512,@object
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif /* !lint && !__lint */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_callprov.c b/sys/contrib/openzfs/module/icp/core/kcf_callprov.c
new file mode 100644
index 000000000000..fd2f7e1aac3d
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_callprov.c
@@ -0,0 +1,1567 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+
+static int kcf_emulate_dual(kcf_provider_desc_t *, crypto_ctx_t *,
+    kcf_req_params_t *);
+
+void
+kcf_free_triedlist(kcf_prov_tried_t *list)
+{
+	kcf_prov_tried_t *l;
+
+	while ((l = list) != NULL) {
+		list = list->pt_next;
+		KCF_PROV_REFRELE(l->pt_pd);
+		kmem_free(l, sizeof (kcf_prov_tried_t));
+	}
+}
+
+kcf_prov_tried_t *
+kcf_insert_triedlist(kcf_prov_tried_t **list, kcf_provider_desc_t *pd,
+    int kmflag)
+{
+	kcf_prov_tried_t *l;
+
+	l = kmem_alloc(sizeof (kcf_prov_tried_t), kmflag);
+	if (l == NULL)
+		return (NULL);
+
+	l->pt_pd = pd;
+	l->pt_next = *list;
+	*list = l;
+
+	return (l);
+}
+
+static boolean_t
+is_in_triedlist(kcf_provider_desc_t *pd, kcf_prov_tried_t *triedl)
+{
+	while (triedl != NULL) {
+		if (triedl->pt_pd == pd)
+			return (B_TRUE);
+		triedl = triedl->pt_next;
+	};
+
+	return (B_FALSE);
+}
+
+/*
+ * Search a mech entry's hardware provider list for the specified
+ * provider. Return true if found.
+ */
+static boolean_t
+is_valid_provider_for_mech(kcf_provider_desc_t *pd, kcf_mech_entry_t *me,
+    crypto_func_group_t fg)
+{
+	kcf_prov_mech_desc_t *prov_chain;
+
+	prov_chain = me->me_hw_prov_chain;
+	if (prov_chain != NULL) {
+		ASSERT(me->me_num_hwprov > 0);
+		for (; prov_chain != NULL; prov_chain = prov_chain->pm_next) {
+			if (prov_chain->pm_prov_desc == pd &&
+			    IS_FG_SUPPORTED(prov_chain, fg)) {
+				return (B_TRUE);
+			}
+		}
+	}
+	return (B_FALSE);
+}
+
+/*
+ * This routine, given a logical provider, returns the least loaded
+ * provider belonging to the logical provider. The provider must be
+ * able to do the specified mechanism, i.e. check that the mechanism
+ * hasn't been disabled. In addition, just in case providers are not
+ * entirely equivalent, the provider's entry point is checked for
+ * non-nullness. This is accomplished by having the caller pass, as
+ * arguments, the offset of the function group (offset_1), and the
+ * offset of the function within the function group (offset_2).
+ * Returns NULL if no provider can be found.
+ */
+int
+kcf_get_hardware_provider(crypto_mech_type_t mech_type_1,
+    crypto_mech_type_t mech_type_2, boolean_t call_restrict,
+    kcf_provider_desc_t *old, kcf_provider_desc_t **new, crypto_func_group_t fg)
+{
+	kcf_provider_desc_t *provider, *real_pd = old;
+	kcf_provider_desc_t *gpd = NULL;	/* good provider */
+	kcf_provider_desc_t *bpd = NULL;	/* busy provider */
+	kcf_provider_list_t *p;
+	kcf_ops_class_t class;
+	kcf_mech_entry_t *me;
+	kcf_mech_entry_tab_t *me_tab;
+	int index, len, gqlen = INT_MAX, rv = CRYPTO_SUCCESS;
+
+	/* get the mech entry for the specified mechanism */
+	class = KCF_MECH2CLASS(mech_type_1);
+	if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) {
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	me_tab = &kcf_mech_tabs_tab[class];
+	index = KCF_MECH2INDEX(mech_type_1);
+	if ((index < 0) || (index >= me_tab->met_size)) {
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	me = &((me_tab->met_tab)[index]);
+	mutex_enter(&me->me_mutex);
+
+	/*
+	 * We assume the provider descriptor will not go away because
+	 * it is being held somewhere, i.e. its reference count has been
+	 * incremented. In the case of the crypto module, the provider
+	 * descriptor is held by the session structure.
+	 */
+	if (old->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		if (old->pd_provider_list == NULL) {
+			real_pd = NULL;
+			rv = CRYPTO_DEVICE_ERROR;
+			goto out;
+		}
+		/*
+		 * Find the least loaded real provider. KCF_PROV_LOAD gives
+		 * the load (number of pending requests) of the provider.
+		 */
+		mutex_enter(&old->pd_lock);
+		p = old->pd_provider_list;
+		while (p != NULL) {
+			provider = p->pl_provider;
+
+			ASSERT(provider->pd_prov_type !=
+			    CRYPTO_LOGICAL_PROVIDER);
+
+			if (call_restrict &&
+			    (provider->pd_flags & KCF_PROV_RESTRICTED)) {
+				p = p->pl_next;
+				continue;
+			}
+
+			if (!is_valid_provider_for_mech(provider, me, fg)) {
+				p = p->pl_next;
+				continue;
+			}
+
+			/* provider does second mech */
+			if (mech_type_2 != CRYPTO_MECH_INVALID) {
+				int i;
+
+				i = KCF_TO_PROV_MECH_INDX(provider,
+				    mech_type_2);
+				if (i == KCF_INVALID_INDX) {
+					p = p->pl_next;
+					continue;
+				}
+			}
+
+			if (provider->pd_state != KCF_PROV_READY) {
+				/* choose BUSY if no READY providers */
+				if (provider->pd_state == KCF_PROV_BUSY)
+					bpd = provider;
+				p = p->pl_next;
+				continue;
+			}
+
+			len = KCF_PROV_LOAD(provider);
+			if (len < gqlen) {
+				gqlen = len;
+				gpd = provider;
+			}
+
+			p = p->pl_next;
+		}
+
+		if (gpd != NULL) {
+			real_pd = gpd;
+			KCF_PROV_REFHOLD(real_pd);
+		} else if (bpd != NULL) {
+			real_pd = bpd;
+			KCF_PROV_REFHOLD(real_pd);
+		} else {
+			/* can't find provider */
+			real_pd = NULL;
+			rv = CRYPTO_MECHANISM_INVALID;
+		}
+		mutex_exit(&old->pd_lock);
+
+	} else {
+		if (!KCF_IS_PROV_USABLE(old) ||
+		    (call_restrict && (old->pd_flags & KCF_PROV_RESTRICTED))) {
+			real_pd = NULL;
+			rv = CRYPTO_DEVICE_ERROR;
+			goto out;
+		}
+
+		if (!is_valid_provider_for_mech(old, me, fg)) {
+			real_pd = NULL;
+			rv = CRYPTO_MECHANISM_INVALID;
+			goto out;
+		}
+
+		KCF_PROV_REFHOLD(real_pd);
+	}
+out:
+	mutex_exit(&me->me_mutex);
+	*new = real_pd;
+	return (rv);
+}
+
+/*
+ * Return the best provider for the specified mechanism. The provider
+ * is held and it is the caller's responsibility to release it when done.
+ * The fg input argument is used as a search criterion to pick a provider.
+ * A provider has to support this function group to be picked.
+ *
+ * Find the least loaded provider in the list of providers. We do a linear
+ * search to find one. This is fine as we assume there are only a few
+ * number of providers in this list. If this assumption ever changes,
+ * we should revisit this.
+ *
+ * call_restrict represents if the caller should not be allowed to
+ * use restricted providers.
+ */
+kcf_provider_desc_t *
+kcf_get_mech_provider(crypto_mech_type_t mech_type, kcf_mech_entry_t **mepp,
+    int *error, kcf_prov_tried_t *triedl, crypto_func_group_t fg,
+    boolean_t call_restrict, size_t data_size)
+{
+	kcf_provider_desc_t *pd = NULL, *gpd = NULL;
+	kcf_prov_mech_desc_t *prov_chain, *mdesc;
+	int len, gqlen = INT_MAX;
+	kcf_ops_class_t class;
+	int index;
+	kcf_mech_entry_t *me;
+	kcf_mech_entry_tab_t *me_tab;
+
+	class = KCF_MECH2CLASS(mech_type);
+	if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) {
+		*error = CRYPTO_MECHANISM_INVALID;
+		return (NULL);
+	}
+
+	me_tab = &kcf_mech_tabs_tab[class];
+	index = KCF_MECH2INDEX(mech_type);
+	if ((index < 0) || (index >= me_tab->met_size)) {
+		*error = CRYPTO_MECHANISM_INVALID;
+		return (NULL);
+	}
+
+	me = &((me_tab->met_tab)[index]);
+	if (mepp != NULL)
+		*mepp = me;
+
+	mutex_enter(&me->me_mutex);
+
+	prov_chain = me->me_hw_prov_chain;
+
+	/*
+	 * We check for the threshold for using a hardware provider for
+	 * this amount of data. If there is no software provider available
+	 * for the mechanism, then the threshold is ignored.
+	 */
+	if ((prov_chain != NULL) &&
+	    ((data_size == 0) || (me->me_threshold == 0) ||
+	    (data_size >= me->me_threshold) ||
+	    ((mdesc = me->me_sw_prov) == NULL) ||
+	    (!IS_FG_SUPPORTED(mdesc, fg)) ||
+	    (!KCF_IS_PROV_USABLE(mdesc->pm_prov_desc)))) {
+		ASSERT(me->me_num_hwprov > 0);
+		/* there is at least one provider */
+
+		/*
+		 * Find the least loaded real provider. KCF_PROV_LOAD gives
+		 * the load (number of pending requests) of the provider.
+		 */
+		while (prov_chain != NULL) {
+			pd = prov_chain->pm_prov_desc;
+
+			if (!IS_FG_SUPPORTED(prov_chain, fg) ||
+			    !KCF_IS_PROV_USABLE(pd) ||
+			    IS_PROVIDER_TRIED(pd, triedl) ||
+			    (call_restrict &&
+			    (pd->pd_flags & KCF_PROV_RESTRICTED))) {
+				prov_chain = prov_chain->pm_next;
+				continue;
+			}
+
+			if ((len = KCF_PROV_LOAD(pd)) < gqlen) {
+				gqlen = len;
+				gpd = pd;
+			}
+
+			prov_chain = prov_chain->pm_next;
+		}
+
+		pd = gpd;
+	}
+
+	/* No HW provider for this mech, is there a SW provider? */
+	if (pd == NULL && (mdesc = me->me_sw_prov) != NULL) {
+		pd = mdesc->pm_prov_desc;
+		if (!IS_FG_SUPPORTED(mdesc, fg) ||
+		    !KCF_IS_PROV_USABLE(pd) ||
+		    IS_PROVIDER_TRIED(pd, triedl) ||
+		    (call_restrict && (pd->pd_flags & KCF_PROV_RESTRICTED)))
+			pd = NULL;
+	}
+
+	if (pd == NULL) {
+		/*
+		 * We do not want to report CRYPTO_MECH_NOT_SUPPORTED, when
+		 * we are in the "fallback to the next provider" case. Rather
+		 * we preserve the error, so that the client gets the right
+		 * error code.
+		 */
+		if (triedl == NULL)
+			*error = CRYPTO_MECH_NOT_SUPPORTED;
+	} else
+		KCF_PROV_REFHOLD(pd);
+
+	mutex_exit(&me->me_mutex);
+	return (pd);
+}
+
+/*
+ * Very similar to kcf_get_mech_provider(). Finds the best provider capable of
+ * a dual operation with both me1 and me2.
+ * When no dual-ops capable providers are available, return the best provider
+ * for me1 only, and sets *prov_mt2 to CRYPTO_INVALID_MECHID;
+ * We assume/expect that a slower HW capable of the dual is still
+ * faster than the 2 fastest providers capable of the individual ops
+ * separately.
+ */
+kcf_provider_desc_t *
+kcf_get_dual_provider(crypto_mechanism_t *mech1, crypto_mechanism_t *mech2,
+    kcf_mech_entry_t **mepp, crypto_mech_type_t *prov_mt1,
+    crypto_mech_type_t *prov_mt2, int *error, kcf_prov_tried_t *triedl,
+    crypto_func_group_t fg1, crypto_func_group_t fg2, boolean_t call_restrict,
+    size_t data_size)
+{
+	kcf_provider_desc_t *pd = NULL, *pdm1 = NULL, *pdm1m2 = NULL;
+	kcf_prov_mech_desc_t *prov_chain, *mdesc;
+	int len, gqlen = INT_MAX, dgqlen = INT_MAX;
+	crypto_mech_info_list_t *mil;
+	crypto_mech_type_t m2id =  mech2->cm_type;
+	kcf_mech_entry_t *me;
+
+	/* when mech is a valid mechanism, me will be its mech_entry */
+	if (kcf_get_mech_entry(mech1->cm_type, &me) != KCF_SUCCESS) {
+		*error = CRYPTO_MECHANISM_INVALID;
+		return (NULL);
+	}
+
+	*prov_mt2 = CRYPTO_MECH_INVALID;
+
+	if (mepp != NULL)
+		*mepp = me;
+	mutex_enter(&me->me_mutex);
+
+	prov_chain = me->me_hw_prov_chain;
+	/*
+	 * We check the threshold for using a hardware provider for
+	 * this amount of data. If there is no software provider available
+	 * for the first mechanism, then the threshold is ignored.
+	 */
+	if ((prov_chain != NULL) &&
+	    ((data_size == 0) || (me->me_threshold == 0) ||
+	    (data_size >= me->me_threshold) ||
+	    ((mdesc = me->me_sw_prov) == NULL) ||
+	    (!IS_FG_SUPPORTED(mdesc, fg1)) ||
+	    (!KCF_IS_PROV_USABLE(mdesc->pm_prov_desc)))) {
+		/* there is at least one provider */
+		ASSERT(me->me_num_hwprov > 0);
+
+		/*
+		 * Find the least loaded provider capable of the combo
+		 * me1 + me2, and save a pointer to the least loaded
+		 * provider capable of me1 only.
+		 */
+		while (prov_chain != NULL) {
+			pd = prov_chain->pm_prov_desc;
+			len = KCF_PROV_LOAD(pd);
+
+			if (!IS_FG_SUPPORTED(prov_chain, fg1) ||
+			    !KCF_IS_PROV_USABLE(pd) ||
+			    IS_PROVIDER_TRIED(pd, triedl) ||
+			    (call_restrict &&
+			    (pd->pd_flags & KCF_PROV_RESTRICTED))) {
+				prov_chain = prov_chain->pm_next;
+				continue;
+			}
+
+			/* Save the best provider capable of m1 */
+			if (len < gqlen) {
+				*prov_mt1 =
+				    prov_chain->pm_mech_info.cm_mech_number;
+				gqlen = len;
+				pdm1 = pd;
+			}
+
+			/* See if pd can do me2 too */
+			for (mil = prov_chain->pm_mi_list;
+			    mil != NULL; mil = mil->ml_next) {
+				if ((mil->ml_mech_info.cm_func_group_mask &
+				    fg2) == 0)
+					continue;
+
+				if ((mil->ml_kcf_mechid == m2id) &&
+				    (len < dgqlen)) {
+					/* Bingo! */
+					dgqlen = len;
+					pdm1m2 = pd;
+					*prov_mt2 =
+					    mil->ml_mech_info.cm_mech_number;
+					*prov_mt1 = prov_chain->
+					    pm_mech_info.cm_mech_number;
+					break;
+				}
+			}
+
+			prov_chain = prov_chain->pm_next;
+		}
+
+		pd =  (pdm1m2 != NULL) ? pdm1m2 : pdm1;
+	}
+
+	/* no HW provider for this mech, is there a SW provider? */
+	if (pd == NULL && (mdesc = me->me_sw_prov) != NULL) {
+		pd = mdesc->pm_prov_desc;
+		if (!IS_FG_SUPPORTED(mdesc, fg1) ||
+		    !KCF_IS_PROV_USABLE(pd) ||
+		    IS_PROVIDER_TRIED(pd, triedl) ||
+		    (call_restrict && (pd->pd_flags & KCF_PROV_RESTRICTED)))
+			pd = NULL;
+		else {
+			/* See if pd can do me2 too */
+			for (mil = me->me_sw_prov->pm_mi_list;
+			    mil != NULL; mil = mil->ml_next) {
+				if ((mil->ml_mech_info.cm_func_group_mask &
+				    fg2) == 0)
+					continue;
+
+				if (mil->ml_kcf_mechid == m2id) {
+					/* Bingo! */
+					*prov_mt2 =
+					    mil->ml_mech_info.cm_mech_number;
+					break;
+				}
+			}
+			*prov_mt1 = me->me_sw_prov->pm_mech_info.cm_mech_number;
+		}
+	}
+
+	if (pd == NULL)
+		*error = CRYPTO_MECH_NOT_SUPPORTED;
+	else
+		KCF_PROV_REFHOLD(pd);
+
+	mutex_exit(&me->me_mutex);
+	return (pd);
+}
+
+/*
+ * Do the actual work of calling the provider routines.
+ *
+ * pd - Provider structure
+ * ctx - Context for this operation
+ * params - Parameters for this operation
+ * rhndl - Request handle to use for notification
+ *
+ * The return values are the same as that of the respective SPI.
+ */
+int
+common_submit_request(kcf_provider_desc_t *pd, crypto_ctx_t *ctx,
+    kcf_req_params_t *params, crypto_req_handle_t rhndl)
+{
+	int err = CRYPTO_ARGUMENTS_BAD;
+	kcf_op_type_t optype;
+
+	optype = params->rp_optype;
+
+	switch (params->rp_opgrp) {
+	case KCF_OG_DIGEST: {
+		kcf_digest_ops_params_t *dops = &params->rp_u.digest_params;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			/*
+			 * We should do this only here and not in KCF_WRAP_*
+			 * macros. This is because we may want to try other
+			 * providers, in case we recover from a failure.
+			 */
+			KCF_SET_PROVIDER_MECHNUM(dops->do_framework_mechtype,
+			    pd, &dops->do_mech);
+
+			err = KCF_PROV_DIGEST_INIT(pd, ctx, &dops->do_mech,
+			    rhndl);
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_DIGEST(pd, ctx, dops->do_data,
+			    dops->do_digest, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			err = KCF_PROV_DIGEST_UPDATE(pd, ctx,
+			    dops->do_data, rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			err = KCF_PROV_DIGEST_FINAL(pd, ctx,
+			    dops->do_digest, rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(dops->do_framework_mechtype,
+			    pd, &dops->do_mech);
+			err = KCF_PROV_DIGEST_ATOMIC(pd, dops->do_sid,
+			    &dops->do_mech, dops->do_data, dops->do_digest,
+			    rhndl);
+			break;
+
+		case KCF_OP_DIGEST_KEY:
+			err = KCF_PROV_DIGEST_KEY(pd, ctx, dops->do_digest_key,
+			    rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_MAC: {
+		kcf_mac_ops_params_t *mops = &params->rp_u.mac_params;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype,
+			    pd, &mops->mo_mech);
+
+			err = KCF_PROV_MAC_INIT(pd, ctx, &mops->mo_mech,
+			    mops->mo_key, mops->mo_templ, rhndl);
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_MAC(pd, ctx, mops->mo_data,
+			    mops->mo_mac, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			err = KCF_PROV_MAC_UPDATE(pd, ctx, mops->mo_data,
+			    rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			err = KCF_PROV_MAC_FINAL(pd, ctx, mops->mo_mac, rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype,
+			    pd, &mops->mo_mech);
+
+			err = KCF_PROV_MAC_ATOMIC(pd, mops->mo_sid,
+			    &mops->mo_mech, mops->mo_key, mops->mo_data,
+			    mops->mo_mac, mops->mo_templ, rhndl);
+			break;
+
+		case KCF_OP_MAC_VERIFY_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype,
+			    pd, &mops->mo_mech);
+
+			err = KCF_PROV_MAC_VERIFY_ATOMIC(pd, mops->mo_sid,
+			    &mops->mo_mech, mops->mo_key, mops->mo_data,
+			    mops->mo_mac, mops->mo_templ, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_ENCRYPT: {
+		kcf_encrypt_ops_params_t *eops = &params->rp_u.encrypt_params;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			KCF_SET_PROVIDER_MECHNUM(eops->eo_framework_mechtype,
+			    pd, &eops->eo_mech);
+
+			err = KCF_PROV_ENCRYPT_INIT(pd, ctx, &eops->eo_mech,
+			    eops->eo_key, eops->eo_templ, rhndl);
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_ENCRYPT(pd, ctx, eops->eo_plaintext,
+			    eops->eo_ciphertext, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			err = KCF_PROV_ENCRYPT_UPDATE(pd, ctx,
+			    eops->eo_plaintext, eops->eo_ciphertext, rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			err = KCF_PROV_ENCRYPT_FINAL(pd, ctx,
+			    eops->eo_ciphertext, rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(eops->eo_framework_mechtype,
+			    pd, &eops->eo_mech);
+
+			err = KCF_PROV_ENCRYPT_ATOMIC(pd, eops->eo_sid,
+			    &eops->eo_mech, eops->eo_key, eops->eo_plaintext,
+			    eops->eo_ciphertext, eops->eo_templ, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_DECRYPT: {
+		kcf_decrypt_ops_params_t *dcrops = &params->rp_u.decrypt_params;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			KCF_SET_PROVIDER_MECHNUM(dcrops->dop_framework_mechtype,
+			    pd, &dcrops->dop_mech);
+
+			err = KCF_PROV_DECRYPT_INIT(pd, ctx, &dcrops->dop_mech,
+			    dcrops->dop_key, dcrops->dop_templ, rhndl);
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_DECRYPT(pd, ctx, dcrops->dop_ciphertext,
+			    dcrops->dop_plaintext, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			err = KCF_PROV_DECRYPT_UPDATE(pd, ctx,
+			    dcrops->dop_ciphertext, dcrops->dop_plaintext,
+			    rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			err = KCF_PROV_DECRYPT_FINAL(pd, ctx,
+			    dcrops->dop_plaintext, rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(dcrops->dop_framework_mechtype,
+			    pd, &dcrops->dop_mech);
+
+			err = KCF_PROV_DECRYPT_ATOMIC(pd, dcrops->dop_sid,
+			    &dcrops->dop_mech, dcrops->dop_key,
+			    dcrops->dop_ciphertext, dcrops->dop_plaintext,
+			    dcrops->dop_templ, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_SIGN: {
+		kcf_sign_ops_params_t *sops = &params->rp_u.sign_params;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+			    pd, &sops->so_mech);
+
+			err = KCF_PROV_SIGN_INIT(pd, ctx, &sops->so_mech,
+			    sops->so_key, sops->so_templ, rhndl);
+			break;
+
+		case KCF_OP_SIGN_RECOVER_INIT:
+			KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+			    pd, &sops->so_mech);
+
+			err = KCF_PROV_SIGN_RECOVER_INIT(pd, ctx,
+			    &sops->so_mech, sops->so_key, sops->so_templ,
+			    rhndl);
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_SIGN(pd, ctx, sops->so_data,
+			    sops->so_signature, rhndl);
+			break;
+
+		case KCF_OP_SIGN_RECOVER:
+			err = KCF_PROV_SIGN_RECOVER(pd, ctx,
+			    sops->so_data, sops->so_signature, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			err = KCF_PROV_SIGN_UPDATE(pd, ctx, sops->so_data,
+			    rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			err = KCF_PROV_SIGN_FINAL(pd, ctx, sops->so_signature,
+			    rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+			    pd, &sops->so_mech);
+
+			err = KCF_PROV_SIGN_ATOMIC(pd, sops->so_sid,
+			    &sops->so_mech, sops->so_key, sops->so_data,
+			    sops->so_templ, sops->so_signature, rhndl);
+			break;
+
+		case KCF_OP_SIGN_RECOVER_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype,
+			    pd, &sops->so_mech);
+
+			err = KCF_PROV_SIGN_RECOVER_ATOMIC(pd, sops->so_sid,
+			    &sops->so_mech, sops->so_key, sops->so_data,
+			    sops->so_templ, sops->so_signature, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_VERIFY: {
+		kcf_verify_ops_params_t *vops = &params->rp_u.verify_params;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+			    pd, &vops->vo_mech);
+
+			err = KCF_PROV_VERIFY_INIT(pd, ctx, &vops->vo_mech,
+			    vops->vo_key, vops->vo_templ, rhndl);
+			break;
+
+		case KCF_OP_VERIFY_RECOVER_INIT:
+			KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+			    pd, &vops->vo_mech);
+
+			err = KCF_PROV_VERIFY_RECOVER_INIT(pd, ctx,
+			    &vops->vo_mech, vops->vo_key, vops->vo_templ,
+			    rhndl);
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_VERIFY(pd, ctx, vops->vo_data,
+			    vops->vo_signature, rhndl);
+			break;
+
+		case KCF_OP_VERIFY_RECOVER:
+			err = KCF_PROV_VERIFY_RECOVER(pd, ctx,
+			    vops->vo_signature, vops->vo_data, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			err = KCF_PROV_VERIFY_UPDATE(pd, ctx, vops->vo_data,
+			    rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			err = KCF_PROV_VERIFY_FINAL(pd, ctx, vops->vo_signature,
+			    rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+			    pd, &vops->vo_mech);
+
+			err = KCF_PROV_VERIFY_ATOMIC(pd, vops->vo_sid,
+			    &vops->vo_mech, vops->vo_key, vops->vo_data,
+			    vops->vo_templ, vops->vo_signature, rhndl);
+			break;
+
+		case KCF_OP_VERIFY_RECOVER_ATOMIC:
+			ASSERT(ctx == NULL);
+			KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype,
+			    pd, &vops->vo_mech);
+
+			err = KCF_PROV_VERIFY_RECOVER_ATOMIC(pd, vops->vo_sid,
+			    &vops->vo_mech, vops->vo_key, vops->vo_signature,
+			    vops->vo_templ, vops->vo_data, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_ENCRYPT_MAC: {
+		kcf_encrypt_mac_ops_params_t *eops =
+		    &params->rp_u.encrypt_mac_params;
+		kcf_context_t *kcf_secondctx;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			kcf_secondctx = ((kcf_context_t *)
+			    (ctx->cc_framework_private))->kc_secondctx;
+
+			if (kcf_secondctx != NULL) {
+				err = kcf_emulate_dual(pd, ctx, params);
+				break;
+			}
+			KCF_SET_PROVIDER_MECHNUM(
+			    eops->em_framework_encr_mechtype,
+			    pd, &eops->em_encr_mech);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    eops->em_framework_mac_mechtype,
+			    pd, &eops->em_mac_mech);
+
+			err = KCF_PROV_ENCRYPT_MAC_INIT(pd, ctx,
+			    &eops->em_encr_mech, eops->em_encr_key,
+			    &eops->em_mac_mech, eops->em_mac_key,
+			    eops->em_encr_templ, eops->em_mac_templ,
+			    rhndl);
+
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_ENCRYPT_MAC(pd, ctx,
+			    eops->em_plaintext, eops->em_ciphertext,
+			    eops->em_mac, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			kcf_secondctx = ((kcf_context_t *)
+			    (ctx->cc_framework_private))->kc_secondctx;
+			if (kcf_secondctx != NULL) {
+				err = kcf_emulate_dual(pd, ctx, params);
+				break;
+			}
+			err = KCF_PROV_ENCRYPT_MAC_UPDATE(pd, ctx,
+			    eops->em_plaintext, eops->em_ciphertext, rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			kcf_secondctx = ((kcf_context_t *)
+			    (ctx->cc_framework_private))->kc_secondctx;
+			if (kcf_secondctx != NULL) {
+				err = kcf_emulate_dual(pd, ctx, params);
+				break;
+			}
+			err = KCF_PROV_ENCRYPT_MAC_FINAL(pd, ctx,
+			    eops->em_ciphertext, eops->em_mac, rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    eops->em_framework_encr_mechtype,
+			    pd, &eops->em_encr_mech);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    eops->em_framework_mac_mechtype,
+			    pd, &eops->em_mac_mech);
+
+			err = KCF_PROV_ENCRYPT_MAC_ATOMIC(pd, eops->em_sid,
+			    &eops->em_encr_mech, eops->em_encr_key,
+			    &eops->em_mac_mech, eops->em_mac_key,
+			    eops->em_plaintext, eops->em_ciphertext,
+			    eops->em_mac,
+			    eops->em_encr_templ, eops->em_mac_templ,
+			    rhndl);
+
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_MAC_DECRYPT: {
+		kcf_mac_decrypt_ops_params_t *dops =
+		    &params->rp_u.mac_decrypt_params;
+		kcf_context_t *kcf_secondctx;
+
+		switch (optype) {
+		case KCF_OP_INIT:
+			kcf_secondctx = ((kcf_context_t *)
+			    (ctx->cc_framework_private))->kc_secondctx;
+
+			if (kcf_secondctx != NULL) {
+				err = kcf_emulate_dual(pd, ctx, params);
+				break;
+			}
+			KCF_SET_PROVIDER_MECHNUM(
+			    dops->md_framework_mac_mechtype,
+			    pd, &dops->md_mac_mech);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    dops->md_framework_decr_mechtype,
+			    pd, &dops->md_decr_mech);
+
+			err = KCF_PROV_MAC_DECRYPT_INIT(pd, ctx,
+			    &dops->md_mac_mech, dops->md_mac_key,
+			    &dops->md_decr_mech, dops->md_decr_key,
+			    dops->md_mac_templ, dops->md_decr_templ,
+			    rhndl);
+
+			break;
+
+		case KCF_OP_SINGLE:
+			err = KCF_PROV_MAC_DECRYPT(pd, ctx,
+			    dops->md_ciphertext, dops->md_mac,
+			    dops->md_plaintext, rhndl);
+			break;
+
+		case KCF_OP_UPDATE:
+			kcf_secondctx = ((kcf_context_t *)
+			    (ctx->cc_framework_private))->kc_secondctx;
+			if (kcf_secondctx != NULL) {
+				err = kcf_emulate_dual(pd, ctx, params);
+				break;
+			}
+			err = KCF_PROV_MAC_DECRYPT_UPDATE(pd, ctx,
+			    dops->md_ciphertext, dops->md_plaintext, rhndl);
+			break;
+
+		case KCF_OP_FINAL:
+			kcf_secondctx = ((kcf_context_t *)
+			    (ctx->cc_framework_private))->kc_secondctx;
+			if (kcf_secondctx != NULL) {
+				err = kcf_emulate_dual(pd, ctx, params);
+				break;
+			}
+			err = KCF_PROV_MAC_DECRYPT_FINAL(pd, ctx,
+			    dops->md_mac, dops->md_plaintext, rhndl);
+			break;
+
+		case KCF_OP_ATOMIC:
+			ASSERT(ctx == NULL);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    dops->md_framework_mac_mechtype,
+			    pd, &dops->md_mac_mech);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    dops->md_framework_decr_mechtype,
+			    pd, &dops->md_decr_mech);
+
+			err = KCF_PROV_MAC_DECRYPT_ATOMIC(pd, dops->md_sid,
+			    &dops->md_mac_mech, dops->md_mac_key,
+			    &dops->md_decr_mech, dops->md_decr_key,
+			    dops->md_ciphertext, dops->md_mac,
+			    dops->md_plaintext,
+			    dops->md_mac_templ, dops->md_decr_templ,
+			    rhndl);
+
+			break;
+
+		case KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC:
+			ASSERT(ctx == NULL);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    dops->md_framework_mac_mechtype,
+			    pd, &dops->md_mac_mech);
+
+			KCF_SET_PROVIDER_MECHNUM(
+			    dops->md_framework_decr_mechtype,
+			    pd, &dops->md_decr_mech);
+
+			err = KCF_PROV_MAC_VERIFY_DECRYPT_ATOMIC(pd,
+			    dops->md_sid, &dops->md_mac_mech, dops->md_mac_key,
+			    &dops->md_decr_mech, dops->md_decr_key,
+			    dops->md_ciphertext, dops->md_mac,
+			    dops->md_plaintext,
+			    dops->md_mac_templ, dops->md_decr_templ,
+			    rhndl);
+
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_KEY: {
+		kcf_key_ops_params_t *kops = &params->rp_u.key_params;
+
+		ASSERT(ctx == NULL);
+		KCF_SET_PROVIDER_MECHNUM(kops->ko_framework_mechtype, pd,
+		    &kops->ko_mech);
+
+		switch (optype) {
+		case KCF_OP_KEY_GENERATE:
+			err = KCF_PROV_KEY_GENERATE(pd, kops->ko_sid,
+			    &kops->ko_mech,
+			    kops->ko_key_template, kops->ko_key_attribute_count,
+			    kops->ko_key_object_id_ptr, rhndl);
+			break;
+
+		case KCF_OP_KEY_GENERATE_PAIR:
+			err = KCF_PROV_KEY_GENERATE_PAIR(pd, kops->ko_sid,
+			    &kops->ko_mech,
+			    kops->ko_key_template, kops->ko_key_attribute_count,
+			    kops->ko_private_key_template,
+			    kops->ko_private_key_attribute_count,
+			    kops->ko_key_object_id_ptr,
+			    kops->ko_private_key_object_id_ptr, rhndl);
+			break;
+
+		case KCF_OP_KEY_WRAP:
+			err = KCF_PROV_KEY_WRAP(pd, kops->ko_sid,
+			    &kops->ko_mech,
+			    kops->ko_key, kops->ko_key_object_id_ptr,
+			    kops->ko_wrapped_key, kops->ko_wrapped_key_len_ptr,
+			    rhndl);
+			break;
+
+		case KCF_OP_KEY_UNWRAP:
+			err = KCF_PROV_KEY_UNWRAP(pd, kops->ko_sid,
+			    &kops->ko_mech,
+			    kops->ko_key, kops->ko_wrapped_key,
+			    kops->ko_wrapped_key_len_ptr,
+			    kops->ko_key_template, kops->ko_key_attribute_count,
+			    kops->ko_key_object_id_ptr, rhndl);
+			break;
+
+		case KCF_OP_KEY_DERIVE:
+			err = KCF_PROV_KEY_DERIVE(pd, kops->ko_sid,
+			    &kops->ko_mech,
+			    kops->ko_key, kops->ko_key_template,
+			    kops->ko_key_attribute_count,
+			    kops->ko_key_object_id_ptr, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_RANDOM: {
+		kcf_random_number_ops_params_t *rops =
+		    &params->rp_u.random_number_params;
+
+		ASSERT(ctx == NULL);
+
+		switch (optype) {
+		case KCF_OP_RANDOM_SEED:
+			err = KCF_PROV_SEED_RANDOM(pd, rops->rn_sid,
+			    rops->rn_buf, rops->rn_buflen, rops->rn_entropy_est,
+			    rops->rn_flags, rhndl);
+			break;
+
+		case KCF_OP_RANDOM_GENERATE:
+			err = KCF_PROV_GENERATE_RANDOM(pd, rops->rn_sid,
+			    rops->rn_buf, rops->rn_buflen, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_SESSION: {
+		kcf_session_ops_params_t *sops = &params->rp_u.session_params;
+
+		ASSERT(ctx == NULL);
+		switch (optype) {
+		case KCF_OP_SESSION_OPEN:
+			/*
+			 * so_pd may be a logical provider, in which case
+			 * we need to check whether it has been removed.
+			 */
+			if (KCF_IS_PROV_REMOVED(sops->so_pd)) {
+				err = CRYPTO_DEVICE_ERROR;
+				break;
+			}
+			err = KCF_PROV_SESSION_OPEN(pd, sops->so_sid_ptr,
+			    rhndl, sops->so_pd);
+			break;
+
+		case KCF_OP_SESSION_CLOSE:
+			/*
+			 * so_pd may be a logical provider, in which case
+			 * we need to check whether it has been removed.
+			 */
+			if (KCF_IS_PROV_REMOVED(sops->so_pd)) {
+				err = CRYPTO_DEVICE_ERROR;
+				break;
+			}
+			err = KCF_PROV_SESSION_CLOSE(pd, sops->so_sid,
+			    rhndl, sops->so_pd);
+			break;
+
+		case KCF_OP_SESSION_LOGIN:
+			err = KCF_PROV_SESSION_LOGIN(pd, sops->so_sid,
+			    sops->so_user_type, sops->so_pin,
+			    sops->so_pin_len, rhndl);
+			break;
+
+		case KCF_OP_SESSION_LOGOUT:
+			err = KCF_PROV_SESSION_LOGOUT(pd, sops->so_sid, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_OBJECT: {
+		kcf_object_ops_params_t *jops = &params->rp_u.object_params;
+
+		ASSERT(ctx == NULL);
+		switch (optype) {
+		case KCF_OP_OBJECT_CREATE:
+			err = KCF_PROV_OBJECT_CREATE(pd, jops->oo_sid,
+			    jops->oo_template, jops->oo_attribute_count,
+			    jops->oo_object_id_ptr, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_COPY:
+			err = KCF_PROV_OBJECT_COPY(pd, jops->oo_sid,
+			    jops->oo_object_id,
+			    jops->oo_template, jops->oo_attribute_count,
+			    jops->oo_object_id_ptr, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_DESTROY:
+			err = KCF_PROV_OBJECT_DESTROY(pd, jops->oo_sid,
+			    jops->oo_object_id, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_GET_SIZE:
+			err = KCF_PROV_OBJECT_GET_SIZE(pd, jops->oo_sid,
+			    jops->oo_object_id, jops->oo_object_size, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_GET_ATTRIBUTE_VALUE:
+			err = KCF_PROV_OBJECT_GET_ATTRIBUTE_VALUE(pd,
+			    jops->oo_sid, jops->oo_object_id,
+			    jops->oo_template, jops->oo_attribute_count, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_SET_ATTRIBUTE_VALUE:
+			err = KCF_PROV_OBJECT_SET_ATTRIBUTE_VALUE(pd,
+			    jops->oo_sid, jops->oo_object_id,
+			    jops->oo_template, jops->oo_attribute_count, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_FIND_INIT:
+			err = KCF_PROV_OBJECT_FIND_INIT(pd, jops->oo_sid,
+			    jops->oo_template, jops->oo_attribute_count,
+			    jops->oo_find_init_pp_ptr, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_FIND:
+			err = KCF_PROV_OBJECT_FIND(pd, jops->oo_find_pp,
+			    jops->oo_object_id_ptr, jops->oo_max_object_count,
+			    jops->oo_object_count_ptr, rhndl);
+			break;
+
+		case KCF_OP_OBJECT_FIND_FINAL:
+			err = KCF_PROV_OBJECT_FIND_FINAL(pd, jops->oo_find_pp,
+			    rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_PROVMGMT: {
+		kcf_provmgmt_ops_params_t *pops = &params->rp_u.provmgmt_params;
+
+		ASSERT(ctx == NULL);
+		switch (optype) {
+		case KCF_OP_MGMT_EXTINFO:
+			/*
+			 * po_pd may be a logical provider, in which case
+			 * we need to check whether it has been removed.
+			 */
+			if (KCF_IS_PROV_REMOVED(pops->po_pd)) {
+				err = CRYPTO_DEVICE_ERROR;
+				break;
+			}
+			err = KCF_PROV_EXT_INFO(pd, pops->po_ext_info, rhndl,
+			    pops->po_pd);
+			break;
+
+		case KCF_OP_MGMT_INITTOKEN:
+			err = KCF_PROV_INIT_TOKEN(pd, pops->po_pin,
+			    pops->po_pin_len, pops->po_label, rhndl);
+			break;
+
+		case KCF_OP_MGMT_INITPIN:
+			err = KCF_PROV_INIT_PIN(pd, pops->po_sid, pops->po_pin,
+			    pops->po_pin_len, rhndl);
+			break;
+
+		case KCF_OP_MGMT_SETPIN:
+			err = KCF_PROV_SET_PIN(pd, pops->po_sid,
+			    pops->po_old_pin, pops->po_old_pin_len,
+			    pops->po_pin, pops->po_pin_len, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+
+	case KCF_OG_NOSTORE_KEY: {
+		kcf_key_ops_params_t *kops = &params->rp_u.key_params;
+
+		ASSERT(ctx == NULL);
+		KCF_SET_PROVIDER_MECHNUM(kops->ko_framework_mechtype, pd,
+		    &kops->ko_mech);
+
+		switch (optype) {
+		case KCF_OP_KEY_GENERATE:
+			err = KCF_PROV_NOSTORE_KEY_GENERATE(pd, kops->ko_sid,
+			    &kops->ko_mech, kops->ko_key_template,
+			    kops->ko_key_attribute_count,
+			    kops->ko_out_template1,
+			    kops->ko_out_attribute_count1, rhndl);
+			break;
+
+		case KCF_OP_KEY_GENERATE_PAIR:
+			err = KCF_PROV_NOSTORE_KEY_GENERATE_PAIR(pd,
+			    kops->ko_sid, &kops->ko_mech,
+			    kops->ko_key_template, kops->ko_key_attribute_count,
+			    kops->ko_private_key_template,
+			    kops->ko_private_key_attribute_count,
+			    kops->ko_out_template1,
+			    kops->ko_out_attribute_count1,
+			    kops->ko_out_template2,
+			    kops->ko_out_attribute_count2,
+			    rhndl);
+			break;
+
+		case KCF_OP_KEY_DERIVE:
+			err = KCF_PROV_NOSTORE_KEY_DERIVE(pd, kops->ko_sid,
+			    &kops->ko_mech, kops->ko_key,
+			    kops->ko_key_template,
+			    kops->ko_key_attribute_count,
+			    kops->ko_out_template1,
+			    kops->ko_out_attribute_count1, rhndl);
+			break;
+
+		default:
+			break;
+		}
+		break;
+	}
+	default:
+		break;
+	}		/* end of switch(params->rp_opgrp) */
+
+	KCF_PROV_INCRSTATS(pd, err);
+	return (err);
+}
+
+
+/*
+ * Emulate the call for a multipart dual ops with 2 single steps.
+ * This routine is always called in the context of a working thread
+ * running kcf_svc_do_run().
+ * The single steps are submitted in a pure synchronous way (blocking).
+ * When this routine returns, kcf_svc_do_run() will call kcf_aop_done()
+ * so the originating consumer's callback gets invoked. kcf_aop_done()
+ * takes care of freeing the operation context. So, this routine does
+ * not free the operation context.
+ *
+ * The provider descriptor is assumed held by the callers.
+ */
+static int
+kcf_emulate_dual(kcf_provider_desc_t *pd, crypto_ctx_t *ctx,
+    kcf_req_params_t *params)
+{
+	int err = CRYPTO_ARGUMENTS_BAD;
+	kcf_op_type_t optype;
+	size_t save_len;
+	off_t save_offset;
+
+	optype = params->rp_optype;
+
+	switch (params->rp_opgrp) {
+	case KCF_OG_ENCRYPT_MAC: {
+		kcf_encrypt_mac_ops_params_t *cmops =
+		    &params->rp_u.encrypt_mac_params;
+		kcf_context_t *encr_kcf_ctx;
+		crypto_ctx_t *mac_ctx;
+		kcf_req_params_t encr_params;
+
+		encr_kcf_ctx = (kcf_context_t *)(ctx->cc_framework_private);
+
+		switch (optype) {
+		case KCF_OP_INIT: {
+			encr_kcf_ctx->kc_secondctx = NULL;
+
+			KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_INIT,
+			    pd->pd_sid, &cmops->em_encr_mech,
+			    cmops->em_encr_key, NULL, NULL,
+			    cmops->em_encr_templ);
+
+			err = kcf_submit_request(pd, ctx, NULL, &encr_params,
+			    B_FALSE);
+
+			/* It can't be CRYPTO_QUEUED */
+			if (err != CRYPTO_SUCCESS) {
+				break;
+			}
+
+			err = crypto_mac_init(&cmops->em_mac_mech,
+			    cmops->em_mac_key, cmops->em_mac_templ,
+			    (crypto_context_t *)&mac_ctx, NULL);
+
+			if (err == CRYPTO_SUCCESS) {
+				encr_kcf_ctx->kc_secondctx = (kcf_context_t *)
+				    mac_ctx->cc_framework_private;
+				KCF_CONTEXT_REFHOLD((kcf_context_t *)
+				    mac_ctx->cc_framework_private);
+			}
+
+			break;
+
+		}
+		case KCF_OP_UPDATE: {
+			crypto_dual_data_t *ct = cmops->em_ciphertext;
+			crypto_data_t *pt = cmops->em_plaintext;
+			kcf_context_t *mac_kcf_ctx = encr_kcf_ctx->kc_secondctx;
+			crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+
+			KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_UPDATE,
+			    pd->pd_sid, NULL, NULL, pt, (crypto_data_t *)ct,
+			    NULL);
+
+			err = kcf_submit_request(pd, ctx, NULL, &encr_params,
+			    B_FALSE);
+
+			/* It can't be CRYPTO_QUEUED */
+			if (err != CRYPTO_SUCCESS) {
+				break;
+			}
+
+			save_offset = ct->dd_offset1;
+			save_len = ct->dd_len1;
+			if (ct->dd_len2 == 0) {
+				/*
+				 * The previous encrypt step was an
+				 * accumulation only and didn't produce any
+				 * partial output
+				 */
+				if (ct->dd_len1 == 0)
+					break;
+
+			} else {
+				ct->dd_offset1 = ct->dd_offset2;
+				ct->dd_len1 = ct->dd_len2;
+			}
+			err = crypto_mac_update((crypto_context_t)mac_ctx,
+			    (crypto_data_t *)ct, NULL);
+
+			ct->dd_offset1 = save_offset;
+			ct->dd_len1 = save_len;
+
+			break;
+		}
+		case KCF_OP_FINAL: {
+			crypto_dual_data_t *ct = cmops->em_ciphertext;
+			crypto_data_t *mac = cmops->em_mac;
+			kcf_context_t *mac_kcf_ctx = encr_kcf_ctx->kc_secondctx;
+			crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+			crypto_context_t mac_context = mac_ctx;
+
+			KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_FINAL,
+			    pd->pd_sid, NULL, NULL, NULL, (crypto_data_t *)ct,
+			    NULL);
+
+			err = kcf_submit_request(pd, ctx, NULL, &encr_params,
+			    B_FALSE);
+
+			/* It can't be CRYPTO_QUEUED */
+			if (err != CRYPTO_SUCCESS) {
+				crypto_cancel_ctx(mac_context);
+				break;
+			}
+
+			if (ct->dd_len2 > 0) {
+				save_offset = ct->dd_offset1;
+				save_len = ct->dd_len1;
+				ct->dd_offset1 = ct->dd_offset2;
+				ct->dd_len1 = ct->dd_len2;
+
+				err = crypto_mac_update(mac_context,
+				    (crypto_data_t *)ct, NULL);
+
+				ct->dd_offset1 = save_offset;
+				ct->dd_len1 = save_len;
+
+				if (err != CRYPTO_SUCCESS)  {
+					crypto_cancel_ctx(mac_context);
+					return (err);
+				}
+			}
+
+			/* and finally, collect the MAC */
+			err = crypto_mac_final(mac_context, mac, NULL);
+			break;
+		}
+
+		default:
+			break;
+		}
+		KCF_PROV_INCRSTATS(pd, err);
+		break;
+	}
+	case KCF_OG_MAC_DECRYPT: {
+		kcf_mac_decrypt_ops_params_t *mdops =
+		    &params->rp_u.mac_decrypt_params;
+		kcf_context_t *decr_kcf_ctx;
+		crypto_ctx_t *mac_ctx;
+		kcf_req_params_t decr_params;
+
+		decr_kcf_ctx = (kcf_context_t *)(ctx->cc_framework_private);
+
+		switch (optype) {
+		case KCF_OP_INIT: {
+			decr_kcf_ctx->kc_secondctx = NULL;
+
+			err = crypto_mac_init(&mdops->md_mac_mech,
+			    mdops->md_mac_key, mdops->md_mac_templ,
+			    (crypto_context_t *)&mac_ctx, NULL);
+
+			/* It can't be CRYPTO_QUEUED */
+			if (err != CRYPTO_SUCCESS) {
+				break;
+			}
+
+			KCF_WRAP_DECRYPT_OPS_PARAMS(&decr_params, KCF_OP_INIT,
+			    pd->pd_sid, &mdops->md_decr_mech,
+			    mdops->md_decr_key, NULL, NULL,
+			    mdops->md_decr_templ);
+
+			err = kcf_submit_request(pd, ctx, NULL, &decr_params,
+			    B_FALSE);
+
+			/* It can't be CRYPTO_QUEUED */
+			if (err != CRYPTO_SUCCESS) {
+				crypto_cancel_ctx((crypto_context_t)mac_ctx);
+				break;
+			}
+
+			decr_kcf_ctx->kc_secondctx = (kcf_context_t *)
+			    mac_ctx->cc_framework_private;
+			KCF_CONTEXT_REFHOLD((kcf_context_t *)
+			    mac_ctx->cc_framework_private);
+
+			break;
+		default:
+			break;
+
+		}
+		case KCF_OP_UPDATE: {
+			crypto_dual_data_t *ct = mdops->md_ciphertext;
+			crypto_data_t *pt = mdops->md_plaintext;
+			kcf_context_t *mac_kcf_ctx = decr_kcf_ctx->kc_secondctx;
+			crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+
+			err = crypto_mac_update((crypto_context_t)mac_ctx,
+			    (crypto_data_t *)ct, NULL);
+
+			if (err != CRYPTO_SUCCESS)
+				break;
+
+			save_offset = ct->dd_offset1;
+			save_len = ct->dd_len1;
+
+			/* zero ct->dd_len2 means decrypt everything */
+			if (ct->dd_len2 > 0) {
+				ct->dd_offset1 = ct->dd_offset2;
+				ct->dd_len1 = ct->dd_len2;
+			}
+
+			err = crypto_decrypt_update((crypto_context_t)ctx,
+			    (crypto_data_t *)ct, pt, NULL);
+
+			ct->dd_offset1 = save_offset;
+			ct->dd_len1 = save_len;
+
+			break;
+		}
+		case KCF_OP_FINAL: {
+			crypto_data_t *pt = mdops->md_plaintext;
+			crypto_data_t *mac = mdops->md_mac;
+			kcf_context_t *mac_kcf_ctx = decr_kcf_ctx->kc_secondctx;
+			crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx;
+
+			err = crypto_mac_final((crypto_context_t)mac_ctx,
+			    mac, NULL);
+
+			if (err != CRYPTO_SUCCESS) {
+				crypto_cancel_ctx(ctx);
+				break;
+			}
+
+			/* Get the last chunk of plaintext */
+			KCF_CONTEXT_REFHOLD(decr_kcf_ctx);
+			err = crypto_decrypt_final((crypto_context_t)ctx, pt,
+			    NULL);
+
+			break;
+		}
+		}
+		break;
+	}
+	default:
+
+		break;
+	}		/* end of switch(params->rp_opgrp) */
+
+	return (err);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_mech_tabs.c b/sys/contrib/openzfs/module/icp/core/kcf_mech_tabs.c
new file mode 100644
index 000000000000..2642b317d698
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_mech_tabs.c
@@ -0,0 +1,791 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/impl.h>
+#include <sys/modhash.h>
+
+/* Cryptographic mechanisms tables and their access functions */
+
+/*
+ * Internal numbers assigned to mechanisms are coded as follows:
+ *
+ * +----------------+----------------+
+ * | mech. class    | mech. index    |
+ * <--- 32-bits --->+<--- 32-bits --->
+ *
+ * the mech_class identifies the table the mechanism belongs to.
+ * mech_index  is the index for that mechanism in the table.
+ * A mechanism belongs to exactly 1 table.
+ * The tables are:
+ * . digest_mechs_tab[] for the msg digest mechs.
+ * . cipher_mechs_tab[] for encrypt/decrypt and wrap/unwrap mechs.
+ * . mac_mechs_tab[] for MAC mechs.
+ * . sign_mechs_tab[] for sign & verify mechs.
+ * . keyops_mechs_tab[] for key/key pair generation, and key derivation.
+ * . misc_mechs_tab[] for mechs that don't belong to any of the above.
+ *
+ * There are no holes in the tables.
+ */
+
+/*
+ * Locking conventions:
+ * --------------------
+ * A global mutex, kcf_mech_tabs_lock, serializes writes to the
+ * mechanism table via kcf_create_mech_entry().
+ *
+ * A mutex is associated with every entry of the tables.
+ * The mutex is acquired whenever the entry is accessed for
+ * 1) retrieving the mech_id (comparing the mech name)
+ * 2) finding a provider for an xxx_init() or atomic operation.
+ * 3) altering the mechs entry to add or remove a provider.
+ *
+ * In 2), after a provider is chosen, its prov_desc is held and the
+ * entry's mutex must be dropped. The provider's working function (SPI) is
+ * called outside the mech_entry's mutex.
+ *
+ * The number of providers for a particular mechanism is not expected to be
+ * long enough to justify the cost of using rwlocks, so the per-mechanism
+ * entry mutex won't be very *hot*.
+ *
+ * When both kcf_mech_tabs_lock and a mech_entry mutex need to be held,
+ * kcf_mech_tabs_lock must always be acquired first.
+ *
+ */
+
+		/* Mechanisms tables */
+
+
+/* RFE 4687834 Will deal with the extensibility of these tables later */
+
+kcf_mech_entry_t kcf_digest_mechs_tab[KCF_MAXDIGEST];
+kcf_mech_entry_t kcf_cipher_mechs_tab[KCF_MAXCIPHER];
+kcf_mech_entry_t kcf_mac_mechs_tab[KCF_MAXMAC];
+kcf_mech_entry_t kcf_sign_mechs_tab[KCF_MAXSIGN];
+kcf_mech_entry_t kcf_keyops_mechs_tab[KCF_MAXKEYOPS];
+kcf_mech_entry_t kcf_misc_mechs_tab[KCF_MAXMISC];
+
+kcf_mech_entry_tab_t kcf_mech_tabs_tab[KCF_LAST_OPSCLASS + 1] = {
+	{0, NULL},				/* No class zero */
+	{KCF_MAXDIGEST, kcf_digest_mechs_tab},
+	{KCF_MAXCIPHER, kcf_cipher_mechs_tab},
+	{KCF_MAXMAC, kcf_mac_mechs_tab},
+	{KCF_MAXSIGN, kcf_sign_mechs_tab},
+	{KCF_MAXKEYOPS, kcf_keyops_mechs_tab},
+	{KCF_MAXMISC, kcf_misc_mechs_tab}
+};
+
+/*
+ * Per-algorithm internal thresholds for the minimum input size of before
+ * offloading to hardware provider.
+ * Dispatching a crypto operation  to a hardware provider entails paying the
+ * cost of an additional context switch.  Measurements with Sun Accelerator 4000
+ * shows that 512-byte jobs or smaller are better handled in software.
+ * There is room for refinement here.
+ *
+ */
+int kcf_md5_threshold = 512;
+int kcf_sha1_threshold = 512;
+int kcf_des_threshold = 512;
+int kcf_des3_threshold = 512;
+int kcf_aes_threshold = 512;
+int kcf_bf_threshold = 512;
+int kcf_rc4_threshold = 512;
+
+kmutex_t kcf_mech_tabs_lock;
+static uint32_t kcf_gen_swprov = 0;
+
+int kcf_mech_hash_size = 256;
+mod_hash_t *kcf_mech_hash;	/* mech name to id hash */
+
+static crypto_mech_type_t
+kcf_mech_hash_find(char *mechname)
+{
+	mod_hash_val_t hv;
+	crypto_mech_type_t mt;
+
+	mt = CRYPTO_MECH_INVALID;
+	if (mod_hash_find(kcf_mech_hash, (mod_hash_key_t)mechname, &hv) == 0) {
+		mt = *(crypto_mech_type_t *)hv;
+		ASSERT(mt != CRYPTO_MECH_INVALID);
+	}
+
+	return (mt);
+}
+
+void
+kcf_destroy_mech_tabs(void)
+{
+	int i, max;
+	kcf_ops_class_t class;
+	kcf_mech_entry_t *me_tab;
+
+	if (kcf_mech_hash)
+		mod_hash_destroy_hash(kcf_mech_hash);
+
+	mutex_destroy(&kcf_mech_tabs_lock);
+
+	for (class = KCF_FIRST_OPSCLASS; class <= KCF_LAST_OPSCLASS; class++) {
+		max = kcf_mech_tabs_tab[class].met_size;
+		me_tab = kcf_mech_tabs_tab[class].met_tab;
+		for (i = 0; i < max; i++)
+			mutex_destroy(&(me_tab[i].me_mutex));
+	}
+}
+
+/*
+ * kcf_init_mech_tabs()
+ *
+ * Called by the misc/kcf's _init() routine to initialize the tables
+ * of mech_entry's.
+ */
+void
+kcf_init_mech_tabs(void)
+{
+	int i, max;
+	kcf_ops_class_t class;
+	kcf_mech_entry_t *me_tab;
+
+	/* Initializes the mutex locks. */
+
+	mutex_init(&kcf_mech_tabs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	/* Then the pre-defined mechanism entries */
+
+	/* Two digests */
+	(void) strncpy(kcf_digest_mechs_tab[0].me_name, SUN_CKM_MD5,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_digest_mechs_tab[0].me_threshold = kcf_md5_threshold;
+
+	(void) strncpy(kcf_digest_mechs_tab[1].me_name, SUN_CKM_SHA1,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_digest_mechs_tab[1].me_threshold = kcf_sha1_threshold;
+
+	/* The symmetric ciphers in various modes */
+	(void) strncpy(kcf_cipher_mechs_tab[0].me_name, SUN_CKM_DES_CBC,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[0].me_threshold = kcf_des_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[1].me_name, SUN_CKM_DES3_CBC,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[1].me_threshold = kcf_des3_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[2].me_name, SUN_CKM_DES_ECB,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[2].me_threshold = kcf_des_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[3].me_name, SUN_CKM_DES3_ECB,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[3].me_threshold = kcf_des3_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[4].me_name, SUN_CKM_BLOWFISH_CBC,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[4].me_threshold = kcf_bf_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[5].me_name, SUN_CKM_BLOWFISH_ECB,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[5].me_threshold = kcf_bf_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[6].me_name, SUN_CKM_AES_CBC,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[6].me_threshold = kcf_aes_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[7].me_name, SUN_CKM_AES_ECB,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[7].me_threshold = kcf_aes_threshold;
+
+	(void) strncpy(kcf_cipher_mechs_tab[8].me_name, SUN_CKM_RC4,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_cipher_mechs_tab[8].me_threshold = kcf_rc4_threshold;
+
+
+	/* 4 HMACs */
+	(void) strncpy(kcf_mac_mechs_tab[0].me_name, SUN_CKM_MD5_HMAC,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_mac_mechs_tab[0].me_threshold = kcf_md5_threshold;
+
+	(void) strncpy(kcf_mac_mechs_tab[1].me_name, SUN_CKM_MD5_HMAC_GENERAL,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_mac_mechs_tab[1].me_threshold = kcf_md5_threshold;
+
+	(void) strncpy(kcf_mac_mechs_tab[2].me_name, SUN_CKM_SHA1_HMAC,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_mac_mechs_tab[2].me_threshold = kcf_sha1_threshold;
+
+	(void) strncpy(kcf_mac_mechs_tab[3].me_name, SUN_CKM_SHA1_HMAC_GENERAL,
+	    CRYPTO_MAX_MECH_NAME);
+	kcf_mac_mechs_tab[3].me_threshold = kcf_sha1_threshold;
+
+
+	/* 1 random number generation pseudo mechanism */
+	(void) strncpy(kcf_misc_mechs_tab[0].me_name, SUN_RANDOM,
+	    CRYPTO_MAX_MECH_NAME);
+
+	kcf_mech_hash = mod_hash_create_strhash_nodtr("kcf mech2id hash",
+	    kcf_mech_hash_size, mod_hash_null_valdtor);
+
+	for (class = KCF_FIRST_OPSCLASS; class <= KCF_LAST_OPSCLASS; class++) {
+		max = kcf_mech_tabs_tab[class].met_size;
+		me_tab = kcf_mech_tabs_tab[class].met_tab;
+		for (i = 0; i < max; i++) {
+			mutex_init(&(me_tab[i].me_mutex), NULL,
+			    MUTEX_DEFAULT, NULL);
+			if (me_tab[i].me_name[0] != 0) {
+				me_tab[i].me_mechid = KCF_MECHID(class, i);
+				(void) mod_hash_insert(kcf_mech_hash,
+				    (mod_hash_key_t)me_tab[i].me_name,
+				    (mod_hash_val_t)&(me_tab[i].me_mechid));
+			}
+		}
+	}
+}
+
+/*
+ * kcf_create_mech_entry()
+ *
+ * Arguments:
+ *	. The class of mechanism.
+ *	. the name of the new mechanism.
+ *
+ * Description:
+ *	Creates a new mech_entry for a mechanism not yet known to the
+ *	framework.
+ *	This routine is called by kcf_add_mech_provider, which is
+ *	in turn invoked for each mechanism supported by a provider.
+ *	The'class' argument depends on the crypto_func_group_t bitmask
+ *	in the registering provider's mech_info struct for this mechanism.
+ *	When there is ambiguity in the mapping between the crypto_func_group_t
+ *	and a class (dual ops, ...) the KCF_MISC_CLASS should be used.
+ *
+ * Context:
+ *	User context only.
+ *
+ * Returns:
+ *	KCF_INVALID_MECH_CLASS or KCF_INVALID_MECH_NAME if the class or
+ *	the mechname is bogus.
+ *	KCF_MECH_TAB_FULL when there is no room left in the mech. tabs.
+ *	KCF_SUCCESS otherwise.
+ */
+static int
+kcf_create_mech_entry(kcf_ops_class_t class, char *mechname)
+{
+	crypto_mech_type_t mt;
+	kcf_mech_entry_t *me_tab;
+	int i = 0, size;
+
+	if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS))
+		return (KCF_INVALID_MECH_CLASS);
+
+	if ((mechname == NULL) || (mechname[0] == 0))
+		return (KCF_INVALID_MECH_NAME);
+	/*
+	 * First check if the mechanism is already in one of the tables.
+	 * The mech_entry could be in another class.
+	 */
+	mutex_enter(&kcf_mech_tabs_lock);
+	mt = kcf_mech_hash_find(mechname);
+	if (mt != CRYPTO_MECH_INVALID) {
+		/* Nothing to do, regardless the suggested class. */
+		mutex_exit(&kcf_mech_tabs_lock);
+		return (KCF_SUCCESS);
+	}
+	/* Now take the next unused mech entry in the class's tab */
+	me_tab = kcf_mech_tabs_tab[class].met_tab;
+	size = kcf_mech_tabs_tab[class].met_size;
+
+	while (i < size) {
+		mutex_enter(&(me_tab[i].me_mutex));
+		if (me_tab[i].me_name[0] == 0) {
+			/* Found an empty spot */
+			(void) strlcpy(me_tab[i].me_name, mechname,
+			    CRYPTO_MAX_MECH_NAME);
+			me_tab[i].me_name[CRYPTO_MAX_MECH_NAME-1] = '\0';
+			me_tab[i].me_mechid = KCF_MECHID(class, i);
+			/*
+			 * No a-priori information about the new mechanism, so
+			 * the threshold is set to zero.
+			 */
+			me_tab[i].me_threshold = 0;
+
+			mutex_exit(&(me_tab[i].me_mutex));
+			/* Add the new mechanism to the hash table */
+			(void) mod_hash_insert(kcf_mech_hash,
+			    (mod_hash_key_t)me_tab[i].me_name,
+			    (mod_hash_val_t)&(me_tab[i].me_mechid));
+			break;
+		}
+		mutex_exit(&(me_tab[i].me_mutex));
+		i++;
+	}
+
+	mutex_exit(&kcf_mech_tabs_lock);
+
+	if (i == size) {
+		return (KCF_MECH_TAB_FULL);
+	}
+
+	return (KCF_SUCCESS);
+}
+
+/*
+ * kcf_add_mech_provider()
+ *
+ * Arguments:
+ *	. An index in to  the provider mechanism array
+ *      . A pointer to the provider descriptor
+ *	. A storage for the kcf_prov_mech_desc_t the entry was added at.
+ *
+ * Description:
+ *      Adds  a new provider of a mechanism to the mechanism's mech_entry
+ *	chain.
+ *
+ * Context:
+ *      User context only.
+ *
+ * Returns
+ *      KCF_SUCCESS on success
+ *      KCF_MECH_TAB_FULL otherwise.
+ */
+int
+kcf_add_mech_provider(short mech_indx,
+    kcf_provider_desc_t *prov_desc, kcf_prov_mech_desc_t **pmdpp)
+{
+	int error;
+	kcf_mech_entry_t *mech_entry = NULL;
+	crypto_mech_info_t *mech_info;
+	crypto_mech_type_t kcf_mech_type, mt;
+	kcf_prov_mech_desc_t *prov_mech, *prov_mech2;
+	crypto_func_group_t simple_fg_mask, dual_fg_mask;
+	crypto_mech_info_t *dmi;
+	crypto_mech_info_list_t *mil, *mil2;
+	kcf_mech_entry_t *me;
+	int i;
+
+	ASSERT(prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	mech_info = &prov_desc->pd_mechanisms[mech_indx];
+
+	/*
+	 * A mechanism belongs to exactly one mechanism table.
+	 * Find the class corresponding to the function group flag of
+	 * the mechanism.
+	 */
+	kcf_mech_type = kcf_mech_hash_find(mech_info->cm_mech_name);
+	if (kcf_mech_type == CRYPTO_MECH_INVALID) {
+		crypto_func_group_t fg = mech_info->cm_func_group_mask;
+		kcf_ops_class_t class;
+
+		if (fg & CRYPTO_FG_DIGEST || fg & CRYPTO_FG_DIGEST_ATOMIC)
+			class = KCF_DIGEST_CLASS;
+		else if (fg & CRYPTO_FG_ENCRYPT || fg & CRYPTO_FG_DECRYPT ||
+		    fg & CRYPTO_FG_ENCRYPT_ATOMIC ||
+		    fg & CRYPTO_FG_DECRYPT_ATOMIC)
+			class = KCF_CIPHER_CLASS;
+		else if (fg & CRYPTO_FG_MAC || fg & CRYPTO_FG_MAC_ATOMIC)
+			class = KCF_MAC_CLASS;
+		else if (fg & CRYPTO_FG_SIGN || fg & CRYPTO_FG_VERIFY ||
+		    fg & CRYPTO_FG_SIGN_ATOMIC ||
+		    fg & CRYPTO_FG_VERIFY_ATOMIC ||
+		    fg & CRYPTO_FG_SIGN_RECOVER ||
+		    fg & CRYPTO_FG_VERIFY_RECOVER)
+			class = KCF_SIGN_CLASS;
+		else if (fg & CRYPTO_FG_GENERATE ||
+		    fg & CRYPTO_FG_GENERATE_KEY_PAIR ||
+		    fg & CRYPTO_FG_WRAP || fg & CRYPTO_FG_UNWRAP ||
+		    fg & CRYPTO_FG_DERIVE)
+			class = KCF_KEYOPS_CLASS;
+		else
+			class = KCF_MISC_CLASS;
+
+		/*
+		 * Attempt to create a new mech_entry for the specified
+		 * mechanism. kcf_create_mech_entry() can handle the case
+		 * where such an entry already exists.
+		 */
+		if ((error = kcf_create_mech_entry(class,
+		    mech_info->cm_mech_name)) != KCF_SUCCESS) {
+			return (error);
+		}
+		/* get the KCF mech type that was assigned to the mechanism */
+		kcf_mech_type = kcf_mech_hash_find(mech_info->cm_mech_name);
+		ASSERT(kcf_mech_type != CRYPTO_MECH_INVALID);
+	}
+
+	error = kcf_get_mech_entry(kcf_mech_type, &mech_entry);
+	ASSERT(error == KCF_SUCCESS);
+
+	/* allocate and initialize new kcf_prov_mech_desc */
+	prov_mech = kmem_zalloc(sizeof (kcf_prov_mech_desc_t), KM_SLEEP);
+	bcopy(mech_info, &prov_mech->pm_mech_info, sizeof (crypto_mech_info_t));
+	prov_mech->pm_prov_desc = prov_desc;
+	prov_desc->pd_mech_indx[KCF_MECH2CLASS(kcf_mech_type)]
+	    [KCF_MECH2INDEX(kcf_mech_type)] = mech_indx;
+
+	KCF_PROV_REFHOLD(prov_desc);
+	KCF_PROV_IREFHOLD(prov_desc);
+
+	dual_fg_mask = mech_info->cm_func_group_mask & CRYPTO_FG_DUAL_MASK;
+
+	if (dual_fg_mask == ((crypto_func_group_t)0))
+		goto add_entry;
+
+	simple_fg_mask = (mech_info->cm_func_group_mask &
+	    CRYPTO_FG_SIMPLEOP_MASK) | CRYPTO_FG_RANDOM;
+
+	for (i = 0; i < prov_desc->pd_mech_list_count; i++) {
+		dmi = &prov_desc->pd_mechanisms[i];
+
+		/* skip self */
+		if (dmi->cm_mech_number == mech_info->cm_mech_number)
+			continue;
+
+		/* skip if not a dual operation mechanism */
+		if (!(dmi->cm_func_group_mask & dual_fg_mask) ||
+		    (dmi->cm_func_group_mask & simple_fg_mask))
+			continue;
+
+		mt = kcf_mech_hash_find(dmi->cm_mech_name);
+		if (mt == CRYPTO_MECH_INVALID)
+			continue;
+
+		if (kcf_get_mech_entry(mt, &me) != KCF_SUCCESS)
+			continue;
+
+		mil = kmem_zalloc(sizeof (*mil), KM_SLEEP);
+		mil2 = kmem_zalloc(sizeof (*mil2), KM_SLEEP);
+
+		/*
+		 * Ignore hard-coded entries in the mech table
+		 * if the provider hasn't registered.
+		 */
+		mutex_enter(&me->me_mutex);
+		if (me->me_hw_prov_chain == NULL && me->me_sw_prov == NULL) {
+			mutex_exit(&me->me_mutex);
+			kmem_free(mil, sizeof (*mil));
+			kmem_free(mil2, sizeof (*mil2));
+			continue;
+		}
+
+		/*
+		 * Add other dual mechanisms that have registered
+		 * with the framework to this mechanism's
+		 * cross-reference list.
+		 */
+		mil->ml_mech_info = *dmi; /* struct assignment */
+		mil->ml_kcf_mechid = mt;
+
+		/* add to head of list */
+		mil->ml_next = prov_mech->pm_mi_list;
+		prov_mech->pm_mi_list = mil;
+
+		if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+			prov_mech2 = me->me_hw_prov_chain;
+		else
+			prov_mech2 = me->me_sw_prov;
+
+		if (prov_mech2 == NULL) {
+			kmem_free(mil2, sizeof (*mil2));
+			mutex_exit(&me->me_mutex);
+			continue;
+		}
+
+		/*
+		 * Update all other cross-reference lists by
+		 * adding this new mechanism.
+		 */
+		while (prov_mech2 != NULL) {
+			if (prov_mech2->pm_prov_desc == prov_desc) {
+				/* struct assignment */
+				mil2->ml_mech_info = *mech_info;
+				mil2->ml_kcf_mechid = kcf_mech_type;
+
+				/* add to head of list */
+				mil2->ml_next = prov_mech2->pm_mi_list;
+				prov_mech2->pm_mi_list = mil2;
+				break;
+			}
+			prov_mech2 = prov_mech2->pm_next;
+		}
+		if (prov_mech2 == NULL)
+			kmem_free(mil2, sizeof (*mil2));
+
+		mutex_exit(&me->me_mutex);
+	}
+
+add_entry:
+	/*
+	 * Add new kcf_prov_mech_desc at the front of HW providers
+	 * chain.
+	 */
+	switch (prov_desc->pd_prov_type) {
+
+	case CRYPTO_HW_PROVIDER:
+		mutex_enter(&mech_entry->me_mutex);
+		prov_mech->pm_me = mech_entry;
+		prov_mech->pm_next = mech_entry->me_hw_prov_chain;
+		mech_entry->me_hw_prov_chain = prov_mech;
+		mech_entry->me_num_hwprov++;
+		mutex_exit(&mech_entry->me_mutex);
+		break;
+
+	case CRYPTO_SW_PROVIDER:
+		mutex_enter(&mech_entry->me_mutex);
+		if (mech_entry->me_sw_prov != NULL) {
+			/*
+			 * There is already a SW provider for this mechanism.
+			 * Since we allow only one SW provider per mechanism,
+			 * report this condition.
+			 */
+			cmn_err(CE_WARN, "The cryptographic software provider "
+			    "\"%s\" will not be used for %s. The provider "
+			    "\"%s\" will be used for this mechanism "
+			    "instead.", prov_desc->pd_description,
+			    mech_info->cm_mech_name,
+			    mech_entry->me_sw_prov->pm_prov_desc->
+			    pd_description);
+			KCF_PROV_REFRELE(prov_desc);
+			kmem_free(prov_mech, sizeof (kcf_prov_mech_desc_t));
+			prov_mech = NULL;
+		} else {
+			/*
+			 * Set the provider as the software provider for
+			 * this mechanism.
+			 */
+			mech_entry->me_sw_prov = prov_mech;
+
+			/* We'll wrap around after 4 billion registrations! */
+			mech_entry->me_gen_swprov = kcf_gen_swprov++;
+		}
+		mutex_exit(&mech_entry->me_mutex);
+		break;
+	default:
+		break;
+	}
+
+	*pmdpp = prov_mech;
+
+	return (KCF_SUCCESS);
+}
+
+/*
+ * kcf_remove_mech_provider()
+ *
+ * Arguments:
+ *      . mech_name: the name of the mechanism.
+ *      . prov_desc: The provider descriptor
+ *
+ * Description:
+ *      Removes a provider from chain of provider descriptors.
+ *	The provider is made unavailable to kernel consumers for the specified
+ *	mechanism.
+ *
+ * Context:
+ *      User context only.
+ */
+void
+kcf_remove_mech_provider(char *mech_name, kcf_provider_desc_t *prov_desc)
+{
+	crypto_mech_type_t mech_type;
+	kcf_prov_mech_desc_t *prov_mech = NULL, *prov_chain;
+	kcf_prov_mech_desc_t **prev_entry_next;
+	kcf_mech_entry_t *mech_entry;
+	crypto_mech_info_list_t *mil, *mil2, *next, **prev_next;
+
+	ASSERT(prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER);
+
+	/* get the KCF mech type that was assigned to the mechanism */
+	if ((mech_type = kcf_mech_hash_find(mech_name)) ==
+	    CRYPTO_MECH_INVALID) {
+		/*
+		 * Provider was not allowed for this mech due to policy or
+		 * configuration.
+		 */
+		return;
+	}
+
+	/* get a ptr to the mech_entry that was created */
+	if (kcf_get_mech_entry(mech_type, &mech_entry) != KCF_SUCCESS) {
+		/*
+		 * Provider was not allowed for this mech due to policy or
+		 * configuration.
+		 */
+		return;
+	}
+
+	mutex_enter(&mech_entry->me_mutex);
+
+	switch (prov_desc->pd_prov_type) {
+
+	case CRYPTO_HW_PROVIDER:
+		/* find the provider in the mech_entry chain */
+		prev_entry_next = &mech_entry->me_hw_prov_chain;
+		prov_mech = mech_entry->me_hw_prov_chain;
+		while (prov_mech != NULL &&
+		    prov_mech->pm_prov_desc != prov_desc) {
+			prev_entry_next = &prov_mech->pm_next;
+			prov_mech = prov_mech->pm_next;
+		}
+
+		if (prov_mech == NULL) {
+			/* entry not found, simply return */
+			mutex_exit(&mech_entry->me_mutex);
+			return;
+		}
+
+		/* remove provider entry from mech_entry chain */
+		*prev_entry_next = prov_mech->pm_next;
+		ASSERT(mech_entry->me_num_hwprov > 0);
+		mech_entry->me_num_hwprov--;
+		break;
+
+	case CRYPTO_SW_PROVIDER:
+		if (mech_entry->me_sw_prov == NULL ||
+		    mech_entry->me_sw_prov->pm_prov_desc != prov_desc) {
+			/* not the software provider for this mechanism */
+			mutex_exit(&mech_entry->me_mutex);
+			return;
+		}
+		prov_mech = mech_entry->me_sw_prov;
+		mech_entry->me_sw_prov = NULL;
+		break;
+	default:
+		/* unexpected crypto_provider_type_t */
+		mutex_exit(&mech_entry->me_mutex);
+		return;
+	}
+
+	mutex_exit(&mech_entry->me_mutex);
+
+	/* Free the dual ops cross-reference lists  */
+	mil = prov_mech->pm_mi_list;
+	while (mil != NULL) {
+		next = mil->ml_next;
+		if (kcf_get_mech_entry(mil->ml_kcf_mechid,
+		    &mech_entry) != KCF_SUCCESS) {
+			mil = next;
+			continue;
+		}
+
+		mutex_enter(&mech_entry->me_mutex);
+		if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+			prov_chain = mech_entry->me_hw_prov_chain;
+		else
+			prov_chain = mech_entry->me_sw_prov;
+
+		while (prov_chain != NULL) {
+			if (prov_chain->pm_prov_desc == prov_desc) {
+				prev_next = &prov_chain->pm_mi_list;
+				mil2 = prov_chain->pm_mi_list;
+				while (mil2 != NULL &&
+				    mil2->ml_kcf_mechid != mech_type) {
+					prev_next = &mil2->ml_next;
+					mil2 = mil2->ml_next;
+				}
+				if (mil2 != NULL) {
+					*prev_next = mil2->ml_next;
+					kmem_free(mil2, sizeof (*mil2));
+				}
+				break;
+			}
+			prov_chain = prov_chain->pm_next;
+		}
+
+		mutex_exit(&mech_entry->me_mutex);
+		kmem_free(mil, sizeof (crypto_mech_info_list_t));
+		mil = next;
+	}
+
+	/* free entry  */
+	KCF_PROV_REFRELE(prov_mech->pm_prov_desc);
+	KCF_PROV_IREFRELE(prov_mech->pm_prov_desc);
+	kmem_free(prov_mech, sizeof (kcf_prov_mech_desc_t));
+}
+
+/*
+ * kcf_get_mech_entry()
+ *
+ * Arguments:
+ *      . The framework mechanism type
+ *      . Storage for the mechanism entry
+ *
+ * Description:
+ *      Retrieves the mechanism entry for the mech.
+ *
+ * Context:
+ *      User and interrupt contexts.
+ *
+ * Returns:
+ *      KCF_MECHANISM_XXX appropriate error code.
+ *      KCF_SUCCESS otherwise.
+ */
+int
+kcf_get_mech_entry(crypto_mech_type_t mech_type, kcf_mech_entry_t **mep)
+{
+	kcf_ops_class_t		class;
+	int			index;
+	kcf_mech_entry_tab_t	*me_tab;
+
+	ASSERT(mep != NULL);
+
+	class = KCF_MECH2CLASS(mech_type);
+
+	if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) {
+		/* the caller won't need to know it's an invalid class */
+		return (KCF_INVALID_MECH_NUMBER);
+	}
+
+	me_tab = &kcf_mech_tabs_tab[class];
+	index = KCF_MECH2INDEX(mech_type);
+
+	if ((index < 0) || (index >= me_tab->met_size)) {
+		return (KCF_INVALID_MECH_NUMBER);
+	}
+
+	*mep = &((me_tab->met_tab)[index]);
+
+	return (KCF_SUCCESS);
+}
+
+/* CURRENTLY UNSUPPORTED: attempting to load the module if it isn't found */
+/*
+ * Lookup the hash table for an entry that matches the mechname.
+ * If there are no hardware or software providers for the mechanism,
+ * but there is an unloaded software provider, this routine will attempt
+ * to load it.
+ *
+ * If the MOD_NOAUTOUNLOAD flag is not set, a software provider is
+ * in constant danger of being unloaded.  For consumers that call
+ * crypto_mech2id() only once, the provider will not be reloaded
+ * if it becomes unloaded.  If a provider gets loaded elsewhere
+ * without the MOD_NOAUTOUNLOAD flag being set, we set it now.
+ */
+crypto_mech_type_t
+crypto_mech2id_common(char *mechname, boolean_t load_module)
+{
+	crypto_mech_type_t mt = kcf_mech_hash_find(mechname);
+	return (mt);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c b/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c
new file mode 100644
index 000000000000..1b115d976232
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_prov_lib.c
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <modes/modes.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Utility routine to copy a buffer to a crypto_data structure.
+ */
+
+/*
+ * Utility routine to apply the command, 'cmd', to the
+ * data in the uio structure.
+ */
+int
+crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd,
+    void *digest_ctx, void (*update)(void))
+{
+	zfs_uio_t *uiop = data->cd_uio;
+	off_t offset = data->cd_offset;
+	size_t length = len;
+	uint_t vec_idx;
+	size_t cur_len;
+	uchar_t *datap;
+
+	ASSERT(data->cd_format == CRYPTO_DATA_UIO);
+	if (zfs_uio_segflg(uiop) != UIO_SYSSPACE) {
+		return (CRYPTO_ARGUMENTS_BAD);
+	}
+
+	/*
+	 * Jump to the first iovec containing data to be
+	 * processed.
+	 */
+	offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx);
+
+	if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+		/*
+		 * The caller specified an offset that is larger than
+		 * the total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) {
+		cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) -
+		    offset, length);
+
+		datap = (uchar_t *)(zfs_uio_iovbase(uiop, vec_idx) + offset);
+		switch (cmd) {
+		case COPY_FROM_DATA:
+			bcopy(datap, buf, cur_len);
+			buf += cur_len;
+			break;
+		case COPY_TO_DATA:
+			bcopy(buf, datap, cur_len);
+			buf += cur_len;
+			break;
+		case COMPARE_TO_DATA:
+			if (bcmp(datap, buf, cur_len))
+				return (CRYPTO_SIGNATURE_INVALID);
+			buf += cur_len;
+			break;
+		case MD5_DIGEST_DATA:
+		case SHA1_DIGEST_DATA:
+		case SHA2_DIGEST_DATA:
+		case GHASH_DATA:
+			return (CRYPTO_ARGUMENTS_BAD);
+		}
+
+		length -= cur_len;
+		vec_idx++;
+		offset = 0;
+	}
+
+	if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+		/*
+		 * The end of the specified iovec's was reached but
+		 * the length requested could not be processed.
+		 */
+		switch (cmd) {
+		case COPY_TO_DATA:
+			data->cd_length = len;
+			return (CRYPTO_BUFFER_TOO_SMALL);
+		default:
+			return (CRYPTO_DATA_LEN_RANGE);
+		}
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+int
+crypto_put_output_data(uchar_t *buf, crypto_data_t *output, int len)
+{
+	switch (output->cd_format) {
+	case CRYPTO_DATA_RAW:
+		if (output->cd_raw.iov_len < len) {
+			output->cd_length = len;
+			return (CRYPTO_BUFFER_TOO_SMALL);
+		}
+		bcopy(buf, (uchar_t *)(output->cd_raw.iov_base +
+		    output->cd_offset), len);
+		break;
+
+	case CRYPTO_DATA_UIO:
+		return (crypto_uio_data(output, buf, len,
+		    COPY_TO_DATA, NULL, NULL));
+	default:
+		return (CRYPTO_ARGUMENTS_BAD);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+int
+crypto_update_iov(void *ctx, crypto_data_t *input, crypto_data_t *output,
+    int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+    void (*copy_block)(uint8_t *, uint64_t *))
+{
+	common_ctx_t *common_ctx = ctx;
+	int rv;
+
+	ASSERT(input != output);
+	if (input->cd_miscdata != NULL) {
+		copy_block((uint8_t *)input->cd_miscdata,
+		    &common_ctx->cc_iv[0]);
+	}
+
+	if (input->cd_raw.iov_len < input->cd_length)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	rv = (cipher)(ctx, input->cd_raw.iov_base + input->cd_offset,
+	    input->cd_length, output);
+
+	return (rv);
+}
+
+int
+crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output,
+    int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+    void (*copy_block)(uint8_t *, uint64_t *))
+{
+	common_ctx_t *common_ctx = ctx;
+	zfs_uio_t *uiop = input->cd_uio;
+	off_t offset = input->cd_offset;
+	size_t length = input->cd_length;
+	uint_t vec_idx;
+	size_t cur_len;
+
+	ASSERT(input != output);
+	if (input->cd_miscdata != NULL) {
+		copy_block((uint8_t *)input->cd_miscdata,
+		    &common_ctx->cc_iv[0]);
+	}
+
+	if (zfs_uio_segflg(input->cd_uio) != UIO_SYSSPACE) {
+		return (CRYPTO_ARGUMENTS_BAD);
+	}
+
+	/*
+	 * Jump to the first iovec containing data to be
+	 * processed.
+	 */
+	offset = zfs_uio_index_at_offset(uiop, offset, &vec_idx);
+	if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	/*
+	 * Now process the iovecs.
+	 */
+	while (vec_idx < zfs_uio_iovcnt(uiop) && length > 0) {
+		cur_len = MIN(zfs_uio_iovlen(uiop, vec_idx) -
+		    offset, length);
+
+		int rv = (cipher)(ctx, zfs_uio_iovbase(uiop, vec_idx) + offset,
+		    cur_len, output);
+
+		if (rv != CRYPTO_SUCCESS) {
+			return (rv);
+		}
+		length -= cur_len;
+		vec_idx++;
+		offset = 0;
+	}
+
+	if (vec_idx == zfs_uio_iovcnt(uiop) && length > 0) {
+		/*
+		 * The end of the specified iovec's was reached but
+		 * the length requested could not be processed, i.e.
+		 * The caller requested to digest more data than it provided.
+		 */
+
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_prov_tabs.c b/sys/contrib/openzfs/module/icp/core/kcf_prov_tabs.c
new file mode 100644
index 000000000000..94e6937bcd76
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_prov_tabs.c
@@ -0,0 +1,645 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file is part of the core Kernel Cryptographic Framework.
+ * It implements the management of tables of Providers. Entries to
+ * added and removed when cryptographic providers register with
+ * and unregister from the framework, respectively. The KCF scheduler
+ * and ioctl pseudo driver call this function to obtain the list
+ * of available providers.
+ *
+ * The provider table is indexed by crypto_provider_id_t. Each
+ * element of the table contains a pointer to a provider descriptor,
+ * or NULL if the entry is free.
+ *
+ * This file also implements helper functions to allocate and free
+ * provider descriptors.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/crypto/spi.h>
+
+#define	KCF_MAX_PROVIDERS	512	/* max number of providers */
+
+/*
+ * Prov_tab is an array of providers which is updated when
+ * a crypto provider registers with kcf. The provider calls the
+ * SPI routine, crypto_register_provider(), which in turn calls
+ * kcf_prov_tab_add_provider().
+ *
+ * A provider unregisters by calling crypto_unregister_provider()
+ * which triggers the removal of the prov_tab entry.
+ * It also calls kcf_remove_mech_provider().
+ *
+ * prov_tab entries are not updated from kcf.conf or by cryptoadm(1M).
+ */
+static kcf_provider_desc_t **prov_tab = NULL;
+static kmutex_t prov_tab_mutex; /* ensure exclusive access to the table */
+static uint_t prov_tab_num = 0; /* number of providers in table */
+static uint_t prov_tab_max = KCF_MAX_PROVIDERS;
+
+void
+kcf_prov_tab_destroy(void)
+{
+	mutex_destroy(&prov_tab_mutex);
+
+	if (prov_tab)
+		kmem_free(prov_tab, prov_tab_max *
+		    sizeof (kcf_provider_desc_t *));
+}
+
+/*
+ * Initialize a mutex and the KCF providers table, prov_tab.
+ * The providers table is dynamically allocated with prov_tab_max entries.
+ * Called from kcf module _init().
+ */
+void
+kcf_prov_tab_init(void)
+{
+	mutex_init(&prov_tab_mutex, NULL, MUTEX_DEFAULT, NULL);
+
+	prov_tab = kmem_zalloc(prov_tab_max * sizeof (kcf_provider_desc_t *),
+	    KM_SLEEP);
+}
+
+/*
+ * Add a provider to the provider table. If no free entry can be found
+ * for the new provider, returns CRYPTO_HOST_MEMORY. Otherwise, add
+ * the provider to the table, initialize the pd_prov_id field
+ * of the specified provider descriptor to the index in that table,
+ * and return CRYPTO_SUCCESS. Note that a REFHOLD is done on the
+ * provider when pointed to by a table entry.
+ */
+int
+kcf_prov_tab_add_provider(kcf_provider_desc_t *prov_desc)
+{
+	uint_t i;
+
+	ASSERT(prov_tab != NULL);
+
+	mutex_enter(&prov_tab_mutex);
+
+	/* find free slot in providers table */
+	for (i = 1; i < KCF_MAX_PROVIDERS && prov_tab[i] != NULL; i++)
+		;
+	if (i == KCF_MAX_PROVIDERS) {
+		/* ran out of providers entries */
+		mutex_exit(&prov_tab_mutex);
+		cmn_err(CE_WARN, "out of providers entries");
+		return (CRYPTO_HOST_MEMORY);
+	}
+
+	/* initialize entry */
+	prov_tab[i] = prov_desc;
+	KCF_PROV_REFHOLD(prov_desc);
+	KCF_PROV_IREFHOLD(prov_desc);
+	prov_tab_num++;
+
+	mutex_exit(&prov_tab_mutex);
+
+	/* update provider descriptor */
+	prov_desc->pd_prov_id = i;
+
+	/*
+	 * The KCF-private provider handle is defined as the internal
+	 * provider id.
+	 */
+	prov_desc->pd_kcf_prov_handle =
+	    (crypto_kcf_provider_handle_t)prov_desc->pd_prov_id;
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Remove the provider specified by its id. A REFRELE is done on the
+ * corresponding provider descriptor before this function returns.
+ * Returns CRYPTO_UNKNOWN_PROVIDER if the provider id is not valid.
+ */
+int
+kcf_prov_tab_rem_provider(crypto_provider_id_t prov_id)
+{
+	kcf_provider_desc_t *prov_desc;
+
+	ASSERT(prov_tab != NULL);
+	ASSERT(prov_tab_num >= 0);
+
+	/*
+	 * Validate provider id, since it can be specified by a 3rd-party
+	 * provider.
+	 */
+
+	mutex_enter(&prov_tab_mutex);
+	if (prov_id >= KCF_MAX_PROVIDERS ||
+	    ((prov_desc = prov_tab[prov_id]) == NULL)) {
+		mutex_exit(&prov_tab_mutex);
+		return (CRYPTO_INVALID_PROVIDER_ID);
+	}
+	mutex_exit(&prov_tab_mutex);
+
+	/*
+	 * The provider id must remain valid until the associated provider
+	 * descriptor is freed. For this reason, we simply release our
+	 * reference to the descriptor here. When the reference count
+	 * reaches zero, kcf_free_provider_desc() will be invoked and
+	 * the associated entry in the providers table will be released
+	 * at that time.
+	 */
+
+	KCF_PROV_REFRELE(prov_desc);
+	KCF_PROV_IREFRELE(prov_desc);
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Returns the provider descriptor corresponding to the specified
+ * provider id. A REFHOLD is done on the descriptor before it is
+ * returned to the caller. It is the responsibility of the caller
+ * to do a REFRELE once it is done with the provider descriptor.
+ */
+kcf_provider_desc_t *
+kcf_prov_tab_lookup(crypto_provider_id_t prov_id)
+{
+	kcf_provider_desc_t *prov_desc;
+
+	mutex_enter(&prov_tab_mutex);
+
+	prov_desc = prov_tab[prov_id];
+
+	if (prov_desc == NULL) {
+		mutex_exit(&prov_tab_mutex);
+		return (NULL);
+	}
+
+	KCF_PROV_REFHOLD(prov_desc);
+
+	mutex_exit(&prov_tab_mutex);
+
+	return (prov_desc);
+}
+
+static void
+allocate_ops_v1(crypto_ops_t *src, crypto_ops_t *dst, uint_t *mech_list_count)
+{
+	if (src->co_control_ops != NULL)
+		dst->co_control_ops = kmem_alloc(sizeof (crypto_control_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_digest_ops != NULL)
+		dst->co_digest_ops = kmem_alloc(sizeof (crypto_digest_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_cipher_ops != NULL)
+		dst->co_cipher_ops = kmem_alloc(sizeof (crypto_cipher_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_mac_ops != NULL)
+		dst->co_mac_ops = kmem_alloc(sizeof (crypto_mac_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_sign_ops != NULL)
+		dst->co_sign_ops = kmem_alloc(sizeof (crypto_sign_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_verify_ops != NULL)
+		dst->co_verify_ops = kmem_alloc(sizeof (crypto_verify_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_dual_ops != NULL)
+		dst->co_dual_ops = kmem_alloc(sizeof (crypto_dual_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_dual_cipher_mac_ops != NULL)
+		dst->co_dual_cipher_mac_ops = kmem_alloc(
+		    sizeof (crypto_dual_cipher_mac_ops_t), KM_SLEEP);
+
+	if (src->co_random_ops != NULL) {
+		dst->co_random_ops = kmem_alloc(
+		    sizeof (crypto_random_number_ops_t), KM_SLEEP);
+
+		/*
+		 * Allocate storage to store the array of supported mechanisms
+		 * specified by provider. We allocate extra mechanism storage
+		 * if the provider has random_ops since we keep an internal
+		 * mechanism, SUN_RANDOM, in this case.
+		 */
+		(*mech_list_count)++;
+	}
+
+	if (src->co_session_ops != NULL)
+		dst->co_session_ops = kmem_alloc(sizeof (crypto_session_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_object_ops != NULL)
+		dst->co_object_ops = kmem_alloc(sizeof (crypto_object_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_key_ops != NULL)
+		dst->co_key_ops = kmem_alloc(sizeof (crypto_key_ops_t),
+		    KM_SLEEP);
+
+	if (src->co_provider_ops != NULL)
+		dst->co_provider_ops = kmem_alloc(
+		    sizeof (crypto_provider_management_ops_t), KM_SLEEP);
+
+	if (src->co_ctx_ops != NULL)
+		dst->co_ctx_ops = kmem_alloc(sizeof (crypto_ctx_ops_t),
+		    KM_SLEEP);
+}
+
+static void
+allocate_ops_v2(crypto_ops_t *src, crypto_ops_t *dst)
+{
+	if (src->co_mech_ops != NULL)
+		dst->co_mech_ops = kmem_alloc(sizeof (crypto_mech_ops_t),
+		    KM_SLEEP);
+}
+
+static void
+allocate_ops_v3(crypto_ops_t *src, crypto_ops_t *dst)
+{
+	if (src->co_nostore_key_ops != NULL)
+		dst->co_nostore_key_ops =
+		    kmem_alloc(sizeof (crypto_nostore_key_ops_t), KM_SLEEP);
+}
+
+/*
+ * Allocate a provider descriptor. mech_list_count specifies the
+ * number of mechanisms supported by the providers, and is used
+ * to allocate storage for the mechanism table.
+ * This function may sleep while allocating memory, which is OK
+ * since it is invoked from user context during provider registration.
+ */
+kcf_provider_desc_t *
+kcf_alloc_provider_desc(crypto_provider_info_t *info)
+{
+	int i, j;
+	kcf_provider_desc_t *desc;
+	uint_t mech_list_count = info->pi_mech_list_count;
+	crypto_ops_t *src_ops = info->pi_ops_vector;
+
+	desc = kmem_zalloc(sizeof (kcf_provider_desc_t), KM_SLEEP);
+
+	/*
+	 * pd_description serves two purposes
+	 * - Appears as a blank padded PKCS#11 style string, that will be
+	 *   returned to applications in CK_SLOT_INFO.slotDescription.
+	 *   This means that we should not have a null character in the
+	 *   first CRYPTO_PROVIDER_DESCR_MAX_LEN bytes.
+	 * - Appears as a null-terminated string that can be used by
+	 *   other kcf routines.
+	 *
+	 * So, we allocate enough room for one extra null terminator
+	 * which keeps every one happy.
+	 */
+	desc->pd_description = kmem_alloc(CRYPTO_PROVIDER_DESCR_MAX_LEN + 1,
+	    KM_SLEEP);
+	(void) memset(desc->pd_description, ' ',
+	    CRYPTO_PROVIDER_DESCR_MAX_LEN);
+	desc->pd_description[CRYPTO_PROVIDER_DESCR_MAX_LEN] = '\0';
+
+	/*
+	 * Since the framework does not require the ops vector specified
+	 * by the providers during registration to be persistent,
+	 * KCF needs to allocate storage where copies of the ops
+	 * vectors are copied.
+	 */
+	desc->pd_ops_vector = kmem_zalloc(sizeof (crypto_ops_t), KM_SLEEP);
+
+	if (info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER) {
+		allocate_ops_v1(src_ops, desc->pd_ops_vector, &mech_list_count);
+		if (info->pi_interface_version >= CRYPTO_SPI_VERSION_2)
+			allocate_ops_v2(src_ops, desc->pd_ops_vector);
+		if (info->pi_interface_version == CRYPTO_SPI_VERSION_3)
+			allocate_ops_v3(src_ops, desc->pd_ops_vector);
+	}
+
+	desc->pd_mech_list_count = mech_list_count;
+	desc->pd_mechanisms = kmem_zalloc(sizeof (crypto_mech_info_t) *
+	    mech_list_count, KM_SLEEP);
+	for (i = 0; i < KCF_OPS_CLASSSIZE; i++)
+		for (j = 0; j < KCF_MAXMECHTAB; j++)
+			desc->pd_mech_indx[i][j] = KCF_INVALID_INDX;
+
+	desc->pd_prov_id = KCF_PROVID_INVALID;
+	desc->pd_state = KCF_PROV_ALLOCATED;
+
+	mutex_init(&desc->pd_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&desc->pd_resume_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&desc->pd_remove_cv, NULL, CV_DEFAULT, NULL);
+
+	return (desc);
+}
+
+/*
+ * Called by KCF_PROV_REFRELE when a provider's reference count drops
+ * to zero. We free the descriptor when the last reference is released.
+ * However, for software providers, we do not free it when there is an
+ * unregister thread waiting. We signal that thread in this case and
+ * that thread is responsible for freeing the descriptor.
+ */
+void
+kcf_provider_zero_refcnt(kcf_provider_desc_t *desc)
+{
+	mutex_enter(&desc->pd_lock);
+	switch (desc->pd_prov_type) {
+	case CRYPTO_SW_PROVIDER:
+		if (desc->pd_state == KCF_PROV_REMOVED ||
+		    desc->pd_state == KCF_PROV_DISABLED) {
+			desc->pd_state = KCF_PROV_FREED;
+			cv_broadcast(&desc->pd_remove_cv);
+			mutex_exit(&desc->pd_lock);
+			break;
+		}
+		/* FALLTHRU */
+
+	case CRYPTO_HW_PROVIDER:
+	case CRYPTO_LOGICAL_PROVIDER:
+		mutex_exit(&desc->pd_lock);
+		kcf_free_provider_desc(desc);
+	}
+}
+
+/*
+ * Free a provider descriptor.
+ */
+void
+kcf_free_provider_desc(kcf_provider_desc_t *desc)
+{
+	if (desc == NULL)
+		return;
+
+	mutex_enter(&prov_tab_mutex);
+	if (desc->pd_prov_id != KCF_PROVID_INVALID) {
+		/* release the associated providers table entry */
+		ASSERT(prov_tab[desc->pd_prov_id] != NULL);
+		prov_tab[desc->pd_prov_id] = NULL;
+		prov_tab_num--;
+	}
+	mutex_exit(&prov_tab_mutex);
+
+	/* free the kernel memory associated with the provider descriptor */
+
+	if (desc->pd_description != NULL)
+		kmem_free(desc->pd_description,
+		    CRYPTO_PROVIDER_DESCR_MAX_LEN + 1);
+
+	if (desc->pd_ops_vector != NULL) {
+
+		if (desc->pd_ops_vector->co_control_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_control_ops,
+			    sizeof (crypto_control_ops_t));
+
+		if (desc->pd_ops_vector->co_digest_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_digest_ops,
+			    sizeof (crypto_digest_ops_t));
+
+		if (desc->pd_ops_vector->co_cipher_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_cipher_ops,
+			    sizeof (crypto_cipher_ops_t));
+
+		if (desc->pd_ops_vector->co_mac_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_mac_ops,
+			    sizeof (crypto_mac_ops_t));
+
+		if (desc->pd_ops_vector->co_sign_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_sign_ops,
+			    sizeof (crypto_sign_ops_t));
+
+		if (desc->pd_ops_vector->co_verify_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_verify_ops,
+			    sizeof (crypto_verify_ops_t));
+
+		if (desc->pd_ops_vector->co_dual_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_dual_ops,
+			    sizeof (crypto_dual_ops_t));
+
+		if (desc->pd_ops_vector->co_dual_cipher_mac_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_dual_cipher_mac_ops,
+			    sizeof (crypto_dual_cipher_mac_ops_t));
+
+		if (desc->pd_ops_vector->co_random_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_random_ops,
+			    sizeof (crypto_random_number_ops_t));
+
+		if (desc->pd_ops_vector->co_session_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_session_ops,
+			    sizeof (crypto_session_ops_t));
+
+		if (desc->pd_ops_vector->co_object_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_object_ops,
+			    sizeof (crypto_object_ops_t));
+
+		if (desc->pd_ops_vector->co_key_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_key_ops,
+			    sizeof (crypto_key_ops_t));
+
+		if (desc->pd_ops_vector->co_provider_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_provider_ops,
+			    sizeof (crypto_provider_management_ops_t));
+
+		if (desc->pd_ops_vector->co_ctx_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_ctx_ops,
+			    sizeof (crypto_ctx_ops_t));
+
+		if (desc->pd_ops_vector->co_mech_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_mech_ops,
+			    sizeof (crypto_mech_ops_t));
+
+		if (desc->pd_ops_vector->co_nostore_key_ops != NULL)
+			kmem_free(desc->pd_ops_vector->co_nostore_key_ops,
+			    sizeof (crypto_nostore_key_ops_t));
+
+		kmem_free(desc->pd_ops_vector, sizeof (crypto_ops_t));
+	}
+
+	if (desc->pd_mechanisms != NULL)
+		/* free the memory associated with the mechanism info's */
+		kmem_free(desc->pd_mechanisms, sizeof (crypto_mech_info_t) *
+		    desc->pd_mech_list_count);
+
+	if (desc->pd_sched_info.ks_taskq != NULL)
+		taskq_destroy(desc->pd_sched_info.ks_taskq);
+
+	mutex_destroy(&desc->pd_lock);
+	cv_destroy(&desc->pd_resume_cv);
+	cv_destroy(&desc->pd_remove_cv);
+
+	kmem_free(desc, sizeof (kcf_provider_desc_t));
+}
+
+/*
+ * Returns an array of hardware and logical provider descriptors,
+ * a.k.a the PKCS#11 slot list. A REFHOLD is done on each descriptor
+ * before the array is returned. The entire table can be freed by
+ * calling kcf_free_provider_tab().
+ */
+int
+kcf_get_slot_list(uint_t *count, kcf_provider_desc_t ***array,
+    boolean_t unverified)
+{
+	kcf_provider_desc_t *prov_desc;
+	kcf_provider_desc_t **p = NULL;
+	char *last;
+	uint_t cnt = 0;
+	uint_t i, j;
+	int rval = CRYPTO_SUCCESS;
+	size_t n, final_size;
+
+	/* count the providers */
+	mutex_enter(&prov_tab_mutex);
+	for (i = 0; i < KCF_MAX_PROVIDERS; i++) {
+		if ((prov_desc = prov_tab[i]) != NULL &&
+		    ((prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER &&
+		    (prov_desc->pd_flags & CRYPTO_HIDE_PROVIDER) == 0) ||
+		    prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)) {
+			if (KCF_IS_PROV_USABLE(prov_desc) ||
+			    (unverified && KCF_IS_PROV_UNVERIFIED(prov_desc))) {
+				cnt++;
+			}
+		}
+	}
+	mutex_exit(&prov_tab_mutex);
+
+	if (cnt == 0)
+		goto out;
+
+	n = cnt * sizeof (kcf_provider_desc_t *);
+again:
+	p = kmem_zalloc(n, KM_SLEEP);
+
+	/* pointer to last entry in the array */
+	last = (char *)&p[cnt-1];
+
+	mutex_enter(&prov_tab_mutex);
+	/* fill the slot list */
+	for (i = 0, j = 0; i < KCF_MAX_PROVIDERS; i++) {
+		if ((prov_desc = prov_tab[i]) != NULL &&
+		    ((prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER &&
+		    (prov_desc->pd_flags & CRYPTO_HIDE_PROVIDER) == 0) ||
+		    prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)) {
+			if (KCF_IS_PROV_USABLE(prov_desc) ||
+			    (unverified && KCF_IS_PROV_UNVERIFIED(prov_desc))) {
+				if ((char *)&p[j] > last) {
+					mutex_exit(&prov_tab_mutex);
+					kcf_free_provider_tab(cnt, p);
+					n = n << 1;
+					cnt = cnt << 1;
+					goto again;
+				}
+				p[j++] = prov_desc;
+				KCF_PROV_REFHOLD(prov_desc);
+			}
+		}
+	}
+	mutex_exit(&prov_tab_mutex);
+
+	final_size = j * sizeof (kcf_provider_desc_t *);
+	cnt = j;
+	ASSERT(final_size <= n);
+
+	/* check if buffer we allocated is too large */
+	if (final_size < n) {
+		char *final_buffer = NULL;
+
+		if (final_size > 0) {
+			final_buffer = kmem_alloc(final_size, KM_SLEEP);
+			bcopy(p, final_buffer, final_size);
+		}
+		kmem_free(p, n);
+		p = (kcf_provider_desc_t **)final_buffer;
+	}
+out:
+	*count = cnt;
+	*array = p;
+	return (rval);
+}
+
+/*
+ * Free an array of hardware provider descriptors.  A REFRELE
+ * is done on each descriptor before the table is freed.
+ */
+void
+kcf_free_provider_tab(uint_t count, kcf_provider_desc_t **array)
+{
+	kcf_provider_desc_t *prov_desc;
+	int i;
+
+	for (i = 0; i < count; i++) {
+		if ((prov_desc = array[i]) != NULL) {
+			KCF_PROV_REFRELE(prov_desc);
+		}
+	}
+	kmem_free(array, count * sizeof (kcf_provider_desc_t *));
+}
+
+/*
+ * Returns in the location pointed to by pd a pointer to the descriptor
+ * for the software provider for the specified mechanism.
+ * The provider descriptor is returned held and it is the caller's
+ * responsibility to release it when done. The mechanism entry
+ * is returned if the optional argument mep is non NULL.
+ *
+ * Returns one of the CRYPTO_ * error codes on failure, and
+ * CRYPTO_SUCCESS on success.
+ */
+int
+kcf_get_sw_prov(crypto_mech_type_t mech_type, kcf_provider_desc_t **pd,
+    kcf_mech_entry_t **mep, boolean_t log_warn)
+{
+	kcf_mech_entry_t *me;
+
+	/* get the mechanism entry for this mechanism */
+	if (kcf_get_mech_entry(mech_type, &me) != KCF_SUCCESS)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	/*
+	 * Get the software provider for this mechanism.
+	 * Lock the mech_entry until we grab the 'pd'.
+	 */
+	mutex_enter(&me->me_mutex);
+
+	if (me->me_sw_prov == NULL ||
+	    (*pd = me->me_sw_prov->pm_prov_desc) == NULL) {
+		/* no SW provider for this mechanism */
+		if (log_warn)
+			cmn_err(CE_WARN, "no SW provider for \"%s\"\n",
+			    me->me_name);
+		mutex_exit(&me->me_mutex);
+		return (CRYPTO_MECH_NOT_SUPPORTED);
+	}
+
+	KCF_PROV_REFHOLD(*pd);
+	mutex_exit(&me->me_mutex);
+
+	if (mep != NULL)
+		*mep = me;
+
+	return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/core/kcf_sched.c b/sys/contrib/openzfs/module/icp/core/kcf_sched.c
new file mode 100644
index 000000000000..81fd15f8ea26
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/core/kcf_sched.c
@@ -0,0 +1,1780 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file contains the core framework routines for the
+ * kernel cryptographic framework. These routines are at the
+ * layer, between the kernel API/ioctls and the SPI.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/crypto/api.h>
+
+kcf_global_swq_t *gswq;	/* Global software queue */
+
+/* Thread pool related variables */
+static kcf_pool_t *kcfpool;	/* Thread pool of kcfd LWPs */
+int kcf_maxthreads = 2;
+int kcf_minthreads = 1;
+int kcf_thr_multiple = 2;	/* Boot-time tunable for experimentation */
+static ulong_t	kcf_idlethr_timeout;
+#define	KCF_DEFAULT_THRTIMEOUT	60000000	/* 60 seconds */
+
+/* kmem caches used by the scheduler */
+static kmem_cache_t *kcf_sreq_cache;
+static kmem_cache_t *kcf_areq_cache;
+static kmem_cache_t *kcf_context_cache;
+
+/* Global request ID table */
+static kcf_reqid_table_t *kcf_reqid_table[REQID_TABLES];
+
+/* KCF stats. Not protected. */
+static kcf_stats_t kcf_ksdata = {
+	{ "total threads in pool",	KSTAT_DATA_UINT32},
+	{ "idle threads in pool",	KSTAT_DATA_UINT32},
+	{ "min threads in pool",	KSTAT_DATA_UINT32},
+	{ "max threads in pool",	KSTAT_DATA_UINT32},
+	{ "requests in gswq",		KSTAT_DATA_UINT32},
+	{ "max requests in gswq",	KSTAT_DATA_UINT32},
+	{ "threads for HW taskq",	KSTAT_DATA_UINT32},
+	{ "minalloc for HW taskq",	KSTAT_DATA_UINT32},
+	{ "maxalloc for HW taskq",	KSTAT_DATA_UINT32}
+};
+
+static kstat_t *kcf_misc_kstat = NULL;
+ulong_t kcf_swprov_hndl = 0;
+
+static kcf_areq_node_t *kcf_areqnode_alloc(kcf_provider_desc_t *,
+    kcf_context_t *, crypto_call_req_t *, kcf_req_params_t *, boolean_t);
+static int kcf_disp_sw_request(kcf_areq_node_t *);
+static void process_req_hwp(void *);
+static int kcf_enqueue(kcf_areq_node_t *);
+static void kcfpool_alloc(void);
+static void kcf_reqid_delete(kcf_areq_node_t *areq);
+static crypto_req_id_t kcf_reqid_insert(kcf_areq_node_t *areq);
+static int kcf_misc_kstat_update(kstat_t *ksp, int rw);
+
+/*
+ * Create a new context.
+ */
+crypto_ctx_t *
+kcf_new_ctx(crypto_call_req_t *crq, kcf_provider_desc_t *pd,
+    crypto_session_id_t sid)
+{
+	crypto_ctx_t *ctx;
+	kcf_context_t *kcf_ctx;
+
+	kcf_ctx = kmem_cache_alloc(kcf_context_cache,
+	    (crq == NULL) ? KM_SLEEP : KM_NOSLEEP);
+	if (kcf_ctx == NULL)
+		return (NULL);
+
+	/* initialize the context for the consumer */
+	kcf_ctx->kc_refcnt = 1;
+	kcf_ctx->kc_req_chain_first = NULL;
+	kcf_ctx->kc_req_chain_last = NULL;
+	kcf_ctx->kc_secondctx = NULL;
+	KCF_PROV_REFHOLD(pd);
+	kcf_ctx->kc_prov_desc = pd;
+	kcf_ctx->kc_sw_prov_desc = NULL;
+	kcf_ctx->kc_mech = NULL;
+
+	ctx = &kcf_ctx->kc_glbl_ctx;
+	ctx->cc_provider = pd->pd_prov_handle;
+	ctx->cc_session = sid;
+	ctx->cc_provider_private = NULL;
+	ctx->cc_framework_private = (void *)kcf_ctx;
+	ctx->cc_flags = 0;
+	ctx->cc_opstate = NULL;
+
+	return (ctx);
+}
+
+/*
+ * Allocate a new async request node.
+ *
+ * ictx - Framework private context pointer
+ * crq - Has callback function and argument. Should be non NULL.
+ * req - The parameters to pass to the SPI
+ */
+static kcf_areq_node_t *
+kcf_areqnode_alloc(kcf_provider_desc_t *pd, kcf_context_t *ictx,
+    crypto_call_req_t *crq, kcf_req_params_t *req, boolean_t isdual)
+{
+	kcf_areq_node_t	*arptr, *areq;
+
+	ASSERT(crq != NULL);
+	arptr = kmem_cache_alloc(kcf_areq_cache, KM_NOSLEEP);
+	if (arptr == NULL)
+		return (NULL);
+
+	arptr->an_state = REQ_ALLOCATED;
+	arptr->an_reqarg = *crq;
+	arptr->an_params = *req;
+	arptr->an_context = ictx;
+	arptr->an_isdual = isdual;
+
+	arptr->an_next = arptr->an_prev = NULL;
+	KCF_PROV_REFHOLD(pd);
+	arptr->an_provider = pd;
+	arptr->an_tried_plist = NULL;
+	arptr->an_refcnt = 1;
+	arptr->an_idnext = arptr->an_idprev = NULL;
+
+	/*
+	 * Requests for context-less operations do not use the
+	 * fields - an_is_my_turn, and an_ctxchain_next.
+	 */
+	if (ictx == NULL)
+		return (arptr);
+
+	KCF_CONTEXT_REFHOLD(ictx);
+	/*
+	 * Chain this request to the context.
+	 */
+	mutex_enter(&ictx->kc_in_use_lock);
+	arptr->an_ctxchain_next = NULL;
+	if ((areq = ictx->kc_req_chain_last) == NULL) {
+		arptr->an_is_my_turn = B_TRUE;
+		ictx->kc_req_chain_last =
+		    ictx->kc_req_chain_first = arptr;
+	} else {
+		ASSERT(ictx->kc_req_chain_first != NULL);
+		arptr->an_is_my_turn = B_FALSE;
+		/* Insert the new request to the end of the chain. */
+		areq->an_ctxchain_next = arptr;
+		ictx->kc_req_chain_last = arptr;
+	}
+	mutex_exit(&ictx->kc_in_use_lock);
+
+	return (arptr);
+}
+
+/*
+ * Queue the request node and do one of the following:
+ *	- If there is an idle thread signal it to run.
+ *	- If there is no idle thread and max running threads is not
+ *	  reached, signal the creator thread for more threads.
+ *
+ * If the two conditions above are not met, we don't need to do
+ * anything. The request will be picked up by one of the
+ * worker threads when it becomes available.
+ */
+static int
+kcf_disp_sw_request(kcf_areq_node_t *areq)
+{
+	int err;
+	int cnt = 0;
+
+	if ((err = kcf_enqueue(areq)) != 0)
+		return (err);
+
+	if (kcfpool->kp_idlethreads > 0) {
+		/* Signal an idle thread to run */
+		mutex_enter(&gswq->gs_lock);
+		cv_signal(&gswq->gs_cv);
+		mutex_exit(&gswq->gs_lock);
+
+		return (CRYPTO_QUEUED);
+	}
+
+	/*
+	 * We keep the number of running threads to be at
+	 * kcf_minthreads to reduce gs_lock contention.
+	 */
+	cnt = kcf_minthreads -
+	    (kcfpool->kp_threads - kcfpool->kp_blockedthreads);
+	if (cnt > 0) {
+		/*
+		 * The following ensures the number of threads in pool
+		 * does not exceed kcf_maxthreads.
+		 */
+		cnt = MIN(cnt, kcf_maxthreads - (int)kcfpool->kp_threads);
+		if (cnt > 0) {
+			/* Signal the creator thread for more threads */
+			mutex_enter(&kcfpool->kp_user_lock);
+			if (!kcfpool->kp_signal_create_thread) {
+				kcfpool->kp_signal_create_thread = B_TRUE;
+				kcfpool->kp_nthrs = cnt;
+				cv_signal(&kcfpool->kp_user_cv);
+			}
+			mutex_exit(&kcfpool->kp_user_lock);
+		}
+	}
+
+	return (CRYPTO_QUEUED);
+}
+
+/*
+ * This routine is called by the taskq associated with
+ * each hardware provider. We notify the kernel consumer
+ * via the callback routine in case of CRYPTO_SUCCESS or
+ * a failure.
+ *
+ * A request can be of type kcf_areq_node_t or of type
+ * kcf_sreq_node_t.
+ */
+static void
+process_req_hwp(void *ireq)
+{
+	int error = 0;
+	crypto_ctx_t *ctx;
+	kcf_call_type_t ctype;
+	kcf_provider_desc_t *pd;
+	kcf_areq_node_t *areq = (kcf_areq_node_t *)ireq;
+	kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)ireq;
+
+	pd = ((ctype = GET_REQ_TYPE(ireq)) == CRYPTO_SYNCH) ?
+	    sreq->sn_provider : areq->an_provider;
+
+	/*
+	 * Wait if flow control is in effect for the provider. A
+	 * CRYPTO_PROVIDER_READY or CRYPTO_PROVIDER_FAILED
+	 * notification will signal us. We also get signaled if
+	 * the provider is unregistering.
+	 */
+	if (pd->pd_state == KCF_PROV_BUSY) {
+		mutex_enter(&pd->pd_lock);
+		while (pd->pd_state == KCF_PROV_BUSY)
+			cv_wait(&pd->pd_resume_cv, &pd->pd_lock);
+		mutex_exit(&pd->pd_lock);
+	}
+
+	/*
+	 * Bump the internal reference count while the request is being
+	 * processed. This is how we know when it's safe to unregister
+	 * a provider. This step must precede the pd_state check below.
+	 */
+	KCF_PROV_IREFHOLD(pd);
+
+	/*
+	 * Fail the request if the provider has failed. We return a
+	 * recoverable error and the notified clients attempt any
+	 * recovery. For async clients this is done in kcf_aop_done()
+	 * and for sync clients it is done in the k-api routines.
+	 */
+	if (pd->pd_state >= KCF_PROV_FAILED) {
+		error = CRYPTO_DEVICE_ERROR;
+		goto bail;
+	}
+
+	if (ctype == CRYPTO_SYNCH) {
+		mutex_enter(&sreq->sn_lock);
+		sreq->sn_state = REQ_INPROGRESS;
+		mutex_exit(&sreq->sn_lock);
+
+		ctx = sreq->sn_context ? &sreq->sn_context->kc_glbl_ctx : NULL;
+		error = common_submit_request(sreq->sn_provider, ctx,
+		    sreq->sn_params, sreq);
+	} else {
+		kcf_context_t *ictx;
+		ASSERT(ctype == CRYPTO_ASYNCH);
+
+		/*
+		 * We are in the per-hardware provider thread context and
+		 * hence can sleep. Note that the caller would have done
+		 * a taskq_dispatch(..., TQ_NOSLEEP) and would have returned.
+		 */
+		ctx = (ictx = areq->an_context) ? &ictx->kc_glbl_ctx : NULL;
+
+		mutex_enter(&areq->an_lock);
+		/*
+		 * We need to maintain ordering for multi-part requests.
+		 * an_is_my_turn is set to B_TRUE initially for a request
+		 * when it is enqueued and there are no other requests
+		 * for that context. It is set later from kcf_aop_done() when
+		 * the request before us in the chain of requests for the
+		 * context completes. We get signaled at that point.
+		 */
+		if (ictx != NULL) {
+			ASSERT(ictx->kc_prov_desc == areq->an_provider);
+
+			while (areq->an_is_my_turn == B_FALSE) {
+				cv_wait(&areq->an_turn_cv, &areq->an_lock);
+			}
+		}
+		areq->an_state = REQ_INPROGRESS;
+		mutex_exit(&areq->an_lock);
+
+		error = common_submit_request(areq->an_provider, ctx,
+		    &areq->an_params, areq);
+	}
+
+bail:
+	if (error == CRYPTO_QUEUED) {
+		/*
+		 * The request is queued by the provider and we should
+		 * get a crypto_op_notification() from the provider later.
+		 * We notify the consumer at that time.
+		 */
+		return;
+	} else {		/* CRYPTO_SUCCESS or other failure */
+		KCF_PROV_IREFRELE(pd);
+		if (ctype == CRYPTO_SYNCH)
+			kcf_sop_done(sreq, error);
+		else
+			kcf_aop_done(areq, error);
+	}
+}
+
+/*
+ * This routine checks if a request can be retried on another
+ * provider. If true, mech1 is initialized to point to the mechanism
+ * structure. mech2 is also initialized in case of a dual operation. fg
+ * is initialized to the correct crypto_func_group_t bit flag. They are
+ * initialized by this routine, so that the caller can pass them to a
+ * kcf_get_mech_provider() or kcf_get_dual_provider() with no further change.
+ *
+ * We check that the request is for a init or atomic routine and that
+ * it is for one of the operation groups used from k-api .
+ */
+static boolean_t
+can_resubmit(kcf_areq_node_t *areq, crypto_mechanism_t **mech1,
+    crypto_mechanism_t **mech2, crypto_func_group_t *fg)
+{
+	kcf_req_params_t *params;
+	kcf_op_type_t optype;
+
+	params = &areq->an_params;
+	optype = params->rp_optype;
+
+	if (!(IS_INIT_OP(optype) || IS_ATOMIC_OP(optype)))
+		return (B_FALSE);
+
+	switch (params->rp_opgrp) {
+	case KCF_OG_DIGEST: {
+		kcf_digest_ops_params_t *dops = &params->rp_u.digest_params;
+
+		dops->do_mech.cm_type = dops->do_framework_mechtype;
+		*mech1 = &dops->do_mech;
+		*fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_DIGEST :
+		    CRYPTO_FG_DIGEST_ATOMIC;
+		break;
+	}
+
+	case KCF_OG_MAC: {
+		kcf_mac_ops_params_t *mops = &params->rp_u.mac_params;
+
+		mops->mo_mech.cm_type = mops->mo_framework_mechtype;
+		*mech1 = &mops->mo_mech;
+		*fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_MAC :
+		    CRYPTO_FG_MAC_ATOMIC;
+		break;
+	}
+
+	case KCF_OG_SIGN: {
+		kcf_sign_ops_params_t *sops = &params->rp_u.sign_params;
+
+		sops->so_mech.cm_type = sops->so_framework_mechtype;
+		*mech1 = &sops->so_mech;
+		switch (optype) {
+		case KCF_OP_INIT:
+			*fg = CRYPTO_FG_SIGN;
+			break;
+		case KCF_OP_ATOMIC:
+			*fg = CRYPTO_FG_SIGN_ATOMIC;
+			break;
+		default:
+			ASSERT(optype == KCF_OP_SIGN_RECOVER_ATOMIC);
+			*fg = CRYPTO_FG_SIGN_RECOVER_ATOMIC;
+		}
+		break;
+	}
+
+	case KCF_OG_VERIFY: {
+		kcf_verify_ops_params_t *vops = &params->rp_u.verify_params;
+
+		vops->vo_mech.cm_type = vops->vo_framework_mechtype;
+		*mech1 = &vops->vo_mech;
+		switch (optype) {
+		case KCF_OP_INIT:
+			*fg = CRYPTO_FG_VERIFY;
+			break;
+		case KCF_OP_ATOMIC:
+			*fg = CRYPTO_FG_VERIFY_ATOMIC;
+			break;
+		default:
+			ASSERT(optype == KCF_OP_VERIFY_RECOVER_ATOMIC);
+			*fg = CRYPTO_FG_VERIFY_RECOVER_ATOMIC;
+		}
+		break;
+	}
+
+	case KCF_OG_ENCRYPT: {
+		kcf_encrypt_ops_params_t *eops = &params->rp_u.encrypt_params;
+
+		eops->eo_mech.cm_type = eops->eo_framework_mechtype;
+		*mech1 = &eops->eo_mech;
+		*fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_ENCRYPT :
+		    CRYPTO_FG_ENCRYPT_ATOMIC;
+		break;
+	}
+
+	case KCF_OG_DECRYPT: {
+		kcf_decrypt_ops_params_t *dcrops = &params->rp_u.decrypt_params;
+
+		dcrops->dop_mech.cm_type = dcrops->dop_framework_mechtype;
+		*mech1 = &dcrops->dop_mech;
+		*fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_DECRYPT :
+		    CRYPTO_FG_DECRYPT_ATOMIC;
+		break;
+	}
+
+	case KCF_OG_ENCRYPT_MAC: {
+		kcf_encrypt_mac_ops_params_t *eops =
+		    &params->rp_u.encrypt_mac_params;
+
+		eops->em_encr_mech.cm_type = eops->em_framework_encr_mechtype;
+		*mech1 = &eops->em_encr_mech;
+		eops->em_mac_mech.cm_type = eops->em_framework_mac_mechtype;
+		*mech2 = &eops->em_mac_mech;
+		*fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_ENCRYPT_MAC :
+		    CRYPTO_FG_ENCRYPT_MAC_ATOMIC;
+		break;
+	}
+
+	case KCF_OG_MAC_DECRYPT: {
+		kcf_mac_decrypt_ops_params_t *dops =
+		    &params->rp_u.mac_decrypt_params;
+
+		dops->md_mac_mech.cm_type = dops->md_framework_mac_mechtype;
+		*mech1 = &dops->md_mac_mech;
+		dops->md_decr_mech.cm_type = dops->md_framework_decr_mechtype;
+		*mech2 = &dops->md_decr_mech;
+		*fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_MAC_DECRYPT :
+		    CRYPTO_FG_MAC_DECRYPT_ATOMIC;
+		break;
+	}
+
+	default:
+		return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * This routine is called when a request to a provider has failed
+ * with a recoverable error. This routine tries to find another provider
+ * and dispatches the request to the new provider, if one is available.
+ * We reuse the request structure.
+ *
+ * A return value of NULL from kcf_get_mech_provider() indicates
+ * we have tried the last provider.
+ */
+static int
+kcf_resubmit_request(kcf_areq_node_t *areq)
+{
+	int error = CRYPTO_FAILED;
+	kcf_context_t *ictx;
+	kcf_provider_desc_t *old_pd;
+	kcf_provider_desc_t *new_pd;
+	crypto_mechanism_t *mech1 = NULL, *mech2 = NULL;
+	crypto_mech_type_t prov_mt1, prov_mt2;
+	crypto_func_group_t fg = 0;
+
+	if (!can_resubmit(areq, &mech1, &mech2, &fg))
+		return (error);
+
+	old_pd = areq->an_provider;
+	/*
+	 * Add old_pd to the list of providers already tried. We release
+	 * the hold on old_pd (from the earlier kcf_get_mech_provider()) in
+	 * kcf_free_triedlist().
+	 */
+	if (kcf_insert_triedlist(&areq->an_tried_plist, old_pd,
+	    KM_NOSLEEP) == NULL)
+		return (error);
+
+	if (mech1 && !mech2) {
+		new_pd = kcf_get_mech_provider(mech1->cm_type, NULL, &error,
+		    areq->an_tried_plist, fg,
+		    (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), 0);
+	} else {
+		ASSERT(mech1 != NULL && mech2 != NULL);
+
+		new_pd = kcf_get_dual_provider(mech1, mech2, NULL, &prov_mt1,
+		    &prov_mt2, &error, areq->an_tried_plist, fg, fg,
+		    (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), 0);
+	}
+
+	if (new_pd == NULL)
+		return (error);
+
+	/*
+	 * We reuse the old context by resetting provider specific
+	 * fields in it.
+	 */
+	if ((ictx = areq->an_context) != NULL) {
+		crypto_ctx_t *ctx;
+
+		ASSERT(old_pd == ictx->kc_prov_desc);
+		KCF_PROV_REFRELE(ictx->kc_prov_desc);
+		KCF_PROV_REFHOLD(new_pd);
+		ictx->kc_prov_desc = new_pd;
+
+		ctx = &ictx->kc_glbl_ctx;
+		ctx->cc_provider = new_pd->pd_prov_handle;
+		ctx->cc_session = new_pd->pd_sid;
+		ctx->cc_provider_private = NULL;
+	}
+
+	/* We reuse areq. by resetting the provider and context fields. */
+	KCF_PROV_REFRELE(old_pd);
+	KCF_PROV_REFHOLD(new_pd);
+	areq->an_provider = new_pd;
+	mutex_enter(&areq->an_lock);
+	areq->an_state = REQ_WAITING;
+	mutex_exit(&areq->an_lock);
+
+	switch (new_pd->pd_prov_type) {
+	case CRYPTO_SW_PROVIDER:
+		error = kcf_disp_sw_request(areq);
+		break;
+
+	case CRYPTO_HW_PROVIDER: {
+		taskq_t *taskq = new_pd->pd_sched_info.ks_taskq;
+
+		if (taskq_dispatch(taskq, process_req_hwp, areq, TQ_NOSLEEP) ==
+		    TASKQID_INVALID) {
+			error = CRYPTO_HOST_MEMORY;
+		} else {
+			error = CRYPTO_QUEUED;
+		}
+
+		break;
+	default:
+		break;
+	}
+	}
+
+	return (error);
+}
+
+static inline int EMPTY_TASKQ(taskq_t *tq)
+{
+#ifdef _KERNEL
+	return (tq->tq_lowest_id == tq->tq_next_id);
+#else
+	return (tq->tq_task.tqent_next == &tq->tq_task || tq->tq_active == 0);
+#endif
+}
+
+/*
+ * Routine called by both ioctl and k-api. The consumer should
+ * bundle the parameters into a kcf_req_params_t structure. A bunch
+ * of macros are available in ops_impl.h for this bundling. They are:
+ *
+ * 	KCF_WRAP_DIGEST_OPS_PARAMS()
+ *	KCF_WRAP_MAC_OPS_PARAMS()
+ *	KCF_WRAP_ENCRYPT_OPS_PARAMS()
+ *	KCF_WRAP_DECRYPT_OPS_PARAMS() ... etc.
+ *
+ * It is the caller's responsibility to free the ctx argument when
+ * appropriate. See the KCF_CONTEXT_COND_RELEASE macro for details.
+ */
+int
+kcf_submit_request(kcf_provider_desc_t *pd, crypto_ctx_t *ctx,
+    crypto_call_req_t *crq, kcf_req_params_t *params, boolean_t cont)
+{
+	int error = CRYPTO_SUCCESS;
+	kcf_areq_node_t *areq;
+	kcf_sreq_node_t *sreq;
+	kcf_context_t *kcf_ctx;
+	taskq_t *taskq = pd->pd_sched_info.ks_taskq;
+
+	kcf_ctx = ctx ? (kcf_context_t *)ctx->cc_framework_private : NULL;
+
+	/* Synchronous cases */
+	if (crq == NULL) {
+		switch (pd->pd_prov_type) {
+		case CRYPTO_SW_PROVIDER:
+			error = common_submit_request(pd, ctx, params,
+			    KCF_RHNDL(KM_SLEEP));
+			break;
+
+		case CRYPTO_HW_PROVIDER:
+			/*
+			 * Special case for CRYPTO_SYNCHRONOUS providers that
+			 * never return a CRYPTO_QUEUED error. We skip any
+			 * request allocation and call the SPI directly.
+			 */
+			if ((pd->pd_flags & CRYPTO_SYNCHRONOUS) &&
+			    EMPTY_TASKQ(taskq)) {
+				KCF_PROV_IREFHOLD(pd);
+				if (pd->pd_state == KCF_PROV_READY) {
+					error = common_submit_request(pd, ctx,
+					    params, KCF_RHNDL(KM_SLEEP));
+					KCF_PROV_IREFRELE(pd);
+					ASSERT(error != CRYPTO_QUEUED);
+					break;
+				}
+				KCF_PROV_IREFRELE(pd);
+			}
+
+			sreq = kmem_cache_alloc(kcf_sreq_cache, KM_SLEEP);
+			sreq->sn_state = REQ_ALLOCATED;
+			sreq->sn_rv = CRYPTO_FAILED;
+			sreq->sn_params = params;
+
+			/*
+			 * Note that we do not need to hold the context
+			 * for synchronous case as the context will never
+			 * become invalid underneath us. We do not need to hold
+			 * the provider here either as the caller has a hold.
+			 */
+			sreq->sn_context = kcf_ctx;
+			ASSERT(KCF_PROV_REFHELD(pd));
+			sreq->sn_provider = pd;
+
+			ASSERT(taskq != NULL);
+			/*
+			 * Call the SPI directly if the taskq is empty and the
+			 * provider is not busy, else dispatch to the taskq.
+			 * Calling directly is fine as this is the synchronous
+			 * case. This is unlike the asynchronous case where we
+			 * must always dispatch to the taskq.
+			 */
+			if (EMPTY_TASKQ(taskq) &&
+			    pd->pd_state == KCF_PROV_READY) {
+				process_req_hwp(sreq);
+			} else {
+				/*
+				 * We can not tell from taskq_dispatch() return
+				 * value if we exceeded maxalloc. Hence the
+				 * check here. Since we are allowed to wait in
+				 * the synchronous case, we wait for the taskq
+				 * to become empty.
+				 */
+				if (taskq->tq_nalloc >= crypto_taskq_maxalloc) {
+					taskq_wait(taskq);
+				}
+
+				(void) taskq_dispatch(taskq, process_req_hwp,
+				    sreq, TQ_SLEEP);
+			}
+
+			/*
+			 * Wait for the notification to arrive,
+			 * if the operation is not done yet.
+			 * Bug# 4722589 will make the wait a cv_wait_sig().
+			 */
+			mutex_enter(&sreq->sn_lock);
+			while (sreq->sn_state < REQ_DONE)
+				cv_wait(&sreq->sn_cv, &sreq->sn_lock);
+			mutex_exit(&sreq->sn_lock);
+
+			error = sreq->sn_rv;
+			kmem_cache_free(kcf_sreq_cache, sreq);
+
+			break;
+
+		default:
+			error = CRYPTO_FAILED;
+			break;
+		}
+
+	} else {	/* Asynchronous cases */
+		switch (pd->pd_prov_type) {
+		case CRYPTO_SW_PROVIDER:
+			if (!(crq->cr_flag & CRYPTO_ALWAYS_QUEUE)) {
+				/*
+				 * This case has less overhead since there is
+				 * no switching of context.
+				 */
+				error = common_submit_request(pd, ctx, params,
+				    KCF_RHNDL(KM_NOSLEEP));
+			} else {
+				/*
+				 * CRYPTO_ALWAYS_QUEUE is set. We need to
+				 * queue the request and return.
+				 */
+				areq = kcf_areqnode_alloc(pd, kcf_ctx, crq,
+				    params, cont);
+				if (areq == NULL)
+					error = CRYPTO_HOST_MEMORY;
+				else {
+					if (!(crq->cr_flag
+					    & CRYPTO_SKIP_REQID)) {
+					/*
+					 * Set the request handle. This handle
+					 * is used for any crypto_cancel_req(9f)
+					 * calls from the consumer. We have to
+					 * do this before dispatching the
+					 * request.
+					 */
+					crq->cr_reqid = kcf_reqid_insert(areq);
+					}
+
+					error = kcf_disp_sw_request(areq);
+					/*
+					 * There is an error processing this
+					 * request. Remove the handle and
+					 * release the request structure.
+					 */
+					if (error != CRYPTO_QUEUED) {
+						if (!(crq->cr_flag
+						    & CRYPTO_SKIP_REQID))
+							kcf_reqid_delete(areq);
+						KCF_AREQ_REFRELE(areq);
+					}
+				}
+			}
+			break;
+
+		case CRYPTO_HW_PROVIDER:
+			/*
+			 * We need to queue the request and return.
+			 */
+			areq = kcf_areqnode_alloc(pd, kcf_ctx, crq, params,
+			    cont);
+			if (areq == NULL) {
+				error = CRYPTO_HOST_MEMORY;
+				goto done;
+			}
+
+			ASSERT(taskq != NULL);
+			/*
+			 * We can not tell from taskq_dispatch() return
+			 * value if we exceeded maxalloc. Hence the check
+			 * here.
+			 */
+			if (taskq->tq_nalloc >= crypto_taskq_maxalloc) {
+				error = CRYPTO_BUSY;
+				KCF_AREQ_REFRELE(areq);
+				goto done;
+			}
+
+			if (!(crq->cr_flag & CRYPTO_SKIP_REQID)) {
+			/*
+			 * Set the request handle. This handle is used
+			 * for any crypto_cancel_req(9f) calls from the
+			 * consumer. We have to do this before dispatching
+			 * the request.
+			 */
+			crq->cr_reqid = kcf_reqid_insert(areq);
+			}
+
+			if (taskq_dispatch(taskq,
+			    process_req_hwp, areq, TQ_NOSLEEP) ==
+			    TASKQID_INVALID) {
+				error = CRYPTO_HOST_MEMORY;
+				if (!(crq->cr_flag & CRYPTO_SKIP_REQID))
+					kcf_reqid_delete(areq);
+				KCF_AREQ_REFRELE(areq);
+			} else {
+				error = CRYPTO_QUEUED;
+			}
+			break;
+
+		default:
+			error = CRYPTO_FAILED;
+			break;
+		}
+	}
+
+done:
+	return (error);
+}
+
+/*
+ * We're done with this framework context, so free it. Note that freeing
+ * framework context (kcf_context) frees the global context (crypto_ctx).
+ *
+ * The provider is responsible for freeing provider private context after a
+ * final or single operation and resetting the cc_provider_private field
+ * to NULL. It should do this before it notifies the framework of the
+ * completion. We still need to call KCF_PROV_FREE_CONTEXT to handle cases
+ * like crypto_cancel_ctx(9f).
+ */
+void
+kcf_free_context(kcf_context_t *kcf_ctx)
+{
+	kcf_provider_desc_t *pd = kcf_ctx->kc_prov_desc;
+	crypto_ctx_t *gctx = &kcf_ctx->kc_glbl_ctx;
+	kcf_context_t *kcf_secondctx = kcf_ctx->kc_secondctx;
+
+	/* Release the second context, if any */
+
+	if (kcf_secondctx != NULL)
+		KCF_CONTEXT_REFRELE(kcf_secondctx);
+
+	if (gctx->cc_provider_private != NULL) {
+		mutex_enter(&pd->pd_lock);
+		if (!KCF_IS_PROV_REMOVED(pd)) {
+			/*
+			 * Increment the provider's internal refcnt so it
+			 * doesn't unregister from the framework while
+			 * we're calling the entry point.
+			 */
+			KCF_PROV_IREFHOLD(pd);
+			mutex_exit(&pd->pd_lock);
+			(void) KCF_PROV_FREE_CONTEXT(pd, gctx);
+			KCF_PROV_IREFRELE(pd);
+		} else {
+			mutex_exit(&pd->pd_lock);
+		}
+	}
+
+	/* kcf_ctx->kc_prov_desc has a hold on pd */
+	KCF_PROV_REFRELE(kcf_ctx->kc_prov_desc);
+
+	/* check if this context is shared with a software provider */
+	if ((gctx->cc_flags & CRYPTO_INIT_OPSTATE) &&
+	    kcf_ctx->kc_sw_prov_desc != NULL) {
+		KCF_PROV_REFRELE(kcf_ctx->kc_sw_prov_desc);
+	}
+
+	kmem_cache_free(kcf_context_cache, kcf_ctx);
+}
+
+/*
+ * Free the request after releasing all the holds.
+ */
+void
+kcf_free_req(kcf_areq_node_t *areq)
+{
+	KCF_PROV_REFRELE(areq->an_provider);
+	if (areq->an_context != NULL)
+		KCF_CONTEXT_REFRELE(areq->an_context);
+
+	if (areq->an_tried_plist != NULL)
+		kcf_free_triedlist(areq->an_tried_plist);
+	kmem_cache_free(kcf_areq_cache, areq);
+}
+
+/*
+ * Utility routine to remove a request from the chain of requests
+ * hanging off a context.
+ */
+static void
+kcf_removereq_in_ctxchain(kcf_context_t *ictx, kcf_areq_node_t *areq)
+{
+	kcf_areq_node_t *cur, *prev;
+
+	/*
+	 * Get context lock, search for areq in the chain and remove it.
+	 */
+	ASSERT(ictx != NULL);
+	mutex_enter(&ictx->kc_in_use_lock);
+	prev = cur = ictx->kc_req_chain_first;
+
+	while (cur != NULL) {
+		if (cur == areq) {
+			if (prev == cur) {
+				if ((ictx->kc_req_chain_first =
+				    cur->an_ctxchain_next) == NULL)
+					ictx->kc_req_chain_last = NULL;
+			} else {
+				if (cur == ictx->kc_req_chain_last)
+					ictx->kc_req_chain_last = prev;
+				prev->an_ctxchain_next = cur->an_ctxchain_next;
+			}
+
+			break;
+		}
+		prev = cur;
+		cur = cur->an_ctxchain_next;
+	}
+	mutex_exit(&ictx->kc_in_use_lock);
+}
+
+/*
+ * Remove the specified node from the global software queue.
+ *
+ * The caller must hold the queue lock and request lock (an_lock).
+ */
+static void
+kcf_remove_node(kcf_areq_node_t *node)
+{
+	kcf_areq_node_t *nextp = node->an_next;
+	kcf_areq_node_t *prevp = node->an_prev;
+
+	if (nextp != NULL)
+		nextp->an_prev = prevp;
+	else
+		gswq->gs_last = prevp;
+
+	if (prevp != NULL)
+		prevp->an_next = nextp;
+	else
+		gswq->gs_first = nextp;
+
+	node->an_state = REQ_CANCELED;
+}
+
+/*
+ * Add the request node to the end of the global software queue.
+ *
+ * The caller should not hold the queue lock. Returns 0 if the
+ * request is successfully queued. Returns CRYPTO_BUSY if the limit
+ * on the number of jobs is exceeded.
+ */
+static int
+kcf_enqueue(kcf_areq_node_t *node)
+{
+	kcf_areq_node_t *tnode;
+
+	mutex_enter(&gswq->gs_lock);
+
+	if (gswq->gs_njobs >= gswq->gs_maxjobs) {
+		mutex_exit(&gswq->gs_lock);
+		return (CRYPTO_BUSY);
+	}
+
+	if (gswq->gs_last == NULL) {
+		gswq->gs_first = gswq->gs_last = node;
+	} else {
+		ASSERT(gswq->gs_last->an_next == NULL);
+		tnode = gswq->gs_last;
+		tnode->an_next = node;
+		gswq->gs_last = node;
+		node->an_prev = tnode;
+	}
+
+	gswq->gs_njobs++;
+
+	/* an_lock not needed here as we hold gs_lock */
+	node->an_state = REQ_WAITING;
+
+	mutex_exit(&gswq->gs_lock);
+
+	return (0);
+}
+
+/*
+ * kmem_cache_alloc constructor for sync request structure.
+ */
+/* ARGSUSED */
+static int
+kcf_sreq_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)buf;
+
+	sreq->sn_type = CRYPTO_SYNCH;
+	cv_init(&sreq->sn_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&sreq->sn_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+kcf_sreq_cache_destructor(void *buf, void *cdrarg)
+{
+	kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)buf;
+
+	mutex_destroy(&sreq->sn_lock);
+	cv_destroy(&sreq->sn_cv);
+}
+
+/*
+ * kmem_cache_alloc constructor for async request structure.
+ */
+/* ARGSUSED */
+static int
+kcf_areq_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	kcf_areq_node_t *areq = (kcf_areq_node_t *)buf;
+
+	areq->an_type = CRYPTO_ASYNCH;
+	areq->an_refcnt = 0;
+	mutex_init(&areq->an_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&areq->an_done, NULL, CV_DEFAULT, NULL);
+	cv_init(&areq->an_turn_cv, NULL, CV_DEFAULT, NULL);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+kcf_areq_cache_destructor(void *buf, void *cdrarg)
+{
+	kcf_areq_node_t *areq = (kcf_areq_node_t *)buf;
+
+	ASSERT(areq->an_refcnt == 0);
+	mutex_destroy(&areq->an_lock);
+	cv_destroy(&areq->an_done);
+	cv_destroy(&areq->an_turn_cv);
+}
+
+/*
+ * kmem_cache_alloc constructor for kcf_context structure.
+ */
+/* ARGSUSED */
+static int
+kcf_context_cache_constructor(void *buf, void *cdrarg, int kmflags)
+{
+	kcf_context_t *kctx = (kcf_context_t *)buf;
+
+	kctx->kc_refcnt = 0;
+	mutex_init(&kctx->kc_in_use_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+kcf_context_cache_destructor(void *buf, void *cdrarg)
+{
+	kcf_context_t *kctx = (kcf_context_t *)buf;
+
+	ASSERT(kctx->kc_refcnt == 0);
+	mutex_destroy(&kctx->kc_in_use_lock);
+}
+
+void
+kcf_sched_destroy(void)
+{
+	int i;
+
+	if (kcf_misc_kstat)
+		kstat_delete(kcf_misc_kstat);
+
+	if (kcfpool) {
+		mutex_destroy(&kcfpool->kp_thread_lock);
+		cv_destroy(&kcfpool->kp_nothr_cv);
+		mutex_destroy(&kcfpool->kp_user_lock);
+		cv_destroy(&kcfpool->kp_user_cv);
+
+		kmem_free(kcfpool, sizeof (kcf_pool_t));
+	}
+
+	for (i = 0; i < REQID_TABLES; i++) {
+		if (kcf_reqid_table[i]) {
+			mutex_destroy(&(kcf_reqid_table[i]->rt_lock));
+			kmem_free(kcf_reqid_table[i],
+			    sizeof (kcf_reqid_table_t));
+		}
+	}
+
+	if (gswq) {
+		mutex_destroy(&gswq->gs_lock);
+		cv_destroy(&gswq->gs_cv);
+		kmem_free(gswq, sizeof (kcf_global_swq_t));
+	}
+
+	if (kcf_context_cache)
+		kmem_cache_destroy(kcf_context_cache);
+	if (kcf_areq_cache)
+		kmem_cache_destroy(kcf_areq_cache);
+	if (kcf_sreq_cache)
+		kmem_cache_destroy(kcf_sreq_cache);
+
+	mutex_destroy(&ntfy_list_lock);
+	cv_destroy(&ntfy_list_cv);
+}
+
+/*
+ * Creates and initializes all the structures needed by the framework.
+ */
+void
+kcf_sched_init(void)
+{
+	int i;
+	kcf_reqid_table_t *rt;
+
+	/*
+	 * Create all the kmem caches needed by the framework. We set the
+	 * align argument to 64, to get a slab aligned to 64-byte as well as
+	 * have the objects (cache_chunksize) to be a 64-byte multiple.
+	 * This helps to avoid false sharing as this is the size of the
+	 * CPU cache line.
+	 */
+	kcf_sreq_cache = kmem_cache_create("kcf_sreq_cache",
+	    sizeof (struct kcf_sreq_node), 64, kcf_sreq_cache_constructor,
+	    kcf_sreq_cache_destructor, NULL, NULL, NULL, 0);
+
+	kcf_areq_cache = kmem_cache_create("kcf_areq_cache",
+	    sizeof (struct kcf_areq_node), 64, kcf_areq_cache_constructor,
+	    kcf_areq_cache_destructor, NULL, NULL, NULL, 0);
+
+	kcf_context_cache = kmem_cache_create("kcf_context_cache",
+	    sizeof (struct kcf_context), 64, kcf_context_cache_constructor,
+	    kcf_context_cache_destructor, NULL, NULL, NULL, 0);
+
+	gswq = kmem_alloc(sizeof (kcf_global_swq_t), KM_SLEEP);
+
+	mutex_init(&gswq->gs_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&gswq->gs_cv, NULL, CV_DEFAULT, NULL);
+	gswq->gs_njobs = 0;
+	gswq->gs_maxjobs = kcf_maxthreads * crypto_taskq_maxalloc;
+	gswq->gs_first = gswq->gs_last = NULL;
+
+	/* Initialize the global reqid table */
+	for (i = 0; i < REQID_TABLES; i++) {
+		rt = kmem_zalloc(sizeof (kcf_reqid_table_t), KM_SLEEP);
+		kcf_reqid_table[i] = rt;
+		mutex_init(&rt->rt_lock, NULL, MUTEX_DEFAULT, NULL);
+		rt->rt_curid = i;
+	}
+
+	/* Allocate and initialize the thread pool */
+	kcfpool_alloc();
+
+	/* Initialize the event notification list variables */
+	mutex_init(&ntfy_list_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&ntfy_list_cv, NULL, CV_DEFAULT, NULL);
+
+	/* Create the kcf kstat */
+	kcf_misc_kstat = kstat_create("kcf", 0, "framework_stats", "crypto",
+	    KSTAT_TYPE_NAMED, sizeof (kcf_stats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (kcf_misc_kstat != NULL) {
+		kcf_misc_kstat->ks_data = &kcf_ksdata;
+		kcf_misc_kstat->ks_update = kcf_misc_kstat_update;
+		kstat_install(kcf_misc_kstat);
+	}
+}
+
+/*
+ * Signal the waiting sync client.
+ */
+void
+kcf_sop_done(kcf_sreq_node_t *sreq, int error)
+{
+	mutex_enter(&sreq->sn_lock);
+	sreq->sn_state = REQ_DONE;
+	sreq->sn_rv = error;
+	cv_signal(&sreq->sn_cv);
+	mutex_exit(&sreq->sn_lock);
+}
+
+/*
+ * Callback the async client with the operation status.
+ * We free the async request node and possibly the context.
+ * We also handle any chain of requests hanging off of
+ * the context.
+ */
+void
+kcf_aop_done(kcf_areq_node_t *areq, int error)
+{
+	kcf_op_type_t optype;
+	boolean_t skip_notify = B_FALSE;
+	kcf_context_t *ictx;
+	kcf_areq_node_t *nextreq;
+
+	/*
+	 * Handle recoverable errors. This has to be done first
+	 * before doing anything else in this routine so that
+	 * we do not change the state of the request.
+	 */
+	if (error != CRYPTO_SUCCESS && IS_RECOVERABLE(error)) {
+		/*
+		 * We try another provider, if one is available. Else
+		 * we continue with the failure notification to the
+		 * client.
+		 */
+		if (kcf_resubmit_request(areq) == CRYPTO_QUEUED)
+			return;
+	}
+
+	mutex_enter(&areq->an_lock);
+	areq->an_state = REQ_DONE;
+	mutex_exit(&areq->an_lock);
+
+	optype = (&areq->an_params)->rp_optype;
+	if ((ictx = areq->an_context) != NULL) {
+		/*
+		 * A request after it is removed from the request
+		 * queue, still stays on a chain of requests hanging
+		 * of its context structure. It needs to be removed
+		 * from this chain at this point.
+		 */
+		mutex_enter(&ictx->kc_in_use_lock);
+		nextreq = areq->an_ctxchain_next;
+		if (nextreq != NULL) {
+			mutex_enter(&nextreq->an_lock);
+			nextreq->an_is_my_turn = B_TRUE;
+			cv_signal(&nextreq->an_turn_cv);
+			mutex_exit(&nextreq->an_lock);
+		}
+
+		ictx->kc_req_chain_first = nextreq;
+		if (nextreq == NULL)
+			ictx->kc_req_chain_last = NULL;
+		mutex_exit(&ictx->kc_in_use_lock);
+
+		if (IS_SINGLE_OP(optype) || IS_FINAL_OP(optype)) {
+			ASSERT(nextreq == NULL);
+			KCF_CONTEXT_REFRELE(ictx);
+		} else if (error != CRYPTO_SUCCESS && IS_INIT_OP(optype)) {
+		/*
+		 * NOTE - We do not release the context in case of update
+		 * operations. We require the consumer to free it explicitly,
+		 * in case it wants to abandon an update operation. This is done
+		 * as there may be mechanisms in ECB mode that can continue
+		 * even if an operation on a block fails.
+		 */
+			KCF_CONTEXT_REFRELE(ictx);
+		}
+	}
+
+	/* Deal with the internal continuation to this request first */
+
+	if (areq->an_isdual) {
+		kcf_dual_req_t *next_arg;
+		next_arg = (kcf_dual_req_t *)areq->an_reqarg.cr_callback_arg;
+		next_arg->kr_areq = areq;
+		KCF_AREQ_REFHOLD(areq);
+		areq->an_isdual = B_FALSE;
+
+		NOTIFY_CLIENT(areq, error);
+		return;
+	}
+
+	/*
+	 * If CRYPTO_NOTIFY_OPDONE flag is set, we should notify
+	 * always. If this flag is clear, we skip the notification
+	 * provided there are no errors.  We check this flag for only
+	 * init or update operations. It is ignored for single, final or
+	 * atomic operations.
+	 */
+	skip_notify = (IS_UPDATE_OP(optype) || IS_INIT_OP(optype)) &&
+	    (!(areq->an_reqarg.cr_flag & CRYPTO_NOTIFY_OPDONE)) &&
+	    (error == CRYPTO_SUCCESS);
+
+	if (!skip_notify) {
+		NOTIFY_CLIENT(areq, error);
+	}
+
+	if (!(areq->an_reqarg.cr_flag & CRYPTO_SKIP_REQID))
+		kcf_reqid_delete(areq);
+
+	KCF_AREQ_REFRELE(areq);
+}
+
+/*
+ * Allocate the thread pool and initialize all the fields.
+ */
+static void
+kcfpool_alloc()
+{
+	kcfpool = kmem_alloc(sizeof (kcf_pool_t), KM_SLEEP);
+
+	kcfpool->kp_threads = kcfpool->kp_idlethreads = 0;
+	kcfpool->kp_blockedthreads = 0;
+	kcfpool->kp_signal_create_thread = B_FALSE;
+	kcfpool->kp_nthrs = 0;
+	kcfpool->kp_user_waiting = B_FALSE;
+
+	mutex_init(&kcfpool->kp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&kcfpool->kp_nothr_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_init(&kcfpool->kp_user_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&kcfpool->kp_user_cv, NULL, CV_DEFAULT, NULL);
+
+	kcf_idlethr_timeout = KCF_DEFAULT_THRTIMEOUT;
+}
+
+/*
+ * Insert the async request in the hash table after assigning it
+ * an ID. Returns the ID.
+ *
+ * The ID is used by the caller to pass as an argument to a
+ * cancel_req() routine later.
+ */
+static crypto_req_id_t
+kcf_reqid_insert(kcf_areq_node_t *areq)
+{
+	int indx;
+	crypto_req_id_t id;
+	kcf_areq_node_t *headp;
+	kcf_reqid_table_t *rt;
+
+	rt = kcf_reqid_table[CPU_SEQID_UNSTABLE & REQID_TABLE_MASK];
+
+	mutex_enter(&rt->rt_lock);
+
+	rt->rt_curid = id =
+	    (rt->rt_curid - REQID_COUNTER_LOW) | REQID_COUNTER_HIGH;
+	SET_REQID(areq, id);
+	indx = REQID_HASH(id);
+	headp = areq->an_idnext = rt->rt_idhash[indx];
+	areq->an_idprev = NULL;
+	if (headp != NULL)
+		headp->an_idprev = areq;
+
+	rt->rt_idhash[indx] = areq;
+	mutex_exit(&rt->rt_lock);
+
+	return (id);
+}
+
+/*
+ * Delete the async request from the hash table.
+ */
+static void
+kcf_reqid_delete(kcf_areq_node_t *areq)
+{
+	int indx;
+	kcf_areq_node_t *nextp, *prevp;
+	crypto_req_id_t id = GET_REQID(areq);
+	kcf_reqid_table_t *rt;
+
+	rt = kcf_reqid_table[id & REQID_TABLE_MASK];
+	indx = REQID_HASH(id);
+
+	mutex_enter(&rt->rt_lock);
+
+	nextp = areq->an_idnext;
+	prevp = areq->an_idprev;
+	if (nextp != NULL)
+		nextp->an_idprev = prevp;
+	if (prevp != NULL)
+		prevp->an_idnext = nextp;
+	else
+		rt->rt_idhash[indx] = nextp;
+
+	SET_REQID(areq, 0);
+	cv_broadcast(&areq->an_done);
+
+	mutex_exit(&rt->rt_lock);
+}
+
+/*
+ * Cancel a single asynchronous request.
+ *
+ * We guarantee that no problems will result from calling
+ * crypto_cancel_req() for a request which is either running, or
+ * has already completed. We remove the request from any queues
+ * if it is possible. We wait for request completion if the
+ * request is dispatched to a provider.
+ *
+ * Calling context:
+ * 	Can be called from user context only.
+ *
+ * NOTE: We acquire the following locks in this routine (in order):
+ *	- rt_lock (kcf_reqid_table_t)
+ *	- gswq->gs_lock
+ *	- areq->an_lock
+ *	- ictx->kc_in_use_lock (from kcf_removereq_in_ctxchain())
+ *
+ * This locking order MUST be maintained in code every where else.
+ */
+void
+crypto_cancel_req(crypto_req_id_t id)
+{
+	int indx;
+	kcf_areq_node_t *areq;
+	kcf_provider_desc_t *pd;
+	kcf_context_t *ictx;
+	kcf_reqid_table_t *rt;
+
+	rt = kcf_reqid_table[id & REQID_TABLE_MASK];
+	indx = REQID_HASH(id);
+
+	mutex_enter(&rt->rt_lock);
+	for (areq = rt->rt_idhash[indx]; areq; areq = areq->an_idnext) {
+	if (GET_REQID(areq) == id) {
+		/*
+		 * We found the request. It is either still waiting
+		 * in the framework queues or running at the provider.
+		 */
+		pd = areq->an_provider;
+		ASSERT(pd != NULL);
+
+		switch (pd->pd_prov_type) {
+		case CRYPTO_SW_PROVIDER:
+			mutex_enter(&gswq->gs_lock);
+			mutex_enter(&areq->an_lock);
+
+			/* This request can be safely canceled. */
+			if (areq->an_state <= REQ_WAITING) {
+				/* Remove from gswq, global software queue. */
+				kcf_remove_node(areq);
+				if ((ictx = areq->an_context) != NULL)
+					kcf_removereq_in_ctxchain(ictx, areq);
+
+				mutex_exit(&areq->an_lock);
+				mutex_exit(&gswq->gs_lock);
+				mutex_exit(&rt->rt_lock);
+
+				/* Remove areq from hash table and free it. */
+				kcf_reqid_delete(areq);
+				KCF_AREQ_REFRELE(areq);
+				return;
+			}
+
+			mutex_exit(&areq->an_lock);
+			mutex_exit(&gswq->gs_lock);
+			break;
+
+		case CRYPTO_HW_PROVIDER:
+			/*
+			 * There is no interface to remove an entry
+			 * once it is on the taskq. So, we do not do
+			 * anything for a hardware provider.
+			 */
+			break;
+		default:
+			break;
+		}
+
+		/*
+		 * The request is running. Wait for the request completion
+		 * to notify us.
+		 */
+		KCF_AREQ_REFHOLD(areq);
+		while (GET_REQID(areq) == id)
+			cv_wait(&areq->an_done, &rt->rt_lock);
+		KCF_AREQ_REFRELE(areq);
+		break;
+	}
+	}
+
+	mutex_exit(&rt->rt_lock);
+}
+
+/*
+ * Cancel all asynchronous requests associated with the
+ * passed in crypto context and free it.
+ *
+ * A client SHOULD NOT call this routine after calling a crypto_*_final
+ * routine. This routine is called only during intermediate operations.
+ * The client should not use the crypto context after this function returns
+ * since we destroy it.
+ *
+ * Calling context:
+ * 	Can be called from user context only.
+ */
+void
+crypto_cancel_ctx(crypto_context_t ctx)
+{
+	kcf_context_t *ictx;
+	kcf_areq_node_t *areq;
+
+	if (ctx == NULL)
+		return;
+
+	ictx = (kcf_context_t *)((crypto_ctx_t *)ctx)->cc_framework_private;
+
+	mutex_enter(&ictx->kc_in_use_lock);
+
+	/* Walk the chain and cancel each request */
+	while ((areq = ictx->kc_req_chain_first) != NULL) {
+		/*
+		 * We have to drop the lock here as we may have
+		 * to wait for request completion. We hold the
+		 * request before dropping the lock though, so that it
+		 * won't be freed underneath us.
+		 */
+		KCF_AREQ_REFHOLD(areq);
+		mutex_exit(&ictx->kc_in_use_lock);
+
+		crypto_cancel_req(GET_REQID(areq));
+		KCF_AREQ_REFRELE(areq);
+
+		mutex_enter(&ictx->kc_in_use_lock);
+	}
+
+	mutex_exit(&ictx->kc_in_use_lock);
+	KCF_CONTEXT_REFRELE(ictx);
+}
+
+/*
+ * Update kstats.
+ */
+static int
+kcf_misc_kstat_update(kstat_t *ksp, int rw)
+{
+	uint_t tcnt;
+	kcf_stats_t *ks_data;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	ks_data = ksp->ks_data;
+
+	ks_data->ks_thrs_in_pool.value.ui32 = kcfpool->kp_threads;
+	/*
+	 * The failover thread is counted in kp_idlethreads in
+	 * some corner cases. This is done to avoid doing more checks
+	 * when submitting a request. We account for those cases below.
+	 */
+	if ((tcnt = kcfpool->kp_idlethreads) == (kcfpool->kp_threads + 1))
+		tcnt--;
+	ks_data->ks_idle_thrs.value.ui32 = tcnt;
+	ks_data->ks_minthrs.value.ui32 = kcf_minthreads;
+	ks_data->ks_maxthrs.value.ui32 = kcf_maxthreads;
+	ks_data->ks_swq_njobs.value.ui32 = gswq->gs_njobs;
+	ks_data->ks_swq_maxjobs.value.ui32 = gswq->gs_maxjobs;
+	ks_data->ks_taskq_threads.value.ui32 = crypto_taskq_threads;
+	ks_data->ks_taskq_minalloc.value.ui32 = crypto_taskq_minalloc;
+	ks_data->ks_taskq_maxalloc.value.ui32 = crypto_taskq_maxalloc;
+
+	return (0);
+}
+
+/*
+ * Allocate and initialize a kcf_dual_req, used for saving the arguments of
+ * a dual operation or an atomic operation that has to be internally
+ * simulated with multiple single steps.
+ * crq determines the memory allocation flags.
+ */
+
+kcf_dual_req_t *
+kcf_alloc_req(crypto_call_req_t *crq)
+{
+	kcf_dual_req_t *kcr;
+
+	kcr = kmem_alloc(sizeof (kcf_dual_req_t), KCF_KMFLAG(crq));
+
+	if (kcr == NULL)
+		return (NULL);
+
+	/* Copy the whole crypto_call_req struct, as it isn't persistent */
+	if (crq != NULL)
+		kcr->kr_callreq = *crq;
+	else
+		bzero(&(kcr->kr_callreq), sizeof (crypto_call_req_t));
+	kcr->kr_areq = NULL;
+	kcr->kr_saveoffset = 0;
+	kcr->kr_savelen = 0;
+
+	return (kcr);
+}
+
+/*
+ * Callback routine for the next part of a simulated dual part.
+ * Schedules the next step.
+ *
+ * This routine can be called from interrupt context.
+ */
+void
+kcf_next_req(void *next_req_arg, int status)
+{
+	kcf_dual_req_t *next_req = (kcf_dual_req_t *)next_req_arg;
+	kcf_req_params_t *params = &(next_req->kr_params);
+	kcf_areq_node_t *areq = next_req->kr_areq;
+	int error = status;
+	kcf_provider_desc_t *pd = NULL;
+	crypto_dual_data_t *ct = NULL;
+
+	/* Stop the processing if an error occurred at this step */
+	if (error != CRYPTO_SUCCESS) {
+out:
+		areq->an_reqarg = next_req->kr_callreq;
+		KCF_AREQ_REFRELE(areq);
+		kmem_free(next_req, sizeof (kcf_dual_req_t));
+		areq->an_isdual = B_FALSE;
+		kcf_aop_done(areq, error);
+		return;
+	}
+
+	switch (params->rp_opgrp) {
+	case KCF_OG_MAC: {
+
+		/*
+		 * The next req is submitted with the same reqid as the
+		 * first part. The consumer only got back that reqid, and
+		 * should still be able to cancel the operation during its
+		 * second step.
+		 */
+		kcf_mac_ops_params_t *mops = &(params->rp_u.mac_params);
+		crypto_ctx_template_t mac_tmpl;
+		kcf_mech_entry_t *me;
+
+		ct = (crypto_dual_data_t *)mops->mo_data;
+		mac_tmpl = (crypto_ctx_template_t)mops->mo_templ;
+
+		/* No expected recoverable failures, so no retry list */
+		pd = kcf_get_mech_provider(mops->mo_framework_mechtype,
+		    &me, &error, NULL, CRYPTO_FG_MAC_ATOMIC,
+		    (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), ct->dd_len2);
+
+		if (pd == NULL) {
+			error = CRYPTO_MECH_NOT_SUPPORTED;
+			goto out;
+		}
+		/* Validate the MAC context template here */
+		if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) &&
+		    (mac_tmpl != NULL)) {
+			kcf_ctx_template_t *ctx_mac_tmpl;
+
+			ctx_mac_tmpl = (kcf_ctx_template_t *)mac_tmpl;
+
+			if (ctx_mac_tmpl->ct_generation != me->me_gen_swprov) {
+				KCF_PROV_REFRELE(pd);
+				error = CRYPTO_OLD_CTX_TEMPLATE;
+				goto out;
+			}
+			mops->mo_templ = ctx_mac_tmpl->ct_prov_tmpl;
+		}
+
+		break;
+	}
+	case KCF_OG_DECRYPT: {
+		kcf_decrypt_ops_params_t *dcrops =
+		    &(params->rp_u.decrypt_params);
+
+		ct = (crypto_dual_data_t *)dcrops->dop_ciphertext;
+		/* No expected recoverable failures, so no retry list */
+		pd = kcf_get_mech_provider(dcrops->dop_framework_mechtype,
+		    NULL, &error, NULL, CRYPTO_FG_DECRYPT_ATOMIC,
+		    (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), ct->dd_len1);
+
+		if (pd == NULL) {
+			error = CRYPTO_MECH_NOT_SUPPORTED;
+			goto out;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+
+	/* The second step uses len2 and offset2 of the dual_data */
+	next_req->kr_saveoffset = ct->dd_offset1;
+	next_req->kr_savelen = ct->dd_len1;
+	ct->dd_offset1 = ct->dd_offset2;
+	ct->dd_len1 = ct->dd_len2;
+
+	/* preserve if the caller is restricted */
+	if (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED) {
+		areq->an_reqarg.cr_flag = CRYPTO_RESTRICTED;
+	} else {
+		areq->an_reqarg.cr_flag = 0;
+	}
+
+	areq->an_reqarg.cr_callback_func = kcf_last_req;
+	areq->an_reqarg.cr_callback_arg = next_req;
+	areq->an_isdual = B_TRUE;
+
+	/*
+	 * We would like to call kcf_submit_request() here. But,
+	 * that is not possible as that routine allocates a new
+	 * kcf_areq_node_t request structure, while we need to
+	 * reuse the existing request structure.
+	 */
+	switch (pd->pd_prov_type) {
+	case CRYPTO_SW_PROVIDER:
+		error = common_submit_request(pd, NULL, params,
+		    KCF_RHNDL(KM_NOSLEEP));
+		break;
+
+	case CRYPTO_HW_PROVIDER: {
+		kcf_provider_desc_t *old_pd;
+		taskq_t *taskq = pd->pd_sched_info.ks_taskq;
+
+		/*
+		 * Set the params for the second step in the
+		 * dual-ops.
+		 */
+		areq->an_params = *params;
+		old_pd = areq->an_provider;
+		KCF_PROV_REFRELE(old_pd);
+		KCF_PROV_REFHOLD(pd);
+		areq->an_provider = pd;
+
+		/*
+		 * Note that we have to do a taskq_dispatch()
+		 * here as we may be in interrupt context.
+		 */
+		if (taskq_dispatch(taskq, process_req_hwp, areq,
+		    TQ_NOSLEEP) == (taskqid_t)0) {
+			error = CRYPTO_HOST_MEMORY;
+		} else {
+			error = CRYPTO_QUEUED;
+		}
+		break;
+	}
+	default:
+		break;
+	}
+
+	/*
+	 * We have to release the holds on the request and the provider
+	 * in all cases.
+	 */
+	KCF_AREQ_REFRELE(areq);
+	KCF_PROV_REFRELE(pd);
+
+	if (error != CRYPTO_QUEUED) {
+		/* restore, clean up, and invoke the client's callback */
+
+		ct->dd_offset1 = next_req->kr_saveoffset;
+		ct->dd_len1 = next_req->kr_savelen;
+		areq->an_reqarg = next_req->kr_callreq;
+		kmem_free(next_req, sizeof (kcf_dual_req_t));
+		areq->an_isdual = B_FALSE;
+		kcf_aop_done(areq, error);
+	}
+}
+
+/*
+ * Last part of an emulated dual operation.
+ * Clean up and restore ...
+ */
+void
+kcf_last_req(void *last_req_arg, int status)
+{
+	kcf_dual_req_t *last_req = (kcf_dual_req_t *)last_req_arg;
+
+	kcf_req_params_t *params = &(last_req->kr_params);
+	kcf_areq_node_t *areq = last_req->kr_areq;
+	crypto_dual_data_t *ct = NULL;
+
+	switch (params->rp_opgrp) {
+	case KCF_OG_MAC: {
+		kcf_mac_ops_params_t *mops = &(params->rp_u.mac_params);
+
+		ct = (crypto_dual_data_t *)mops->mo_data;
+		break;
+	}
+	case KCF_OG_DECRYPT: {
+		kcf_decrypt_ops_params_t *dcrops =
+		    &(params->rp_u.decrypt_params);
+
+		ct = (crypto_dual_data_t *)dcrops->dop_ciphertext;
+		break;
+	}
+	default: {
+		panic("invalid kcf_op_group_t %d", (int)params->rp_opgrp);
+		return;
+	}
+	}
+	ct->dd_offset1 = last_req->kr_saveoffset;
+	ct->dd_len1 = last_req->kr_savelen;
+
+	/* The submitter used kcf_last_req as its callback */
+
+	if (areq == NULL) {
+		crypto_call_req_t *cr = &last_req->kr_callreq;
+
+		(*(cr->cr_callback_func))(cr->cr_callback_arg, status);
+		kmem_free(last_req, sizeof (kcf_dual_req_t));
+		return;
+	}
+	areq->an_reqarg = last_req->kr_callreq;
+	KCF_AREQ_REFRELE(areq);
+	kmem_free(last_req, sizeof (kcf_dual_req_t));
+	areq->an_isdual = B_FALSE;
+	kcf_aop_done(areq, status);
+}
diff --git a/sys/contrib/openzfs/module/icp/illumos-crypto.c b/sys/contrib/openzfs/module/icp/illumos-crypto.c
new file mode 100644
index 000000000000..3c5ef4393940
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/illumos-crypto.c
@@ -0,0 +1,158 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#ifdef _KERNEL
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#else
+#define	__exit
+#define	__init
+#endif
+
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/modhash_impl.h>
+#include <sys/crypto/icp.h>
+
+/*
+ * Changes made to the original Illumos Crypto Layer for the ICP:
+ *
+ * Several changes were needed to allow the Illumos Crypto Layer
+ * to work in the Linux kernel. Almost all of the changes fall into
+ * one of the following categories:
+ *
+ * 1) Moving the syntax to the C90: This was mostly a matter of
+ * changing func() definitions to func(void). In a few cases,
+ * initializations of structs with unions needed to have brackets
+ * added.
+ *
+ * 2) Changes to allow userspace compilation: The ICP is meant to be
+ * compiled and used in both userspace and kernel space (for ztest and
+ * libzfs), so the _KERNEL macros did not make sense anymore. For the
+ * same reason, many header includes were also changed to use
+ * sys/zfs_context.h
+ *
+ * 3) Moving to a statically compiled architecture: At some point in
+ * the future it may make sense to have encryption algorithms that are
+ * loadable into the ICP at runtime via separate kernel modules.
+ * However, considering that this code will probably not see much use
+ * outside of zfs and zfs encryption only requires aes and sha256
+ * algorithms it seemed like more trouble than it was worth to port over
+ * Illumos's kernel module structure to a Linux kernel module. In
+ * addition, The Illumos code related to keeping track of kernel modules
+ * is very much tied to the Illumos OS and proved difficult to port to
+ * Linux. Therefore, the structure of the ICP was simplified to work
+ * statically and several pieces of code responsible for keeping track
+ * of Illumos kernel modules were removed and simplified. All module
+ * initialization and destruction is now called in this file during
+ * Linux kernel module loading and unloading.
+ *
+ * 4) Adding destructors: The Illumos Crypto Layer is built into
+ * the Illumos kernel and is not meant to be unloaded. Some destructors
+ * were added to allow the ICP to be unloaded without leaking
+ * structures.
+ *
+ * 5) Removing CRYPTO_DATA_MBLK related structures and code:
+ * crypto_data_t can have 3 formats, CRYPTO_DATA_RAW, CRYPTO_DATA_UIO,
+ * and CRYPTO_DATA_MBLK. ZFS only requires the first 2 formats, as the
+ * last one is related to streamed data. To simplify the port, code
+ * related to this format was removed.
+ *
+ * 6) Changes for architecture specific code: Some changes were needed
+ * to make architecture specific assembly compile. The biggest change
+ * here was to functions related to detecting CPU capabilities for amd64.
+ * The Illumos Crypto Layer used called into the Illumos kernel's API
+ * to discover these. They have been converted to instead use the
+ * 'cpuid' instruction as per the Intel spec. In addition, references to
+ * the sun4u' and sparc architectures have been removed so that these
+ * will use the generic implementation.
+ *
+ * 7) Removing sha384 and sha512 code: The sha code was actually very
+ * easy to port. However, the generic sha384 and sha512 code actually
+ * exceeds the stack size on arm and powerpc architectures. In an effort
+ * to remove warnings, this code was removed.
+ *
+ * 8) Change large allocations from kmem_alloc() to vmem_alloc(): In
+ * testing the ICP with the ZFS encryption code, a few allocations were
+ * found that could potentially be very large. These caused the SPL to
+ * throw warnings and so they were changed to use vmem_alloc().
+ *
+ * 9) Makefiles: Makefiles were added that would work with the existing
+ * ZFS Makefiles.
+ */
+
+void __exit
+icp_fini(void)
+{
+	skein_mod_fini();
+	sha2_mod_fini();
+	sha1_mod_fini();
+	edonr_mod_fini();
+	aes_mod_fini();
+	kcf_sched_destroy();
+	kcf_prov_tab_destroy();
+	kcf_destroy_mech_tabs();
+	mod_hash_fini();
+}
+
+/* roughly equivalent to kcf.c: _init() */
+int __init
+icp_init(void)
+{
+	/* initialize the mod hash module */
+	mod_hash_init();
+
+	/* initialize the mechanisms tables supported out-of-the-box */
+	kcf_init_mech_tabs();
+
+	/* initialize the providers tables */
+	kcf_prov_tab_init();
+
+	/*
+	 * Initialize scheduling structures. Note that this does NOT
+	 * start any threads since it might not be safe to do so.
+	 */
+	kcf_sched_init();
+
+	/* initialize algorithms */
+	aes_mod_init();
+	edonr_mod_init();
+	sha1_mod_init();
+	sha2_mod_init();
+	skein_mod_init();
+
+	return (0);
+}
+
+#if defined(_KERNEL)
+module_exit(icp_fini);
+module_init(icp_init);
+MODULE_AUTHOR(ZFS_META_AUTHOR);
+MODULE_LICENSE(ZFS_META_LICENSE);
+MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+#endif
diff --git a/sys/contrib/openzfs/module/icp/include/aes/aes_impl.h b/sys/contrib/openzfs/module/icp/include/aes/aes_impl.h
new file mode 100644
index 000000000000..41dccaa3848a
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/aes/aes_impl.h
@@ -0,0 +1,227 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_AES_IMPL_H
+#define	_AES_IMPL_H
+
+/*
+ * Common definitions used by AES.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+/* Similar to sysmacros.h IS_P2ALIGNED, but checks two pointers: */
+#define	IS_P2ALIGNED2(v, w, a) \
+	((((uintptr_t)(v) | (uintptr_t)(w)) & ((uintptr_t)(a) - 1)) == 0)
+
+#define	AES_BLOCK_LEN	16	/* bytes */
+/* Round constant length, in number of 32-bit elements: */
+#define	RC_LENGTH	(5 * ((AES_BLOCK_LEN) / 4 - 2))
+
+#define	AES_COPY_BLOCK(src, dst) \
+	(dst)[0] = (src)[0]; \
+	(dst)[1] = (src)[1]; \
+	(dst)[2] = (src)[2]; \
+	(dst)[3] = (src)[3]; \
+	(dst)[4] = (src)[4]; \
+	(dst)[5] = (src)[5]; \
+	(dst)[6] = (src)[6]; \
+	(dst)[7] = (src)[7]; \
+	(dst)[8] = (src)[8]; \
+	(dst)[9] = (src)[9]; \
+	(dst)[10] = (src)[10]; \
+	(dst)[11] = (src)[11]; \
+	(dst)[12] = (src)[12]; \
+	(dst)[13] = (src)[13]; \
+	(dst)[14] = (src)[14]; \
+	(dst)[15] = (src)[15]
+
+#define	AES_XOR_BLOCK(src, dst) \
+	(dst)[0] ^= (src)[0]; \
+	(dst)[1] ^= (src)[1]; \
+	(dst)[2] ^= (src)[2]; \
+	(dst)[3] ^= (src)[3]; \
+	(dst)[4] ^= (src)[4]; \
+	(dst)[5] ^= (src)[5]; \
+	(dst)[6] ^= (src)[6]; \
+	(dst)[7] ^= (src)[7]; \
+	(dst)[8] ^= (src)[8]; \
+	(dst)[9] ^= (src)[9]; \
+	(dst)[10] ^= (src)[10]; \
+	(dst)[11] ^= (src)[11]; \
+	(dst)[12] ^= (src)[12]; \
+	(dst)[13] ^= (src)[13]; \
+	(dst)[14] ^= (src)[14]; \
+	(dst)[15] ^= (src)[15]
+
+/* AES key size definitions */
+#define	AES_MINBITS		128
+#define	AES_MINBYTES		((AES_MINBITS) >> 3)
+#define	AES_MAXBITS		256
+#define	AES_MAXBYTES		((AES_MAXBITS) >> 3)
+
+#define	AES_MIN_KEY_BYTES	((AES_MINBITS) >> 3)
+#define	AES_MAX_KEY_BYTES	((AES_MAXBITS) >> 3)
+#define	AES_192_KEY_BYTES	24
+#define	AES_IV_LEN		16
+
+/* AES key schedule may be implemented with 32- or 64-bit elements: */
+#define	AES_32BIT_KS		32
+#define	AES_64BIT_KS		64
+
+#define	MAX_AES_NR		14 /* Maximum number of rounds */
+#define	MAX_AES_NB		4  /* Number of columns comprising a state */
+
+typedef union {
+#ifdef	sun4u
+	uint64_t	ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+#endif
+	uint32_t	ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+} aes_ks_t;
+
+typedef struct aes_impl_ops aes_impl_ops_t;
+
+/*
+ * The absolute offset of the encr_ks (0) and the nr (504) fields are hard
+ * coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly).
+ */
+typedef struct aes_key aes_key_t;
+struct aes_key {
+	aes_ks_t	encr_ks;  /* encryption key schedule */
+	aes_ks_t	decr_ks;  /* decryption key schedule */
+#ifdef __amd64
+	long double	align128; /* Align fields above for Intel AES-NI */
+#endif	/* __amd64 */
+	const aes_impl_ops_t	*ops;	/* ops associated with this schedule */
+	int		nr;	  /* number of rounds (10, 12, or 14) */
+	int		type;	  /* key schedule size (32 or 64 bits) */
+};
+
+/*
+ * Core AES functions.
+ * ks and keysched are pointers to aes_key_t.
+ * They are declared void* as they are intended to be opaque types.
+ * Use function aes_alloc_keysched() to allocate memory for ks and keysched.
+ */
+extern void *aes_alloc_keysched(size_t *size, int kmflag);
+extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits,
+	void *keysched);
+extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct);
+extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt);
+
+/*
+ * AES mode functions.
+ * The first 2 functions operate on 16-byte AES blocks.
+ */
+extern void aes_copy_block(uint8_t *in, uint8_t *out);
+extern void aes_xor_block(uint8_t *data, uint8_t *dst);
+
+/* Note: ctx is a pointer to aes_ctx_t defined in modes.h */
+extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+    crypto_data_t *out);
+extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
+    crypto_data_t *out);
+
+/*
+ * The following definitions and declarations are only used by AES FIPS POST
+ */
+#ifdef _AES_IMPL
+
+typedef enum aes_mech_type {
+	AES_ECB_MECH_INFO_TYPE,		/* SUN_CKM_AES_ECB */
+	AES_CBC_MECH_INFO_TYPE,		/* SUN_CKM_AES_CBC */
+	AES_CBC_PAD_MECH_INFO_TYPE,	/* SUN_CKM_AES_CBC_PAD */
+	AES_CTR_MECH_INFO_TYPE,		/* SUN_CKM_AES_CTR */
+	AES_CCM_MECH_INFO_TYPE,		/* SUN_CKM_AES_CCM */
+	AES_GCM_MECH_INFO_TYPE,		/* SUN_CKM_AES_GCM */
+	AES_GMAC_MECH_INFO_TYPE		/* SUN_CKM_AES_GMAC */
+} aes_mech_type_t;
+
+#endif /* _AES_IMPL */
+
+/*
+ * Methods used to define AES implementation
+ *
+ * @aes_gen_f Key generation
+ * @aes_enc_f Function encrypts one block
+ * @aes_dec_f Function decrypts one block
+ * @aes_will_work_f Function tests whether method will function
+ */
+typedef void 		(*aes_generate_f)(aes_key_t *, const uint32_t *, int);
+typedef void		(*aes_encrypt_f)(const uint32_t[], int,
+    const uint32_t[4], uint32_t[4]);
+typedef void		(*aes_decrypt_f)(const uint32_t[], int,
+    const uint32_t[4], uint32_t[4]);
+typedef boolean_t	(*aes_will_work_f)(void);
+
+#define	AES_IMPL_NAME_MAX (16)
+
+struct aes_impl_ops {
+	aes_generate_f generate;
+	aes_encrypt_f encrypt;
+	aes_decrypt_f decrypt;
+	aes_will_work_f is_supported;
+	boolean_t needs_byteswap;
+	char name[AES_IMPL_NAME_MAX];
+};
+
+extern const aes_impl_ops_t aes_generic_impl;
+#if defined(__x86_64)
+extern const aes_impl_ops_t aes_x86_64_impl;
+
+/* These functions are used to execute amd64 instructions for AMD or Intel: */
+extern int rijndael_key_setup_enc_amd64(uint32_t rk[],
+	const uint32_t cipherKey[], int keyBits);
+extern int rijndael_key_setup_dec_amd64(uint32_t rk[],
+	const uint32_t cipherKey[], int keyBits);
+extern void aes_encrypt_amd64(const uint32_t rk[], int Nr,
+	const uint32_t pt[4], uint32_t ct[4]);
+extern void aes_decrypt_amd64(const uint32_t rk[], int Nr,
+	const uint32_t ct[4], uint32_t pt[4]);
+#endif
+#if defined(__x86_64) && defined(HAVE_AES)
+extern const aes_impl_ops_t aes_aesni_impl;
+#endif
+
+/*
+ * Initializes fastest implementation
+ */
+void aes_impl_init(void);
+
+/*
+ * Returns optimal allowed AES implementation
+ */
+const struct aes_impl_ops *aes_impl_get_ops(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _AES_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/modes/gcm_impl.h b/sys/contrib/openzfs/module/icp/include/modes/gcm_impl.h
new file mode 100644
index 000000000000..28c8f63a7d46
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/modes/gcm_impl.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#ifndef	_GCM_IMPL_H
+#define	_GCM_IMPL_H
+
+/*
+ * GCM function dispatcher.
+ */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+/*
+ * Methods used to define GCM implementation
+ *
+ * @gcm_mul_f Perform carry-less multiplication
+ * @gcm_will_work_f Function tests whether implementation will function
+ */
+typedef void		(*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *);
+typedef boolean_t	(*gcm_will_work_f)(void);
+
+#define	GCM_IMPL_NAME_MAX (16)
+
+typedef struct gcm_impl_ops {
+	gcm_mul_f mul;
+	gcm_will_work_f is_supported;
+	char name[GCM_IMPL_NAME_MAX];
+} gcm_impl_ops_t;
+
+extern const gcm_impl_ops_t gcm_generic_impl;
+#if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
+#endif
+
+/*
+ * Initializes fastest implementation
+ */
+void gcm_impl_init(void);
+
+/*
+ * Returns optimal allowed GCM implementation
+ */
+const struct gcm_impl_ops *gcm_impl_get_ops(void);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _GCM_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/modes/modes.h b/sys/contrib/openzfs/module/icp/include/modes/modes.h
new file mode 100644
index 000000000000..ab71197542eb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/modes/modes.h
@@ -0,0 +1,411 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_COMMON_CRYPTO_MODES_H
+#define	_COMMON_CRYPTO_MODES_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+
+/*
+ * Does the build chain support all instructions needed for the GCM assembler
+ * routines. AVX support should imply AES-NI and PCLMULQDQ, but make sure
+ * anyhow.
+ */
+#if defined(__x86_64__) && defined(HAVE_AVX) && \
+    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
+#define	CAN_USE_GCM_ASM
+extern boolean_t gcm_avx_can_use_movbe;
+#endif
+
+#define	ECB_MODE			0x00000002
+#define	CBC_MODE			0x00000004
+#define	CTR_MODE			0x00000008
+#define	CCM_MODE			0x00000010
+#define	GCM_MODE			0x00000020
+#define	GMAC_MODE			0x00000040
+
+/*
+ * cc_keysched:		Pointer to key schedule.
+ *
+ * cc_keysched_len:	Length of the key schedule.
+ *
+ * cc_remainder:	This is for residual data, i.e. data that can't
+ *			be processed because there are too few bytes.
+ *			Must wait until more data arrives.
+ *
+ * cc_remainder_len:	Number of bytes in cc_remainder.
+ *
+ * cc_iv:		Scratch buffer that sometimes contains the IV.
+ *
+ * cc_lastp:		Pointer to previous block of ciphertext.
+ *
+ * cc_copy_to:		Pointer to where encrypted residual data needs
+ *			to be copied.
+ *
+ * cc_flags:		PROVIDER_OWNS_KEY_SCHEDULE
+ *			When a context is freed, it is necessary
+ *			to know whether the key schedule was allocated
+ *			by the caller, or internally, e.g. an init routine.
+ *			If allocated by the latter, then it needs to be freed.
+ *
+ *			ECB_MODE, CBC_MODE, CTR_MODE, or CCM_MODE
+ */
+struct common_ctx {
+	void *cc_keysched;
+	size_t cc_keysched_len;
+	uint64_t cc_iv[2];
+	uint64_t cc_remainder[2];
+	size_t cc_remainder_len;
+	uint8_t *cc_lastp;
+	uint8_t *cc_copy_to;
+	uint32_t cc_flags;
+};
+
+typedef struct common_ctx common_ctx_t;
+
+typedef struct ecb_ctx {
+	struct common_ctx ecb_common;
+	uint64_t ecb_lastblock[2];
+} ecb_ctx_t;
+
+#define	ecb_keysched		ecb_common.cc_keysched
+#define	ecb_keysched_len	ecb_common.cc_keysched_len
+#define	ecb_iv			ecb_common.cc_iv
+#define	ecb_remainder		ecb_common.cc_remainder
+#define	ecb_remainder_len	ecb_common.cc_remainder_len
+#define	ecb_lastp		ecb_common.cc_lastp
+#define	ecb_copy_to		ecb_common.cc_copy_to
+#define	ecb_flags		ecb_common.cc_flags
+
+typedef struct cbc_ctx {
+	struct common_ctx cbc_common;
+	uint64_t cbc_lastblock[2];
+} cbc_ctx_t;
+
+#define	cbc_keysched		cbc_common.cc_keysched
+#define	cbc_keysched_len	cbc_common.cc_keysched_len
+#define	cbc_iv			cbc_common.cc_iv
+#define	cbc_remainder		cbc_common.cc_remainder
+#define	cbc_remainder_len	cbc_common.cc_remainder_len
+#define	cbc_lastp		cbc_common.cc_lastp
+#define	cbc_copy_to		cbc_common.cc_copy_to
+#define	cbc_flags		cbc_common.cc_flags
+
+/*
+ * ctr_lower_mask		Bit-mask for lower 8 bytes of counter block.
+ * ctr_upper_mask		Bit-mask for upper 8 bytes of counter block.
+ */
+typedef struct ctr_ctx {
+	struct common_ctx ctr_common;
+	uint64_t ctr_lower_mask;
+	uint64_t ctr_upper_mask;
+	uint32_t ctr_tmp[4];
+} ctr_ctx_t;
+
+/*
+ * ctr_cb			Counter block.
+ */
+#define	ctr_keysched		ctr_common.cc_keysched
+#define	ctr_keysched_len	ctr_common.cc_keysched_len
+#define	ctr_cb			ctr_common.cc_iv
+#define	ctr_remainder		ctr_common.cc_remainder
+#define	ctr_remainder_len	ctr_common.cc_remainder_len
+#define	ctr_lastp		ctr_common.cc_lastp
+#define	ctr_copy_to		ctr_common.cc_copy_to
+#define	ctr_flags		ctr_common.cc_flags
+
+/*
+ *
+ * ccm_mac_len:		Stores length of the MAC in CCM mode.
+ * ccm_mac_buf:		Stores the intermediate value for MAC in CCM encrypt.
+ *			In CCM decrypt, stores the input MAC value.
+ * ccm_data_len:	Length of the plaintext for CCM mode encrypt, or
+ *			length of the ciphertext for CCM mode decrypt.
+ * ccm_processed_data_len:
+ *			Length of processed plaintext in CCM mode encrypt,
+ *			or length of processed ciphertext for CCM mode decrypt.
+ * ccm_processed_mac_len:
+ *			Length of MAC data accumulated in CCM mode decrypt.
+ *
+ * ccm_pt_buf:		Only used in CCM mode decrypt.  It stores the
+ *			decrypted plaintext to be returned when
+ *			MAC verification succeeds in decrypt_final.
+ *			Memory for this should be allocated in the AES module.
+ *
+ */
+typedef struct ccm_ctx {
+	struct common_ctx ccm_common;
+	uint32_t ccm_tmp[4];
+	size_t ccm_mac_len;
+	uint64_t ccm_mac_buf[2];
+	size_t ccm_data_len;
+	size_t ccm_processed_data_len;
+	size_t ccm_processed_mac_len;
+	uint8_t *ccm_pt_buf;
+	uint64_t ccm_mac_input_buf[2];
+	uint64_t ccm_counter_mask;
+} ccm_ctx_t;
+
+#define	ccm_keysched		ccm_common.cc_keysched
+#define	ccm_keysched_len	ccm_common.cc_keysched_len
+#define	ccm_cb			ccm_common.cc_iv
+#define	ccm_remainder		ccm_common.cc_remainder
+#define	ccm_remainder_len	ccm_common.cc_remainder_len
+#define	ccm_lastp		ccm_common.cc_lastp
+#define	ccm_copy_to		ccm_common.cc_copy_to
+#define	ccm_flags		ccm_common.cc_flags
+
+/*
+ * gcm_tag_len:		Length of authentication tag.
+ *
+ * gcm_ghash:		Stores output from the GHASH function.
+ *
+ * gcm_processed_data_len:
+ *			Length of processed plaintext (encrypt) or
+ *			length of processed ciphertext (decrypt).
+ *
+ * gcm_pt_buf:		Stores the decrypted plaintext returned by
+ *			decrypt_final when the computed authentication
+ *			tag matches the	user supplied tag.
+ *
+ * gcm_pt_buf_len:	Length of the plaintext buffer.
+ *
+ * gcm_H:		Subkey.
+ *
+ * gcm_Htable:		Pre-computed and pre-shifted H, H^2, ... H^6 for the
+ *			Karatsuba Algorithm in host byte order.
+ *
+ * gcm_J0:		Pre-counter block generated from the IV.
+ *
+ * gcm_len_a_len_c:	64-bit representations of the bit lengths of
+ *			AAD and ciphertext.
+ *
+ * gcm_kmflag:		Current value of kmflag. Used for allocating
+ *			the plaintext buffer during decryption and a
+ *			gcm_avx_chunk_size'd buffer for avx enabled encryption.
+ */
+typedef struct gcm_ctx {
+	struct common_ctx gcm_common;
+	size_t gcm_tag_len;
+	size_t gcm_processed_data_len;
+	size_t gcm_pt_buf_len;
+	uint32_t gcm_tmp[4];
+	/*
+	 * The offset of gcm_Htable relative to gcm_ghash, (32), is hard coded
+	 * in aesni-gcm-x86_64.S, so please don't change (or adjust there).
+	 */
+	uint64_t gcm_ghash[2];
+	uint64_t gcm_H[2];
+#ifdef CAN_USE_GCM_ASM
+	uint64_t *gcm_Htable;
+	size_t gcm_htab_len;
+#endif
+	uint64_t gcm_J0[2];
+	uint64_t gcm_len_a_len_c[2];
+	uint8_t *gcm_pt_buf;
+	int gcm_kmflag;
+#ifdef CAN_USE_GCM_ASM
+	boolean_t gcm_use_avx;
+#endif
+} gcm_ctx_t;
+
+#define	gcm_keysched		gcm_common.cc_keysched
+#define	gcm_keysched_len	gcm_common.cc_keysched_len
+#define	gcm_cb			gcm_common.cc_iv
+#define	gcm_remainder		gcm_common.cc_remainder
+#define	gcm_remainder_len	gcm_common.cc_remainder_len
+#define	gcm_lastp		gcm_common.cc_lastp
+#define	gcm_copy_to		gcm_common.cc_copy_to
+#define	gcm_flags		gcm_common.cc_flags
+
+#define	AES_GMAC_IV_LEN		12
+#define	AES_GMAC_TAG_BITS	128
+
+typedef struct aes_ctx {
+	union {
+		ecb_ctx_t acu_ecb;
+		cbc_ctx_t acu_cbc;
+		ctr_ctx_t acu_ctr;
+		ccm_ctx_t acu_ccm;
+		gcm_ctx_t acu_gcm;
+	} acu;
+} aes_ctx_t;
+
+#define	ac_flags		acu.acu_ecb.ecb_common.cc_flags
+#define	ac_remainder_len	acu.acu_ecb.ecb_common.cc_remainder_len
+#define	ac_keysched		acu.acu_ecb.ecb_common.cc_keysched
+#define	ac_keysched_len		acu.acu_ecb.ecb_common.cc_keysched_len
+#define	ac_iv			acu.acu_ecb.ecb_common.cc_iv
+#define	ac_lastp		acu.acu_ecb.ecb_common.cc_lastp
+#define	ac_pt_buf		acu.acu_ccm.ccm_pt_buf
+#define	ac_mac_len		acu.acu_ccm.ccm_mac_len
+#define	ac_data_len		acu.acu_ccm.ccm_data_len
+#define	ac_processed_mac_len	acu.acu_ccm.ccm_processed_mac_len
+#define	ac_processed_data_len	acu.acu_ccm.ccm_processed_data_len
+#define	ac_tag_len		acu.acu_gcm.gcm_tag_len
+
+typedef struct blowfish_ctx {
+	union {
+		ecb_ctx_t bcu_ecb;
+		cbc_ctx_t bcu_cbc;
+	} bcu;
+} blowfish_ctx_t;
+
+#define	bc_flags		bcu.bcu_ecb.ecb_common.cc_flags
+#define	bc_remainder_len	bcu.bcu_ecb.ecb_common.cc_remainder_len
+#define	bc_keysched		bcu.bcu_ecb.ecb_common.cc_keysched
+#define	bc_keysched_len		bcu.bcu_ecb.ecb_common.cc_keysched_len
+#define	bc_iv			bcu.bcu_ecb.ecb_common.cc_iv
+#define	bc_lastp		bcu.bcu_ecb.ecb_common.cc_lastp
+
+typedef struct des_ctx {
+	union {
+		ecb_ctx_t dcu_ecb;
+		cbc_ctx_t dcu_cbc;
+	} dcu;
+} des_ctx_t;
+
+#define	dc_flags		dcu.dcu_ecb.ecb_common.cc_flags
+#define	dc_remainder_len	dcu.dcu_ecb.ecb_common.cc_remainder_len
+#define	dc_keysched		dcu.dcu_ecb.ecb_common.cc_keysched
+#define	dc_keysched_len		dcu.dcu_ecb.ecb_common.cc_keysched_len
+#define	dc_iv			dcu.dcu_ecb.ecb_common.cc_iv
+#define	dc_lastp		dcu.dcu_ecb.ecb_common.cc_lastp
+
+extern int ecb_cipher_contiguous_blocks(ecb_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t, int (*cipher)(const void *, const uint8_t *,
+    uint8_t *));
+
+extern int cbc_encrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t,
+    int (*encrypt)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int cbc_decrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t,
+    int (*decrypt)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ctr_mode_contiguous_blocks(ctr_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t,
+    int (*cipher)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *, char *, size_t,
+    crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+int ccm_encrypt_final(ccm_ctx_t *, crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+int gcm_encrypt_final(gcm_ctx_t *, crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ccm_decrypt_final(ccm_ctx_t *, crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_decrypt_final(gcm_ctx_t *, crypto_data_t *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int ctr_mode_final(ctr_ctx_t *, crypto_data_t *,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *));
+
+extern int cbc_init_ctx(cbc_ctx_t *, char *, size_t, size_t,
+    void (*copy_block)(uint8_t *, uint64_t *));
+
+extern int ctr_init_ctx(ctr_ctx_t *, ulong_t, uint8_t *,
+    void (*copy_block)(uint8_t *, uint8_t *));
+
+extern int ccm_init_ctx(ccm_ctx_t *, char *, int, boolean_t, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gcm_init_ctx(gcm_ctx_t *, char *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern int gmac_init_ctx(gcm_ctx_t *, char *, size_t,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+    void (*copy_block)(uint8_t *, uint8_t *),
+    void (*xor_block)(uint8_t *, uint8_t *));
+
+extern void calculate_ccm_mac(ccm_ctx_t *, uint8_t *,
+    int (*encrypt_block)(const void *, const uint8_t *, uint8_t *));
+
+extern void gcm_mul(uint64_t *, uint64_t *, uint64_t *);
+
+extern void crypto_init_ptrs(crypto_data_t *, void **, offset_t *);
+extern void crypto_get_ptrs(crypto_data_t *, void **, offset_t *,
+    uint8_t **, size_t *, uint8_t **, size_t);
+
+extern void *ecb_alloc_ctx(int);
+extern void *cbc_alloc_ctx(int);
+extern void *ctr_alloc_ctx(int);
+extern void *ccm_alloc_ctx(int);
+extern void *gcm_alloc_ctx(int);
+extern void *gmac_alloc_ctx(int);
+extern void crypto_free_mode_ctx(void *);
+extern void gcm_set_kmflag(gcm_ctx_t *, int);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _COMMON_CRYPTO_MODES_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha1/sha1.h b/sys/contrib/openzfs/module/icp/include/sha1/sha1.h
new file mode 100644
index 000000000000..251b64fcaeee
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha1/sha1.h
@@ -0,0 +1,61 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_SHA1_H
+#define	_SYS_SHA1_H
+
+#include <sys/types.h>		/* for uint_* */
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * NOTE: n2rng (Niagara2 RNG driver) accesses the state field of
+ * SHA1_CTX directly.  NEVER change this structure without verifying
+ * compatibility with n2rng.  The important thing is that the state
+ * must be in a field declared as uint32_t state[5].
+ */
+/* SHA-1 context. */
+typedef struct 	{
+	uint32_t state[5];	/* state (ABCDE) */
+	uint32_t count[2];	/* number of bits, modulo 2^64 (msb first) */
+	union 	{
+		uint8_t		buf8[64];	/* undigested input */
+		uint32_t	buf32[16];	/* realigned input */
+	} buf_un;
+} SHA1_CTX;
+
+#define	SHA1_DIGEST_LENGTH 20
+
+void SHA1Init(SHA1_CTX *);
+void SHA1Update(SHA1_CTX *, const void *, size_t);
+void SHA1Final(void *, SHA1_CTX *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_SHA1_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha1/sha1_consts.h b/sys/contrib/openzfs/module/icp/include/sha1/sha1_consts.h
new file mode 100644
index 000000000000..848d25ef050f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha1/sha1_consts.h
@@ -0,0 +1,65 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 1998, by Sun Microsystems, Inc.
+ * All rights reserved.
+ */
+
+#ifndef	_SYS_SHA1_CONSTS_H
+#define	_SYS_SHA1_CONSTS_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * as explained in sha1.c, loading 32-bit constants on a sparc is expensive
+ * since it involves both a `sethi' and an `or'.  thus, we instead use `ld'
+ * to load the constants from an array called `sha1_consts'.  however, on
+ * intel (and perhaps other processors), it is cheaper to load the constant
+ * directly.  thus, the c code in SHA1Transform() uses the macro SHA1_CONST()
+ * which either expands to a constant or an array reference, depending on
+ * the architecture the code is being compiled for.
+ */
+
+#include <sys/types.h>		/* uint32_t */
+
+extern	const uint32_t	sha1_consts[];
+
+#if	defined(__sparc)
+#define	SHA1_CONST(x)		(sha1_consts[x])
+#else
+#define	SHA1_CONST(x)		(SHA1_CONST_ ## x)
+#endif
+
+/* constants, as provided in FIPS 180-1 */
+
+#define	SHA1_CONST_0		0x5a827999U
+#define	SHA1_CONST_1		0x6ed9eba1U
+#define	SHA1_CONST_2		0x8f1bbcdcU
+#define	SHA1_CONST_3		0xca62c1d6U
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_SHA1_CONSTS_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha1/sha1_impl.h b/sys/contrib/openzfs/module/icp/include/sha1/sha1_impl.h
new file mode 100644
index 000000000000..1c1f8728f9b5
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha1/sha1_impl.h
@@ -0,0 +1,73 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SHA1_IMPL_H
+#define	_SHA1_IMPL_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	SHA1_HASH_SIZE		20	/* SHA_1 digest length in bytes */
+#define	SHA1_DIGEST_LENGTH	20	/* SHA1 digest length in bytes */
+#define	SHA1_HMAC_BLOCK_SIZE	64	/* SHA1-HMAC block size */
+#define	SHA1_HMAC_MIN_KEY_LEN	1	/* SHA1-HMAC min key length in bytes */
+#define	SHA1_HMAC_MAX_KEY_LEN	INT_MAX /* SHA1-HMAC max key length in bytes */
+#define	SHA1_HMAC_INTS_PER_BLOCK	(SHA1_HMAC_BLOCK_SIZE/sizeof (uint32_t))
+
+/*
+ * CSPI information (entry points, provider info, etc.)
+ */
+typedef enum sha1_mech_type {
+	SHA1_MECH_INFO_TYPE,		/* SUN_CKM_SHA1 */
+	SHA1_HMAC_MECH_INFO_TYPE,	/* SUN_CKM_SHA1_HMAC */
+	SHA1_HMAC_GEN_MECH_INFO_TYPE	/* SUN_CKM_SHA1_HMAC_GENERAL */
+} sha1_mech_type_t;
+
+/*
+ * Context for SHA1 mechanism.
+ */
+typedef struct sha1_ctx {
+	sha1_mech_type_t	sc_mech_type;	/* type of context */
+	SHA1_CTX		sc_sha1_ctx;	/* SHA1 context */
+} sha1_ctx_t;
+
+/*
+ * Context for SHA1-HMAC and SHA1-HMAC-GENERAL mechanisms.
+ */
+typedef struct sha1_hmac_ctx {
+	sha1_mech_type_t	hc_mech_type;	/* type of context */
+	uint32_t		hc_digest_len;	/* digest len in bytes */
+	SHA1_CTX		hc_icontext;	/* inner SHA1 context */
+	SHA1_CTX		hc_ocontext;	/* outer SHA1 context */
+} sha1_hmac_ctx_t;
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SHA1_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha2/sha2_consts.h b/sys/contrib/openzfs/module/icp/include/sha2/sha2_consts.h
new file mode 100644
index 000000000000..3a6645508fe9
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha2/sha2_consts.h
@@ -0,0 +1,219 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_SHA2_CONSTS_H
+#define	_SYS_SHA2_CONSTS_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Loading 32-bit constants on a sparc is expensive since it involves both
+ * a `sethi' and an `or'.  thus, we instead use `ld' to load the constants
+ * from an array called `sha2_consts'.  however, on intel (and perhaps other
+ * processors), it is cheaper to load the constant directly.  thus, the c
+ * code in SHA transform functions uses the macro SHA2_CONST() which either
+ * expands to a constant or an array reference, depending on
+ * the architecture the code is being compiled for.
+ *
+ * SHA512 constants are used for SHA384
+ */
+
+#include <sys/types.h>		/* uint32_t */
+
+extern	const uint32_t	sha256_consts[];
+extern	const uint64_t	sha512_consts[];
+
+#if	defined(__sparc)
+#define	SHA256_CONST(x)		(sha256_consts[x])
+#define	SHA512_CONST(x)		(sha512_consts[x])
+#else
+#define	SHA256_CONST(x)		(SHA256_CONST_ ## x)
+#define	SHA512_CONST(x)		(SHA512_CONST_ ## x)
+#endif
+
+/* constants, as provided in FIPS 180-2 */
+
+#define	SHA256_CONST_0		0x428a2f98U
+#define	SHA256_CONST_1		0x71374491U
+#define	SHA256_CONST_2		0xb5c0fbcfU
+#define	SHA256_CONST_3		0xe9b5dba5U
+#define	SHA256_CONST_4		0x3956c25bU
+#define	SHA256_CONST_5		0x59f111f1U
+#define	SHA256_CONST_6		0x923f82a4U
+#define	SHA256_CONST_7		0xab1c5ed5U
+
+#define	SHA256_CONST_8		0xd807aa98U
+#define	SHA256_CONST_9		0x12835b01U
+#define	SHA256_CONST_10		0x243185beU
+#define	SHA256_CONST_11		0x550c7dc3U
+#define	SHA256_CONST_12		0x72be5d74U
+#define	SHA256_CONST_13		0x80deb1feU
+#define	SHA256_CONST_14		0x9bdc06a7U
+#define	SHA256_CONST_15		0xc19bf174U
+
+#define	SHA256_CONST_16		0xe49b69c1U
+#define	SHA256_CONST_17		0xefbe4786U
+#define	SHA256_CONST_18		0x0fc19dc6U
+#define	SHA256_CONST_19		0x240ca1ccU
+#define	SHA256_CONST_20		0x2de92c6fU
+#define	SHA256_CONST_21		0x4a7484aaU
+#define	SHA256_CONST_22		0x5cb0a9dcU
+#define	SHA256_CONST_23		0x76f988daU
+
+#define	SHA256_CONST_24		0x983e5152U
+#define	SHA256_CONST_25		0xa831c66dU
+#define	SHA256_CONST_26		0xb00327c8U
+#define	SHA256_CONST_27		0xbf597fc7U
+#define	SHA256_CONST_28		0xc6e00bf3U
+#define	SHA256_CONST_29		0xd5a79147U
+#define	SHA256_CONST_30		0x06ca6351U
+#define	SHA256_CONST_31		0x14292967U
+
+#define	SHA256_CONST_32		0x27b70a85U
+#define	SHA256_CONST_33		0x2e1b2138U
+#define	SHA256_CONST_34		0x4d2c6dfcU
+#define	SHA256_CONST_35		0x53380d13U
+#define	SHA256_CONST_36		0x650a7354U
+#define	SHA256_CONST_37		0x766a0abbU
+#define	SHA256_CONST_38		0x81c2c92eU
+#define	SHA256_CONST_39		0x92722c85U
+
+#define	SHA256_CONST_40		0xa2bfe8a1U
+#define	SHA256_CONST_41		0xa81a664bU
+#define	SHA256_CONST_42		0xc24b8b70U
+#define	SHA256_CONST_43		0xc76c51a3U
+#define	SHA256_CONST_44		0xd192e819U
+#define	SHA256_CONST_45		0xd6990624U
+#define	SHA256_CONST_46		0xf40e3585U
+#define	SHA256_CONST_47		0x106aa070U
+
+#define	SHA256_CONST_48		0x19a4c116U
+#define	SHA256_CONST_49		0x1e376c08U
+#define	SHA256_CONST_50		0x2748774cU
+#define	SHA256_CONST_51		0x34b0bcb5U
+#define	SHA256_CONST_52		0x391c0cb3U
+#define	SHA256_CONST_53		0x4ed8aa4aU
+#define	SHA256_CONST_54		0x5b9cca4fU
+#define	SHA256_CONST_55		0x682e6ff3U
+
+#define	SHA256_CONST_56		0x748f82eeU
+#define	SHA256_CONST_57		0x78a5636fU
+#define	SHA256_CONST_58		0x84c87814U
+#define	SHA256_CONST_59		0x8cc70208U
+#define	SHA256_CONST_60		0x90befffaU
+#define	SHA256_CONST_61		0xa4506cebU
+#define	SHA256_CONST_62		0xbef9a3f7U
+#define	SHA256_CONST_63		0xc67178f2U
+
+#define	SHA512_CONST_0		0x428a2f98d728ae22ULL
+#define	SHA512_CONST_1		0x7137449123ef65cdULL
+#define	SHA512_CONST_2		0xb5c0fbcfec4d3b2fULL
+#define	SHA512_CONST_3		0xe9b5dba58189dbbcULL
+#define	SHA512_CONST_4		0x3956c25bf348b538ULL
+#define	SHA512_CONST_5		0x59f111f1b605d019ULL
+#define	SHA512_CONST_6		0x923f82a4af194f9bULL
+#define	SHA512_CONST_7		0xab1c5ed5da6d8118ULL
+#define	SHA512_CONST_8		0xd807aa98a3030242ULL
+#define	SHA512_CONST_9		0x12835b0145706fbeULL
+#define	SHA512_CONST_10		0x243185be4ee4b28cULL
+#define	SHA512_CONST_11		0x550c7dc3d5ffb4e2ULL
+#define	SHA512_CONST_12		0x72be5d74f27b896fULL
+#define	SHA512_CONST_13		0x80deb1fe3b1696b1ULL
+#define	SHA512_CONST_14		0x9bdc06a725c71235ULL
+#define	SHA512_CONST_15		0xc19bf174cf692694ULL
+#define	SHA512_CONST_16		0xe49b69c19ef14ad2ULL
+#define	SHA512_CONST_17		0xefbe4786384f25e3ULL
+#define	SHA512_CONST_18		0x0fc19dc68b8cd5b5ULL
+#define	SHA512_CONST_19		0x240ca1cc77ac9c65ULL
+#define	SHA512_CONST_20		0x2de92c6f592b0275ULL
+#define	SHA512_CONST_21		0x4a7484aa6ea6e483ULL
+#define	SHA512_CONST_22		0x5cb0a9dcbd41fbd4ULL
+#define	SHA512_CONST_23		0x76f988da831153b5ULL
+#define	SHA512_CONST_24		0x983e5152ee66dfabULL
+#define	SHA512_CONST_25		0xa831c66d2db43210ULL
+#define	SHA512_CONST_26		0xb00327c898fb213fULL
+#define	SHA512_CONST_27		0xbf597fc7beef0ee4ULL
+#define	SHA512_CONST_28		0xc6e00bf33da88fc2ULL
+#define	SHA512_CONST_29		0xd5a79147930aa725ULL
+#define	SHA512_CONST_30		0x06ca6351e003826fULL
+#define	SHA512_CONST_31		0x142929670a0e6e70ULL
+#define	SHA512_CONST_32		0x27b70a8546d22ffcULL
+#define	SHA512_CONST_33		0x2e1b21385c26c926ULL
+#define	SHA512_CONST_34		0x4d2c6dfc5ac42aedULL
+#define	SHA512_CONST_35		0x53380d139d95b3dfULL
+#define	SHA512_CONST_36		0x650a73548baf63deULL
+#define	SHA512_CONST_37		0x766a0abb3c77b2a8ULL
+#define	SHA512_CONST_38		0x81c2c92e47edaee6ULL
+#define	SHA512_CONST_39		0x92722c851482353bULL
+#define	SHA512_CONST_40		0xa2bfe8a14cf10364ULL
+#define	SHA512_CONST_41		0xa81a664bbc423001ULL
+#define	SHA512_CONST_42		0xc24b8b70d0f89791ULL
+#define	SHA512_CONST_43		0xc76c51a30654be30ULL
+#define	SHA512_CONST_44		0xd192e819d6ef5218ULL
+#define	SHA512_CONST_45		0xd69906245565a910ULL
+#define	SHA512_CONST_46		0xf40e35855771202aULL
+#define	SHA512_CONST_47		0x106aa07032bbd1b8ULL
+#define	SHA512_CONST_48		0x19a4c116b8d2d0c8ULL
+#define	SHA512_CONST_49		0x1e376c085141ab53ULL
+#define	SHA512_CONST_50		0x2748774cdf8eeb99ULL
+#define	SHA512_CONST_51		0x34b0bcb5e19b48a8ULL
+#define	SHA512_CONST_52		0x391c0cb3c5c95a63ULL
+#define	SHA512_CONST_53		0x4ed8aa4ae3418acbULL
+#define	SHA512_CONST_54		0x5b9cca4f7763e373ULL
+#define	SHA512_CONST_55		0x682e6ff3d6b2b8a3ULL
+#define	SHA512_CONST_56		0x748f82ee5defb2fcULL
+#define	SHA512_CONST_57		0x78a5636f43172f60ULL
+#define	SHA512_CONST_58		0x84c87814a1f0ab72ULL
+#define	SHA512_CONST_59		0x8cc702081a6439ecULL
+#define	SHA512_CONST_60		0x90befffa23631e28ULL
+#define	SHA512_CONST_61		0xa4506cebde82bde9ULL
+#define	SHA512_CONST_62		0xbef9a3f7b2c67915ULL
+#define	SHA512_CONST_63		0xc67178f2e372532bULL
+#define	SHA512_CONST_64		0xca273eceea26619cULL
+#define	SHA512_CONST_65		0xd186b8c721c0c207ULL
+#define	SHA512_CONST_66		0xeada7dd6cde0eb1eULL
+#define	SHA512_CONST_67		0xf57d4f7fee6ed178ULL
+#define	SHA512_CONST_68		0x06f067aa72176fbaULL
+#define	SHA512_CONST_69		0x0a637dc5a2c898a6ULL
+#define	SHA512_CONST_70		0x113f9804bef90daeULL
+#define	SHA512_CONST_71		0x1b710b35131c471bULL
+#define	SHA512_CONST_72		0x28db77f523047d84ULL
+#define	SHA512_CONST_73		0x32caab7b40c72493ULL
+#define	SHA512_CONST_74		0x3c9ebe0a15c9bebcULL
+#define	SHA512_CONST_75		0x431d67c49c100d4cULL
+#define	SHA512_CONST_76		0x4cc5d4becb3e42b6ULL
+#define	SHA512_CONST_77		0x597f299cfc657e2aULL
+#define	SHA512_CONST_78		0x5fcb6fab3ad6faecULL
+#define	SHA512_CONST_79		0x6c44198c4a475817ULL
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SYS_SHA2_CONSTS_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sha2/sha2_impl.h b/sys/contrib/openzfs/module/icp/include/sha2/sha2_impl.h
new file mode 100644
index 000000000000..b9768d344e95
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sha2/sha2_impl.h
@@ -0,0 +1,64 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SHA2_IMPL_H
+#define	_SHA2_IMPL_H
+
+#include <sys/sha2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+	SHA1_TYPE,
+	SHA256_TYPE,
+	SHA384_TYPE,
+	SHA512_TYPE
+} sha2_mech_t;
+
+/*
+ * Context for SHA2 mechanism.
+ */
+typedef struct sha2_ctx {
+	sha2_mech_type_t	sc_mech_type;	/* type of context */
+	SHA2_CTX		sc_sha2_ctx;	/* SHA2 context */
+} sha2_ctx_t;
+
+/*
+ * Context for SHA2 HMAC and HMAC GENERAL mechanisms.
+ */
+typedef struct sha2_hmac_ctx {
+	sha2_mech_type_t	hc_mech_type;	/* type of context */
+	uint32_t		hc_digest_len;	/* digest len in bytes */
+	SHA2_CTX		hc_icontext;	/* inner SHA2 context */
+	SHA2_CTX		hc_ocontext;	/* outer SHA2 context */
+} sha2_hmac_ctx_t;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif /* _SHA2_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/asm_linkage.h b/sys/contrib/openzfs/module/icp/include/sys/asm_linkage.h
new file mode 100644
index 000000000000..49a494b46e0b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/asm_linkage.h
@@ -0,0 +1,46 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_ASM_LINKAGE_H
+#define	_SYS_ASM_LINKAGE_H
+
+#if defined(__i386) || defined(__amd64)
+
+#include <sys/ia32/asm_linkage.h>	/* XX64	x86/sys/asm_linkage.h */
+
+#endif
+
+#if defined(_KERNEL) && defined(HAVE_KERNEL_OBJTOOL)
+
+#include <asm/frame.h>
+
+#else /* userspace */
+#define	FRAME_BEGIN
+#define	FRAME_END
+#endif
+
+
+#endif	/* _SYS_ASM_LINKAGE_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/bitmap.h b/sys/contrib/openzfs/module/icp/include/sys/bitmap.h
new file mode 100644
index 000000000000..4e86ee70ed9e
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/bitmap.h
@@ -0,0 +1,183 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+
+#ifndef _SYS_BITMAP_H
+#define	_SYS_BITMAP_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#if defined(__GNUC__) && defined(_ASM_INLINES) && \
+	(defined(__i386) || defined(__amd64))
+#include <asm/bitmap.h>
+#endif
+
+/*
+ * Operations on bitmaps of arbitrary size
+ * A bitmap is a vector of 1 or more ulong_t's.
+ * The user of the package is responsible for range checks and keeping
+ * track of sizes.
+ */
+
+#ifdef _LP64
+#define	BT_ULSHIFT	6 /* log base 2 of BT_NBIPUL, to extract word index */
+#define	BT_ULSHIFT32	5 /* log base 2 of BT_NBIPUL, to extract word index */
+#else
+#define	BT_ULSHIFT	5 /* log base 2 of BT_NBIPUL, to extract word index */
+#endif
+
+#define	BT_NBIPUL	(1 << BT_ULSHIFT)	/* n bits per ulong_t */
+#define	BT_ULMASK	(BT_NBIPUL - 1)		/* to extract bit index */
+
+#ifdef _LP64
+#define	BT_NBIPUL32	(1 << BT_ULSHIFT32)	/* n bits per ulong_t */
+#define	BT_ULMASK32	(BT_NBIPUL32 - 1)	/* to extract bit index */
+#define	BT_ULMAXMASK	0xffffffffffffffff	/* used by bt_getlowbit */
+#else
+#define	BT_ULMAXMASK	0xffffffff
+#endif
+
+/*
+ * bitmap is a ulong_t *, bitindex an index_t
+ *
+ * The macros BT_WIM and BT_BIW internal; there is no need
+ * for users of this package to use them.
+ */
+
+/*
+ * word in map
+ */
+#define	BT_WIM(bitmap, bitindex) \
+	((bitmap)[(bitindex) >> BT_ULSHIFT])
+/*
+ * bit in word
+ */
+#define	BT_BIW(bitindex) \
+	(1UL << ((bitindex) & BT_ULMASK))
+
+#ifdef _LP64
+#define	BT_WIM32(bitmap, bitindex) \
+	((bitmap)[(bitindex) >> BT_ULSHIFT32])
+
+#define	BT_BIW32(bitindex) \
+	(1UL << ((bitindex) & BT_ULMASK32))
+#endif
+
+/*
+ * These are public macros
+ *
+ * BT_BITOUL == n bits to n ulong_t's
+ */
+#define	BT_BITOUL(nbits) \
+	(((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL)
+#define	BT_SIZEOFMAP(nbits) \
+	(BT_BITOUL(nbits) * sizeof (ulong_t))
+#define	BT_TEST(bitmap, bitindex) \
+	((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0)
+#define	BT_SET(bitmap, bitindex) \
+	{ BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); }
+#define	BT_CLEAR(bitmap, bitindex) \
+	{ BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); }
+
+#ifdef _LP64
+#define	BT_BITOUL32(nbits) \
+	(((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32)
+#define	BT_SIZEOFMAP32(nbits) \
+	(BT_BITOUL32(nbits) * sizeof (uint_t))
+#define	BT_TEST32(bitmap, bitindex) \
+	((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0)
+#define	BT_SET32(bitmap, bitindex) \
+	{ BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); }
+#define	BT_CLEAR32(bitmap, bitindex) \
+	{ BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); }
+#endif /* _LP64 */
+
+
+/*
+ * BIT_ONLYONESET is a private macro not designed for bitmaps of
+ * arbitrary size.  u must be an unsigned integer/long.  It returns
+ * true if one and only one bit is set in u.
+ */
+#define	BIT_ONLYONESET(u) \
+	((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0))
+
+#ifndef _ASM
+
+/*
+ * return next available bit index from map with specified number of bits
+ */
+extern index_t	bt_availbit(ulong_t *bitmap, size_t nbits);
+/*
+ * find the highest order bit that is on, and is within or below
+ * the word specified by wx
+ */
+extern int	bt_gethighbit(ulong_t *mapp, int wx);
+extern int	bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2,
+			size_t end_pos);
+extern int	bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
+extern void	bt_copy(ulong_t *, ulong_t *, ulong_t);
+
+/*
+ * find the parity
+ */
+extern int	odd_parity(ulong_t);
+
+/*
+ * Atomically set/clear bits
+ * Atomic exclusive operations will set "result" to "-1"
+ * if the bit is already set/cleared. "result" will be set
+ * to 0 otherwise.
+ */
+#define	BT_ATOMIC_SET(bitmap, bitindex) \
+	{ atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
+#define	BT_ATOMIC_CLEAR(bitmap, bitindex) \
+	{ atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
+
+#define	BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
+	{ result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)),	\
+	    (bitindex) % BT_NBIPUL); }
+#define	BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \
+	{ result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)),	\
+	    (bitindex) % BT_NBIPUL); }
+
+/*
+ * Extracts bits between index h (high, inclusive) and l (low, exclusive) from
+ * u, which must be an unsigned integer.
+ */
+#define	BITX(u, h, l)	(((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU))
+
+#endif	/* _ASM */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_BITMAP_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/elfsign.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/elfsign.h
new file mode 100644
index 000000000000..5432f0c8d607
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/elfsign.h
@@ -0,0 +1,137 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_ELFSIGN_H
+#define	_SYS_CRYPTO_ELFSIGN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Consolidation Private Interface for elfsign/libpkcs11/kcfd
+ */
+
+#include <sys/zfs_context.h>
+
+/*
+ * Project Private structures and types used for communication between kcfd
+ * and KCF over the door.
+ */
+
+typedef enum ELFsign_status_e {
+	ELFSIGN_UNKNOWN,
+	ELFSIGN_SUCCESS,
+	ELFSIGN_FAILED,
+	ELFSIGN_NOTSIGNED,
+	ELFSIGN_INVALID_CERTPATH,
+	ELFSIGN_INVALID_ELFOBJ,
+	ELFSIGN_RESTRICTED
+} ELFsign_status_t;
+
+#define	KCF_KCFD_VERSION1	1
+#define	SIG_MAX_LENGTH		1024
+
+#define	ELF_SIGNATURE_SECTION	".SUNW_signature"
+
+typedef struct kcf_door_arg_s {
+	short		da_version;
+	boolean_t	da_iskernel;
+
+	union {
+		char filename[MAXPATHLEN];	/* For request */
+
+		struct kcf_door_result_s {	/* For response */
+			ELFsign_status_t	status;
+			uint32_t		siglen;
+			uchar_t			signature[1];
+		} result;
+	} da_u;
+} kcf_door_arg_t;
+
+typedef uint32_t	filesig_vers_t;
+
+/*
+ * File Signature Structure
+ *	Applicable to ELF and other file formats
+ */
+struct filesignatures {
+	uint32_t	filesig_cnt;	/* count of signatures */
+	uint32_t	filesig_pad;	/* unused */
+	union {
+		char	filesig_data[1];
+		struct filesig {	/* one of these for each signature */
+			uint32_t	filesig_size;
+			filesig_vers_t	filesig_version;
+			union {
+				struct filesig_version1 {
+					uint32_t	filesig_v1_dnsize;
+					uint32_t	filesig_v1_sigsize;
+					uint32_t	filesig_v1_oidsize;
+					char	filesig_v1_data[1];
+				} filesig_v1;
+				struct filesig_version3 {
+					uint64_t	filesig_v3_time;
+					uint32_t	filesig_v3_dnsize;
+					uint32_t	filesig_v3_sigsize;
+					uint32_t	filesig_v3_oidsize;
+					char	filesig_v3_data[1];
+				} filesig_v3;
+			} _u2;
+		} filesig_sig;
+		uint64_t filesig_align;
+	} _u1;
+};
+#define	filesig_sig		_u1.filesig_sig
+
+#define	filesig_v1_dnsize	_u2.filesig_v1.filesig_v1_dnsize
+#define	filesig_v1_sigsize	_u2.filesig_v1.filesig_v1_sigsize
+#define	filesig_v1_oidsize	_u2.filesig_v1.filesig_v1_oidsize
+#define	filesig_v1_data		_u2.filesig_v1.filesig_v1_data
+
+#define	filesig_v3_time		_u2.filesig_v3.filesig_v3_time
+#define	filesig_v3_dnsize	_u2.filesig_v3.filesig_v3_dnsize
+#define	filesig_v3_sigsize	_u2.filesig_v3.filesig_v3_sigsize
+#define	filesig_v3_oidsize	_u2.filesig_v3.filesig_v3_oidsize
+#define	filesig_v3_data		_u2.filesig_v3.filesig_v3_data
+
+#define	filesig_ALIGN(s)	(((s) + sizeof (uint64_t) - 1) & \
+				    (-sizeof (uint64_t)))
+#define	filesig_next(ptr)	(struct filesig *)((void *)((char *)(ptr) + \
+				    filesig_ALIGN((ptr)->filesig_size)))
+
+#define	FILESIG_UNKNOWN		0	/* unrecognized version */
+#define	FILESIG_VERSION1	1	/* version1, all but sig section */
+#define	FILESIG_VERSION2	2	/* version1 format, SHF_ALLOC only */
+#define	FILESIG_VERSION3	3	/* version3, all but sig section */
+#define	FILESIG_VERSION4	4	/* version3 format, SHF_ALLOC only */
+
+#define	_PATH_KCFD_DOOR	"/etc/svc/volatile/kcfd_door"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_ELFSIGN_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/impl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/impl.h
new file mode 100644
index 000000000000..0f37f3f63532
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/impl.h
@@ -0,0 +1,1363 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CRYPTO_IMPL_H
+#define	_SYS_CRYPTO_IMPL_H
+
+/*
+ * Kernel Cryptographic Framework private implementation definitions.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/ioctl.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#define	KCF_MODULE "kcf"
+
+/*
+ * Prefixes convention: structures internal to the kernel cryptographic
+ * framework start with 'kcf_'. Exposed structure start with 'crypto_'.
+ */
+
+/* Provider stats. Not protected. */
+typedef	struct kcf_prov_stats {
+	kstat_named_t	ps_ops_total;
+	kstat_named_t	ps_ops_passed;
+	kstat_named_t	ps_ops_failed;
+	kstat_named_t	ps_ops_busy_rval;
+} kcf_prov_stats_t;
+
+/* Various kcf stats. Not protected. */
+typedef	struct kcf_stats {
+	kstat_named_t	ks_thrs_in_pool;
+	kstat_named_t	ks_idle_thrs;
+	kstat_named_t	ks_minthrs;
+	kstat_named_t	ks_maxthrs;
+	kstat_named_t	ks_swq_njobs;
+	kstat_named_t	ks_swq_maxjobs;
+	kstat_named_t	ks_taskq_threads;
+	kstat_named_t	ks_taskq_minalloc;
+	kstat_named_t	ks_taskq_maxalloc;
+} kcf_stats_t;
+
+/*
+ * Keep all the information needed by the scheduler from
+ * this provider.
+ */
+typedef struct kcf_sched_info {
+	/* The number of operations dispatched. */
+	uint64_t	ks_ndispatches;
+
+	/* The number of operations that failed. */
+	uint64_t	ks_nfails;
+
+	/* The number of operations that returned CRYPTO_BUSY. */
+	uint64_t	ks_nbusy_rval;
+
+	/* taskq used to dispatch crypto requests */
+	taskq_t	*ks_taskq;
+} kcf_sched_info_t;
+
+/*
+ * pd_irefcnt approximates the number of inflight requests to the
+ * provider. Though we increment this counter during registration for
+ * other purposes, that base value is mostly same across all providers.
+ * So, it is a good measure of the load on a provider when it is not
+ * in a busy state. Once a provider notifies it is busy, requests
+ * backup in the taskq. So, we use tq_nalloc in that case which gives
+ * the number of task entries in the task queue. Note that we do not
+ * acquire any locks here as it is not critical to get the exact number
+ * and the lock contention may be too costly for this code path.
+ */
+#define	KCF_PROV_LOAD(pd)	((pd)->pd_state != KCF_PROV_BUSY ?	\
+	(pd)->pd_irefcnt : (pd)->pd_sched_info.ks_taskq->tq_nalloc)
+
+#define	KCF_PROV_INCRSTATS(pd, error)	{				\
+	(pd)->pd_sched_info.ks_ndispatches++;				\
+	if (error == CRYPTO_BUSY)					\
+		(pd)->pd_sched_info.ks_nbusy_rval++;			\
+	else if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED)	\
+		(pd)->pd_sched_info.ks_nfails++;			\
+}
+
+
+/*
+ * The following two macros should be
+ * #define	KCF_OPS_CLASSSIZE (KCF_LAST_OPSCLASS - KCF_FIRST_OPSCLASS + 2)
+ * #define	KCF_MAXMECHTAB KCF_MAXCIPHER
+ *
+ * However, doing that would involve reorganizing the header file a bit.
+ * When impl.h is broken up (bug# 4703218), this will be done. For now,
+ * we hardcode these values.
+ */
+#define	KCF_OPS_CLASSSIZE	8
+#define	KCF_MAXMECHTAB		32
+
+/*
+ * Valid values for the state of a provider. The order of
+ * the elements is important.
+ *
+ * Routines which get a provider or the list of providers
+ * should pick only those that are either in KCF_PROV_READY state
+ * or in KCF_PROV_BUSY state.
+ */
+typedef enum {
+	KCF_PROV_ALLOCATED = 1,
+	KCF_PROV_UNVERIFIED,
+	KCF_PROV_VERIFICATION_FAILED,
+	/*
+	 * state < KCF_PROV_READY means the provider can not
+	 * be used at all.
+	 */
+	KCF_PROV_READY,
+	KCF_PROV_BUSY,
+	/*
+	 * state > KCF_PROV_BUSY means the provider can not
+	 * be used for new requests.
+	 */
+	KCF_PROV_FAILED,
+	/*
+	 * Threads setting the following two states should do so only
+	 * if the current state < KCF_PROV_DISABLED.
+	 */
+	KCF_PROV_DISABLED,
+	KCF_PROV_REMOVED,
+	KCF_PROV_FREED
+} kcf_prov_state_t;
+
+#define	KCF_IS_PROV_UNVERIFIED(pd) ((pd)->pd_state == KCF_PROV_UNVERIFIED)
+#define	KCF_IS_PROV_USABLE(pd) ((pd)->pd_state == KCF_PROV_READY || \
+	(pd)->pd_state == KCF_PROV_BUSY)
+#define	KCF_IS_PROV_REMOVED(pd)	((pd)->pd_state >= KCF_PROV_REMOVED)
+
+/* Internal flags valid for pd_flags field */
+#define	KCF_PROV_RESTRICTED	0x40000000
+#define	KCF_LPROV_MEMBER	0x80000000 /* is member of a logical provider */
+
+/*
+ * A provider descriptor structure. There is one such structure per
+ * provider. It is allocated and initialized at registration time and
+ * freed when the provider unregisters.
+ *
+ * pd_prov_type:	Provider type, hardware or software
+ * pd_sid:		Session ID of the provider used by kernel clients.
+ *			This is valid only for session-oriented providers.
+ * pd_refcnt:		Reference counter to this provider descriptor
+ * pd_irefcnt:		References held by the framework internal structs
+ * pd_lock:		lock protects pd_state and pd_provider_list
+ * pd_state:		State value of the provider
+ * pd_provider_list:	Used to cross-reference logical providers and their
+ *			members. Not used for software providers.
+ * pd_resume_cv:	cv to wait for state to change from KCF_PROV_BUSY
+ * pd_prov_handle:	Provider handle specified by provider
+ * pd_ops_vector:	The ops vector specified by Provider
+ * pd_mech_indx:	Lookup table which maps a core framework mechanism
+ *			number to an index in pd_mechanisms array
+ * pd_mechanisms:	Array of mechanisms supported by the provider, specified
+ *			by the provider during registration
+ * pd_sched_info:	Scheduling information associated with the provider
+ * pd_mech_list_count:	The number of entries in pi_mechanisms, specified
+ *			by the provider during registration
+ * pd_name:		Device name or module name
+ * pd_instance:		Device instance
+ * pd_module_id:	Module ID returned by modload
+ * pd_mctlp:		Pointer to modctl structure for this provider
+ * pd_remove_cv:	cv to wait on while the provider queue drains
+ * pd_description:	Provider description string
+ * pd_flags		bitwise OR of pi_flags from crypto_provider_info_t
+ *			and other internal flags defined above.
+ * pd_hash_limit	Maximum data size that hash mechanisms of this provider
+ * 			can support.
+ * pd_kcf_prov_handle:	KCF-private handle assigned by KCF
+ * pd_prov_id:		Identification # assigned by KCF to provider
+ * pd_kstat:		kstat associated with the provider
+ * pd_ks_data:		kstat data
+ */
+typedef struct kcf_provider_desc {
+	crypto_provider_type_t		pd_prov_type;
+	crypto_session_id_t		pd_sid;
+	uint_t				pd_refcnt;
+	uint_t				pd_irefcnt;
+	kmutex_t			pd_lock;
+	kcf_prov_state_t		pd_state;
+	struct kcf_provider_list	*pd_provider_list;
+	kcondvar_t			pd_resume_cv;
+	crypto_provider_handle_t	pd_prov_handle;
+	crypto_ops_t			*pd_ops_vector;
+	ushort_t			pd_mech_indx[KCF_OPS_CLASSSIZE]\
+					    [KCF_MAXMECHTAB];
+	crypto_mech_info_t		*pd_mechanisms;
+	kcf_sched_info_t		pd_sched_info;
+	uint_t				pd_mech_list_count;
+	// char				*pd_name;
+	// uint_t				pd_instance;
+	// int				pd_module_id;
+	// struct modctl			*pd_mctlp;
+	kcondvar_t			pd_remove_cv;
+	char				*pd_description;
+	uint_t				pd_flags;
+	uint_t				pd_hash_limit;
+	crypto_kcf_provider_handle_t	pd_kcf_prov_handle;
+	crypto_provider_id_t		pd_prov_id;
+	kstat_t				*pd_kstat;
+	kcf_prov_stats_t		pd_ks_data;
+} kcf_provider_desc_t;
+
+/* useful for making a list of providers */
+typedef struct kcf_provider_list {
+	struct kcf_provider_list *pl_next;
+	struct kcf_provider_desc *pl_provider;
+} kcf_provider_list_t;
+
+/* atomic operations in linux implicitly form a memory barrier */
+#define	membar_exit()
+
+/*
+ * If a component has a reference to a kcf_provider_desc_t,
+ * it REFHOLD()s. A new provider descriptor which is referenced only
+ * by the providers table has a reference counter of one.
+ */
+#define	KCF_PROV_REFHOLD(desc) {		\
+	atomic_add_32(&(desc)->pd_refcnt, 1);	\
+	ASSERT((desc)->pd_refcnt != 0);		\
+}
+
+#define	KCF_PROV_IREFHOLD(desc) {		\
+	atomic_add_32(&(desc)->pd_irefcnt, 1);	\
+	ASSERT((desc)->pd_irefcnt != 0);	\
+}
+
+#define	KCF_PROV_IREFRELE(desc) {				\
+	ASSERT((desc)->pd_irefcnt != 0);			\
+	membar_exit();						\
+	if (atomic_add_32_nv(&(desc)->pd_irefcnt, -1) == 0) {	\
+		cv_broadcast(&(desc)->pd_remove_cv);		\
+	}							\
+}
+
+#define	KCF_PROV_REFHELD(desc)	((desc)->pd_refcnt >= 1)
+
+#define	KCF_PROV_REFRELE(desc) {				\
+	ASSERT((desc)->pd_refcnt != 0);				\
+	membar_exit();						\
+	if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0) {	\
+		kcf_provider_zero_refcnt((desc));		\
+	}							\
+}
+
+
+/* list of crypto_mech_info_t valid as the second mech in a dual operation */
+
+typedef	struct crypto_mech_info_list {
+	struct crypto_mech_info_list	*ml_next;
+	crypto_mech_type_t		ml_kcf_mechid;	/* KCF's id */
+	crypto_mech_info_t		ml_mech_info;
+} crypto_mech_info_list_t;
+
+/*
+ * An element in a mechanism provider descriptors chain.
+ * The kcf_prov_mech_desc_t is duplicated in every chain the provider belongs
+ * to. This is a small tradeoff memory vs mutex spinning time to access the
+ * common provider field.
+ */
+
+typedef struct kcf_prov_mech_desc {
+	struct kcf_mech_entry		*pm_me;		/* Back to the head */
+	struct kcf_prov_mech_desc	*pm_next;	/* Next in the chain */
+	crypto_mech_info_t		pm_mech_info;	/* Provider mech info */
+	crypto_mech_info_list_t		*pm_mi_list;	/* list for duals */
+	kcf_provider_desc_t		*pm_prov_desc;	/* Common desc. */
+} kcf_prov_mech_desc_t;
+
+/* and the notation shortcuts ... */
+#define	pm_provider_type	pm_prov_desc.pd_provider_type
+#define	pm_provider_handle	pm_prov_desc.pd_provider_handle
+#define	pm_ops_vector		pm_prov_desc.pd_ops_vector
+
+/*
+ * A mechanism entry in an xxx_mech_tab[]. me_pad was deemed
+ * to be unnecessary and removed.
+ */
+typedef	struct kcf_mech_entry {
+	crypto_mech_name_t	me_name;	/* mechanism name */
+	crypto_mech_type_t	me_mechid;	/* Internal id for mechanism */
+	kmutex_t		me_mutex;	/* access protection	*/
+	kcf_prov_mech_desc_t	*me_hw_prov_chain;  /* list of HW providers */
+	kcf_prov_mech_desc_t	*me_sw_prov;    /* SW provider */
+	/*
+	 * Number of HW providers in the chain. There is only one
+	 * SW provider. So, we need only a count of HW providers.
+	 */
+	int			me_num_hwprov;
+	/*
+	 * When a SW provider is present, this is the generation number that
+	 * ensures no objects from old SW providers are used in the new one
+	 */
+	uint32_t		me_gen_swprov;
+	/*
+	 *  threshold for using hardware providers for this mech
+	 */
+	size_t			me_threshold;
+} kcf_mech_entry_t;
+
+/*
+ * A policy descriptor structure. It is allocated and initialized
+ * when administrative ioctls load disabled mechanisms.
+ *
+ * pd_prov_type:	Provider type, hardware or software
+ * pd_name:		Device name or module name.
+ * pd_instance:		Device instance.
+ * pd_refcnt:		Reference counter for this policy descriptor
+ * pd_mutex:		Protects array and count of disabled mechanisms.
+ * pd_disabled_count:	Count of disabled mechanisms.
+ * pd_disabled_mechs:	Array of disabled mechanisms.
+ */
+typedef struct kcf_policy_desc {
+	crypto_provider_type_t	pd_prov_type;
+	char			*pd_name;
+	uint_t			pd_instance;
+	uint_t			pd_refcnt;
+	kmutex_t		pd_mutex;
+	uint_t			pd_disabled_count;
+	crypto_mech_name_t	*pd_disabled_mechs;
+} kcf_policy_desc_t;
+
+/*
+ * If a component has a reference to a kcf_policy_desc_t,
+ * it REFHOLD()s. A new policy descriptor which is referenced only
+ * by the policy table has a reference count of one.
+ */
+#define	KCF_POLICY_REFHOLD(desc) {		\
+	atomic_add_32(&(desc)->pd_refcnt, 1);	\
+	ASSERT((desc)->pd_refcnt != 0);		\
+}
+
+/*
+ * Releases a reference to a policy descriptor. When the last
+ * reference is released, the descriptor is freed.
+ */
+#define	KCF_POLICY_REFRELE(desc) {				\
+	ASSERT((desc)->pd_refcnt != 0);				\
+	membar_exit();						\
+	if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0)	\
+		kcf_policy_free_desc(desc);			\
+}
+
+/*
+ * This entry stores the name of a software module and its
+ * mechanisms.  The mechanisms are 'hints' that are used to
+ * trigger loading of the module.
+ */
+typedef struct kcf_soft_conf_entry {
+	struct kcf_soft_conf_entry	*ce_next;
+	char				*ce_name;
+	crypto_mech_name_t		*ce_mechs;
+	uint_t				ce_count;
+} kcf_soft_conf_entry_t;
+
+extern kmutex_t soft_config_mutex;
+extern kcf_soft_conf_entry_t *soft_config_list;
+
+/*
+ * Global tables. The sizes are from the predefined PKCS#11 v2.20 mechanisms,
+ * with a margin of few extra empty entry points
+ */
+
+#define	KCF_MAXDIGEST		16	/* Digests */
+#define	KCF_MAXCIPHER		64	/* Ciphers */
+#define	KCF_MAXMAC		40	/* Message authentication codes */
+#define	KCF_MAXSIGN		24	/* Sign/Verify */
+#define	KCF_MAXKEYOPS		116	/* Key generation and derivation */
+#define	KCF_MAXMISC		16	/* Others ... */
+
+#define	KCF_MAXMECHS		KCF_MAXDIGEST + KCF_MAXCIPHER + KCF_MAXMAC + \
+				KCF_MAXSIGN + KCF_MAXKEYOPS + \
+				KCF_MAXMISC
+
+extern kcf_mech_entry_t kcf_digest_mechs_tab[];
+extern kcf_mech_entry_t kcf_cipher_mechs_tab[];
+extern kcf_mech_entry_t kcf_mac_mechs_tab[];
+extern kcf_mech_entry_t kcf_sign_mechs_tab[];
+extern kcf_mech_entry_t kcf_keyops_mechs_tab[];
+extern kcf_mech_entry_t kcf_misc_mechs_tab[];
+
+extern kmutex_t kcf_mech_tabs_lock;
+
+typedef	enum {
+	KCF_DIGEST_CLASS = 1,
+	KCF_CIPHER_CLASS,
+	KCF_MAC_CLASS,
+	KCF_SIGN_CLASS,
+	KCF_KEYOPS_CLASS,
+	KCF_MISC_CLASS
+} kcf_ops_class_t;
+
+#define	KCF_FIRST_OPSCLASS	KCF_DIGEST_CLASS
+#define	KCF_LAST_OPSCLASS	KCF_MISC_CLASS
+
+/* The table of all the kcf_xxx_mech_tab[]s, indexed by kcf_ops_class */
+
+typedef	struct kcf_mech_entry_tab {
+	int			met_size;	/* Size of the met_tab[] */
+	kcf_mech_entry_t	*met_tab;	/* the table		 */
+} kcf_mech_entry_tab_t;
+
+extern kcf_mech_entry_tab_t kcf_mech_tabs_tab[];
+
+#define	KCF_MECHID(class, index)				\
+	(((crypto_mech_type_t)(class) << 32) | (crypto_mech_type_t)(index))
+
+#define	KCF_MECH2CLASS(mech_type) ((kcf_ops_class_t)((mech_type) >> 32))
+
+#define	KCF_MECH2INDEX(mech_type) ((int)(mech_type))
+
+#define	KCF_TO_PROV_MECH_INDX(pd, mech_type) 			\
+	((pd)->pd_mech_indx[KCF_MECH2CLASS(mech_type)] 		\
+	[KCF_MECH2INDEX(mech_type)])
+
+#define	KCF_TO_PROV_MECHINFO(pd, mech_type)			\
+	((pd)->pd_mechanisms[KCF_TO_PROV_MECH_INDX(pd, mech_type)])
+
+#define	KCF_TO_PROV_MECHNUM(pd, mech_type)			\
+	(KCF_TO_PROV_MECHINFO(pd, mech_type).cm_mech_number)
+
+#define	KCF_CAN_SHARE_OPSTATE(pd, mech_type)			\
+	((KCF_TO_PROV_MECHINFO(pd, mech_type).cm_mech_flags) &	\
+	CRYPTO_CAN_SHARE_OPSTATE)
+
+/* ps_refcnt is protected by cm_lock in the crypto_minor structure */
+typedef struct crypto_provider_session {
+	struct crypto_provider_session *ps_next;
+	crypto_session_id_t		ps_session;
+	kcf_provider_desc_t		*ps_provider;
+	kcf_provider_desc_t		*ps_real_provider;
+	uint_t				ps_refcnt;
+} crypto_provider_session_t;
+
+typedef struct crypto_session_data {
+	kmutex_t			sd_lock;
+	kcondvar_t			sd_cv;
+	uint32_t			sd_flags;
+	int				sd_pre_approved_amount;
+	crypto_ctx_t			*sd_digest_ctx;
+	crypto_ctx_t			*sd_encr_ctx;
+	crypto_ctx_t			*sd_decr_ctx;
+	crypto_ctx_t			*sd_sign_ctx;
+	crypto_ctx_t			*sd_verify_ctx;
+	crypto_ctx_t			*sd_sign_recover_ctx;
+	crypto_ctx_t			*sd_verify_recover_ctx;
+	kcf_provider_desc_t		*sd_provider;
+	void				*sd_find_init_cookie;
+	crypto_provider_session_t	*sd_provider_session;
+} crypto_session_data_t;
+
+#define	CRYPTO_SESSION_IN_USE		0x00000001
+#define	CRYPTO_SESSION_IS_BUSY		0x00000002
+#define	CRYPTO_SESSION_IS_CLOSED	0x00000004
+
+#define	KCF_MAX_PIN_LEN			1024
+
+/*
+ * Per-minor info.
+ *
+ * cm_lock protects everything in this structure except for cm_refcnt.
+ */
+typedef struct crypto_minor {
+	uint_t				cm_refcnt;
+	kmutex_t			cm_lock;
+	kcondvar_t			cm_cv;
+	crypto_session_data_t		**cm_session_table;
+	uint_t				cm_session_table_count;
+	kcf_provider_desc_t		**cm_provider_array;
+	uint_t				cm_provider_count;
+	crypto_provider_session_t	*cm_provider_session;
+} crypto_minor_t;
+
+/*
+ * Return codes for internal functions
+ */
+#define	KCF_SUCCESS		0x0	/* Successful call */
+#define	KCF_INVALID_MECH_NUMBER	0x1	/* invalid mechanism number */
+#define	KCF_INVALID_MECH_NAME	0x2	/* invalid mechanism name */
+#define	KCF_INVALID_MECH_CLASS	0x3	/* invalid mechanism class */
+#define	KCF_MECH_TAB_FULL	0x4	/* Need more room in the mech tabs. */
+#define	KCF_INVALID_INDX	((ushort_t)-1)
+
+/*
+ * kCF internal mechanism and function group for tracking RNG providers.
+ */
+#define	SUN_RANDOM		"random"
+#define	CRYPTO_FG_RANDOM	0x80000000	/* generate_random() */
+
+/*
+ * Wrappers for ops vectors. In the wrapper definitions below, the pd
+ * argument always corresponds to a pointer to a provider descriptor
+ * of type kcf_prov_desc_t.
+ */
+
+#define	KCF_PROV_CONTROL_OPS(pd)	((pd)->pd_ops_vector->co_control_ops)
+#define	KCF_PROV_CTX_OPS(pd)		((pd)->pd_ops_vector->co_ctx_ops)
+#define	KCF_PROV_DIGEST_OPS(pd)		((pd)->pd_ops_vector->co_digest_ops)
+#define	KCF_PROV_CIPHER_OPS(pd)		((pd)->pd_ops_vector->co_cipher_ops)
+#define	KCF_PROV_MAC_OPS(pd)		((pd)->pd_ops_vector->co_mac_ops)
+#define	KCF_PROV_SIGN_OPS(pd)		((pd)->pd_ops_vector->co_sign_ops)
+#define	KCF_PROV_VERIFY_OPS(pd)		((pd)->pd_ops_vector->co_verify_ops)
+#define	KCF_PROV_DUAL_OPS(pd)		((pd)->pd_ops_vector->co_dual_ops)
+#define	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) \
+	((pd)->pd_ops_vector->co_dual_cipher_mac_ops)
+#define	KCF_PROV_RANDOM_OPS(pd)		((pd)->pd_ops_vector->co_random_ops)
+#define	KCF_PROV_SESSION_OPS(pd)	((pd)->pd_ops_vector->co_session_ops)
+#define	KCF_PROV_OBJECT_OPS(pd)		((pd)->pd_ops_vector->co_object_ops)
+#define	KCF_PROV_KEY_OPS(pd)		((pd)->pd_ops_vector->co_key_ops)
+#define	KCF_PROV_PROVIDER_OPS(pd)	((pd)->pd_ops_vector->co_provider_ops)
+#define	KCF_PROV_MECH_OPS(pd)		((pd)->pd_ops_vector->co_mech_ops)
+#define	KCF_PROV_NOSTORE_KEY_OPS(pd)	\
+	((pd)->pd_ops_vector->co_nostore_key_ops)
+
+/*
+ * Wrappers for crypto_control_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_STATUS(pd, status) ( \
+	(KCF_PROV_CONTROL_OPS(pd) && \
+	KCF_PROV_CONTROL_OPS(pd)->provider_status) ? \
+	KCF_PROV_CONTROL_OPS(pd)->provider_status( \
+	    (pd)->pd_prov_handle, status) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_ctx_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_CREATE_CTX_TEMPLATE(pd, mech, key, template, size, req) ( \
+	(KCF_PROV_CTX_OPS(pd) && KCF_PROV_CTX_OPS(pd)->create_ctx_template) ? \
+	KCF_PROV_CTX_OPS(pd)->create_ctx_template( \
+	    (pd)->pd_prov_handle, mech, key, template, size, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_FREE_CONTEXT(pd, ctx) ( \
+	(KCF_PROV_CTX_OPS(pd) && KCF_PROV_CTX_OPS(pd)->free_context) ? \
+	KCF_PROV_CTX_OPS(pd)->free_context(ctx) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_COPYIN_MECH(pd, umech, kmech, errorp, mode) ( \
+	(KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->copyin_mechanism) ? \
+	KCF_PROV_MECH_OPS(pd)->copyin_mechanism( \
+	    (pd)->pd_prov_handle, umech, kmech, errorp, mode) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_COPYOUT_MECH(pd, kmech, umech, errorp, mode) ( \
+	(KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->copyout_mechanism) ? \
+	KCF_PROV_MECH_OPS(pd)->copyout_mechanism( \
+	    (pd)->pd_prov_handle, kmech, umech, errorp, mode) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_FREE_MECH(pd, prov_mech) ( \
+	(KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->free_mechanism) ? \
+	KCF_PROV_MECH_OPS(pd)->free_mechanism( \
+	    (pd)->pd_prov_handle, prov_mech) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_digest_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_DIGEST_INIT(pd, ctx, mech, req) ( \
+	(KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_init) ? \
+	KCF_PROV_DIGEST_OPS(pd)->digest_init(ctx, mech, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * The _ (underscore) in _digest is needed to avoid replacing the
+ * function digest().
+ */
+#define	KCF_PROV_DIGEST(pd, ctx, data, _digest, req) ( \
+	(KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest) ? \
+	KCF_PROV_DIGEST_OPS(pd)->digest(ctx, data, _digest, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DIGEST_UPDATE(pd, ctx, data, req) ( \
+	(KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_update) ? \
+	KCF_PROV_DIGEST_OPS(pd)->digest_update(ctx, data, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DIGEST_KEY(pd, ctx, key, req) ( \
+	(KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_key) ? \
+	KCF_PROV_DIGEST_OPS(pd)->digest_key(ctx, key, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DIGEST_FINAL(pd, ctx, digest, req) ( \
+	(KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_final) ? \
+	KCF_PROV_DIGEST_OPS(pd)->digest_final(ctx, digest, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DIGEST_ATOMIC(pd, session, mech, data, digest, req) ( \
+	(KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_atomic) ? \
+	KCF_PROV_DIGEST_OPS(pd)->digest_atomic( \
+	    (pd)->pd_prov_handle, session, mech, data, digest, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_cipher_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_ENCRYPT_INIT(pd, ctx, mech, key, template, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_init) ? \
+	KCF_PROV_CIPHER_OPS(pd)->encrypt_init(ctx, mech, key, template, \
+	    req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT(pd, ctx, plaintext, ciphertext, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt) ? \
+	KCF_PROV_CIPHER_OPS(pd)->encrypt(ctx, plaintext, ciphertext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT_UPDATE(pd, ctx, plaintext, ciphertext, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_update) ? \
+	KCF_PROV_CIPHER_OPS(pd)->encrypt_update(ctx, plaintext, \
+	    ciphertext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT_FINAL(pd, ctx, ciphertext, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_final) ? \
+	KCF_PROV_CIPHER_OPS(pd)->encrypt_final(ctx, ciphertext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT_ATOMIC(pd, session, mech, key, plaintext, ciphertext, \
+	    template, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_atomic) ? \
+	KCF_PROV_CIPHER_OPS(pd)->encrypt_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, plaintext, ciphertext, \
+	    template, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DECRYPT_INIT(pd, ctx, mech, key, template, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_init) ? \
+	KCF_PROV_CIPHER_OPS(pd)->decrypt_init(ctx, mech, key, template, \
+	    req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DECRYPT(pd, ctx, ciphertext, plaintext, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt) ? \
+	KCF_PROV_CIPHER_OPS(pd)->decrypt(ctx, ciphertext, plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DECRYPT_UPDATE(pd, ctx, ciphertext, plaintext, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_update) ? \
+	KCF_PROV_CIPHER_OPS(pd)->decrypt_update(ctx, ciphertext, \
+	    plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DECRYPT_FINAL(pd, ctx, plaintext, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_final) ? \
+	KCF_PROV_CIPHER_OPS(pd)->decrypt_final(ctx, plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DECRYPT_ATOMIC(pd, session, mech, key, ciphertext, plaintext, \
+	    template, req) ( \
+	(KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_atomic) ? \
+	KCF_PROV_CIPHER_OPS(pd)->decrypt_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, ciphertext, plaintext, \
+	    template, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_mac_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_MAC_INIT(pd, ctx, mech, key, template, req) ( \
+	(KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_init) ? \
+	KCF_PROV_MAC_OPS(pd)->mac_init(ctx, mech, key, template, req) \
+	: CRYPTO_NOT_SUPPORTED)
+
+/*
+ * The _ (underscore) in _mac is needed to avoid replacing the
+ * function mac().
+ */
+#define	KCF_PROV_MAC(pd, ctx, data, _mac, req) ( \
+	(KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac) ? \
+	KCF_PROV_MAC_OPS(pd)->mac(ctx, data, _mac, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_UPDATE(pd, ctx, data, req) ( \
+	(KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_update) ? \
+	KCF_PROV_MAC_OPS(pd)->mac_update(ctx, data, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_FINAL(pd, ctx, mac, req) ( \
+	(KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_final) ? \
+	KCF_PROV_MAC_OPS(pd)->mac_final(ctx, mac, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_ATOMIC(pd, session, mech, key, data, mac, template, \
+	    req) ( \
+	(KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_atomic) ? \
+	KCF_PROV_MAC_OPS(pd)->mac_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, data, mac, template, \
+	    req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_VERIFY_ATOMIC(pd, session, mech, key, data, mac, \
+	    template, req) ( \
+	(KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_verify_atomic) ? \
+	KCF_PROV_MAC_OPS(pd)->mac_verify_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, data, mac, template, \
+	    req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_sign_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_SIGN_INIT(pd, ctx, mech, key, template, req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_init) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign_init( \
+	    ctx, mech, key, template, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN(pd, ctx, data, sig, req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign(ctx, data, sig, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN_UPDATE(pd, ctx, data, req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_update) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign_update(ctx, data, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN_FINAL(pd, ctx, sig, req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_final) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign_final(ctx, sig, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN_ATOMIC(pd, session, mech, key, data, template, \
+	    sig, req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_atomic) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, data, sig, template, \
+	    req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN_RECOVER_INIT(pd, ctx, mech, key, template, \
+	    req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_recover_init) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign_recover_init(ctx, mech, key, template, \
+	    req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN_RECOVER(pd, ctx, data, sig, req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_recover) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign_recover(ctx, data, sig, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN_RECOVER_ATOMIC(pd, session, mech, key, data, template, \
+	    sig, req) ( \
+	(KCF_PROV_SIGN_OPS(pd) && \
+	KCF_PROV_SIGN_OPS(pd)->sign_recover_atomic) ? \
+	KCF_PROV_SIGN_OPS(pd)->sign_recover_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, data, sig, template, \
+	    req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_verify_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_VERIFY_INIT(pd, ctx, mech, key, template, req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_init) ? \
+	KCF_PROV_VERIFY_OPS(pd)->verify_init(ctx, mech, key, template, \
+	    req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_VERIFY(pd, ctx, data, sig, req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->do_verify) ? \
+	KCF_PROV_VERIFY_OPS(pd)->do_verify(ctx, data, sig, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_VERIFY_UPDATE(pd, ctx, data, req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_update) ? \
+	KCF_PROV_VERIFY_OPS(pd)->verify_update(ctx, data, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_VERIFY_FINAL(pd, ctx, sig, req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_final) ? \
+	KCF_PROV_VERIFY_OPS(pd)->verify_final(ctx, sig, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_VERIFY_ATOMIC(pd, session, mech, key, data, template, sig, \
+	    req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_atomic) ? \
+	KCF_PROV_VERIFY_OPS(pd)->verify_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, data, sig, template, \
+	    req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_VERIFY_RECOVER_INIT(pd, ctx, mech, key, template, \
+	    req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && \
+	KCF_PROV_VERIFY_OPS(pd)->verify_recover_init) ? \
+	KCF_PROV_VERIFY_OPS(pd)->verify_recover_init(ctx, mech, key, \
+	    template, req) : CRYPTO_NOT_SUPPORTED)
+
+/* verify_recover() CSPI routine has different argument order than verify() */
+#define	KCF_PROV_VERIFY_RECOVER(pd, ctx, sig, data, req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_recover) ? \
+	KCF_PROV_VERIFY_OPS(pd)->verify_recover(ctx, sig, data, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * verify_recover_atomic() CSPI routine has different argument order
+ * than verify_atomic().
+ */
+#define	KCF_PROV_VERIFY_RECOVER_ATOMIC(pd, session, mech, key, sig, \
+	    template, data,  req) ( \
+	(KCF_PROV_VERIFY_OPS(pd) && \
+	KCF_PROV_VERIFY_OPS(pd)->verify_recover_atomic) ? \
+	KCF_PROV_VERIFY_OPS(pd)->verify_recover_atomic( \
+	    (pd)->pd_prov_handle, session, mech, key, sig, data, template, \
+	    req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_dual_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_DIGEST_ENCRYPT_UPDATE(digest_ctx, encrypt_ctx, plaintext, \
+	    ciphertext, req) ( \
+	(KCF_PROV_DUAL_OPS(pd) && \
+	KCF_PROV_DUAL_OPS(pd)->digest_encrypt_update) ? \
+	KCF_PROV_DUAL_OPS(pd)->digest_encrypt_update( \
+	    digest_ctx, encrypt_ctx, plaintext, ciphertext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DECRYPT_DIGEST_UPDATE(decrypt_ctx, digest_ctx, ciphertext, \
+	    plaintext, req) ( \
+	(KCF_PROV_DUAL_OPS(pd) && \
+	KCF_PROV_DUAL_OPS(pd)->decrypt_digest_update) ? \
+	KCF_PROV_DUAL_OPS(pd)->decrypt_digest_update( \
+	    decrypt_ctx, digest_ctx, ciphertext, plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SIGN_ENCRYPT_UPDATE(sign_ctx, encrypt_ctx, plaintext, \
+	    ciphertext, req) ( \
+	(KCF_PROV_DUAL_OPS(pd) && \
+	KCF_PROV_DUAL_OPS(pd)->sign_encrypt_update) ? \
+	KCF_PROV_DUAL_OPS(pd)->sign_encrypt_update( \
+	    sign_ctx, encrypt_ctx, plaintext, ciphertext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_DECRYPT_VERIFY_UPDATE(decrypt_ctx, verify_ctx, ciphertext, \
+	    plaintext, req) ( \
+	(KCF_PROV_DUAL_OPS(pd) && \
+	KCF_PROV_DUAL_OPS(pd)->decrypt_verify_update) ? \
+	KCF_PROV_DUAL_OPS(pd)->decrypt_verify_update( \
+	    decrypt_ctx, verify_ctx, ciphertext, plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_dual_cipher_mac_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_ENCRYPT_MAC_INIT(pd, ctx, encr_mech, encr_key, mac_mech, \
+	    mac_key, encr_ctx_template, mac_ctx_template, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_init) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_init( \
+	    ctx, encr_mech, encr_key, mac_mech, mac_key, encr_ctx_template, \
+	    mac_ctx_template, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT_MAC(pd, ctx, plaintext, ciphertext, mac, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac( \
+	    ctx, plaintext, ciphertext, mac, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT_MAC_UPDATE(pd, ctx, plaintext, ciphertext, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_update) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_update( \
+	    ctx, plaintext, ciphertext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT_MAC_FINAL(pd, ctx, ciphertext, mac, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_final) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_final( \
+	    ctx, ciphertext, mac, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_ENCRYPT_MAC_ATOMIC(pd, session, encr_mech, encr_key, \
+	    mac_mech, mac_key, plaintext, ciphertext, mac, \
+	    encr_ctx_template, mac_ctx_template, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_atomic) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_atomic( \
+	    (pd)->pd_prov_handle, session, encr_mech, encr_key, \
+	    mac_mech, mac_key, plaintext, ciphertext, mac, \
+	    encr_ctx_template, mac_ctx_template, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_DECRYPT_INIT(pd, ctx, mac_mech, mac_key, decr_mech, \
+	    decr_key, mac_ctx_template, decr_ctx_template, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_init) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_init( \
+	    ctx, mac_mech, mac_key, decr_mech, decr_key, mac_ctx_template, \
+	    decr_ctx_template, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_DECRYPT(pd, ctx, ciphertext, mac, plaintext, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt( \
+	    ctx, ciphertext, mac, plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_DECRYPT_UPDATE(pd, ctx, ciphertext, plaintext, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_update) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_update( \
+	    ctx, ciphertext, plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_DECRYPT_FINAL(pd, ctx, mac, plaintext, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_final) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_final( \
+	    ctx, mac, plaintext, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_DECRYPT_ATOMIC(pd, session, mac_mech, mac_key, \
+	    decr_mech, decr_key, ciphertext, mac, plaintext, \
+	    mac_ctx_template, decr_ctx_template, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_atomic) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_atomic( \
+	    (pd)->pd_prov_handle, session, mac_mech, mac_key, \
+	    decr_mech, decr_key, ciphertext, mac, plaintext, \
+	    mac_ctx_template, decr_ctx_template, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_MAC_VERIFY_DECRYPT_ATOMIC(pd, session, mac_mech, mac_key, \
+	    decr_mech, decr_key, ciphertext, mac, plaintext, \
+	    mac_ctx_template, decr_ctx_template, req) ( \
+	(KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_verify_decrypt_atomic \
+	    != NULL) ? \
+	KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_verify_decrypt_atomic( \
+	    (pd)->pd_prov_handle, session, mac_mech, mac_key, \
+	    decr_mech, decr_key, ciphertext, mac, plaintext, \
+	    mac_ctx_template, decr_ctx_template, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_random_number_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_SEED_RANDOM(pd, session, buf, len, est, flags, req) ( \
+	(KCF_PROV_RANDOM_OPS(pd) && KCF_PROV_RANDOM_OPS(pd)->seed_random) ? \
+	KCF_PROV_RANDOM_OPS(pd)->seed_random((pd)->pd_prov_handle, \
+	    session, buf, len, est, flags, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_GENERATE_RANDOM(pd, session, buf, len, req) ( \
+	(KCF_PROV_RANDOM_OPS(pd) && \
+	KCF_PROV_RANDOM_OPS(pd)->generate_random) ? \
+	KCF_PROV_RANDOM_OPS(pd)->generate_random((pd)->pd_prov_handle, \
+	    session, buf, len, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_session_ops(9S) entry points.
+ *
+ * ops_pd is the provider descriptor that supplies the ops_vector.
+ * pd is the descriptor that supplies the provider handle.
+ * Only session open/close needs two handles.
+ */
+
+#define	KCF_PROV_SESSION_OPEN(ops_pd, session, req, pd) ( \
+	(KCF_PROV_SESSION_OPS(ops_pd) && \
+	KCF_PROV_SESSION_OPS(ops_pd)->session_open) ? \
+	KCF_PROV_SESSION_OPS(ops_pd)->session_open((pd)->pd_prov_handle, \
+	    session, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SESSION_CLOSE(ops_pd, session, req, pd) ( \
+	(KCF_PROV_SESSION_OPS(ops_pd) && \
+	KCF_PROV_SESSION_OPS(ops_pd)->session_close) ? \
+	KCF_PROV_SESSION_OPS(ops_pd)->session_close((pd)->pd_prov_handle, \
+	    session, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SESSION_LOGIN(pd, session, user_type, pin, len, req) ( \
+	(KCF_PROV_SESSION_OPS(pd) && \
+	KCF_PROV_SESSION_OPS(pd)->session_login) ? \
+	KCF_PROV_SESSION_OPS(pd)->session_login((pd)->pd_prov_handle, \
+	    session, user_type, pin, len, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SESSION_LOGOUT(pd, session, req) ( \
+	(KCF_PROV_SESSION_OPS(pd) && \
+	KCF_PROV_SESSION_OPS(pd)->session_logout) ? \
+	KCF_PROV_SESSION_OPS(pd)->session_logout((pd)->pd_prov_handle, \
+	    session, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_object_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_OBJECT_CREATE(pd, session, template, count, object, req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_create) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_create((pd)->pd_prov_handle, \
+	    session, template, count, object, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_COPY(pd, session, object, template, count, \
+	    new_object, req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_copy) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_copy((pd)->pd_prov_handle, \
+	session, object, template, count, new_object, req) : \
+	    CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_DESTROY(pd, session, object, req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_destroy) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_destroy((pd)->pd_prov_handle, \
+	    session, object, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_GET_SIZE(pd, session, object, size, req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && \
+	KCF_PROV_OBJECT_OPS(pd)->object_get_size) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_get_size((pd)->pd_prov_handle, \
+	    session, object, size, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_GET_ATTRIBUTE_VALUE(pd, session, object, template, \
+	    count, req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && \
+	KCF_PROV_OBJECT_OPS(pd)->object_get_attribute_value) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_get_attribute_value( \
+	(pd)->pd_prov_handle, session, object, template, count, req) : \
+	    CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_SET_ATTRIBUTE_VALUE(pd, session, object, template, \
+	    count, req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && \
+	KCF_PROV_OBJECT_OPS(pd)->object_set_attribute_value) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_set_attribute_value( \
+	(pd)->pd_prov_handle, session, object, template, count, req) : \
+	    CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_FIND_INIT(pd, session, template, count, ppriv, \
+	    req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && \
+	KCF_PROV_OBJECT_OPS(pd)->object_find_init) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_find_init((pd)->pd_prov_handle, \
+	session, template, count, ppriv, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_FIND(pd, ppriv, objects, max_objects, object_count, \
+	    req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_find) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_find( \
+	(pd)->pd_prov_handle, ppriv, objects, max_objects, object_count, \
+	req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_OBJECT_FIND_FINAL(pd, ppriv, req) ( \
+	(KCF_PROV_OBJECT_OPS(pd) && \
+	KCF_PROV_OBJECT_OPS(pd)->object_find_final) ? \
+	KCF_PROV_OBJECT_OPS(pd)->object_find_final( \
+	    (pd)->pd_prov_handle, ppriv, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_key_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_KEY_GENERATE(pd, session, mech, template, count, object, \
+	    req) ( \
+	(KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_generate) ? \
+	KCF_PROV_KEY_OPS(pd)->key_generate((pd)->pd_prov_handle, \
+	    session, mech, template, count, object, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_KEY_GENERATE_PAIR(pd, session, mech, pub_template, \
+	    pub_count, priv_template, priv_count, pub_key, priv_key, req) ( \
+	(KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_generate_pair) ? \
+	KCF_PROV_KEY_OPS(pd)->key_generate_pair((pd)->pd_prov_handle, \
+	    session, mech, pub_template, pub_count, priv_template, \
+	    priv_count, pub_key, priv_key, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_KEY_WRAP(pd, session, mech, wrapping_key, key, wrapped_key, \
+	    wrapped_key_len, req) ( \
+	(KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_wrap) ? \
+	KCF_PROV_KEY_OPS(pd)->key_wrap((pd)->pd_prov_handle, \
+	    session, mech, wrapping_key, key, wrapped_key, wrapped_key_len, \
+	    req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_KEY_UNWRAP(pd, session, mech, unwrapping_key, wrapped_key, \
+	    wrapped_key_len, template, count, key, req) ( \
+	(KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_unwrap) ? \
+	KCF_PROV_KEY_OPS(pd)->key_unwrap((pd)->pd_prov_handle, \
+	    session, mech, unwrapping_key, wrapped_key, wrapped_key_len, \
+	    template, count, key, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_KEY_DERIVE(pd, session, mech, base_key, template, count, \
+	    key, req) ( \
+	(KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_derive) ? \
+	KCF_PROV_KEY_OPS(pd)->key_derive((pd)->pd_prov_handle, \
+	    session, mech, base_key, template, count, key, req) : \
+	CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_KEY_CHECK(pd, mech, key) ( \
+	(KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_check) ? \
+	KCF_PROV_KEY_OPS(pd)->key_check((pd)->pd_prov_handle, mech, key) : \
+	CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_provider_management_ops(9S) entry points.
+ *
+ * ops_pd is the provider descriptor that supplies the ops_vector.
+ * pd is the descriptor that supplies the provider handle.
+ * Only ext_info needs two handles.
+ */
+
+#define	KCF_PROV_EXT_INFO(ops_pd, provext_info, req, pd) ( \
+	(KCF_PROV_PROVIDER_OPS(ops_pd) && \
+	KCF_PROV_PROVIDER_OPS(ops_pd)->ext_info) ? \
+	KCF_PROV_PROVIDER_OPS(ops_pd)->ext_info((pd)->pd_prov_handle, \
+	    provext_info, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_INIT_TOKEN(pd, pin, pin_len, label, req) ( \
+	(KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->init_token) ? \
+	KCF_PROV_PROVIDER_OPS(pd)->init_token((pd)->pd_prov_handle, \
+	    pin, pin_len, label, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_INIT_PIN(pd, session, pin, pin_len, req) ( \
+	(KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->init_pin) ? \
+	KCF_PROV_PROVIDER_OPS(pd)->init_pin((pd)->pd_prov_handle, \
+	    session, pin, pin_len, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_SET_PIN(pd, session, old_pin, old_len, new_pin, new_len, \
+	    req) ( \
+	(KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->set_pin) ? \
+	KCF_PROV_PROVIDER_OPS(pd)->set_pin((pd)->pd_prov_handle, \
+	session, old_pin, old_len, new_pin, new_len, req) : \
+	    CRYPTO_NOT_SUPPORTED)
+
+/*
+ * Wrappers for crypto_nostore_key_ops(9S) entry points.
+ */
+
+#define	KCF_PROV_NOSTORE_KEY_GENERATE(pd, session, mech, template, count, \
+	    out_template, out_count, req) ( \
+	(KCF_PROV_NOSTORE_KEY_OPS(pd) && \
+	    KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate) ? \
+	KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate( \
+	    (pd)->pd_prov_handle, session, mech, template, count, \
+	    out_template, out_count, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_NOSTORE_KEY_GENERATE_PAIR(pd, session, mech, pub_template, \
+	    pub_count, priv_template, priv_count, out_pub_template, \
+	    out_pub_count, out_priv_template, out_priv_count, req) ( \
+	(KCF_PROV_NOSTORE_KEY_OPS(pd) && \
+	    KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate_pair) ? \
+	KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate_pair( \
+	    (pd)->pd_prov_handle, session, mech, pub_template, pub_count, \
+	    priv_template, priv_count, out_pub_template, out_pub_count, \
+	    out_priv_template, out_priv_count, req) : CRYPTO_NOT_SUPPORTED)
+
+#define	KCF_PROV_NOSTORE_KEY_DERIVE(pd, session, mech, base_key, template, \
+	    count, out_template, out_count, req) ( \
+	(KCF_PROV_NOSTORE_KEY_OPS(pd) && \
+	    KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_derive) ? \
+	KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_derive( \
+	    (pd)->pd_prov_handle, session, mech, base_key, template, count, \
+	    out_template, out_count, req) : CRYPTO_NOT_SUPPORTED)
+
+/*
+ * The following routines are exported by the kcf module (/kernel/misc/kcf)
+ * to the crypto and cryptoadmin modules.
+ */
+
+/* Digest/mac/cipher entry points that take a provider descriptor and session */
+extern int crypto_digest_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_mac_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_encrypt_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_decrypt_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+
+/* Other private digest/mac/cipher entry points not exported through k-API */
+extern int crypto_digest_key_prov(crypto_context_t, crypto_key_t *,
+    crypto_call_req_t *);
+
+/* Private sign entry points exported by KCF */
+extern int crypto_sign_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_sign_recover_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+/* Private verify entry points exported by KCF */
+extern int crypto_verify_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+extern int crypto_verify_recover_single(crypto_context_t, crypto_data_t *,
+    crypto_data_t *, crypto_call_req_t *);
+
+/* Private dual operations entry points exported by KCF */
+extern int crypto_digest_encrypt_update(crypto_context_t, crypto_context_t,
+    crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+extern int crypto_decrypt_digest_update(crypto_context_t, crypto_context_t,
+    crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+extern int crypto_sign_encrypt_update(crypto_context_t, crypto_context_t,
+    crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+extern int crypto_decrypt_verify_update(crypto_context_t, crypto_context_t,
+    crypto_data_t *, crypto_data_t *, crypto_call_req_t *);
+
+/* Random Number Generation */
+int crypto_seed_random(crypto_provider_handle_t provider, uchar_t *buf,
+    size_t len, crypto_call_req_t *req);
+int crypto_generate_random(crypto_provider_handle_t provider, uchar_t *buf,
+    size_t len, crypto_call_req_t *req);
+
+/* Provider Management */
+int crypto_get_provider_info(crypto_provider_id_t id,
+    crypto_provider_info_t **info, crypto_call_req_t *req);
+int crypto_get_provider_mechanisms(crypto_minor_t *, crypto_provider_id_t id,
+    uint_t *count, crypto_mech_name_t **list);
+int crypto_init_token(crypto_provider_handle_t provider, char *pin,
+    size_t pin_len, char *label, crypto_call_req_t *);
+int crypto_init_pin(crypto_provider_handle_t provider, char *pin,
+    size_t pin_len, crypto_call_req_t *req);
+int crypto_set_pin(crypto_provider_handle_t provider, char *old_pin,
+    size_t old_len, char *new_pin, size_t new_len, crypto_call_req_t *req);
+void crypto_free_provider_list(crypto_provider_entry_t *list, uint_t count);
+void crypto_free_provider_info(crypto_provider_info_t *info);
+
+/* Administrative */
+int crypto_get_dev_list(uint_t *count, crypto_dev_list_entry_t **list);
+int crypto_get_soft_list(uint_t *count, char **list, size_t *len);
+int crypto_get_dev_info(char *name, uint_t instance, uint_t *count,
+    crypto_mech_name_t **list);
+int crypto_get_soft_info(caddr_t name, uint_t *count,
+    crypto_mech_name_t **list);
+int crypto_load_dev_disabled(char *name, uint_t instance, uint_t count,
+    crypto_mech_name_t *list);
+int crypto_load_soft_disabled(caddr_t name, uint_t count,
+    crypto_mech_name_t *list);
+int crypto_unload_soft_module(caddr_t path);
+int crypto_load_soft_config(caddr_t name, uint_t count,
+    crypto_mech_name_t *list);
+int crypto_load_door(uint_t did);
+void crypto_free_mech_list(crypto_mech_name_t *list, uint_t count);
+void crypto_free_dev_list(crypto_dev_list_entry_t *list, uint_t count);
+
+/* Miscellaneous */
+int crypto_get_mechanism_number(caddr_t name, crypto_mech_type_t *number);
+int crypto_get_function_list(crypto_provider_id_t id,
+    crypto_function_list_t **list, int kmflag);
+void crypto_free_function_list(crypto_function_list_t *list);
+int crypto_build_permitted_mech_names(kcf_provider_desc_t *,
+    crypto_mech_name_t **, uint_t *, int);
+extern void kcf_destroy_mech_tabs(void);
+extern void kcf_init_mech_tabs(void);
+extern int kcf_add_mech_provider(short, kcf_provider_desc_t *,
+    kcf_prov_mech_desc_t **);
+extern void kcf_remove_mech_provider(char *, kcf_provider_desc_t *);
+extern int kcf_get_mech_entry(crypto_mech_type_t, kcf_mech_entry_t **);
+extern kcf_provider_desc_t *kcf_alloc_provider_desc(crypto_provider_info_t *);
+extern void kcf_provider_zero_refcnt(kcf_provider_desc_t *);
+extern void kcf_free_provider_desc(kcf_provider_desc_t *);
+extern void kcf_soft_config_init(void);
+extern int get_sw_provider_for_mech(crypto_mech_name_t, char **);
+extern crypto_mech_type_t crypto_mech2id_common(char *, boolean_t);
+extern void undo_register_provider(kcf_provider_desc_t *, boolean_t);
+extern void redo_register_provider(kcf_provider_desc_t *);
+extern void kcf_rnd_init(void);
+extern boolean_t kcf_rngprov_check(void);
+extern int kcf_rnd_get_pseudo_bytes(uint8_t *, size_t);
+extern int kcf_rnd_get_bytes(uint8_t *, size_t, boolean_t, boolean_t);
+extern int random_add_pseudo_entropy(uint8_t *, size_t, uint_t);
+extern void kcf_rnd_schedule_timeout(boolean_t);
+extern int crypto_uio_data(crypto_data_t *, uchar_t *, int, cmd_type_t,
+    void *, void (*update)(void));
+extern int crypto_mblk_data(crypto_data_t *, uchar_t *, int, cmd_type_t,
+    void *, void (*update)(void));
+extern int crypto_put_output_data(uchar_t *, crypto_data_t *, int);
+extern int crypto_get_input_data(crypto_data_t *, uchar_t **, uchar_t *);
+extern int crypto_copy_key_to_ctx(crypto_key_t *, crypto_key_t **, size_t *,
+    int kmflag);
+extern int crypto_digest_data(crypto_data_t *, void *, uchar_t *,
+    void (*update)(void), void (*final)(void), uchar_t);
+extern int crypto_update_iov(void *, crypto_data_t *, crypto_data_t *,
+    int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+    void (*copy_block)(uint8_t *, uint64_t *));
+extern int crypto_update_uio(void *, crypto_data_t *, crypto_data_t *,
+    int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+    void (*copy_block)(uint8_t *, uint64_t *));
+extern int crypto_update_mp(void *, crypto_data_t *, crypto_data_t *,
+    int (*cipher)(void *, caddr_t, size_t, crypto_data_t *),
+    void (*copy_block)(uint8_t *, uint64_t *));
+extern int crypto_get_key_attr(crypto_key_t *, crypto_attr_type_t, uchar_t **,
+    ssize_t *);
+
+/* Access to the provider's table */
+extern void kcf_prov_tab_destroy(void);
+extern void kcf_prov_tab_init(void);
+extern int kcf_prov_tab_add_provider(kcf_provider_desc_t *);
+extern int kcf_prov_tab_rem_provider(crypto_provider_id_t);
+extern kcf_provider_desc_t *kcf_prov_tab_lookup_by_name(char *);
+extern kcf_provider_desc_t *kcf_prov_tab_lookup_by_dev(char *, uint_t);
+extern int kcf_get_hw_prov_tab(uint_t *, kcf_provider_desc_t ***, int,
+    char *, uint_t, boolean_t);
+extern int kcf_get_slot_list(uint_t *, kcf_provider_desc_t ***, boolean_t);
+extern void kcf_free_provider_tab(uint_t, kcf_provider_desc_t **);
+extern kcf_provider_desc_t *kcf_prov_tab_lookup(crypto_provider_id_t);
+extern int kcf_get_sw_prov(crypto_mech_type_t, kcf_provider_desc_t **,
+    kcf_mech_entry_t **, boolean_t);
+
+/* Access to the policy table */
+extern boolean_t is_mech_disabled(kcf_provider_desc_t *, crypto_mech_name_t);
+extern boolean_t is_mech_disabled_byname(crypto_provider_type_t, char *,
+    uint_t, crypto_mech_name_t);
+extern void kcf_policy_tab_init(void);
+extern void kcf_policy_free_desc(kcf_policy_desc_t *);
+extern void kcf_policy_remove_by_name(char *, uint_t *, crypto_mech_name_t **);
+extern void kcf_policy_remove_by_dev(char *, uint_t, uint_t *,
+    crypto_mech_name_t **);
+extern kcf_policy_desc_t *kcf_policy_lookup_by_name(char *);
+extern kcf_policy_desc_t *kcf_policy_lookup_by_dev(char *, uint_t);
+extern int kcf_policy_load_soft_disabled(char *, uint_t, crypto_mech_name_t *,
+    uint_t *, crypto_mech_name_t **);
+extern int kcf_policy_load_dev_disabled(char *, uint_t, uint_t,
+    crypto_mech_name_t *, uint_t *, crypto_mech_name_t **);
+extern boolean_t in_soft_config_list(char *);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_CRYPTO_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctl.h
new file mode 100644
index 000000000000..6e371e343945
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctl.h
@@ -0,0 +1,1480 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CRYPTO_IOCTL_H
+#define	_SYS_CRYPTO_IOCTL_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/common.h>
+
+#define	CRYPTO_MAX_ATTRIBUTE_COUNT	128
+
+#define	CRYPTO_IOFLAGS_RW_SESSION	0x00000001
+
+#define	CRYPTO(x)		(('y' << 8) | (x))
+
+#define	MAX_NUM_THRESHOLD	7
+
+/* the PKCS11 Mechanisms */
+#define	CKM_RC4			0x00000111
+#define	CKM_DES3_ECB		0x00000132
+#define	CKM_DES3_CBC		0x00000133
+#define	CKM_MD5			0x00000210
+#define	CKM_SHA_1		0x00000220
+#define	CKM_AES_ECB		0x00001081
+#define	CKM_AES_CBC		0x00001082
+
+/*
+ * General Purpose Ioctls
+ */
+
+typedef struct fl_mechs_threshold {
+	int		mech_type;
+	uint32_t	mech_threshold;
+} fl_mechs_threshold_t;
+
+typedef struct crypto_function_list {
+	boolean_t fl_digest_init;
+	boolean_t fl_digest;
+	boolean_t fl_digest_update;
+	boolean_t fl_digest_key;
+	boolean_t fl_digest_final;
+
+	boolean_t fl_encrypt_init;
+	boolean_t fl_encrypt;
+	boolean_t fl_encrypt_update;
+	boolean_t fl_encrypt_final;
+
+	boolean_t fl_decrypt_init;
+	boolean_t fl_decrypt;
+	boolean_t fl_decrypt_update;
+	boolean_t fl_decrypt_final;
+
+	boolean_t fl_mac_init;
+	boolean_t fl_mac;
+	boolean_t fl_mac_update;
+	boolean_t fl_mac_final;
+
+	boolean_t fl_sign_init;
+	boolean_t fl_sign;
+	boolean_t fl_sign_update;
+	boolean_t fl_sign_final;
+	boolean_t fl_sign_recover_init;
+	boolean_t fl_sign_recover;
+
+	boolean_t fl_verify_init;
+	boolean_t fl_verify;
+	boolean_t fl_verify_update;
+	boolean_t fl_verify_final;
+	boolean_t fl_verify_recover_init;
+	boolean_t fl_verify_recover;
+
+	boolean_t fl_digest_encrypt_update;
+	boolean_t fl_decrypt_digest_update;
+	boolean_t fl_sign_encrypt_update;
+	boolean_t fl_decrypt_verify_update;
+
+	boolean_t fl_seed_random;
+	boolean_t fl_generate_random;
+
+	boolean_t fl_session_open;
+	boolean_t fl_session_close;
+	boolean_t fl_session_login;
+	boolean_t fl_session_logout;
+
+	boolean_t fl_object_create;
+	boolean_t fl_object_copy;
+	boolean_t fl_object_destroy;
+	boolean_t fl_object_get_size;
+	boolean_t fl_object_get_attribute_value;
+	boolean_t fl_object_set_attribute_value;
+	boolean_t fl_object_find_init;
+	boolean_t fl_object_find;
+	boolean_t fl_object_find_final;
+
+	boolean_t fl_key_generate;
+	boolean_t fl_key_generate_pair;
+	boolean_t fl_key_wrap;
+	boolean_t fl_key_unwrap;
+	boolean_t fl_key_derive;
+
+	boolean_t fl_init_token;
+	boolean_t fl_init_pin;
+	boolean_t fl_set_pin;
+
+	boolean_t prov_is_limited;
+	uint32_t prov_hash_threshold;
+	uint32_t prov_hash_limit;
+
+	int total_threshold_count;
+	fl_mechs_threshold_t	fl_threshold[MAX_NUM_THRESHOLD];
+} crypto_function_list_t;
+
+typedef struct crypto_get_function_list {
+	uint_t			fl_return_value;
+	crypto_provider_id_t	fl_provider_id;
+	crypto_function_list_t	fl_list;
+} crypto_get_function_list_t;
+
+typedef struct crypto_get_mechanism_number {
+	uint_t			pn_return_value;
+	caddr_t			pn_mechanism_string;
+	size_t			pn_mechanism_len;
+	crypto_mech_type_t	pn_internal_number;
+} crypto_get_mechanism_number_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_get_mechanism_number32 {
+	uint32_t		pn_return_value;
+	caddr32_t		pn_mechanism_string;
+	size32_t		pn_mechanism_len;
+	crypto_mech_type_t	pn_internal_number;
+} crypto_get_mechanism_number32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_GET_FUNCTION_LIST	CRYPTO(20)
+#define	CRYPTO_GET_MECHANISM_NUMBER	CRYPTO(21)
+
+/*
+ * Session Ioctls
+ */
+
+typedef uint32_t	crypto_flags_t;
+
+typedef struct crypto_open_session {
+	uint_t			os_return_value;
+	crypto_session_id_t	os_session;
+	crypto_flags_t		os_flags;
+	crypto_provider_id_t	os_provider_id;
+} crypto_open_session_t;
+
+typedef struct crypto_close_session {
+	uint_t			cs_return_value;
+	crypto_session_id_t	cs_session;
+} crypto_close_session_t;
+
+typedef struct crypto_close_all_sessions {
+	uint_t			as_return_value;
+	crypto_provider_id_t	as_provider_id;
+} crypto_close_all_sessions_t;
+
+#define	CRYPTO_OPEN_SESSION		CRYPTO(30)
+#define	CRYPTO_CLOSE_SESSION		CRYPTO(31)
+#define	CRYPTO_CLOSE_ALL_SESSIONS	CRYPTO(32)
+
+/*
+ * Login Ioctls
+ */
+typedef struct crypto_login {
+	uint_t			co_return_value;
+	crypto_session_id_t	co_session;
+	uint_t			co_user_type;
+	uint_t			co_pin_len;
+	caddr_t			co_pin;
+} crypto_login_t;
+
+typedef struct crypto_logout {
+	uint_t			cl_return_value;
+	crypto_session_id_t	cl_session;
+} crypto_logout_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_login32 {
+	uint32_t		co_return_value;
+	crypto_session_id_t	co_session;
+	uint32_t		co_user_type;
+	uint32_t		co_pin_len;
+	caddr32_t		co_pin;
+} crypto_login32_t;
+
+typedef struct crypto_logout32 {
+	uint32_t		cl_return_value;
+	crypto_session_id_t	cl_session;
+} crypto_logout32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_LOGIN			CRYPTO(40)
+#define	CRYPTO_LOGOUT			CRYPTO(41)
+
+/*
+ * Cryptographic Ioctls
+ */
+typedef struct crypto_encrypt {
+	uint_t			ce_return_value;
+	crypto_session_id_t	ce_session;
+	size_t			ce_datalen;
+	caddr_t			ce_databuf;
+	size_t			ce_encrlen;
+	caddr_t			ce_encrbuf;
+	uint_t			ce_flags;
+} crypto_encrypt_t;
+
+typedef struct crypto_encrypt_init {
+	uint_t			ei_return_value;
+	crypto_session_id_t	ei_session;
+	crypto_mechanism_t	ei_mech;
+	crypto_key_t		ei_key;
+} crypto_encrypt_init_t;
+
+typedef struct crypto_encrypt_update {
+	uint_t			eu_return_value;
+	crypto_session_id_t	eu_session;
+	size_t			eu_datalen;
+	caddr_t			eu_databuf;
+	size_t			eu_encrlen;
+	caddr_t			eu_encrbuf;
+} crypto_encrypt_update_t;
+
+typedef struct crypto_encrypt_final {
+	uint_t			ef_return_value;
+	crypto_session_id_t	ef_session;
+	size_t			ef_encrlen;
+	caddr_t			ef_encrbuf;
+} crypto_encrypt_final_t;
+
+typedef struct crypto_decrypt {
+	uint_t			cd_return_value;
+	crypto_session_id_t	cd_session;
+	size_t			cd_encrlen;
+	caddr_t			cd_encrbuf;
+	size_t			cd_datalen;
+	caddr_t			cd_databuf;
+	uint_t			cd_flags;
+} crypto_decrypt_t;
+
+typedef struct crypto_decrypt_init {
+	uint_t			di_return_value;
+	crypto_session_id_t	di_session;
+	crypto_mechanism_t	di_mech;
+	crypto_key_t		di_key;
+} crypto_decrypt_init_t;
+
+typedef struct crypto_decrypt_update {
+	uint_t			du_return_value;
+	crypto_session_id_t	du_session;
+	size_t			du_encrlen;
+	caddr_t			du_encrbuf;
+	size_t			du_datalen;
+	caddr_t			du_databuf;
+} crypto_decrypt_update_t;
+
+typedef struct crypto_decrypt_final {
+	uint_t			df_return_value;
+	crypto_session_id_t	df_session;
+	size_t			df_datalen;
+	caddr_t			df_databuf;
+} crypto_decrypt_final_t;
+
+typedef struct crypto_digest {
+	uint_t			cd_return_value;
+	crypto_session_id_t	cd_session;
+	size_t			cd_datalen;
+	caddr_t			cd_databuf;
+	size_t			cd_digestlen;
+	caddr_t			cd_digestbuf;
+} crypto_digest_t;
+
+typedef struct crypto_digest_init {
+	uint_t			di_return_value;
+	crypto_session_id_t	di_session;
+	crypto_mechanism_t	di_mech;
+} crypto_digest_init_t;
+
+typedef struct crypto_digest_update {
+	uint_t			du_return_value;
+	crypto_session_id_t	du_session;
+	size_t			du_datalen;
+	caddr_t			du_databuf;
+} crypto_digest_update_t;
+
+typedef struct crypto_digest_key {
+	uint_t			dk_return_value;
+	crypto_session_id_t	dk_session;
+	crypto_key_t		dk_key;
+} crypto_digest_key_t;
+
+typedef struct crypto_digest_final {
+	uint_t			df_return_value;
+	crypto_session_id_t	df_session;
+	size_t			df_digestlen;
+	caddr_t			df_digestbuf;
+} crypto_digest_final_t;
+
+typedef struct crypto_mac {
+	uint_t			cm_return_value;
+	crypto_session_id_t	cm_session;
+	size_t			cm_datalen;
+	caddr_t			cm_databuf;
+	size_t			cm_maclen;
+	caddr_t			cm_macbuf;
+} crypto_mac_t;
+
+typedef struct crypto_mac_init {
+	uint_t			mi_return_value;
+	crypto_session_id_t	mi_session;
+	crypto_mechanism_t	mi_mech;
+	crypto_key_t		mi_key;
+} crypto_mac_init_t;
+
+typedef struct crypto_mac_update {
+	uint_t			mu_return_value;
+	crypto_session_id_t	mu_session;
+	size_t			mu_datalen;
+	caddr_t			mu_databuf;
+} crypto_mac_update_t;
+
+typedef struct crypto_mac_final {
+	uint_t			mf_return_value;
+	crypto_session_id_t	mf_session;
+	size_t			mf_maclen;
+	caddr_t			mf_macbuf;
+} crypto_mac_final_t;
+
+typedef struct crypto_sign {
+	uint_t			cs_return_value;
+	crypto_session_id_t	cs_session;
+	size_t			cs_datalen;
+	caddr_t			cs_databuf;
+	size_t			cs_signlen;
+	caddr_t			cs_signbuf;
+} crypto_sign_t;
+
+typedef struct crypto_sign_init {
+	uint_t			si_return_value;
+	crypto_session_id_t	si_session;
+	crypto_mechanism_t	si_mech;
+	crypto_key_t		si_key;
+} crypto_sign_init_t;
+
+typedef struct crypto_sign_update {
+	uint_t			su_return_value;
+	crypto_session_id_t	su_session;
+	size_t			su_datalen;
+	caddr_t			su_databuf;
+} crypto_sign_update_t;
+
+typedef struct crypto_sign_final {
+	uint_t			sf_return_value;
+	crypto_session_id_t	sf_session;
+	size_t			sf_signlen;
+	caddr_t			sf_signbuf;
+} crypto_sign_final_t;
+
+typedef struct crypto_sign_recover_init {
+	uint_t			ri_return_value;
+	crypto_session_id_t	ri_session;
+	crypto_mechanism_t	ri_mech;
+	crypto_key_t		ri_key;
+} crypto_sign_recover_init_t;
+
+typedef struct crypto_sign_recover {
+	uint_t			sr_return_value;
+	crypto_session_id_t	sr_session;
+	size_t			sr_datalen;
+	caddr_t			sr_databuf;
+	size_t			sr_signlen;
+	caddr_t			sr_signbuf;
+} crypto_sign_recover_t;
+
+typedef struct crypto_verify {
+	uint_t			cv_return_value;
+	crypto_session_id_t	cv_session;
+	size_t			cv_datalen;
+	caddr_t			cv_databuf;
+	size_t			cv_signlen;
+	caddr_t			cv_signbuf;
+} crypto_verify_t;
+
+typedef struct crypto_verify_init {
+	uint_t			vi_return_value;
+	crypto_session_id_t	vi_session;
+	crypto_mechanism_t	vi_mech;
+	crypto_key_t		vi_key;
+} crypto_verify_init_t;
+
+typedef struct crypto_verify_update {
+	uint_t			vu_return_value;
+	crypto_session_id_t	vu_session;
+	size_t			vu_datalen;
+	caddr_t			vu_databuf;
+} crypto_verify_update_t;
+
+typedef struct crypto_verify_final {
+	uint_t			vf_return_value;
+	crypto_session_id_t	vf_session;
+	size_t			vf_signlen;
+	caddr_t			vf_signbuf;
+} crypto_verify_final_t;
+
+typedef struct crypto_verify_recover_init {
+	uint_t			ri_return_value;
+	crypto_session_id_t	ri_session;
+	crypto_mechanism_t	ri_mech;
+	crypto_key_t		ri_key;
+} crypto_verify_recover_init_t;
+
+typedef struct crypto_verify_recover {
+	uint_t			vr_return_value;
+	crypto_session_id_t	vr_session;
+	size_t			vr_signlen;
+	caddr_t			vr_signbuf;
+	size_t			vr_datalen;
+	caddr_t			vr_databuf;
+} crypto_verify_recover_t;
+
+typedef struct crypto_digest_encrypt_update {
+	uint_t			eu_return_value;
+	crypto_session_id_t	eu_session;
+	size_t			eu_datalen;
+	caddr_t			eu_databuf;
+	size_t			eu_encrlen;
+	caddr_t			eu_encrbuf;
+} crypto_digest_encrypt_update_t;
+
+typedef struct crypto_decrypt_digest_update {
+	uint_t			du_return_value;
+	crypto_session_id_t	du_session;
+	size_t			du_encrlen;
+	caddr_t			du_encrbuf;
+	size_t			du_datalen;
+	caddr_t			du_databuf;
+} crypto_decrypt_digest_update_t;
+
+typedef struct crypto_sign_encrypt_update {
+	uint_t			eu_return_value;
+	crypto_session_id_t	eu_session;
+	size_t			eu_datalen;
+	caddr_t			eu_databuf;
+	size_t			eu_encrlen;
+	caddr_t			eu_encrbuf;
+} crypto_sign_encrypt_update_t;
+
+typedef struct crypto_decrypt_verify_update {
+	uint_t			vu_return_value;
+	crypto_session_id_t	vu_session;
+	size_t			vu_encrlen;
+	caddr_t			vu_encrbuf;
+	size_t			vu_datalen;
+	caddr_t			vu_databuf;
+} crypto_decrypt_verify_update_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_encrypt32 {
+	uint32_t		ce_return_value;
+	crypto_session_id_t	ce_session;
+	size32_t		ce_datalen;
+	caddr32_t		ce_databuf;
+	size32_t		ce_encrlen;
+	caddr32_t		ce_encrbuf;
+	uint32_t		ce_flags;
+} crypto_encrypt32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_encrypt_init32 {
+	uint32_t		ei_return_value;
+	crypto_session_id_t	ei_session;
+	crypto_mechanism32_t	ei_mech;
+	crypto_key32_t		ei_key;
+} crypto_encrypt_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_encrypt_update32 {
+	uint32_t		eu_return_value;
+	crypto_session_id_t	eu_session;
+	size32_t		eu_datalen;
+	caddr32_t		eu_databuf;
+	size32_t		eu_encrlen;
+	caddr32_t		eu_encrbuf;
+} crypto_encrypt_update32_t;
+
+typedef struct crypto_encrypt_final32 {
+	uint32_t		ef_return_value;
+	crypto_session_id_t	ef_session;
+	size32_t		ef_encrlen;
+	caddr32_t		ef_encrbuf;
+} crypto_encrypt_final32_t;
+
+typedef struct crypto_decrypt32 {
+	uint32_t		cd_return_value;
+	crypto_session_id_t	cd_session;
+	size32_t		cd_encrlen;
+	caddr32_t		cd_encrbuf;
+	size32_t		cd_datalen;
+	caddr32_t		cd_databuf;
+	uint32_t		cd_flags;
+} crypto_decrypt32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_decrypt_init32 {
+	uint32_t		di_return_value;
+	crypto_session_id_t	di_session;
+	crypto_mechanism32_t	di_mech;
+	crypto_key32_t		di_key;
+} crypto_decrypt_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_decrypt_update32 {
+	uint32_t		du_return_value;
+	crypto_session_id_t	du_session;
+	size32_t		du_encrlen;
+	caddr32_t		du_encrbuf;
+	size32_t		du_datalen;
+	caddr32_t		du_databuf;
+} crypto_decrypt_update32_t;
+
+typedef struct crypto_decrypt_final32 {
+	uint32_t		df_return_value;
+	crypto_session_id_t	df_session;
+	size32_t		df_datalen;
+	caddr32_t		df_databuf;
+} crypto_decrypt_final32_t;
+
+typedef struct crypto_digest32 {
+	uint32_t		cd_return_value;
+	crypto_session_id_t	cd_session;
+	size32_t		cd_datalen;
+	caddr32_t		cd_databuf;
+	size32_t		cd_digestlen;
+	caddr32_t		cd_digestbuf;
+} crypto_digest32_t;
+
+typedef struct crypto_digest_init32 {
+	uint32_t		di_return_value;
+	crypto_session_id_t	di_session;
+	crypto_mechanism32_t	di_mech;
+} crypto_digest_init32_t;
+
+typedef struct crypto_digest_update32 {
+	uint32_t		du_return_value;
+	crypto_session_id_t	du_session;
+	size32_t		du_datalen;
+	caddr32_t		du_databuf;
+} crypto_digest_update32_t;
+
+typedef struct crypto_digest_key32 {
+	uint32_t		dk_return_value;
+	crypto_session_id_t	dk_session;
+	crypto_key32_t		dk_key;
+} crypto_digest_key32_t;
+
+typedef struct crypto_digest_final32 {
+	uint32_t		df_return_value;
+	crypto_session_id_t	df_session;
+	size32_t		df_digestlen;
+	caddr32_t		df_digestbuf;
+} crypto_digest_final32_t;
+
+typedef struct crypto_mac32 {
+	uint32_t		cm_return_value;
+	crypto_session_id_t	cm_session;
+	size32_t		cm_datalen;
+	caddr32_t		cm_databuf;
+	size32_t		cm_maclen;
+	caddr32_t		cm_macbuf;
+} crypto_mac32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_mac_init32 {
+	uint32_t		mi_return_value;
+	crypto_session_id_t	mi_session;
+	crypto_mechanism32_t	mi_mech;
+	crypto_key32_t		mi_key;
+} crypto_mac_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_mac_update32 {
+	uint32_t		mu_return_value;
+	crypto_session_id_t	mu_session;
+	size32_t		mu_datalen;
+	caddr32_t		mu_databuf;
+} crypto_mac_update32_t;
+
+typedef struct crypto_mac_final32 {
+	uint32_t		mf_return_value;
+	crypto_session_id_t	mf_session;
+	size32_t		mf_maclen;
+	caddr32_t		mf_macbuf;
+} crypto_mac_final32_t;
+
+typedef struct crypto_sign32 {
+	uint32_t		cs_return_value;
+	crypto_session_id_t	cs_session;
+	size32_t		cs_datalen;
+	caddr32_t		cs_databuf;
+	size32_t		cs_signlen;
+	caddr32_t		cs_signbuf;
+} crypto_sign32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_sign_init32 {
+	uint32_t		si_return_value;
+	crypto_session_id_t	si_session;
+	crypto_mechanism32_t	si_mech;
+	crypto_key32_t		si_key;
+} crypto_sign_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_sign_update32 {
+	uint32_t		su_return_value;
+	crypto_session_id_t	su_session;
+	size32_t		su_datalen;
+	caddr32_t		su_databuf;
+} crypto_sign_update32_t;
+
+typedef struct crypto_sign_final32 {
+	uint32_t		sf_return_value;
+	crypto_session_id_t	sf_session;
+	size32_t		sf_signlen;
+	caddr32_t		sf_signbuf;
+} crypto_sign_final32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_sign_recover_init32 {
+	uint32_t		ri_return_value;
+	crypto_session_id_t	ri_session;
+	crypto_mechanism32_t	ri_mech;
+	crypto_key32_t		ri_key;
+} crypto_sign_recover_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_sign_recover32 {
+	uint32_t		sr_return_value;
+	crypto_session_id_t	sr_session;
+	size32_t		sr_datalen;
+	caddr32_t		sr_databuf;
+	size32_t		sr_signlen;
+	caddr32_t		sr_signbuf;
+} crypto_sign_recover32_t;
+
+typedef struct crypto_verify32 {
+	uint32_t		cv_return_value;
+	crypto_session_id_t	cv_session;
+	size32_t		cv_datalen;
+	caddr32_t		cv_databuf;
+	size32_t		cv_signlen;
+	caddr32_t		cv_signbuf;
+} crypto_verify32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_verify_init32 {
+	uint32_t		vi_return_value;
+	crypto_session_id_t	vi_session;
+	crypto_mechanism32_t	vi_mech;
+	crypto_key32_t		vi_key;
+} crypto_verify_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_verify_update32 {
+	uint32_t		vu_return_value;
+	crypto_session_id_t	vu_session;
+	size32_t		vu_datalen;
+	caddr32_t		vu_databuf;
+} crypto_verify_update32_t;
+
+typedef struct crypto_verify_final32 {
+	uint32_t		vf_return_value;
+	crypto_session_id_t	vf_session;
+	size32_t		vf_signlen;
+	caddr32_t		vf_signbuf;
+} crypto_verify_final32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_verify_recover_init32 {
+	uint32_t		ri_return_value;
+	crypto_session_id_t	ri_session;
+	crypto_mechanism32_t	ri_mech;
+	crypto_key32_t		ri_key;
+} crypto_verify_recover_init32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_verify_recover32 {
+	uint32_t		vr_return_value;
+	crypto_session_id_t	vr_session;
+	size32_t		vr_signlen;
+	caddr32_t		vr_signbuf;
+	size32_t		vr_datalen;
+	caddr32_t		vr_databuf;
+} crypto_verify_recover32_t;
+
+typedef struct crypto_digest_encrypt_update32 {
+	uint32_t		eu_return_value;
+	crypto_session_id_t	eu_session;
+	size32_t		eu_datalen;
+	caddr32_t		eu_databuf;
+	size32_t		eu_encrlen;
+	caddr32_t		eu_encrbuf;
+} crypto_digest_encrypt_update32_t;
+
+typedef struct crypto_decrypt_digest_update32 {
+	uint32_t		du_return_value;
+	crypto_session_id_t	du_session;
+	size32_t		du_encrlen;
+	caddr32_t		du_encrbuf;
+	size32_t		du_datalen;
+	caddr32_t		du_databuf;
+} crypto_decrypt_digest_update32_t;
+
+typedef struct crypto_sign_encrypt_update32 {
+	uint32_t		eu_return_value;
+	crypto_session_id_t	eu_session;
+	size32_t		eu_datalen;
+	caddr32_t		eu_databuf;
+	size32_t		eu_encrlen;
+	caddr32_t		eu_encrbuf;
+} crypto_sign_encrypt_update32_t;
+
+typedef struct crypto_decrypt_verify_update32 {
+	uint32_t		vu_return_value;
+	crypto_session_id_t	vu_session;
+	size32_t		vu_encrlen;
+	caddr32_t		vu_encrbuf;
+	size32_t		vu_datalen;
+	caddr32_t		vu_databuf;
+} crypto_decrypt_verify_update32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_ENCRYPT			CRYPTO(50)
+#define	CRYPTO_ENCRYPT_INIT		CRYPTO(51)
+#define	CRYPTO_ENCRYPT_UPDATE		CRYPTO(52)
+#define	CRYPTO_ENCRYPT_FINAL		CRYPTO(53)
+#define	CRYPTO_DECRYPT			CRYPTO(54)
+#define	CRYPTO_DECRYPT_INIT		CRYPTO(55)
+#define	CRYPTO_DECRYPT_UPDATE		CRYPTO(56)
+#define	CRYPTO_DECRYPT_FINAL		CRYPTO(57)
+
+#define	CRYPTO_DIGEST			CRYPTO(58)
+#define	CRYPTO_DIGEST_INIT		CRYPTO(59)
+#define	CRYPTO_DIGEST_UPDATE		CRYPTO(60)
+#define	CRYPTO_DIGEST_KEY		CRYPTO(61)
+#define	CRYPTO_DIGEST_FINAL		CRYPTO(62)
+#define	CRYPTO_MAC			CRYPTO(63)
+#define	CRYPTO_MAC_INIT			CRYPTO(64)
+#define	CRYPTO_MAC_UPDATE		CRYPTO(65)
+#define	CRYPTO_MAC_FINAL		CRYPTO(66)
+
+#define	CRYPTO_SIGN			CRYPTO(67)
+#define	CRYPTO_SIGN_INIT		CRYPTO(68)
+#define	CRYPTO_SIGN_UPDATE		CRYPTO(69)
+#define	CRYPTO_SIGN_FINAL		CRYPTO(70)
+#define	CRYPTO_SIGN_RECOVER_INIT	CRYPTO(71)
+#define	CRYPTO_SIGN_RECOVER		CRYPTO(72)
+#define	CRYPTO_VERIFY			CRYPTO(73)
+#define	CRYPTO_VERIFY_INIT		CRYPTO(74)
+#define	CRYPTO_VERIFY_UPDATE		CRYPTO(75)
+#define	CRYPTO_VERIFY_FINAL		CRYPTO(76)
+#define	CRYPTO_VERIFY_RECOVER_INIT	CRYPTO(77)
+#define	CRYPTO_VERIFY_RECOVER		CRYPTO(78)
+
+#define	CRYPTO_DIGEST_ENCRYPT_UPDATE	CRYPTO(79)
+#define	CRYPTO_DECRYPT_DIGEST_UPDATE	CRYPTO(80)
+#define	CRYPTO_SIGN_ENCRYPT_UPDATE	CRYPTO(81)
+#define	CRYPTO_DECRYPT_VERIFY_UPDATE	CRYPTO(82)
+
+/*
+ * Random Number Ioctls
+ */
+typedef struct crypto_seed_random {
+	uint_t			sr_return_value;
+	crypto_session_id_t	sr_session;
+	size_t			sr_seedlen;
+	caddr_t			sr_seedbuf;
+} crypto_seed_random_t;
+
+typedef struct crypto_generate_random {
+	uint_t			gr_return_value;
+	crypto_session_id_t	gr_session;
+	caddr_t			gr_buf;
+	size_t			gr_buflen;
+} crypto_generate_random_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_seed_random32 {
+	uint32_t		sr_return_value;
+	crypto_session_id_t	sr_session;
+	size32_t		sr_seedlen;
+	caddr32_t		sr_seedbuf;
+} crypto_seed_random32_t;
+
+typedef struct crypto_generate_random32 {
+	uint32_t		gr_return_value;
+	crypto_session_id_t	gr_session;
+	caddr32_t		gr_buf;
+	size32_t		gr_buflen;
+} crypto_generate_random32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_SEED_RANDOM		CRYPTO(90)
+#define	CRYPTO_GENERATE_RANDOM		CRYPTO(91)
+
+/*
+ * Object Management Ioctls
+ */
+typedef struct crypto_object_create {
+	uint_t			oc_return_value;
+	crypto_session_id_t	oc_session;
+	crypto_object_id_t	oc_handle;
+	uint_t			oc_count;
+	caddr_t			oc_attributes;
+} crypto_object_create_t;
+
+typedef struct crypto_object_copy {
+	uint_t			oc_return_value;
+	crypto_session_id_t	oc_session;
+	crypto_object_id_t	oc_handle;
+	crypto_object_id_t	oc_new_handle;
+	uint_t			oc_count;
+	caddr_t			oc_new_attributes;
+} crypto_object_copy_t;
+
+typedef struct crypto_object_destroy {
+	uint_t			od_return_value;
+	crypto_session_id_t	od_session;
+	crypto_object_id_t	od_handle;
+} crypto_object_destroy_t;
+
+typedef struct crypto_object_get_attribute_value {
+	uint_t			og_return_value;
+	crypto_session_id_t	og_session;
+	crypto_object_id_t	og_handle;
+	uint_t			og_count;
+	caddr_t			og_attributes;
+} crypto_object_get_attribute_value_t;
+
+typedef struct crypto_object_get_size {
+	uint_t			gs_return_value;
+	crypto_session_id_t	gs_session;
+	crypto_object_id_t	gs_handle;
+	size_t			gs_size;
+} crypto_object_get_size_t;
+
+typedef struct crypto_object_set_attribute_value {
+	uint_t			sa_return_value;
+	crypto_session_id_t	sa_session;
+	crypto_object_id_t	sa_handle;
+	uint_t			sa_count;
+	caddr_t			sa_attributes;
+} crypto_object_set_attribute_value_t;
+
+typedef struct crypto_object_find_init {
+	uint_t			fi_return_value;
+	crypto_session_id_t	fi_session;
+	uint_t			fi_count;
+	caddr_t			fi_attributes;
+} crypto_object_find_init_t;
+
+typedef struct crypto_object_find_update {
+	uint_t			fu_return_value;
+	crypto_session_id_t	fu_session;
+	uint_t			fu_max_count;
+	uint_t			fu_count;
+	caddr_t			fu_handles;
+} crypto_object_find_update_t;
+
+typedef struct crypto_object_find_final {
+	uint_t			ff_return_value;
+	crypto_session_id_t	ff_session;
+} crypto_object_find_final_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_object_create32 {
+	uint32_t		oc_return_value;
+	crypto_session_id_t	oc_session;
+	crypto_object_id_t	oc_handle;
+	uint32_t		oc_count;
+	caddr32_t		oc_attributes;
+} crypto_object_create32_t;
+
+typedef struct crypto_object_copy32 {
+	uint32_t		oc_return_value;
+	crypto_session_id_t	oc_session;
+	crypto_object_id_t	oc_handle;
+	crypto_object_id_t	oc_new_handle;
+	uint32_t		oc_count;
+	caddr32_t		oc_new_attributes;
+} crypto_object_copy32_t;
+
+typedef struct crypto_object_destroy32 {
+	uint32_t		od_return_value;
+	crypto_session_id_t	od_session;
+	crypto_object_id_t	od_handle;
+} crypto_object_destroy32_t;
+
+typedef struct crypto_object_get_attribute_value32 {
+	uint32_t		og_return_value;
+	crypto_session_id_t	og_session;
+	crypto_object_id_t	og_handle;
+	uint32_t		og_count;
+	caddr32_t		og_attributes;
+} crypto_object_get_attribute_value32_t;
+
+typedef struct crypto_object_get_size32 {
+	uint32_t		gs_return_value;
+	crypto_session_id_t	gs_session;
+	crypto_object_id_t	gs_handle;
+	size32_t		gs_size;
+} crypto_object_get_size32_t;
+
+typedef struct crypto_object_set_attribute_value32 {
+	uint32_t		sa_return_value;
+	crypto_session_id_t	sa_session;
+	crypto_object_id_t	sa_handle;
+	uint32_t		sa_count;
+	caddr32_t		sa_attributes;
+} crypto_object_set_attribute_value32_t;
+
+typedef struct crypto_object_find_init32 {
+	uint32_t		fi_return_value;
+	crypto_session_id_t	fi_session;
+	uint32_t		fi_count;
+	caddr32_t		fi_attributes;
+} crypto_object_find_init32_t;
+
+typedef struct crypto_object_find_update32 {
+	uint32_t		fu_return_value;
+	crypto_session_id_t	fu_session;
+	uint32_t		fu_max_count;
+	uint32_t		fu_count;
+	caddr32_t		fu_handles;
+} crypto_object_find_update32_t;
+
+typedef struct crypto_object_find_final32 {
+	uint32_t		ff_return_value;
+	crypto_session_id_t	ff_session;
+} crypto_object_find_final32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_OBJECT_CREATE			CRYPTO(100)
+#define	CRYPTO_OBJECT_COPY			CRYPTO(101)
+#define	CRYPTO_OBJECT_DESTROY			CRYPTO(102)
+#define	CRYPTO_OBJECT_GET_ATTRIBUTE_VALUE	CRYPTO(103)
+#define	CRYPTO_OBJECT_GET_SIZE			CRYPTO(104)
+#define	CRYPTO_OBJECT_SET_ATTRIBUTE_VALUE	CRYPTO(105)
+#define	CRYPTO_OBJECT_FIND_INIT			CRYPTO(106)
+#define	CRYPTO_OBJECT_FIND_UPDATE		CRYPTO(107)
+#define	CRYPTO_OBJECT_FIND_FINAL		CRYPTO(108)
+
+/*
+ * Key Generation Ioctls
+ */
+typedef struct crypto_object_generate_key {
+	uint_t			gk_return_value;
+	crypto_session_id_t	gk_session;
+	crypto_object_id_t	gk_handle;
+	crypto_mechanism_t	gk_mechanism;
+	uint_t			gk_count;
+	caddr_t			gk_attributes;
+} crypto_object_generate_key_t;
+
+typedef struct crypto_object_generate_key_pair {
+	uint_t			kp_return_value;
+	crypto_session_id_t	kp_session;
+	crypto_object_id_t	kp_public_handle;
+	crypto_object_id_t	kp_private_handle;
+	uint_t			kp_public_count;
+	uint_t			kp_private_count;
+	caddr_t			kp_public_attributes;
+	caddr_t			kp_private_attributes;
+	crypto_mechanism_t	kp_mechanism;
+} crypto_object_generate_key_pair_t;
+
+typedef struct crypto_object_wrap_key {
+	uint_t			wk_return_value;
+	crypto_session_id_t	wk_session;
+	crypto_mechanism_t	wk_mechanism;
+	crypto_key_t		wk_wrapping_key;
+	crypto_object_id_t	wk_object_handle;
+	size_t			wk_wrapped_key_len;
+	caddr_t			wk_wrapped_key;
+} crypto_object_wrap_key_t;
+
+typedef struct crypto_object_unwrap_key {
+	uint_t			uk_return_value;
+	crypto_session_id_t	uk_session;
+	crypto_mechanism_t	uk_mechanism;
+	crypto_key_t		uk_unwrapping_key;
+	crypto_object_id_t	uk_object_handle;
+	size_t			uk_wrapped_key_len;
+	caddr_t			uk_wrapped_key;
+	uint_t			uk_count;
+	caddr_t			uk_attributes;
+} crypto_object_unwrap_key_t;
+
+typedef struct crypto_derive_key {
+	uint_t			dk_return_value;
+	crypto_session_id_t	dk_session;
+	crypto_mechanism_t	dk_mechanism;
+	crypto_key_t		dk_base_key;
+	crypto_object_id_t	dk_object_handle;
+	uint_t			dk_count;
+	caddr_t			dk_attributes;
+} crypto_derive_key_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_object_generate_key32 {
+	uint32_t		gk_return_value;
+	crypto_session_id_t	gk_session;
+	crypto_object_id_t	gk_handle;
+	crypto_mechanism32_t	gk_mechanism;
+	uint32_t		gk_count;
+	caddr32_t		gk_attributes;
+} crypto_object_generate_key32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+typedef struct crypto_object_generate_key_pair32 {
+	uint32_t		kp_return_value;
+	crypto_session_id_t	kp_session;
+	crypto_object_id_t	kp_public_handle;
+	crypto_object_id_t	kp_private_handle;
+	uint32_t		kp_public_count;
+	uint32_t		kp_private_count;
+	caddr32_t		kp_public_attributes;
+	caddr32_t		kp_private_attributes;
+	crypto_mechanism32_t	kp_mechanism;
+} crypto_object_generate_key_pair32_t;
+
+typedef struct crypto_object_wrap_key32 {
+	uint32_t		wk_return_value;
+	crypto_session_id_t	wk_session;
+	crypto_mechanism32_t	wk_mechanism;
+	crypto_key32_t		wk_wrapping_key;
+	crypto_object_id_t	wk_object_handle;
+	size32_t		wk_wrapped_key_len;
+	caddr32_t		wk_wrapped_key;
+} crypto_object_wrap_key32_t;
+
+typedef struct crypto_object_unwrap_key32 {
+	uint32_t		uk_return_value;
+	crypto_session_id_t	uk_session;
+	crypto_mechanism32_t	uk_mechanism;
+	crypto_key32_t		uk_unwrapping_key;
+	crypto_object_id_t	uk_object_handle;
+	size32_t		uk_wrapped_key_len;
+	caddr32_t		uk_wrapped_key;
+	uint32_t		uk_count;
+	caddr32_t		uk_attributes;
+} crypto_object_unwrap_key32_t;
+
+typedef struct crypto_derive_key32 {
+	uint32_t		dk_return_value;
+	crypto_session_id_t	dk_session;
+	crypto_mechanism32_t	dk_mechanism;
+	crypto_key32_t		dk_base_key;
+	crypto_object_id_t	dk_object_handle;
+	uint32_t		dk_count;
+	caddr32_t		dk_attributes;
+} crypto_derive_key32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_GENERATE_KEY		CRYPTO(110)
+#define	CRYPTO_GENERATE_KEY_PAIR	CRYPTO(111)
+#define	CRYPTO_WRAP_KEY			CRYPTO(112)
+#define	CRYPTO_UNWRAP_KEY		CRYPTO(113)
+#define	CRYPTO_DERIVE_KEY		CRYPTO(114)
+
+/*
+ * Provider Management Ioctls
+ */
+
+typedef struct crypto_get_provider_list {
+	uint_t			pl_return_value;
+	uint_t			pl_count;
+	crypto_provider_entry_t	pl_list[1];
+} crypto_get_provider_list_t;
+
+typedef struct crypto_provider_data {
+	uchar_t			pd_prov_desc[CRYPTO_PROVIDER_DESCR_MAX_LEN];
+	uchar_t			pd_label[CRYPTO_EXT_SIZE_LABEL];
+	uchar_t			pd_manufacturerID[CRYPTO_EXT_SIZE_MANUF];
+	uchar_t			pd_model[CRYPTO_EXT_SIZE_MODEL];
+	uchar_t			pd_serial_number[CRYPTO_EXT_SIZE_SERIAL];
+	ulong_t			pd_flags;
+	ulong_t			pd_max_session_count;
+	ulong_t			pd_session_count;
+	ulong_t			pd_max_rw_session_count;
+	ulong_t			pd_rw_session_count;
+	ulong_t			pd_max_pin_len;
+	ulong_t			pd_min_pin_len;
+	ulong_t			pd_total_public_memory;
+	ulong_t			pd_free_public_memory;
+	ulong_t			pd_total_private_memory;
+	ulong_t			pd_free_private_memory;
+	crypto_version_t	pd_hardware_version;
+	crypto_version_t	pd_firmware_version;
+	uchar_t			pd_time[CRYPTO_EXT_SIZE_TIME];
+} crypto_provider_data_t;
+
+typedef struct crypto_get_provider_info {
+	uint_t			gi_return_value;
+	crypto_provider_id_t	gi_provider_id;
+	crypto_provider_data_t	gi_provider_data;
+} crypto_get_provider_info_t;
+
+typedef struct crypto_get_provider_mechanisms {
+	uint_t			pm_return_value;
+	crypto_provider_id_t	pm_provider_id;
+	uint_t			pm_count;
+	crypto_mech_name_t	pm_list[1];
+} crypto_get_provider_mechanisms_t;
+
+typedef struct crypto_get_provider_mechanism_info {
+	uint_t			mi_return_value;
+	crypto_provider_id_t	mi_provider_id;
+	crypto_mech_name_t	mi_mechanism_name;
+	uint32_t		mi_min_key_size;
+	uint32_t		mi_max_key_size;
+	uint32_t		mi_flags;
+} crypto_get_provider_mechanism_info_t;
+
+typedef struct crypto_init_token {
+	uint_t			it_return_value;
+	crypto_provider_id_t	it_provider_id;
+	caddr_t			it_pin;
+	size_t			it_pin_len;
+	caddr_t			it_label;
+} crypto_init_token_t;
+
+typedef struct crypto_init_pin {
+	uint_t			ip_return_value;
+	crypto_session_id_t	ip_session;
+	caddr_t			ip_pin;
+	size_t			ip_pin_len;
+} crypto_init_pin_t;
+
+typedef struct crypto_set_pin {
+	uint_t			sp_return_value;
+	crypto_session_id_t	sp_session;
+	caddr_t			sp_old_pin;
+	size_t			sp_old_len;
+	caddr_t			sp_new_pin;
+	size_t			sp_new_len;
+} crypto_set_pin_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_get_provider_list32 {
+	uint32_t		pl_return_value;
+	uint32_t		pl_count;
+	crypto_provider_entry_t pl_list[1];
+} crypto_get_provider_list32_t;
+
+typedef struct crypto_version32 {
+	uchar_t	cv_major;
+	uchar_t	cv_minor;
+} crypto_version32_t;
+
+typedef struct crypto_provider_data32 {
+	uchar_t			pd_prov_desc[CRYPTO_PROVIDER_DESCR_MAX_LEN];
+	uchar_t			pd_label[CRYPTO_EXT_SIZE_LABEL];
+	uchar_t			pd_manufacturerID[CRYPTO_EXT_SIZE_MANUF];
+	uchar_t			pd_model[CRYPTO_EXT_SIZE_MODEL];
+	uchar_t			pd_serial_number[CRYPTO_EXT_SIZE_SERIAL];
+	uint32_t		pd_flags;
+	uint32_t		pd_max_session_count;
+	uint32_t		pd_session_count;
+	uint32_t		pd_max_rw_session_count;
+	uint32_t		pd_rw_session_count;
+	uint32_t		pd_max_pin_len;
+	uint32_t		pd_min_pin_len;
+	uint32_t		pd_total_public_memory;
+	uint32_t		pd_free_public_memory;
+	uint32_t		pd_total_private_memory;
+	uint32_t		pd_free_private_memory;
+	crypto_version32_t	pd_hardware_version;
+	crypto_version32_t	pd_firmware_version;
+	uchar_t			pd_time[CRYPTO_EXT_SIZE_TIME];
+} crypto_provider_data32_t;
+
+typedef struct crypto_get_provider_info32 {
+	uint32_t		gi_return_value;
+	crypto_provider_id_t	gi_provider_id;
+	crypto_provider_data32_t gi_provider_data;
+} crypto_get_provider_info32_t;
+
+typedef struct crypto_get_provider_mechanisms32 {
+	uint32_t		pm_return_value;
+	crypto_provider_id_t	pm_provider_id;
+	uint32_t		pm_count;
+	crypto_mech_name_t	pm_list[1];
+} crypto_get_provider_mechanisms32_t;
+
+typedef struct crypto_init_token32 {
+	uint32_t		it_return_value;
+	crypto_provider_id_t	it_provider_id;
+	caddr32_t		it_pin;
+	size32_t		it_pin_len;
+	caddr32_t		it_label;
+} crypto_init_token32_t;
+
+typedef struct crypto_init_pin32 {
+	uint32_t		ip_return_value;
+	crypto_session_id_t	ip_session;
+	caddr32_t		ip_pin;
+	size32_t		ip_pin_len;
+} crypto_init_pin32_t;
+
+typedef struct crypto_set_pin32 {
+	uint32_t		sp_return_value;
+	crypto_session_id_t	sp_session;
+	caddr32_t		sp_old_pin;
+	size32_t		sp_old_len;
+	caddr32_t		sp_new_pin;
+	size32_t		sp_new_len;
+} crypto_set_pin32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_GET_PROVIDER_LIST		CRYPTO(120)
+#define	CRYPTO_GET_PROVIDER_INFO		CRYPTO(121)
+#define	CRYPTO_GET_PROVIDER_MECHANISMS		CRYPTO(122)
+#define	CRYPTO_GET_PROVIDER_MECHANISM_INFO	CRYPTO(123)
+#define	CRYPTO_INIT_TOKEN			CRYPTO(124)
+#define	CRYPTO_INIT_PIN				CRYPTO(125)
+#define	CRYPTO_SET_PIN				CRYPTO(126)
+
+/*
+ * No (Key) Store Key Generation Ioctls
+ */
+typedef struct crypto_nostore_generate_key {
+	uint_t			ngk_return_value;
+	crypto_session_id_t	ngk_session;
+	crypto_mechanism_t	ngk_mechanism;
+	uint_t			ngk_in_count;
+	uint_t			ngk_out_count;
+	caddr_t			ngk_in_attributes;
+	caddr_t			ngk_out_attributes;
+} crypto_nostore_generate_key_t;
+
+typedef struct crypto_nostore_generate_key_pair {
+	uint_t			nkp_return_value;
+	crypto_session_id_t	nkp_session;
+	uint_t			nkp_in_public_count;
+	uint_t			nkp_in_private_count;
+	uint_t			nkp_out_public_count;
+	uint_t			nkp_out_private_count;
+	caddr_t			nkp_in_public_attributes;
+	caddr_t			nkp_in_private_attributes;
+	caddr_t			nkp_out_public_attributes;
+	caddr_t			nkp_out_private_attributes;
+	crypto_mechanism_t	nkp_mechanism;
+} crypto_nostore_generate_key_pair_t;
+
+typedef struct crypto_nostore_derive_key {
+	uint_t			ndk_return_value;
+	crypto_session_id_t	ndk_session;
+	crypto_mechanism_t	ndk_mechanism;
+	crypto_key_t		ndk_base_key;
+	uint_t			ndk_in_count;
+	uint_t			ndk_out_count;
+	caddr_t			ndk_in_attributes;
+	caddr_t			ndk_out_attributes;
+} crypto_nostore_derive_key_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_nostore_generate_key32 {
+	uint32_t		ngk_return_value;
+	crypto_session_id_t	ngk_session;
+	crypto_mechanism32_t	ngk_mechanism;
+	uint32_t		ngk_in_count;
+	uint32_t		ngk_out_count;
+	caddr32_t		ngk_in_attributes;
+	caddr32_t		ngk_out_attributes;
+} crypto_nostore_generate_key32_t;
+
+typedef struct crypto_nostore_generate_key_pair32 {
+	uint32_t		nkp_return_value;
+	crypto_session_id_t	nkp_session;
+	uint32_t		nkp_in_public_count;
+	uint32_t		nkp_in_private_count;
+	uint32_t		nkp_out_public_count;
+	uint32_t		nkp_out_private_count;
+	caddr32_t		nkp_in_public_attributes;
+	caddr32_t		nkp_in_private_attributes;
+	caddr32_t		nkp_out_public_attributes;
+	caddr32_t		nkp_out_private_attributes;
+	crypto_mechanism32_t	nkp_mechanism;
+} crypto_nostore_generate_key_pair32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack(4)
+#endif
+
+typedef struct crypto_nostore_derive_key32 {
+	uint32_t		ndk_return_value;
+	crypto_session_id_t	ndk_session;
+	crypto_mechanism32_t	ndk_mechanism;
+	crypto_key32_t		ndk_base_key;
+	uint32_t		ndk_in_count;
+	uint32_t		ndk_out_count;
+	caddr32_t		ndk_in_attributes;
+	caddr32_t		ndk_out_attributes;
+} crypto_nostore_derive_key32_t;
+
+#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
+#pragma pack()
+#endif
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_NOSTORE_GENERATE_KEY		CRYPTO(127)
+#define	CRYPTO_NOSTORE_GENERATE_KEY_PAIR	CRYPTO(128)
+#define	CRYPTO_NOSTORE_DERIVE_KEY		CRYPTO(129)
+
+/*
+ * Mechanism Ioctls
+ */
+
+typedef struct crypto_get_mechanism_list {
+	uint_t			ml_return_value;
+	uint_t			ml_count;
+	crypto_mech_name_t	ml_list[1];
+} crypto_get_mechanism_list_t;
+
+typedef struct crypto_get_all_mechanism_info {
+	uint_t			mi_return_value;
+	crypto_mech_name_t	mi_mechanism_name;
+	uint_t			mi_count;
+	crypto_mechanism_info_t	mi_list[1];
+} crypto_get_all_mechanism_info_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_get_mechanism_list32 {
+	uint32_t		ml_return_value;
+	uint32_t		ml_count;
+	crypto_mech_name_t	ml_list[1];
+} crypto_get_mechanism_list32_t;
+
+typedef struct crypto_get_all_mechanism_info32 {
+	uint32_t		mi_return_value;
+	crypto_mech_name_t	mi_mechanism_name;
+	uint32_t		mi_count;
+	crypto_mechanism_info32_t mi_list[1];
+} crypto_get_all_mechanism_info32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_GET_MECHANISM_LIST		CRYPTO(140)
+#define	CRYPTO_GET_ALL_MECHANISM_INFO		CRYPTO(141)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_CRYPTO_IOCTL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctladmin.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctladmin.h
new file mode 100644
index 000000000000..24babd7755cc
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/ioctladmin.h
@@ -0,0 +1,136 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CRYPTO_IOCTLADMIN_H
+#define	_SYS_CRYPTO_IOCTLADMIN_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+#define	ADMIN_IOCTL_DEVICE	"/dev/cryptoadm"
+
+#define	CRYPTOADMIN(x)		(('y' << 8) | (x))
+
+/*
+ * Administrative IOCTLs
+ */
+
+typedef struct crypto_get_dev_list {
+	uint_t			dl_return_value;
+	uint_t			dl_dev_count;
+	crypto_dev_list_entry_t	dl_devs[1];
+} crypto_get_dev_list_t;
+
+typedef struct crypto_get_soft_list {
+	uint_t			sl_return_value;
+	uint_t			sl_soft_count;
+	size_t			sl_soft_len;
+	caddr_t			sl_soft_names;
+} crypto_get_soft_list_t;
+
+typedef struct crypto_get_dev_info {
+	uint_t			di_return_value;
+	char			di_dev_name[MAXNAMELEN];
+	uint_t			di_dev_instance;
+	uint_t			di_count;
+	crypto_mech_name_t	di_list[1];
+} crypto_get_dev_info_t;
+
+typedef struct crypto_get_soft_info {
+	uint_t			si_return_value;
+	char			si_name[MAXNAMELEN];
+	uint_t			si_count;
+	crypto_mech_name_t	si_list[1];
+} crypto_get_soft_info_t;
+
+typedef struct crypto_load_dev_disabled {
+	uint_t			dd_return_value;
+	char			dd_dev_name[MAXNAMELEN];
+	uint_t			dd_dev_instance;
+	uint_t			dd_count;
+	crypto_mech_name_t	dd_list[1];
+} crypto_load_dev_disabled_t;
+
+typedef struct crypto_load_soft_disabled {
+	uint_t			sd_return_value;
+	char			sd_name[MAXNAMELEN];
+	uint_t			sd_count;
+	crypto_mech_name_t	sd_list[1];
+} crypto_load_soft_disabled_t;
+
+typedef struct crypto_unload_soft_module {
+	uint_t			sm_return_value;
+	char			sm_name[MAXNAMELEN];
+} crypto_unload_soft_module_t;
+
+typedef struct crypto_load_soft_config {
+	uint_t			sc_return_value;
+	char			sc_name[MAXNAMELEN];
+	uint_t			sc_count;
+	crypto_mech_name_t	sc_list[1];
+} crypto_load_soft_config_t;
+
+typedef struct crypto_load_door {
+	uint_t			ld_return_value;
+	uint_t			ld_did;
+} crypto_load_door_t;
+
+#ifdef	_KERNEL
+#ifdef	_SYSCALL32
+
+typedef struct crypto_get_soft_list32 {
+	uint32_t		sl_return_value;
+	uint32_t		sl_soft_count;
+	size32_t		sl_soft_len;
+	caddr32_t		sl_soft_names;
+} crypto_get_soft_list32_t;
+
+#endif	/* _SYSCALL32 */
+#endif	/* _KERNEL */
+
+#define	CRYPTO_GET_VERSION		CRYPTOADMIN(1)
+#define	CRYPTO_GET_DEV_LIST		CRYPTOADMIN(2)
+#define	CRYPTO_GET_SOFT_LIST		CRYPTOADMIN(3)
+#define	CRYPTO_GET_DEV_INFO		CRYPTOADMIN(4)
+#define	CRYPTO_GET_SOFT_INFO		CRYPTOADMIN(5)
+#define	CRYPTO_LOAD_DEV_DISABLED	CRYPTOADMIN(8)
+#define	CRYPTO_LOAD_SOFT_DISABLED	CRYPTOADMIN(9)
+#define	CRYPTO_UNLOAD_SOFT_MODULE	CRYPTOADMIN(10)
+#define	CRYPTO_LOAD_SOFT_CONFIG		CRYPTOADMIN(11)
+#define	CRYPTO_POOL_CREATE		CRYPTOADMIN(12)
+#define	CRYPTO_POOL_WAIT		CRYPTOADMIN(13)
+#define	CRYPTO_POOL_RUN			CRYPTOADMIN(14)
+#define	CRYPTO_LOAD_DOOR		CRYPTOADMIN(15)
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_CRYPTO_IOCTLADMIN_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/ops_impl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/ops_impl.h
new file mode 100644
index 000000000000..230d74b063fc
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/ops_impl.h
@@ -0,0 +1,630 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_OPS_IMPL_H
+#define	_SYS_CRYPTO_OPS_IMPL_H
+
+/*
+ * Scheduler internal structures.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/common.h>
+
+/*
+ * The parameters needed for each function group are batched
+ * in one structure. This is much simpler than having a
+ * separate structure for each function.
+ *
+ * In some cases, a field is generically named to keep the
+ * structure small. The comments indicate these cases.
+ */
+typedef struct kcf_digest_ops_params {
+	crypto_session_id_t	do_sid;
+	crypto_mech_type_t	do_framework_mechtype;
+	crypto_mechanism_t	do_mech;
+	crypto_data_t		*do_data;
+	crypto_data_t		*do_digest;
+	crypto_key_t		*do_digest_key;	/* Argument for digest_key() */
+} kcf_digest_ops_params_t;
+
+typedef struct kcf_mac_ops_params {
+	crypto_session_id_t		mo_sid;
+	crypto_mech_type_t		mo_framework_mechtype;
+	crypto_mechanism_t		mo_mech;
+	crypto_key_t			*mo_key;
+	crypto_data_t			*mo_data;
+	crypto_data_t			*mo_mac;
+	crypto_spi_ctx_template_t	mo_templ;
+} kcf_mac_ops_params_t;
+
+typedef struct kcf_encrypt_ops_params {
+	crypto_session_id_t		eo_sid;
+	crypto_mech_type_t		eo_framework_mechtype;
+	crypto_mechanism_t		eo_mech;
+	crypto_key_t			*eo_key;
+	crypto_data_t			*eo_plaintext;
+	crypto_data_t			*eo_ciphertext;
+	crypto_spi_ctx_template_t	eo_templ;
+} kcf_encrypt_ops_params_t;
+
+typedef struct kcf_decrypt_ops_params {
+	crypto_session_id_t		dop_sid;
+	crypto_mech_type_t		dop_framework_mechtype;
+	crypto_mechanism_t		dop_mech;
+	crypto_key_t			*dop_key;
+	crypto_data_t			*dop_ciphertext;
+	crypto_data_t			*dop_plaintext;
+	crypto_spi_ctx_template_t	dop_templ;
+} kcf_decrypt_ops_params_t;
+
+typedef struct kcf_sign_ops_params {
+	crypto_session_id_t		so_sid;
+	crypto_mech_type_t		so_framework_mechtype;
+	crypto_mechanism_t		so_mech;
+	crypto_key_t			*so_key;
+	crypto_data_t			*so_data;
+	crypto_data_t			*so_signature;
+	crypto_spi_ctx_template_t	so_templ;
+} kcf_sign_ops_params_t;
+
+typedef struct kcf_verify_ops_params {
+	crypto_session_id_t		vo_sid;
+	crypto_mech_type_t		vo_framework_mechtype;
+	crypto_mechanism_t		vo_mech;
+	crypto_key_t			*vo_key;
+	crypto_data_t			*vo_data;
+	crypto_data_t			*vo_signature;
+	crypto_spi_ctx_template_t	vo_templ;
+} kcf_verify_ops_params_t;
+
+typedef struct kcf_encrypt_mac_ops_params {
+	crypto_session_id_t 		em_sid;
+	crypto_mech_type_t		em_framework_encr_mechtype;
+	crypto_mechanism_t		em_encr_mech;
+	crypto_key_t			*em_encr_key;
+	crypto_mech_type_t		em_framework_mac_mechtype;
+	crypto_mechanism_t		em_mac_mech;
+	crypto_key_t			*em_mac_key;
+	crypto_data_t			*em_plaintext;
+	crypto_dual_data_t		*em_ciphertext;
+	crypto_data_t			*em_mac;
+	crypto_spi_ctx_template_t	em_encr_templ;
+	crypto_spi_ctx_template_t	em_mac_templ;
+} kcf_encrypt_mac_ops_params_t;
+
+typedef struct kcf_mac_decrypt_ops_params {
+	crypto_session_id_t 		md_sid;
+	crypto_mech_type_t		md_framework_mac_mechtype;
+	crypto_mechanism_t		md_mac_mech;
+	crypto_key_t			*md_mac_key;
+	crypto_mech_type_t		md_framework_decr_mechtype;
+	crypto_mechanism_t		md_decr_mech;
+	crypto_key_t			*md_decr_key;
+	crypto_dual_data_t		*md_ciphertext;
+	crypto_data_t			*md_mac;
+	crypto_data_t			*md_plaintext;
+	crypto_spi_ctx_template_t	md_mac_templ;
+	crypto_spi_ctx_template_t	md_decr_templ;
+} kcf_mac_decrypt_ops_params_t;
+
+typedef struct kcf_random_number_ops_params {
+	crypto_session_id_t	rn_sid;
+	uchar_t			*rn_buf;
+	size_t			rn_buflen;
+	uint_t			rn_entropy_est;
+	uint32_t		rn_flags;
+} kcf_random_number_ops_params_t;
+
+/*
+ * so_pd is useful when the provider descriptor (pd) supplying the
+ * provider handle is different from the pd supplying the ops vector.
+ * This is the case for session open/close where so_pd can be the pd
+ * of a logical provider. The pd supplying the ops vector is passed
+ * as an argument to kcf_submit_request().
+ */
+typedef struct kcf_session_ops_params {
+	crypto_session_id_t	*so_sid_ptr;
+	crypto_session_id_t	so_sid;
+	crypto_user_type_t	so_user_type;
+	char			*so_pin;
+	size_t			so_pin_len;
+	kcf_provider_desc_t	*so_pd;
+} kcf_session_ops_params_t;
+
+typedef struct kcf_object_ops_params {
+	crypto_session_id_t		oo_sid;
+	crypto_object_id_t		oo_object_id;
+	crypto_object_attribute_t	*oo_template;
+	uint_t 				oo_attribute_count;
+	crypto_object_id_t		*oo_object_id_ptr;
+	size_t				*oo_object_size;
+	void				**oo_find_init_pp_ptr;
+	void				*oo_find_pp;
+	uint_t				oo_max_object_count;
+	uint_t				*oo_object_count_ptr;
+} kcf_object_ops_params_t;
+
+/*
+ * ko_key is used to encode wrapping key in key_wrap() and
+ * unwrapping key in key_unwrap(). ko_key_template and
+ * ko_key_attribute_count are used to encode public template
+ * and public template attr count in key_generate_pair().
+ * kops->ko_key_object_id_ptr is used to encode public key
+ * in key_generate_pair().
+ */
+typedef struct kcf_key_ops_params {
+	crypto_session_id_t		ko_sid;
+	crypto_mech_type_t		ko_framework_mechtype;
+	crypto_mechanism_t		ko_mech;
+	crypto_object_attribute_t	*ko_key_template;
+	uint_t				ko_key_attribute_count;
+	crypto_object_id_t		*ko_key_object_id_ptr;
+	crypto_object_attribute_t	*ko_private_key_template;
+	uint_t				ko_private_key_attribute_count;
+	crypto_object_id_t		*ko_private_key_object_id_ptr;
+	crypto_key_t			*ko_key;
+	uchar_t				*ko_wrapped_key;
+	size_t				*ko_wrapped_key_len_ptr;
+	crypto_object_attribute_t	*ko_out_template1;
+	crypto_object_attribute_t	*ko_out_template2;
+	uint_t				ko_out_attribute_count1;
+	uint_t				ko_out_attribute_count2;
+} kcf_key_ops_params_t;
+
+/*
+ * po_pin and po_pin_len are used to encode new_pin and new_pin_len
+ * when wrapping set_pin() function parameters.
+ *
+ * po_pd is useful when the provider descriptor (pd) supplying the
+ * provider handle is different from the pd supplying the ops vector.
+ * This is true for the ext_info provider entry point where po_pd
+ * can be the pd of a logical provider. The pd supplying the ops vector
+ * is passed as an argument to kcf_submit_request().
+ */
+typedef struct kcf_provmgmt_ops_params {
+	crypto_session_id_t 		po_sid;
+	char				*po_pin;
+	size_t				po_pin_len;
+	char				*po_old_pin;
+	size_t				po_old_pin_len;
+	char				*po_label;
+	crypto_provider_ext_info_t	*po_ext_info;
+	kcf_provider_desc_t		*po_pd;
+} kcf_provmgmt_ops_params_t;
+
+/*
+ * The operation type within a function group.
+ */
+typedef enum kcf_op_type {
+	/* common ops for all mechanisms */
+	KCF_OP_INIT = 1,
+	KCF_OP_SINGLE,	/* pkcs11 sense. So, INIT is already done */
+	KCF_OP_UPDATE,
+	KCF_OP_FINAL,
+	KCF_OP_ATOMIC,
+
+	/* digest_key op */
+	KCF_OP_DIGEST_KEY,
+
+	/* mac specific op */
+	KCF_OP_MAC_VERIFY_ATOMIC,
+
+	/* mac/cipher specific op */
+	KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC,
+
+	/* sign_recover ops */
+	KCF_OP_SIGN_RECOVER_INIT,
+	KCF_OP_SIGN_RECOVER,
+	KCF_OP_SIGN_RECOVER_ATOMIC,
+
+	/* verify_recover ops */
+	KCF_OP_VERIFY_RECOVER_INIT,
+	KCF_OP_VERIFY_RECOVER,
+	KCF_OP_VERIFY_RECOVER_ATOMIC,
+
+	/* random number ops */
+	KCF_OP_RANDOM_SEED,
+	KCF_OP_RANDOM_GENERATE,
+
+	/* session management ops */
+	KCF_OP_SESSION_OPEN,
+	KCF_OP_SESSION_CLOSE,
+	KCF_OP_SESSION_LOGIN,
+	KCF_OP_SESSION_LOGOUT,
+
+	/* object management ops */
+	KCF_OP_OBJECT_CREATE,
+	KCF_OP_OBJECT_COPY,
+	KCF_OP_OBJECT_DESTROY,
+	KCF_OP_OBJECT_GET_SIZE,
+	KCF_OP_OBJECT_GET_ATTRIBUTE_VALUE,
+	KCF_OP_OBJECT_SET_ATTRIBUTE_VALUE,
+	KCF_OP_OBJECT_FIND_INIT,
+	KCF_OP_OBJECT_FIND,
+	KCF_OP_OBJECT_FIND_FINAL,
+
+	/* key management ops */
+	KCF_OP_KEY_GENERATE,
+	KCF_OP_KEY_GENERATE_PAIR,
+	KCF_OP_KEY_WRAP,
+	KCF_OP_KEY_UNWRAP,
+	KCF_OP_KEY_DERIVE,
+	KCF_OP_KEY_CHECK,
+
+	/* provider management ops */
+	KCF_OP_MGMT_EXTINFO,
+	KCF_OP_MGMT_INITTOKEN,
+	KCF_OP_MGMT_INITPIN,
+	KCF_OP_MGMT_SETPIN
+} kcf_op_type_t;
+
+/*
+ * The operation groups that need wrapping of parameters. This is somewhat
+ * similar to the function group type in spi.h except that this also includes
+ * all the functions that don't have a mechanism.
+ *
+ * The wrapper macros should never take these enum values as an argument.
+ * Rather, they are assigned in the macro itself since they are known
+ * from the macro name.
+ */
+typedef enum kcf_op_group {
+	KCF_OG_DIGEST = 1,
+	KCF_OG_MAC,
+	KCF_OG_ENCRYPT,
+	KCF_OG_DECRYPT,
+	KCF_OG_SIGN,
+	KCF_OG_VERIFY,
+	KCF_OG_ENCRYPT_MAC,
+	KCF_OG_MAC_DECRYPT,
+	KCF_OG_RANDOM,
+	KCF_OG_SESSION,
+	KCF_OG_OBJECT,
+	KCF_OG_KEY,
+	KCF_OG_PROVMGMT,
+	KCF_OG_NOSTORE_KEY
+} kcf_op_group_t;
+
+/*
+ * The kcf_op_type_t enum values used here should be only for those
+ * operations for which there is a k-api routine in sys/crypto/api.h.
+ */
+#define	IS_INIT_OP(ftype)	((ftype) == KCF_OP_INIT)
+#define	IS_SINGLE_OP(ftype)	((ftype) == KCF_OP_SINGLE)
+#define	IS_UPDATE_OP(ftype)	((ftype) == KCF_OP_UPDATE)
+#define	IS_FINAL_OP(ftype)	((ftype) == KCF_OP_FINAL)
+#define	IS_ATOMIC_OP(ftype)	( \
+	(ftype) == KCF_OP_ATOMIC || (ftype) == KCF_OP_MAC_VERIFY_ATOMIC || \
+	(ftype) == KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC || \
+	(ftype) == KCF_OP_SIGN_RECOVER_ATOMIC || \
+	(ftype) == KCF_OP_VERIFY_RECOVER_ATOMIC)
+
+/*
+ * Keep the parameters associated with a request around.
+ * We need to pass them to the SPI.
+ */
+typedef struct kcf_req_params {
+	kcf_op_group_t		rp_opgrp;
+	kcf_op_type_t		rp_optype;
+
+	union {
+		kcf_digest_ops_params_t		digest_params;
+		kcf_mac_ops_params_t		mac_params;
+		kcf_encrypt_ops_params_t	encrypt_params;
+		kcf_decrypt_ops_params_t	decrypt_params;
+		kcf_sign_ops_params_t		sign_params;
+		kcf_verify_ops_params_t		verify_params;
+		kcf_encrypt_mac_ops_params_t	encrypt_mac_params;
+		kcf_mac_decrypt_ops_params_t	mac_decrypt_params;
+		kcf_random_number_ops_params_t	random_number_params;
+		kcf_session_ops_params_t	session_params;
+		kcf_object_ops_params_t		object_params;
+		kcf_key_ops_params_t		key_params;
+		kcf_provmgmt_ops_params_t	provmgmt_params;
+	} rp_u;
+} kcf_req_params_t;
+
+
+/*
+ * The ioctl/k-api code should bundle the parameters into a kcf_req_params_t
+ * structure before calling a scheduler routine. The following macros are
+ * available for that purpose.
+ *
+ * For the most part, the macro arguments closely correspond to the
+ * function parameters. In some cases, we use generic names. The comments
+ * for the structure should indicate these cases.
+ */
+#define	KCF_WRAP_DIGEST_OPS_PARAMS(req, ftype, _sid, _mech, _key,	\
+	_data, _digest) {						\
+	kcf_digest_ops_params_t *dops = &(req)->rp_u.digest_params;	\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_DIGEST;				\
+	(req)->rp_optype = ftype;					\
+	dops->do_sid = _sid;						\
+	if (mechp != NULL) {						\
+		dops->do_mech = *mechp;					\
+		dops->do_framework_mechtype = mechp->cm_type;		\
+	}								\
+	dops->do_digest_key = _key;					\
+	dops->do_data = _data;						\
+	dops->do_digest = _digest;					\
+}
+
+#define	KCF_WRAP_MAC_OPS_PARAMS(req, ftype, _sid, _mech, _key,		\
+	_data, _mac, _templ) {						\
+	kcf_mac_ops_params_t *mops = &(req)->rp_u.mac_params;		\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_MAC;					\
+	(req)->rp_optype = ftype;					\
+	mops->mo_sid = _sid;						\
+	if (mechp != NULL) {						\
+		mops->mo_mech = *mechp;					\
+		mops->mo_framework_mechtype = mechp->cm_type;		\
+	}								\
+	mops->mo_key = _key;						\
+	mops->mo_data = _data;						\
+	mops->mo_mac = _mac;						\
+	mops->mo_templ = _templ;					\
+}
+
+#define	KCF_WRAP_ENCRYPT_OPS_PARAMS(req, ftype, _sid, _mech, _key,	\
+	_plaintext, _ciphertext, _templ) {				\
+	kcf_encrypt_ops_params_t *cops = &(req)->rp_u.encrypt_params;	\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_ENCRYPT;				\
+	(req)->rp_optype = ftype;					\
+	cops->eo_sid = _sid;						\
+	if (mechp != NULL) {						\
+		cops->eo_mech = *mechp;					\
+		cops->eo_framework_mechtype = mechp->cm_type;		\
+	}								\
+	cops->eo_key = _key;						\
+	cops->eo_plaintext = _plaintext;				\
+	cops->eo_ciphertext = _ciphertext;				\
+	cops->eo_templ = _templ;					\
+}
+
+#define	KCF_WRAP_DECRYPT_OPS_PARAMS(req, ftype, _sid, _mech, _key,	\
+	_ciphertext, _plaintext, _templ) {				\
+	kcf_decrypt_ops_params_t *cops = &(req)->rp_u.decrypt_params;	\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_DECRYPT;				\
+	(req)->rp_optype = ftype;					\
+	cops->dop_sid = _sid;						\
+	if (mechp != NULL) {						\
+		cops->dop_mech = *mechp;				\
+		cops->dop_framework_mechtype = mechp->cm_type;		\
+	}								\
+	cops->dop_key = _key;						\
+	cops->dop_ciphertext = _ciphertext;				\
+	cops->dop_plaintext = _plaintext;				\
+	cops->dop_templ = _templ;					\
+}
+
+#define	KCF_WRAP_SIGN_OPS_PARAMS(req, ftype, _sid, _mech, _key,		\
+	_data, _signature, _templ) {					\
+	kcf_sign_ops_params_t *sops = &(req)->rp_u.sign_params;		\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_SIGN;					\
+	(req)->rp_optype = ftype;					\
+	sops->so_sid = _sid;						\
+	if (mechp != NULL) {						\
+		sops->so_mech = *mechp;					\
+		sops->so_framework_mechtype = mechp->cm_type;		\
+	}								\
+	sops->so_key = _key;						\
+	sops->so_data = _data;						\
+	sops->so_signature = _signature;				\
+	sops->so_templ = _templ;					\
+}
+
+#define	KCF_WRAP_VERIFY_OPS_PARAMS(req, ftype, _sid, _mech, _key,	\
+	_data, _signature, _templ) {					\
+	kcf_verify_ops_params_t *vops = &(req)->rp_u.verify_params;	\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_VERIFY;				\
+	(req)->rp_optype = ftype;					\
+	vops->vo_sid = _sid;						\
+	if (mechp != NULL) {						\
+		vops->vo_mech = *mechp;					\
+		vops->vo_framework_mechtype = mechp->cm_type;		\
+	}								\
+	vops->vo_key = _key;						\
+	vops->vo_data = _data;						\
+	vops->vo_signature = _signature;				\
+	vops->vo_templ = _templ;					\
+}
+
+#define	KCF_WRAP_ENCRYPT_MAC_OPS_PARAMS(req, ftype, _sid, _encr_key,	\
+	_mac_key, _plaintext, _ciphertext, _mac, _encr_templ, _mac_templ) { \
+	kcf_encrypt_mac_ops_params_t *cmops = &(req)->rp_u.encrypt_mac_params; \
+									\
+	(req)->rp_opgrp = KCF_OG_ENCRYPT_MAC;				\
+	(req)->rp_optype = ftype;					\
+	cmops->em_sid = _sid;						\
+	cmops->em_encr_key = _encr_key;					\
+	cmops->em_mac_key = _mac_key;					\
+	cmops->em_plaintext = _plaintext;				\
+	cmops->em_ciphertext = _ciphertext;				\
+	cmops->em_mac = _mac;						\
+	cmops->em_encr_templ = _encr_templ;				\
+	cmops->em_mac_templ = _mac_templ;				\
+}
+
+#define	KCF_WRAP_MAC_DECRYPT_OPS_PARAMS(req, ftype, _sid, _mac_key,	\
+	_decr_key, _ciphertext, _mac, _plaintext, _mac_templ, _decr_templ) { \
+	kcf_mac_decrypt_ops_params_t *cmops = &(req)->rp_u.mac_decrypt_params; \
+									\
+	(req)->rp_opgrp = KCF_OG_MAC_DECRYPT;				\
+	(req)->rp_optype = ftype;					\
+	cmops->md_sid = _sid;						\
+	cmops->md_mac_key = _mac_key;					\
+	cmops->md_decr_key = _decr_key;					\
+	cmops->md_ciphertext = _ciphertext;				\
+	cmops->md_mac = _mac;						\
+	cmops->md_plaintext = _plaintext;				\
+	cmops->md_mac_templ = _mac_templ;				\
+	cmops->md_decr_templ = _decr_templ;				\
+}
+
+#define	KCF_WRAP_RANDOM_OPS_PARAMS(req, ftype, _sid, _buf, _buflen,	\
+	_est, _flags) {							\
+	kcf_random_number_ops_params_t *rops =				\
+		&(req)->rp_u.random_number_params;			\
+									\
+	(req)->rp_opgrp = KCF_OG_RANDOM;				\
+	(req)->rp_optype = ftype;					\
+	rops->rn_sid = _sid;						\
+	rops->rn_buf = _buf;						\
+	rops->rn_buflen = _buflen;					\
+	rops->rn_entropy_est = _est;					\
+	rops->rn_flags = _flags;					\
+}
+
+#define	KCF_WRAP_SESSION_OPS_PARAMS(req, ftype, _sid_ptr, _sid,		\
+	_user_type, _pin, _pin_len, _pd) {				\
+	kcf_session_ops_params_t *sops = &(req)->rp_u.session_params;	\
+									\
+	(req)->rp_opgrp = KCF_OG_SESSION;				\
+	(req)->rp_optype = ftype;					\
+	sops->so_sid_ptr = _sid_ptr;					\
+	sops->so_sid = _sid;						\
+	sops->so_user_type = _user_type;				\
+	sops->so_pin = _pin;						\
+	sops->so_pin_len = _pin_len;					\
+	sops->so_pd = _pd;						\
+}
+
+#define	KCF_WRAP_OBJECT_OPS_PARAMS(req, ftype, _sid, _object_id,	\
+	_template, _attribute_count, _object_id_ptr, _object_size,	\
+	_find_init_pp_ptr, _find_pp, _max_object_count, _object_count_ptr) { \
+	kcf_object_ops_params_t *jops = &(req)->rp_u.object_params;	\
+									\
+	(req)->rp_opgrp = KCF_OG_OBJECT;				\
+	(req)->rp_optype = ftype;					\
+	jops->oo_sid = _sid;						\
+	jops->oo_object_id = _object_id;				\
+	jops->oo_template = _template;					\
+	jops->oo_attribute_count = _attribute_count;			\
+	jops->oo_object_id_ptr = _object_id_ptr;			\
+	jops->oo_object_size = _object_size;				\
+	jops->oo_find_init_pp_ptr = _find_init_pp_ptr;			\
+	jops->oo_find_pp = _find_pp;					\
+	jops->oo_max_object_count = _max_object_count;			\
+	jops->oo_object_count_ptr = _object_count_ptr;			\
+}
+
+#define	KCF_WRAP_KEY_OPS_PARAMS(req, ftype, _sid, _mech, _key_template, \
+	_key_attribute_count, _key_object_id_ptr, _private_key_template, \
+	_private_key_attribute_count, _private_key_object_id_ptr,	\
+	_key, _wrapped_key, _wrapped_key_len_ptr) {			\
+	kcf_key_ops_params_t *kops = &(req)->rp_u.key_params;		\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_KEY;					\
+	(req)->rp_optype = ftype;					\
+	kops->ko_sid = _sid;						\
+	if (mechp != NULL) {						\
+		kops->ko_mech = *mechp;					\
+		kops->ko_framework_mechtype = mechp->cm_type;		\
+	}								\
+	kops->ko_key_template = _key_template;				\
+	kops->ko_key_attribute_count = _key_attribute_count;		\
+	kops->ko_key_object_id_ptr = _key_object_id_ptr;		\
+	kops->ko_private_key_template = _private_key_template;		\
+	kops->ko_private_key_attribute_count = _private_key_attribute_count; \
+	kops->ko_private_key_object_id_ptr = _private_key_object_id_ptr; \
+	kops->ko_key = _key;						\
+	kops->ko_wrapped_key = _wrapped_key;				\
+	kops->ko_wrapped_key_len_ptr = _wrapped_key_len_ptr;		\
+}
+
+#define	KCF_WRAP_PROVMGMT_OPS_PARAMS(req, ftype, _sid, _old_pin,	\
+	_old_pin_len, _pin, _pin_len, _label, _ext_info, _pd) {		\
+	kcf_provmgmt_ops_params_t *pops = &(req)->rp_u.provmgmt_params;	\
+									\
+	(req)->rp_opgrp = KCF_OG_PROVMGMT;				\
+	(req)->rp_optype = ftype;					\
+	pops->po_sid = _sid;						\
+	pops->po_pin = _pin;						\
+	pops->po_pin_len = _pin_len;					\
+	pops->po_old_pin = _old_pin;					\
+	pops->po_old_pin_len = _old_pin_len;				\
+	pops->po_label = _label;					\
+	pops->po_ext_info = _ext_info;					\
+	pops->po_pd = _pd;						\
+}
+
+#define	KCF_WRAP_NOSTORE_KEY_OPS_PARAMS(req, ftype, _sid, _mech,	\
+	_key_template, _key_attribute_count, _private_key_template,	\
+	_private_key_attribute_count, _key, _out_template1,		\
+	_out_attribute_count1, _out_template2, _out_attribute_count2) {	\
+	kcf_key_ops_params_t *kops = &(req)->rp_u.key_params;		\
+	crypto_mechanism_t *mechp = _mech;				\
+									\
+	(req)->rp_opgrp = KCF_OG_NOSTORE_KEY;				\
+	(req)->rp_optype = ftype;					\
+	kops->ko_sid = _sid;						\
+	if (mechp != NULL) {						\
+		kops->ko_mech = *mechp;					\
+		kops->ko_framework_mechtype = mechp->cm_type;		\
+	}								\
+	kops->ko_key_template = _key_template;				\
+	kops->ko_key_attribute_count = _key_attribute_count;		\
+	kops->ko_key_object_id_ptr = NULL;				\
+	kops->ko_private_key_template = _private_key_template;		\
+	kops->ko_private_key_attribute_count = _private_key_attribute_count; \
+	kops->ko_private_key_object_id_ptr = NULL;			\
+	kops->ko_key = _key;						\
+	kops->ko_wrapped_key = NULL;					\
+	kops->ko_wrapped_key_len_ptr = 0;				\
+	kops->ko_out_template1 = _out_template1;			\
+	kops->ko_out_template2 = _out_template2;			\
+	kops->ko_out_attribute_count1 = _out_attribute_count1;		\
+	kops->ko_out_attribute_count2 = _out_attribute_count2;		\
+}
+
+#define	KCF_SET_PROVIDER_MECHNUM(fmtype, pd, mechp)			\
+	(mechp)->cm_type =						\
+	    KCF_TO_PROV_MECHNUM(pd, fmtype);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_OPS_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/sched_impl.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/sched_impl.h
new file mode 100644
index 000000000000..85ea0ba1d092
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/sched_impl.h
@@ -0,0 +1,531 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_CRYPTO_SCHED_IMPL_H
+#define	_SYS_CRYPTO_SCHED_IMPL_H
+
+/*
+ * Scheduler internal structures.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/api.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/ops_impl.h>
+
+typedef void (kcf_func_t)(void *, int);
+
+typedef enum kcf_req_status {
+	REQ_ALLOCATED = 1,
+	REQ_WAITING,		/* At the framework level */
+	REQ_INPROGRESS,		/* At the provider level */
+	REQ_DONE,
+	REQ_CANCELED
+} kcf_req_status_t;
+
+typedef enum kcf_call_type {
+	CRYPTO_SYNCH = 1,
+	CRYPTO_ASYNCH
+} kcf_call_type_t;
+
+#define	CHECK_RESTRICT(crq) (crq != NULL &&	\
+	((crq)->cr_flag & CRYPTO_RESTRICTED))
+
+#define	CHECK_RESTRICT_FALSE	B_FALSE
+
+#define	CHECK_FASTPATH(crq, pd) ((crq) == NULL ||	\
+	!((crq)->cr_flag & CRYPTO_ALWAYS_QUEUE)) &&	\
+	(pd)->pd_prov_type == CRYPTO_SW_PROVIDER
+
+#define	KCF_KMFLAG(crq)	(((crq) == NULL) ? KM_SLEEP : KM_NOSLEEP)
+
+/*
+ * The framework keeps an internal handle to use in the adaptive
+ * asynchronous case. This is the case when a client has the
+ * CRYPTO_ALWAYS_QUEUE bit clear and a software provider is used for
+ * the request. The request is completed in the context of the calling
+ * thread and kernel memory must be allocated with KM_NOSLEEP.
+ *
+ * The framework passes a pointer to the handle in crypto_req_handle_t
+ * argument when it calls the SPI of the software provider. The macros
+ * KCF_RHNDL() and KCF_SWFP_RHNDL() are used to do this.
+ *
+ * When a provider asks the framework for kmflag value via
+ * crypto_kmflag(9S) we use REQHNDL2_KMFLAG() macro.
+ */
+extern ulong_t kcf_swprov_hndl;
+#define	KCF_RHNDL(kmflag) (((kmflag) == KM_SLEEP) ? NULL : &kcf_swprov_hndl)
+#define	KCF_SWFP_RHNDL(crq) (((crq) == NULL) ? NULL : &kcf_swprov_hndl)
+#define	REQHNDL2_KMFLAG(rhndl) \
+	((rhndl == &kcf_swprov_hndl) ? KM_NOSLEEP : KM_SLEEP)
+
+/* Internal call_req flags. They start after the public ones in api.h */
+
+#define	CRYPTO_SETDUAL	0x00001000	/* Set the 'cont' boolean before */
+					/* submitting the request */
+#define	KCF_ISDUALREQ(crq)	\
+	(((crq) == NULL) ? B_FALSE : (crq->cr_flag & CRYPTO_SETDUAL))
+
+typedef struct kcf_prov_tried {
+	kcf_provider_desc_t	*pt_pd;
+	struct kcf_prov_tried	*pt_next;
+} kcf_prov_tried_t;
+
+#define	IS_FG_SUPPORTED(mdesc, fg)		\
+	(((mdesc)->pm_mech_info.cm_func_group_mask & (fg)) != 0)
+
+#define	IS_PROVIDER_TRIED(pd, tlist)		\
+	(tlist != NULL && is_in_triedlist(pd, tlist))
+
+#define	IS_RECOVERABLE(error)			\
+	(error == CRYPTO_BUFFER_TOO_BIG ||	\
+	error == CRYPTO_BUSY ||			\
+	error == CRYPTO_DEVICE_ERROR ||		\
+	error == CRYPTO_DEVICE_MEMORY ||	\
+	error == CRYPTO_KEY_SIZE_RANGE ||	\
+	error == CRYPTO_NO_PERMISSION)
+
+#define	KCF_ATOMIC_INCR(x)	atomic_add_32(&(x), 1)
+#define	KCF_ATOMIC_DECR(x)	atomic_add_32(&(x), -1)
+
+/*
+ * Node structure for synchronous requests.
+ */
+typedef struct kcf_sreq_node {
+	/* Should always be the first field in this structure */
+	kcf_call_type_t		sn_type;
+	/*
+	 * sn_cv and sr_lock are used to wait for the
+	 * operation to complete. sn_lock also protects
+	 * the sn_state field.
+	 */
+	kcondvar_t		sn_cv;
+	kmutex_t		sn_lock;
+	kcf_req_status_t	sn_state;
+
+	/*
+	 * Return value from the operation. This will be
+	 * one of the CRYPTO_* errors defined in common.h.
+	 */
+	int			sn_rv;
+
+	/*
+	 * parameters to call the SPI with. This can be
+	 * a pointer as we know the caller context/stack stays.
+	 */
+	struct kcf_req_params	*sn_params;
+
+	/* Internal context for this request */
+	struct kcf_context	*sn_context;
+
+	/* Provider handling this request */
+	kcf_provider_desc_t	*sn_provider;
+} kcf_sreq_node_t;
+
+/*
+ * Node structure for asynchronous requests. A node can be on
+ * on a chain of requests hanging of the internal context
+ * structure and can be in the global software provider queue.
+ */
+typedef struct kcf_areq_node {
+	/* Should always be the first field in this structure */
+	kcf_call_type_t		an_type;
+
+	/* an_lock protects the field an_state  */
+	kmutex_t		an_lock;
+	kcf_req_status_t	an_state;
+	crypto_call_req_t	an_reqarg;
+
+	/*
+	 * parameters to call the SPI with. We need to
+	 * save the params since the caller stack can go away.
+	 */
+	struct kcf_req_params	an_params;
+
+	/*
+	 * The next two fields should be NULL for operations that
+	 * don't need a context.
+	 */
+	/* Internal context for this request */
+	struct kcf_context	*an_context;
+
+	/* next in chain of requests for context */
+	struct kcf_areq_node	*an_ctxchain_next;
+
+	kcondvar_t		an_turn_cv;
+	boolean_t		an_is_my_turn;
+	boolean_t		an_isdual;	/* for internal reuse */
+
+	/*
+	 * Next and previous nodes in the global software
+	 * queue. These fields are NULL for a hardware
+	 * provider since we use a taskq there.
+	 */
+	struct kcf_areq_node	*an_next;
+	struct kcf_areq_node	*an_prev;
+
+	/* Provider handling this request */
+	kcf_provider_desc_t	*an_provider;
+	kcf_prov_tried_t	*an_tried_plist;
+
+	struct kcf_areq_node	*an_idnext;	/* Next in ID hash */
+	struct kcf_areq_node	*an_idprev;	/* Prev in ID hash */
+	kcondvar_t		an_done;	/* Signal request completion */
+	uint_t			an_refcnt;
+} kcf_areq_node_t;
+
+#define	KCF_AREQ_REFHOLD(areq) {		\
+	atomic_add_32(&(areq)->an_refcnt, 1);	\
+	ASSERT((areq)->an_refcnt != 0);		\
+}
+
+#define	KCF_AREQ_REFRELE(areq) {				\
+	ASSERT((areq)->an_refcnt != 0);				\
+	membar_exit();						\
+	if (atomic_add_32_nv(&(areq)->an_refcnt, -1) == 0)	\
+		kcf_free_req(areq);				\
+}
+
+#define	GET_REQ_TYPE(arg) *((kcf_call_type_t *)(arg))
+
+#define	NOTIFY_CLIENT(areq, err) (*(areq)->an_reqarg.cr_callback_func)(\
+	(areq)->an_reqarg.cr_callback_arg, err);
+
+/* For internally generated call requests for dual operations */
+typedef	struct kcf_call_req {
+	crypto_call_req_t	kr_callreq;	/* external client call req */
+	kcf_req_params_t	kr_params;	/* Params saved for next call */
+	kcf_areq_node_t		*kr_areq;	/* Use this areq */
+	off_t			kr_saveoffset;
+	size_t			kr_savelen;
+} kcf_dual_req_t;
+
+/*
+ * The following are some what similar to macros in callo.h, which implement
+ * callout tables.
+ *
+ * The lower four bits of the ID are used to encode the table ID to
+ * index in to. The REQID_COUNTER_HIGH bit is used to avoid any check for
+ * wrap around when generating ID. We assume that there won't be a request
+ * which takes more time than 2^^(sizeof (long) - 5) other requests submitted
+ * after it. This ensures there won't be any ID collision.
+ */
+#define	REQID_COUNTER_HIGH	(1UL << (8 * sizeof (long) - 1))
+#define	REQID_COUNTER_SHIFT	4
+#define	REQID_COUNTER_LOW	(1 << REQID_COUNTER_SHIFT)
+#define	REQID_TABLES		16
+#define	REQID_TABLE_MASK	(REQID_TABLES - 1)
+
+#define	REQID_BUCKETS		512
+#define	REQID_BUCKET_MASK	(REQID_BUCKETS - 1)
+#define	REQID_HASH(id)	(((id) >> REQID_COUNTER_SHIFT) & REQID_BUCKET_MASK)
+
+#define	GET_REQID(areq) (areq)->an_reqarg.cr_reqid
+#define	SET_REQID(areq, val)	GET_REQID(areq) = val
+
+/*
+ * Hash table for async requests.
+ */
+typedef struct kcf_reqid_table {
+	kmutex_t		rt_lock;
+	crypto_req_id_t		rt_curid;
+	kcf_areq_node_t		*rt_idhash[REQID_BUCKETS];
+} kcf_reqid_table_t;
+
+/*
+ * Global software provider queue structure. Requests to be
+ * handled by a SW provider and have the ALWAYS_QUEUE flag set
+ * get queued here.
+ */
+typedef struct kcf_global_swq {
+	/*
+	 * gs_cv and gs_lock are used to wait for new requests.
+	 * gs_lock protects the changes to the queue.
+	 */
+	kcondvar_t		gs_cv;
+	kmutex_t		gs_lock;
+	uint_t			gs_njobs;
+	uint_t			gs_maxjobs;
+	kcf_areq_node_t		*gs_first;
+	kcf_areq_node_t		*gs_last;
+} kcf_global_swq_t;
+
+
+/*
+ * Internal representation of a canonical context. We contain crypto_ctx_t
+ * structure in order to have just one memory allocation. The SPI
+ * ((crypto_ctx_t *)ctx)->cc_framework_private maps to this structure.
+ */
+typedef struct kcf_context {
+	crypto_ctx_t		kc_glbl_ctx;
+	uint_t			kc_refcnt;
+	kmutex_t		kc_in_use_lock;
+	/*
+	 * kc_req_chain_first and kc_req_chain_last are used to chain
+	 * multiple async requests using the same context. They should be
+	 * NULL for sync requests.
+	 */
+	kcf_areq_node_t		*kc_req_chain_first;
+	kcf_areq_node_t		*kc_req_chain_last;
+	kcf_provider_desc_t	*kc_prov_desc;	/* Prov. descriptor */
+	kcf_provider_desc_t	*kc_sw_prov_desc;	/* Prov. descriptor */
+	kcf_mech_entry_t	*kc_mech;
+	struct kcf_context	*kc_secondctx;	/* for dual contexts */
+} kcf_context_t;
+
+/*
+ * Bump up the reference count on the framework private context. A
+ * global context or a request that references this structure should
+ * do a hold.
+ */
+#define	KCF_CONTEXT_REFHOLD(ictx) {		\
+	atomic_add_32(&(ictx)->kc_refcnt, 1);	\
+	ASSERT((ictx)->kc_refcnt != 0);		\
+}
+
+/*
+ * Decrement the reference count on the framework private context.
+ * When the last reference is released, the framework private
+ * context structure is freed along with the global context.
+ */
+#define	KCF_CONTEXT_REFRELE(ictx) {				\
+	ASSERT((ictx)->kc_refcnt != 0);				\
+	membar_exit();						\
+	if (atomic_add_32_nv(&(ictx)->kc_refcnt, -1) == 0)	\
+		kcf_free_context(ictx);				\
+}
+
+/*
+ * Check if we can release the context now. In case of CRYPTO_QUEUED
+ * we do not release it as we can do it only after the provider notified
+ * us. In case of CRYPTO_BUSY, the client can retry the request using
+ * the context, so we do not release the context.
+ *
+ * This macro should be called only from the final routine in
+ * an init/update/final sequence. We do not release the context in case
+ * of update operations. We require the consumer to free it
+ * explicitly, in case it wants to abandon the operation. This is done
+ * as there may be mechanisms in ECB mode that can continue even if
+ * an operation on a block fails.
+ */
+#define	KCF_CONTEXT_COND_RELEASE(rv, kcf_ctx) {			\
+	if (KCF_CONTEXT_DONE(rv))				\
+		KCF_CONTEXT_REFRELE(kcf_ctx);			\
+}
+
+/*
+ * This macro determines whether we're done with a context.
+ */
+#define	KCF_CONTEXT_DONE(rv)					\
+	((rv) != CRYPTO_QUEUED && (rv) != CRYPTO_BUSY &&	\
+	    (rv) != CRYPTO_BUFFER_TOO_SMALL)
+
+/*
+ * A crypto_ctx_template_t is internally a pointer to this struct
+ */
+typedef	struct kcf_ctx_template {
+	crypto_kcf_provider_handle_t	ct_prov_handle;	/* provider handle */
+	uint_t				ct_generation;	/* generation # */
+	size_t				ct_size;	/* for freeing */
+	crypto_spi_ctx_template_t	ct_prov_tmpl;	/* context template */
+							/* from the SW prov */
+} kcf_ctx_template_t;
+
+/*
+ * Structure for pool of threads working on global software queue.
+ */
+typedef struct kcf_pool {
+	uint32_t	kp_threads;		/* Number of threads in pool */
+	uint32_t	kp_idlethreads;		/* Idle threads in pool */
+	uint32_t	kp_blockedthreads;	/* Blocked threads in pool */
+
+	/*
+	 * cv & lock to monitor the condition when no threads
+	 * are around. In this case the failover thread kicks in.
+	 */
+	kcondvar_t	kp_nothr_cv;
+	kmutex_t	kp_thread_lock;
+
+	/* Userspace thread creator variables. */
+	boolean_t	kp_signal_create_thread; /* Create requested flag  */
+	int		kp_nthrs;		/* # of threads to create */
+	boolean_t	kp_user_waiting;	/* Thread waiting for work */
+
+	/*
+	 * cv & lock for the condition where more threads need to be
+	 * created. kp_user_lock also protects the three fields above.
+	 */
+	kcondvar_t	kp_user_cv;		/* Creator cond. variable */
+	kmutex_t	kp_user_lock;		/* Creator lock */
+} kcf_pool_t;
+
+
+/*
+ * State of a crypto bufcall element.
+ */
+typedef enum cbuf_state {
+	CBUF_FREE = 1,
+	CBUF_WAITING,
+	CBUF_RUNNING
+} cbuf_state_t;
+
+/*
+ * Structure of a crypto bufcall element.
+ */
+typedef struct kcf_cbuf_elem {
+	/*
+	 * lock and cv to wait for CBUF_RUNNING to be done
+	 * kc_lock also protects kc_state.
+	 */
+	kmutex_t		kc_lock;
+	kcondvar_t		kc_cv;
+	cbuf_state_t		kc_state;
+
+	struct kcf_cbuf_elem	*kc_next;
+	struct kcf_cbuf_elem	*kc_prev;
+
+	void			(*kc_func)(void *arg);
+	void			*kc_arg;
+} kcf_cbuf_elem_t;
+
+/*
+ * State of a notify element.
+ */
+typedef enum ntfy_elem_state {
+	NTFY_WAITING = 1,
+	NTFY_RUNNING
+} ntfy_elem_state_t;
+
+/*
+ * Structure of a notify list element.
+ */
+typedef struct kcf_ntfy_elem {
+	/*
+	 * lock and cv to wait for NTFY_RUNNING to be done.
+	 * kn_lock also protects kn_state.
+	 */
+	kmutex_t			kn_lock;
+	kcondvar_t			kn_cv;
+	ntfy_elem_state_t		kn_state;
+
+	struct kcf_ntfy_elem		*kn_next;
+	struct kcf_ntfy_elem		*kn_prev;
+
+	crypto_notify_callback_t	kn_func;
+	uint32_t			kn_event_mask;
+} kcf_ntfy_elem_t;
+
+
+/*
+ * The following values are based on the assumption that it would
+ * take around eight cpus to load a hardware provider (This is true for
+ * at least one product) and a kernel client may come from different
+ * low-priority interrupt levels. We will have CRYPTO_TASKQ_MIN number
+ * of cached taskq entries. The CRYPTO_TASKQ_MAX number is based on
+ * a throughput of 1GB/s using 512-byte buffers. These are just
+ * reasonable estimates and might need to change in future.
+ */
+#define	CRYPTO_TASKQ_THREADS	8
+#define	CRYPTO_TASKQ_MIN	64
+#define	CRYPTO_TASKQ_MAX	2 * 1024 * 1024
+
+extern int crypto_taskq_threads;
+extern int crypto_taskq_minalloc;
+extern int crypto_taskq_maxalloc;
+extern kcf_global_swq_t *gswq;
+extern int kcf_maxthreads;
+extern int kcf_minthreads;
+
+/*
+ * All pending crypto bufcalls are put on a list. cbuf_list_lock
+ * protects changes to this list.
+ */
+extern kmutex_t cbuf_list_lock;
+extern kcondvar_t cbuf_list_cv;
+
+/*
+ * All event subscribers are put on a list. kcf_notify_list_lock
+ * protects changes to this list.
+ */
+extern kmutex_t ntfy_list_lock;
+extern kcondvar_t ntfy_list_cv;
+
+boolean_t kcf_get_next_logical_provider_member(kcf_provider_desc_t *,
+    kcf_provider_desc_t *, kcf_provider_desc_t **);
+extern int kcf_get_hardware_provider(crypto_mech_type_t, crypto_mech_type_t,
+    boolean_t, kcf_provider_desc_t *, kcf_provider_desc_t **,
+    crypto_func_group_t);
+extern int kcf_get_hardware_provider_nomech(offset_t, offset_t,
+    boolean_t, kcf_provider_desc_t *, kcf_provider_desc_t **);
+extern void kcf_free_triedlist(kcf_prov_tried_t *);
+extern kcf_prov_tried_t *kcf_insert_triedlist(kcf_prov_tried_t **,
+    kcf_provider_desc_t *, int);
+extern kcf_provider_desc_t *kcf_get_mech_provider(crypto_mech_type_t,
+    kcf_mech_entry_t **, int *, kcf_prov_tried_t *, crypto_func_group_t,
+    boolean_t, size_t);
+extern kcf_provider_desc_t *kcf_get_dual_provider(crypto_mechanism_t *,
+    crypto_mechanism_t *, kcf_mech_entry_t **, crypto_mech_type_t *,
+    crypto_mech_type_t *, int *, kcf_prov_tried_t *,
+    crypto_func_group_t, crypto_func_group_t, boolean_t, size_t);
+extern crypto_ctx_t *kcf_new_ctx(crypto_call_req_t  *, kcf_provider_desc_t *,
+    crypto_session_id_t);
+extern int kcf_submit_request(kcf_provider_desc_t *, crypto_ctx_t *,
+    crypto_call_req_t *, kcf_req_params_t *, boolean_t);
+extern void kcf_sched_destroy(void);
+extern void kcf_sched_init(void);
+extern void kcf_sched_start(void);
+extern void kcf_sop_done(kcf_sreq_node_t *, int);
+extern void kcf_aop_done(kcf_areq_node_t *, int);
+extern int common_submit_request(kcf_provider_desc_t *,
+    crypto_ctx_t *, kcf_req_params_t *, crypto_req_handle_t);
+extern void kcf_free_context(kcf_context_t *);
+
+extern int kcf_svc_wait(int *);
+extern int kcf_svc_do_run(void);
+extern int kcf_need_signature_verification(kcf_provider_desc_t *);
+extern void kcf_verify_signature(void *);
+extern struct modctl *kcf_get_modctl(crypto_provider_info_t *);
+extern void verify_unverified_providers(void);
+extern void kcf_free_req(kcf_areq_node_t *areq);
+extern void crypto_bufcall_service(void);
+
+extern void kcf_walk_ntfylist(uint32_t, void *);
+extern void kcf_do_notify(kcf_provider_desc_t *, boolean_t);
+
+extern kcf_dual_req_t *kcf_alloc_req(crypto_call_req_t *);
+extern void kcf_next_req(void *, int);
+extern void kcf_last_req(void *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CRYPTO_SCHED_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/crypto/spi.h b/sys/contrib/openzfs/module/icp/include/sys/crypto/spi.h
new file mode 100644
index 000000000000..2c62b5706651
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/crypto/spi.h
@@ -0,0 +1,726 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_CRYPTO_SPI_H
+#define	_SYS_CRYPTO_SPI_H
+
+/*
+ * CSPI: Cryptographic Service Provider Interface.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef CONSTIFY_PLUGIN
+#define	__no_const __attribute__((no_const))
+#else
+#define	__no_const
+#endif /* CONSTIFY_PLUGIN */
+
+#define	CRYPTO_SPI_VERSION_1	1
+#define	CRYPTO_SPI_VERSION_2	2
+#define	CRYPTO_SPI_VERSION_3	3
+
+/*
+ * Provider-private handle. This handle is specified by a provider
+ * when it registers by means of the pi_provider_handle field of
+ * the crypto_provider_info structure, and passed to the provider
+ * when its entry points are invoked.
+ */
+typedef void *crypto_provider_handle_t;
+
+/*
+ * Context templates can be used to by software providers to pre-process
+ * keying material, such as key schedules. They are allocated by
+ * a software provider create_ctx_template(9E) entry point, and passed
+ * as argument to initialization and atomic provider entry points.
+ */
+typedef void *crypto_spi_ctx_template_t;
+
+/*
+ * Request handles are used by the kernel to identify an asynchronous
+ * request being processed by a provider. It is passed by the kernel
+ * to a hardware provider when submitting a request, and must be
+ * specified by a provider when calling crypto_op_notification(9F)
+ */
+typedef void *crypto_req_handle_t;
+
+/* Values for cc_flags field */
+#define	CRYPTO_INIT_OPSTATE	0x00000001 /* allocate and init cc_opstate */
+#define	CRYPTO_USE_OPSTATE	0x00000002 /* .. start using it as context */
+
+/*
+ * The context structure is passed from the kernel to a provider.
+ * It contains the information needed to process a multi-part or
+ * single part operation. The context structure is not used
+ * by atomic operations.
+ *
+ * Parameters needed to perform a cryptographic operation, such
+ * as keys, mechanisms, input and output buffers, are passed
+ * as separate arguments to Provider routines.
+ */
+typedef struct crypto_ctx {
+	crypto_provider_handle_t cc_provider;
+	crypto_session_id_t	cc_session;
+	void			*cc_provider_private;	/* owned by provider */
+	void			*cc_framework_private;	/* owned by framework */
+	uint32_t		cc_flags;		/* flags */
+	void			*cc_opstate;		/* state */
+} crypto_ctx_t;
+
+/*
+ * Extended provider information.
+ */
+
+/*
+ * valid values for ei_flags field of extended info structure
+ * They match the RSA Security, Inc PKCS#11 tokenInfo flags.
+ */
+#define	CRYPTO_EXTF_RNG					0x00000001
+#define	CRYPTO_EXTF_WRITE_PROTECTED			0x00000002
+#define	CRYPTO_EXTF_LOGIN_REQUIRED			0x00000004
+#define	CRYPTO_EXTF_USER_PIN_INITIALIZED		0x00000008
+#define	CRYPTO_EXTF_CLOCK_ON_TOKEN			0x00000040
+#define	CRYPTO_EXTF_PROTECTED_AUTHENTICATION_PATH	0x00000100
+#define	CRYPTO_EXTF_DUAL_CRYPTO_OPERATIONS		0x00000200
+#define	CRYPTO_EXTF_TOKEN_INITIALIZED			0x00000400
+#define	CRYPTO_EXTF_USER_PIN_COUNT_LOW			0x00010000
+#define	CRYPTO_EXTF_USER_PIN_FINAL_TRY			0x00020000
+#define	CRYPTO_EXTF_USER_PIN_LOCKED			0x00040000
+#define	CRYPTO_EXTF_USER_PIN_TO_BE_CHANGED		0x00080000
+#define	CRYPTO_EXTF_SO_PIN_COUNT_LOW			0x00100000
+#define	CRYPTO_EXTF_SO_PIN_FINAL_TRY			0x00200000
+#define	CRYPTO_EXTF_SO_PIN_LOCKED			0x00400000
+#define	CRYPTO_EXTF_SO_PIN_TO_BE_CHANGED		0x00800000
+
+/*
+ * The crypto_control_ops structure contains pointers to control
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_control_ops {
+	void (*provider_status)(crypto_provider_handle_t, uint_t *);
+} __no_const crypto_control_ops_t;
+
+/*
+ * The crypto_ctx_ops structure contains points to context and context
+ * templates management operations for cryptographic providers. It is
+ * passed through the crypto_ops(9S) structure when providers register
+ * with the kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_ctx_ops {
+	int (*create_ctx_template)(crypto_provider_handle_t,
+	    crypto_mechanism_t *, crypto_key_t *,
+	    crypto_spi_ctx_template_t *, size_t *, crypto_req_handle_t);
+	int (*free_context)(crypto_ctx_t *);
+} __no_const crypto_ctx_ops_t;
+
+/*
+ * The crypto_digest_ops structure contains pointers to digest
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_digest_ops {
+	int (*digest_init)(crypto_ctx_t *, crypto_mechanism_t *,
+	    crypto_req_handle_t);
+	int (*digest)(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+	    crypto_req_handle_t);
+	int (*digest_update)(crypto_ctx_t *, crypto_data_t *,
+	    crypto_req_handle_t);
+	int (*digest_key)(crypto_ctx_t *, crypto_key_t *, crypto_req_handle_t);
+	int (*digest_final)(crypto_ctx_t *, crypto_data_t *,
+	    crypto_req_handle_t);
+	int (*digest_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+} __no_const crypto_digest_ops_t;
+
+/*
+ * The crypto_cipher_ops structure contains pointers to encryption
+ * and decryption operations for cryptographic providers.  It is
+ * passed through the crypto_ops(9S) structure when providers register
+ * with the kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_cipher_ops {
+	int (*encrypt_init)(crypto_ctx_t *,
+	    crypto_mechanism_t *, crypto_key_t *,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+	int (*encrypt)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*encrypt_update)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*encrypt_final)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*encrypt_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+	int (*decrypt_init)(crypto_ctx_t *,
+	    crypto_mechanism_t *, crypto_key_t *,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+	int (*decrypt)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*decrypt_update)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*decrypt_final)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*decrypt_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+} __no_const crypto_cipher_ops_t;
+
+/*
+ * The crypto_mac_ops structure contains pointers to MAC
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_mac_ops {
+	int (*mac_init)(crypto_ctx_t *,
+	    crypto_mechanism_t *, crypto_key_t *,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+	int (*mac)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*mac_update)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*mac_final)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*mac_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+	int (*mac_verify_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+} __no_const crypto_mac_ops_t;
+
+/*
+ * The crypto_sign_ops structure contains pointers to signing
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_sign_ops {
+	int (*sign_init)(crypto_ctx_t *,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+	int (*sign)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*sign_update)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*sign_final)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*sign_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+	int (*sign_recover_init)(crypto_ctx_t *, crypto_mechanism_t *,
+	    crypto_key_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+	int (*sign_recover)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*sign_recover_atomic)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+} __no_const crypto_sign_ops_t;
+
+/*
+ * The crypto_verify_ops structure contains pointers to verify
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_verify_ops {
+	int (*verify_init)(crypto_ctx_t *,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+	int (*do_verify)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*verify_update)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*verify_final)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*verify_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+	int (*verify_recover_init)(crypto_ctx_t *, crypto_mechanism_t *,
+	    crypto_key_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+	int (*verify_recover)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*verify_recover_atomic)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_req_handle_t);
+} __no_const crypto_verify_ops_t;
+
+/*
+ * The crypto_dual_ops structure contains pointers to dual
+ * cipher and sign/verify operations for cryptographic providers.
+ * It is passed through the crypto_ops(9S) structure when
+ * providers register with the kernel using
+ * crypto_register_provider(9F).
+ */
+typedef struct crypto_dual_ops {
+	int (*digest_encrypt_update)(
+	    crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*decrypt_digest_update)(
+	    crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*sign_encrypt_update)(
+	    crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+	int (*decrypt_verify_update)(
+	    crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *,
+	    crypto_data_t *, crypto_req_handle_t);
+} __no_const crypto_dual_ops_t;
+
+/*
+ * The crypto_dual_cipher_mac_ops structure contains pointers to dual
+ * cipher and MAC operations for cryptographic providers.
+ * It is passed through the crypto_ops(9S) structure when
+ * providers register with the kernel using
+ * crypto_register_provider(9F).
+ */
+typedef struct crypto_dual_cipher_mac_ops {
+	int (*encrypt_mac_init)(crypto_ctx_t *,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *,
+	    crypto_key_t *, crypto_spi_ctx_template_t,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+	int (*encrypt_mac)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_dual_data_t *, crypto_data_t *,
+	    crypto_req_handle_t);
+	int (*encrypt_mac_update)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_dual_data_t *, crypto_req_handle_t);
+	int (*encrypt_mac_final)(crypto_ctx_t *,
+	    crypto_dual_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*encrypt_mac_atomic)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *,
+	    crypto_key_t *, crypto_data_t *, crypto_dual_data_t *,
+	    crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+	int (*mac_decrypt_init)(crypto_ctx_t *,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *,
+	    crypto_key_t *, crypto_spi_ctx_template_t,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+	int (*mac_decrypt)(crypto_ctx_t *,
+	    crypto_dual_data_t *, crypto_data_t *, crypto_data_t *,
+	    crypto_req_handle_t);
+	int (*mac_decrypt_update)(crypto_ctx_t *,
+	    crypto_dual_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*mac_decrypt_final)(crypto_ctx_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_req_handle_t);
+	int (*mac_decrypt_atomic)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_dual_data_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+	int (*mac_verify_decrypt_atomic)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_dual_data_t *,
+	    crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t,
+	    crypto_spi_ctx_template_t, crypto_req_handle_t);
+} __no_const crypto_dual_cipher_mac_ops_t;
+
+/*
+ * The crypto_random_number_ops structure contains pointers to random
+ * number operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_random_number_ops {
+	int (*seed_random)(crypto_provider_handle_t, crypto_session_id_t,
+	    uchar_t *, size_t, uint_t, uint32_t, crypto_req_handle_t);
+	int (*generate_random)(crypto_provider_handle_t, crypto_session_id_t,
+	    uchar_t *, size_t, crypto_req_handle_t);
+} __no_const crypto_random_number_ops_t;
+
+/*
+ * Flag values for seed_random.
+ */
+#define	CRYPTO_SEED_NOW		0x00000001
+
+/*
+ * The crypto_session_ops structure contains pointers to session
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_session_ops {
+	int (*session_open)(crypto_provider_handle_t, crypto_session_id_t *,
+	    crypto_req_handle_t);
+	int (*session_close)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_req_handle_t);
+	int (*session_login)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_user_type_t, char *, size_t, crypto_req_handle_t);
+	int (*session_logout)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_req_handle_t);
+} __no_const crypto_session_ops_t;
+
+/*
+ * The crypto_object_ops structure contains pointers to object
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_object_ops {
+	int (*object_create)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_object_attribute_t *, uint_t, crypto_object_id_t *,
+	    crypto_req_handle_t);
+	int (*object_copy)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_object_id_t, crypto_object_attribute_t *, uint_t,
+	    crypto_object_id_t *, crypto_req_handle_t);
+	int (*object_destroy)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_object_id_t, crypto_req_handle_t);
+	int (*object_get_size)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_object_id_t, size_t *, crypto_req_handle_t);
+	int (*object_get_attribute_value)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_object_id_t,
+	    crypto_object_attribute_t *, uint_t, crypto_req_handle_t);
+	int (*object_set_attribute_value)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_object_id_t,
+	    crypto_object_attribute_t *,  uint_t, crypto_req_handle_t);
+	int (*object_find_init)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_object_attribute_t *, uint_t, void **,
+	    crypto_req_handle_t);
+	int (*object_find)(crypto_provider_handle_t, void *,
+	    crypto_object_id_t *, uint_t, uint_t *, crypto_req_handle_t);
+	int (*object_find_final)(crypto_provider_handle_t, void *,
+	    crypto_req_handle_t);
+} __no_const crypto_object_ops_t;
+
+/*
+ * The crypto_key_ops structure contains pointers to key
+ * operations for cryptographic providers.  It is passed through
+ * the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_key_ops {
+	int (*key_generate)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_object_attribute_t *, uint_t,
+	    crypto_object_id_t *, crypto_req_handle_t);
+	int (*key_generate_pair)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_object_attribute_t *, uint_t,
+	    crypto_object_attribute_t *, uint_t, crypto_object_id_t *,
+	    crypto_object_id_t *, crypto_req_handle_t);
+	int (*key_wrap)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_object_id_t *,
+	    uchar_t *, size_t *, crypto_req_handle_t);
+	int (*key_unwrap)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, uchar_t *, size_t *,
+	    crypto_object_attribute_t *, uint_t,
+	    crypto_object_id_t *, crypto_req_handle_t);
+	int (*key_derive)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_object_attribute_t *,
+	    uint_t, crypto_object_id_t *, crypto_req_handle_t);
+	int (*key_check)(crypto_provider_handle_t, crypto_mechanism_t *,
+	    crypto_key_t *);
+} __no_const crypto_key_ops_t;
+
+/*
+ * The crypto_provider_management_ops structure contains pointers
+ * to management operations for cryptographic providers.  It is passed
+ * through the crypto_ops(9S) structure when providers register with the
+ * kernel using crypto_register_provider(9F).
+ */
+typedef struct crypto_provider_management_ops {
+	int (*ext_info)(crypto_provider_handle_t,
+	    crypto_provider_ext_info_t *, crypto_req_handle_t);
+	int (*init_token)(crypto_provider_handle_t, char *, size_t,
+	    char *, crypto_req_handle_t);
+	int (*init_pin)(crypto_provider_handle_t, crypto_session_id_t,
+	    char *, size_t, crypto_req_handle_t);
+	int (*set_pin)(crypto_provider_handle_t, crypto_session_id_t,
+	    char *, size_t, char *, size_t, crypto_req_handle_t);
+} __no_const crypto_provider_management_ops_t;
+
+typedef struct crypto_mech_ops {
+	int (*copyin_mechanism)(crypto_provider_handle_t,
+	    crypto_mechanism_t *, crypto_mechanism_t *, int *, int);
+	int (*copyout_mechanism)(crypto_provider_handle_t,
+	    crypto_mechanism_t *, crypto_mechanism_t *, int *, int);
+	int (*free_mechanism)(crypto_provider_handle_t, crypto_mechanism_t *);
+} __no_const crypto_mech_ops_t;
+
+typedef struct crypto_nostore_key_ops {
+	int (*nostore_key_generate)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_mechanism_t *,
+	    crypto_object_attribute_t *, uint_t, crypto_object_attribute_t *,
+	    uint_t, crypto_req_handle_t);
+	int (*nostore_key_generate_pair)(crypto_provider_handle_t,
+	    crypto_session_id_t, crypto_mechanism_t *,
+	    crypto_object_attribute_t *, uint_t, crypto_object_attribute_t *,
+	    uint_t, crypto_object_attribute_t *, uint_t,
+	    crypto_object_attribute_t *, uint_t, crypto_req_handle_t);
+	int (*nostore_key_derive)(crypto_provider_handle_t, crypto_session_id_t,
+	    crypto_mechanism_t *, crypto_key_t *, crypto_object_attribute_t *,
+	    uint_t, crypto_object_attribute_t *, uint_t, crypto_req_handle_t);
+} __no_const crypto_nostore_key_ops_t;
+
+/*
+ * The crypto_ops(9S) structure contains the structures containing
+ * the pointers to functions implemented by cryptographic providers.
+ * It is specified as part of the crypto_provider_info(9S)
+ * supplied by a provider when it registers with the kernel
+ * by calling crypto_register_provider(9F).
+ */
+typedef struct crypto_ops_v1 {
+	crypto_control_ops_t			*co_control_ops;
+	crypto_digest_ops_t			*co_digest_ops;
+	crypto_cipher_ops_t			*co_cipher_ops;
+	crypto_mac_ops_t			*co_mac_ops;
+	crypto_sign_ops_t			*co_sign_ops;
+	crypto_verify_ops_t			*co_verify_ops;
+	crypto_dual_ops_t			*co_dual_ops;
+	crypto_dual_cipher_mac_ops_t		*co_dual_cipher_mac_ops;
+	crypto_random_number_ops_t		*co_random_ops;
+	crypto_session_ops_t			*co_session_ops;
+	crypto_object_ops_t			*co_object_ops;
+	crypto_key_ops_t			*co_key_ops;
+	crypto_provider_management_ops_t	*co_provider_ops;
+	crypto_ctx_ops_t			*co_ctx_ops;
+} crypto_ops_v1_t;
+
+typedef struct crypto_ops_v2 {
+	crypto_ops_v1_t				v1_ops;
+	crypto_mech_ops_t			*co_mech_ops;
+} crypto_ops_v2_t;
+
+typedef struct crypto_ops_v3 {
+	crypto_ops_v2_t				v2_ops;
+	crypto_nostore_key_ops_t		*co_nostore_key_ops;
+} crypto_ops_v3_t;
+
+typedef struct crypto_ops {
+	union {
+		crypto_ops_v3_t	cou_v3;
+		crypto_ops_v2_t	cou_v2;
+		crypto_ops_v1_t	cou_v1;
+	} cou;
+} crypto_ops_t;
+
+#define	co_control_ops			cou.cou_v1.co_control_ops
+#define	co_digest_ops			cou.cou_v1.co_digest_ops
+#define	co_cipher_ops			cou.cou_v1.co_cipher_ops
+#define	co_mac_ops			cou.cou_v1.co_mac_ops
+#define	co_sign_ops			cou.cou_v1.co_sign_ops
+#define	co_verify_ops			cou.cou_v1.co_verify_ops
+#define	co_dual_ops			cou.cou_v1.co_dual_ops
+#define	co_dual_cipher_mac_ops		cou.cou_v1.co_dual_cipher_mac_ops
+#define	co_random_ops			cou.cou_v1.co_random_ops
+#define	co_session_ops			cou.cou_v1.co_session_ops
+#define	co_object_ops			cou.cou_v1.co_object_ops
+#define	co_key_ops			cou.cou_v1.co_key_ops
+#define	co_provider_ops			cou.cou_v1.co_provider_ops
+#define	co_ctx_ops			cou.cou_v1.co_ctx_ops
+#define	co_mech_ops			cou.cou_v2.co_mech_ops
+#define	co_nostore_key_ops		cou.cou_v3.co_nostore_key_ops
+
+/*
+ * The mechanism info structure crypto_mech_info_t contains a function group
+ * bit mask cm_func_group_mask. This field, of type crypto_func_group_t,
+ * specifies the provider entry point that can be used a particular
+ * mechanism. The function group mask is a combination of the following values.
+ */
+
+typedef uint32_t crypto_func_group_t;
+
+
+#define	CRYPTO_FG_ENCRYPT		0x00000001 /* encrypt_init() */
+#define	CRYPTO_FG_DECRYPT		0x00000002 /* decrypt_init() */
+#define	CRYPTO_FG_DIGEST		0x00000004 /* digest_init() */
+#define	CRYPTO_FG_SIGN			0x00000008 /* sign_init() */
+#define	CRYPTO_FG_SIGN_RECOVER		0x00000010 /* sign_recover_init() */
+#define	CRYPTO_FG_VERIFY		0x00000020 /* verify_init() */
+#define	CRYPTO_FG_VERIFY_RECOVER	0x00000040 /* verify_recover_init() */
+#define	CRYPTO_FG_GENERATE		0x00000080 /* key_generate() */
+#define	CRYPTO_FG_GENERATE_KEY_PAIR	0x00000100 /* key_generate_pair() */
+#define	CRYPTO_FG_WRAP			0x00000200 /* key_wrap() */
+#define	CRYPTO_FG_UNWRAP		0x00000400 /* key_unwrap() */
+#define	CRYPTO_FG_DERIVE		0x00000800 /* key_derive() */
+#define	CRYPTO_FG_MAC			0x00001000 /* mac_init() */
+#define	CRYPTO_FG_ENCRYPT_MAC		0x00002000 /* encrypt_mac_init() */
+#define	CRYPTO_FG_MAC_DECRYPT		0x00004000 /* decrypt_mac_init() */
+#define	CRYPTO_FG_ENCRYPT_ATOMIC	0x00008000 /* encrypt_atomic() */
+#define	CRYPTO_FG_DECRYPT_ATOMIC	0x00010000 /* decrypt_atomic() */
+#define	CRYPTO_FG_MAC_ATOMIC		0x00020000 /* mac_atomic() */
+#define	CRYPTO_FG_DIGEST_ATOMIC		0x00040000 /* digest_atomic() */
+#define	CRYPTO_FG_SIGN_ATOMIC		0x00080000 /* sign_atomic() */
+#define	CRYPTO_FG_SIGN_RECOVER_ATOMIC   0x00100000 /* sign_recover_atomic() */
+#define	CRYPTO_FG_VERIFY_ATOMIC		0x00200000 /* verify_atomic() */
+#define	CRYPTO_FG_VERIFY_RECOVER_ATOMIC	0x00400000 /* verify_recover_atomic() */
+#define	CRYPTO_FG_ENCRYPT_MAC_ATOMIC	0x00800000 /* encrypt_mac_atomic() */
+#define	CRYPTO_FG_MAC_DECRYPT_ATOMIC	0x01000000 /* mac_decrypt_atomic() */
+#define	CRYPTO_FG_RESERVED		0x80000000
+
+/*
+ * Maximum length of the pi_provider_description field of the
+ * crypto_provider_info structure.
+ */
+#define	CRYPTO_PROVIDER_DESCR_MAX_LEN	64
+
+
+/* Bit mask for all the simple operations */
+#define	CRYPTO_FG_SIMPLEOP_MASK	(CRYPTO_FG_ENCRYPT | CRYPTO_FG_DECRYPT | \
+    CRYPTO_FG_DIGEST | CRYPTO_FG_SIGN | CRYPTO_FG_VERIFY | CRYPTO_FG_MAC | \
+    CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT_ATOMIC |		\
+    CRYPTO_FG_MAC_ATOMIC | CRYPTO_FG_DIGEST_ATOMIC | CRYPTO_FG_SIGN_ATOMIC | \
+    CRYPTO_FG_VERIFY_ATOMIC)
+
+/* Bit mask for all the dual operations */
+#define	CRYPTO_FG_MAC_CIPHER_MASK	(CRYPTO_FG_ENCRYPT_MAC |	\
+    CRYPTO_FG_MAC_DECRYPT | CRYPTO_FG_ENCRYPT_MAC_ATOMIC | 		\
+    CRYPTO_FG_MAC_DECRYPT_ATOMIC)
+
+/* Add other combos to CRYPTO_FG_DUAL_MASK */
+#define	CRYPTO_FG_DUAL_MASK	CRYPTO_FG_MAC_CIPHER_MASK
+
+/*
+ * The crypto_mech_info structure specifies one of the mechanisms
+ * supported by a cryptographic provider. The pi_mechanisms field of
+ * the crypto_provider_info structure contains a pointer to an array
+ * of crypto_mech_info's.
+ */
+typedef struct crypto_mech_info {
+	crypto_mech_name_t	cm_mech_name;
+	crypto_mech_type_t	cm_mech_number;
+	crypto_func_group_t	cm_func_group_mask;
+	ssize_t			cm_min_key_length;
+	ssize_t			cm_max_key_length;
+	uint32_t		cm_mech_flags;
+} crypto_mech_info_t;
+
+/* Alias the old name to the new name for compatibility. */
+#define	cm_keysize_unit	cm_mech_flags
+
+/*
+ * The following is used by a provider that sets
+ * CRYPTO_HASH_NO_UPDATE. It needs to specify the maximum
+ * input data size it can digest in this field.
+ */
+#define	cm_max_input_length	cm_max_key_length
+
+/*
+ * crypto_kcf_provider_handle_t is a handle allocated by the kernel.
+ * It is returned after the provider registers with
+ * crypto_register_provider(), and must be specified by the provider
+ * when calling crypto_unregister_provider(), and
+ * crypto_provider_notification().
+ */
+typedef uint_t crypto_kcf_provider_handle_t;
+
+/*
+ * Provider information. Passed as argument to crypto_register_provider(9F).
+ * Describes the provider and its capabilities. Multiple providers can
+ * register for the same device instance. In this case, the same
+ * pi_provider_dev must be specified with a different pi_provider_handle.
+ */
+typedef struct crypto_provider_info_v1 {
+	uint_t				pi_interface_version;
+	char				*pi_provider_description;
+	crypto_provider_type_t		pi_provider_type;
+	crypto_provider_handle_t	pi_provider_handle;
+	crypto_ops_t			*pi_ops_vector;
+	uint_t				pi_mech_list_count;
+	crypto_mech_info_t		*pi_mechanisms;
+	uint_t				pi_logical_provider_count;
+	crypto_kcf_provider_handle_t	*pi_logical_providers;
+} crypto_provider_info_v1_t;
+
+typedef struct crypto_provider_info_v2 {
+	crypto_provider_info_v1_t	v1_info;
+	uint_t				pi_flags;
+} crypto_provider_info_v2_t;
+
+typedef struct crypto_provider_info {
+	union {
+		crypto_provider_info_v2_t piu_v2;
+		crypto_provider_info_v1_t piu_v1;
+	} piu;
+} crypto_provider_info_t;
+
+#define	pi_interface_version		piu.piu_v1.pi_interface_version
+#define	pi_provider_description		piu.piu_v1.pi_provider_description
+#define	pi_provider_type		piu.piu_v1.pi_provider_type
+#define	pi_provider_handle		piu.piu_v1.pi_provider_handle
+#define	pi_ops_vector			piu.piu_v1.pi_ops_vector
+#define	pi_mech_list_count		piu.piu_v1.pi_mech_list_count
+#define	pi_mechanisms			piu.piu_v1.pi_mechanisms
+#define	pi_logical_provider_count	piu.piu_v1.pi_logical_provider_count
+#define	pi_logical_providers		piu.piu_v1.pi_logical_providers
+#define	pi_flags			piu.piu_v2.pi_flags
+
+/* hidden providers can only be accessed via a logical provider */
+#define	CRYPTO_HIDE_PROVIDER		0x00000001
+/*
+ * provider can not do multi-part digest (updates) and has a limit
+ * on maximum input data that it can digest.
+ */
+#define	CRYPTO_HASH_NO_UPDATE		0x00000002
+
+/* provider can handle the request without returning a CRYPTO_QUEUED */
+#define	CRYPTO_SYNCHRONOUS		0x00000004
+
+#define	CRYPTO_PIFLAGS_RESERVED2	0x40000000
+#define	CRYPTO_PIFLAGS_RESERVED1	0x80000000
+
+/*
+ * Provider status passed by a provider to crypto_provider_notification(9F)
+ * and returned by the provider_status(9E) entry point.
+ */
+#define	CRYPTO_PROVIDER_READY		0
+#define	CRYPTO_PROVIDER_BUSY		1
+#define	CRYPTO_PROVIDER_FAILED		2
+
+/*
+ * Functions exported by Solaris to cryptographic providers. Providers
+ * call these functions to register and unregister, notify the kernel
+ * of state changes, and notify the kernel when a asynchronous request
+ * completed.
+ */
+extern int crypto_register_provider(crypto_provider_info_t *,
+		crypto_kcf_provider_handle_t *);
+extern int crypto_unregister_provider(crypto_kcf_provider_handle_t);
+extern void crypto_provider_notification(crypto_kcf_provider_handle_t, uint_t);
+extern void crypto_op_notification(crypto_req_handle_t, int);
+extern int crypto_kmflag(crypto_req_handle_t);
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_CRYPTO_SPI_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h b/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h
new file mode 100644
index 000000000000..f2dae7093b94
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/ia32/asm_linkage.h
@@ -0,0 +1,307 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IA32_SYS_ASM_LINKAGE_H
+#define	_IA32_SYS_ASM_LINKAGE_H
+
+#include <sys/stack.h>
+#include <sys/trap.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef _ASM	/* The remainder of this file is only for assembly files */
+
+/*
+ * make annoying differences in assembler syntax go away
+ */
+
+/*
+ * D16 and A16 are used to insert instructions prefixes; the
+ * macros help the assembler code be slightly more portable.
+ */
+#if !defined(__GNUC_AS__)
+/*
+ * /usr/ccs/bin/as prefixes are parsed as separate instructions
+ */
+#define	D16	data16;
+#define	A16	addr16;
+
+/*
+ * (There are some weird constructs in constant expressions)
+ */
+#define	_CONST(const)		[const]
+#define	_BITNOT(const)		-1!_CONST(const)
+#define	_MUL(a, b)		_CONST(a \* b)
+
+#else
+/*
+ * Why not use the 'data16' and 'addr16' prefixes .. well, the
+ * assembler doesn't quite believe in real mode, and thus argues with
+ * us about what we're trying to do.
+ */
+#define	D16	.byte	0x66;
+#define	A16	.byte	0x67;
+
+#define	_CONST(const)		(const)
+#define	_BITNOT(const)		~_CONST(const)
+#define	_MUL(a, b)		_CONST(a * b)
+
+#endif
+
+/*
+ * C pointers are different sizes between i386 and amd64.
+ * These constants can be used to compute offsets into pointer arrays.
+ */
+#if defined(__amd64)
+#define	CLONGSHIFT	3
+#define	CLONGSIZE	8
+#define	CLONGMASK	7
+#elif defined(__i386)
+#define	CLONGSHIFT	2
+#define	CLONGSIZE	4
+#define	CLONGMASK	3
+#endif
+
+/*
+ * Since we know we're either ILP32 or LP64 ..
+ */
+#define	CPTRSHIFT	CLONGSHIFT
+#define	CPTRSIZE	CLONGSIZE
+#define	CPTRMASK	CLONGMASK
+
+#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT)
+#error	"inconsistent shift constants"
+#endif
+
+#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1)
+#error	"inconsistent mask constants"
+#endif
+
+#define	ASM_ENTRY_ALIGN	16
+
+/*
+ * SSE register alignment and save areas
+ */
+
+#define	XMM_SIZE	16
+#define	XMM_ALIGN	16
+
+#if defined(__amd64)
+
+#define	SAVE_XMM_PROLOG(sreg, nreg)				\
+	subq	$_CONST(_MUL(XMM_SIZE, nreg)), %rsp;		\
+	movq	%rsp, sreg
+
+#define	RSTOR_XMM_EPILOG(sreg, nreg)				\
+	addq	$_CONST(_MUL(XMM_SIZE, nreg)), %rsp
+
+#elif defined(__i386)
+
+#define	SAVE_XMM_PROLOG(sreg, nreg)				\
+	subl	$_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \
+	movl	%esp, sreg;					\
+	addl	$XMM_ALIGN, sreg;				\
+	andl	$_BITNOT(XMM_ALIGN-1), sreg
+
+#define	RSTOR_XMM_EPILOG(sreg, nreg)				\
+	addl	$_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp;
+
+#endif	/* __i386 */
+
+/*
+ * profiling causes definitions of the MCOUNT and RTMCOUNT
+ * particular to the type
+ */
+#ifdef GPROF
+
+#define	MCOUNT(x) \
+	pushl	%ebp; \
+	movl	%esp, %ebp; \
+	call	_mcount; \
+	popl	%ebp
+
+#endif /* GPROF */
+
+#ifdef PROF
+
+#define	MCOUNT(x) \
+/* CSTYLED */ \
+	.lcomm .L_/**/x/**/1, 4, 4; \
+	pushl	%ebp; \
+	movl	%esp, %ebp; \
+/* CSTYLED */ \
+	movl	$.L_/**/x/**/1, %edx; \
+	call	_mcount; \
+	popl	%ebp
+
+#endif /* PROF */
+
+/*
+ * if we are not profiling, MCOUNT should be defined to nothing
+ */
+#if !defined(PROF) && !defined(GPROF)
+#define	MCOUNT(x)
+#endif /* !defined(PROF) && !defined(GPROF) */
+
+#define	RTMCOUNT(x)	MCOUNT(x)
+
+/*
+ * Macro to define weak symbol aliases. These are similar to the ANSI-C
+ *	#pragma weak _name = name
+ * except a compiler can determine type. The assembler must be told. Hence,
+ * the second parameter must be the type of the symbol (i.e.: function,...)
+ */
+#define	ANSI_PRAGMA_WEAK(sym, stype)	\
+/* CSTYLED */ \
+	.weak	_/**/sym; \
+/* CSTYLED */ \
+	.type	_/**/sym, @stype; \
+/* CSTYLED */ \
+_/**/sym = sym
+
+/*
+ * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in:
+ *	#pragma weak sym1 = sym2
+ */
+#define	ANSI_PRAGMA_WEAK2(sym1, sym2, stype)	\
+	.weak	sym1; \
+	.type sym1, @stype; \
+sym1	= sym2
+
+/*
+ * ENTRY provides the standard procedure entry code and an easy way to
+ * insert the calls to mcount for profiling. ENTRY_NP is identical, but
+ * never calls mcount.
+ */
+#define	ENTRY(x) \
+	.text; \
+	.align	ASM_ENTRY_ALIGN; \
+	.globl	x; \
+	.type	x, @function; \
+x:	MCOUNT(x)
+
+#define	ENTRY_NP(x) \
+	.text; \
+	.align	ASM_ENTRY_ALIGN; \
+	.globl	x; \
+	.type	x, @function; \
+x:
+
+#define	RTENTRY(x) \
+	.text; \
+	.align	ASM_ENTRY_ALIGN; \
+	.globl	x; \
+	.type	x, @function; \
+x:	RTMCOUNT(x)
+
+/*
+ * ENTRY2 is identical to ENTRY but provides two labels for the entry point.
+ */
+#define	ENTRY2(x, y) \
+	.text; \
+	.align	ASM_ENTRY_ALIGN; \
+	.globl	x, y; \
+	.type	x, @function; \
+	.type	y, @function; \
+/* CSTYLED */ \
+x:	; \
+y:	MCOUNT(x)
+
+#define	ENTRY_NP2(x, y) \
+	.text; \
+	.align	ASM_ENTRY_ALIGN; \
+	.globl	x, y; \
+	.type	x, @function; \
+	.type	y, @function; \
+/* CSTYLED */ \
+x:	; \
+y:
+
+
+/*
+ * ALTENTRY provides for additional entry points.
+ */
+#define	ALTENTRY(x) \
+	.globl x; \
+	.type	x, @function; \
+x:
+
+/*
+ * DGDEF and DGDEF2 provide global data declarations.
+ *
+ * DGDEF provides a word aligned word of storage.
+ *
+ * DGDEF2 allocates "sz" bytes of storage with **NO** alignment.  This
+ * implies this macro is best used for byte arrays.
+ *
+ * DGDEF3 allocates "sz" bytes of storage with "algn" alignment.
+ */
+#define	DGDEF2(name, sz) \
+	.data; \
+	.globl	name; \
+	.type	name, @object; \
+	.size	name, sz; \
+name:
+
+#define	DGDEF3(name, sz, algn) \
+	.data; \
+	.align	algn; \
+	.globl	name; \
+	.type	name, @object; \
+	.size	name, sz; \
+name:
+
+#define	DGDEF(name)	DGDEF3(name, 4, 4)
+
+/*
+ * SET_SIZE trails a function and set the size for the ELF symbol table.
+ */
+#define	SET_SIZE(x) \
+	.size	x, [.-x]
+
+/*
+ * NWORD provides native word value.
+ */
+#if defined(__amd64)
+
+/*CSTYLED*/
+#define	NWORD	quad
+
+#elif defined(__i386)
+
+#define	NWORD	long
+
+#endif  /* __i386 */
+
+#endif /* _ASM */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _IA32_SYS_ASM_LINKAGE_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/ia32/stack.h b/sys/contrib/openzfs/module/icp/include/sys/ia32/stack.h
new file mode 100644
index 000000000000..9e7c089e1182
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/ia32/stack.h
@@ -0,0 +1,160 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IA32_SYS_STACK_H
+#define	_IA32_SYS_STACK_H
+
+#if !defined(_ASM)
+
+#include <sys/types.h>
+
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * In the x86 world, a stack frame looks like this:
+ *
+ *		|--------------------------|
+ * 4n+8(%ebp) ->| argument word n	   |
+ *		| ...			   |	(Previous frame)
+ *    8(%ebp) ->| argument word 0	   |
+ *		|--------------------------|--------------------
+ *    4(%ebp) ->| return address	   |
+ *		|--------------------------|
+ *    0(%ebp) ->| previous %ebp (optional) |
+ * 		|--------------------------|
+ *   -4(%ebp) ->| unspecified		   |	(Current frame)
+ *		| ...			   |
+ *    0(%esp) ->| variable size		   |
+ * 		|--------------------------|
+ */
+
+/*
+ * Stack alignment macros.
+ */
+
+#define	STACK_ALIGN32		4
+#define	STACK_ENTRY_ALIGN32	4
+#define	STACK_BIAS32		0
+#define	SA32(x)			(((x)+(STACK_ALIGN32-1)) & ~(STACK_ALIGN32-1))
+#define	STACK_RESERVE32		0
+#define	MINFRAME32		0
+
+#if defined(__amd64)
+
+/*
+ * In the amd64 world, a stack frame looks like this:
+ *
+ *		|--------------------------|
+ * 8n+16(%rbp)->| argument word n	   |
+ *		| ...			   |	(Previous frame)
+ *   16(%rbp) ->| argument word 0	   |
+ *		|--------------------------|--------------------
+ *    8(%rbp) ->| return address	   |
+ *		|--------------------------|
+ *    0(%rbp) ->| previous %rbp            |
+ * 		|--------------------------|
+ *   -8(%rbp) ->| unspecified		   |	(Current frame)
+ *		| ...			   |
+ *    0(%rsp) ->| variable size		   |
+ * 		|--------------------------|
+ * -128(%rsp) ->| reserved for function	   |
+ * 		|--------------------------|
+ *
+ * The end of the input argument area must be aligned on a 16-byte
+ * boundary; i.e. (%rsp - 8) % 16 == 0 at function entry.
+ *
+ * The 128-byte location beyond %rsp is considered to be reserved for
+ * functions and is NOT modified by signal handlers.  It can be used
+ * to store temporary data that is not needed across function calls.
+ */
+
+/*
+ * Stack alignment macros.
+ */
+
+#define	STACK_ALIGN64		16
+#define	STACK_ENTRY_ALIGN64 	8
+#define	STACK_BIAS64		0
+#define	SA64(x)			(((x)+(STACK_ALIGN64-1)) & ~(STACK_ALIGN64-1))
+#define	STACK_RESERVE64		128
+#define	MINFRAME64		0
+
+#define	STACK_ALIGN		STACK_ALIGN64
+#define	STACK_ENTRY_ALIGN	STACK_ENTRY_ALIGN64
+#define	STACK_BIAS		STACK_BIAS64
+#define	SA(x)			SA64(x)
+#define	STACK_RESERVE		STACK_RESERVE64
+#define	MINFRAME		MINFRAME64
+
+#elif defined(__i386)
+
+#define	STACK_ALIGN		STACK_ALIGN32
+#define	STACK_ENTRY_ALIGN	STACK_ENTRY_ALIGN32
+#define	STACK_BIAS		STACK_BIAS32
+#define	SA(x)			SA32(x)
+#define	STACK_RESERVE		STACK_RESERVE32
+#define	MINFRAME		MINFRAME32
+
+#endif	/* __i386 */
+
+#if defined(_KERNEL) && !defined(_ASM)
+
+#if defined(ZFS_DEBUG)
+#if STACK_ALIGN == 4
+#define	ASSERT_STACK_ALIGNED()						\
+	{								\
+		uint32_t __tmp;						\
+		ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0);	\
+	}
+#elif (STACK_ALIGN == 16) && (_LONG_DOUBLE_ALIGNMENT == 16)
+#define	ASSERT_STACK_ALIGNED()						\
+	{								\
+		long double __tmp;					\
+		ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0);	\
+	}
+#endif
+#else	/* DEBUG */
+#define	ASSERT_STACK_ALIGNED()
+#endif	/* DEBUG */
+
+struct regs;
+
+void traceregs(struct regs *);
+void traceback(caddr_t);
+
+#endif /* defined(_KERNEL) && !defined(_ASM) */
+
+#define	STACK_GROWTH_DOWN /* stacks grow from high to low addresses */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _IA32_SYS_STACK_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/ia32/trap.h b/sys/contrib/openzfs/module/icp/include/sys/ia32/trap.h
new file mode 100644
index 000000000000..55b94969b80b
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/ia32/trap.h
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*	Copyright (c) 1990, 1991 UNIX System Laboratories, Inc.	*/
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _IA32_SYS_TRAP_H
+#define	_IA32_SYS_TRAP_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Trap type values
+ */
+
+#define	T_ZERODIV	0x0	/* #de	divide by 0 error		*/
+#define	T_SGLSTP	0x1	/* #db	single step			*/
+#define	T_NMIFLT	0x2	/* 	NMI				*/
+#define	T_BPTFLT	0x3	/* #bp	breakpoint fault, INT3 insn	*/
+#define	T_OVFLW		0x4	/* #of	INTO overflow fault		*/
+#define	T_BOUNDFLT	0x5	/* #br	BOUND insn fault		*/
+#define	T_ILLINST	0x6	/* #ud	invalid opcode fault		*/
+#define	T_NOEXTFLT	0x7	/* #nm	device not available: x87	*/
+#define	T_DBLFLT	0x8	/* #df	double fault			*/
+#define	T_EXTOVRFLT	0x9	/* 	[not generated: 386 only]	*/
+#define	T_TSSFLT	0xa	/* #ts	invalid TSS fault		*/
+#define	T_SEGFLT	0xb	/* #np	segment not present fault	*/
+#define	T_STKFLT	0xc	/* #ss	stack fault			*/
+#define	T_GPFLT		0xd	/* #gp	general protection fault	*/
+#define	T_PGFLT		0xe	/* #pf	page fault			*/
+#define	T_EXTERRFLT	0x10	/* #mf	x87 FPU error fault		*/
+#define	T_ALIGNMENT	0x11	/* #ac	alignment check error		*/
+#define	T_MCE		0x12	/* #mc	machine check exception		*/
+#define	T_SIMDFPE	0x13	/* #xm	SSE/SSE exception		*/
+#define	T_DBGENTR	0x14	/*	debugger entry 			*/
+#define	T_ENDPERR	0x21	/*	emulated extension error flt	*/
+#define	T_ENOEXTFLT	0x20	/*	emulated ext not present	*/
+#define	T_FASTTRAP	0xd2	/*	fast system call		*/
+#define	T_SYSCALLINT	0x91	/*	general system call		*/
+#define	T_DTRACE_RET	0x7f	/*	DTrace pid return		*/
+#define	T_INT80		0x80	/*	int80 handler for linux emulation */
+#define	T_SOFTINT	0x50fd	/*	pseudo softint trap type	*/
+
+/*
+ * Pseudo traps.
+ */
+#define	T_INTERRUPT		0x100
+#define	T_FAULT			0x200
+#define	T_AST			0x400
+#define	T_SYSCALL		0x180
+
+
+/*
+ *  Values of error code on stack in case of page fault
+ */
+
+#define	PF_ERR_MASK	0x01	/* Mask for error bit */
+#define	PF_ERR_PAGE	0x00	/* page not present */
+#define	PF_ERR_PROT	0x01	/* protection error */
+#define	PF_ERR_WRITE	0x02	/* fault caused by write (else read) */
+#define	PF_ERR_USER	0x04	/* processor was in user mode */
+				/*	(else supervisor) */
+#define	PF_ERR_EXEC	0x10	/* attempt to execute a No eXec page (AMD) */
+
+/*
+ *  Definitions for fast system call subfunctions
+ */
+#define	T_FNULL		0	/* Null trap for testing		*/
+#define	T_FGETFP	1	/* Get emulated FP context		*/
+#define	T_FSETFP	2	/* Set emulated FP context		*/
+#define	T_GETHRTIME	3	/* Get high resolution time		*/
+#define	T_GETHRVTIME	4	/* Get high resolution virtual time	*/
+#define	T_GETHRESTIME	5	/* Get high resolution time		*/
+#define	T_GETLGRP	6	/* Get home lgrpid			*/
+
+#define	T_LASTFAST	6	/* Last valid subfunction		*/
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _IA32_SYS_TRAP_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/modctl.h b/sys/contrib/openzfs/module/icp/include/sys/modctl.h
new file mode 100644
index 000000000000..6c26ad618c93
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/modctl.h
@@ -0,0 +1,477 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_SYS_MODCTL_H
+#define	_SYS_MODCTL_H
+
+/*
+ * loadable module support.
+ */
+
+#include <sys/zfs_context.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+struct modlmisc;
+struct modlinkage;
+
+/*
+ * The following structure defines the operations used by modctl
+ * to load and unload modules.  Each supported loadable module type
+ * requires a set of mod_ops.
+ */
+struct mod_ops {
+	int	(*modm_install)(struct modlmisc *, struct modlinkage *);
+	int	(*modm_remove)(struct modlmisc *, struct modlinkage *);
+	int	(*modm_info)(void *, struct modlinkage *, int *);
+};
+
+/*
+ * The defined set of mod_ops structures for each loadable module type
+ * Defined in modctl.c
+ */
+extern struct mod_ops mod_brandops;
+#if defined(__i386) || defined(__amd64)
+extern struct mod_ops mod_cpuops;
+#endif
+extern struct mod_ops mod_cryptoops;
+extern struct mod_ops mod_driverops;
+extern struct mod_ops mod_execops;
+extern struct mod_ops mod_fsops;
+extern struct mod_ops mod_miscops;
+extern struct mod_ops mod_schedops;
+extern struct mod_ops mod_strmodops;
+extern struct mod_ops mod_syscallops;
+extern struct mod_ops mod_sockmodops;
+#ifdef _SYSCALL32_IMPL
+extern struct mod_ops mod_syscallops32;
+#endif
+extern struct mod_ops mod_dacfops;
+extern struct mod_ops mod_ippops;
+extern struct mod_ops mod_pcbeops;
+extern struct mod_ops mod_devfsops;
+extern struct mod_ops mod_kiconvops;
+
+/*
+ * Definitions for the module specific linkage structures.
+ * The first two fields are the same in all of the structures.
+ * The linkinfo is for informational purposes only and is returned by
+ * modctl with the MODINFO cmd.
+ */
+
+/* For cryptographic providers */
+struct modlcrypto {
+	struct mod_ops		*crypto_modops;
+	char			*crypto_linkinfo;
+};
+
+/* For misc */
+struct modlmisc {
+	struct mod_ops		*misc_modops;
+	char			*misc_linkinfo;
+};
+
+/*
+ * Revision number of loadable modules support.  This is the value
+ * that must be used in the modlinkage structure.
+ */
+#define	MODREV_1		1
+
+/*
+ * The modlinkage structure is the structure that the module writer
+ * provides to the routines to install, remove, and stat a module.
+ * The ml_linkage element is an array of pointers to linkage structures.
+ * For most modules there is only one linkage structure.  We allocate
+ * enough space for 3 linkage structures which happens to be the most
+ * we have in any sun supplied module.  For those modules with more
+ * than 3 linkage structures (which is very unlikely), a modlinkage
+ * structure must be kmem_alloc'd in the module wrapper to be big enough
+ * for all of the linkage structures.
+ */
+struct modlinkage {
+	int		ml_rev;		/* rev of loadable modules system */
+#ifdef _LP64
+	void		*ml_linkage[7];	/* more space in 64-bit OS */
+#else
+	void		*ml_linkage[4];	/* NULL terminated list of */
+					/* linkage structures */
+#endif
+};
+
+/*
+ * commands.  These are the commands supported by the modctl system call.
+ */
+#define	MODLOAD			0
+#define	MODUNLOAD		1
+#define	MODINFO			2
+#define	MODRESERVED		3
+#define	MODSETMINIROOT		4
+#define	MODADDMAJBIND		5
+#define	MODGETPATH		6
+#define	MODREADSYSBIND		7
+#define	MODGETMAJBIND		8
+#define	MODGETNAME		9
+#define	MODSIZEOF_DEVID		10
+#define	MODGETDEVID		11
+#define	MODSIZEOF_MINORNAME	12
+#define	MODGETMINORNAME		13
+#define	MODGETPATHLEN		14
+#define	MODEVENTS		15
+#define	MODGETFBNAME		16
+#define	MODREREADDACF		17
+#define	MODLOADDRVCONF		18
+#define	MODUNLOADDRVCONF	19
+#define	MODREMMAJBIND		20
+#define	MODDEVT2INSTANCE	21
+#define	MODGETDEVFSPATH_LEN	22
+#define	MODGETDEVFSPATH		23
+#define	MODDEVID2PATHS		24
+#define	MODSETDEVPOLICY		26
+#define	MODGETDEVPOLICY		27
+#define	MODALLOCPRIV		28
+#define	MODGETDEVPOLICYBYNAME	29
+#define	MODLOADMINORPERM	31
+#define	MODADDMINORPERM		32
+#define	MODREMMINORPERM		33
+#define	MODREMDRVCLEANUP	34
+#define	MODDEVEXISTS		35
+#define	MODDEVREADDIR		36
+#define	MODDEVNAME		37
+#define	MODGETDEVFSPATH_MI_LEN	38
+#define	MODGETDEVFSPATH_MI	39
+#define	MODRETIRE		40
+#define	MODUNRETIRE		41
+#define	MODISRETIRED		42
+#define	MODDEVEMPTYDIR		43
+#define	MODREMDRVALIAS		44
+
+/*
+ * sub cmds for MODEVENTS
+ */
+#define	MODEVENTS_FLUSH				0
+#define	MODEVENTS_FLUSH_DUMP			1
+#define	MODEVENTS_SET_DOOR_UPCALL_FILENAME	2
+#define	MODEVENTS_GETDATA			3
+#define	MODEVENTS_FREEDATA			4
+#define	MODEVENTS_POST_EVENT			5
+#define	MODEVENTS_REGISTER_EVENT		6
+
+/*
+ * devname subcmds for MODDEVNAME
+ */
+#define	MODDEVNAME_LOOKUPDOOR	0
+#define	MODDEVNAME_DEVFSADMNODE	1
+#define	MODDEVNAME_NSMAPS	2
+#define	MODDEVNAME_PROFILE	3
+#define	MODDEVNAME_RECONFIG	4
+#define	MODDEVNAME_SYSAVAIL	5
+
+
+/*
+ * Data structure passed to modconfig command in kernel to build devfs tree
+ */
+
+struct aliases {
+	struct aliases *a_next;
+	char *a_name;
+	int a_len;
+};
+
+#define	MAXMODCONFNAME	256
+
+struct modconfig {
+	char drvname[MAXMODCONFNAME];
+	char drvclass[MAXMODCONFNAME];
+	int major;
+	int flags;
+	int num_aliases;
+	struct aliases *ap;
+};
+
+#if defined(_SYSCALL32)
+
+struct aliases32 {
+	caddr32_t a_next;
+	caddr32_t a_name;
+	int32_t a_len;
+};
+
+struct modconfig32 {
+	char drvname[MAXMODCONFNAME];
+	char drvclass[MAXMODCONFNAME];
+	int32_t major;
+	int32_t flags;
+	int32_t num_aliases;
+	caddr32_t ap;
+};
+
+#endif /* _SYSCALL32 */
+
+/* flags for modconfig */
+#define	MOD_UNBIND_OVERRIDE	0x01		/* fail unbind if in use */
+
+/*
+ * Max module path length
+ */
+#define	MOD_MAXPATH	256
+
+/*
+ * Default search path for modules ADDITIONAL to the directory
+ * where the kernel components we booted from are.
+ *
+ * Most often, this will be "/platform/{platform}/kernel /kernel /usr/kernel",
+ * but we don't wire it down here.
+ */
+#define	MOD_DEFPATH	"/kernel /usr/kernel"
+
+/*
+ * Default file name extension for autoloading modules.
+ */
+#define	MOD_DEFEXT	""
+
+/*
+ * Parameters for modinfo
+ */
+#define	MODMAXNAMELEN 32		/* max module name length */
+#define	MODMAXLINKINFOLEN 32		/* max link info length */
+
+/*
+ * Module specific information.
+ */
+struct modspecific_info {
+	char	msi_linkinfo[MODMAXLINKINFOLEN]; /* name in linkage struct */
+	int	msi_p0;			/* module specific information */
+};
+
+/*
+ * Structure returned by modctl with MODINFO command.
+ */
+#define	MODMAXLINK 10			/* max linkages modinfo can handle */
+
+struct modinfo {
+	int		   mi_info;		/* Flags for info wanted */
+	int		   mi_state;		/* Flags for module state */
+	int		   mi_id;		/* id of this loaded module */
+	int		   mi_nextid;		/* id of next module or -1 */
+	caddr_t		   mi_base;		/* virtual addr of text */
+	size_t		   mi_size;		/* size of module in bytes */
+	int		   mi_rev;		/* loadable modules rev */
+	int		   mi_loadcnt;		/* # of times loaded */
+	char		   mi_name[MODMAXNAMELEN]; /* name of module */
+	struct modspecific_info mi_msinfo[MODMAXLINK];
+						/* mod specific info */
+};
+
+
+#if defined(_SYSCALL32)
+
+#define	MODMAXNAMELEN32 32		/* max module name length */
+#define	MODMAXLINKINFOLEN32 32		/* max link info length */
+#define	MODMAXLINK32 10			/* max linkages modinfo can handle */
+
+struct modspecific_info32 {
+	char	msi_linkinfo[MODMAXLINKINFOLEN32]; /* name in linkage struct */
+	int32_t	msi_p0;			/* module specific information */
+};
+
+struct modinfo32 {
+	int32_t		   mi_info;		/* Flags for info wanted */
+	int32_t		   mi_state;		/* Flags for module state */
+	int32_t		   mi_id;		/* id of this loaded module */
+	int32_t		   mi_nextid;		/* id of next module or -1 */
+	caddr32_t	   mi_base;		/* virtual addr of text */
+	uint32_t	   mi_size;		/* size of module in bytes */
+	int32_t		   mi_rev;		/* loadable modules rev */
+	int32_t		   mi_loadcnt;		/* # of times loaded */
+	char		   mi_name[MODMAXNAMELEN32]; /* name of module */
+	struct modspecific_info32 mi_msinfo[MODMAXLINK32];
+						/* mod specific info */
+};
+
+#endif /* _SYSCALL32 */
+
+/* Values for mi_info flags */
+#define	MI_INFO_ONE	1
+#define	MI_INFO_ALL	2
+#define	MI_INFO_CNT	4
+#define	MI_INFO_LINKAGE	8	/* used internally to extract modlinkage */
+/*
+ * MI_INFO_NOBASE indicates caller does not need mi_base. Failure to use this
+ * flag may lead 32-bit apps to receive an EOVERFLOW error from modctl(MODINFO)
+ * when used with a 64-bit kernel.
+ */
+#define	MI_INFO_NOBASE	16
+
+/* Values for mi_state */
+#define	MI_LOADED	1
+#define	MI_INSTALLED	2
+
+/*
+ * Macros to vector to the appropriate module specific routine.
+ */
+#define	MODL_INSTALL(MODL, MODLP) \
+	(*(MODL)->misc_modops->modm_install)(MODL, MODLP)
+#define	MODL_REMOVE(MODL, MODLP) \
+	(*(MODL)->misc_modops->modm_remove)(MODL, MODLP)
+#define	MODL_INFO(MODL, MODLP, P0) \
+	(*(MODL)->misc_modops->modm_info)(MODL, MODLP, P0)
+
+/*
+ * Definitions for stubs
+ */
+struct mod_stub_info {
+	uintptr_t mods_func_adr;
+	struct mod_modinfo *mods_modinfo;
+	uintptr_t mods_stub_adr;
+	int (*mods_errfcn)(void);
+	int mods_flag;			/* flags defined below */
+};
+
+/*
+ * Definitions for mods_flag.
+ */
+#define	MODS_WEAK	0x01		/* weak stub (not loaded if called) */
+#define	MODS_NOUNLOAD	0x02		/* module not unloadable (no _fini()) */
+#define	MODS_INSTALLED	0x10		/* module installed */
+
+struct mod_modinfo {
+	char *modm_module_name;
+	struct modctl *mp;
+	struct mod_stub_info modm_stubs[1];
+};
+
+struct modctl_list {
+	struct modctl_list *modl_next;
+	struct modctl *modl_modp;
+};
+
+/*
+ * Structure to manage a loadable module.
+ * Note: the module (mod_mp) structure's "text" and "text_size" information
+ * are replicated in the modctl structure so that mod_containing_pc()
+ * doesn't have to grab any locks (modctls are persistent; modules are not.)
+ */
+typedef struct modctl {
+	struct modctl	*mod_next;	/* &modules based list */
+	struct modctl	*mod_prev;
+	int		mod_id;
+	void		*mod_mp;
+	kthread_t	*mod_inprogress_thread;
+	struct mod_modinfo *mod_modinfo;
+	struct modlinkage *mod_linkage;
+	char		*mod_filename;
+	char		*mod_modname;
+
+	char		mod_busy;	/* inprogress_thread has locked */
+	char		mod_want;	/* someone waiting for unlock */
+	char		mod_prim;	/* primary module */
+
+	int		mod_ref;	/* ref count - from dependent or stub */
+
+	char		mod_loaded;	/* module in memory */
+	char		mod_installed;	/* post _init pre _fini */
+	char		mod_loadflags;
+	char		mod_delay_unload;	/* deferred unload */
+
+	struct modctl_list *mod_requisites;	/* mods this one depends on. */
+	void		*____unused;	/* NOTE: reuse (same size) is OK, */
+					/* deletion causes mdb.vs.core issues */
+	int		mod_loadcnt;	/* number of times mod was loaded */
+	int		mod_nenabled;	/* # of enabled DTrace probes in mod */
+	char		*mod_text;
+	size_t		mod_text_size;
+
+	int		mod_gencount;	/* # times loaded/unloaded */
+	struct modctl	*mod_requisite_loading;	/* mod circular dependency */
+} modctl_t;
+
+/*
+ * mod_loadflags
+ */
+
+#define	MOD_NOAUTOUNLOAD	0x1	/* Auto mod-unloader skips this mod */
+#define	MOD_NONOTIFY		0x2	/* No krtld notifications on (un)load */
+#define	MOD_NOUNLOAD		0x4	/* Assume EBUSY for all _fini's */
+
+#define	MOD_BIND_HASHSIZE	64
+#define	MOD_BIND_HASHMASK	(MOD_BIND_HASHSIZE-1)
+
+typedef int modid_t;
+
+/*
+ * global function and data declarations
+ */
+extern kmutex_t mod_lock;
+
+extern char *systemfile;
+extern char **syscallnames;
+extern int moddebug;
+
+/*
+ * this is the head of a doubly linked list.  Only the next and prev
+ * pointers are used
+ */
+extern modctl_t modules;
+
+/*
+ * Only the following are part of the DDI/DKI
+ */
+extern int	mod_install(struct modlinkage *);
+extern int	mod_remove(struct modlinkage *);
+extern int	mod_info(struct modlinkage *, struct modinfo *);
+
+/*
+ * bit definitions for moddebug.
+ */
+#define	MODDEBUG_LOADMSG	0x80000000	/* print "[un]loading..." msg */
+#define	MODDEBUG_ERRMSG		0x40000000	/* print detailed error msgs */
+#define	MODDEBUG_LOADMSG2	0x20000000	/* print 2nd level msgs */
+#define	MODDEBUG_RETIRE		0x10000000	/* print retire msgs */
+#define	MODDEBUG_BINDING	0x00040000	/* driver/alias binding */
+#define	MODDEBUG_FINI_EBUSY	0x00020000	/* pretend fini returns EBUSY */
+#define	MODDEBUG_NOAUL_IPP	0x00010000	/* no Autounloading ipp mods */
+#define	MODDEBUG_NOAUL_DACF	0x00008000	/* no Autounloading dacf mods */
+#define	MODDEBUG_KEEPTEXT	0x00004000	/* keep text after unloading */
+#define	MODDEBUG_NOAUL_DRV	0x00001000	/* no Autounloading Drivers */
+#define	MODDEBUG_NOAUL_EXEC	0x00000800	/* no Autounloading Execs */
+#define	MODDEBUG_NOAUL_FS	0x00000400	/* no Autounloading File sys */
+#define	MODDEBUG_NOAUL_MISC	0x00000200	/* no Autounloading misc */
+#define	MODDEBUG_NOAUL_SCHED	0x00000100	/* no Autounloading scheds */
+#define	MODDEBUG_NOAUL_STR	0x00000080	/* no Autounloading streams */
+#define	MODDEBUG_NOAUL_SYS	0x00000040	/* no Autounloading syscalls */
+#define	MODDEBUG_NOCTF		0x00000020	/* do not load CTF debug data */
+#define	MODDEBUG_NOAUTOUNLOAD	0x00000010	/* no autounloading at all */
+#define	MODDEBUG_DDI_MOD	0x00000008	/* ddi_mod{open,sym,close} */
+#define	MODDEBUG_MP_MATCH	0x00000004	/* dev_minorperm */
+#define	MODDEBUG_MINORPERM	0x00000002	/* minor perm modctls */
+#define	MODDEBUG_USERDEBUG	0x00000001	/* bpt after init_module() */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_MODCTL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/modhash.h b/sys/contrib/openzfs/module/icp/include/sys/modhash.h
new file mode 100644
index 000000000000..06b52ff02604
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/modhash.h
@@ -0,0 +1,147 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MODHASH_H
+#define	_SYS_MODHASH_H
+
+/*
+ * Generic hash implementation for the kernel.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+
+/*
+ * Opaque data types for storing keys and values
+ */
+typedef void *mod_hash_val_t;
+typedef void *mod_hash_key_t;
+
+/*
+ * Opaque data type for reservation
+ */
+typedef void *mod_hash_hndl_t;
+
+/*
+ * Opaque type for hash itself.
+ */
+struct mod_hash;
+typedef struct mod_hash mod_hash_t;
+
+/*
+ * String hash table
+ */
+mod_hash_t *mod_hash_create_strhash_nodtr(char *, size_t,
+	void (*)(mod_hash_val_t));
+mod_hash_t *mod_hash_create_strhash(char *, size_t, void (*)(mod_hash_val_t));
+void mod_hash_destroy_strhash(mod_hash_t *);
+int mod_hash_strkey_cmp(mod_hash_key_t, mod_hash_key_t);
+void mod_hash_strkey_dtor(mod_hash_key_t);
+void mod_hash_strval_dtor(mod_hash_val_t);
+uint_t mod_hash_bystr(void *, mod_hash_key_t);
+
+/*
+ * Pointer hash table
+ */
+mod_hash_t *mod_hash_create_ptrhash(char *, size_t, void (*)(mod_hash_val_t),
+	size_t);
+void mod_hash_destroy_ptrhash(mod_hash_t *);
+int mod_hash_ptrkey_cmp(mod_hash_key_t, mod_hash_key_t);
+uint_t mod_hash_byptr(void *, mod_hash_key_t);
+
+/*
+ * ID hash table
+ */
+mod_hash_t *mod_hash_create_idhash(char *, size_t, void (*)(mod_hash_val_t));
+void mod_hash_destroy_idhash(mod_hash_t *);
+int mod_hash_idkey_cmp(mod_hash_key_t, mod_hash_key_t);
+uint_t mod_hash_byid(void *, mod_hash_key_t);
+uint_t mod_hash_iddata_gen(size_t);
+
+/*
+ * Hash management functions
+ */
+mod_hash_t *mod_hash_create_extended(char *, size_t, void (*)(mod_hash_key_t),
+	void (*)(mod_hash_val_t), uint_t (*)(void *, mod_hash_key_t), void *,
+	int (*)(mod_hash_key_t, mod_hash_key_t), int);
+
+void mod_hash_destroy_hash(mod_hash_t *);
+void mod_hash_clear(mod_hash_t *);
+
+/*
+ * Null key and value destructors
+ */
+void mod_hash_null_keydtor(mod_hash_key_t);
+void mod_hash_null_valdtor(mod_hash_val_t);
+
+/*
+ * Basic hash operations
+ */
+
+/*
+ * Error codes for insert, remove, find, destroy.
+ */
+#define	MH_ERR_NOMEM -1
+#define	MH_ERR_DUPLICATE -2
+#define	MH_ERR_NOTFOUND -3
+
+/*
+ * Return codes for hash walkers
+ */
+#define	MH_WALK_CONTINUE 0
+#define	MH_WALK_TERMINATE 1
+
+/*
+ * Basic hash operations
+ */
+int mod_hash_insert(mod_hash_t *, mod_hash_key_t, mod_hash_val_t);
+int mod_hash_replace(mod_hash_t *, mod_hash_key_t, mod_hash_val_t);
+int mod_hash_remove(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int mod_hash_destroy(mod_hash_t *, mod_hash_key_t);
+int mod_hash_find(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int mod_hash_find_cb(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
+	void (*)(mod_hash_key_t, mod_hash_val_t));
+int mod_hash_find_cb_rval(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *,
+	int (*)(mod_hash_key_t, mod_hash_val_t), int *);
+void mod_hash_walk(mod_hash_t *,
+	uint_t (*)(mod_hash_key_t, mod_hash_val_t *, void *), void *);
+
+/*
+ * Reserving hash operations
+ */
+int mod_hash_reserve(mod_hash_t *, mod_hash_hndl_t *);
+int mod_hash_reserve_nosleep(mod_hash_t *, mod_hash_hndl_t *);
+void mod_hash_cancel(mod_hash_t *, mod_hash_hndl_t *);
+int mod_hash_insert_reserve(mod_hash_t *, mod_hash_key_t, mod_hash_val_t,
+	mod_hash_hndl_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MODHASH_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/modhash_impl.h b/sys/contrib/openzfs/module/icp/include/sys/modhash_impl.h
new file mode 100644
index 000000000000..3130773aa196
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/modhash_impl.h
@@ -0,0 +1,108 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_MODHASH_IMPL_H
+#define	_SYS_MODHASH_IMPL_H
+
+/*
+ * Internal details for the kernel's generic hash implementation.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/zfs_context.h>
+#include <sys/modhash.h>
+
+struct mod_hash_entry {
+	mod_hash_key_t mhe_key;			/* stored hash key	*/
+	mod_hash_val_t mhe_val;			/* stored hash value	*/
+	struct mod_hash_entry *mhe_next;	/* next item in chain	*/
+};
+
+struct mod_hash_stat {
+	ulong_t mhs_hit;	/* tried a 'find' and it succeeded */
+	ulong_t mhs_miss;	/* tried a 'find' but it failed */
+	ulong_t mhs_coll;	/* occur when insert fails because of dup's */
+	ulong_t mhs_nelems;	/* total number of stored key/value pairs */
+	ulong_t mhs_nomem;	/* number of times kmem_alloc failed */
+};
+
+struct mod_hash {
+	krwlock_t	mh_contents;	/* lock protecting contents */
+	char		*mh_name;	/* hash name */
+	int		mh_sleep;	/* kmem_alloc flag */
+	size_t		mh_nchains;	/* # of elements in mh_entries */
+
+	/* key and val destructor */
+	void    (*mh_kdtor)(mod_hash_key_t);
+	void    (*mh_vdtor)(mod_hash_val_t);
+
+	/* key comparator */
+	int	(*mh_keycmp)(mod_hash_key_t, mod_hash_key_t);
+
+	/* hash algorithm, and algorithm-private data */
+	uint_t  (*mh_hashalg)(void *, mod_hash_key_t);
+	void    *mh_hashalg_data;
+
+	struct mod_hash	*mh_next;	/* next hash in list */
+
+	struct mod_hash_stat mh_stat;
+
+	struct mod_hash_entry *mh_entries[1];
+};
+
+/*
+ * MH_SIZE()
+ * 	Compute the size of a mod_hash_t, in bytes, given the number of
+ * 	elements it contains.
+ */
+#define	MH_SIZE(n) \
+	(sizeof (mod_hash_t) + ((n) - 1) * (sizeof (struct mod_hash_entry *)))
+
+/*
+ * Module initialization; called once.
+ */
+void mod_hash_fini(void);
+void mod_hash_init(void);
+
+/*
+ * Internal routines.  Use directly with care.
+ */
+uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t);
+int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t,
+    mod_hash_hndl_t);
+int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *);
+void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t,
+    mod_hash_val_t *, void *), void *);
+void i_mod_hash_clear_nosync(mod_hash_t *hash);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_MODHASH_IMPL_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/stack.h b/sys/contrib/openzfs/module/icp/include/sys/stack.h
new file mode 100644
index 000000000000..64fecf409b5c
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/stack.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_STACK_H
+#define	_SYS_STACK_H
+
+#if defined(__i386) || defined(__amd64)
+
+#include <sys/ia32/stack.h>	/* XX64 x86/sys/stack.h */
+
+#endif
+
+#endif	/* _SYS_STACK_H */
diff --git a/sys/contrib/openzfs/module/icp/include/sys/trap.h b/sys/contrib/openzfs/module/icp/include/sys/trap.h
new file mode 100644
index 000000000000..7f9fd375805f
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/include/sys/trap.h
@@ -0,0 +1,36 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_TRAP_H
+#define	_SYS_TRAP_H
+
+#if defined(__i386) || defined(__amd64)
+
+#include <sys/ia32/trap.h>	/* XX64	x86/sys/trap.h */
+
+#endif
+
+#endif	/* _SYS_TRAP_H */
diff --git a/sys/contrib/openzfs/module/icp/io/aes.c b/sys/contrib/openzfs/module/icp/io/aes.c
new file mode 100644
index 000000000000..e540af4473f7
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/aes.c
@@ -0,0 +1,1457 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * AES provider for the Kernel Cryptographic Framework (KCF)
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/icp.h>
+#include <modes/modes.h>
+#include <sys/modctl.h>
+#define	_AES_IMPL
+#include <aes/aes_impl.h>
+#include <modes/gcm_impl.h>
+
+#define	CRYPTO_PROVIDER_NAME "aes"
+
+extern struct mod_ops mod_cryptoops;
+
+/*
+ * Module linkage information for the kernel.
+ */
+static struct modlcrypto modlcrypto = {
+	&mod_cryptoops,
+	"AES Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, { (void *)&modlcrypto, NULL }
+};
+
+/*
+ * Mechanism info structure passed to KCF during registration.
+ */
+static crypto_mech_info_t aes_mech_info_tab[] = {
+	/* AES_ECB */
+	{SUN_CKM_AES_ECB, AES_ECB_MECH_INFO_TYPE,
+	    CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+	    CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+	    AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* AES_CBC */
+	{SUN_CKM_AES_CBC, AES_CBC_MECH_INFO_TYPE,
+	    CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+	    CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+	    AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* AES_CTR */
+	{SUN_CKM_AES_CTR, AES_CTR_MECH_INFO_TYPE,
+	    CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+	    CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+	    AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* AES_CCM */
+	{SUN_CKM_AES_CCM, AES_CCM_MECH_INFO_TYPE,
+	    CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+	    CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+	    AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* AES_GCM */
+	{SUN_CKM_AES_GCM, AES_GCM_MECH_INFO_TYPE,
+	    CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+	    CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC,
+	    AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* AES_GMAC */
+	{SUN_CKM_AES_GMAC, AES_GMAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC |
+	    CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC |
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC |
+	    CRYPTO_FG_SIGN | CRYPTO_FG_SIGN_ATOMIC |
+	    CRYPTO_FG_VERIFY | CRYPTO_FG_VERIFY_ATOMIC,
+	    AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void aes_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t aes_control_ops = {
+	aes_provider_status
+};
+
+static int aes_encrypt_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int aes_decrypt_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int aes_common_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t, boolean_t);
+static int aes_common_init_ctx(aes_ctx_t *, crypto_spi_ctx_template_t *,
+    crypto_mechanism_t *, crypto_key_t *, int, boolean_t);
+static int aes_encrypt_final(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int aes_decrypt_final(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+
+static int aes_encrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int aes_encrypt_update(crypto_ctx_t *, crypto_data_t *,
+    crypto_data_t *, crypto_req_handle_t);
+static int aes_encrypt_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+    crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static int aes_decrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int aes_decrypt_update(crypto_ctx_t *, crypto_data_t *,
+    crypto_data_t *, crypto_req_handle_t);
+static int aes_decrypt_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *,
+    crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_cipher_ops_t aes_cipher_ops = {
+	.encrypt_init = aes_encrypt_init,
+	.encrypt = aes_encrypt,
+	.encrypt_update = aes_encrypt_update,
+	.encrypt_final = aes_encrypt_final,
+	.encrypt_atomic = aes_encrypt_atomic,
+	.decrypt_init = aes_decrypt_init,
+	.decrypt = aes_decrypt,
+	.decrypt_update = aes_decrypt_update,
+	.decrypt_final = aes_decrypt_final,
+	.decrypt_atomic = aes_decrypt_atomic
+};
+
+static int aes_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int aes_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t aes_mac_ops = {
+	.mac_init = NULL,
+	.mac = NULL,
+	.mac_update = NULL,
+	.mac_final = NULL,
+	.mac_atomic = aes_mac_atomic,
+	.mac_verify_atomic = aes_mac_verify_atomic
+};
+
+static int aes_create_ctx_template(crypto_provider_handle_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+    size_t *, crypto_req_handle_t);
+static int aes_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t aes_ctx_ops = {
+	.create_ctx_template = aes_create_ctx_template,
+	.free_context = aes_free_context
+};
+
+static crypto_ops_t aes_crypto_ops = {{{{{
+	&aes_control_ops,
+	NULL,
+	&aes_cipher_ops,
+	&aes_mac_ops,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&aes_ctx_ops
+}}}}};
+
+static crypto_provider_info_t aes_prov_info = {{{{
+	CRYPTO_SPI_VERSION_1,
+	"AES Software Provider",
+	CRYPTO_SW_PROVIDER,
+	NULL,
+	&aes_crypto_ops,
+	sizeof (aes_mech_info_tab)/sizeof (crypto_mech_info_t),
+	aes_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t aes_prov_handle = 0;
+static crypto_data_t null_crypto_data = { CRYPTO_DATA_RAW };
+
+int
+aes_mod_init(void)
+{
+	int ret;
+
+	/* Determine the fastest available implementation. */
+	aes_impl_init();
+	gcm_impl_init();
+
+	if ((ret = mod_install(&modlinkage)) != 0)
+		return (ret);
+
+	/* Register with KCF.  If the registration fails, remove the module. */
+	if (crypto_register_provider(&aes_prov_info, &aes_prov_handle)) {
+		(void) mod_remove(&modlinkage);
+		return (EACCES);
+	}
+
+	return (0);
+}
+
+int
+aes_mod_fini(void)
+{
+	/* Unregister from KCF if module is registered */
+	if (aes_prov_handle != 0) {
+		if (crypto_unregister_provider(aes_prov_handle))
+			return (EBUSY);
+
+		aes_prov_handle = 0;
+	}
+
+	return (mod_remove(&modlinkage));
+}
+
+static int
+aes_check_mech_param(crypto_mechanism_t *mechanism, aes_ctx_t **ctx, int kmflag)
+{
+	void *p = NULL;
+	boolean_t param_required = B_TRUE;
+	size_t param_len;
+	void *(*alloc_fun)(int);
+	int rv = CRYPTO_SUCCESS;
+
+	switch (mechanism->cm_type) {
+	case AES_ECB_MECH_INFO_TYPE:
+		param_required = B_FALSE;
+		alloc_fun = ecb_alloc_ctx;
+		break;
+	case AES_CBC_MECH_INFO_TYPE:
+		param_len = AES_BLOCK_LEN;
+		alloc_fun = cbc_alloc_ctx;
+		break;
+	case AES_CTR_MECH_INFO_TYPE:
+		param_len = sizeof (CK_AES_CTR_PARAMS);
+		alloc_fun = ctr_alloc_ctx;
+		break;
+	case AES_CCM_MECH_INFO_TYPE:
+		param_len = sizeof (CK_AES_CCM_PARAMS);
+		alloc_fun = ccm_alloc_ctx;
+		break;
+	case AES_GCM_MECH_INFO_TYPE:
+		param_len = sizeof (CK_AES_GCM_PARAMS);
+		alloc_fun = gcm_alloc_ctx;
+		break;
+	case AES_GMAC_MECH_INFO_TYPE:
+		param_len = sizeof (CK_AES_GMAC_PARAMS);
+		alloc_fun = gmac_alloc_ctx;
+		break;
+	default:
+		rv = CRYPTO_MECHANISM_INVALID;
+		return (rv);
+	}
+	if (param_required && mechanism->cm_param != NULL &&
+	    mechanism->cm_param_len != param_len) {
+		rv = CRYPTO_MECHANISM_PARAM_INVALID;
+	}
+	if (ctx != NULL) {
+		p = (alloc_fun)(kmflag);
+		*ctx = p;
+	}
+	return (rv);
+}
+
+/*
+ * Initialize key schedules for AES
+ */
+static int
+init_keysched(crypto_key_t *key, void *newbie)
+{
+	/*
+	 * Only keys by value are supported by this module.
+	 */
+	switch (key->ck_format) {
+	case CRYPTO_KEY_RAW:
+		if (key->ck_length < AES_MINBITS ||
+		    key->ck_length > AES_MAXBITS) {
+			return (CRYPTO_KEY_SIZE_RANGE);
+		}
+
+		/* key length must be either 128, 192, or 256 */
+		if ((key->ck_length & 63) != 0)
+			return (CRYPTO_KEY_SIZE_RANGE);
+		break;
+	default:
+		return (CRYPTO_KEY_TYPE_INCONSISTENT);
+	}
+
+	aes_init_keysched(key->ck_data, key->ck_length, newbie);
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+aes_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+	*status = CRYPTO_PROVIDER_READY;
+}
+
+static int
+aes_encrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t template,
+    crypto_req_handle_t req)
+{
+	return (aes_common_init(ctx, mechanism, key, template, req, B_TRUE));
+}
+
+static int
+aes_decrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t template,
+    crypto_req_handle_t req)
+{
+	return (aes_common_init(ctx, mechanism, key, template, req, B_FALSE));
+}
+
+
+
+/*
+ * KCF software provider encrypt entry points.
+ */
+static int
+aes_common_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t template,
+    crypto_req_handle_t req, boolean_t is_encrypt_init)
+{
+	aes_ctx_t *aes_ctx;
+	int rv;
+	int kmflag;
+
+	/*
+	 * Only keys by value are supported by this module.
+	 */
+	if (key->ck_format != CRYPTO_KEY_RAW) {
+		return (CRYPTO_KEY_TYPE_INCONSISTENT);
+	}
+
+	kmflag = crypto_kmflag(req);
+	if ((rv = aes_check_mech_param(mechanism, &aes_ctx, kmflag))
+	    != CRYPTO_SUCCESS)
+		return (rv);
+
+	rv = aes_common_init_ctx(aes_ctx, template, mechanism, key, kmflag,
+	    is_encrypt_init);
+	if (rv != CRYPTO_SUCCESS) {
+		crypto_free_mode_ctx(aes_ctx);
+		return (rv);
+	}
+
+	ctx->cc_provider_private = aes_ctx;
+
+	return (CRYPTO_SUCCESS);
+}
+
+static void
+aes_copy_block64(uint8_t *in, uint64_t *out)
+{
+	if (IS_P2ALIGNED(in, sizeof (uint64_t))) {
+		/* LINTED: pointer alignment */
+		out[0] = *(uint64_t *)&in[0];
+		/* LINTED: pointer alignment */
+		out[1] = *(uint64_t *)&in[8];
+	} else {
+		uint8_t *iv8 = (uint8_t *)&out[0];
+
+		AES_COPY_BLOCK(in, iv8);
+	}
+}
+
+
+static int
+aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext,
+    crypto_data_t *ciphertext, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_FAILED;
+
+	aes_ctx_t *aes_ctx;
+	size_t saved_length, saved_offset, length_needed;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+	aes_ctx = ctx->cc_provider_private;
+
+	/*
+	 * For block ciphers, plaintext must be a multiple of AES block size.
+	 * This test is only valid for ciphers whose blocksize is a power of 2.
+	 */
+	if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE))
+	    == 0) && (plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0)
+		return (CRYPTO_DATA_LEN_RANGE);
+
+	ASSERT(ciphertext != NULL);
+
+	/*
+	 * We need to just return the length needed to store the output.
+	 * We should not destroy the context for the following case.
+	 */
+	switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) {
+	case CCM_MODE:
+		length_needed = plaintext->cd_length + aes_ctx->ac_mac_len;
+		break;
+	case GCM_MODE:
+		length_needed = plaintext->cd_length + aes_ctx->ac_tag_len;
+		break;
+	case GMAC_MODE:
+		if (plaintext->cd_length != 0)
+			return (CRYPTO_ARGUMENTS_BAD);
+
+		length_needed = aes_ctx->ac_tag_len;
+		break;
+	default:
+		length_needed = plaintext->cd_length;
+	}
+
+	if (ciphertext->cd_length < length_needed) {
+		ciphertext->cd_length = length_needed;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	saved_length = ciphertext->cd_length;
+	saved_offset = ciphertext->cd_offset;
+
+	/*
+	 * Do an update on the specified input data.
+	 */
+	ret = aes_encrypt_update(ctx, plaintext, ciphertext, req);
+	if (ret != CRYPTO_SUCCESS) {
+		return (ret);
+	}
+
+	/*
+	 * For CCM mode, aes_ccm_encrypt_final() will take care of any
+	 * left-over unprocessed data, and compute the MAC
+	 */
+	if (aes_ctx->ac_flags & CCM_MODE) {
+		/*
+		 * ccm_encrypt_final() will compute the MAC and append
+		 * it to existing ciphertext. So, need to adjust the left over
+		 * length value accordingly
+		 */
+
+		/* order of following 2 lines MUST not be reversed */
+		ciphertext->cd_offset = ciphertext->cd_length;
+		ciphertext->cd_length = saved_length - ciphertext->cd_length;
+		ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, ciphertext,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+		if (ret != CRYPTO_SUCCESS) {
+			return (ret);
+		}
+
+		if (plaintext != ciphertext) {
+			ciphertext->cd_length =
+			    ciphertext->cd_offset - saved_offset;
+		}
+		ciphertext->cd_offset = saved_offset;
+	} else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+		/*
+		 * gcm_encrypt_final() will compute the MAC and append
+		 * it to existing ciphertext. So, need to adjust the left over
+		 * length value accordingly
+		 */
+
+		/* order of following 2 lines MUST not be reversed */
+		ciphertext->cd_offset = ciphertext->cd_length;
+		ciphertext->cd_length = saved_length - ciphertext->cd_length;
+		ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, ciphertext,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+		if (ret != CRYPTO_SUCCESS) {
+			return (ret);
+		}
+
+		if (plaintext != ciphertext) {
+			ciphertext->cd_length =
+			    ciphertext->cd_offset - saved_offset;
+		}
+		ciphertext->cd_offset = saved_offset;
+	}
+
+	ASSERT(aes_ctx->ac_remainder_len == 0);
+	(void) aes_free_context(ctx);
+
+	return (ret);
+}
+
+
+static int
+aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext,
+    crypto_data_t *plaintext, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_FAILED;
+
+	aes_ctx_t *aes_ctx;
+	off_t saved_offset;
+	size_t saved_length, length_needed;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+	aes_ctx = ctx->cc_provider_private;
+
+	/*
+	 * For block ciphers, plaintext must be a multiple of AES block size.
+	 * This test is only valid for ciphers whose blocksize is a power of 2.
+	 */
+	if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE))
+	    == 0) && (ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) {
+		return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+	}
+
+	ASSERT(plaintext != NULL);
+
+	/*
+	 * Return length needed to store the output.
+	 * Do not destroy context when plaintext buffer is too small.
+	 *
+	 * CCM:  plaintext is MAC len smaller than cipher text
+	 * GCM:  plaintext is TAG len smaller than cipher text
+	 * GMAC: plaintext length must be zero
+	 */
+	switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) {
+	case CCM_MODE:
+		length_needed = aes_ctx->ac_processed_data_len;
+		break;
+	case GCM_MODE:
+		length_needed = ciphertext->cd_length - aes_ctx->ac_tag_len;
+		break;
+	case GMAC_MODE:
+		if (plaintext->cd_length != 0)
+			return (CRYPTO_ARGUMENTS_BAD);
+
+		length_needed = 0;
+		break;
+	default:
+		length_needed = ciphertext->cd_length;
+	}
+
+	if (plaintext->cd_length < length_needed) {
+		plaintext->cd_length = length_needed;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	saved_offset = plaintext->cd_offset;
+	saved_length = plaintext->cd_length;
+
+	/*
+	 * Do an update on the specified input data.
+	 */
+	ret = aes_decrypt_update(ctx, ciphertext, plaintext, req);
+	if (ret != CRYPTO_SUCCESS) {
+		goto cleanup;
+	}
+
+	if (aes_ctx->ac_flags & CCM_MODE) {
+		ASSERT(aes_ctx->ac_processed_data_len == aes_ctx->ac_data_len);
+		ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len);
+
+		/* order of following 2 lines MUST not be reversed */
+		plaintext->cd_offset = plaintext->cd_length;
+		plaintext->cd_length = saved_length - plaintext->cd_length;
+
+		ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, plaintext,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+		if (ret == CRYPTO_SUCCESS) {
+			if (plaintext != ciphertext) {
+				plaintext->cd_length =
+				    plaintext->cd_offset - saved_offset;
+			}
+		} else {
+			plaintext->cd_length = saved_length;
+		}
+
+		plaintext->cd_offset = saved_offset;
+	} else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+		/* order of following 2 lines MUST not be reversed */
+		plaintext->cd_offset = plaintext->cd_length;
+		plaintext->cd_length = saved_length - plaintext->cd_length;
+
+		ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, plaintext,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+		if (ret == CRYPTO_SUCCESS) {
+			if (plaintext != ciphertext) {
+				plaintext->cd_length =
+				    plaintext->cd_offset - saved_offset;
+			}
+		} else {
+			plaintext->cd_length = saved_length;
+		}
+
+		plaintext->cd_offset = saved_offset;
+	}
+
+	ASSERT(aes_ctx->ac_remainder_len == 0);
+
+cleanup:
+	(void) aes_free_context(ctx);
+
+	return (ret);
+}
+
+
+/* ARGSUSED */
+static int
+aes_encrypt_update(crypto_ctx_t *ctx, crypto_data_t *plaintext,
+    crypto_data_t *ciphertext, crypto_req_handle_t req)
+{
+	off_t saved_offset;
+	size_t saved_length, out_len;
+	int ret = CRYPTO_SUCCESS;
+	aes_ctx_t *aes_ctx;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+	aes_ctx = ctx->cc_provider_private;
+
+	ASSERT(ciphertext != NULL);
+
+	/* compute number of bytes that will hold the ciphertext */
+	out_len = aes_ctx->ac_remainder_len;
+	out_len += plaintext->cd_length;
+	out_len &= ~(AES_BLOCK_LEN - 1);
+
+	/* return length needed to store the output */
+	if (ciphertext->cd_length < out_len) {
+		ciphertext->cd_length = out_len;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	saved_offset = ciphertext->cd_offset;
+	saved_length = ciphertext->cd_length;
+
+	/*
+	 * Do the AES update on the specified input data.
+	 */
+	switch (plaintext->cd_format) {
+	case CRYPTO_DATA_RAW:
+		ret = crypto_update_iov(ctx->cc_provider_private,
+		    plaintext, ciphertext, aes_encrypt_contiguous_blocks,
+		    aes_copy_block64);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = crypto_update_uio(ctx->cc_provider_private,
+		    plaintext, ciphertext, aes_encrypt_contiguous_blocks,
+		    aes_copy_block64);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	/*
+	 * Since AES counter mode is a stream cipher, we call
+	 * ctr_mode_final() to pick up any remaining bytes.
+	 * It is an internal function that does not destroy
+	 * the context like *normal* final routines.
+	 */
+	if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) {
+		ret = ctr_mode_final((ctr_ctx_t *)aes_ctx,
+		    ciphertext, aes_encrypt_block);
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		if (plaintext != ciphertext)
+			ciphertext->cd_length =
+			    ciphertext->cd_offset - saved_offset;
+	} else {
+		ciphertext->cd_length = saved_length;
+	}
+	ciphertext->cd_offset = saved_offset;
+
+	return (ret);
+}
+
+
+static int
+aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext,
+    crypto_data_t *plaintext, crypto_req_handle_t req)
+{
+	off_t saved_offset;
+	size_t saved_length, out_len;
+	int ret = CRYPTO_SUCCESS;
+	aes_ctx_t *aes_ctx;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+	aes_ctx = ctx->cc_provider_private;
+
+	ASSERT(plaintext != NULL);
+
+	/*
+	 * Compute number of bytes that will hold the plaintext.
+	 * This is not necessary for CCM, GCM, and GMAC since these
+	 * mechanisms never return plaintext for update operations.
+	 */
+	if ((aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) {
+		out_len = aes_ctx->ac_remainder_len;
+		out_len += ciphertext->cd_length;
+		out_len &= ~(AES_BLOCK_LEN - 1);
+
+		/* return length needed to store the output */
+		if (plaintext->cd_length < out_len) {
+			plaintext->cd_length = out_len;
+			return (CRYPTO_BUFFER_TOO_SMALL);
+		}
+	}
+
+	saved_offset = plaintext->cd_offset;
+	saved_length = plaintext->cd_length;
+
+	if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE))
+		gcm_set_kmflag((gcm_ctx_t *)aes_ctx, crypto_kmflag(req));
+
+	/*
+	 * Do the AES update on the specified input data.
+	 */
+	switch (ciphertext->cd_format) {
+	case CRYPTO_DATA_RAW:
+		ret = crypto_update_iov(ctx->cc_provider_private,
+		    ciphertext, plaintext, aes_decrypt_contiguous_blocks,
+		    aes_copy_block64);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = crypto_update_uio(ctx->cc_provider_private,
+		    ciphertext, plaintext, aes_decrypt_contiguous_blocks,
+		    aes_copy_block64);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	/*
+	 * Since AES counter mode is a stream cipher, we call
+	 * ctr_mode_final() to pick up any remaining bytes.
+	 * It is an internal function that does not destroy
+	 * the context like *normal* final routines.
+	 */
+	if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) {
+		ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, plaintext,
+		    aes_encrypt_block);
+		if (ret == CRYPTO_DATA_LEN_RANGE)
+			ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		if (ciphertext != plaintext)
+			plaintext->cd_length =
+			    plaintext->cd_offset - saved_offset;
+	} else {
+		plaintext->cd_length = saved_length;
+	}
+	plaintext->cd_offset = saved_offset;
+
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+aes_encrypt_final(crypto_ctx_t *ctx, crypto_data_t *data,
+    crypto_req_handle_t req)
+{
+	aes_ctx_t *aes_ctx;
+	int ret;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+	aes_ctx = ctx->cc_provider_private;
+
+	if (data->cd_format != CRYPTO_DATA_RAW &&
+	    data->cd_format != CRYPTO_DATA_UIO) {
+		return (CRYPTO_ARGUMENTS_BAD);
+	}
+
+	if (aes_ctx->ac_flags & CTR_MODE) {
+		if (aes_ctx->ac_remainder_len > 0) {
+			ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data,
+			    aes_encrypt_block);
+			if (ret != CRYPTO_SUCCESS)
+				return (ret);
+		}
+	} else if (aes_ctx->ac_flags & CCM_MODE) {
+		ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, data,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+		if (ret != CRYPTO_SUCCESS) {
+			return (ret);
+		}
+	} else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+		size_t saved_offset = data->cd_offset;
+
+		ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, data,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+		if (ret != CRYPTO_SUCCESS) {
+			return (ret);
+		}
+		data->cd_length = data->cd_offset - saved_offset;
+		data->cd_offset = saved_offset;
+	} else {
+		/*
+		 * There must be no unprocessed plaintext.
+		 * This happens if the length of the last data is
+		 * not a multiple of the AES block length.
+		 */
+		if (aes_ctx->ac_remainder_len > 0) {
+			return (CRYPTO_DATA_LEN_RANGE);
+		}
+		data->cd_length = 0;
+	}
+
+	(void) aes_free_context(ctx);
+
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data,
+    crypto_req_handle_t req)
+{
+	aes_ctx_t *aes_ctx;
+	int ret;
+	off_t saved_offset;
+	size_t saved_length;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+	aes_ctx = ctx->cc_provider_private;
+
+	if (data->cd_format != CRYPTO_DATA_RAW &&
+	    data->cd_format != CRYPTO_DATA_UIO) {
+		return (CRYPTO_ARGUMENTS_BAD);
+	}
+
+	/*
+	 * There must be no unprocessed ciphertext.
+	 * This happens if the length of the last ciphertext is
+	 * not a multiple of the AES block length.
+	 */
+	if (aes_ctx->ac_remainder_len > 0) {
+		if ((aes_ctx->ac_flags & CTR_MODE) == 0)
+			return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+		else {
+			ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data,
+			    aes_encrypt_block);
+			if (ret == CRYPTO_DATA_LEN_RANGE)
+				ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+			if (ret != CRYPTO_SUCCESS)
+				return (ret);
+		}
+	}
+
+	if (aes_ctx->ac_flags & CCM_MODE) {
+		/*
+		 * This is where all the plaintext is returned, make sure
+		 * the plaintext buffer is big enough
+		 */
+		size_t pt_len = aes_ctx->ac_data_len;
+		if (data->cd_length < pt_len) {
+			data->cd_length = pt_len;
+			return (CRYPTO_BUFFER_TOO_SMALL);
+		}
+
+		ASSERT(aes_ctx->ac_processed_data_len == pt_len);
+		ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len);
+		saved_offset = data->cd_offset;
+		saved_length = data->cd_length;
+		ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, data,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+		if (ret == CRYPTO_SUCCESS) {
+			data->cd_length = data->cd_offset - saved_offset;
+		} else {
+			data->cd_length = saved_length;
+		}
+
+		data->cd_offset = saved_offset;
+		if (ret != CRYPTO_SUCCESS) {
+			return (ret);
+		}
+	} else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) {
+		/*
+		 * This is where all the plaintext is returned, make sure
+		 * the plaintext buffer is big enough
+		 */
+		gcm_ctx_t *ctx = (gcm_ctx_t *)aes_ctx;
+		size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len;
+
+		if (data->cd_length < pt_len) {
+			data->cd_length = pt_len;
+			return (CRYPTO_BUFFER_TOO_SMALL);
+		}
+
+		saved_offset = data->cd_offset;
+		saved_length = data->cd_length;
+		ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, data,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block);
+		if (ret == CRYPTO_SUCCESS) {
+			data->cd_length = data->cd_offset - saved_offset;
+		} else {
+			data->cd_length = saved_length;
+		}
+
+		data->cd_offset = saved_offset;
+		if (ret != CRYPTO_SUCCESS) {
+			return (ret);
+		}
+	}
+
+
+	if ((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) {
+		data->cd_length = 0;
+	}
+
+	(void) aes_free_context(ctx);
+
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+aes_encrypt_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext,
+    crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+	aes_ctx_t aes_ctx;	/* on the stack */
+	off_t saved_offset;
+	size_t saved_length;
+	size_t length_needed;
+	int ret;
+
+	ASSERT(ciphertext != NULL);
+
+	/*
+	 * CTR, CCM, GCM, and GMAC modes do not require that plaintext
+	 * be a multiple of AES block size.
+	 */
+	switch (mechanism->cm_type) {
+	case AES_CTR_MECH_INFO_TYPE:
+	case AES_CCM_MECH_INFO_TYPE:
+	case AES_GCM_MECH_INFO_TYPE:
+	case AES_GMAC_MECH_INFO_TYPE:
+		break;
+	default:
+		if ((plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0)
+			return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS)
+		return (ret);
+
+	bzero(&aes_ctx, sizeof (aes_ctx_t));
+
+	ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key,
+	    crypto_kmflag(req), B_TRUE);
+	if (ret != CRYPTO_SUCCESS)
+		return (ret);
+
+	switch (mechanism->cm_type) {
+	case AES_CCM_MECH_INFO_TYPE:
+		length_needed = plaintext->cd_length + aes_ctx.ac_mac_len;
+		break;
+	case AES_GMAC_MECH_INFO_TYPE:
+		if (plaintext->cd_length != 0)
+			return (CRYPTO_ARGUMENTS_BAD);
+		/* FALLTHRU */
+	case AES_GCM_MECH_INFO_TYPE:
+		length_needed = plaintext->cd_length + aes_ctx.ac_tag_len;
+		break;
+	default:
+		length_needed = plaintext->cd_length;
+	}
+
+	/* return size of buffer needed to store output */
+	if (ciphertext->cd_length < length_needed) {
+		ciphertext->cd_length = length_needed;
+		ret = CRYPTO_BUFFER_TOO_SMALL;
+		goto out;
+	}
+
+	saved_offset = ciphertext->cd_offset;
+	saved_length = ciphertext->cd_length;
+
+	/*
+	 * Do an update on the specified input data.
+	 */
+	switch (plaintext->cd_format) {
+	case CRYPTO_DATA_RAW:
+		ret = crypto_update_iov(&aes_ctx, plaintext, ciphertext,
+		    aes_encrypt_contiguous_blocks, aes_copy_block64);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = crypto_update_uio(&aes_ctx, plaintext, ciphertext,
+		    aes_encrypt_contiguous_blocks, aes_copy_block64);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) {
+			ret = ccm_encrypt_final((ccm_ctx_t *)&aes_ctx,
+			    ciphertext, AES_BLOCK_LEN, aes_encrypt_block,
+			    aes_xor_block);
+			if (ret != CRYPTO_SUCCESS)
+				goto out;
+			ASSERT(aes_ctx.ac_remainder_len == 0);
+		} else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE ||
+		    mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) {
+			ret = gcm_encrypt_final((gcm_ctx_t *)&aes_ctx,
+			    ciphertext, AES_BLOCK_LEN, aes_encrypt_block,
+			    aes_copy_block, aes_xor_block);
+			if (ret != CRYPTO_SUCCESS)
+				goto out;
+			ASSERT(aes_ctx.ac_remainder_len == 0);
+		} else if (mechanism->cm_type == AES_CTR_MECH_INFO_TYPE) {
+			if (aes_ctx.ac_remainder_len > 0) {
+				ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx,
+				    ciphertext, aes_encrypt_block);
+				if (ret != CRYPTO_SUCCESS)
+					goto out;
+			}
+		} else {
+			ASSERT(aes_ctx.ac_remainder_len == 0);
+		}
+
+		if (plaintext != ciphertext) {
+			ciphertext->cd_length =
+			    ciphertext->cd_offset - saved_offset;
+		}
+	} else {
+		ciphertext->cd_length = saved_length;
+	}
+	ciphertext->cd_offset = saved_offset;
+
+out:
+	if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+		bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+		kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+	}
+#ifdef CAN_USE_GCM_ASM
+	if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE) &&
+	    ((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
+
+		gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
+
+		bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
+		kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+	}
+#endif
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+aes_decrypt_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext,
+    crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+	aes_ctx_t aes_ctx;	/* on the stack */
+	off_t saved_offset;
+	size_t saved_length;
+	size_t length_needed;
+	int ret;
+
+	ASSERT(plaintext != NULL);
+
+	/*
+	 * CCM, GCM, CTR, and GMAC modes do not require that ciphertext
+	 * be a multiple of AES block size.
+	 */
+	switch (mechanism->cm_type) {
+	case AES_CTR_MECH_INFO_TYPE:
+	case AES_CCM_MECH_INFO_TYPE:
+	case AES_GCM_MECH_INFO_TYPE:
+	case AES_GMAC_MECH_INFO_TYPE:
+		break;
+	default:
+		if ((ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0)
+			return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE);
+	}
+
+	if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS)
+		return (ret);
+
+	bzero(&aes_ctx, sizeof (aes_ctx_t));
+
+	ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key,
+	    crypto_kmflag(req), B_FALSE);
+	if (ret != CRYPTO_SUCCESS)
+		return (ret);
+
+	switch (mechanism->cm_type) {
+	case AES_CCM_MECH_INFO_TYPE:
+		length_needed = aes_ctx.ac_data_len;
+		break;
+	case AES_GCM_MECH_INFO_TYPE:
+		length_needed = ciphertext->cd_length - aes_ctx.ac_tag_len;
+		break;
+	case AES_GMAC_MECH_INFO_TYPE:
+		if (plaintext->cd_length != 0)
+			return (CRYPTO_ARGUMENTS_BAD);
+		length_needed = 0;
+		break;
+	default:
+		length_needed = ciphertext->cd_length;
+	}
+
+	/* return size of buffer needed to store output */
+	if (plaintext->cd_length < length_needed) {
+		plaintext->cd_length = length_needed;
+		ret = CRYPTO_BUFFER_TOO_SMALL;
+		goto out;
+	}
+
+	saved_offset = plaintext->cd_offset;
+	saved_length = plaintext->cd_length;
+
+	if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE ||
+	    mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE)
+		gcm_set_kmflag((gcm_ctx_t *)&aes_ctx, crypto_kmflag(req));
+
+	/*
+	 * Do an update on the specified input data.
+	 */
+	switch (ciphertext->cd_format) {
+	case CRYPTO_DATA_RAW:
+		ret = crypto_update_iov(&aes_ctx, ciphertext, plaintext,
+		    aes_decrypt_contiguous_blocks, aes_copy_block64);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = crypto_update_uio(&aes_ctx, ciphertext, plaintext,
+		    aes_decrypt_contiguous_blocks, aes_copy_block64);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) {
+			ASSERT(aes_ctx.ac_processed_data_len
+			    == aes_ctx.ac_data_len);
+			ASSERT(aes_ctx.ac_processed_mac_len
+			    == aes_ctx.ac_mac_len);
+			ret = ccm_decrypt_final((ccm_ctx_t *)&aes_ctx,
+			    plaintext, AES_BLOCK_LEN, aes_encrypt_block,
+			    aes_copy_block, aes_xor_block);
+			ASSERT(aes_ctx.ac_remainder_len == 0);
+			if ((ret == CRYPTO_SUCCESS) &&
+			    (ciphertext != plaintext)) {
+				plaintext->cd_length =
+				    plaintext->cd_offset - saved_offset;
+			} else {
+				plaintext->cd_length = saved_length;
+			}
+		} else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE ||
+		    mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) {
+			ret = gcm_decrypt_final((gcm_ctx_t *)&aes_ctx,
+			    plaintext, AES_BLOCK_LEN, aes_encrypt_block,
+			    aes_xor_block);
+			ASSERT(aes_ctx.ac_remainder_len == 0);
+			if ((ret == CRYPTO_SUCCESS) &&
+			    (ciphertext != plaintext)) {
+				plaintext->cd_length =
+				    plaintext->cd_offset - saved_offset;
+			} else {
+				plaintext->cd_length = saved_length;
+			}
+		} else if (mechanism->cm_type != AES_CTR_MECH_INFO_TYPE) {
+			ASSERT(aes_ctx.ac_remainder_len == 0);
+			if (ciphertext != plaintext)
+				plaintext->cd_length =
+				    plaintext->cd_offset - saved_offset;
+		} else {
+			if (aes_ctx.ac_remainder_len > 0) {
+				ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx,
+				    plaintext, aes_encrypt_block);
+				if (ret == CRYPTO_DATA_LEN_RANGE)
+					ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE;
+				if (ret != CRYPTO_SUCCESS)
+					goto out;
+			}
+			if (ciphertext != plaintext)
+				plaintext->cd_length =
+				    plaintext->cd_offset - saved_offset;
+		}
+	} else {
+		plaintext->cd_length = saved_length;
+	}
+	plaintext->cd_offset = saved_offset;
+
+out:
+	if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+		bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+		kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
+	}
+
+	if (aes_ctx.ac_flags & CCM_MODE) {
+		if (aes_ctx.ac_pt_buf != NULL) {
+			vmem_free(aes_ctx.ac_pt_buf, aes_ctx.ac_data_len);
+		}
+	} else if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE)) {
+		if (((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf != NULL) {
+			vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf,
+			    ((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len);
+		}
+#ifdef CAN_USE_GCM_ASM
+		if (((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
+			gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
+
+			bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
+			kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
+		}
+#endif
+	}
+
+	return (ret);
+}
+
+/*
+ * KCF software provider context template entry points.
+ */
+/* ARGSUSED */
+static int
+aes_create_ctx_template(crypto_provider_handle_t provider,
+    crypto_mechanism_t *mechanism, crypto_key_t *key,
+    crypto_spi_ctx_template_t *tmpl, size_t *tmpl_size, crypto_req_handle_t req)
+{
+	void *keysched;
+	size_t size;
+	int rv;
+
+	if (mechanism->cm_type != AES_ECB_MECH_INFO_TYPE &&
+	    mechanism->cm_type != AES_CBC_MECH_INFO_TYPE &&
+	    mechanism->cm_type != AES_CTR_MECH_INFO_TYPE &&
+	    mechanism->cm_type != AES_CCM_MECH_INFO_TYPE &&
+	    mechanism->cm_type != AES_GCM_MECH_INFO_TYPE &&
+	    mechanism->cm_type != AES_GMAC_MECH_INFO_TYPE)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	if ((keysched = aes_alloc_keysched(&size,
+	    crypto_kmflag(req))) == NULL) {
+		return (CRYPTO_HOST_MEMORY);
+	}
+
+	/*
+	 * Initialize key schedule.  Key length information is stored
+	 * in the key.
+	 */
+	if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) {
+		bzero(keysched, size);
+		kmem_free(keysched, size);
+		return (rv);
+	}
+
+	*tmpl = keysched;
+	*tmpl_size = size;
+
+	return (CRYPTO_SUCCESS);
+}
+
+
+static int
+aes_free_context(crypto_ctx_t *ctx)
+{
+	aes_ctx_t *aes_ctx = ctx->cc_provider_private;
+
+	if (aes_ctx != NULL) {
+		if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+			ASSERT(aes_ctx->ac_keysched_len != 0);
+			bzero(aes_ctx->ac_keysched, aes_ctx->ac_keysched_len);
+			kmem_free(aes_ctx->ac_keysched,
+			    aes_ctx->ac_keysched_len);
+		}
+		crypto_free_mode_ctx(aes_ctx);
+		ctx->cc_provider_private = NULL;
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+
+static int
+aes_common_init_ctx(aes_ctx_t *aes_ctx, crypto_spi_ctx_template_t *template,
+    crypto_mechanism_t *mechanism, crypto_key_t *key, int kmflag,
+    boolean_t is_encrypt_init)
+{
+	int rv = CRYPTO_SUCCESS;
+	void *keysched;
+	size_t size = 0;
+
+	if (template == NULL) {
+		if ((keysched = aes_alloc_keysched(&size, kmflag)) == NULL)
+			return (CRYPTO_HOST_MEMORY);
+		/*
+		 * Initialize key schedule.
+		 * Key length is stored in the key.
+		 */
+		if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) {
+			kmem_free(keysched, size);
+			return (rv);
+		}
+
+		aes_ctx->ac_flags |= PROVIDER_OWNS_KEY_SCHEDULE;
+		aes_ctx->ac_keysched_len = size;
+	} else {
+		keysched = template;
+	}
+	aes_ctx->ac_keysched = keysched;
+
+	switch (mechanism->cm_type) {
+	case AES_CBC_MECH_INFO_TYPE:
+		rv = cbc_init_ctx((cbc_ctx_t *)aes_ctx, mechanism->cm_param,
+		    mechanism->cm_param_len, AES_BLOCK_LEN, aes_copy_block64);
+		break;
+	case AES_CTR_MECH_INFO_TYPE: {
+		CK_AES_CTR_PARAMS *pp;
+
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (CK_AES_CTR_PARAMS)) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		pp = (CK_AES_CTR_PARAMS *)(void *)mechanism->cm_param;
+		rv = ctr_init_ctx((ctr_ctx_t *)aes_ctx, pp->ulCounterBits,
+		    pp->cb, aes_copy_block);
+		break;
+	}
+	case AES_CCM_MECH_INFO_TYPE:
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (CK_AES_CCM_PARAMS)) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		rv = ccm_init_ctx((ccm_ctx_t *)aes_ctx, mechanism->cm_param,
+		    kmflag, is_encrypt_init, AES_BLOCK_LEN, aes_encrypt_block,
+		    aes_xor_block);
+		break;
+	case AES_GCM_MECH_INFO_TYPE:
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (CK_AES_GCM_PARAMS)) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		rv = gcm_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+		break;
+	case AES_GMAC_MECH_INFO_TYPE:
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		rv = gmac_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param,
+		    AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block,
+		    aes_xor_block);
+		break;
+	case AES_ECB_MECH_INFO_TYPE:
+		aes_ctx->ac_flags |= ECB_MODE;
+	}
+
+	if (rv != CRYPTO_SUCCESS) {
+		if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) {
+			bzero(keysched, size);
+			kmem_free(keysched, size);
+		}
+	}
+
+	return (rv);
+}
+
+static int
+process_gmac_mech(crypto_mechanism_t *mech, crypto_data_t *data,
+    CK_AES_GCM_PARAMS *gcm_params)
+{
+	/* LINTED: pointer alignment */
+	CK_AES_GMAC_PARAMS *params = (CK_AES_GMAC_PARAMS *)mech->cm_param;
+
+	if (mech->cm_type != AES_GMAC_MECH_INFO_TYPE)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	if (mech->cm_param_len != sizeof (CK_AES_GMAC_PARAMS))
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+
+	if (params->pIv == NULL)
+		return (CRYPTO_MECHANISM_PARAM_INVALID);
+
+	gcm_params->pIv = params->pIv;
+	gcm_params->ulIvLen = AES_GMAC_IV_LEN;
+	gcm_params->ulTagBits = AES_GMAC_TAG_BITS;
+
+	if (data == NULL)
+		return (CRYPTO_SUCCESS);
+
+	if (data->cd_format != CRYPTO_DATA_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	gcm_params->pAAD = (uchar_t *)data->cd_raw.iov_base;
+	gcm_params->ulAADLen = data->cd_length;
+	return (CRYPTO_SUCCESS);
+}
+
+static int
+aes_mac_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+	CK_AES_GCM_PARAMS gcm_params;
+	crypto_mechanism_t gcm_mech;
+	int rv;
+
+	if ((rv = process_gmac_mech(mechanism, data, &gcm_params))
+	    != CRYPTO_SUCCESS)
+		return (rv);
+
+	gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE;
+	gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+	gcm_mech.cm_param = (char *)&gcm_params;
+
+	return (aes_encrypt_atomic(provider, session_id, &gcm_mech,
+	    key, &null_crypto_data, mac, template, req));
+}
+
+static int
+aes_mac_verify_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t template, crypto_req_handle_t req)
+{
+	CK_AES_GCM_PARAMS gcm_params;
+	crypto_mechanism_t gcm_mech;
+	int rv;
+
+	if ((rv = process_gmac_mech(mechanism, data, &gcm_params))
+	    != CRYPTO_SUCCESS)
+		return (rv);
+
+	gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE;
+	gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+	gcm_mech.cm_param = (char *)&gcm_params;
+
+	return (aes_decrypt_atomic(provider, session_id, &gcm_mech,
+	    key, mac, &null_crypto_data, template, req));
+}
diff --git a/sys/contrib/openzfs/module/icp/io/edonr_mod.c b/sys/contrib/openzfs/module/icp/io/edonr_mod.c
new file mode 100644
index 000000000000..a806af610629
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/edonr_mod.c
@@ -0,0 +1,63 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#include <sys/edonr.h>
+
+/*
+ * Unlike sha2 or skein, we won't expose edonr via the Kernel Cryptographic
+ * Framework (KCF), because Edon-R is *NOT* suitable for general-purpose
+ * cryptographic use. Users of Edon-R must interface directly to this module.
+ */
+
+static struct modlmisc modlmisc = {
+	&mod_cryptoops,
+	"Edon-R Message-Digest Algorithm"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, {&modlmisc, NULL}
+};
+
+int
+edonr_mod_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	return (0);
+}
+
+int
+edonr_mod_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
diff --git a/sys/contrib/openzfs/module/icp/io/sha1_mod.c b/sys/contrib/openzfs/module/icp/io/sha1_mod.c
new file mode 100644
index 000000000000..6dcee6b2ecf2
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/sha1_mod.c
@@ -0,0 +1,1230 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+
+#include <sha1/sha1.h>
+#include <sha1/sha1_impl.h>
+
+/*
+ * The sha1 module is created with two modlinkages:
+ * - a modlmisc that allows consumers to directly call the entry points
+ *   SHA1Init, SHA1Update, and SHA1Final.
+ * - a modlcrypto that allows the module to register with the Kernel
+ *   Cryptographic Framework (KCF) as a software provider for the SHA1
+ *   mechanisms.
+ */
+
+static struct modlcrypto modlcrypto = {
+	&mod_cryptoops,
+	"SHA1 Kernel SW Provider 1.1"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, { &modlcrypto, NULL }
+};
+
+
+/*
+ * Macros to access the SHA1 or SHA1-HMAC contexts from a context passed
+ * by KCF to one of the entry points.
+ */
+
+#define	PROV_SHA1_CTX(ctx)	((sha1_ctx_t *)(ctx)->cc_provider_private)
+#define	PROV_SHA1_HMAC_CTX(ctx)	((sha1_hmac_ctx_t *)(ctx)->cc_provider_private)
+
+/* to extract the digest length passed as mechanism parameter */
+#define	PROV_SHA1_GET_DIGEST_LEN(m, len) {				\
+	if (IS_P2ALIGNED((m)->cm_param, sizeof (ulong_t)))		\
+		(len) = (uint32_t)*((ulong_t *)(void *)mechanism->cm_param); \
+	else {								\
+		ulong_t tmp_ulong;					\
+		bcopy((m)->cm_param, &tmp_ulong, sizeof (ulong_t));	\
+		(len) = (uint32_t)tmp_ulong;				\
+	}								\
+}
+
+#define	PROV_SHA1_DIGEST_KEY(ctx, key, len, digest) {	\
+	SHA1Init(ctx);					\
+	SHA1Update(ctx, key, len);			\
+	SHA1Final(digest, ctx);				\
+}
+
+/*
+ * Mechanism info structure passed to KCF during registration.
+ */
+static crypto_mech_info_t sha1_mech_info_tab[] = {
+	/* SHA1 */
+	{SUN_CKM_SHA1, SHA1_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	/* SHA1-HMAC */
+	{SUN_CKM_SHA1_HMAC, SHA1_HMAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA1_HMAC_MIN_KEY_LEN, SHA1_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* SHA1-HMAC GENERAL */
+	{SUN_CKM_SHA1_HMAC_GENERAL, SHA1_HMAC_GEN_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA1_HMAC_MIN_KEY_LEN, SHA1_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void sha1_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t sha1_control_ops = {
+	sha1_provider_status
+};
+
+static int sha1_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_req_handle_t);
+static int sha1_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha1_digest_update(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha1_digest_final(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha1_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+
+static crypto_digest_ops_t sha1_digest_ops = {
+	.digest_init = sha1_digest_init,
+	.digest = sha1_digest,
+	.digest_update = sha1_digest_update,
+	.digest_key = NULL,
+	.digest_final = sha1_digest_final,
+	.digest_atomic = sha1_digest_atomic
+};
+
+static int sha1_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha1_mac_update(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha1_mac_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int sha1_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha1_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t sha1_mac_ops = {
+	.mac_init = sha1_mac_init,
+	.mac = NULL,
+	.mac_update = sha1_mac_update,
+	.mac_final = sha1_mac_final,
+	.mac_atomic = sha1_mac_atomic,
+	.mac_verify_atomic = sha1_mac_verify_atomic
+};
+
+static int sha1_create_ctx_template(crypto_provider_handle_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+    size_t *, crypto_req_handle_t);
+static int sha1_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t sha1_ctx_ops = {
+	.create_ctx_template = sha1_create_ctx_template,
+	.free_context = sha1_free_context
+};
+
+static crypto_ops_t sha1_crypto_ops = {{{{{
+	&sha1_control_ops,
+	&sha1_digest_ops,
+	NULL,
+	&sha1_mac_ops,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&sha1_ctx_ops,
+}}}}};
+
+static crypto_provider_info_t sha1_prov_info = {{{{
+	CRYPTO_SPI_VERSION_1,
+	"SHA1 Software Provider",
+	CRYPTO_SW_PROVIDER,
+	NULL,
+	&sha1_crypto_ops,
+	sizeof (sha1_mech_info_tab)/sizeof (crypto_mech_info_t),
+	sha1_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t sha1_prov_handle = 0;
+
+int
+sha1_mod_init(void)
+{
+	int ret;
+
+	if ((ret = mod_install(&modlinkage)) != 0)
+		return (ret);
+
+	/*
+	 * Register with KCF. If the registration fails, log an
+	 * error but do not uninstall the module, since the functionality
+	 * provided by misc/sha1 should still be available.
+	 */
+	if ((ret = crypto_register_provider(&sha1_prov_info,
+	    &sha1_prov_handle)) != CRYPTO_SUCCESS)
+		cmn_err(CE_WARN, "sha1 _init: "
+		    "crypto_register_provider() failed (0x%x)", ret);
+
+	return (0);
+}
+
+int
+sha1_mod_fini(void)
+{
+	int ret;
+
+	if (sha1_prov_handle != 0) {
+		if ((ret = crypto_unregister_provider(sha1_prov_handle)) !=
+		    CRYPTO_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "sha1 _fini: crypto_unregister_provider() "
+			    "failed (0x%x)", ret);
+			return (EBUSY);
+		}
+		sha1_prov_handle = 0;
+	}
+
+	return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+sha1_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+	*status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+static int
+sha1_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_req_handle_t req)
+{
+	if (mechanism->cm_type != SHA1_MECH_INFO_TYPE)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	/*
+	 * Allocate and initialize SHA1 context.
+	 */
+	ctx->cc_provider_private = kmem_alloc(sizeof (sha1_ctx_t),
+	    crypto_kmflag(req));
+	if (ctx->cc_provider_private == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	PROV_SHA1_CTX(ctx)->sc_mech_type = SHA1_MECH_INFO_TYPE;
+	SHA1Init(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx);
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA1 digest update function for uio data.
+ */
+static int
+sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data)
+{
+	off_t offset = data->cd_offset;
+	size_t length = data->cd_length;
+	uint_t vec_idx = 0;
+	size_t cur_len;
+
+	/* we support only kernel buffer */
+	if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing data to be
+	 * digested.
+	 */
+	offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx);
+	if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	/*
+	 * Now do the digesting on the iovecs.
+	 */
+	while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+		cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) -
+		    offset, length);
+
+		SHA1Update(sha1_ctx,
+		    (uint8_t *)zfs_uio_iovbase(data->cd_uio, vec_idx) + offset,
+		    cur_len);
+
+		length -= cur_len;
+		vec_idx++;
+		offset = 0;
+	}
+
+	if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+		/*
+		 * The end of the specified iovec's was reached but
+		 * the length requested could not be processed, i.e.
+		 * The caller requested to digest more data than it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA1 digest final function for uio data.
+ * digest_len is the length of the desired digest. If digest_len
+ * is smaller than the default SHA1 digest length, the caller
+ * must pass a scratch buffer, digest_scratch, which must
+ * be at least SHA1_DIGEST_LENGTH bytes.
+ */
+static int
+sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest,
+    ulong_t digest_len, uchar_t *digest_scratch)
+{
+	off_t offset = digest->cd_offset;
+	uint_t vec_idx = 0;
+
+	/* we support only kernel buffer */
+	if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing ptr to the digest to
+	 * be returned.
+	 */
+	offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx);
+	if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) {
+		/*
+		 * The caller specified an offset that is
+		 * larger than the total size of the buffers
+		 * it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	if (offset + digest_len <=
+	    zfs_uio_iovlen(digest->cd_uio, vec_idx)) {
+		/*
+		 * The computed SHA1 digest will fit in the current
+		 * iovec.
+		 */
+		if (digest_len != SHA1_DIGEST_LENGTH) {
+			/*
+			 * The caller requested a short digest. Digest
+			 * into a scratch buffer and return to
+			 * the user only what was requested.
+			 */
+			SHA1Final(digest_scratch, sha1_ctx);
+			bcopy(digest_scratch, (uchar_t *)
+			    zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+			    digest_len);
+		} else {
+			SHA1Final((uchar_t *)zfs_uio_iovbase(digest->
+			    cd_uio, vec_idx) + offset,
+			    sha1_ctx);
+		}
+	} else {
+		/*
+		 * The computed digest will be crossing one or more iovec's.
+		 * This is bad performance-wise but we need to support it.
+		 * Allocate a small scratch buffer on the stack and
+		 * copy it piece meal to the specified digest iovec's.
+		 */
+		uchar_t digest_tmp[SHA1_DIGEST_LENGTH];
+		off_t scratch_offset = 0;
+		size_t length = digest_len;
+		size_t cur_len;
+
+		SHA1Final(digest_tmp, sha1_ctx);
+
+		while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+			cur_len = MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) -
+			    offset, length);
+			bcopy(digest_tmp + scratch_offset,
+			    zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+			    cur_len);
+
+			length -= cur_len;
+			vec_idx++;
+			scratch_offset += cur_len;
+			offset = 0;
+		}
+
+		if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+			/*
+			 * The end of the specified iovec's was reached but
+			 * the length requested could not be processed, i.e.
+			 * The caller requested to digest more data than it
+			 * provided.
+			 */
+			return (CRYPTO_DATA_LEN_RANGE);
+		}
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	/*
+	 * We need to just return the length needed to store the output.
+	 * We should not destroy the context for the following cases.
+	 */
+	if ((digest->cd_length == 0) ||
+	    (digest->cd_length < SHA1_DIGEST_LENGTH)) {
+		digest->cd_length = SHA1_DIGEST_LENGTH;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	/*
+	 * Do the SHA1 update on the specified input data.
+	 */
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA1Update(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_update_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+		    data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret != CRYPTO_SUCCESS) {
+		/* the update failed, free context and bail */
+		kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t));
+		ctx->cc_provider_private = NULL;
+		digest->cd_length = 0;
+		return (ret);
+	}
+
+	/*
+	 * Do a SHA1 final, must be done separately since the digest
+	 * type can be different than the input data type.
+	 */
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA1Final((unsigned char *)digest->cd_raw.iov_base +
+		    digest->cd_offset, &PROV_SHA1_CTX(ctx)->sc_sha1_ctx);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_final_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+		    digest, SHA1_DIGEST_LENGTH, NULL);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	/* all done, free context and return */
+
+	if (ret == CRYPTO_SUCCESS) {
+		digest->cd_length = SHA1_DIGEST_LENGTH;
+	} else {
+		digest->cd_length = 0;
+	}
+
+	kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t));
+	ctx->cc_provider_private = NULL;
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest_update(crypto_ctx_t *ctx, crypto_data_t *data,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	/*
+	 * Do the SHA1 update on the specified input data.
+	 */
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA1Update(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_update_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+		    data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	/*
+	 * We need to just return the length needed to store the output.
+	 * We should not destroy the context for the following cases.
+	 */
+	if ((digest->cd_length == 0) ||
+	    (digest->cd_length < SHA1_DIGEST_LENGTH)) {
+		digest->cd_length = SHA1_DIGEST_LENGTH;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	/*
+	 * Do a SHA1 final.
+	 */
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA1Final((unsigned char *)digest->cd_raw.iov_base +
+		    digest->cd_offset, &PROV_SHA1_CTX(ctx)->sc_sha1_ctx);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_final_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx,
+		    digest, SHA1_DIGEST_LENGTH, NULL);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	/* all done, free context and return */
+
+	if (ret == CRYPTO_SUCCESS) {
+		digest->cd_length = SHA1_DIGEST_LENGTH;
+	} else {
+		digest->cd_length = 0;
+	}
+
+	kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t));
+	ctx->cc_provider_private = NULL;
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_digest_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_data_t *data, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	SHA1_CTX sha1_ctx;
+
+	if (mechanism->cm_type != SHA1_MECH_INFO_TYPE)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	/*
+	 * Do the SHA1 init.
+	 */
+	SHA1Init(&sha1_ctx);
+
+	/*
+	 * Do the SHA1 update on the specified input data.
+	 */
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA1Update(&sha1_ctx,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_update_uio(&sha1_ctx, data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret != CRYPTO_SUCCESS) {
+		/* the update failed, bail */
+		digest->cd_length = 0;
+		return (ret);
+	}
+
+	/*
+	 * Do a SHA1 final, must be done separately since the digest
+	 * type can be different than the input data type.
+	 */
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA1Final((unsigned char *)digest->cd_raw.iov_base +
+		    digest->cd_offset, &sha1_ctx);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_final_uio(&sha1_ctx, digest,
+		    SHA1_DIGEST_LENGTH, NULL);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		digest->cd_length = SHA1_DIGEST_LENGTH;
+	} else {
+		digest->cd_length = 0;
+	}
+
+	return (ret);
+}
+
+/*
+ * KCF software provider mac entry points.
+ *
+ * SHA1 HMAC is: SHA1(key XOR opad, SHA1(key XOR ipad, text))
+ *
+ * Init:
+ * The initialization routine initializes what we denote
+ * as the inner and outer contexts by doing
+ * - for inner context: SHA1(key XOR ipad)
+ * - for outer context: SHA1(key XOR opad)
+ *
+ * Update:
+ * Each subsequent SHA1 HMAC update will result in an
+ * update of the inner context with the specified data.
+ *
+ * Final:
+ * The SHA1 HMAC final will do a SHA1 final operation on the
+ * inner context, and the resulting digest will be used
+ * as the data for an update on the outer context. Last
+ * but not least, a SHA1 final on the outer context will
+ * be performed to obtain the SHA1 HMAC digest to return
+ * to the user.
+ */
+
+/*
+ * Initialize a SHA1-HMAC context.
+ */
+static void
+sha1_mac_init_ctx(sha1_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes)
+{
+	uint32_t ipad[SHA1_HMAC_INTS_PER_BLOCK];
+	uint32_t opad[SHA1_HMAC_INTS_PER_BLOCK];
+	uint_t i;
+
+	bzero(ipad, SHA1_HMAC_BLOCK_SIZE);
+	bzero(opad, SHA1_HMAC_BLOCK_SIZE);
+
+	bcopy(keyval, ipad, length_in_bytes);
+	bcopy(keyval, opad, length_in_bytes);
+
+	/* XOR key with ipad (0x36) and opad (0x5c) */
+	for (i = 0; i < SHA1_HMAC_INTS_PER_BLOCK; i++) {
+		ipad[i] ^= 0x36363636;
+		opad[i] ^= 0x5c5c5c5c;
+	}
+
+	/* perform SHA1 on ipad */
+	SHA1Init(&ctx->hc_icontext);
+	SHA1Update(&ctx->hc_icontext, (uint8_t *)ipad, SHA1_HMAC_BLOCK_SIZE);
+
+	/* perform SHA1 on opad */
+	SHA1Init(&ctx->hc_ocontext);
+	SHA1Update(&ctx->hc_ocontext, (uint8_t *)opad, SHA1_HMAC_BLOCK_SIZE);
+}
+
+/*
+ */
+static int
+sha1_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+	if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE &&
+	    mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	/* Add support for key by attributes (RFE 4706552) */
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	ctx->cc_provider_private = kmem_alloc(sizeof (sha1_hmac_ctx_t),
+	    crypto_kmflag(req));
+	if (ctx->cc_provider_private == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	if (ctx_template != NULL) {
+		/* reuse context template */
+		bcopy(ctx_template, PROV_SHA1_HMAC_CTX(ctx),
+		    sizeof (sha1_hmac_ctx_t));
+	} else {
+		/* no context template, compute context */
+		if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+			uchar_t digested_key[SHA1_DIGEST_LENGTH];
+			sha1_hmac_ctx_t *hmac_ctx = ctx->cc_provider_private;
+
+			/*
+			 * Hash the passed-in key to get a smaller key.
+			 * The inner context is used since it hasn't been
+			 * initialized yet.
+			 */
+			PROV_SHA1_DIGEST_KEY(&hmac_ctx->hc_icontext,
+			    key->ck_data, keylen_in_bytes, digested_key);
+			sha1_mac_init_ctx(PROV_SHA1_HMAC_CTX(ctx),
+			    digested_key, SHA1_DIGEST_LENGTH);
+		} else {
+			sha1_mac_init_ctx(PROV_SHA1_HMAC_CTX(ctx),
+			    key->ck_data, keylen_in_bytes);
+		}
+	}
+
+	/*
+	 * Get the mechanism parameters, if applicable.
+	 */
+	PROV_SHA1_HMAC_CTX(ctx)->hc_mech_type = mechanism->cm_type;
+	if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) {
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (ulong_t))
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+		PROV_SHA1_GET_DIGEST_LEN(mechanism,
+		    PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len);
+		if (PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len >
+		    SHA1_DIGEST_LENGTH)
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+	}
+
+	if (ret != CRYPTO_SUCCESS) {
+		bzero(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+		kmem_free(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+		ctx->cc_provider_private = NULL;
+	}
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	/*
+	 * Do a SHA1 update of the inner context using the specified
+	 * data.
+	 */
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA1Update(&PROV_SHA1_HMAC_CTX(ctx)->hc_icontext,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_update_uio(
+		    &PROV_SHA1_HMAC_CTX(ctx)->hc_icontext, data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uchar_t digest[SHA1_DIGEST_LENGTH];
+	uint32_t digest_len = SHA1_DIGEST_LENGTH;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	if (PROV_SHA1_HMAC_CTX(ctx)->hc_mech_type ==
+	    SHA1_HMAC_GEN_MECH_INFO_TYPE)
+		digest_len = PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len;
+
+	/*
+	 * We need to just return the length needed to store the output.
+	 * We should not destroy the context for the following cases.
+	 */
+	if ((mac->cd_length == 0) || (mac->cd_length < digest_len)) {
+		mac->cd_length = digest_len;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	/*
+	 * Do a SHA1 final on the inner context.
+	 */
+	SHA1Final(digest, &PROV_SHA1_HMAC_CTX(ctx)->hc_icontext);
+
+	/*
+	 * Do a SHA1 update on the outer context, feeding the inner
+	 * digest as data.
+	 */
+	SHA1Update(&PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext, digest,
+	    SHA1_DIGEST_LENGTH);
+
+	/*
+	 * Do a SHA1 final on the outer context, storing the computing
+	 * digest in the users buffer.
+	 */
+	switch (mac->cd_format) {
+	case CRYPTO_DATA_RAW:
+		if (digest_len != SHA1_DIGEST_LENGTH) {
+			/*
+			 * The caller requested a short digest. Digest
+			 * into a scratch buffer and return to
+			 * the user only what was requested.
+			 */
+			SHA1Final(digest,
+			    &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext);
+			bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset, digest_len);
+		} else {
+			SHA1Final((unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset,
+			    &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext);
+		}
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_final_uio(
+		    &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext, mac,
+		    digest_len, digest);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		mac->cd_length = digest_len;
+	} else {
+		mac->cd_length = 0;
+	}
+
+	bzero(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+	kmem_free(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t));
+	ctx->cc_provider_private = NULL;
+
+	return (ret);
+}
+
+#define	SHA1_MAC_UPDATE(data, ctx, ret) {				\
+	switch (data->cd_format) {					\
+	case CRYPTO_DATA_RAW:						\
+		SHA1Update(&(ctx).hc_icontext,				\
+		    (uint8_t *)data->cd_raw.iov_base +			\
+		    data->cd_offset, data->cd_length);			\
+		break;							\
+	case CRYPTO_DATA_UIO:						\
+		ret = sha1_digest_update_uio(&(ctx).hc_icontext, data); \
+		break;							\
+	default:							\
+		ret = CRYPTO_ARGUMENTS_BAD;				\
+	}								\
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uchar_t digest[SHA1_DIGEST_LENGTH];
+	sha1_hmac_ctx_t sha1_hmac_ctx;
+	uint32_t digest_len = SHA1_DIGEST_LENGTH;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+	if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE &&
+	    mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	/* Add support for key by attributes (RFE 4706552) */
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	if (ctx_template != NULL) {
+		/* reuse context template */
+		bcopy(ctx_template, &sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+	} else {
+		/* no context template, initialize context */
+		if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+			/*
+			 * Hash the passed-in key to get a smaller key.
+			 * The inner context is used since it hasn't been
+			 * initialized yet.
+			 */
+			PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx.hc_icontext,
+			    key->ck_data, keylen_in_bytes, digest);
+			sha1_mac_init_ctx(&sha1_hmac_ctx, digest,
+			    SHA1_DIGEST_LENGTH);
+		} else {
+			sha1_mac_init_ctx(&sha1_hmac_ctx, key->ck_data,
+			    keylen_in_bytes);
+		}
+	}
+
+	/* get the mechanism parameters, if applicable */
+	if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) {
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (ulong_t)) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+		PROV_SHA1_GET_DIGEST_LEN(mechanism, digest_len);
+		if (digest_len > SHA1_DIGEST_LENGTH) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+	}
+
+	/* do a SHA1 update of the inner context using the specified data */
+	SHA1_MAC_UPDATE(data, sha1_hmac_ctx, ret);
+	if (ret != CRYPTO_SUCCESS)
+		/* the update failed, free context and bail */
+		goto bail;
+
+	/*
+	 * Do a SHA1 final on the inner context.
+	 */
+	SHA1Final(digest, &sha1_hmac_ctx.hc_icontext);
+
+	/*
+	 * Do an SHA1 update on the outer context, feeding the inner
+	 * digest as data.
+	 */
+	SHA1Update(&sha1_hmac_ctx.hc_ocontext, digest, SHA1_DIGEST_LENGTH);
+
+	/*
+	 * Do a SHA1 final on the outer context, storing the computed
+	 * digest in the users buffer.
+	 */
+	switch (mac->cd_format) {
+	case CRYPTO_DATA_RAW:
+		if (digest_len != SHA1_DIGEST_LENGTH) {
+			/*
+			 * The caller requested a short digest. Digest
+			 * into a scratch buffer and return to
+			 * the user only what was requested.
+			 */
+			SHA1Final(digest, &sha1_hmac_ctx.hc_ocontext);
+			bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset, digest_len);
+		} else {
+			SHA1Final((unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset, &sha1_hmac_ctx.hc_ocontext);
+		}
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha1_digest_final_uio(&sha1_hmac_ctx.hc_ocontext, mac,
+		    digest_len, digest);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		mac->cd_length = digest_len;
+	} else {
+		mac->cd_length = 0;
+	}
+	/* Extra paranoia: zeroize the context on the stack */
+	bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+
+	return (ret);
+bail:
+	bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+	mac->cd_length = 0;
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha1_mac_verify_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uchar_t digest[SHA1_DIGEST_LENGTH];
+	sha1_hmac_ctx_t sha1_hmac_ctx;
+	uint32_t digest_len = SHA1_DIGEST_LENGTH;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+	if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE &&
+	    mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)
+		return (CRYPTO_MECHANISM_INVALID);
+
+	/* Add support for key by attributes (RFE 4706552) */
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	if (ctx_template != NULL) {
+		/* reuse context template */
+		bcopy(ctx_template, &sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+	} else {
+		/* no context template, initialize context */
+		if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+			/*
+			 * Hash the passed-in key to get a smaller key.
+			 * The inner context is used since it hasn't been
+			 * initialized yet.
+			 */
+			PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx.hc_icontext,
+			    key->ck_data, keylen_in_bytes, digest);
+			sha1_mac_init_ctx(&sha1_hmac_ctx, digest,
+			    SHA1_DIGEST_LENGTH);
+		} else {
+			sha1_mac_init_ctx(&sha1_hmac_ctx, key->ck_data,
+			    keylen_in_bytes);
+		}
+	}
+
+	/* get the mechanism parameters, if applicable */
+	if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) {
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (ulong_t)) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+		PROV_SHA1_GET_DIGEST_LEN(mechanism, digest_len);
+		if (digest_len > SHA1_DIGEST_LENGTH) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+	}
+
+	if (mac->cd_length != digest_len) {
+		ret = CRYPTO_INVALID_MAC;
+		goto bail;
+	}
+
+	/* do a SHA1 update of the inner context using the specified data */
+	SHA1_MAC_UPDATE(data, sha1_hmac_ctx, ret);
+	if (ret != CRYPTO_SUCCESS)
+		/* the update failed, free context and bail */
+		goto bail;
+
+	/* do a SHA1 final on the inner context */
+	SHA1Final(digest, &sha1_hmac_ctx.hc_icontext);
+
+	/*
+	 * Do an SHA1 update on the outer context, feeding the inner
+	 * digest as data.
+	 */
+	SHA1Update(&sha1_hmac_ctx.hc_ocontext, digest, SHA1_DIGEST_LENGTH);
+
+	/*
+	 * Do a SHA1 final on the outer context, storing the computed
+	 * digest in the users buffer.
+	 */
+	SHA1Final(digest, &sha1_hmac_ctx.hc_ocontext);
+
+	/*
+	 * Compare the computed digest against the expected digest passed
+	 * as argument.
+	 */
+
+	switch (mac->cd_format) {
+
+	case CRYPTO_DATA_RAW:
+		if (bcmp(digest, (unsigned char *)mac->cd_raw.iov_base +
+		    mac->cd_offset, digest_len) != 0)
+			ret = CRYPTO_INVALID_MAC;
+		break;
+
+	case CRYPTO_DATA_UIO: {
+		off_t offset = mac->cd_offset;
+		uint_t vec_idx = 0;
+		off_t scratch_offset = 0;
+		size_t length = digest_len;
+		size_t cur_len;
+
+		/* we support only kernel buffer */
+		if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE)
+			return (CRYPTO_ARGUMENTS_BAD);
+
+		/* jump to the first iovec containing the expected digest */
+		offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx);
+		if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) {
+			/*
+			 * The caller specified an offset that is
+			 * larger than the total size of the buffers
+			 * it provided.
+			 */
+			ret = CRYPTO_DATA_LEN_RANGE;
+			break;
+		}
+
+		/* do the comparison of computed digest vs specified one */
+		while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) {
+			cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) -
+			    offset, length);
+
+			if (bcmp(digest + scratch_offset,
+			    zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset,
+			    cur_len) != 0) {
+				ret = CRYPTO_INVALID_MAC;
+				break;
+			}
+
+			length -= cur_len;
+			vec_idx++;
+			scratch_offset += cur_len;
+			offset = 0;
+		}
+		break;
+	}
+
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+	return (ret);
+bail:
+	bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t));
+	mac->cd_length = 0;
+	return (ret);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/* ARGSUSED */
+static int
+sha1_create_ctx_template(crypto_provider_handle_t provider,
+    crypto_mechanism_t *mechanism, crypto_key_t *key,
+    crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+    crypto_req_handle_t req)
+{
+	sha1_hmac_ctx_t *sha1_hmac_ctx_tmpl;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+	if ((mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE) &&
+	    (mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)) {
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	/* Add support for key by attributes (RFE 4706552) */
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Allocate and initialize SHA1 context.
+	 */
+	sha1_hmac_ctx_tmpl = kmem_alloc(sizeof (sha1_hmac_ctx_t),
+	    crypto_kmflag(req));
+	if (sha1_hmac_ctx_tmpl == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) {
+		uchar_t digested_key[SHA1_DIGEST_LENGTH];
+
+		/*
+		 * Hash the passed-in key to get a smaller key.
+		 * The inner context is used since it hasn't been
+		 * initialized yet.
+		 */
+		PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx_tmpl->hc_icontext,
+		    key->ck_data, keylen_in_bytes, digested_key);
+		sha1_mac_init_ctx(sha1_hmac_ctx_tmpl, digested_key,
+		    SHA1_DIGEST_LENGTH);
+	} else {
+		sha1_mac_init_ctx(sha1_hmac_ctx_tmpl, key->ck_data,
+		    keylen_in_bytes);
+	}
+
+	sha1_hmac_ctx_tmpl->hc_mech_type = mechanism->cm_type;
+	*ctx_template = (crypto_spi_ctx_template_t)sha1_hmac_ctx_tmpl;
+	*ctx_template_size = sizeof (sha1_hmac_ctx_t);
+
+
+	return (CRYPTO_SUCCESS);
+}
+
+static int
+sha1_free_context(crypto_ctx_t *ctx)
+{
+	uint_t ctx_len;
+	sha1_mech_type_t mech_type;
+
+	if (ctx->cc_provider_private == NULL)
+		return (CRYPTO_SUCCESS);
+
+	/*
+	 * We have to free either SHA1 or SHA1-HMAC contexts, which
+	 * have different lengths.
+	 */
+
+	mech_type = PROV_SHA1_CTX(ctx)->sc_mech_type;
+	if (mech_type == SHA1_MECH_INFO_TYPE)
+		ctx_len = sizeof (sha1_ctx_t);
+	else {
+		ASSERT(mech_type == SHA1_HMAC_MECH_INFO_TYPE ||
+		    mech_type == SHA1_HMAC_GEN_MECH_INFO_TYPE);
+		ctx_len = sizeof (sha1_hmac_ctx_t);
+	}
+
+	bzero(ctx->cc_provider_private, ctx_len);
+	kmem_free(ctx->cc_provider_private, ctx_len);
+	ctx->cc_provider_private = NULL;
+
+	return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/io/sha2_mod.c b/sys/contrib/openzfs/module/icp/io/sha2_mod.c
new file mode 100644
index 000000000000..d690cd0bcb05
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/sha2_mod.c
@@ -0,0 +1,1399 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/spi.h>
+#include <sys/crypto/icp.h>
+#define	_SHA2_IMPL
+#include <sys/sha2.h>
+#include <sha2/sha2_impl.h>
+
+/*
+ * The sha2 module is created with two modlinkages:
+ * - a modlmisc that allows consumers to directly call the entry points
+ *   SHA2Init, SHA2Update, and SHA2Final.
+ * - a modlcrypto that allows the module to register with the Kernel
+ *   Cryptographic Framework (KCF) as a software provider for the SHA2
+ *   mechanisms.
+ */
+
+static struct modlcrypto modlcrypto = {
+	&mod_cryptoops,
+	"SHA2 Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, {&modlcrypto, NULL}
+};
+
+/*
+ * Macros to access the SHA2 or SHA2-HMAC contexts from a context passed
+ * by KCF to one of the entry points.
+ */
+
+#define	PROV_SHA2_CTX(ctx)	((sha2_ctx_t *)(ctx)->cc_provider_private)
+#define	PROV_SHA2_HMAC_CTX(ctx)	((sha2_hmac_ctx_t *)(ctx)->cc_provider_private)
+
+/* to extract the digest length passed as mechanism parameter */
+#define	PROV_SHA2_GET_DIGEST_LEN(m, len) {				\
+	if (IS_P2ALIGNED((m)->cm_param, sizeof (ulong_t)))		\
+		(len) = (uint32_t)*((ulong_t *)(m)->cm_param);	\
+	else {								\
+		ulong_t tmp_ulong;					\
+		bcopy((m)->cm_param, &tmp_ulong, sizeof (ulong_t));	\
+		(len) = (uint32_t)tmp_ulong;				\
+	}								\
+}
+
+#define	PROV_SHA2_DIGEST_KEY(mech, ctx, key, len, digest) {	\
+	SHA2Init(mech, ctx);				\
+	SHA2Update(ctx, key, len);			\
+	SHA2Final(digest, ctx);				\
+}
+
+/*
+ * Mechanism info structure passed to KCF during registration.
+ */
+static crypto_mech_info_t sha2_mech_info_tab[] = {
+	/* SHA256 */
+	{SUN_CKM_SHA256, SHA256_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	/* SHA256-HMAC */
+	{SUN_CKM_SHA256_HMAC, SHA256_HMAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* SHA256-HMAC GENERAL */
+	{SUN_CKM_SHA256_HMAC_GENERAL, SHA256_HMAC_GEN_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* SHA384 */
+	{SUN_CKM_SHA384, SHA384_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	/* SHA384-HMAC */
+	{SUN_CKM_SHA384_HMAC, SHA384_HMAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* SHA384-HMAC GENERAL */
+	{SUN_CKM_SHA384_HMAC_GENERAL, SHA384_HMAC_GEN_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* SHA512 */
+	{SUN_CKM_SHA512, SHA512_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	/* SHA512-HMAC */
+	{SUN_CKM_SHA512_HMAC, SHA512_HMAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	/* SHA512-HMAC GENERAL */
+	{SUN_CKM_SHA512_HMAC_GENERAL, SHA512_HMAC_GEN_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC,
+	    SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void sha2_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t sha2_control_ops = {
+	sha2_provider_status
+};
+
+static int sha2_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_req_handle_t);
+static int sha2_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha2_digest_update(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha2_digest_final(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha2_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+
+static crypto_digest_ops_t sha2_digest_ops = {
+	.digest_init = sha2_digest_init,
+	.digest = sha2_digest,
+	.digest_update = sha2_digest_update,
+	.digest_key = NULL,
+	.digest_final = sha2_digest_final,
+	.digest_atomic = sha2_digest_atomic
+};
+
+static int sha2_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha2_mac_update(crypto_ctx_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int sha2_mac_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int sha2_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int sha2_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t sha2_mac_ops = {
+	.mac_init = sha2_mac_init,
+	.mac = NULL,
+	.mac_update = sha2_mac_update,
+	.mac_final = sha2_mac_final,
+	.mac_atomic = sha2_mac_atomic,
+	.mac_verify_atomic = sha2_mac_verify_atomic
+};
+
+static int sha2_create_ctx_template(crypto_provider_handle_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+    size_t *, crypto_req_handle_t);
+static int sha2_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t sha2_ctx_ops = {
+	.create_ctx_template = sha2_create_ctx_template,
+	.free_context = sha2_free_context
+};
+
+static crypto_ops_t sha2_crypto_ops = {{{{{
+	&sha2_control_ops,
+	&sha2_digest_ops,
+	NULL,
+	&sha2_mac_ops,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&sha2_ctx_ops
+}}}}};
+
+static crypto_provider_info_t sha2_prov_info = {{{{
+	CRYPTO_SPI_VERSION_1,
+	"SHA2 Software Provider",
+	CRYPTO_SW_PROVIDER,
+	NULL,
+	&sha2_crypto_ops,
+	sizeof (sha2_mech_info_tab)/sizeof (crypto_mech_info_t),
+	sha2_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t sha2_prov_handle = 0;
+
+int
+sha2_mod_init(void)
+{
+	int ret;
+
+	if ((ret = mod_install(&modlinkage)) != 0)
+		return (ret);
+
+	/*
+	 * Register with KCF. If the registration fails, log an
+	 * error but do not uninstall the module, since the functionality
+	 * provided by misc/sha2 should still be available.
+	 */
+	if ((ret = crypto_register_provider(&sha2_prov_info,
+	    &sha2_prov_handle)) != CRYPTO_SUCCESS)
+		cmn_err(CE_WARN, "sha2 _init: "
+		    "crypto_register_provider() failed (0x%x)", ret);
+
+	return (0);
+}
+
+int
+sha2_mod_fini(void)
+{
+	int ret;
+
+	if (sha2_prov_handle != 0) {
+		if ((ret = crypto_unregister_provider(sha2_prov_handle)) !=
+		    CRYPTO_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "sha2 _fini: crypto_unregister_provider() "
+			    "failed (0x%x)", ret);
+			return (EBUSY);
+		}
+		sha2_prov_handle = 0;
+	}
+
+	return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+sha2_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+	*status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+static int
+sha2_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_req_handle_t req)
+{
+
+	/*
+	 * Allocate and initialize SHA2 context.
+	 */
+	ctx->cc_provider_private = kmem_alloc(sizeof (sha2_ctx_t),
+	    crypto_kmflag(req));
+	if (ctx->cc_provider_private == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	PROV_SHA2_CTX(ctx)->sc_mech_type = mechanism->cm_type;
+	SHA2Init(mechanism->cm_type, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx);
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA2 digest update function for uio data.
+ */
+static int
+sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data)
+{
+	off_t offset = data->cd_offset;
+	size_t length = data->cd_length;
+	uint_t vec_idx = 0;
+	size_t cur_len;
+
+	/* we support only kernel buffer */
+	if (zfs_uio_segflg(data->cd_uio) != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing data to be
+	 * digested.
+	 */
+	offset = zfs_uio_index_at_offset(data->cd_uio, offset, &vec_idx);
+	if (vec_idx == zfs_uio_iovcnt(data->cd_uio)) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	/*
+	 * Now do the digesting on the iovecs.
+	 */
+	while (vec_idx < zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+		cur_len = MIN(zfs_uio_iovlen(data->cd_uio, vec_idx) -
+		    offset, length);
+
+		SHA2Update(sha2_ctx, (uint8_t *)zfs_uio_iovbase(data->cd_uio,
+		    vec_idx) + offset, cur_len);
+		length -= cur_len;
+		vec_idx++;
+		offset = 0;
+	}
+
+	if (vec_idx == zfs_uio_iovcnt(data->cd_uio) && length > 0) {
+		/*
+		 * The end of the specified iovec's was reached but
+		 * the length requested could not be processed, i.e.
+		 * The caller requested to digest more data than it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Helper SHA2 digest final function for uio data.
+ * digest_len is the length of the desired digest. If digest_len
+ * is smaller than the default SHA2 digest length, the caller
+ * must pass a scratch buffer, digest_scratch, which must
+ * be at least the algorithm's digest length bytes.
+ */
+static int
+sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest,
+    ulong_t digest_len, uchar_t *digest_scratch)
+{
+	off_t offset = digest->cd_offset;
+	uint_t vec_idx = 0;
+
+	/* we support only kernel buffer */
+	if (zfs_uio_segflg(digest->cd_uio) != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing ptr to the digest to
+	 * be returned.
+	 */
+	offset = zfs_uio_index_at_offset(digest->cd_uio, offset, &vec_idx);
+	if (vec_idx == zfs_uio_iovcnt(digest->cd_uio)) {
+		/*
+		 * The caller specified an offset that is
+		 * larger than the total size of the buffers
+		 * it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	if (offset + digest_len <=
+	    zfs_uio_iovlen(digest->cd_uio, vec_idx)) {
+		/*
+		 * The computed SHA2 digest will fit in the current
+		 * iovec.
+		 */
+		if (((sha2_ctx->algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) &&
+		    (digest_len != SHA256_DIGEST_LENGTH)) ||
+		    ((sha2_ctx->algotype > SHA256_HMAC_GEN_MECH_INFO_TYPE) &&
+		    (digest_len != SHA512_DIGEST_LENGTH))) {
+			/*
+			 * The caller requested a short digest. Digest
+			 * into a scratch buffer and return to
+			 * the user only what was requested.
+			 */
+			SHA2Final(digest_scratch, sha2_ctx);
+
+			bcopy(digest_scratch, (uchar_t *)
+			    zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+			    digest_len);
+		} else {
+			SHA2Final((uchar_t *)zfs_uio_iovbase(digest->
+			    cd_uio, vec_idx) + offset,
+			    sha2_ctx);
+
+		}
+	} else {
+		/*
+		 * The computed digest will be crossing one or more iovec's.
+		 * This is bad performance-wise but we need to support it.
+		 * Allocate a small scratch buffer on the stack and
+		 * copy it piece meal to the specified digest iovec's.
+		 */
+		uchar_t digest_tmp[SHA512_DIGEST_LENGTH];
+		off_t scratch_offset = 0;
+		size_t length = digest_len;
+		size_t cur_len;
+
+		SHA2Final(digest_tmp, sha2_ctx);
+
+		while (vec_idx < zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+			cur_len =
+			    MIN(zfs_uio_iovlen(digest->cd_uio, vec_idx) -
+			    offset, length);
+			bcopy(digest_tmp + scratch_offset,
+			    zfs_uio_iovbase(digest->cd_uio, vec_idx) + offset,
+			    cur_len);
+
+			length -= cur_len;
+			vec_idx++;
+			scratch_offset += cur_len;
+			offset = 0;
+		}
+
+		if (vec_idx == zfs_uio_iovcnt(digest->cd_uio) && length > 0) {
+			/*
+			 * The end of the specified iovec's was reached but
+			 * the length requested could not be processed, i.e.
+			 * The caller requested to digest more data than it
+			 * provided.
+			 */
+			return (CRYPTO_DATA_LEN_RANGE);
+		}
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uint_t sha_digest_len;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	switch (PROV_SHA2_CTX(ctx)->sc_mech_type) {
+	case SHA256_MECH_INFO_TYPE:
+		sha_digest_len = SHA256_DIGEST_LENGTH;
+		break;
+	case SHA384_MECH_INFO_TYPE:
+		sha_digest_len = SHA384_DIGEST_LENGTH;
+		break;
+	case SHA512_MECH_INFO_TYPE:
+		sha_digest_len = SHA512_DIGEST_LENGTH;
+		break;
+	default:
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	/*
+	 * We need to just return the length needed to store the output.
+	 * We should not destroy the context for the following cases.
+	 */
+	if ((digest->cd_length == 0) ||
+	    (digest->cd_length < sha_digest_len)) {
+		digest->cd_length = sha_digest_len;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	/*
+	 * Do the SHA2 update on the specified input data.
+	 */
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+		    data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret != CRYPTO_SUCCESS) {
+		/* the update failed, free context and bail */
+		kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t));
+		ctx->cc_provider_private = NULL;
+		digest->cd_length = 0;
+		return (ret);
+	}
+
+	/*
+	 * Do a SHA2 final, must be done separately since the digest
+	 * type can be different than the input data type.
+	 */
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA2Final((unsigned char *)digest->cd_raw.iov_base +
+		    digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+		    digest, sha_digest_len, NULL);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	/* all done, free context and return */
+
+	if (ret == CRYPTO_SUCCESS)
+		digest->cd_length = sha_digest_len;
+	else
+		digest->cd_length = 0;
+
+	kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t));
+	ctx->cc_provider_private = NULL;
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest_update(crypto_ctx_t *ctx, crypto_data_t *data,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	/*
+	 * Do the SHA2 update on the specified input data.
+	 */
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+		    data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uint_t sha_digest_len;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	switch (PROV_SHA2_CTX(ctx)->sc_mech_type) {
+	case SHA256_MECH_INFO_TYPE:
+		sha_digest_len = SHA256_DIGEST_LENGTH;
+		break;
+	case SHA384_MECH_INFO_TYPE:
+		sha_digest_len = SHA384_DIGEST_LENGTH;
+		break;
+	case SHA512_MECH_INFO_TYPE:
+		sha_digest_len = SHA512_DIGEST_LENGTH;
+		break;
+	default:
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	/*
+	 * We need to just return the length needed to store the output.
+	 * We should not destroy the context for the following cases.
+	 */
+	if ((digest->cd_length == 0) ||
+	    (digest->cd_length < sha_digest_len)) {
+		digest->cd_length = sha_digest_len;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	/*
+	 * Do a SHA2 final.
+	 */
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA2Final((unsigned char *)digest->cd_raw.iov_base +
+		    digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx,
+		    digest, sha_digest_len, NULL);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	/* all done, free context and return */
+
+	if (ret == CRYPTO_SUCCESS)
+		digest->cd_length = sha_digest_len;
+	else
+		digest->cd_length = 0;
+
+	kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t));
+	ctx->cc_provider_private = NULL;
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_digest_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_data_t *data, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	SHA2_CTX sha2_ctx;
+	uint32_t sha_digest_len;
+
+	/*
+	 * Do the SHA inits.
+	 */
+
+	SHA2Init(mechanism->cm_type, &sha2_ctx);
+
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA2Update(&sha2_ctx, (uint8_t *)data->
+		    cd_raw.iov_base + data->cd_offset, data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_update_uio(&sha2_ctx, data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	/*
+	 * Do the SHA updates on the specified input data.
+	 */
+
+	if (ret != CRYPTO_SUCCESS) {
+		/* the update failed, bail */
+		digest->cd_length = 0;
+		return (ret);
+	}
+
+	if (mechanism->cm_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE)
+		sha_digest_len = SHA256_DIGEST_LENGTH;
+	else
+		sha_digest_len = SHA512_DIGEST_LENGTH;
+
+	/*
+	 * Do a SHA2 final, must be done separately since the digest
+	 * type can be different than the input data type.
+	 */
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA2Final((unsigned char *)digest->cd_raw.iov_base +
+		    digest->cd_offset, &sha2_ctx);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_final_uio(&sha2_ctx, digest,
+		    sha_digest_len, NULL);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS)
+		digest->cd_length = sha_digest_len;
+	else
+		digest->cd_length = 0;
+
+	return (ret);
+}
+
+/*
+ * KCF software provider mac entry points.
+ *
+ * SHA2 HMAC is: SHA2(key XOR opad, SHA2(key XOR ipad, text))
+ *
+ * Init:
+ * The initialization routine initializes what we denote
+ * as the inner and outer contexts by doing
+ * - for inner context: SHA2(key XOR ipad)
+ * - for outer context: SHA2(key XOR opad)
+ *
+ * Update:
+ * Each subsequent SHA2 HMAC update will result in an
+ * update of the inner context with the specified data.
+ *
+ * Final:
+ * The SHA2 HMAC final will do a SHA2 final operation on the
+ * inner context, and the resulting digest will be used
+ * as the data for an update on the outer context. Last
+ * but not least, a SHA2 final on the outer context will
+ * be performed to obtain the SHA2 HMAC digest to return
+ * to the user.
+ */
+
+/*
+ * Initialize a SHA2-HMAC context.
+ */
+static void
+sha2_mac_init_ctx(sha2_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes)
+{
+	uint64_t ipad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)];
+	uint64_t opad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)];
+	int i, block_size, blocks_per_int64;
+
+	/* Determine the block size */
+	if (ctx->hc_mech_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) {
+		block_size = SHA256_HMAC_BLOCK_SIZE;
+		blocks_per_int64 = SHA256_HMAC_BLOCK_SIZE / sizeof (uint64_t);
+	} else {
+		block_size = SHA512_HMAC_BLOCK_SIZE;
+		blocks_per_int64 = SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t);
+	}
+
+	(void) bzero(ipad, block_size);
+	(void) bzero(opad, block_size);
+	(void) bcopy(keyval, ipad, length_in_bytes);
+	(void) bcopy(keyval, opad, length_in_bytes);
+
+	/* XOR key with ipad (0x36) and opad (0x5c) */
+	for (i = 0; i < blocks_per_int64; i ++) {
+		ipad[i] ^= 0x3636363636363636;
+		opad[i] ^= 0x5c5c5c5c5c5c5c5c;
+	}
+
+	/* perform SHA2 on ipad */
+	SHA2Init(ctx->hc_mech_type, &ctx->hc_icontext);
+	SHA2Update(&ctx->hc_icontext, (uint8_t *)ipad, block_size);
+
+	/* perform SHA2 on opad */
+	SHA2Init(ctx->hc_mech_type, &ctx->hc_ocontext);
+	SHA2Update(&ctx->hc_ocontext, (uint8_t *)opad, block_size);
+
+}
+
+/*
+ */
+static int
+sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+	uint_t sha_digest_len, sha_hmac_block_size;
+
+	/*
+	 * Set the digest length and block size to values appropriate to the
+	 * mechanism
+	 */
+	switch (mechanism->cm_type) {
+	case SHA256_HMAC_MECH_INFO_TYPE:
+	case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = SHA256_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+		break;
+	case SHA384_HMAC_MECH_INFO_TYPE:
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+	case SHA512_HMAC_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = SHA512_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+		break;
+	default:
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	ctx->cc_provider_private = kmem_alloc(sizeof (sha2_hmac_ctx_t),
+	    crypto_kmflag(req));
+	if (ctx->cc_provider_private == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type = mechanism->cm_type;
+	if (ctx_template != NULL) {
+		/* reuse context template */
+		bcopy(ctx_template, PROV_SHA2_HMAC_CTX(ctx),
+		    sizeof (sha2_hmac_ctx_t));
+	} else {
+		/* no context template, compute context */
+		if (keylen_in_bytes > sha_hmac_block_size) {
+			uchar_t digested_key[SHA512_DIGEST_LENGTH];
+			sha2_hmac_ctx_t *hmac_ctx = ctx->cc_provider_private;
+
+			/*
+			 * Hash the passed-in key to get a smaller key.
+			 * The inner context is used since it hasn't been
+			 * initialized yet.
+			 */
+			PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+			    &hmac_ctx->hc_icontext,
+			    key->ck_data, keylen_in_bytes, digested_key);
+			sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx),
+			    digested_key, sha_digest_len);
+		} else {
+			sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx),
+			    key->ck_data, keylen_in_bytes);
+		}
+	}
+
+	/*
+	 * Get the mechanism parameters, if applicable.
+	 */
+	if (mechanism->cm_type % 3 == 2) {
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (ulong_t))
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+		PROV_SHA2_GET_DIGEST_LEN(mechanism,
+		    PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len);
+		if (PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len > sha_digest_len)
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+	}
+
+	if (ret != CRYPTO_SUCCESS) {
+		bzero(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+		kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+		ctx->cc_provider_private = NULL;
+	}
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_update(crypto_ctx_t *ctx, crypto_data_t *data,
+    crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	/*
+	 * Do a SHA2 update of the inner context using the specified
+	 * data.
+	 */
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_icontext,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_update_uio(
+		    &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext, data);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uchar_t digest[SHA512_DIGEST_LENGTH];
+	uint32_t digest_len, sha_digest_len;
+
+	ASSERT(ctx->cc_provider_private != NULL);
+
+	/* Set the digest lengths to values appropriate to the mechanism */
+	switch (PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type) {
+	case SHA256_HMAC_MECH_INFO_TYPE:
+		sha_digest_len = digest_len = SHA256_DIGEST_LENGTH;
+		break;
+	case SHA384_HMAC_MECH_INFO_TYPE:
+		sha_digest_len = digest_len = SHA384_DIGEST_LENGTH;
+		break;
+	case SHA512_HMAC_MECH_INFO_TYPE:
+		sha_digest_len = digest_len = SHA512_DIGEST_LENGTH;
+		break;
+	case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = SHA256_DIGEST_LENGTH;
+		digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len;
+		break;
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = SHA512_DIGEST_LENGTH;
+		digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len;
+		break;
+	default:
+		return (CRYPTO_ARGUMENTS_BAD);
+	}
+
+	/*
+	 * We need to just return the length needed to store the output.
+	 * We should not destroy the context for the following cases.
+	 */
+	if ((mac->cd_length == 0) || (mac->cd_length < digest_len)) {
+		mac->cd_length = digest_len;
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	/*
+	 * Do a SHA2 final on the inner context.
+	 */
+	SHA2Final(digest, &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext);
+
+	/*
+	 * Do a SHA2 update on the outer context, feeding the inner
+	 * digest as data.
+	 */
+	SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, digest,
+	    sha_digest_len);
+
+	/*
+	 * Do a SHA2 final on the outer context, storing the computing
+	 * digest in the users buffer.
+	 */
+	switch (mac->cd_format) {
+	case CRYPTO_DATA_RAW:
+		if (digest_len != sha_digest_len) {
+			/*
+			 * The caller requested a short digest. Digest
+			 * into a scratch buffer and return to
+			 * the user only what was requested.
+			 */
+			SHA2Final(digest,
+			    &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext);
+			bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset, digest_len);
+		} else {
+			SHA2Final((unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset,
+			    &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext);
+		}
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_final_uio(
+		    &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, mac,
+		    digest_len, digest);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS)
+		mac->cd_length = digest_len;
+	else
+		mac->cd_length = 0;
+
+	bzero(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+	kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t));
+	ctx->cc_provider_private = NULL;
+
+	return (ret);
+}
+
+#define	SHA2_MAC_UPDATE(data, ctx, ret) {				\
+	switch (data->cd_format) {					\
+	case CRYPTO_DATA_RAW:						\
+		SHA2Update(&(ctx).hc_icontext,				\
+		    (uint8_t *)data->cd_raw.iov_base +			\
+		    data->cd_offset, data->cd_length);			\
+		break;							\
+	case CRYPTO_DATA_UIO:						\
+		ret = sha2_digest_update_uio(&(ctx).hc_icontext, data);	\
+		break;							\
+	default:							\
+		ret = CRYPTO_ARGUMENTS_BAD;				\
+	}								\
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uchar_t digest[SHA512_DIGEST_LENGTH];
+	sha2_hmac_ctx_t sha2_hmac_ctx;
+	uint32_t sha_digest_len, digest_len, sha_hmac_block_size;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+	/*
+	 * Set the digest length and block size to values appropriate to the
+	 * mechanism
+	 */
+	switch (mechanism->cm_type) {
+	case SHA256_HMAC_MECH_INFO_TYPE:
+	case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = digest_len = SHA256_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+		break;
+	case SHA384_HMAC_MECH_INFO_TYPE:
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+	case SHA512_HMAC_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = digest_len = SHA512_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+		break;
+	default:
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	/* Add support for key by attributes (RFE 4706552) */
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	if (ctx_template != NULL) {
+		/* reuse context template */
+		bcopy(ctx_template, &sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+	} else {
+		sha2_hmac_ctx.hc_mech_type = mechanism->cm_type;
+		/* no context template, initialize context */
+		if (keylen_in_bytes > sha_hmac_block_size) {
+			/*
+			 * Hash the passed-in key to get a smaller key.
+			 * The inner context is used since it hasn't been
+			 * initialized yet.
+			 */
+			PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+			    &sha2_hmac_ctx.hc_icontext,
+			    key->ck_data, keylen_in_bytes, digest);
+			sha2_mac_init_ctx(&sha2_hmac_ctx, digest,
+			    sha_digest_len);
+		} else {
+			sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data,
+			    keylen_in_bytes);
+		}
+	}
+
+	/* get the mechanism parameters, if applicable */
+	if ((mechanism->cm_type % 3) == 2) {
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (ulong_t)) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+		PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len);
+		if (digest_len > sha_digest_len) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+	}
+
+	/* do a SHA2 update of the inner context using the specified data */
+	SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret);
+	if (ret != CRYPTO_SUCCESS)
+		/* the update failed, free context and bail */
+		goto bail;
+
+	/*
+	 * Do a SHA2 final on the inner context.
+	 */
+	SHA2Final(digest, &sha2_hmac_ctx.hc_icontext);
+
+	/*
+	 * Do an SHA2 update on the outer context, feeding the inner
+	 * digest as data.
+	 *
+	 * HMAC-SHA384 needs special handling as the outer hash needs only 48
+	 * bytes of the inner hash value.
+	 */
+	if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE ||
+	    mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE)
+		SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest,
+		    SHA384_DIGEST_LENGTH);
+	else
+		SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len);
+
+	/*
+	 * Do a SHA2 final on the outer context, storing the computed
+	 * digest in the users buffer.
+	 */
+	switch (mac->cd_format) {
+	case CRYPTO_DATA_RAW:
+		if (digest_len != sha_digest_len) {
+			/*
+			 * The caller requested a short digest. Digest
+			 * into a scratch buffer and return to
+			 * the user only what was requested.
+			 */
+			SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext);
+			bcopy(digest, (unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset, digest_len);
+		} else {
+			SHA2Final((unsigned char *)mac->cd_raw.iov_base +
+			    mac->cd_offset, &sha2_hmac_ctx.hc_ocontext);
+		}
+		break;
+	case CRYPTO_DATA_UIO:
+		ret = sha2_digest_final_uio(&sha2_hmac_ctx.hc_ocontext, mac,
+		    digest_len, digest);
+		break;
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (ret == CRYPTO_SUCCESS) {
+		mac->cd_length = digest_len;
+		return (CRYPTO_SUCCESS);
+	}
+bail:
+	bzero(&sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+	mac->cd_length = 0;
+	return (ret);
+}
+
+/* ARGSUSED */
+static int
+sha2_mac_verify_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+	int ret = CRYPTO_SUCCESS;
+	uchar_t digest[SHA512_DIGEST_LENGTH];
+	sha2_hmac_ctx_t sha2_hmac_ctx;
+	uint32_t sha_digest_len, digest_len, sha_hmac_block_size;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+
+	/*
+	 * Set the digest length and block size to values appropriate to the
+	 * mechanism
+	 */
+	switch (mechanism->cm_type) {
+	case SHA256_HMAC_MECH_INFO_TYPE:
+	case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = digest_len = SHA256_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+		break;
+	case SHA384_HMAC_MECH_INFO_TYPE:
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+	case SHA512_HMAC_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = digest_len = SHA512_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+		break;
+	default:
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	/* Add support for key by attributes (RFE 4706552) */
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	if (ctx_template != NULL) {
+		/* reuse context template */
+		bcopy(ctx_template, &sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+	} else {
+		sha2_hmac_ctx.hc_mech_type = mechanism->cm_type;
+		/* no context template, initialize context */
+		if (keylen_in_bytes > sha_hmac_block_size) {
+			/*
+			 * Hash the passed-in key to get a smaller key.
+			 * The inner context is used since it hasn't been
+			 * initialized yet.
+			 */
+			PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+			    &sha2_hmac_ctx.hc_icontext,
+			    key->ck_data, keylen_in_bytes, digest);
+			sha2_mac_init_ctx(&sha2_hmac_ctx, digest,
+			    sha_digest_len);
+		} else {
+			sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data,
+			    keylen_in_bytes);
+		}
+	}
+
+	/* get the mechanism parameters, if applicable */
+	if (mechanism->cm_type % 3 == 2) {
+		if (mechanism->cm_param == NULL ||
+		    mechanism->cm_param_len != sizeof (ulong_t)) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+		PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len);
+		if (digest_len > sha_digest_len) {
+			ret = CRYPTO_MECHANISM_PARAM_INVALID;
+			goto bail;
+		}
+	}
+
+	if (mac->cd_length != digest_len) {
+		ret = CRYPTO_INVALID_MAC;
+		goto bail;
+	}
+
+	/* do a SHA2 update of the inner context using the specified data */
+	SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret);
+	if (ret != CRYPTO_SUCCESS)
+		/* the update failed, free context and bail */
+		goto bail;
+
+	/* do a SHA2 final on the inner context */
+	SHA2Final(digest, &sha2_hmac_ctx.hc_icontext);
+
+	/*
+	 * Do an SHA2 update on the outer context, feeding the inner
+	 * digest as data.
+	 *
+	 * HMAC-SHA384 needs special handling as the outer hash needs only 48
+	 * bytes of the inner hash value.
+	 */
+	if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE ||
+	    mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE)
+		SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest,
+		    SHA384_DIGEST_LENGTH);
+	else
+		SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len);
+
+	/*
+	 * Do a SHA2 final on the outer context, storing the computed
+	 * digest in the users buffer.
+	 */
+	SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext);
+
+	/*
+	 * Compare the computed digest against the expected digest passed
+	 * as argument.
+	 */
+
+	switch (mac->cd_format) {
+
+	case CRYPTO_DATA_RAW:
+		if (bcmp(digest, (unsigned char *)mac->cd_raw.iov_base +
+		    mac->cd_offset, digest_len) != 0)
+			ret = CRYPTO_INVALID_MAC;
+		break;
+
+	case CRYPTO_DATA_UIO: {
+		off_t offset = mac->cd_offset;
+		uint_t vec_idx = 0;
+		off_t scratch_offset = 0;
+		size_t length = digest_len;
+		size_t cur_len;
+
+		/* we support only kernel buffer */
+		if (zfs_uio_segflg(mac->cd_uio) != UIO_SYSSPACE)
+			return (CRYPTO_ARGUMENTS_BAD);
+
+		/* jump to the first iovec containing the expected digest */
+		offset = zfs_uio_index_at_offset(mac->cd_uio, offset, &vec_idx);
+		if (vec_idx == zfs_uio_iovcnt(mac->cd_uio)) {
+			/*
+			 * The caller specified an offset that is
+			 * larger than the total size of the buffers
+			 * it provided.
+			 */
+			ret = CRYPTO_DATA_LEN_RANGE;
+			break;
+		}
+
+		/* do the comparison of computed digest vs specified one */
+		while (vec_idx < zfs_uio_iovcnt(mac->cd_uio) && length > 0) {
+			cur_len = MIN(zfs_uio_iovlen(mac->cd_uio, vec_idx) -
+			    offset, length);
+
+			if (bcmp(digest + scratch_offset,
+			    zfs_uio_iovbase(mac->cd_uio, vec_idx) + offset,
+			    cur_len) != 0) {
+				ret = CRYPTO_INVALID_MAC;
+				break;
+			}
+
+			length -= cur_len;
+			vec_idx++;
+			scratch_offset += cur_len;
+			offset = 0;
+		}
+		break;
+	}
+
+	default:
+		ret = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (ret);
+bail:
+	bzero(&sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t));
+	mac->cd_length = 0;
+	return (ret);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/* ARGSUSED */
+static int
+sha2_create_ctx_template(crypto_provider_handle_t provider,
+    crypto_mechanism_t *mechanism, crypto_key_t *key,
+    crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+    crypto_req_handle_t req)
+{
+	sha2_hmac_ctx_t *sha2_hmac_ctx_tmpl;
+	uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length);
+	uint32_t sha_digest_len, sha_hmac_block_size;
+
+	/*
+	 * Set the digest length and block size to values appropriate to the
+	 * mechanism
+	 */
+	switch (mechanism->cm_type) {
+	case SHA256_HMAC_MECH_INFO_TYPE:
+	case SHA256_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = SHA256_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE;
+		break;
+	case SHA384_HMAC_MECH_INFO_TYPE:
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+	case SHA512_HMAC_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		sha_digest_len = SHA512_DIGEST_LENGTH;
+		sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE;
+		break;
+	default:
+		return (CRYPTO_MECHANISM_INVALID);
+	}
+
+	/* Add support for key by attributes (RFE 4706552) */
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Allocate and initialize SHA2 context.
+	 */
+	sha2_hmac_ctx_tmpl = kmem_alloc(sizeof (sha2_hmac_ctx_t),
+	    crypto_kmflag(req));
+	if (sha2_hmac_ctx_tmpl == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	sha2_hmac_ctx_tmpl->hc_mech_type = mechanism->cm_type;
+
+	if (keylen_in_bytes > sha_hmac_block_size) {
+		uchar_t digested_key[SHA512_DIGEST_LENGTH];
+
+		/*
+		 * Hash the passed-in key to get a smaller key.
+		 * The inner context is used since it hasn't been
+		 * initialized yet.
+		 */
+		PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3,
+		    &sha2_hmac_ctx_tmpl->hc_icontext,
+		    key->ck_data, keylen_in_bytes, digested_key);
+		sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, digested_key,
+		    sha_digest_len);
+	} else {
+		sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, key->ck_data,
+		    keylen_in_bytes);
+	}
+
+	*ctx_template = (crypto_spi_ctx_template_t)sha2_hmac_ctx_tmpl;
+	*ctx_template_size = sizeof (sha2_hmac_ctx_t);
+
+	return (CRYPTO_SUCCESS);
+}
+
+static int
+sha2_free_context(crypto_ctx_t *ctx)
+{
+	uint_t ctx_len;
+
+	if (ctx->cc_provider_private == NULL)
+		return (CRYPTO_SUCCESS);
+
+	/*
+	 * We have to free either SHA2 or SHA2-HMAC contexts, which
+	 * have different lengths.
+	 *
+	 * Note: Below is dependent on the mechanism ordering.
+	 */
+
+	if (PROV_SHA2_CTX(ctx)->sc_mech_type % 3 == 0)
+		ctx_len = sizeof (sha2_ctx_t);
+	else
+		ctx_len = sizeof (sha2_hmac_ctx_t);
+
+	bzero(ctx->cc_provider_private, ctx_len);
+	kmem_free(ctx->cc_provider_private, ctx_len);
+	ctx->cc_provider_private = NULL;
+
+	return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/io/skein_mod.c b/sys/contrib/openzfs/module/icp/io/skein_mod.c
new file mode 100644
index 000000000000..5ee36af12bcb
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/io/skein_mod.c
@@ -0,0 +1,729 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/icp.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#define	SKEIN_MODULE_IMPL
+#include <sys/skein.h>
+
+/*
+ * Like the sha2 module, we create the skein module with two modlinkages:
+ * - modlmisc to allow direct calls to Skein_* API functions.
+ * - modlcrypto to integrate well into the Kernel Crypto Framework (KCF).
+ */
+static struct modlmisc modlmisc = {
+	&mod_cryptoops,
+	"Skein Message-Digest Algorithm"
+};
+
+static struct modlcrypto modlcrypto = {
+	&mod_cryptoops,
+	"Skein Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, {&modlmisc, &modlcrypto, NULL}
+};
+
+static crypto_mech_info_t skein_mech_info_tab[] = {
+	{CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	{CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	{CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void skein_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t skein_control_ops = {
+	skein_provider_status
+};
+
+static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_req_handle_t);
+static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int skein_update(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+
+static crypto_digest_ops_t skein_digest_ops = {
+	.digest_init = skein_digest_init,
+	.digest = skein_digest,
+	.digest_update = skein_update,
+	.digest_key = NULL,
+	.digest_final = skein_final,
+	.digest_atomic = skein_digest_atomic
+};
+
+static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int skein_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t skein_mac_ops = {
+	.mac_init = skein_mac_init,
+	.mac = NULL,
+	.mac_update = skein_update, /* using regular digest update is OK here */
+	.mac_final = skein_final,   /* using regular digest final is OK here */
+	.mac_atomic = skein_mac_atomic,
+	.mac_verify_atomic = NULL
+};
+
+static int skein_create_ctx_template(crypto_provider_handle_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+    size_t *, crypto_req_handle_t);
+static int skein_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t skein_ctx_ops = {
+	.create_ctx_template = skein_create_ctx_template,
+	.free_context = skein_free_context
+};
+
+static crypto_ops_t skein_crypto_ops = {{{{{
+	&skein_control_ops,
+	&skein_digest_ops,
+	NULL,
+	&skein_mac_ops,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&skein_ctx_ops,
+}}}}};
+
+static crypto_provider_info_t skein_prov_info = {{{{
+	CRYPTO_SPI_VERSION_1,
+	"Skein Software Provider",
+	CRYPTO_SW_PROVIDER,
+	NULL,
+	&skein_crypto_ops,
+	sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t),
+	skein_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t skein_prov_handle = 0;
+
+typedef struct skein_ctx {
+	skein_mech_type_t		sc_mech_type;
+	size_t				sc_digest_bitlen;
+	/*LINTED(E_ANONYMOUS_UNION_DECL)*/
+	union {
+		Skein_256_Ctxt_t	sc_256;
+		Skein_512_Ctxt_t	sc_512;
+		Skein1024_Ctxt_t	sc_1024;
+	};
+} skein_ctx_t;
+#define	SKEIN_CTX(_ctx_)	((skein_ctx_t *)((_ctx_)->cc_provider_private))
+#define	SKEIN_CTX_LVALUE(_ctx_)	(_ctx_)->cc_provider_private
+#define	SKEIN_OP(_skein_ctx, _op, ...)					\
+	do {								\
+		skein_ctx_t	*sc = (_skein_ctx);			\
+		switch (sc->sc_mech_type) {				\
+		case SKEIN_256_MECH_INFO_TYPE:				\
+		case SKEIN_256_MAC_MECH_INFO_TYPE:			\
+			(void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\
+			break;						\
+		case SKEIN_512_MECH_INFO_TYPE:				\
+		case SKEIN_512_MAC_MECH_INFO_TYPE:			\
+			(void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\
+			break;						\
+		case SKEIN1024_MECH_INFO_TYPE:				\
+		case SKEIN1024_MAC_MECH_INFO_TYPE:			\
+			(void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\
+			break;						\
+		}							\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+static int
+skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result)
+{
+	if (mechanism->cm_param != NULL) {
+		/*LINTED(E_BAD_PTR_CAST_ALIGN)*/
+		skein_param_t	*param = (skein_param_t *)mechanism->cm_param;
+
+		if (mechanism->cm_param_len != sizeof (*param) ||
+		    param->sp_digest_bitlen == 0) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		*result = param->sp_digest_bitlen;
+	} else {
+		switch (mechanism->cm_type) {
+		case SKEIN_256_MECH_INFO_TYPE:
+			*result = 256;
+			break;
+		case SKEIN_512_MECH_INFO_TYPE:
+			*result = 512;
+			break;
+		case SKEIN1024_MECH_INFO_TYPE:
+			*result = 1024;
+			break;
+		default:
+			return (CRYPTO_MECHANISM_INVALID);
+		}
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+int
+skein_mod_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	/*
+	 * Try to register with KCF - failure shouldn't unload us, since we
+	 * still may want to continue providing misc/skein functionality.
+	 */
+	(void) crypto_register_provider(&skein_prov_info, &skein_prov_handle);
+
+	return (0);
+}
+
+int
+skein_mod_fini(void)
+{
+	int ret;
+
+	if (skein_prov_handle != 0) {
+		if ((ret = crypto_unregister_provider(skein_prov_handle)) !=
+		    CRYPTO_SUCCESS) {
+			cmn_err(CE_WARN,
+			    "skein _fini: crypto_unregister_provider() "
+			    "failed (0x%x)", ret);
+			return (EBUSY);
+		}
+		skein_prov_handle = 0;
+	}
+
+	return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+skein_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+	*status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * General Skein hashing helper functions.
+ */
+
+/*
+ * Performs an Update on a context with uio input data.
+ */
+static int
+skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data)
+{
+	off_t		offset = data->cd_offset;
+	size_t		length = data->cd_length;
+	uint_t		vec_idx = 0;
+	size_t		cur_len;
+	zfs_uio_t	*uio = data->cd_uio;
+
+	/* we support only kernel buffer */
+	if (zfs_uio_segflg(uio) != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing data to be
+	 * digested.
+	 */
+	offset = zfs_uio_index_at_offset(uio, offset, &vec_idx);
+	if (vec_idx == zfs_uio_iovcnt(uio)) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	/*
+	 * Now do the digesting on the iovecs.
+	 */
+	while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) {
+		cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset, length);
+		SKEIN_OP(ctx, Update, (uint8_t *)zfs_uio_iovbase(uio, vec_idx)
+		    + offset, cur_len);
+		length -= cur_len;
+		vec_idx++;
+		offset = 0;
+	}
+
+	if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) {
+		/*
+		 * The end of the specified iovec's was reached but
+		 * the length requested could not be processed, i.e.
+		 * The caller requested to digest more data than it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Performs a Final on a context and writes to a uio digest output.
+ */
+static int
+skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	off_t offset = digest->cd_offset;
+	uint_t vec_idx = 0;
+	zfs_uio_t *uio = digest->cd_uio;
+
+	/* we support only kernel buffer */
+	if (zfs_uio_segflg(uio) != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing ptr to the digest to be returned.
+	 */
+	offset = zfs_uio_index_at_offset(uio, offset, &vec_idx);
+	if (vec_idx == zfs_uio_iovcnt(uio)) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+	if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <=
+	    zfs_uio_iovlen(uio, vec_idx)) {
+		/* The computed digest will fit in the current iovec. */
+		SKEIN_OP(ctx, Final,
+		    (uchar_t *)zfs_uio_iovbase(uio, vec_idx) + offset);
+	} else {
+		uint8_t *digest_tmp;
+		off_t scratch_offset = 0;
+		size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen);
+		size_t cur_len;
+
+		digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES(
+		    ctx->sc_digest_bitlen), crypto_kmflag(req));
+		if (digest_tmp == NULL)
+			return (CRYPTO_HOST_MEMORY);
+		SKEIN_OP(ctx, Final, digest_tmp);
+		while (vec_idx < zfs_uio_iovcnt(uio) && length > 0) {
+			cur_len = MIN(zfs_uio_iovlen(uio, vec_idx) - offset,
+			    length);
+			bcopy(digest_tmp + scratch_offset,
+			    zfs_uio_iovbase(uio, vec_idx) + offset, cur_len);
+
+			length -= cur_len;
+			vec_idx++;
+			scratch_offset += cur_len;
+			offset = 0;
+		}
+		kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen));
+
+		if (vec_idx == zfs_uio_iovcnt(uio) && length > 0) {
+			/*
+			 * The end of the specified iovec's was reached but
+			 * the length requested could not be processed, i.e.
+			 * The caller requested to digest more data than it
+			 * provided.
+			 */
+			return (CRYPTO_DATA_LEN_RANGE);
+		}
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+/*
+ * Initializes a skein digest context to the configuration in `mechanism'.
+ * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param
+ * field may contain a skein_param_t structure indicating the length of the
+ * digest the algorithm should produce. Otherwise the default output lengths
+ * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes
+ * for Skein-1024).
+ */
+static int
+skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_req_handle_t req)
+{
+	int	error = CRYPTO_SUCCESS;
+
+	if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+
+	SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+	    crypto_kmflag(req));
+	if (SKEIN_CTX(ctx) == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism,
+	    &SKEIN_CTX(ctx)->sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		goto errout;
+	SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen);
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	SKEIN_CTX_LVALUE(ctx) = NULL;
+	return (error);
+}
+
+/*
+ * Executes a skein_update and skein_digest on a pre-initialized crypto
+ * context in a single step. See the documentation to these functions to
+ * see what to pass here.
+ */
+static int
+skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	if (digest->cd_length <
+	    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	error = skein_update(ctx, data, req);
+	if (error != CRYPTO_SUCCESS) {
+		bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		SKEIN_CTX_LVALUE(ctx) = NULL;
+		digest->cd_length = 0;
+		return (error);
+	}
+	error = skein_final(ctx, digest, req);
+
+	return (error);
+}
+
+/*
+ * Performs a skein Update with the input message in `data' (successive calls
+ * can push more data). This is used both for digest and MAC operation.
+ * Supported input data formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SKEIN_OP(SKEIN_CTX(ctx), Update,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		error = skein_digest_update_uio(SKEIN_CTX(ctx), data);
+		break;
+	default:
+		error = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (error);
+}
+
+/*
+ * Performs a skein Final, writing the output to `digest'. This is used both
+ * for digest and MAC operation.
+ * Supported output digest formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	if (digest->cd_length <
+	    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SKEIN_OP(SKEIN_CTX(ctx), Final,
+		    (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset);
+		break;
+	case CRYPTO_DATA_UIO:
+		error = skein_digest_final_uio(SKEIN_CTX(ctx), digest, req);
+		break;
+	default:
+		error = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (error == CRYPTO_SUCCESS)
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+	else
+		digest->cd_length = 0;
+
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx))));
+	SKEIN_CTX_LVALUE(ctx) = NULL;
+
+	return (error);
+}
+
+/*
+ * Performs a full skein digest computation in a single call, configuring the
+ * algorithm according to `mechanism', reading the input to be digested from
+ * `data' and writing the output to `digest'.
+ * Supported input/output formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_digest_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req)
+{
+	int		error;
+	skein_ctx_t	skein_ctx;
+	crypto_ctx_t	ctx;
+	SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+	/* Init */
+	if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+	skein_ctx.sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		goto out;
+	SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen);
+
+	if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS)
+		goto out;
+	if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS)
+		goto out;
+
+out:
+	if (error == CRYPTO_SUCCESS)
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen);
+	else
+		digest->cd_length = 0;
+	bzero(&skein_ctx, sizeof (skein_ctx));
+
+	return (error);
+}
+
+/*
+ * Helper function that builds a Skein MAC context from the provided
+ * mechanism and key.
+ */
+static int
+skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key)
+{
+	int error;
+
+	if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+	ctx->sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		return (error);
+	SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data,
+	    CRYPTO_BITS2BYTES(key->ck_length));
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provide mac entry points.
+ */
+/*
+ * Initializes a skein MAC context. You may pass a ctx_template, in which
+ * case the template will be reused to make initialization more efficient.
+ * Otherwise a new context will be constructed. The mechanism cm_type must
+ * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you
+ * may pass a skein_param_t in cm_param to configure the length of the
+ * digest. The key must be in raw format.
+ */
+static int
+skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+    crypto_req_handle_t req)
+{
+	int	error;
+
+	SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+	    crypto_kmflag(req));
+	if (SKEIN_CTX(ctx) == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	if (ctx_template != NULL) {
+		bcopy(ctx_template, SKEIN_CTX(ctx),
+		    sizeof (*SKEIN_CTX(ctx)));
+	} else {
+		error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key);
+		if (error != CRYPTO_SUCCESS)
+			goto errout;
+	}
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	return (error);
+}
+
+/*
+ * The MAC update and final calls are reused from the regular digest code.
+ */
+
+/*ARGSUSED*/
+/*
+ * Same as skein_digest_atomic, performs an atomic Skein MAC operation in
+ * one step. All the same properties apply to the arguments of this
+ * function as to those of the partial operations above.
+ */
+static int
+skein_mac_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+	/* faux crypto context just for skein_digest_{update,final} */
+	int		error;
+	crypto_ctx_t	ctx;
+	skein_ctx_t	skein_ctx;
+	SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+	if (ctx_template != NULL) {
+		bcopy(ctx_template, &skein_ctx, sizeof (skein_ctx));
+	} else {
+		error = skein_mac_ctx_build(&skein_ctx, mechanism, key);
+		if (error != CRYPTO_SUCCESS)
+			goto errout;
+	}
+
+	if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS)
+		goto errout;
+	if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS)
+		goto errout;
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(&skein_ctx, sizeof (skein_ctx));
+	return (error);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/*
+ * Constructs a context template for the Skein MAC algorithm. The same
+ * properties apply to the arguments of this function as to those of
+ * skein_mac_init.
+ */
+/*ARGSUSED*/
+static int
+skein_create_ctx_template(crypto_provider_handle_t provider,
+    crypto_mechanism_t *mechanism, crypto_key_t *key,
+    crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+    crypto_req_handle_t req)
+{
+	int		error;
+	skein_ctx_t	*ctx_tmpl;
+
+	ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), crypto_kmflag(req));
+	if (ctx_tmpl == NULL)
+		return (CRYPTO_HOST_MEMORY);
+	error = skein_mac_ctx_build(ctx_tmpl, mechanism, key);
+	if (error != CRYPTO_SUCCESS)
+		goto errout;
+	*ctx_template = ctx_tmpl;
+	*ctx_template_size = sizeof (*ctx_tmpl);
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(ctx_tmpl, sizeof (*ctx_tmpl));
+	kmem_free(ctx_tmpl, sizeof (*ctx_tmpl));
+	return (error);
+}
+
+/*
+ * Frees a skein context in a parent crypto context.
+ */
+static int
+skein_free_context(crypto_ctx_t *ctx)
+{
+	if (SKEIN_CTX(ctx) != NULL) {
+		bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		SKEIN_CTX_LVALUE(ctx) = NULL;
+	}
+
+	return (CRYPTO_SUCCESS);
+}
diff --git a/sys/contrib/openzfs/module/icp/os/modconf.c b/sys/contrib/openzfs/module/icp/os/modconf.c
new file mode 100644
index 000000000000..3743416ed951
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/os/modconf.c
@@ -0,0 +1,173 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/modctl.h>
+
+/*
+ * Null operations; used for uninitialized and "misc" modules.
+ */
+static int mod_null(struct modlmisc *, struct modlinkage *);
+static int mod_infonull(void *, struct modlinkage *, int *);
+
+/*
+ * Cryptographic Modules
+ */
+struct mod_ops mod_cryptoops = {
+	.modm_install = mod_null,
+	.modm_remove = mod_null,
+	.modm_info = mod_infonull
+};
+
+/*
+ * Null operation; return 0.
+ */
+static int
+mod_null(struct modlmisc *modl, struct modlinkage *modlp)
+{
+	return (0);
+}
+
+/*
+ * Status for User modules.
+ */
+static int
+mod_infonull(void *modl, struct modlinkage *modlp, int *p0)
+{
+	*p0 = -1;		/* for modinfo display */
+	return (0);
+}
+
+/*
+ * Install a module.
+ * (This routine is in the Solaris SPARC DDI/DKI)
+ */
+int
+mod_install(struct modlinkage *modlp)
+{
+	int retval = -1;	/* No linkage structures */
+	struct modlmisc **linkpp;
+	struct modlmisc **linkpp1;
+
+	if (modlp->ml_rev != MODREV_1) {
+		cmn_err(CE_WARN, "mod_install: "
+		    "modlinkage structure is not MODREV_1\n");
+		return (EINVAL);
+	}
+	linkpp = (struct modlmisc **)&modlp->ml_linkage[0];
+
+	while (*linkpp != NULL) {
+		if ((retval = MODL_INSTALL(*linkpp, modlp)) != 0) {
+			linkpp1 = (struct modlmisc **)&modlp->ml_linkage[0];
+
+			while (linkpp1 != linkpp) {
+				MODL_REMOVE(*linkpp1, modlp); /* clean up */
+				linkpp1++;
+			}
+			break;
+		}
+		linkpp++;
+	}
+	return (retval);
+}
+
+static char *reins_err =
+	"Could not reinstall %s\nReboot to correct the problem";
+
+/*
+ * Remove a module.  This is called by the module wrapper routine.
+ * (This routine is in the Solaris SPARC DDI/DKI)
+ */
+int
+mod_remove(struct modlinkage *modlp)
+{
+	int retval = 0;
+	struct modlmisc **linkpp, *last_linkp;
+
+	linkpp = (struct modlmisc **)&modlp->ml_linkage[0];
+
+	while (*linkpp != NULL) {
+		if ((retval = MODL_REMOVE(*linkpp, modlp)) != 0) {
+			last_linkp = *linkpp;
+			linkpp = (struct modlmisc **)&modlp->ml_linkage[0];
+			while (*linkpp != last_linkp) {
+				if (MODL_INSTALL(*linkpp, modlp) != 0) {
+					cmn_err(CE_WARN, reins_err,
+					    (*linkpp)->misc_linkinfo);
+					break;
+				}
+				linkpp++;
+			}
+			break;
+		}
+		linkpp++;
+	}
+	return (retval);
+}
+
+/*
+ * Get module status.
+ * (This routine is in the Solaris SPARC DDI/DKI)
+ */
+int
+mod_info(struct modlinkage *modlp, struct modinfo *modinfop)
+{
+	int i;
+	int retval = 0;
+	struct modspecific_info *msip;
+	struct modlmisc **linkpp;
+
+	modinfop->mi_rev = modlp->ml_rev;
+
+	linkpp = (struct modlmisc **)modlp->ml_linkage;
+	msip = &modinfop->mi_msinfo[0];
+
+	for (i = 0; i < MODMAXLINK; i++) {
+		if (*linkpp == NULL) {
+			msip->msi_linkinfo[0] = '\0';
+		} else {
+			(void) strlcpy(msip->msi_linkinfo,
+			    (*linkpp)->misc_linkinfo, MODMAXLINKINFOLEN);
+			retval = MODL_INFO(*linkpp, modlp, &msip->msi_p0);
+			if (retval != 0)
+				break;
+			linkpp++;
+		}
+		msip++;
+	}
+
+	if (modinfop->mi_info == MI_INFO_LINKAGE) {
+		/*
+		 * Slight kludge used to extract the address of the
+		 * modlinkage structure from the module (just after
+		 * loading a module for the very first time)
+		 */
+		modinfop->mi_base = (void *)modlp;
+	}
+
+	if (retval == 0)
+		return (1);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/icp/os/modhash.c b/sys/contrib/openzfs/module/icp/os/modhash.c
new file mode 100644
index 000000000000..a897871001ce
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/os/modhash.c
@@ -0,0 +1,927 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * mod_hash: flexible hash table implementation.
+ *
+ * This is a reasonably fast, reasonably flexible hash table implementation
+ * which features pluggable hash algorithms to support storing arbitrary keys
+ * and values.  It is designed to handle small (< 100,000 items) amounts of
+ * data.  The hash uses chaining to resolve collisions, and does not feature a
+ * mechanism to grow the hash.  Care must be taken to pick nchains to be large
+ * enough for the application at hand, or lots of time will be wasted searching
+ * hash chains.
+ *
+ * The client of the hash is required to supply a number of items to support
+ * the various hash functions:
+ *
+ * 	- Destructor functions for the key and value being hashed.
+ *	  A destructor is responsible for freeing an object when the hash
+ *	  table is no longer storing it.  Since keys and values can be of
+ *	  arbitrary type, separate destructors for keys & values are used.
+ *	  These may be mod_hash_null_keydtor and mod_hash_null_valdtor if no
+ *	  destructor is needed for either a key or value.
+ *
+ *	- A hashing algorithm which returns a uint_t representing a hash index
+ *	  The number returned need _not_ be between 0 and nchains.  The mod_hash
+ *	  code will take care of doing that.  The second argument (after the
+ *	  key) to the hashing function is a void * that represents
+ *	  hash_alg_data-- this is provided so that the hashing algorithm can
+ *	  maintain some state across calls, or keep algorithm-specific
+ *	  constants associated with the hash table.
+ *
+ *	  A pointer-hashing and a string-hashing algorithm are supplied in
+ *	  this file.
+ *
+ *	- A key comparator (a la qsort).
+ *	  This is used when searching the hash chain.  The key comparator
+ *	  determines if two keys match.  It should follow the return value
+ *	  semantics of strcmp.
+ *
+ *	  string and pointer comparators are supplied in this file.
+ *
+ * mod_hash_create_strhash() and mod_hash_create_ptrhash() provide good
+ * examples of how to create a customized hash table.
+ *
+ * Basic hash operations:
+ *
+ *   mod_hash_create_strhash(name, nchains, dtor),
+ *	create a hash using strings as keys.
+ *	NOTE: This create a hash which automatically cleans up the string
+ *	      values it is given for keys.
+ *
+ *   mod_hash_create_ptrhash(name, nchains, dtor, key_elem_size):
+ *	create a hash using pointers as keys.
+ *
+ *   mod_hash_create_extended(name, nchains, kdtor, vdtor,
+ *			      hash_alg, hash_alg_data,
+ *			      keycmp, sleep)
+ *	create a customized hash table.
+ *
+ *   mod_hash_destroy_hash(hash):
+ *	destroy the given hash table, calling the key and value destructors
+ *	on each key-value pair stored in the hash.
+ *
+ *   mod_hash_insert(hash, key, val):
+ *	place a key, value pair into the given hash.
+ *	duplicate keys are rejected.
+ *
+ *   mod_hash_insert_reserve(hash, key, val, handle):
+ *	place a key, value pair into the given hash, using handle to indicate
+ *	the reserved storage for the pair.  (no memory allocation is needed
+ *	during a mod_hash_insert_reserve.)  duplicate keys are rejected.
+ *
+ *   mod_hash_reserve(hash, *handle):
+ *      reserve storage for a key-value pair using the memory allocation
+ *      policy of 'hash', returning the storage handle in 'handle'.
+ *
+ *   mod_hash_reserve_nosleep(hash, *handle): reserve storage for a key-value
+ *	pair ignoring the memory allocation policy of 'hash' and always without
+ *	sleep, returning the storage handle in 'handle'.
+ *
+ *   mod_hash_remove(hash, key, *val):
+ *	remove a key-value pair with key 'key' from 'hash', destroying the
+ *	stored key, and returning the value in val.
+ *
+ *   mod_hash_replace(hash, key, val)
+ * 	atomically remove an existing key-value pair from a hash, and replace
+ * 	the key and value with the ones supplied.  The removed key and value
+ * 	(if any) are destroyed.
+ *
+ *   mod_hash_destroy(hash, key):
+ *	remove a key-value pair with key 'key' from 'hash', destroying both
+ *	stored key and stored value.
+ *
+ *   mod_hash_find(hash, key, val):
+ *	find a value in the hash table corresponding to the given key.
+ *
+ *   mod_hash_find_cb(hash, key, val, found_callback)
+ *	find a value in the hash table corresponding to the given key.
+ *	If a value is found, call specified callback passing key and val to it.
+ *      The callback is called with the hash lock held.
+ *	It is intended to be used in situations where the act of locating the
+ *	data must also modify it - such as in reference counting schemes.
+ *
+ *   mod_hash_walk(hash, callback(key, elem, arg), arg)
+ * 	walks all the elements in the hashtable and invokes the callback
+ * 	function with the key/value pair for each element.  the hashtable
+ * 	is locked for readers so the callback function should not attempt
+ * 	to do any updates to the hashable.  the callback function should
+ * 	return MH_WALK_CONTINUE to continue walking the hashtable or
+ * 	MH_WALK_TERMINATE to abort the walk of the hashtable.
+ *
+ *   mod_hash_clear(hash):
+ *	clears the given hash table of entries, calling the key and value
+ *	destructors for every element in the hash.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/bitmap.h>
+#include <sys/modhash_impl.h>
+#include <sys/sysmacros.h>
+
+/*
+ * MH_KEY_DESTROY()
+ * 	Invoke the key destructor.
+ */
+#define	MH_KEY_DESTROY(hash, key) ((hash->mh_kdtor)(key))
+
+/*
+ * MH_VAL_DESTROY()
+ * 	Invoke the value destructor.
+ */
+#define	MH_VAL_DESTROY(hash, val) ((hash->mh_vdtor)(val))
+
+/*
+ * MH_KEYCMP()
+ * 	Call the key comparator for the given hash keys.
+ */
+#define	MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2))
+
+/*
+ * Cache for struct mod_hash_entry
+ */
+kmem_cache_t *mh_e_cache = NULL;
+mod_hash_t *mh_head = NULL;
+kmutex_t mh_head_lock;
+
+/*
+ * mod_hash_null_keydtor()
+ * mod_hash_null_valdtor()
+ * 	no-op key and value destructors.
+ */
+/*ARGSUSED*/
+void
+mod_hash_null_keydtor(mod_hash_key_t key)
+{
+}
+
+/*ARGSUSED*/
+void
+mod_hash_null_valdtor(mod_hash_val_t val)
+{
+}
+
+/*
+ * mod_hash_bystr()
+ * mod_hash_strkey_cmp()
+ * mod_hash_strkey_dtor()
+ * mod_hash_strval_dtor()
+ *	Hash and key comparison routines for hashes with string keys.
+ *
+ * mod_hash_create_strhash()
+ * 	Create a hash using strings as keys
+ *
+ *	The string hashing algorithm is from the "Dragon Book" --
+ *	"Compilers: Principles, Tools & Techniques", by Aho, Sethi, Ullman
+ */
+
+/*ARGSUSED*/
+uint_t
+mod_hash_bystr(void *hash_data, mod_hash_key_t key)
+{
+	uint_t hash = 0;
+	uint_t g;
+	char *p, *k = (char *)key;
+
+	ASSERT(k);
+	for (p = k; *p != '\0'; p++) {
+		hash = (hash << 4) + *p;
+		if ((g = (hash & 0xf0000000)) != 0) {
+			hash ^= (g >> 24);
+			hash ^= g;
+		}
+	}
+	return (hash);
+}
+
+int
+mod_hash_strkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
+{
+	return (strcmp((char *)key1, (char *)key2));
+}
+
+void
+mod_hash_strkey_dtor(mod_hash_key_t key)
+{
+	char *c = (char *)key;
+	kmem_free(c, strlen(c) + 1);
+}
+
+void
+mod_hash_strval_dtor(mod_hash_val_t val)
+{
+	char *c = (char *)val;
+	kmem_free(c, strlen(c) + 1);
+}
+
+mod_hash_t *
+mod_hash_create_strhash_nodtr(char *name, size_t nchains,
+    void (*val_dtor)(mod_hash_val_t))
+{
+	return mod_hash_create_extended(name, nchains, mod_hash_null_keydtor,
+	    val_dtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+}
+
+mod_hash_t *
+mod_hash_create_strhash(char *name, size_t nchains,
+    void (*val_dtor)(mod_hash_val_t))
+{
+	return mod_hash_create_extended(name, nchains, mod_hash_strkey_dtor,
+	    val_dtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
+}
+
+void
+mod_hash_destroy_strhash(mod_hash_t *strhash)
+{
+	ASSERT(strhash);
+	mod_hash_destroy_hash(strhash);
+}
+
+
+/*
+ * mod_hash_byptr()
+ * mod_hash_ptrkey_cmp()
+ *	Hash and key comparison routines for hashes with pointer keys.
+ *
+ * mod_hash_create_ptrhash()
+ * mod_hash_destroy_ptrhash()
+ * 	Create a hash that uses pointers as keys.  This hash algorithm
+ * 	picks an appropriate set of middle bits in the address to hash on
+ * 	based on the size of the hash table and a hint about the size of
+ * 	the items pointed at.
+ */
+uint_t
+mod_hash_byptr(void *hash_data, mod_hash_key_t key)
+{
+	uintptr_t k = (uintptr_t)key;
+	k >>= (int)(uintptr_t)hash_data;
+
+	return ((uint_t)k);
+}
+
+int
+mod_hash_ptrkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
+{
+	uintptr_t k1 = (uintptr_t)key1;
+	uintptr_t k2 = (uintptr_t)key2;
+	if (k1 > k2)
+		return (-1);
+	else if (k1 < k2)
+		return (1);
+	else
+		return (0);
+}
+
+mod_hash_t *
+mod_hash_create_ptrhash(char *name, size_t nchains,
+    void (*val_dtor)(mod_hash_val_t), size_t key_elem_size)
+{
+	size_t rshift;
+
+	/*
+	 * We want to hash on the bits in the middle of the address word
+	 * Bits far to the right in the word have little significance, and
+	 * are likely to all look the same (for example, an array of
+	 * 256-byte structures will have the bottom 8 bits of address
+	 * words the same).  So we want to right-shift each address to
+	 * ignore the bottom bits.
+	 *
+	 * The high bits, which are also unused, will get taken out when
+	 * mod_hash takes hashkey % nchains.
+	 */
+	rshift = highbit64(key_elem_size);
+
+	return mod_hash_create_extended(name, nchains, mod_hash_null_keydtor,
+	    val_dtor, mod_hash_byptr, (void *)rshift, mod_hash_ptrkey_cmp,
+	    KM_SLEEP);
+}
+
+void
+mod_hash_destroy_ptrhash(mod_hash_t *hash)
+{
+	ASSERT(hash);
+	mod_hash_destroy_hash(hash);
+}
+
+/*
+ * mod_hash_byid()
+ * mod_hash_idkey_cmp()
+ *	Hash and key comparison routines for hashes with 32-bit unsigned keys.
+ *
+ * mod_hash_create_idhash()
+ * mod_hash_destroy_idhash()
+ * mod_hash_iddata_gen()
+ * 	Create a hash that uses numeric keys.
+ *
+ *	The hash algorithm is documented in "Introduction to Algorithms"
+ *	(Cormen, Leiserson, Rivest);  when the hash table is created, it
+ *	attempts to find the next largest prime above the number of hash
+ *	slots.  The hash index is then this number times the key modulo
+ *	the hash size, or (key * prime) % nchains.
+ */
+uint_t
+mod_hash_byid(void *hash_data, mod_hash_key_t key)
+{
+	uint_t kval = (uint_t)(uintptr_t)hash_data;
+	return ((uint_t)(uintptr_t)key * (uint_t)kval);
+}
+
+int
+mod_hash_idkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
+{
+	return ((uint_t)(uintptr_t)key1 - (uint_t)(uintptr_t)key2);
+}
+
+/*
+ * Generate the next largest prime number greater than nchains; this value
+ * is intended to be later passed in to mod_hash_create_extended() as the
+ * hash_data.
+ */
+uint_t
+mod_hash_iddata_gen(size_t nchains)
+{
+	uint_t kval, i, prime;
+
+	/*
+	 * Pick the first (odd) prime greater than nchains.  Make sure kval is
+	 * odd (so start with nchains +1 or +2 as appropriate).
+	 */
+	kval = (nchains % 2 == 0) ? nchains + 1 : nchains + 2;
+
+	for (;;) {
+		prime = 1;
+		for (i = 3; i * i <= kval; i += 2) {
+			if (kval % i == 0)
+				prime = 0;
+		}
+		if (prime == 1)
+			break;
+		kval += 2;
+	}
+	return (kval);
+}
+
+mod_hash_t *
+mod_hash_create_idhash(char *name, size_t nchains,
+    void (*val_dtor)(mod_hash_val_t))
+{
+	uint_t kval = mod_hash_iddata_gen(nchains);
+
+	return (mod_hash_create_extended(name, nchains, mod_hash_null_keydtor,
+	    val_dtor, mod_hash_byid, (void *)(uintptr_t)kval,
+	    mod_hash_idkey_cmp, KM_SLEEP));
+}
+
+void
+mod_hash_destroy_idhash(mod_hash_t *hash)
+{
+	ASSERT(hash);
+	mod_hash_destroy_hash(hash);
+}
+
+void
+mod_hash_fini(void)
+{
+	mutex_destroy(&mh_head_lock);
+
+	if (mh_e_cache) {
+		kmem_cache_destroy(mh_e_cache);
+		mh_e_cache = NULL;
+	}
+}
+
+/*
+ * mod_hash_init()
+ * 	sets up globals, etc for mod_hash_*
+ */
+void
+mod_hash_init(void)
+{
+	ASSERT(mh_e_cache == NULL);
+	mh_e_cache = kmem_cache_create("mod_hash_entries",
+	    sizeof (struct mod_hash_entry), 0, NULL, NULL, NULL, NULL,
+	    NULL, 0);
+
+	mutex_init(&mh_head_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/*
+ * mod_hash_create_extended()
+ * 	The full-blown hash creation function.
+ *
+ * notes:
+ * 	nchains		- how many hash slots to create.  More hash slots will
+ *			  result in shorter hash chains, but will consume
+ *			  slightly more memory up front.
+ *	sleep		- should be KM_SLEEP or KM_NOSLEEP, to indicate whether
+ *			  to sleep for memory, or fail in low-memory conditions.
+ *
+ * 	Fails only if KM_NOSLEEP was specified, and no memory was available.
+ */
+mod_hash_t *
+mod_hash_create_extended(
+    char *hname,			/* descriptive name for hash */
+    size_t nchains,			/* number of hash slots */
+    void (*kdtor)(mod_hash_key_t),	/* key destructor */
+    void (*vdtor)(mod_hash_val_t),	/* value destructor */
+    uint_t (*hash_alg)(void *, mod_hash_key_t), /* hash algorithm */
+    void *hash_alg_data,		/* pass-thru arg for hash_alg */
+    int (*keycmp)(mod_hash_key_t, mod_hash_key_t), /* key comparator */
+    int sleep)				/* whether to sleep for mem */
+{
+	mod_hash_t *mod_hash;
+	size_t size;
+	ASSERT(hname && keycmp && hash_alg && vdtor && kdtor);
+
+	if ((mod_hash = kmem_zalloc(MH_SIZE(nchains), sleep)) == NULL)
+		return (NULL);
+
+	size = strlen(hname) + 1;
+	mod_hash->mh_name = kmem_alloc(size, sleep);
+	if (mod_hash->mh_name == NULL) {
+		kmem_free(mod_hash, MH_SIZE(nchains));
+		return (NULL);
+	}
+	(void) strlcpy(mod_hash->mh_name, hname, size);
+
+	rw_init(&mod_hash->mh_contents, NULL, RW_DEFAULT, NULL);
+	mod_hash->mh_sleep = sleep;
+	mod_hash->mh_nchains = nchains;
+	mod_hash->mh_kdtor = kdtor;
+	mod_hash->mh_vdtor = vdtor;
+	mod_hash->mh_hashalg = hash_alg;
+	mod_hash->mh_hashalg_data = hash_alg_data;
+	mod_hash->mh_keycmp = keycmp;
+
+	/*
+	 * Link the hash up on the list of hashes
+	 */
+	mutex_enter(&mh_head_lock);
+	mod_hash->mh_next = mh_head;
+	mh_head = mod_hash;
+	mutex_exit(&mh_head_lock);
+
+	return (mod_hash);
+}
+
+/*
+ * mod_hash_destroy_hash()
+ * 	destroy a hash table, destroying all of its stored keys and values
+ * 	as well.
+ */
+void
+mod_hash_destroy_hash(mod_hash_t *hash)
+{
+	mod_hash_t *mhp, *mhpp;
+
+	mutex_enter(&mh_head_lock);
+	/*
+	 * Remove the hash from the hash list
+	 */
+	if (hash == mh_head) {		/* removing 1st list elem */
+		mh_head = mh_head->mh_next;
+	} else {
+		/*
+		 * mhpp can start out NULL since we know the 1st elem isn't the
+		 * droid we're looking for.
+		 */
+		mhpp = NULL;
+		for (mhp = mh_head; mhp != NULL; mhp = mhp->mh_next) {
+			if (mhp == hash) {
+				mhpp->mh_next = mhp->mh_next;
+				break;
+			}
+			mhpp = mhp;
+		}
+	}
+	mutex_exit(&mh_head_lock);
+
+	/*
+	 * Clean out keys and values.
+	 */
+	mod_hash_clear(hash);
+
+	rw_destroy(&hash->mh_contents);
+	kmem_free(hash->mh_name, strlen(hash->mh_name) + 1);
+	kmem_free(hash, MH_SIZE(hash->mh_nchains));
+}
+
+/*
+ * i_mod_hash()
+ * 	Call the hashing algorithm for this hash table, with the given key.
+ */
+uint_t
+i_mod_hash(mod_hash_t *hash, mod_hash_key_t key)
+{
+	uint_t h;
+	/*
+	 * Prevent div by 0 problems;
+	 * Also a nice shortcut when using a hash as a list
+	 */
+	if (hash->mh_nchains == 1)
+		return (0);
+
+	h = (hash->mh_hashalg)(hash->mh_hashalg_data, key);
+	return (h % (hash->mh_nchains - 1));
+}
+
+/*
+ * i_mod_hash_insert_nosync()
+ * mod_hash_insert()
+ * mod_hash_insert_reserve()
+ * 	insert 'val' into the hash table, using 'key' as its key.  If 'key' is
+ * 	already a key in the hash, an error will be returned, and the key-val
+ * 	pair will not be inserted.  i_mod_hash_insert_nosync() supports a simple
+ * 	handle abstraction, allowing hash entry allocation to be separated from
+ * 	the hash insertion.  this abstraction allows simple use of the mod_hash
+ * 	structure in situations where mod_hash_insert() with a KM_SLEEP
+ * 	allocation policy would otherwise be unsafe.
+ */
+int
+i_mod_hash_insert_nosync(mod_hash_t *hash, mod_hash_key_t key,
+    mod_hash_val_t val, mod_hash_hndl_t handle)
+{
+	uint_t hashidx;
+	struct mod_hash_entry *entry;
+
+	ASSERT(hash);
+
+	/*
+	 * If we've not been given reserved storage, allocate storage directly,
+	 * using the hash's allocation policy.
+	 */
+	if (handle == (mod_hash_hndl_t)0) {
+		entry = kmem_cache_alloc(mh_e_cache, hash->mh_sleep);
+		if (entry == NULL) {
+			hash->mh_stat.mhs_nomem++;
+			return (MH_ERR_NOMEM);
+		}
+	} else {
+		entry = (struct mod_hash_entry *)handle;
+	}
+
+	hashidx = i_mod_hash(hash, key);
+	entry->mhe_key = key;
+	entry->mhe_val = val;
+	entry->mhe_next = hash->mh_entries[hashidx];
+
+	hash->mh_entries[hashidx] = entry;
+	hash->mh_stat.mhs_nelems++;
+
+	return (0);
+}
+
+int
+mod_hash_insert(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t val)
+{
+	int res;
+	mod_hash_val_t v;
+
+	rw_enter(&hash->mh_contents, RW_WRITER);
+
+	/*
+	 * Disallow duplicate keys in the hash
+	 */
+	if (i_mod_hash_find_nosync(hash, key, &v) == 0) {
+		rw_exit(&hash->mh_contents);
+		hash->mh_stat.mhs_coll++;
+		return (MH_ERR_DUPLICATE);
+	}
+
+	res = i_mod_hash_insert_nosync(hash, key, val, (mod_hash_hndl_t)0);
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
+int
+mod_hash_insert_reserve(mod_hash_t *hash, mod_hash_key_t key,
+    mod_hash_val_t val, mod_hash_hndl_t handle)
+{
+	int res;
+	mod_hash_val_t v;
+
+	rw_enter(&hash->mh_contents, RW_WRITER);
+
+	/*
+	 * Disallow duplicate keys in the hash
+	 */
+	if (i_mod_hash_find_nosync(hash, key, &v) == 0) {
+		rw_exit(&hash->mh_contents);
+		hash->mh_stat.mhs_coll++;
+		return (MH_ERR_DUPLICATE);
+	}
+	res = i_mod_hash_insert_nosync(hash, key, val, handle);
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
+/*
+ * mod_hash_reserve()
+ * mod_hash_reserve_nosleep()
+ * mod_hash_cancel()
+ *   Make or cancel a mod_hash_entry_t reservation.  Reservations are used in
+ *   mod_hash_insert_reserve() above.
+ */
+int
+mod_hash_reserve(mod_hash_t *hash, mod_hash_hndl_t *handlep)
+{
+	*handlep = kmem_cache_alloc(mh_e_cache, hash->mh_sleep);
+	if (*handlep == NULL) {
+		hash->mh_stat.mhs_nomem++;
+		return (MH_ERR_NOMEM);
+	}
+
+	return (0);
+}
+
+int
+mod_hash_reserve_nosleep(mod_hash_t *hash, mod_hash_hndl_t *handlep)
+{
+	*handlep = kmem_cache_alloc(mh_e_cache, KM_NOSLEEP);
+	if (*handlep == NULL) {
+		hash->mh_stat.mhs_nomem++;
+		return (MH_ERR_NOMEM);
+	}
+
+	return (0);
+
+}
+
+/*ARGSUSED*/
+void
+mod_hash_cancel(mod_hash_t *hash, mod_hash_hndl_t *handlep)
+{
+	kmem_cache_free(mh_e_cache, *handlep);
+	*handlep = (mod_hash_hndl_t)0;
+}
+
+/*
+ * i_mod_hash_remove_nosync()
+ * mod_hash_remove()
+ * 	Remove an element from the hash table.
+ */
+int
+i_mod_hash_remove_nosync(mod_hash_t *hash, mod_hash_key_t key,
+    mod_hash_val_t *val)
+{
+	int hashidx;
+	struct mod_hash_entry *e, *ep;
+
+	hashidx = i_mod_hash(hash, key);
+	ep = NULL; /* e's parent */
+
+	for (e = hash->mh_entries[hashidx]; e != NULL; e = e->mhe_next) {
+		if (MH_KEYCMP(hash, e->mhe_key, key) == 0)
+			break;
+		ep = e;
+	}
+
+	if (e == NULL) {	/* not found */
+		return (MH_ERR_NOTFOUND);
+	}
+
+	if (ep == NULL) 	/* special case 1st element in bucket */
+		hash->mh_entries[hashidx] = e->mhe_next;
+	else
+		ep->mhe_next = e->mhe_next;
+
+	/*
+	 * Clean up resources used by the node's key.
+	 */
+	MH_KEY_DESTROY(hash, e->mhe_key);
+
+	*val = e->mhe_val;
+	kmem_cache_free(mh_e_cache, e);
+	hash->mh_stat.mhs_nelems--;
+
+	return (0);
+}
+
+int
+mod_hash_remove(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val)
+{
+	int res;
+
+	rw_enter(&hash->mh_contents, RW_WRITER);
+	res = i_mod_hash_remove_nosync(hash, key, val);
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
+/*
+ * mod_hash_replace()
+ * 	atomically remove an existing key-value pair from a hash, and replace
+ * 	the key and value with the ones supplied.  The removed key and value
+ * 	(if any) are destroyed.
+ */
+int
+mod_hash_replace(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t val)
+{
+	int res;
+	mod_hash_val_t v;
+
+	rw_enter(&hash->mh_contents, RW_WRITER);
+
+	if (i_mod_hash_remove_nosync(hash, key, &v) == 0) {
+		/*
+		 * mod_hash_remove() takes care of freeing up the key resources.
+		 */
+		MH_VAL_DESTROY(hash, v);
+	}
+	res = i_mod_hash_insert_nosync(hash, key, val, (mod_hash_hndl_t)0);
+
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
+/*
+ * mod_hash_destroy()
+ * 	Remove an element from the hash table matching 'key', and destroy it.
+ */
+int
+mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key)
+{
+	mod_hash_val_t val;
+	int rv;
+
+	rw_enter(&hash->mh_contents, RW_WRITER);
+
+	if ((rv = i_mod_hash_remove_nosync(hash, key, &val)) == 0) {
+		/*
+		 * mod_hash_remove() takes care of freeing up the key resources.
+		 */
+		MH_VAL_DESTROY(hash, val);
+	}
+
+	rw_exit(&hash->mh_contents);
+	return (rv);
+}
+
+/*
+ * i_mod_hash_find_nosync()
+ * mod_hash_find()
+ * 	Find a value in the hash table corresponding to the given key.
+ */
+int
+i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key,
+    mod_hash_val_t *val)
+{
+	uint_t hashidx;
+	struct mod_hash_entry *e;
+
+	hashidx = i_mod_hash(hash, key);
+
+	for (e = hash->mh_entries[hashidx]; e != NULL; e = e->mhe_next) {
+		if (MH_KEYCMP(hash, e->mhe_key, key) == 0) {
+			*val = e->mhe_val;
+			hash->mh_stat.mhs_hit++;
+			return (0);
+		}
+	}
+	hash->mh_stat.mhs_miss++;
+	return (MH_ERR_NOTFOUND);
+}
+
+int
+mod_hash_find(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val)
+{
+	int res;
+
+	rw_enter(&hash->mh_contents, RW_READER);
+	res = i_mod_hash_find_nosync(hash, key, val);
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
+int
+mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
+    void (*find_cb)(mod_hash_key_t, mod_hash_val_t))
+{
+	int res;
+
+	rw_enter(&hash->mh_contents, RW_READER);
+	res = i_mod_hash_find_nosync(hash, key, val);
+	if (res == 0) {
+		find_cb(key, *val);
+	}
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
+int
+mod_hash_find_cb_rval(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val,
+    int (*find_cb)(mod_hash_key_t, mod_hash_val_t), int *cb_rval)
+{
+	int res;
+
+	rw_enter(&hash->mh_contents, RW_READER);
+	res = i_mod_hash_find_nosync(hash, key, val);
+	if (res == 0) {
+		*cb_rval = find_cb(key, *val);
+	}
+	rw_exit(&hash->mh_contents);
+
+	return (res);
+}
+
+void
+i_mod_hash_walk_nosync(mod_hash_t *hash,
+    uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
+{
+	struct mod_hash_entry	*e;
+	uint_t			hashidx;
+	int			res = MH_WALK_CONTINUE;
+
+	for (hashidx = 0;
+	    (hashidx < (hash->mh_nchains - 1)) && (res == MH_WALK_CONTINUE);
+	    hashidx++) {
+		e = hash->mh_entries[hashidx];
+		while ((e != NULL) && (res == MH_WALK_CONTINUE)) {
+			res = callback(e->mhe_key, e->mhe_val, arg);
+			e = e->mhe_next;
+		}
+	}
+}
+
+/*
+ * mod_hash_walk()
+ * 	Walks all the elements in the hashtable and invokes the callback
+ * 	function with the key/value pair for each element.  The hashtable
+ * 	is locked for readers so the callback function should not attempt
+ * 	to do any updates to the hashable.  The callback function should
+ * 	return MH_WALK_CONTINUE to continue walking the hashtable or
+ * 	MH_WALK_TERMINATE to abort the walk of the hashtable.
+ */
+void
+mod_hash_walk(mod_hash_t *hash,
+    uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg)
+{
+	rw_enter(&hash->mh_contents, RW_READER);
+	i_mod_hash_walk_nosync(hash, callback, arg);
+	rw_exit(&hash->mh_contents);
+}
+
+
+/*
+ * i_mod_hash_clear_nosync()
+ * mod_hash_clear()
+ *	Clears the given hash table by calling the destructor of every hash
+ *	element and freeing up all mod_hash_entry's.
+ */
+void
+i_mod_hash_clear_nosync(mod_hash_t *hash)
+{
+	int i;
+	struct mod_hash_entry *e, *old_e;
+
+	for (i = 0; i < hash->mh_nchains; i++) {
+		e = hash->mh_entries[i];
+		while (e != NULL) {
+			MH_KEY_DESTROY(hash, e->mhe_key);
+			MH_VAL_DESTROY(hash, e->mhe_val);
+			old_e = e;
+			e = e->mhe_next;
+			kmem_cache_free(mh_e_cache, old_e);
+		}
+		hash->mh_entries[i] = NULL;
+	}
+	hash->mh_stat.mhs_nelems = 0;
+}
+
+void
+mod_hash_clear(mod_hash_t *hash)
+{
+	ASSERT(hash);
+	rw_enter(&hash->mh_contents, RW_WRITER);
+	i_mod_hash_clear_nosync(hash);
+	rw_exit(&hash->mh_contents);
+}
diff --git a/sys/contrib/openzfs/module/icp/spi/kcf_spi.c b/sys/contrib/openzfs/module/icp/spi/kcf_spi.c
new file mode 100644
index 000000000000..34b36b81c0ab
--- /dev/null
+++ b/sys/contrib/openzfs/module/icp/spi/kcf_spi.c
@@ -0,0 +1,925 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * This file is part of the core Kernel Cryptographic Framework.
+ * It implements the SPI functions exported to cryptographic
+ * providers.
+ */
+
+
+#include <sys/zfs_context.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/impl.h>
+#include <sys/crypto/sched_impl.h>
+#include <sys/crypto/spi.h>
+
+/*
+ * minalloc and maxalloc values to be used for taskq_create().
+ */
+int crypto_taskq_threads = CRYPTO_TASKQ_THREADS;
+int crypto_taskq_minalloc = CRYPTO_TASKQ_MIN;
+int crypto_taskq_maxalloc = CRYPTO_TASKQ_MAX;
+
+static void remove_provider(kcf_provider_desc_t *);
+static void process_logical_providers(crypto_provider_info_t *,
+    kcf_provider_desc_t *);
+static int init_prov_mechs(crypto_provider_info_t *, kcf_provider_desc_t *);
+static int kcf_prov_kstat_update(kstat_t *, int);
+static void delete_kstat(kcf_provider_desc_t *);
+
+static kcf_prov_stats_t kcf_stats_ks_data_template = {
+	{ "kcf_ops_total",		KSTAT_DATA_UINT64 },
+	{ "kcf_ops_passed",		KSTAT_DATA_UINT64 },
+	{ "kcf_ops_failed",		KSTAT_DATA_UINT64 },
+	{ "kcf_ops_returned_busy",	KSTAT_DATA_UINT64 }
+};
+
+#define	KCF_SPI_COPY_OPS(src, dst, ops) if ((src)->ops != NULL) \
+	*((dst)->ops) = *((src)->ops);
+
+/*
+ * Copy an ops vector from src to dst. Used during provider registration
+ * to copy the ops vector from the provider info structure to the
+ * provider descriptor maintained by KCF.
+ * Copying the ops vector specified by the provider is needed since the
+ * framework does not require the provider info structure to be
+ * persistent.
+ */
+static void
+copy_ops_vector_v1(crypto_ops_t *src_ops, crypto_ops_t *dst_ops)
+{
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_control_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_digest_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_cipher_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_mac_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_sign_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_verify_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_dual_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_dual_cipher_mac_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_random_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_session_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_object_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_key_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_provider_ops);
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_ctx_ops);
+}
+
+static void
+copy_ops_vector_v2(crypto_ops_t *src_ops, crypto_ops_t *dst_ops)
+{
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_mech_ops);
+}
+
+static void
+copy_ops_vector_v3(crypto_ops_t *src_ops, crypto_ops_t *dst_ops)
+{
+	KCF_SPI_COPY_OPS(src_ops, dst_ops, co_nostore_key_ops);
+}
+
+/*
+ * This routine is used to add cryptographic providers to the KEF framework.
+ * Providers pass a crypto_provider_info structure to crypto_register_provider()
+ * and get back a handle.  The crypto_provider_info structure contains a
+ * list of mechanisms supported by the provider and an ops vector containing
+ * provider entry points.  Hardware providers call this routine in their attach
+ * routines.  Software providers call this routine in their _init() routine.
+ */
+int
+crypto_register_provider(crypto_provider_info_t *info,
+    crypto_kcf_provider_handle_t *handle)
+{
+	char *ks_name;
+
+	kcf_provider_desc_t *prov_desc = NULL;
+	int ret = CRYPTO_ARGUMENTS_BAD;
+
+	if (info->pi_interface_version > CRYPTO_SPI_VERSION_3)
+		return (CRYPTO_VERSION_MISMATCH);
+
+	/*
+	 * Check provider type, must be software, hardware, or logical.
+	 */
+	if (info->pi_provider_type != CRYPTO_HW_PROVIDER &&
+	    info->pi_provider_type != CRYPTO_SW_PROVIDER &&
+	    info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Allocate and initialize a new provider descriptor. We also
+	 * hold it and release it when done.
+	 */
+	prov_desc = kcf_alloc_provider_desc(info);
+	KCF_PROV_REFHOLD(prov_desc);
+
+	prov_desc->pd_prov_type = info->pi_provider_type;
+
+	/* provider-private handle, opaque to KCF */
+	prov_desc->pd_prov_handle = info->pi_provider_handle;
+
+	/* copy provider description string */
+	if (info->pi_provider_description != NULL) {
+		/*
+		 * pi_provider_descriptor is a string that can contain
+		 * up to CRYPTO_PROVIDER_DESCR_MAX_LEN + 1 characters
+		 * INCLUDING the terminating null character. A bcopy()
+		 * is necessary here as pd_description should not have
+		 * a null character. See comments in kcf_alloc_provider_desc()
+		 * for details on pd_description field.
+		 */
+		bcopy(info->pi_provider_description, prov_desc->pd_description,
+		    MIN(strlen(info->pi_provider_description),
+		    (size_t)CRYPTO_PROVIDER_DESCR_MAX_LEN));
+	}
+
+	if (info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER) {
+		if (info->pi_ops_vector == NULL) {
+			goto bail;
+		}
+		copy_ops_vector_v1(info->pi_ops_vector,
+		    prov_desc->pd_ops_vector);
+		if (info->pi_interface_version >= CRYPTO_SPI_VERSION_2) {
+			copy_ops_vector_v2(info->pi_ops_vector,
+			    prov_desc->pd_ops_vector);
+			prov_desc->pd_flags = info->pi_flags;
+		}
+		if (info->pi_interface_version == CRYPTO_SPI_VERSION_3) {
+			copy_ops_vector_v3(info->pi_ops_vector,
+			    prov_desc->pd_ops_vector);
+		}
+	}
+
+	/* object_ops and nostore_key_ops are mutually exclusive */
+	if (prov_desc->pd_ops_vector->co_object_ops &&
+	    prov_desc->pd_ops_vector->co_nostore_key_ops) {
+		goto bail;
+	}
+
+	/* process the mechanisms supported by the provider */
+	if ((ret = init_prov_mechs(info, prov_desc)) != CRYPTO_SUCCESS)
+		goto bail;
+
+	/*
+	 * Add provider to providers tables, also sets the descriptor
+	 * pd_prov_id field.
+	 */
+	if ((ret = kcf_prov_tab_add_provider(prov_desc)) != CRYPTO_SUCCESS) {
+		undo_register_provider(prov_desc, B_FALSE);
+		goto bail;
+	}
+
+	/*
+	 * We create a taskq only for a hardware provider. The global
+	 * software queue is used for software providers. We handle ordering
+	 * of multi-part requests in the taskq routine. So, it is safe to
+	 * have multiple threads for the taskq. We pass TASKQ_PREPOPULATE flag
+	 * to keep some entries cached to improve performance.
+	 */
+	if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+		prov_desc->pd_sched_info.ks_taskq = taskq_create("kcf_taskq",
+		    crypto_taskq_threads, minclsyspri,
+		    crypto_taskq_minalloc, crypto_taskq_maxalloc,
+		    TASKQ_PREPOPULATE);
+	else
+		prov_desc->pd_sched_info.ks_taskq = NULL;
+
+	/* no kernel session to logical providers */
+	if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+		/*
+		 * Open a session for session-oriented providers. This session
+		 * is used for all kernel consumers. This is fine as a provider
+		 * is required to support multiple thread access to a session.
+		 * We can do this only after the taskq has been created as we
+		 * do a kcf_submit_request() to open the session.
+		 */
+		if (KCF_PROV_SESSION_OPS(prov_desc) != NULL) {
+			kcf_req_params_t params;
+
+			KCF_WRAP_SESSION_OPS_PARAMS(&params,
+			    KCF_OP_SESSION_OPEN, &prov_desc->pd_sid, 0,
+			    CRYPTO_USER, NULL, 0, prov_desc);
+			ret = kcf_submit_request(prov_desc, NULL, NULL, &params,
+			    B_FALSE);
+
+			if (ret != CRYPTO_SUCCESS) {
+				undo_register_provider(prov_desc, B_TRUE);
+				ret = CRYPTO_FAILED;
+				goto bail;
+			}
+		}
+	}
+
+	if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+		/*
+		 * Create the kstat for this provider. There is a kstat
+		 * installed for each successfully registered provider.
+		 * This kstat is deleted, when the provider unregisters.
+		 */
+		if (prov_desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+			ks_name = kmem_asprintf("%s_%s",
+			    "NONAME", "provider_stats");
+		} else {
+			ks_name = kmem_asprintf("%s_%d_%u_%s",
+			    "NONAME", 0, prov_desc->pd_prov_id,
+			    "provider_stats");
+		}
+
+		prov_desc->pd_kstat = kstat_create("kcf", 0, ks_name, "crypto",
+		    KSTAT_TYPE_NAMED, sizeof (kcf_prov_stats_t) /
+		    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+		if (prov_desc->pd_kstat != NULL) {
+			bcopy(&kcf_stats_ks_data_template,
+			    &prov_desc->pd_ks_data,
+			    sizeof (kcf_stats_ks_data_template));
+			prov_desc->pd_kstat->ks_data = &prov_desc->pd_ks_data;
+			KCF_PROV_REFHOLD(prov_desc);
+			KCF_PROV_IREFHOLD(prov_desc);
+			prov_desc->pd_kstat->ks_private = prov_desc;
+			prov_desc->pd_kstat->ks_update = kcf_prov_kstat_update;
+			kstat_install(prov_desc->pd_kstat);
+		}
+		kmem_strfree(ks_name);
+	}
+
+	if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER)
+		process_logical_providers(info, prov_desc);
+
+	mutex_enter(&prov_desc->pd_lock);
+	prov_desc->pd_state = KCF_PROV_READY;
+	mutex_exit(&prov_desc->pd_lock);
+	kcf_do_notify(prov_desc, B_TRUE);
+
+	*handle = prov_desc->pd_kcf_prov_handle;
+	ret = CRYPTO_SUCCESS;
+
+bail:
+	KCF_PROV_REFRELE(prov_desc);
+	return (ret);
+}
+
+/*
+ * This routine is used to notify the framework when a provider is being
+ * removed.  Hardware providers call this routine in their detach routines.
+ * Software providers call this routine in their _fini() routine.
+ */
+int
+crypto_unregister_provider(crypto_kcf_provider_handle_t handle)
+{
+	uint_t mech_idx;
+	kcf_provider_desc_t *desc;
+	kcf_prov_state_t saved_state;
+
+	/* lookup provider descriptor */
+	if ((desc = kcf_prov_tab_lookup((crypto_provider_id_t)handle)) == NULL)
+		return (CRYPTO_UNKNOWN_PROVIDER);
+
+	mutex_enter(&desc->pd_lock);
+	/*
+	 * Check if any other thread is disabling or removing
+	 * this provider. We return if this is the case.
+	 */
+	if (desc->pd_state >= KCF_PROV_DISABLED) {
+		mutex_exit(&desc->pd_lock);
+		/* Release reference held by kcf_prov_tab_lookup(). */
+		KCF_PROV_REFRELE(desc);
+		return (CRYPTO_BUSY);
+	}
+
+	saved_state = desc->pd_state;
+	desc->pd_state = KCF_PROV_REMOVED;
+
+	if (saved_state == KCF_PROV_BUSY) {
+		/*
+		 * The per-provider taskq threads may be waiting. We
+		 * signal them so that they can start failing requests.
+		 */
+		cv_broadcast(&desc->pd_resume_cv);
+	}
+
+	if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+		/*
+		 * Check if this provider is currently being used.
+		 * pd_irefcnt is the number of holds from the internal
+		 * structures. We add one to account for the above lookup.
+		 */
+		if (desc->pd_refcnt > desc->pd_irefcnt + 1) {
+			desc->pd_state = saved_state;
+			mutex_exit(&desc->pd_lock);
+			/* Release reference held by kcf_prov_tab_lookup(). */
+			KCF_PROV_REFRELE(desc);
+			/*
+			 * The administrator presumably will stop the clients
+			 * thus removing the holds, when they get the busy
+			 * return value.  Any retry will succeed then.
+			 */
+			return (CRYPTO_BUSY);
+		}
+	}
+	mutex_exit(&desc->pd_lock);
+
+	if (desc->pd_prov_type != CRYPTO_SW_PROVIDER) {
+		remove_provider(desc);
+	}
+
+	if (desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+		/* remove the provider from the mechanisms tables */
+		for (mech_idx = 0; mech_idx < desc->pd_mech_list_count;
+		    mech_idx++) {
+			kcf_remove_mech_provider(
+			    desc->pd_mechanisms[mech_idx].cm_mech_name, desc);
+		}
+	}
+
+	/* remove provider from providers table */
+	if (kcf_prov_tab_rem_provider((crypto_provider_id_t)handle) !=
+	    CRYPTO_SUCCESS) {
+		/* Release reference held by kcf_prov_tab_lookup(). */
+		KCF_PROV_REFRELE(desc);
+		return (CRYPTO_UNKNOWN_PROVIDER);
+	}
+
+	delete_kstat(desc);
+
+	if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+		/* Release reference held by kcf_prov_tab_lookup(). */
+		KCF_PROV_REFRELE(desc);
+
+		/*
+		 * Wait till the existing requests complete.
+		 */
+		mutex_enter(&desc->pd_lock);
+		while (desc->pd_state != KCF_PROV_FREED)
+			cv_wait(&desc->pd_remove_cv, &desc->pd_lock);
+		mutex_exit(&desc->pd_lock);
+	} else {
+		/*
+		 * Wait until requests that have been sent to the provider
+		 * complete.
+		 */
+		mutex_enter(&desc->pd_lock);
+		while (desc->pd_irefcnt > 0)
+			cv_wait(&desc->pd_remove_cv, &desc->pd_lock);
+		mutex_exit(&desc->pd_lock);
+	}
+
+	kcf_do_notify(desc, B_FALSE);
+
+	if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) {
+		/*
+		 * This is the only place where kcf_free_provider_desc()
+		 * is called directly. KCF_PROV_REFRELE() should free the
+		 * structure in all other places.
+		 */
+		ASSERT(desc->pd_state == KCF_PROV_FREED &&
+		    desc->pd_refcnt == 0);
+		kcf_free_provider_desc(desc);
+	} else {
+		KCF_PROV_REFRELE(desc);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * This routine is used to notify the framework that the state of
+ * a cryptographic provider has changed. Valid state codes are:
+ *
+ * CRYPTO_PROVIDER_READY
+ * 	The provider indicates that it can process more requests. A provider
+ *	will notify with this event if it previously has notified us with a
+ *	CRYPTO_PROVIDER_BUSY.
+ *
+ * CRYPTO_PROVIDER_BUSY
+ * 	The provider can not take more requests.
+ *
+ * CRYPTO_PROVIDER_FAILED
+ *	The provider encountered an internal error. The framework will not
+ * 	be sending any more requests to the provider. The provider may notify
+ *	with a CRYPTO_PROVIDER_READY, if it is able to recover from the error.
+ *
+ * This routine can be called from user or interrupt context.
+ */
+void
+crypto_provider_notification(crypto_kcf_provider_handle_t handle, uint_t state)
+{
+	kcf_provider_desc_t *pd;
+
+	/* lookup the provider from the given handle */
+	if ((pd = kcf_prov_tab_lookup((crypto_provider_id_t)handle)) == NULL)
+		return;
+
+	mutex_enter(&pd->pd_lock);
+
+	if (pd->pd_state <= KCF_PROV_VERIFICATION_FAILED)
+		goto out;
+
+	if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		cmn_err(CE_WARN, "crypto_provider_notification: "
+		    "logical provider (%x) ignored\n", handle);
+		goto out;
+	}
+	switch (state) {
+	case CRYPTO_PROVIDER_READY:
+		switch (pd->pd_state) {
+		case KCF_PROV_BUSY:
+			pd->pd_state = KCF_PROV_READY;
+			/*
+			 * Signal the per-provider taskq threads that they
+			 * can start submitting requests.
+			 */
+			cv_broadcast(&pd->pd_resume_cv);
+			break;
+
+		case KCF_PROV_FAILED:
+			/*
+			 * The provider recovered from the error. Let us
+			 * use it now.
+			 */
+			pd->pd_state = KCF_PROV_READY;
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case CRYPTO_PROVIDER_BUSY:
+		switch (pd->pd_state) {
+		case KCF_PROV_READY:
+			pd->pd_state = KCF_PROV_BUSY;
+			break;
+		default:
+			break;
+		}
+		break;
+
+	case CRYPTO_PROVIDER_FAILED:
+		/*
+		 * We note the failure and return. The per-provider taskq
+		 * threads check this flag and start failing the
+		 * requests, if it is set. See process_req_hwp() for details.
+		 */
+		switch (pd->pd_state) {
+		case KCF_PROV_READY:
+			pd->pd_state = KCF_PROV_FAILED;
+			break;
+
+		case KCF_PROV_BUSY:
+			pd->pd_state = KCF_PROV_FAILED;
+			/*
+			 * The per-provider taskq threads may be waiting. We
+			 * signal them so that they can start failing requests.
+			 */
+			cv_broadcast(&pd->pd_resume_cv);
+			break;
+		default:
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+out:
+	mutex_exit(&pd->pd_lock);
+	KCF_PROV_REFRELE(pd);
+}
+
+/*
+ * This routine is used to notify the framework the result of
+ * an asynchronous request handled by a provider. Valid error
+ * codes are the same as the CRYPTO_* errors defined in common.h.
+ *
+ * This routine can be called from user or interrupt context.
+ */
+void
+crypto_op_notification(crypto_req_handle_t handle, int error)
+{
+	kcf_call_type_t ctype;
+
+	if (handle == NULL)
+		return;
+
+	if ((ctype = GET_REQ_TYPE(handle)) == CRYPTO_SYNCH) {
+		kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)handle;
+
+		if (error != CRYPTO_SUCCESS)
+			sreq->sn_provider->pd_sched_info.ks_nfails++;
+		KCF_PROV_IREFRELE(sreq->sn_provider);
+		kcf_sop_done(sreq, error);
+	} else {
+		kcf_areq_node_t *areq = (kcf_areq_node_t *)handle;
+
+		ASSERT(ctype == CRYPTO_ASYNCH);
+		if (error != CRYPTO_SUCCESS)
+			areq->an_provider->pd_sched_info.ks_nfails++;
+		KCF_PROV_IREFRELE(areq->an_provider);
+		kcf_aop_done(areq, error);
+	}
+}
+
+/*
+ * This routine is used by software providers to determine
+ * whether to use KM_SLEEP or KM_NOSLEEP during memory allocation.
+ * Note that hardware providers can always use KM_SLEEP. So,
+ * they do not need to call this routine.
+ *
+ * This routine can be called from user or interrupt context.
+ */
+int
+crypto_kmflag(crypto_req_handle_t handle)
+{
+	return (REQHNDL2_KMFLAG(handle));
+}
+
+/*
+ * Process the mechanism info structures specified by the provider
+ * during registration. A NULL crypto_provider_info_t indicates
+ * an already initialized provider descriptor.
+ *
+ * Mechanisms are not added to the kernel's mechanism table if the
+ * provider is a logical provider.
+ *
+ * Returns CRYPTO_SUCCESS on success, CRYPTO_ARGUMENTS if one
+ * of the specified mechanisms was malformed, or CRYPTO_HOST_MEMORY
+ * if the table of mechanisms is full.
+ */
+static int
+init_prov_mechs(crypto_provider_info_t *info, kcf_provider_desc_t *desc)
+{
+	uint_t mech_idx;
+	uint_t cleanup_idx;
+	int err = CRYPTO_SUCCESS;
+	kcf_prov_mech_desc_t *pmd;
+	int desc_use_count = 0;
+	int mcount = desc->pd_mech_list_count;
+
+	if (desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) {
+		if (info != NULL) {
+			ASSERT(info->pi_mechanisms != NULL);
+			bcopy(info->pi_mechanisms, desc->pd_mechanisms,
+			    sizeof (crypto_mech_info_t) * mcount);
+		}
+		return (CRYPTO_SUCCESS);
+	}
+
+	/*
+	 * Copy the mechanism list from the provider info to the provider
+	 * descriptor. desc->pd_mechanisms has an extra crypto_mech_info_t
+	 * element if the provider has random_ops since we keep an internal
+	 * mechanism, SUN_RANDOM, in this case.
+	 */
+	if (info != NULL) {
+		if (info->pi_ops_vector->co_random_ops != NULL) {
+			crypto_mech_info_t *rand_mi;
+
+			/*
+			 * Need the following check as it is possible to have
+			 * a provider that implements just random_ops and has
+			 * pi_mechanisms == NULL.
+			 */
+			if (info->pi_mechanisms != NULL) {
+				bcopy(info->pi_mechanisms, desc->pd_mechanisms,
+				    sizeof (crypto_mech_info_t) * (mcount - 1));
+			}
+			rand_mi = &desc->pd_mechanisms[mcount - 1];
+
+			bzero(rand_mi, sizeof (crypto_mech_info_t));
+			(void) strncpy(rand_mi->cm_mech_name, SUN_RANDOM,
+			    CRYPTO_MAX_MECH_NAME);
+			rand_mi->cm_func_group_mask = CRYPTO_FG_RANDOM;
+		} else {
+			ASSERT(info->pi_mechanisms != NULL);
+			bcopy(info->pi_mechanisms, desc->pd_mechanisms,
+			    sizeof (crypto_mech_info_t) * mcount);
+		}
+	}
+
+	/*
+	 * For each mechanism support by the provider, add the provider
+	 * to the corresponding KCF mechanism mech_entry chain.
+	 */
+	for (mech_idx = 0; mech_idx < desc->pd_mech_list_count; mech_idx++) {
+		crypto_mech_info_t *mi = &desc->pd_mechanisms[mech_idx];
+
+		if ((mi->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BITS) &&
+		    (mi->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BYTES)) {
+			err = CRYPTO_ARGUMENTS_BAD;
+			break;
+		}
+
+		if (desc->pd_flags & CRYPTO_HASH_NO_UPDATE &&
+		    mi->cm_func_group_mask & CRYPTO_FG_DIGEST) {
+			/*
+			 * We ask the provider to specify the limit
+			 * per hash mechanism. But, in practice, a
+			 * hardware limitation means all hash mechanisms
+			 * will have the same maximum size allowed for
+			 * input data. So, we make it a per provider
+			 * limit to keep it simple.
+			 */
+			if (mi->cm_max_input_length == 0) {
+				err = CRYPTO_ARGUMENTS_BAD;
+				break;
+			} else {
+				desc->pd_hash_limit = mi->cm_max_input_length;
+			}
+		}
+
+		if ((err = kcf_add_mech_provider(mech_idx, desc, &pmd)) !=
+		    KCF_SUCCESS)
+			break;
+
+		if (pmd == NULL)
+			continue;
+
+		/* The provider will be used for this mechanism */
+		desc_use_count++;
+	}
+
+	/*
+	 * Don't allow multiple software providers with disabled mechanisms
+	 * to register. Subsequent enabling of mechanisms will result in
+	 * an unsupported configuration, i.e. multiple software providers
+	 * per mechanism.
+	 */
+	if (desc_use_count == 0 && desc->pd_prov_type == CRYPTO_SW_PROVIDER)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	if (err == KCF_SUCCESS)
+		return (CRYPTO_SUCCESS);
+
+	/*
+	 * An error occurred while adding the mechanism, cleanup
+	 * and bail.
+	 */
+	for (cleanup_idx = 0; cleanup_idx < mech_idx; cleanup_idx++) {
+		kcf_remove_mech_provider(
+		    desc->pd_mechanisms[cleanup_idx].cm_mech_name, desc);
+	}
+
+	if (err == KCF_MECH_TAB_FULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	return (CRYPTO_ARGUMENTS_BAD);
+}
+
+/*
+ * Update routine for kstat. Only privileged users are allowed to
+ * access this information, since this information is sensitive.
+ * There are some cryptographic attacks (e.g. traffic analysis)
+ * which can use this information.
+ */
+static int
+kcf_prov_kstat_update(kstat_t *ksp, int rw)
+{
+	kcf_prov_stats_t *ks_data;
+	kcf_provider_desc_t *pd = (kcf_provider_desc_t *)ksp->ks_private;
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	ks_data = ksp->ks_data;
+
+	ks_data->ps_ops_total.value.ui64 = pd->pd_sched_info.ks_ndispatches;
+	ks_data->ps_ops_failed.value.ui64 = pd->pd_sched_info.ks_nfails;
+	ks_data->ps_ops_busy_rval.value.ui64 = pd->pd_sched_info.ks_nbusy_rval;
+	ks_data->ps_ops_passed.value.ui64 =
+	    pd->pd_sched_info.ks_ndispatches -
+	    pd->pd_sched_info.ks_nfails -
+	    pd->pd_sched_info.ks_nbusy_rval;
+
+	return (0);
+}
+
+
+/*
+ * Utility routine called from failure paths in crypto_register_provider()
+ * and from crypto_load_soft_disabled().
+ */
+void
+undo_register_provider(kcf_provider_desc_t *desc, boolean_t remove_prov)
+{
+	uint_t mech_idx;
+
+	/* remove the provider from the mechanisms tables */
+	for (mech_idx = 0; mech_idx < desc->pd_mech_list_count;
+	    mech_idx++) {
+		kcf_remove_mech_provider(
+		    desc->pd_mechanisms[mech_idx].cm_mech_name, desc);
+	}
+
+	/* remove provider from providers table */
+	if (remove_prov)
+		(void) kcf_prov_tab_rem_provider(desc->pd_prov_id);
+}
+
+/*
+ * Utility routine called from crypto_load_soft_disabled(). Callers
+ * should have done a prior undo_register_provider().
+ */
+void
+redo_register_provider(kcf_provider_desc_t *pd)
+{
+	/* process the mechanisms supported by the provider */
+	(void) init_prov_mechs(NULL, pd);
+
+	/*
+	 * Hold provider in providers table. We should not call
+	 * kcf_prov_tab_add_provider() here as the provider descriptor
+	 * is still valid which means it has an entry in the provider
+	 * table.
+	 */
+	KCF_PROV_REFHOLD(pd);
+	KCF_PROV_IREFHOLD(pd);
+}
+
+/*
+ * Add provider (p1) to another provider's array of providers (p2).
+ * Hardware and logical providers use this array to cross-reference
+ * each other.
+ */
+static void
+add_provider_to_array(kcf_provider_desc_t *p1, kcf_provider_desc_t *p2)
+{
+	kcf_provider_list_t *new;
+
+	new = kmem_alloc(sizeof (kcf_provider_list_t), KM_SLEEP);
+	mutex_enter(&p2->pd_lock);
+	new->pl_next = p2->pd_provider_list;
+	p2->pd_provider_list = new;
+	KCF_PROV_IREFHOLD(p1);
+	new->pl_provider = p1;
+	mutex_exit(&p2->pd_lock);
+}
+
+/*
+ * Remove provider (p1) from another provider's array of providers (p2).
+ * Hardware and logical providers use this array to cross-reference
+ * each other.
+ */
+static void
+remove_provider_from_array(kcf_provider_desc_t *p1, kcf_provider_desc_t *p2)
+{
+
+	kcf_provider_list_t *pl = NULL, **prev;
+
+	mutex_enter(&p2->pd_lock);
+	for (pl = p2->pd_provider_list, prev = &p2->pd_provider_list;
+	    pl != NULL; prev = &pl->pl_next, pl = pl->pl_next) {
+		if (pl->pl_provider == p1) {
+			break;
+		}
+	}
+
+	if (p1 == NULL) {
+		mutex_exit(&p2->pd_lock);
+		return;
+	}
+
+	/* detach and free kcf_provider_list structure */
+	KCF_PROV_IREFRELE(p1);
+	*prev = pl->pl_next;
+	kmem_free(pl, sizeof (*pl));
+	mutex_exit(&p2->pd_lock);
+}
+
+/*
+ * Convert an array of logical provider handles (crypto_provider_id)
+ * stored in a crypto_provider_info structure into an array of provider
+ * descriptors (kcf_provider_desc_t) attached to a logical provider.
+ */
+static void
+process_logical_providers(crypto_provider_info_t *info, kcf_provider_desc_t *hp)
+{
+	kcf_provider_desc_t *lp;
+	crypto_provider_id_t handle;
+	int count = info->pi_logical_provider_count;
+	int i;
+
+	/* add hardware provider to each logical provider */
+	for (i = 0; i < count; i++) {
+		handle = info->pi_logical_providers[i];
+		lp = kcf_prov_tab_lookup((crypto_provider_id_t)handle);
+		if (lp == NULL) {
+			continue;
+		}
+		add_provider_to_array(hp, lp);
+		hp->pd_flags |= KCF_LPROV_MEMBER;
+
+		/*
+		 * A hardware provider has to have the provider descriptor of
+		 * every logical provider it belongs to, so it can be removed
+		 * from the logical provider if the hardware provider
+		 * unregisters from the framework.
+		 */
+		add_provider_to_array(lp, hp);
+		KCF_PROV_REFRELE(lp);
+	}
+}
+
+/*
+ * This routine removes a provider from all of the logical or
+ * hardware providers it belongs to, and frees the provider's
+ * array of pointers to providers.
+ */
+static void
+remove_provider(kcf_provider_desc_t *pp)
+{
+	kcf_provider_desc_t *p;
+	kcf_provider_list_t *e, *next;
+
+	mutex_enter(&pp->pd_lock);
+	for (e = pp->pd_provider_list; e != NULL; e = next) {
+		p = e->pl_provider;
+		remove_provider_from_array(pp, p);
+		if (p->pd_prov_type == CRYPTO_HW_PROVIDER &&
+		    p->pd_provider_list == NULL)
+			p->pd_flags &= ~KCF_LPROV_MEMBER;
+		KCF_PROV_IREFRELE(p);
+		next = e->pl_next;
+		kmem_free(e, sizeof (*e));
+	}
+	pp->pd_provider_list = NULL;
+	mutex_exit(&pp->pd_lock);
+}
+
+/*
+ * Dispatch events as needed for a provider. is_added flag tells
+ * whether the provider is registering or unregistering.
+ */
+void
+kcf_do_notify(kcf_provider_desc_t *prov_desc, boolean_t is_added)
+{
+	int i;
+	crypto_notify_event_change_t ec;
+
+	ASSERT(prov_desc->pd_state > KCF_PROV_VERIFICATION_FAILED);
+
+	/*
+	 * Inform interested clients of the mechanisms becoming
+	 * available/unavailable. We skip this for logical providers
+	 * as they do not affect mechanisms.
+	 */
+	if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) {
+		ec.ec_provider_type = prov_desc->pd_prov_type;
+		ec.ec_change = is_added ? CRYPTO_MECH_ADDED :
+		    CRYPTO_MECH_REMOVED;
+		for (i = 0; i < prov_desc->pd_mech_list_count; i++) {
+			(void) strlcpy(ec.ec_mech_name,
+			    prov_desc->pd_mechanisms[i].cm_mech_name,
+			    CRYPTO_MAX_MECH_NAME);
+			kcf_walk_ntfylist(CRYPTO_EVENT_MECHS_CHANGED, &ec);
+		}
+
+	}
+
+	/*
+	 * Inform interested clients about the new or departing provider.
+	 * In case of a logical provider, we need to notify the event only
+	 * for the logical provider and not for the underlying
+	 * providers which are known by the KCF_LPROV_MEMBER bit.
+	 */
+	if (prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER ||
+	    (prov_desc->pd_flags & KCF_LPROV_MEMBER) == 0) {
+		kcf_walk_ntfylist(is_added ? CRYPTO_EVENT_PROVIDER_REGISTERED :
+		    CRYPTO_EVENT_PROVIDER_UNREGISTERED, prov_desc);
+	}
+}
+
+static void
+delete_kstat(kcf_provider_desc_t *desc)
+{
+	/* destroy the kstat created for this provider */
+	if (desc->pd_kstat != NULL) {
+		kcf_provider_desc_t *kspd = desc->pd_kstat->ks_private;
+
+		/* release reference held by desc->pd_kstat->ks_private */
+		ASSERT(desc == kspd);
+		kstat_delete(kspd->pd_kstat);
+		desc->pd_kstat = NULL;
+		KCF_PROV_REFRELE(kspd);
+		KCF_PROV_IREFRELE(kspd);
+	}
+}
diff --git a/sys/contrib/openzfs/module/lua/Makefile.in b/sys/contrib/openzfs/module/lua/Makefile.in
new file mode 100644
index 000000000000..0a74c17e64e8
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/Makefile.in
@@ -0,0 +1,39 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zlua
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+ccflags-y := -DLUA_USE_LONGLONG
+
+$(MODULE)-objs += lapi.o
+$(MODULE)-objs += lauxlib.o
+$(MODULE)-objs += lbaselib.o
+$(MODULE)-objs += lcode.o
+$(MODULE)-objs += lcompat.o
+$(MODULE)-objs += lcorolib.o
+$(MODULE)-objs += lctype.o
+$(MODULE)-objs += ldebug.o
+$(MODULE)-objs += ldo.o
+$(MODULE)-objs += lfunc.o
+$(MODULE)-objs += lgc.o
+$(MODULE)-objs += llex.o
+$(MODULE)-objs += lmem.o
+$(MODULE)-objs += lobject.o
+$(MODULE)-objs += lopcodes.o
+$(MODULE)-objs += lparser.o
+$(MODULE)-objs += lstate.o
+$(MODULE)-objs += lstring.o
+$(MODULE)-objs += lstrlib.o
+$(MODULE)-objs += ltable.o
+$(MODULE)-objs += ltablib.o
+$(MODULE)-objs += ltm.o
+$(MODULE)-objs += lvm.o
+$(MODULE)-objs += lzio.o
+$(MODULE)-objs += setjmp/setjmp.o
+
+all:
+	mkdir -p setjmp
diff --git a/sys/contrib/openzfs/module/lua/README.zfs b/sys/contrib/openzfs/module/lua/README.zfs
new file mode 100644
index 000000000000..0e22de7a4a18
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/README.zfs
@@ -0,0 +1,80 @@
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Delphix. All rights reserved.
+#
+
+Introduction
+------------
+
+This README describes the Lua interpreter source code that lives in the ZFS
+source tree to enable execution of ZFS channel programs, including its
+maintenance policy, the modifications that have been made to it, and how it
+should (and should not) be used.
+
+For a description of the Lua language and features exposed by ZFS channel
+programs, please refer to the zfs-program(1m) man page instead.
+
+
+Maintenance policy
+------------------
+
+The Lua runtime is considered stable software. Channel programs don't need much
+complicated logic, so updates to the Lua runtime from upstream are viewed as
+nice-to-have, but not required for channel programs to be well-supported. As
+such, the Lua runtime in ZFS should be updated on an as-needed basis for
+security vulnerabilities, but not much else.
+
+
+Modifications to Lua
+--------------------
+
+The version of the Lua runtime we're using in ZFS has been modified in a variety
+of ways to make it more useful for the specific purpose of running channel
+programs. These changes include:
+
+1. "Normal" Lua uses floating point for all numbers it stores, but those aren't
+   useful inside ZFS / the kernel. We have changed the runtime to use int64_t
+   throughout for all numbers.
+2. Some of the Lua standard libraries do file I/O or spawn processes, but
+   neither of these make sense from inside channel programs. We have removed
+   those libraries rather than reimplementing them using kernel APIs.
+3. The "normal" Lua runtime handles errors by failing fatally, but since this
+   version of Lua runs inside the kernel we must handle these failures and
+   return meaningful error codes to userland. We have customized the Lua
+   failure paths so that they aren't fatal.
+4. Running poorly-vetted code inside the kernel is always a risk; even if the
+   ability to do so is restricted to the root user, it's still possible to write
+   an incorrect program that results in an infinite loop or massive memory use.
+   We've added new protections into the Lua interpreter to limit the runtime
+   (measured in number of Lua instructions run) and memory overhead of running
+   a channel program.
+5. The Lua bytecode is not designed to be secure / safe, so it would be easy to
+   pass invalid bytecode which can panic the kernel. By comparison, the parser
+   is hardened and fails gracefully on invalid input. Therefore, we only accept
+   Lua source code at the ioctl level and then interpret it inside the kernel.
+
+Each of these modifications have been tested in the zfs-test suite. If / when
+new modifications are made, new tests should be added to the suite located in
+zfs-tests/tests/functional/channel_program/lua_core.
+
+
+How to use this Lua interpreter
+-------------------------------
+
+From the above, it should be clear that this is not a general-purpose Lua
+interpreter. Additional work would be required to extricate this custom version
+of Lua from ZFS and make it usable by other areas of the kernel.
diff --git a/sys/contrib/openzfs/module/lua/lapi.c b/sys/contrib/openzfs/module/lua/lapi.c
new file mode 100644
index 000000000000..6a845c461052
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lapi.c
@@ -0,0 +1,1345 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua API
+** See Copyright Notice in lua.h
+*/
+
+
+#define lapi_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+const char lua_ident[] =
+  "$LuaVersion: " LUA_COPYRIGHT " $"
+  "$LuaAuthors: " LUA_AUTHORS " $";
+
+
+/* value at a non-valid index */
+#define NONVALIDVALUE		cast(TValue *, luaO_nilobject)
+
+/* corresponding test */
+#define isvalid(o)	((o) != luaO_nilobject)
+
+/* test for pseudo index */
+#define ispseudo(i)		((i) <= LUA_REGISTRYINDEX)
+
+/* test for valid but not pseudo index */
+#define isstackindex(i, o)	(isvalid(o) && !ispseudo(i))
+
+#define api_checkvalidindex(L, o)  api_check(L, isvalid(o), "invalid index")
+
+#define api_checkstackindex(L, i, o)  \
+	api_check(L, isstackindex(i, o), "index not in the stack")
+
+
+static TValue *index2addr (lua_State *L, int idx) {
+  CallInfo *ci = L->ci;
+  if (idx > 0) {
+    TValue *o = ci->func + idx;
+    api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index");
+    if (o >= L->top) return NONVALIDVALUE;
+    else return o;
+  }
+  else if (!ispseudo(idx)) {  /* negative index */
+    api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index");
+    return L->top + idx;
+  }
+  else if (idx == LUA_REGISTRYINDEX)
+    return &G(L)->l_registry;
+  else {  /* upvalues */
+    idx = LUA_REGISTRYINDEX - idx;
+    api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large");
+    if (ttislcf(ci->func))  /* light C function? */
+      return NONVALIDVALUE;  /* it has no upvalues */
+    else {
+      CClosure *func = clCvalue(ci->func);
+      return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE;
+    }
+  }
+}
+
+
+/*
+** to be called by 'lua_checkstack' in protected mode, to grow stack
+** capturing memory errors
+*/
+static void growstack (lua_State *L, void *ud) {
+  int size = *(int *)ud;
+  luaD_growstack(L, size);
+}
+
+
+LUA_API int lua_checkstack (lua_State *L, int size) {
+  int res;
+  CallInfo *ci = L->ci;
+  lua_lock(L);
+  if (L->stack_last - L->top > size)  /* stack large enough? */
+    res = 1;  /* yes; check is OK */
+  else {  /* no; need to grow stack */
+    int inuse = cast_int(L->top - L->stack) + EXTRA_STACK;
+    if (inuse > LUAI_MAXSTACK - size)  /* can grow without overflow? */
+      res = 0;  /* no */
+    else  /* try to grow stack */
+      res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK);
+  }
+  if (res && ci->top < L->top + size)
+    ci->top = L->top + size;  /* adjust frame top */
+  lua_unlock(L);
+  return res;
+}
+
+
+LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) {
+  int i;
+  if (from == to) return;
+  lua_lock(to);
+  api_checknelems(from, n);
+  api_check(from, G(from) == G(to), "moving among independent states");
+  api_check(from, to->ci->top - to->top >= n, "not enough elements to move");
+  from->top -= n;
+  for (i = 0; i < n; i++) {
+    setobj2s(to, to->top++, from->top + i);
+  }
+  lua_unlock(to);
+}
+
+
+LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) {
+  lua_CFunction old;
+  lua_lock(L);
+  old = G(L)->panic;
+  G(L)->panic = panicf;
+  lua_unlock(L);
+  return old;
+}
+
+
+LUA_API const lua_Number *lua_version (lua_State *L) {
+  static const lua_Number version = LUA_VERSION_NUM;
+  if (L == NULL) return &version;
+  else return G(L)->version;
+}
+
+
+
+/*
+** basic stack manipulation
+*/
+
+
+/*
+** convert an acceptable stack index into an absolute index
+*/
+LUA_API int lua_absindex (lua_State *L, int idx) {
+  return (idx > 0 || ispseudo(idx))
+         ? idx
+         : cast_int(L->top - L->ci->func + idx);
+}
+
+
+LUA_API int lua_gettop (lua_State *L) {
+  return cast_int(L->top - (L->ci->func + 1));
+}
+
+
+LUA_API void lua_settop (lua_State *L, int idx) {
+  StkId func = L->ci->func;
+  lua_lock(L);
+  if (idx >= 0) {
+    api_check(L, idx <= L->stack_last - (func + 1), "new top too large");
+    while (L->top < (func + 1) + idx)
+      setnilvalue(L->top++);
+    L->top = (func + 1) + idx;
+  }
+  else {
+    api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top");
+    L->top += idx+1;  /* `subtract' index (index is negative) */
+  }
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_remove (lua_State *L, int idx) {
+  StkId p;
+  lua_lock(L);
+  p = index2addr(L, idx);
+  api_checkstackindex(L, idx, p);
+  while (++p < L->top) setobjs2s(L, p-1, p);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_insert (lua_State *L, int idx) {
+  StkId p;
+  StkId q;
+  lua_lock(L);
+  p = index2addr(L, idx);
+  api_checkstackindex(L, idx, p);
+  for (q = L->top; q > p; q--)  /* use L->top as a temporary */
+    setobjs2s(L, q, q - 1);
+  setobjs2s(L, p, L->top);
+  lua_unlock(L);
+}
+
+
+static void moveto (lua_State *L, TValue *fr, int idx) {
+  TValue *to = index2addr(L, idx);
+  api_checkvalidindex(L, to);
+  setobj(L, to, fr);
+  if (idx < LUA_REGISTRYINDEX)  /* function upvalue? */
+    luaC_barrier(L, clCvalue(L->ci->func), fr);
+  /* LUA_REGISTRYINDEX does not need gc barrier
+     (collector revisits it before finishing collection) */
+}
+
+
+LUA_API void lua_replace (lua_State *L, int idx) {
+  lua_lock(L);
+  api_checknelems(L, 1);
+  moveto(L, L->top - 1, idx);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) {
+  TValue *fr;
+  lua_lock(L);
+  fr = index2addr(L, fromidx);
+  moveto(L, fr, toidx);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushvalue (lua_State *L, int idx) {
+  lua_lock(L);
+  setobj2s(L, L->top, index2addr(L, idx));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+
+/*
+** access functions (stack -> C)
+*/
+
+
+LUA_API int lua_type (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  return (isvalid(o) ? ttypenv(o) : LUA_TNONE);
+}
+
+
+LUA_API const char *lua_typename (lua_State *L, int t) {
+  UNUSED(L);
+  return ttypename(t);
+}
+
+
+LUA_API int lua_iscfunction (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  return (ttislcf(o) || (ttisCclosure(o)));
+}
+
+
+LUA_API int lua_isnumber (lua_State *L, int idx) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  return tonumber(o, &n);
+}
+
+
+LUA_API int lua_isstring (lua_State *L, int idx) {
+  int t = lua_type(L, idx);
+  return (t == LUA_TSTRING || t == LUA_TNUMBER);
+}
+
+
+LUA_API int lua_isuserdata (lua_State *L, int idx) {
+  const TValue *o = index2addr(L, idx);
+  return (ttisuserdata(o) || ttislightuserdata(o));
+}
+
+
+LUA_API int lua_rawequal (lua_State *L, int index1, int index2) {
+  StkId o1 = index2addr(L, index1);
+  StkId o2 = index2addr(L, index2);
+  return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0;
+}
+
+
+LUA_API void lua_arith (lua_State *L, int op) {
+  StkId o1;  /* 1st operand */
+  StkId o2;  /* 2nd operand */
+  lua_lock(L);
+  if (op != LUA_OPUNM) /* all other operations expect two operands */
+    api_checknelems(L, 2);
+  else {  /* for unary minus, add fake 2nd operand */
+    api_checknelems(L, 1);
+    setobjs2s(L, L->top, L->top - 1);
+    L->top++;
+  }
+  o1 = L->top - 2;
+  o2 = L->top - 1;
+  if (ttisnumber(o1) && ttisnumber(o2)) {
+    setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2)));
+  }
+  else
+    luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD));
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) {
+  StkId o1, o2;
+  int i = 0;
+  lua_lock(L);  /* may call tag method */
+  o1 = index2addr(L, index1);
+  o2 = index2addr(L, index2);
+  if (isvalid(o1) && isvalid(o2)) {
+    switch (op) {
+      case LUA_OPEQ: i = equalobj(L, o1, o2); break;
+      case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break;
+      case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break;
+      default: api_check(L, 0, "invalid option");
+    }
+  }
+  lua_unlock(L);
+  return i;
+}
+
+
+LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  if (tonumber(o, &n)) {
+    if (isnum) *isnum = 1;
+    return nvalue(o);
+  }
+  else {
+    if (isnum) *isnum = 0;
+    return 0;
+  }
+}
+
+
+LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  if (tonumber(o, &n)) {
+    lua_Integer res;
+    lua_Number num = nvalue(o);
+    lua_number2integer(res, num);
+    if (isnum) *isnum = 1;
+    return res;
+  }
+  else {
+    if (isnum) *isnum = 0;
+    return 0;
+  }
+}
+
+
+LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  if (tonumber(o, &n)) {
+    lua_Unsigned res;
+    lua_Number num = nvalue(o);
+    lua_number2unsigned(res, num);
+    if (isnum) *isnum = 1;
+    return res;
+  }
+  else {
+    if (isnum) *isnum = 0;
+    return 0;
+  }
+}
+
+
+LUA_API int lua_toboolean (lua_State *L, int idx) {
+  const TValue *o = index2addr(L, idx);
+  return !l_isfalse(o);
+}
+
+
+LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) {
+  StkId o = index2addr(L, idx);
+  if (!ttisstring(o)) {
+    lua_lock(L);  /* `luaV_tostring' may create a new string */
+    if (!luaV_tostring(L, o)) {  /* conversion failed? */
+      if (len != NULL) *len = 0;
+      lua_unlock(L);
+      return NULL;
+    }
+    luaC_checkGC(L);
+    o = index2addr(L, idx);  /* previous call may reallocate the stack */
+    lua_unlock(L);
+  }
+  if (len != NULL) *len = tsvalue(o)->len;
+  return svalue(o);
+}
+
+
+LUA_API size_t lua_rawlen (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  switch (ttypenv(o)) {
+    case LUA_TSTRING: return tsvalue(o)->len;
+    case LUA_TUSERDATA: return uvalue(o)->len;
+    case LUA_TTABLE: return luaH_getn(hvalue(o));
+    default: return 0;
+  }
+}
+
+
+LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  if (ttislcf(o)) return fvalue(o);
+  else if (ttisCclosure(o))
+    return clCvalue(o)->f;
+  else return NULL;  /* not a C function */
+}
+
+
+LUA_API void *lua_touserdata (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  switch (ttypenv(o)) {
+    case LUA_TUSERDATA: return ((void *)(rawuvalue(o) + 1));
+    case LUA_TLIGHTUSERDATA: return pvalue(o);
+    default: return NULL;
+  }
+}
+
+
+LUA_API lua_State *lua_tothread (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  return (!ttisthread(o)) ? NULL : thvalue(o);
+}
+
+
+LUA_API const void *lua_topointer (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  switch (ttype(o)) {
+    case LUA_TTABLE: return hvalue(o);
+    case LUA_TLCL: return clLvalue(o);
+    case LUA_TCCL: return clCvalue(o);
+    case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o)));
+    case LUA_TTHREAD: return thvalue(o);
+    case LUA_TUSERDATA:
+    case LUA_TLIGHTUSERDATA:
+      return lua_touserdata(L, idx);
+    default: return NULL;
+  }
+}
+
+
+
+/*
+** push functions (C -> stack)
+*/
+
+
+LUA_API void lua_pushnil (lua_State *L) {
+  lua_lock(L);
+  setnilvalue(L->top);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushnumber (lua_State *L, lua_Number n) {
+  lua_lock(L);
+  setnvalue(L->top, n);
+  luai_checknum(L, L->top,
+    luaG_runerror(L, "C API - attempt to push a signaling NaN"));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) {
+  lua_lock(L);
+  setnvalue(L->top, cast_num(n));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) {
+  lua_Number n;
+  lua_lock(L);
+  n = lua_unsigned2number(u);
+  setnvalue(L->top, n);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) {
+  TString *ts;
+  lua_lock(L);
+  luaC_checkGC(L);
+  ts = luaS_newlstr(L, s, len);
+  setsvalue2s(L, L->top, ts);
+  api_incr_top(L);
+  lua_unlock(L);
+  return getstr(ts);
+}
+
+
+LUA_API const char *lua_pushstring (lua_State *L, const char *s) {
+  if (s == NULL) {
+    lua_pushnil(L);
+    return NULL;
+  }
+  else {
+    TString *ts;
+    lua_lock(L);
+    luaC_checkGC(L);
+    ts = luaS_new(L, s);
+    setsvalue2s(L, L->top, ts);
+    api_incr_top(L);
+    lua_unlock(L);
+    return getstr(ts);
+  }
+}
+
+
+LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt,
+                                      va_list argp) {
+  const char *ret;
+  lua_lock(L);
+  luaC_checkGC(L);
+  ret = luaO_pushvfstring(L, fmt, argp);
+  lua_unlock(L);
+  return ret;
+}
+
+
+LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) {
+  const char *ret;
+  va_list argp;
+  lua_lock(L);
+  luaC_checkGC(L);
+  va_start(argp, fmt);
+  ret = luaO_pushvfstring(L, fmt, argp);
+  va_end(argp);
+  lua_unlock(L);
+  return ret;
+}
+
+
+LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) {
+  lua_lock(L);
+  if (n == 0) {
+    setfvalue(L->top, fn);
+  }
+  else {
+    Closure *cl;
+    api_checknelems(L, n);
+    api_check(L, n <= MAXUPVAL, "upvalue index too large");
+    luaC_checkGC(L);
+    cl = luaF_newCclosure(L, n);
+    cl->c.f = fn;
+    L->top -= n;
+    while (n--)
+      setobj2n(L, &cl->c.upvalue[n], L->top + n);
+    setclCvalue(L, L->top, cl);
+  }
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushboolean (lua_State *L, int b) {
+  lua_lock(L);
+  setbvalue(L->top, (b != 0));  /* ensure that true is 1 */
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushlightuserdata (lua_State *L, void *p) {
+  lua_lock(L);
+  setpvalue(L->top, p);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_pushthread (lua_State *L) {
+  lua_lock(L);
+  setthvalue(L, L->top, L);
+  api_incr_top(L);
+  lua_unlock(L);
+  return (G(L)->mainthread == L);
+}
+
+
+
+/*
+** get functions (Lua -> stack)
+*/
+
+
+LUA_API void lua_getglobal (lua_State *L, const char *var) {
+  Table *reg = hvalue(&G(L)->l_registry);
+  const TValue *gt;  /* global table */
+  lua_lock(L);
+  gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+  setsvalue2s(L, L->top++, luaS_new(L, var));
+  luaV_gettable(L, gt, L->top - 1, L->top - 1);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_gettable (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  luaV_gettable(L, t, L->top - 1, L->top - 1);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_getfield (lua_State *L, int idx, const char *k) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  setsvalue2s(L, L->top, luaS_new(L, k));
+  api_incr_top(L);
+  luaV_gettable(L, t, L->top - 1, L->top - 1);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawget (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1));
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgeti (lua_State *L, int idx, int n) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setobj2s(L, L->top, luaH_getint(hvalue(t), n));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) {
+  StkId t;
+  TValue k;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setpvalue(&k, cast(void *, p));
+  setobj2s(L, L->top, luaH_get(hvalue(t), &k));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_createtable (lua_State *L, int narray, int nrec) {
+  Table *t;
+  lua_lock(L);
+  luaC_checkGC(L);
+  t = luaH_new(L);
+  sethvalue(L, L->top, t);
+  api_incr_top(L);
+  if (narray > 0 || nrec > 0)
+    luaH_resize(L, t, narray, nrec);
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_getmetatable (lua_State *L, int objindex) {
+  const TValue *obj;
+  Table *mt = NULL;
+  int res;
+  lua_lock(L);
+  obj = index2addr(L, objindex);
+  switch (ttypenv(obj)) {
+    case LUA_TTABLE:
+      mt = hvalue(obj)->metatable;
+      break;
+    case LUA_TUSERDATA:
+      mt = uvalue(obj)->metatable;
+      break;
+    default:
+      mt = G(L)->mt[ttypenv(obj)];
+      break;
+  }
+  if (mt == NULL)
+    res = 0;
+  else {
+    sethvalue(L, L->top, mt);
+    api_incr_top(L);
+    res = 1;
+  }
+  lua_unlock(L);
+  return res;
+}
+
+
+LUA_API void lua_getuservalue (lua_State *L, int idx) {
+  StkId o;
+  lua_lock(L);
+  o = index2addr(L, idx);
+  api_check(L, ttisuserdata(o), "userdata expected");
+  if (uvalue(o)->env) {
+    sethvalue(L, L->top, uvalue(o)->env);
+  } else
+    setnilvalue(L->top);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+/*
+** set functions (stack -> Lua)
+*/
+
+
+LUA_API void lua_setglobal (lua_State *L, const char *var) {
+  Table *reg = hvalue(&G(L)->l_registry);
+  const TValue *gt;  /* global table */
+  lua_lock(L);
+  api_checknelems(L, 1);
+  gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+  setsvalue2s(L, L->top++, luaS_new(L, var));
+  luaV_settable(L, gt, L->top - 1, L->top - 2);
+  L->top -= 2;  /* pop value and key */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_settable (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 2);
+  t = index2addr(L, idx);
+  luaV_settable(L, t, L->top - 2, L->top - 1);
+  L->top -= 2;  /* pop index and value */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_setfield (lua_State *L, int idx, const char *k) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  t = index2addr(L, idx);
+  setsvalue2s(L, L->top++, luaS_new(L, k));
+  luaV_settable(L, t, L->top - 1, L->top - 2);
+  L->top -= 2;  /* pop value and key */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawset (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 2);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1);
+  invalidateTMcache(hvalue(t));
+  luaC_barrierback(L, gcvalue(t), L->top-1);
+  L->top -= 2;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawseti (lua_State *L, int idx, int n) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  luaH_setint(L, hvalue(t), n, L->top - 1);
+  luaC_barrierback(L, gcvalue(t), L->top-1);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) {
+  StkId t;
+  TValue k;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setpvalue(&k, cast(void *, p));
+  setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1);
+  luaC_barrierback(L, gcvalue(t), L->top - 1);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_setmetatable (lua_State *L, int objindex) {
+  TValue *obj;
+  Table *mt;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  obj = index2addr(L, objindex);
+  if (ttisnil(L->top - 1))
+    mt = NULL;
+  else {
+    api_check(L, ttistable(L->top - 1), "table expected");
+    mt = hvalue(L->top - 1);
+  }
+  switch (ttypenv(obj)) {
+    case LUA_TTABLE: {
+      hvalue(obj)->metatable = mt;
+      if (mt) {
+        luaC_objbarrierback(L, gcvalue(obj), mt);
+        luaC_checkfinalizer(L, gcvalue(obj), mt);
+      }
+      break;
+    }
+    case LUA_TUSERDATA: {
+      uvalue(obj)->metatable = mt;
+      if (mt) {
+        luaC_objbarrier(L, rawuvalue(obj), mt);
+        luaC_checkfinalizer(L, gcvalue(obj), mt);
+      }
+      break;
+    }
+    default: {
+      G(L)->mt[ttypenv(obj)] = mt;
+      break;
+    }
+  }
+  L->top--;
+  lua_unlock(L);
+  return 1;
+}
+
+
+LUA_API void lua_setuservalue (lua_State *L, int idx) {
+  StkId o;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  o = index2addr(L, idx);
+  api_check(L, ttisuserdata(o), "userdata expected");
+  if (ttisnil(L->top - 1))
+    uvalue(o)->env = NULL;
+  else {
+    api_check(L, ttistable(L->top - 1), "table expected");
+    uvalue(o)->env = hvalue(L->top - 1);
+    luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1));
+  }
+  L->top--;
+  lua_unlock(L);
+}
+
+
+/*
+** `load' and `call' functions (run Lua code)
+*/
+
+
+#define checkresults(L,na,nr) \
+     api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \
+	"results from function overflow current stack size")
+
+
+LUA_API int lua_getctx (lua_State *L, int *ctx) {
+  if (L->ci->callstatus & CIST_YIELDED) {
+    if (ctx) *ctx = L->ci->u.c.ctx;
+    return L->ci->u.c.status;
+  }
+  else return LUA_OK;
+}
+
+
+LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx,
+                        lua_CFunction k) {
+  StkId func;
+  lua_lock(L);
+  api_check(L, k == NULL || !isLua(L->ci),
+    "cannot use continuations inside hooks");
+  api_checknelems(L, nargs+1);
+  api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+  checkresults(L, nargs, nresults);
+  func = L->top - (nargs+1);
+  if (k != NULL && L->nny == 0) {  /* need to prepare continuation? */
+    L->ci->u.c.k = k;  /* save continuation */
+    L->ci->u.c.ctx = ctx;  /* save context */
+    luaD_call(L, func, nresults, 1);  /* do the call */
+  }
+  else  /* no continuation or no yieldable */
+    luaD_call(L, func, nresults, 0);  /* just do the call */
+  adjustresults(L, nresults);
+  lua_unlock(L);
+}
+
+
+
+/*
+** Execute a protected call.
+*/
+struct CallS {  /* data to `f_call' */
+  StkId func;
+  int nresults;
+};
+
+
+static void f_call (lua_State *L, void *ud) {
+  struct CallS *c = cast(struct CallS *, ud);
+  luaD_call(L, c->func, c->nresults, 0);
+}
+
+
+
+LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc,
+                        int ctx, lua_CFunction k) {
+  struct CallS c;
+  int status;
+  ptrdiff_t func;
+  lua_lock(L);
+  api_check(L, k == NULL || !isLua(L->ci),
+    "cannot use continuations inside hooks");
+  api_checknelems(L, nargs+1);
+  api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+  checkresults(L, nargs, nresults);
+  if (errfunc == 0)
+    func = 0;
+  else {
+    StkId o = index2addr(L, errfunc);
+    api_checkstackindex(L, errfunc, o);
+    func = savestack(L, o);
+  }
+  c.func = L->top - (nargs+1);  /* function to be called */
+  if (k == NULL || L->nny > 0) {  /* no continuation or no yieldable? */
+    c.nresults = nresults;  /* do a 'conventional' protected call */
+    status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func);
+  }
+  else {  /* prepare continuation (call is already protected by 'resume') */
+    CallInfo *ci = L->ci;
+    ci->u.c.k = k;  /* save continuation */
+    ci->u.c.ctx = ctx;  /* save context */
+    /* save information for error recovery */
+    ci->extra = savestack(L, c.func);
+    ci->u.c.old_allowhook = L->allowhook;
+    ci->u.c.old_errfunc = L->errfunc;
+    L->errfunc = func;
+    /* mark that function may do error recovery */
+    ci->callstatus |= CIST_YPCALL;
+    luaD_call(L, c.func, nresults, 1);  /* do the call */
+    ci->callstatus &= ~CIST_YPCALL;
+    L->errfunc = ci->u.c.old_errfunc;
+    status = LUA_OK;  /* if it is here, there were no errors */
+  }
+  adjustresults(L, nresults);
+  lua_unlock(L);
+  return status;
+}
+
+
+LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data,
+                      const char *chunkname, const char *mode) {
+  ZIO z;
+  int status;
+  lua_lock(L);
+  if (!chunkname) chunkname = "?";
+  luaZ_init(L, &z, reader, data);
+  status = luaD_protectedparser(L, &z, chunkname, mode);
+  if (status == LUA_OK) {  /* no errors? */
+    LClosure *f = clLvalue(L->top - 1);  /* get newly created function */
+    if (f->nupvalues == 1) {  /* does it have one upvalue? */
+      /* get global table from registry */
+      Table *reg = hvalue(&G(L)->l_registry);
+      const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+      /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */
+      setobj(L, f->upvals[0]->v, gt);
+      luaC_barrier(L, f->upvals[0], gt);
+    }
+  }
+  lua_unlock(L);
+  return status;
+}
+
+#if defined(LUA_USE_DUMP)
+LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) {
+  int status;
+  TValue *o;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  o = L->top - 1;
+  if (isLfunction(o))
+    status = luaU_dump(L, getproto(o), writer, data, 0);
+  else
+    status = 1;
+  lua_unlock(L);
+  return status;
+}
+#endif
+
+LUA_API int lua_status (lua_State *L) {
+  return L->status;
+}
+
+
+/*
+** Garbage-collection function
+*/
+
+LUA_API int lua_gc (lua_State *L, int what, int data) {
+  int res = 0;
+  global_State *g;
+  lua_lock(L);
+  g = G(L);
+  switch (what) {
+    case LUA_GCSTOP: {
+      g->gcrunning = 0;
+      break;
+    }
+    case LUA_GCRESTART: {
+      luaE_setdebt(g, 0);
+      g->gcrunning = 1;
+      break;
+    }
+    case LUA_GCCOLLECT: {
+      luaC_fullgc(L, 0);
+      break;
+    }
+    case LUA_GCCOUNT: {
+      /* GC values are expressed in Kbytes: #bytes/2^10 */
+      res = cast_int(gettotalbytes(g) >> 10);
+      break;
+    }
+    case LUA_GCCOUNTB: {
+      res = cast_int(gettotalbytes(g) & 0x3ff);
+      break;
+    }
+    case LUA_GCSTEP: {
+      if (g->gckind == KGC_GEN) {  /* generational mode? */
+        res = (g->GCestimate == 0);  /* true if it will do major collection */
+        luaC_forcestep(L);  /* do a single step */
+      }
+      else {
+       lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE;
+       if (g->gcrunning)
+         debt += g->GCdebt;  /* include current debt */
+       luaE_setdebt(g, debt);
+       luaC_forcestep(L);
+       if (g->gcstate == GCSpause)  /* end of cycle? */
+         res = 1;  /* signal it */
+      }
+      break;
+    }
+    case LUA_GCSETPAUSE: {
+      res = g->gcpause;
+      g->gcpause = data;
+      break;
+    }
+    case LUA_GCSETMAJORINC: {
+      res = g->gcmajorinc;
+      g->gcmajorinc = data;
+      break;
+    }
+    case LUA_GCSETSTEPMUL: {
+      res = g->gcstepmul;
+      g->gcstepmul = data;
+      break;
+    }
+    case LUA_GCISRUNNING: {
+      res = g->gcrunning;
+      break;
+    }
+    case LUA_GCGEN: {  /* change collector to generational mode */
+      luaC_changemode(L, KGC_GEN);
+      break;
+    }
+    case LUA_GCINC: {  /* change collector to incremental mode */
+      luaC_changemode(L, KGC_NORMAL);
+      break;
+    }
+    default: res = -1;  /* invalid option */
+  }
+  lua_unlock(L);
+  return res;
+}
+
+
+
+/*
+** miscellaneous functions
+*/
+
+
+LUA_API int lua_error (lua_State *L) {
+  lua_lock(L);
+  api_checknelems(L, 1);
+  luaG_errormsg(L);
+  /* code unreachable; will unlock when control actually leaves the kernel */
+  return 0;  /* to avoid warnings */
+}
+
+
+LUA_API int lua_next (lua_State *L, int idx) {
+  StkId t;
+  int more;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  more = luaH_next(L, hvalue(t), L->top - 1);
+  if (more) {
+    api_incr_top(L);
+  }
+  else  /* no more elements */
+    L->top -= 1;  /* remove key */
+  lua_unlock(L);
+  return more;
+}
+
+
+LUA_API void lua_concat (lua_State *L, int n) {
+  lua_lock(L);
+  api_checknelems(L, n);
+  if (n >= 2) {
+    luaC_checkGC(L);
+    luaV_concat(L, n);
+  }
+  else if (n == 0) {  /* push empty string */
+    setsvalue2s(L, L->top, luaS_newlstr(L, "", 0));
+    api_incr_top(L);
+  }
+  /* else n == 1; nothing to do */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_len (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  luaV_objlen(L, L->top, t);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) {
+  lua_Alloc f;
+  lua_lock(L);
+  if (ud) *ud = G(L)->ud;
+  f = G(L)->frealloc;
+  lua_unlock(L);
+  return f;
+}
+
+
+LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) {
+  lua_lock(L);
+  G(L)->ud = ud;
+  G(L)->frealloc = f;
+  lua_unlock(L);
+}
+
+
+LUA_API void *lua_newuserdata (lua_State *L, size_t size) {
+  Udata *u;
+  lua_lock(L);
+  luaC_checkGC(L);
+  u = luaS_newudata(L, size, NULL);
+  setuvalue(L, L->top, u);
+  api_incr_top(L);
+  lua_unlock(L);
+  return u + 1;
+}
+
+
+
+static const char *aux_upvalue (StkId fi, int n, TValue **val,
+                                GCObject **owner) {
+  switch (ttype(fi)) {
+    case LUA_TCCL: {  /* C closure */
+      CClosure *f = clCvalue(fi);
+      if (!(1 <= n && n <= f->nupvalues)) return NULL;
+      *val = &f->upvalue[n-1];
+      if (owner) *owner = obj2gco(f);
+      return "";
+    }
+    case LUA_TLCL: {  /* Lua closure */
+      LClosure *f = clLvalue(fi);
+      TString *name;
+      Proto *p = f->p;
+      if (!(1 <= n && n <= p->sizeupvalues)) return NULL;
+      *val = f->upvals[n-1]->v;
+      if (owner) *owner = obj2gco(f->upvals[n - 1]);
+      name = p->upvalues[n-1].name;
+      return (name == NULL) ? "" : getstr(name);
+    }
+    default: return NULL;  /* not a closure */
+  }
+}
+
+
+LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) {
+  const char *name;
+  TValue *val = NULL;  /* to avoid warnings */
+  lua_lock(L);
+  name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL);
+  if (name) {
+    setobj2s(L, L->top, val);
+    api_incr_top(L);
+  }
+  lua_unlock(L);
+  return name;
+}
+
+
+LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) {
+  const char *name;
+  TValue *val = NULL;  /* to avoid warnings */
+  GCObject *owner = NULL;  /* to avoid warnings */
+  StkId fi;
+  lua_lock(L);
+  fi = index2addr(L, funcindex);
+  api_checknelems(L, 1);
+  name = aux_upvalue(fi, n, &val, &owner);
+  if (name) {
+    L->top--;
+    setobj(L, val, L->top);
+    luaC_barrier(L, owner, L->top);
+  }
+  lua_unlock(L);
+  return name;
+}
+
+
+static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) {
+  LClosure *f;
+  StkId fi = index2addr(L, fidx);
+  api_check(L, ttisLclosure(fi), "Lua function expected");
+  f = clLvalue(fi);
+  api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index");
+  if (pf) *pf = f;
+  return &f->upvals[n - 1];  /* get its upvalue pointer */
+}
+
+
+LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) {
+  StkId fi = index2addr(L, fidx);
+  switch (ttype(fi)) {
+    case LUA_TLCL: {  /* lua closure */
+      return *getupvalref(L, fidx, n, NULL);
+    }
+    case LUA_TCCL: {  /* C closure */
+      CClosure *f = clCvalue(fi);
+      api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index");
+      return &f->upvalue[n - 1];
+    }
+    default: {
+      api_check(L, 0, "closure expected");
+      return NULL;
+    }
+  }
+}
+
+
+LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1,
+                                            int fidx2, int n2) {
+  LClosure *f1;
+  UpVal **up1 = getupvalref(L, fidx1, n1, &f1);
+  UpVal **up2 = getupvalref(L, fidx2, n2, NULL);
+  *up1 = *up2;
+  luaC_objbarrier(L, f1, *up2);
+}
+
+#if defined(_KERNEL)
+
+static int __init
+lua_init(void)
+{
+        return (0);
+}
+
+static void __exit
+lua_fini(void)
+{
+}
+
+module_init(lua_init);
+module_exit(lua_fini);
+
+#endif
+/* END CSTYLED */
+
+ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS");
+ZFS_MODULE_AUTHOR("Lua.org");
+ZFS_MODULE_LICENSE("Dual MIT/GPL");
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(lua_absindex);
+EXPORT_SYMBOL(lua_atpanic);
+EXPORT_SYMBOL(lua_checkstack);
+EXPORT_SYMBOL(lua_close);
+EXPORT_SYMBOL(lua_createtable);
+EXPORT_SYMBOL(lua_error);
+EXPORT_SYMBOL(lua_getfield);
+EXPORT_SYMBOL(lua_gettable);
+EXPORT_SYMBOL(lua_gettop);
+EXPORT_SYMBOL(lua_isnumber);
+EXPORT_SYMBOL(lua_isstring);
+EXPORT_SYMBOL(lua_newstate);
+EXPORT_SYMBOL(lua_newuserdata);
+EXPORT_SYMBOL(lua_next);
+EXPORT_SYMBOL(lua_pcallk);
+EXPORT_SYMBOL(lua_pushboolean);
+EXPORT_SYMBOL(lua_pushcclosure);
+EXPORT_SYMBOL(lua_pushfstring);
+EXPORT_SYMBOL(lua_pushinteger);
+EXPORT_SYMBOL(lua_pushlightuserdata);
+EXPORT_SYMBOL(lua_pushnil);
+EXPORT_SYMBOL(lua_pushnumber);
+EXPORT_SYMBOL(lua_pushstring);
+EXPORT_SYMBOL(lua_pushvalue);
+EXPORT_SYMBOL(lua_pushvfstring);
+EXPORT_SYMBOL(lua_remove);
+EXPORT_SYMBOL(lua_replace);
+EXPORT_SYMBOL(lua_setfield);
+EXPORT_SYMBOL(lua_setglobal);
+EXPORT_SYMBOL(lua_sethook);
+EXPORT_SYMBOL(lua_setmetatable);
+EXPORT_SYMBOL(lua_settable);
+EXPORT_SYMBOL(lua_settop);
+EXPORT_SYMBOL(lua_toboolean);
+EXPORT_SYMBOL(lua_tointegerx);
+EXPORT_SYMBOL(lua_tolstring);
+EXPORT_SYMBOL(lua_tonumberx);
+EXPORT_SYMBOL(lua_touserdata);
+EXPORT_SYMBOL(lua_type);
+EXPORT_SYMBOL(lua_typename);
diff --git a/sys/contrib/openzfs/module/lua/lapi.h b/sys/contrib/openzfs/module/lua/lapi.h
new file mode 100644
index 000000000000..509f46f692a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lapi.h
@@ -0,0 +1,26 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions from Lua API
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lapi_h
+#define lapi_h
+
+
+#include "llimits.h"
+#include "lstate.h"
+
+#define api_incr_top(L)   {L->top++; api_check(L, L->top <= L->ci->top, \
+				"stack overflow");}
+
+#define adjustresults(L,nres) \
+    { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; }
+
+#define api_checknelems(L,n)	api_check(L, (n) < (L->top - L->ci->func), \
+				  "not enough elements in the stack")
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lauxlib.c b/sys/contrib/openzfs/module/lua/lauxlib.c
new file mode 100644
index 000000000000..1e0356e7c00e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lauxlib.c
@@ -0,0 +1,800 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions for building Lua libraries
+** See Copyright Notice in lua.h
+*/
+
+
+/* This file uses only the official API of Lua.
+** Any function declared here could be written as an application function.
+*/
+
+#define lauxlib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+
+
+/*
+** {======================================================
+** Traceback
+** =======================================================
+*/
+
+
+#define LEVELS1	12	/* size of the first part of the stack */
+#define LEVELS2	10	/* size of the second part of the stack */
+
+
+
+/*
+** search for 'objidx' in table at index -1.
+** return 1 + string at top if find a good name.
+*/
+static int findfield (lua_State *L, int objidx, int level) {
+  if (level == 0 || !lua_istable(L, -1))
+    return 0;  /* not found */
+  lua_pushnil(L);  /* start 'next' loop */
+  while (lua_next(L, -2)) {  /* for each pair in table */
+    if (lua_type(L, -2) == LUA_TSTRING) {  /* ignore non-string keys */
+      if (lua_rawequal(L, objidx, -1)) {  /* found object? */
+        lua_pop(L, 1);  /* remove value (but keep name) */
+        return 1;
+      }
+      else if (findfield(L, objidx, level - 1)) {  /* try recursively */
+        lua_remove(L, -2);  /* remove table (but keep name) */
+        lua_pushliteral(L, ".");
+        lua_insert(L, -2);  /* place '.' between the two names */
+        lua_concat(L, 3);
+        return 1;
+      }
+    }
+    lua_pop(L, 1);  /* remove value */
+  }
+  return 0;  /* not found */
+}
+
+
+static int pushglobalfuncname (lua_State *L, lua_Debug *ar) {
+  int top = lua_gettop(L);
+  lua_getinfo(L, "f", ar);  /* push function */
+  lua_pushglobaltable(L);
+  if (findfield(L, top + 1, 2)) {
+    lua_copy(L, -1, top + 1);  /* move name to proper place */
+    lua_pop(L, 2);  /* remove pushed values */
+    return 1;
+  }
+  else {
+    lua_settop(L, top);  /* remove function and global table */
+    return 0;
+  }
+}
+
+
+static void pushfuncname (lua_State *L, lua_Debug *ar) {
+  if (*ar->namewhat != '\0')  /* is there a name? */
+    lua_pushfstring(L, "function " LUA_QS, ar->name);
+  else if (*ar->what == 'm')  /* main? */
+      lua_pushliteral(L, "main chunk");
+  else if (*ar->what == 'C') {
+    if (pushglobalfuncname(L, ar)) {
+      lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1));
+      lua_remove(L, -2);  /* remove name */
+    }
+    else
+      lua_pushliteral(L, "?");
+  }
+  else
+    lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined);
+}
+
+
+static int countlevels (lua_State *L) {
+  lua_Debug ar;
+  int li = 1, le = 1;
+  /* find an upper bound */
+  while (lua_getstack(L, le, &ar)) { li = le; le *= 2; }
+  /* do a binary search */
+  while (li < le) {
+    int m = (li + le)/2;
+    if (lua_getstack(L, m, &ar)) li = m + 1;
+    else le = m;
+  }
+  return le - 1;
+}
+
+
+LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1,
+                                const char *msg, int level) {
+  lua_Debug ar;
+  int top = lua_gettop(L);
+  int numlevels = countlevels(L1);
+  int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0;
+  if (msg) lua_pushfstring(L, "%s\n", msg);
+  lua_pushliteral(L, "stack traceback:");
+  while (lua_getstack(L1, level++, &ar)) {
+    if (level == mark) {  /* too many levels? */
+      lua_pushliteral(L, "\n\t...");  /* add a '...' */
+      level = numlevels - LEVELS2;  /* and skip to last ones */
+    }
+    else {
+      lua_getinfo(L1, "Slnt", &ar);
+      lua_pushfstring(L, "\n\t%s:", ar.short_src);
+      if (ar.currentline > 0)
+        lua_pushfstring(L, "%d:", ar.currentline);
+      lua_pushliteral(L, " in ");
+      pushfuncname(L, &ar);
+      if (ar.istailcall)
+        lua_pushliteral(L, "\n\t(...tail calls...)");
+      lua_concat(L, lua_gettop(L) - top);
+    }
+  }
+  lua_concat(L, lua_gettop(L) - top);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Error-report functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) {
+  lua_Debug ar;
+  if (!lua_getstack(L, 0, &ar))  /* no stack frame? */
+    return luaL_error(L, "bad argument #%d (%s)", narg, extramsg);
+  lua_getinfo(L, "n", &ar);
+  if (strcmp(ar.namewhat, "method") == 0) {
+    narg--;  /* do not count `self' */
+    if (narg == 0)  /* error is in the self argument itself? */
+      return luaL_error(L, "calling " LUA_QS " on bad self (%s)",
+                           ar.name, extramsg);
+  }
+  if (ar.name == NULL)
+    ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?";
+  return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)",
+                        narg, ar.name, extramsg);
+}
+
+
+static int typeerror (lua_State *L, int narg, const char *tname) {
+  const char *msg = lua_pushfstring(L, "%s expected, got %s",
+                                    tname, luaL_typename(L, narg));
+  return luaL_argerror(L, narg, msg);
+}
+
+
+static void tag_error (lua_State *L, int narg, int tag) {
+  typeerror(L, narg, lua_typename(L, tag));
+}
+
+
+LUALIB_API void luaL_where (lua_State *L, int level) {
+  lua_Debug ar;
+  if (lua_getstack(L, level, &ar)) {  /* check function at level */
+    lua_getinfo(L, "Sl", &ar);  /* get info about it */
+    if (ar.currentline > 0) {  /* is there info? */
+      lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline);
+      return;
+    }
+  }
+  lua_pushliteral(L, "");  /* else, no information available... */
+}
+
+
+LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) {
+  va_list argp;
+  va_start(argp, fmt);
+  luaL_where(L, 1);
+  lua_pushvfstring(L, fmt, argp);
+  va_end(argp);
+  lua_concat(L, 2);
+  return lua_error(L);
+}
+
+
+#if !defined(inspectstat)	/* { */
+
+#if defined(LUA_USE_POSIX)
+
+#include <sys/wait.h>
+
+/*
+** use appropriate macros to interpret 'pclose' return status
+*/
+#define inspectstat(stat,what)  \
+   if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \
+   else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; }
+
+#else
+
+#define inspectstat(stat,what)  /* no op */
+
+#endif
+
+#endif				/* } */
+
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Userdata's metatable manipulation
+** =======================================================
+*/
+
+LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) {
+  luaL_getmetatable(L, tname);  /* try to get metatable */
+  if (!lua_isnil(L, -1))  /* name already in use? */
+    return 0;  /* leave previous value on top, but return 0 */
+  lua_pop(L, 1);
+  lua_newtable(L);  /* create metatable */
+  lua_pushvalue(L, -1);
+  lua_setfield(L, LUA_REGISTRYINDEX, tname);  /* registry.name = metatable */
+  return 1;
+}
+
+
+LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) {
+  luaL_getmetatable(L, tname);
+  lua_setmetatable(L, -2);
+}
+
+
+LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) {
+  void *p = lua_touserdata(L, ud);
+  if (p != NULL) {  /* value is a userdata? */
+    if (lua_getmetatable(L, ud)) {  /* does it have a metatable? */
+      luaL_getmetatable(L, tname);  /* get correct metatable */
+      if (!lua_rawequal(L, -1, -2))  /* not the same? */
+        p = NULL;  /* value is a userdata with wrong metatable */
+      lua_pop(L, 2);  /* remove both metatables */
+      return p;
+    }
+  }
+  return NULL;  /* value is not a userdata with a metatable */
+}
+
+
+LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) {
+  void *p = luaL_testudata(L, ud, tname);
+  if (p == NULL) typeerror(L, ud, tname);
+  return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Argument check functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def,
+                                 const char *const lst[]) {
+  const char *name = (def) ? luaL_optstring(L, narg, def) :
+                             luaL_checkstring(L, narg);
+  int i;
+  for (i=0; lst[i]; i++)
+    if (strcmp(lst[i], name) == 0)
+      return i;
+  return luaL_argerror(L, narg,
+                       lua_pushfstring(L, "invalid option " LUA_QS, name));
+}
+
+
+LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) {
+  /* keep some extra space to run error routines, if needed */
+  const int extra = LUA_MINSTACK;
+  if (!lua_checkstack(L, space + extra)) {
+    if (msg)
+      luaL_error(L, "stack overflow (%s)", msg);
+    else
+      luaL_error(L, "stack overflow");
+  }
+}
+
+
+LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) {
+  if (lua_type(L, narg) != t)
+    tag_error(L, narg, t);
+}
+
+
+LUALIB_API void luaL_checkany (lua_State *L, int narg) {
+  if (lua_type(L, narg) == LUA_TNONE)
+    luaL_argerror(L, narg, "value expected");
+}
+
+
+LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) {
+  const char *s = lua_tolstring(L, narg, len);
+  if (!s) tag_error(L, narg, LUA_TSTRING);
+  return s;
+}
+
+
+LUALIB_API const char *luaL_optlstring (lua_State *L, int narg,
+                                        const char *def, size_t *len) {
+  if (lua_isnoneornil(L, narg)) {
+    if (len)
+      *len = (def ? strlen(def) : 0);
+    return def;
+  }
+  else return luaL_checklstring(L, narg, len);
+}
+
+
+LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) {
+  int isnum;
+  lua_Number d = lua_tonumberx(L, narg, &isnum);
+  if (!isnum)
+    tag_error(L, narg, LUA_TNUMBER);
+  return d;
+}
+
+
+LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) {
+  return luaL_opt(L, luaL_checknumber, narg, def);
+}
+
+
+LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) {
+  int isnum;
+  lua_Integer d = lua_tointegerx(L, narg, &isnum);
+  if (!isnum)
+    tag_error(L, narg, LUA_TNUMBER);
+  return d;
+}
+
+
+LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) {
+  int isnum;
+  lua_Unsigned d = lua_tounsignedx(L, narg, &isnum);
+  if (!isnum)
+    tag_error(L, narg, LUA_TNUMBER);
+  return d;
+}
+
+
+LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg,
+                                                      lua_Integer def) {
+  return luaL_opt(L, luaL_checkinteger, narg, def);
+}
+
+
+LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg,
+                                                        lua_Unsigned def) {
+  return luaL_opt(L, luaL_checkunsigned, narg, def);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Generic Buffer manipulation
+** =======================================================
+*/
+
+/*
+** check whether buffer is using a userdata on the stack as a temporary
+** buffer
+*/
+#define buffonstack(B)	((B)->b != (B)->initb)
+
+
+/*
+** returns a pointer to a free area with at least 'sz' bytes
+*/
+LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) {
+  lua_State *L = B->L;
+  if (B->size - B->n < sz) {  /* not enough space? */
+    char *newbuff;
+    size_t newsize = B->size * 2;  /* double buffer size */
+    if (newsize - B->n < sz)  /* not big enough? */
+      newsize = B->n + sz;
+    if (newsize < B->n || newsize - B->n < sz)
+      luaL_error(L, "buffer too large");
+    /* create larger buffer */
+    newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char));
+    /* move content to new buffer */
+    memcpy(newbuff, B->b, B->n * sizeof(char));
+    if (buffonstack(B))
+      lua_remove(L, -2);  /* remove old buffer */
+    B->b = newbuff;
+    B->size = newsize;
+  }
+  return &B->b[B->n];
+}
+
+
+LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) {
+  char *b = luaL_prepbuffsize(B, l);
+  memcpy(b, s, l * sizeof(char));
+  luaL_addsize(B, l);
+}
+
+
+LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) {
+  luaL_addlstring(B, s, strlen(s));
+}
+
+
+LUALIB_API void luaL_pushresult (luaL_Buffer *B) {
+  lua_State *L = B->L;
+  lua_pushlstring(L, B->b, B->n);
+  if (buffonstack(B))
+    lua_remove(L, -2);  /* remove old buffer */
+}
+
+
+LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) {
+  luaL_addsize(B, sz);
+  luaL_pushresult(B);
+}
+
+
+LUALIB_API void luaL_addvalue (luaL_Buffer *B) {
+  lua_State *L = B->L;
+  size_t l;
+  const char *s = lua_tolstring(L, -1, &l);
+  if (buffonstack(B))
+    lua_insert(L, -2);  /* put value below buffer */
+  luaL_addlstring(B, s, l);
+  lua_remove(L, (buffonstack(B)) ? -2 : -1);  /* remove value */
+}
+
+
+LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) {
+  B->L = L;
+  B->b = B->initb;
+  B->n = 0;
+  B->size = LUAL_BUFFERSIZE;
+}
+
+
+LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) {
+  luaL_buffinit(L, B);
+  return luaL_prepbuffsize(B, sz);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Reference system
+** =======================================================
+*/
+
+/* index of free-list header */
+#define freelist	0
+
+
+LUALIB_API int luaL_ref (lua_State *L, int t) {
+  int ref;
+  if (lua_isnil(L, -1)) {
+    lua_pop(L, 1);  /* remove from stack */
+    return LUA_REFNIL;  /* `nil' has a unique fixed reference */
+  }
+  t = lua_absindex(L, t);
+  lua_rawgeti(L, t, freelist);  /* get first free element */
+  ref = (int)lua_tointeger(L, -1);  /* ref = t[freelist] */
+  lua_pop(L, 1);  /* remove it from stack */
+  if (ref != 0) {  /* any free element? */
+    lua_rawgeti(L, t, ref);  /* remove it from list */
+    lua_rawseti(L, t, freelist);  /* (t[freelist] = t[ref]) */
+  }
+  else  /* no free elements */
+    ref = (int)lua_rawlen(L, t) + 1;  /* get a new reference */
+  lua_rawseti(L, t, ref);
+  return ref;
+}
+
+
+LUALIB_API void luaL_unref (lua_State *L, int t, int ref) {
+  if (ref >= 0) {
+    t = lua_absindex(L, t);
+    lua_rawgeti(L, t, freelist);
+    lua_rawseti(L, t, ref);  /* t[ref] = t[freelist] */
+    lua_pushinteger(L, ref);
+    lua_rawseti(L, t, freelist);  /* t[freelist] = ref */
+  }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Load functions
+** =======================================================
+*/
+
+typedef struct LoadS {
+  const char *s;
+  size_t size;
+} LoadS;
+
+
+static const char *getS (lua_State *L, void *ud, size_t *size) {
+  LoadS *ls = (LoadS *)ud;
+  (void)L;  /* not used */
+  if (ls->size == 0) return NULL;
+  *size = ls->size;
+  ls->size = 0;
+  return ls->s;
+}
+
+
+LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size,
+                                 const char *name, const char *mode) {
+  LoadS ls;
+  ls.s = buff;
+  ls.size = size;
+  return lua_load(L, getS, &ls, name, mode);
+}
+
+
+LUALIB_API int luaL_loadstring (lua_State *L, const char *s) {
+  return luaL_loadbuffer(L, s, strlen(s), s);
+}
+
+/* }====================================================== */
+
+
+
+LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) {
+  if (!lua_getmetatable(L, obj))  /* no metatable? */
+    return 0;
+  lua_pushstring(L, event);
+  lua_rawget(L, -2);
+  if (lua_isnil(L, -1)) {
+    lua_pop(L, 2);  /* remove metatable and metafield */
+    return 0;
+  }
+  else {
+    lua_remove(L, -2);  /* remove only metatable */
+    return 1;
+  }
+}
+
+
+LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) {
+  obj = lua_absindex(L, obj);
+  if (!luaL_getmetafield(L, obj, event))  /* no metafield? */
+    return 0;
+  lua_pushvalue(L, obj);
+  lua_call(L, 1, 1);
+  return 1;
+}
+
+
+LUALIB_API int luaL_len (lua_State *L, int idx) {
+  int l;
+  int isnum;
+  lua_len(L, idx);
+  l = (int)lua_tointegerx(L, -1, &isnum);
+  if (!isnum)
+    luaL_error(L, "object length is not a number");
+  lua_pop(L, 1);  /* remove object */
+  return l;
+}
+
+
+LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) {
+  if (!luaL_callmeta(L, idx, "__tostring")) {  /* no metafield? */
+    switch (lua_type(L, idx)) {
+      case LUA_TNUMBER:
+      case LUA_TSTRING:
+        lua_pushvalue(L, idx);
+        break;
+      case LUA_TBOOLEAN:
+        lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false"));
+        break;
+      case LUA_TNIL:
+        lua_pushliteral(L, "nil");
+        break;
+      default:
+        lua_pushfstring(L, "%s: %p", luaL_typename(L, idx),
+                                            lua_topointer(L, idx));
+        break;
+    }
+  }
+  return lua_tolstring(L, -1, len);
+}
+
+
+/*
+** {======================================================
+** Compatibility with 5.1 module functions
+** =======================================================
+*/
+#if defined(LUA_COMPAT_MODULE)
+
+static const char *luaL_findtable (lua_State *L, int idx,
+                                   const char *fname, int szhint) {
+  const char *e;
+  if (idx) lua_pushvalue(L, idx);
+  do {
+    e = strchr(fname, '.');
+    if (e == NULL) e = fname + strlen(fname);
+    lua_pushlstring(L, fname, e - fname);
+    lua_rawget(L, -2);
+    if (lua_isnil(L, -1)) {  /* no such field? */
+      lua_pop(L, 1);  /* remove this nil */
+      lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */
+      lua_pushlstring(L, fname, e - fname);
+      lua_pushvalue(L, -2);
+      lua_settable(L, -4);  /* set new table into field */
+    }
+    else if (!lua_istable(L, -1)) {  /* field has a non-table value? */
+      lua_pop(L, 2);  /* remove table and value */
+      return fname;  /* return problematic part of the name */
+    }
+    lua_remove(L, -2);  /* remove previous table */
+    fname = e + 1;
+  } while (*e == '.');
+  return NULL;
+}
+
+
+/*
+** Count number of elements in a luaL_Reg list.
+*/
+static int libsize (const luaL_Reg *l) {
+  int size = 0;
+  for (; l && l->name; l++) size++;
+  return size;
+}
+
+
+/*
+** Find or create a module table with a given name. The function
+** first looks at the _LOADED table and, if that fails, try a
+** global variable with that name. In any case, leaves on the stack
+** the module table.
+*/
+LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname,
+                                 int sizehint) {
+  luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1);  /* get _LOADED table */
+  lua_getfield(L, -1, modname);  /* get _LOADED[modname] */
+  if (!lua_istable(L, -1)) {  /* not found? */
+    lua_pop(L, 1);  /* remove previous result */
+    /* try global variable (and create one if it does not exist) */
+    lua_pushglobaltable(L);
+    if (luaL_findtable(L, 0, modname, sizehint) != NULL)
+      luaL_error(L, "name conflict for module " LUA_QS, modname);
+    lua_pushvalue(L, -1);
+    lua_setfield(L, -3, modname);  /* _LOADED[modname] = new table */
+  }
+  lua_remove(L, -2);  /* remove _LOADED table */
+}
+
+
+LUALIB_API void luaL_openlib (lua_State *L, const char *libname,
+                               const luaL_Reg *l, int nup) {
+  luaL_checkversion(L);
+  if (libname) {
+    luaL_pushmodule(L, libname, libsize(l));  /* get/create library table */
+    lua_insert(L, -(nup + 1));  /* move library table to below upvalues */
+  }
+  if (l)
+    luaL_setfuncs(L, l, nup);
+  else
+    lua_pop(L, nup);  /* remove upvalues */
+}
+
+#endif
+/* }====================================================== */
+
+/*
+** set functions from list 'l' into table at top - 'nup'; each
+** function gets the 'nup' elements at the top as upvalues.
+** Returns with only the table at the stack.
+*/
+LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
+  luaL_checkversion(L);
+  luaL_checkstack(L, nup, "too many upvalues");
+  for (; l->name != NULL; l++) {  /* fill the table with given functions */
+    int i;
+    for (i = 0; i < nup; i++)  /* copy upvalues to the top */
+      lua_pushvalue(L, -nup);
+    lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
+    lua_setfield(L, -(nup + 2), l->name);
+  }
+  lua_pop(L, nup);  /* remove upvalues */
+}
+
+
+/*
+** ensure that stack[idx][fname] has a table and push that table
+** into the stack
+*/
+LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) {
+  lua_getfield(L, idx, fname);
+  if (lua_istable(L, -1)) return 1;  /* table already there */
+  else {
+    lua_pop(L, 1);  /* remove previous result */
+    idx = lua_absindex(L, idx);
+    lua_newtable(L);
+    lua_pushvalue(L, -1);  /* copy to be left at top */
+    lua_setfield(L, idx, fname);  /* assign new table to field */
+    return 0;  /* false, because did not find table there */
+  }
+}
+
+
+/*
+** stripped-down 'require'. Calls 'openf' to open a module,
+** registers the result in 'package.loaded' table and, if 'glb'
+** is true, also registers the result in the global table.
+** Leaves resulting module on the top.
+*/
+LUALIB_API void luaL_requiref (lua_State *L, const char *modname,
+                               lua_CFunction openf, int glb) {
+  lua_pushcfunction(L, openf);
+  lua_pushstring(L, modname);  /* argument to open function */
+  lua_call(L, 1, 1);  /* open module */
+  luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED");
+  lua_pushvalue(L, -2);  /* make copy of module (call result) */
+  lua_setfield(L, -2, modname);  /* _LOADED[modname] = module */
+  lua_pop(L, 1);  /* remove _LOADED table */
+  if (glb) {
+    lua_pushvalue(L, -1);  /* copy of 'mod' */
+    lua_setglobal(L, modname);  /* _G[modname] = module */
+  }
+}
+
+
+LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p,
+                                                               const char *r) {
+  const char *wild;
+  size_t l = strlen(p);
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  while ((wild = strstr(s, p)) != NULL) {
+    luaL_addlstring(&b, s, wild - s);  /* push prefix */
+    luaL_addstring(&b, r);  /* push replacement in place of pattern */
+    s = wild + l;  /* continue after `p' */
+  }
+  luaL_addstring(&b, s);  /* push last suffix */
+  luaL_pushresult(&b);
+  return lua_tostring(L, -1);
+}
+
+
+LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) {
+  const lua_Number *v = lua_version(L);
+  if (v != lua_version(NULL))
+    luaL_error(L, "multiple Lua VMs detected");
+  else if (*v != ver)
+    luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f",
+                  ver, *v);
+  /* check conversions number -> integer types */
+  lua_pushnumber(L, -(lua_Number)0x1234);
+  if (lua_tointeger(L, -1) != -0x1234 ||
+      lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234)
+    luaL_error(L, "bad conversion number->int;"
+                  " must recompile Lua with proper settings");
+  lua_pop(L, 1);
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaL_argerror);
+EXPORT_SYMBOL(luaL_error);
+EXPORT_SYMBOL(luaL_loadbufferx);
+EXPORT_SYMBOL(luaL_newmetatable);
+EXPORT_SYMBOL(luaL_traceback);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lbaselib.c b/sys/contrib/openzfs/module/lua/lbaselib.c
new file mode 100644
index 000000000000..854649a0fb4d
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lbaselib.c
@@ -0,0 +1,296 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $
+** Basic library
+** See Copyright Notice in lua.h
+*/
+
+/* The following built-in lua functions have been removed and are not available
+ * for use in ZFS channel programs:
+ *
+ * dofile
+ * loadfile
+ * load
+ * pcall
+ * print
+ * xpcall
+ */
+
+
+#define lbaselib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+#define SPACECHARS	" \f\n\r\t\v"
+
+static int luaB_tonumber (lua_State *L) {
+  if (lua_isnoneornil(L, 2)) {  /* standard conversion */
+    int isnum;
+    lua_Number n = lua_tonumberx(L, 1, &isnum);
+    if (isnum) {
+      lua_pushnumber(L, n);
+      return 1;
+    }  /* else not a number; must be something */
+    luaL_checkany(L, 1);
+  }
+  else {
+    size_t l;
+    const char *s = luaL_checklstring(L, 1, &l);
+    const char *e = s + l;  /* end point for 's' */
+    int base = luaL_checkint(L, 2);
+    int neg = 0;
+    luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range");
+    s += strspn(s, SPACECHARS);  /* skip initial spaces */
+    if (*s == '-') { s++; neg = 1; }  /* handle signal */
+    else if (*s == '+') s++;
+    if (isalnum((unsigned char)*s)) {
+      lua_Number n = 0;
+      do {
+        int digit = (isdigit((unsigned char)*s)) ? *s - '0'
+                       : toupper((unsigned char)*s) - 'A' + 10;
+        if (digit >= base) break;  /* invalid numeral; force a fail */
+        n = n * (lua_Number)base + (lua_Number)digit;
+        s++;
+      } while (isalnum((unsigned char)*s));
+      s += strspn(s, SPACECHARS);  /* skip trailing spaces */
+      if (s == e) {  /* no invalid trailing characters? */
+        lua_pushnumber(L, (neg) ? -n : n);
+        return 1;
+      }  /* else not a number */
+    }  /* else not a number */
+  }
+  lua_pushnil(L);  /* not a number */
+  return 1;
+}
+
+
+static int luaB_error (lua_State *L) {
+  int level = luaL_optint(L, 2, 1);
+  lua_settop(L, 1);
+  if (lua_isstring(L, 1) && level > 0) {  /* add extra information? */
+    luaL_where(L, level);
+    lua_pushvalue(L, 1);
+    lua_concat(L, 2);
+  }
+  return lua_error(L);
+}
+
+
+static int luaB_getmetatable (lua_State *L) {
+  luaL_checkany(L, 1);
+  if (!lua_getmetatable(L, 1)) {
+    lua_pushnil(L);
+    return 1;  /* no metatable */
+  }
+  luaL_getmetafield(L, 1, "__metatable");
+  return 1;  /* returns either __metatable field (if present) or metatable */
+}
+
+
+static int luaB_setmetatable (lua_State *L) {
+  int t = lua_type(L, 2);
+  luaL_checktype(L, 1, LUA_TTABLE);
+  luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2,
+                    "nil or table expected");
+  if (luaL_getmetafield(L, 1, "__metatable"))
+    return luaL_error(L, "cannot change a protected metatable");
+  lua_settop(L, 2);
+  lua_setmetatable(L, 1);
+  return 1;
+}
+
+
+static int luaB_rawequal (lua_State *L) {
+  luaL_checkany(L, 1);
+  luaL_checkany(L, 2);
+  lua_pushboolean(L, lua_rawequal(L, 1, 2));
+  return 1;
+}
+
+
+static int luaB_rawlen (lua_State *L) {
+  int t = lua_type(L, 1);
+  luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1,
+                   "table or string expected");
+  lua_pushinteger(L, lua_rawlen(L, 1));
+  return 1;
+}
+
+
+static int luaB_rawget (lua_State *L) {
+  luaL_checktype(L, 1, LUA_TTABLE);
+  luaL_checkany(L, 2);
+  lua_settop(L, 2);
+  lua_rawget(L, 1);
+  return 1;
+}
+
+static int luaB_rawset (lua_State *L) {
+  luaL_checktype(L, 1, LUA_TTABLE);
+  luaL_checkany(L, 2);
+  luaL_checkany(L, 3);
+  lua_settop(L, 3);
+  lua_rawset(L, 1);
+  return 1;
+}
+
+
+static int luaB_collectgarbage (lua_State *L) {
+  static const char *const opts[] = {"stop", "restart", "collect",
+    "count", "step", "setpause", "setstepmul",
+    "setmajorinc", "isrunning", "generational", "incremental", NULL};
+  static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT,
+    LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL,
+    LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC};
+  int o = optsnum[luaL_checkoption(L, 1, "collect", opts)];
+  int ex = luaL_optint(L, 2, 0);
+  int res = lua_gc(L, o, ex);
+  switch (o) {
+    case LUA_GCCOUNT: {
+      int b = lua_gc(L, LUA_GCCOUNTB, 0);
+      lua_pushnumber(L, res + ((lua_Number)b/1024));
+      lua_pushinteger(L, b);
+      return 2;
+    }
+    case LUA_GCSTEP: case LUA_GCISRUNNING: {
+      lua_pushboolean(L, res);
+      return 1;
+    }
+    default: {
+      lua_pushinteger(L, res);
+      return 1;
+    }
+  }
+}
+
+
+static int luaB_type (lua_State *L) {
+  luaL_checkany(L, 1);
+  lua_pushstring(L, luaL_typename(L, 1));
+  return 1;
+}
+
+
+static int pairsmeta (lua_State *L, const char *method, int iszero,
+                      lua_CFunction iter) {
+  if (!luaL_getmetafield(L, 1, method)) {  /* no metamethod? */
+    luaL_checktype(L, 1, LUA_TTABLE);  /* argument must be a table */
+    lua_pushcfunction(L, iter);  /* will return generator, */
+    lua_pushvalue(L, 1);  /* state, */
+    if (iszero) lua_pushinteger(L, 0);  /* and initial value */
+    else lua_pushnil(L);
+  }
+  else {
+    lua_pushvalue(L, 1);  /* argument 'self' to metamethod */
+    lua_call(L, 1, 3);  /* get 3 values from metamethod */
+  }
+  return 3;
+}
+
+
+static int luaB_next (lua_State *L) {
+  luaL_checktype(L, 1, LUA_TTABLE);
+  lua_settop(L, 2);  /* create a 2nd argument if there isn't one */
+  if (lua_next(L, 1))
+    return 2;
+  else {
+    lua_pushnil(L);
+    return 1;
+  }
+}
+
+
+static int luaB_pairs (lua_State *L) {
+  return pairsmeta(L, "__pairs", 0, luaB_next);
+}
+
+
+static int ipairsaux (lua_State *L) {
+  int i = luaL_checkint(L, 2);
+  luaL_checktype(L, 1, LUA_TTABLE);
+  i++;  /* next value */
+  lua_pushinteger(L, i);
+  lua_rawgeti(L, 1, i);
+  return (lua_isnil(L, -1)) ? 1 : 2;
+}
+
+
+static int luaB_ipairs (lua_State *L) {
+  return pairsmeta(L, "__ipairs", 1, ipairsaux);
+}
+
+
+static int luaB_assert (lua_State *L) {
+  if (!lua_toboolean(L, 1))
+    return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!"));
+  return lua_gettop(L);
+}
+
+
+static int luaB_select (lua_State *L) {
+  int n = lua_gettop(L);
+  if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') {
+    lua_pushinteger(L, n-1);
+    return 1;
+  }
+  else {
+    int i = luaL_checkint(L, 1);
+    if (i < 0) i = n + i;
+    else if (i > n) i = n;
+    luaL_argcheck(L, 1 <= i, 1, "index out of range");
+    return n - i;
+  }
+}
+
+static int luaB_tostring (lua_State *L) {
+  luaL_checkany(L, 1);
+  luaL_tolstring(L, 1, NULL);
+  return 1;
+}
+
+static const luaL_Reg base_funcs[] = {
+  {"assert", luaB_assert},
+  {"collectgarbage", luaB_collectgarbage},
+  {"error", luaB_error},
+  {"getmetatable", luaB_getmetatable},
+  {"ipairs", luaB_ipairs},
+#if defined(LUA_COMPAT_LOADSTRING)
+  {"loadstring", luaB_load},
+#endif
+  {"next", luaB_next},
+  {"pairs", luaB_pairs},
+  {"rawequal", luaB_rawequal},
+  {"rawlen", luaB_rawlen},
+  {"rawget", luaB_rawget},
+  {"rawset", luaB_rawset},
+  {"select", luaB_select},
+  {"setmetatable", luaB_setmetatable},
+  {"tonumber", luaB_tonumber},
+  {"tostring", luaB_tostring},
+  {"type", luaB_type},
+  {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_base (lua_State *L) {
+  /* set global _G */
+  lua_pushglobaltable(L);
+  lua_pushglobaltable(L);
+  lua_setfield(L, -2, "_G");
+  /* open lib into global table */
+  luaL_setfuncs(L, base_funcs, 0);
+  lua_pushliteral(L, LUA_VERSION);
+  lua_setfield(L, -2, "_VERSION");  /* set global _VERSION */
+  return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_base);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lcode.c b/sys/contrib/openzfs/module/lua/lcode.c
new file mode 100644
index 000000000000..ae9a3d91d810
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcode.c
@@ -0,0 +1,884 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+#define lcode_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+#define hasjumps(e)	((e)->t != (e)->f)
+
+
+static int isnumeral(expdesc *e) {
+  return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP);
+}
+
+
+void luaK_nil (FuncState *fs, int from, int n) {
+  Instruction *previous;
+  int l = from + n - 1;  /* last register to set nil */
+  if (fs->pc > fs->lasttarget) {  /* no jumps to current position? */
+    previous = &fs->f->code[fs->pc-1];
+    if (GET_OPCODE(*previous) == OP_LOADNIL) {
+      int pfrom = GETARG_A(*previous);
+      int pl = pfrom + GETARG_B(*previous);
+      if ((pfrom <= from && from <= pl + 1) ||
+          (from <= pfrom && pfrom <= l + 1)) {  /* can connect both? */
+        if (pfrom < from) from = pfrom;  /* from = min(from, pfrom) */
+        if (pl > l) l = pl;  /* l = max(l, pl) */
+        SETARG_A(*previous, from);
+        SETARG_B(*previous, l - from);
+        return;
+      }
+    }  /* else go through */
+  }
+  luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0);  /* else no optimization */
+}
+
+
+int luaK_jump (FuncState *fs) {
+  int jpc = fs->jpc;  /* save list of jumps to here */
+  int j;
+  fs->jpc = NO_JUMP;
+  j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP);
+  luaK_concat(fs, &j, jpc);  /* keep them on hold */
+  return j;
+}
+
+
+void luaK_ret (FuncState *fs, int first, int nret) {
+  luaK_codeABC(fs, OP_RETURN, first, nret+1, 0);
+}
+
+
+static int condjump (FuncState *fs, OpCode op, int A, int B, int C) {
+  luaK_codeABC(fs, op, A, B, C);
+  return luaK_jump(fs);
+}
+
+
+static void fixjump (FuncState *fs, int pc, int dest) {
+  Instruction *jmp = &fs->f->code[pc];
+  int offset = dest-(pc+1);
+  lua_assert(dest != NO_JUMP);
+  if (abs(offset) > MAXARG_sBx)
+    luaX_syntaxerror(fs->ls, "control structure too long");
+  SETARG_sBx(*jmp, offset);
+}
+
+
+/*
+** returns current `pc' and marks it as a jump target (to avoid wrong
+** optimizations with consecutive instructions not in the same basic block).
+*/
+int luaK_getlabel (FuncState *fs) {
+  fs->lasttarget = fs->pc;
+  return fs->pc;
+}
+
+
+static int getjump (FuncState *fs, int pc) {
+  int offset = GETARG_sBx(fs->f->code[pc]);
+  if (offset == NO_JUMP)  /* point to itself represents end of list */
+    return NO_JUMP;  /* end of list */
+  else
+    return (pc+1)+offset;  /* turn offset into absolute position */
+}
+
+
+static Instruction *getjumpcontrol (FuncState *fs, int pc) {
+  Instruction *pi = &fs->f->code[pc];
+  if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1))))
+    return pi-1;
+  else
+    return pi;
+}
+
+
+/*
+** check whether list has any jump that do not produce a value
+** (or produce an inverted value)
+*/
+static int need_value (FuncState *fs, int list) {
+  for (; list != NO_JUMP; list = getjump(fs, list)) {
+    Instruction i = *getjumpcontrol(fs, list);
+    if (GET_OPCODE(i) != OP_TESTSET) return 1;
+  }
+  return 0;  /* not found */
+}
+
+
+static int patchtestreg (FuncState *fs, int node, int reg) {
+  Instruction *i = getjumpcontrol(fs, node);
+  if (GET_OPCODE(*i) != OP_TESTSET)
+    return 0;  /* cannot patch other instructions */
+  if (reg != NO_REG && reg != GETARG_B(*i))
+    SETARG_A(*i, reg);
+  else  /* no register to put value or register already has the value */
+    *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i));
+
+  return 1;
+}
+
+
+static void removevalues (FuncState *fs, int list) {
+  for (; list != NO_JUMP; list = getjump(fs, list))
+      patchtestreg(fs, list, NO_REG);
+}
+
+
+static void patchlistaux (FuncState *fs, int list, int vtarget, int reg,
+                          int dtarget) {
+  while (list != NO_JUMP) {
+    int next = getjump(fs, list);
+    if (patchtestreg(fs, list, reg))
+      fixjump(fs, list, vtarget);
+    else
+      fixjump(fs, list, dtarget);  /* jump to default target */
+    list = next;
+  }
+}
+
+
+static void dischargejpc (FuncState *fs) {
+  patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc);
+  fs->jpc = NO_JUMP;
+}
+
+
+void luaK_patchlist (FuncState *fs, int list, int target) {
+  if (target == fs->pc)
+    luaK_patchtohere(fs, list);
+  else {
+    lua_assert(target < fs->pc);
+    patchlistaux(fs, list, target, NO_REG, target);
+  }
+}
+
+
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) {
+  level++;  /* argument is +1 to reserve 0 as non-op */
+  while (list != NO_JUMP) {
+    int next = getjump(fs, list);
+    lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP &&
+                (GETARG_A(fs->f->code[list]) == 0 ||
+                 GETARG_A(fs->f->code[list]) >= level));
+    SETARG_A(fs->f->code[list], level);
+    list = next;
+  }
+}
+
+
+void luaK_patchtohere (FuncState *fs, int list) {
+  luaK_getlabel(fs);
+  luaK_concat(fs, &fs->jpc, list);
+}
+
+
+void luaK_concat (FuncState *fs, int *l1, int l2) {
+  if (l2 == NO_JUMP) return;
+  else if (*l1 == NO_JUMP)
+    *l1 = l2;
+  else {
+    int list = *l1;
+    int next;
+    while ((next = getjump(fs, list)) != NO_JUMP)  /* find last element */
+      list = next;
+    fixjump(fs, list, l2);
+  }
+}
+
+
+static int luaK_code (FuncState *fs, Instruction i) {
+  Proto *f = fs->f;
+  dischargejpc(fs);  /* `pc' will change */
+  /* put new instruction in code array */
+  luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction,
+                  MAX_INT, "opcodes");
+  f->code[fs->pc] = i;
+  /* save corresponding line information */
+  luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int,
+                  MAX_INT, "opcodes");
+  f->lineinfo[fs->pc] = fs->ls->lastline;
+  return fs->pc++;
+}
+
+
+int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) {
+  lua_assert(getOpMode(o) == iABC);
+  lua_assert(getBMode(o) != OpArgN || b == 0);
+  lua_assert(getCMode(o) != OpArgN || c == 0);
+  lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C);
+  return luaK_code(fs, CREATE_ABC(o, a, b, c));
+}
+
+
+int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) {
+  lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx);
+  lua_assert(getCMode(o) == OpArgN);
+  lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx);
+  return luaK_code(fs, CREATE_ABx(o, a, bc));
+}
+
+
+static int codeextraarg (FuncState *fs, int a) {
+  lua_assert(a <= MAXARG_Ax);
+  return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a));
+}
+
+
+int luaK_codek (FuncState *fs, int reg, int k) {
+  if (k <= MAXARG_Bx)
+    return luaK_codeABx(fs, OP_LOADK, reg, k);
+  else {
+    int p = luaK_codeABx(fs, OP_LOADKX, reg, 0);
+    codeextraarg(fs, k);
+    return p;
+  }
+}
+
+
+void luaK_checkstack (FuncState *fs, int n) {
+  int newstack = fs->freereg + n;
+  if (newstack > fs->f->maxstacksize) {
+    if (newstack >= MAXSTACK)
+      luaX_syntaxerror(fs->ls, "function or expression too complex");
+    fs->f->maxstacksize = cast_byte(newstack);
+  }
+}
+
+
+void luaK_reserveregs (FuncState *fs, int n) {
+  luaK_checkstack(fs, n);
+  fs->freereg += n;
+}
+
+
+static void freereg (FuncState *fs, int reg) {
+  if (!ISK(reg) && reg >= fs->nactvar) {
+    fs->freereg--;
+    lua_assert(reg == fs->freereg);
+  }
+}
+
+
+static void freeexp (FuncState *fs, expdesc *e) {
+  if (e->k == VNONRELOC)
+    freereg(fs, e->u.info);
+}
+
+
+static int addk (FuncState *fs, TValue *key, TValue *v) {
+  lua_State *L = fs->ls->L;
+  TValue *idx = luaH_set(L, fs->h, key);
+  Proto *f = fs->f;
+  int k, oldsize;
+  if (ttisnumber(idx)) {
+    lua_Number n = nvalue(idx);
+    lua_number2int(k, n);
+    if (luaV_rawequalobj(&f->k[k], v))
+      return k;
+    /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0");
+       go through and create a new entry for this value */
+  }
+  /* constant not found; create a new entry */
+  oldsize = f->sizek;
+  k = fs->nk;
+  /* numerical value does not need GC barrier;
+     table has no metatable, so it does not need to invalidate cache */
+  setnvalue(idx, cast_num(k));
+  luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants");
+  while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]);
+  setobj(L, &f->k[k], v);
+  fs->nk++;
+  luaC_barrier(L, f, v);
+  return k;
+}
+
+
+int luaK_stringK (FuncState *fs, TString *s) {
+  TValue o;
+  setsvalue(fs->ls->L, &o, s);
+  return addk(fs, &o, &o);
+}
+
+
+int luaK_numberK (FuncState *fs, lua_Number r) {
+  int n;
+  lua_State *L = fs->ls->L;
+  TValue o;
+  setnvalue(&o, r);
+  if (r == 0 || luai_numisnan(NULL, r)) {  /* handle -0 and NaN */
+    /* use raw representation as key to avoid numeric problems */
+    setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r)));
+    n = addk(fs, L->top - 1, &o);
+    L->top--;
+  }
+  else
+    n = addk(fs, &o, &o);  /* regular case */
+  return n;
+}
+
+
+static int boolK (FuncState *fs, int b) {
+  TValue o;
+  setbvalue(&o, b);
+  return addk(fs, &o, &o);
+}
+
+
+static int nilK (FuncState *fs) {
+  TValue k, v;
+  setnilvalue(&v);
+  /* cannot use nil as key; instead use table itself to represent nil */
+  sethvalue(fs->ls->L, &k, fs->h);
+  return addk(fs, &k, &v);
+}
+
+
+void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) {
+  if (e->k == VCALL) {  /* expression is an open function call? */
+    SETARG_C(getcode(fs, e), nresults+1);
+  }
+  else if (e->k == VVARARG) {
+    SETARG_B(getcode(fs, e), nresults+1);
+    SETARG_A(getcode(fs, e), fs->freereg);
+    luaK_reserveregs(fs, 1);
+  }
+}
+
+
+void luaK_setoneret (FuncState *fs, expdesc *e) {
+  if (e->k == VCALL) {  /* expression is an open function call? */
+    e->k = VNONRELOC;
+    e->u.info = GETARG_A(getcode(fs, e));
+  }
+  else if (e->k == VVARARG) {
+    SETARG_B(getcode(fs, e), 2);
+    e->k = VRELOCABLE;  /* can relocate its simple result */
+  }
+}
+
+
+void luaK_dischargevars (FuncState *fs, expdesc *e) {
+  switch (e->k) {
+    case VLOCAL: {
+      e->k = VNONRELOC;
+      break;
+    }
+    case VUPVAL: {
+      e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0);
+      e->k = VRELOCABLE;
+      break;
+    }
+    case VINDEXED: {
+      OpCode op = OP_GETTABUP;  /* assume 't' is in an upvalue */
+      freereg(fs, e->u.ind.idx);
+      if (e->u.ind.vt == VLOCAL) {  /* 't' is in a register? */
+        freereg(fs, e->u.ind.t);
+        op = OP_GETTABLE;
+      }
+      e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx);
+      e->k = VRELOCABLE;
+      break;
+    }
+    case VVARARG:
+    case VCALL: {
+      luaK_setoneret(fs, e);
+      break;
+    }
+    default: break;  /* there is one value available (somewhere) */
+  }
+}
+
+
+static int code_label (FuncState *fs, int A, int b, int jump) {
+  luaK_getlabel(fs);  /* those instructions may be jump targets */
+  return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump);
+}
+
+
+static void discharge2reg (FuncState *fs, expdesc *e, int reg) {
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VNIL: {
+      luaK_nil(fs, reg, 1);
+      break;
+    }
+    case VFALSE: case VTRUE: {
+      luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0);
+      break;
+    }
+    case VK: {
+      luaK_codek(fs, reg, e->u.info);
+      break;
+    }
+    case VKNUM: {
+      luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval));
+      break;
+    }
+    case VRELOCABLE: {
+      Instruction *pc = &getcode(fs, e);
+      SETARG_A(*pc, reg);
+      break;
+    }
+    case VNONRELOC: {
+      if (reg != e->u.info)
+        luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0);
+      break;
+    }
+    default: {
+      lua_assert(e->k == VVOID || e->k == VJMP);
+      return;  /* nothing to do... */
+    }
+  }
+  e->u.info = reg;
+  e->k = VNONRELOC;
+}
+
+
+static void discharge2anyreg (FuncState *fs, expdesc *e) {
+  if (e->k != VNONRELOC) {
+    luaK_reserveregs(fs, 1);
+    discharge2reg(fs, e, fs->freereg-1);
+  }
+}
+
+
+static void exp2reg (FuncState *fs, expdesc *e, int reg) {
+  discharge2reg(fs, e, reg);
+  if (e->k == VJMP)
+    luaK_concat(fs, &e->t, e->u.info);  /* put this jump in `t' list */
+  if (hasjumps(e)) {
+    int final;  /* position after whole expression */
+    int p_f = NO_JUMP;  /* position of an eventual LOAD false */
+    int p_t = NO_JUMP;  /* position of an eventual LOAD true */
+    if (need_value(fs, e->t) || need_value(fs, e->f)) {
+      int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs);
+      p_f = code_label(fs, reg, 0, 1);
+      p_t = code_label(fs, reg, 1, 0);
+      luaK_patchtohere(fs, fj);
+    }
+    final = luaK_getlabel(fs);
+    patchlistaux(fs, e->f, final, reg, p_f);
+    patchlistaux(fs, e->t, final, reg, p_t);
+  }
+  e->f = e->t = NO_JUMP;
+  e->u.info = reg;
+  e->k = VNONRELOC;
+}
+
+
+void luaK_exp2nextreg (FuncState *fs, expdesc *e) {
+  luaK_dischargevars(fs, e);
+  freeexp(fs, e);
+  luaK_reserveregs(fs, 1);
+  exp2reg(fs, e, fs->freereg - 1);
+}
+
+
+int luaK_exp2anyreg (FuncState *fs, expdesc *e) {
+  luaK_dischargevars(fs, e);
+  if (e->k == VNONRELOC) {
+    if (!hasjumps(e)) return e->u.info;  /* exp is already in a register */
+    if (e->u.info >= fs->nactvar) {  /* reg. is not a local? */
+      exp2reg(fs, e, e->u.info);  /* put value on it */
+      return e->u.info;
+    }
+  }
+  luaK_exp2nextreg(fs, e);  /* default */
+  return e->u.info;
+}
+
+
+void luaK_exp2anyregup (FuncState *fs, expdesc *e) {
+  if (e->k != VUPVAL || hasjumps(e))
+    luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_exp2val (FuncState *fs, expdesc *e) {
+  if (hasjumps(e))
+    luaK_exp2anyreg(fs, e);
+  else
+    luaK_dischargevars(fs, e);
+}
+
+
+int luaK_exp2RK (FuncState *fs, expdesc *e) {
+  luaK_exp2val(fs, e);
+  switch (e->k) {
+    case VTRUE:
+    case VFALSE:
+    case VNIL: {
+      if (fs->nk <= MAXINDEXRK) {  /* constant fits in RK operand? */
+        e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE));
+        e->k = VK;
+        return RKASK(e->u.info);
+      }
+      else break;
+    }
+    case VKNUM: {
+      e->u.info = luaK_numberK(fs, e->u.nval);
+      e->k = VK;
+      /* go through */
+    }
+    case VK: {
+      if (e->u.info <= MAXINDEXRK)  /* constant fits in argC? */
+        return RKASK(e->u.info);
+      else break;
+    }
+    default: break;
+  }
+  /* not a constant in the right range: put it in a register */
+  return luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) {
+  switch (var->k) {
+    case VLOCAL: {
+      freeexp(fs, ex);
+      exp2reg(fs, ex, var->u.info);
+      return;
+    }
+    case VUPVAL: {
+      int e = luaK_exp2anyreg(fs, ex);
+      luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0);
+      break;
+    }
+    case VINDEXED: {
+      OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP;
+      int e = luaK_exp2RK(fs, ex);
+      luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e);
+      break;
+    }
+    default: {
+      lua_assert(0);  /* invalid var kind to store */
+      break;
+    }
+  }
+  freeexp(fs, ex);
+}
+
+
+void luaK_self (FuncState *fs, expdesc *e, expdesc *key) {
+  int ereg;
+  luaK_exp2anyreg(fs, e);
+  ereg = e->u.info;  /* register where 'e' was placed */
+  freeexp(fs, e);
+  e->u.info = fs->freereg;  /* base register for op_self */
+  e->k = VNONRELOC;
+  luaK_reserveregs(fs, 2);  /* function and 'self' produced by op_self */
+  luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key));
+  freeexp(fs, key);
+}
+
+
+static void invertjump (FuncState *fs, expdesc *e) {
+  Instruction *pc = getjumpcontrol(fs, e->u.info);
+  lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET &&
+                                           GET_OPCODE(*pc) != OP_TEST);
+  SETARG_A(*pc, !(GETARG_A(*pc)));
+}
+
+
+static int jumponcond (FuncState *fs, expdesc *e, int cond) {
+  if (e->k == VRELOCABLE) {
+    Instruction ie = getcode(fs, e);
+    if (GET_OPCODE(ie) == OP_NOT) {
+      fs->pc--;  /* remove previous OP_NOT */
+      return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond);
+    }
+    /* else go through */
+  }
+  discharge2anyreg(fs, e);
+  freeexp(fs, e);
+  return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond);
+}
+
+
+void luaK_goiftrue (FuncState *fs, expdesc *e) {
+  int pc;  /* pc of last jump */
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VJMP: {
+      invertjump(fs, e);
+      pc = e->u.info;
+      break;
+    }
+    case VK: case VKNUM: case VTRUE: {
+      pc = NO_JUMP;  /* always true; do nothing */
+      break;
+    }
+    default: {
+      pc = jumponcond(fs, e, 0);
+      break;
+    }
+  }
+  luaK_concat(fs, &e->f, pc);  /* insert last jump in `f' list */
+  luaK_patchtohere(fs, e->t);
+  e->t = NO_JUMP;
+}
+
+
+void luaK_goiffalse (FuncState *fs, expdesc *e) {
+  int pc;  /* pc of last jump */
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VJMP: {
+      pc = e->u.info;
+      break;
+    }
+    case VNIL: case VFALSE: {
+      pc = NO_JUMP;  /* always false; do nothing */
+      break;
+    }
+    default: {
+      pc = jumponcond(fs, e, 1);
+      break;
+    }
+  }
+  luaK_concat(fs, &e->t, pc);  /* insert last jump in `t' list */
+  luaK_patchtohere(fs, e->f);
+  e->f = NO_JUMP;
+}
+
+
+static void codenot (FuncState *fs, expdesc *e) {
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VNIL: case VFALSE: {
+      e->k = VTRUE;
+      break;
+    }
+    case VK: case VKNUM: case VTRUE: {
+      e->k = VFALSE;
+      break;
+    }
+    case VJMP: {
+      invertjump(fs, e);
+      break;
+    }
+    case VRELOCABLE:
+    case VNONRELOC: {
+      discharge2anyreg(fs, e);
+      freeexp(fs, e);
+      e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0);
+      e->k = VRELOCABLE;
+      break;
+    }
+    default: {
+      lua_assert(0);  /* cannot happen */
+      break;
+    }
+  }
+  /* interchange true and false lists */
+  { int temp = e->f; e->f = e->t; e->t = temp; }
+  removevalues(fs, e->f);
+  removevalues(fs, e->t);
+}
+
+
+void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) {
+  lua_assert(!hasjumps(t));
+  t->u.ind.t = t->u.info;
+  t->u.ind.idx = luaK_exp2RK(fs, k);
+  t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL
+                                 : check_exp(vkisinreg(t->k), VLOCAL);
+  t->k = VINDEXED;
+}
+
+
+static int constfolding (OpCode op, expdesc *e1, expdesc *e2) {
+  lua_Number r;
+  if (!isnumeral(e1) || !isnumeral(e2)) return 0;
+  if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0)
+    return 0;  /* do not attempt to divide by 0 */
+  /*
+   * Patched: check for MIN_INT / -1
+   */
+  if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1)
+    return 0;
+  r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval);
+  e1->u.nval = r;
+  return 1;
+}
+
+
+static void codearith (FuncState *fs, OpCode op,
+                       expdesc *e1, expdesc *e2, int line) {
+  if (constfolding(op, e1, e2))
+    return;
+  else {
+    int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0;
+    int o1 = luaK_exp2RK(fs, e1);
+    if (o1 > o2) {
+      freeexp(fs, e1);
+      freeexp(fs, e2);
+    }
+    else {
+      freeexp(fs, e2);
+      freeexp(fs, e1);
+    }
+    e1->u.info = luaK_codeABC(fs, op, 0, o1, o2);
+    e1->k = VRELOCABLE;
+    luaK_fixline(fs, line);
+  }
+}
+
+
+static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1,
+                                                          expdesc *e2) {
+  int o1 = luaK_exp2RK(fs, e1);
+  int o2 = luaK_exp2RK(fs, e2);
+  freeexp(fs, e2);
+  freeexp(fs, e1);
+  if (cond == 0 && op != OP_EQ) {
+    int temp;  /* exchange args to replace by `<' or `<=' */
+    temp = o1; o1 = o2; o2 = temp;  /* o1 <==> o2 */
+    cond = 1;
+  }
+  e1->u.info = condjump(fs, op, cond, o1, o2);
+  e1->k = VJMP;
+}
+
+
+void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) {
+  expdesc e2;
+  e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0;
+  switch (op) {
+    case OPR_MINUS: {
+      if (isnumeral(e))  /* minus constant? */
+        e->u.nval = luai_numunm(NULL, e->u.nval);  /* fold it */
+      else {
+        luaK_exp2anyreg(fs, e);
+        codearith(fs, OP_UNM, e, &e2, line);
+      }
+      break;
+    }
+    case OPR_NOT: codenot(fs, e); break;
+    case OPR_LEN: {
+      luaK_exp2anyreg(fs, e);  /* cannot operate on constants */
+      codearith(fs, OP_LEN, e, &e2, line);
+      break;
+    }
+    default: lua_assert(0);
+  }
+}
+
+
+void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) {
+  switch (op) {
+    case OPR_AND: {
+      luaK_goiftrue(fs, v);
+      break;
+    }
+    case OPR_OR: {
+      luaK_goiffalse(fs, v);
+      break;
+    }
+    case OPR_CONCAT: {
+      luaK_exp2nextreg(fs, v);  /* operand must be on the `stack' */
+      break;
+    }
+    case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+    case OPR_MOD: case OPR_POW: {
+      if (!isnumeral(v)) luaK_exp2RK(fs, v);
+      break;
+    }
+    default: {
+      luaK_exp2RK(fs, v);
+      break;
+    }
+  }
+}
+
+
+void luaK_posfix (FuncState *fs, BinOpr op,
+                  expdesc *e1, expdesc *e2, int line) {
+  switch (op) {
+    case OPR_AND: {
+      lua_assert(e1->t == NO_JUMP);  /* list must be closed */
+      luaK_dischargevars(fs, e2);
+      luaK_concat(fs, &e2->f, e1->f);
+      *e1 = *e2;
+      break;
+    }
+    case OPR_OR: {
+      lua_assert(e1->f == NO_JUMP);  /* list must be closed */
+      luaK_dischargevars(fs, e2);
+      luaK_concat(fs, &e2->t, e1->t);
+      *e1 = *e2;
+      break;
+    }
+    case OPR_CONCAT: {
+      luaK_exp2val(fs, e2);
+      if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) {
+        lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1);
+        freeexp(fs, e1);
+        SETARG_B(getcode(fs, e2), e1->u.info);
+        e1->k = VRELOCABLE; e1->u.info = e2->u.info;
+      }
+      else {
+        luaK_exp2nextreg(fs, e2);  /* operand must be on the 'stack' */
+        codearith(fs, OP_CONCAT, e1, e2, line);
+      }
+      break;
+    }
+    case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+    case OPR_MOD: case OPR_POW: {
+      codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line);
+      break;
+    }
+    case OPR_EQ: case OPR_LT: case OPR_LE: {
+      codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2);
+      break;
+    }
+    case OPR_NE: case OPR_GT: case OPR_GE: {
+      codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2);
+      break;
+    }
+    default: lua_assert(0);
+  }
+}
+
+
+void luaK_fixline (FuncState *fs, int line) {
+  fs->f->lineinfo[fs->pc - 1] = line;
+}
+
+
+void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) {
+  int c =  (nelems - 1)/LFIELDS_PER_FLUSH + 1;
+  int b = (tostore == LUA_MULTRET) ? 0 : tostore;
+  lua_assert(tostore != 0);
+  if (c <= MAXARG_C)
+    luaK_codeABC(fs, OP_SETLIST, base, b, c);
+  else if (c <= MAXARG_Ax) {
+    luaK_codeABC(fs, OP_SETLIST, base, b, 0);
+    codeextraarg(fs, c);
+  }
+  else
+    luaX_syntaxerror(fs->ls, "constructor too long");
+  fs->freereg = base + 1;  /* free registers with list values */
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lcode.h b/sys/contrib/openzfs/module/lua/lcode.h
new file mode 100644
index 000000000000..fd5fad00df3d
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcode.h
@@ -0,0 +1,85 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lcode_h
+#define lcode_h
+
+#include "llex.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+
+
+/*
+** Marks the end of a patch list. It is an invalid value both as an absolute
+** address, and as a list link (would link an element to itself).
+*/
+#define NO_JUMP (-1)
+
+
+/*
+** grep "ORDER OPR" if you change these enums  (ORDER OP)
+*/
+typedef enum BinOpr {
+  OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW,
+  OPR_CONCAT,
+  OPR_EQ, OPR_LT, OPR_LE,
+  OPR_NE, OPR_GT, OPR_GE,
+  OPR_AND, OPR_OR,
+  OPR_NOBINOPR
+} BinOpr;
+
+
+typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr;
+
+
+#define getcode(fs,e)	((fs)->f->code[(e)->u.info])
+
+#define luaK_codeAsBx(fs,o,A,sBx)	luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx)
+
+#define luaK_setmultret(fs,e)	luaK_setreturns(fs, e, LUA_MULTRET)
+
+#define luaK_jumpto(fs,t)	luaK_patchlist(fs, luaK_jump(fs), t)
+
+LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx);
+LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C);
+LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k);
+LUAI_FUNC void luaK_fixline (FuncState *fs, int line);
+LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n);
+LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n);
+LUAI_FUNC void luaK_checkstack (FuncState *fs, int n);
+LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s);
+LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r);
+LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key);
+LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k);
+LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e);
+LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults);
+LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_jump (FuncState *fs);
+LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret);
+LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target);
+LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list);
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level);
+LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2);
+LUAI_FUNC int luaK_getlabel (FuncState *fs);
+LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line);
+LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v);
+LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1,
+                            expdesc *v2, int line);
+LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lcompat.c b/sys/contrib/openzfs/module/lua/lcompat.c
new file mode 100644
index 000000000000..c0a27182c7d8
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcompat.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/lua/lua.h>
+
+
+ssize_t
+lcompat_sprintf(char *buf, size_t size, const char *fmt, ...)
+{
+	ssize_t res;
+	va_list args;
+
+	va_start(args, fmt);
+	res = vsnprintf(buf, size, fmt, args);
+	va_end(args);
+
+	return (res);
+}
+
+int64_t
+lcompat_strtoll(const char *str, char **ptr)
+{
+	int base;
+	const char *cp;
+	int digits;
+	int64_t value;
+	boolean_t is_negative;
+
+	cp = str;
+	while (*cp == ' ' || *cp == '\t' || *cp == '\n') {
+		cp++;
+	}
+	is_negative = (*cp == '-');
+	if (is_negative) {
+		cp++;
+	}
+	base = 10;
+
+	if (*cp == '0') {
+		base = 8;
+		cp++;
+		if (*cp == 'x' || *cp == 'X') {
+			base = 16;
+			cp++;
+		}
+	}
+
+	value = 0;
+	for (; *cp != '\0'; cp++) {
+		if (*cp >= '0' && *cp <= '9') {
+			digits = *cp - '0';
+		} else if (*cp >= 'a' && *cp <= 'f') {
+			digits = *cp - 'a' + 10;
+		} else if (*cp >= 'A' && *cp <= 'F') {
+			digits = *cp - 'A' + 10;
+		} else {
+			break;
+		}
+		if (digits >= base) {
+			break;
+		}
+		value = (value * base) + digits;
+	}
+
+	if (ptr != NULL) {
+		*ptr = (char *)cp;
+	}
+	if (is_negative) {
+		value = -value;
+	}
+	return (value);
+}
+
+int64_t
+lcompat_pow(int64_t x, int64_t y)
+{
+	int64_t result = 1;
+	if (y < 0)
+		return (0);
+
+	while (y) {
+		if (y & 1)
+			result *= x;
+		y >>= 1;
+		x *= x;
+	}
+	return (result);
+}
+
+int
+lcompat_hashnum(int64_t x)
+{
+	x = (~x) + (x << 18);
+	x = x ^ (x >> 31);
+	x = x * 21;
+	x = x ^ (x >> 11);
+	x = x + (x << 6);
+	x = x ^ (x >> 22);
+	return ((int)x);
+}
diff --git a/sys/contrib/openzfs/module/lua/lcorolib.c b/sys/contrib/openzfs/module/lua/lcorolib.c
new file mode 100644
index 000000000000..0300e7ee17d5
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lcorolib.c
@@ -0,0 +1,159 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $
+** Coroutine Library
+** See Copyright Notice in lua.h
+*/
+
+
+#define lcorolib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+
+static int auxresume (lua_State *L, lua_State *co, int narg) {
+  int status;
+  if (!lua_checkstack(co, narg)) {
+    lua_pushliteral(L, "too many arguments to resume");
+    return -1;  /* error flag */
+  }
+  if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) {
+    lua_pushliteral(L, "cannot resume dead coroutine");
+    return -1;  /* error flag */
+  }
+  lua_xmove(L, co, narg);
+  status = lua_resume(co, L, narg);
+  if (status == LUA_OK || status == LUA_YIELD) {
+    int nres = lua_gettop(co);
+    if (!lua_checkstack(L, nres + 1)) {
+      lua_pop(co, nres);  /* remove results anyway */
+      lua_pushliteral(L, "too many results to resume");
+      return -1;  /* error flag */
+    }
+    lua_xmove(co, L, nres);  /* move yielded values */
+    return nres;
+  }
+  else {
+    lua_xmove(co, L, 1);  /* move error message */
+    return -1;  /* error flag */
+  }
+}
+
+
+static int luaB_coresume (lua_State *L) {
+  lua_State *co = lua_tothread(L, 1);
+  int r;
+  luaL_argcheck(L, co, 1, "coroutine expected");
+  r = auxresume(L, co, lua_gettop(L) - 1);
+  if (r < 0) {
+    lua_pushboolean(L, 0);
+    lua_insert(L, -2);
+    return 2;  /* return false + error message */
+  }
+  else {
+    lua_pushboolean(L, 1);
+    lua_insert(L, -(r + 1));
+    return r + 1;  /* return true + 'resume' returns */
+  }
+}
+
+
+static int luaB_auxwrap (lua_State *L) {
+  lua_State *co = lua_tothread(L, lua_upvalueindex(1));
+  int r = auxresume(L, co, lua_gettop(L));
+  if (r < 0) {
+    if (lua_isstring(L, -1)) {  /* error object is a string? */
+      luaL_where(L, 1);  /* add extra info */
+      lua_insert(L, -2);
+      lua_concat(L, 2);
+    }
+    return lua_error(L);  /* propagate error */
+  }
+  return r;
+}
+
+
+static int luaB_cocreate (lua_State *L) {
+  lua_State *NL;
+  luaL_checktype(L, 1, LUA_TFUNCTION);
+  NL = lua_newthread(L);
+  lua_pushvalue(L, 1);  /* move function to top */
+  lua_xmove(L, NL, 1);  /* move function from L to NL */
+  return 1;
+}
+
+
+static int luaB_cowrap (lua_State *L) {
+  luaB_cocreate(L);
+  lua_pushcclosure(L, luaB_auxwrap, 1);
+  return 1;
+}
+
+
+static int luaB_yield (lua_State *L) {
+  return lua_yield(L, lua_gettop(L));
+}
+
+
+static int luaB_costatus (lua_State *L) {
+  lua_State *co = lua_tothread(L, 1);
+  luaL_argcheck(L, co, 1, "coroutine expected");
+  if (L == co) lua_pushliteral(L, "running");
+  else {
+    switch (lua_status(co)) {
+      case LUA_YIELD:
+        lua_pushliteral(L, "suspended");
+        break;
+      case LUA_OK: {
+        lua_Debug ar;
+        if (lua_getstack(co, 0, &ar) > 0)  /* does it have frames? */
+          lua_pushliteral(L, "normal");  /* it is running */
+        else if (lua_gettop(co) == 0)
+            lua_pushliteral(L, "dead");
+        else
+          lua_pushliteral(L, "suspended");  /* initial state */
+        break;
+      }
+      default:  /* some error occurred */
+        lua_pushliteral(L, "dead");
+        break;
+    }
+  }
+  return 1;
+}
+
+
+static int luaB_corunning (lua_State *L) {
+  int ismain = lua_pushthread(L);
+  lua_pushboolean(L, ismain);
+  return 2;
+}
+
+
+static const luaL_Reg co_funcs[] = {
+  {"create", luaB_cocreate},
+  {"resume", luaB_coresume},
+  {"running", luaB_corunning},
+  {"status", luaB_costatus},
+  {"wrap", luaB_cowrap},
+  {"yield", luaB_yield},
+  {NULL, NULL}
+};
+
+
+
+LUAMOD_API int luaopen_coroutine (lua_State *L) {
+  luaL_newlib(L, co_funcs);
+  return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_coroutine);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lctype.c b/sys/contrib/openzfs/module/lua/lctype.c
new file mode 100644
index 000000000000..028d278ae4da
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lctype.c
@@ -0,0 +1,52 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#define lctype_c
+#define LUA_CORE
+
+#include "lctype.h"
+
+#if !LUA_USE_CTYPE	/* { */
+
+LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = {
+  0x00,  /* EOZ */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 0. */
+  0x00,  0x08,  0x08,  0x08,  0x08,  0x08,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 1. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x0c,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,	/* 2. */
+  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,
+  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,	/* 3. */
+  0x16,  0x16,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,
+  0x04,  0x15,  0x15,  0x15,  0x15,  0x15,  0x15,  0x05,	/* 4. */
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,	/* 5. */
+  0x05,  0x05,  0x05,  0x04,  0x04,  0x04,  0x04,  0x05,
+  0x04,  0x15,  0x15,  0x15,  0x15,  0x15,  0x15,  0x05,	/* 6. */
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,	/* 7. */
+  0x05,  0x05,  0x05,  0x04,  0x04,  0x04,  0x04,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 8. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 9. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* a. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* b. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* c. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* d. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* e. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* f. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+};
+
+#endif			/* } */
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lctype.h b/sys/contrib/openzfs/module/lua/lctype.h
new file mode 100644
index 000000000000..b16b6bc7dab3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lctype.h
@@ -0,0 +1,94 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lctype_h
+#define lctype_h
+
+#include <sys/lua/lua.h>
+
+
+/*
+** WARNING: the functions defined here do not necessarily correspond
+** to the similar functions in the standard C ctype.h. They are
+** optimized for the specific needs of Lua
+*/
+
+#if !defined(LUA_USE_CTYPE)
+
+#if 'A' == 65 && '0' == 48
+/* ASCII case: can use its own tables; faster and fixed */
+#define LUA_USE_CTYPE	0
+#else
+/* must use standard C ctype */
+#define LUA_USE_CTYPE	1
+#endif
+
+#endif
+
+
+#if !LUA_USE_CTYPE	/* { */
+
+#include "llimits.h"
+
+
+#define ALPHABIT	0
+#define DIGITBIT	1
+#define PRINTBIT	2
+#define SPACEBIT	3
+#define XDIGITBIT	4
+
+
+#define MASK(B)		(1 << (B))
+
+
+/*
+** add 1 to char to allow index -1 (EOZ)
+*/
+#define testprop(c,p)	(luai_ctype_[(lu_byte)(c)+1] & (p))
+
+/*
+** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_'
+*/
+#define lislalpha(c)	testprop(c, MASK(ALPHABIT))
+#define lislalnum(c)	testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT)))
+#define lisdigit(c)	testprop(c, MASK(DIGITBIT))
+#define lisspace(c)	testprop(c, MASK(SPACEBIT))
+#define lisprint(c)	testprop(c, MASK(PRINTBIT))
+#define lisxdigit(c)	testprop(c, MASK(XDIGITBIT))
+
+/*
+** this 'ltolower' only works for alphabetic characters
+*/
+#define ltolower(c)	((c) | ('A' ^ 'a'))
+
+
+/* two more entries for 0 and -1 (EOZ) */
+LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2];
+
+
+#else			/* }{ */
+
+/*
+** use standard C ctypes
+*/
+
+#include <ctype.h>
+
+
+#define lislalpha(c)	(isalpha(c) || (c) == '_')
+#define lislalnum(c)	(isalnum(c) || (c) == '_')
+#define lisdigit(c)	(isdigit(c))
+#define lisspace(c)	(isspace(c))
+#define lisprint(c)	(isprint(c))
+#define lisxdigit(c)	(isxdigit(c))
+
+#define ltolower(c)	(tolower(c))
+
+#endif			/* } */
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldebug.c b/sys/contrib/openzfs/module/lua/ldebug.c
new file mode 100644
index 000000000000..da005c44376e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldebug.c
@@ -0,0 +1,608 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $
+** Debug Interface
+** See Copyright Notice in lua.h
+*/
+
+
+#define ldebug_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+#define noLuaClosure(f)		((f) == NULL || (f)->c.tt == LUA_TCCL)
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name);
+
+
+static int currentpc (CallInfo *ci) {
+  lua_assert(isLua(ci));
+  return pcRel(ci->u.l.savedpc, ci_func(ci)->p);
+}
+
+
+static int currentline (CallInfo *ci) {
+  return getfuncline(ci_func(ci)->p, currentpc(ci));
+}
+
+
+static void swapextra (lua_State *L) {
+  if (L->status == LUA_YIELD) {
+    CallInfo *ci = L->ci;  /* get function that yielded */
+    StkId temp = ci->func;  /* exchange its 'func' and 'extra' values */
+    ci->func = restorestack(L, ci->extra);
+    ci->extra = savestack(L, temp);
+  }
+}
+
+
+/*
+** this function can be called asynchronous (e.g. during a signal)
+*/
+LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) {
+  if (func == NULL || mask == 0) {  /* turn off hooks? */
+    mask = 0;
+    func = NULL;
+  }
+  if (isLua(L->ci))
+    L->oldpc = L->ci->u.l.savedpc;
+  L->hook = func;
+  L->basehookcount = count;
+  resethookcount(L);
+  L->hookmask = cast_byte(mask);
+  return 1;
+}
+
+
+LUA_API lua_Hook lua_gethook (lua_State *L) {
+  return L->hook;
+}
+
+
+LUA_API int lua_gethookmask (lua_State *L) {
+  return L->hookmask;
+}
+
+
+LUA_API int lua_gethookcount (lua_State *L) {
+  return L->basehookcount;
+}
+
+
+LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) {
+  int status;
+  CallInfo *ci;
+  if (level < 0) return 0;  /* invalid (negative) level */
+  lua_lock(L);
+  for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous)
+    level--;
+  if (level == 0 && ci != &L->base_ci) {  /* level found? */
+    status = 1;
+    ar->i_ci = ci;
+  }
+  else status = 0;  /* no such level */
+  lua_unlock(L);
+  return status;
+}
+
+
+static const char *upvalname (Proto *p, int uv) {
+  TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name);
+  if (s == NULL) return "?";
+  else return getstr(s);
+}
+
+
+static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
+  int nparams = clLvalue(ci->func)->p->numparams;
+  if (n >= ci->u.l.base - ci->func - nparams)
+    return NULL;  /* no such vararg */
+  else {
+    *pos = ci->func + nparams + n;
+    return "(*vararg)";  /* generic name for any vararg */
+  }
+}
+
+
+static const char *findlocal (lua_State *L, CallInfo *ci, int n,
+                              StkId *pos) {
+  const char *name = NULL;
+  StkId base;
+  if (isLua(ci)) {
+    if (n < 0)  /* access to vararg values? */
+      return findvararg(ci, -n, pos);
+    else {
+      base = ci->u.l.base;
+      name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
+    }
+  }
+  else
+    base = ci->func + 1;
+  if (name == NULL) {  /* no 'standard' name? */
+    StkId limit = (ci == L->ci) ? L->top : ci->next->func;
+    if (limit - base >= n && n > 0)  /* is 'n' inside 'ci' stack? */
+      name = "(*temporary)";  /* generic name for any valid slot */
+    else
+      return NULL;  /* no name */
+  }
+  *pos = base + (n - 1);
+  return name;
+}
+
+
+LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) {
+  const char *name;
+  lua_lock(L);
+  swapextra(L);
+  if (ar == NULL) {  /* information about non-active function? */
+    if (!isLfunction(L->top - 1))  /* not a Lua function? */
+      name = NULL;
+    else  /* consider live variables at function start (parameters) */
+      name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0);
+  }
+  else {  /* active function; get information through 'ar' */
+    StkId pos = 0;  /* to avoid warnings */
+    name = findlocal(L, ar->i_ci, n, &pos);
+    if (name) {
+      setobj2s(L, L->top, pos);
+      api_incr_top(L);
+    }
+  }
+  swapextra(L);
+  lua_unlock(L);
+  return name;
+}
+
+
+LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) {
+  StkId pos = 0;  /* to avoid warnings */
+  const char *name;
+  lua_lock(L);
+  swapextra(L);
+  name = findlocal(L, ar->i_ci, n, &pos);
+  if (name)
+    setobjs2s(L, pos, L->top - 1);
+  L->top--;  /* pop value */
+  swapextra(L);
+  lua_unlock(L);
+  return name;
+}
+
+
+static void funcinfo (lua_Debug *ar, Closure *cl) {
+  if (noLuaClosure(cl)) {
+    ar->source = "=[C]";
+    ar->linedefined = -1;
+    ar->lastlinedefined = -1;
+    ar->what = "C";
+  }
+  else {
+    Proto *p = cl->l.p;
+    ar->source = p->source ? getstr(p->source) : "=?";
+    ar->linedefined = p->linedefined;
+    ar->lastlinedefined = p->lastlinedefined;
+    ar->what = (ar->linedefined == 0) ? "main" : "Lua";
+  }
+  luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE);
+}
+
+
+static void collectvalidlines (lua_State *L, Closure *f) {
+  if (noLuaClosure(f)) {
+    setnilvalue(L->top);
+    api_incr_top(L);
+  }
+  else {
+    int i;
+    TValue v;
+    int *lineinfo = f->l.p->lineinfo;
+    Table *t = luaH_new(L);  /* new table to store active lines */
+    sethvalue(L, L->top, t);  /* push it on stack */
+    api_incr_top(L);
+    setbvalue(&v, 1);  /* boolean 'true' to be the value of all indices */
+    for (i = 0; i < f->l.p->sizelineinfo; i++)  /* for all lines with code */
+      luaH_setint(L, t, lineinfo[i], &v);  /* table[line] = true */
+  }
+}
+
+
+static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar,
+                       Closure *f, CallInfo *ci) {
+  int status = 1;
+  for (; *what; what++) {
+    switch (*what) {
+      case 'S': {
+        funcinfo(ar, f);
+        break;
+      }
+      case 'l': {
+        ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1;
+        break;
+      }
+      case 'u': {
+        ar->nups = (f == NULL) ? 0 : f->c.nupvalues;
+        if (noLuaClosure(f)) {
+          ar->isvararg = 1;
+          ar->nparams = 0;
+        }
+        else {
+          ar->isvararg = f->l.p->is_vararg;
+          ar->nparams = f->l.p->numparams;
+        }
+        break;
+      }
+      case 't': {
+        ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0;
+        break;
+      }
+      case 'n': {
+        /* calling function is a known Lua function? */
+        if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous))
+          ar->namewhat = getfuncname(L, ci->previous, &ar->name);
+        else
+          ar->namewhat = NULL;
+        if (ar->namewhat == NULL) {
+          ar->namewhat = "";  /* not found */
+          ar->name = NULL;
+        }
+        break;
+      }
+      case 'L':
+      case 'f':  /* handled by lua_getinfo */
+        break;
+      default: status = 0;  /* invalid option */
+    }
+  }
+  return status;
+}
+
+
+LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) {
+  int status;
+  Closure *cl;
+  CallInfo *ci;
+  StkId func;
+  lua_lock(L);
+  swapextra(L);
+  if (*what == '>') {
+    ci = NULL;
+    func = L->top - 1;
+    api_check(L, ttisfunction(func), "function expected");
+    what++;  /* skip the '>' */
+    L->top--;  /* pop function */
+  }
+  else {
+    ci = ar->i_ci;
+    func = ci->func;
+    lua_assert(ttisfunction(ci->func));
+  }
+  cl = ttisclosure(func) ? clvalue(func) : NULL;
+  status = auxgetinfo(L, what, ar, cl, ci);
+  if (strchr(what, 'f')) {
+    setobjs2s(L, L->top, func);
+    api_incr_top(L);
+  }
+  swapextra(L);
+  if (strchr(what, 'L'))
+    collectvalidlines(L, cl);
+  lua_unlock(L);
+  return status;
+}
+
+
+/*
+** {======================================================
+** Symbolic Execution
+** =======================================================
+*/
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+                               const char **name);
+
+
+/*
+** find a "name" for the RK value 'c'
+*/
+static void kname (Proto *p, int pc, int c, const char **name) {
+  if (ISK(c)) {  /* is 'c' a constant? */
+    TValue *kvalue = &p->k[INDEXK(c)];
+    if (ttisstring(kvalue)) {  /* literal constant? */
+      *name = svalue(kvalue);  /* it is its own name */
+      return;
+    }
+    /* else no reasonable name found */
+  }
+  else {  /* 'c' is a register */
+    const char *what = getobjname(p, pc, c, name); /* search for 'c' */
+    if (what && *what == 'c') {  /* found a constant name? */
+      return;  /* 'name' already filled */
+    }
+    /* else no reasonable name found */
+  }
+  *name = "?";  /* no reasonable name found */
+}
+
+
+static int filterpc (int pc, int jmptarget) {
+  if (pc < jmptarget)  /* is code conditional (inside a jump)? */
+    return -1;  /* cannot know who sets that register */
+  else return pc;  /* current position sets that register */
+}
+
+
+/*
+** try to find last instruction before 'lastpc' that modified register 'reg'
+*/
+static int findsetreg (Proto *p, int lastpc, int reg) {
+  int pc;
+  int setreg = -1;  /* keep last instruction that changed 'reg' */
+  int jmptarget = 0;  /* any code before this address is conditional */
+  for (pc = 0; pc < lastpc; pc++) {
+    Instruction i = p->code[pc];
+    OpCode op = GET_OPCODE(i);
+    int a = GETARG_A(i);
+    switch (op) {
+      case OP_LOADNIL: {
+        int b = GETARG_B(i);
+        if (a <= reg && reg <= a + b)  /* set registers from 'a' to 'a+b' */
+          setreg = filterpc(pc, jmptarget);
+        break;
+      }
+      case OP_TFORCALL: {
+        if (reg >= a + 2)  /* affect all regs above its base */
+          setreg = filterpc(pc, jmptarget);
+        break;
+      }
+      case OP_CALL:
+      case OP_TAILCALL: {
+        if (reg >= a)  /* affect all registers above base */
+          setreg = filterpc(pc, jmptarget);
+        break;
+      }
+      case OP_JMP: {
+        int b = GETARG_sBx(i);
+        int dest = pc + 1 + b;
+        /* jump is forward and do not skip `lastpc'? */
+        if (pc < dest && dest <= lastpc) {
+          if (dest > jmptarget)
+            jmptarget = dest;  /* update 'jmptarget' */
+        }
+        break;
+      }
+      case OP_TEST: {
+        if (reg == a)  /* jumped code can change 'a' */
+          setreg = filterpc(pc, jmptarget);
+        break;
+      }
+      default:
+        if (testAMode(op) && reg == a)  /* any instruction that set A */
+          setreg = filterpc(pc, jmptarget);
+        break;
+    }
+  }
+  return setreg;
+}
+
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+                               const char **name) {
+  int pc;
+  *name = luaF_getlocalname(p, reg + 1, lastpc);
+  if (*name)  /* is a local? */
+    return "local";
+  /* else try symbolic execution */
+  pc = findsetreg(p, lastpc, reg);
+  if (pc != -1) {  /* could find instruction? */
+    Instruction i = p->code[pc];
+    OpCode op = GET_OPCODE(i);
+    switch (op) {
+      case OP_MOVE: {
+        int b = GETARG_B(i);  /* move from 'b' to 'a' */
+        if (b < GETARG_A(i))
+          return getobjname(p, pc, b, name);  /* get name for 'b' */
+        break;
+      }
+      case OP_GETTABUP:
+      case OP_GETTABLE: {
+        int k = GETARG_C(i);  /* key index */
+        int t = GETARG_B(i);  /* table index */
+        const char *vn = (op == OP_GETTABLE)  /* name of indexed variable */
+                         ? luaF_getlocalname(p, t + 1, pc)
+                         : upvalname(p, t);
+        kname(p, pc, k, name);
+        return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field";
+      }
+      case OP_GETUPVAL: {
+        *name = upvalname(p, GETARG_B(i));
+        return "upvalue";
+      }
+      case OP_LOADK:
+      case OP_LOADKX: {
+        int b = (op == OP_LOADK) ? GETARG_Bx(i)
+                                 : GETARG_Ax(p->code[pc + 1]);
+        if (ttisstring(&p->k[b])) {
+          *name = svalue(&p->k[b]);
+          return "constant";
+        }
+        break;
+      }
+      case OP_SELF: {
+        int k = GETARG_C(i);  /* key index */
+        kname(p, pc, k, name);
+        return "method";
+      }
+      default: break;  /* go through to return NULL */
+    }
+  }
+  return NULL;  /* could not find reasonable name */
+}
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) {
+  TMS tm;
+  Proto *p = ci_func(ci)->p;  /* calling function */
+  int pc = currentpc(ci);  /* calling instruction index */
+  Instruction i = p->code[pc];  /* calling instruction */
+  switch (GET_OPCODE(i)) {
+    case OP_CALL:
+    case OP_TAILCALL:  /* get function name */
+      return getobjname(p, pc, GETARG_A(i), name);
+    case OP_TFORCALL: {  /* for iterator */
+      *name = "for iterator";
+       return "for iterator";
+    }
+    /* all other instructions can call only through metamethods */
+    case OP_SELF:
+    case OP_GETTABUP:
+    case OP_GETTABLE: tm = TM_INDEX; break;
+    case OP_SETTABUP:
+    case OP_SETTABLE: tm = TM_NEWINDEX; break;
+    case OP_EQ: tm = TM_EQ; break;
+    case OP_ADD: tm = TM_ADD; break;
+    case OP_SUB: tm = TM_SUB; break;
+    case OP_MUL: tm = TM_MUL; break;
+    case OP_DIV: tm = TM_DIV; break;
+    case OP_MOD: tm = TM_MOD; break;
+    case OP_POW: tm = TM_POW; break;
+    case OP_UNM: tm = TM_UNM; break;
+    case OP_LEN: tm = TM_LEN; break;
+    case OP_LT: tm = TM_LT; break;
+    case OP_LE: tm = TM_LE; break;
+    case OP_CONCAT: tm = TM_CONCAT; break;
+    default:
+      return NULL;  /* else no useful name can be found */
+  }
+  *name = getstr(G(L)->tmname[tm]);
+  return "metamethod";
+}
+
+/* }====================================================== */
+
+
+
+/*
+** only ANSI way to check whether a pointer points to an array
+** (used only for error messages, so efficiency is not a big concern)
+*/
+static int isinstack (CallInfo *ci, const TValue *o) {
+  StkId p;
+  for (p = ci->u.l.base; p < ci->top; p++)
+    if (o == p) return 1;
+  return 0;
+}
+
+
+static const char *getupvalname (CallInfo *ci, const TValue *o,
+                                 const char **name) {
+  LClosure *c = ci_func(ci);
+  int i;
+  for (i = 0; i < c->nupvalues; i++) {
+    if (c->upvals[i]->v == o) {
+      *name = upvalname(c->p, i);
+      return "upvalue";
+    }
+  }
+  return NULL;
+}
+
+
+l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) {
+  CallInfo *ci = L->ci;
+  const char *name = NULL;
+  const char *t = objtypename(o);
+  const char *kind = NULL;
+  if (isLua(ci)) {
+    kind = getupvalname(ci, o, &name);  /* check whether 'o' is an upvalue */
+    if (!kind && isinstack(ci, o))  /* no? try a register */
+      kind = getobjname(ci_func(ci)->p, currentpc(ci),
+                        cast_int(o - ci->u.l.base), &name);
+  }
+  if (kind)
+    luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)",
+                op, kind, name, t);
+  else
+    luaG_runerror(L, "attempt to %s a %s value", op, t);
+}
+
+
+l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) {
+  if (ttisstring(p1) || ttisnumber(p1)) p1 = p2;
+  lua_assert(!ttisstring(p1) && !ttisnumber(p1));
+  luaG_typeerror(L, p1, "concatenate");
+}
+
+
+l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) {
+  TValue temp;
+  if (luaV_tonumber(p1, &temp) == NULL)
+    p2 = p1;  /* first operand is wrong */
+  luaG_typeerror(L, p2, "perform arithmetic on");
+}
+
+
+l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) {
+  const char *t1 = objtypename(p1);
+  const char *t2 = objtypename(p2);
+  if (t1 == t2)
+    luaG_runerror(L, "attempt to compare two %s values", t1);
+  else
+    luaG_runerror(L, "attempt to compare %s with %s", t1, t2);
+}
+
+
+static void addinfo (lua_State *L, const char *msg) {
+  CallInfo *ci = L->ci;
+  if (isLua(ci)) {  /* is Lua code? */
+    char buff[LUA_IDSIZE];  /* add file:line information */
+    int line = currentline(ci);
+    TString *src = ci_func(ci)->p->source;
+    if (src)
+      luaO_chunkid(buff, getstr(src), LUA_IDSIZE);
+    else {  /* no source available; use "?" instead */
+      buff[0] = '?'; buff[1] = '\0';
+    }
+    luaO_pushfstring(L, "%s:%d: %s", buff, line, msg);
+  }
+}
+
+
+l_noret luaG_errormsg (lua_State *L) {
+  if (L->errfunc != 0) {  /* is there an error handling function? */
+    StkId errfunc = restorestack(L, L->errfunc);
+    if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR);
+    setobjs2s(L, L->top, L->top - 1);  /* move argument */
+    setobjs2s(L, L->top - 1, errfunc);  /* push function */
+    L->top++;
+    luaD_call(L, L->top - 2, 1, 0);  /* call it */
+  }
+  luaD_throw(L, LUA_ERRRUN);
+}
+
+
+l_noret luaG_runerror (lua_State *L, const char *fmt, ...) {
+  L->runerror++;
+  va_list argp;
+  va_start(argp, fmt);
+  addinfo(L, luaO_pushvfstring(L, fmt, argp));
+  va_end(argp);
+  luaG_errormsg(L);
+  L->runerror--;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldebug.h b/sys/contrib/openzfs/module/lua/ldebug.h
new file mode 100644
index 000000000000..36ed396f26c9
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldebug.h
@@ -0,0 +1,36 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions from Debug Interface module
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldebug_h
+#define ldebug_h
+
+
+#include "lstate.h"
+
+
+#define pcRel(pc, p)	(cast(int, (pc) - (p)->code) - 1)
+
+#define getfuncline(f,pc)	(((f)->lineinfo) ? (f)->lineinfo[pc] : 0)
+
+#define resethookcount(L)	(L->hookcount = L->basehookcount)
+
+/* Active Lua function (given call info) */
+#define ci_func(ci)		(clLvalue((ci)->func))
+
+
+LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o,
+                                                const char *opname);
+LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2);
+LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1,
+                                                 const TValue *p2);
+LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1,
+                                                 const TValue *p2);
+LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...);
+LUAI_FUNC l_noret luaG_errormsg (lua_State *L);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldo.c b/sys/contrib/openzfs/module/lua/ldo.c
new file mode 100644
index 000000000000..f3c3dcb4d81a
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldo.c
@@ -0,0 +1,749 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+
+#define ldo_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+#include "lzio.h"
+
+
+
+/* Return the number of bytes available on the stack. */
+#if defined (_KERNEL) && defined(__linux__)
+#include <asm/current.h>
+static intptr_t stack_remaining(void) {
+  intptr_t local;
+  local = (intptr_t)&local - (intptr_t)current->stack;
+  return local;
+}
+#elif defined (_KERNEL) && defined(__FreeBSD__)
+#include <sys/pcpu.h>
+static intptr_t stack_remaining(void) {
+  intptr_t local;
+  local = (intptr_t)&local - (intptr_t)curthread->td_kstack;
+  return local;
+}
+#else
+static intptr_t stack_remaining(void) {
+  return INTPTR_MAX;
+}
+#endif
+
+/*
+** {======================================================
+** Error-recovery functions
+** =======================================================
+*/
+
+/*
+** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By
+** default, Lua handles errors with exceptions when compiling as
+** C++ code, with _longjmp/_setjmp when asked to use them, and with
+** longjmp/setjmp otherwise.
+*/
+#if !defined(LUAI_THROW)
+
+#ifdef _KERNEL
+
+#ifdef __linux__
+#if defined(__i386__)
+#define	JMP_BUF_CNT	6
+#elif defined(__x86_64__)
+#define	JMP_BUF_CNT	8
+#elif defined(__sparc__) && defined(__arch64__)
+#define	JMP_BUF_CNT	6
+#elif defined(__powerpc__)
+#define	JMP_BUF_CNT	26
+#elif defined(__aarch64__)
+#define	JMP_BUF_CNT	64
+#elif defined(__arm__)
+#define	JMP_BUF_CNT	65
+#elif defined(__mips__)
+#define JMP_BUF_CNT	12
+#elif defined(__s390x__)
+#define JMP_BUF_CNT	18
+#elif defined(__riscv)
+#define JMP_BUF_CNT     64
+#else
+#define	JMP_BUF_CNT	1
+#endif
+
+typedef	struct _label_t { long long unsigned val[JMP_BUF_CNT]; } label_t;
+
+int setjmp(label_t *) __attribute__ ((__nothrow__));
+extern void longjmp(label_t *) __attribute__((__noreturn__));
+
+#define LUAI_THROW(L,c)		longjmp(&(c)->b)
+#define LUAI_TRY(L,c,a)		if (setjmp(&(c)->b) == 0) { a }
+#define luai_jmpbuf		label_t
+
+/* unsupported arches will build but not be able to run lua programs */
+#if JMP_BUF_CNT == 1
+int setjmp (label_t *buf) {
+	return 1;
+}
+
+void longjmp (label_t * buf) {
+	for (;;);
+}
+#endif
+#else
+#define LUAI_THROW(L,c)		longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a)		if (setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf		jmp_buf
+#endif
+
+#else /* _KERNEL */
+
+#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP)
+/* C++ exceptions */
+#define LUAI_THROW(L,c)		throw(c)
+#define LUAI_TRY(L,c,a) \
+	try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; }
+#define luai_jmpbuf		int  /* dummy variable */
+
+#elif defined(LUA_USE_ULONGJMP)
+/* in Unix, try _longjmp/_setjmp (more efficient) */
+#define LUAI_THROW(L,c)		_longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a)		if (_setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf		jmp_buf
+
+#else
+/* default handling with long jumps */
+#define LUAI_THROW(L,c)		longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a)		if (setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf		jmp_buf
+
+#endif
+
+#endif /* _KERNEL */
+
+#endif /* LUAI_THROW */
+
+
+/* chain list of long jump buffers */
+struct lua_longjmp {
+  struct lua_longjmp *previous;
+  luai_jmpbuf b;
+  volatile int status;  /* error code */
+};
+
+
+static void seterrorobj (lua_State *L, int errcode, StkId oldtop) {
+  switch (errcode) {
+    case LUA_ERRMEM: {  /* memory error? */
+      setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */
+      break;
+    }
+    case LUA_ERRERR: {
+      setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling"));
+      break;
+    }
+    default: {
+      setobjs2s(L, oldtop, L->top - 1);  /* error message on current top */
+      break;
+    }
+  }
+  L->top = oldtop + 1;
+}
+
+
+l_noret luaD_throw (lua_State *L, int errcode) {
+  if (L->errorJmp) {  /* thread has an error handler? */
+    L->errorJmp->status = errcode;  /* set status */
+    LUAI_THROW(L, L->errorJmp);  /* jump to it */
+  }
+  else {  /* thread has no error handler */
+    L->status = cast_byte(errcode);  /* mark it as dead */
+    if (G(L)->mainthread->errorJmp) {  /* main thread has a handler? */
+      setobjs2s(L, G(L)->mainthread->top++, L->top - 1);  /* copy error obj. */
+      luaD_throw(G(L)->mainthread, errcode);  /* re-throw in main thread */
+    }
+    else {  /* no handler at all; abort */
+      if (G(L)->panic) {  /* panic function? */
+        lua_unlock(L);
+        G(L)->panic(L);  /* call it (last chance to jump out) */
+      }
+      panic("no error handler");
+    }
+  }
+}
+
+
+int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) {
+  unsigned short oldnCcalls = L->nCcalls;
+  struct lua_longjmp lj;
+  lj.status = LUA_OK;
+  lj.previous = L->errorJmp;  /* chain new error handler */
+  L->errorJmp = &lj;
+  LUAI_TRY(L, &lj,
+    (*f)(L, ud);
+  );
+  L->errorJmp = lj.previous;  /* restore old error handler */
+  L->nCcalls = oldnCcalls;
+  return lj.status;
+}
+
+/* }====================================================== */
+
+
+static void correctstack (lua_State *L, TValue *oldstack) {
+  CallInfo *ci;
+  GCObject *up;
+  L->top = (L->top - oldstack) + L->stack;
+  for (up = L->openupval; up != NULL; up = up->gch.next)
+    gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack;
+  for (ci = L->ci; ci != NULL; ci = ci->previous) {
+    ci->top = (ci->top - oldstack) + L->stack;
+    ci->func = (ci->func - oldstack) + L->stack;
+    if (isLua(ci))
+      ci->u.l.base = (ci->u.l.base - oldstack) + L->stack;
+  }
+}
+
+
+/* some space for error handling */
+#define ERRORSTACKSIZE	(LUAI_MAXSTACK + 200)
+
+
+void luaD_reallocstack (lua_State *L, int newsize) {
+  TValue *oldstack = L->stack;
+  int lim = L->stacksize;
+  lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE);
+  lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK);
+  luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue);
+  for (; lim < newsize; lim++)
+    setnilvalue(L->stack + lim); /* erase new segment */
+  L->stacksize = newsize;
+  L->stack_last = L->stack + newsize - EXTRA_STACK;
+  correctstack(L, oldstack);
+}
+
+
+void luaD_growstack (lua_State *L, int n) {
+  int size = L->stacksize;
+  if (size > LUAI_MAXSTACK)  /* error after extra size? */
+    luaD_throw(L, LUA_ERRERR);
+  else {
+    int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK;
+    int newsize = 2 * size;
+    if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK;
+    if (newsize < needed) newsize = needed;
+    if (newsize > LUAI_MAXSTACK) {  /* stack overflow? */
+      luaD_reallocstack(L, ERRORSTACKSIZE);
+      luaG_runerror(L, "stack overflow");
+    }
+    else
+      luaD_reallocstack(L, newsize);
+  }
+}
+
+
+static int stackinuse (lua_State *L) {
+  CallInfo *ci;
+  StkId lim = L->top;
+  for (ci = L->ci; ci != NULL; ci = ci->previous) {
+    lua_assert(ci->top <= L->stack_last);
+    if (lim < ci->top) lim = ci->top;
+  }
+  return cast_int(lim - L->stack) + 1;  /* part of stack in use */
+}
+
+
+void luaD_shrinkstack (lua_State *L) {
+  int inuse = stackinuse(L);
+  int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK;
+  if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK;
+  if (inuse > LUAI_MAXSTACK ||  /* handling stack overflow? */
+      goodsize >= L->stacksize)  /* would grow instead of shrink? */
+    condmovestack(L);  /* don't change stack (change only for debugging) */
+  else
+    luaD_reallocstack(L, goodsize);  /* shrink it */
+}
+
+
+void luaD_hook (lua_State *L, int event, int line) {
+  lua_Hook hook = L->hook;
+  if (hook && L->allowhook) {
+    CallInfo *ci = L->ci;
+    ptrdiff_t top = savestack(L, L->top);
+    ptrdiff_t ci_top = savestack(L, ci->top);
+    lua_Debug ar;
+    ar.event = event;
+    ar.currentline = line;
+    ar.i_ci = ci;
+    luaD_checkstack(L, LUA_MINSTACK);  /* ensure minimum stack size */
+    ci->top = L->top + LUA_MINSTACK;
+    lua_assert(ci->top <= L->stack_last);
+    L->allowhook = 0;  /* cannot call hooks inside a hook */
+    ci->callstatus |= CIST_HOOKED;
+    lua_unlock(L);
+    (*hook)(L, &ar);
+    lua_lock(L);
+    lua_assert(!L->allowhook);
+    L->allowhook = 1;
+    ci->top = restorestack(L, ci_top);
+    L->top = restorestack(L, top);
+    ci->callstatus &= ~CIST_HOOKED;
+  }
+}
+
+
+static void callhook (lua_State *L, CallInfo *ci) {
+  int hook = LUA_HOOKCALL;
+  ci->u.l.savedpc++;  /* hooks assume 'pc' is already incremented */
+  if (isLua(ci->previous) &&
+      GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) {
+    ci->callstatus |= CIST_TAIL;
+    hook = LUA_HOOKTAILCALL;
+  }
+  luaD_hook(L, hook, -1);
+  ci->u.l.savedpc--;  /* correct 'pc' */
+}
+
+
+static StkId adjust_varargs (lua_State *L, Proto *p, int actual) {
+  int i;
+  int nfixargs = p->numparams;
+  StkId base, fixed;
+  lua_assert(actual >= nfixargs);
+  /* move fixed parameters to final position */
+  luaD_checkstack(L, p->maxstacksize);  /* check again for new 'base' */
+  fixed = L->top - actual;  /* first fixed argument */
+  base = L->top;  /* final position of first argument */
+  for (i=0; i<nfixargs; i++) {
+    setobjs2s(L, L->top++, fixed + i);
+    setnilvalue(fixed + i);
+  }
+  return base;
+}
+
+
+static StkId tryfuncTM (lua_State *L, StkId func) {
+  const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL);
+  StkId p;
+  ptrdiff_t funcr = savestack(L, func);
+  if (!ttisfunction(tm))
+    luaG_typeerror(L, func, "call");
+  /* Open a hole inside the stack at `func' */
+  for (p = L->top; p > func; p--) setobjs2s(L, p, p-1);
+  incr_top(L);
+  func = restorestack(L, funcr);  /* previous call may change stack */
+  setobj2s(L, func, tm);  /* tag method is the new function to be called */
+  return func;
+}
+
+
+
+#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L)))
+
+
+/*
+** returns true if function has been executed (C function)
+*/
+int luaD_precall (lua_State *L, StkId func, int nresults) {
+  lua_CFunction f;
+  CallInfo *ci;
+  int n;  /* number of arguments (Lua) or returns (C) */
+  ptrdiff_t funcr = savestack(L, func);
+  switch (ttype(func)) {
+    case LUA_TLCF:  /* light C function */
+      f = fvalue(func);
+      goto Cfunc;
+    case LUA_TCCL: {  /* C closure */
+      f = clCvalue(func)->f;
+     Cfunc:
+      luaD_checkstack(L, LUA_MINSTACK);  /* ensure minimum stack size */
+      ci = next_ci(L);  /* now 'enter' new function */
+      ci->nresults = nresults;
+      ci->func = restorestack(L, funcr);
+      ci->top = L->top + LUA_MINSTACK;
+      lua_assert(ci->top <= L->stack_last);
+      ci->callstatus = 0;
+      luaC_checkGC(L);  /* stack grow uses memory */
+      if (L->hookmask & LUA_MASKCALL)
+        luaD_hook(L, LUA_HOOKCALL, -1);
+      lua_unlock(L);
+      n = (*f)(L);  /* do the actual call */
+      lua_lock(L);
+      api_checknelems(L, n);
+      luaD_poscall(L, L->top - n);
+      return 1;
+    }
+    case LUA_TLCL: {  /* Lua function: prepare its call */
+      StkId base;
+      Proto *p = clLvalue(func)->p;
+      n = cast_int(L->top - func) - 1;  /* number of real arguments */
+      luaD_checkstack(L, p->maxstacksize);
+      for (; n < p->numparams; n++)
+        setnilvalue(L->top++);  /* complete missing arguments */
+      if (!p->is_vararg) {
+        func = restorestack(L, funcr);
+        base = func + 1;
+      }
+      else {
+        base = adjust_varargs(L, p, n);
+        func = restorestack(L, funcr);  /* previous call can change stack */
+      }
+      ci = next_ci(L);  /* now 'enter' new function */
+      ci->nresults = nresults;
+      ci->func = func;
+      ci->u.l.base = base;
+      ci->top = base + p->maxstacksize;
+      lua_assert(ci->top <= L->stack_last);
+      ci->u.l.savedpc = p->code;  /* starting point */
+      ci->callstatus = CIST_LUA;
+      L->top = ci->top;
+      luaC_checkGC(L);  /* stack grow uses memory */
+      if (L->hookmask & LUA_MASKCALL)
+        callhook(L, ci);
+      return 0;
+    }
+    default: {  /* not a function */
+      func = tryfuncTM(L, func);  /* retry with 'function' tag method */
+      return luaD_precall(L, func, nresults);  /* now it must be a function */
+    }
+  }
+}
+
+
+int luaD_poscall (lua_State *L, StkId firstResult) {
+  StkId res;
+  int wanted, i;
+  CallInfo *ci = L->ci;
+  if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) {
+    if (L->hookmask & LUA_MASKRET) {
+      ptrdiff_t fr = savestack(L, firstResult);  /* hook may change stack */
+      luaD_hook(L, LUA_HOOKRET, -1);
+      firstResult = restorestack(L, fr);
+    }
+    L->oldpc = ci->previous->u.l.savedpc;  /* 'oldpc' for caller function */
+  }
+  res = ci->func;  /* res == final position of 1st result */
+  wanted = ci->nresults;
+  L->ci = ci = ci->previous;  /* back to caller */
+  /* move results to correct place */
+  for (i = wanted; i != 0 && firstResult < L->top; i--)
+    setobjs2s(L, res++, firstResult++);
+  while (i-- > 0)
+    setnilvalue(res++);
+  L->top = res;
+  return (wanted - LUA_MULTRET);  /* 0 iff wanted == LUA_MULTRET */
+}
+
+
+/*
+** Call a function (C or Lua). The function to be called is at *func.
+** The arguments are on the stack, right after the function.
+** When returns, all the results are on the stack, starting at the original
+** function position.
+*/
+void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) {
+  if (++L->nCcalls >= LUAI_MAXCCALLS) {
+    if (L->nCcalls == LUAI_MAXCCALLS)
+      luaG_runerror(L, "C stack overflow");
+    else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3)))
+      luaD_throw(L, LUA_ERRERR);  /* error while handling stack error */
+  }
+  intptr_t remaining = stack_remaining();
+  if (L->runerror == 0 && remaining < LUAI_MINCSTACK)
+    luaG_runerror(L, "C stack overflow");
+  if (L->runerror != 0 && remaining < LUAI_MINCSTACK / 2)
+    luaD_throw(L, LUA_ERRERR);  /* error while handling stack error */
+  if (!allowyield) L->nny++;
+  if (!luaD_precall(L, func, nResults))  /* is a Lua function? */
+    luaV_execute(L);  /* call it */
+  if (!allowyield) L->nny--;
+  L->nCcalls--;
+}
+
+
+static void finishCcall (lua_State *L) {
+  CallInfo *ci = L->ci;
+  int n;
+  lua_assert(ci->u.c.k != NULL);  /* must have a continuation */
+  lua_assert(L->nny == 0);
+  if (ci->callstatus & CIST_YPCALL) {  /* was inside a pcall? */
+    ci->callstatus &= ~CIST_YPCALL;  /* finish 'lua_pcall' */
+    L->errfunc = ci->u.c.old_errfunc;
+  }
+  /* finish 'lua_callk'/'lua_pcall' */
+  adjustresults(L, ci->nresults);
+  /* call continuation function */
+  if (!(ci->callstatus & CIST_STAT))  /* no call status? */
+    ci->u.c.status = LUA_YIELD;  /* 'default' status */
+  lua_assert(ci->u.c.status != LUA_OK);
+  ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED;
+  lua_unlock(L);
+  n = (*ci->u.c.k)(L);
+  lua_lock(L);
+  api_checknelems(L, n);
+  /* finish 'luaD_precall' */
+  luaD_poscall(L, L->top - n);
+}
+
+
+static void unroll (lua_State *L, void *ud) {
+  UNUSED(ud);
+  for (;;) {
+    if (L->ci == &L->base_ci)  /* stack is empty? */
+      return;  /* coroutine finished normally */
+    if (!isLua(L->ci))  /* C function? */
+      finishCcall(L);
+    else {  /* Lua function */
+      luaV_finishOp(L);  /* finish interrupted instruction */
+      luaV_execute(L);  /* execute down to higher C 'boundary' */
+    }
+  }
+}
+
+
+/*
+** check whether thread has a suspended protected call
+*/
+static CallInfo *findpcall (lua_State *L) {
+  CallInfo *ci;
+  for (ci = L->ci; ci != NULL; ci = ci->previous) {  /* search for a pcall */
+    if (ci->callstatus & CIST_YPCALL)
+      return ci;
+  }
+  return NULL;  /* no pending pcall */
+}
+
+
+static int recover (lua_State *L, int status) {
+  StkId oldtop;
+  CallInfo *ci = findpcall(L);
+  if (ci == NULL) return 0;  /* no recovery point */
+  /* "finish" luaD_pcall */
+  oldtop = restorestack(L, ci->extra);
+  luaF_close(L, oldtop);
+  seterrorobj(L, status, oldtop);
+  L->ci = ci;
+  L->allowhook = ci->u.c.old_allowhook;
+  L->nny = 0;  /* should be zero to be yieldable */
+  luaD_shrinkstack(L);
+  L->errfunc = ci->u.c.old_errfunc;
+  ci->callstatus |= CIST_STAT;  /* call has error status */
+  ci->u.c.status = status;  /* (here it is) */
+  return 1;  /* continue running the coroutine */
+}
+
+
+/*
+** signal an error in the call to 'resume', not in the execution of the
+** coroutine itself. (Such errors should not be handled by any coroutine
+** error handler and should not kill the coroutine.)
+*/
+static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) {
+  L->top = firstArg;  /* remove args from the stack */
+  setsvalue2s(L, L->top, luaS_new(L, msg));  /* push error message */
+  api_incr_top(L);
+  luaD_throw(L, -1);  /* jump back to 'lua_resume' */
+}
+
+
+/*
+** do the work for 'lua_resume' in protected mode
+*/
+static void resume_cb (lua_State *L, void *ud) {
+  int nCcalls = L->nCcalls;
+  StkId firstArg = cast(StkId, ud);
+  CallInfo *ci = L->ci;
+  if (nCcalls >= LUAI_MAXCCALLS)
+    resume_error(L, "C stack overflow", firstArg);
+  if (L->status == LUA_OK) {  /* may be starting a coroutine */
+    if (ci != &L->base_ci)  /* not in base level? */
+      resume_error(L, "cannot resume non-suspended coroutine", firstArg);
+    /* coroutine is in base level; start running it */
+    if (!luaD_precall(L, firstArg - 1, LUA_MULTRET))  /* Lua function? */
+      luaV_execute(L);  /* call it */
+  }
+  else if (L->status != LUA_YIELD)
+    resume_error(L, "cannot resume dead coroutine", firstArg);
+  else {  /* resuming from previous yield */
+    L->status = LUA_OK;
+    ci->func = restorestack(L, ci->extra);
+    if (isLua(ci))  /* yielded inside a hook? */
+      luaV_execute(L);  /* just continue running Lua code */
+    else {  /* 'common' yield */
+      if (ci->u.c.k != NULL) {  /* does it have a continuation? */
+        int n;
+        ci->u.c.status = LUA_YIELD;  /* 'default' status */
+        ci->callstatus |= CIST_YIELDED;
+        lua_unlock(L);
+        n = (*ci->u.c.k)(L);  /* call continuation */
+        lua_lock(L);
+        api_checknelems(L, n);
+        firstArg = L->top - n;  /* yield results come from continuation */
+      }
+      luaD_poscall(L, firstArg);  /* finish 'luaD_precall' */
+    }
+    unroll(L, NULL);
+  }
+  lua_assert(nCcalls == L->nCcalls);
+}
+
+
+LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) {
+  int status;
+  int oldnny = L->nny;  /* save 'nny' */
+  lua_lock(L);
+  luai_userstateresume(L, nargs);
+  L->nCcalls = (from) ? from->nCcalls + 1 : 1;
+  L->nny = 0;  /* allow yields */
+  api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs);
+  status = luaD_rawrunprotected(L, resume_cb, L->top - nargs);
+  if (status == -1)  /* error calling 'lua_resume'? */
+    status = LUA_ERRRUN;
+  else {  /* yield or regular error */
+    while (status != LUA_OK && status != LUA_YIELD) {  /* error? */
+      if (recover(L, status))  /* recover point? */
+        status = luaD_rawrunprotected(L, unroll, NULL);  /* run continuation */
+      else {  /* unrecoverable error */
+        L->status = cast_byte(status);  /* mark thread as `dead' */
+        seterrorobj(L, status, L->top);
+        L->ci->top = L->top;
+        break;
+      }
+    }
+    lua_assert(status == L->status);
+  }
+  L->nny = oldnny;  /* restore 'nny' */
+  L->nCcalls--;
+  lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0));
+  lua_unlock(L);
+  return status;
+}
+
+
+LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) {
+  CallInfo *ci = L->ci;
+  luai_userstateyield(L, nresults);
+  lua_lock(L);
+  api_checknelems(L, nresults);
+  if (L->nny > 0) {
+    if (L != G(L)->mainthread)
+      luaG_runerror(L, "attempt to yield across a C-call boundary");
+    else
+      luaG_runerror(L, "attempt to yield from outside a coroutine");
+  }
+  L->status = LUA_YIELD;
+  ci->extra = savestack(L, ci->func);  /* save current 'func' */
+  if (isLua(ci)) {  /* inside a hook? */
+    api_check(L, k == NULL, "hooks cannot continue after yielding");
+  }
+  else {
+    if ((ci->u.c.k = k) != NULL)  /* is there a continuation? */
+      ci->u.c.ctx = ctx;  /* save context */
+    ci->func = L->top - nresults - 1;  /* protect stack below results */
+    luaD_throw(L, LUA_YIELD);
+  }
+  lua_assert(ci->callstatus & CIST_HOOKED);  /* must be inside a hook */
+  lua_unlock(L);
+  return 0;  /* return to 'luaD_hook' */
+}
+
+
+int luaD_pcall (lua_State *L, Pfunc func, void *u,
+                ptrdiff_t old_top, ptrdiff_t ef) {
+  int status;
+  CallInfo *old_ci = L->ci;
+  lu_byte old_allowhooks = L->allowhook;
+  unsigned short old_nny = L->nny;
+  ptrdiff_t old_errfunc = L->errfunc;
+  L->errfunc = ef;
+  status = luaD_rawrunprotected(L, func, u);
+  if (status != LUA_OK) {  /* an error occurred? */
+    StkId oldtop = restorestack(L, old_top);
+    luaF_close(L, oldtop);  /* close possible pending closures */
+    seterrorobj(L, status, oldtop);
+    L->ci = old_ci;
+    L->allowhook = old_allowhooks;
+    L->nny = old_nny;
+    luaD_shrinkstack(L);
+  }
+  L->errfunc = old_errfunc;
+  return status;
+}
+
+
+
+/*
+** Execute a protected parser.
+*/
+struct SParser {  /* data to `f_parser' */
+  ZIO *z;
+  Mbuffer buff;  /* dynamic structure used by the scanner */
+  Dyndata dyd;  /* dynamic structures used by the parser */
+  const char *mode;
+  const char *name;
+};
+
+
+static void checkmode (lua_State *L, const char *mode, const char *x) {
+  if (mode && strchr(mode, x[0]) == NULL) {
+    luaO_pushfstring(L,
+       "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode);
+    luaD_throw(L, LUA_ERRSYNTAX);
+  }
+}
+
+
+static void f_parser (lua_State *L, void *ud) {
+  int i;
+  Closure *cl;
+  struct SParser *p = cast(struct SParser *, ud);
+  int c = zgetc(p->z);  /* read first character */
+  lua_assert(c != LUA_SIGNATURE[0]);	/* binary not supported */
+  checkmode(L, p->mode, "text");
+  cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c);
+  lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues);
+  for (i = 0; i < cl->l.nupvalues; i++) {  /* initialize upvalues */
+    UpVal *up = luaF_newupval(L);
+    cl->l.upvals[i] = up;
+    luaC_objbarrier(L, cl, up);
+  }
+}
+
+
+int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+                                        const char *mode) {
+  struct SParser p;
+  int status;
+  L->nny++;  /* cannot yield during parsing */
+  p.z = z; p.name = name; p.mode = mode;
+  p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0;
+  p.dyd.gt.arr = NULL; p.dyd.gt.size = 0;
+  p.dyd.label.arr = NULL; p.dyd.label.size = 0;
+  luaZ_initbuffer(L, &p.buff);
+  status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc);
+  luaZ_freebuffer(L, &p.buff);
+  luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size);
+  luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size);
+  luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size);
+  L->nny--;
+  return status;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ldo.h b/sys/contrib/openzfs/module/lua/ldo.h
new file mode 100644
index 000000000000..2c0e1704d072
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ldo.h
@@ -0,0 +1,47 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldo_h
+#define ldo_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+#define luaD_checkstack(L,n)	if (L->stack_last - L->top <= (n)) \
+				    luaD_growstack(L, n); else condmovestack(L);
+
+
+#define incr_top(L) {L->top++; luaD_checkstack(L,0);}
+
+#define savestack(L,p)		((char *)(p) - (char *)L->stack)
+#define restorestack(L,n)	((TValue *)((char *)L->stack + (n)))
+
+
+/* type of protected functions, to be ran by `runprotected' */
+typedef void (*Pfunc) (lua_State *L, void *ud);
+
+LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+                                                  const char *mode);
+LUAI_FUNC void luaD_hook (lua_State *L, int event, int line);
+LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults);
+LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults,
+                                        int allowyield);
+LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u,
+                                        ptrdiff_t oldtop, ptrdiff_t ef);
+LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult);
+LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize);
+LUAI_FUNC void luaD_growstack (lua_State *L, int n);
+LUAI_FUNC void luaD_shrinkstack (lua_State *L);
+
+LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode);
+LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lfunc.c b/sys/contrib/openzfs/module/lua/lfunc.c
new file mode 100644
index 000000000000..1a510831259c
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lfunc.c
@@ -0,0 +1,160 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+
+#define lfunc_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+Closure *luaF_newCclosure (lua_State *L, int n) {
+  Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl;
+  c->c.nupvalues = cast_byte(n);
+  return c;
+}
+
+
+Closure *luaF_newLclosure (lua_State *L, int n) {
+  Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl;
+  c->l.p = NULL;
+  c->l.nupvalues = cast_byte(n);
+  while (n--) c->l.upvals[n] = NULL;
+  return c;
+}
+
+
+UpVal *luaF_newupval (lua_State *L) {
+  UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv;
+  uv->v = &uv->u.value;
+  setnilvalue(uv->v);
+  return uv;
+}
+
+
+UpVal *luaF_findupval (lua_State *L, StkId level) {
+  global_State *g = G(L);
+  GCObject **pp = &L->openupval;
+  UpVal *p;
+  UpVal *uv;
+  while (*pp != NULL && (p = gco2uv(*pp))->v >= level) {
+    GCObject *o = obj2gco(p);
+    lua_assert(p->v != &p->u.value);
+    lua_assert(!isold(o) || isold(obj2gco(L)));
+    if (p->v == level) {  /* found a corresponding upvalue? */
+      if (isdead(g, o))  /* is it dead? */
+        changewhite(o);  /* resurrect it */
+      return p;
+    }
+    pp = &p->next;
+  }
+  /* not found: create a new one */
+  uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv;
+  uv->v = level;  /* current value lives in the stack */
+  uv->u.l.prev = &g->uvhead;  /* double link it in `uvhead' list */
+  uv->u.l.next = g->uvhead.u.l.next;
+  uv->u.l.next->u.l.prev = uv;
+  g->uvhead.u.l.next = uv;
+  lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+  return uv;
+}
+
+
+static void unlinkupval (UpVal *uv) {
+  lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+  uv->u.l.next->u.l.prev = uv->u.l.prev;  /* remove from `uvhead' list */
+  uv->u.l.prev->u.l.next = uv->u.l.next;
+}
+
+
+void luaF_freeupval (lua_State *L, UpVal *uv) {
+  if (uv->v != &uv->u.value)  /* is it open? */
+    unlinkupval(uv);  /* remove from open list */
+  luaM_free(L, uv);  /* free upvalue */
+}
+
+
+void luaF_close (lua_State *L, StkId level) {
+  UpVal *uv;
+  global_State *g = G(L);
+  while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) {
+    GCObject *o = obj2gco(uv);
+    lua_assert(!isblack(o) && uv->v != &uv->u.value);
+    L->openupval = uv->next;  /* remove from `open' list */
+    if (isdead(g, o))
+      luaF_freeupval(L, uv);  /* free upvalue */
+    else {
+      unlinkupval(uv);  /* remove upvalue from 'uvhead' list */
+      setobj(L, &uv->u.value, uv->v);  /* move value to upvalue slot */
+      uv->v = &uv->u.value;  /* now current value lives here */
+      gch(o)->next = g->allgc;  /* link upvalue into 'allgc' list */
+      g->allgc = o;
+      luaC_checkupvalcolor(g, uv);
+    }
+  }
+}
+
+
+Proto *luaF_newproto (lua_State *L) {
+  Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p;
+  f->k = NULL;
+  f->sizek = 0;
+  f->p = NULL;
+  f->sizep = 0;
+  f->code = NULL;
+  f->cache = NULL;
+  f->sizecode = 0;
+  f->lineinfo = NULL;
+  f->sizelineinfo = 0;
+  f->upvalues = NULL;
+  f->sizeupvalues = 0;
+  f->numparams = 0;
+  f->is_vararg = 0;
+  f->maxstacksize = 0;
+  f->locvars = NULL;
+  f->sizelocvars = 0;
+  f->linedefined = 0;
+  f->lastlinedefined = 0;
+  f->source = NULL;
+  return f;
+}
+
+
+void luaF_freeproto (lua_State *L, Proto *f) {
+  luaM_freearray(L, f->code, f->sizecode);
+  luaM_freearray(L, f->p, f->sizep);
+  luaM_freearray(L, f->k, f->sizek);
+  luaM_freearray(L, f->lineinfo, f->sizelineinfo);
+  luaM_freearray(L, f->locvars, f->sizelocvars);
+  luaM_freearray(L, f->upvalues, f->sizeupvalues);
+  luaM_free(L, f);
+}
+
+
+/*
+** Look for n-th local variable at line `line' in function `func'.
+** Returns NULL if not found.
+*/
+const char *luaF_getlocalname (const Proto *f, int local_number, int pc) {
+  int i;
+  for (i = 0; i<f->sizelocvars && f->locvars[i].startpc <= pc; i++) {
+    if (pc < f->locvars[i].endpc) {  /* is variable active? */
+      local_number--;
+      if (local_number == 0)
+        return getstr(f->locvars[i].varname);
+    }
+  }
+  return NULL;  /* not found */
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lfunc.h b/sys/contrib/openzfs/module/lua/lfunc.h
new file mode 100644
index 000000000000..59a4fa75c46e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lfunc.h
@@ -0,0 +1,35 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lfunc_h
+#define lfunc_h
+
+
+#include "lobject.h"
+
+
+#define sizeCclosure(n)	(cast(int, sizeof(CClosure)) + \
+                         cast(int, sizeof(TValue)*((n)-1)))
+
+#define sizeLclosure(n)	(cast(int, sizeof(LClosure)) + \
+                         cast(int, sizeof(TValue *)*((n)-1)))
+
+
+LUAI_FUNC Proto *luaF_newproto (lua_State *L);
+LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems);
+LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems);
+LUAI_FUNC UpVal *luaF_newupval (lua_State *L);
+LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level);
+LUAI_FUNC void luaF_close (lua_State *L, StkId level);
+LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f);
+LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv);
+LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number,
+                                         int pc);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lgc.c b/sys/contrib/openzfs/module/lua/lgc.c
new file mode 100644
index 000000000000..55feb24119d3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lgc.c
@@ -0,0 +1,1218 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#define lgc_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+
+/*
+** cost of sweeping one element (the size of a small object divided
+** by some adjust for the sweep speed)
+*/
+#define GCSWEEPCOST	((sizeof(TString) + 4) / 4)
+
+/* maximum number of elements to sweep in each single step */
+#define GCSWEEPMAX	(cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4))
+
+/* maximum number of finalizers to call in each GC step */
+#define GCFINALIZENUM	4
+
+
+/*
+** macro to adjust 'stepmul': 'stepmul' is actually used like
+** 'stepmul / STEPMULADJ' (value chosen by tests)
+*/
+#define STEPMULADJ		200
+
+
+/*
+** macro to adjust 'pause': 'pause' is actually used like
+** 'pause / PAUSEADJ' (value chosen by tests)
+*/
+#define PAUSEADJ		100
+
+
+/*
+** 'makewhite' erases all color bits plus the old bit and then
+** sets only the current white bit
+*/
+#define maskcolors	(~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS))
+#define makewhite(g,x)	\
+ (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g)))
+
+#define white2gray(x)	resetbits(gch(x)->marked, WHITEBITS)
+#define black2gray(x)	resetbit(gch(x)->marked, BLACKBIT)
+
+
+#define isfinalized(x)		testbit(gch(x)->marked, FINALIZEDBIT)
+
+#define checkdeadkey(n)	lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n)))
+
+
+#define checkconsistency(obj)  \
+  lua_longassert(!iscollectable(obj) || righttt(obj))
+
+
+#define markvalue(g,o) { checkconsistency(o); \
+  if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); }
+
+#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \
+		reallymarkobject(g, obj2gco(t)); }
+
+static void reallymarkobject (global_State *g, GCObject *o);
+
+
+/*
+** {======================================================
+** Generic functions
+** =======================================================
+*/
+
+
+/*
+** one after last element in a hash array
+*/
+#define gnodelast(h)	gnode(h, cast(size_t, sizenode(h)))
+
+
+/*
+** link table 'h' into list pointed by 'p'
+*/
+#define linktable(h,p)	((h)->gclist = *(p), *(p) = obj2gco(h))
+
+
+/*
+** if key is not marked, mark its entry as dead (therefore removing it
+** from the table)
+*/
+static void removeentry (Node *n) {
+  lua_assert(ttisnil(gval(n)));
+  if (valiswhite(gkey(n)))
+    setdeadvalue(gkey(n));  /* unused and unmarked key; remove it */
+}
+
+
+/*
+** tells whether a key or value can be cleared from a weak
+** table. Non-collectable objects are never removed from weak
+** tables. Strings behave as `values', so are never removed too. for
+** other objects: if really collected, cannot keep them; for objects
+** being finalized, keep them in keys, but not in values
+*/
+static int iscleared (global_State *g, const TValue *o) {
+  if (!iscollectable(o)) return 0;
+  else if (ttisstring(o)) {
+    markobject(g, rawtsvalue(o));  /* strings are `values', so are never weak */
+    return 0;
+  }
+  else return iswhite(gcvalue(o));
+}
+
+
+/*
+** barrier that moves collector forward, that is, mark the white object
+** being pointed by a black object.
+*/
+void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) {
+  global_State *g = G(L);
+  lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
+  lua_assert(g->gcstate != GCSpause);
+  lua_assert(gch(o)->tt != LUA_TTABLE);
+  if (keepinvariantout(g))  /* must keep invariant? */
+    reallymarkobject(g, v);  /* restore invariant */
+  else {  /* sweep phase */
+    lua_assert(issweepphase(g));
+    makewhite(g, o);  /* mark main obj. as white to avoid other barriers */
+  }
+}
+
+
+/*
+** barrier that moves collector backward, that is, mark the black object
+** pointing to a white object as gray again. (Current implementation
+** only works for tables; access to 'gclist' is not uniform across
+** different types.)
+*/
+void luaC_barrierback_ (lua_State *L, GCObject *o) {
+  global_State *g = G(L);
+  lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE);
+  black2gray(o);  /* make object gray (again) */
+  gco2t(o)->gclist = g->grayagain;
+  g->grayagain = o;
+}
+
+
+/*
+** barrier for prototypes. When creating first closure (cache is
+** NULL), use a forward barrier; this may be the only closure of the
+** prototype (if it is a "regular" function, with a single instance)
+** and the prototype may be big, so it is better to avoid traversing
+** it again. Otherwise, use a backward barrier, to avoid marking all
+** possible instances.
+*/
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) {
+  global_State *g = G(L);
+  lua_assert(isblack(obj2gco(p)));
+  if (p->cache == NULL) {  /* first time? */
+    luaC_objbarrier(L, p, c);
+  }
+  else {  /* use a backward barrier */
+    black2gray(obj2gco(p));  /* make prototype gray (again) */
+    p->gclist = g->grayagain;
+    g->grayagain = obj2gco(p);
+  }
+}
+
+
+/*
+** check color (and invariants) for an upvalue that was closed,
+** i.e., moved into the 'allgc' list
+*/
+void luaC_checkupvalcolor (global_State *g, UpVal *uv) {
+  GCObject *o = obj2gco(uv);
+  lua_assert(!isblack(o));  /* open upvalues are never black */
+  if (isgray(o)) {
+    if (keepinvariant(g)) {
+      resetoldbit(o);  /* see MOVE OLD rule */
+      gray2black(o);  /* it is being visited now */
+      markvalue(g, uv->v);
+    }
+    else {
+      lua_assert(issweepphase(g));
+      makewhite(g, o);
+    }
+  }
+}
+
+
+/*
+** create a new collectable object (with given type and size) and link
+** it to '*list'. 'offset' tells how many bytes to allocate before the
+** object itself (used only by states).
+*/
+GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list,
+                       int offset) {
+  global_State *g = G(L);
+  char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz));
+  GCObject *o = obj2gco(raw + offset);
+  if (list == NULL)
+    list = &g->allgc;  /* standard list for collectable objects */
+  gch(o)->marked = luaC_white(g);
+  gch(o)->tt = tt;
+  gch(o)->next = *list;
+  *list = o;
+  return o;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Mark functions
+** =======================================================
+*/
+
+
+/*
+** mark an object. Userdata, strings, and closed upvalues are visited
+** and turned black here. Other objects are marked gray and added
+** to appropriate list to be visited (and turned black) later. (Open
+** upvalues are already linked in 'headuv' list.)
+*/
+static void reallymarkobject (global_State *g, GCObject *o) {
+  lu_mem size;
+  white2gray(o);
+  switch (gch(o)->tt) {
+    case LUA_TSHRSTR:
+    case LUA_TLNGSTR: {
+      size = sizestring(gco2ts(o));
+      break;  /* nothing else to mark; make it black */
+    }
+    case LUA_TUSERDATA: {
+      Table *mt = gco2u(o)->metatable;
+      markobject(g, mt);
+      markobject(g, gco2u(o)->env);
+      size = sizeudata(gco2u(o));
+      break;
+    }
+    case LUA_TUPVAL: {
+      UpVal *uv = gco2uv(o);
+      markvalue(g, uv->v);
+      if (uv->v != &uv->u.value)  /* open? */
+        return;  /* open upvalues remain gray */
+      size = sizeof(UpVal);
+      break;
+    }
+    case LUA_TLCL: {
+      gco2lcl(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    case LUA_TCCL: {
+      gco2ccl(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    case LUA_TTABLE: {
+      linktable(gco2t(o), &g->gray);
+      return;
+    }
+    case LUA_TTHREAD: {
+      gco2th(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    case LUA_TPROTO: {
+      gco2p(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    default: lua_assert(0); return;
+  }
+  gray2black(o);
+  g->GCmemtrav += size;
+}
+
+
+/*
+** mark metamethods for basic types
+*/
+static void markmt (global_State *g) {
+  int i;
+  for (i=0; i < LUA_NUMTAGS; i++)
+    markobject(g, g->mt[i]);
+}
+
+
+/*
+** mark all objects in list of being-finalized
+*/
+static void markbeingfnz (global_State *g) {
+  GCObject *o;
+  for (o = g->tobefnz; o != NULL; o = gch(o)->next) {
+    makewhite(g, o);
+    reallymarkobject(g, o);
+  }
+}
+
+
+/*
+** mark all values stored in marked open upvalues. (See comment in
+** 'lstate.h'.)
+*/
+static void remarkupvals (global_State *g) {
+  UpVal *uv;
+  for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) {
+    if (isgray(obj2gco(uv)))
+      markvalue(g, uv->v);
+  }
+}
+
+
+/*
+** mark root set and reset all gray lists, to start a new
+** incremental (or full) collection
+*/
+static void restartcollection (global_State *g) {
+  g->gray = g->grayagain = NULL;
+  g->weak = g->allweak = g->ephemeron = NULL;
+  markobject(g, g->mainthread);
+  markvalue(g, &g->l_registry);
+  markmt(g);
+  markbeingfnz(g);  /* mark any finalizing object left from previous cycle */
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Traverse functions
+** =======================================================
+*/
+
+static void traverseweakvalue (global_State *g, Table *h) {
+  Node *n, *limit = gnodelast(h);
+  /* if there is array part, assume it may have white values (do not
+     traverse it just to check) */
+  int hasclears = (h->sizearray > 0);
+  for (n = gnode(h, 0); n < limit; n++) {
+    checkdeadkey(n);
+    if (ttisnil(gval(n)))  /* entry is empty? */
+      removeentry(n);  /* remove it */
+    else {
+      lua_assert(!ttisnil(gkey(n)));
+      markvalue(g, gkey(n));  /* mark key */
+      if (!hasclears && iscleared(g, gval(n)))  /* is there a white value? */
+        hasclears = 1;  /* table will have to be cleared */
+    }
+  }
+  if (hasclears)
+    linktable(h, &g->weak);  /* has to be cleared later */
+  else  /* no white values */
+    linktable(h, &g->grayagain);  /* no need to clean */
+}
+
+
+static int traverseephemeron (global_State *g, Table *h) {
+  int marked = 0;  /* true if an object is marked in this traversal */
+  int hasclears = 0;  /* true if table has white keys */
+  int prop = 0;  /* true if table has entry "white-key -> white-value" */
+  Node *n, *limit = gnodelast(h);
+  int i;
+  /* traverse array part (numeric keys are 'strong') */
+  for (i = 0; i < h->sizearray; i++) {
+    if (valiswhite(&h->array[i])) {
+      marked = 1;
+      reallymarkobject(g, gcvalue(&h->array[i]));
+    }
+  }
+  /* traverse hash part */
+  for (n = gnode(h, 0); n < limit; n++) {
+    checkdeadkey(n);
+    if (ttisnil(gval(n)))  /* entry is empty? */
+      removeentry(n);  /* remove it */
+    else if (iscleared(g, gkey(n))) {  /* key is not marked (yet)? */
+      hasclears = 1;  /* table must be cleared */
+      if (valiswhite(gval(n)))  /* value not marked yet? */
+        prop = 1;  /* must propagate again */
+    }
+    else if (valiswhite(gval(n))) {  /* value not marked yet? */
+      marked = 1;
+      reallymarkobject(g, gcvalue(gval(n)));  /* mark it now */
+    }
+  }
+  if (g->gcstate != GCSatomic || prop)
+    linktable(h, &g->ephemeron);  /* have to propagate again */
+  else if (hasclears)  /* does table have white keys? */
+    linktable(h, &g->allweak);  /* may have to clean white keys */
+  else  /* no white keys */
+    linktable(h, &g->grayagain);  /* no need to clean */
+  return marked;
+}
+
+
+static void traversestrongtable (global_State *g, Table *h) {
+  Node *n, *limit = gnodelast(h);
+  int i;
+  for (i = 0; i < h->sizearray; i++)  /* traverse array part */
+    markvalue(g, &h->array[i]);
+  for (n = gnode(h, 0); n < limit; n++) {  /* traverse hash part */
+    checkdeadkey(n);
+    if (ttisnil(gval(n)))  /* entry is empty? */
+      removeentry(n);  /* remove it */
+    else {
+      lua_assert(!ttisnil(gkey(n)));
+      markvalue(g, gkey(n));  /* mark key */
+      markvalue(g, gval(n));  /* mark value */
+    }
+  }
+}
+
+
+static lu_mem traversetable (global_State *g, Table *h) {
+  const char *weakkey, *weakvalue;
+  const TValue *mode = gfasttm(g, h->metatable, TM_MODE);
+  markobject(g, h->metatable);
+  if (mode && ttisstring(mode) &&  /* is there a weak mode? */
+      ((weakkey = strchr(svalue(mode), 'k')),
+       (weakvalue = strchr(svalue(mode), 'v')),
+       (weakkey || weakvalue))) {  /* is really weak? */
+    black2gray(obj2gco(h));  /* keep table gray */
+    if (!weakkey)  /* strong keys? */
+      traverseweakvalue(g, h);
+    else if (!weakvalue)  /* strong values? */
+      traverseephemeron(g, h);
+    else  /* all weak */
+      linktable(h, &g->allweak);  /* nothing to traverse now */
+  }
+  else  /* not weak */
+    traversestrongtable(g, h);
+  return sizeof(Table) + sizeof(TValue) * h->sizearray +
+                         sizeof(Node) * cast(size_t, sizenode(h));
+}
+
+
+static int traverseproto (global_State *g, Proto *f) {
+  int i;
+  if (f->cache && iswhite(obj2gco(f->cache)))
+    f->cache = NULL;  /* allow cache to be collected */
+  markobject(g, f->source);
+  for (i = 0; i < f->sizek; i++)  /* mark literals */
+    markvalue(g, &f->k[i]);
+  for (i = 0; i < f->sizeupvalues; i++)  /* mark upvalue names */
+    markobject(g, f->upvalues[i].name);
+  for (i = 0; i < f->sizep; i++)  /* mark nested protos */
+    markobject(g, f->p[i]);
+  for (i = 0; i < f->sizelocvars; i++)  /* mark local-variable names */
+    markobject(g, f->locvars[i].varname);
+  return sizeof(Proto) + sizeof(Instruction) * f->sizecode +
+                         sizeof(Proto *) * f->sizep +
+                         sizeof(TValue) * f->sizek +
+                         sizeof(int) * f->sizelineinfo +
+                         sizeof(LocVar) * f->sizelocvars +
+                         sizeof(Upvaldesc) * f->sizeupvalues;
+}
+
+
+static lu_mem traverseCclosure (global_State *g, CClosure *cl) {
+  int i;
+  for (i = 0; i < cl->nupvalues; i++)  /* mark its upvalues */
+    markvalue(g, &cl->upvalue[i]);
+  return sizeCclosure(cl->nupvalues);
+}
+
+static lu_mem traverseLclosure (global_State *g, LClosure *cl) {
+  int i;
+  markobject(g, cl->p);  /* mark its prototype */
+  for (i = 0; i < cl->nupvalues; i++)  /* mark its upvalues */
+    markobject(g, cl->upvals[i]);
+  return sizeLclosure(cl->nupvalues);
+}
+
+
+static lu_mem traversestack (global_State *g, lua_State *th) {
+  int n = 0;
+  StkId o = th->stack;
+  if (o == NULL)
+    return 1;  /* stack not completely built yet */
+  for (; o < th->top; o++)  /* mark live elements in the stack */
+    markvalue(g, o);
+  if (g->gcstate == GCSatomic) {  /* final traversal? */
+    StkId lim = th->stack + th->stacksize;  /* real end of stack */
+    for (; o < lim; o++)  /* clear not-marked stack slice */
+      setnilvalue(o);
+  }
+  else {  /* count call infos to compute size */
+    CallInfo *ci;
+    for (ci = &th->base_ci; ci != th->ci; ci = ci->next)
+      n++;
+  }
+  return sizeof(lua_State) + sizeof(TValue) * th->stacksize +
+         sizeof(CallInfo) * n;
+}
+
+
+/*
+** traverse one gray object, turning it to black (except for threads,
+** which are always gray).
+*/
+static void propagatemark (global_State *g) {
+  lu_mem size;
+  GCObject *o = g->gray;
+  lua_assert(isgray(o));
+  gray2black(o);
+  switch (gch(o)->tt) {
+    case LUA_TTABLE: {
+      Table *h = gco2t(o);
+      g->gray = h->gclist;  /* remove from 'gray' list */
+      size = traversetable(g, h);
+      break;
+    }
+    case LUA_TLCL: {
+      LClosure *cl = gco2lcl(o);
+      g->gray = cl->gclist;  /* remove from 'gray' list */
+      size = traverseLclosure(g, cl);
+      break;
+    }
+    case LUA_TCCL: {
+      CClosure *cl = gco2ccl(o);
+      g->gray = cl->gclist;  /* remove from 'gray' list */
+      size = traverseCclosure(g, cl);
+      break;
+    }
+    case LUA_TTHREAD: {
+      lua_State *th = gco2th(o);
+      g->gray = th->gclist;  /* remove from 'gray' list */
+      th->gclist = g->grayagain;
+      g->grayagain = o;  /* insert into 'grayagain' list */
+      black2gray(o);
+      size = traversestack(g, th);
+      break;
+    }
+    case LUA_TPROTO: {
+      Proto *p = gco2p(o);
+      g->gray = p->gclist;  /* remove from 'gray' list */
+      size = traverseproto(g, p);
+      break;
+    }
+    default: lua_assert(0); return;
+  }
+  g->GCmemtrav += size;
+}
+
+
+static void propagateall (global_State *g) {
+  while (g->gray) propagatemark(g);
+}
+
+
+static void propagatelist (global_State *g, GCObject *l) {
+  lua_assert(g->gray == NULL);  /* no grays left */
+  g->gray = l;
+  propagateall(g);  /* traverse all elements from 'l' */
+}
+
+/*
+** retraverse all gray lists. Because tables may be reinserted in other
+** lists when traversed, traverse the original lists to avoid traversing
+** twice the same table (which is not wrong, but inefficient)
+*/
+static void retraversegrays (global_State *g) {
+  GCObject *weak = g->weak;  /* save original lists */
+  GCObject *grayagain = g->grayagain;
+  GCObject *ephemeron = g->ephemeron;
+  g->weak = g->grayagain = g->ephemeron = NULL;
+  propagateall(g);  /* traverse main gray list */
+  propagatelist(g, grayagain);
+  propagatelist(g, weak);
+  propagatelist(g, ephemeron);
+}
+
+
+static void convergeephemerons (global_State *g) {
+  int changed;
+  do {
+    GCObject *w;
+    GCObject *next = g->ephemeron;  /* get ephemeron list */
+    g->ephemeron = NULL;  /* tables will return to this list when traversed */
+    changed = 0;
+    while ((w = next) != NULL) {
+      next = gco2t(w)->gclist;
+      if (traverseephemeron(g, gco2t(w))) {  /* traverse marked some value? */
+        propagateall(g);  /* propagate changes */
+        changed = 1;  /* will have to revisit all ephemeron tables */
+      }
+    }
+  } while (changed);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Sweep Functions
+** =======================================================
+*/
+
+
+/*
+** clear entries with unmarked keys from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearkeys (global_State *g, GCObject *l, GCObject *f) {
+  for (; l != f; l = gco2t(l)->gclist) {
+    Table *h = gco2t(l);
+    Node *n, *limit = gnodelast(h);
+    for (n = gnode(h, 0); n < limit; n++) {
+      if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) {
+        setnilvalue(gval(n));  /* remove value ... */
+        removeentry(n);  /* and remove entry from table */
+      }
+    }
+  }
+}
+
+
+/*
+** clear entries with unmarked values from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearvalues (global_State *g, GCObject *l, GCObject *f) {
+  for (; l != f; l = gco2t(l)->gclist) {
+    Table *h = gco2t(l);
+    Node *n, *limit = gnodelast(h);
+    int i;
+    for (i = 0; i < h->sizearray; i++) {
+      TValue *o = &h->array[i];
+      if (iscleared(g, o))  /* value was collected? */
+        setnilvalue(o);  /* remove value */
+    }
+    for (n = gnode(h, 0); n < limit; n++) {
+      if (!ttisnil(gval(n)) && iscleared(g, gval(n))) {
+        setnilvalue(gval(n));  /* remove value ... */
+        removeentry(n);  /* and remove entry from table */
+      }
+    }
+  }
+}
+
+
+static void freeobj (lua_State *L, GCObject *o) {
+  switch (gch(o)->tt) {
+    case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break;
+    case LUA_TLCL: {
+      luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues));
+      break;
+    }
+    case LUA_TCCL: {
+      luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues));
+      break;
+    }
+    case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break;
+    case LUA_TTABLE: luaH_free(L, gco2t(o)); break;
+    case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break;
+    case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break;
+    case LUA_TSHRSTR:
+      G(L)->strt.nuse--;
+      /* FALLTHROUGH */
+    case LUA_TLNGSTR: {
+      luaM_freemem(L, o, sizestring(gco2ts(o)));
+      break;
+    }
+    default: lua_assert(0);
+  }
+}
+
+
+#define sweepwholelist(L,p)	sweeplist(L,p,MAX_LUMEM)
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count);
+
+
+/*
+** sweep the (open) upvalues of a thread and resize its stack and
+** list of call-info structures.
+*/
+static void sweepthread (lua_State *L, lua_State *L1) {
+  if (L1->stack == NULL) return;  /* stack not completely built yet */
+  sweepwholelist(L, &L1->openupval);  /* sweep open upvalues */
+  luaE_freeCI(L1);  /* free extra CallInfo slots */
+  /* should not change the stack during an emergency gc cycle */
+  if (G(L)->gckind != KGC_EMERGENCY)
+    luaD_shrinkstack(L1);
+}
+
+
+/*
+** sweep at most 'count' elements from a list of GCObjects erasing dead
+** objects, where a dead (not alive) object is one marked with the "old"
+** (non current) white and not fixed.
+** In non-generational mode, change all non-dead objects back to white,
+** preparing for next collection cycle.
+** In generational mode, keep black objects black, and also mark them as
+** old; stop when hitting an old object, as all objects after that
+** one will be old too.
+** When object is a thread, sweep its list of open upvalues too.
+*/
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) {
+  global_State *g = G(L);
+  int ow = otherwhite(g);
+  int toclear, toset;  /* bits to clear and to set in all live objects */
+  int tostop;  /* stop sweep when this is true */
+  if (isgenerational(g)) {  /* generational mode? */
+    toclear = ~0;  /* clear nothing */
+    toset = bitmask(OLDBIT);  /* set the old bit of all surviving objects */
+    tostop = bitmask(OLDBIT);  /* do not sweep old generation */
+  }
+  else {  /* normal mode */
+    toclear = maskcolors;  /* clear all color bits + old bit */
+    toset = luaC_white(g);  /* make object white */
+    tostop = 0;  /* do not stop */
+  }
+  while (*p != NULL && count-- > 0) {
+    GCObject *curr = *p;
+    int marked = gch(curr)->marked;
+    if (isdeadm(ow, marked)) {  /* is 'curr' dead? */
+      *p = gch(curr)->next;  /* remove 'curr' from list */
+      freeobj(L, curr);  /* erase 'curr' */
+    }
+    else {
+      if (testbits(marked, tostop))
+        return NULL;  /* stop sweeping this list */
+      if (gch(curr)->tt == LUA_TTHREAD)
+        sweepthread(L, gco2th(curr));  /* sweep thread's upvalues */
+      /* update marks */
+      gch(curr)->marked = cast_byte((marked & toclear) | toset);
+      p = &gch(curr)->next;  /* go to next element */
+    }
+  }
+  return (*p == NULL) ? NULL : p;
+}
+
+
+/*
+** sweep a list until a live object (or end of list)
+*/
+static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) {
+  GCObject ** old = p;
+  int i = 0;
+  do {
+    i++;
+    p = sweeplist(L, p, 1);
+  } while (p == old);
+  if (n) *n += i;
+  return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Finalization
+** =======================================================
+*/
+
+static void checkSizes (lua_State *L) {
+  global_State *g = G(L);
+  if (g->gckind != KGC_EMERGENCY) {  /* do not change sizes in emergency */
+    int hs = g->strt.size / 2;  /* half the size of the string table */
+    if (g->strt.nuse < cast(lu_int32, hs))  /* using less than that half? */
+      luaS_resize(L, hs);  /* halve its size */
+    luaZ_freebuffer(L, &g->buff);  /* free concatenation buffer */
+  }
+}
+
+
+static GCObject *udata2finalize (global_State *g) {
+  GCObject *o = g->tobefnz;  /* get first element */
+  lua_assert(isfinalized(o));
+  g->tobefnz = gch(o)->next;  /* remove it from 'tobefnz' list */
+  gch(o)->next = g->allgc;  /* return it to 'allgc' list */
+  g->allgc = o;
+  resetbit(gch(o)->marked, SEPARATED);  /* mark that it is not in 'tobefnz' */
+  lua_assert(!isold(o));  /* see MOVE OLD rule */
+  if (!keepinvariantout(g))  /* not keeping invariant? */
+    makewhite(g, o);  /* "sweep" object */
+  return o;
+}
+
+
+static void dothecall (lua_State *L, void *ud) {
+  UNUSED(ud);
+  luaD_call(L, L->top - 2, 0, 0);
+}
+
+
+static void GCTM (lua_State *L, int propagateerrors) {
+  global_State *g = G(L);
+  const TValue *tm;
+  TValue v;
+  setgcovalue(L, &v, udata2finalize(g));
+  tm = luaT_gettmbyobj(L, &v, TM_GC);
+  if (tm != NULL && ttisfunction(tm)) {  /* is there a finalizer? */
+    int status;
+    lu_byte oldah = L->allowhook;
+    int running  = g->gcrunning;
+    L->allowhook = 0;  /* stop debug hooks during GC metamethod */
+    g->gcrunning = 0;  /* avoid GC steps */
+    setobj2s(L, L->top, tm);  /* push finalizer... */
+    setobj2s(L, L->top + 1, &v);  /* ... and its argument */
+    L->top += 2;  /* and (next line) call the finalizer */
+    status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0);
+    L->allowhook = oldah;  /* restore hooks */
+    g->gcrunning = running;  /* restore state */
+    if (status != LUA_OK && propagateerrors) {  /* error while running __gc? */
+      if (status == LUA_ERRRUN) {  /* is there an error object? */
+        const char *msg = (ttisstring(L->top - 1))
+                            ? svalue(L->top - 1)
+                            : "no message";
+        luaO_pushfstring(L, "error in __gc metamethod (%s)", msg);
+        status = LUA_ERRGCMM;  /* error in __gc metamethod */
+      }
+      luaD_throw(L, status);  /* re-throw error */
+    }
+  }
+}
+
+
+/*
+** move all unreachable objects (or 'all' objects) that need
+** finalization from list 'finobj' to list 'tobefnz' (to be finalized)
+*/
+static void separatetobefnz (lua_State *L, int all) {
+  global_State *g = G(L);
+  GCObject **p = &g->finobj;
+  GCObject *curr;
+  GCObject **lastnext = &g->tobefnz;
+  /* find last 'next' field in 'tobefnz' list (to add elements in its end) */
+  while (*lastnext != NULL)
+    lastnext = &gch(*lastnext)->next;
+  while ((curr = *p) != NULL) {  /* traverse all finalizable objects */
+    lua_assert(!isfinalized(curr));
+    lua_assert(testbit(gch(curr)->marked, SEPARATED));
+    if (!(iswhite(curr) || all))  /* not being collected? */
+      p = &gch(curr)->next;  /* don't bother with it */
+    else {
+      l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */
+      *p = gch(curr)->next;  /* remove 'curr' from 'finobj' list */
+      gch(curr)->next = *lastnext;  /* link at the end of 'tobefnz' list */
+      *lastnext = curr;
+      lastnext = &gch(curr)->next;
+    }
+  }
+}
+
+
+/*
+** if object 'o' has a finalizer, remove it from 'allgc' list (must
+** search the list to find it) and link it in 'finobj' list.
+*/
+void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) {
+  global_State *g = G(L);
+  if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */
+      isfinalized(o) ||                           /* ... or is finalized... */
+      gfasttm(g, mt, TM_GC) == NULL)                /* or has no finalizer? */
+    return;  /* nothing to be done */
+  else {  /* move 'o' to 'finobj' list */
+    GCObject **p;
+    GCheader *ho = gch(o);
+    if (g->sweepgc == &ho->next) {  /* avoid removing current sweep object */
+      lua_assert(issweepphase(g));
+      g->sweepgc = sweeptolive(L, g->sweepgc, NULL);
+    }
+    /* search for pointer pointing to 'o' */
+    for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ }
+    *p = ho->next;  /* remove 'o' from root list */
+    ho->next = g->finobj;  /* link it in list 'finobj' */
+    g->finobj = o;
+    l_setbit(ho->marked, SEPARATED);  /* mark it as such */
+    if (!keepinvariantout(g))  /* not keeping invariant? */
+      makewhite(g, o);  /* "sweep" object */
+    else
+      resetoldbit(o);  /* see MOVE OLD rule */
+  }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** GC control
+** =======================================================
+*/
+
+
+/*
+** set a reasonable "time" to wait before starting a new GC cycle;
+** cycle will start when memory use hits threshold
+*/
+static void setpause (global_State *g, l_mem estimate) {
+  l_mem debt, threshold;
+  estimate = estimate / PAUSEADJ;  /* adjust 'estimate' */
+  threshold = (g->gcpause < MAX_LMEM / estimate)  /* overflow? */
+            ? estimate * g->gcpause  /* no overflow */
+            : MAX_LMEM;  /* overflow; truncate to maximum */
+  debt = -cast(l_mem, threshold - gettotalbytes(g));
+  luaE_setdebt(g, debt);
+}
+
+
+#define sweepphases  \
+	(bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep))
+
+
+/*
+** enter first sweep phase (strings) and prepare pointers for other
+** sweep phases.  The calls to 'sweeptolive' make pointers point to an
+** object inside the list (instead of to the header), so that the real
+** sweep do not need to skip objects created between "now" and the start
+** of the real sweep.
+** Returns how many objects it swept.
+*/
+static int entersweep (lua_State *L) {
+  global_State *g = G(L);
+  int n = 0;
+  g->gcstate = GCSsweepstring;
+  lua_assert(g->sweepgc == NULL && g->sweepfin == NULL);
+  /* prepare to sweep strings, finalizable objects, and regular objects */
+  g->sweepstrgc = 0;
+  g->sweepfin = sweeptolive(L, &g->finobj, &n);
+  g->sweepgc = sweeptolive(L, &g->allgc, &n);
+  return n;
+}
+
+
+/*
+** change GC mode
+*/
+void luaC_changemode (lua_State *L, int mode) {
+  global_State *g = G(L);
+  if (mode == g->gckind) return;  /* nothing to change */
+  if (mode == KGC_GEN) {  /* change to generational mode */
+    /* make sure gray lists are consistent */
+    luaC_runtilstate(L, bitmask(GCSpropagate));
+    g->GCestimate = gettotalbytes(g);
+    g->gckind = KGC_GEN;
+  }
+  else {  /* change to incremental mode */
+    /* sweep all objects to turn them back to white
+       (as white has not changed, nothing extra will be collected) */
+    g->gckind = KGC_NORMAL;
+    entersweep(L);
+    luaC_runtilstate(L, ~sweepphases);
+  }
+}
+
+
+/*
+** call all pending finalizers
+*/
+static void callallpendingfinalizers (lua_State *L, int propagateerrors) {
+  global_State *g = G(L);
+  while (g->tobefnz) {
+    resetoldbit(g->tobefnz);
+    GCTM(L, propagateerrors);
+  }
+}
+
+
+void luaC_freeallobjects (lua_State *L) {
+  global_State *g = G(L);
+  int i;
+  separatetobefnz(L, 1);  /* separate all objects with finalizers */
+  lua_assert(g->finobj == NULL);
+  callallpendingfinalizers(L, 0);
+  g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */
+  g->gckind = KGC_NORMAL;
+  sweepwholelist(L, &g->finobj);  /* finalizers can create objs. in 'finobj' */
+  sweepwholelist(L, &g->allgc);
+  for (i = 0; i < g->strt.size; i++)  /* free all string lists */
+    sweepwholelist(L, &g->strt.hash[i]);
+  lua_assert(g->strt.nuse == 0);
+}
+
+
+static l_mem atomic (lua_State *L) {
+  global_State *g = G(L);
+  l_mem work = -cast(l_mem, g->GCmemtrav);  /* start counting work */
+  GCObject *origweak, *origall;
+  lua_assert(!iswhite(obj2gco(g->mainthread)));
+  markobject(g, L);  /* mark running thread */
+  /* registry and global metatables may be changed by API */
+  markvalue(g, &g->l_registry);
+  markmt(g);  /* mark basic metatables */
+  /* remark occasional upvalues of (maybe) dead threads */
+  remarkupvals(g);
+  propagateall(g);  /* propagate changes */
+  work += g->GCmemtrav;  /* stop counting (do not (re)count grays) */
+  /* traverse objects caught by write barrier and by 'remarkupvals' */
+  retraversegrays(g);
+  work -= g->GCmemtrav;  /* restart counting */
+  convergeephemerons(g);
+  /* at this point, all strongly accessible objects are marked. */
+  /* clear values from weak tables, before checking finalizers */
+  clearvalues(g, g->weak, NULL);
+  clearvalues(g, g->allweak, NULL);
+  origweak = g->weak; origall = g->allweak;
+  work += g->GCmemtrav;  /* stop counting (objects being finalized) */
+  separatetobefnz(L, 0);  /* separate objects to be finalized */
+  markbeingfnz(g);  /* mark objects that will be finalized */
+  propagateall(g);  /* remark, to propagate `preserveness' */
+  work -= g->GCmemtrav;  /* restart counting */
+  convergeephemerons(g);
+  /* at this point, all resurrected objects are marked. */
+  /* remove dead objects from weak tables */
+  clearkeys(g, g->ephemeron, NULL);  /* clear keys from all ephemeron tables */
+  clearkeys(g, g->allweak, NULL);  /* clear keys from all allweak tables */
+  /* clear values from resurrected weak tables */
+  clearvalues(g, g->weak, origweak);
+  clearvalues(g, g->allweak, origall);
+  g->currentwhite = cast_byte(otherwhite(g));  /* flip current white */
+  work += g->GCmemtrav;  /* complete counting */
+  return work;  /* estimate of memory marked by 'atomic' */
+}
+
+
+static lu_mem singlestep (lua_State *L) {
+  global_State *g = G(L);
+  switch (g->gcstate) {
+    case GCSpause: {
+      /* start to count memory traversed */
+      g->GCmemtrav = g->strt.size * sizeof(GCObject*);
+      lua_assert(!isgenerational(g));
+      restartcollection(g);
+      g->gcstate = GCSpropagate;
+      return g->GCmemtrav;
+    }
+    case GCSpropagate: {
+      if (g->gray) {
+        lu_mem oldtrav = g->GCmemtrav;
+        propagatemark(g);
+        return g->GCmemtrav - oldtrav;  /* memory traversed in this step */
+      }
+      else {  /* no more `gray' objects */
+        lu_mem work;
+        int sw;
+        g->gcstate = GCSatomic;  /* finish mark phase */
+        g->GCestimate = g->GCmemtrav;  /* save what was counted */;
+        work = atomic(L);  /* add what was traversed by 'atomic' */
+        g->GCestimate += work;  /* estimate of total memory traversed */
+        sw = entersweep(L);
+        return work + sw * GCSWEEPCOST;
+      }
+    }
+    case GCSsweepstring: {
+      int i;
+      for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++)
+        sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]);
+      g->sweepstrgc += i;
+      if (g->sweepstrgc >= g->strt.size)  /* no more strings to sweep? */
+        g->gcstate = GCSsweepudata;
+      return i * GCSWEEPCOST;
+    }
+    case GCSsweepudata: {
+      if (g->sweepfin) {
+        g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX);
+        return GCSWEEPMAX*GCSWEEPCOST;
+      }
+      else {
+        g->gcstate = GCSsweep;
+        return 0;
+      }
+    }
+    case GCSsweep: {
+      if (g->sweepgc) {
+        g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);
+        return GCSWEEPMAX*GCSWEEPCOST;
+      }
+      else {
+        /* sweep main thread */
+        GCObject *mt = obj2gco(g->mainthread);
+        sweeplist(L, &mt, 1);
+        checkSizes(L);
+        g->gcstate = GCSpause;  /* finish collection */
+        return GCSWEEPCOST;
+      }
+    }
+    default: lua_assert(0); return 0;
+  }
+}
+
+
+/*
+** advances the garbage collector until it reaches a state allowed
+** by 'statemask'
+*/
+void luaC_runtilstate (lua_State *L, int statesmask) {
+  global_State *g = G(L);
+  while (!testbit(statesmask, g->gcstate))
+    singlestep(L);
+}
+
+
+static void generationalcollection (lua_State *L) {
+  global_State *g = G(L);
+  lua_assert(g->gcstate == GCSpropagate);
+  if (g->GCestimate == 0) {  /* signal for another major collection? */
+    luaC_fullgc(L, 0);  /* perform a full regular collection */
+    g->GCestimate = gettotalbytes(g);  /* update control */
+  }
+  else {
+    lu_mem estimate = g->GCestimate;
+    luaC_runtilstate(L, bitmask(GCSpause));  /* run complete (minor) cycle */
+    g->gcstate = GCSpropagate;  /* skip restart */
+    if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc)
+      g->GCestimate = 0;  /* signal for a major collection */
+    else
+      g->GCestimate = estimate;  /* keep estimate from last major coll. */
+
+  }
+  setpause(g, gettotalbytes(g));
+  lua_assert(g->gcstate == GCSpropagate);
+}
+
+
+static void incstep (lua_State *L) {
+  global_State *g = G(L);
+  l_mem debt = g->GCdebt;
+  int stepmul = g->gcstepmul;
+  if (stepmul < 40) stepmul = 40;  /* avoid ridiculous low values (and 0) */
+  /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */
+  debt = (debt / STEPMULADJ) + 1;
+  debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM;
+  do {  /* always perform at least one single step */
+    lu_mem work = singlestep(L);  /* do some work */
+    debt -= work;
+  } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause);
+  if (g->gcstate == GCSpause)
+    setpause(g, g->GCestimate);  /* pause until next cycle */
+  else {
+    debt = (debt / stepmul) * STEPMULADJ;  /* convert 'work units' to Kb */
+    luaE_setdebt(g, debt);
+  }
+}
+
+
+/*
+** performs a basic GC step
+*/
+void luaC_forcestep (lua_State *L) {
+  global_State *g = G(L);
+  int i;
+  if (isgenerational(g)) generationalcollection(L);
+  else incstep(L);
+  /* run a few finalizers (or all of them at the end of a collect cycle) */
+  for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++)
+    GCTM(L, 1);  /* call one finalizer */
+}
+
+
+/*
+** performs a basic GC step only if collector is running
+*/
+void luaC_step (lua_State *L) {
+  global_State *g = G(L);
+  if (g->gcrunning) luaC_forcestep(L);
+  else luaE_setdebt(g, -GCSTEPSIZE);  /* avoid being called too often */
+}
+
+
+
+/*
+** performs a full GC cycle; if "isemergency", does not call
+** finalizers (which could change stack positions)
+*/
+void luaC_fullgc (lua_State *L, int isemergency) {
+  global_State *g = G(L);
+  int origkind = g->gckind;
+  lua_assert(origkind != KGC_EMERGENCY);
+  if (isemergency)  /* do not run finalizers during emergency GC */
+    g->gckind = KGC_EMERGENCY;
+  else {
+    g->gckind = KGC_NORMAL;
+    callallpendingfinalizers(L, 1);
+  }
+  if (keepinvariant(g)) {  /* may there be some black objects? */
+    /* must sweep all objects to turn them back to white
+       (as white has not changed, nothing will be collected) */
+    entersweep(L);
+  }
+  /* finish any pending sweep phase to start a new cycle */
+  luaC_runtilstate(L, bitmask(GCSpause));
+  luaC_runtilstate(L, ~bitmask(GCSpause));  /* start new collection */
+  luaC_runtilstate(L, bitmask(GCSpause));  /* run entire collection */
+  if (origkind == KGC_GEN) {  /* generational mode? */
+    /* generational mode must be kept in propagate phase */
+    luaC_runtilstate(L, bitmask(GCSpropagate));
+  }
+  g->gckind = origkind;
+  setpause(g, gettotalbytes(g));
+  if (!isemergency)   /* do not run finalizers during emergency GC */
+    callallpendingfinalizers(L, 1);
+}
+
+/* }====================================================== */
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lgc.h b/sys/contrib/openzfs/module/lua/lgc.h
new file mode 100644
index 000000000000..34097a45edfc
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lgc.h
@@ -0,0 +1,159 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lgc_h
+#define lgc_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+
+/*
+** Collectable objects may have one of three colors: white, which
+** means the object is not marked; gray, which means the
+** object is marked, but its references may be not marked; and
+** black, which means that the object and all its references are marked.
+** The main invariant of the garbage collector, while marking objects,
+** is that a black object can never point to a white one. Moreover,
+** any gray object must be in a "gray list" (gray, grayagain, weak,
+** allweak, ephemeron) so that it can be visited again before finishing
+** the collection cycle. These lists have no meaning when the invariant
+** is not being enforced (e.g., sweep phase).
+*/
+
+
+
+/* how much to allocate before next GC step */
+#if !defined(GCSTEPSIZE)
+/* ~100 small strings */
+#define GCSTEPSIZE	(cast_int(100 * sizeof(TString)))
+#endif
+
+
+/*
+** Possible states of the Garbage Collector
+*/
+#define GCSpropagate	0
+#define GCSatomic	1
+#define GCSsweepstring	2
+#define GCSsweepudata	3
+#define GCSsweep	4
+#define GCSpause	5
+
+
+#define issweepphase(g)  \
+	(GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep)
+
+#define isgenerational(g)	((g)->gckind == KGC_GEN)
+
+/*
+** macros to tell when main invariant (white objects cannot point to black
+** ones) must be kept. During a non-generational collection, the sweep
+** phase may break the invariant, as objects turned white may point to
+** still-black objects. The invariant is restored when sweep ends and
+** all objects are white again. During a generational collection, the
+** invariant must be kept all times.
+*/
+
+#define keepinvariant(g)	(isgenerational(g) || g->gcstate <= GCSatomic)
+
+
+/*
+** Outside the collector, the state in generational mode is kept in
+** 'propagate', so 'keepinvariant' is always true.
+*/
+#define keepinvariantout(g)  \
+  check_exp(g->gcstate == GCSpropagate || !isgenerational(g),  \
+            g->gcstate <= GCSatomic)
+
+
+/*
+** some useful bit tricks
+*/
+#define resetbits(x,m)		((x) &= cast(lu_byte, ~(m)))
+#define setbits(x,m)		((x) |= (m))
+#define testbits(x,m)		((x) & (m))
+#define bitmask(b)		(1<<(b))
+#define bit2mask(b1,b2)		(bitmask(b1) | bitmask(b2))
+#define l_setbit(x,b)		setbits(x, bitmask(b))
+#define resetbit(x,b)		resetbits(x, bitmask(b))
+#define testbit(x,b)		testbits(x, bitmask(b))
+
+
+/* Layout for bit use in `marked' field: */
+#define WHITE0BIT	0  /* object is white (type 0) */
+#define WHITE1BIT	1  /* object is white (type 1) */
+#define BLACKBIT	2  /* object is black */
+#define FINALIZEDBIT	3  /* object has been separated for finalization */
+#define SEPARATED	4  /* object is in 'finobj' list or in 'tobefnz' */
+#define FIXEDBIT	5  /* object is fixed (should not be collected) */
+#define OLDBIT		6  /* object is old (only in generational mode) */
+/* bit 7 is currently used by tests (luaL_checkmemory) */
+
+#define WHITEBITS	bit2mask(WHITE0BIT, WHITE1BIT)
+
+
+#define iswhite(x)      testbits((x)->gch.marked, WHITEBITS)
+#define isblack(x)      testbit((x)->gch.marked, BLACKBIT)
+#define isgray(x)  /* neither white nor black */  \
+	(!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT)))
+
+#define isold(x)	testbit((x)->gch.marked, OLDBIT)
+
+/* MOVE OLD rule: whenever an object is moved to the beginning of
+   a GC list, its old bit must be cleared */
+#define resetoldbit(o)	resetbit((o)->gch.marked, OLDBIT)
+
+#define otherwhite(g)	(g->currentwhite ^ WHITEBITS)
+#define isdeadm(ow,m)	(!(((m) ^ WHITEBITS) & (ow)))
+#define isdead(g,v)	isdeadm(otherwhite(g), (v)->gch.marked)
+
+#define changewhite(x)	((x)->gch.marked ^= WHITEBITS)
+#define gray2black(x)	l_setbit((x)->gch.marked, BLACKBIT)
+
+#define valiswhite(x)	(iscollectable(x) && iswhite(gcvalue(x)))
+
+#define luaC_white(g)	cast(lu_byte, (g)->currentwhite & WHITEBITS)
+
+
+#define luaC_condGC(L,c) \
+	{if (G(L)->GCdebt > 0) {c;}; condchangemem(L);}
+#define luaC_checkGC(L)		luaC_condGC(L, luaC_step(L);)
+
+
+#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p)))  \
+	luaC_barrier_(L,obj2gco(p),gcvalue(v)); }
+
+#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p)))  \
+	luaC_barrierback_(L,p); }
+
+#define luaC_objbarrier(L,p,o)  \
+	{ if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \
+		luaC_barrier_(L,obj2gco(p),obj2gco(o)); }
+
+#define luaC_objbarrierback(L,p,o)  \
+   { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); }
+
+#define luaC_barrierproto(L,p,c) \
+   { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); }
+
+LUAI_FUNC void luaC_freeallobjects (lua_State *L);
+LUAI_FUNC void luaC_step (lua_State *L);
+LUAI_FUNC void luaC_forcestep (lua_State *L);
+LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask);
+LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency);
+LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz,
+                                 GCObject **list, int offset);
+LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v);
+LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o);
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c);
+LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt);
+LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv);
+LUAI_FUNC void luaC_changemode (lua_State *L, int mode);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/llex.c b/sys/contrib/openzfs/module/lua/llex.c
new file mode 100644
index 000000000000..50c301f599f1
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/llex.c
@@ -0,0 +1,531 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+#define llex_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lctype.h"
+#include "ldo.h"
+#include "llex.h"
+#include "lobject.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lzio.h"
+
+
+
+#define next(ls) (ls->current = zgetc(ls->z))
+
+
+
+#define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
+
+
+/* ORDER RESERVED */
+static const char *const luaX_tokens [] = {
+    "and", "break", "do", "else", "elseif",
+    "end", "false", "for", "function", "goto", "if",
+    "in", "local", "nil", "not", "or", "repeat",
+    "return", "then", "true", "until", "while",
+    "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
+    "<number>", "<name>", "<string>"
+};
+
+
+#define save_and_next(ls) (save(ls, ls->current), next(ls))
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token);
+
+
+static void save (LexState *ls, int c) {
+  Mbuffer *b = ls->buff;
+  if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
+    size_t newsize;
+    if (luaZ_sizebuffer(b) >= MAX_SIZET/2)
+      lexerror(ls, "lexical element too long", 0);
+    newsize = luaZ_sizebuffer(b) * 2;
+    luaZ_resizebuffer(ls->L, b, newsize);
+  }
+  b->buffer[luaZ_bufflen(b)++] = cast(char, c);
+}
+
+
+void luaX_init (lua_State *L) {
+  int i;
+  for (i=0; i<NUM_RESERVED; i++) {
+    TString *ts = luaS_new(L, luaX_tokens[i]);
+    luaS_fix(ts);  /* reserved words are never collected */
+    ts->tsv.extra = cast_byte(i+1);  /* reserved word */
+  }
+}
+
+
+const char *luaX_token2str (LexState *ls, int token) {
+  if (token < FIRST_RESERVED) {  /* single-byte symbols? */
+    lua_assert(token == cast(unsigned char, token));
+    return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) :
+                              luaO_pushfstring(ls->L, "char(%d)", token);
+  }
+  else {
+    const char *s = luaX_tokens[token - FIRST_RESERVED];
+    if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
+      return luaO_pushfstring(ls->L, LUA_QS, s);
+    else  /* names, strings, and numerals */
+      return s;
+  }
+}
+
+
+static const char *txtToken (LexState *ls, int token) {
+  switch (token) {
+    case TK_NAME:
+    case TK_STRING:
+    case TK_NUMBER:
+      save(ls, '\0');
+      return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff));
+    default:
+      return luaX_token2str(ls, token);
+  }
+}
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token) {
+  char buff[LUA_IDSIZE];
+  luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE);
+  msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg);
+  if (token)
+    luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
+  luaD_throw(ls->L, LUA_ERRSYNTAX);
+}
+
+
+l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
+  lexerror(ls, msg, ls->t.token);
+}
+
+
+/*
+** creates a new string and anchors it in function's table so that
+** it will not be collected until the end of the function's compilation
+** (by that time it should be anchored in function's prototype)
+*/
+TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
+  lua_State *L = ls->L;
+  TValue *o;  /* entry for `str' */
+  TString *ts = luaS_newlstr(L, str, l);  /* create new string */
+  setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
+  o = luaH_set(L, ls->fs->h, L->top - 1);
+  if (ttisnil(o)) {  /* not in use yet? (see 'addK') */
+    /* boolean value does not need GC barrier;
+       table has no metatable, so it does not need to invalidate cache */
+    setbvalue(o, 1);  /* t[string] = true */
+    luaC_checkGC(L);
+  }
+  else {  /* string already present */
+    ts = rawtsvalue(keyfromval(o));  /* re-use value previously stored */
+  }
+  L->top--;  /* remove string from stack */
+  return ts;
+}
+
+
+/*
+** increment line number and skips newline sequence (any of
+** \n, \r, \n\r, or \r\n)
+*/
+static void inclinenumber (LexState *ls) {
+  int old = ls->current;
+  lua_assert(currIsNewline(ls));
+  next(ls);  /* skip `\n' or `\r' */
+  if (currIsNewline(ls) && ls->current != old)
+    next(ls);  /* skip `\n\r' or `\r\n' */
+  if (++ls->linenumber >= MAX_INT)
+    lexerror(ls, "chunk has too many lines", 0);
+}
+
+
+void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
+                    int firstchar) {
+  ls->decpoint = '.';
+  ls->L = L;
+  ls->current = firstchar;
+  ls->lookahead.token = TK_EOS;  /* no look-ahead token */
+  ls->z = z;
+  ls->fs = NULL;
+  ls->linenumber = 1;
+  ls->lastline = 1;
+  ls->source = source;
+  ls->envn = luaS_new(L, LUA_ENV);  /* create env name */
+  luaS_fix(ls->envn);  /* never collect this name */
+  luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
+}
+
+
+
+/*
+** =======================================================
+** LEXICAL ANALYZER
+** =======================================================
+*/
+
+
+
+static int check_next (LexState *ls, const char *set) {
+  if (ls->current == '\0' || !strchr(set, ls->current))
+    return 0;
+  save_and_next(ls);
+  return 1;
+}
+
+
+/*
+** change all characters 'from' in buffer to 'to'
+*/
+static void buffreplace (LexState *ls, char from, char to) {
+  size_t n = luaZ_bufflen(ls->buff);
+  char *p = luaZ_buffer(ls->buff);
+  while (n--)
+    if (p[n] == from) p[n] = to;
+}
+
+
+#if !defined(getlocaledecpoint)
+#define getlocaledecpoint()	(localeconv()->decimal_point[0])
+#endif
+
+
+#define buff2d(b,e)	luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e)
+
+/*
+** in case of format error, try to change decimal point separator to
+** the one defined in the current locale and check again
+*/
+static void trydecpoint (LexState *ls, SemInfo *seminfo) {
+  char old = ls->decpoint;
+  ls->decpoint = getlocaledecpoint();
+  buffreplace(ls, old, ls->decpoint);  /* try new decimal separator */
+  if (!buff2d(ls->buff, &seminfo->r)) {
+    /* format error with correct decimal point: no more options */
+    buffreplace(ls, ls->decpoint, '.');  /* undo change (for error message) */
+    lexerror(ls, "malformed number", TK_NUMBER);
+  }
+}
+
+
+/* LUA_NUMBER */
+/*
+** this function is quite liberal in what it accepts, as 'luaO_str2d'
+** will reject ill-formed numerals.
+*/
+static void read_numeral (LexState *ls, SemInfo *seminfo) {
+  const char *expo = "Ee";
+  int first = ls->current;
+  lua_assert(lisdigit(ls->current));
+  save_and_next(ls);
+  if (first == '0' && check_next(ls, "Xx"))  /* hexadecimal? */
+    expo = "Pp";
+  for (;;) {
+    if (check_next(ls, expo))  /* exponent part? */
+      (void) check_next(ls, "+-");  /* optional exponent sign */
+    if (lisxdigit(ls->current) || ls->current == '.')
+      save_and_next(ls);
+    else  break;
+  }
+  save(ls, '\0');
+  buffreplace(ls, '.', ls->decpoint);  /* follow locale for decimal point */
+  if (!buff2d(ls->buff, &seminfo->r))  /* format error? */
+    trydecpoint(ls, seminfo); /* try to update decimal point separator */
+}
+
+
+/*
+** skip a sequence '[=*[' or ']=*]' and return its number of '='s or
+** -1 if sequence is malformed
+*/
+static int skip_sep (LexState *ls) {
+  int count = 0;
+  int s = ls->current;
+  lua_assert(s == '[' || s == ']');
+  save_and_next(ls);
+  while (ls->current == '=') {
+    save_and_next(ls);
+    count++;
+  }
+  return (ls->current == s) ? count : (-count) - 1;
+}
+
+
+static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
+  save_and_next(ls);  /* skip 2nd `[' */
+  if (currIsNewline(ls))  /* string starts with a newline? */
+    inclinenumber(ls);  /* skip it */
+  for (;;) {
+    switch (ls->current) {
+      case EOZ:
+        lexerror(ls, (seminfo) ? "unfinished long string" :
+                                 "unfinished long comment", TK_EOS);
+        break;  /* to avoid warnings */
+      case ']': {
+        if (skip_sep(ls) == sep) {
+          save_and_next(ls);  /* skip 2nd `]' */
+          goto endloop;
+        }
+        break;
+      }
+      case '\n': case '\r': {
+        save(ls, '\n');
+        inclinenumber(ls);
+        if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
+        break;
+      }
+      default: {
+        if (seminfo) save_and_next(ls);
+        else next(ls);
+      }
+    }
+  } endloop:
+  if (seminfo)
+    seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
+                                     luaZ_bufflen(ls->buff) - 2*(2 + sep));
+}
+
+
+static void escerror (LexState *ls, int *c, int n, const char *msg) {
+  int i;
+  luaZ_resetbuffer(ls->buff);  /* prepare error message */
+  save(ls, '\\');
+  for (i = 0; i < n && c[i] != EOZ; i++)
+    save(ls, c[i]);
+  lexerror(ls, msg, TK_STRING);
+}
+
+
+static int readhexaesc (LexState *ls) {
+  int c[3], i;  /* keep input for error message */
+  int r = 0;  /* result accumulator */
+  c[0] = 'x';  /* for error message */
+  for (i = 1; i < 3; i++) {  /* read two hexadecimal digits */
+    c[i] = next(ls);
+    if (!lisxdigit(c[i]))
+      escerror(ls, c, i + 1, "hexadecimal digit expected");
+    r = (r << 4) + luaO_hexavalue(c[i]);
+  }
+  return r;
+}
+
+
+static int readdecesc (LexState *ls) {
+  int c[3], i;
+  int r = 0;  /* result accumulator */
+  for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
+    c[i] = ls->current;
+    r = 10*r + c[i] - '0';
+    next(ls);
+  }
+  if (r > UCHAR_MAX)
+    escerror(ls, c, i, "decimal escape too large");
+  return r;
+}
+
+
+static void read_string (LexState *ls, int del, SemInfo *seminfo) {
+  save_and_next(ls);  /* keep delimiter (for error messages) */
+  while (ls->current != del) {
+    switch (ls->current) {
+      case EOZ:
+        lexerror(ls, "unfinished string", TK_EOS);
+        break;  /* to avoid warnings */
+      case '\n':
+      case '\r':
+        lexerror(ls, "unfinished string", TK_STRING);
+        break;  /* to avoid warnings */
+      case '\\': {  /* escape sequences */
+        int c;  /* final character to be saved */
+        next(ls);  /* do not save the `\' */
+        switch (ls->current) {
+          case 'a': c = '\a'; goto read_save;
+          case 'b': c = '\b'; goto read_save;
+          case 'f': c = '\f'; goto read_save;
+          case 'n': c = '\n'; goto read_save;
+          case 'r': c = '\r'; goto read_save;
+          case 't': c = '\t'; goto read_save;
+          case 'v': c = '\v'; goto read_save;
+          case 'x': c = readhexaesc(ls); goto read_save;
+          case '\n': case '\r':
+            inclinenumber(ls); c = '\n'; goto only_save;
+          case '\\': case '\"': case '\'':
+            c = ls->current; goto read_save;
+          case EOZ: goto no_save;  /* will raise an error next loop */
+          case 'z': {  /* zap following span of spaces */
+            next(ls);  /* skip the 'z' */
+            while (lisspace(ls->current)) {
+              if (currIsNewline(ls)) inclinenumber(ls);
+              else next(ls);
+            }
+            goto no_save;
+          }
+          default: {
+            if (!lisdigit(ls->current))
+              escerror(ls, &ls->current, 1, "invalid escape sequence");
+            /* digital escape \ddd */
+            c = readdecesc(ls);
+            goto only_save;
+          }
+        }
+       read_save: next(ls);  /* read next character */
+       only_save: save(ls, c);  /* save 'c' */
+       no_save: break;
+      }
+      default:
+        save_and_next(ls);
+    }
+  }
+  save_and_next(ls);  /* skip delimiter */
+  seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
+                                   luaZ_bufflen(ls->buff) - 2);
+}
+
+
+static int llex (LexState *ls, SemInfo *seminfo) {
+  luaZ_resetbuffer(ls->buff);
+  for (;;) {
+    switch (ls->current) {
+      case '\n': case '\r': {  /* line breaks */
+        inclinenumber(ls);
+        break;
+      }
+      case ' ': case '\f': case '\t': case '\v': {  /* spaces */
+        next(ls);
+        break;
+      }
+      case '-': {  /* '-' or '--' (comment) */
+        next(ls);
+        if (ls->current != '-') return '-';
+        /* else is a comment */
+        next(ls);
+        if (ls->current == '[') {  /* long comment? */
+          int sep = skip_sep(ls);
+          luaZ_resetbuffer(ls->buff);  /* `skip_sep' may dirty the buffer */
+          if (sep >= 0) {
+            read_long_string(ls, NULL, sep);  /* skip long comment */
+            luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
+            break;
+          }
+        }
+        /* else short comment */
+        while (!currIsNewline(ls) && ls->current != EOZ)
+          next(ls);  /* skip until end of line (or end of file) */
+        break;
+      }
+      case '[': {  /* long string or simply '[' */
+        int sep = skip_sep(ls);
+        if (sep >= 0) {
+          read_long_string(ls, seminfo, sep);
+          return TK_STRING;
+        } else if (sep == -1) {
+		return '[';
+        } else {
+		lexerror(ls, "invalid long string delimiter", TK_STRING);
+		break;
+	}
+      }
+      case '=': {
+        next(ls);
+        if (ls->current != '=') return '=';
+        else { next(ls); return TK_EQ; }
+      }
+      case '<': {
+        next(ls);
+        if (ls->current != '=') return '<';
+        else { next(ls); return TK_LE; }
+      }
+      case '>': {
+        next(ls);
+        if (ls->current != '=') return '>';
+        else { next(ls); return TK_GE; }
+      }
+      case '~': {
+        next(ls);
+        if (ls->current != '=') return '~';
+        else { next(ls); return TK_NE; }
+      }
+      case ':': {
+        next(ls);
+        if (ls->current != ':') return ':';
+        else { next(ls); return TK_DBCOLON; }
+      }
+      case '"': case '\'': {  /* short literal strings */
+        read_string(ls, ls->current, seminfo);
+        return TK_STRING;
+      }
+      case '.': {  /* '.', '..', '...', or number */
+        save_and_next(ls);
+        if (check_next(ls, ".")) {
+          if (check_next(ls, "."))
+            return TK_DOTS;   /* '...' */
+          else return TK_CONCAT;   /* '..' */
+        }
+        else if (!lisdigit(ls->current)) return '.';
+        /* else go through */
+      }
+      /* FALLTHROUGH */
+      case '0': case '1': case '2': case '3': case '4':
+      case '5': case '6': case '7': case '8': case '9': {
+        read_numeral(ls, seminfo);
+        return TK_NUMBER;
+      }
+      case EOZ: {
+        return TK_EOS;
+      }
+      default: {
+        if (lislalpha(ls->current)) {  /* identifier or reserved word? */
+          TString *ts;
+          do {
+            save_and_next(ls);
+          } while (lislalnum(ls->current));
+          ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
+                                  luaZ_bufflen(ls->buff));
+          seminfo->ts = ts;
+          if (isreserved(ts))  /* reserved word? */
+            return ts->tsv.extra - 1 + FIRST_RESERVED;
+          else {
+            return TK_NAME;
+          }
+        }
+        else {  /* single-char tokens (+ - / ...) */
+          int c = ls->current;
+          next(ls);
+          return c;
+        }
+      }
+    }
+  }
+}
+
+
+void luaX_next (LexState *ls) {
+  ls->lastline = ls->linenumber;
+  if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
+    ls->t = ls->lookahead;  /* use this one */
+    ls->lookahead.token = TK_EOS;  /* and discharge it */
+  }
+  else
+    ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
+}
+
+
+int luaX_lookahead (LexState *ls) {
+  lua_assert(ls->lookahead.token == TK_EOS);
+  ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
+  return ls->lookahead.token;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/llex.h b/sys/contrib/openzfs/module/lua/llex.h
new file mode 100644
index 000000000000..da58203e8dc8
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/llex.h
@@ -0,0 +1,83 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llex_h
+#define llex_h
+
+#include "lobject.h"
+#include "lzio.h"
+
+
+#define FIRST_RESERVED	257
+
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER RESERVED"
+*/
+enum RESERVED {
+  /* terminal symbols denoted by reserved words */
+  TK_AND = FIRST_RESERVED, TK_BREAK,
+  TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
+  TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
+  TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
+  /* other terminal symbols */
+  TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS,
+  TK_NUMBER, TK_NAME, TK_STRING
+};
+
+/* number of reserved words */
+#define NUM_RESERVED	(cast(int, TK_WHILE-FIRST_RESERVED+1))
+
+
+typedef union {
+  lua_Number r;
+  TString *ts;
+} SemInfo;  /* semantics information */
+
+
+typedef struct Token {
+  int token;
+  SemInfo seminfo;
+} Token;
+
+#ifdef current
+#undef current
+#endif
+
+/* state of the lexer plus state of the parser when shared by all
+   functions */
+typedef struct LexState {
+  int current;  /* current character (charint) */
+  int linenumber;  /* input line counter */
+  int lastline;  /* line of last token `consumed' */
+  Token t;  /* current token */
+  Token lookahead;  /* look ahead token */
+  struct FuncState *fs;  /* current function (parser) */
+  struct lua_State *L;
+  ZIO *z;  /* input stream */
+  Mbuffer *buff;  /* buffer for tokens */
+  struct Dyndata *dyd;  /* dynamic structures used by the parser */
+  TString *source;  /* current source name */
+  TString *envn;  /* environment variable name */
+  char decpoint;  /* locale decimal point */
+} LexState;
+
+
+LUAI_FUNC void luaX_init (lua_State *L);
+LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z,
+                              TString *source, int firstchar);
+LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l);
+LUAI_FUNC void luaX_next (LexState *ls);
+LUAI_FUNC int luaX_lookahead (LexState *ls);
+LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s);
+LUAI_FUNC const char *luaX_token2str (LexState *ls, int token);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/llimits.h b/sys/contrib/openzfs/module/lua/llimits.h
new file mode 100644
index 000000000000..177092fbc228
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/llimits.h
@@ -0,0 +1,314 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $
+** Limits, basic types, and some other `installation-dependent' definitions
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llimits_h
+#define llimits_h
+
+
+#include <sys/lua/lua.h>
+
+
+typedef unsigned LUA_INT32 lu_int32;
+
+typedef LUAI_UMEM lu_mem;
+
+typedef LUAI_MEM l_mem;
+
+
+
+/* chars used as small naturals (so that `char' is reserved for characters) */
+typedef unsigned char lu_byte;
+
+
+#define MAX_SIZET	((size_t)(~(size_t)0)-2)
+
+#define MAX_LUMEM	((lu_mem)(~(lu_mem)0)-2)
+
+#define MAX_LMEM	((l_mem) ((MAX_LUMEM >> 1) - 2))
+
+
+#define MAX_INT (INT_MAX-2)  /* maximum value of an int (-2 for safety) */
+
+/*
+** conversion of pointer to integer
+** this is for hashing only; there is no problem if the integer
+** cannot hold the whole pointer value
+*/
+#define IntPoint(p)  ((unsigned int)(lu_mem)(p))
+
+
+
+/* type to ensure maximum alignment */
+#if !defined(LUAI_USER_ALIGNMENT_T)
+#define LUAI_USER_ALIGNMENT_T	union { double u; void *s; long l; }
+#endif
+
+typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;
+
+
+/* result of a `usual argument conversion' over lua_Number */
+typedef LUAI_UACNUMBER l_uacNumber;
+
+
+/* internal assertions for in-house debugging */
+#if defined(lua_assert)
+#define check_exp(c,e)		(lua_assert(c), (e))
+/* to avoid problems with conditions too long */
+#define lua_longassert(c)	{ if (!(c)) lua_assert(0); }
+#else
+#define lua_assert(c)		((void)0)
+#define check_exp(c,e)		(e)
+#define lua_longassert(c)	((void)0)
+#endif
+
+/*
+** assertion for checking API calls
+*/
+#if !defined(luai_apicheck)
+
+#if defined(LUA_USE_APICHECK)
+#include <assert.h>
+#define luai_apicheck(L,e)	assert(e)
+#else
+#define luai_apicheck(L,e)	lua_assert(e)
+#endif
+
+#endif
+
+#define api_check(l,e,msg)	luai_apicheck(l,(e) && msg)
+
+
+#if !defined(UNUSED)
+#define UNUSED(x)	((void)(x))	/* to avoid warnings */
+#endif
+
+
+#define cast(t, exp)	((t)(exp))
+
+#define cast_byte(i)	cast(lu_byte, (i))
+#define cast_num(i)	cast(lua_Number, (i))
+#define cast_int(i)	cast(int, (i))
+#define cast_uchar(i)	cast(unsigned char, (i))
+
+
+/*
+** non-return type
+**
+** Suppress noreturn attribute in kernel builds to avoid objtool check warnings
+*/
+#if defined(__GNUC__) && !defined(_KERNEL)
+#define l_noret		void __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define l_noret		void __declspec(noreturn)
+#else
+#define l_noret		void
+#endif
+
+
+
+/*
+** maximum depth for nested C calls and syntactical nested non-terminals
+** in a program. (Value must fit in an unsigned short int.)
+**
+** Note: On amd64 platform, the limit has been measured to be 45.  We set
+** the maximum lower to give a margin for changing the amount of stack
+** used by various functions involved in parsing and executing code.
+*/
+#if !defined(LUAI_MAXCCALLS)
+#define LUAI_MAXCCALLS		20
+#endif
+
+/*
+ * Minimum amount of available stack space (in bytes) to make a C call.  With
+ * gsub() recursion, the stack space between each luaD_call() is 1256 bytes.
+ */
+#define LUAI_MINCSTACK		4096
+
+/*
+** maximum number of upvalues in a closure (both C and Lua). (Value
+** must fit in an unsigned char.)
+*/
+#define MAXUPVAL	UCHAR_MAX
+
+
+/*
+** type for virtual-machine instructions
+** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h)
+*/
+typedef lu_int32 Instruction;
+
+
+
+/* maximum stack for a Lua function */
+#define MAXSTACK	250
+
+
+
+/* minimum size for the string table (must be power of 2) */
+#if !defined(MINSTRTABSIZE)
+#define MINSTRTABSIZE	32
+#endif
+
+
+/* minimum size for string buffer */
+#if !defined(LUA_MINBUFFER)
+#define LUA_MINBUFFER	32
+#endif
+
+
+#if !defined(lua_lock)
+#define lua_lock(L)     ((void) 0)
+#define lua_unlock(L)   ((void) 0)
+#endif
+
+#if !defined(luai_threadyield)
+#define luai_threadyield(L)     {lua_unlock(L); lua_lock(L);}
+#endif
+
+
+/*
+** these macros allow user-specific actions on threads when you defined
+** LUAI_EXTRASPACE and need to do something extra when a thread is
+** created/deleted/resumed/yielded.
+*/
+#if !defined(luai_userstateopen)
+#define luai_userstateopen(L)		((void)L)
+#endif
+
+#if !defined(luai_userstateclose)
+#define luai_userstateclose(L)		((void)L)
+#endif
+
+#if !defined(luai_userstatethread)
+#define luai_userstatethread(L,L1)	((void)L)
+#endif
+
+#if !defined(luai_userstatefree)
+#define luai_userstatefree(L,L1)	((void)L)
+#endif
+
+#if !defined(luai_userstateresume)
+#define luai_userstateresume(L,n)       ((void)L)
+#endif
+
+#if !defined(luai_userstateyield)
+#define luai_userstateyield(L,n)        ((void)L)
+#endif
+
+/*
+** lua_number2int is a macro to convert lua_Number to int.
+** lua_number2integer is a macro to convert lua_Number to lua_Integer.
+** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned.
+** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number.
+** luai_hashnum is a macro to hash a lua_Number value into an integer.
+** The hash must be deterministic and give reasonable values for
+** both small and large values (outside the range of integers).
+*/
+
+#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK)	/* { */
+/* trick with Microsoft assembler for X86 */
+
+#define lua_number2int(i,n)  __asm {__asm fld n   __asm fistp i}
+#define lua_number2integer(i,n)		lua_number2int(i, n)
+#define lua_number2unsigned(i,n)  \
+  {__int64 l; __asm {__asm fld n   __asm fistp l} i = (unsigned int)l;}
+
+
+#elif defined(LUA_IEEE754TRICK)		/* }{ */
+/* the next trick should work on any machine using IEEE754 with
+   a 32-bit int type */
+
+union luai_Cast { double l_d; LUA_INT32 l_p[2]; };
+
+#if !defined(LUA_IEEEENDIAN)	/* { */
+#define LUAI_EXTRAIEEE	\
+  static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)};
+#define LUA_IEEEENDIANLOC	(ieeeendian.l_p[1] == 33)
+#else
+#define LUA_IEEEENDIANLOC	LUA_IEEEENDIAN
+#define LUAI_EXTRAIEEE		/* empty */
+#endif				/* } */
+
+#define lua_number2int32(i,n,t) \
+  { LUAI_EXTRAIEEE \
+    volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \
+    (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; }
+
+#define luai_hashnum(i,n)  \
+  { volatile union luai_Cast u; u.l_d = (n) + 1.0;  /* avoid -0 */ \
+    (i) = u.l_p[0]; (i) += u.l_p[1]; }  /* add double bits for his hash */
+
+#define lua_number2int(i,n)		lua_number2int32(i, n, int)
+#define lua_number2unsigned(i,n)	lua_number2int32(i, n, lua_Unsigned)
+
+/* the trick can be expanded to lua_Integer when it is a 32-bit value */
+#if defined(LUA_IEEELL)
+#define lua_number2integer(i,n)		lua_number2int32(i, n, lua_Integer)
+#endif
+
+#endif				/* } */
+
+
+/* the following definitions always work, but may be slow */
+
+#if !defined(lua_number2int)
+#define lua_number2int(i,n)	((i)=(int)(n))
+#endif
+
+#if !defined(lua_number2integer)
+#define lua_number2integer(i,n)	((i)=(lua_Integer)(n))
+#endif
+
+#if !defined(lua_number2unsigned)	/* { */
+/* the following definition assures proper modulo behavior */
+#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT)
+#include <math.h>
+#define SUPUNSIGNED	((lua_Number)(~(lua_Unsigned)0) + 1)
+#define lua_number2unsigned(i,n)  \
+	((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED))
+#else
+#define lua_number2unsigned(i,n)	((i)=(lua_Unsigned)(n))
+#endif
+#endif				/* } */
+
+
+#if !defined(lua_unsigned2number)
+/* on several machines, coercion from unsigned to double is slow,
+   so it may be worth to avoid */
+#define lua_unsigned2number(u)  \
+    (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u))
+#endif
+
+
+
+#if defined(ltable_c) && !defined(luai_hashnum)
+
+#define luai_hashnum(i,n) (i = lcompat_hashnum(n))
+
+#endif
+
+
+
+/*
+** macro to control inclusion of some hard tests on stack reallocation
+*/
+#if !defined(HARDSTACKTESTS)
+#define condmovestack(L)	((void)0)
+#else
+/* realloc stack keeping its size */
+#define condmovestack(L)	luaD_reallocstack((L), (L)->stacksize)
+#endif
+
+#if !defined(HARDMEMTESTS)
+#define condchangemem(L)	condmovestack(L)
+#else
+#define condchangemem(L)  \
+	((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1)))
+#endif
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lmem.c b/sys/contrib/openzfs/module/lua/lmem.c
new file mode 100644
index 000000000000..18bb2514cb01
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lmem.c
@@ -0,0 +1,98 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+
+#define lmem_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+/*
+** About the realloc function:
+** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize);
+** (`osize' is the old size, `nsize' is the new size)
+**
+** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no
+** matter 'x').
+**
+** * frealloc(ud, p, x, 0) frees the block `p'
+** (in this specific case, frealloc must return NULL);
+** particularly, frealloc(ud, NULL, 0, 0) does nothing
+** (which is equivalent to free(NULL) in ANSI C)
+**
+** frealloc returns NULL if it cannot create or reallocate the area
+** (any reallocation to an equal or smaller size cannot fail!)
+*/
+
+
+
+#define MINSIZEARRAY	4
+
+
+void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems,
+                     int limit, const char *what) {
+  void *newblock;
+  int newsize;
+  if (*size >= limit/2) {  /* cannot double it? */
+    if (*size >= limit)  /* cannot grow even a little? */
+      luaG_runerror(L, "too many %s (limit is %d)", what, limit);
+    newsize = limit;  /* still have at least one free place */
+  }
+  else {
+    newsize = (*size)*2;
+    if (newsize < MINSIZEARRAY)
+      newsize = MINSIZEARRAY;  /* minimum size */
+  }
+  newblock = luaM_reallocv(L, block, *size, newsize, size_elems);
+  *size = newsize;  /* update only when everything else is OK */
+  return newblock;
+}
+
+
+l_noret luaM_toobig (lua_State *L) {
+  luaG_runerror(L, "memory allocation error: block too big");
+}
+
+
+
+/*
+** generic allocation routine.
+*/
+void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) {
+  void *newblock;
+  global_State *g = G(L);
+  size_t realosize = (block) ? osize : 0;
+  lua_assert((realosize == 0) == (block == NULL));
+#if defined(HARDMEMTESTS)
+  if (nsize > realosize && g->gcrunning)
+    luaC_fullgc(L, 1);  /* force a GC whenever possible */
+#endif
+  newblock = (*g->frealloc)(g->ud, block, osize, nsize);
+  if (newblock == NULL && nsize > 0) {
+    api_check(L, nsize > realosize,
+                 "realloc cannot fail when shrinking a block");
+    if (g->gcrunning) {
+      luaC_fullgc(L, 1);  /* try to free some memory... */
+      newblock = (*g->frealloc)(g->ud, block, osize, nsize);  /* try again */
+    }
+    if (newblock == NULL)
+      luaD_throw(L, LUA_ERRMEM);
+  }
+  lua_assert((nsize == 0) == (newblock == NULL));
+  g->GCdebt = (g->GCdebt + nsize) - realosize;
+  return newblock;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lmem.h b/sys/contrib/openzfs/module/lua/lmem.h
new file mode 100644
index 000000000000..22c04c98c863
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lmem.h
@@ -0,0 +1,56 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lmem_h
+#define lmem_h
+
+
+#include "llimits.h"
+#include <sys/lua/lua.h>
+
+
+/*
+** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is
+** always constant.
+** The macro is somewhat complex to avoid warnings:
+** +1 avoids warnings of "comparison has constant result";
+** cast to 'void' avoids warnings of "value unused".
+*/
+#define luaM_reallocv(L,b,on,n,e) \
+  (cast(void, \
+     (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \
+   luaM_realloc_(L, (b), (on)*(e), (n)*(e)))
+
+#define luaM_freemem(L, b, s)	luaM_realloc_(L, (b), (s), 0)
+#define luaM_free(L, b)		luaM_realloc_(L, (b), sizeof(*(b)), 0)
+#define luaM_freearray(L, b, n)   luaM_reallocv(L, (b), n, 0, sizeof((b)[0]))
+
+#define luaM_malloc(L,s)	luaM_realloc_(L, NULL, 0, (s))
+#define luaM_new(L,t)		cast(t *, luaM_malloc(L, sizeof(t)))
+#define luaM_newvector(L,n,t) \
+		cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t)))
+
+#define luaM_newobject(L,tag,s)	luaM_realloc_(L, NULL, tag, (s))
+
+#define luaM_growvector(L,v,nelems,size,t,limit,e) \
+          if ((nelems)+1 > (size)) \
+            ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e)))
+
+#define luaM_reallocvector(L, v,oldn,n,t) \
+   ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t))))
+
+LUAI_FUNC l_noret luaM_toobig (lua_State *L);
+
+/* not to be called directly */
+LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize,
+                                                          size_t size);
+LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size,
+                               size_t size_elem, int limit,
+                               const char *what);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lobject.c b/sys/contrib/openzfs/module/lua/lobject.c
new file mode 100644
index 000000000000..024d3199fe24
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lobject.c
@@ -0,0 +1,282 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $
+** Some generic functions over Lua objects
+** See Copyright Notice in lua.h
+*/
+
+#define lobject_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lctype.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "lvm.h"
+
+
+
+LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT};
+
+
+/*
+** converts an integer to a "floating point byte", represented as
+** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if
+** eeeee != 0 and (xxx) otherwise.
+*/
+int luaO_int2fb (unsigned int x) {
+  int e = 0;  /* exponent */
+  if (x < 8) return x;
+  while (x >= 0x10) {
+    x = (x+1) >> 1;
+    e++;
+  }
+  return ((e+1) << 3) | (cast_int(x) - 8);
+}
+
+
+/* converts back */
+int luaO_fb2int (int x) {
+  int e = (x >> 3) & 0x1f;
+  if (e == 0) return x;
+  else return ((x & 7) + 8) << (e - 1);
+}
+
+
+int luaO_ceillog2 (unsigned int x) {
+  static const lu_byte log_2[256] = {
+    0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+  };
+  int l = 0;
+  x--;
+  while (x >= 256) { l += 8; x >>= 8; }
+  return l + log_2[x];
+}
+
+
+lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) {
+  switch (op) {
+    case LUA_OPADD: return luai_numadd(NULL, v1, v2);
+    case LUA_OPSUB: return luai_numsub(NULL, v1, v2);
+    case LUA_OPMUL: return luai_nummul(NULL, v1, v2);
+    case LUA_OPDIV: return luai_numdiv(NULL, v1, v2);
+    case LUA_OPMOD: return luai_nummod(NULL, v1, v2);
+    case LUA_OPPOW: return luai_numpow(NULL, v1, v2);
+    case LUA_OPUNM: return luai_numunm(NULL, v1);
+    default: lua_assert(0); return 0;
+  }
+}
+
+
+int luaO_hexavalue (int c) {
+  if (lisdigit(c)) return c - '0';
+  else return ltolower(c) - 'a' + 10;
+}
+
+
+#if !defined(lua_strx2number)
+
+
+
+static int isneg (const char **s) {
+  if (**s == '-') { (*s)++; return 1; }
+  else if (**s == '+') (*s)++;
+  return 0;
+}
+
+
+static lua_Number readhexa (const char **s, lua_Number r, int *count) {
+  for (; lisxdigit(cast_uchar(**s)); (*s)++) {  /* read integer part */
+    r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s)));
+    (*count)++;
+  }
+  return r;
+}
+
+
+/*
+** convert an hexadecimal numeric string to a number, following
+** C99 specification for 'strtod'
+*/
+static lua_Number lua_strx2number (const char *s, char **endptr) {
+  lua_Number r = 0.0;
+  int e = 0, i = 0;
+  int neg = 0;  /* 1 if number is negative */
+  *endptr = cast(char *, s);  /* nothing is valid yet */
+  while (lisspace(cast_uchar(*s))) s++;  /* skip initial spaces */
+  neg = isneg(&s);  /* check signal */
+  if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X')))  /* check '0x' */
+    return 0.0;  /* invalid format (no '0x') */
+  s += 2;  /* skip '0x' */
+  r = readhexa(&s, r, &i);  /* read integer part */
+  if (*s == '.') {
+    s++;  /* skip dot */
+    r = readhexa(&s, r, &e);  /* read fractional part */
+  }
+  if (i == 0 && e == 0)
+    return 0.0;  /* invalid format (no digit) */
+  e *= -4;  /* each fractional digit divides value by 2^-4 */
+  *endptr = cast(char *, s);  /* valid up to here */
+  if (*s == 'p' || *s == 'P') {  /* exponent part? */
+    int exp1 = 0;
+    int neg1;
+    s++;  /* skip 'p' */
+    neg1 = isneg(&s);  /* signal */
+    if (!lisdigit(cast_uchar(*s)))
+      goto ret;  /* must have at least one digit */
+    while (lisdigit(cast_uchar(*s)))  /* read exponent */
+      exp1 = exp1 * 10 + *(s++) - '0';
+    if (neg1) exp1 = -exp1;
+    e += exp1;
+  }
+  *endptr = cast(char *, s);  /* valid up to here */
+ ret:
+  if (neg) r = -r;
+  return (r * (1 << e));
+}
+
+#endif
+
+
+int luaO_str2d (const char *s, size_t len, lua_Number *result) {
+  char *endptr;
+  if (strpbrk(s, "nN"))  /* reject 'inf' and 'nan' */
+    return 0;
+  else if (strpbrk(s, "xX"))  /* hexa? */
+    *result = lua_strx2number(s, &endptr);
+  else
+    *result = lua_str2number(s, &endptr);
+  if (endptr == s) return 0;  /* nothing recognized */
+  while (lisspace(cast_uchar(*endptr))) endptr++;
+  return (endptr == s + len);  /* OK if no trailing characters */
+}
+
+
+
+static void pushstr (lua_State *L, const char *str, size_t l) {
+  setsvalue2s(L, L->top++, luaS_newlstr(L, str, l));
+}
+
+
+/* this function handles only `%d', `%c', %f, %p, and `%s' formats */
+const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
+  int n = 0;
+  for (;;) {
+    const char *e = strchr(fmt, '%');
+    if (e == NULL) break;
+    luaD_checkstack(L, 2);  /* fmt + item */
+    pushstr(L, fmt, e - fmt);
+    switch (*(e+1)) {
+      case 's': {
+        const char *s = va_arg(argp, char *);
+        if (s == NULL) s = "(null)";
+        pushstr(L, s, strlen(s));
+        break;
+      }
+      case 'c': {
+        char buff;
+        buff = cast(char, va_arg(argp, int));
+        pushstr(L, &buff, 1);
+        break;
+      }
+      case 'd': {
+        setnvalue(L->top++, cast_num(va_arg(argp, int)));
+        break;
+      }
+      case 'f': {
+        setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber)));
+        break;
+      }
+      case 'p': {
+        char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */
+        int l = lcompat_sprintf(buff, sizeof(buff), "%p", va_arg(argp, void *));
+        pushstr(L, buff, l);
+        break;
+      }
+      case '%': {
+        pushstr(L, "%", 1);
+        break;
+      }
+      default: {
+        luaG_runerror(L,
+            "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"),
+            *(e + 1));
+      }
+    }
+    n += 2;
+    fmt = e+2;
+  }
+  luaD_checkstack(L, 1);
+  pushstr(L, fmt, strlen(fmt));
+  if (n > 0) luaV_concat(L, n + 1);
+  return svalue(L->top - 1);
+}
+
+
+const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) {
+  const char *msg;
+  va_list argp;
+  va_start(argp, fmt);
+  msg = luaO_pushvfstring(L, fmt, argp);
+  va_end(argp);
+  return msg;
+}
+
+
+/* number of chars of a literal string without the ending \0 */
+#define LL(x)	(sizeof(x)/sizeof(char) - 1)
+
+#define RETS	"..."
+#define PRE	"[string \""
+#define POS	"\"]"
+
+#define addstr(a,b,l)	( memcpy(a,b,(l) * sizeof(char)), a += (l) )
+
+void luaO_chunkid (char *out, const char *source, size_t bufflen) {
+  size_t l = strlen(source);
+  if (*source == '=') {  /* 'literal' source */
+    if (l <= bufflen)  /* small enough? */
+      memcpy(out, source + 1, l * sizeof(char));
+    else {  /* truncate it */
+      addstr(out, source + 1, bufflen - 1);
+      *out = '\0';
+    }
+  }
+  else if (*source == '@') {  /* file name */
+    if (l <= bufflen)  /* small enough? */
+      memcpy(out, source + 1, l * sizeof(char));
+    else {  /* add '...' before rest of name */
+      addstr(out, RETS, LL(RETS));
+      bufflen -= LL(RETS);
+      memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char));
+    }
+  }
+  else {  /* string; format as [string "source"] */
+    const char *nl = strchr(source, '\n');  /* find first new line (if any) */
+    addstr(out, PRE, LL(PRE));  /* add prefix */
+    bufflen -= LL(PRE RETS POS) + 1;  /* save space for prefix+suffix+'\0' */
+    if (l < bufflen && nl == NULL) {  /* small one-line source? */
+      addstr(out, source, l);  /* keep it */
+    }
+    else {
+      if (nl != NULL) l = nl - source;  /* stop at first newline */
+      if (l > bufflen) l = bufflen;
+      addstr(out, source, l);
+      addstr(out, RETS, LL(RETS));
+    }
+    memcpy(out, POS, (LL(POS) + 1) * sizeof(char));
+  }
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lobject.h b/sys/contrib/openzfs/module/lua/lobject.h
new file mode 100644
index 000000000000..a16b8d62eb4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lobject.h
@@ -0,0 +1,605 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $
+** Type definitions for Lua objects
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lobject_h
+#define lobject_h
+
+
+#include "llimits.h"
+#include <sys/lua/lua.h>
+
+
+/*
+** Extra tags for non-values
+*/
+#define LUA_TPROTO	LUA_NUMTAGS
+#define LUA_TUPVAL	(LUA_NUMTAGS+1)
+#define LUA_TDEADKEY	(LUA_NUMTAGS+2)
+
+/*
+** number of all possible tags (including LUA_TNONE but excluding DEADKEY)
+*/
+#define LUA_TOTALTAGS	(LUA_TUPVAL+2)
+
+
+/*
+** tags for Tagged Values have the following use of bits:
+** bits 0-3: actual tag (a LUA_T* value)
+** bits 4-5: variant bits
+** bit 6: whether value is collectable
+*/
+
+#define VARBITS		(3 << 4)
+
+
+/*
+** LUA_TFUNCTION variants:
+** 0 - Lua function
+** 1 - light C function
+** 2 - regular C function (closure)
+*/
+
+/* Variant tags for functions */
+#define LUA_TLCL	(LUA_TFUNCTION | (0 << 4))  /* Lua closure */
+#define LUA_TLCF	(LUA_TFUNCTION | (1 << 4))  /* light C function */
+#define LUA_TCCL	(LUA_TFUNCTION | (2 << 4))  /* C closure */
+
+
+/* Variant tags for strings */
+#define LUA_TSHRSTR	(LUA_TSTRING | (0 << 4))  /* short strings */
+#define LUA_TLNGSTR	(LUA_TSTRING | (1 << 4))  /* long strings */
+
+
+/* Bit mark for collectable types */
+#define BIT_ISCOLLECTABLE	(1 << 6)
+
+/* mark a tag as collectable */
+#define ctb(t)			((t) | BIT_ISCOLLECTABLE)
+
+
+/*
+** Union of all collectable objects
+*/
+typedef union GCObject GCObject;
+
+
+/*
+** Common Header for all collectable objects (in macro form, to be
+** included in other objects)
+*/
+#define CommonHeader	GCObject *next; lu_byte tt; lu_byte marked
+
+
+/*
+** Common header in struct form
+*/
+typedef struct GCheader {
+  CommonHeader;
+} GCheader;
+
+
+
+/*
+** Union of all Lua values
+*/
+typedef union Value Value;
+
+
+#define numfield	lua_Number n;    /* numbers */
+
+
+
+/*
+** Tagged Values. This is the basic representation of values in Lua,
+** an actual value plus a tag with its type.
+*/
+
+#define TValuefields	Value value_; int tt_
+
+typedef struct lua_TValue TValue;
+
+
+/* macro defining a nil value */
+#define NILCONSTANT	{NULL}, LUA_TNIL
+
+
+#define val_(o)		((o)->value_)
+#define num_(o)		(val_(o).n)
+
+
+/* raw type tag of a TValue */
+#define rttype(o)	((o)->tt_)
+
+/* tag with no variants (bits 0-3) */
+#define novariant(x)	((x) & 0x0F)
+
+/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */
+#define ttype(o)	(rttype(o) & 0x3F)
+
+/* type tag of a TValue with no variants (bits 0-3) */
+#define ttypenv(o)	(novariant(rttype(o)))
+
+
+/* Macros to test type */
+#define checktag(o,t)		(rttype(o) == (t))
+#define checktype(o,t)		(ttypenv(o) == (t))
+#define ttisnumber(o)		checktag((o), LUA_TNUMBER)
+#define ttisnil(o)		checktag((o), LUA_TNIL)
+#define ttisboolean(o)		checktag((o), LUA_TBOOLEAN)
+#define ttislightuserdata(o)	checktag((o), LUA_TLIGHTUSERDATA)
+#define ttisstring(o)		checktype((o), LUA_TSTRING)
+#define ttisshrstring(o)	checktag((o), ctb(LUA_TSHRSTR))
+#define ttislngstring(o)	checktag((o), ctb(LUA_TLNGSTR))
+#define ttistable(o)		checktag((o), ctb(LUA_TTABLE))
+#define ttisfunction(o)		checktype(o, LUA_TFUNCTION)
+#define ttisclosure(o)		((rttype(o) & 0x1F) == LUA_TFUNCTION)
+#define ttisCclosure(o)		checktag((o), ctb(LUA_TCCL))
+#define ttisLclosure(o)		checktag((o), ctb(LUA_TLCL))
+#define ttislcf(o)		checktag((o), LUA_TLCF)
+#define ttisuserdata(o)		checktag((o), ctb(LUA_TUSERDATA))
+#define ttisthread(o)		checktag((o), ctb(LUA_TTHREAD))
+#define ttisdeadkey(o)		checktag((o), LUA_TDEADKEY)
+
+#define ttisequal(o1,o2)	(rttype(o1) == rttype(o2))
+
+/* Macros to access values */
+#define nvalue(o)	check_exp(ttisnumber(o), num_(o))
+#define gcvalue(o)	check_exp(iscollectable(o), val_(o).gc)
+#define pvalue(o)	check_exp(ttislightuserdata(o), val_(o).p)
+#define rawtsvalue(o)	check_exp(ttisstring(o), &val_(o).gc->ts)
+#define tsvalue(o)	(&rawtsvalue(o)->tsv)
+#define rawuvalue(o)	check_exp(ttisuserdata(o), &val_(o).gc->u)
+#define uvalue(o)	(&rawuvalue(o)->uv)
+#define clvalue(o)	check_exp(ttisclosure(o), &val_(o).gc->cl)
+#define clLvalue(o)	check_exp(ttisLclosure(o), &val_(o).gc->cl.l)
+#define clCvalue(o)	check_exp(ttisCclosure(o), &val_(o).gc->cl.c)
+#define fvalue(o)	check_exp(ttislcf(o), val_(o).f)
+#define hvalue(o)	check_exp(ttistable(o), &val_(o).gc->h)
+#define bvalue(o)	check_exp(ttisboolean(o), val_(o).b)
+#define thvalue(o)	check_exp(ttisthread(o), &val_(o).gc->th)
+/* a dead value may get the 'gc' field, but cannot access its contents */
+#define deadvalue(o)	check_exp(ttisdeadkey(o), cast(void *, val_(o).gc))
+
+#define l_isfalse(o)	(ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0))
+
+
+#define iscollectable(o)	(rttype(o) & BIT_ISCOLLECTABLE)
+
+
+/* Macros for internal tests */
+#define righttt(obj)		(ttype(obj) == gcvalue(obj)->gch.tt)
+
+#define checkliveness(g,obj) \
+	lua_longassert(!iscollectable(obj) || \
+			(righttt(obj) && !isdead(g,gcvalue(obj))))
+
+
+/* Macros to set values */
+#define settt_(o,t)	((o)->tt_=(t))
+
+#define setnvalue(obj,x) \
+  { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); }
+
+#define setnilvalue(obj) settt_(obj, LUA_TNIL)
+
+#define setfvalue(obj,x) \
+  { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); }
+
+#define setpvalue(obj,x) \
+  { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); }
+
+#define setbvalue(obj,x) \
+  { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); }
+
+#define setgcovalue(L,obj,x) \
+  { TValue *io=(obj); GCObject *i_g=(x); \
+    val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); }
+
+#define setsvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    TString *x_ = (x); \
+    val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \
+    checkliveness(G(L),io); }
+
+#define setuvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \
+    checkliveness(G(L),io); }
+
+#define setthvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \
+    checkliveness(G(L),io); }
+
+#define setclLvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \
+    checkliveness(G(L),io); }
+
+#define setclCvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \
+    checkliveness(G(L),io); }
+
+#define sethvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \
+    checkliveness(G(L),io); }
+
+#define setdeadvalue(obj)	settt_(obj, LUA_TDEADKEY)
+
+
+
+#define setobj(L,obj1,obj2) \
+	{ const TValue *io2=(obj2); TValue *io1=(obj1); \
+	  io1->value_ = io2->value_; io1->tt_ = io2->tt_; \
+	  checkliveness(G(L),io1); }
+
+
+/*
+** different types of assignments, according to destination
+*/
+
+/* from stack to (same) stack */
+#define setobjs2s	setobj
+/* to stack (not from same stack) */
+#define setobj2s	setobj
+#define setsvalue2s	setsvalue
+#define sethvalue2s	sethvalue
+#define setptvalue2s	setptvalue
+/* from table to same table */
+#define setobjt2t	setobj
+/* to table */
+#define setobj2t	setobj
+/* to new object */
+#define setobj2n	setobj
+#define setsvalue2n	setsvalue
+
+
+/* check whether a number is valid (useful only for NaN trick) */
+#define luai_checknum(L,o,c)	{ /* empty */ }
+
+
+/*
+** {======================================================
+** NaN Trick
+** =======================================================
+*/
+#if defined(LUA_NANTRICK)
+
+/*
+** numbers are represented in the 'd_' field. All other values have the
+** value (NNMARK | tag) in 'tt__'. A number with such pattern would be
+** a "signaled NaN", which is never generated by regular operations by
+** the CPU (nor by 'strtod')
+*/
+
+/* allows for external implementation for part of the trick */
+#if !defined(NNMARK)	/* { */
+
+
+#if !defined(LUA_IEEEENDIAN)
+#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN'
+#endif
+
+
+#define NNMARK		0x7FF7A500
+#define NNMASK		0x7FFFFF00
+
+#undef TValuefields
+#undef NILCONSTANT
+
+#if (LUA_IEEEENDIAN == 0)	/* { */
+
+/* little endian */
+#define TValuefields  \
+	union { struct { Value v__; int tt__; } i; double d__; } u
+#define NILCONSTANT	{{{NULL}, tag2tt(LUA_TNIL)}}
+/* field-access macros */
+#define v_(o)		((o)->u.i.v__)
+#define d_(o)		((o)->u.d__)
+#define tt_(o)		((o)->u.i.tt__)
+
+#else				/* }{ */
+
+/* big endian */
+#define TValuefields  \
+	union { struct { int tt__; Value v__; } i; double d__; } u
+#define NILCONSTANT	{{tag2tt(LUA_TNIL), {NULL}}}
+/* field-access macros */
+#define v_(o)		((o)->u.i.v__)
+#define d_(o)		((o)->u.d__)
+#define tt_(o)		((o)->u.i.tt__)
+
+#endif				/* } */
+
+#endif			/* } */
+
+
+/* correspondence with standard representation */
+#undef val_
+#define val_(o)		v_(o)
+#undef num_
+#define num_(o)		d_(o)
+
+
+#undef numfield
+#define numfield	/* no such field; numbers are the entire struct */
+
+/* basic check to distinguish numbers from non-numbers */
+#undef ttisnumber
+#define ttisnumber(o)	((tt_(o) & NNMASK) != NNMARK)
+
+#define tag2tt(t)	(NNMARK | (t))
+
+#undef rttype
+#define rttype(o)	(ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff)
+
+#undef settt_
+#define settt_(o,t)	(tt_(o) = tag2tt(t))
+
+#undef setnvalue
+#define setnvalue(obj,x) \
+	{ TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); }
+
+#undef setobj
+#define setobj(L,obj1,obj2) \
+	{ const TValue *o2_=(obj2); TValue *o1_=(obj1); \
+	  o1_->u = o2_->u; \
+	  checkliveness(G(L),o1_); }
+
+
+/*
+** these redefinitions are not mandatory, but these forms are more efficient
+*/
+
+#undef checktag
+#undef checktype
+#define checktag(o,t)	(tt_(o) == tag2tt(t))
+#define checktype(o,t)	(ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS))
+
+#undef ttisequal
+#define ttisequal(o1,o2)  \
+	(ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2)))
+
+
+#undef luai_checknum
+#define luai_checknum(L,o,c)	{ if (!ttisnumber(o)) c; }
+
+#endif
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** types and prototypes
+** =======================================================
+*/
+
+
+union Value {
+  GCObject *gc;    /* collectable objects */
+  void *p;         /* light userdata */
+  int b;           /* booleans */
+  lua_CFunction f; /* light C functions */
+  numfield         /* numbers */
+};
+
+
+struct lua_TValue {
+  TValuefields;
+};
+
+
+typedef TValue *StkId;  /* index to stack elements */
+
+
+
+
+/*
+** Header for string value; string bytes follow the end of this structure
+*/
+typedef union TString {
+  L_Umaxalign dummy;  /* ensures maximum alignment for strings */
+  struct {
+    CommonHeader;
+    lu_byte extra;  /* reserved words for short strings; "has hash" for longs */
+    unsigned int hash;
+    size_t len;  /* number of characters in string */
+  } tsv;
+} TString;
+
+
+/* get the actual string (array of bytes) from a TString */
+#define getstr(ts)	cast(const char *, (ts) + 1)
+
+/* get the actual string (array of bytes) from a Lua value */
+#define svalue(o)       getstr(rawtsvalue(o))
+
+
+/*
+** Header for userdata; memory area follows the end of this structure
+*/
+typedef union Udata {
+  L_Umaxalign dummy;  /* ensures maximum alignment for `local' udata */
+  struct {
+    CommonHeader;
+    struct Table *metatable;
+    struct Table *env;
+    size_t len;  /* number of bytes */
+  } uv;
+} Udata;
+
+
+
+/*
+** Description of an upvalue for function prototypes
+*/
+typedef struct Upvaldesc {
+  TString *name;  /* upvalue name (for debug information) */
+  lu_byte instack;  /* whether it is in stack */
+  lu_byte idx;  /* index of upvalue (in stack or in outer function's list) */
+} Upvaldesc;
+
+
+/*
+** Description of a local variable for function prototypes
+** (used for debug information)
+*/
+typedef struct LocVar {
+  TString *varname;
+  int startpc;  /* first point where variable is active */
+  int endpc;    /* first point where variable is dead */
+} LocVar;
+
+
+/*
+** Function Prototypes
+*/
+typedef struct Proto {
+  CommonHeader;
+  TValue *k;  /* constants used by the function */
+  Instruction *code;
+  struct Proto **p;  /* functions defined inside the function */
+  int *lineinfo;  /* map from opcodes to source lines (debug information) */
+  LocVar *locvars;  /* information about local variables (debug information) */
+  Upvaldesc *upvalues;  /* upvalue information */
+  union Closure *cache;  /* last created closure with this prototype */
+  TString  *source;  /* used for debug information */
+  int sizeupvalues;  /* size of 'upvalues' */
+  int sizek;  /* size of `k' */
+  int sizecode;
+  int sizelineinfo;
+  int sizep;  /* size of `p' */
+  int sizelocvars;
+  int linedefined;
+  int lastlinedefined;
+  GCObject *gclist;
+  lu_byte numparams;  /* number of fixed parameters */
+  lu_byte is_vararg;
+  lu_byte maxstacksize;  /* maximum stack used by this function */
+} Proto;
+
+
+
+/*
+** Lua Upvalues
+*/
+typedef struct UpVal {
+  CommonHeader;
+  TValue *v;  /* points to stack or to its own value */
+  union {
+    TValue value;  /* the value (when closed) */
+    struct {  /* double linked list (when open) */
+      struct UpVal *prev;
+      struct UpVal *next;
+    } l;
+  } u;
+} UpVal;
+
+
+/*
+** Closures
+*/
+
+#define ClosureHeader \
+	CommonHeader; lu_byte nupvalues; GCObject *gclist
+
+typedef struct CClosure {
+  ClosureHeader;
+  lua_CFunction f;
+  TValue upvalue[1];  /* list of upvalues */
+} CClosure;
+
+
+typedef struct LClosure {
+  ClosureHeader;
+  struct Proto *p;
+  UpVal *upvals[1];  /* list of upvalues */
+} LClosure;
+
+
+typedef union Closure {
+  CClosure c;
+  LClosure l;
+} Closure;
+
+
+#define isLfunction(o)	ttisLclosure(o)
+
+#define getproto(o)	(clLvalue(o)->p)
+
+
+/*
+** Tables
+*/
+
+typedef union TKey {
+  struct {
+    TValuefields;
+    struct Node *next;  /* for chaining */
+  } nk;
+  TValue tvk;
+} TKey;
+
+
+typedef struct Node {
+  TValue i_val;
+  TKey i_key;
+} Node;
+
+
+typedef struct Table {
+  CommonHeader;
+  lu_byte flags;  /* 1<<p means tagmethod(p) is not present */
+  lu_byte lsizenode;  /* log2 of size of `node' array */
+  int sizearray;  /* size of `array' array */
+  TValue *array;  /* array part */
+  Node *node;
+  Node *lastfree;  /* any free position is before this position */
+  struct Table *metatable;
+  GCObject *gclist;
+} Table;
+
+
+
+/*
+** `module' operation for hashing (size is always a power of 2)
+*/
+#define lmod(s,size) \
+	(check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
+
+
+#define twoto(x)	(1<<(x))
+#define sizenode(t)	(twoto((t)->lsizenode))
+
+
+/*
+** (address of) a fixed nil value
+*/
+#define luaO_nilobject		(&luaO_nilobject_)
+
+
+LUAI_DDEC const TValue luaO_nilobject_;
+
+
+LUAI_FUNC int luaO_int2fb (unsigned int x);
+LUAI_FUNC int luaO_fb2int (int x);
+LUAI_FUNC int luaO_ceillog2 (unsigned int x);
+LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2);
+LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result);
+LUAI_FUNC int luaO_hexavalue (int c);
+LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt,
+                                                       va_list argp);
+LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...);
+LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lopcodes.c b/sys/contrib/openzfs/module/lua/lopcodes.c
new file mode 100644
index 000000000000..5f34e6d90515
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lopcodes.c
@@ -0,0 +1,108 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#define lopcodes_c
+#define LUA_CORE
+
+
+#include "lopcodes.h"
+
+
+/* ORDER OP */
+
+LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = {
+  "MOVE",
+  "LOADK",
+  "LOADKX",
+  "LOADBOOL",
+  "LOADNIL",
+  "GETUPVAL",
+  "GETTABUP",
+  "GETTABLE",
+  "SETTABUP",
+  "SETUPVAL",
+  "SETTABLE",
+  "NEWTABLE",
+  "SELF",
+  "ADD",
+  "SUB",
+  "MUL",
+  "DIV",
+  "MOD",
+  "POW",
+  "UNM",
+  "NOT",
+  "LEN",
+  "CONCAT",
+  "JMP",
+  "EQ",
+  "LT",
+  "LE",
+  "TEST",
+  "TESTSET",
+  "CALL",
+  "TAILCALL",
+  "RETURN",
+  "FORLOOP",
+  "FORPREP",
+  "TFORCALL",
+  "TFORLOOP",
+  "SETLIST",
+  "CLOSURE",
+  "VARARG",
+  "EXTRAARG",
+  NULL
+};
+
+
+#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m))
+
+LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = {
+/*       T  A    B       C     mode		   opcode	*/
+  opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_MOVE */
+ ,opmode(0, 1, OpArgK, OpArgN, iABx)		/* OP_LOADK */
+ ,opmode(0, 1, OpArgN, OpArgN, iABx)		/* OP_LOADKX */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_LOADBOOL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_LOADNIL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_GETUPVAL */
+ ,opmode(0, 1, OpArgU, OpArgK, iABC)		/* OP_GETTABUP */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC)		/* OP_GETTABLE */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC)		/* OP_SETTABUP */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC)		/* OP_SETUPVAL */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC)		/* OP_SETTABLE */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_NEWTABLE */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC)		/* OP_SELF */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_ADD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_SUB */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_MUL */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_DIV */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_MOD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_POW */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_UNM */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_NOT */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_LEN */
+ ,opmode(0, 1, OpArgR, OpArgR, iABC)		/* OP_CONCAT */
+ ,opmode(0, 0, OpArgR, OpArgN, iAsBx)		/* OP_JMP */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_EQ */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_LT */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_LE */
+ ,opmode(1, 0, OpArgN, OpArgU, iABC)		/* OP_TEST */
+ ,opmode(1, 1, OpArgR, OpArgU, iABC)		/* OP_TESTSET */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_CALL */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_TAILCALL */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC)		/* OP_RETURN */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_FORLOOP */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_FORPREP */
+ ,opmode(0, 0, OpArgN, OpArgU, iABC)		/* OP_TFORCALL */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_TFORLOOP */
+ ,opmode(0, 0, OpArgU, OpArgU, iABC)		/* OP_SETLIST */
+ ,opmode(0, 1, OpArgU, OpArgN, iABx)		/* OP_CLOSURE */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_VARARG */
+ ,opmode(0, 0, OpArgU, OpArgU, iAx)		/* OP_EXTRAARG */
+};
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lopcodes.h b/sys/contrib/openzfs/module/lua/lopcodes.h
new file mode 100644
index 000000000000..02eeec1ecd06
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lopcodes.h
@@ -0,0 +1,290 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lopcodes_h
+#define lopcodes_h
+
+#include "llimits.h"
+
+
+/*===========================================================================
+  We assume that instructions are unsigned numbers.
+  All instructions have an opcode in the first 6 bits.
+  Instructions can have the following fields:
+	`A' : 8 bits
+	`B' : 9 bits
+	`C' : 9 bits
+	'Ax' : 26 bits ('A', 'B', and 'C' together)
+	`Bx' : 18 bits (`B' and `C' together)
+	`sBx' : signed Bx
+
+  A signed argument is represented in excess K; that is, the number
+  value is the unsigned value minus K. K is exactly the maximum value
+  for that argument (so that -max is represented by 0, and +max is
+  represented by 2*max), which is half the maximum for the corresponding
+  unsigned argument.
+===========================================================================*/
+
+
+enum OpMode {iABC, iABx, iAsBx, iAx};  /* basic instruction format */
+
+
+/*
+** size and position of opcode arguments.
+*/
+#define SIZE_C		9
+#define SIZE_B		9
+#define SIZE_Bx		(SIZE_C + SIZE_B)
+#define SIZE_A		8
+#define SIZE_Ax		(SIZE_C + SIZE_B + SIZE_A)
+
+#define SIZE_OP		6
+
+#define POS_OP		0
+#define POS_A		(POS_OP + SIZE_OP)
+#define POS_C		(POS_A + SIZE_A)
+#define POS_B		(POS_C + SIZE_C)
+#define POS_Bx		POS_C
+#define POS_Ax		POS_A
+
+
+/*
+** limits for opcode arguments.
+** we use (signed) int to manipulate most arguments,
+** so they must fit in LUAI_BITSINT-1 bits (-1 for sign)
+*/
+#if SIZE_Bx < LUAI_BITSINT-1
+#define MAXARG_Bx        ((1<<SIZE_Bx)-1)
+#define MAXARG_sBx        (MAXARG_Bx>>1)         /* `sBx' is signed */
+#else
+#define MAXARG_Bx        MAX_INT
+#define MAXARG_sBx        MAX_INT
+#endif
+
+#if SIZE_Ax < LUAI_BITSINT-1
+#define MAXARG_Ax	((1<<SIZE_Ax)-1)
+#else
+#define MAXARG_Ax	MAX_INT
+#endif
+
+
+#define MAXARG_A        ((1<<SIZE_A)-1)
+#define MAXARG_B        ((1<<SIZE_B)-1)
+#define MAXARG_C        ((1<<SIZE_C)-1)
+
+
+/* creates a mask with `n' 1 bits at position `p' */
+#define MASK1(n,p)	((~((~(Instruction)0)<<(n)))<<(p))
+
+/* creates a mask with `n' 0 bits at position `p' */
+#define MASK0(n,p)	(~MASK1(n,p))
+
+/*
+** the following macros help to manipulate instructions
+*/
+
+#define GET_OPCODE(i)	(cast(OpCode, ((i)>>POS_OP) & MASK1(SIZE_OP,0)))
+#define SET_OPCODE(i,o)	((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \
+		((cast(Instruction, o)<<POS_OP)&MASK1(SIZE_OP,POS_OP))))
+
+#define getarg(i,pos,size)	(cast(int, ((i)>>pos) & MASK1(size,0)))
+#define setarg(i,v,pos,size)	((i) = (((i)&MASK0(size,pos)) | \
+                ((cast(Instruction, v)<<pos)&MASK1(size,pos))))
+
+#define GETARG_A(i)	getarg(i, POS_A, SIZE_A)
+#define SETARG_A(i,v)	setarg(i, v, POS_A, SIZE_A)
+
+#define GETARG_B(i)	getarg(i, POS_B, SIZE_B)
+#define SETARG_B(i,v)	setarg(i, v, POS_B, SIZE_B)
+
+#define GETARG_C(i)	getarg(i, POS_C, SIZE_C)
+#define SETARG_C(i,v)	setarg(i, v, POS_C, SIZE_C)
+
+#define GETARG_Bx(i)	getarg(i, POS_Bx, SIZE_Bx)
+#define SETARG_Bx(i,v)	setarg(i, v, POS_Bx, SIZE_Bx)
+
+#define GETARG_Ax(i)	getarg(i, POS_Ax, SIZE_Ax)
+#define SETARG_Ax(i,v)	setarg(i, v, POS_Ax, SIZE_Ax)
+
+#define GETARG_sBx(i)	(GETARG_Bx(i)-MAXARG_sBx)
+#define SETARG_sBx(i,b)	SETARG_Bx((i),cast(unsigned int, (b)+MAXARG_sBx))
+
+
+#define CREATE_ABC(o,a,b,c)	((cast(Instruction, o)<<POS_OP) \
+			| (cast(Instruction, a)<<POS_A) \
+			| (cast(Instruction, b)<<POS_B) \
+			| (cast(Instruction, c)<<POS_C))
+
+#define CREATE_ABx(o,a,bc)	((cast(Instruction, o)<<POS_OP) \
+			| (cast(Instruction, a)<<POS_A) \
+			| (cast(Instruction, bc)<<POS_Bx))
+
+#define CREATE_Ax(o,a)		((cast(Instruction, o)<<POS_OP) \
+			| (cast(Instruction, a)<<POS_Ax))
+
+
+/*
+** Macros to operate RK indices
+*/
+
+/* this bit 1 means constant (0 means register) */
+#define BITRK		(1 << (SIZE_B - 1))
+
+/* test whether value is a constant */
+#define ISK(x)		((x) & BITRK)
+
+/* gets the index of the constant */
+#define INDEXK(r)	((int)(r) & ~BITRK)
+
+#define MAXINDEXRK	(BITRK - 1)
+
+/* code a constant index as a RK value */
+#define RKASK(x)	((x) | BITRK)
+
+
+/*
+** invalid register that fits in 8 bits
+*/
+#define NO_REG		MAXARG_A
+
+
+/*
+** R(x) - register
+** Kst(x) - constant (in constant table)
+** RK(x) == if ISK(x) then Kst(INDEXK(x)) else R(x)
+*/
+
+
+/*
+** grep "ORDER OP" if you change these enums
+*/
+
+typedef enum {
+/*----------------------------------------------------------------------
+name		args	description
+------------------------------------------------------------------------*/
+OP_MOVE,/*	A B	R(A) := R(B)					*/
+OP_LOADK,/*	A Bx	R(A) := Kst(Bx)					*/
+OP_LOADKX,/*	A 	R(A) := Kst(extra arg)				*/
+OP_LOADBOOL,/*	A B C	R(A) := (Bool)B; if (C) pc++			*/
+OP_LOADNIL,/*	A B	R(A), R(A+1), ..., R(A+B) := nil		*/
+OP_GETUPVAL,/*	A B	R(A) := UpValue[B]				*/
+
+OP_GETTABUP,/*	A B C	R(A) := UpValue[B][RK(C)]			*/
+OP_GETTABLE,/*	A B C	R(A) := R(B)[RK(C)]				*/
+
+OP_SETTABUP,/*	A B C	UpValue[A][RK(B)] := RK(C)			*/
+OP_SETUPVAL,/*	A B	UpValue[B] := R(A)				*/
+OP_SETTABLE,/*	A B C	R(A)[RK(B)] := RK(C)				*/
+
+OP_NEWTABLE,/*	A B C	R(A) := {} (size = B,C)				*/
+
+OP_SELF,/*	A B C	R(A+1) := R(B); R(A) := R(B)[RK(C)]		*/
+
+OP_ADD,/*	A B C	R(A) := RK(B) + RK(C)				*/
+OP_SUB,/*	A B C	R(A) := RK(B) - RK(C)				*/
+OP_MUL,/*	A B C	R(A) := RK(B) * RK(C)				*/
+OP_DIV,/*	A B C	R(A) := RK(B) / RK(C)				*/
+OP_MOD,/*	A B C	R(A) := RK(B) % RK(C)				*/
+OP_POW,/*	A B C	R(A) := RK(B) ^ RK(C)				*/
+OP_UNM,/*	A B	R(A) := -R(B)					*/
+OP_NOT,/*	A B	R(A) := not R(B)				*/
+OP_LEN,/*	A B	R(A) := length of R(B)				*/
+
+OP_CONCAT,/*	A B C	R(A) := R(B).. ... ..R(C)			*/
+
+OP_JMP,/*	A sBx	pc+=sBx; if (A) close all upvalues >= R(A - 1)	*/
+OP_EQ,/*	A B C	if ((RK(B) == RK(C)) ~= A) then pc++		*/
+OP_LT,/*	A B C	if ((RK(B) <  RK(C)) ~= A) then pc++		*/
+OP_LE,/*	A B C	if ((RK(B) <= RK(C)) ~= A) then pc++		*/
+
+OP_TEST,/*	A C	if not (R(A) <=> C) then pc++			*/
+OP_TESTSET,/*	A B C	if (R(B) <=> C) then R(A) := R(B) else pc++	*/
+
+OP_CALL,/*	A B C	R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */
+OP_TAILCALL,/*	A B C	return R(A)(R(A+1), ... ,R(A+B-1))		*/
+OP_RETURN,/*	A B	return R(A), ... ,R(A+B-2)	(see note)	*/
+
+OP_FORLOOP,/*	A sBx	R(A)+=R(A+2);
+			if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/
+OP_FORPREP,/*	A sBx	R(A)-=R(A+2); pc+=sBx				*/
+
+OP_TFORCALL,/*	A C	R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2));	*/
+OP_TFORLOOP,/*	A sBx	if R(A+1) ~= nil then { R(A)=R(A+1); pc += sBx }*/
+
+OP_SETLIST,/*	A B C	R(A)[(C-1)*FPF+i] := R(A+i), 1 <= i <= B	*/
+
+OP_CLOSURE,/*	A Bx	R(A) := closure(KPROTO[Bx])			*/
+
+OP_VARARG,/*	A B	R(A), R(A+1), ..., R(A+B-2) = vararg		*/
+
+OP_EXTRAARG/*	Ax	extra (larger) argument for previous opcode	*/
+} OpCode;
+
+
+#define NUM_OPCODES	(cast(int, OP_EXTRAARG) + 1)
+
+
+
+/*===========================================================================
+  Notes:
+  (*) In OP_CALL, if (B == 0) then B = top. If (C == 0), then `top' is
+  set to last_result+1, so next open instruction (OP_CALL, OP_RETURN,
+  OP_SETLIST) may use `top'.
+
+  (*) In OP_VARARG, if (B == 0) then use actual number of varargs and
+  set top (like in OP_CALL with C == 0).
+
+  (*) In OP_RETURN, if (B == 0) then return up to `top'.
+
+  (*) In OP_SETLIST, if (B == 0) then B = `top'; if (C == 0) then next
+  'instruction' is EXTRAARG(real C).
+
+  (*) In OP_LOADKX, the next 'instruction' is always EXTRAARG.
+
+  (*) For comparisons, A specifies what condition the test should accept
+  (true or false).
+
+  (*) All `skips' (pc++) assume that next instruction is a jump.
+
+===========================================================================*/
+
+
+/*
+** masks for instruction properties. The format is:
+** bits 0-1: op mode
+** bits 2-3: C arg mode
+** bits 4-5: B arg mode
+** bit 6: instruction set register A
+** bit 7: operator is a test (next instruction must be a jump)
+*/
+
+enum OpArgMask {
+  OpArgN,  /* argument is not used */
+  OpArgU,  /* argument is used */
+  OpArgR,  /* argument is a register or a jump offset */
+  OpArgK   /* argument is a constant or register/constant */
+};
+
+LUAI_DDEC const lu_byte luaP_opmodes[NUM_OPCODES];
+
+#define getOpMode(m)	(cast(enum OpMode, luaP_opmodes[m] & 3))
+#define getBMode(m)	(cast(enum OpArgMask, (luaP_opmodes[m] >> 4) & 3))
+#define getCMode(m)	(cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3))
+#define testAMode(m)	(luaP_opmodes[m] & (1 << 6))
+#define testTMode(m)	(luaP_opmodes[m] & (1 << 7))
+
+
+LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1];  /* opcode names */
+
+
+/* number of list items to accumulate before a SETLIST instruction */
+#define LFIELDS_PER_FLUSH	50
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lparser.c b/sys/contrib/openzfs/module/lua/lparser.c
new file mode 100644
index 000000000000..e1dd88f2f654
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lparser.c
@@ -0,0 +1,1643 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+#define lparser_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+
+
+
+/* maximum number of local variables per function (must be smaller
+   than 250, due to the bytecode format) */
+#define MAXVARS		200
+
+
+#define hasmultret(k)		((k) == VCALL || (k) == VVARARG)
+
+
+
+/*
+** nodes for block list (list of active blocks)
+*/
+typedef struct BlockCnt {
+  struct BlockCnt *previous;  /* chain */
+  short firstlabel;  /* index of first label in this block */
+  short firstgoto;  /* index of first pending goto in this block */
+  lu_byte nactvar;  /* # active locals outside the block */
+  lu_byte upval;  /* true if some variable in the block is an upvalue */
+  lu_byte isloop;  /* true if `block' is a loop */
+} BlockCnt;
+
+
+
+/*
+** prototypes for recursive non-terminal functions
+*/
+static void statement (LexState *ls);
+static void expr (LexState *ls, expdesc *v);
+
+
+static void anchor_token (LexState *ls) {
+  /* last token from outer function must be EOS */
+  lua_assert(ls->fs != NULL || ls->t.token == TK_EOS);
+  if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) {
+    TString *ts = ls->t.seminfo.ts;
+    luaX_newstring(ls, getstr(ts), ts->tsv.len);
+  }
+}
+
+
+/* semantic error */
+static l_noret semerror (LexState *ls, const char *msg) {
+  ls->t.token = 0;  /* remove 'near to' from final message */
+  luaX_syntaxerror(ls, msg);
+}
+
+
+static l_noret error_expected (LexState *ls, int token) {
+  luaX_syntaxerror(ls,
+      luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token)));
+}
+
+
+static l_noret errorlimit (FuncState *fs, int limit, const char *what) {
+  lua_State *L = fs->ls->L;
+  const char *msg;
+  int line = fs->f->linedefined;
+  const char *where = (line == 0)
+                      ? "main function"
+                      : luaO_pushfstring(L, "function at line %d", line);
+  msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s",
+                             what, limit, where);
+  luaX_syntaxerror(fs->ls, msg);
+}
+
+
+static void checklimit (FuncState *fs, int v, int l, const char *what) {
+  if (v > l) errorlimit(fs, l, what);
+}
+
+
+static int testnext (LexState *ls, int c) {
+  if (ls->t.token == c) {
+    luaX_next(ls);
+    return 1;
+  }
+  else return 0;
+}
+
+
+static void check (LexState *ls, int c) {
+  if (ls->t.token != c)
+    error_expected(ls, c);
+}
+
+
+static void checknext (LexState *ls, int c) {
+  check(ls, c);
+  luaX_next(ls);
+}
+
+
+#define check_condition(ls,c,msg)	{ if (!(c)) luaX_syntaxerror(ls, msg); }
+
+
+
+static void check_match (LexState *ls, int what, int who, int where) {
+  if (!testnext(ls, what)) {
+    if (where == ls->linenumber)
+      error_expected(ls, what);
+    else {
+      luaX_syntaxerror(ls, luaO_pushfstring(ls->L,
+             "%s expected (to close %s at line %d)",
+              luaX_token2str(ls, what), luaX_token2str(ls, who), where));
+    }
+  }
+}
+
+
+static TString *str_checkname (LexState *ls) {
+  TString *ts;
+  check(ls, TK_NAME);
+  ts = ls->t.seminfo.ts;
+  luaX_next(ls);
+  return ts;
+}
+
+
+static void init_exp (expdesc *e, expkind k, int i) {
+  e->f = e->t = NO_JUMP;
+  e->k = k;
+  e->u.info = i;
+}
+
+
+static void codestring (LexState *ls, expdesc *e, TString *s) {
+  init_exp(e, VK, luaK_stringK(ls->fs, s));
+}
+
+
+static void checkname (LexState *ls, expdesc *e) {
+  codestring(ls, e, str_checkname(ls));
+}
+
+
+static int registerlocalvar (LexState *ls, TString *varname) {
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;
+  int oldsize = f->sizelocvars;
+  luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars,
+                  LocVar, SHRT_MAX, "local variables");
+  while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL;
+  f->locvars[fs->nlocvars].varname = varname;
+  luaC_objbarrier(ls->L, f, varname);
+  return fs->nlocvars++;
+}
+
+
+static void new_localvar (LexState *ls, TString *name) {
+  FuncState *fs = ls->fs;
+  Dyndata *dyd = ls->dyd;
+  int reg = registerlocalvar(ls, name);
+  checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal,
+                  MAXVARS, "local variables");
+  luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1,
+                  dyd->actvar.size, Vardesc, MAX_INT, "local variables");
+  dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg);
+}
+
+
+static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) {
+  new_localvar(ls, luaX_newstring(ls, name, sz));
+}
+
+#define new_localvarliteral(ls,v) \
+	new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1)
+
+
+static LocVar *getlocvar (FuncState *fs, int i) {
+  int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx;
+  lua_assert(idx < fs->nlocvars);
+  return &fs->f->locvars[idx];
+}
+
+
+static void adjustlocalvars (LexState *ls, int nvars) {
+  FuncState *fs = ls->fs;
+  fs->nactvar = cast_byte(fs->nactvar + nvars);
+  for (; nvars; nvars--) {
+    getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc;
+  }
+}
+
+
+static void removevars (FuncState *fs, int tolevel) {
+  fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel);
+  while (fs->nactvar > tolevel)
+    getlocvar(fs, --fs->nactvar)->endpc = fs->pc;
+}
+
+
+static int searchupvalue (FuncState *fs, TString *name) {
+  int i;
+  Upvaldesc *up = fs->f->upvalues;
+  for (i = 0; i < fs->nups; i++) {
+    if (luaS_eqstr(up[i].name, name)) return i;
+  }
+  return -1;  /* not found */
+}
+
+
+static int newupvalue (FuncState *fs, TString *name, expdesc *v) {
+  Proto *f = fs->f;
+  int oldsize = f->sizeupvalues;
+  checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues");
+  luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues,
+                  Upvaldesc, MAXUPVAL, "upvalues");
+  while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL;
+  f->upvalues[fs->nups].instack = (v->k == VLOCAL);
+  f->upvalues[fs->nups].idx = cast_byte(v->u.info);
+  f->upvalues[fs->nups].name = name;
+  luaC_objbarrier(fs->ls->L, f, name);
+  return fs->nups++;
+}
+
+
+static int searchvar (FuncState *fs, TString *n) {
+  int i;
+  for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) {
+    if (luaS_eqstr(n, getlocvar(fs, i)->varname))
+      return i;
+  }
+  return -1;  /* not found */
+}
+
+
+/*
+  Mark block where variable at given level was defined
+  (to emit close instructions later).
+*/
+static void markupval (FuncState *fs, int level) {
+  BlockCnt *bl = fs->bl;
+  while (bl->nactvar > level) bl = bl->previous;
+  bl->upval = 1;
+}
+
+
+/*
+  Find variable with given name 'n'. If it is an upvalue, add this
+  upvalue into all intermediate functions.
+*/
+static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) {
+  if (fs == NULL)  /* no more levels? */
+    return VVOID;  /* default is global */
+  else {
+    int v = searchvar(fs, n);  /* look up locals at current level */
+    if (v >= 0) {  /* found? */
+      init_exp(var, VLOCAL, v);  /* variable is local */
+      if (!base)
+        markupval(fs, v);  /* local will be used as an upval */
+      return VLOCAL;
+    }
+    else {  /* not found as local at current level; try upvalues */
+      int idx = searchupvalue(fs, n);  /* try existing upvalues */
+      if (idx < 0) {  /* not found? */
+        if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */
+          return VVOID;  /* not found; is a global */
+        /* else was LOCAL or UPVAL */
+        idx  = newupvalue(fs, n, var);  /* will be a new upvalue */
+      }
+      init_exp(var, VUPVAL, idx);
+      return VUPVAL;
+    }
+  }
+}
+
+
+static void singlevar (LexState *ls, expdesc *var) {
+  TString *varname = str_checkname(ls);
+  FuncState *fs = ls->fs;
+  if (singlevaraux(fs, varname, var, 1) == VVOID) {  /* global name? */
+    expdesc key;
+    singlevaraux(fs, ls->envn, var, 1);  /* get environment variable */
+    lua_assert(var->k == VLOCAL || var->k == VUPVAL);
+    codestring(ls, &key, varname);  /* key is variable name */
+    luaK_indexed(fs, var, &key);  /* env[varname] */
+  }
+}
+
+
+static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) {
+  FuncState *fs = ls->fs;
+  int extra = nvars - nexps;
+  if (hasmultret(e->k)) {
+    extra++;  /* includes call itself */
+    if (extra < 0) extra = 0;
+    luaK_setreturns(fs, e, extra);  /* last exp. provides the difference */
+    if (extra > 1) luaK_reserveregs(fs, extra-1);
+  }
+  else {
+    if (e->k != VVOID) luaK_exp2nextreg(fs, e);  /* close last expression */
+    if (extra > 0) {
+      int reg = fs->freereg;
+      luaK_reserveregs(fs, extra);
+      luaK_nil(fs, reg, extra);
+    }
+  }
+}
+
+
+static void enterlevel (LexState *ls) {
+  lua_State *L = ls->L;
+  ++L->nCcalls;
+  checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels");
+}
+
+
+#define leavelevel(ls)	((ls)->L->nCcalls--)
+
+
+static void closegoto (LexState *ls, int g, Labeldesc *label) {
+  int i;
+  FuncState *fs = ls->fs;
+  Labellist *gl = &ls->dyd->gt;
+  Labeldesc *gt = &gl->arr[g];
+  lua_assert(luaS_eqstr(gt->name, label->name));
+  if (gt->nactvar < label->nactvar) {
+    TString *vname = getlocvar(fs, gt->nactvar)->varname;
+    const char *msg = luaO_pushfstring(ls->L,
+      "<goto %s> at line %d jumps into the scope of local " LUA_QS,
+      getstr(gt->name), gt->line, getstr(vname));
+    semerror(ls, msg);
+  }
+  luaK_patchlist(fs, gt->pc, label->pc);
+  /* remove goto from pending list */
+  for (i = g; i < gl->n - 1; i++)
+    gl->arr[i] = gl->arr[i + 1];
+  gl->n--;
+}
+
+
+/*
+** try to close a goto with existing labels; this solves backward jumps
+*/
+static int findlabel (LexState *ls, int g) {
+  int i;
+  BlockCnt *bl = ls->fs->bl;
+  Dyndata *dyd = ls->dyd;
+  Labeldesc *gt = &dyd->gt.arr[g];
+  /* check labels in current block for a match */
+  for (i = bl->firstlabel; i < dyd->label.n; i++) {
+    Labeldesc *lb = &dyd->label.arr[i];
+    if (luaS_eqstr(lb->name, gt->name)) {  /* correct label? */
+      if (gt->nactvar > lb->nactvar &&
+          (bl->upval || dyd->label.n > bl->firstlabel))
+        luaK_patchclose(ls->fs, gt->pc, lb->nactvar);
+      closegoto(ls, g, lb);  /* close it */
+      return 1;
+    }
+  }
+  return 0;  /* label not found; cannot close goto */
+}
+
+
+static int newlabelentry (LexState *ls, Labellist *l, TString *name,
+                          int line, int pc) {
+  int n = l->n;
+  luaM_growvector(ls->L, l->arr, n, l->size,
+                  Labeldesc, SHRT_MAX, "labels/gotos");
+  l->arr[n].name = name;
+  l->arr[n].line = line;
+  l->arr[n].nactvar = ls->fs->nactvar;
+  l->arr[n].pc = pc;
+  l->n++;
+  return n;
+}
+
+
+/*
+** check whether new label 'lb' matches any pending gotos in current
+** block; solves forward jumps
+*/
+static void findgotos (LexState *ls, Labeldesc *lb) {
+  Labellist *gl = &ls->dyd->gt;
+  int i = ls->fs->bl->firstgoto;
+  while (i < gl->n) {
+    if (luaS_eqstr(gl->arr[i].name, lb->name))
+      closegoto(ls, i, lb);
+    else
+      i++;
+  }
+}
+
+
+/*
+** "export" pending gotos to outer level, to check them against
+** outer labels; if the block being exited has upvalues, and
+** the goto exits the scope of any variable (which can be the
+** upvalue), close those variables being exited.
+*/
+static void movegotosout (FuncState *fs, BlockCnt *bl) {
+  int i = bl->firstgoto;
+  Labellist *gl = &fs->ls->dyd->gt;
+  /* correct pending gotos to current block and try to close it
+     with visible labels */
+  while (i < gl->n) {
+    Labeldesc *gt = &gl->arr[i];
+    if (gt->nactvar > bl->nactvar) {
+      if (bl->upval)
+        luaK_patchclose(fs, gt->pc, bl->nactvar);
+      gt->nactvar = bl->nactvar;
+    }
+    if (!findlabel(fs->ls, i))
+      i++;  /* move to next one */
+  }
+}
+
+
+static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) {
+  bl->isloop = isloop;
+  bl->nactvar = fs->nactvar;
+  bl->firstlabel = fs->ls->dyd->label.n;
+  bl->firstgoto = fs->ls->dyd->gt.n;
+  bl->upval = 0;
+  bl->previous = fs->bl;
+  fs->bl = bl;
+  lua_assert(fs->freereg == fs->nactvar);
+}
+
+
+/*
+** create a label named "break" to resolve break statements
+*/
+static void breaklabel (LexState *ls) {
+  TString *n = luaS_new(ls->L, "break");
+  int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc);
+  findgotos(ls, &ls->dyd->label.arr[l]);
+}
+
+/*
+** generates an error for an undefined 'goto'; choose appropriate
+** message when label name is a reserved word (which can only be 'break')
+*/
+static l_noret undefgoto (LexState *ls, Labeldesc *gt) {
+  const char *msg = isreserved(gt->name)
+                    ? "<%s> at line %d not inside a loop"
+                    : "no visible label " LUA_QS " for <goto> at line %d";
+  msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line);
+  semerror(ls, msg);
+}
+
+
+static void leaveblock (FuncState *fs) {
+  BlockCnt *bl = fs->bl;
+  LexState *ls = fs->ls;
+  if (bl->previous && bl->upval) {
+    /* create a 'jump to here' to close upvalues */
+    int j = luaK_jump(fs);
+    luaK_patchclose(fs, j, bl->nactvar);
+    luaK_patchtohere(fs, j);
+  }
+  if (bl->isloop)
+    breaklabel(ls);  /* close pending breaks */
+  fs->bl = bl->previous;
+  removevars(fs, bl->nactvar);
+  lua_assert(bl->nactvar == fs->nactvar);
+  fs->freereg = fs->nactvar;  /* free registers */
+  ls->dyd->label.n = bl->firstlabel;  /* remove local labels */
+  if (bl->previous)  /* inner block? */
+    movegotosout(fs, bl);  /* update pending gotos to outer block */
+  else if (bl->firstgoto < ls->dyd->gt.n)  /* pending gotos in outer block? */
+    undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]);  /* error */
+}
+
+
+/*
+** adds a new prototype into list of prototypes
+*/
+static Proto *addprototype (LexState *ls) {
+  Proto *clp;
+  lua_State *L = ls->L;
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;  /* prototype of current function */
+  if (fs->np >= f->sizep) {
+    int oldsize = f->sizep;
+    luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions");
+    while (oldsize < f->sizep) f->p[oldsize++] = NULL;
+  }
+  f->p[fs->np++] = clp = luaF_newproto(L);
+  luaC_objbarrier(L, f, clp);
+  return clp;
+}
+
+
+/*
+** codes instruction to create new closure in parent function.
+** The OP_CLOSURE instruction must use the last available register,
+** so that, if it invokes the GC, the GC knows which registers
+** are in use at that time.
+*/
+static void codeclosure (LexState *ls, expdesc *v) {
+  FuncState *fs = ls->fs->prev;
+  init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1));
+  luaK_exp2nextreg(fs, v);  /* fix it at the last register */
+}
+
+
+static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) {
+  lua_State *L = ls->L;
+  Proto *f;
+  fs->prev = ls->fs;  /* linked list of funcstates */
+  fs->ls = ls;
+  ls->fs = fs;
+  fs->pc = 0;
+  fs->lasttarget = 0;
+  fs->jpc = NO_JUMP;
+  fs->freereg = 0;
+  fs->nk = 0;
+  fs->np = 0;
+  fs->nups = 0;
+  fs->nlocvars = 0;
+  fs->nactvar = 0;
+  fs->firstlocal = ls->dyd->actvar.n;
+  fs->bl = NULL;
+  f = fs->f;
+  f->source = ls->source;
+  f->maxstacksize = 2;  /* registers 0/1 are always valid */
+  fs->h = luaH_new(L);
+  /* anchor table of constants (to avoid being collected) */
+  sethvalue2s(L, L->top, fs->h);
+  incr_top(L);
+  enterblock(fs, bl, 0);
+}
+
+
+static void close_func (LexState *ls) {
+  lua_State *L = ls->L;
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;
+  luaK_ret(fs, 0, 0);  /* final return */
+  leaveblock(fs);
+  luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction);
+  f->sizecode = fs->pc;
+  luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int);
+  f->sizelineinfo = fs->pc;
+  luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue);
+  f->sizek = fs->nk;
+  luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *);
+  f->sizep = fs->np;
+  luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar);
+  f->sizelocvars = fs->nlocvars;
+  luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc);
+  f->sizeupvalues = fs->nups;
+  lua_assert(fs->bl == NULL);
+  ls->fs = fs->prev;
+  /* last token read was anchored in defunct function; must re-anchor it */
+  anchor_token(ls);
+  L->top--;  /* pop table of constants */
+  luaC_checkGC(L);
+}
+
+
+
+/*============================================================*/
+/* GRAMMAR RULES */
+/*============================================================*/
+
+
+/*
+** check whether current token is in the follow set of a block.
+** 'until' closes syntactical blocks, but do not close scope,
+** so it handled in separate.
+*/
+static int block_follow (LexState *ls, int withuntil) {
+  switch (ls->t.token) {
+    case TK_ELSE: case TK_ELSEIF:
+    case TK_END: case TK_EOS:
+      return 1;
+    case TK_UNTIL: return withuntil;
+    default: return 0;
+  }
+}
+
+
+/*
+ * by inlining statlist() and test_then_block() we cut back the
+ * native stack usage per nested C call from 272 bytes to 152
+ * which allows us to stay within budget for 8K kernel stacks
+ */
+__attribute__((always_inline)) inline
+static void statlist (LexState *ls) {
+  /* statlist -> { stat [`;'] } */
+  while (!block_follow(ls, 1)) {
+    if (ls->t.token == TK_RETURN) {
+      statement(ls);
+      return;  /* 'return' must be last statement */
+    }
+    statement(ls);
+  }
+}
+
+
+static void fieldsel (LexState *ls, expdesc *v) {
+  /* fieldsel -> ['.' | ':'] NAME */
+  FuncState *fs = ls->fs;
+  expdesc key;
+  luaK_exp2anyregup(fs, v);
+  luaX_next(ls);  /* skip the dot or colon */
+  checkname(ls, &key);
+  luaK_indexed(fs, v, &key);
+}
+
+
+static void yindex (LexState *ls, expdesc *v) {
+  /* index -> '[' expr ']' */
+  luaX_next(ls);  /* skip the '[' */
+  expr(ls, v);
+  luaK_exp2val(ls->fs, v);
+  checknext(ls, ']');
+}
+
+
+/*
+** {======================================================================
+** Rules for Constructors
+** =======================================================================
+*/
+
+
+struct ConsControl {
+  expdesc v;  /* last list item read */
+  expdesc *t;  /* table descriptor */
+  int nh;  /* total number of `record' elements */
+  int na;  /* total number of array elements */
+  int tostore;  /* number of array elements pending to be stored */
+};
+
+
+static void recfield (LexState *ls, struct ConsControl *cc) {
+  /* recfield -> (NAME | `['exp1`]') = exp1 */
+  FuncState *fs = ls->fs;
+  int reg = ls->fs->freereg;
+  expdesc key, val;
+  int rkkey;
+  if (ls->t.token == TK_NAME) {
+    checklimit(fs, cc->nh, MAX_INT, "items in a constructor");
+    checkname(ls, &key);
+  }
+  else  /* ls->t.token == '[' */
+    yindex(ls, &key);
+  cc->nh++;
+  checknext(ls, '=');
+  rkkey = luaK_exp2RK(fs, &key);
+  expr(ls, &val);
+  luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val));
+  fs->freereg = reg;  /* free registers */
+}
+
+
+static void closelistfield (FuncState *fs, struct ConsControl *cc) {
+  if (cc->v.k == VVOID) return;  /* there is no list item */
+  luaK_exp2nextreg(fs, &cc->v);
+  cc->v.k = VVOID;
+  if (cc->tostore == LFIELDS_PER_FLUSH) {
+    luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);  /* flush */
+    cc->tostore = 0;  /* no more items pending */
+  }
+}
+
+
+static void lastlistfield (FuncState *fs, struct ConsControl *cc) {
+  if (cc->tostore == 0) return;
+  if (hasmultret(cc->v.k)) {
+    luaK_setmultret(fs, &cc->v);
+    luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET);
+    cc->na--;  /* do not count last expression (unknown number of elements) */
+  }
+  else {
+    if (cc->v.k != VVOID)
+      luaK_exp2nextreg(fs, &cc->v);
+    luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);
+  }
+}
+
+
+static void listfield (LexState *ls, struct ConsControl *cc) {
+  /* listfield -> exp */
+  expr(ls, &cc->v);
+  checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor");
+  cc->na++;
+  cc->tostore++;
+}
+
+
+static void field (LexState *ls, struct ConsControl *cc) {
+  /* field -> listfield | recfield */
+  switch(ls->t.token) {
+    case TK_NAME: {  /* may be 'listfield' or 'recfield' */
+      if (luaX_lookahead(ls) != '=')  /* expression? */
+        listfield(ls, cc);
+      else
+        recfield(ls, cc);
+      break;
+    }
+    case '[': {
+      recfield(ls, cc);
+      break;
+    }
+    default: {
+      listfield(ls, cc);
+      break;
+    }
+  }
+}
+
+
+static void constructor (LexState *ls, expdesc *t) {
+  /* constructor -> '{' [ field { sep field } [sep] ] '}'
+     sep -> ',' | ';' */
+  FuncState *fs = ls->fs;
+  int line = ls->linenumber;
+  int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0);
+  struct ConsControl cc;
+  cc.na = cc.nh = cc.tostore = 0;
+  cc.t = t;
+  init_exp(t, VRELOCABLE, pc);
+  init_exp(&cc.v, VVOID, 0);  /* no value (yet) */
+  luaK_exp2nextreg(ls->fs, t);  /* fix it at stack top */
+  checknext(ls, '{');
+  do {
+    lua_assert(cc.v.k == VVOID || cc.tostore > 0);
+    if (ls->t.token == '}') break;
+    closelistfield(fs, &cc);
+    field(ls, &cc);
+  } while (testnext(ls, ',') || testnext(ls, ';'));
+  check_match(ls, '}', '{', line);
+  lastlistfield(fs, &cc);
+  SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */
+  SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh));  /* set initial table size */
+}
+
+/* }====================================================================== */
+
+
+
+static void parlist (LexState *ls) {
+  /* parlist -> [ param { `,' param } ] */
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;
+  int nparams = 0;
+  f->is_vararg = 0;
+  if (ls->t.token != ')') {  /* is `parlist' not empty? */
+    do {
+      switch (ls->t.token) {
+        case TK_NAME: {  /* param -> NAME */
+          new_localvar(ls, str_checkname(ls));
+          nparams++;
+          break;
+        }
+        case TK_DOTS: {  /* param -> `...' */
+          luaX_next(ls);
+          f->is_vararg = 1;
+          break;
+        }
+        default: luaX_syntaxerror(ls, "<name> or " LUA_QL("...") " expected");
+      }
+    } while (!f->is_vararg && testnext(ls, ','));
+  }
+  adjustlocalvars(ls, nparams);
+  f->numparams = cast_byte(fs->nactvar);
+  luaK_reserveregs(fs, fs->nactvar);  /* reserve register for parameters */
+}
+
+
+static void body (LexState *ls, expdesc *e, int ismethod, int line) {
+  /* body ->  `(' parlist `)' block END */
+  FuncState new_fs;
+  BlockCnt bl;
+  new_fs.f = addprototype(ls);
+  new_fs.f->linedefined = line;
+  open_func(ls, &new_fs, &bl);
+  checknext(ls, '(');
+  if (ismethod) {
+    new_localvarliteral(ls, "self");  /* create 'self' parameter */
+    adjustlocalvars(ls, 1);
+  }
+  parlist(ls);
+  checknext(ls, ')');
+  statlist(ls);
+  new_fs.f->lastlinedefined = ls->linenumber;
+  check_match(ls, TK_END, TK_FUNCTION, line);
+  codeclosure(ls, e);
+  close_func(ls);
+}
+
+
+static int explist (LexState *ls, expdesc *v) {
+  /* explist -> expr { `,' expr } */
+  int n = 1;  /* at least one expression */
+  expr(ls, v);
+  while (testnext(ls, ',')) {
+    luaK_exp2nextreg(ls->fs, v);
+    expr(ls, v);
+    n++;
+  }
+  return n;
+}
+
+
+static void funcargs (LexState *ls, expdesc *f, int line) {
+  FuncState *fs = ls->fs;
+  expdesc args;
+  int base, nparams;
+  switch (ls->t.token) {
+    case '(': {  /* funcargs -> `(' [ explist ] `)' */
+      luaX_next(ls);
+      if (ls->t.token == ')')  /* arg list is empty? */
+        args.k = VVOID;
+      else {
+        explist(ls, &args);
+        luaK_setmultret(fs, &args);
+      }
+      check_match(ls, ')', '(', line);
+      break;
+    }
+    case '{': {  /* funcargs -> constructor */
+      constructor(ls, &args);
+      break;
+    }
+    case TK_STRING: {  /* funcargs -> STRING */
+      codestring(ls, &args, ls->t.seminfo.ts);
+      luaX_next(ls);  /* must use `seminfo' before `next' */
+      break;
+    }
+    default: {
+      luaX_syntaxerror(ls, "function arguments expected");
+    }
+  }
+  lua_assert(f->k == VNONRELOC);
+  base = f->u.info;  /* base register for call */
+  if (hasmultret(args.k))
+    nparams = LUA_MULTRET;  /* open call */
+  else {
+    if (args.k != VVOID)
+      luaK_exp2nextreg(fs, &args);  /* close last argument */
+    nparams = fs->freereg - (base+1);
+  }
+  init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2));
+  luaK_fixline(fs, line);
+  fs->freereg = base+1;  /* call remove function and arguments and leaves
+                            (unless changed) one result */
+}
+
+
+
+
+/*
+** {======================================================================
+** Expression parsing
+** =======================================================================
+*/
+
+
+static void primaryexp (LexState *ls, expdesc *v) {
+  /* primaryexp -> NAME | '(' expr ')' */
+  switch (ls->t.token) {
+    case '(': {
+      int line = ls->linenumber;
+      luaX_next(ls);
+      expr(ls, v);
+      check_match(ls, ')', '(', line);
+      luaK_dischargevars(ls->fs, v);
+      return;
+    }
+    case TK_NAME: {
+      singlevar(ls, v);
+      return;
+    }
+    default: {
+      luaX_syntaxerror(ls, "unexpected symbol");
+    }
+  }
+}
+
+
+static void suffixedexp (LexState *ls, expdesc *v) {
+  /* suffixedexp ->
+       primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */
+  FuncState *fs = ls->fs;
+  int line = ls->linenumber;
+  primaryexp(ls, v);
+  for (;;) {
+    switch (ls->t.token) {
+      case '.': {  /* fieldsel */
+        fieldsel(ls, v);
+        break;
+      }
+      case '[': {  /* `[' exp1 `]' */
+        expdesc key;
+        luaK_exp2anyregup(fs, v);
+        yindex(ls, &key);
+        luaK_indexed(fs, v, &key);
+        break;
+      }
+      case ':': {  /* `:' NAME funcargs */
+        expdesc key;
+        luaX_next(ls);
+        checkname(ls, &key);
+        luaK_self(fs, v, &key);
+        funcargs(ls, v, line);
+        break;
+      }
+      case '(': case TK_STRING: case '{': {  /* funcargs */
+        luaK_exp2nextreg(fs, v);
+        funcargs(ls, v, line);
+        break;
+      }
+      default: return;
+    }
+  }
+}
+
+
+static void simpleexp (LexState *ls, expdesc *v) {
+  /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... |
+                  constructor | FUNCTION body | suffixedexp */
+  switch (ls->t.token) {
+    case TK_NUMBER: {
+      init_exp(v, VKNUM, 0);
+      v->u.nval = ls->t.seminfo.r;
+      break;
+    }
+    case TK_STRING: {
+      codestring(ls, v, ls->t.seminfo.ts);
+      break;
+    }
+    case TK_NIL: {
+      init_exp(v, VNIL, 0);
+      break;
+    }
+    case TK_TRUE: {
+      init_exp(v, VTRUE, 0);
+      break;
+    }
+    case TK_FALSE: {
+      init_exp(v, VFALSE, 0);
+      break;
+    }
+    case TK_DOTS: {  /* vararg */
+      FuncState *fs = ls->fs;
+      check_condition(ls, fs->f->is_vararg,
+                      "cannot use " LUA_QL("...") " outside a vararg function");
+      init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
+      break;
+    }
+    case '{': {  /* constructor */
+      constructor(ls, v);
+      return;
+    }
+    case TK_FUNCTION: {
+      luaX_next(ls);
+      body(ls, v, 0, ls->linenumber);
+      return;
+    }
+    default: {
+      suffixedexp(ls, v);
+      return;
+    }
+  }
+  luaX_next(ls);
+}
+
+
+static UnOpr getunopr (int op) {
+  switch (op) {
+    case TK_NOT: return OPR_NOT;
+    case '-': return OPR_MINUS;
+    case '#': return OPR_LEN;
+    default: return OPR_NOUNOPR;
+  }
+}
+
+
+static BinOpr getbinopr (int op) {
+  switch (op) {
+    case '+': return OPR_ADD;
+    case '-': return OPR_SUB;
+    case '*': return OPR_MUL;
+    case '/': return OPR_DIV;
+    case '%': return OPR_MOD;
+    case '^': return OPR_POW;
+    case TK_CONCAT: return OPR_CONCAT;
+    case TK_NE: return OPR_NE;
+    case TK_EQ: return OPR_EQ;
+    case '<': return OPR_LT;
+    case TK_LE: return OPR_LE;
+    case '>': return OPR_GT;
+    case TK_GE: return OPR_GE;
+    case TK_AND: return OPR_AND;
+    case TK_OR: return OPR_OR;
+    default: return OPR_NOBINOPR;
+  }
+}
+
+
+static const struct {
+  lu_byte left;  /* left priority for each binary operator */
+  lu_byte right; /* right priority */
+} priority[] = {  /* ORDER OPR */
+   {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7},  /* `+' `-' `*' `/' `%' */
+   {10, 9}, {5, 4},                 /* ^, .. (right associative) */
+   {3, 3}, {3, 3}, {3, 3},          /* ==, <, <= */
+   {3, 3}, {3, 3}, {3, 3},          /* ~=, >, >= */
+   {2, 2}, {1, 1}                   /* and, or */
+};
+
+#define UNARY_PRIORITY	8  /* priority for unary operators */
+
+
+/*
+** subexpr -> (simpleexp | unop subexpr) { binop subexpr }
+** where `binop' is any binary operator with a priority higher than `limit'
+*/
+static BinOpr subexpr (LexState *ls, expdesc *v, int limit) {
+  BinOpr op;
+  UnOpr uop;
+  enterlevel(ls);
+  uop = getunopr(ls->t.token);
+  if (uop != OPR_NOUNOPR) {
+    int line = ls->linenumber;
+    luaX_next(ls);
+    subexpr(ls, v, UNARY_PRIORITY);
+    luaK_prefix(ls->fs, uop, v, line);
+  }
+  else simpleexp(ls, v);
+  /* expand while operators have priorities higher than `limit' */
+  op = getbinopr(ls->t.token);
+  while (op != OPR_NOBINOPR && priority[op].left > limit) {
+    expdesc v2;
+    BinOpr nextop;
+    int line = ls->linenumber;
+    luaX_next(ls);
+    luaK_infix(ls->fs, op, v);
+    /* read sub-expression with higher priority */
+    nextop = subexpr(ls, &v2, priority[op].right);
+    luaK_posfix(ls->fs, op, v, &v2, line);
+    op = nextop;
+  }
+  leavelevel(ls);
+  return op;  /* return first untreated operator */
+}
+
+
+static void expr (LexState *ls, expdesc *v) {
+  subexpr(ls, v, 0);
+}
+
+/* }==================================================================== */
+
+
+
+/*
+** {======================================================================
+** Rules for Statements
+** =======================================================================
+*/
+
+
+static void block (LexState *ls) {
+  /* block -> statlist */
+  FuncState *fs = ls->fs;
+  BlockCnt bl;
+  enterblock(fs, &bl, 0);
+  statlist(ls);
+  leaveblock(fs);
+}
+
+
+/*
+** structure to chain all variables in the left-hand side of an
+** assignment
+*/
+struct LHS_assign {
+  struct LHS_assign *prev;
+  expdesc v;  /* variable (global, local, upvalue, or indexed) */
+};
+
+
+/*
+** check whether, in an assignment to an upvalue/local variable, the
+** upvalue/local variable is begin used in a previous assignment to a
+** table. If so, save original upvalue/local value in a safe place and
+** use this safe copy in the previous assignment.
+*/
+static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) {
+  FuncState *fs = ls->fs;
+  int extra = fs->freereg;  /* eventual position to save local variable */
+  int conflict = 0;
+  for (; lh; lh = lh->prev) {  /* check all previous assignments */
+    if (lh->v.k == VINDEXED) {  /* assigning to a table? */
+      /* table is the upvalue/local being assigned now? */
+      if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) {
+        conflict = 1;
+        lh->v.u.ind.vt = VLOCAL;
+        lh->v.u.ind.t = extra;  /* previous assignment will use safe copy */
+      }
+      /* index is the local being assigned? (index cannot be upvalue) */
+      if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) {
+        conflict = 1;
+        lh->v.u.ind.idx = extra;  /* previous assignment will use safe copy */
+      }
+    }
+  }
+  if (conflict) {
+    /* copy upvalue/local value to a temporary (in position 'extra') */
+    OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL;
+    luaK_codeABC(fs, op, extra, v->u.info, 0);
+    luaK_reserveregs(fs, 1);
+  }
+}
+
+
+static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) {
+  expdesc e;
+  check_condition(ls, vkisvar(lh->v.k), "syntax error");
+  if (testnext(ls, ',')) {  /* assignment -> ',' suffixedexp assignment */
+    struct LHS_assign nv;
+    nv.prev = lh;
+    suffixedexp(ls, &nv.v);
+    if (nv.v.k != VINDEXED)
+      check_conflict(ls, lh, &nv.v);
+    checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS,
+                    "C levels");
+    assignment(ls, &nv, nvars+1);
+  }
+  else {  /* assignment -> `=' explist */
+    int nexps;
+    checknext(ls, '=');
+    nexps = explist(ls, &e);
+    if (nexps != nvars) {
+      adjust_assign(ls, nvars, nexps, &e);
+      if (nexps > nvars)
+        ls->fs->freereg -= nexps - nvars;  /* remove extra values */
+    }
+    else {
+      luaK_setoneret(ls->fs, &e);  /* close last expression */
+      luaK_storevar(ls->fs, &lh->v, &e);
+      return;  /* avoid default */
+    }
+  }
+  init_exp(&e, VNONRELOC, ls->fs->freereg-1);  /* default assignment */
+  luaK_storevar(ls->fs, &lh->v, &e);
+}
+
+
+static int cond (LexState *ls) {
+  /* cond -> exp */
+  expdesc v;
+  expr(ls, &v);  /* read condition */
+  if (v.k == VNIL) v.k = VFALSE;  /* `falses' are all equal here */
+  luaK_goiftrue(ls->fs, &v);
+  return v.f;
+}
+
+
+static void gotostat (LexState *ls, int pc) {
+  int line = ls->linenumber;
+  TString *label;
+  int g;
+  if (testnext(ls, TK_GOTO))
+    label = str_checkname(ls);
+  else {
+    luaX_next(ls);  /* skip break */
+    label = luaS_new(ls->L, "break");
+  }
+  g = newlabelentry(ls, &ls->dyd->gt, label, line, pc);
+  findlabel(ls, g);  /* close it if label already defined */
+}
+
+
+/* check for repeated labels on the same block */
+static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) {
+  int i;
+  for (i = fs->bl->firstlabel; i < ll->n; i++) {
+    if (luaS_eqstr(label, ll->arr[i].name)) {
+      const char *msg = luaO_pushfstring(fs->ls->L,
+                          "label " LUA_QS " already defined on line %d",
+                          getstr(label), ll->arr[i].line);
+      semerror(fs->ls, msg);
+    }
+  }
+}
+
+
+/* skip no-op statements */
+static void skipnoopstat (LexState *ls) {
+  while (ls->t.token == ';' || ls->t.token == TK_DBCOLON)
+    statement(ls);
+}
+
+
+static void labelstat (LexState *ls, TString *label, int line) {
+  /* label -> '::' NAME '::' */
+  FuncState *fs = ls->fs;
+  Labellist *ll = &ls->dyd->label;
+  int l;  /* index of new label being created */
+  checkrepeated(fs, ll, label);  /* check for repeated labels */
+  checknext(ls, TK_DBCOLON);  /* skip double colon */
+  /* create new entry for this label */
+  l = newlabelentry(ls, ll, label, line, fs->pc);
+  skipnoopstat(ls);  /* skip other no-op statements */
+  if (block_follow(ls, 0)) {  /* label is last no-op statement in the block? */
+    /* assume that locals are already out of scope */
+    ll->arr[l].nactvar = fs->bl->nactvar;
+  }
+  findgotos(ls, &ll->arr[l]);
+}
+
+
+static void whilestat (LexState *ls, int line) {
+  /* whilestat -> WHILE cond DO block END */
+  FuncState *fs = ls->fs;
+  int whileinit;
+  int condexit;
+  BlockCnt bl;
+  luaX_next(ls);  /* skip WHILE */
+  whileinit = luaK_getlabel(fs);
+  condexit = cond(ls);
+  enterblock(fs, &bl, 1);
+  checknext(ls, TK_DO);
+  block(ls);
+  luaK_jumpto(fs, whileinit);
+  check_match(ls, TK_END, TK_WHILE, line);
+  leaveblock(fs);
+  luaK_patchtohere(fs, condexit);  /* false conditions finish the loop */
+}
+
+
+static void repeatstat (LexState *ls, int line) {
+  /* repeatstat -> REPEAT block UNTIL cond */
+  int condexit;
+  FuncState *fs = ls->fs;
+  int repeat_init = luaK_getlabel(fs);
+  BlockCnt bl1, bl2;
+  enterblock(fs, &bl1, 1);  /* loop block */
+  enterblock(fs, &bl2, 0);  /* scope block */
+  luaX_next(ls);  /* skip REPEAT */
+  statlist(ls);
+  check_match(ls, TK_UNTIL, TK_REPEAT, line);
+  condexit = cond(ls);  /* read condition (inside scope block) */
+  if (bl2.upval)  /* upvalues? */
+    luaK_patchclose(fs, condexit, bl2.nactvar);
+  leaveblock(fs);  /* finish scope */
+  luaK_patchlist(fs, condexit, repeat_init);  /* close the loop */
+  leaveblock(fs);  /* finish loop */
+}
+
+
+static int exp1 (LexState *ls) {
+  expdesc e;
+  int reg;
+  expr(ls, &e);
+  luaK_exp2nextreg(ls->fs, &e);
+  lua_assert(e.k == VNONRELOC);
+  reg = e.u.info;
+  return reg;
+}
+
+
+static void forbody (LexState *ls, int base, int line, int nvars, int isnum) {
+  /* forbody -> DO block */
+  BlockCnt bl;
+  FuncState *fs = ls->fs;
+  int prep, endfor;
+  adjustlocalvars(ls, 3);  /* control variables */
+  checknext(ls, TK_DO);
+  prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs);
+  enterblock(fs, &bl, 0);  /* scope for declared variables */
+  adjustlocalvars(ls, nvars);
+  luaK_reserveregs(fs, nvars);
+  block(ls);
+  leaveblock(fs);  /* end of scope for declared variables */
+  luaK_patchtohere(fs, prep);
+  if (isnum)  /* numeric for? */
+    endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP);
+  else {  /* generic for */
+    luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars);
+    luaK_fixline(fs, line);
+    endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP);
+  }
+  luaK_patchlist(fs, endfor, prep + 1);
+  luaK_fixline(fs, line);
+}
+
+
+static void fornum (LexState *ls, TString *varname, int line) {
+  /* fornum -> NAME = exp1,exp1[,exp1] forbody */
+  FuncState *fs = ls->fs;
+  int base = fs->freereg;
+  new_localvarliteral(ls, "(for index)");
+  new_localvarliteral(ls, "(for limit)");
+  new_localvarliteral(ls, "(for step)");
+  new_localvar(ls, varname);
+  checknext(ls, '=');
+  exp1(ls);  /* initial value */
+  checknext(ls, ',');
+  exp1(ls);  /* limit */
+  if (testnext(ls, ','))
+    exp1(ls);  /* optional step */
+  else {  /* default step = 1 */
+    luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1));
+    luaK_reserveregs(fs, 1);
+  }
+  forbody(ls, base, line, 1, 1);
+}
+
+
+static void forlist (LexState *ls, TString *indexname) {
+  /* forlist -> NAME {,NAME} IN explist forbody */
+  FuncState *fs = ls->fs;
+  expdesc e;
+  int nvars = 4;  /* gen, state, control, plus at least one declared var */
+  int line;
+  int base = fs->freereg;
+  /* create control variables */
+  new_localvarliteral(ls, "(for generator)");
+  new_localvarliteral(ls, "(for state)");
+  new_localvarliteral(ls, "(for control)");
+  /* create declared variables */
+  new_localvar(ls, indexname);
+  while (testnext(ls, ',')) {
+    new_localvar(ls, str_checkname(ls));
+    nvars++;
+  }
+  checknext(ls, TK_IN);
+  line = ls->linenumber;
+  adjust_assign(ls, 3, explist(ls, &e), &e);
+  luaK_checkstack(fs, 3);  /* extra space to call generator */
+  forbody(ls, base, line, nvars - 3, 0);
+}
+
+
+static void forstat (LexState *ls, int line) {
+  /* forstat -> FOR (fornum | forlist) END */
+  FuncState *fs = ls->fs;
+  TString *varname;
+  BlockCnt bl;
+  enterblock(fs, &bl, 1);  /* scope for loop and control variables */
+  luaX_next(ls);  /* skip `for' */
+  varname = str_checkname(ls);  /* first variable name */
+  switch (ls->t.token) {
+    case '=': fornum(ls, varname, line); break;
+    case ',': case TK_IN: forlist(ls, varname); break;
+    default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected");
+  }
+  check_match(ls, TK_END, TK_FOR, line);
+  leaveblock(fs);  /* loop scope (`break' jumps to this point) */
+}
+
+
+__attribute__((always_inline)) inline
+static void test_then_block (LexState *ls, int *escapelist) {
+  /* test_then_block -> [IF | ELSEIF] cond THEN block */
+  BlockCnt bl;
+  FuncState *fs = ls->fs;
+  expdesc v;
+  int jf;  /* instruction to skip 'then' code (if condition is false) */
+  luaX_next(ls);  /* skip IF or ELSEIF */
+  expr(ls, &v);  /* read condition */
+  checknext(ls, TK_THEN);
+  if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) {
+    luaK_goiffalse(ls->fs, &v);  /* will jump to label if condition is true */
+    enterblock(fs, &bl, 0);  /* must enter block before 'goto' */
+    gotostat(ls, v.t);  /* handle goto/break */
+    skipnoopstat(ls);  /* skip other no-op statements */
+    if (block_follow(ls, 0)) {  /* 'goto' is the entire block? */
+      leaveblock(fs);
+      return;  /* and that is it */
+    }
+    else  /* must skip over 'then' part if condition is false */
+      jf = luaK_jump(fs);
+  }
+  else {  /* regular case (not goto/break) */
+    luaK_goiftrue(ls->fs, &v);  /* skip over block if condition is false */
+    enterblock(fs, &bl, 0);
+    jf = v.f;
+  }
+  statlist(ls);  /* `then' part */
+  leaveblock(fs);
+  if (ls->t.token == TK_ELSE ||
+      ls->t.token == TK_ELSEIF)  /* followed by 'else'/'elseif'? */
+    luaK_concat(fs, escapelist, luaK_jump(fs));  /* must jump over it */
+  luaK_patchtohere(fs, jf);
+}
+
+
+static void ifstat (LexState *ls, int line) {
+  /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */
+  FuncState *fs = ls->fs;
+  int escapelist = NO_JUMP;  /* exit list for finished parts */
+  test_then_block(ls, &escapelist);  /* IF cond THEN block */
+  while (ls->t.token == TK_ELSEIF)
+    test_then_block(ls, &escapelist);  /* ELSEIF cond THEN block */
+  if (testnext(ls, TK_ELSE))
+    block(ls);  /* `else' part */
+  check_match(ls, TK_END, TK_IF, line);
+  luaK_patchtohere(fs, escapelist);  /* patch escape list to 'if' end */
+}
+
+
+static void localfunc (LexState *ls) {
+  expdesc b;
+  FuncState *fs = ls->fs;
+  new_localvar(ls, str_checkname(ls));  /* new local variable */
+  adjustlocalvars(ls, 1);  /* enter its scope */
+  body(ls, &b, 0, ls->linenumber);  /* function created in next register */
+  /* debug information will only see the variable after this point! */
+  getlocvar(fs, b.u.info)->startpc = fs->pc;
+}
+
+
+static void localstat (LexState *ls) {
+  /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */
+  int nvars = 0;
+  int nexps;
+  expdesc e;
+  do {
+    new_localvar(ls, str_checkname(ls));
+    nvars++;
+  } while (testnext(ls, ','));
+  if (testnext(ls, '='))
+    nexps = explist(ls, &e);
+  else {
+    e.k = VVOID;
+    nexps = 0;
+  }
+  adjust_assign(ls, nvars, nexps, &e);
+  adjustlocalvars(ls, nvars);
+}
+
+
+static int funcname (LexState *ls, expdesc *v) {
+  /* funcname -> NAME {fieldsel} [`:' NAME] */
+  int ismethod = 0;
+  singlevar(ls, v);
+  while (ls->t.token == '.')
+    fieldsel(ls, v);
+  if (ls->t.token == ':') {
+    ismethod = 1;
+    fieldsel(ls, v);
+  }
+  return ismethod;
+}
+
+
+static void funcstat (LexState *ls, int line) {
+  /* funcstat -> FUNCTION funcname body */
+  int ismethod;
+  expdesc v, b;
+  luaX_next(ls);  /* skip FUNCTION */
+  ismethod = funcname(ls, &v);
+  body(ls, &b, ismethod, line);
+  luaK_storevar(ls->fs, &v, &b);
+  luaK_fixline(ls->fs, line);  /* definition `happens' in the first line */
+}
+
+
+static void exprstat (LexState *ls) {
+  /* stat -> func | assignment */
+  FuncState *fs = ls->fs;
+  struct LHS_assign v;
+  suffixedexp(ls, &v.v);
+  if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */
+    v.prev = NULL;
+    assignment(ls, &v, 1);
+  }
+  else {  /* stat -> func */
+    check_condition(ls, v.v.k == VCALL, "syntax error");
+    SETARG_C(getcode(fs, &v.v), 1);  /* call statement uses no results */
+  }
+}
+
+
+static void retstat (LexState *ls) {
+  /* stat -> RETURN [explist] [';'] */
+  FuncState *fs = ls->fs;
+  expdesc e;
+  int first, nret;  /* registers with returned values */
+  if (block_follow(ls, 1) || ls->t.token == ';')
+    first = nret = 0;  /* return no values */
+  else {
+    nret = explist(ls, &e);  /* optional return values */
+    if (hasmultret(e.k)) {
+      luaK_setmultret(fs, &e);
+      if (e.k == VCALL && nret == 1) {  /* tail call? */
+        SET_OPCODE(getcode(fs,&e), OP_TAILCALL);
+        lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar);
+      }
+      first = fs->nactvar;
+      nret = LUA_MULTRET;  /* return all values */
+    }
+    else {
+      if (nret == 1)  /* only one single value? */
+        first = luaK_exp2anyreg(fs, &e);
+      else {
+        luaK_exp2nextreg(fs, &e);  /* values must go to the `stack' */
+        first = fs->nactvar;  /* return all `active' values */
+        lua_assert(nret == fs->freereg - first);
+      }
+    }
+  }
+  luaK_ret(fs, first, nret);
+  (void) testnext(ls, ';');  /* skip optional semicolon */
+}
+
+
+static void statement (LexState *ls) {
+  int line = ls->linenumber;  /* may be needed for error messages */
+  enterlevel(ls);
+  switch (ls->t.token) {
+    case ';': {  /* stat -> ';' (empty statement) */
+      luaX_next(ls);  /* skip ';' */
+      break;
+    }
+    case TK_IF: {  /* stat -> ifstat */
+      ifstat(ls, line);
+      break;
+    }
+    case TK_WHILE: {  /* stat -> whilestat */
+      whilestat(ls, line);
+      break;
+    }
+    case TK_DO: {  /* stat -> DO block END */
+      luaX_next(ls);  /* skip DO */
+      block(ls);
+      check_match(ls, TK_END, TK_DO, line);
+      break;
+    }
+    case TK_FOR: {  /* stat -> forstat */
+      forstat(ls, line);
+      break;
+    }
+    case TK_REPEAT: {  /* stat -> repeatstat */
+      repeatstat(ls, line);
+      break;
+    }
+    case TK_FUNCTION: {  /* stat -> funcstat */
+      funcstat(ls, line);
+      break;
+    }
+    case TK_LOCAL: {  /* stat -> localstat */
+      luaX_next(ls);  /* skip LOCAL */
+      if (testnext(ls, TK_FUNCTION))  /* local function? */
+        localfunc(ls);
+      else
+        localstat(ls);
+      break;
+    }
+    case TK_DBCOLON: {  /* stat -> label */
+      luaX_next(ls);  /* skip double colon */
+      labelstat(ls, str_checkname(ls), line);
+      break;
+    }
+    case TK_RETURN: {  /* stat -> retstat */
+      luaX_next(ls);  /* skip RETURN */
+      retstat(ls);
+      break;
+    }
+    case TK_BREAK:   /* stat -> breakstat */
+    case TK_GOTO: {  /* stat -> 'goto' NAME */
+      gotostat(ls, luaK_jump(ls->fs));
+      break;
+    }
+    default: {  /* stat -> func | assignment */
+      exprstat(ls);
+      break;
+    }
+  }
+  lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
+             ls->fs->freereg >= ls->fs->nactvar);
+  ls->fs->freereg = ls->fs->nactvar;  /* free registers */
+  leavelevel(ls);
+}
+
+/* }====================================================================== */
+
+
+/*
+** compiles the main function, which is a regular vararg function with an
+** upvalue named LUA_ENV
+*/
+static void mainfunc (LexState *ls, FuncState *fs) {
+  BlockCnt bl;
+  expdesc v;
+  open_func(ls, fs, &bl);
+  fs->f->is_vararg = 1;  /* main function is always vararg */
+  init_exp(&v, VLOCAL, 0);  /* create and... */
+  newupvalue(fs, ls->envn, &v);  /* ...set environment upvalue */
+  luaX_next(ls);  /* read first token */
+  statlist(ls);  /* parse main body */
+  check(ls, TK_EOS);
+  close_func(ls);
+}
+
+
+Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+                      Dyndata *dyd, const char *name, int firstchar) {
+  LexState lexstate;
+  FuncState funcstate;
+  Closure *cl = luaF_newLclosure(L, 1);  /* create main closure */
+  /* anchor closure (to avoid being collected) */
+  setclLvalue(L, L->top, cl);
+  incr_top(L);
+  funcstate.f = cl->l.p = luaF_newproto(L);
+  funcstate.f->source = luaS_new(L, name);  /* create and anchor TString */
+  lexstate.buff = buff;
+  lexstate.dyd = dyd;
+  dyd->actvar.n = dyd->gt.n = dyd->label.n = 0;
+  luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar);
+  mainfunc(&lexstate, &funcstate);
+  lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs);
+  /* all scopes should be correctly finished */
+  lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0);
+  return cl;  /* it's on the stack too */
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lparser.h b/sys/contrib/openzfs/module/lua/lparser.h
new file mode 100644
index 000000000000..8aea0523f3e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lparser.h
@@ -0,0 +1,121 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lparser_h
+#define lparser_h
+
+#include "llimits.h"
+#include "lobject.h"
+#include "lzio.h"
+
+
+/*
+** Expression descriptor
+*/
+
+typedef enum {
+  VVOID,	/* no value */
+  VNIL,
+  VTRUE,
+  VFALSE,
+  VK,		/* info = index of constant in `k' */
+  VKNUM,	/* nval = numerical value */
+  VNONRELOC,	/* info = result register */
+  VLOCAL,	/* info = local register */
+  VUPVAL,       /* info = index of upvalue in 'upvalues' */
+  VINDEXED,	/* t = table register/upvalue; idx = index R/K */
+  VJMP,		/* info = instruction pc */
+  VRELOCABLE,	/* info = instruction pc */
+  VCALL,	/* info = instruction pc */
+  VVARARG	/* info = instruction pc */
+} expkind;
+
+
+#define vkisvar(k)	(VLOCAL <= (k) && (k) <= VINDEXED)
+#define vkisinreg(k)	((k) == VNONRELOC || (k) == VLOCAL)
+
+typedef struct expdesc {
+  expkind k;
+  union {
+    struct {  /* for indexed variables (VINDEXED) */
+      short idx;  /* index (R/K) */
+      lu_byte t;  /* table (register or upvalue) */
+      lu_byte vt;  /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */
+    } ind;
+    int info;  /* for generic use */
+    lua_Number nval;  /* for VKNUM */
+  } u;
+  int t;  /* patch list of `exit when true' */
+  int f;  /* patch list of `exit when false' */
+} expdesc;
+
+
+/* description of active local variable */
+typedef struct Vardesc {
+  short idx;  /* variable index in stack */
+} Vardesc;
+
+
+/* description of pending goto statements and label statements */
+typedef struct Labeldesc {
+  TString *name;  /* label identifier */
+  int pc;  /* position in code */
+  int line;  /* line where it appeared */
+  lu_byte nactvar;  /* local level where it appears in current block */
+} Labeldesc;
+
+
+/* list of labels or gotos */
+typedef struct Labellist {
+  Labeldesc *arr;  /* array */
+  int n;  /* number of entries in use */
+  int size;  /* array size */
+} Labellist;
+
+
+/* dynamic structures used by the parser */
+typedef struct Dyndata {
+  struct {  /* list of active local variables */
+    Vardesc *arr;
+    int n;
+    int size;
+  } actvar;
+  Labellist gt;  /* list of pending gotos */
+  Labellist label;   /* list of active labels */
+} Dyndata;
+
+
+/* control of blocks */
+struct BlockCnt;  /* defined in lparser.c */
+
+
+/* state needed to generate code for a given function */
+typedef struct FuncState {
+  Proto *f;  /* current function header */
+  Table *h;  /* table to find (and reuse) elements in `k' */
+  struct FuncState *prev;  /* enclosing function */
+  struct LexState *ls;  /* lexical state */
+  struct BlockCnt *bl;  /* chain of current blocks */
+  int pc;  /* next position to code (equivalent to `ncode') */
+  int lasttarget;   /* 'label' of last 'jump label' */
+  int jpc;  /* list of pending jumps to `pc' */
+  int nk;  /* number of elements in `k' */
+  int np;  /* number of elements in `p' */
+  int firstlocal;  /* index of first local var (in Dyndata array) */
+  short nlocvars;  /* number of elements in 'f->locvars' */
+  lu_byte nactvar;  /* number of active local variables */
+  lu_byte nups;  /* number of upvalues */
+  lu_byte freereg;  /* first free register */
+} FuncState;
+
+
+LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+                                Dyndata *dyd, const char *name, int firstchar);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstate.c b/sys/contrib/openzfs/module/lua/lstate.c
new file mode 100644
index 000000000000..4d196eced6a3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstate.c
@@ -0,0 +1,320 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+
+#define lstate_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+#if !defined(LUAI_GCPAUSE)
+#define LUAI_GCPAUSE	200  /* 200% */
+#endif
+
+#if !defined(LUAI_GCMAJOR)
+#define LUAI_GCMAJOR	200  /* 200% */
+#endif
+
+#if !defined(LUAI_GCMUL)
+#define LUAI_GCMUL	200 /* GC runs 'twice the speed' of memory allocation */
+#endif
+
+
+#define MEMERRMSG	"not enough memory"
+
+
+/*
+** a macro to help the creation of a unique random seed when a state is
+** created; the seed is used to randomize hashes.
+*/
+#if !defined(luai_makeseed)
+#define luai_makeseed()		cast(unsigned int, gethrtime())
+#endif
+
+
+
+/*
+** thread state + extra space
+*/
+typedef struct LX {
+#if defined(LUAI_EXTRASPACE)
+  char buff[LUAI_EXTRASPACE];
+#endif
+  lua_State l;
+} LX;
+
+
+/*
+** Main thread combines a thread state and the global state
+*/
+typedef struct LG {
+  LX l;
+  global_State g;
+} LG;
+
+
+
+#define fromstate(L)	(cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l)))
+
+
+/*
+** Compute an initial seed as random as possible. In ANSI, rely on
+** Address Space Layout Randomization (if present) to increase
+** randomness..
+*/
+#define addbuff(b,p,e) \
+  { size_t t = cast(size_t, e); \
+    memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); }
+
+static unsigned int makeseed (lua_State *L) {
+  char buff[4 * sizeof(size_t)];
+  unsigned int h = luai_makeseed();
+  int p = 0;
+  addbuff(buff, p, L);  /* heap variable */
+  addbuff(buff, p, &h);  /* local variable */
+  addbuff(buff, p, luaO_nilobject);  /* global variable */
+  addbuff(buff, p, &lua_newstate);  /* public function */
+  lua_assert(p == sizeof(buff));
+  return luaS_hash(buff, p, h);
+}
+
+
+/*
+** set GCdebt to a new value keeping the value (totalbytes + GCdebt)
+** invariant
+*/
+void luaE_setdebt (global_State *g, l_mem debt) {
+  g->totalbytes -= (debt - g->GCdebt);
+  g->GCdebt = debt;
+}
+
+
+CallInfo *luaE_extendCI (lua_State *L) {
+  CallInfo *ci = luaM_new(L, CallInfo);
+  lua_assert(L->ci->next == NULL);
+  L->ci->next = ci;
+  ci->previous = L->ci;
+  ci->next = NULL;
+  return ci;
+}
+
+
+void luaE_freeCI (lua_State *L) {
+  CallInfo *ci = L->ci;
+  CallInfo *next = ci->next;
+  ci->next = NULL;
+  while ((ci = next) != NULL) {
+    next = ci->next;
+    luaM_free(L, ci);
+  }
+}
+
+
+static void stack_init (lua_State *L1, lua_State *L) {
+  int i; CallInfo *ci;
+  /* initialize stack array */
+  L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue);
+  L1->stacksize = BASIC_STACK_SIZE;
+  for (i = 0; i < BASIC_STACK_SIZE; i++)
+    setnilvalue(L1->stack + i);  /* erase new stack */
+  L1->top = L1->stack;
+  L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK;
+  /* initialize first ci */
+  ci = &L1->base_ci;
+  ci->next = ci->previous = NULL;
+  ci->callstatus = 0;
+  ci->func = L1->top;
+  setnilvalue(L1->top++);  /* 'function' entry for this 'ci' */
+  ci->top = L1->top + LUA_MINSTACK;
+  L1->ci = ci;
+}
+
+
+static void freestack (lua_State *L) {
+  if (L->stack == NULL)
+    return;  /* stack not completely built yet */
+  L->ci = &L->base_ci;  /* free the entire 'ci' list */
+  luaE_freeCI(L);
+  luaM_freearray(L, L->stack, L->stacksize);  /* free stack array */
+}
+
+
+/*
+** Create registry table and its predefined values
+*/
+static void init_registry (lua_State *L, global_State *g) {
+  TValue mt;
+  /* create registry */
+  Table *registry = luaH_new(L);
+  sethvalue(L, &g->l_registry, registry);
+  luaH_resize(L, registry, LUA_RIDX_LAST, 0);
+  /* registry[LUA_RIDX_MAINTHREAD] = L */
+  setthvalue(L, &mt, L);
+  luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt);
+  /* registry[LUA_RIDX_GLOBALS] = table of globals */
+  sethvalue(L, &mt, luaH_new(L));
+  luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt);
+}
+
+
+/*
+** open parts of the state that may cause memory-allocation errors
+*/
+static void f_luaopen (lua_State *L, void *ud) {
+  global_State *g = G(L);
+  UNUSED(ud);
+  stack_init(L, L);  /* init stack */
+  init_registry(L, g);
+  luaS_resize(L, MINSTRTABSIZE);  /* initial size of string table */
+  luaT_init(L);
+  luaX_init(L);
+  /* pre-create memory-error message */
+  g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
+  luaS_fix(g->memerrmsg);  /* it should never be collected */
+  g->gcrunning = 1;  /* allow gc */
+  g->version = lua_version(NULL);
+  luai_userstateopen(L);
+}
+
+
+/*
+** preinitialize a state with consistent values without allocating
+** any memory (to avoid errors)
+*/
+static void preinit_state (lua_State *L, global_State *g) {
+  G(L) = g;
+  L->stack = NULL;
+  L->ci = NULL;
+  L->stacksize = 0;
+  L->errorJmp = NULL;
+  L->nCcalls = 0;
+  L->hook = NULL;
+  L->hookmask = 0;
+  L->basehookcount = 0;
+  L->allowhook = 1;
+  resethookcount(L);
+  L->openupval = NULL;
+  L->nny = 1;
+  L->status = LUA_OK;
+  L->errfunc = 0;
+  L->runerror = 0;
+}
+
+
+static void close_state (lua_State *L) {
+  global_State *g = G(L);
+  luaF_close(L, L->stack);  /* close all upvalues for this thread */
+  luaC_freeallobjects(L);  /* collect all objects */
+  if (g->version)  /* closing a fully built state? */
+    luai_userstateclose(L);
+  luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size);
+  luaZ_freebuffer(L, &g->buff);
+  freestack(L);
+  lua_assert(gettotalbytes(g) == sizeof(LG));
+  (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0);  /* free main block */
+}
+
+
+LUA_API lua_State *lua_newthread (lua_State *L) {
+  lua_State *L1;
+  lua_lock(L);
+  luaC_checkGC(L);
+  L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th;
+  setthvalue(L, L->top, L1);
+  api_incr_top(L);
+  preinit_state(L1, G(L));
+  L1->hookmask = L->hookmask;
+  L1->basehookcount = L->basehookcount;
+  L1->hook = L->hook;
+  resethookcount(L1);
+  luai_userstatethread(L, L1);
+  stack_init(L1, L);  /* init stack */
+  lua_unlock(L);
+  return L1;
+}
+
+
+void luaE_freethread (lua_State *L, lua_State *L1) {
+  LX *l = fromstate(L1);
+  luaF_close(L1, L1->stack);  /* close all upvalues for this thread */
+  lua_assert(L1->openupval == NULL);
+  luai_userstatefree(L, L1);
+  freestack(L1);
+  luaM_free(L, l);
+}
+
+
+LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
+  int i;
+  lua_State *L;
+  global_State *g;
+  LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
+  if (l == NULL) return NULL;
+  L = &l->l.l;
+  g = &l->g;
+  L->next = NULL;
+  L->tt = LUA_TTHREAD;
+  g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT);
+  L->marked = luaC_white(g);
+  g->gckind = KGC_NORMAL;
+  preinit_state(L, g);
+  g->frealloc = f;
+  g->ud = ud;
+  g->mainthread = L;
+  g->seed = makeseed(L);
+  g->uvhead.u.l.prev = &g->uvhead;
+  g->uvhead.u.l.next = &g->uvhead;
+  g->gcrunning = 0;  /* no GC while building state */
+  g->GCestimate = 0;
+  g->strt.size = 0;
+  g->strt.nuse = 0;
+  g->strt.hash = NULL;
+  setnilvalue(&g->l_registry);
+  luaZ_initbuffer(L, &g->buff);
+  g->panic = NULL;
+  g->version = NULL;
+  g->gcstate = GCSpause;
+  g->allgc = NULL;
+  g->finobj = NULL;
+  g->tobefnz = NULL;
+  g->sweepgc = g->sweepfin = NULL;
+  g->gray = g->grayagain = NULL;
+  g->weak = g->ephemeron = g->allweak = NULL;
+  g->totalbytes = sizeof(LG);
+  g->GCdebt = 0;
+  g->gcpause = LUAI_GCPAUSE;
+  g->gcmajorinc = LUAI_GCMAJOR;
+  g->gcstepmul = LUAI_GCMUL;
+  for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL;
+  if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) {
+    /* memory allocation error: free partial state */
+    close_state(L);
+    L = NULL;
+  }
+  return L;
+}
+
+
+LUA_API void lua_close (lua_State *L) {
+  L = G(L)->mainthread;  /* only the main thread can be closed */
+  lua_lock(L);
+  close_state(L);
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstate.h b/sys/contrib/openzfs/module/lua/lstate.h
new file mode 100644
index 000000000000..b636396a6015
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstate.h
@@ -0,0 +1,230 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstate_h
+#define lstate_h
+
+#include <sys/lua/lua.h>
+
+#include "lobject.h"
+#include "ltm.h"
+#include "lzio.h"
+
+
+/*
+
+** Some notes about garbage-collected objects:  All objects in Lua must
+** be kept somehow accessible until being freed.
+**
+** Lua keeps most objects linked in list g->allgc. The link uses field
+** 'next' of the CommonHeader.
+**
+** Strings are kept in several lists headed by the array g->strt.hash.
+**
+** Open upvalues are not subject to independent garbage collection. They
+** are collected together with their respective threads. Lua keeps a
+** double-linked list with all open upvalues (g->uvhead) so that it can
+** mark objects referred by them. (They are always gray, so they must
+** be remarked in the atomic step. Usually their contents would be marked
+** when traversing the respective threads, but the thread may already be
+** dead, while the upvalue is still accessible through closures.)
+**
+** Objects with finalizers are kept in the list g->finobj.
+**
+** The list g->tobefnz links all objects being finalized.
+
+*/
+
+
+struct lua_longjmp;  /* defined in ldo.c */
+
+
+
+/* extra stack space to handle TM calls and some other extras */
+#define EXTRA_STACK   5
+
+
+#define BASIC_STACK_SIZE        (2*LUA_MINSTACK)
+
+
+/* kinds of Garbage Collection */
+#define KGC_NORMAL	0
+#define KGC_EMERGENCY	1	/* gc was forced by an allocation failure */
+#define KGC_GEN		2	/* generational collection */
+
+
+typedef struct stringtable {
+  GCObject **hash;
+  lu_int32 nuse;  /* number of elements */
+  int size;
+} stringtable;
+
+
+/*
+** information about a call
+*/
+typedef struct CallInfo {
+  StkId func;  /* function index in the stack */
+  StkId	top;  /* top for this function */
+  struct CallInfo *previous, *next;  /* dynamic call link */
+  short nresults;  /* expected number of results from this function */
+  lu_byte callstatus;
+  ptrdiff_t extra;
+  union {
+    struct {  /* only for Lua functions */
+      StkId base;  /* base for this function */
+      const Instruction *savedpc;
+    } l;
+    struct {  /* only for C functions */
+      int ctx;  /* context info. in case of yields */
+      lua_CFunction k;  /* continuation in case of yields */
+      ptrdiff_t old_errfunc;
+      lu_byte old_allowhook;
+      lu_byte status;
+    } c;
+  } u;
+} CallInfo;
+
+
+/*
+** Bits in CallInfo status
+*/
+#define CIST_LUA	(1<<0)	/* call is running a Lua function */
+#define CIST_HOOKED	(1<<1)	/* call is running a debug hook */
+#define CIST_REENTRY	(1<<2)	/* call is running on same invocation of
+                                   luaV_execute of previous call */
+#define CIST_YIELDED	(1<<3)	/* call reentered after suspension */
+#define CIST_YPCALL	(1<<4)	/* call is a yieldable protected call */
+#define CIST_STAT	(1<<5)	/* call has an error status (pcall) */
+#define CIST_TAIL	(1<<6)	/* call was tail called */
+#define CIST_HOOKYIELD	(1<<7)	/* last hook called yielded */
+
+
+#define isLua(ci)	((ci)->callstatus & CIST_LUA)
+
+
+/*
+** `global state', shared by all threads of this state
+*/
+typedef struct global_State {
+  lua_Alloc frealloc;  /* function to reallocate memory */
+  void *ud;         /* auxiliary data to `frealloc' */
+  lu_mem totalbytes;  /* number of bytes currently allocated - GCdebt */
+  l_mem GCdebt;  /* bytes allocated not yet compensated by the collector */
+  lu_mem GCmemtrav;  /* memory traversed by the GC */
+  lu_mem GCestimate;  /* an estimate of the non-garbage memory in use */
+  stringtable strt;  /* hash table for strings */
+  TValue l_registry;
+  unsigned int seed;  /* randomized seed for hashes */
+  lu_byte currentwhite;
+  lu_byte gcstate;  /* state of garbage collector */
+  lu_byte gckind;  /* kind of GC running */
+  lu_byte gcrunning;  /* true if GC is running */
+  int sweepstrgc;  /* position of sweep in `strt' */
+  GCObject *allgc;  /* list of all collectable objects */
+  GCObject *finobj;  /* list of collectable objects with finalizers */
+  GCObject **sweepgc;  /* current position of sweep in list 'allgc' */
+  GCObject **sweepfin;  /* current position of sweep in list 'finobj' */
+  GCObject *gray;  /* list of gray objects */
+  GCObject *grayagain;  /* list of objects to be traversed atomically */
+  GCObject *weak;  /* list of tables with weak values */
+  GCObject *ephemeron;  /* list of ephemeron tables (weak keys) */
+  GCObject *allweak;  /* list of all-weak tables */
+  GCObject *tobefnz;  /* list of userdata to be GC */
+  UpVal uvhead;  /* head of double-linked list of all open upvalues */
+  Mbuffer buff;  /* temporary buffer for string concatenation */
+  int gcpause;  /* size of pause between successive GCs */
+  int gcmajorinc;  /* pause between major collections (only in gen. mode) */
+  int gcstepmul;  /* GC `granularity' */
+  lua_CFunction panic;  /* to be called in unprotected errors */
+  struct lua_State *mainthread;
+  const lua_Number *version;  /* pointer to version number */
+  TString *memerrmsg;  /* memory-error message */
+  TString *tmname[TM_N];  /* array with tag-method names */
+  struct Table *mt[LUA_NUMTAGS];  /* metatables for basic types */
+} global_State;
+
+
+/*
+** `per thread' state
+*/
+struct lua_State {
+  CommonHeader;
+  lu_byte status;
+  StkId top;  /* first free slot in the stack */
+  global_State *l_G;
+  CallInfo *ci;  /* call info for current function */
+  const Instruction *oldpc;  /* last pc traced */
+  StkId stack_last;  /* last free slot in the stack */
+  StkId stack;  /* stack base */
+  int stacksize;
+  unsigned short nny;  /* number of non-yieldable calls in stack */
+  unsigned short nCcalls;  /* number of nested C calls */
+  lu_byte hookmask;
+  lu_byte allowhook;
+  lu_byte runerror; /* handling a runtime error */
+  int basehookcount;
+  int hookcount;
+  lua_Hook hook;
+  GCObject *openupval;  /* list of open upvalues in this stack */
+  GCObject *gclist;
+  struct lua_longjmp *errorJmp;  /* current error recover point */
+  ptrdiff_t errfunc;  /* current error handling function (stack index) */
+  CallInfo base_ci;  /* CallInfo for first level (C calling Lua) */
+};
+
+
+#define G(L)	(L->l_G)
+
+
+/*
+** Union of all collectable objects
+*/
+union GCObject {
+  GCheader gch;  /* common header */
+  union TString ts;
+  union Udata u;
+  union Closure cl;
+  struct Table h;
+  struct Proto p;
+  struct UpVal uv;
+  struct lua_State th;  /* thread */
+};
+
+
+#define gch(o)		(&(o)->gch)
+
+/* macros to convert a GCObject into a specific value */
+#define rawgco2ts(o)  \
+	check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts))
+#define gco2ts(o)	(&rawgco2ts(o)->tsv)
+#define rawgco2u(o)	check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u))
+#define gco2u(o)	(&rawgco2u(o)->uv)
+#define gco2lcl(o)	check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l))
+#define gco2ccl(o)	check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c))
+#define gco2cl(o)  \
+	check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl))
+#define gco2t(o)	check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h))
+#define gco2p(o)	check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p))
+#define gco2uv(o)	check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv))
+#define gco2th(o)	check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th))
+
+/* macro to convert any Lua object into a GCObject */
+#define obj2gco(v)	(cast(GCObject *, (v)))
+
+
+/* actual number of total bytes allocated */
+#define gettotalbytes(g)	((g)->totalbytes + (g)->GCdebt)
+
+LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt);
+LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1);
+LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L);
+LUAI_FUNC void luaE_freeCI (lua_State *L);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstring.c b/sys/contrib/openzfs/module/lua/lstring.c
new file mode 100644
index 000000000000..7fcef3d88aa3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstring.c
@@ -0,0 +1,186 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $
+** String table (keeps all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+
+#define lstring_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+
+
+/*
+** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to
+** compute its hash
+*/
+#if !defined(LUAI_HASHLIMIT)
+#define LUAI_HASHLIMIT		5
+#endif
+
+
+/*
+** equality for long strings
+*/
+int luaS_eqlngstr (TString *a, TString *b) {
+  size_t len = a->tsv.len;
+  lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR);
+  return (a == b) ||  /* same instance or... */
+    ((len == b->tsv.len) &&  /* equal length and ... */
+     (memcmp(getstr(a), getstr(b), len) == 0));  /* equal contents */
+}
+
+
+/*
+** equality for strings
+*/
+int luaS_eqstr (TString *a, TString *b) {
+  return (a->tsv.tt == b->tsv.tt) &&
+         (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b));
+}
+
+
+unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
+  unsigned int h = seed ^ cast(unsigned int, l);
+  size_t l1;
+  size_t step = (l >> LUAI_HASHLIMIT) + 1;
+  for (l1 = l; l1 >= step; l1 -= step)
+    h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1]));
+  return h;
+}
+
+
+/*
+** resizes the string table
+*/
+void luaS_resize (lua_State *L, int newsize) {
+  int i;
+  stringtable *tb = &G(L)->strt;
+  /* cannot resize while GC is traversing strings */
+  luaC_runtilstate(L, ~bitmask(GCSsweepstring));
+  if (newsize > tb->size) {
+    luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+    for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL;
+  }
+  /* rehash */
+  for (i=0; i<tb->size; i++) {
+    GCObject *p = tb->hash[i];
+    tb->hash[i] = NULL;
+    while (p) {  /* for each node in the list */
+      GCObject *next = gch(p)->next;  /* save next */
+      unsigned int h = lmod(gco2ts(p)->hash, newsize);  /* new position */
+      gch(p)->next = tb->hash[h];  /* chain it */
+      tb->hash[h] = p;
+      resetoldbit(p);  /* see MOVE OLD rule */
+      p = next;
+    }
+  }
+  if (newsize < tb->size) {
+    /* shrinking slice must be empty */
+    lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
+    luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+  }
+  tb->size = newsize;
+}
+
+
+/*
+** creates a new string object
+*/
+static TString *createstrobj (lua_State *L, const char *str, size_t l,
+                              int tag, unsigned int h, GCObject **list) {
+  TString *ts;
+  char *sbuf;
+  size_t totalsize;  /* total size of TString object */
+  totalsize = sizeof(TString) + ((l + 1) * sizeof(char));
+  ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts;
+  ts->tsv.len = l;
+  ts->tsv.hash = h;
+  ts->tsv.extra = 0;
+  sbuf = (char *)(TString *)(ts + 1);
+  memcpy(sbuf, str, l*sizeof(char));
+  sbuf[l] = '\0';  /* ending 0 */
+  return ts;
+}
+
+
+/*
+** creates a new short string, inserting it into string table
+*/
+static TString *newshrstr (lua_State *L, const char *str, size_t l,
+                                       unsigned int h) {
+  GCObject **list;  /* (pointer to) list where it will be inserted */
+  stringtable *tb = &G(L)->strt;
+  TString *s;
+  if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
+    luaS_resize(L, tb->size*2);  /* too crowded */
+  list = &tb->hash[lmod(h, tb->size)];
+  s = createstrobj(L, str, l, LUA_TSHRSTR, h, list);
+  tb->nuse++;
+  return s;
+}
+
+
+/*
+** checks whether short string exists and reuses it or creates a new one
+*/
+static TString *internshrstr (lua_State *L, const char *str, size_t l) {
+  GCObject *o;
+  global_State *g = G(L);
+  unsigned int h = luaS_hash(str, l, g->seed);
+  for (o = g->strt.hash[lmod(h, g->strt.size)];
+       o != NULL;
+       o = gch(o)->next) {
+    TString *ts = rawgco2ts(o);
+    if (h == ts->tsv.hash &&
+        l == ts->tsv.len &&
+        (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
+      if (isdead(G(L), o))  /* string is dead (but was not collected yet)? */
+        changewhite(o);  /* resurrect it */
+      return ts;
+    }
+  }
+  return newshrstr(L, str, l, h);  /* not found; create a new string */
+}
+
+
+/*
+** new string (with explicit length)
+*/
+TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
+  if (l <= LUAI_MAXSHORTLEN)  /* short string? */
+    return internshrstr(L, str, l);
+  else {
+    if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char))
+      luaM_toobig(L);
+    return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL);
+  }
+}
+
+
+/*
+** new zero-terminated string
+*/
+TString *luaS_new (lua_State *L, const char *str) {
+  return luaS_newlstr(L, str, strlen(str));
+}
+
+
+Udata *luaS_newudata (lua_State *L, size_t s, Table *e) {
+  Udata *u;
+  if (s > MAX_SIZET - sizeof(Udata))
+    luaM_toobig(L);
+  u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u;
+  u->uv.len = s;
+  u->uv.metatable = NULL;
+  u->uv.env = e;
+  return u;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstring.h b/sys/contrib/openzfs/module/lua/lstring.h
new file mode 100644
index 000000000000..66e65379b8e7
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstring.h
@@ -0,0 +1,48 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $
+** String table (keep all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstring_h
+#define lstring_h
+
+#include "lgc.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+#define sizestring(s)	(sizeof(union TString)+((s)->len+1)*sizeof(char))
+
+#define sizeudata(u)	(sizeof(union Udata)+(u)->len)
+
+#define luaS_newliteral(L, s)	(luaS_newlstr(L, "" s, \
+                                 (sizeof(s)/sizeof(char))-1))
+
+#define luaS_fix(s)	l_setbit((s)->tsv.marked, FIXEDBIT)
+
+
+/*
+** test whether a string is a reserved word
+*/
+#define isreserved(s)	((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0)
+
+
+/*
+** equality for short strings, which are always internalized
+*/
+#define eqshrstr(a,b)	check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b))
+
+
+LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed);
+LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b);
+LUAI_FUNC int luaS_eqstr (TString *a, TString *b);
+LUAI_FUNC void luaS_resize (lua_State *L, int newsize);
+LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e);
+LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l);
+LUAI_FUNC TString *luaS_new (lua_State *L, const char *str);
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lstrlib.c b/sys/contrib/openzfs/module/lua/lstrlib.c
new file mode 100644
index 000000000000..12027757bf53
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lstrlib.c
@@ -0,0 +1,1040 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $
+** Standard library for string operations and pattern-matching
+** See Copyright Notice in lua.h
+*/
+
+
+#define lstrlib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+
+/*
+** maximum number of captures that a pattern can do during
+** pattern-matching. This limit is arbitrary.
+*/
+#if !defined(LUA_MAXCAPTURES)
+#define LUA_MAXCAPTURES		16
+#endif
+
+
+/* macro to `unsign' a character */
+#define uchar(c)	((unsigned char)(c))
+
+/*
+ * The provided version of sprintf returns a char *, but str_format expects
+ * it to return the number of characters printed. This version has the expected
+ * behavior.
+ */
+static size_t str_sprintf(char *buf, const char *fmt, ...) {
+  va_list args;
+  size_t len;
+
+  va_start(args, fmt);
+  len = vsnprintf(buf, INT_MAX, fmt, args);
+  va_end(args);
+
+  return len;
+}
+
+
+static int str_len (lua_State *L) {
+  size_t l;
+  luaL_checklstring(L, 1, &l);
+  lua_pushinteger(L, (lua_Integer)l);
+  return 1;
+}
+
+
+/* translate a relative string position: negative means back from end */
+static size_t posrelat (ptrdiff_t pos, size_t len) {
+  if (pos >= 0) return (size_t)pos;
+  else if (0u - (size_t)pos > len) return 0;
+  else return len - ((size_t)-pos) + 1;
+}
+
+
+static int str_sub (lua_State *L) {
+  size_t l;
+  const char *s = luaL_checklstring(L, 1, &l);
+  size_t start = posrelat(luaL_checkinteger(L, 2), l);
+  size_t end = posrelat(luaL_optinteger(L, 3, -1), l);
+  if (start < 1) start = 1;
+  if (end > l) end = l;
+  if (start <= end)
+    lua_pushlstring(L, s + start - 1, end - start + 1);
+  else lua_pushliteral(L, "");
+  return 1;
+}
+
+
+static int str_reverse (lua_State *L) {
+  size_t l, i;
+  luaL_Buffer b;
+  const char *s = luaL_checklstring(L, 1, &l);
+  char *p = luaL_buffinitsize(L, &b, l);
+  for (i = 0; i < l; i++)
+    p[i] = s[l - i - 1];
+  luaL_pushresultsize(&b, l);
+  return 1;
+}
+
+
+static int str_lower (lua_State *L) {
+  size_t l;
+  size_t i;
+  luaL_Buffer b;
+  const char *s = luaL_checklstring(L, 1, &l);
+  char *p = luaL_buffinitsize(L, &b, l);
+  for (i=0; i<l; i++)
+    p[i] = tolower(uchar(s[i]));
+  luaL_pushresultsize(&b, l);
+  return 1;
+}
+
+
+static int str_upper (lua_State *L) {
+  size_t l;
+  size_t i;
+  luaL_Buffer b;
+  const char *s = luaL_checklstring(L, 1, &l);
+  char *p = luaL_buffinitsize(L, &b, l);
+  for (i=0; i<l; i++)
+    p[i] = toupper(uchar(s[i]));
+  luaL_pushresultsize(&b, l);
+  return 1;
+}
+
+
+/* reasonable limit to avoid arithmetic overflow */
+#define MAXSIZE		((~(size_t)0) >> 1)
+
+static int str_rep (lua_State *L) {
+  size_t l, lsep;
+  const char *s = luaL_checklstring(L, 1, &l);
+  int n = luaL_checkint(L, 2);
+  const char *sep = luaL_optlstring(L, 3, "", &lsep);
+  if (n <= 0) lua_pushliteral(L, "");
+  else if (l + lsep < l || l + lsep >= MAXSIZE / n)  /* may overflow? */
+    return luaL_error(L, "resulting string too large");
+  else {
+    size_t totallen = n * l + (n - 1) * lsep;
+    luaL_Buffer b;
+    char *p = luaL_buffinitsize(L, &b, totallen);
+    while (n-- > 1) {  /* first n-1 copies (followed by separator) */
+      memcpy(p, s, l * sizeof(char)); p += l;
+      if (lsep > 0) {  /* avoid empty 'memcpy' (may be expensive) */
+        memcpy(p, sep, lsep * sizeof(char)); p += lsep;
+      }
+    }
+    memcpy(p, s, l * sizeof(char));  /* last copy (not followed by separator) */
+    luaL_pushresultsize(&b, totallen);
+  }
+  return 1;
+}
+
+
+static int str_byte (lua_State *L) {
+  size_t l;
+  const char *s = luaL_checklstring(L, 1, &l);
+  size_t posi = posrelat(luaL_optinteger(L, 2, 1), l);
+  size_t pose = posrelat(luaL_optinteger(L, 3, posi), l);
+  int n, i;
+  if (posi < 1) posi = 1;
+  if (pose > l) pose = l;
+  if (posi > pose) return 0;  /* empty interval; return no values */
+  n = (int)(pose -  posi + 1);
+  if (posi + n <= pose)  /* (size_t -> int) overflow? */
+    return luaL_error(L, "string slice too long");
+  luaL_checkstack(L, n, "string slice too long");
+  for (i=0; i<n; i++)
+    lua_pushinteger(L, uchar(s[posi+i-1]));
+  return n;
+}
+
+
+static int str_char (lua_State *L) {
+  int n = lua_gettop(L);  /* number of arguments */
+  int i;
+  luaL_Buffer b;
+  char *p = luaL_buffinitsize(L, &b, n);
+  for (i=1; i<=n; i++) {
+    int c = luaL_checkint(L, i);
+    luaL_argcheck(L, uchar(c) == c, i, "value out of range");
+    p[i - 1] = uchar(c);
+  }
+  luaL_pushresultsize(&b, n);
+  return 1;
+}
+
+
+#if defined(LUA_USE_DUMP)
+static int writer (lua_State *L, const void* b, size_t size, void* B) {
+  (void)L;
+  luaL_addlstring((luaL_Buffer*) B, (const char *)b, size);
+  return 0;
+}
+
+
+static int str_dump (lua_State *L) {
+  luaL_Buffer b;
+  luaL_checktype(L, 1, LUA_TFUNCTION);
+  lua_settop(L, 1);
+  luaL_buffinit(L,&b);
+  if (lua_dump(L, writer, &b) != 0)
+    return luaL_error(L, "unable to dump given function");
+  luaL_pushresult(&b);
+  return 1;
+}
+#endif
+
+
+/*
+** {======================================================
+** PATTERN MATCHING
+** =======================================================
+*/
+
+
+#define CAP_UNFINISHED	(-1)
+#define CAP_POSITION	(-2)
+
+
+typedef struct MatchState {
+  int matchdepth;  /* control for recursive depth (to avoid C stack overflow) */
+  const char *src_init;  /* init of source string */
+  const char *src_end;  /* end ('\0') of source string */
+  const char *p_end;  /* end ('\0') of pattern */
+  lua_State *L;
+  int level;  /* total number of captures (finished or unfinished) */
+  struct {
+    const char *init;
+    ptrdiff_t len;
+  } capture[LUA_MAXCAPTURES];
+} MatchState;
+
+
+/* recursive function */
+static const char *match (MatchState *ms, const char *s, const char *p);
+
+
+/* maximum recursion depth for 'match' */
+#if !defined(MAXCCALLS)
+#define MAXCCALLS	200
+#endif
+
+
+#define L_ESC		'%'
+#define SPECIALS	"^$*+?.([%-"
+
+
+static int check_capture (MatchState *ms, int l) {
+  l -= '1';
+  if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
+    return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
+  return l;
+}
+
+
+static int capture_to_close (MatchState *ms) {
+  int level = ms->level;
+  for (level--; level>=0; level--)
+    if (ms->capture[level].len == CAP_UNFINISHED) return level;
+  return luaL_error(ms->L, "invalid pattern capture");
+}
+
+
+static const char *classend (MatchState *ms, const char *p) {
+  switch (*p++) {
+    case L_ESC: {
+      if (p == ms->p_end)
+        luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
+      return p+1;
+    }
+    case '[': {
+      if (*p == '^') p++;
+      do {  /* look for a `]' */
+        if (p == ms->p_end)
+          luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
+        if (*(p++) == L_ESC && p < ms->p_end)
+          p++;  /* skip escapes (e.g. `%]') */
+      } while (*p != ']');
+      return p+1;
+    }
+    default: {
+      return p;
+    }
+  }
+}
+
+
+static int match_class (int c, int cl) {
+  int res;
+  switch (tolower(cl)) {
+    case 'a' : res = isalpha(c); break;
+    case 'c' : res = iscntrl(c); break;
+    case 'd' : res = isdigit(c); break;
+    case 'g' : res = isgraph(c); break;
+    case 'l' : res = islower(c); break;
+    case 'p' : res = ispunct(c); break;
+    case 's' : res = isspace(c); break;
+    case 'u' : res = isupper(c); break;
+    case 'w' : res = isalnum(c); break;
+    case 'x' : res = isxdigit(c); break;
+    case 'z' : res = (c == 0); break;  /* deprecated option */
+    default: return (cl == c);
+  }
+  return (islower(cl) ? res : !res);
+}
+
+
+static int matchbracketclass (int c, const char *p, const char *ec) {
+  int sig = 1;
+  if (*(p+1) == '^') {
+    sig = 0;
+    p++;  /* skip the `^' */
+  }
+  while (++p < ec) {
+    if (*p == L_ESC) {
+      p++;
+      if (match_class(c, uchar(*p)))
+        return sig;
+    }
+    else if ((*(p+1) == '-') && (p+2 < ec)) {
+      p+=2;
+      if (uchar(*(p-2)) <= c && c <= uchar(*p))
+        return sig;
+    }
+    else if (uchar(*p) == c) return sig;
+  }
+  return !sig;
+}
+
+
+static int singlematch (MatchState *ms, const char *s, const char *p,
+                        const char *ep) {
+  if (s >= ms->src_end)
+    return 0;
+  else {
+    int c = uchar(*s);
+    switch (*p) {
+      case '.': return 1;  /* matches any char */
+      case L_ESC: return match_class(c, uchar(*(p+1)));
+      case '[': return matchbracketclass(c, p, ep-1);
+      default:  return (uchar(*p) == c);
+    }
+  }
+}
+
+
+static const char *matchbalance (MatchState *ms, const char *s,
+                                   const char *p) {
+  if (p >= ms->p_end - 1)
+    luaL_error(ms->L, "malformed pattern "
+                      "(missing arguments to " LUA_QL("%%b") ")");
+  if (*s != *p) return NULL;
+  else {
+    int b = *p;
+    int e = *(p+1);
+    int cont = 1;
+    while (++s < ms->src_end) {
+      if (*s == e) {
+        if (--cont == 0) return s+1;
+      }
+      else if (*s == b) cont++;
+    }
+  }
+  return NULL;  /* string ends out of balance */
+}
+
+
+static const char *max_expand (MatchState *ms, const char *s,
+                                 const char *p, const char *ep) {
+  ptrdiff_t i = 0;  /* counts maximum expand for item */
+  while (singlematch(ms, s + i, p, ep))
+    i++;
+  /* keeps trying to match with the maximum repetitions */
+  while (i>=0) {
+    const char *res = match(ms, (s+i), ep+1);
+    if (res) return res;
+    i--;  /* else didn't match; reduce 1 repetition to try again */
+  }
+  return NULL;
+}
+
+
+static const char *min_expand (MatchState *ms, const char *s,
+                                 const char *p, const char *ep) {
+  for (;;) {
+    const char *res = match(ms, s, ep+1);
+    if (res != NULL)
+      return res;
+    else if (singlematch(ms, s, p, ep))
+      s++;  /* try with one more repetition */
+    else return NULL;
+  }
+}
+
+
+static const char *start_capture (MatchState *ms, const char *s,
+                                    const char *p, int what) {
+  const char *res;
+  int level = ms->level;
+  if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
+  ms->capture[level].init = s;
+  ms->capture[level].len = what;
+  ms->level = level+1;
+  if ((res=match(ms, s, p)) == NULL)  /* match failed? */
+    ms->level--;  /* undo capture */
+  return res;
+}
+
+
+static const char *end_capture (MatchState *ms, const char *s,
+                                  const char *p) {
+  int l = capture_to_close(ms);
+  const char *res;
+  ms->capture[l].len = s - ms->capture[l].init;  /* close capture */
+  if ((res = match(ms, s, p)) == NULL)  /* match failed? */
+    ms->capture[l].len = CAP_UNFINISHED;  /* undo capture */
+  return res;
+}
+
+
+static const char *match_capture (MatchState *ms, const char *s, int l) {
+  size_t len;
+  l = check_capture(ms, l);
+  len = ms->capture[l].len;
+  if ((size_t)(ms->src_end-s) >= len &&
+      memcmp(ms->capture[l].init, s, len) == 0)
+    return s+len;
+  else return NULL;
+}
+
+
+static const char *match (MatchState *ms, const char *s, const char *p) {
+  if (ms->matchdepth-- == 0)
+    luaL_error(ms->L, "pattern too complex");
+  init: /* using goto's to optimize tail recursion */
+  if (p != ms->p_end) {  /* end of pattern? */
+    switch (*p) {
+      case '(': {  /* start capture */
+        if (*(p + 1) == ')')  /* position capture? */
+          s = start_capture(ms, s, p + 2, CAP_POSITION);
+        else
+          s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
+        break;
+      }
+      case ')': {  /* end capture */
+        s = end_capture(ms, s, p + 1);
+        break;
+      }
+      case '$': {
+        if ((p + 1) != ms->p_end)  /* is the `$' the last char in pattern? */
+          goto dflt;  /* no; go to default */
+        s = (s == ms->src_end) ? s : NULL;  /* check end of string */
+        break;
+      }
+      case L_ESC: {  /* escaped sequences not in the format class[*+?-]? */
+        switch (*(p + 1)) {
+          case 'b': {  /* balanced string? */
+            s = matchbalance(ms, s, p + 2);
+            if (s != NULL) {
+              p += 4; goto init;  /* return match(ms, s, p + 4); */
+            }  /* else fail (s == NULL) */
+            break;
+          }
+          case 'f': {  /* frontier? */
+            const char *ep; char previous;
+            p += 2;
+            if (*p != '[')
+              luaL_error(ms->L, "missing " LUA_QL("[") " after "
+                                 LUA_QL("%%f") " in pattern");
+            ep = classend(ms, p);  /* points to what is next */
+            previous = (s == ms->src_init) ? '\0' : *(s - 1);
+            if (!matchbracketclass(uchar(previous), p, ep - 1) &&
+               matchbracketclass(uchar(*s), p, ep - 1)) {
+              p = ep; goto init;  /* return match(ms, s, ep); */
+            }
+            s = NULL;  /* match failed */
+            break;
+          }
+          case '0': case '1': case '2': case '3':
+          case '4': case '5': case '6': case '7':
+          case '8': case '9': {  /* capture results (%0-%9)? */
+            s = match_capture(ms, s, uchar(*(p + 1)));
+            if (s != NULL) {
+              p += 2; goto init;  /* return match(ms, s, p + 2) */
+            }
+            break;
+          }
+          default: goto dflt;
+        }
+        break;
+      }
+      default: dflt: {  /* pattern class plus optional suffix */
+        const char *ep = classend(ms, p);  /* points to optional suffix */
+        /* does not match at least once? */
+        if (!singlematch(ms, s, p, ep)) {
+          if (*ep == '*' || *ep == '?' || *ep == '-') {  /* accept empty? */
+            p = ep + 1; goto init;  /* return match(ms, s, ep + 1); */
+          }
+          else  /* '+' or no suffix */
+            s = NULL;  /* fail */
+        }
+        else {  /* matched once */
+          switch (*ep) {  /* handle optional suffix */
+            case '?': {  /* optional */
+              const char *res;
+              if ((res = match(ms, s + 1, ep + 1)) != NULL)
+                s = res;
+              else {
+                p = ep + 1; goto init;  /* else return match(ms, s, ep + 1); */
+              }
+              break;
+            }
+            case '+':  /* 1 or more repetitions */
+              s++;  /* 1 match already done */
+              /* FALLTHROUGH */
+            case '*':  /* 0 or more repetitions */
+              s = max_expand(ms, s, p, ep);
+              break;
+            case '-':  /* 0 or more repetitions (minimum) */
+              s = min_expand(ms, s, p, ep);
+              break;
+            default:  /* no suffix */
+              s++; p = ep; goto init;  /* return match(ms, s + 1, ep); */
+          }
+        }
+        break;
+      }
+    }
+  }
+  ms->matchdepth++;
+  return s;
+}
+
+
+
+static const char *lmemfind (const char *s1, size_t l1,
+                               const char *s2, size_t l2) {
+  if (l2 == 0) return s1;  /* empty strings are everywhere */
+  else if (l2 > l1) return NULL;  /* avoids a negative `l1' */
+  else {
+    const char *init;  /* to search for a `*s2' inside `s1' */
+    l2--;  /* 1st char will be checked by `memchr' */
+    l1 = l1-l2;  /* `s2' cannot be found after that */
+    while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
+      init++;   /* 1st char is already checked */
+      if (memcmp(init, s2+1, l2) == 0)
+        return init-1;
+      else {  /* correct `l1' and `s1' to try again */
+        l1 -= init-s1;
+        s1 = init;
+      }
+    }
+    return NULL;  /* not found */
+  }
+}
+
+
+static void push_onecapture (MatchState *ms, int i, const char *s,
+                                                    const char *e) {
+  if (i >= ms->level) {
+    if (i == 0)  /* ms->level == 0, too */
+      lua_pushlstring(ms->L, s, e - s);  /* add whole match */
+    else
+      luaL_error(ms->L, "invalid capture index");
+  }
+  else {
+    ptrdiff_t l = ms->capture[i].len;
+    if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
+    if (l == CAP_POSITION)
+      lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
+    else
+      lua_pushlstring(ms->L, ms->capture[i].init, l);
+  }
+}
+
+
+static int push_captures (MatchState *ms, const char *s, const char *e) {
+  int i;
+  int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
+  luaL_checkstack(ms->L, nlevels, "too many captures");
+  for (i = 0; i < nlevels; i++)
+    push_onecapture(ms, i, s, e);
+  return nlevels;  /* number of strings pushed */
+}
+
+
+/* check whether pattern has no special characters */
+static int nospecials (const char *p, size_t l) {
+  size_t upto = 0;
+  do {
+    if (strpbrk(p + upto, SPECIALS))
+      return 0;  /* pattern has a special character */
+    upto += strlen(p + upto) + 1;  /* may have more after \0 */
+  } while (upto <= l);
+  return 1;  /* no special chars found */
+}
+
+
+static int str_find_aux (lua_State *L, int find) {
+  size_t ls, lp;
+  const char *s = luaL_checklstring(L, 1, &ls);
+  const char *p = luaL_checklstring(L, 2, &lp);
+  size_t init = posrelat(luaL_optinteger(L, 3, 1), ls);
+  if (init < 1) init = 1;
+  else if (init > ls + 1) {  /* start after string's end? */
+    lua_pushnil(L);  /* cannot find anything */
+    return 1;
+  }
+  /* explicit request or no special characters? */
+  if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) {
+    /* do a plain search */
+    const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp);
+    if (s2) {
+      lua_pushinteger(L, s2 - s + 1);
+      lua_pushinteger(L, s2 - s + lp);
+      return 2;
+    }
+  }
+  else {
+    MatchState ms;
+    const char *s1 = s + init - 1;
+    int anchor = (*p == '^');
+    if (anchor) {
+      p++; lp--;  /* skip anchor character */
+    }
+    ms.L = L;
+    ms.matchdepth = MAXCCALLS;
+    ms.src_init = s;
+    ms.src_end = s + ls;
+    ms.p_end = p + lp;
+    do {
+      const char *res;
+      ms.level = 0;
+      lua_assert(ms.matchdepth == MAXCCALLS);
+      if ((res=match(&ms, s1, p)) != NULL) {
+        if (find) {
+          lua_pushinteger(L, s1 - s + 1);  /* start */
+          lua_pushinteger(L, res - s);   /* end */
+          return push_captures(&ms, NULL, 0) + 2;
+        }
+        else
+          return push_captures(&ms, s1, res);
+      }
+    } while (s1++ < ms.src_end && !anchor);
+  }
+  lua_pushnil(L);  /* not found */
+  return 1;
+}
+
+
+static int str_find (lua_State *L) {
+  return str_find_aux(L, 1);
+}
+
+
+static int str_match (lua_State *L) {
+  return str_find_aux(L, 0);
+}
+
+
+static int gmatch_aux (lua_State *L) {
+  MatchState ms;
+  size_t ls, lp;
+  const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
+  const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp);
+  const char *src;
+  ms.L = L;
+  ms.matchdepth = MAXCCALLS;
+  ms.src_init = s;
+  ms.src_end = s+ls;
+  ms.p_end = p + lp;
+  for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
+       src <= ms.src_end;
+       src++) {
+    const char *e;
+    ms.level = 0;
+    lua_assert(ms.matchdepth == MAXCCALLS);
+    if ((e = match(&ms, src, p)) != NULL) {
+      lua_Integer newstart = e-s;
+      if (e == src) newstart++;  /* empty match? go at least one position */
+      lua_pushinteger(L, newstart);
+      lua_replace(L, lua_upvalueindex(3));
+      return push_captures(&ms, src, e);
+    }
+  }
+  return 0;  /* not found */
+}
+
+
+static int str_gmatch (lua_State *L) {
+  luaL_checkstring(L, 1);
+  luaL_checkstring(L, 2);
+  lua_settop(L, 2);
+  lua_pushinteger(L, 0);
+  lua_pushcclosure(L, gmatch_aux, 3);
+  return 1;
+}
+
+
+static void add_s (MatchState *ms, luaL_Buffer *b, const char *s,
+                                                   const char *e) {
+  size_t l, i;
+  const char *news = lua_tolstring(ms->L, 3, &l);
+  for (i = 0; i < l; i++) {
+    if (news[i] != L_ESC)
+      luaL_addchar(b, news[i]);
+    else {
+      i++;  /* skip ESC */
+      if (!isdigit(uchar(news[i]))) {
+        if (news[i] != L_ESC)
+          luaL_error(ms->L, "invalid use of " LUA_QL("%c")
+                           " in replacement string", L_ESC);
+        luaL_addchar(b, news[i]);
+      }
+      else if (news[i] == '0')
+          luaL_addlstring(b, s, e - s);
+      else {
+        push_onecapture(ms, news[i] - '1', s, e);
+        luaL_addvalue(b);  /* add capture to accumulated result */
+      }
+    }
+  }
+}
+
+
+static void add_value (MatchState *ms, luaL_Buffer *b, const char *s,
+                                       const char *e, int tr) {
+  lua_State *L = ms->L;
+  switch (tr) {
+    case LUA_TFUNCTION: {
+      int n;
+      lua_pushvalue(L, 3);
+      n = push_captures(ms, s, e);
+      lua_call(L, n, 1);
+      break;
+    }
+    case LUA_TTABLE: {
+      push_onecapture(ms, 0, s, e);
+      lua_gettable(L, 3);
+      break;
+    }
+    default: {  /* LUA_TNUMBER or LUA_TSTRING */
+      add_s(ms, b, s, e);
+      return;
+    }
+  }
+  if (!lua_toboolean(L, -1)) {  /* nil or false? */
+    lua_pop(L, 1);
+    lua_pushlstring(L, s, e - s);  /* keep original text */
+  }
+  else if (!lua_isstring(L, -1))
+    luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
+  luaL_addvalue(b);  /* add result to accumulator */
+}
+
+
+static int str_gsub (lua_State *L) {
+  size_t srcl, lp;
+  const char *src = luaL_checklstring(L, 1, &srcl);
+  const char *p = luaL_checklstring(L, 2, &lp);
+  int tr = lua_type(L, 3);
+  size_t max_s = luaL_optinteger(L, 4, srcl+1);
+  int anchor = (*p == '^');
+  size_t n = 0;
+  MatchState ms;
+  luaL_Buffer b;
+  luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
+                   tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
+                      "string/function/table expected");
+  luaL_buffinit(L, &b);
+  if (anchor) {
+    p++; lp--;  /* skip anchor character */
+  }
+  ms.L = L;
+  ms.matchdepth = MAXCCALLS;
+  ms.src_init = src;
+  ms.src_end = src+srcl;
+  ms.p_end = p + lp;
+  while (n < max_s) {
+    const char *e;
+    ms.level = 0;
+    lua_assert(ms.matchdepth == MAXCCALLS);
+    e = match(&ms, src, p);
+    if (e) {
+      n++;
+      add_value(&ms, &b, src, e, tr);
+    }
+    if (e && e>src) /* non empty match? */
+      src = e;  /* skip it */
+    else if (src < ms.src_end)
+      luaL_addchar(&b, *src++);
+    else break;
+    if (anchor) break;
+  }
+  luaL_addlstring(&b, src, ms.src_end-src);
+  luaL_pushresult(&b);
+  lua_pushinteger(L, n);  /* number of substitutions */
+  return 2;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** STRING FORMAT
+** =======================================================
+*/
+
+/*
+** LUA_INTFRMLEN is the length modifier for integer conversions in
+** 'string.format'; LUA_INTFRM_T is the integer type corresponding to
+** the previous length
+*/
+#if !defined(LUA_INTFRMLEN)	/* { */
+#if defined(LUA_USE_LONGLONG)
+
+#define LUA_INTFRMLEN		"ll"
+#define LUA_INTFRM_T		long long
+
+#else
+
+#define LUA_INTFRMLEN		"l"
+#define LUA_INTFRM_T		long
+
+#endif
+#endif				/* } */
+
+
+/*
+** LUA_FLTFRMLEN is the length modifier for float conversions in
+** 'string.format'; LUA_FLTFRM_T is the float type corresponding to
+** the previous length
+*/
+#if !defined(LUA_FLTFRMLEN)
+
+#define LUA_FLTFRMLEN		""
+#define LUA_FLTFRM_T		double
+
+#endif
+
+
+/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
+#define MAX_ITEM	512
+/* valid flags in a format specification */
+#define FLAGS	"-+ #0"
+/*
+** maximum size of each format specification (such as '%-099.99d')
+** (+10 accounts for %99.99x plus margin of error)
+*/
+#define MAX_FORMAT	(sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
+
+
+static void addquoted (lua_State *L, luaL_Buffer *b, int arg) {
+  size_t l;
+  const char *s = luaL_checklstring(L, arg, &l);
+  luaL_addchar(b, '"');
+  while (l--) {
+    if (*s == '"' || *s == '\\' || *s == '\n') {
+      luaL_addchar(b, '\\');
+      luaL_addchar(b, *s);
+    }
+    else if (*s == '\0' || iscntrl(uchar(*s))) {
+      char buff[10];
+      if (!isdigit(uchar(*(s+1))))
+        snprintf(buff, sizeof(buff), "\\%d", (int)uchar(*s));
+      else
+        snprintf(buff, sizeof(buff), "\\%03d", (int)uchar(*s));
+      luaL_addstring(b, buff);
+    }
+    else
+      luaL_addchar(b, *s);
+    s++;
+  }
+  luaL_addchar(b, '"');
+}
+
+static const char *scanformat (lua_State *L, const char *strfrmt, char *form) {
+  const char *p = strfrmt;
+  while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++;  /* skip flags */
+  if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char))
+    luaL_error(L, "invalid format (repeated flags)");
+  if (isdigit(uchar(*p))) p++;  /* skip width */
+  if (isdigit(uchar(*p))) p++;  /* (2 digits at most) */
+  if (*p == '.') {
+    p++;
+    if (isdigit(uchar(*p))) p++;  /* skip precision */
+    if (isdigit(uchar(*p))) p++;  /* (2 digits at most) */
+  }
+  if (isdigit(uchar(*p)))
+    luaL_error(L, "invalid format (width or precision too long)");
+  *(form++) = '%';
+  memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char));
+  form += p - strfrmt + 1;
+  *form = '\0';
+  return p;
+}
+
+
+/*
+** add length modifier into formats
+*/
+static void addlenmod (char *form, const char *lenmod, size_t size) {
+  size_t l = strlen(form);
+  size_t lm = strlen(lenmod);
+  char spec = form[l - 1];
+  strlcpy(form + l - 1, lenmod, size - (l - 1));
+  form[l + lm - 1] = spec;
+  form[l + lm] = '\0';
+}
+
+
+static int str_format (lua_State *L) {
+  int top = lua_gettop(L);
+  int arg = 1;
+  size_t sfl;
+  const char *strfrmt = luaL_checklstring(L, arg, &sfl);
+  const char *strfrmt_end = strfrmt+sfl;
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  while (strfrmt < strfrmt_end) {
+    if (*strfrmt != L_ESC)
+      luaL_addchar(&b, *strfrmt++);
+    else if (*++strfrmt == L_ESC)
+      luaL_addchar(&b, *strfrmt++);  /* %% */
+    else { /* format item */
+      char form[MAX_FORMAT];  /* to store the format (`%...') */
+      char *buff = luaL_prepbuffsize(&b, MAX_ITEM);  /* to put formatted item */
+      int nb = 0;  /* number of bytes in added item */
+      if (++arg > top)
+        luaL_argerror(L, arg, "no value");
+      strfrmt = scanformat(L, strfrmt, form);
+      switch (*strfrmt++) {
+        case 'c': {
+          nb = str_sprintf(buff, form, luaL_checkint(L, arg));
+          break;
+        }
+        case 'd': case 'i': {
+          lua_Number n = luaL_checknumber(L, arg);
+          LUA_INTFRM_T ni = (LUA_INTFRM_T)n;
+          lua_Number diff = n - (lua_Number)ni;
+          luaL_argcheck(L, -1 < diff && diff < 1, arg,
+                        "not a number in proper range");
+          addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT);
+          nb = str_sprintf(buff, form, ni);
+          break;
+        }
+        case 'o': case 'u': case 'x': case 'X': {
+          lua_Number n = luaL_checknumber(L, arg);
+          unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n;
+          lua_Number diff = n - (lua_Number)ni;
+          luaL_argcheck(L, -1 < diff && diff < 1, arg,
+                        "not a non-negative number in proper range");
+          addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT);
+          nb = str_sprintf(buff, form, ni);
+          break;
+        }
+#if defined(LUA_USE_FLOAT_FORMATS)
+        case 'e': case 'E': case 'f':
+#if defined(LUA_USE_AFORMAT)
+        case 'a': case 'A':
+#endif
+        case 'g': case 'G': {
+          addlenmod(form, LUA_FLTFRMLEN, MAX_FORMAT);
+          nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg));
+          break;
+        }
+#endif
+        case 'q': {
+          addquoted(L, &b, arg);
+          break;
+        }
+        case 's': {
+          size_t l;
+          const char *s = luaL_tolstring(L, arg, &l);
+          if (!strchr(form, '.') && l >= 100) {
+            /* no precision and string is too long to be formatted;
+               keep original string */
+            luaL_addvalue(&b);
+            break;
+          }
+          else {
+            nb = str_sprintf(buff, form, s);
+            lua_pop(L, 1);  /* remove result from 'luaL_tolstring' */
+            break;
+          }
+        }
+        default: {  /* also treat cases `pnLlh' */
+          return luaL_error(L, "invalid option " LUA_QL("%%%c") " to "
+                               LUA_QL("format"), *(strfrmt - 1));
+        }
+      }
+      luaL_addsize(&b, nb);
+    }
+  }
+  luaL_pushresult(&b);
+  return 1;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg strlib[] = {
+  {"byte", str_byte},
+  {"char", str_char},
+#if defined(LUA_USE_DUMP)
+  {"dump", str_dump},
+#endif
+  {"find", str_find},
+  {"format", str_format},
+  {"gmatch", str_gmatch},
+  {"gsub", str_gsub},
+  {"len", str_len},
+  {"lower", str_lower},
+  {"match", str_match},
+  {"rep", str_rep},
+  {"reverse", str_reverse},
+  {"sub", str_sub},
+  {"upper", str_upper},
+  {NULL, NULL}
+};
+
+
+static void createmetatable (lua_State *L) {
+  lua_createtable(L, 0, 1);  /* table to be metatable for strings */
+  lua_pushliteral(L, "");  /* dummy string */
+  lua_pushvalue(L, -2);  /* copy table */
+  lua_setmetatable(L, -2);  /* set table as metatable for strings */
+  lua_pop(L, 1);  /* pop dummy string */
+  lua_pushvalue(L, -2);  /* get string library */
+  lua_setfield(L, -2, "__index");  /* metatable.__index = string */
+  lua_pop(L, 1);  /* pop metatable */
+}
+
+
+/*
+** Open string library
+*/
+LUAMOD_API int luaopen_string (lua_State *L) {
+  luaL_newlib(L, strlib);
+  createmetatable(L);
+  return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_string);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltable.c b/sys/contrib/openzfs/module/lua/ltable.c
new file mode 100644
index 000000000000..f60418721bef
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltable.c
@@ -0,0 +1,592 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+
+/*
+** Implementation of tables (aka arrays, objects, or hash tables).
+** Tables keep its elements in two parts: an array part and a hash part.
+** Non-negative integer keys are all candidates to be kept in the array
+** part. The actual size of the array is the largest `n' such that at
+** least half the slots between 0 and n are in use.
+** Hash uses a mix of chained scatter table with Brent's variation.
+** A main invariant of these tables is that, if an element is not
+** in its main position (i.e. the `original' position that its hash gives
+** to it), then the colliding element is in its own main position.
+** Hence even when the load factor reaches 100%, performance remains good.
+*/
+
+
+#define ltable_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+/*
+** max size of array part is 2^MAXBITS
+*/
+#if LUAI_BITSINT >= 32
+#define MAXBITS		30
+#else
+#define MAXBITS		(LUAI_BITSINT-2)
+#endif
+
+#define MAXASIZE	(1 << MAXBITS)
+
+
+#define hashpow2(t,n)		(gnode(t, lmod((n), sizenode(t))))
+
+#define hashstr(t,str)		hashpow2(t, (str)->tsv.hash)
+#define hashboolean(t,p)	hashpow2(t, p)
+
+
+/*
+** for some types, it is better to avoid modulus by power of 2, as
+** they tend to have many 2 factors.
+*/
+#define hashmod(t,n)	(gnode(t, ((n) % ((sizenode(t)-1)|1))))
+
+
+#define hashpointer(t,p)	hashmod(t, IntPoint(p))
+
+
+#define dummynode		(&dummynode_)
+
+#define isdummy(n)		((n) == dummynode)
+
+static const Node dummynode_ = {
+  {NILCONSTANT},  /* value */
+  {{NILCONSTANT, NULL}}  /* key */
+};
+
+
+/*
+** hash for lua_Numbers
+*/
+static Node *hashnum (const Table *t, lua_Number n) {
+  int i;
+  luai_hashnum(i, n);
+  if (i < 0) {
+    if (cast(unsigned int, i) == 0u - i)  /* use unsigned to avoid overflows */
+      i = 0;  /* handle INT_MIN */
+    i = -i;  /* must be a positive value */
+  }
+  return hashmod(t, i);
+}
+
+
+
+/*
+** returns the `main' position of an element in a table (that is, the index
+** of its hash value)
+*/
+static Node *mainposition (const Table *t, const TValue *key) {
+  switch (ttype(key)) {
+    case LUA_TNUMBER:
+      return hashnum(t, nvalue(key));
+    case LUA_TLNGSTR: {
+      TString *s = rawtsvalue(key);
+      if (s->tsv.extra == 0) {  /* no hash? */
+        s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash);
+        s->tsv.extra = 1;  /* now it has its hash */
+      }
+      return hashstr(t, rawtsvalue(key));
+    }
+    case LUA_TSHRSTR:
+      return hashstr(t, rawtsvalue(key));
+    case LUA_TBOOLEAN:
+      return hashboolean(t, bvalue(key));
+    case LUA_TLIGHTUSERDATA:
+      return hashpointer(t, pvalue(key));
+    case LUA_TLCF:
+      return hashpointer(t, fvalue(key));
+    default:
+      return hashpointer(t, gcvalue(key));
+  }
+}
+
+
+/*
+** returns the index for `key' if `key' is an appropriate key to live in
+** the array part of the table, -1 otherwise.
+*/
+static int arrayindex (const TValue *key) {
+  if (ttisnumber(key)) {
+    lua_Number n = nvalue(key);
+    int k;
+    lua_number2int(k, n);
+    if (luai_numeq(cast_num(k), n))
+      return k;
+  }
+  return -1;  /* `key' did not match some condition */
+}
+
+
+/*
+** returns the index of a `key' for table traversals. First goes all
+** elements in the array part, then elements in the hash part. The
+** beginning of a traversal is signaled by -1.
+*/
+static int findindex (lua_State *L, Table *t, StkId key) {
+  int i;
+  if (ttisnil(key)) return -1;  /* first iteration */
+  i = arrayindex(key);
+  if (0 < i && i <= t->sizearray)  /* is `key' inside array part? */
+    return i-1;  /* yes; that's the index (corrected to C) */
+  else {
+    Node *n = mainposition(t, key);
+    for (;;) {  /* check whether `key' is somewhere in the chain */
+      /* key may be dead already, but it is ok to use it in `next' */
+      if (luaV_rawequalobj(gkey(n), key) ||
+            (ttisdeadkey(gkey(n)) && iscollectable(key) &&
+             deadvalue(gkey(n)) == gcvalue(key))) {
+        i = cast_int(n - gnode(t, 0));  /* key index in hash table */
+        /* hash elements are numbered after array ones */
+        return i + t->sizearray;
+      }
+      else n = gnext(n);
+      if (n == NULL)
+        luaG_runerror(L, "invalid key to " LUA_QL("next"));  /* key not found */
+    }
+  }
+}
+
+
+int luaH_next (lua_State *L, Table *t, StkId key) {
+  int i = findindex(L, t, key);  /* find original element */
+  for (i++; i < t->sizearray; i++) {  /* try first array part */
+    if (!ttisnil(&t->array[i])) {  /* a non-nil value? */
+      setnvalue(key, cast_num(i+1));
+      setobj2s(L, key+1, &t->array[i]);
+      return 1;
+    }
+  }
+  for (i -= t->sizearray; i < sizenode(t); i++) {  /* then hash part */
+    if (!ttisnil(gval(gnode(t, i)))) {  /* a non-nil value? */
+      setobj2s(L, key, gkey(gnode(t, i)));
+      setobj2s(L, key+1, gval(gnode(t, i)));
+      return 1;
+    }
+  }
+  return 0;  /* no more elements */
+}
+
+
+/*
+** {=============================================================
+** Rehash
+** ==============================================================
+*/
+
+
+static int computesizes (int nums[], int *narray) {
+  int i;
+  int twotoi;  /* 2^i */
+  int a = 0;  /* number of elements smaller than 2^i */
+  int na = 0;  /* number of elements to go to array part */
+  int n = 0;  /* optimal size for array part */
+  for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
+    if (nums[i] > 0) {
+      a += nums[i];
+      if (a > twotoi/2) {  /* more than half elements present? */
+        n = twotoi;  /* optimal size (till now) */
+        na = a;  /* all elements smaller than n will go to array part */
+      }
+    }
+    if (a == *narray) break;  /* all elements already counted */
+  }
+  *narray = n;
+  lua_assert(*narray/2 <= na && na <= *narray);
+  return na;
+}
+
+
+static int countint (const TValue *key, int *nums) {
+  int k = arrayindex(key);
+  if (0 < k && k <= MAXASIZE) {  /* is `key' an appropriate array index? */
+    nums[luaO_ceillog2(k)]++;  /* count as such */
+    return 1;
+  }
+  else
+    return 0;
+}
+
+
+static int numusearray (const Table *t, int *nums) {
+  int lg;
+  int ttlg;  /* 2^lg */
+  int ause = 0;  /* summation of `nums' */
+  int i = 1;  /* count to traverse all array keys */
+  for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) {  /* for each slice */
+    int lc = 0;  /* counter */
+    int lim = ttlg;
+    if (lim > t->sizearray) {
+      lim = t->sizearray;  /* adjust upper limit */
+      if (i > lim)
+        break;  /* no more elements to count */
+    }
+    /* count elements in range (2^(lg-1), 2^lg] */
+    for (; i <= lim; i++) {
+      if (!ttisnil(&t->array[i-1]))
+        lc++;
+    }
+    nums[lg] += lc;
+    ause += lc;
+  }
+  return ause;
+}
+
+
+static int numusehash (const Table *t, int *nums, int *pnasize) {
+  int totaluse = 0;  /* total number of elements */
+  int ause = 0;  /* summation of `nums' */
+  int i = sizenode(t);
+  while (i--) {
+    Node *n = &t->node[i];
+    if (!ttisnil(gval(n))) {
+      ause += countint(gkey(n), nums);
+      totaluse++;
+    }
+  }
+  *pnasize += ause;
+  return totaluse;
+}
+
+
+static void setarrayvector (lua_State *L, Table *t, int size) {
+  int i;
+  luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
+  for (i=t->sizearray; i<size; i++)
+     setnilvalue(&t->array[i]);
+  t->sizearray = size;
+}
+
+
+static void setnodevector (lua_State *L, Table *t, int size) {
+  int lsize;
+  if (size == 0) {  /* no elements to hash part? */
+    t->node = cast(Node *, dummynode);  /* use common `dummynode' */
+    lsize = 0;
+  }
+  else {
+    int i;
+    lsize = luaO_ceillog2(size);
+    if (lsize > MAXBITS)
+      luaG_runerror(L, "table overflow");
+    size = twoto(lsize);
+    t->node = luaM_newvector(L, size, Node);
+    for (i=0; i<size; i++) {
+      Node *n = gnode(t, i);
+      gnext(n) = NULL;
+      setnilvalue(gkey(n));
+      setnilvalue(gval(n));
+    }
+  }
+  t->lsizenode = cast_byte(lsize);
+  t->lastfree = gnode(t, size);  /* all positions are free */
+}
+
+
+void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) {
+  int i;
+  int oldasize = t->sizearray;
+  int oldhsize = t->lsizenode;
+  Node *nold = t->node;  /* save old hash ... */
+  if (nasize > oldasize)  /* array part must grow? */
+    setarrayvector(L, t, nasize);
+  /* create new hash part with appropriate size */
+  setnodevector(L, t, nhsize);
+  if (nasize < oldasize) {  /* array part must shrink? */
+    t->sizearray = nasize;
+    /* re-insert elements from vanishing slice */
+    for (i=nasize; i<oldasize; i++) {
+      if (!ttisnil(&t->array[i]))
+        luaH_setint(L, t, i + 1, &t->array[i]);
+    }
+    /* shrink array */
+    luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
+  }
+  /* re-insert elements from hash part */
+  for (i = twoto(oldhsize) - 1; i >= 0; i--) {
+    Node *old = nold+i;
+    if (!ttisnil(gval(old))) {
+      /* doesn't need barrier/invalidate cache, as entry was
+         already present in the table */
+      setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
+    }
+  }
+  if (!isdummy(nold))
+    luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */
+}
+
+
+void luaH_resizearray (lua_State *L, Table *t, int nasize) {
+  int nsize = isdummy(t->node) ? 0 : sizenode(t);
+  luaH_resize(L, t, nasize, nsize);
+}
+
+
+static void rehash (lua_State *L, Table *t, const TValue *ek) {
+  int nasize, na;
+  int nums[MAXBITS+1];  /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */
+  int i;
+  int totaluse;
+  for (i=0; i<=MAXBITS; i++) nums[i] = 0;  /* reset counts */
+  nasize = numusearray(t, nums);  /* count keys in array part */
+  totaluse = nasize;  /* all those keys are integer keys */
+  totaluse += numusehash(t, nums, &nasize);  /* count keys in hash part */
+  /* count extra key */
+  nasize += countint(ek, nums);
+  totaluse++;
+  /* compute new size for array part */
+  na = computesizes(nums, &nasize);
+  /* resize the table to new computed sizes */
+  luaH_resize(L, t, nasize, totaluse - na);
+}
+
+
+
+/*
+** }=============================================================
+*/
+
+
+Table *luaH_new (lua_State *L) {
+  Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h;
+  t->metatable = NULL;
+  t->flags = cast_byte(~0);
+  t->array = NULL;
+  t->sizearray = 0;
+  setnodevector(L, t, 0);
+  return t;
+}
+
+
+void luaH_free (lua_State *L, Table *t) {
+  if (!isdummy(t->node))
+    luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
+  luaM_freearray(L, t->array, t->sizearray);
+  luaM_free(L, t);
+}
+
+
+static Node *getfreepos (Table *t) {
+  while (t->lastfree > t->node) {
+    t->lastfree--;
+    if (ttisnil(gkey(t->lastfree)))
+      return t->lastfree;
+  }
+  return NULL;  /* could not find a free place */
+}
+
+
+
+/*
+** inserts a new key into a hash table; first, check whether key's main
+** position is free. If not, check whether colliding node is in its main
+** position or not: if it is not, move colliding node to an empty place and
+** put new key in its main position; otherwise (colliding node is in its main
+** position), new key goes to an empty position.
+*/
+TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) {
+  Node *mp;
+  if (ttisnil(key)) luaG_runerror(L, "table index is nil");
+#if defined LUA_HAS_FLOAT_NUMBERS
+  else if (ttisnumber(key) && luai_numisnan(L, nvalue(key)))
+    luaG_runerror(L, "table index is NaN");
+#endif
+  mp = mainposition(t, key);
+  if (!ttisnil(gval(mp)) || isdummy(mp)) {  /* main position is taken? */
+    Node *othern;
+    Node *n = getfreepos(t);  /* get a free place */
+    if (n == NULL) {  /* cannot find a free place? */
+      rehash(L, t, key);  /* grow table */
+      /* whatever called 'newkey' take care of TM cache and GC barrier */
+      return luaH_set(L, t, key);  /* insert key into grown table */
+    }
+    lua_assert(!isdummy(n));
+    othern = mainposition(t, gkey(mp));
+    if (othern != mp) {  /* is colliding node out of its main position? */
+      /* yes; move colliding node into free position */
+      while (gnext(othern) != mp) othern = gnext(othern);  /* find previous */
+      gnext(othern) = n;  /* redo the chain with `n' in place of `mp' */
+      *n = *mp;  /* copy colliding node into free pos. (mp->next also goes) */
+      gnext(mp) = NULL;  /* now `mp' is free */
+      setnilvalue(gval(mp));
+    }
+    else {  /* colliding node is in its own main position */
+      /* new node will go into free position */
+      gnext(n) = gnext(mp);  /* chain new position */
+      gnext(mp) = n;
+      mp = n;
+    }
+  }
+  setobj2t(L, gkey(mp), key);
+  luaC_barrierback(L, obj2gco(t), key);
+  lua_assert(ttisnil(gval(mp)));
+  return gval(mp);
+}
+
+
+/*
+** search function for integers
+*/
+const TValue *luaH_getint (Table *t, int key) {
+  /* (1 <= key && key <= t->sizearray) */
+  if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray))
+    return &t->array[key-1];
+  else {
+    lua_Number nk = cast_num(key);
+    Node *n = hashnum(t, nk);
+    do {  /* check whether `key' is somewhere in the chain */
+      if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk))
+        return gval(n);  /* that's it */
+      else n = gnext(n);
+    } while (n);
+    return luaO_nilobject;
+  }
+}
+
+
+/*
+** search function for short strings
+*/
+const TValue *luaH_getstr (Table *t, TString *key) {
+  Node *n = hashstr(t, key);
+  lua_assert(key->tsv.tt == LUA_TSHRSTR);
+  do {  /* check whether `key' is somewhere in the chain */
+    if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key))
+      return gval(n);  /* that's it */
+    else n = gnext(n);
+  } while (n);
+  return luaO_nilobject;
+}
+
+
+/*
+** main search function
+*/
+const TValue *luaH_get (Table *t, const TValue *key) {
+  switch (ttype(key)) {
+    case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key));
+    case LUA_TNIL: return luaO_nilobject;
+    case LUA_TNUMBER: {
+      int k;
+      lua_Number n = nvalue(key);
+      lua_number2int(k, n);
+      if (luai_numeq(cast_num(k), n)) /* index is int? */
+        return luaH_getint(t, k);  /* use specialized version */
+      /* else go through */
+    }
+    /* FALLTHROUGH */
+    default: {
+      Node *n = mainposition(t, key);
+      do {  /* check whether `key' is somewhere in the chain */
+        if (luaV_rawequalobj(gkey(n), key))
+          return gval(n);  /* that's it */
+        else n = gnext(n);
+      } while (n);
+      return luaO_nilobject;
+    }
+  }
+}
+
+
+/*
+** beware: when using this function you probably need to check a GC
+** barrier and invalidate the TM cache.
+*/
+TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
+  const TValue *p = luaH_get(t, key);
+  if (p != luaO_nilobject)
+    return cast(TValue *, p);
+  else return luaH_newkey(L, t, key);
+}
+
+
+void luaH_setint (lua_State *L, Table *t, int key, TValue *value) {
+  const TValue *p = luaH_getint(t, key);
+  TValue *cell;
+  if (p != luaO_nilobject)
+    cell = cast(TValue *, p);
+  else {
+    TValue k;
+    setnvalue(&k, cast_num(key));
+    cell = luaH_newkey(L, t, &k);
+  }
+  setobj2t(L, cell, value);
+}
+
+
+static int unbound_search (Table *t, unsigned int j) {
+  unsigned int i = j;  /* i is zero or a present index */
+  j++;
+  /* find `i' and `j' such that i is present and j is not */
+  while (!ttisnil(luaH_getint(t, j))) {
+    i = j;
+    j *= 2;
+    if (j > cast(unsigned int, MAX_INT)) {  /* overflow? */
+      /* table was built with bad purposes: resort to linear search */
+      i = 1;
+      while (!ttisnil(luaH_getint(t, i))) i++;
+      return i - 1;
+    }
+  }
+  /* now do a binary search between them */
+  while (j - i > 1) {
+    unsigned int m = (i+j)/2;
+    if (ttisnil(luaH_getint(t, m))) j = m;
+    else i = m;
+  }
+  return i;
+}
+
+
+/*
+** Try to find a boundary in table `t'. A `boundary' is an integer index
+** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
+*/
+int luaH_getn (Table *t) {
+  unsigned int j = t->sizearray;
+  if (j > 0 && ttisnil(&t->array[j - 1])) {
+    /* there is a boundary in the array part: (binary) search for it */
+    unsigned int i = 0;
+    while (j - i > 1) {
+      unsigned int m = (i+j)/2;
+      if (ttisnil(&t->array[m - 1])) j = m;
+      else i = m;
+    }
+    return i;
+  }
+  /* else must find a boundary in hash part */
+  else if (isdummy(t->node))  /* hash part is empty? */
+    return j;  /* that is easy... */
+  else return unbound_search(t, j);
+}
+
+
+
+#if defined(LUA_DEBUG)
+
+Node *luaH_mainposition (const Table *t, const TValue *key) {
+  return mainposition(t, key);
+}
+
+int luaH_isdummy (Node *n) { return isdummy(n); }
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltable.h b/sys/contrib/openzfs/module/lua/ltable.h
new file mode 100644
index 000000000000..ea877ebf4eb0
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltable.h
@@ -0,0 +1,47 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltable_h
+#define ltable_h
+
+#include "lobject.h"
+
+
+#define gnode(t,i)	((Node *)&(t)->node[i])
+#define gkey(n)		(&(n)->i_key.tvk)
+#define gval(n)		(&(n)->i_val)
+#define gnext(n)	((n)->i_key.nk.next)
+
+#define invalidateTMcache(t)	((t)->flags = 0)
+
+/* returns the key, given the value of a table entry */
+#define keyfromval(v) \
+  (gkey(cast(Node *, cast(char *, (v)) - offsetof(Node, i_val))))
+
+
+LUAI_FUNC const TValue *luaH_getint (Table *t, int key);
+LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value);
+LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key);
+LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC Table *luaH_new (lua_State *L);
+LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize);
+LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize);
+LUAI_FUNC void luaH_free (lua_State *L, Table *t);
+LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key);
+LUAI_FUNC int luaH_getn (Table *t);
+
+
+#if defined(LUA_DEBUG)
+LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key);
+LUAI_FUNC int luaH_isdummy (Node *n);
+#endif
+
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltablib.c b/sys/contrib/openzfs/module/lua/ltablib.c
new file mode 100644
index 000000000000..51cafffaafcd
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltablib.c
@@ -0,0 +1,289 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $
+** Library for Table Manipulation
+** See Copyright Notice in lua.h
+*/
+
+
+#define ltablib_c
+#define LUA_LIB
+
+#include <sys/lua/lua.h>
+
+#include <sys/lua/lauxlib.h>
+#include <sys/lua/lualib.h>
+
+
+#define aux_getn(L,n)	(luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n))
+
+
+
+#if defined(LUA_COMPAT_MAXN)
+static int maxn (lua_State *L) {
+  lua_Number max = 0;
+  luaL_checktype(L, 1, LUA_TTABLE);
+  lua_pushnil(L);  /* first key */
+  while (lua_next(L, 1)) {
+    lua_pop(L, 1);  /* remove value */
+    if (lua_type(L, -1) == LUA_TNUMBER) {
+      lua_Number v = lua_tonumber(L, -1);
+      if (v > max) max = v;
+    }
+  }
+  lua_pushnumber(L, max);
+  return 1;
+}
+#endif
+
+
+static int tinsert (lua_State *L) {
+  int e = aux_getn(L, 1) + 1;  /* first empty element */
+  int pos;  /* where to insert new element */
+  switch (lua_gettop(L)) {
+    case 2: {  /* called with only 2 arguments */
+      pos = e;  /* insert new element at the end */
+      break;
+    }
+    case 3: {
+      int i;
+      pos = luaL_checkint(L, 2);  /* 2nd argument is the position */
+      luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds");
+      for (i = e; i > pos; i--) {  /* move up elements */
+        lua_rawgeti(L, 1, i-1);
+        lua_rawseti(L, 1, i);  /* t[i] = t[i-1] */
+      }
+      break;
+    }
+    default: {
+      return luaL_error(L, "wrong number of arguments to " LUA_QL("insert"));
+    }
+  }
+  lua_rawseti(L, 1, pos);  /* t[pos] = v */
+  return 0;
+}
+
+
+static int tremove (lua_State *L) {
+  int size = aux_getn(L, 1);
+  int pos = luaL_optint(L, 2, size);
+  if (pos != size)  /* validate 'pos' if given */
+    luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds");
+  lua_rawgeti(L, 1, pos);  /* result = t[pos] */
+  for ( ; pos < size; pos++) {
+    lua_rawgeti(L, 1, pos+1);
+    lua_rawseti(L, 1, pos);  /* t[pos] = t[pos+1] */
+  }
+  lua_pushnil(L);
+  lua_rawseti(L, 1, pos);  /* t[pos] = nil */
+  return 1;
+}
+
+
+static void addfield (lua_State *L, luaL_Buffer *b, int i) {
+  lua_rawgeti(L, 1, i);
+  if (!lua_isstring(L, -1))
+    luaL_error(L, "invalid value (%s) at index %d in table for "
+                  LUA_QL("concat"), luaL_typename(L, -1), i);
+  luaL_addvalue(b);
+}
+
+
+static int tconcat (lua_State *L) {
+  luaL_Buffer b;
+  size_t lsep;
+  int i, last;
+  const char *sep = luaL_optlstring(L, 2, "", &lsep);
+  luaL_checktype(L, 1, LUA_TTABLE);
+  i = luaL_optint(L, 3, 1);
+  last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1));
+  luaL_buffinit(L, &b);
+  for (; i < last; i++) {
+    addfield(L, &b, i);
+    luaL_addlstring(&b, sep, lsep);
+  }
+  if (i == last)  /* add last value (if interval was not empty) */
+    addfield(L, &b, i);
+  luaL_pushresult(&b);
+  return 1;
+}
+
+
+/*
+** {======================================================
+** Pack/unpack
+** =======================================================
+*/
+
+static int pack (lua_State *L) {
+  int n = lua_gettop(L);  /* number of elements to pack */
+  lua_createtable(L, n, 1);  /* create result table */
+  lua_pushinteger(L, n);
+  lua_setfield(L, -2, "n");  /* t.n = number of elements */
+  if (n > 0) {  /* at least one element? */
+    int i;
+    lua_pushvalue(L, 1);
+    lua_rawseti(L, -2, 1);  /* insert first element */
+    lua_replace(L, 1);  /* move table into index 1 */
+    for (i = n; i >= 2; i--)  /* assign other elements */
+      lua_rawseti(L, 1, i);
+  }
+  return 1;  /* return table */
+}
+
+
+static int unpack (lua_State *L) {
+  int i, e;
+  unsigned int n;
+  luaL_checktype(L, 1, LUA_TTABLE);
+  i = luaL_optint(L, 2, 1);
+  e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1));
+  if (i > e) return 0;  /* empty range */
+  n = (unsigned int)e - (unsigned int)i;  /* number of elements minus 1 */
+  if (n > (INT_MAX - 10) || !lua_checkstack(L, ++n))
+    return luaL_error(L, "too many results to unpack");
+  lua_rawgeti(L, 1, i);  /* push arg[i] (avoiding overflow problems) */
+  while (i++ < e)  /* push arg[i + 1...e] */
+    lua_rawgeti(L, 1, i);
+  return n;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Quicksort
+** (based on `Algorithms in MODULA-3', Robert Sedgewick;
+**  Addison-Wesley, 1993.)
+** =======================================================
+*/
+
+
+static void set2 (lua_State *L, int i, int j) {
+  lua_rawseti(L, 1, i);
+  lua_rawseti(L, 1, j);
+}
+
+static int sort_comp (lua_State *L, int a, int b) {
+  if (!lua_isnil(L, 2)) {  /* function? */
+    int res;
+    lua_pushvalue(L, 2);
+    lua_pushvalue(L, a-1);  /* -1 to compensate function */
+    lua_pushvalue(L, b-2);  /* -2 to compensate function and `a' */
+    lua_call(L, 2, 1);
+    res = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+    return res;
+  }
+  else  /* a < b? */
+    return lua_compare(L, a, b, LUA_OPLT);
+}
+
+static void auxsort (lua_State *L, int l, int u) {
+  while (l < u) {  /* for tail recursion */
+    int i, j;
+    /* sort elements a[l], a[(l+u)/2] and a[u] */
+    lua_rawgeti(L, 1, l);
+    lua_rawgeti(L, 1, u);
+    if (sort_comp(L, -1, -2))  /* a[u] < a[l]? */
+      set2(L, l, u);  /* swap a[l] - a[u] */
+    else
+      lua_pop(L, 2);
+    if (u-l == 1) break;  /* only 2 elements */
+    i = (l+u)/2;
+    lua_rawgeti(L, 1, i);
+    lua_rawgeti(L, 1, l);
+    if (sort_comp(L, -2, -1))  /* a[i]<a[l]? */
+      set2(L, i, l);
+    else {
+      lua_pop(L, 1);  /* remove a[l] */
+      lua_rawgeti(L, 1, u);
+      if (sort_comp(L, -1, -2))  /* a[u]<a[i]? */
+        set2(L, i, u);
+      else
+        lua_pop(L, 2);
+    }
+    if (u-l == 2) break;  /* only 3 elements */
+    lua_rawgeti(L, 1, i);  /* Pivot */
+    lua_pushvalue(L, -1);
+    lua_rawgeti(L, 1, u-1);
+    set2(L, i, u-1);
+    /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */
+    i = l; j = u-1;
+    for (;;) {  /* invariant: a[l..i] <= P <= a[j..u] */
+      /* repeat ++i until a[i] >= P */
+      while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) {
+        if (i>=u) luaL_error(L, "invalid order function for sorting");
+        lua_pop(L, 1);  /* remove a[i] */
+      }
+      /* repeat --j until a[j] <= P */
+      while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) {
+        if (j<=l) luaL_error(L, "invalid order function for sorting");
+        lua_pop(L, 1);  /* remove a[j] */
+      }
+      if (j<i) {
+        lua_pop(L, 3);  /* pop pivot, a[i], a[j] */
+        break;
+      }
+      set2(L, i, j);
+    }
+    lua_rawgeti(L, 1, u-1);
+    lua_rawgeti(L, 1, i);
+    set2(L, u-1, i);  /* swap pivot (a[u-1]) with a[i] */
+    /* a[l..i-1] <= a[i] == P <= a[i+1..u] */
+    /* adjust so that smaller half is in [j..i] and larger one in [l..u] */
+    if (i-l < u-i) {
+      j=l; i=i-1; l=i+2;
+    }
+    else {
+      j=i+1; i=u; u=j-2;
+    }
+    auxsort(L, j, i);  /* call recursively the smaller one */
+  }  /* repeat the routine for the larger one */
+}
+
+static int tsort (lua_State *L) {
+  int n = aux_getn(L, 1);
+  luaL_checkstack(L, 40, "");  /* assume array is smaller than 2^40 */
+  if (!lua_isnoneornil(L, 2))  /* is there a 2nd argument? */
+    luaL_checktype(L, 2, LUA_TFUNCTION);
+  lua_settop(L, 2);  /* make sure there is two arguments */
+  auxsort(L, 1, n);
+  return 0;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg tab_funcs[] = {
+  {"concat", tconcat},
+#if defined(LUA_COMPAT_MAXN)
+  {"maxn", maxn},
+#endif
+  {"insert", tinsert},
+  {"pack", pack},
+  {"unpack", unpack},
+  {"remove", tremove},
+  {"sort", tsort},
+  {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_table (lua_State *L) {
+  luaL_newlib(L, tab_funcs);
+#if defined(LUA_COMPAT_UNPACK)
+  /* _G.unpack = table.unpack */
+  lua_getfield(L, -1, "unpack");
+  lua_setglobal(L, "unpack");
+#endif
+  return 1;
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(luaopen_table);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltm.c b/sys/contrib/openzfs/module/lua/ltm.c
new file mode 100644
index 000000000000..94f29f7d96d5
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltm.c
@@ -0,0 +1,76 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltm.c,v 2.14.1.1 2013/04/12 18:48:47 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+
+#define ltm_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+static const char udatatypename[] = "userdata";
+
+LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = {
+  "no value",
+  "nil", "boolean", udatatypename, "number",
+  "string", "table", "function", udatatypename, "thread",
+  "proto", "upval"  /* these last two cases are used for tests only */
+};
+
+
+void luaT_init (lua_State *L) {
+  static const char *const luaT_eventname[] = {  /* ORDER TM */
+    "__index", "__newindex",
+    "__gc", "__mode", "__len", "__eq",
+    "__add", "__sub", "__mul", "__div", "__mod",
+    "__pow", "__unm", "__lt", "__le",
+    "__concat", "__call"
+  };
+  int i;
+  for (i=0; i<TM_N; i++) {
+    G(L)->tmname[i] = luaS_new(L, luaT_eventname[i]);
+    luaS_fix(G(L)->tmname[i]);  /* never collect these names */
+  }
+}
+
+
+/*
+** function to be used with macro "fasttm": optimized for absence of
+** tag methods
+*/
+const TValue *luaT_gettm (Table *events, TMS event, TString *ename) {
+  const TValue *tm = luaH_getstr(events, ename);
+  lua_assert(event <= TM_EQ);
+  if (ttisnil(tm)) {  /* no tag method? */
+    events->flags |= cast_byte(1u<<event);  /* cache this fact */
+    return NULL;
+  }
+  else return tm;
+}
+
+
+const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, TMS event) {
+  Table *mt;
+  switch (ttypenv(o)) {
+    case LUA_TTABLE:
+      mt = hvalue(o)->metatable;
+      break;
+    case LUA_TUSERDATA:
+      mt = uvalue(o)->metatable;
+      break;
+    default:
+      mt = G(L)->mt[ttypenv(o)];
+  }
+  return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject);
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/ltm.h b/sys/contrib/openzfs/module/lua/ltm.h
new file mode 100644
index 000000000000..c056f4637353
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/ltm.h
@@ -0,0 +1,59 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltm_h
+#define ltm_h
+
+
+#include "lobject.h"
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER TM"
+*/
+typedef enum {
+  TM_INDEX,
+  TM_NEWINDEX,
+  TM_GC,
+  TM_MODE,
+  TM_LEN,
+  TM_EQ,  /* last tag method with `fast' access */
+  TM_ADD,
+  TM_SUB,
+  TM_MUL,
+  TM_DIV,
+  TM_MOD,
+  TM_POW,
+  TM_UNM,
+  TM_LT,
+  TM_LE,
+  TM_CONCAT,
+  TM_CALL,
+  TM_N		/* number of elements in the enum */
+} TMS;
+
+
+
+#define gfasttm(g,et,e) ((et) == NULL ? NULL : \
+  ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e]))
+
+#define fasttm(l,et,e)	gfasttm(G(l), et, e)
+
+#define ttypename(x)	luaT_typenames_[(x) + 1]
+#define objtypename(x)	ttypename(ttypenv(x))
+
+LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS];
+
+
+LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename);
+LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o,
+                                                       TMS event);
+LUAI_FUNC void luaT_init (lua_State *L);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lvm.c b/sys/contrib/openzfs/module/lua/lvm.c
new file mode 100644
index 000000000000..4685be52b449
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lvm.c
@@ -0,0 +1,932 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#define lvm_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+#ifdef _KERNEL
+#define	strcoll(l,r) (strcmp((l),(r)))
+#endif
+
+/* limit for table tag-method chains (to avoid loops) */
+#define MAXTAGLOOP	100
+
+
+const TValue *luaV_tonumber (const TValue *obj, TValue *n) {
+  lua_Number num;
+  if (ttisnumber(obj)) return obj;
+  if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) {
+    setnvalue(n, num);
+    return n;
+  }
+  else
+    return NULL;
+}
+
+
+int luaV_tostring (lua_State *L, StkId obj) {
+  if (!ttisnumber(obj))
+    return 0;
+  else {
+    char s[LUAI_MAXNUMBER2STR];
+    lua_Number n = nvalue(obj);
+    int l = lua_number2str(s, n);
+    setsvalue2s(L, obj, luaS_newlstr(L, s, l));
+    return 1;
+  }
+}
+
+
+static void traceexec (lua_State *L) {
+  CallInfo *ci = L->ci;
+  lu_byte mask = L->hookmask;
+  int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0);
+  if (counthook)
+    resethookcount(L);  /* reset count */
+  if (ci->callstatus & CIST_HOOKYIELD) {  /* called hook last time? */
+    ci->callstatus &= ~CIST_HOOKYIELD;  /* erase mark */
+    return;  /* do not call hook again (VM yielded, so it did not move) */
+  }
+  if (counthook)
+    luaD_hook(L, LUA_HOOKCOUNT, -1);  /* call count hook */
+  if (mask & LUA_MASKLINE) {
+    Proto *p = ci_func(ci)->p;
+    int npc = pcRel(ci->u.l.savedpc, p);
+    int newline = getfuncline(p, npc);
+    if (npc == 0 ||  /* call linehook when enter a new function, */
+        ci->u.l.savedpc <= L->oldpc ||  /* when jump back (loop), or when */
+        newline != getfuncline(p, pcRel(L->oldpc, p)))  /* enter a new line */
+      luaD_hook(L, LUA_HOOKLINE, newline);  /* call line hook */
+  }
+  L->oldpc = ci->u.l.savedpc;
+  if (L->status == LUA_YIELD) {  /* did hook yield? */
+    if (counthook)
+      L->hookcount = 1;  /* undo decrement to zero */
+    ci->u.l.savedpc--;  /* undo increment (resume will increment it again) */
+    ci->callstatus |= CIST_HOOKYIELD;  /* mark that it yielded */
+    ci->func = L->top - 1;  /* protect stack below results */
+    luaD_throw(L, LUA_YIELD);
+  }
+}
+
+
+static void callTM (lua_State *L, const TValue *f, const TValue *p1,
+                    const TValue *p2, TValue *p3, int hasres) {
+  if (L == NULL) return;
+
+  ptrdiff_t result = savestack(L, p3);
+  setobj2s(L, L->top++, f);  /* push function */
+  setobj2s(L, L->top++, p1);  /* 1st argument */
+  setobj2s(L, L->top++, p2);  /* 2nd argument */
+  if (!hasres)  /* no result? 'p3' is third argument */
+    setobj2s(L, L->top++, p3);  /* 3rd argument */
+  /* metamethod may yield only when called from Lua code */
+  luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci));
+  if (hasres) {  /* if has result, move it to its place */
+    p3 = restorestack(L, result);
+    setobjs2s(L, p3, --L->top);
+  }
+}
+
+
+void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+  int loop;
+  for (loop = 0; loop < MAXTAGLOOP; loop++) {
+    const TValue *tm;
+    if (ttistable(t)) {  /* `t' is a table? */
+      Table *h = hvalue(t);
+      const TValue *res = luaH_get(h, key); /* do a primitive get */
+      if (!ttisnil(res) ||  /* result is not nil? */
+          (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */
+        setobj2s(L, val, res);
+        return;
+      }
+      /* else will try the tag method */
+    }
+    else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX)))
+      luaG_typeerror(L, t, "index");
+    if (ttisfunction(tm)) {
+      callTM(L, tm, t, key, val, 1);
+      return;
+    }
+    t = tm;  /* else repeat with 'tm' */
+  }
+  luaG_runerror(L, "loop in gettable");
+}
+
+
+void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+  int loop;
+  for (loop = 0; loop < MAXTAGLOOP; loop++) {
+    const TValue *tm;
+    if (ttistable(t)) {  /* `t' is a table? */
+      Table *h = hvalue(t);
+      TValue *oldval = cast(TValue *, luaH_get(h, key));
+      /* if previous value is not nil, there must be a previous entry
+         in the table; moreover, a metamethod has no relevance */
+      if (!ttisnil(oldval) ||
+         /* previous value is nil; must check the metamethod */
+         ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL &&
+         /* no metamethod; is there a previous entry in the table? */
+         (oldval != luaO_nilobject ||
+         /* no previous entry; must create one. (The next test is
+            always true; we only need the assignment.) */
+         (oldval = luaH_newkey(L, h, key), 1)))) {
+        /* no metamethod and (now) there is an entry with given key */
+        setobj2t(L, oldval, val);  /* assign new value to that entry */
+        invalidateTMcache(h);
+        luaC_barrierback(L, obj2gco(h), val);
+        return;
+      }
+      /* else will try the metamethod */
+    }
+    else  /* not a table; check metamethod */
+      if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
+        luaG_typeerror(L, t, "index");
+    /* there is a metamethod */
+    if (ttisfunction(tm)) {
+      callTM(L, tm, t, key, val, 0);
+      return;
+    }
+    t = tm;  /* else repeat with 'tm' */
+  }
+  luaG_runerror(L, "loop in settable");
+}
+
+
+static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2,
+                       StkId res, TMS event) {
+  const TValue *tm = luaT_gettmbyobj(L, p1, event);  /* try first operand */
+  if (ttisnil(tm))
+    tm = luaT_gettmbyobj(L, p2, event);  /* try second operand */
+  if (ttisnil(tm)) return 0;
+  callTM(L, tm, p1, p2, res, 1);
+  return 1;
+}
+
+
+static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2,
+                                  TMS event) {
+  const TValue *tm1 = fasttm(L, mt1, event);
+  const TValue *tm2;
+  if (tm1 == NULL) return NULL;  /* no metamethod */
+  if (mt1 == mt2) return tm1;  /* same metatables => same metamethods */
+  tm2 = fasttm(L, mt2, event);
+  if (tm2 == NULL) return NULL;  /* no metamethod */
+  if (luaV_rawequalobj(tm1, tm2))  /* same metamethods? */
+    return tm1;
+  return NULL;
+}
+
+
+static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2,
+                         TMS event) {
+  if (!call_binTM(L, p1, p2, L->top, event))
+    return -1;  /* no metamethod */
+  else
+    return !l_isfalse(L->top);
+}
+
+
+static int l_strcmp (const TString *ls, const TString *rs) {
+  const char *l = getstr(ls);
+  size_t ll = ls->tsv.len;
+  const char *r = getstr(rs);
+  size_t lr = rs->tsv.len;
+  for (;;) {
+    int temp = strcoll(l, r);
+    if (temp != 0) return temp;
+    else {  /* strings are equal up to a `\0' */
+      size_t len = strlen(l);  /* index of first `\0' in both strings */
+      if (len == lr)  /* r is finished? */
+        return (len == ll) ? 0 : 1;
+      else if (len == ll)  /* l is finished? */
+        return -1;  /* l is smaller than r (because r is not finished) */
+      /* both strings longer than `len'; go on comparing (after the `\0') */
+      len++;
+      l += len; ll -= len; r += len; lr -= len;
+    }
+  }
+}
+
+
+int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) {
+  int res;
+  if (ttisnumber(l) && ttisnumber(r))
+    return luai_numlt(L, nvalue(l), nvalue(r));
+  else if (ttisstring(l) && ttisstring(r))
+    return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0;
+  else if ((res = call_orderTM(L, l, r, TM_LT)) < 0)
+    luaG_ordererror(L, l, r);
+  return res;
+}
+
+
+int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) {
+  int res;
+  if (ttisnumber(l) && ttisnumber(r))
+    return luai_numle(L, nvalue(l), nvalue(r));
+  else if (ttisstring(l) && ttisstring(r))
+    return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0;
+  else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0)  /* first try `le' */
+    return res;
+  else if ((res = call_orderTM(L, r, l, TM_LT)) < 0)  /* else try `lt' */
+    luaG_ordererror(L, l, r);
+  return !res;
+}
+
+
+/*
+** equality of Lua values. L == NULL means raw equality (no metamethods)
+*/
+int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) {
+  const TValue *tm;
+  lua_assert(ttisequal(t1, t2));
+  switch (ttype(t1)) {
+    case LUA_TNIL: return 1;
+    case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2));
+    case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2);  /* true must be 1 !! */
+    case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2);
+    case LUA_TLCF: return fvalue(t1) == fvalue(t2);
+    case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2));
+    case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2));
+    case LUA_TUSERDATA: {
+      if (uvalue(t1) == uvalue(t2)) return 1;
+      else if (L == NULL) return 0;
+      tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ);
+      break;  /* will try TM */
+    }
+    case LUA_TTABLE: {
+      if (hvalue(t1) == hvalue(t2)) return 1;
+      else if (L == NULL) return 0;
+      tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ);
+      break;  /* will try TM */
+    }
+    default:
+      lua_assert(iscollectable(t1));
+      return gcvalue(t1) == gcvalue(t2);
+  }
+  if (tm == NULL || L == NULL) return 0;  /* no TM? */
+  callTM(L, tm, t1, t2, L->top, 1);  /* call TM */
+  return !l_isfalse(L->top);
+}
+
+
+void luaV_concat (lua_State *L, int total) {
+  lua_assert(total >= 2);
+  do {
+    StkId top = L->top;
+    int n = 2;  /* number of elements handled in this pass (at least 2) */
+    if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) {
+      if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT))
+        luaG_concaterror(L, top-2, top-1);
+    }
+    else if (tsvalue(top-1)->len == 0)  /* second operand is empty? */
+      (void)tostring(L, top - 2);  /* result is first operand */
+    else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) {
+      setobjs2s(L, top - 2, top - 1);  /* result is second op. */
+    }
+    else {
+      /* at least two non-empty string values; get as many as possible */
+      size_t tl = tsvalue(top-1)->len;
+      char *buffer;
+      int i;
+      /* collect total length */
+      for (i = 1; i < total && tostring(L, top-i-1); i++) {
+        size_t l = tsvalue(top-i-1)->len;
+        if (l >= (MAX_SIZET/sizeof(char)) - tl)
+          luaG_runerror(L, "string length overflow");
+        tl += l;
+      }
+      buffer = luaZ_openspace(L, &G(L)->buff, tl);
+      tl = 0;
+      n = i;
+      do {  /* concat all strings */
+        size_t l = tsvalue(top-i)->len;
+        memcpy(buffer+tl, svalue(top-i), l * sizeof(char));
+        tl += l;
+      } while (--i > 0);
+      setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl));
+    }
+    total -= n-1;  /* got 'n' strings to create 1 new */
+    L->top -= n-1;  /* popped 'n' strings and pushed one */
+  } while (total > 1);  /* repeat until only 1 result left */
+}
+
+
+void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) {
+  const TValue *tm;
+  switch (ttypenv(rb)) {
+    case LUA_TTABLE: {
+      Table *h = hvalue(rb);
+      tm = fasttm(L, h->metatable, TM_LEN);
+      if (tm) break;  /* metamethod? break switch to call it */
+      setnvalue(ra, cast_num(luaH_getn(h)));  /* else primitive len */
+      return;
+    }
+    case LUA_TSTRING: {
+      setnvalue(ra, cast_num(tsvalue(rb)->len));
+      return;
+    }
+    default: {  /* try metamethod */
+      tm = luaT_gettmbyobj(L, rb, TM_LEN);
+      if (ttisnil(tm))  /* no metamethod? */
+        luaG_typeerror(L, rb, "get length of");
+      break;
+    }
+  }
+  callTM(L, tm, rb, rb, ra, 1);
+}
+
+/*
+ * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle
+ * div/mod by zero (instead of crashing, which is the default behavior in
+ * Lua 5.2)
+ */
+
+/*
+** Integer division; return 'm // n', that is, floor(m/n).
+** C division truncates its result (rounds towards zero).
+** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer,
+** otherwise 'floor(q) == trunc(q) - 1'.
+*/
+static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) {
+  if ((lua_Unsigned)(n) + 1u <= 1u) {  /* special cases: -1 or 0 */
+    if (n == 0)
+      luaG_runerror(L, "attempt to divide by zero");
+    return (0 - m);   /* n==-1; avoid overflow with 0x80000...//-1 */
+  }
+  else {
+    lua_Number q = m / n;  /* perform C division */
+    if ((m ^ n) < 0 && m % n != 0)  /* 'm/n' would be negative non-integer? */
+      q -= 1;  /* correct result for different rounding */
+    return q;
+  }
+}
+
+
+/*
+** Integer modulus; return 'm % n'. (Assume that C '%' with
+** negative operands follows C99 behavior. See previous comment
+** about luaV_div.)
+*/
+static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) {
+  if ((lua_Unsigned)(n) + 1u <= 1u) {  /* special cases: -1 or 0 */
+    if (n == 0)
+      luaG_runerror(L, "attempt to perform 'n%%0'");
+    return 0;   /* m % -1 == 0; avoid overflow with 0x80000...%-1 */
+  }
+  else {
+    lua_Number r = m % n;
+    if (r != 0 && (m ^ n) < 0)  /* 'm/n' would be non-integer negative? */
+      r += n;  /* correct result for different rounding */
+    return r;
+  }
+}
+
+/*
+ * End patch from 5.3.2
+ */
+
+void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+                 const TValue *rc, TMS op) {
+  TValue tempb, tempc;
+  const TValue *b, *c;
+  if ((b = luaV_tonumber(rb, &tempb)) != NULL &&
+      (c = luaV_tonumber(rc, &tempc)) != NULL) {
+    /*
+     * Patched: if dividing or modding, use patched functions from 5.3
+     */
+    lua_Number res;
+    int lop = op - TM_ADD + LUA_OPADD;
+    if (lop == LUA_OPDIV) {
+      res = luaV_div(L, nvalue(b), nvalue(c));
+    } else if (lop == LUA_OPMOD) {
+      res = luaV_mod(L, nvalue(b), nvalue(c));
+    } else {
+      res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c));
+    }
+    setnvalue(ra, res);
+  }
+  else if (!call_binTM(L, rb, rc, ra, op))
+    luaG_aritherror(L, rb, rc);
+}
+
+
+/*
+** check whether cached closure in prototype 'p' may be reused, that is,
+** whether there is a cached closure with the same upvalues needed by
+** new closure to be created.
+*/
+static Closure *getcached (Proto *p, UpVal **encup, StkId base) {
+  Closure *c = p->cache;
+  if (c != NULL) {  /* is there a cached closure? */
+    int nup = p->sizeupvalues;
+    Upvaldesc *uv = p->upvalues;
+    int i;
+    for (i = 0; i < nup; i++) {  /* check whether it has right upvalues */
+      TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v;
+      if (c->l.upvals[i]->v != v)
+        return NULL;  /* wrong upvalue; cannot reuse closure */
+    }
+  }
+  return c;  /* return cached closure (or NULL if no cached closure) */
+}
+
+
+/*
+** create a new Lua closure, push it in the stack, and initialize
+** its upvalues. Note that the call to 'luaC_barrierproto' must come
+** before the assignment to 'p->cache', as the function needs the
+** original value of that field.
+*/
+static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base,
+                         StkId ra) {
+  int nup = p->sizeupvalues;
+  Upvaldesc *uv = p->upvalues;
+  int i;
+  Closure *ncl = luaF_newLclosure(L, nup);
+  ncl->l.p = p;
+  setclLvalue(L, ra, ncl);  /* anchor new closure in stack */
+  for (i = 0; i < nup; i++) {  /* fill in its upvalues */
+    if (uv[i].instack)  /* upvalue refers to local variable? */
+      ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx);
+    else  /* get upvalue from enclosing function */
+      ncl->l.upvals[i] = encup[uv[i].idx];
+  }
+  luaC_barrierproto(L, p, ncl);
+  p->cache = ncl;  /* save it on cache for reuse */
+}
+
+
+/*
+** finish execution of an opcode interrupted by an yield
+*/
+void luaV_finishOp (lua_State *L) {
+  CallInfo *ci = L->ci;
+  StkId base = ci->u.l.base;
+  Instruction inst = *(ci->u.l.savedpc - 1);  /* interrupted instruction */
+  OpCode op = GET_OPCODE(inst);
+  switch (op) {  /* finish its execution */
+    case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV:
+    case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN:
+    case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: {
+      setobjs2s(L, base + GETARG_A(inst), --L->top);
+      break;
+    }
+    case OP_LE: case OP_LT: case OP_EQ: {
+      int res = !l_isfalse(L->top - 1);
+      L->top--;
+      /* metamethod should not be called when operand is K */
+      lua_assert(!ISK(GETARG_B(inst)));
+      if (op == OP_LE &&  /* "<=" using "<" instead? */
+          ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE)))
+        res = !res;  /* invert result */
+      lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP);
+      if (res != GETARG_A(inst))  /* condition failed? */
+        ci->u.l.savedpc++;  /* skip jump instruction */
+      break;
+    }
+    case OP_CONCAT: {
+      StkId top = L->top - 1;  /* top when 'call_binTM' was called */
+      int b = GETARG_B(inst);      /* first element to concatenate */
+      int total = cast_int(top - 1 - (base + b));  /* yet to concatenate */
+      setobj2s(L, top - 2, top);  /* put TM result in proper position */
+      if (total > 1) {  /* are there elements to concat? */
+        L->top = top - 1;  /* top is one after last element (at top-2) */
+        luaV_concat(L, total);  /* concat them (may yield again) */
+      }
+      /* move final result to final position */
+      setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1);
+      L->top = ci->top;  /* restore top */
+      break;
+    }
+    case OP_TFORCALL: {
+      lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP);
+      L->top = ci->top;  /* correct top */
+      break;
+    }
+    case OP_CALL: {
+      if (GETARG_C(inst) - 1 >= 0)  /* nresults >= 0? */
+        L->top = ci->top;  /* adjust results */
+      break;
+    }
+    case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE:
+      break;
+    default: lua_assert(0);
+  }
+}
+
+
+
+/*
+** some macros for common tasks in `luaV_execute'
+*/
+
+#if !defined luai_runtimecheck
+#define luai_runtimecheck(L, c)		/* void */
+#endif
+
+
+#define RA(i)	(base+GETARG_A(i))
+/* to be used after possible stack reallocation */
+#define RB(i)	check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i))
+#define RC(i)	check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i))
+#define RKB(i)	check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \
+	ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i))
+#define RKC(i)	check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \
+	ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i))
+#define KBx(i)  \
+  (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++)))
+
+
+/* execute a jump instruction */
+#define dojump(ci,i,e) \
+  { int a = GETARG_A(i); \
+    if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \
+    ci->u.l.savedpc += GETARG_sBx(i) + e; }
+
+/* for test instructions, execute the jump instruction that follows it */
+#define donextjump(ci)	{ i = *ci->u.l.savedpc; dojump(ci, i, 1); }
+
+
+#define Protect(x)	{ {x;}; base = ci->u.l.base; }
+
+#define checkGC(L,c)  \
+  Protect( luaC_condGC(L,{L->top = (c);  /* limit of live values */ \
+                          luaC_step(L); \
+                          L->top = ci->top;})  /* restore top */ \
+           luai_threadyield(L); )
+
+
+#define arith_op(op,tm) { \
+        TValue *rb = RKB(i); \
+        TValue *rc = RKC(i); \
+        if (ttisnumber(rb) && ttisnumber(rc)) { \
+          lua_Number nb = nvalue(rb), nc = nvalue(rc); \
+          setnvalue(ra, op(L, nb, nc)); \
+        } \
+        else { Protect(luaV_arith(L, ra, rb, rc, tm)); } }
+
+
+#define vmdispatch(o)	switch(o)
+#define vmcase(l,b)	case l: {b}  break;
+#define vmcasenb(l,b)	case l: {b}		/* nb = no break */
+
+void luaV_execute (lua_State *L) {
+  CallInfo *ci = L->ci;
+  LClosure *cl;
+  TValue *k;
+  StkId base;
+ newframe:  /* reentry point when frame changes (call/return) */
+  lua_assert(ci == L->ci);
+  cl = clLvalue(ci->func);
+  k = cl->p->k;
+  base = ci->u.l.base;
+  /* main loop of interpreter */
+  for (;;) {
+    Instruction i = *(ci->u.l.savedpc++);
+    StkId ra;
+    if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) &&
+        (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) {
+      Protect(traceexec(L));
+    }
+    /* WARNING: several calls may realloc the stack and invalidate `ra' */
+    ra = RA(i);
+    lua_assert(base == ci->u.l.base);
+    lua_assert(base <= L->top && L->top < L->stack + L->stacksize);
+    vmdispatch (GET_OPCODE(i)) {
+      vmcase(OP_MOVE,
+        setobjs2s(L, ra, RB(i));
+      )
+      vmcase(OP_LOADK,
+        TValue *rb = k + GETARG_Bx(i);
+        setobj2s(L, ra, rb);
+      )
+      vmcase(OP_LOADKX,
+        TValue *rb;
+        lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+        rb = k + GETARG_Ax(*ci->u.l.savedpc++);
+        setobj2s(L, ra, rb);
+      )
+      vmcase(OP_LOADBOOL,
+        setbvalue(ra, GETARG_B(i));
+        if (GETARG_C(i)) ci->u.l.savedpc++;  /* skip next instruction (if C) */
+      )
+      vmcase(OP_LOADNIL,
+        int b = GETARG_B(i);
+        do {
+          setnilvalue(ra++);
+        } while (b--);
+      )
+      vmcase(OP_GETUPVAL,
+        int b = GETARG_B(i);
+        setobj2s(L, ra, cl->upvals[b]->v);
+      )
+      vmcase(OP_GETTABUP,
+        int b = GETARG_B(i);
+        Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra));
+      )
+      vmcase(OP_GETTABLE,
+        Protect(luaV_gettable(L, RB(i), RKC(i), ra));
+      )
+      vmcase(OP_SETTABUP,
+        int a = GETARG_A(i);
+        Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i)));
+      )
+      vmcase(OP_SETUPVAL,
+        UpVal *uv = cl->upvals[GETARG_B(i)];
+        setobj(L, uv->v, ra);
+        luaC_barrier(L, uv, ra);
+      )
+      vmcase(OP_SETTABLE,
+        Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
+      )
+      vmcase(OP_NEWTABLE,
+        int b = GETARG_B(i);
+        int c = GETARG_C(i);
+        Table *t = luaH_new(L);
+        sethvalue(L, ra, t);
+        if (b != 0 || c != 0)
+          luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c));
+        checkGC(L, ra + 1);
+      )
+      vmcase(OP_SELF,
+        StkId rb = RB(i);
+        setobjs2s(L, ra+1, rb);
+        Protect(luaV_gettable(L, rb, RKC(i), ra));
+      )
+      vmcase(OP_ADD,
+        arith_op(luai_numadd, TM_ADD);
+      )
+      vmcase(OP_SUB,
+        arith_op(luai_numsub, TM_SUB);
+      )
+      vmcase(OP_MUL,
+        arith_op(luai_nummul, TM_MUL);
+      )
+      /*
+       * Patched: use luaV_* instead of luai_* to handle div/mod by 0
+       */
+      vmcase(OP_DIV,
+        arith_op(luaV_div, TM_DIV);
+      )
+      vmcase(OP_MOD,
+        arith_op(luaV_mod, TM_MOD);
+      )
+      vmcase(OP_POW,
+        arith_op(luai_numpow, TM_POW);
+      )
+      vmcase(OP_UNM,
+        TValue *rb = RB(i);
+        if (ttisnumber(rb)) {
+          lua_Number nb = nvalue(rb);
+          setnvalue(ra, luai_numunm(L, nb));
+        }
+        else {
+          Protect(luaV_arith(L, ra, rb, rb, TM_UNM));
+        }
+      )
+      vmcase(OP_NOT,
+        TValue *rb = RB(i);
+        int res = l_isfalse(rb);  /* next assignment may change this value */
+        setbvalue(ra, res);
+      )
+      vmcase(OP_LEN,
+        Protect(luaV_objlen(L, ra, RB(i)));
+      )
+      vmcase(OP_CONCAT,
+        int b = GETARG_B(i);
+        int c = GETARG_C(i);
+        StkId rb;
+        L->top = base + c + 1;  /* mark the end of concat operands */
+        Protect(luaV_concat(L, c - b + 1));
+        ra = RA(i);  /* 'luav_concat' may invoke TMs and move the stack */
+        rb = b + base;
+        setobjs2s(L, ra, rb);
+        checkGC(L, (ra >= rb ? ra + 1 : rb));
+        L->top = ci->top;  /* restore top */
+      )
+      vmcase(OP_JMP,
+        dojump(ci, i, 0);
+      )
+      vmcase(OP_EQ,
+        TValue *rb = RKB(i);
+        TValue *rc = RKC(i);
+        Protect(
+          if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i))
+            ci->u.l.savedpc++;
+          else
+            donextjump(ci);
+        )
+      )
+      vmcase(OP_LT,
+        Protect(
+          if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i))
+            ci->u.l.savedpc++;
+          else
+            donextjump(ci);
+        )
+      )
+      vmcase(OP_LE,
+        Protect(
+          if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i))
+            ci->u.l.savedpc++;
+          else
+            donextjump(ci);
+        )
+      )
+      vmcase(OP_TEST,
+        if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra))
+            ci->u.l.savedpc++;
+          else
+          donextjump(ci);
+      )
+      vmcase(OP_TESTSET,
+        TValue *rb = RB(i);
+        if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb))
+          ci->u.l.savedpc++;
+        else {
+          setobjs2s(L, ra, rb);
+          donextjump(ci);
+        }
+      )
+      vmcase(OP_CALL,
+        int b = GETARG_B(i);
+        int nresults = GETARG_C(i) - 1;
+        if (b != 0) L->top = ra+b;  /* else previous instruction set top */
+        if (luaD_precall(L, ra, nresults)) {  /* C function? */
+          if (nresults >= 0) L->top = ci->top;  /* adjust results */
+          base = ci->u.l.base;
+        }
+        else {  /* Lua function */
+          ci = L->ci;
+          ci->callstatus |= CIST_REENTRY;
+          goto newframe;  /* restart luaV_execute over new Lua function */
+        }
+      )
+      vmcase(OP_TAILCALL,
+        int b = GETARG_B(i);
+        if (b != 0) L->top = ra+b;  /* else previous instruction set top */
+        lua_assert(GETARG_C(i) - 1 == LUA_MULTRET);
+        if (luaD_precall(L, ra, LUA_MULTRET))  /* C function? */
+          base = ci->u.l.base;
+        else {
+          /* tail call: put called frame (n) in place of caller one (o) */
+          CallInfo *nci = L->ci;  /* called frame */
+          CallInfo *oci = nci->previous;  /* caller frame */
+          StkId nfunc = nci->func;  /* called function */
+          StkId ofunc = oci->func;  /* caller function */
+          /* last stack slot filled by 'precall' */
+          StkId lim = nci->u.l.base + getproto(nfunc)->numparams;
+          int aux;
+          /* close all upvalues from previous call */
+          if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base);
+          /* move new frame into old one */
+          for (aux = 0; nfunc + aux < lim; aux++)
+            setobjs2s(L, ofunc + aux, nfunc + aux);
+          oci->u.l.base = ofunc + (nci->u.l.base - nfunc);  /* correct base */
+          oci->top = L->top = ofunc + (L->top - nfunc);  /* correct top */
+          oci->u.l.savedpc = nci->u.l.savedpc;
+          oci->callstatus |= CIST_TAIL;  /* function was tail called */
+          ci = L->ci = oci;  /* remove new frame */
+          lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize);
+          goto newframe;  /* restart luaV_execute over new Lua function */
+        }
+      )
+      vmcasenb(OP_RETURN,
+        int b = GETARG_B(i);
+        if (b != 0) L->top = ra+b-1;
+        if (cl->p->sizep > 0) luaF_close(L, base);
+        b = luaD_poscall(L, ra);
+        if (!(ci->callstatus & CIST_REENTRY))  /* 'ci' still the called one */
+          return;  /* external invocation: return */
+        else {  /* invocation via reentry: continue execution */
+          ci = L->ci;
+          if (b) L->top = ci->top;
+          lua_assert(isLua(ci));
+          lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
+          goto newframe;  /* restart luaV_execute over new Lua function */
+        }
+      )
+      vmcase(OP_FORLOOP,
+        lua_Number step = nvalue(ra+2);
+        lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */
+        lua_Number limit = nvalue(ra+1);
+        if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit)
+                                   : luai_numle(L, limit, idx)) {
+          ci->u.l.savedpc += GETARG_sBx(i);  /* jump back */
+          setnvalue(ra, idx);  /* update internal index... */
+          setnvalue(ra+3, idx);  /* ...and external index */
+        }
+      )
+      vmcase(OP_FORPREP,
+        const TValue *init = ra;
+        const TValue *plimit = ra+1;
+        const TValue *pstep = ra+2;
+        if (!tonumber(init, ra))
+          luaG_runerror(L, LUA_QL("for") " initial value must be a number");
+        else if (!tonumber(plimit, ra+1))
+          luaG_runerror(L, LUA_QL("for") " limit must be a number");
+        else if (!tonumber(pstep, ra+2))
+          luaG_runerror(L, LUA_QL("for") " step must be a number");
+        setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep)));
+        ci->u.l.savedpc += GETARG_sBx(i);
+      )
+      vmcasenb(OP_TFORCALL,
+        StkId cb = ra + 3;  /* call base */
+        setobjs2s(L, cb+2, ra+2);
+        setobjs2s(L, cb+1, ra+1);
+        setobjs2s(L, cb, ra);
+        L->top = cb + 3;  /* func. + 2 args (state and index) */
+        Protect(luaD_call(L, cb, GETARG_C(i), 1));
+        L->top = ci->top;
+        i = *(ci->u.l.savedpc++);  /* go to next instruction */
+        ra = RA(i);
+        lua_assert(GET_OPCODE(i) == OP_TFORLOOP);
+        goto l_tforloop;
+      )
+      vmcase(OP_TFORLOOP,
+        l_tforloop:
+        if (!ttisnil(ra + 1)) {  /* continue loop? */
+          setobjs2s(L, ra, ra + 1);  /* save control variable */
+           ci->u.l.savedpc += GETARG_sBx(i);  /* jump back */
+        }
+      )
+      vmcase(OP_SETLIST,
+        int n = GETARG_B(i);
+        int c = GETARG_C(i);
+        int last;
+        Table *h;
+        if (n == 0) n = cast_int(L->top - ra) - 1;
+        if (c == 0) {
+          lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+          c = GETARG_Ax(*ci->u.l.savedpc++);
+        }
+        luai_runtimecheck(L, ttistable(ra));
+        h = hvalue(ra);
+        last = ((c-1)*LFIELDS_PER_FLUSH) + n;
+        if (last > h->sizearray)  /* needs more space? */
+          luaH_resizearray(L, h, last);  /* pre-allocate it at once */
+        for (; n > 0; n--) {
+          TValue *val = ra+n;
+          luaH_setint(L, h, last--, val);
+          luaC_barrierback(L, obj2gco(h), val);
+        }
+        L->top = ci->top;  /* correct top (in case of previous open call) */
+      )
+      vmcase(OP_CLOSURE,
+        Proto *p = cl->p->p[GETARG_Bx(i)];
+        Closure *ncl = getcached(p, cl->upvals, base);  /* cached closure */
+        if (ncl == NULL)  /* no match? */
+          pushclosure(L, p, cl->upvals, base, ra);  /* create a new one */
+        else
+          setclLvalue(L, ra, ncl);  /* push cashed closure */
+        checkGC(L, ra + 1);
+      )
+      vmcase(OP_VARARG,
+        int b = GETARG_B(i) - 1;
+        int j;
+        int n = cast_int(base - ci->func) - cl->p->numparams - 1;
+        if (b < 0) {  /* B == 0? */
+          b = n;  /* get all var. arguments */
+          Protect(luaD_checkstack(L, n));
+          ra = RA(i);  /* previous call may change the stack */
+          L->top = ra + n;
+        }
+        for (j = 0; j < b; j++) {
+          if (j < n) {
+            setobjs2s(L, ra + j, base - n + j);
+          }
+          else {
+            setnilvalue(ra + j);
+          }
+        }
+      )
+      vmcase(OP_EXTRAARG,
+        lua_assert(0);
+      )
+    }
+  }
+}
+
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lvm.h b/sys/contrib/openzfs/module/lua/lvm.h
new file mode 100644
index 000000000000..2d2be9836f69
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lvm.h
@@ -0,0 +1,46 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lvm_h
+#define lvm_h
+
+
+#include "ldo.h"
+#include "lobject.h"
+#include "ltm.h"
+
+
+#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o)))
+
+#define tonumber(o,n)	(ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL))
+
+#define equalobj(L,o1,o2)  (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2))
+
+#define luaV_rawequalobj(o1,o2)		equalobj(NULL,o1,o2)
+
+
+/* not to called directly */
+LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2);
+
+
+LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n);
+LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj);
+LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key,
+                                            StkId val);
+LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key,
+                                            StkId val);
+LUAI_FUNC void luaV_finishOp (lua_State *L);
+LUAI_FUNC void luaV_execute (lua_State *L);
+LUAI_FUNC void luaV_concat (lua_State *L, int total);
+LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+                           const TValue *rc, TMS op);
+LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lzio.c b/sys/contrib/openzfs/module/lua/lzio.c
new file mode 100644
index 000000000000..bfbb41cf8ed3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lzio.c
@@ -0,0 +1,74 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#define lzio_c
+#define LUA_CORE
+
+#include <sys/lua/lua.h>
+
+#include "llimits.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+int luaZ_fill (ZIO *z) {
+  size_t size;
+  lua_State *L = z->L;
+  const char *buff;
+  lua_unlock(L);
+  buff = z->reader(L, z->data, &size);
+  lua_lock(L);
+  if (buff == NULL || size == 0)
+    return EOZ;
+  z->n = size - 1;  /* discount char being returned */
+  z->p = buff;
+  return cast_uchar(*(z->p++));
+}
+
+
+void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) {
+  z->L = L;
+  z->reader = reader;
+  z->data = data;
+  z->n = 0;
+  z->p = NULL;
+}
+
+
+/* --------------------------------------------------------------- read --- */
+size_t luaZ_read (ZIO *z, void *b, size_t n) {
+  while (n) {
+    size_t m;
+    if (z->n == 0) {  /* no bytes in buffer? */
+      if (luaZ_fill(z) == EOZ)  /* try to read more */
+        return n;  /* no more input; return number of missing bytes */
+      else {
+        z->n++;  /* luaZ_fill consumed first byte; put it back */
+        z->p--;
+      }
+    }
+    m = (n <= z->n) ? n : z->n;  /* min. between n and z->n */
+    memcpy(b, z->p, m);
+    z->n -= m;
+    z->p += m;
+    b = (char *)b + m;
+    n -= m;
+  }
+  return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) {
+  if (n > buff->buffsize) {
+    if (n < LUA_MINBUFFER) n = LUA_MINBUFFER;
+    luaZ_resizebuffer(L, buff, n);
+  }
+  return buff->buffer;
+}
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/lzio.h b/sys/contrib/openzfs/module/lua/lzio.h
new file mode 100644
index 000000000000..27908759d509
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/lzio.h
@@ -0,0 +1,67 @@
+/* BEGIN CSTYLED */
+/*
+** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lzio_h
+#define lzio_h
+
+#include <sys/lua/lua.h>
+
+#include "lmem.h"
+
+
+#define EOZ	(-1)			/* end of stream */
+
+typedef struct Zio ZIO;
+
+#define zgetc(z)  (((z)->n--)>0 ?  cast_uchar(*(z)->p++) : luaZ_fill(z))
+
+
+typedef struct Mbuffer {
+  char *buffer;
+  size_t n;
+  size_t buffsize;
+} Mbuffer;
+
+#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0)
+
+#define luaZ_buffer(buff)	((buff)->buffer)
+#define luaZ_sizebuffer(buff)	((buff)->buffsize)
+#define luaZ_bufflen(buff)	((buff)->n)
+
+#define luaZ_resetbuffer(buff) ((buff)->n = 0)
+
+
+#define luaZ_resizebuffer(L, buff, size) \
+	(luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \
+	(buff)->buffsize = size)
+
+#define luaZ_freebuffer(L, buff)	luaZ_resizebuffer(L, buff, 0)
+
+
+LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n);
+LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader,
+                                        void *data);
+LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n);	/* read next n bytes */
+
+
+
+/* --------- Private Part ------------------ */
+
+struct Zio {
+  size_t n;			/* bytes still unread */
+  const char *p;		/* current position in buffer */
+  lua_Reader reader;		/* reader function */
+  void* data;			/* additional data */
+  lua_State *L;			/* Lua state (for reader) */
+};
+
+
+LUAI_FUNC int luaZ_fill (ZIO *z);
+
+#endif
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp.S
new file mode 100644
index 000000000000..1f461a0a4ef3
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp.S
@@ -0,0 +1,19 @@
+#if defined(__x86_64__)
+#include "setjmp_x86_64.S"
+#elif defined(__i386__)
+#include "setjmp_i386.S"
+#elif defined(__aarch64__)
+#include "setjmp_aarch64.S"
+#elif defined(__arm__)
+#include "setjmp_arm.S"
+#elif defined(__sparc__) && defined(__arch64__)
+#include "setjmp_sparc64.S"
+#elif defined(__powerpc__)
+#include "setjmp_ppc.S"
+#elif defined(__mips__)
+#include "setjmp_mips.S"
+#elif defined(__s390x__)
+#include "setjmp_s390x.S"
+#elif defined(__riscv)
+#include "setjmp_rv64g.S"
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_aarch64.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_aarch64.S
new file mode 100644
index 000000000000..a5a9a85fd57e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_aarch64.S
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2014 Andrew Turner
+ * Copyright (c) 2014-2015 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed by Andrew Turner
+ * under sponsorship from the FreeBSD Foundation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+
+#ifdef __aarch64__
+
+#define	ENTRY(sym) \
+	.text; \
+	.globl	sym; \
+	.align	2; \
+	.type	sym,#function; \
+sym:
+
+#define	END(sym) \
+	.size	sym, . - sym
+
+
+ENTRY(setjmp)
+	/* Store the stack pointer */
+	mov	x8, sp
+	str	x8, [x0], #8
+
+	/* Store the general purpose registers and lr */
+	stp	x19, x20, [x0], #16
+	stp	x21, x22, [x0], #16
+	stp	x23, x24, [x0], #16
+	stp	x25, x26, [x0], #16
+	stp	x27, x28, [x0], #16
+	stp	x29, x30, [x0], #16
+
+	/* Return value */
+	mov	x0, #0
+	ret
+END(setjmp)
+
+ENTRY(longjmp)
+	/* Restore the stack pointer */
+	ldr	x8, [x0], #8
+	mov	sp, x8
+
+	/* Restore the general purpose registers and lr */
+	ldp	x19, x20, [x0], #16
+	ldp	x21, x22, [x0], #16
+	ldp	x23, x24, [x0], #16
+	ldp	x25, x26, [x0], #16
+	ldp	x27, x28, [x0], #16
+	ldp	x29, x30, [x0], #16
+
+	/* Load the return value */
+	mov	x0, x1
+	ret
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* __aarch64__ */
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_arm.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_arm.S
new file mode 100644
index 000000000000..78bc3e0b347d
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_arm.S
@@ -0,0 +1,84 @@
+/*-
+ * Copyright 2004-2014 Olivier Houchard <cognet@FreeBSD.org>
+ * Copyright 2012-2014 Ian Lepore <ian@FreeBSD.org>
+ * Copyright 2013-2014 Andrew Turner <andrew@FreeBSD.org>
+ * Copyright 2014 Svatopluk Kraus <onwahe@gmail.com>
+ * Copyright 2014 Michal Meloun <meloun@miracle.cz>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#if defined(__arm__) && !defined(__aarch64__)
+
+#if defined(__thumb2__)
+#define	_FUNC_MODE	.code 16; .thumb_func
+#else
+#define	_FUNC_MODE	.code 32
+#endif
+
+#define	ENTRY(x) \
+	.text; \
+	.syntax unified; \
+	.align 2; \
+	.global x; \
+	.type x,#function; \
+	_FUNC_MODE; \
+x:
+
+#define	END(x) \
+	.size x, . - x;
+
+#define	RET	bx	lr
+
+
+/*
+ * setjump + longjmp
+ */
+ENTRY(setjmp)
+#if defined(__thumb2__)
+	mov	ip, sp
+	stmia	r0, {r4-r12,r14}
+#else
+	stmia	r0, {r4-r14}
+#endif
+	mov	r0, #0x00000000
+	RET
+END(setjmp)
+
+ENTRY(longjmp)
+#if defined(__thumb2__)
+	ldmia	r0, {r4-r12,r14}
+	mov	sp, ip
+#else
+	ldmia	r0, {r4-r14}
+#endif
+	mov	r0, #0x00000001
+	RET
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_i386.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_i386.S
new file mode 100644
index 000000000000..6d6a5f332688
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_i386.S
@@ -0,0 +1,69 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#define	ENTRY(x) \
+	.text; \
+	.align	8; \
+	.globl	x; \
+	.type	x, @function; \
+x:
+
+#define	SET_SIZE(x) \
+	.size	x, [.-x]
+
+/*
+ * Setjmp and longjmp implement non-local gotos using state vectors
+ * type label_t.
+ */
+#ifdef __i386__
+
+	ENTRY(setjmp)			/* save area is passed in eax */
+	movl	%ebp, 0(%eax)		/* save ebp */
+	movl	%ebx, 4(%eax)		/* save ebx */
+	movl	%esi, 8(%eax)		/* save esi */
+	movl	%edi, 12(%eax)		/* save edi */
+	movl	%esp, 16(%eax)		/* save esp */
+	movl	(%esp), %ecx		/* %eip (return address) */
+	movl	%ecx, 20(%eax)		/* save eip */
+	subl	%eax, %eax		/* return 0 */
+	ret
+	SET_SIZE(setjmp)
+
+	ENTRY(longjmp)			/* save area is passed in eax */
+	movl	0(%eax), %ebp		/* restore ebp */
+	movl	4(%eax), %ebx		/* restore ebx */
+	movl	8(%eax), %esi		/* restore esi */
+	movl	12(%eax), %edi		/* restore edi */
+	movl	16(%eax), %esp		/* restore esp */
+	movl	20(%eax), %ecx		/* %eip (return address) */
+	addl	$4, %esp		/* pop ret adr */
+	jmp	*%ecx			/* indirect jump */
+	SET_SIZE(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* __i386__ */
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_mips.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_mips.S
new file mode 100644
index 000000000000..0084fbfa4bec
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_mips.S
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009
+ *	The President and Fellows of Harvard College.
+ * Copyright (c) 2017 MIPS Technologies, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <asm/asm.h>
+#include <asm/regdef.h>
+
+/*
+ * setjmp and longjmp for MIPS.
+ */
+
+   .text
+   .set noreorder
+
+   /*
+    * int setjmp(jmp_buf jb);
+    *
+    * Save the current state so we can return again from the call later
+    * if/when longjmp is called. (If the function that called setjmp
+    * returns before longjmp is called, the results are undefined. We
+    * only need to save registers, not the whole contents of the stack.)
+    */
+LEAF(setjmp)
+   /*
+    * jmp_buf is in a0. We need to save s0-s8, sp, gp, and ra in it.
+    * Don't store more registers without adjusting machine/setjmp.h.
+    */
+
+   REG_S sp, 0(a0)		/* save registers */
+   REG_S ra, 1*SZREG(a0)
+   REG_S gp, 2*SZREG(a0)
+   REG_S s0, 3*SZREG(a0)
+   REG_S s1, 4*SZREG(a0)
+   REG_S s2, 5*SZREG(a0)
+   REG_S s3, 6*SZREG(a0)
+   REG_S s4, 7*SZREG(a0)
+   REG_S s5, 8*SZREG(a0)
+   REG_S s6, 9*SZREG(a0)
+   REG_S s7, 10*SZREG(a0)
+   REG_S s8, 11*SZREG(a0)
+
+   jr ra		/* done */
+    move v0, zero	/* return 0 (in delay slot) */
+END(setjmp)
+
+
+   /*
+    * void longjmp(jmp_buf jb, int code);
+    */
+LEAF(longjmp)
+   /*
+    * jmp_buf is in a0. Return code is in a1.
+    * We need to restore s0-s8, sp, gp, and ra from the jmp_buf.
+    * The return code is forced to 1 if 0 is passed in.
+    */
+
+   sltiu t0, a1, 1	/* set t0 to 1 if return code is 0... otherwise 0 */
+   addu a1, a1, t0	/* update the return code */
+
+   REG_L sp, 0(a0)		/* restore registers */
+   REG_L ra, 1*SZREG(a0)
+   REG_L gp, 2*SZREG(a0)
+   REG_L s0, 3*SZREG(a0)
+   REG_L s1, 4*SZREG(a0)
+   REG_L s2, 5*SZREG(a0)
+   REG_L s3, 6*SZREG(a0)
+   REG_L s4, 7*SZREG(a0)
+   REG_L s5, 8*SZREG(a0)
+   REG_L s6, 9*SZREG(a0)
+   REG_L s7, 10*SZREG(a0)
+   REG_L s8, 11*SZREG(a0)
+
+   jr ra		/* return, to where setjmp was called from */
+    move v0, a1		/* set return value */
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_ppc.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_ppc.S
new file mode 100644
index 000000000000..72aa5d5ab5b0
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_ppc.S
@@ -0,0 +1,165 @@
+/*	$FreeBSD$  */
+/*	from:	NetBSD: setjmp.S,v 1.1 1998/01/27 15:13:12 sakamoto Exp $  */
+/*	from:	OpenBSD: setjmp.S,v 1.2 1996/12/28 06:22:18 rahnds Exp 	*/
+/* kernel version of this file, does not have signal goop */
+/* int setjmp(jmp_buf env) */
+
+#define	_ASM
+#include <asm/types.h>
+
+#ifdef __powerpc64__
+#if !defined(PPC64_ELF_ABI_v2) && !defined(PPC64_ELF_ABI_v1)
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define	PPC64_ELF_ABI_v2
+#endif /* _CALL_ELF */
+#endif /* PPC64_ELF_ABI_ */
+#endif /* __powerpc64__ */
+
+#ifdef __powerpc64__
+#define LD_REG	ld
+#define	ST_REG	std
+#define	REGWIDTH 8
+#else
+#define	LD_REG	lwz
+#define	ST_REG	stw
+#define	REGWIDTH 4
+#endif /* __powerpc64__ */
+
+#define JMP_r1	1*REGWIDTH
+#define JMP_r2	2*REGWIDTH
+#define JMP_r14	3*REGWIDTH
+#define JMP_r15 4*REGWIDTH
+#define JMP_r16 5*REGWIDTH
+#define JMP_r17 6*REGWIDTH
+#define JMP_r18 7*REGWIDTH
+#define JMP_r19 8*REGWIDTH
+#define JMP_r20 9*REGWIDTH
+#define JMP_r21 10*REGWIDTH
+#define JMP_r22 11*REGWIDTH
+#define JMP_r23 12*REGWIDTH
+#define JMP_r24 13*REGWIDTH
+#define JMP_r25 14*REGWIDTH
+#define JMP_r26 15*REGWIDTH
+#define JMP_r27 16*REGWIDTH
+#define JMP_r28 17*REGWIDTH
+#define JMP_r29 18*REGWIDTH
+#define JMP_r30 19*REGWIDTH
+#define JMP_r31 20*REGWIDTH
+#define JMP_lr 	21*REGWIDTH
+#define JMP_cr	22*REGWIDTH
+#define JMP_ctr	23*REGWIDTH
+#define JMP_xer	24*REGWIDTH
+
+#ifdef __powerpc64__
+#ifdef PPC64_ELF_ABI_v2
+
+#define	ENTRY(name) \
+	.align 2 ; \
+	.type name,@function; \
+	.weak name; \
+name:
+
+#else /* PPC64_ELF_ABI_v1 */
+
+#define	XGLUE(a,b) a##b
+#define	GLUE(a,b) XGLUE(a,b)
+#define	ENTRY(name) \
+	.align 2 ; \
+	.weak name; \
+	.weak GLUE(.,name); \
+	.pushsection ".opd","aw"; \
+name: \
+	.quad GLUE(.,name); \
+	.quad .TOC.@tocbase; \
+	.quad 0; \
+	.popsection; \
+	.type GLUE(.,name),@function; \
+GLUE(.,name):
+
+#endif /* PPC64_ELF_ABI_v2 */
+
+#else /* 32-bit */
+
+#define	ENTRY(name) \
+	.text; \
+	.p2align 4; \
+	.weak name; \
+	.type name,@function; \
+name:
+
+#endif /* __powerpc64__ */
+
+
+ENTRY(setjmp)
+	ST_REG 31, JMP_r31(3)
+	/* r1, r2, r14-r30 */
+	ST_REG 1,  JMP_r1 (3)
+	ST_REG 2,  JMP_r2 (3)
+	ST_REG 14, JMP_r14(3)
+	ST_REG 15, JMP_r15(3)
+	ST_REG 16, JMP_r16(3)
+	ST_REG 17, JMP_r17(3)
+	ST_REG 18, JMP_r18(3)
+	ST_REG 19, JMP_r19(3)
+	ST_REG 20, JMP_r20(3)
+	ST_REG 21, JMP_r21(3)
+	ST_REG 22, JMP_r22(3)
+	ST_REG 23, JMP_r23(3)
+	ST_REG 24, JMP_r24(3)
+	ST_REG 25, JMP_r25(3)
+	ST_REG 26, JMP_r26(3)
+	ST_REG 27, JMP_r27(3)
+	ST_REG 28, JMP_r28(3)
+	ST_REG 29, JMP_r29(3)
+	ST_REG 30, JMP_r30(3)
+	/* cr, lr, ctr, xer */
+	mfcr 0
+	ST_REG 0, JMP_cr(3)
+	mflr 0
+	ST_REG 0, JMP_lr(3)
+	mfctr 0
+	ST_REG 0, JMP_ctr(3)
+	mfxer 0
+	ST_REG 0, JMP_xer(3)
+	/* f14-f31, fpscr */
+	li 3, 0
+	blr
+
+ENTRY(longjmp)
+	LD_REG 31, JMP_r31(3)
+	/* r1, r2, r14-r30 */
+	LD_REG 1,  JMP_r1 (3)
+	LD_REG 2,  JMP_r2 (3)
+	LD_REG 14, JMP_r14(3)
+	LD_REG 15, JMP_r15(3)
+	LD_REG 16, JMP_r16(3)
+	LD_REG 17, JMP_r17(3)
+	LD_REG 18, JMP_r18(3)
+	LD_REG 19, JMP_r19(3)
+	LD_REG 20, JMP_r20(3)
+	LD_REG 21, JMP_r21(3)
+	LD_REG 22, JMP_r22(3)
+	LD_REG 23, JMP_r23(3)
+	LD_REG 24, JMP_r24(3)
+	LD_REG 25, JMP_r25(3)
+	LD_REG 26, JMP_r26(3)
+	LD_REG 27, JMP_r27(3)
+	LD_REG 28, JMP_r28(3)
+	LD_REG 29, JMP_r29(3)
+	LD_REG 30, JMP_r30(3)
+	/* cr, lr, ctr, xer */
+	LD_REG 0, JMP_cr(3)
+	mtcr 0
+	LD_REG 0, JMP_lr(3)
+	mtlr 0
+	LD_REG 0, JMP_ctr(3)
+	mtctr 0
+	LD_REG 0, JMP_xer(3)
+	mtxer 0
+	/* f14-f31, fpscr */
+	mr 3, 4
+	blr
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_rv64g.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_rv64g.S
new file mode 100644
index 000000000000..7f6c50d25a4c
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_rv64g.S
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2015-2016 Ruslan Bukin <br@bsdpad.com>
+ * All rights reserved.
+ *
+ * Portions of this software were developed by SRI International and the
+ * University of Cambridge Computer Laboratory under DARPA/AFRL contract
+ * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
+ *
+ * Portions of this software were developed by the University of Cambridge
+ * Computer Laboratory as part of the CTSRD Project, with support from the
+ * UK Higher Education Innovation Fund (HEIF).
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#define ENTRY(sym)                                              \
+        .text; .globl sym; .type sym,@function; sym:
+#define END(sym) .size sym, . - sym
+
+
+ENTRY(setjmp)
+	/* Store the stack pointer */
+	sd	sp, (0 * 8)(a0)
+	addi	a0, a0, (1 * 8)
+
+	/* Store the general purpose registers and ra */
+	sd	s0, (0 * 8)(a0)
+	sd	s1, (1 * 8)(a0)
+	sd	s2, (2 * 8)(a0)
+	sd	s3, (3 * 8)(a0)
+	sd	s4, (4 * 8)(a0)
+	sd	s5, (5 * 8)(a0)
+	sd	s6, (6 * 8)(a0)
+	sd	s7, (7 * 8)(a0)
+	sd	s8, (8 * 8)(a0)
+	sd	s9, (9 * 8)(a0)
+	sd	s10, (10 * 8)(a0)
+	sd	s11, (11 * 8)(a0)
+	sd	ra, (12 * 8)(a0)
+	addi	a0, a0, (13 * 8)
+
+	/* Return value */
+	li	a0, 0
+	ret
+END(setjmp)
+
+ENTRY(longjmp)
+	/* Restore the stack pointer */
+	ld	t0, 0(a0)
+	mv	sp, t0
+	addi	a0, a0, (1 * 8)
+
+	/* Restore the general purpose registers and ra */
+	ld	s0, (0 * 8)(a0)
+	ld	s1, (1 * 8)(a0)
+	ld	s2, (2 * 8)(a0)
+	ld	s3, (3 * 8)(a0)
+	ld	s4, (4 * 8)(a0)
+	ld	s5, (5 * 8)(a0)
+	ld	s6, (6 * 8)(a0)
+	ld	s7, (7 * 8)(a0)
+	ld	s8, (8 * 8)(a0)
+	ld	s9, (9 * 8)(a0)
+	ld	s10, (10 * 8)(a0)
+	ld	s11, (11 * 8)(a0)
+	ld	ra, (12 * 8)(a0)
+	addi	a0, a0, (13 * 8)
+
+	/* Load the return value */
+	mv	a0, a1
+	ret
+END(longjmp)
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_s390x.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_s390x.S
new file mode 100644
index 000000000000..336c66c08b51
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_s390x.S
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2005-2014 Rich Felker, et al.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+	.global setjmp
+	.type   setjmp,@function
+setjmp:
+	stmg %r6, %r15, 0(%r2)
+
+	std  %f8,  10*8(%r2)
+	std  %f9,  11*8(%r2)
+	std  %f10, 12*8(%r2)
+	std  %f11, 13*8(%r2)
+	std  %f12, 14*8(%r2)
+	std  %f13, 15*8(%r2)
+	std  %f14, 16*8(%r2)
+	std  %f15, 17*8(%r2)
+
+	lghi %r2, 0
+	br   %r14
+
+	.global longjmp
+	.type   longjmp,@function
+longjmp:
+
+1:
+	lmg %r6, %r15, 0(%r2)
+
+	ld  %f8, 10*8(%r2)
+	ld  %f9, 11*8(%r2)
+	ld %f10, 12*8(%r2)
+	ld %f11, 13*8(%r2)
+	ld %f12, 14*8(%r2)
+	ld %f13, 15*8(%r2)
+	ld %f14, 16*8(%r2)
+	ld %f15, 17*8(%r2)
+
+	ltgr %r2, %r3
+	bnzr %r14
+	lhi  %r2, 1
+	br   %r14
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_sparc64.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_sparc64.S
new file mode 100644
index 000000000000..a37a71cbce33
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_sparc64.S
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This software was developed by the Computer Systems Engineering group
+ * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
+ * contributed to Berkeley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Header: _setjmp.s,v 1.1 91/07/06 16:45:53 torek Exp
+ */
+
+#if defined(LIBC_SCCS) && !defined(lint)
+#if 0
+	.asciz "@(#)_setjmp.s	8.1 (Berkeley) 6/4/93"
+#else
+	RCSID("$NetBSD: _setjmp.S,v 1.4 1998/10/08 02:27:59 eeh Exp $")
+#endif
+#endif /* LIBC_SCCS and not lint */
+
+#define	_JB_FP		0x0
+#define	_JB_PC		0x8
+#define	_JB_SP		0x10
+
+	.register %g2,#ignore
+	.register %g3,#ignore
+
+#define	ENTRY(x)		\
+	.text ;			\
+	.align 32 ;		\
+	.globl	x ;		\
+	.type	x,@function ;	\
+x:
+
+#define	END(x)			\
+	.size x, . - x
+
+/*
+ * C library -- setjmp, longjmp
+ *
+ *	longjmp(a,v)
+ * will generate a "return(v?v:1)" from
+ * the last call to
+ *	setjmp(a)
+ * by restoring the previous context.
+ */
+
+ENTRY(setjmp)
+	stx	%sp, [%o0 + _JB_SP]
+	stx	%o7, [%o0 + _JB_PC]
+	stx	%fp, [%o0 + _JB_FP]
+	retl
+	 clr	%o0
+END(setjmp)
+
+ENTRY(longjmp)
+	mov	1, %g1
+	movrnz	%o1, %o1, %g1
+	mov	%o0, %g2
+	ldx	[%g2 + _JB_FP], %g3
+1:	cmp	%fp, %g3
+	bl,a	1b
+	 restore
+	be,a	2f
+	 ldx	[%g2 + _JB_SP], %o0
+
+.Lbotch:
+	illtrap
+
+2:	cmp	%o0, %sp
+	bge,a	3f
+	 mov	%o0, %sp
+	b,a	.Lbotch
+	 nop
+3:	ldx	[%g2 + _JB_PC], %o7
+	retl
+	 mov	%g1, %o0
+END(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S b/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S
new file mode 100644
index 000000000000..a469cbad780e
--- /dev/null
+++ b/sys/contrib/openzfs/module/lua/setjmp/setjmp_x86_64.S
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+
+#define	ENTRY(x) \
+	.text; \
+	.align	8; \
+	.globl	x; \
+	.type	x, @function; \
+x:
+
+#define	SET_SIZE(x) \
+	.size	x, [.-x]
+
+
+/*
+ * Setjmp and longjmp implement non-local gotos using state vectors
+ * type label_t.
+ */
+#ifdef __x86_64__
+
+	ENTRY(setjmp)
+	movq	%rsp, 0(%rdi)
+	movq	%rbp, 8(%rdi)
+	movq	%rbx, 16(%rdi)
+	movq	%r12, 24(%rdi)
+	movq	%r13, 32(%rdi)
+	movq	%r14, 40(%rdi)
+	movq	%r15, 48(%rdi)
+	movq	0(%rsp), %rdx		/* return address */
+	movq	%rdx, 56(%rdi)		/* rip */
+	xorl	%eax, %eax		/* return 0 */
+	ret
+	SET_SIZE(setjmp)
+
+	ENTRY(longjmp)
+	movq	0(%rdi), %rsp
+	movq	8(%rdi), %rbp
+	movq	16(%rdi), %rbx
+	movq	24(%rdi), %r12
+	movq	32(%rdi), %r13
+	movq	40(%rdi), %r14
+	movq	48(%rdi), %r15
+	movq	56(%rdi), %rdx		/* return address */
+	movq	%rdx, 0(%rsp)
+	xorl	%eax, %eax
+	incl	%eax			/* return 1 */
+	ret
+	SET_SIZE(longjmp)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
+
+#endif /* __x86_64__ */
diff --git a/sys/contrib/openzfs/module/nvpair/Makefile.in b/sys/contrib/openzfs/module/nvpair/Makefile.in
new file mode 100644
index 000000000000..d8145236674b
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/Makefile.in
@@ -0,0 +1,13 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := znvpair
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+$(MODULE)-objs += nvpair.o
+$(MODULE)-objs += fnvpair.o
+$(MODULE)-objs += nvpair_alloc_spl.o
+$(MODULE)-objs += nvpair_alloc_fixed.o
diff --git a/sys/contrib/openzfs/module/nvpair/fnvpair.c b/sys/contrib/openzfs/module/nvpair/fnvpair.c
new file mode 100644
index 000000000000..dc8257e48594
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/fnvpair.c
@@ -0,0 +1,660 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/nvpair.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/param.h>
+#ifndef _KERNEL
+#include <stdlib.h>
+#endif
+
+/*
+ * "Force" nvlist wrapper.
+ *
+ * These functions wrap the nvlist_* functions with assertions that assume
+ * the operation is successful.  This allows the caller's code to be much
+ * more readable, especially for the fnvlist_lookup_* and fnvpair_value_*
+ * functions, which can return the requested value (rather than filling in
+ * a pointer).
+ *
+ * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate
+ * with KM_SLEEP.
+ *
+ * More wrappers should be added as needed -- for example
+ * nvlist_lookup_*_array and nvpair_value_*_array.
+ */
+
+nvlist_t *
+fnvlist_alloc(void)
+{
+	nvlist_t *nvl;
+	VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP));
+	return (nvl);
+}
+
+void
+fnvlist_free(nvlist_t *nvl)
+{
+	nvlist_free(nvl);
+}
+
+size_t
+fnvlist_size(nvlist_t *nvl)
+{
+	size_t size;
+	VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE));
+	return (size);
+}
+
+/*
+ * Returns allocated buffer of size *sizep.  Caller must free the buffer with
+ * fnvlist_pack_free().
+ */
+char *
+fnvlist_pack(nvlist_t *nvl, size_t *sizep)
+{
+	char *packed = 0;
+	VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE,
+	    KM_SLEEP), ==, 0);
+	return (packed);
+}
+
+/*ARGSUSED*/
+void
+fnvlist_pack_free(char *pack, size_t size)
+{
+#ifdef _KERNEL
+	kmem_free(pack, size);
+#else
+	free(pack);
+#endif
+}
+
+nvlist_t *
+fnvlist_unpack(char *buf, size_t buflen)
+{
+	nvlist_t *rv;
+	VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP));
+	return (rv);
+}
+
+nvlist_t *
+fnvlist_dup(nvlist_t *nvl)
+{
+	nvlist_t *rv;
+	VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP));
+	return (rv);
+}
+
+void
+fnvlist_merge(nvlist_t *dst, nvlist_t *src)
+{
+	VERIFY0(nvlist_merge(dst, src, KM_SLEEP));
+}
+
+size_t
+fnvlist_num_pairs(nvlist_t *nvl)
+{
+	size_t count = 0;
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL;
+	    pair = nvlist_next_nvpair(nvl, pair))
+		count++;
+	return (count);
+}
+
+void
+fnvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+	VERIFY0(nvlist_add_boolean(nvl, name));
+}
+
+void
+fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+	VERIFY0(nvlist_add_boolean_value(nvl, name, val));
+}
+
+void
+fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+	VERIFY0(nvlist_add_byte(nvl, name, val));
+}
+
+void
+fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+	VERIFY0(nvlist_add_int8(nvl, name, val));
+}
+
+void
+fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+	VERIFY0(nvlist_add_uint8(nvl, name, val));
+}
+
+void
+fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+	VERIFY0(nvlist_add_int16(nvl, name, val));
+}
+
+void
+fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+	VERIFY0(nvlist_add_uint16(nvl, name, val));
+}
+
+void
+fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+	VERIFY0(nvlist_add_int32(nvl, name, val));
+}
+
+void
+fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+	VERIFY0(nvlist_add_uint32(nvl, name, val));
+}
+
+void
+fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+	VERIFY0(nvlist_add_int64(nvl, name, val));
+}
+
+void
+fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+	VERIFY0(nvlist_add_uint64(nvl, name, val));
+}
+
+void
+fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+	VERIFY0(nvlist_add_string(nvl, name, val));
+}
+
+void
+fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+	VERIFY0(nvlist_add_nvlist(nvl, name, val));
+}
+
+void
+fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+	VERIFY0(nvlist_add_nvpair(nvl, pair));
+}
+
+void
+fnvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+    boolean_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_boolean_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_byte_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint8_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint16_array(nvlist_t *nvl, const char *name,
+    uint16_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint16_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint32_array(nvlist_t *nvl, const char *name,
+    uint32_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint32_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_int64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_uint64_array(nvlist_t *nvl, const char *name,
+    uint64_t *val, uint_t n)
+{
+	VERIFY0(nvlist_add_uint64_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_string_array(nvlist_t *nvl, const char *name,
+    char * const *val, uint_t n)
+{
+	VERIFY0(nvlist_add_string_array(nvl, name, val, n));
+}
+
+void
+fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name,
+    nvlist_t **val, uint_t n)
+{
+	VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n));
+}
+
+void
+fnvlist_remove(nvlist_t *nvl, const char *name)
+{
+	VERIFY0(nvlist_remove_all(nvl, name));
+}
+
+void
+fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair)
+{
+	VERIFY0(nvlist_remove_nvpair(nvl, pair));
+}
+
+nvpair_t *
+fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name)
+{
+	nvpair_t *rv;
+	VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv));
+	return (rv);
+}
+
+/* returns B_TRUE if the entry exists */
+boolean_t
+fnvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+	return (nvlist_lookup_boolean(nvl, name) == 0);
+}
+
+boolean_t
+fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name)
+{
+	boolean_t rv;
+	VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv));
+	return (rv);
+}
+
+uchar_t
+fnvlist_lookup_byte(nvlist_t *nvl, const char *name)
+{
+	uchar_t rv;
+	VERIFY0(nvlist_lookup_byte(nvl, name, &rv));
+	return (rv);
+}
+
+int8_t
+fnvlist_lookup_int8(nvlist_t *nvl, const char *name)
+{
+	int8_t rv;
+	VERIFY0(nvlist_lookup_int8(nvl, name, &rv));
+	return (rv);
+}
+
+int16_t
+fnvlist_lookup_int16(nvlist_t *nvl, const char *name)
+{
+	int16_t rv;
+	VERIFY0(nvlist_lookup_int16(nvl, name, &rv));
+	return (rv);
+}
+
+int32_t
+fnvlist_lookup_int32(nvlist_t *nvl, const char *name)
+{
+	int32_t rv;
+	VERIFY0(nvlist_lookup_int32(nvl, name, &rv));
+	return (rv);
+}
+
+int64_t
+fnvlist_lookup_int64(nvlist_t *nvl, const char *name)
+{
+	int64_t rv;
+	VERIFY0(nvlist_lookup_int64(nvl, name, &rv));
+	return (rv);
+}
+
+uint8_t
+fnvlist_lookup_uint8(nvlist_t *nvl, const char *name)
+{
+	uint8_t rv;
+	VERIFY0(nvlist_lookup_uint8(nvl, name, &rv));
+	return (rv);
+}
+
+uint16_t
+fnvlist_lookup_uint16(nvlist_t *nvl, const char *name)
+{
+	uint16_t rv;
+	VERIFY0(nvlist_lookup_uint16(nvl, name, &rv));
+	return (rv);
+}
+
+uint32_t
+fnvlist_lookup_uint32(nvlist_t *nvl, const char *name)
+{
+	uint32_t rv;
+	VERIFY0(nvlist_lookup_uint32(nvl, name, &rv));
+	return (rv);
+}
+
+uint64_t
+fnvlist_lookup_uint64(nvlist_t *nvl, const char *name)
+{
+	uint64_t rv;
+	VERIFY0(nvlist_lookup_uint64(nvl, name, &rv));
+	return (rv);
+}
+
+char *
+fnvlist_lookup_string(nvlist_t *nvl, const char *name)
+{
+	char *rv;
+	VERIFY0(nvlist_lookup_string(nvl, name, &rv));
+	return (rv);
+}
+
+nvlist_t *
+fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name)
+{
+	nvlist_t *rv;
+	VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv));
+	return (rv);
+}
+boolean_t *
+fnvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	boolean_t *rv;
+	VERIFY0(nvlist_lookup_boolean_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+uchar_t *
+fnvlist_lookup_byte_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	uchar_t *rv;
+	VERIFY0(nvlist_lookup_byte_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+int8_t *
+fnvlist_lookup_int8_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	int8_t *rv;
+	VERIFY0(nvlist_lookup_int8_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+uint8_t *
+fnvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	uint8_t *rv;
+	VERIFY0(nvlist_lookup_uint8_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+int16_t *
+fnvlist_lookup_int16_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	int16_t *rv;
+	VERIFY0(nvlist_lookup_int16_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+uint16_t *
+fnvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	uint16_t *rv;
+	VERIFY0(nvlist_lookup_uint16_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+int32_t *
+fnvlist_lookup_int32_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	int32_t *rv;
+	VERIFY0(nvlist_lookup_int32_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+uint32_t *
+fnvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	uint32_t *rv;
+	VERIFY0(nvlist_lookup_uint32_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+int64_t *
+fnvlist_lookup_int64_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	int64_t *rv;
+	VERIFY0(nvlist_lookup_int64_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+uint64_t *
+fnvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, uint_t *n)
+{
+	uint64_t *rv;
+	VERIFY0(nvlist_lookup_uint64_array(nvl, name, &rv, n));
+	return (rv);
+}
+
+boolean_t
+fnvpair_value_boolean_value(nvpair_t *nvp)
+{
+	boolean_t rv;
+	VERIFY0(nvpair_value_boolean_value(nvp, &rv));
+	return (rv);
+}
+
+uchar_t
+fnvpair_value_byte(nvpair_t *nvp)
+{
+	uchar_t rv;
+	VERIFY0(nvpair_value_byte(nvp, &rv));
+	return (rv);
+}
+
+int8_t
+fnvpair_value_int8(nvpair_t *nvp)
+{
+	int8_t rv;
+	VERIFY0(nvpair_value_int8(nvp, &rv));
+	return (rv);
+}
+
+int16_t
+fnvpair_value_int16(nvpair_t *nvp)
+{
+	int16_t rv;
+	VERIFY0(nvpair_value_int16(nvp, &rv));
+	return (rv);
+}
+
+int32_t
+fnvpair_value_int32(nvpair_t *nvp)
+{
+	int32_t rv;
+	VERIFY0(nvpair_value_int32(nvp, &rv));
+	return (rv);
+}
+
+int64_t
+fnvpair_value_int64(nvpair_t *nvp)
+{
+	int64_t rv;
+	VERIFY0(nvpair_value_int64(nvp, &rv));
+	return (rv);
+}
+
+uint8_t
+fnvpair_value_uint8(nvpair_t *nvp)
+{
+	uint8_t rv;
+	VERIFY0(nvpair_value_uint8(nvp, &rv));
+	return (rv);
+}
+
+uint16_t
+fnvpair_value_uint16(nvpair_t *nvp)
+{
+	uint16_t rv;
+	VERIFY0(nvpair_value_uint16(nvp, &rv));
+	return (rv);
+}
+
+uint32_t
+fnvpair_value_uint32(nvpair_t *nvp)
+{
+	uint32_t rv;
+	VERIFY0(nvpair_value_uint32(nvp, &rv));
+	return (rv);
+}
+
+uint64_t
+fnvpair_value_uint64(nvpair_t *nvp)
+{
+	uint64_t rv;
+	VERIFY0(nvpair_value_uint64(nvp, &rv));
+	return (rv);
+}
+
+char *
+fnvpair_value_string(nvpair_t *nvp)
+{
+	char *rv;
+	VERIFY0(nvpair_value_string(nvp, &rv));
+	return (rv);
+}
+
+nvlist_t *
+fnvpair_value_nvlist(nvpair_t *nvp)
+{
+	nvlist_t *rv;
+	VERIFY0(nvpair_value_nvlist(nvp, &rv));
+	return (rv);
+}
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(fnvlist_alloc);
+EXPORT_SYMBOL(fnvlist_free);
+EXPORT_SYMBOL(fnvlist_size);
+EXPORT_SYMBOL(fnvlist_pack);
+EXPORT_SYMBOL(fnvlist_pack_free);
+EXPORT_SYMBOL(fnvlist_unpack);
+EXPORT_SYMBOL(fnvlist_dup);
+EXPORT_SYMBOL(fnvlist_merge);
+
+EXPORT_SYMBOL(fnvlist_add_nvpair);
+EXPORT_SYMBOL(fnvlist_add_boolean);
+EXPORT_SYMBOL(fnvlist_add_boolean_value);
+EXPORT_SYMBOL(fnvlist_add_byte);
+EXPORT_SYMBOL(fnvlist_add_int8);
+EXPORT_SYMBOL(fnvlist_add_uint8);
+EXPORT_SYMBOL(fnvlist_add_int16);
+EXPORT_SYMBOL(fnvlist_add_uint16);
+EXPORT_SYMBOL(fnvlist_add_int32);
+EXPORT_SYMBOL(fnvlist_add_uint32);
+EXPORT_SYMBOL(fnvlist_add_int64);
+EXPORT_SYMBOL(fnvlist_add_uint64);
+EXPORT_SYMBOL(fnvlist_add_string);
+EXPORT_SYMBOL(fnvlist_add_nvlist);
+EXPORT_SYMBOL(fnvlist_add_boolean_array);
+EXPORT_SYMBOL(fnvlist_add_byte_array);
+EXPORT_SYMBOL(fnvlist_add_int8_array);
+EXPORT_SYMBOL(fnvlist_add_uint8_array);
+EXPORT_SYMBOL(fnvlist_add_int16_array);
+EXPORT_SYMBOL(fnvlist_add_uint16_array);
+EXPORT_SYMBOL(fnvlist_add_int32_array);
+EXPORT_SYMBOL(fnvlist_add_uint32_array);
+EXPORT_SYMBOL(fnvlist_add_int64_array);
+EXPORT_SYMBOL(fnvlist_add_uint64_array);
+EXPORT_SYMBOL(fnvlist_add_string_array);
+EXPORT_SYMBOL(fnvlist_add_nvlist_array);
+
+EXPORT_SYMBOL(fnvlist_remove);
+EXPORT_SYMBOL(fnvlist_remove_nvpair);
+
+EXPORT_SYMBOL(fnvlist_lookup_nvpair);
+EXPORT_SYMBOL(fnvlist_lookup_boolean);
+EXPORT_SYMBOL(fnvlist_lookup_boolean_value);
+EXPORT_SYMBOL(fnvlist_lookup_byte);
+EXPORT_SYMBOL(fnvlist_lookup_int8);
+EXPORT_SYMBOL(fnvlist_lookup_uint8);
+EXPORT_SYMBOL(fnvlist_lookup_int16);
+EXPORT_SYMBOL(fnvlist_lookup_uint16);
+EXPORT_SYMBOL(fnvlist_lookup_int32);
+EXPORT_SYMBOL(fnvlist_lookup_uint32);
+EXPORT_SYMBOL(fnvlist_lookup_int64);
+EXPORT_SYMBOL(fnvlist_lookup_uint64);
+EXPORT_SYMBOL(fnvlist_lookup_string);
+EXPORT_SYMBOL(fnvlist_lookup_nvlist);
+
+EXPORT_SYMBOL(fnvpair_value_boolean_value);
+EXPORT_SYMBOL(fnvpair_value_byte);
+EXPORT_SYMBOL(fnvpair_value_int8);
+EXPORT_SYMBOL(fnvpair_value_uint8);
+EXPORT_SYMBOL(fnvpair_value_int16);
+EXPORT_SYMBOL(fnvpair_value_uint16);
+EXPORT_SYMBOL(fnvpair_value_int32);
+EXPORT_SYMBOL(fnvpair_value_uint32);
+EXPORT_SYMBOL(fnvpair_value_int64);
+EXPORT_SYMBOL(fnvpair_value_uint64);
+EXPORT_SYMBOL(fnvpair_value_string);
+EXPORT_SYMBOL(fnvpair_value_nvlist);
+EXPORT_SYMBOL(fnvlist_num_pairs);
+
+#endif
diff --git a/sys/contrib/openzfs/module/nvpair/nvpair.c b/sys/contrib/openzfs/module/nvpair/nvpair.c
new file mode 100644
index 000000000000..990a4482c993
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/nvpair.c
@@ -0,0 +1,3738 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
+ * Copyright 2018 RackTop Systems.
+ */
+
+/*
+ * Links to Illumos.org for more information on Interface Libraries:
+ * [1] https://illumos.org/man/3lib/libnvpair
+ * [2] https://illumos.org/man/3nvpair/nvlist_alloc
+ * [3] https://illumos.org/man/9f/nvlist_alloc
+ * [4] https://illumos.org/man/9f/nvlist_next_nvpair
+ * [5] https://illumos.org/man/9f/nvpair_value_byte
+ */
+
+#include <sys/debug.h>
+#include <sys/isa_defs.h>
+#include <sys/nvpair.h>
+#include <sys/nvpair_impl.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/strings.h>
+#include <rpc/xdr.h>
+#include <sys/mod.h>
+
+#if defined(_KERNEL)
+#include <sys/sunddi.h>
+#include <sys/sysmacros.h>
+#else
+#include <stdarg.h>
+#include <stdlib.h>
+#include <stddef.h>
+#endif
+
+#define	skip_whitespace(p)	while ((*(p) == ' ') || (*(p) == '\t')) p++
+
+/*
+ * nvpair.c - Provides kernel & userland interfaces for manipulating
+ *	name-value pairs.
+ *
+ * Overview Diagram
+ *
+ *  +--------------+
+ *  |  nvlist_t    |
+ *  |--------------|
+ *  | nvl_version  |
+ *  | nvl_nvflag   |
+ *  | nvl_priv    -+-+
+ *  | nvl_flag     | |
+ *  | nvl_pad      | |
+ *  +--------------+ |
+ *                   V
+ *      +--------------+      last i_nvp in list
+ *      | nvpriv_t     |  +--------------------->
+ *      |--------------|  |
+ *   +--+- nvp_list    |  |   +------------+
+ *   |  |  nvp_last   -+--+   + nv_alloc_t |
+ *   |  |  nvp_curr    |      |------------|
+ *   |  |  nvp_nva    -+----> | nva_ops    |
+ *   |  |  nvp_stat    |      | nva_arg    |
+ *   |  +--------------+      +------------+
+ *   |
+ *   +-------+
+ *           V
+ *   +---------------------+      +-------------------+
+ *   |  i_nvp_t            |  +-->|  i_nvp_t          |  +-->
+ *   |---------------------|  |   |-------------------|  |
+ *   | nvi_next           -+--+   | nvi_next         -+--+
+ *   | nvi_prev (NULL)     | <----+ nvi_prev          |
+ *   | . . . . . . . . . . |      | . . . . . . . . . |
+ *   | nvp (nvpair_t)      |      | nvp (nvpair_t)    |
+ *   |  - nvp_size         |      |  - nvp_size       |
+ *   |  - nvp_name_sz      |      |  - nvp_name_sz    |
+ *   |  - nvp_value_elem   |      |  - nvp_value_elem |
+ *   |  - nvp_type         |      |  - nvp_type       |
+ *   |  - data ...         |      |  - data ...       |
+ *   +---------------------+      +-------------------+
+ *
+ *
+ *
+ *   +---------------------+              +---------------------+
+ *   |  i_nvp_t            |  +-->    +-->|  i_nvp_t (last)     |
+ *   |---------------------|  |       |   |---------------------|
+ *   |  nvi_next          -+--+ ... --+   | nvi_next (NULL)     |
+ * <-+- nvi_prev           |<-- ...  <----+ nvi_prev            |
+ *   | . . . . . . . . .   |              | . . . . . . . . .   |
+ *   | nvp (nvpair_t)      |              | nvp (nvpair_t)      |
+ *   |  - nvp_size         |              |  - nvp_size         |
+ *   |  - nvp_name_sz      |              |  - nvp_name_sz      |
+ *   |  - nvp_value_elem   |              |  - nvp_value_elem   |
+ *   |  - DATA_TYPE_NVLIST |              |  - nvp_type         |
+ *   |  - data (embedded)  |              |  - data ...         |
+ *   |    nvlist name      |              +---------------------+
+ *   |  +--------------+   |
+ *   |  |  nvlist_t    |   |
+ *   |  |--------------|   |
+ *   |  | nvl_version  |   |
+ *   |  | nvl_nvflag   |   |
+ *   |  | nvl_priv   --+---+---->
+ *   |  | nvl_flag     |   |
+ *   |  | nvl_pad      |   |
+ *   |  +--------------+   |
+ *   +---------------------+
+ *
+ *
+ * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will
+ * allow value to be aligned on 8 byte boundary
+ *
+ * name_len is the length of the name string including the null terminator
+ * so it must be >= 1
+ */
+#define	NVP_SIZE_CALC(name_len, data_len) \
+	(NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len))
+
+static int i_get_value_size(data_type_t type, const void *data, uint_t nelem);
+static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type,
+    uint_t nelem, const void *data);
+
+#define	NV_STAT_EMBEDDED	0x1
+#define	EMBEDDED_NVL(nvp)	((nvlist_t *)(void *)NVP_VALUE(nvp))
+#define	EMBEDDED_NVL_ARRAY(nvp)	((nvlist_t **)(void *)NVP_VALUE(nvp))
+
+#define	NVP_VALOFF(nvp)	(NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz))
+#define	NVPAIR2I_NVP(nvp) \
+	((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp)))
+
+#ifdef _KERNEL
+int nvpair_max_recursion = 20;
+#else
+int nvpair_max_recursion = 100;
+#endif
+
+uint64_t nvlist_hashtable_init_size = (1 << 4);
+
+int
+nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...)
+{
+	va_list valist;
+	int err = 0;
+
+	nva->nva_ops = nvo;
+	nva->nva_arg = NULL;
+
+	va_start(valist, nvo);
+	if (nva->nva_ops->nv_ao_init != NULL)
+		err = nva->nva_ops->nv_ao_init(nva, valist);
+	va_end(valist);
+
+	return (err);
+}
+
+void
+nv_alloc_reset(nv_alloc_t *nva)
+{
+	if (nva->nva_ops->nv_ao_reset != NULL)
+		nva->nva_ops->nv_ao_reset(nva);
+}
+
+void
+nv_alloc_fini(nv_alloc_t *nva)
+{
+	if (nva->nva_ops->nv_ao_fini != NULL)
+		nva->nva_ops->nv_ao_fini(nva);
+}
+
+nv_alloc_t *
+nvlist_lookup_nv_alloc(nvlist_t *nvl)
+{
+	nvpriv_t *priv;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (NULL);
+
+	return (priv->nvp_nva);
+}
+
+static void *
+nv_mem_zalloc(nvpriv_t *nvp, size_t size)
+{
+	nv_alloc_t *nva = nvp->nvp_nva;
+	void *buf;
+
+	if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL)
+		bzero(buf, size);
+
+	return (buf);
+}
+
+static void
+nv_mem_free(nvpriv_t *nvp, void *buf, size_t size)
+{
+	nv_alloc_t *nva = nvp->nvp_nva;
+
+	nva->nva_ops->nv_ao_free(nva, buf, size);
+}
+
+static void
+nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat)
+{
+	bzero(priv, sizeof (nvpriv_t));
+
+	priv->nvp_nva = nva;
+	priv->nvp_stat = stat;
+}
+
+static nvpriv_t *
+nv_priv_alloc(nv_alloc_t *nva)
+{
+	nvpriv_t *priv;
+
+	/*
+	 * nv_mem_alloc() cannot called here because it needs the priv
+	 * argument.
+	 */
+	if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL)
+		return (NULL);
+
+	nv_priv_init(priv, nva, 0);
+
+	return (priv);
+}
+
+/*
+ * Embedded lists need their own nvpriv_t's.  We create a new
+ * nvpriv_t using the parameters and allocator from the parent
+ * list's nvpriv_t.
+ */
+static nvpriv_t *
+nv_priv_alloc_embedded(nvpriv_t *priv)
+{
+	nvpriv_t *emb_priv;
+
+	if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL)
+		return (NULL);
+
+	nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED);
+
+	return (emb_priv);
+}
+
+static int
+nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets)
+{
+	ASSERT3P(priv->nvp_hashtable, ==, NULL);
+	ASSERT0(priv->nvp_nbuckets);
+	ASSERT0(priv->nvp_nentries);
+
+	i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *));
+	if (tab == NULL)
+		return (ENOMEM);
+
+	priv->nvp_hashtable = tab;
+	priv->nvp_nbuckets = buckets;
+	return (0);
+}
+
+static void
+nvt_tab_free(nvpriv_t *priv)
+{
+	i_nvp_t **tab = priv->nvp_hashtable;
+	if (tab == NULL) {
+		ASSERT0(priv->nvp_nbuckets);
+		ASSERT0(priv->nvp_nentries);
+		return;
+	}
+
+	nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *));
+
+	priv->nvp_hashtable = NULL;
+	priv->nvp_nbuckets = 0;
+	priv->nvp_nentries = 0;
+}
+
+static uint32_t
+nvt_hash(const char *p)
+{
+	uint32_t g, hval = 0;
+
+	while (*p) {
+		hval = (hval << 4) + *p++;
+		if ((g = (hval & 0xf0000000)) != 0)
+			hval ^= g >> 24;
+		hval &= ~g;
+	}
+	return (hval);
+}
+
+static boolean_t
+nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag)
+{
+	boolean_t match = B_FALSE;
+	if (nvflag & NV_UNIQUE_NAME_TYPE) {
+		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 &&
+		    NVP_TYPE(nvp1) == NVP_TYPE(nvp2))
+			match = B_TRUE;
+	} else {
+		ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME);
+		if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0)
+			match = B_TRUE;
+	}
+	return (match);
+}
+
+static nvpair_t *
+nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	ASSERT(priv != NULL);
+
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	if (tab == NULL) {
+		ASSERT3P(priv->nvp_list, ==, NULL);
+		ASSERT0(priv->nvp_nbuckets);
+		ASSERT0(priv->nvp_nentries);
+		return (NULL);
+	} else {
+		ASSERT(priv->nvp_nbuckets != 0);
+	}
+
+	uint64_t hash = nvt_hash(name);
+	uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+	ASSERT3U(index, <, priv->nvp_nbuckets);
+	i_nvp_t *entry = tab[index];
+
+	for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) {
+		if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 &&
+		    (type == DATA_TYPE_DONTCARE ||
+		    NVP_TYPE(&e->nvi_nvp) == type))
+			return (&e->nvi_nvp);
+	}
+	return (NULL);
+}
+
+static nvpair_t *
+nvt_lookup_name(nvlist_t *nvl, const char *name)
+{
+	return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE));
+}
+
+static int
+nvt_resize(nvpriv_t *priv, uint32_t new_size)
+{
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	/*
+	 * Migrate all the entries from the current table
+	 * to a newly-allocated table with the new size by
+	 * re-adjusting the pointers of their entries.
+	 */
+	uint32_t size = priv->nvp_nbuckets;
+	uint32_t new_mask = new_size - 1;
+	ASSERT(ISP2(new_size));
+
+	i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *));
+	if (new_tab == NULL)
+		return (ENOMEM);
+
+	uint32_t nentries = 0;
+	for (uint32_t i = 0; i < size; i++) {
+		i_nvp_t *next, *e = tab[i];
+
+		while (e != NULL) {
+			next = e->nvi_hashtable_next;
+
+			uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp));
+			uint32_t index = hash & new_mask;
+
+			e->nvi_hashtable_next = new_tab[index];
+			new_tab[index] = e;
+			nentries++;
+
+			e = next;
+		}
+		tab[i] = NULL;
+	}
+	ASSERT3U(nentries, ==, priv->nvp_nentries);
+
+	nvt_tab_free(priv);
+
+	priv->nvp_hashtable = new_tab;
+	priv->nvp_nbuckets = new_size;
+	priv->nvp_nentries = nentries;
+
+	return (0);
+}
+
+static boolean_t
+nvt_needs_togrow(nvpriv_t *priv)
+{
+	/*
+	 * Grow only when we have more elements than buckets
+	 * and the # of buckets doesn't overflow.
+	 */
+	return (priv->nvp_nentries > priv->nvp_nbuckets &&
+	    (UINT32_MAX >> 1) >= priv->nvp_nbuckets);
+}
+
+/*
+ * Allocate a new table that's twice the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_grow(nvpriv_t *priv)
+{
+	uint32_t current_size = priv->nvp_nbuckets;
+	/* ensure we won't overflow */
+	ASSERT3U(UINT32_MAX >> 1, >=, current_size);
+	return (nvt_resize(priv, current_size << 1));
+}
+
+static boolean_t
+nvt_needs_toshrink(nvpriv_t *priv)
+{
+	/*
+	 * Shrink only when the # of elements is less than or
+	 * equal to 1/4 the # of buckets. Never shrink less than
+	 * nvlist_hashtable_init_size.
+	 */
+	ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size);
+	if (priv->nvp_nbuckets == nvlist_hashtable_init_size)
+		return (B_FALSE);
+	return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2));
+}
+
+/*
+ * Allocate a new table that's half the size of the old one,
+ * and migrate all the entries from the old one to the new
+ * one by re-adjusting their pointers.
+ */
+static int
+nvt_shrink(nvpriv_t *priv)
+{
+	uint32_t current_size = priv->nvp_nbuckets;
+	/* ensure we won't overflow */
+	ASSERT3U(current_size, >=, nvlist_hashtable_init_size);
+	return (nvt_resize(priv, current_size >> 1));
+}
+
+static int
+nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+	if (nvt_needs_toshrink(priv)) {
+		int err = nvt_shrink(priv);
+		if (err != 0)
+			return (err);
+	}
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	char *name = NVP_NAME(nvp);
+	uint64_t hash = nvt_hash(name);
+	uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+	ASSERT3U(index, <, priv->nvp_nbuckets);
+	i_nvp_t *bucket = tab[index];
+
+	for (i_nvp_t *prev = NULL, *e = bucket;
+	    e != NULL; prev = e, e = e->nvi_hashtable_next) {
+		if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_nvflag)) {
+			if (prev != NULL) {
+				prev->nvi_hashtable_next =
+				    e->nvi_hashtable_next;
+			} else {
+				ASSERT3P(e, ==, bucket);
+				tab[index] = e->nvi_hashtable_next;
+			}
+			e->nvi_hashtable_next = NULL;
+			priv->nvp_nentries--;
+			break;
+		}
+	}
+
+	return (0);
+}
+
+static int
+nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+
+	/* initialize nvpair table now if it doesn't exist. */
+	if (priv->nvp_hashtable == NULL) {
+		int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size);
+		if (err != 0)
+			return (err);
+	}
+
+	/*
+	 * if we don't allow duplicate entries, make sure to
+	 * unlink any existing entries from the table.
+	 */
+	if (nvl->nvl_nvflag != 0) {
+		int err = nvt_remove_nvpair(nvl, nvp);
+		if (err != 0)
+			return (err);
+	}
+
+	if (nvt_needs_togrow(priv)) {
+		int err = nvt_grow(priv);
+		if (err != 0)
+			return (err);
+	}
+	i_nvp_t **tab = priv->nvp_hashtable;
+
+	char *name = NVP_NAME(nvp);
+	uint64_t hash = nvt_hash(name);
+	uint64_t index = hash & (priv->nvp_nbuckets - 1);
+
+	ASSERT3U(index, <, priv->nvp_nbuckets);
+	i_nvp_t *bucket = tab[index];
+
+	/* insert link at the beginning of the bucket */
+	i_nvp_t *new_entry = NVPAIR2I_NVP(nvp);
+	ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL);
+	new_entry->nvi_hashtable_next = bucket;
+	tab[index] = new_entry;
+
+	priv->nvp_nentries++;
+	return (0);
+}
+
+static void
+nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv)
+{
+	nvl->nvl_version = NV_VERSION;
+	nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE);
+	nvl->nvl_priv = (uint64_t)(uintptr_t)priv;
+	nvl->nvl_flag = 0;
+	nvl->nvl_pad = 0;
+}
+
+uint_t
+nvlist_nvflag(nvlist_t *nvl)
+{
+	return (nvl->nvl_nvflag);
+}
+
+static nv_alloc_t *
+nvlist_nv_alloc(int kmflag)
+{
+#if defined(_KERNEL)
+	switch (kmflag) {
+	case KM_SLEEP:
+		return (nv_alloc_sleep);
+	case KM_NOSLEEP:
+		return (nv_alloc_nosleep);
+	default:
+		return (nv_alloc_pushpage);
+	}
+#else
+	return (nv_alloc_nosleep);
+#endif /* _KERNEL */
+}
+
+/*
+ * nvlist_alloc - Allocate nvlist.
+ */
+int
+nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag)
+{
+	return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva)
+{
+	nvpriv_t *priv;
+
+	if (nvlp == NULL || nva == NULL)
+		return (EINVAL);
+
+	if ((priv = nv_priv_alloc(nva)) == NULL)
+		return (ENOMEM);
+
+	if ((*nvlp = nv_mem_zalloc(priv,
+	    NV_ALIGN(sizeof (nvlist_t)))) == NULL) {
+		nv_mem_free(priv, priv, sizeof (nvpriv_t));
+		return (ENOMEM);
+	}
+
+	nvlist_init(*nvlp, nvflag, priv);
+
+	return (0);
+}
+
+/*
+ * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair.
+ */
+static nvpair_t *
+nvp_buf_alloc(nvlist_t *nvl, size_t len)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	i_nvp_t *buf;
+	nvpair_t *nvp;
+	size_t nvsize;
+
+	/*
+	 * Allocate the buffer
+	 */
+	nvsize = len + offsetof(i_nvp_t, nvi_nvp);
+
+	if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL)
+		return (NULL);
+
+	nvp = &buf->nvi_nvp;
+	nvp->nvp_size = len;
+
+	return (nvp);
+}
+
+/*
+ * nvp_buf_free - de-Allocate an i_nvp_t.
+ */
+static void
+nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp);
+
+	nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize);
+}
+
+/*
+ * nvp_buf_link - link a new nv pair into the nvlist.
+ */
+static void
+nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+	/* Put element at end of nvlist */
+	if (priv->nvp_list == NULL) {
+		priv->nvp_list = priv->nvp_last = curr;
+	} else {
+		curr->nvi_prev = priv->nvp_last;
+		priv->nvp_last->nvi_next = curr;
+		priv->nvp_last = curr;
+	}
+}
+
+/*
+ * nvp_buf_unlink - unlink an removed nvpair out of the nvlist.
+ */
+static void
+nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	i_nvp_t *curr = NVPAIR2I_NVP(nvp);
+
+	/*
+	 * protect nvlist_next_nvpair() against walking on freed memory.
+	 */
+	if (priv->nvp_curr == curr)
+		priv->nvp_curr = curr->nvi_next;
+
+	if (curr == priv->nvp_list)
+		priv->nvp_list = curr->nvi_next;
+	else
+		curr->nvi_prev->nvi_next = curr->nvi_next;
+
+	if (curr == priv->nvp_last)
+		priv->nvp_last = curr->nvi_prev;
+	else
+		curr->nvi_next->nvi_prev = curr->nvi_prev;
+}
+
+/*
+ * take a nvpair type and number of elements and make sure the are valid
+ */
+static int
+i_validate_type_nelem(data_type_t type, uint_t nelem)
+{
+	switch (type) {
+	case DATA_TYPE_BOOLEAN:
+		if (nelem != 0)
+			return (EINVAL);
+		break;
+	case DATA_TYPE_BOOLEAN_VALUE:
+	case DATA_TYPE_BYTE:
+	case DATA_TYPE_INT8:
+	case DATA_TYPE_UINT8:
+	case DATA_TYPE_INT16:
+	case DATA_TYPE_UINT16:
+	case DATA_TYPE_INT32:
+	case DATA_TYPE_UINT32:
+	case DATA_TYPE_INT64:
+	case DATA_TYPE_UINT64:
+	case DATA_TYPE_STRING:
+	case DATA_TYPE_HRTIME:
+	case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+	case DATA_TYPE_DOUBLE:
+#endif
+		if (nelem != 1)
+			return (EINVAL);
+		break;
+	case DATA_TYPE_BOOLEAN_ARRAY:
+	case DATA_TYPE_BYTE_ARRAY:
+	case DATA_TYPE_INT8_ARRAY:
+	case DATA_TYPE_UINT8_ARRAY:
+	case DATA_TYPE_INT16_ARRAY:
+	case DATA_TYPE_UINT16_ARRAY:
+	case DATA_TYPE_INT32_ARRAY:
+	case DATA_TYPE_UINT32_ARRAY:
+	case DATA_TYPE_INT64_ARRAY:
+	case DATA_TYPE_UINT64_ARRAY:
+	case DATA_TYPE_STRING_ARRAY:
+	case DATA_TYPE_NVLIST_ARRAY:
+		/* we allow arrays with 0 elements */
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * Verify nvp_name_sz and check the name string length.
+ */
+static int
+i_validate_nvpair_name(nvpair_t *nvp)
+{
+	if ((nvp->nvp_name_sz <= 0) ||
+	    (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0)))
+		return (EFAULT);
+
+	/* verify the name string, make sure its terminated */
+	if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0')
+		return (EFAULT);
+
+	return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT);
+}
+
+static int
+i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data)
+{
+	switch (type) {
+	case DATA_TYPE_BOOLEAN_VALUE:
+		if (*(boolean_t *)data != B_TRUE &&
+		    *(boolean_t *)data != B_FALSE)
+			return (EINVAL);
+		break;
+	case DATA_TYPE_BOOLEAN_ARRAY: {
+		int i;
+
+		for (i = 0; i < nelem; i++)
+			if (((boolean_t *)data)[i] != B_TRUE &&
+			    ((boolean_t *)data)[i] != B_FALSE)
+				return (EINVAL);
+		break;
+	}
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * This function takes a pointer to what should be a nvpair and it's size
+ * and then verifies that all the nvpair fields make sense and can be
+ * trusted.  This function is used when decoding packed nvpairs.
+ */
+static int
+i_validate_nvpair(nvpair_t *nvp)
+{
+	data_type_t type = NVP_TYPE(nvp);
+	int size1, size2;
+
+	/* verify nvp_name_sz, check the name string length */
+	if (i_validate_nvpair_name(nvp) != 0)
+		return (EFAULT);
+
+	if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0)
+		return (EFAULT);
+
+	/*
+	 * verify nvp_type, nvp_value_elem, and also possibly
+	 * verify string values and get the value size.
+	 */
+	size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp));
+	size1 = nvp->nvp_size - NVP_VALOFF(nvp);
+	if (size2 < 0 || size1 != NV_ALIGN(size2))
+		return (EFAULT);
+
+	return (0);
+}
+
+static int
+nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl)
+{
+	nvpriv_t *priv;
+	i_nvp_t *curr;
+
+	if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL)
+		return (EINVAL);
+
+	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+		nvpair_t *nvp = &curr->nvi_nvp;
+		int err;
+
+		if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+		    NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0)
+			return (err);
+	}
+
+	return (0);
+}
+
+/*
+ * Frees all memory allocated for an nvpair (like embedded lists) with
+ * the exception of the nvpair buffer itself.
+ */
+static void
+nvpair_free(nvpair_t *nvp)
+{
+	switch (NVP_TYPE(nvp)) {
+	case DATA_TYPE_NVLIST:
+		nvlist_free(EMBEDDED_NVL(nvp));
+		break;
+	case DATA_TYPE_NVLIST_ARRAY: {
+		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+		int i;
+
+		for (i = 0; i < NVP_NELEM(nvp); i++)
+			if (nvlp[i] != NULL)
+				nvlist_free(nvlp[i]);
+		break;
+	}
+	default:
+		break;
+	}
+}
+
+/*
+ * nvlist_free - free an unpacked nvlist
+ */
+void
+nvlist_free(nvlist_t *nvl)
+{
+	nvpriv_t *priv;
+	i_nvp_t *curr;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return;
+
+	/*
+	 * Unpacked nvlist are linked through i_nvp_t
+	 */
+	curr = priv->nvp_list;
+	while (curr != NULL) {
+		nvpair_t *nvp = &curr->nvi_nvp;
+		curr = curr->nvi_next;
+
+		nvpair_free(nvp);
+		nvp_buf_free(nvl, nvp);
+	}
+
+	if (!(priv->nvp_stat & NV_STAT_EMBEDDED))
+		nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t)));
+	else
+		nvl->nvl_priv = 0;
+
+	nvt_tab_free(priv);
+	nv_mem_free(priv, priv, sizeof (nvpriv_t));
+}
+
+static int
+nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	i_nvp_t *curr;
+
+	if (nvp == NULL)
+		return (0);
+
+	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+		if (&curr->nvi_nvp == nvp)
+			return (1);
+
+	return (0);
+}
+
+/*
+ * Make a copy of nvlist
+ */
+int
+nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag)
+{
+	return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+	int err;
+	nvlist_t *ret;
+
+	if (nvl == NULL || nvlp == NULL)
+		return (EINVAL);
+
+	if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0)
+		return (err);
+
+	if ((err = nvlist_copy_pairs(nvl, ret)) != 0)
+		nvlist_free(ret);
+	else
+		*nvlp = ret;
+
+	return (err);
+}
+
+/*
+ * Remove all with matching name
+ */
+int
+nvlist_remove_all(nvlist_t *nvl, const char *name)
+{
+	int error = ENOENT;
+
+	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
+		return (EINVAL);
+
+	nvpair_t *nvp;
+	while ((nvp = nvt_lookup_name(nvl, name)) != NULL) {
+		VERIFY0(nvlist_remove_nvpair(nvl, nvp));
+		error = 0;
+	}
+
+	return (error);
+}
+
+/*
+ * Remove first one with matching name and type
+ */
+int
+nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
+{
+	if (nvl == NULL || name == NULL || nvl->nvl_priv == 0)
+		return (EINVAL);
+
+	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+	if (nvp == NULL)
+		return (ENOENT);
+
+	return (nvlist_remove_nvpair(nvl, nvp));
+}
+
+int
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	if (nvl == NULL || nvp == NULL)
+		return (EINVAL);
+
+	int err = nvt_remove_nvpair(nvl, nvp);
+	if (err != 0)
+		return (err);
+
+	nvp_buf_unlink(nvl, nvp);
+	nvpair_free(nvp);
+	nvp_buf_free(nvl, nvp);
+	return (0);
+}
+
+/*
+ * This function calculates the size of an nvpair value.
+ *
+ * The data argument controls the behavior in case of the data types
+ * 	DATA_TYPE_STRING    	and
+ *	DATA_TYPE_STRING_ARRAY
+ * Is data == NULL then the size of the string(s) is excluded.
+ */
+static int
+i_get_value_size(data_type_t type, const void *data, uint_t nelem)
+{
+	uint64_t value_sz;
+
+	if (i_validate_type_nelem(type, nelem) != 0)
+		return (-1);
+
+	/* Calculate required size for holding value */
+	switch (type) {
+	case DATA_TYPE_BOOLEAN:
+		value_sz = 0;
+		break;
+	case DATA_TYPE_BOOLEAN_VALUE:
+		value_sz = sizeof (boolean_t);
+		break;
+	case DATA_TYPE_BYTE:
+		value_sz = sizeof (uchar_t);
+		break;
+	case DATA_TYPE_INT8:
+		value_sz = sizeof (int8_t);
+		break;
+	case DATA_TYPE_UINT8:
+		value_sz = sizeof (uint8_t);
+		break;
+	case DATA_TYPE_INT16:
+		value_sz = sizeof (int16_t);
+		break;
+	case DATA_TYPE_UINT16:
+		value_sz = sizeof (uint16_t);
+		break;
+	case DATA_TYPE_INT32:
+		value_sz = sizeof (int32_t);
+		break;
+	case DATA_TYPE_UINT32:
+		value_sz = sizeof (uint32_t);
+		break;
+	case DATA_TYPE_INT64:
+		value_sz = sizeof (int64_t);
+		break;
+	case DATA_TYPE_UINT64:
+		value_sz = sizeof (uint64_t);
+		break;
+#if !defined(_KERNEL)
+	case DATA_TYPE_DOUBLE:
+		value_sz = sizeof (double);
+		break;
+#endif
+	case DATA_TYPE_STRING:
+		if (data == NULL)
+			value_sz = 0;
+		else
+			value_sz = strlen(data) + 1;
+		break;
+	case DATA_TYPE_BOOLEAN_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (boolean_t);
+		break;
+	case DATA_TYPE_BYTE_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (uchar_t);
+		break;
+	case DATA_TYPE_INT8_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (int8_t);
+		break;
+	case DATA_TYPE_UINT8_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (uint8_t);
+		break;
+	case DATA_TYPE_INT16_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (int16_t);
+		break;
+	case DATA_TYPE_UINT16_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (uint16_t);
+		break;
+	case DATA_TYPE_INT32_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (int32_t);
+		break;
+	case DATA_TYPE_UINT32_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (uint32_t);
+		break;
+	case DATA_TYPE_INT64_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (int64_t);
+		break;
+	case DATA_TYPE_UINT64_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (uint64_t);
+		break;
+	case DATA_TYPE_STRING_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (uint64_t);
+
+		if (data != NULL) {
+			char *const *strs = data;
+			uint_t i;
+
+			/* no alignment requirement for strings */
+			for (i = 0; i < nelem; i++) {
+				if (strs[i] == NULL)
+					return (-1);
+				value_sz += strlen(strs[i]) + 1;
+			}
+		}
+		break;
+	case DATA_TYPE_HRTIME:
+		value_sz = sizeof (hrtime_t);
+		break;
+	case DATA_TYPE_NVLIST:
+		value_sz = NV_ALIGN(sizeof (nvlist_t));
+		break;
+	case DATA_TYPE_NVLIST_ARRAY:
+		value_sz = (uint64_t)nelem * sizeof (uint64_t) +
+		    (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t));
+		break;
+	default:
+		return (-1);
+	}
+
+	return (value_sz > INT32_MAX ? -1 : (int)value_sz);
+}
+
+static int
+nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl)
+{
+	nvpriv_t *priv;
+	int err;
+
+	if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t)
+	    nvl->nvl_priv)) == NULL)
+		return (ENOMEM);
+
+	nvlist_init(emb_nvl, onvl->nvl_nvflag, priv);
+
+	if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) {
+		nvlist_free(emb_nvl);
+		emb_nvl->nvl_priv = 0;
+	}
+
+	return (err);
+}
+
+/*
+ * nvlist_add_common - Add new <name,value> pair to nvlist
+ */
+static int
+nvlist_add_common(nvlist_t *nvl, const char *name,
+    data_type_t type, uint_t nelem, const void *data)
+{
+	nvpair_t *nvp;
+	uint_t i;
+
+	int nvp_sz, name_sz, value_sz;
+	int err = 0;
+
+	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
+		return (EINVAL);
+
+	if (nelem != 0 && data == NULL)
+		return (EINVAL);
+
+	/*
+	 * Verify type and nelem and get the value size.
+	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+	 * is the size of the string(s) included.
+	 */
+	if ((value_sz = i_get_value_size(type, data, nelem)) < 0)
+		return (EINVAL);
+
+	if (i_validate_nvpair_value(type, nelem, data) != 0)
+		return (EINVAL);
+
+	/*
+	 * If we're adding an nvlist or nvlist array, ensure that we are not
+	 * adding the input nvlist to itself, which would cause recursion,
+	 * and ensure that no NULL nvlist pointers are present.
+	 */
+	switch (type) {
+	case DATA_TYPE_NVLIST:
+		if (data == nvl || data == NULL)
+			return (EINVAL);
+		break;
+	case DATA_TYPE_NVLIST_ARRAY: {
+		nvlist_t **onvlp = (nvlist_t **)data;
+		for (i = 0; i < nelem; i++) {
+			if (onvlp[i] == nvl || onvlp[i] == NULL)
+				return (EINVAL);
+		}
+		break;
+	}
+	default:
+		break;
+	}
+
+	/* calculate sizes of the nvpair elements and the nvpair itself */
+	name_sz = strlen(name) + 1;
+	if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * NBBY - 1))
+		return (EINVAL);
+
+	nvp_sz = NVP_SIZE_CALC(name_sz, value_sz);
+
+	if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL)
+		return (ENOMEM);
+
+	ASSERT(nvp->nvp_size == nvp_sz);
+	nvp->nvp_name_sz = name_sz;
+	nvp->nvp_value_elem = nelem;
+	nvp->nvp_type = type;
+	bcopy(name, NVP_NAME(nvp), name_sz);
+
+	switch (type) {
+	case DATA_TYPE_BOOLEAN:
+		break;
+	case DATA_TYPE_STRING_ARRAY: {
+		char *const *strs = data;
+		char *buf = NVP_VALUE(nvp);
+		char **cstrs = (void *)buf;
+
+		/* skip pre-allocated space for pointer array */
+		buf += nelem * sizeof (uint64_t);
+		for (i = 0; i < nelem; i++) {
+			int slen = strlen(strs[i]) + 1;
+			bcopy(strs[i], buf, slen);
+			cstrs[i] = buf;
+			buf += slen;
+		}
+		break;
+	}
+	case DATA_TYPE_NVLIST: {
+		nvlist_t *nnvl = EMBEDDED_NVL(nvp);
+		nvlist_t *onvl = (nvlist_t *)data;
+
+		if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) {
+			nvp_buf_free(nvl, nvp);
+			return (err);
+		}
+		break;
+	}
+	case DATA_TYPE_NVLIST_ARRAY: {
+		nvlist_t **onvlp = (nvlist_t **)data;
+		nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+		nvlist_t *embedded = (nvlist_t *)
+		    ((uintptr_t)nvlp + nelem * sizeof (uint64_t));
+
+		for (i = 0; i < nelem; i++) {
+			if ((err = nvlist_copy_embedded(nvl,
+			    onvlp[i], embedded)) != 0) {
+				/*
+				 * Free any successfully created lists
+				 */
+				nvpair_free(nvp);
+				nvp_buf_free(nvl, nvp);
+				return (err);
+			}
+
+			nvlp[i] = embedded++;
+		}
+		break;
+	}
+	default:
+		bcopy(data, NVP_VALUE(nvp), value_sz);
+	}
+
+	/* if unique name, remove before add */
+	if (nvl->nvl_nvflag & NV_UNIQUE_NAME)
+		(void) nvlist_remove_all(nvl, name);
+	else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE)
+		(void) nvlist_remove(nvl, name, type);
+
+	err = nvt_add_nvpair(nvl, nvp);
+	if (err != 0) {
+		nvpair_free(nvp);
+		nvp_buf_free(nvl, nvp);
+		return (err);
+	}
+	nvp_buf_link(nvl, nvp);
+
+	return (0);
+}
+
+int
+nvlist_add_boolean(nvlist_t *nvl, const char *name)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL));
+}
+
+int
+nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val));
+}
+
+int
+nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val));
+}
+
+int
+nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val));
+}
+
+int
+nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val));
+}
+
+int
+nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val));
+}
+
+int
+nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val));
+}
+
+int
+nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val));
+}
+
+int
+nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val));
+}
+
+int
+nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val));
+}
+
+int
+nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val));
+}
+
+#if !defined(_KERNEL)
+int
+nvlist_add_double(nvlist_t *nvl, const char *name, double val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val));
+}
+#endif
+
+int
+nvlist_add_string(nvlist_t *nvl, const char *name, const char *val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val));
+}
+
+int
+nvlist_add_boolean_array(nvlist_t *nvl, const char *name,
+    boolean_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_add_string_array(nvlist_t *nvl, const char *name,
+    char *const *a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val));
+}
+
+int
+nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val));
+}
+
+int
+nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n)
+{
+	return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+/* reading name-value pairs */
+nvpair_t *
+nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv;
+	i_nvp_t *curr;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (NULL);
+
+	curr = NVPAIR2I_NVP(nvp);
+
+	/*
+	 * Ensure that nvp is a valid nvpair on this nvlist.
+	 * NB: nvp_curr is used only as a hint so that we don't always
+	 * have to walk the list to determine if nvp is still on the list.
+	 */
+	if (nvp == NULL)
+		curr = priv->nvp_list;
+	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+		curr = curr->nvi_next;
+	else
+		curr = NULL;
+
+	priv->nvp_curr = curr;
+
+	return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+nvpair_t *
+nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv;
+	i_nvp_t *curr;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (NULL);
+
+	curr = NVPAIR2I_NVP(nvp);
+
+	if (nvp == NULL)
+		curr = priv->nvp_last;
+	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+		curr = curr->nvi_prev;
+	else
+		curr = NULL;
+
+	priv->nvp_curr = curr;
+
+	return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+boolean_t
+nvlist_empty(nvlist_t *nvl)
+{
+	nvpriv_t *priv;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (B_TRUE);
+
+	return (priv->nvp_list == NULL);
+}
+
+char *
+nvpair_name(nvpair_t *nvp)
+{
+	return (NVP_NAME(nvp));
+}
+
+data_type_t
+nvpair_type(nvpair_t *nvp)
+{
+	return (NVP_TYPE(nvp));
+}
+
+int
+nvpair_type_is_array(nvpair_t *nvp)
+{
+	data_type_t type = NVP_TYPE(nvp);
+
+	if ((type == DATA_TYPE_BYTE_ARRAY) ||
+	    (type == DATA_TYPE_INT8_ARRAY) ||
+	    (type == DATA_TYPE_UINT8_ARRAY) ||
+	    (type == DATA_TYPE_INT16_ARRAY) ||
+	    (type == DATA_TYPE_UINT16_ARRAY) ||
+	    (type == DATA_TYPE_INT32_ARRAY) ||
+	    (type == DATA_TYPE_UINT32_ARRAY) ||
+	    (type == DATA_TYPE_INT64_ARRAY) ||
+	    (type == DATA_TYPE_UINT64_ARRAY) ||
+	    (type == DATA_TYPE_BOOLEAN_ARRAY) ||
+	    (type == DATA_TYPE_STRING_ARRAY) ||
+	    (type == DATA_TYPE_NVLIST_ARRAY))
+		return (1);
+	return (0);
+
+}
+
+static int
+nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data)
+{
+	int value_sz;
+
+	if (nvp == NULL || nvpair_type(nvp) != type)
+		return (EINVAL);
+
+	/*
+	 * For non-array types, we copy the data.
+	 * For array types (including string), we set a pointer.
+	 */
+	switch (type) {
+	case DATA_TYPE_BOOLEAN:
+		if (nelem != NULL)
+			*nelem = 0;
+		break;
+
+	case DATA_TYPE_BOOLEAN_VALUE:
+	case DATA_TYPE_BYTE:
+	case DATA_TYPE_INT8:
+	case DATA_TYPE_UINT8:
+	case DATA_TYPE_INT16:
+	case DATA_TYPE_UINT16:
+	case DATA_TYPE_INT32:
+	case DATA_TYPE_UINT32:
+	case DATA_TYPE_INT64:
+	case DATA_TYPE_UINT64:
+	case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+	case DATA_TYPE_DOUBLE:
+#endif
+		if (data == NULL)
+			return (EINVAL);
+		if ((value_sz = i_get_value_size(type, NULL, 1)) < 0)
+			return (EINVAL);
+		bcopy(NVP_VALUE(nvp), data, (size_t)value_sz);
+		if (nelem != NULL)
+			*nelem = 1;
+		break;
+
+	case DATA_TYPE_NVLIST:
+	case DATA_TYPE_STRING:
+		if (data == NULL)
+			return (EINVAL);
+		*(void **)data = (void *)NVP_VALUE(nvp);
+		if (nelem != NULL)
+			*nelem = 1;
+		break;
+
+	case DATA_TYPE_BOOLEAN_ARRAY:
+	case DATA_TYPE_BYTE_ARRAY:
+	case DATA_TYPE_INT8_ARRAY:
+	case DATA_TYPE_UINT8_ARRAY:
+	case DATA_TYPE_INT16_ARRAY:
+	case DATA_TYPE_UINT16_ARRAY:
+	case DATA_TYPE_INT32_ARRAY:
+	case DATA_TYPE_UINT32_ARRAY:
+	case DATA_TYPE_INT64_ARRAY:
+	case DATA_TYPE_UINT64_ARRAY:
+	case DATA_TYPE_STRING_ARRAY:
+	case DATA_TYPE_NVLIST_ARRAY:
+		if (nelem == NULL || data == NULL)
+			return (EINVAL);
+		if ((*nelem = NVP_NELEM(nvp)) != 0)
+			*(void **)data = (void *)NVP_VALUE(nvp);
+		else
+			*(void **)data = NULL;
+		break;
+
+	default:
+		return (ENOTSUP);
+	}
+
+	return (0);
+}
+
+static int
+nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type,
+    uint_t *nelem, void *data)
+{
+	if (name == NULL || nvl == NULL || nvl->nvl_priv == 0)
+		return (EINVAL);
+
+	if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE)))
+		return (ENOTSUP);
+
+	nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type);
+	if (nvp == NULL)
+		return (ENOENT);
+
+	return (nvpair_value_common(nvp, type, nelem, data));
+}
+
+int
+nvlist_lookup_boolean(nvlist_t *nvl, const char *name)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL));
+}
+
+int
+nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val)
+{
+	return (nvlist_lookup_common(nvl, name,
+	    DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val));
+}
+
+#if !defined(_KERNEL)
+int
+nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
+int
+nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name,
+    boolean_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name,
+	    DATA_TYPE_BOOLEAN_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_byte_array(nvlist_t *nvl, const char *name,
+    uchar_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name,
+    uint8_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int16_array(nvlist_t *nvl, const char *name,
+    int16_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name,
+    uint16_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int32_array(nvlist_t *nvl, const char *name,
+    int32_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name,
+    uint32_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_int64_array(nvlist_t *nvl, const char *name,
+    int64_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name,
+    uint64_t **a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_string_array(nvlist_t *nvl, const char *name,
+    char ***a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name,
+    nvlist_t ***a, uint_t *n)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a));
+}
+
+int
+nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val)
+{
+	return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val));
+}
+
+int
+nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...)
+{
+	va_list ap;
+	char *name;
+	int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0);
+	int ret = 0;
+
+	va_start(ap, flag);
+	while (ret == 0 && (name = va_arg(ap, char *)) != NULL) {
+		data_type_t type;
+		void *val;
+		uint_t *nelem;
+
+		switch (type = va_arg(ap, data_type_t)) {
+		case DATA_TYPE_BOOLEAN:
+			ret = nvlist_lookup_common(nvl, name, type, NULL, NULL);
+			break;
+
+		case DATA_TYPE_BOOLEAN_VALUE:
+		case DATA_TYPE_BYTE:
+		case DATA_TYPE_INT8:
+		case DATA_TYPE_UINT8:
+		case DATA_TYPE_INT16:
+		case DATA_TYPE_UINT16:
+		case DATA_TYPE_INT32:
+		case DATA_TYPE_UINT32:
+		case DATA_TYPE_INT64:
+		case DATA_TYPE_UINT64:
+		case DATA_TYPE_HRTIME:
+		case DATA_TYPE_STRING:
+		case DATA_TYPE_NVLIST:
+#if !defined(_KERNEL)
+		case DATA_TYPE_DOUBLE:
+#endif
+			val = va_arg(ap, void *);
+			ret = nvlist_lookup_common(nvl, name, type, NULL, val);
+			break;
+
+		case DATA_TYPE_BYTE_ARRAY:
+		case DATA_TYPE_BOOLEAN_ARRAY:
+		case DATA_TYPE_INT8_ARRAY:
+		case DATA_TYPE_UINT8_ARRAY:
+		case DATA_TYPE_INT16_ARRAY:
+		case DATA_TYPE_UINT16_ARRAY:
+		case DATA_TYPE_INT32_ARRAY:
+		case DATA_TYPE_UINT32_ARRAY:
+		case DATA_TYPE_INT64_ARRAY:
+		case DATA_TYPE_UINT64_ARRAY:
+		case DATA_TYPE_STRING_ARRAY:
+		case DATA_TYPE_NVLIST_ARRAY:
+			val = va_arg(ap, void *);
+			nelem = va_arg(ap, uint_t *);
+			ret = nvlist_lookup_common(nvl, name, type, nelem, val);
+			break;
+
+		default:
+			ret = EINVAL;
+		}
+
+		if (ret == ENOENT && noentok)
+			ret = 0;
+	}
+	va_end(ap);
+
+	return (ret);
+}
+
+/*
+ * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function
+ * returns zero and a pointer to the matching nvpair is returned in '*ret'
+ * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate
+ * multiple levels of embedded nvlists, with 'sep' as the separator. As an
+ * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or
+ * "a.d[3].e[1]".  This matches the C syntax for array embed (for convenience,
+ * code also supports "a.d[3]e[1]" syntax).
+ *
+ * If 'ip' is non-NULL and the last name component is an array, return the
+ * value of the "...[index]" array index in *ip. For an array reference that
+ * is not indexed, *ip will be returned as -1. If there is a syntax error in
+ * 'name', and 'ep' is non-NULL then *ep will be set to point to the location
+ * inside the 'name' string where the syntax error was detected.
+ */
+static int
+nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep,
+    nvpair_t **ret, int *ip, char **ep)
+{
+	nvpair_t	*nvp;
+	const char	*np;
+	char		*sepp = NULL;
+	char		*idxp, *idxep;
+	nvlist_t	**nva;
+	long		idx = 0;
+	int		n;
+
+	if (ip)
+		*ip = -1;			/* not indexed */
+	if (ep)
+		*ep = NULL;
+
+	if ((nvl == NULL) || (name == NULL))
+		return (EINVAL);
+
+	sepp = NULL;
+	idx = 0;
+	/* step through components of name */
+	for (np = name; np && *np; np = sepp) {
+		/* ensure unique names */
+		if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME))
+			return (ENOTSUP);
+
+		/* skip white space */
+		skip_whitespace(np);
+		if (*np == 0)
+			break;
+
+		/* set 'sepp' to end of current component 'np' */
+		if (sep)
+			sepp = strchr(np, sep);
+		else
+			sepp = NULL;
+
+		/* find start of next "[ index ]..." */
+		idxp = strchr(np, '[');
+
+		/* if sepp comes first, set idxp to NULL */
+		if (sepp && idxp && (sepp < idxp))
+			idxp = NULL;
+
+		/*
+		 * At this point 'idxp' is set if there is an index
+		 * expected for the current component.
+		 */
+		if (idxp) {
+			/* set 'n' to length of current 'np' name component */
+			n = idxp++ - np;
+
+			/* keep sepp up to date for *ep use as we advance */
+			skip_whitespace(idxp);
+			sepp = idxp;
+
+			/* determine the index value */
+#if defined(_KERNEL)
+			if (ddi_strtol(idxp, &idxep, 0, &idx))
+				goto fail;
+#else
+			idx = strtol(idxp, &idxep, 0);
+#endif
+			if (idxep == idxp)
+				goto fail;
+
+			/* keep sepp up to date for *ep use as we advance */
+			sepp = idxep;
+
+			/* skip white space index value and check for ']' */
+			skip_whitespace(sepp);
+			if (*sepp++ != ']')
+				goto fail;
+
+			/* for embedded arrays, support C syntax: "a[1].b" */
+			skip_whitespace(sepp);
+			if (sep && (*sepp == sep))
+				sepp++;
+		} else if (sepp) {
+			n = sepp++ - np;
+		} else {
+			n = strlen(np);
+		}
+
+		/* trim trailing whitespace by reducing length of 'np' */
+		if (n == 0)
+			goto fail;
+		for (n--; (np[n] == ' ') || (np[n] == '\t'); n--)
+			;
+		n++;
+
+		/* skip whitespace, and set sepp to NULL if complete */
+		if (sepp) {
+			skip_whitespace(sepp);
+			if (*sepp == 0)
+				sepp = NULL;
+		}
+
+		/*
+		 * At this point:
+		 * o  'n' is the length of current 'np' component.
+		 * o  'idxp' is set if there was an index, and value 'idx'.
+		 * o  'sepp' is set to the beginning of the next component,
+		 *    and set to NULL if we have no more components.
+		 *
+		 * Search for nvpair with matching component name.
+		 */
+		for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL;
+		    nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+			/* continue if no match on name */
+			if (strncmp(np, nvpair_name(nvp), n) ||
+			    (strlen(nvpair_name(nvp)) != n))
+				continue;
+
+			/* if indexed, verify type is array oriented */
+			if (idxp && !nvpair_type_is_array(nvp))
+				goto fail;
+
+			/*
+			 * Full match found, return nvp and idx if this
+			 * was the last component.
+			 */
+			if (sepp == NULL) {
+				if (ret)
+					*ret = nvp;
+				if (ip && idxp)
+					*ip = (int)idx;	/* return index */
+				return (0);		/* found */
+			}
+
+			/*
+			 * More components: current match must be
+			 * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY
+			 * to support going deeper.
+			 */
+			if (nvpair_type(nvp) == DATA_TYPE_NVLIST) {
+				nvl = EMBEDDED_NVL(nvp);
+				break;
+			} else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) {
+				(void) nvpair_value_nvlist_array(nvp,
+				    &nva, (uint_t *)&n);
+				if ((n < 0) || (idx >= n))
+					goto fail;
+				nvl = nva[idx];
+				break;
+			}
+
+			/* type does not support more levels */
+			goto fail;
+		}
+		if (nvp == NULL)
+			goto fail;		/* 'name' not found */
+
+		/* search for match of next component in embedded 'nvl' list */
+	}
+
+fail:	if (ep && sepp)
+		*ep = sepp;
+	return (EINVAL);
+}
+
+/*
+ * Return pointer to nvpair with specified 'name'.
+ */
+int
+nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret)
+{
+	return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL));
+}
+
+/*
+ * Determine if named nvpair exists in nvlist (use embedded separator of '.'
+ * and return array index).  See nvlist_lookup_nvpair_ei_sep for more detailed
+ * description.
+ */
+int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl,
+    const char *name, nvpair_t **ret, int *ip, char **ep)
+{
+	return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep));
+}
+
+boolean_t
+nvlist_exists(nvlist_t *nvl, const char *name)
+{
+	nvpriv_t *priv;
+	nvpair_t *nvp;
+	i_nvp_t *curr;
+
+	if (name == NULL || nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (B_FALSE);
+
+	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+		nvp = &curr->nvi_nvp;
+
+		if (strcmp(name, NVP_NAME(nvp)) == 0)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+int
+nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val));
+}
+
+int
+nvpair_value_byte(nvpair_t *nvp, uchar_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val));
+}
+
+int
+nvpair_value_int8(nvpair_t *nvp, int8_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val));
+}
+
+int
+nvpair_value_uint8(nvpair_t *nvp, uint8_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val));
+}
+
+int
+nvpair_value_int16(nvpair_t *nvp, int16_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val));
+}
+
+int
+nvpair_value_uint16(nvpair_t *nvp, uint16_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val));
+}
+
+int
+nvpair_value_int32(nvpair_t *nvp, int32_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val));
+}
+
+int
+nvpair_value_uint32(nvpair_t *nvp, uint32_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val));
+}
+
+int
+nvpair_value_int64(nvpair_t *nvp, int64_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val));
+}
+
+int
+nvpair_value_uint64(nvpair_t *nvp, uint64_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val));
+}
+
+#if !defined(_KERNEL)
+int
+nvpair_value_double(nvpair_t *nvp, double *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val));
+}
+#endif
+
+int
+nvpair_value_string(nvpair_t *nvp, char **val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val));
+}
+
+int
+nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val));
+}
+
+int
+nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val));
+}
+
+int
+nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val)
+{
+	return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val));
+}
+
+/*
+ * Add specified pair to the list.
+ */
+int
+nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	if (nvl == NULL || nvp == NULL)
+		return (EINVAL);
+
+	return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp),
+	    NVP_NELEM(nvp), NVP_VALUE(nvp)));
+}
+
+/*
+ * Merge the supplied nvlists and put the result in dst.
+ * The merged list will contain all names specified in both lists,
+ * the values are taken from nvl in the case of duplicates.
+ * Return 0 on success.
+ */
+/*ARGSUSED*/
+int
+nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag)
+{
+	if (nvl == NULL || dst == NULL)
+		return (EINVAL);
+
+	if (dst != nvl)
+		return (nvlist_copy_pairs(nvl, dst));
+
+	return (0);
+}
+
+/*
+ * Encoding related routines
+ */
+#define	NVS_OP_ENCODE	0
+#define	NVS_OP_DECODE	1
+#define	NVS_OP_GETSIZE	2
+
+typedef struct nvs_ops nvs_ops_t;
+
+typedef struct {
+	int		nvs_op;
+	const nvs_ops_t	*nvs_ops;
+	void		*nvs_private;
+	nvpriv_t	*nvs_priv;
+	int		nvs_recursion;
+} nvstream_t;
+
+/*
+ * nvs operations are:
+ *   - nvs_nvlist
+ *     encoding / decoding of an nvlist header (nvlist_t)
+ *     calculates the size used for header and end detection
+ *
+ *   - nvs_nvpair
+ *     responsible for the first part of encoding / decoding of an nvpair
+ *     calculates the decoded size of an nvpair
+ *
+ *   - nvs_nvp_op
+ *     second part of encoding / decoding of an nvpair
+ *
+ *   - nvs_nvp_size
+ *     calculates the encoding size of an nvpair
+ *
+ *   - nvs_nvl_fini
+ *     encodes the end detection mark (zeros).
+ */
+struct nvs_ops {
+	int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *);
+	int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *);
+	int (*nvs_nvp_op)(nvstream_t *, nvpair_t *);
+	int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *);
+	int (*nvs_nvl_fini)(nvstream_t *);
+};
+
+typedef struct {
+	char	nvh_encoding;	/* nvs encoding method */
+	char	nvh_endian;	/* nvs endian */
+	char	nvh_reserved1;	/* reserved for future use */
+	char	nvh_reserved2;	/* reserved for future use */
+} nvs_header_t;
+
+static int
+nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	i_nvp_t *curr;
+
+	/*
+	 * Walk nvpair in list and encode each nvpair
+	 */
+	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next)
+		if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0)
+			return (EFAULT);
+
+	return (nvs->nvs_ops->nvs_nvl_fini(nvs));
+}
+
+static int
+nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl)
+{
+	nvpair_t *nvp;
+	size_t nvsize;
+	int err;
+
+	/*
+	 * Get decoded size of next pair in stream, alloc
+	 * memory for nvpair_t, then decode the nvpair
+	 */
+	while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) {
+		if (nvsize == 0) /* end of list */
+			break;
+
+		/* make sure len makes sense */
+		if (nvsize < NVP_SIZE_CALC(1, 0))
+			return (EFAULT);
+
+		if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL)
+			return (ENOMEM);
+
+		if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) {
+			nvp_buf_free(nvl, nvp);
+			return (err);
+		}
+
+		if (i_validate_nvpair(nvp) != 0) {
+			nvpair_free(nvp);
+			nvp_buf_free(nvl, nvp);
+			return (EFAULT);
+		}
+
+		err = nvt_add_nvpair(nvl, nvp);
+		if (err != 0) {
+			nvpair_free(nvp);
+			nvp_buf_free(nvl, nvp);
+			return (err);
+		}
+		nvp_buf_link(nvl, nvp);
+	}
+	return (err);
+}
+
+static int
+nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+	nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv;
+	i_nvp_t *curr;
+	uint64_t nvsize = *buflen;
+	size_t size;
+
+	/*
+	 * Get encoded size of nvpairs in nvlist
+	 */
+	for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) {
+		if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0)
+			return (EINVAL);
+
+		if ((nvsize += size) > INT32_MAX)
+			return (EINVAL);
+	}
+
+	*buflen = nvsize;
+	return (0);
+}
+
+static int
+nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen)
+{
+	int err;
+
+	if (nvl->nvl_priv == 0)
+		return (EFAULT);
+
+	/*
+	 * Perform the operation, starting with header, then each nvpair
+	 */
+	if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0)
+		return (err);
+
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+		err = nvs_encode_pairs(nvs, nvl);
+		break;
+
+	case NVS_OP_DECODE:
+		err = nvs_decode_pairs(nvs, nvl);
+		break;
+
+	case NVS_OP_GETSIZE:
+		err = nvs_getsize_pairs(nvs, nvl, buflen);
+		break;
+
+	default:
+		err = EINVAL;
+	}
+
+	return (err);
+}
+
+static int
+nvs_embedded(nvstream_t *nvs, nvlist_t *embedded)
+{
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE: {
+		int err;
+
+		if (nvs->nvs_recursion >= nvpair_max_recursion)
+			return (EINVAL);
+		nvs->nvs_recursion++;
+		err = nvs_operation(nvs, embedded, NULL);
+		nvs->nvs_recursion--;
+		return (err);
+	}
+	case NVS_OP_DECODE: {
+		nvpriv_t *priv;
+		int err;
+
+		if (embedded->nvl_version != NV_VERSION)
+			return (ENOTSUP);
+
+		if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL)
+			return (ENOMEM);
+
+		nvlist_init(embedded, embedded->nvl_nvflag, priv);
+
+		if (nvs->nvs_recursion >= nvpair_max_recursion) {
+			nvlist_free(embedded);
+			return (EINVAL);
+		}
+		nvs->nvs_recursion++;
+		if ((err = nvs_operation(nvs, embedded, NULL)) != 0)
+			nvlist_free(embedded);
+		nvs->nvs_recursion--;
+		return (err);
+	}
+	default:
+		break;
+	}
+
+	return (EINVAL);
+}
+
+static int
+nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+	size_t nelem = NVP_NELEM(nvp);
+	nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp);
+	int i;
+
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+		for (i = 0; i < nelem; i++)
+			if (nvs_embedded(nvs, nvlp[i]) != 0)
+				return (EFAULT);
+		break;
+
+	case NVS_OP_DECODE: {
+		size_t len = nelem * sizeof (uint64_t);
+		nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len);
+
+		bzero(nvlp, len);	/* don't trust packed data */
+		for (i = 0; i < nelem; i++) {
+			if (nvs_embedded(nvs, embedded) != 0) {
+				nvpair_free(nvp);
+				return (EFAULT);
+			}
+
+			nvlp[i] = embedded++;
+		}
+		break;
+	}
+	case NVS_OP_GETSIZE: {
+		uint64_t nvsize = 0;
+
+		for (i = 0; i < nelem; i++) {
+			size_t nvp_sz = 0;
+
+			if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0)
+				return (EINVAL);
+
+			if ((nvsize += nvp_sz) > INT32_MAX)
+				return (EINVAL);
+		}
+
+		*size = nvsize;
+		break;
+	}
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *);
+static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *);
+
+/*
+ * Common routine for nvlist operations:
+ * encode, decode, getsize (encoded size).
+ */
+static int
+nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding,
+    int nvs_op)
+{
+	int err = 0;
+	nvstream_t nvs;
+	int nvl_endian;
+#if defined(_ZFS_LITTLE_ENDIAN)
+	int host_endian = 1;
+#elif defined(_ZFS_BIG_ENDIAN)
+	int host_endian = 0;
+#else
+#error "No endian defined!"
+#endif	/* _ZFS_LITTLE_ENDIAN */
+	nvs_header_t *nvh;
+
+	if (buflen == NULL || nvl == NULL ||
+	    (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (EINVAL);
+
+	nvs.nvs_op = nvs_op;
+	nvs.nvs_recursion = 0;
+
+	/*
+	 * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and
+	 * a buffer is allocated.  The first 4 bytes in the buffer are
+	 * used for encoding method and host endian.
+	 */
+	switch (nvs_op) {
+	case NVS_OP_ENCODE:
+		if (buf == NULL || *buflen < sizeof (nvs_header_t))
+			return (EINVAL);
+
+		nvh = (void *)buf;
+		nvh->nvh_encoding = encoding;
+		nvh->nvh_endian = nvl_endian = host_endian;
+		nvh->nvh_reserved1 = 0;
+		nvh->nvh_reserved2 = 0;
+		break;
+
+	case NVS_OP_DECODE:
+		if (buf == NULL || *buflen < sizeof (nvs_header_t))
+			return (EINVAL);
+
+		/* get method of encoding from first byte */
+		nvh = (void *)buf;
+		encoding = nvh->nvh_encoding;
+		nvl_endian = nvh->nvh_endian;
+		break;
+
+	case NVS_OP_GETSIZE:
+		nvl_endian = host_endian;
+
+		/*
+		 * add the size for encoding
+		 */
+		*buflen = sizeof (nvs_header_t);
+		break;
+
+	default:
+		return (ENOTSUP);
+	}
+
+	/*
+	 * Create an nvstream with proper encoding method
+	 */
+	switch (encoding) {
+	case NV_ENCODE_NATIVE:
+		/*
+		 * check endianness, in case we are unpacking
+		 * from a file
+		 */
+		if (nvl_endian != host_endian)
+			return (ENOTSUP);
+		err = nvs_native(&nvs, nvl, buf, buflen);
+		break;
+	case NV_ENCODE_XDR:
+		err = nvs_xdr(&nvs, nvl, buf, buflen);
+		break;
+	default:
+		err = ENOTSUP;
+		break;
+	}
+
+	return (err);
+}
+
+int
+nvlist_size(nvlist_t *nvl, size_t *size, int encoding)
+{
+	return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE));
+}
+
+/*
+ * Pack nvlist into contiguous memory
+ */
+int
+nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+    int kmflag)
+{
+	return (nvlist_xpack(nvl, bufp, buflen, encoding,
+	    nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding,
+    nv_alloc_t *nva)
+{
+	nvpriv_t nvpriv;
+	size_t alloc_size;
+	char *buf;
+	int err;
+
+	if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL)
+		return (EINVAL);
+
+	if (*bufp != NULL)
+		return (nvlist_common(nvl, *bufp, buflen, encoding,
+		    NVS_OP_ENCODE));
+
+	/*
+	 * Here is a difficult situation:
+	 * 1. The nvlist has fixed allocator properties.
+	 *    All other nvlist routines (like nvlist_add_*, ...) use
+	 *    these properties.
+	 * 2. When using nvlist_pack() the user can specify their own
+	 *    allocator properties (e.g. by using KM_NOSLEEP).
+	 *
+	 * We use the user specified properties (2). A clearer solution
+	 * will be to remove the kmflag from nvlist_pack(), but we will
+	 * not change the interface.
+	 */
+	nv_priv_init(&nvpriv, nva, 0);
+
+	if ((err = nvlist_size(nvl, &alloc_size, encoding)))
+		return (err);
+
+	if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL)
+		return (ENOMEM);
+
+	if ((err = nvlist_common(nvl, buf, &alloc_size, encoding,
+	    NVS_OP_ENCODE)) != 0) {
+		nv_mem_free(&nvpriv, buf, alloc_size);
+	} else {
+		*buflen = alloc_size;
+		*bufp = buf;
+	}
+
+	return (err);
+}
+
+/*
+ * Unpack buf into an nvlist_t
+ */
+int
+nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag)
+{
+	return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag)));
+}
+
+int
+nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva)
+{
+	nvlist_t *nvl;
+	int err;
+
+	if (nvlp == NULL)
+		return (EINVAL);
+
+	if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0)
+		return (err);
+
+	if ((err = nvlist_common(nvl, buf, &buflen, NV_ENCODE_NATIVE,
+	    NVS_OP_DECODE)) != 0)
+		nvlist_free(nvl);
+	else
+		*nvlp = nvl;
+
+	return (err);
+}
+
+/*
+ * Native encoding functions
+ */
+typedef struct {
+	/*
+	 * This structure is used when decoding a packed nvpair in
+	 * the native format.  n_base points to a buffer containing the
+	 * packed nvpair.  n_end is a pointer to the end of the buffer.
+	 * (n_end actually points to the first byte past the end of the
+	 * buffer.)  n_curr is a pointer that lies between n_base and n_end.
+	 * It points to the current data that we are decoding.
+	 * The amount of data left in the buffer is equal to n_end - n_curr.
+	 * n_flag is used to recognize a packed embedded list.
+	 */
+	caddr_t n_base;
+	caddr_t n_end;
+	caddr_t n_curr;
+	uint_t  n_flag;
+} nvs_native_t;
+
+static int
+nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf,
+    size_t buflen)
+{
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+	case NVS_OP_DECODE:
+		nvs->nvs_private = native;
+		native->n_curr = native->n_base = buf;
+		native->n_end = buf + buflen;
+		native->n_flag = 0;
+		return (0);
+
+	case NVS_OP_GETSIZE:
+		nvs->nvs_private = native;
+		native->n_curr = native->n_base = native->n_end = NULL;
+		native->n_flag = 0;
+		return (0);
+	default:
+		return (EINVAL);
+	}
+}
+
+/*ARGSUSED*/
+static void
+nvs_native_destroy(nvstream_t *nvs)
+{
+}
+
+static int
+native_cp(nvstream_t *nvs, void *buf, size_t size)
+{
+	nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+
+	if (native->n_curr + size > native->n_end)
+		return (EFAULT);
+
+	/*
+	 * The bcopy() below eliminates alignment requirement
+	 * on the buffer (stream) and is preferred over direct access.
+	 */
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+		bcopy(buf, native->n_curr, size);
+		break;
+	case NVS_OP_DECODE:
+		bcopy(native->n_curr, buf, size);
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	native->n_curr += size;
+	return (0);
+}
+
+/*
+ * operate on nvlist_t header
+ */
+static int
+nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+	nvs_native_t *native = nvs->nvs_private;
+
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+	case NVS_OP_DECODE:
+		if (native->n_flag)
+			return (0);	/* packed embedded list */
+
+		native->n_flag = 1;
+
+		/* copy version and nvflag of the nvlist_t */
+		if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 ||
+		    native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0)
+			return (EFAULT);
+
+		return (0);
+
+	case NVS_OP_GETSIZE:
+		/*
+		 * if calculate for packed embedded list
+		 * 	4 for end of the embedded list
+		 * else
+		 * 	2 * sizeof (int32_t) for nvl_version and nvl_nvflag
+		 * 	and 4 for end of the entire list
+		 */
+		if (native->n_flag) {
+			*size += 4;
+		} else {
+			native->n_flag = 1;
+			*size += 2 * sizeof (int32_t) + 4;
+		}
+
+		return (0);
+
+	default:
+		return (EINVAL);
+	}
+}
+
+static int
+nvs_native_nvl_fini(nvstream_t *nvs)
+{
+	if (nvs->nvs_op == NVS_OP_ENCODE) {
+		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+		/*
+		 * Add 4 zero bytes at end of nvlist. They are used
+		 * for end detection by the decode routine.
+		 */
+		if (native->n_curr + sizeof (int) > native->n_end)
+			return (EFAULT);
+
+		bzero(native->n_curr, sizeof (int));
+		native->n_curr += sizeof (int);
+	}
+
+	return (0);
+}
+
+static int
+nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp)
+{
+	if (nvs->nvs_op == NVS_OP_ENCODE) {
+		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+		nvlist_t *packed = (void *)
+		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+		/*
+		 * Null out the pointer that is meaningless in the packed
+		 * structure. The address may not be aligned, so we have
+		 * to use bzero.
+		 */
+		bzero((char *)packed + offsetof(nvlist_t, nvl_priv),
+		    sizeof (uint64_t));
+	}
+
+	return (nvs_embedded(nvs, EMBEDDED_NVL(nvp)));
+}
+
+static int
+nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+	if (nvs->nvs_op == NVS_OP_ENCODE) {
+		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+		char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp);
+		size_t len = NVP_NELEM(nvp) * sizeof (uint64_t);
+		nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len);
+		int i;
+		/*
+		 * Null out pointers that are meaningless in the packed
+		 * structure. The addresses may not be aligned, so we have
+		 * to use bzero.
+		 */
+		bzero(value, len);
+
+		for (i = 0; i < NVP_NELEM(nvp); i++, packed++)
+			/*
+			 * Null out the pointer that is meaningless in the
+			 * packed structure. The address may not be aligned,
+			 * so we have to use bzero.
+			 */
+			bzero((char *)packed + offsetof(nvlist_t, nvl_priv),
+			    sizeof (uint64_t));
+	}
+
+	return (nvs_embedded_nvl_array(nvs, nvp, NULL));
+}
+
+static void
+nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp)
+{
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE: {
+		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+		uint64_t *strp = (void *)
+		    (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp));
+		/*
+		 * Null out pointers that are meaningless in the packed
+		 * structure. The addresses may not be aligned, so we have
+		 * to use bzero.
+		 */
+		bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t));
+		break;
+	}
+	case NVS_OP_DECODE: {
+		char **strp = (void *)NVP_VALUE(nvp);
+		char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t));
+		int i;
+
+		for (i = 0; i < NVP_NELEM(nvp); i++) {
+			strp[i] = buf;
+			buf += strlen(buf) + 1;
+		}
+		break;
+	}
+	}
+}
+
+static int
+nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+	data_type_t type;
+	int value_sz;
+	int ret = 0;
+
+	/*
+	 * We do the initial bcopy of the data before we look at
+	 * the nvpair type, because when we're decoding, we won't
+	 * have the correct values for the pair until we do the bcopy.
+	 */
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+	case NVS_OP_DECODE:
+		if (native_cp(nvs, nvp, nvp->nvp_size) != 0)
+			return (EFAULT);
+		break;
+	default:
+		return (EINVAL);
+	}
+
+	/* verify nvp_name_sz, check the name string length */
+	if (i_validate_nvpair_name(nvp) != 0)
+		return (EFAULT);
+
+	type = NVP_TYPE(nvp);
+
+	/*
+	 * Verify type and nelem and get the value size.
+	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+	 * is the size of the string(s) excluded.
+	 */
+	if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0)
+		return (EFAULT);
+
+	if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size)
+		return (EFAULT);
+
+	switch (type) {
+	case DATA_TYPE_NVLIST:
+		ret = nvpair_native_embedded(nvs, nvp);
+		break;
+	case DATA_TYPE_NVLIST_ARRAY:
+		ret = nvpair_native_embedded_array(nvs, nvp);
+		break;
+	case DATA_TYPE_STRING_ARRAY:
+		nvpair_native_string_array(nvs, nvp);
+		break;
+	default:
+		break;
+	}
+
+	return (ret);
+}
+
+static int
+nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+	uint64_t nvp_sz = nvp->nvp_size;
+
+	switch (NVP_TYPE(nvp)) {
+	case DATA_TYPE_NVLIST: {
+		size_t nvsize = 0;
+
+		if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0)
+			return (EINVAL);
+
+		nvp_sz += nvsize;
+		break;
+	}
+	case DATA_TYPE_NVLIST_ARRAY: {
+		size_t nvsize;
+
+		if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0)
+			return (EINVAL);
+
+		nvp_sz += nvsize;
+		break;
+	}
+	default:
+		break;
+	}
+
+	if (nvp_sz > INT32_MAX)
+		return (EINVAL);
+
+	*size = nvp_sz;
+
+	return (0);
+}
+
+static int
+nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+		return (nvs_native_nvp_op(nvs, nvp));
+
+	case NVS_OP_DECODE: {
+		nvs_native_t *native = (nvs_native_t *)nvs->nvs_private;
+		int32_t decode_len;
+
+		/* try to read the size value from the stream */
+		if (native->n_curr + sizeof (int32_t) > native->n_end)
+			return (EFAULT);
+		bcopy(native->n_curr, &decode_len, sizeof (int32_t));
+
+		/* sanity check the size value */
+		if (decode_len < 0 ||
+		    decode_len > native->n_end - native->n_curr)
+			return (EFAULT);
+
+		*size = decode_len;
+
+		/*
+		 * If at the end of the stream then move the cursor
+		 * forward, otherwise nvpair_native_op() will read
+		 * the entire nvpair at the same cursor position.
+		 */
+		if (*size == 0)
+			native->n_curr += sizeof (int32_t);
+		break;
+	}
+
+	default:
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+static const nvs_ops_t nvs_native_ops = {
+	.nvs_nvlist = nvs_native_nvlist,
+	.nvs_nvpair = nvs_native_nvpair,
+	.nvs_nvp_op = nvs_native_nvp_op,
+	.nvs_nvp_size = nvs_native_nvp_size,
+	.nvs_nvl_fini = nvs_native_nvl_fini
+};
+
+static int
+nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+	nvs_native_t native;
+	int err;
+
+	nvs->nvs_ops = &nvs_native_ops;
+
+	if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t),
+	    *buflen - sizeof (nvs_header_t))) != 0)
+		return (err);
+
+	err = nvs_operation(nvs, nvl, buflen);
+
+	nvs_native_destroy(nvs);
+
+	return (err);
+}
+
+/*
+ * XDR encoding functions
+ *
+ * An xdr packed nvlist is encoded as:
+ *
+ *  - encoding method and host endian (4 bytes)
+ *  - nvl_version (4 bytes)
+ *  - nvl_nvflag (4 bytes)
+ *
+ *  - encoded nvpairs, the format of one xdr encoded nvpair is:
+ *	- encoded size of the nvpair (4 bytes)
+ *	- decoded size of the nvpair (4 bytes)
+ *	- name string, (4 + sizeof(NV_ALIGN4(string))
+ *	  a string is coded as size (4 bytes) and data
+ *	- data type (4 bytes)
+ *	- number of elements in the nvpair (4 bytes)
+ *	- data
+ *
+ *  - 2 zero's for end of the entire list (8 bytes)
+ */
+static int
+nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen)
+{
+	/* xdr data must be 4 byte aligned */
+	if ((ulong_t)buf % 4 != 0)
+		return (EFAULT);
+
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE);
+		nvs->nvs_private = xdr;
+		return (0);
+	case NVS_OP_DECODE:
+		xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE);
+		nvs->nvs_private = xdr;
+		return (0);
+	case NVS_OP_GETSIZE:
+		nvs->nvs_private = NULL;
+		return (0);
+	default:
+		return (EINVAL);
+	}
+}
+
+static void
+nvs_xdr_destroy(nvstream_t *nvs)
+{
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+	case NVS_OP_DECODE:
+		xdr_destroy((XDR *)nvs->nvs_private);
+		break;
+	default:
+		break;
+	}
+}
+
+static int
+nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size)
+{
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE:
+	case NVS_OP_DECODE: {
+		XDR 	*xdr = nvs->nvs_private;
+
+		if (!xdr_int(xdr, &nvl->nvl_version) ||
+		    !xdr_u_int(xdr, &nvl->nvl_nvflag))
+			return (EFAULT);
+		break;
+	}
+	case NVS_OP_GETSIZE: {
+		/*
+		 * 2 * 4 for nvl_version + nvl_nvflag
+		 * and 8 for end of the entire list
+		 */
+		*size += 2 * 4 + 8;
+		break;
+	}
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static int
+nvs_xdr_nvl_fini(nvstream_t *nvs)
+{
+	if (nvs->nvs_op == NVS_OP_ENCODE) {
+		XDR *xdr = nvs->nvs_private;
+		int zero = 0;
+
+		if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero))
+			return (EFAULT);
+	}
+
+	return (0);
+}
+
+/*
+ * The format of xdr encoded nvpair is:
+ * encode_size, decode_size, name string, data type, nelem, data
+ */
+static int
+nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp)
+{
+	data_type_t type;
+	char	*buf;
+	char	*buf_end = (char *)nvp + nvp->nvp_size;
+	int	value_sz;
+	uint_t	nelem, buflen;
+	bool_t	ret = FALSE;
+	XDR	*xdr = nvs->nvs_private;
+
+	ASSERT(xdr != NULL && nvp != NULL);
+
+	/* name string */
+	if ((buf = NVP_NAME(nvp)) >= buf_end)
+		return (EFAULT);
+	buflen = buf_end - buf;
+
+	if (!xdr_string(xdr, &buf, buflen - 1))
+		return (EFAULT);
+	nvp->nvp_name_sz = strlen(buf) + 1;
+
+	/* type and nelem */
+	if (!xdr_int(xdr, (int *)&nvp->nvp_type) ||
+	    !xdr_int(xdr, &nvp->nvp_value_elem))
+		return (EFAULT);
+
+	type = NVP_TYPE(nvp);
+	nelem = nvp->nvp_value_elem;
+
+	/*
+	 * Verify type and nelem and get the value size.
+	 * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY
+	 * is the size of the string(s) excluded.
+	 */
+	if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0)
+		return (EFAULT);
+
+	/* if there is no data to extract then return */
+	if (nelem == 0)
+		return (0);
+
+	/* value */
+	if ((buf = NVP_VALUE(nvp)) >= buf_end)
+		return (EFAULT);
+	buflen = buf_end - buf;
+
+	if (buflen < value_sz)
+		return (EFAULT);
+
+	switch (type) {
+	case DATA_TYPE_NVLIST:
+		if (nvs_embedded(nvs, (void *)buf) == 0)
+			return (0);
+		break;
+
+	case DATA_TYPE_NVLIST_ARRAY:
+		if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0)
+			return (0);
+		break;
+
+	case DATA_TYPE_BOOLEAN:
+		ret = TRUE;
+		break;
+
+	case DATA_TYPE_BYTE:
+	case DATA_TYPE_INT8:
+	case DATA_TYPE_UINT8:
+		ret = xdr_char(xdr, buf);
+		break;
+
+	case DATA_TYPE_INT16:
+		ret = xdr_short(xdr, (void *)buf);
+		break;
+
+	case DATA_TYPE_UINT16:
+		ret = xdr_u_short(xdr, (void *)buf);
+		break;
+
+	case DATA_TYPE_BOOLEAN_VALUE:
+	case DATA_TYPE_INT32:
+		ret = xdr_int(xdr, (void *)buf);
+		break;
+
+	case DATA_TYPE_UINT32:
+		ret = xdr_u_int(xdr, (void *)buf);
+		break;
+
+	case DATA_TYPE_INT64:
+		ret = xdr_longlong_t(xdr, (void *)buf);
+		break;
+
+	case DATA_TYPE_UINT64:
+		ret = xdr_u_longlong_t(xdr, (void *)buf);
+		break;
+
+	case DATA_TYPE_HRTIME:
+		/*
+		 * NOTE: must expose the definition of hrtime_t here
+		 */
+		ret = xdr_longlong_t(xdr, (void *)buf);
+		break;
+#if !defined(_KERNEL)
+	case DATA_TYPE_DOUBLE:
+		ret = xdr_double(xdr, (void *)buf);
+		break;
+#endif
+	case DATA_TYPE_STRING:
+		ret = xdr_string(xdr, &buf, buflen - 1);
+		break;
+
+	case DATA_TYPE_BYTE_ARRAY:
+		ret = xdr_opaque(xdr, buf, nelem);
+		break;
+
+	case DATA_TYPE_INT8_ARRAY:
+	case DATA_TYPE_UINT8_ARRAY:
+		ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t),
+		    (xdrproc_t)xdr_char);
+		break;
+
+	case DATA_TYPE_INT16_ARRAY:
+		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t),
+		    sizeof (int16_t), (xdrproc_t)xdr_short);
+		break;
+
+	case DATA_TYPE_UINT16_ARRAY:
+		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t),
+		    sizeof (uint16_t), (xdrproc_t)xdr_u_short);
+		break;
+
+	case DATA_TYPE_BOOLEAN_ARRAY:
+	case DATA_TYPE_INT32_ARRAY:
+		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t),
+		    sizeof (int32_t), (xdrproc_t)xdr_int);
+		break;
+
+	case DATA_TYPE_UINT32_ARRAY:
+		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t),
+		    sizeof (uint32_t), (xdrproc_t)xdr_u_int);
+		break;
+
+	case DATA_TYPE_INT64_ARRAY:
+		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t),
+		    sizeof (int64_t), (xdrproc_t)xdr_longlong_t);
+		break;
+
+	case DATA_TYPE_UINT64_ARRAY:
+		ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t),
+		    sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t);
+		break;
+
+	case DATA_TYPE_STRING_ARRAY: {
+		size_t len = nelem * sizeof (uint64_t);
+		char **strp = (void *)buf;
+		int i;
+
+		if (nvs->nvs_op == NVS_OP_DECODE)
+			bzero(buf, len);	/* don't trust packed data */
+
+		for (i = 0; i < nelem; i++) {
+			if (buflen <= len)
+				return (EFAULT);
+
+			buf += len;
+			buflen -= len;
+
+			if (xdr_string(xdr, &buf, buflen - 1) != TRUE)
+				return (EFAULT);
+
+			if (nvs->nvs_op == NVS_OP_DECODE)
+				strp[i] = buf;
+			len = strlen(buf) + 1;
+		}
+		ret = TRUE;
+		break;
+	}
+	default:
+		break;
+	}
+
+	return (ret == TRUE ? 0 : EFAULT);
+}
+
+static int
+nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+	data_type_t type = NVP_TYPE(nvp);
+	/*
+	 * encode_size + decode_size + name string size + data type + nelem
+	 * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp)))
+	 */
+	uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4;
+
+	switch (type) {
+	case DATA_TYPE_BOOLEAN:
+		break;
+
+	case DATA_TYPE_BOOLEAN_VALUE:
+	case DATA_TYPE_BYTE:
+	case DATA_TYPE_INT8:
+	case DATA_TYPE_UINT8:
+	case DATA_TYPE_INT16:
+	case DATA_TYPE_UINT16:
+	case DATA_TYPE_INT32:
+	case DATA_TYPE_UINT32:
+		nvp_sz += 4;	/* 4 is the minimum xdr unit */
+		break;
+
+	case DATA_TYPE_INT64:
+	case DATA_TYPE_UINT64:
+	case DATA_TYPE_HRTIME:
+#if !defined(_KERNEL)
+	case DATA_TYPE_DOUBLE:
+#endif
+		nvp_sz += 8;
+		break;
+
+	case DATA_TYPE_STRING:
+		nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp)));
+		break;
+
+	case DATA_TYPE_BYTE_ARRAY:
+		nvp_sz += NV_ALIGN4(NVP_NELEM(nvp));
+		break;
+
+	case DATA_TYPE_BOOLEAN_ARRAY:
+	case DATA_TYPE_INT8_ARRAY:
+	case DATA_TYPE_UINT8_ARRAY:
+	case DATA_TYPE_INT16_ARRAY:
+	case DATA_TYPE_UINT16_ARRAY:
+	case DATA_TYPE_INT32_ARRAY:
+	case DATA_TYPE_UINT32_ARRAY:
+		nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp);
+		break;
+
+	case DATA_TYPE_INT64_ARRAY:
+	case DATA_TYPE_UINT64_ARRAY:
+		nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp);
+		break;
+
+	case DATA_TYPE_STRING_ARRAY: {
+		int i;
+		char **strs = (void *)NVP_VALUE(nvp);
+
+		for (i = 0; i < NVP_NELEM(nvp); i++)
+			nvp_sz += 4 + NV_ALIGN4(strlen(strs[i]));
+
+		break;
+	}
+
+	case DATA_TYPE_NVLIST:
+	case DATA_TYPE_NVLIST_ARRAY: {
+		size_t nvsize = 0;
+		int old_nvs_op = nvs->nvs_op;
+		int err;
+
+		nvs->nvs_op = NVS_OP_GETSIZE;
+		if (type == DATA_TYPE_NVLIST)
+			err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize);
+		else
+			err = nvs_embedded_nvl_array(nvs, nvp, &nvsize);
+		nvs->nvs_op = old_nvs_op;
+
+		if (err != 0)
+			return (EINVAL);
+
+		nvp_sz += nvsize;
+		break;
+	}
+
+	default:
+		return (EINVAL);
+	}
+
+	if (nvp_sz > INT32_MAX)
+		return (EINVAL);
+
+	*size = nvp_sz;
+
+	return (0);
+}
+
+
+/*
+ * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates
+ * the largest nvpair that could be encoded in the buffer.
+ *
+ * See comments above nvpair_xdr_op() for the format of xdr encoding.
+ * The size of a xdr packed nvpair without any data is 5 words.
+ *
+ * Using the size of the data directly as an estimate would be ok
+ * in all cases except one.  If the data type is of DATA_TYPE_STRING_ARRAY
+ * then the actual nvpair has space for an array of pointers to index
+ * the strings.  These pointers are not encoded into the packed xdr buffer.
+ *
+ * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are
+ * of length 0, then each string is encoded in xdr format as a single word.
+ * Therefore when expanded to an nvpair there will be 2.25 word used for
+ * each string.  (a int64_t allocated for pointer usage, and a single char
+ * for the null termination.)
+ *
+ * This is the calculation performed by the NVS_XDR_MAX_LEN macro.
+ */
+#define	NVS_XDR_HDR_LEN		((size_t)(5 * 4))
+#define	NVS_XDR_DATA_LEN(y)	(((size_t)(y) <= NVS_XDR_HDR_LEN) ? \
+					0 : ((size_t)(y) - NVS_XDR_HDR_LEN))
+#define	NVS_XDR_MAX_LEN(x)	(NVP_SIZE_CALC(1, 0) + \
+					(NVS_XDR_DATA_LEN(x) * 2) + \
+					NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4)))
+
+static int
+nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size)
+{
+	XDR 	*xdr = nvs->nvs_private;
+	int32_t	encode_len, decode_len;
+
+	switch (nvs->nvs_op) {
+	case NVS_OP_ENCODE: {
+		size_t nvsize;
+
+		if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0)
+			return (EFAULT);
+
+		decode_len = nvp->nvp_size;
+		encode_len = nvsize;
+		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+			return (EFAULT);
+
+		return (nvs_xdr_nvp_op(nvs, nvp));
+	}
+	case NVS_OP_DECODE: {
+		struct xdr_bytesrec bytesrec;
+
+		/* get the encode and decode size */
+		if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len))
+			return (EFAULT);
+		*size = decode_len;
+
+		/* are we at the end of the stream? */
+		if (*size == 0)
+			return (0);
+
+		/* sanity check the size parameter */
+		if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec))
+			return (EFAULT);
+
+		if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail))
+			return (EFAULT);
+		break;
+	}
+
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+static const struct nvs_ops nvs_xdr_ops = {
+	.nvs_nvlist = nvs_xdr_nvlist,
+	.nvs_nvpair = nvs_xdr_nvpair,
+	.nvs_nvp_op = nvs_xdr_nvp_op,
+	.nvs_nvp_size = nvs_xdr_nvp_size,
+	.nvs_nvl_fini = nvs_xdr_nvl_fini
+};
+
+static int
+nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen)
+{
+	XDR xdr;
+	int err;
+
+	nvs->nvs_ops = &nvs_xdr_ops;
+
+	if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t),
+	    *buflen - sizeof (nvs_header_t))) != 0)
+		return (err);
+
+	err = nvs_operation(nvs, nvl, buflen);
+
+	nvs_xdr_destroy(nvs);
+
+	return (err);
+}
+
+#if defined(_KERNEL)
+static int __init
+nvpair_init(void)
+{
+	return (0);
+}
+
+static void __exit
+nvpair_fini(void)
+{
+}
+
+module_init(nvpair_init);
+module_exit(nvpair_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("Generic name/value pair implementation");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(nv_alloc_init);
+EXPORT_SYMBOL(nv_alloc_reset);
+EXPORT_SYMBOL(nv_alloc_fini);
+
+/* list management */
+EXPORT_SYMBOL(nvlist_alloc);
+EXPORT_SYMBOL(nvlist_free);
+EXPORT_SYMBOL(nvlist_size);
+EXPORT_SYMBOL(nvlist_pack);
+EXPORT_SYMBOL(nvlist_unpack);
+EXPORT_SYMBOL(nvlist_dup);
+EXPORT_SYMBOL(nvlist_merge);
+
+EXPORT_SYMBOL(nvlist_xalloc);
+EXPORT_SYMBOL(nvlist_xpack);
+EXPORT_SYMBOL(nvlist_xunpack);
+EXPORT_SYMBOL(nvlist_xdup);
+EXPORT_SYMBOL(nvlist_lookup_nv_alloc);
+
+EXPORT_SYMBOL(nvlist_add_nvpair);
+EXPORT_SYMBOL(nvlist_add_boolean);
+EXPORT_SYMBOL(nvlist_add_boolean_value);
+EXPORT_SYMBOL(nvlist_add_byte);
+EXPORT_SYMBOL(nvlist_add_int8);
+EXPORT_SYMBOL(nvlist_add_uint8);
+EXPORT_SYMBOL(nvlist_add_int16);
+EXPORT_SYMBOL(nvlist_add_uint16);
+EXPORT_SYMBOL(nvlist_add_int32);
+EXPORT_SYMBOL(nvlist_add_uint32);
+EXPORT_SYMBOL(nvlist_add_int64);
+EXPORT_SYMBOL(nvlist_add_uint64);
+EXPORT_SYMBOL(nvlist_add_string);
+EXPORT_SYMBOL(nvlist_add_nvlist);
+EXPORT_SYMBOL(nvlist_add_boolean_array);
+EXPORT_SYMBOL(nvlist_add_byte_array);
+EXPORT_SYMBOL(nvlist_add_int8_array);
+EXPORT_SYMBOL(nvlist_add_uint8_array);
+EXPORT_SYMBOL(nvlist_add_int16_array);
+EXPORT_SYMBOL(nvlist_add_uint16_array);
+EXPORT_SYMBOL(nvlist_add_int32_array);
+EXPORT_SYMBOL(nvlist_add_uint32_array);
+EXPORT_SYMBOL(nvlist_add_int64_array);
+EXPORT_SYMBOL(nvlist_add_uint64_array);
+EXPORT_SYMBOL(nvlist_add_string_array);
+EXPORT_SYMBOL(nvlist_add_nvlist_array);
+EXPORT_SYMBOL(nvlist_next_nvpair);
+EXPORT_SYMBOL(nvlist_prev_nvpair);
+EXPORT_SYMBOL(nvlist_empty);
+EXPORT_SYMBOL(nvlist_add_hrtime);
+
+EXPORT_SYMBOL(nvlist_remove);
+EXPORT_SYMBOL(nvlist_remove_nvpair);
+EXPORT_SYMBOL(nvlist_remove_all);
+
+EXPORT_SYMBOL(nvlist_lookup_boolean);
+EXPORT_SYMBOL(nvlist_lookup_boolean_value);
+EXPORT_SYMBOL(nvlist_lookup_byte);
+EXPORT_SYMBOL(nvlist_lookup_int8);
+EXPORT_SYMBOL(nvlist_lookup_uint8);
+EXPORT_SYMBOL(nvlist_lookup_int16);
+EXPORT_SYMBOL(nvlist_lookup_uint16);
+EXPORT_SYMBOL(nvlist_lookup_int32);
+EXPORT_SYMBOL(nvlist_lookup_uint32);
+EXPORT_SYMBOL(nvlist_lookup_int64);
+EXPORT_SYMBOL(nvlist_lookup_uint64);
+EXPORT_SYMBOL(nvlist_lookup_string);
+EXPORT_SYMBOL(nvlist_lookup_nvlist);
+EXPORT_SYMBOL(nvlist_lookup_boolean_array);
+EXPORT_SYMBOL(nvlist_lookup_byte_array);
+EXPORT_SYMBOL(nvlist_lookup_int8_array);
+EXPORT_SYMBOL(nvlist_lookup_uint8_array);
+EXPORT_SYMBOL(nvlist_lookup_int16_array);
+EXPORT_SYMBOL(nvlist_lookup_uint16_array);
+EXPORT_SYMBOL(nvlist_lookup_int32_array);
+EXPORT_SYMBOL(nvlist_lookup_uint32_array);
+EXPORT_SYMBOL(nvlist_lookup_int64_array);
+EXPORT_SYMBOL(nvlist_lookup_uint64_array);
+EXPORT_SYMBOL(nvlist_lookup_string_array);
+EXPORT_SYMBOL(nvlist_lookup_nvlist_array);
+EXPORT_SYMBOL(nvlist_lookup_hrtime);
+EXPORT_SYMBOL(nvlist_lookup_pairs);
+
+EXPORT_SYMBOL(nvlist_lookup_nvpair);
+EXPORT_SYMBOL(nvlist_exists);
+
+/* processing nvpair */
+EXPORT_SYMBOL(nvpair_name);
+EXPORT_SYMBOL(nvpair_type);
+EXPORT_SYMBOL(nvpair_value_boolean_value);
+EXPORT_SYMBOL(nvpair_value_byte);
+EXPORT_SYMBOL(nvpair_value_int8);
+EXPORT_SYMBOL(nvpair_value_uint8);
+EXPORT_SYMBOL(nvpair_value_int16);
+EXPORT_SYMBOL(nvpair_value_uint16);
+EXPORT_SYMBOL(nvpair_value_int32);
+EXPORT_SYMBOL(nvpair_value_uint32);
+EXPORT_SYMBOL(nvpair_value_int64);
+EXPORT_SYMBOL(nvpair_value_uint64);
+EXPORT_SYMBOL(nvpair_value_string);
+EXPORT_SYMBOL(nvpair_value_nvlist);
+EXPORT_SYMBOL(nvpair_value_boolean_array);
+EXPORT_SYMBOL(nvpair_value_byte_array);
+EXPORT_SYMBOL(nvpair_value_int8_array);
+EXPORT_SYMBOL(nvpair_value_uint8_array);
+EXPORT_SYMBOL(nvpair_value_int16_array);
+EXPORT_SYMBOL(nvpair_value_uint16_array);
+EXPORT_SYMBOL(nvpair_value_int32_array);
+EXPORT_SYMBOL(nvpair_value_uint32_array);
+EXPORT_SYMBOL(nvpair_value_int64_array);
+EXPORT_SYMBOL(nvpair_value_uint64_array);
+EXPORT_SYMBOL(nvpair_value_string_array);
+EXPORT_SYMBOL(nvpair_value_nvlist_array);
+EXPORT_SYMBOL(nvpair_value_hrtime);
diff --git a/sys/contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c
new file mode 100644
index 000000000000..c8a604a2bfac
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_fixed.c
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/isa_defs.h>
+#include <sys/nvpair.h>
+#include <sys/sysmacros.h>
+
+/*
+ * This allocator is very simple.
+ *  - it uses a pre-allocated buffer for memory allocations.
+ *  - it does _not_ free memory in the pre-allocated buffer.
+ *
+ * The reason for the selected implementation is simplicity.
+ * This allocator is designed for the usage in interrupt context when
+ * the caller may not wait for free memory.
+ */
+
+/* pre-allocated buffer for memory allocations */
+typedef struct nvbuf {
+	uintptr_t	nvb_buf;	/* address of pre-allocated buffer */
+	uintptr_t 	nvb_lim;	/* limit address in the buffer */
+	uintptr_t	nvb_cur;	/* current address in the buffer */
+} nvbuf_t;
+
+/*
+ * Initialize the pre-allocated buffer allocator. The caller needs to supply
+ *
+ *   buf	address of pre-allocated buffer
+ *   bufsz	size of pre-allocated buffer
+ *
+ * nv_fixed_init() calculates the remaining members of nvbuf_t.
+ */
+static int
+nv_fixed_init(nv_alloc_t *nva, va_list valist)
+{
+	uintptr_t base = va_arg(valist, uintptr_t);
+	uintptr_t lim = base + va_arg(valist, size_t);
+	nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t));
+
+	if (base == 0 || (uintptr_t)&nvb[1] > lim)
+		return (EINVAL);
+
+	nvb->nvb_buf = (uintptr_t)&nvb[0];
+	nvb->nvb_cur = (uintptr_t)&nvb[1];
+	nvb->nvb_lim = lim;
+	nva->nva_arg = nvb;
+
+	return (0);
+}
+
+static void *
+nv_fixed_alloc(nv_alloc_t *nva, size_t size)
+{
+	nvbuf_t *nvb = nva->nva_arg;
+	uintptr_t new = nvb->nvb_cur;
+
+	if (size == 0 || new + size > nvb->nvb_lim)
+		return (NULL);
+
+	nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t));
+
+	return ((void *)new);
+}
+
+/*ARGSUSED*/
+static void
+nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+	/* don't free memory in the pre-allocated buffer */
+}
+
+static void
+nv_fixed_reset(nv_alloc_t *nva)
+{
+	nvbuf_t *nvb = nva->nva_arg;
+
+	nvb->nvb_cur = (uintptr_t)&nvb[1];
+}
+
+const nv_alloc_ops_t nv_fixed_ops_def = {
+	.nv_ao_init = nv_fixed_init,
+	.nv_ao_fini = NULL,
+	.nv_ao_alloc = nv_fixed_alloc,
+	.nv_ao_free = nv_fixed_free,
+	.nv_ao_reset = nv_fixed_reset
+};
+
+const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def;
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(nv_fixed_ops);
+#endif
diff --git a/sys/contrib/openzfs/module/nvpair/nvpair_alloc_spl.c b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_spl.c
new file mode 100644
index 000000000000..ed8fa4d09402
--- /dev/null
+++ b/sys/contrib/openzfs/module/nvpair/nvpair_alloc_spl.c
@@ -0,0 +1,96 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License").  You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at * usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/nvpair.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+
+static void *
+nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size)
+{
+	return (vmem_alloc(size, KM_SLEEP));
+}
+
+static void *
+nv_alloc_pushpage_spl(nv_alloc_t *nva, size_t size)
+{
+	return (vmem_alloc(size, KM_PUSHPAGE));
+}
+
+static void *
+nv_alloc_nosleep_spl(nv_alloc_t *nva, size_t size)
+{
+	return (kmem_alloc(size, KM_NOSLEEP));
+}
+
+static void
+nv_free_spl(nv_alloc_t *nva, void *buf, size_t size)
+{
+	kmem_free(buf, size);
+}
+
+const nv_alloc_ops_t spl_sleep_ops_def = {
+	.nv_ao_init = NULL,
+	.nv_ao_fini = NULL,
+	.nv_ao_alloc = nv_alloc_sleep_spl,
+	.nv_ao_free = nv_free_spl,
+	.nv_ao_reset = NULL
+};
+
+const nv_alloc_ops_t spl_pushpage_ops_def = {
+	.nv_ao_init = NULL,
+	.nv_ao_fini = NULL,
+	.nv_ao_alloc = nv_alloc_pushpage_spl,
+	.nv_ao_free = nv_free_spl,
+	.nv_ao_reset = NULL
+};
+
+const nv_alloc_ops_t spl_nosleep_ops_def = {
+	.nv_ao_init = NULL,
+	.nv_ao_fini = NULL,
+	.nv_ao_alloc = nv_alloc_nosleep_spl,
+	.nv_ao_free = nv_free_spl,
+	.nv_ao_reset = NULL
+};
+
+nv_alloc_t nv_alloc_sleep_def = {
+	&spl_sleep_ops_def,
+	NULL
+};
+
+nv_alloc_t nv_alloc_pushpage_def = {
+	&spl_pushpage_ops_def,
+	NULL
+};
+
+nv_alloc_t nv_alloc_nosleep_def = {
+	&spl_nosleep_ops_def,
+	NULL
+};
+
+nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def;
+nv_alloc_t *nv_alloc_pushpage = &nv_alloc_pushpage_def;
+nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def;
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c
new file mode 100644
index 000000000000..66e27cefa396
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/acl_common.c
@@ -0,0 +1,1709 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/avl.h>
+#include <sys/misc.h>
+#if defined(_KERNEL)
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <acl/acl_common.h>
+#include <sys/debug.h>
+#else
+#include <errno.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <grp.h>
+#include <pwd.h>
+#include <acl_common.h>
+#define	ASSERT	assert
+#endif
+
+#define	ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \
+    ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \
+    ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL)
+
+
+#define	ACL_SYNCHRONIZE_SET_DENY		0x0000001
+#define	ACL_SYNCHRONIZE_SET_ALLOW		0x0000002
+#define	ACL_SYNCHRONIZE_ERR_DENY		0x0000004
+#define	ACL_SYNCHRONIZE_ERR_ALLOW		0x0000008
+
+#define	ACL_WRITE_OWNER_SET_DENY		0x0000010
+#define	ACL_WRITE_OWNER_SET_ALLOW		0x0000020
+#define	ACL_WRITE_OWNER_ERR_DENY		0x0000040
+#define	ACL_WRITE_OWNER_ERR_ALLOW		0x0000080
+
+#define	ACL_DELETE_SET_DENY			0x0000100
+#define	ACL_DELETE_SET_ALLOW			0x0000200
+#define	ACL_DELETE_ERR_DENY			0x0000400
+#define	ACL_DELETE_ERR_ALLOW			0x0000800
+
+#define	ACL_WRITE_ATTRS_OWNER_SET_DENY		0x0001000
+#define	ACL_WRITE_ATTRS_OWNER_SET_ALLOW		0x0002000
+#define	ACL_WRITE_ATTRS_OWNER_ERR_DENY		0x0004000
+#define	ACL_WRITE_ATTRS_OWNER_ERR_ALLOW		0x0008000
+
+#define	ACL_WRITE_ATTRS_WRITER_SET_DENY		0x0010000
+#define	ACL_WRITE_ATTRS_WRITER_SET_ALLOW	0x0020000
+#define	ACL_WRITE_ATTRS_WRITER_ERR_DENY		0x0040000
+#define	ACL_WRITE_ATTRS_WRITER_ERR_ALLOW	0x0080000
+
+#define	ACL_WRITE_NAMED_WRITER_SET_DENY		0x0100000
+#define	ACL_WRITE_NAMED_WRITER_SET_ALLOW	0x0200000
+#define	ACL_WRITE_NAMED_WRITER_ERR_DENY		0x0400000
+#define	ACL_WRITE_NAMED_WRITER_ERR_ALLOW	0x0800000
+
+#define	ACL_READ_NAMED_READER_SET_DENY		0x1000000
+#define	ACL_READ_NAMED_READER_SET_ALLOW		0x2000000
+#define	ACL_READ_NAMED_READER_ERR_DENY		0x4000000
+#define	ACL_READ_NAMED_READER_ERR_ALLOW		0x8000000
+
+
+#define	ACE_VALID_MASK_BITS (\
+    ACE_READ_DATA | \
+    ACE_LIST_DIRECTORY | \
+    ACE_WRITE_DATA | \
+    ACE_ADD_FILE | \
+    ACE_APPEND_DATA | \
+    ACE_ADD_SUBDIRECTORY | \
+    ACE_READ_NAMED_ATTRS | \
+    ACE_WRITE_NAMED_ATTRS | \
+    ACE_EXECUTE | \
+    ACE_DELETE_CHILD | \
+    ACE_READ_ATTRIBUTES | \
+    ACE_WRITE_ATTRIBUTES | \
+    ACE_DELETE | \
+    ACE_READ_ACL | \
+    ACE_WRITE_ACL | \
+    ACE_WRITE_OWNER | \
+    ACE_SYNCHRONIZE)
+
+#define	ACE_MASK_UNDEFINED			0x80000000
+
+#define	ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \
+    ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \
+    ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \
+    ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE)
+
+/*
+ * ACL conversion helpers
+ */
+
+typedef enum {
+	ace_unused,
+	ace_user_obj,
+	ace_user,
+	ace_group, /* includes GROUP and GROUP_OBJ */
+	ace_other_obj
+} ace_to_aent_state_t;
+
+typedef struct acevals {
+	uid_t key;
+	avl_node_t avl;
+	uint32_t mask;
+	uint32_t allowed;
+	uint32_t denied;
+	int aent_type;
+} acevals_t;
+
+typedef struct ace_list {
+	acevals_t user_obj;
+	avl_tree_t user;
+	int numusers;
+	acevals_t group_obj;
+	avl_tree_t group;
+	int numgroups;
+	acevals_t other_obj;
+	uint32_t acl_mask;
+	int hasmask;
+	int dfacl_flag;
+	ace_to_aent_state_t state;
+	int seen; /* bitmask of all aclent_t a_type values seen */
+} ace_list_t;
+
+/*
+ * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified.
+ * v = Ptr to array/vector of objs
+ * n = # objs in the array
+ * s = size of each obj (must be multiples of a word size)
+ * f = ptr to function to compare two objs
+ *	returns (-1 = less than, 0 = equal, 1 = greater than
+ */
+void
+ksort(caddr_t v, int n, int s, int (*f)(void *, void *))
+{
+	int g, i, j, ii;
+	unsigned int *p1, *p2;
+	unsigned int tmp;
+
+	/* No work to do */
+	if (v == NULL || n <= 1)
+		return;
+
+	/* Sanity check on arguments */
+	ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0);
+	ASSERT(s > 0);
+	for (g = n / 2; g > 0; g /= 2) {
+		for (i = g; i < n; i++) {
+			for (j = i - g; j >= 0 &&
+			    (*f)(v + j * s, v + (j + g) * s) == 1;
+			    j -= g) {
+				p1 = (void *)(v + j * s);
+				p2 = (void *)(v + (j + g) * s);
+				for (ii = 0; ii < s / 4; ii++) {
+					tmp = *p1;
+					*p1++ = *p2;
+					*p2++ = tmp;
+				}
+			}
+		}
+	}
+}
+
+/*
+ * Compare two acls, all fields.  Returns:
+ * -1 (less than)
+ *  0 (equal)
+ * +1 (greater than)
+ */
+int
+cmp2acls(void *a, void *b)
+{
+	aclent_t *x = (aclent_t *)a;
+	aclent_t *y = (aclent_t *)b;
+
+	/* Compare types */
+	if (x->a_type < y->a_type)
+		return (-1);
+	if (x->a_type > y->a_type)
+		return (1);
+	/* Equal types; compare id's */
+	if (x->a_id < y->a_id)
+		return (-1);
+	if (x->a_id > y->a_id)
+		return (1);
+	/* Equal ids; compare perms */
+	if (x->a_perm < y->a_perm)
+		return (-1);
+	if (x->a_perm > y->a_perm)
+		return (1);
+	/* Totally equal */
+	return (0);
+}
+
+static int
+cacl_malloc(void **ptr, size_t size)
+{
+	*ptr = kmem_zalloc(size, KM_SLEEP);
+	return (0);
+}
+
+
+#if !defined(_KERNEL)
+acl_t *
+acl_alloc(enum acl_type type)
+{
+	acl_t *aclp;
+
+	if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0)
+		return (NULL);
+
+	aclp->acl_aclp = NULL;
+	aclp->acl_cnt = 0;
+
+	switch (type) {
+	case ACE_T:
+		aclp->acl_type = ACE_T;
+		aclp->acl_entry_size = sizeof (ace_t);
+		break;
+	case ACLENT_T:
+		aclp->acl_type = ACLENT_T;
+		aclp->acl_entry_size = sizeof (aclent_t);
+		break;
+	default:
+		acl_free(aclp);
+		aclp = NULL;
+	}
+	return (aclp);
+}
+
+/*
+ * Free acl_t structure
+ */
+void
+acl_free(acl_t *aclp)
+{
+	int acl_size;
+
+	if (aclp == NULL)
+		return;
+
+	if (aclp->acl_aclp) {
+		acl_size = aclp->acl_cnt * aclp->acl_entry_size;
+		cacl_free(aclp->acl_aclp, acl_size);
+	}
+
+	cacl_free(aclp, sizeof (acl_t));
+}
+
+static uint32_t
+access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow)
+{
+	uint32_t access_mask = 0;
+	int acl_produce;
+	int synchronize_set = 0, write_owner_set = 0;
+	int delete_set = 0, write_attrs_set = 0;
+	int read_named_set = 0, write_named_set = 0;
+
+	acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW |
+	    ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+	    ACL_WRITE_ATTRS_WRITER_SET_DENY);
+
+	if (isallow) {
+		synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW;
+		write_owner_set = ACL_WRITE_OWNER_SET_ALLOW;
+		delete_set = ACL_DELETE_SET_ALLOW;
+		if (hasreadperm)
+			read_named_set = ACL_READ_NAMED_READER_SET_ALLOW;
+		if (haswriteperm)
+			write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+		if (isowner)
+			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+		else if (haswriteperm)
+			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+	} else {
+
+		synchronize_set = ACL_SYNCHRONIZE_SET_DENY;
+		write_owner_set = ACL_WRITE_OWNER_SET_DENY;
+		delete_set = ACL_DELETE_SET_DENY;
+		if (hasreadperm)
+			read_named_set = ACL_READ_NAMED_READER_SET_DENY;
+		if (haswriteperm)
+			write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY;
+		if (isowner)
+			write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+		else if (haswriteperm)
+			write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+		else
+			/*
+			 * If the entity is not the owner and does not
+			 * have write permissions ACE_WRITE_ATTRIBUTES will
+			 * always go in the DENY ACE.
+			 */
+			access_mask |= ACE_WRITE_ATTRIBUTES;
+	}
+
+	if (acl_produce & synchronize_set)
+		access_mask |= ACE_SYNCHRONIZE;
+	if (acl_produce & write_owner_set)
+		access_mask |= ACE_WRITE_OWNER;
+	if (acl_produce & delete_set)
+		access_mask |= ACE_DELETE;
+	if (acl_produce & write_attrs_set)
+		access_mask |= ACE_WRITE_ATTRIBUTES;
+	if (acl_produce & read_named_set)
+		access_mask |= ACE_READ_NAMED_ATTRS;
+	if (acl_produce & write_named_set)
+		access_mask |= ACE_WRITE_NAMED_ATTRS;
+
+	return (access_mask);
+}
+
+/*
+ * Given an mode_t, convert it into an access_mask as used
+ * by nfsace, assuming aclent_t -> nfsace semantics.
+ */
+static uint32_t
+mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow)
+{
+	uint32_t access = 0;
+	int haswriteperm = 0;
+	int hasreadperm = 0;
+
+	if (isallow) {
+		haswriteperm = (mode & S_IWOTH);
+		hasreadperm = (mode & S_IROTH);
+	} else {
+		haswriteperm = !(mode & S_IWOTH);
+		hasreadperm = !(mode & S_IROTH);
+	}
+
+	/*
+	 * The following call takes care of correctly setting the following
+	 * mask bits in the access_mask:
+	 * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE,
+	 * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS
+	 */
+	access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow);
+
+	if (isallow) {
+		access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES;
+		if (isowner)
+			access |= ACE_WRITE_ACL;
+	} else {
+		if (! isowner)
+			access |= ACE_WRITE_ACL;
+	}
+
+	/* read */
+	if (mode & S_IROTH) {
+		access |= ACE_READ_DATA;
+	}
+	/* write */
+	if (mode & S_IWOTH) {
+		access |= ACE_WRITE_DATA |
+		    ACE_APPEND_DATA;
+		if (isdir)
+			access |= ACE_DELETE_CHILD;
+	}
+	/* exec */
+	if (mode & S_IXOTH) {
+		access |= ACE_EXECUTE;
+	}
+
+	return (access);
+}
+
+/*
+ * Given an nfsace (presumably an ALLOW entry), make a
+ * corresponding DENY entry at the address given.
+ */
+static void
+ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner)
+{
+	(void) memcpy(deny, allow, sizeof (ace_t));
+
+	deny->a_who = allow->a_who;
+
+	deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+	deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS;
+	if (isdir)
+		deny->a_access_mask ^= ACE_DELETE_CHILD;
+
+	deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER |
+	    ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS |
+	    ACE_WRITE_NAMED_ATTRS);
+	deny->a_access_mask |= access_mask_set((allow->a_access_mask &
+	    ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner,
+	    B_FALSE);
+}
+/*
+ * Make an initial pass over an array of aclent_t's.  Gather
+ * information such as an ACL_MASK (if any), number of users,
+ * number of groups, and whether the array needs to be sorted.
+ */
+static int
+ln_aent_preprocess(aclent_t *aclent, int n,
+    int *hasmask, mode_t *mask,
+    int *numuser, int *numgroup, int *needsort)
+{
+	int error = 0;
+	int i;
+	int curtype = 0;
+
+	*hasmask = 0;
+	*mask = 07;
+	*needsort = 0;
+	*numuser = 0;
+	*numgroup = 0;
+
+	for (i = 0; i < n; i++) {
+		if (aclent[i].a_type < curtype)
+			*needsort = 1;
+		else if (aclent[i].a_type > curtype)
+			curtype = aclent[i].a_type;
+		if (aclent[i].a_type & USER)
+			(*numuser)++;
+		if (aclent[i].a_type & (GROUP | GROUP_OBJ))
+			(*numgroup)++;
+		if (aclent[i].a_type & CLASS_OBJ) {
+			if (*hasmask) {
+				error = EINVAL;
+				goto out;
+			} else {
+				*hasmask = 1;
+				*mask = aclent[i].a_perm;
+			}
+		}
+	}
+
+	if ((! *hasmask) && (*numuser + *numgroup > 1)) {
+		error = EINVAL;
+		goto out;
+	}
+
+out:
+	return (error);
+}
+
+/*
+ * Convert an array of aclent_t into an array of nfsace entries,
+ * following POSIX draft -> nfsv4 conversion semantics as outlined in
+ * the IETF draft.
+ */
+static int
+ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir)
+{
+	int error = 0;
+	mode_t mask;
+	int numuser, numgroup, needsort;
+	int resultsize = 0;
+	int i, groupi = 0, skip;
+	ace_t *acep, *result = NULL;
+	int hasmask;
+
+	error = ln_aent_preprocess(aclent, n, &hasmask, &mask,
+	    &numuser, &numgroup, &needsort);
+	if (error != 0)
+		goto out;
+
+	/* allow + deny for each aclent */
+	resultsize = n * 2;
+	if (hasmask) {
+		/*
+		 * stick extra deny on the group_obj and on each
+		 * user|group for the mask (the group_obj was added
+		 * into the count for numgroup)
+		 */
+		resultsize += numuser + numgroup;
+		/* ... and don't count the mask itself */
+		resultsize -= 2;
+	}
+
+	/* sort the source if necessary */
+	if (needsort)
+		ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls);
+
+	if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0)
+		goto out;
+
+	acep = result;
+
+	for (i = 0; i < n; i++) {
+		/*
+		 * don't process CLASS_OBJ (mask); mask was grabbed in
+		 * ln_aent_preprocess()
+		 */
+		if (aclent[i].a_type & CLASS_OBJ)
+			continue;
+
+		/* If we need an ACL_MASK emulator, prepend it now */
+		if ((hasmask) &&
+		    (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) {
+			acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+			acep->a_flags = 0;
+			if (aclent[i].a_type & GROUP_OBJ) {
+				acep->a_who = (uid_t)-1;
+				acep->a_flags |=
+				    (ACE_IDENTIFIER_GROUP|ACE_GROUP);
+			} else if (aclent[i].a_type & USER) {
+				acep->a_who = aclent[i].a_id;
+			} else {
+				acep->a_who = aclent[i].a_id;
+				acep->a_flags |= ACE_IDENTIFIER_GROUP;
+			}
+			if (aclent[i].a_type & ACL_DEFAULT) {
+				acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+				    ACE_FILE_INHERIT_ACE |
+				    ACE_DIRECTORY_INHERIT_ACE;
+			}
+			/*
+			 * Set the access mask for the prepended deny
+			 * ace.  To do this, we invert the mask (found
+			 * in ln_aent_preprocess()) then convert it to an
+			 * DENY ace access_mask.
+			 */
+			acep->a_access_mask = mode_to_ace_access((mask ^ 07),
+			    isdir, 0, 0);
+			acep += 1;
+		}
+
+		/* handle a_perm -> access_mask */
+		acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm,
+		    isdir, aclent[i].a_type & USER_OBJ, 1);
+
+		/* emulate a default aclent */
+		if (aclent[i].a_type & ACL_DEFAULT) {
+			acep->a_flags |= ACE_INHERIT_ONLY_ACE |
+			    ACE_FILE_INHERIT_ACE |
+			    ACE_DIRECTORY_INHERIT_ACE;
+		}
+
+		/*
+		 * handle a_perm and a_id
+		 *
+		 * this must be done last, since it involves the
+		 * corresponding deny aces, which are handled
+		 * differently for each different a_type.
+		 */
+		if (aclent[i].a_type & USER_OBJ) {
+			acep->a_who = (uid_t)-1;
+			acep->a_flags |= ACE_OWNER;
+			ace_make_deny(acep, acep + 1, isdir, B_TRUE);
+			acep += 2;
+		} else if (aclent[i].a_type & USER) {
+			acep->a_who = aclent[i].a_id;
+			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+			acep += 2;
+		} else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) {
+			if (aclent[i].a_type & GROUP_OBJ) {
+				acep->a_who = (uid_t)-1;
+				acep->a_flags |= ACE_GROUP;
+			} else {
+				acep->a_who = aclent[i].a_id;
+			}
+			acep->a_flags |= ACE_IDENTIFIER_GROUP;
+			/*
+			 * Set the corresponding deny for the group ace.
+			 *
+			 * The deny aces go after all of the groups, unlike
+			 * everything else, where they immediately follow
+			 * the allow ace.
+			 *
+			 * We calculate "skip", the number of slots to
+			 * skip ahead for the deny ace, here.
+			 *
+			 * The pattern is:
+			 * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3
+			 * thus, skip is
+			 * (2 * numgroup) - 1 - groupi
+			 * (2 * numgroup) to account for MD + A
+			 * - 1 to account for the fact that we're on the
+			 * access (A), not the mask (MD)
+			 * - groupi to account for the fact that we have
+			 * passed up groupi number of MD's.
+			 */
+			skip = (2 * numgroup) - 1 - groupi;
+			ace_make_deny(acep, acep + skip, isdir, B_FALSE);
+			/*
+			 * If we just did the last group, skip acep past
+			 * all of the denies; else, just move ahead one.
+			 */
+			if (++groupi >= numgroup)
+				acep += numgroup + 1;
+			else
+				acep += 1;
+		} else if (aclent[i].a_type & OTHER_OBJ) {
+			acep->a_who = (uid_t)-1;
+			acep->a_flags |= ACE_EVERYONE;
+			ace_make_deny(acep, acep + 1, isdir, B_FALSE);
+			acep += 2;
+		} else {
+			error = EINVAL;
+			goto out;
+		}
+	}
+
+	*acepp = result;
+	*rescount = resultsize;
+
+out:
+	if (error != 0) {
+		if ((result != NULL) && (resultsize > 0)) {
+			cacl_free(result, resultsize * sizeof (ace_t));
+		}
+	}
+
+	return (error);
+}
+
+static int
+convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir,
+    ace_t **retacep, int *retacecnt)
+{
+	ace_t *acep;
+	ace_t *dfacep;
+	int acecnt = 0;
+	int dfacecnt = 0;
+	int dfaclstart = 0;
+	int dfaclcnt = 0;
+	aclent_t *aclp;
+	int i;
+	int error;
+	int acesz, dfacesz;
+
+	ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls);
+
+	for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) {
+		if (aclp->a_type & ACL_DEFAULT)
+			break;
+	}
+
+	if (i < aclcnt) {
+		dfaclstart = i;
+		dfaclcnt = aclcnt - i;
+	}
+
+	if (dfaclcnt && !isdir) {
+		return (EINVAL);
+	}
+
+	error = ln_aent_to_ace(aclentp, i,  &acep, &acecnt, isdir);
+	if (error)
+		return (error);
+
+	if (dfaclcnt) {
+		error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt,
+		    &dfacep, &dfacecnt, isdir);
+		if (error) {
+			if (acep) {
+				cacl_free(acep, acecnt * sizeof (ace_t));
+			}
+			return (error);
+		}
+	}
+
+	if (dfacecnt != 0) {
+		acesz = sizeof (ace_t) * acecnt;
+		dfacesz = sizeof (ace_t) * dfacecnt;
+		acep = cacl_realloc(acep, acesz, acesz + dfacesz);
+		if (acep == NULL)
+			return (ENOMEM);
+		if (dfaclcnt) {
+			(void) memcpy(acep + acecnt, dfacep, dfacesz);
+		}
+	}
+	if (dfaclcnt)
+		cacl_free(dfacep, dfacecnt * sizeof (ace_t));
+
+	*retacecnt = acecnt + dfacecnt;
+	*retacep = acep;
+	return (0);
+}
+
+static int
+ace_mask_to_mode(uint32_t  mask, o_mode_t *modep, boolean_t isdir)
+{
+	int error = 0;
+	o_mode_t mode = 0;
+	uint32_t bits, wantbits;
+
+	/* read */
+	if (mask & ACE_READ_DATA)
+		mode |= S_IROTH;
+
+	/* write */
+	wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA);
+	if (isdir)
+		wantbits |= ACE_DELETE_CHILD;
+	bits = mask & wantbits;
+	if (bits != 0) {
+		if (bits != wantbits) {
+			error = ENOTSUP;
+			goto out;
+		}
+		mode |= S_IWOTH;
+	}
+
+	/* exec */
+	if (mask & ACE_EXECUTE) {
+		mode |= S_IXOTH;
+	}
+
+	*modep = mode;
+
+out:
+	return (error);
+}
+
+static void
+acevals_init(acevals_t *vals, uid_t key)
+{
+	bzero(vals, sizeof (*vals));
+	vals->allowed = ACE_MASK_UNDEFINED;
+	vals->denied = ACE_MASK_UNDEFINED;
+	vals->mask = ACE_MASK_UNDEFINED;
+	vals->key = key;
+}
+
+static void
+ace_list_init(ace_list_t *al, int dfacl_flag)
+{
+	acevals_init(&al->user_obj, 0);
+	acevals_init(&al->group_obj, 0);
+	acevals_init(&al->other_obj, 0);
+	al->numusers = 0;
+	al->numgroups = 0;
+	al->acl_mask = 0;
+	al->hasmask = 0;
+	al->state = ace_unused;
+	al->seen = 0;
+	al->dfacl_flag = dfacl_flag;
+}
+
+/*
+ * Find or create an acevals holder for a given id and avl tree.
+ *
+ * Note that only one thread will ever touch these avl trees, so
+ * there is no need for locking.
+ */
+static acevals_t *
+acevals_find(ace_t *ace, avl_tree_t *avl, int *num)
+{
+	acevals_t key, *rc;
+	avl_index_t where;
+
+	key.key = ace->a_who;
+	rc = avl_find(avl, &key, &where);
+	if (rc != NULL)
+		return (rc);
+
+	/* this memory is freed by ln_ace_to_aent()->ace_list_free() */
+	if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0)
+		return (NULL);
+
+	acevals_init(rc, ace->a_who);
+	avl_insert(avl, rc, where);
+	(*num)++;
+
+	return (rc);
+}
+
+static int
+access_mask_check(ace_t *acep, int mask_bit, int isowner)
+{
+	int set_deny, err_deny;
+	int set_allow, err_allow;
+	int acl_consume;
+	int haswriteperm, hasreadperm;
+
+	if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+		haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1;
+		hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1;
+	} else {
+		haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0;
+		hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0;
+	}
+
+	acl_consume = (ACL_SYNCHRONIZE_ERR_DENY |
+	    ACL_DELETE_ERR_DENY |
+	    ACL_WRITE_OWNER_ERR_DENY |
+	    ACL_WRITE_OWNER_ERR_ALLOW |
+	    ACL_WRITE_ATTRS_OWNER_SET_ALLOW |
+	    ACL_WRITE_ATTRS_OWNER_ERR_DENY |
+	    ACL_WRITE_ATTRS_WRITER_SET_DENY |
+	    ACL_WRITE_ATTRS_WRITER_ERR_ALLOW |
+	    ACL_WRITE_NAMED_WRITER_ERR_DENY |
+	    ACL_READ_NAMED_READER_ERR_DENY);
+
+	if (mask_bit == ACE_SYNCHRONIZE) {
+		set_deny = ACL_SYNCHRONIZE_SET_DENY;
+		err_deny =  ACL_SYNCHRONIZE_ERR_DENY;
+		set_allow = ACL_SYNCHRONIZE_SET_ALLOW;
+		err_allow = ACL_SYNCHRONIZE_ERR_ALLOW;
+	} else if (mask_bit == ACE_WRITE_OWNER) {
+		set_deny = ACL_WRITE_OWNER_SET_DENY;
+		err_deny =  ACL_WRITE_OWNER_ERR_DENY;
+		set_allow = ACL_WRITE_OWNER_SET_ALLOW;
+		err_allow = ACL_WRITE_OWNER_ERR_ALLOW;
+	} else if (mask_bit == ACE_DELETE) {
+		set_deny = ACL_DELETE_SET_DENY;
+		err_deny =  ACL_DELETE_ERR_DENY;
+		set_allow = ACL_DELETE_SET_ALLOW;
+		err_allow = ACL_DELETE_ERR_ALLOW;
+	} else if (mask_bit == ACE_WRITE_ATTRIBUTES) {
+		if (isowner) {
+			set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY;
+			err_deny =  ACL_WRITE_ATTRS_OWNER_ERR_DENY;
+			set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW;
+			err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW;
+		} else if (haswriteperm) {
+			set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY;
+			err_deny =  ACL_WRITE_ATTRS_WRITER_ERR_DENY;
+			set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW;
+			err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW;
+		} else {
+			if ((acep->a_access_mask & mask_bit) &&
+			    (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+				return (ENOTSUP);
+			}
+			return (0);
+		}
+	} else if (mask_bit == ACE_READ_NAMED_ATTRS) {
+		if (!hasreadperm)
+			return (0);
+
+		set_deny = ACL_READ_NAMED_READER_SET_DENY;
+		err_deny = ACL_READ_NAMED_READER_ERR_DENY;
+		set_allow = ACL_READ_NAMED_READER_SET_ALLOW;
+		err_allow = ACL_READ_NAMED_READER_ERR_ALLOW;
+	} else if (mask_bit == ACE_WRITE_NAMED_ATTRS) {
+		if (!haswriteperm)
+			return (0);
+
+		set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY;
+		err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY;
+		set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW;
+		err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW;
+	} else {
+		return (EINVAL);
+	}
+
+	if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) {
+		if (acl_consume & set_deny) {
+			if (!(acep->a_access_mask & mask_bit)) {
+				return (ENOTSUP);
+			}
+		} else if (acl_consume & err_deny) {
+			if (acep->a_access_mask & mask_bit) {
+				return (ENOTSUP);
+			}
+		}
+	} else {
+		/* ACE_ACCESS_ALLOWED_ACE_TYPE */
+		if (acl_consume & set_allow) {
+			if (!(acep->a_access_mask & mask_bit)) {
+				return (ENOTSUP);
+			}
+		} else if (acl_consume & err_allow) {
+			if (acep->a_access_mask & mask_bit) {
+				return (ENOTSUP);
+			}
+		}
+	}
+	return (0);
+}
+
+static int
+ace_to_aent_legal(ace_t *acep)
+{
+	int error = 0;
+	int isowner;
+
+	/* only ALLOW or DENY */
+	if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+	    (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	/* check for invalid flags */
+	if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/* some flags are illegal */
+	if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG |
+	    ACE_FAILED_ACCESS_ACE_FLAG |
+	    ACE_NO_PROPAGATE_INHERIT_ACE)) {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	/* check for invalid masks */
+	if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	if ((acep->a_flags & ACE_OWNER)) {
+		isowner = 1;
+	} else {
+		isowner = 0;
+	}
+
+	error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_WRITE_OWNER, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_DELETE, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner);
+	if (error)
+		goto out;
+
+	error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner);
+	if (error)
+		goto out;
+
+	/* more detailed checking of masks */
+	if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+		if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) {
+			error = ENOTSUP;
+			goto out;
+		}
+		if ((acep->a_access_mask & ACE_WRITE_DATA) &&
+		    (! (acep->a_access_mask & ACE_APPEND_DATA))) {
+			error = ENOTSUP;
+			goto out;
+		}
+		if ((! (acep->a_access_mask & ACE_WRITE_DATA)) &&
+		    (acep->a_access_mask & ACE_APPEND_DATA)) {
+			error = ENOTSUP;
+			goto out;
+		}
+	}
+
+	/* ACL enforcement */
+	if ((acep->a_access_mask & ACE_READ_ACL) &&
+	    (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if (acep->a_access_mask & ACE_WRITE_ACL) {
+		if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) &&
+		    (isowner)) {
+			error = ENOTSUP;
+			goto out;
+		}
+		if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) &&
+		    (! isowner)) {
+			error = ENOTSUP;
+			goto out;
+		}
+	}
+
+out:
+	return (error);
+}
+
+static int
+ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir)
+{
+	/* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */
+	if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) !=
+	    (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) {
+		return (ENOTSUP);
+	}
+
+	return (ace_mask_to_mode(mask, modep, isdir));
+}
+
+static int
+acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list,
+    uid_t owner, gid_t group, boolean_t isdir)
+{
+	int error;
+	uint32_t  flips = ACE_POSIX_SUPPORTED_BITS;
+
+	if (isdir)
+		flips |= ACE_DELETE_CHILD;
+	if (vals->allowed != (vals->denied ^ flips)) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if ((list->hasmask) && (list->acl_mask != vals->mask) &&
+	    (vals->aent_type & (USER | GROUP | GROUP_OBJ))) {
+		error = ENOTSUP;
+		goto out;
+	}
+	error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir);
+	if (error != 0)
+		goto out;
+	dest->a_type = vals->aent_type;
+	if (dest->a_type & (USER | GROUP)) {
+		dest->a_id = vals->key;
+	} else if (dest->a_type & USER_OBJ) {
+		dest->a_id = owner;
+	} else if (dest->a_type & GROUP_OBJ) {
+		dest->a_id = group;
+	} else if (dest->a_type & OTHER_OBJ) {
+		dest->a_id = 0;
+	} else {
+		error = EINVAL;
+		goto out;
+	}
+
+out:
+	return (error);
+}
+
+
+static int
+ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt,
+    uid_t owner, gid_t group, boolean_t isdir)
+{
+	int error = 0;
+	aclent_t *aent, *result = NULL;
+	acevals_t *vals;
+	int resultcount;
+
+	if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) !=
+	    (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	resultcount = 3 + list->numusers + list->numgroups;
+	/*
+	 * This must be the same condition as below, when we add the CLASS_OBJ
+	 * (aka ACL mask)
+	 */
+	if ((list->hasmask) || (! list->dfacl_flag))
+		resultcount += 1;
+
+	if (cacl_malloc((void **)&result,
+	    resultcount * sizeof (aclent_t)) != 0) {
+		error = ENOMEM;
+		goto out;
+	}
+	aent = result;
+
+	/* USER_OBJ */
+	if (!(list->user_obj.aent_type & USER_OBJ)) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = acevals_to_aent(&list->user_obj, aent, list, owner, group,
+	    isdir);
+
+	if (error != 0)
+		goto out;
+	++aent;
+	/* USER */
+	vals = NULL;
+	for (vals = avl_first(&list->user); vals != NULL;
+	    vals = AVL_NEXT(&list->user, vals)) {
+		if (!(vals->aent_type & USER)) {
+			error = EINVAL;
+			goto out;
+		}
+		error = acevals_to_aent(vals, aent, list, owner, group,
+		    isdir);
+		if (error != 0)
+			goto out;
+		++aent;
+	}
+	/* GROUP_OBJ */
+	if (!(list->group_obj.aent_type & GROUP_OBJ)) {
+		error = EINVAL;
+		goto out;
+	}
+	error = acevals_to_aent(&list->group_obj, aent, list, owner, group,
+	    isdir);
+	if (error != 0)
+		goto out;
+	++aent;
+	/* GROUP */
+	vals = NULL;
+	for (vals = avl_first(&list->group); vals != NULL;
+	    vals = AVL_NEXT(&list->group, vals)) {
+		if (!(vals->aent_type & GROUP)) {
+			error = EINVAL;
+			goto out;
+		}
+		error = acevals_to_aent(vals, aent, list, owner, group,
+		    isdir);
+		if (error != 0)
+			goto out;
+		++aent;
+	}
+	/*
+	 * CLASS_OBJ (aka ACL_MASK)
+	 *
+	 * An ACL_MASK is not fabricated if the ACL is a default ACL.
+	 * This is to follow UFS's behavior.
+	 */
+	if ((list->hasmask) || (! list->dfacl_flag)) {
+		if (list->hasmask) {
+			uint32_t flips = ACE_POSIX_SUPPORTED_BITS;
+			if (isdir)
+				flips |= ACE_DELETE_CHILD;
+			error = ace_mask_to_mode(list->acl_mask ^ flips,
+			    &aent->a_perm, isdir);
+			if (error != 0)
+				goto out;
+		} else {
+			/* fabricate the ACL_MASK from the group permissions */
+			error = ace_mask_to_mode(list->group_obj.allowed,
+			    &aent->a_perm, isdir);
+			if (error != 0)
+				goto out;
+		}
+		aent->a_id = 0;
+		aent->a_type = CLASS_OBJ | list->dfacl_flag;
+		++aent;
+	}
+	/* OTHER_OBJ */
+	if (!(list->other_obj.aent_type & OTHER_OBJ)) {
+		error = EINVAL;
+		goto out;
+	}
+	error = acevals_to_aent(&list->other_obj, aent, list, owner, group,
+	    isdir);
+	if (error != 0)
+		goto out;
+	++aent;
+
+	*aclentp = result;
+	*aclcnt = resultcount;
+
+out:
+	if (error != 0) {
+		if (result != NULL)
+			cacl_free(result, resultcount * sizeof (aclent_t));
+	}
+
+	return (error);
+}
+
+
+/*
+ * free all data associated with an ace_list
+ */
+static void
+ace_list_free(ace_list_t *al)
+{
+	acevals_t *node;
+	void *cookie;
+
+	if (al == NULL)
+		return;
+
+	cookie = NULL;
+	while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL)
+		cacl_free(node, sizeof (acevals_t));
+	cookie = NULL;
+	while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL)
+		cacl_free(node, sizeof (acevals_t));
+
+	avl_destroy(&al->user);
+	avl_destroy(&al->group);
+
+	/* free the container itself */
+	cacl_free(al, sizeof (ace_list_t));
+}
+
+static int
+acevals_compare(const void *va, const void *vb)
+{
+	const acevals_t *a = va, *b = vb;
+
+	if (a->key == b->key)
+		return (0);
+
+	if (a->key > b->key)
+		return (1);
+
+	else
+		return (-1);
+}
+
+/*
+ * Convert a list of ace_t entries to equivalent regular and default
+ * aclent_t lists.  Return error (ENOTSUP) when conversion is not possible.
+ */
+static int
+ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group,
+    aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt,
+    boolean_t isdir)
+{
+	int error = 0;
+	ace_t *acep;
+	uint32_t bits;
+	int i;
+	ace_list_t *normacl = NULL, *dfacl = NULL, *acl;
+	acevals_t *vals;
+
+	*aclentp = NULL;
+	*aclcnt = 0;
+	*dfaclentp = NULL;
+	*dfaclcnt = 0;
+
+	/* we need at least user_obj, group_obj, and other_obj */
+	if (n < 6) {
+		error = ENOTSUP;
+		goto out;
+	}
+	if (ace == NULL) {
+		error = EINVAL;
+		goto out;
+	}
+
+	error = cacl_malloc((void **)&normacl, sizeof (ace_list_t));
+	if (error != 0)
+		goto out;
+
+	avl_create(&normacl->user, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+	avl_create(&normacl->group, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+
+	ace_list_init(normacl, 0);
+
+	error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t));
+	if (error != 0)
+		goto out;
+
+	avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+	avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t),
+	    offsetof(acevals_t, avl));
+	ace_list_init(dfacl, ACL_DEFAULT);
+
+	/* process every ace_t... */
+	for (i = 0; i < n; i++) {
+		acep = &ace[i];
+
+		/* rule out certain cases quickly */
+		error = ace_to_aent_legal(acep);
+		if (error != 0)
+			goto out;
+
+		/*
+		 * Turn off these bits in order to not have to worry about
+		 * them when doing the checks for compliments.
+		 */
+		acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE |
+		    ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES |
+		    ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS);
+
+		/* see if this should be a regular or default acl */
+		bits = acep->a_flags &
+		    (ACE_INHERIT_ONLY_ACE |
+		    ACE_FILE_INHERIT_ACE |
+		    ACE_DIRECTORY_INHERIT_ACE);
+		if (bits != 0) {
+			/* all or nothing on these inherit bits */
+			if (bits != (ACE_INHERIT_ONLY_ACE |
+			    ACE_FILE_INHERIT_ACE |
+			    ACE_DIRECTORY_INHERIT_ACE)) {
+				error = ENOTSUP;
+				goto out;
+			}
+			acl = dfacl;
+		} else {
+			acl = normacl;
+		}
+
+		if ((acep->a_flags & ACE_OWNER)) {
+			if (acl->state > ace_user_obj) {
+				error = ENOTSUP;
+				goto out;
+			}
+			acl->state = ace_user_obj;
+			acl->seen |= USER_OBJ;
+			vals = &acl->user_obj;
+			vals->aent_type = USER_OBJ | acl->dfacl_flag;
+		} else if ((acep->a_flags & ACE_EVERYONE)) {
+			acl->state = ace_other_obj;
+			acl->seen |= OTHER_OBJ;
+			vals = &acl->other_obj;
+			vals->aent_type = OTHER_OBJ | acl->dfacl_flag;
+		} else if (acep->a_flags & ACE_IDENTIFIER_GROUP) {
+			if (acl->state > ace_group) {
+				error = ENOTSUP;
+				goto out;
+			}
+			if ((acep->a_flags & ACE_GROUP)) {
+				acl->seen |= GROUP_OBJ;
+				vals = &acl->group_obj;
+				vals->aent_type = GROUP_OBJ | acl->dfacl_flag;
+			} else {
+				acl->seen |= GROUP;
+				vals = acevals_find(acep, &acl->group,
+				    &acl->numgroups);
+				if (vals == NULL) {
+					error = ENOMEM;
+					goto out;
+				}
+				vals->aent_type = GROUP | acl->dfacl_flag;
+			}
+			acl->state = ace_group;
+		} else {
+			if (acl->state > ace_user) {
+				error = ENOTSUP;
+				goto out;
+			}
+			acl->state = ace_user;
+			acl->seen |= USER;
+			vals = acevals_find(acep, &acl->user,
+			    &acl->numusers);
+			if (vals == NULL) {
+				error = ENOMEM;
+				goto out;
+			}
+			vals->aent_type = USER | acl->dfacl_flag;
+		}
+
+		if (!(acl->state > ace_unused)) {
+			error = EINVAL;
+			goto out;
+		}
+
+		if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) {
+			/* no more than one allowed per aclent_t */
+			if (vals->allowed != ACE_MASK_UNDEFINED) {
+				error = ENOTSUP;
+				goto out;
+			}
+			vals->allowed = acep->a_access_mask;
+		} else {
+			/*
+			 * it's a DENY; if there was a previous DENY, it
+			 * must have been an ACL_MASK.
+			 */
+			if (vals->denied != ACE_MASK_UNDEFINED) {
+				/* ACL_MASK is for USER and GROUP only */
+				if ((acl->state != ace_user) &&
+				    (acl->state != ace_group)) {
+					error = ENOTSUP;
+					goto out;
+				}
+
+				if (! acl->hasmask) {
+					acl->hasmask = 1;
+					acl->acl_mask = vals->denied;
+				/* check for mismatched ACL_MASK emulations */
+				} else if (acl->acl_mask != vals->denied) {
+					error = ENOTSUP;
+					goto out;
+				}
+				vals->mask = vals->denied;
+			}
+			vals->denied = acep->a_access_mask;
+		}
+	}
+
+	/* done collating; produce the aclent_t lists */
+	if (normacl->state != ace_unused) {
+		error = ace_list_to_aent(normacl, aclentp, aclcnt,
+		    owner, group, isdir);
+		if (error != 0) {
+			goto out;
+		}
+	}
+	if (dfacl->state != ace_unused) {
+		error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt,
+		    owner, group, isdir);
+		if (error != 0) {
+			goto out;
+		}
+	}
+
+out:
+	if (normacl != NULL)
+		ace_list_free(normacl);
+	if (dfacl != NULL)
+		ace_list_free(dfacl);
+
+	return (error);
+}
+
+static int
+convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir,
+    uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt)
+{
+	int error = 0;
+	aclent_t *aclentp, *dfaclentp;
+	int aclcnt, dfaclcnt;
+	int aclsz, dfaclsz;
+
+	error = ln_ace_to_aent(acebufp, acecnt, owner, group,
+	    &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir);
+
+	if (error)
+		return (error);
+
+
+	if (dfaclcnt != 0) {
+		/*
+		 * Slap aclentp and dfaclentp into a single array.
+		 */
+		aclsz = sizeof (aclent_t) * aclcnt;
+		dfaclsz = sizeof (aclent_t) * dfaclcnt;
+		aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz);
+		if (aclentp != NULL) {
+			(void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz);
+		} else {
+			error = ENOMEM;
+		}
+	}
+
+	if (aclentp) {
+		*retaclentp = aclentp;
+		*retaclcnt = aclcnt + dfaclcnt;
+	}
+
+	if (dfaclentp)
+		cacl_free(dfaclentp, dfaclsz);
+
+	return (error);
+}
+
+
+int
+acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner,
+    gid_t group)
+{
+	int aclcnt;
+	void *acldata;
+	int error;
+
+	/*
+	 * See if we need to translate
+	 */
+	if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) ||
+	    (target_flavor == _ACL_ACLENT_ENABLED &&
+	    aclp->acl_type == ACLENT_T))
+		return (0);
+
+	if (target_flavor == -1) {
+		error = EINVAL;
+		goto out;
+	}
+
+	if (target_flavor ==  _ACL_ACE_ENABLED &&
+	    aclp->acl_type == ACLENT_T) {
+		error = convert_aent_to_ace(aclp->acl_aclp,
+		    aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt);
+		if (error)
+			goto out;
+
+	} else if (target_flavor == _ACL_ACLENT_ENABLED &&
+	    aclp->acl_type == ACE_T) {
+		error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt,
+		    isdir, owner, group, (aclent_t **)&acldata, &aclcnt);
+		if (error)
+			goto out;
+	} else {
+		error = ENOTSUP;
+		goto out;
+	}
+
+	/*
+	 * replace old acl with newly translated acl
+	 */
+	cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size);
+	aclp->acl_aclp = acldata;
+	aclp->acl_cnt = aclcnt;
+	if (target_flavor == _ACL_ACE_ENABLED) {
+		aclp->acl_type = ACE_T;
+		aclp->acl_entry_size = sizeof (ace_t);
+	} else {
+		aclp->acl_type = ACLENT_T;
+		aclp->acl_entry_size = sizeof (aclent_t);
+	}
+	return (0);
+
+out:
+
+#if !defined(_KERNEL)
+	errno = error;
+	return (-1);
+#else
+	return (error);
+#endif
+}
+#endif /* !_KERNEL */
+
+#define	SET_ACE(acl, index, who, mask, type, flags) { \
+	acl[0][index].a_who = (uint32_t)who; \
+	acl[0][index].a_type = type; \
+	acl[0][index].a_flags = flags; \
+	acl[0][index++].a_access_mask = mask; \
+}
+
+void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+	uint32_t read_mask = ACE_READ_DATA;
+	uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+	uint32_t execute_mask = ACE_EXECUTE;
+
+	(void) isdir;	/* will need this later */
+
+	masks->deny1 = 0;
+	if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+		masks->deny1 |= read_mask;
+	if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+		masks->deny1 |= write_mask;
+	if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+		masks->deny1 |= execute_mask;
+
+	masks->deny2 = 0;
+	if (!(mode & S_IRGRP) && (mode & S_IROTH))
+		masks->deny2 |= read_mask;
+	if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+		masks->deny2 |= write_mask;
+	if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+		masks->deny2 |= execute_mask;
+
+	masks->allow0 = 0;
+	if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+		masks->allow0 |= read_mask;
+	if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+		masks->allow0 |= write_mask;
+	if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+		masks->allow0 |= execute_mask;
+
+	masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+	    ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+	    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+	if (mode & S_IRUSR)
+		masks->owner |= read_mask;
+	if (mode & S_IWUSR)
+		masks->owner |= write_mask;
+	if (mode & S_IXUSR)
+		masks->owner |= execute_mask;
+
+	masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IRGRP)
+		masks->group |= read_mask;
+	if (mode & S_IWGRP)
+		masks->group |= write_mask;
+	if (mode & S_IXGRP)
+		masks->group |= execute_mask;
+
+	masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IROTH)
+		masks->everyone |= read_mask;
+	if (mode & S_IWOTH)
+		masks->everyone |= write_mask;
+	if (mode & S_IXOTH)
+		masks->everyone |= execute_mask;
+}
+
+int
+acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count)
+{
+	int		index = 0;
+	int		error;
+	trivial_acl_t	masks;
+
+	*count = 3;
+	acl_trivial_access_masks(mode, isdir, &masks);
+
+	if (masks.allow0)
+		(*count)++;
+	if (masks.deny1)
+		(*count)++;
+	if (masks.deny2)
+		(*count)++;
+
+	if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0)
+		return (error);
+
+	if (masks.allow0) {
+		SET_ACE(acl, index, -1, masks.allow0,
+		    ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER);
+	}
+	if (masks.deny1) {
+		SET_ACE(acl, index, -1, masks.deny1,
+		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER);
+	}
+	if (masks.deny2) {
+		SET_ACE(acl, index, -1, masks.deny2,
+		    ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP);
+	}
+
+	SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_OWNER);
+	SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_IDENTIFIER_GROUP|ACE_GROUP);
+	SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE,
+	    ACE_EVERYONE);
+
+	return (0);
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries.  ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+int
+ace_trivial_common(void *acep, int aclcnt,
+    uint64_t (*walk)(void *, uint64_t, int aclcnt,
+    uint16_t *, uint16_t *, uint32_t *))
+{
+	uint16_t flags;
+	uint32_t mask;
+	uint16_t type;
+	uint64_t cookie = 0;
+
+	while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+		switch (flags & ACE_TYPE_FLAGS) {
+		case ACE_OWNER:
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+		case ACE_EVERYONE:
+			break;
+		default:
+			return (1);
+
+		}
+
+		if (flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE))
+			return (1);
+
+		/*
+		 * Special check for some special bits
+		 *
+		 * Don't allow anybody to deny reading basic
+		 * attributes or a files ACL.
+		 */
+		if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+		    (type == ACE_ACCESS_DENIED_ACE_TYPE))
+			return (1);
+
+		/*
+		 * Delete permissions are never set by default
+		 */
+		if (mask & (ACE_DELETE|ACE_DELETE_CHILD))
+			return (1);
+		/*
+		 * only allow owner@ to have
+		 * write_acl/write_owner/write_attributes/write_xattr/
+		 */
+		if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    (!(flags & ACE_OWNER) && (mask &
+		    (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+		    ACE_WRITE_NAMED_ATTRS))))
+			return (1);
+
+	}
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/callb.c b/sys/contrib/openzfs/module/os/freebsd/spl/callb.c
new file mode 100644
index 000000000000..fffa85b6b91b
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/callb.c
@@ -0,0 +1,373 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/mutex.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kobj.h>
+#include <sys/systm.h>	/* for delay() */
+#include <sys/taskq.h>  /* For TASKQ_NAMELEN */
+#include <sys/kernel.h>
+
+#define	CB_MAXNAME	TASKQ_NAMELEN
+
+/*
+ * The callb mechanism provides generic event scheduling/echoing.
+ * A callb function is registered and called on behalf of the event.
+ */
+typedef struct callb {
+	struct callb	*c_next; 	/* next in class or on freelist */
+	kthread_id_t	c_thread;	/* ptr to caller's thread struct */
+	char		c_flag;		/* info about the callb state */
+	uchar_t		c_class;	/* this callb's class */
+	kcondvar_t	c_done_cv;	/* signal callb completion */
+	boolean_t	(*c_func)(void *, int);
+					/* cb function: returns true if ok */
+	void		*c_arg;		/* arg to c_func */
+	char		c_name[CB_MAXNAME+1]; /* debug:max func name length */
+} callb_t;
+
+/*
+ * callb c_flag bitmap definitions
+ */
+#define	CALLB_FREE		0x0
+#define	CALLB_TAKEN		0x1
+#define	CALLB_EXECUTING		0x2
+
+/*
+ * Basic structure for a callb table.
+ * All callbs are organized into different class groups described
+ * by ct_class array.
+ * The callbs within a class are single-linked and normally run by a
+ * serial execution.
+ */
+typedef struct callb_table {
+	kmutex_t ct_lock;		/* protect all callb states */
+	callb_t	*ct_freelist; 		/* free callb structures */
+	int	ct_busy;		/* != 0 prevents additions */
+	kcondvar_t ct_busy_cv;		/* to wait for not busy    */
+	int	ct_ncallb; 		/* num of callbs allocated */
+	callb_t	*ct_first_cb[NCBCLASS];	/* ptr to 1st callb in a class */
+} callb_table_t;
+
+int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC;
+
+static callb_id_t callb_add_common(boolean_t (*)(void *, int),
+    void *, int, char *, kthread_id_t);
+
+static callb_table_t callb_table;	/* system level callback table */
+static callb_table_t *ct = &callb_table;
+static kmutex_t	callb_safe_mutex;
+callb_cpr_t	callb_cprinfo_safe = {
+	&callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} };
+
+/*
+ * Init all callb tables in the system.
+ */
+static void
+callb_init(void *dummy __unused)
+{
+	callb_table.ct_busy = 0;	/* mark table open for additions */
+	mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+callb_fini(void *dummy __unused)
+{
+	callb_t *cp;
+	int i;
+
+	mutex_enter(&ct->ct_lock);
+	for (i = 0; i < 16; i++) {
+		while ((cp = ct->ct_freelist) != NULL) {
+			ct->ct_freelist = cp->c_next;
+			ct->ct_ncallb--;
+			kmem_free(cp, sizeof (callb_t));
+		}
+		if (ct->ct_ncallb == 0)
+			break;
+		/* Not all callbacks finished, waiting for the rest. */
+		mutex_exit(&ct->ct_lock);
+		tsleep(ct, 0, "callb", hz / 4);
+		mutex_enter(&ct->ct_lock);
+	}
+	if (ct->ct_ncallb > 0)
+		printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb);
+	mutex_exit(&ct->ct_lock);
+	mutex_destroy(&callb_safe_mutex);
+	mutex_destroy(&callb_table.ct_lock);
+}
+
+/*
+ * callout_add() is called to register func() be called later.
+ */
+static callb_id_t
+callb_add_common(boolean_t (*func)(void *arg, int code),
+    void *arg, int class, char *name, kthread_id_t t)
+{
+	callb_t *cp;
+
+	ASSERT(class < NCBCLASS);
+
+	mutex_enter(&ct->ct_lock);
+	while (ct->ct_busy)
+		cv_wait(&ct->ct_busy_cv, &ct->ct_lock);
+	if ((cp = ct->ct_freelist) == NULL) {
+		ct->ct_ncallb++;
+		cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP);
+	}
+	ct->ct_freelist = cp->c_next;
+	cp->c_thread = t;
+	cp->c_func = func;
+	cp->c_arg = arg;
+	cp->c_class = (uchar_t)class;
+	cp->c_flag |= CALLB_TAKEN;
+#ifdef ZFS_DEBUG
+	if (strlen(name) > CB_MAXNAME)
+		cmn_err(CE_WARN, "callb_add: name of callback function '%s' "
+		    "too long -- truncated to %d chars",
+		    name, CB_MAXNAME);
+#endif
+	(void) strncpy(cp->c_name, name, CB_MAXNAME);
+	cp->c_name[CB_MAXNAME] = '\0';
+
+	/*
+	 * Insert the new callb at the head of its class list.
+	 */
+	cp->c_next = ct->ct_first_cb[class];
+	ct->ct_first_cb[class] = cp;
+
+	mutex_exit(&ct->ct_lock);
+	return ((callb_id_t)cp);
+}
+
+/*
+ * The default function to add an entry to the callback table.  Since
+ * it uses curthread as the thread identifier to store in the table,
+ * it should be used for the normal case of a thread which is calling
+ * to add ITSELF to the table.
+ */
+callb_id_t
+callb_add(boolean_t (*func)(void *arg, int code),
+    void *arg, int class, char *name)
+{
+	return (callb_add_common(func, arg, class, name, curthread));
+}
+
+/*
+ * A special version of callb_add() above for use by threads which
+ * might be adding an entry to the table on behalf of some other
+ * thread (for example, one which is constructed but not yet running).
+ * In this version the thread id is an argument.
+ */
+callb_id_t
+callb_add_thread(boolean_t (*func)(void *arg, int code),
+    void *arg, int class, char *name, kthread_id_t t)
+{
+	return (callb_add_common(func, arg, class, name, t));
+}
+
+/*
+ * callout_delete() is called to remove an entry identified by id
+ * that was originally placed there by a call to callout_add().
+ * return -1 if fail to delete a callb entry otherwise return 0.
+ */
+int
+callb_delete(callb_id_t id)
+{
+	callb_t **pp;
+	callb_t *me = (callb_t *)id;
+
+	mutex_enter(&ct->ct_lock);
+
+	for (;;) {
+		pp = &ct->ct_first_cb[me->c_class];
+		while (*pp != NULL && *pp != me)
+			pp = &(*pp)->c_next;
+
+#ifdef ZFS_DEBUG
+		if (*pp != me) {
+			cmn_err(CE_WARN, "callb delete bogus entry 0x%p",
+			    (void *)me);
+			mutex_exit(&ct->ct_lock);
+			return (-1);
+		}
+#endif /* DEBUG */
+
+		/*
+		 * It is not allowed to delete a callb in the middle of
+		 * executing otherwise, the callb_execute() will be confused.
+		 */
+		if (!(me->c_flag & CALLB_EXECUTING))
+			break;
+
+		cv_wait(&me->c_done_cv, &ct->ct_lock);
+	}
+	/* relink the class list */
+	*pp = me->c_next;
+
+	/* clean up myself and return the free callb to the head of freelist */
+	me->c_flag = CALLB_FREE;
+	me->c_next = ct->ct_freelist;
+	ct->ct_freelist = me;
+
+	mutex_exit(&ct->ct_lock);
+	return (0);
+}
+
+/*
+ * class:	indicates to execute all callbs in the same class;
+ * code:	optional argument for the callb functions.
+ * return:	 = 0: success
+ *		!= 0: ptr to string supplied when callback was registered
+ */
+void *
+callb_execute_class(int class, int code)
+{
+	callb_t *cp;
+	void *ret = NULL;
+
+	ASSERT(class < NCBCLASS);
+
+	mutex_enter(&ct->ct_lock);
+
+	for (cp = ct->ct_first_cb[class];
+	    cp != NULL && ret == 0; cp = cp->c_next) {
+		while (cp->c_flag & CALLB_EXECUTING)
+			cv_wait(&cp->c_done_cv, &ct->ct_lock);
+		/*
+		 * cont if the callb is deleted while we're sleeping
+		 */
+		if (cp->c_flag == CALLB_FREE)
+			continue;
+		cp->c_flag |= CALLB_EXECUTING;
+
+#ifdef CALLB_DEBUG
+		printf("callb_execute: name=%s func=%p arg=%p\n",
+		    cp->c_name, (void *)cp->c_func, (void *)cp->c_arg);
+#endif /* CALLB_DEBUG */
+
+		mutex_exit(&ct->ct_lock);
+		/* If callback function fails, pass back client's name */
+		if (!(*cp->c_func)(cp->c_arg, code))
+			ret = cp->c_name;
+		mutex_enter(&ct->ct_lock);
+
+		cp->c_flag &= ~CALLB_EXECUTING;
+		cv_broadcast(&cp->c_done_cv);
+	}
+	mutex_exit(&ct->ct_lock);
+	return (ret);
+}
+
+/*
+ * callers make sure no recursive entries to this func.
+ * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure.
+ *
+ * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we
+ * use a cv_timedwait() in case the kernel thread is blocked.
+ *
+ * Note that this is a generic callback handler for daemon CPR and
+ * should NOT be changed to accommodate any specific requirement in a daemon.
+ * Individual daemons that require changes to the handler shall write
+ * callback routines in their own daemon modules.
+ */
+boolean_t
+callb_generic_cpr(void *arg, int code)
+{
+	callb_cpr_t *cp = (callb_cpr_t *)arg;
+	clock_t ret = 0;			/* assume success */
+
+	mutex_enter(cp->cc_lockp);
+
+	switch (code) {
+	case CB_CODE_CPR_CHKPT:
+		cp->cc_events |= CALLB_CPR_START;
+#ifdef CPR_NOT_THREAD_SAFE
+		while (!(cp->cc_events & CALLB_CPR_SAFE))
+			/* cv_timedwait() returns -1 if it times out. */
+			if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
+			    cp->cc_lockp, (callb_timeout_sec * hz),
+			    TR_CLOCK_TICK)) == -1)
+				break;
+#endif
+		break;
+
+	case CB_CODE_CPR_RESUME:
+		cp->cc_events &= ~CALLB_CPR_START;
+		cv_signal(&cp->cc_stop_cv);
+		break;
+	}
+	mutex_exit(cp->cc_lockp);
+	return (ret != -1);
+}
+
+/*
+ * The generic callback function associated with kernel threads which
+ * are always considered safe.
+ */
+/* ARGSUSED */
+boolean_t
+callb_generic_cpr_safe(void *arg, int code)
+{
+	return (B_TRUE);
+}
+/*
+ * Prevent additions to callback table.
+ */
+void
+callb_lock_table(void)
+{
+	mutex_enter(&ct->ct_lock);
+	ASSERT(ct->ct_busy == 0);
+	ct->ct_busy = 1;
+	mutex_exit(&ct->ct_lock);
+}
+
+/*
+ * Allow additions to callback table.
+ */
+void
+callb_unlock_table(void)
+{
+	mutex_enter(&ct->ct_lock);
+	ASSERT(ct->ct_busy != 0);
+	ct->ct_busy = 0;
+	cv_broadcast(&ct->ct_busy_cv);
+	mutex_exit(&ct->ct_lock);
+}
+
+SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
+SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/list.c b/sys/contrib/openzfs/module/os/freebsd/spl/list.c
new file mode 100644
index 000000000000..0f5ae629126c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/list.c
@@ -0,0 +1,244 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Generic doubly-linked list implementation
+ */
+
+#include <sys/param.h>
+#include <sys/list.h>
+#include <sys/list_impl.h>
+#include <sys/types.h>
+#include <sys/debug.h>
+
+#define	list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset))
+#define	list_object(a, node) ((void *)(((char *)node) - (a)->list_offset))
+#define	list_empty(a) ((a)->list_head.list_next == &(a)->list_head)
+
+#define	list_insert_after_node(list, node, object) {	\
+	list_node_t *lnew = list_d2l(list, object);	\
+	lnew->list_prev = (node);			\
+	lnew->list_next = (node)->list_next;		\
+	(node)->list_next->list_prev = lnew;		\
+	(node)->list_next = lnew;			\
+}
+
+#define	list_insert_before_node(list, node, object) {	\
+	list_node_t *lnew = list_d2l(list, object);	\
+	lnew->list_next = (node);			\
+	lnew->list_prev = (node)->list_prev;		\
+	(node)->list_prev->list_next = lnew;		\
+	(node)->list_prev = lnew;			\
+}
+
+#define	list_remove_node(node)					\
+	(node)->list_prev->list_next = (node)->list_next;	\
+	(node)->list_next->list_prev = (node)->list_prev;	\
+	(node)->list_next = (node)->list_prev = NULL
+
+void
+list_create(list_t *list, size_t size, size_t offset)
+{
+	ASSERT(list);
+	ASSERT(size > 0);
+	ASSERT(size >= offset + sizeof (list_node_t));
+
+	list->list_size = size;
+	list->list_offset = offset;
+	list->list_head.list_next = list->list_head.list_prev =
+	    &list->list_head;
+}
+
+void
+list_destroy(list_t *list)
+{
+	list_node_t *node = &list->list_head;
+
+	ASSERT(list);
+	ASSERT(list->list_head.list_next == node);
+	ASSERT(list->list_head.list_prev == node);
+
+	node->list_next = node->list_prev = NULL;
+}
+
+void
+list_insert_after(list_t *list, void *object, void *nobject)
+{
+	if (object == NULL) {
+		list_insert_head(list, nobject);
+	} else {
+		list_node_t *lold = list_d2l(list, object);
+		list_insert_after_node(list, lold, nobject);
+	}
+}
+
+void
+list_insert_before(list_t *list, void *object, void *nobject)
+{
+	if (object == NULL) {
+		list_insert_tail(list, nobject);
+	} else {
+		list_node_t *lold = list_d2l(list, object);
+		list_insert_before_node(list, lold, nobject);
+	}
+}
+
+void
+list_insert_head(list_t *list, void *object)
+{
+	list_node_t *lold = &list->list_head;
+	list_insert_after_node(list, lold, object);
+}
+
+void
+list_insert_tail(list_t *list, void *object)
+{
+	list_node_t *lold = &list->list_head;
+	list_insert_before_node(list, lold, object);
+}
+
+void
+list_remove(list_t *list, void *object)
+{
+	list_node_t *lold = list_d2l(list, object);
+	ASSERT(!list_empty(list));
+	ASSERT(lold->list_next != NULL);
+	list_remove_node(lold);
+}
+
+void *
+list_remove_head(list_t *list)
+{
+	list_node_t *head = list->list_head.list_next;
+	if (head == &list->list_head)
+		return (NULL);
+	list_remove_node(head);
+	return (list_object(list, head));
+}
+
+void *
+list_remove_tail(list_t *list)
+{
+	list_node_t *tail = list->list_head.list_prev;
+	if (tail == &list->list_head)
+		return (NULL);
+	list_remove_node(tail);
+	return (list_object(list, tail));
+}
+
+void *
+list_head(list_t *list)
+{
+	if (list_empty(list))
+		return (NULL);
+	return (list_object(list, list->list_head.list_next));
+}
+
+void *
+list_tail(list_t *list)
+{
+	if (list_empty(list))
+		return (NULL);
+	return (list_object(list, list->list_head.list_prev));
+}
+
+void *
+list_next(list_t *list, void *object)
+{
+	list_node_t *node = list_d2l(list, object);
+
+	if (node->list_next != &list->list_head)
+		return (list_object(list, node->list_next));
+
+	return (NULL);
+}
+
+void *
+list_prev(list_t *list, void *object)
+{
+	list_node_t *node = list_d2l(list, object);
+
+	if (node->list_prev != &list->list_head)
+		return (list_object(list, node->list_prev));
+
+	return (NULL);
+}
+
+/*
+ *  Insert src list after dst list. Empty src list thereafter.
+ */
+void
+list_move_tail(list_t *dst, list_t *src)
+{
+	list_node_t *dstnode = &dst->list_head;
+	list_node_t *srcnode = &src->list_head;
+
+	ASSERT(dst->list_size == src->list_size);
+	ASSERT(dst->list_offset == src->list_offset);
+
+	if (list_empty(src))
+		return;
+
+	dstnode->list_prev->list_next = srcnode->list_next;
+	srcnode->list_next->list_prev = dstnode->list_prev;
+	dstnode->list_prev = srcnode->list_prev;
+	srcnode->list_prev->list_next = dstnode;
+
+	/* empty src list */
+	srcnode->list_next = srcnode->list_prev = srcnode;
+}
+
+void
+list_link_replace(list_node_t *lold, list_node_t *lnew)
+{
+	ASSERT(list_link_active(lold));
+	ASSERT(!list_link_active(lnew));
+
+	lnew->list_next = lold->list_next;
+	lnew->list_prev = lold->list_prev;
+	lold->list_prev->list_next = lnew;
+	lold->list_next->list_prev = lnew;
+	lold->list_next = lold->list_prev = NULL;
+}
+
+void
+list_link_init(list_node_t *link)
+{
+	link->list_next = NULL;
+	link->list_prev = NULL;
+}
+
+int
+list_link_active(list_node_t *link)
+{
+	EQUIV(link->list_next == NULL, link->list_prev == NULL);
+	return (link->list_next != NULL);
+}
+
+int
+list_is_empty(list_t *list)
+{
+	return (list_empty(list));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h
new file mode 100644
index 000000000000..0abd43068708
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha224.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SHA224_H_
+#define	_SHA224_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA224_BLOCK_LENGTH		64
+#define	SHA224_DIGEST_LENGTH		28
+#define	SHA224_DIGEST_STRING_LENGTH	(SHA224_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA224Context {
+	uint32_t state[8];
+	uint64_t count;
+	uint8_t buf[SHA224_BLOCK_LENGTH];
+} SHA224_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA224_Init
+#define	SHA224_Init		_libmd_SHA224_Init
+#endif
+#ifndef SHA224_Update
+#define	SHA224_Update		_libmd_SHA224_Update
+#endif
+#ifndef SHA224_Final
+#define	SHA224_Final		_libmd_SHA224_Final
+#endif
+#ifndef SHA224_End
+#define	SHA224_End		_libmd_SHA224_End
+#endif
+#ifndef SHA224_Fd
+#define	SHA224_Fd		_libmd_SHA224_Fd
+#endif
+#ifndef SHA224_FdChunk
+#define	SHA224_FdChunk		_libmd_SHA224_FdChunk
+#endif
+#ifndef SHA224_File
+#define	SHA224_File		_libmd_SHA224_File
+#endif
+#ifndef SHA224_FileChunk
+#define	SHA224_FileChunk	_libmd_SHA224_FileChunk
+#endif
+#ifndef SHA224_Data
+#define	SHA224_Data		_libmd_SHA224_Data
+#endif
+
+#ifndef SHA224_version
+#define	SHA224_version		_libmd_SHA224_version
+#endif
+
+void	SHA224_Init(SHA224_CTX *);
+void	SHA224_Update(SHA224_CTX *, const void *, size_t);
+void	SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)],
+    SHA224_CTX *);
+#ifndef _KERNEL
+char   *SHA224_End(SHA224_CTX *, char *);
+char   *SHA224_Data(const void *, unsigned int, char *);
+char   *SHA224_Fd(int, char *);
+char   *SHA224_FdChunk(int, char *, off_t, off_t);
+char   *SHA224_File(const char *, char *);
+char   *SHA224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA224_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h
new file mode 100644
index 000000000000..193c0c025120
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha256.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA256_H_
+#define	_SHA256_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA256_BLOCK_LENGTH		64
+#define	SHA256_DIGEST_LENGTH		32
+#define	SHA256_DIGEST_STRING_LENGTH	(SHA256_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint64_t count;
+	uint8_t buf[SHA256_BLOCK_LENGTH];
+} SHA256_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+
+#ifndef SHA256_Init
+#define	SHA256_Init		_libmd_SHA256_Init
+#endif
+#ifndef SHA256_Update
+#define	SHA256_Update		_libmd_SHA256_Update
+#endif
+#ifndef SHA256_Final
+#define	SHA256_Final		_libmd_SHA256_Final
+#endif
+#ifndef SHA256_End
+#define	SHA256_End		_libmd_SHA256_End
+#endif
+#ifndef SHA256_Fd
+#define	SHA256_Fd		_libmd_SHA256_Fd
+#endif
+#ifndef SHA256_FdChunk
+#define	SHA256_FdChunk		_libmd_SHA256_FdChunk
+#endif
+#ifndef SHA256_File
+#define	SHA256_File		_libmd_SHA256_File
+#endif
+#ifndef SHA256_FileChunk
+#define	SHA256_FileChunk	_libmd_SHA256_FileChunk
+#endif
+#ifndef SHA256_Data
+#define	SHA256_Data		_libmd_SHA256_Data
+#endif
+
+#ifndef SHA256_Transform
+#define	SHA256_Transform	_libmd_SHA256_Transform
+#endif
+#ifndef SHA256_version
+#define	SHA256_version		_libmd_SHA256_version
+#endif
+
+void	SHA256_Init(SHA256_CTX *);
+void	SHA256_Update(SHA256_CTX *, const void *, size_t);
+void	SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)],
+    SHA256_CTX *);
+#ifndef _KERNEL
+char   *SHA256_End(SHA256_CTX *, char *);
+char   *SHA256_Data(const void *, unsigned int, char *);
+char   *SHA256_Fd(int, char *);
+char   *SHA256_FdChunk(int, char *, off_t, off_t);
+char   *SHA256_File(const char *, char *);
+char   *SHA256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+__END_DECLS
+
+#endif /* !_SHA256_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c b/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c
new file mode 100644
index 000000000000..241cf8c9ae76
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha256c.c
@@ -0,0 +1,378 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+
+#include <sys/byteorder.h>
+#include <sys/endian.h>
+#include "sha224.h"
+#include "sha256.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint32_t into a vector of bytes */
+#define	be32enc_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint32_t */
+#define	be32dec_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static void
+be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		be32enc(dst + i * 4, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint32_t).  Assumes len is a multiple of 4.
+ */
+static void
+be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 4; i++)
+		dst[i] = be32dec(src + i * 4);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA256 round constants. */
+static const uint32_t K[64] = {
+	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+/* Elementary functions used by SHA256 */
+#define	Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define	Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define	SHR(x, n)	(x >> n)
+#define	ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define	S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define	S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define	s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define	s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define	RND(a, b, c, d, e, f, g, h, k)			\
+	h += S1(e) + Ch(e, f, g) + k;			\
+	d += h;						\
+	h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define	RNDr(S, W, i, ii)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define	MSCH(W, ii, i)				\
+	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] +	\
+		s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t *state, const unsigned char block[64])
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	int i;
+
+	/* 1. Prepare the first part of the message schedule W. */
+	be32dec_vect(W, block, 64);
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	for (i = 0; i < 64; i += 16) {
+		RNDr(S, W, 0, i);
+		RNDr(S, W, 1, i);
+		RNDr(S, W, 2, i);
+		RNDr(S, W, 3, i);
+		RNDr(S, W, 4, i);
+		RNDr(S, W, 5, i);
+		RNDr(S, W, 6, i);
+		RNDr(S, W, 7, i);
+		RNDr(S, W, 8, i);
+		RNDr(S, W, 9, i);
+		RNDr(S, W, 10, i);
+		RNDr(S, W, 11, i);
+		RNDr(S, W, 12, i);
+		RNDr(S, W, 13, i);
+		RNDr(S, W, 14, i);
+		RNDr(S, W, 15, i);
+
+		if (i == 48)
+			break;
+		MSCH(W, 0, i);
+		MSCH(W, 1, i);
+		MSCH(W, 2, i);
+		MSCH(W, 3, i);
+		MSCH(W, 4, i);
+		MSCH(W, 5, i);
+		MSCH(W, 6, i);
+		MSCH(W, 7, i);
+		MSCH(W, 8, i);
+		MSCH(W, 9, i);
+		MSCH(W, 10, i);
+		MSCH(W, 11, i);
+		MSCH(W, 12, i);
+		MSCH(W, 13, i);
+		MSCH(W, 14, i);
+		MSCH(W, 15, i);
+	}
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static unsigned char PAD[64] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA256_Pad(SHA256_CTX * ctx)
+{
+	size_t r;
+
+	/* Figure out how many bytes we have buffered. */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Pad to 56 mod 64, transforming if we finish a block en route. */
+	if (r < 56) {
+		/* Pad to 56 mod 64. */
+		memcpy(&ctx->buf[r], PAD, 56 - r);
+	} else {
+		/* Finish the current block and mix. */
+		memcpy(&ctx->buf[r], PAD, 64 - r);
+		SHA256_Transform(ctx->state, ctx->buf);
+
+		/* The start of the final block is all zeroes. */
+		memset(&ctx->buf[0], 0, 56);
+	}
+
+	/* Add the terminating bit-count. */
+	be64enc(&ctx->buf[56], ctx->count);
+
+	/* Mix in the final block. */
+	SHA256_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-256 initialization.  Begins a SHA-256 operation. */
+void
+SHA256_Init(SHA256_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6A09E667;
+	ctx->state[1] = 0xBB67AE85;
+	ctx->state[2] = 0x3C6EF372;
+	ctx->state[3] = 0xA54FF53A;
+	ctx->state[4] = 0x510E527F;
+	ctx->state[5] = 0x9B05688C;
+	ctx->state[6] = 0x1F83D9AB;
+	ctx->state[7] = 0x5BE0CD19;
+}
+
+/* Add bytes into the hash */
+void
+SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len)
+{
+	uint64_t bitlen;
+	uint32_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count >> 3) & 0x3f;
+
+	/* Convert the length into a number of bits */
+	bitlen = len << 3;
+
+	/* Update number of bits */
+	ctx->count += bitlen;
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < 64 - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, 64 - r);
+	SHA256_Transform(ctx->state, ctx->buf);
+	src += 64 - r;
+	len -= 64 - r;
+
+	/* Perform complete blocks */
+	while (len >= 64) {
+		SHA256_Transform(ctx->state, src);
+		src += 64;
+		len -= 64;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-256 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA256_Pad(ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-224: ******************************************************* */
+/*
+ * the SHA224 and SHA256 transforms are identical
+ */
+
+/* SHA-224 initialization.  Begins a SHA-224 operation. */
+void
+SHA224_Init(SHA224_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0xC1059ED8;
+	ctx->state[1] = 0x367CD507;
+	ctx->state[2] = 0x3070DD17;
+	ctx->state[3] = 0xF70E5939;
+	ctx->state[4] = 0xFFC00B31;
+	ctx->state[5] = 0x68581511;
+	ctx->state[6] = 0x64f98FA7;
+	ctx->state[7] = 0xBEFA4FA4;
+}
+
+/* Add bytes into the SHA-224 hash */
+void
+SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA256_Update((SHA256_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-224 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA256_Pad((SHA256_CTX *)ctx);
+
+	/* Write the hash */
+	be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#ifdef WEAK_REFS
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA256_Init
+__weak_reference(_libmd_SHA256_Init, SHA256_Init);
+#undef SHA256_Update
+__weak_reference(_libmd_SHA256_Update, SHA256_Update);
+#undef SHA256_Final
+__weak_reference(_libmd_SHA256_Final, SHA256_Final);
+#undef SHA256_Transform
+__weak_reference(_libmd_SHA256_Transform, SHA256_Transform);
+
+#undef SHA224_Init
+__weak_reference(_libmd_SHA224_Init, SHA224_Init);
+#undef SHA224_Update
+__weak_reference(_libmd_SHA224_Update, SHA224_Update);
+#undef SHA224_Final
+__weak_reference(_libmd_SHA224_Final, SHA224_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h
new file mode 100644
index 000000000000..67250cee0313
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha384.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA384_H_
+#define	_SHA384_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA384_BLOCK_LENGTH		128
+#define	SHA384_DIGEST_LENGTH		48
+#define	SHA384_DIGEST_STRING_LENGTH	(SHA384_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA384Context {
+	uint64_t state[8];
+	uint64_t count[2];
+	uint8_t buf[SHA384_BLOCK_LENGTH];
+} SHA384_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA384_Init
+#define	SHA384_Init		_libmd_SHA384_Init
+#endif
+#ifndef SHA384_Update
+#define	SHA384_Update		_libmd_SHA384_Update
+#endif
+#ifndef SHA384_Final
+#define	SHA384_Final		_libmd_SHA384_Final
+#endif
+#ifndef SHA384_End
+#define	SHA384_End		_libmd_SHA384_End
+#endif
+#ifndef SHA384_Fd
+#define	SHA384_Fd		_libmd_SHA384_Fd
+#endif
+#ifndef SHA384_FdChunk
+#define	SHA384_FdChunk		_libmd_SHA384_FdChunk
+#endif
+#ifndef SHA384_File
+#define	SHA384_File		_libmd_SHA384_File
+#endif
+#ifndef SHA384_FileChunk
+#define	SHA384_FileChunk	_libmd_SHA384_FileChunk
+#endif
+#ifndef SHA384_Data
+#define	SHA384_Data		_libmd_SHA384_Data
+#endif
+
+#ifndef SHA384_version
+#define	SHA384_version		_libmd_SHA384_version
+#endif
+
+void	SHA384_Init(SHA384_CTX *);
+void	SHA384_Update(SHA384_CTX *, const void *, size_t);
+void	SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)],
+    SHA384_CTX *);
+#ifndef _KERNEL
+char   *SHA384_End(SHA384_CTX *, char *);
+char   *SHA384_Data(const void *, unsigned int, char *);
+char   *SHA384_Fd(int, char *);
+char   *SHA384_FdChunk(int, char *, off_t, off_t);
+char   *SHA384_File(const char *, char *);
+char   *SHA384_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA384_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h
new file mode 100644
index 000000000000..b6fb733ca54e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright 2005 Colin Percival
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512_H_
+#define	_SHA512_H_
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA512_BLOCK_LENGTH		128
+#define	SHA512_DIGEST_LENGTH		64
+#define	SHA512_DIGEST_STRING_LENGTH	(SHA512_DIGEST_LENGTH * 2 + 1)
+
+typedef struct SHA512Context {
+	uint64_t state[8];
+	uint64_t count[2];
+	uint8_t buf[SHA512_BLOCK_LENGTH];
+} SHA512_CTX;
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#if 0
+#ifndef SHA512_Init
+#define	SHA512_Init		_libmd_SHA512_Init
+#endif
+#ifndef SHA512_Update
+#define	SHA512_Update		_libmd_SHA512_Update
+#endif
+#ifndef SHA512_Final
+#define	SHA512_Final		_libmd_SHA512_Final
+#endif
+#endif
+#ifndef SHA512_End
+#define	SHA512_End		_libmd_SHA512_End
+#endif
+#ifndef SHA512_Fd
+#define	SHA512_Fd		_libmd_SHA512_Fd
+#endif
+#ifndef SHA512_FdChunk
+#define	SHA512_FdChunk		_libmd_SHA512_FdChunk
+#endif
+#ifndef SHA512_File
+#define	SHA512_File		_libmd_SHA512_File
+#endif
+#ifndef SHA512_FileChunk
+#define	SHA512_FileChunk	_libmd_SHA512_FileChunk
+#endif
+#ifndef SHA512_Data
+#define	SHA512_Data		_libmd_SHA512_Data
+#endif
+
+#ifndef SHA512_Transform
+#define	SHA512_Transform	_libmd_SHA512_Transform
+#endif
+#ifndef SHA512_version
+#define	SHA512_version		_libmd_SHA512_version
+#endif
+
+void	SHA512_Init(SHA512_CTX *);
+void	SHA512_Update(SHA512_CTX *, const void *, size_t);
+void	SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)],
+    SHA512_CTX *);
+#ifndef _KERNEL
+char   *SHA512_End(SHA512_CTX *, char *);
+char   *SHA512_Data(const void *, unsigned int, char *);
+char   *SHA512_Fd(int, char *);
+char   *SHA512_FdChunk(int, char *, off_t, off_t);
+char   *SHA512_File(const char *, char *);
+char   *SHA512_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c b/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c
new file mode 100644
index 000000000000..146f338f0ed4
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512c.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright 2005 Colin Percival
+ * Copyright (c) 2015 Allan Jude <allanjude@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/endian.h>
+#include <sys/types.h>
+
+#ifdef _KERNEL
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include "sha512.h"
+#include "sha512t.h"
+#include "sha384.h"
+
+#if BYTE_ORDER == BIG_ENDIAN
+
+/* Copy a vector of big-endian uint64_t into a vector of bytes */
+#define	be64enc_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+/* Copy a vector of bytes into a vector of big-endian uint64_t */
+#define	be64dec_vect(dst, src, len)	\
+	memcpy((void *)dst, (const void *)src, (size_t)len)
+
+#else /* BYTE_ORDER != BIG_ENDIAN */
+
+/*
+ * Encode a length len/4 vector of (uint64_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 8.
+ */
+static void
+be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 8; i++)
+		be64enc(dst + i * 8, src[i]);
+}
+
+/*
+ * Decode a big-endian length len vector of (unsigned char) into a length
+ * len/4 vector of (uint64_t).  Assumes len is a multiple of 8.
+ */
+static void
+be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len)
+{
+	size_t i;
+
+	for (i = 0; i < len / 8; i++)
+		dst[i] = be64dec(src + i * 8);
+}
+
+#endif /* BYTE_ORDER != BIG_ENDIAN */
+
+/* SHA512 round constants. */
+static const uint64_t K[80] = {
+	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+	0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+	0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+	0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+	0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+	0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+	0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+	0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+	0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+	0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+	0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+	0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+	0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+	0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+	0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+	0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+	0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+	0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+	0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+	0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+	0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+	0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+	0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+	0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+	0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+	0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+	0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+/* Elementary functions used by SHA512 */
+#define	Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define	Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define	SHR(x, n)	(x >> n)
+#define	ROTR(x, n)	((x >> n) | (x << (64 - n)))
+#define	S0(x)		(ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39))
+#define	S1(x)		(ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41))
+#define	s0(x)		(ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7))
+#define	s1(x)		(ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6))
+
+/* SHA512 round function */
+#define	RND(a, b, c, d, e, f, g, h, k)			\
+	h += S1(e) + Ch(e, f, g) + k;			\
+	d += h;						\
+	h += S0(a) + Maj(a, b, c);
+
+/* Adjusted round function for rotating state */
+#define	RNDr(S, W, i, ii)			\
+	RND(S[(80 - i) % 8], S[(81 - i) % 8],	\
+	    S[(82 - i) % 8], S[(83 - i) % 8],	\
+	    S[(84 - i) % 8], S[(85 - i) % 8],	\
+	    S[(86 - i) % 8], S[(87 - i) % 8],	\
+	    W[i + ii] + K[i + ii])
+
+/* Message schedule computation */
+#define	MSCH(W, ii, i)				\
+	W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] +	\
+		s0(W[i + ii + 1]) + W[i + ii]
+
+/*
+ * SHA512 block compression function.  The 512-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA512_Transform(uint64_t *state,
+    const unsigned char block[SHA512_BLOCK_LENGTH])
+{
+	uint64_t W[80];
+	uint64_t S[8];
+	int i;
+
+	/* 1. Prepare the first part of the message schedule W. */
+	be64dec_vect(W, block, SHA512_BLOCK_LENGTH);
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, SHA512_DIGEST_LENGTH);
+
+	/* 3. Mix. */
+	for (i = 0; i < 80; i += 16) {
+		RNDr(S, W, 0, i);
+		RNDr(S, W, 1, i);
+		RNDr(S, W, 2, i);
+		RNDr(S, W, 3, i);
+		RNDr(S, W, 4, i);
+		RNDr(S, W, 5, i);
+		RNDr(S, W, 6, i);
+		RNDr(S, W, 7, i);
+		RNDr(S, W, 8, i);
+		RNDr(S, W, 9, i);
+		RNDr(S, W, 10, i);
+		RNDr(S, W, 11, i);
+		RNDr(S, W, 12, i);
+		RNDr(S, W, 13, i);
+		RNDr(S, W, 14, i);
+		RNDr(S, W, 15, i);
+
+		if (i == 64)
+			break;
+		MSCH(W, 0, i);
+		MSCH(W, 1, i);
+		MSCH(W, 2, i);
+		MSCH(W, 3, i);
+		MSCH(W, 4, i);
+		MSCH(W, 5, i);
+		MSCH(W, 6, i);
+		MSCH(W, 7, i);
+		MSCH(W, 8, i);
+		MSCH(W, 9, i);
+		MSCH(W, 10, i);
+		MSCH(W, 11, i);
+		MSCH(W, 12, i);
+		MSCH(W, 13, i);
+		MSCH(W, 14, i);
+		MSCH(W, 15, i);
+	}
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static unsigned char PAD[SHA512_BLOCK_LENGTH] = {
+	0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* Add padding and terminating bit-count. */
+static void
+SHA512_Pad(SHA512_CTX * ctx)
+{
+	size_t r;
+
+	/* Figure out how many bytes we have buffered. */
+	r = (ctx->count[1] >> 3) & 0x7f;
+
+	/* Pad to 112 mod 128, transforming if we finish a block en route. */
+	if (r < 112) {
+		/* Pad to 112 mod 128. */
+		memcpy(&ctx->buf[r], PAD, 112 - r);
+	} else {
+		/* Finish the current block and mix. */
+		memcpy(&ctx->buf[r], PAD, 128 - r);
+		SHA512_Transform(ctx->state, ctx->buf);
+
+		/* The start of the final block is all zeroes. */
+		memset(&ctx->buf[0], 0, 112);
+	}
+
+	/* Add the terminating bit-count. */
+	be64enc_vect(&ctx->buf[112], ctx->count, 16);
+
+	/* Mix in the final block. */
+	SHA512_Transform(ctx->state, ctx->buf);
+}
+
+/* SHA-512 initialization.  Begins a SHA-512 operation. */
+void
+SHA512_Init(SHA512_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x6a09e667f3bcc908ULL;
+	ctx->state[1] = 0xbb67ae8584caa73bULL;
+	ctx->state[2] = 0x3c6ef372fe94f82bULL;
+	ctx->state[3] = 0xa54ff53a5f1d36f1ULL;
+	ctx->state[4] = 0x510e527fade682d1ULL;
+	ctx->state[5] = 0x9b05688c2b3e6c1fULL;
+	ctx->state[6] = 0x1f83d9abfb41bd6bULL;
+	ctx->state[7] = 0x5be0cd19137e2179ULL;
+}
+
+/* Add bytes into the hash */
+void
+SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+	uint64_t bitlen[2];
+	uint64_t r;
+	const unsigned char *src = in;
+
+	/* Number of bytes left in the buffer from previous updates */
+	r = (ctx->count[1] >> 3) & 0x7f;
+
+	/* Convert the length into a number of bits */
+	bitlen[1] = ((uint64_t)len) << 3;
+	bitlen[0] = ((uint64_t)len) >> 61;
+
+	/* Update number of bits */
+	if ((ctx->count[1] += bitlen[1]) < bitlen[1])
+		ctx->count[0]++;
+	ctx->count[0] += bitlen[0];
+
+	/* Handle the case where we don't need to perform any transforms */
+	if (len < SHA512_BLOCK_LENGTH - r) {
+		memcpy(&ctx->buf[r], src, len);
+		return;
+	}
+
+	/* Finish the current block */
+	memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r);
+	SHA512_Transform(ctx->state, ctx->buf);
+	src += SHA512_BLOCK_LENGTH - r;
+	len -= SHA512_BLOCK_LENGTH - r;
+
+	/* Perform complete blocks */
+	while (len >= SHA512_BLOCK_LENGTH) {
+		SHA512_Transform(ctx->state, src);
+		src += SHA512_BLOCK_LENGTH;
+		len -= SHA512_BLOCK_LENGTH;
+	}
+
+	/* Copy left over data into buffer */
+	memcpy(ctx->buf, src, len);
+}
+
+/*
+ * SHA-512 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad(ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* SHA-512t: ******************************************************** */
+/*
+ * the SHA512t transforms are identical to SHA512 so reuse the existing function
+ */
+void
+SHA512_224_Init(SHA512_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x8c3d37c819544da2ULL;
+	ctx->state[1] = 0x73e1996689dcd4d6ULL;
+	ctx->state[2] = 0x1dfab7ae32ff9c82ULL;
+	ctx->state[3] = 0x679dd514582f9fcfULL;
+	ctx->state[4] = 0x0f6d2b697bd44da8ULL;
+	ctx->state[5] = 0x77e36f7304c48942ULL;
+	ctx->state[6] = 0x3f9d85a86a1d36c8ULL;
+	ctx->state[7] = 0x1112e6ad91d692a1ULL;
+}
+
+void
+SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH],
+    SHA512_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad(ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+void
+SHA512_256_Init(SHA512_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0x22312194fc2bf72cULL;
+	ctx->state[1] = 0x9f555fa3c84c64c2ULL;
+	ctx->state[2] = 0x2393b86b6f53b151ULL;
+	ctx->state[3] = 0x963877195940eabdULL;
+	ctx->state[4] = 0x96283ee2a88effe3ULL;
+	ctx->state[5] = 0xbe5e1e2553863992ULL;
+	ctx->state[6] = 0x2b0199fc2c85b8aaULL;
+	ctx->state[7] = 0x0eb72ddc81c52ca2ULL;
+}
+
+void
+SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA512_Update(ctx, in, len);
+}
+
+void
+SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH],
+    SHA512_CTX * ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad(ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+/* ** SHA-384: ******************************************************** */
+/*
+ * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped
+ */
+
+/* SHA-384 initialization.  Begins a SHA-384 operation. */
+void
+SHA384_Init(SHA384_CTX * ctx)
+{
+
+	/* Zero bits processed so far */
+	ctx->count[0] = ctx->count[1] = 0;
+
+	/* Magic initialization constants */
+	ctx->state[0] = 0xcbbb9d5dc1059ed8ULL;
+	ctx->state[1] = 0x629a292a367cd507ULL;
+	ctx->state[2] = 0x9159015a3070dd17ULL;
+	ctx->state[3] = 0x152fecd8f70e5939ULL;
+	ctx->state[4] = 0x67332667ffc00b31ULL;
+	ctx->state[5] = 0x8eb44a8768581511ULL;
+	ctx->state[6] = 0xdb0c2e0d64f98fa7ULL;
+	ctx->state[7] = 0x47b5481dbefa4fa4ULL;
+}
+
+/* Add bytes into the SHA-384 hash */
+void
+SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len)
+{
+
+	SHA512_Update((SHA512_CTX *)ctx, in, len);
+}
+
+/*
+ * SHA-384 finalization.  Pads the input data, exports the hash value,
+ * and clears the context state.
+ */
+void
+SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx)
+{
+
+	/* Add padding */
+	SHA512_Pad((SHA512_CTX *)ctx);
+
+	/* Write the hash */
+	be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH);
+
+	/* Clear the context state */
+	explicit_bzero(ctx, sizeof (*ctx));
+}
+
+#if 0
+/*
+ * When building libmd, provide weak references. Note: this is not
+ * activated in the context of compiling these sources for internal
+ * use in libcrypt.
+ */
+#undef SHA512_Init
+__weak_reference(_libmd_SHA512_Init, SHA512_Init);
+#undef SHA512_Update
+__weak_reference(_libmd_SHA512_Update, SHA512_Update);
+#undef SHA512_Final
+__weak_reference(_libmd_SHA512_Final, SHA512_Final);
+#undef SHA512_Transform
+__weak_reference(_libmd_SHA512_Transform, SHA512_Transform);
+
+#undef SHA512_224_Init
+__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init);
+#undef SHA512_224_Update
+__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update);
+#undef SHA512_224_Final
+__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final);
+
+#undef SHA512_256_Init
+__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init);
+#undef SHA512_256_Update
+__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update);
+#undef SHA512_256_Final
+__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final);
+
+#undef SHA384_Init
+__weak_reference(_libmd_SHA384_Init, SHA384_Init);
+#undef SHA384_Update
+__weak_reference(_libmd_SHA384_Update, SHA384_Update);
+#undef SHA384_Final
+__weak_reference(_libmd_SHA384_Final, SHA384_Final);
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h b/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h
new file mode 100644
index 000000000000..703867fc0288
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/sha512t.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2015 Allan Jude <allanjude@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SHA512T_H_
+#define	_SHA512T_H_
+
+#include "sha512.h"
+
+#ifndef _KERNEL
+#include <sys/types.h>
+#endif
+
+#define	SHA512_224_DIGEST_LENGTH	28
+#define	SHA512_224_DIGEST_STRING_LENGTH	(SHA512_224_DIGEST_LENGTH * 2 + 1)
+#define	SHA512_256_DIGEST_LENGTH	32
+#define	SHA512_256_DIGEST_STRING_LENGTH	(SHA512_256_DIGEST_LENGTH * 2 + 1)
+
+__BEGIN_DECLS
+
+/* Ensure libmd symbols do not clash with libcrypto */
+#ifndef SHA512_224_Init
+#define	SHA512_224_Init		_libmd_SHA512_224_Init
+#endif
+#ifndef SHA512_224_Update
+#define	SHA512_224_Update	_libmd_SHA512_224_Update
+#endif
+#ifndef SHA512_224_Final
+#define	SHA512_224_Final	_libmd_SHA512_224_Final
+#endif
+#ifndef SHA512_224_End
+#define	SHA512_224_End		_libmd_SHA512_224_End
+#endif
+#ifndef SHA512_224_Fd
+#define	SHA512_224_Fd		_libmd_SHA512_224_Fd
+#endif
+#ifndef SHA512_224_FdChunk
+#define	SHA512_224_FdChunk	_libmd_SHA512_224_FdChunk
+#endif
+#ifndef SHA512_224_File
+#define	SHA512_224_File		_libmd_SHA512_224_File
+#endif
+#ifndef SHA512_224_FileChunk
+#define	SHA512_224_FileChunk	_libmd_SHA512_224_FileChunk
+#endif
+#ifndef SHA512_224_Data
+#define	SHA512_224_Data		_libmd_SHA512_224_Data
+#endif
+
+#ifndef SHA512_224_Transform
+#define	SHA512_224_Transform	_libmd_SHA512_224_Transform
+#endif
+#ifndef SHA512_224_version
+#define	SHA512_224_version	_libmd_SHA512_224_version
+#endif
+
+#ifndef SHA512_256_Init
+#define	SHA512_256_Init		_libmd_SHA512_256_Init
+#endif
+#ifndef SHA512_256_Update
+#define	SHA512_256_Update	_libmd_SHA512_256_Update
+#endif
+#ifndef SHA512_256_Final
+#define	SHA512_256_Final	_libmd_SHA512_256_Final
+#endif
+#ifndef SHA512_256_End
+#define	SHA512_256_End		_libmd_SHA512_256_End
+#endif
+#ifndef SHA512_256_Fd
+#define	SHA512_256_Fd		_libmd_SHA512_256_Fd
+#endif
+#ifndef SHA512_256_FdChunk
+#define	SHA512_256_FdChunk	_libmd_SHA512_256_FdChunk
+#endif
+#ifndef SHA512_256_File
+#define	SHA512_256_File		_libmd_SHA512_256_File
+#endif
+#ifndef SHA512_256_FileChunk
+#define	SHA512_256_FileChunk	_libmd_SHA512_256_FileChunk
+#endif
+#ifndef SHA512_256_Data
+#define	SHA512_256_Data		_libmd_SHA512_256_Data
+#endif
+
+#ifndef SHA512_256_Transform
+#define	SHA512_256_Transform	_libmd_SHA512_256_Transform
+#endif
+#ifndef SHA512_256_version
+#define	SHA512_256_version	_libmd_SHA512_256_version
+#endif
+
+void	SHA512_224_Init(SHA512_CTX *);
+void	SHA512_224_Update(SHA512_CTX *, const void *, size_t);
+void	SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)],
+    SHA512_CTX *);
+#ifndef _KERNEL
+char   *SHA512_224_End(SHA512_CTX *, char *);
+char   *SHA512_224_Data(const void *, unsigned int, char *);
+char   *SHA512_224_Fd(int, char *);
+char   *SHA512_224_FdChunk(int, char *, off_t, off_t);
+char   *SHA512_224_File(const char *, char *);
+char   *SHA512_224_FileChunk(const char *, char *, off_t, off_t);
+#endif
+void	SHA512_256_Init(SHA512_CTX *);
+void	SHA512_256_Update(SHA512_CTX *, const void *, size_t);
+void	SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)],
+    SHA512_CTX *);
+#ifndef _KERNEL
+char   *SHA512_256_End(SHA512_CTX *, char *);
+char   *SHA512_256_Data(const void *, unsigned int, char *);
+char   *SHA512_256_Fd(int, char *);
+char   *SHA512_256_FdChunk(int, char *, off_t, off_t);
+char   *SHA512_256_File(const char *, char *);
+char   *SHA512_256_FileChunk(const char *, char *, off_t, off_t);
+#endif
+
+__END_DECLS
+
+#endif /* !_SHA512T_H_ */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
new file mode 100644
index 000000000000..74c26d03f87f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_acl.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2008, 2009 Edward Tomasz Napierała <trasz@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/malloc.h>
+#include <sys/errno.h>
+#include <sys/zfs_acl.h>
+#include <sys/acl.h>
+
+struct zfs2bsd {
+	uint32_t	zb_zfs;
+	int		zb_bsd;
+};
+
+struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA},
+			{ACE_WRITE_DATA, ACL_WRITE_DATA},
+			{ACE_EXECUTE, ACL_EXECUTE},
+			{ACE_APPEND_DATA, ACL_APPEND_DATA},
+			{ACE_DELETE_CHILD, ACL_DELETE_CHILD},
+			{ACE_DELETE, ACL_DELETE},
+			{ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES},
+			{ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES},
+			{ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS},
+			{ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS},
+			{ACE_READ_ACL, ACL_READ_ACL},
+			{ACE_WRITE_ACL, ACL_WRITE_ACL},
+			{ACE_WRITE_OWNER, ACL_WRITE_OWNER},
+			{ACE_SYNCHRONIZE, ACL_SYNCHRONIZE},
+			{0, 0}};
+
+struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE,
+			    ACL_ENTRY_FILE_INHERIT},
+			{ACE_DIRECTORY_INHERIT_ACE,
+			    ACL_ENTRY_DIRECTORY_INHERIT},
+			{ACE_NO_PROPAGATE_INHERIT_ACE,
+			    ACL_ENTRY_NO_PROPAGATE_INHERIT},
+			{ACE_INHERIT_ONLY_ACE,
+			    ACL_ENTRY_INHERIT_ONLY},
+			{ACE_INHERITED_ACE,
+			    ACL_ENTRY_INHERITED},
+			{ACE_SUCCESSFUL_ACCESS_ACE_FLAG,
+			    ACL_ENTRY_SUCCESSFUL_ACCESS},
+			{ACE_FAILED_ACCESS_ACE_FLAG,
+			    ACL_ENTRY_FAILED_ACCESS},
+			{0, 0}};
+
+static int
+_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table)
+{
+	const struct zfs2bsd *tmp;
+	int bsd = 0;
+
+	for (tmp = table; tmp->zb_zfs != 0; tmp++) {
+		if (zfs & tmp->zb_zfs)
+			bsd |= tmp->zb_bsd;
+	}
+
+	return (bsd);
+}
+
+static uint32_t
+_zfs_from_bsd(int bsd, const struct zfs2bsd *table)
+{
+	const struct zfs2bsd *tmp;
+	uint32_t zfs = 0;
+
+	for (tmp = table; tmp->zb_bsd != 0; tmp++) {
+		if (bsd & tmp->zb_bsd)
+			zfs |= tmp->zb_zfs;
+	}
+
+	return (zfs);
+}
+
+int
+acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries)
+{
+	int i;
+	struct acl_entry *entry;
+	const ace_t *ace;
+
+	if (nentries < 1) {
+		printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n");
+		return (EINVAL);
+	}
+
+	if (nentries > ACL_MAX_ENTRIES) {
+		/*
+		 * I believe it may happen only when moving a pool
+		 * from SunOS to FreeBSD.
+		 */
+		printf("acl_from_aces: ZFS ACL too big to fit "
+		    "into 'struct acl'; returning EINVAL.\n");
+		return (EINVAL);
+	}
+
+	bzero(aclp, sizeof (*aclp));
+	aclp->acl_maxcnt = ACL_MAX_ENTRIES;
+	aclp->acl_cnt = nentries;
+
+	for (i = 0; i < nentries; i++) {
+		entry = &(aclp->acl_entry[i]);
+		ace = &(aces[i]);
+
+		if (ace->a_flags & ACE_OWNER)
+			entry->ae_tag = ACL_USER_OBJ;
+		else if (ace->a_flags & ACE_GROUP)
+			entry->ae_tag = ACL_GROUP_OBJ;
+		else if (ace->a_flags & ACE_EVERYONE)
+			entry->ae_tag = ACL_EVERYONE;
+		else if (ace->a_flags & ACE_IDENTIFIER_GROUP)
+			entry->ae_tag = ACL_GROUP;
+		else
+			entry->ae_tag = ACL_USER;
+
+		if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP)
+			entry->ae_id = ace->a_who;
+		else
+			entry->ae_id = ACL_UNDEFINED_ID;
+
+		entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms);
+		entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags);
+
+		switch (ace->a_type) {
+		case ACE_ACCESS_ALLOWED_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW;
+			break;
+		case ACE_ACCESS_DENIED_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_DENY;
+			break;
+		case ACE_SYSTEM_AUDIT_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT;
+			break;
+		case ACE_SYSTEM_ALARM_ACE_TYPE:
+			entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM;
+			break;
+		default:
+			panic("acl_from_aces: a_type is 0x%x", ace->a_type);
+		}
+	}
+
+	return (0);
+}
+
+void
+aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp)
+{
+	int i;
+	const struct acl_entry *entry;
+	ace_t *ace;
+
+	bzero(aces, sizeof (*aces) * aclp->acl_cnt);
+
+	*nentries = aclp->acl_cnt;
+
+	for (i = 0; i < aclp->acl_cnt; i++) {
+		entry = &(aclp->acl_entry[i]);
+		ace = &(aces[i]);
+
+		ace->a_who = entry->ae_id;
+
+		if (entry->ae_tag == ACL_USER_OBJ)
+			ace->a_flags = ACE_OWNER;
+		else if (entry->ae_tag == ACL_GROUP_OBJ)
+			ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP);
+		else if (entry->ae_tag == ACL_GROUP)
+			ace->a_flags = ACE_IDENTIFIER_GROUP;
+		else if (entry->ae_tag == ACL_EVERYONE)
+			ace->a_flags = ACE_EVERYONE;
+		else /* ACL_USER */
+			ace->a_flags = 0;
+
+		ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms);
+		ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags);
+
+		switch (entry->ae_entry_type) {
+		case ACL_ENTRY_TYPE_ALLOW:
+			ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
+			break;
+		case ACL_ENTRY_TYPE_DENY:
+			ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE;
+			break;
+		case ACL_ENTRY_TYPE_ALARM:
+			ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE;
+			break;
+		case ACL_ENTRY_TYPE_AUDIT:
+			ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE;
+			break;
+		default:
+			panic("aces_from_acl: ae_entry_type is 0x%x",
+			    entry->ae_entry_type);
+		}
+	}
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
new file mode 100644
index 000000000000..80040fc6a3e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_atomic.c
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/atomic.h>
+
+#if !defined(__LP64__) && !defined(__mips_n32) && \
+	!defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \
+	!defined(HAS_EMULATED_ATOMIC64)
+
+#ifdef _KERNEL
+#include <sys/kernel.h>
+
+struct mtx atomic_mtx;
+MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF);
+#else
+#include <pthread.h>
+
+#define	mtx_lock(lock)		pthread_mutex_lock(lock)
+#define	mtx_unlock(lock)	pthread_mutex_unlock(lock)
+
+static pthread_mutex_t atomic_mtx;
+
+static __attribute__((constructor)) void
+atomic_init(void)
+{
+	pthread_mutex_init(&atomic_mtx, NULL);
+}
+#endif
+
+void
+atomic_add_64(volatile uint64_t *target, int64_t delta)
+{
+
+	mtx_lock(&atomic_mtx);
+	*target += delta;
+	mtx_unlock(&atomic_mtx);
+}
+
+void
+atomic_dec_64(volatile uint64_t *target)
+{
+
+	mtx_lock(&atomic_mtx);
+	*target -= 1;
+	mtx_unlock(&atomic_mtx);
+}
+
+uint64_t
+atomic_swap_64(volatile uint64_t *a, uint64_t value)
+{
+	uint64_t ret;
+
+	mtx_lock(&atomic_mtx);
+	ret = *a;
+	*a = value;
+	mtx_unlock(&atomic_mtx);
+	return (ret);
+}
+
+uint64_t
+atomic_load_64(volatile uint64_t *a)
+{
+	uint64_t ret;
+
+	mtx_lock(&atomic_mtx);
+	ret = *a;
+	mtx_unlock(&atomic_mtx);
+	return (ret);
+}
+
+uint64_t
+atomic_add_64_nv(volatile uint64_t *target, int64_t delta)
+{
+	uint64_t newval;
+
+	mtx_lock(&atomic_mtx);
+	newval = (*target += delta);
+	mtx_unlock(&atomic_mtx);
+	return (newval);
+}
+
+uint64_t
+atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval)
+{
+	uint64_t oldval;
+
+	mtx_lock(&atomic_mtx);
+	oldval = *target;
+	if (oldval == cmp)
+		*target = newval;
+	mtx_unlock(&atomic_mtx);
+	return (oldval);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c
new file mode 100644
index 000000000000..22c7338b7399
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_cmn_err.c
@@ -0,0 +1,77 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 John Birrell <jb@FreeBSD.org>. All rights reserved.
+ * Copyright 2012 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cmn_err.h>
+
+void
+vcmn_err(int ce, const char *fmt, va_list adx)
+{
+	char buf[256];
+	const char *prefix;
+
+	prefix = NULL; /* silence unwitty compilers */
+	switch (ce) {
+	case CE_CONT:
+		prefix = "Solaris(cont): ";
+		break;
+	case CE_NOTE:
+		prefix = "Solaris: NOTICE: ";
+		break;
+	case CE_WARN:
+		prefix = "Solaris: WARNING: ";
+		break;
+	case CE_PANIC:
+		prefix = "Solaris(panic): ";
+		break;
+	case CE_IGNORE:
+		break;
+	default:
+		panic("Solaris: unknown severity level");
+	}
+	if (ce == CE_PANIC) {
+		vsnprintf(buf, sizeof (buf), fmt, adx);
+		panic("%s%s", prefix, buf);
+	}
+	if (ce != CE_IGNORE) {
+		printf("%s", prefix);
+		vprintf(fmt, adx);
+		printf("\n");
+	}
+}
+
+void
+cmn_err(int type, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vcmn_err(type, fmt, ap);
+	va_end(ap);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
new file mode 100644
index 000000000000..6b2872bcc066
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_dtrace.c
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2014 The FreeBSD Project.
+ * All rights reserved.
+ *
+ * This software was developed by Steven Hartland.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/sdt.h>
+
+/* CSTYLED */
+SDT_PROBE_DEFINE1(sdt, , , set__error, "int");
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
new file mode 100644
index 000000000000..cfc61dd7fc2a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kmem.c
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/debug.h>
+#include <sys/mutex.h>
+#include <sys/vmmeter.h>
+
+
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+
+#ifdef KMEM_DEBUG
+#include <sys/queue.h>
+#include <sys/stack.h>
+#endif
+
+#ifdef _KERNEL
+MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris");
+#else
+#define	malloc(size, type, flags)	malloc(size)
+#define	free(addr, type)		free(addr)
+#endif
+
+#ifdef KMEM_DEBUG
+struct kmem_item {
+	struct stack	stack;
+	LIST_ENTRY(kmem_item) next;
+};
+static LIST_HEAD(, kmem_item) kmem_items;
+static struct mtx kmem_items_mtx;
+MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF);
+#endif	/* KMEM_DEBUG */
+
+#include <sys/vmem.h>
+
+void *
+zfs_kmem_alloc(size_t size, int kmflags)
+{
+	void *p;
+#ifdef KMEM_DEBUG
+	struct kmem_item *i;
+
+	size += sizeof (struct kmem_item);
+#endif
+	p = malloc(MAX(size, 16), M_SOLARIS, kmflags);
+#ifndef _KERNEL
+	if (kmflags & KM_SLEEP)
+		assert(p != NULL);
+#endif
+#ifdef KMEM_DEBUG
+	if (p != NULL) {
+		i = p;
+		p = (uint8_t *)p + sizeof (struct kmem_item);
+		stack_save(&i->stack);
+		mtx_lock(&kmem_items_mtx);
+		LIST_INSERT_HEAD(&kmem_items, i, next);
+		mtx_unlock(&kmem_items_mtx);
+	}
+#endif
+	return (p);
+}
+
+void
+zfs_kmem_free(void *buf, size_t size __unused)
+{
+#ifdef KMEM_DEBUG
+	if (buf == NULL) {
+		printf("%s: attempt to free NULL\n", __func__);
+		return;
+	}
+	struct kmem_item *i;
+
+	buf = (uint8_t *)buf - sizeof (struct kmem_item);
+	mtx_lock(&kmem_items_mtx);
+	LIST_FOREACH(i, &kmem_items, next) {
+		if (i == buf)
+			break;
+	}
+	ASSERT(i != NULL);
+	LIST_REMOVE(i, next);
+	mtx_unlock(&kmem_items_mtx);
+	memset(buf, 0xDC, MAX(size, 16));
+#endif
+	free(buf, M_SOLARIS);
+}
+
+static uint64_t kmem_size_val;
+
+static void
+kmem_size_init(void *unused __unused)
+{
+
+	kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE;
+	if (kmem_size_val > vm_kmem_size)
+		kmem_size_val = vm_kmem_size;
+}
+SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
+
+uint64_t
+kmem_size(void)
+{
+
+	return (kmem_size_val);
+}
+
+static int
+kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
+{
+	struct kmem_cache *cache = private;
+
+	return (cache->kc_constructor(mem, cache->kc_private, flags));
+}
+
+static void
+kmem_std_destructor(void *mem, int size __unused, void *private)
+{
+	struct kmem_cache *cache = private;
+
+	cache->kc_destructor(mem, cache->kc_private);
+}
+
+kmem_cache_t *
+kmem_cache_create(char *name, size_t bufsize, size_t align,
+    int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
+    void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags)
+{
+	kmem_cache_t *cache;
+
+	ASSERT(vmp == NULL);
+
+	cache = kmem_alloc(sizeof (*cache), KM_SLEEP);
+	strlcpy(cache->kc_name, name, sizeof (cache->kc_name));
+	cache->kc_constructor = constructor;
+	cache->kc_destructor = destructor;
+	cache->kc_private = private;
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	cache->kc_zone = uma_zcreate(cache->kc_name, bufsize,
+	    constructor != NULL ? kmem_std_constructor : NULL,
+	    destructor != NULL ? kmem_std_destructor : NULL,
+	    NULL, NULL, align > 0 ? align - 1 : 0, cflags);
+#else
+	cache->kc_size = bufsize;
+#endif
+
+	return (cache);
+}
+
+void
+kmem_cache_destroy(kmem_cache_t *cache)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	uma_zdestroy(cache->kc_zone);
+#endif
+	kmem_free(cache, sizeof (*cache));
+}
+
+void *
+kmem_cache_alloc(kmem_cache_t *cache, int flags)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	return (uma_zalloc_arg(cache->kc_zone, cache, flags));
+#else
+	void *p;
+
+	p = kmem_alloc(cache->kc_size, flags);
+	if (p != NULL && cache->kc_constructor != NULL)
+		kmem_std_constructor(p, cache->kc_size, cache, flags);
+	return (p);
+#endif
+}
+
+void
+kmem_cache_free(kmem_cache_t *cache, void *buf)
+{
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	uma_zfree_arg(cache->kc_zone, buf, cache);
+#else
+	if (cache->kc_destructor != NULL)
+		kmem_std_destructor(buf, cache->kc_size, cache);
+	kmem_free(buf, cache->kc_size);
+#endif
+}
+
+/*
+ * Allow our caller to determine if there are running reaps.
+ *
+ * This call is very conservative and may return B_TRUE even when
+ * reaping activity isn't active. If it returns B_FALSE, then reaping
+ * activity is definitely inactive.
+ */
+boolean_t
+kmem_cache_reap_active(void)
+{
+
+	return (B_FALSE);
+}
+
+/*
+ * Reap (almost) everything soon.
+ *
+ * Note: this does not wait for the reap-tasks to complete. Caller
+ * should use kmem_cache_reap_active() (above) and/or moderation to
+ * avoid scheduling too many reap-tasks.
+ */
+#ifdef _KERNEL
+void
+kmem_cache_reap_soon(kmem_cache_t *cache)
+{
+#ifndef KMEM_DEBUG
+#if __FreeBSD_version >= 1300043
+	uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN);
+#else
+	zone_drain(cache->kc_zone);
+#endif
+#endif
+}
+
+void
+kmem_reap(void)
+{
+#if __FreeBSD_version >= 1300043
+	uma_reclaim(UMA_RECLAIM_TRIM);
+#else
+	uma_reclaim();
+#endif
+}
+#else
+void
+kmem_cache_reap_soon(kmem_cache_t *cache __unused)
+{
+}
+
+void
+kmem_reap(void)
+{
+}
+#endif
+
+int
+kmem_debugging(void)
+{
+	return (0);
+}
+
+void *
+calloc(size_t n, size_t s)
+{
+	return (kmem_zalloc(n * s, KM_NOSLEEP));
+}
+
+char *
+kmem_vasprintf(const char *fmt, va_list adx)
+{
+	char *msg;
+	va_list adx2;
+
+	va_copy(adx2, adx);
+	msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP);
+	(void) vsprintf(msg, fmt, adx2);
+	va_end(adx2);
+
+	return (msg);
+}
+
+#include <vm/uma.h>
+#include <vm/uma_int.h>
+#ifdef KMEM_DEBUG
+#error "KMEM_DEBUG not currently supported"
+#endif
+
+uint64_t
+spl_kmem_cache_inuse(kmem_cache_t *cache)
+{
+	return (uma_zone_get_cur(cache->kc_zone));
+}
+
+uint64_t
+spl_kmem_cache_entry_size(kmem_cache_t *cache)
+{
+	return (cache->kc_zone->uz_size);
+}
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(kmem_cache_t *skc,
+    kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+	ASSERT(move != NULL);
+}
+
+#ifdef KMEM_DEBUG
+void kmem_show(void *);
+void
+kmem_show(void *dummy __unused)
+{
+	struct kmem_item *i;
+
+	mtx_lock(&kmem_items_mtx);
+	if (LIST_EMPTY(&kmem_items))
+		printf("KMEM_DEBUG: No leaked elements.\n");
+	else {
+		printf("KMEM_DEBUG: Leaked elements:\n\n");
+		LIST_FOREACH(i, &kmem_items, next) {
+			printf("address=%p\n", i);
+			stack_print_ddb(&i->stack);
+			printf("\n");
+		}
+	}
+	mtx_unlock(&kmem_items_mtx);
+}
+
+SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL);
+#endif	/* KMEM_DEBUG */
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
new file mode 100644
index 000000000000..6bdef466c253
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_kstat.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Links to Illumos.org for more information on kstat function:
+ * [1] https://illumos.org/man/1M/kstat
+ * [2] https://illumos.org/man/9f/kstat_create
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <sys/kstat.h>
+#include <sys/sbuf.h>
+
+static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics");
+
+SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics");
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+    int (*headers)(char *buf, size_t size),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index))
+{
+	ksp->ks_raw_ops.headers = headers;
+	ksp->ks_raw_ops.data    = data;
+	ksp->ks_raw_ops.addr    = addr;
+}
+
+void
+__kstat_set_seq_raw_ops(kstat_t *ksp,
+    int (*headers)(struct seq_file *f),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index))
+{
+	ksp->ks_raw_ops.seq_headers = headers;
+	ksp->ks_raw_ops.data    = data;
+	ksp->ks_raw_ops.addr    = addr;
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+	ASSERT(ksp != NULL);
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+	if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+		return (ENOMEM);
+
+	free(ksp->ks_raw_buf, M_TEMP);
+	ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+	ksp->ks_raw_buf = malloc(ksp->ks_raw_bufsize, M_TEMP, M_WAITOK);
+
+	return (0);
+}
+
+static void *
+kstat_raw_default_addr(kstat_t *ksp, loff_t n)
+{
+	if (n == 0)
+		return (ksp->ks_data);
+	return (NULL);
+}
+
+static int
+kstat_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	kstat_t *ksp = arg1;
+	kstat_named_t *ksent;
+	uint64_t val;
+
+	ksent = ksp->ks_data;
+	/* Select the correct element */
+	ksent += arg2;
+	/* Update the aggsums before reading */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+	val = ksent->value.ui64;
+
+	return (sysctl_handle_64(oidp, &val, 0, req));
+}
+
+static int
+kstat_sysctl_string(SYSCTL_HANDLER_ARGS)
+{
+	kstat_t *ksp = arg1;
+	kstat_named_t *ksent = ksp->ks_data;
+	char *val;
+	uint32_t len = 0;
+
+	/* Select the correct element */
+	ksent += arg2;
+	/* Update the aggsums before reading */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+	val = KSTAT_NAMED_STR_PTR(ksent);
+	len = KSTAT_NAMED_STR_BUFLEN(ksent);
+	val[len-1] = '\0';
+
+	return (sysctl_handle_string(oidp, val, len, req));
+}
+
+static int
+kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf *sb;
+	kstat_t *ksp = arg1;
+	kstat_io_t *kip = ksp->ks_data;
+	int rc;
+
+	sb = sbuf_new_auto();
+	if (sb == NULL)
+		return (ENOMEM);
+	/* Update the aggsums before reading */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+
+	/* though wlentime & friends are signed, they will never be negative */
+	sbuf_printf(sb,
+	    "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
+	    "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
+	    kip->nread, kip->nwritten,
+	    kip->reads, kip->writes,
+	    kip->wtime, kip->wlentime, kip->wlastupdate,
+	    kip->rtime, kip->rlentime, kip->rlastupdate,
+	    kip->wcnt,  kip->rcnt);
+	rc = sbuf_finish(sb);
+	if (rc == 0)
+		rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
+	sbuf_delete(sb);
+	return (rc);
+}
+
+static int
+kstat_sysctl_raw(SYSCTL_HANDLER_ARGS)
+{
+	struct sbuf *sb;
+	void *data;
+	kstat_t *ksp = arg1;
+	void *(*addr_op)(kstat_t *ksp, loff_t index);
+	int n, has_header, rc = 0;
+
+	sb = sbuf_new_auto();
+	if (sb == NULL)
+		return (ENOMEM);
+
+	if (ksp->ks_raw_ops.addr)
+		addr_op = ksp->ks_raw_ops.addr;
+	else
+		addr_op = kstat_raw_default_addr;
+
+	mutex_enter(ksp->ks_lock);
+
+	/* Update the aggsums before reading */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+
+	ksp->ks_raw_bufsize = PAGE_SIZE;
+	ksp->ks_raw_buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
+
+	n = 0;
+	has_header = (ksp->ks_raw_ops.headers ||
+	    ksp->ks_raw_ops.seq_headers);
+
+restart_headers:
+	if (ksp->ks_raw_ops.headers) {
+		rc = ksp->ks_raw_ops.headers(
+		    ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+	} else if (ksp->ks_raw_ops.seq_headers) {
+		struct seq_file f;
+
+		f.sf_buf = ksp->ks_raw_buf;
+		f.sf_size = ksp->ks_raw_bufsize;
+		rc = ksp->ks_raw_ops.seq_headers(&f);
+	}
+	if (has_header) {
+		if (rc == ENOMEM && !kstat_resize_raw(ksp))
+			goto restart_headers;
+		if (rc == 0)
+			sbuf_printf(sb, "\n%s", ksp->ks_raw_buf);
+	}
+
+	while ((data = addr_op(ksp, n)) != NULL) {
+restart:
+		if (ksp->ks_raw_ops.data) {
+			rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf,
+			    ksp->ks_raw_bufsize, data);
+			if (rc == ENOMEM && !kstat_resize_raw(ksp))
+				goto restart;
+			if (rc == 0)
+				sbuf_printf(sb, "%s", ksp->ks_raw_buf);
+
+		} else {
+			ASSERT(ksp->ks_ndata == 1);
+			sbuf_hexdump(sb, ksp->ks_data,
+			    ksp->ks_data_size, NULL, 0);
+		}
+		n++;
+	}
+	free(ksp->ks_raw_buf, M_TEMP);
+	mutex_exit(ksp->ks_lock);
+	sbuf_trim(sb);
+	rc = sbuf_finish(sb);
+	if (rc == 0)
+		rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
+	sbuf_delete(sb);
+	return (rc);
+}
+
+kstat_t *
+__kstat_create(const char *module, int instance, const char *name,
+    const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags)
+{
+	char buf[KSTAT_STRLEN];
+	struct sysctl_oid *root;
+	kstat_t *ksp;
+	char *pool;
+
+	KASSERT(instance == 0, ("instance=%d", instance));
+	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+		ASSERT(ks_ndata == 1);
+
+	if (class == NULL)
+		class = "misc";
+
+	/*
+	 * Allocate the main structure. We don't need to keep a copy of
+	 * module in here, because it is only used for sysctl node creation
+	 * done in this function.
+	 */
+	ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO);
+
+	ksp->ks_crtime = gethrtime();
+	ksp->ks_snaptime = ksp->ks_crtime;
+	ksp->ks_instance = instance;
+	(void) strlcpy(ksp->ks_name, name, KSTAT_STRLEN);
+	(void) strlcpy(ksp->ks_class, class, KSTAT_STRLEN);
+	ksp->ks_type = ks_type;
+	ksp->ks_flags = flags;
+	ksp->ks_update = kstat_default_update;
+
+	mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+	ksp->ks_lock = &ksp->ks_private_lock;
+
+	switch (ksp->ks_type) {
+	case KSTAT_TYPE_RAW:
+		ksp->ks_ndata = 1;
+		ksp->ks_data_size = ks_ndata;
+		break;
+	case KSTAT_TYPE_NAMED:
+		ksp->ks_ndata = ks_ndata;
+		ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+		break;
+	case KSTAT_TYPE_INTR:
+		ksp->ks_ndata = ks_ndata;
+		ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+		break;
+	case KSTAT_TYPE_IO:
+		ksp->ks_ndata = ks_ndata;
+		ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+		break;
+	case KSTAT_TYPE_TIMER:
+		ksp->ks_ndata = ks_ndata;
+		ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+		break;
+	default:
+		panic("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+		ksp->ks_data = NULL;
+	} else {
+		ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+		if (ksp->ks_data == NULL) {
+			kmem_free(ksp, sizeof (*ksp));
+			ksp = NULL;
+		}
+	}
+
+	/*
+	 * Some kstats use a module name like "zfs/poolname" to distinguish a
+	 * set of kstats belonging to a specific pool.  Split on '/' to add an
+	 * extra node for the pool name if needed.
+	 */
+	(void) strlcpy(buf, module, KSTAT_STRLEN);
+	module = buf;
+	pool = strchr(module, '/');
+	if (pool != NULL)
+		*pool++ = '\0';
+
+	/*
+	 * Create sysctl tree for those statistics:
+	 *
+	 *	kstat.<module>[.<pool>].<class>.<name>
+	 */
+	sysctl_ctx_init(&ksp->ks_sysctl_ctx);
+	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+	    SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0,
+	    "");
+	if (root == NULL) {
+		printf("%s: Cannot create kstat.%s tree!\n", __func__, module);
+		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+		free(ksp, M_KSTAT);
+		return (NULL);
+	}
+	if (pool != NULL) {
+		root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+		    SYSCTL_CHILDREN(root), OID_AUTO, pool, CTLFLAG_RW, 0, "");
+		if (root == NULL) {
+			printf("%s: Cannot create kstat.%s.%s tree!\n",
+			    __func__, module, pool);
+			sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+			free(ksp, M_KSTAT);
+			return (NULL);
+		}
+	}
+	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
+	    OID_AUTO, class, CTLFLAG_RW, 0, "");
+	if (root == NULL) {
+		if (pool != NULL)
+			printf("%s: Cannot create kstat.%s.%s.%s tree!\n",
+			    __func__, module, pool, class);
+		else
+			printf("%s: Cannot create kstat.%s.%s tree!\n",
+			    __func__, module, class);
+		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+		free(ksp, M_KSTAT);
+		return (NULL);
+	}
+	if (ksp->ks_type == KSTAT_TYPE_NAMED) {
+		root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
+		    SYSCTL_CHILDREN(root),
+		    OID_AUTO, name, CTLFLAG_RW, 0, "");
+		if (root == NULL) {
+			if (pool != NULL)
+				printf("%s: Cannot create kstat.%s.%s.%s.%s "
+				    "tree!\n", __func__, module, pool, class,
+				    name);
+			else
+				printf("%s: Cannot create kstat.%s.%s.%s "
+				    "tree!\n", __func__, module, class, name);
+			sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+			free(ksp, M_KSTAT);
+			return (NULL);
+		}
+
+	}
+	ksp->ks_sysctl_root = root;
+
+	return (ksp);
+}
+
+static void
+kstat_install_named(kstat_t *ksp)
+{
+	kstat_named_t *ksent;
+	char *namelast;
+	int typelast;
+
+	ksent = ksp->ks_data;
+
+	VERIFY((ksp->ks_flags & KSTAT_FLAG_VIRTUAL) || ksent != NULL);
+
+	typelast = 0;
+	namelast = NULL;
+
+	for (int i = 0; i < ksp->ks_ndata; i++, ksent++) {
+		if (ksent->data_type != 0) {
+			typelast = ksent->data_type;
+			namelast = ksent->name;
+		}
+		switch (typelast) {
+		case KSTAT_DATA_CHAR:
+			/* Not Implemented */
+			break;
+		case KSTAT_DATA_INT32:
+			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, namelast,
+			    CTLTYPE_S32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+			    ksp, i, kstat_sysctl, "I", namelast);
+			break;
+		case KSTAT_DATA_UINT32:
+			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, namelast,
+			    CTLTYPE_U32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+			    ksp, i, kstat_sysctl, "IU", namelast);
+			break;
+		case KSTAT_DATA_INT64:
+			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, namelast,
+			    CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+			    ksp, i, kstat_sysctl, "Q", namelast);
+			break;
+		case KSTAT_DATA_UINT64:
+			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, namelast,
+			    CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
+			    ksp, i, kstat_sysctl, "QU", namelast);
+			break;
+		case KSTAT_DATA_LONG:
+			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, namelast,
+			    CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
+			    ksp, i, kstat_sysctl, "L", namelast);
+			break;
+		case KSTAT_DATA_ULONG:
+			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, namelast,
+			    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
+			    ksp, i, kstat_sysctl, "LU", namelast);
+			break;
+		case KSTAT_DATA_STRING:
+			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, namelast,
+			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+			    ksp, i, kstat_sysctl_string, "A", namelast);
+			break;
+		default:
+			panic("unsupported type: %d", typelast);
+		}
+	}
+}
+
+void
+kstat_install(kstat_t *ksp)
+{
+	struct sysctl_oid *root;
+
+	if (ksp->ks_ndata == UINT32_MAX)
+		VERIFY(ksp->ks_type == KSTAT_TYPE_RAW);
+
+	switch (ksp->ks_type) {
+	case KSTAT_TYPE_NAMED:
+		return (kstat_install_named(ksp));
+	case KSTAT_TYPE_RAW:
+		if (ksp->ks_raw_ops.data) {
+			root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, ksp->ks_name, CTLTYPE_STRING | CTLFLAG_RD
+			    | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
+			    ksp, 0, kstat_sysctl_raw, "A", ksp->ks_name);
+		} else {
+			root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+			    OID_AUTO, ksp->ks_name, CTLTYPE_OPAQUE | CTLFLAG_RD
+			    | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
+			    ksp, 0, kstat_sysctl_raw, "", ksp->ks_name);
+		}
+		break;
+	case KSTAT_TYPE_IO:
+		root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
+		    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
+		    OID_AUTO, ksp->ks_name,
+		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+		    ksp, 0, kstat_sysctl_io, "A", ksp->ks_name);
+		break;
+	case KSTAT_TYPE_TIMER:
+	case KSTAT_TYPE_INTR:
+	default:
+		panic("unsupported kstat type %d\n", ksp->ks_type);
+	}
+	VERIFY(root != NULL);
+	ksp->ks_sysctl_root = root;
+}
+
+void
+kstat_delete(kstat_t *ksp)
+{
+
+	sysctl_ctx_free(&ksp->ks_sysctl_ctx);
+	ksp->ks_lock = NULL;
+	mutex_destroy(&ksp->ks_private_lock);
+	free(ksp, M_KSTAT);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt++;
+	if (wcnt != 0) {
+		kiop->wlentime += delta * wcnt;
+		kiop->wtime += delta;
+	}
+}
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt--;
+	ASSERT((int)wcnt > 0);
+	kiop->wlentime += delta * wcnt;
+	kiop->wtime += delta;
+}
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt++;
+	if (rcnt != 0) {
+		kiop->rlentime += delta * rcnt;
+		kiop->rtime += delta;
+	}
+}
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt--;
+	ASSERT((int)rcnt > 0);
+	kiop->rlentime += delta * rcnt;
+	kiop->rtime += delta;
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
new file mode 100644
index 000000000000..0354b986cd5f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_misc.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <sys/zfs_context.h>
+
+static struct opensolaris_utsname hw_utsname = {
+	.machine = MACHINE
+};
+
+#ifndef KERNEL_STATIC
+char hw_serial[11] = "0";
+
+utsname_t *
+utsname(void)
+{
+	return (&hw_utsname);
+}
+#endif
+
+static void
+opensolaris_utsname_init(void *arg)
+{
+
+	hw_utsname.sysname = ostype;
+	hw_utsname.nodename = prison0.pr_hostname;
+	hw_utsname.release = osrelease;
+	snprintf(hw_utsname.version, sizeof (hw_utsname.version),
+	    "%d", osreldate);
+}
+
+char *
+kmem_strdup(const char *s)
+{
+	char *buf;
+
+	buf = kmem_alloc(strlen(s) + 1, KM_SLEEP);
+	strcpy(buf, s);
+	return (buf);
+}
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyin(from, to, len));
+}
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyout(from, to, len));
+}
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vpanic(fmt, ap);
+	va_end(ap);
+}
+
+
+SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY,
+    opensolaris_utsname_init, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
new file mode 100644
index 000000000000..5ecd3d310361
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_policy.c
@@ -0,0 +1,438 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/jail.h>
+#include <sys/policy.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+
+int
+secpolicy_nfs(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON));
+}
+
+int
+secpolicy_zfs(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_zfs_proc(cred_t *cr, proc_t *proc)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_sys_config(cred_t *cr, int checkonly __unused)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG));
+}
+
+int
+secpolicy_zinject(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT));
+}
+
+int
+secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT));
+}
+
+int
+secpolicy_fs_owner(struct mount *mp, cred_t *cr)
+{
+
+	if (zfs_super_owner) {
+		if (cr->cr_uid == mp->mnt_cred->cr_uid &&
+		    cr->cr_prison == mp->mnt_cred->cr_prison) {
+			return (0);
+		}
+	}
+	return (EPERM);
+}
+
+/*
+ * This check is done in kern_link(), so we could just return 0 here.
+ */
+extern int hardlink_check_uid;
+int
+secpolicy_basic_link(vnode_t *vp, cred_t *cr)
+{
+
+	if (!hardlink_check_uid)
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_LINK));
+}
+
+int
+secpolicy_vnode_stky_modify(cred_t *cr)
+{
+
+	return (EPERM);
+}
+
+int
+secpolicy_vnode_remove(vnode_t *vp, cred_t *cr)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0)
+		return (EACCES);
+	if ((accmode & VWRITE) &&
+	    spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) {
+		return (EACCES);
+	}
+	if (accmode & VEXEC) {
+		if (vp->v_type == VDIR) {
+			if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0)
+				return (EACCES);
+		} else {
+			if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0)
+				return (EACCES);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ */
+int
+secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner,
+    accmode_t curmode, accmode_t wantmode)
+{
+	accmode_t mode;
+
+	mode = ~curmode & wantmode;
+
+	if (mode == 0)
+		return (0);
+
+	return (secpolicy_vnode_access(cr, vp, owner, mode));
+}
+
+int
+secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner)
+{
+	static int privs[] = {
+	    PRIV_VFS_ADMIN,
+	    PRIV_VFS_READ,
+	    PRIV_VFS_WRITE,
+	    PRIV_VFS_EXEC,
+	    PRIV_VFS_LOOKUP
+	};
+	int i;
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	/* Same as secpolicy_vnode_setdac */
+	if (owner == cr->cr_uid)
+		return (0);
+
+	for (i = 0; i < sizeof (privs)/sizeof (int); i++) {
+		int priv;
+
+		switch (priv = privs[i]) {
+		case PRIV_VFS_EXEC:
+			if (vp->v_type == VDIR)
+				continue;
+			break;
+		case PRIV_VFS_LOOKUP:
+			if (vp->v_type != VDIR)
+				continue;
+			break;
+		}
+		if (spl_priv_check_cred(cr, priv) == 0)
+			return (0);
+	}
+	return (EPERM);
+}
+
+int
+secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+	if (owner == cr->cr_uid)
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN));
+}
+
+int
+secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap,
+    const struct vattr *ovap, int flags,
+    int unlocked_access(void *, int, cred_t *), void *node)
+{
+	int mask = vap->va_mask;
+	int error;
+
+	if (mask & AT_SIZE) {
+		if (vp->v_type == VDIR)
+			return (EISDIR);
+		error = unlocked_access(node, VWRITE, cr);
+		if (error)
+			return (error);
+	}
+	if (mask & AT_MODE) {
+		/*
+		 * If not the owner of the file then check privilege
+		 * for two things: the privilege to set the mode at all
+		 * and, if we're setting setuid, we also need permissions
+		 * to add the set-uid bit, if we're not the owner.
+		 * In the specific case of creating a set-uid root
+		 * file, we need even more permissions.
+		 */
+		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+		if (error)
+			return (error);
+		error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr);
+		if (error)
+			return (error);
+	} else {
+		vap->va_mode = ovap->va_mode;
+	}
+	if (mask & (AT_UID | AT_GID)) {
+		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+		if (error)
+			return (error);
+
+		/*
+		 * To change the owner of a file, or change the group of
+		 * a file to a group of which we are not a member, the
+		 * caller must have privilege.
+		 */
+		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid &&
+		    !groupmember(vap->va_gid, cr))) {
+			if (secpolicy_fs_owner(vp->v_mount, cr) != 0) {
+				error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN);
+				if (error)
+					return (error);
+			}
+		}
+
+		if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) ||
+		    ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) {
+			secpolicy_setid_clear(vap, vp, cr);
+		}
+	}
+	if (mask & (AT_ATIME | AT_MTIME)) {
+		/*
+		 * From utimes(2):
+		 * If times is NULL, ... The caller must be the owner of
+		 * the file, have permission to write the file, or be the
+		 * super-user.
+		 * If times is non-NULL, ... The caller must be the owner of
+		 * the file or be the super-user.
+		 */
+		error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid);
+		if (error && (vap->va_vaflags & VA_UTIMES_NULL))
+			error = unlocked_access(node, VWRITE, cr);
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+int
+secpolicy_vnode_create_gid(cred_t *cr)
+{
+
+	return (EPERM);
+}
+
+int
+secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid)
+{
+
+	if (groupmember(gid, cr))
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_SETGID));
+}
+
+int
+secpolicy_vnode_setid_retain(znode_t *zp, cred_t *cr,
+    boolean_t issuidroot __unused)
+{
+
+	if (secpolicy_fs_owner(ZTOV(zp)->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID));
+}
+
+void
+secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return;
+
+	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) {
+		if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) {
+			vap->va_mask |= AT_MODE;
+			vap->va_mode &= ~(S_ISUID|S_ISGID);
+		}
+	}
+}
+
+int
+secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap,
+    const struct vattr *ovap, cred_t *cr)
+{
+	int error;
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	/*
+	 * Privileged processes may set the sticky bit on non-directories,
+	 * as well as set the setgid bit on a file with a group that the process
+	 * is not a member of. Both of these are allowed in jail(8).
+	 */
+	if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) {
+		if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE))
+			return (EFTYPE);
+	}
+	/*
+	 * Check for privilege if attempting to set the
+	 * group-id bit.
+	 */
+	if ((vap->va_mode & S_ISGID) != 0) {
+		error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid);
+		if (error)
+			return (error);
+	}
+	/*
+	 * Deny setting setuid if we are not the file owner.
+	 */
+	if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) {
+		error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN);
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+int
+secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT));
+}
+
+int
+secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+	if (owner == cr->cr_uid)
+		return (0);
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+
+	/* XXX: vfs_suser()? */
+	return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER));
+}
+
+int
+secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN));
+}
+
+void
+secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp)
+{
+
+	if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) {
+		MNT_ILOCK(vfsp);
+		vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER;
+		vfs_clearmntopt(vfsp, MNTOPT_SETUID);
+		vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0);
+		MNT_IUNLOCK(vfsp);
+	}
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr,
+    vtype_t vtype)
+{
+
+	if (secpolicy_fs_owner(vp->v_mount, cr) == 0)
+		return (0);
+	return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS));
+}
+
+int
+secpolicy_smb(cred_t *cr)
+{
+
+	return (spl_priv_check_cred(cr, PRIV_NETSMB));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
new file mode 100644
index 000000000000..e8448ce00686
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_procfs_list.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+
+typedef struct procfs_list_iter {
+	procfs_list_t *pli_pl;
+	void *pli_elt;
+} pli_t;
+
+void
+seq_printf(struct seq_file *f, const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	(void) vsnprintf(f->sf_buf, f->sf_size, fmt, adx);
+	va_end(adx);
+}
+
+static int
+procfs_list_update(kstat_t *ksp, int rw)
+{
+	procfs_list_t *pl = ksp->ks_private;
+
+	if (rw == KSTAT_WRITE)
+		pl->pl_clear(pl);
+
+	return (0);
+}
+
+static int
+procfs_list_data(char *buf, size_t size, void *data)
+{
+	pli_t *p;
+	void *elt;
+	procfs_list_t *pl;
+	struct seq_file f;
+
+	p = data;
+	pl = p->pli_pl;
+	elt = p->pli_elt;
+	free(p, M_TEMP);
+	f.sf_buf = buf;
+	f.sf_size = size;
+	return (pl->pl_show(&f, elt));
+}
+
+static void *
+procfs_list_addr(kstat_t *ksp, loff_t n)
+{
+	procfs_list_t *pl = ksp->ks_private;
+	void *elt = ksp->ks_private1;
+	pli_t *p = NULL;
+
+
+	if (n == 0)
+		ksp->ks_private1 = list_head(&pl->pl_list);
+	else if (elt)
+		ksp->ks_private1 = list_next(&pl->pl_list, elt);
+
+	if (ksp->ks_private1) {
+		p = malloc(sizeof (*p), M_TEMP, M_WAITOK);
+		p->pli_pl = pl;
+		p->pli_elt = ksp->ks_private1;
+	}
+
+	return (p);
+}
+
+void
+procfs_list_install(const char *module,
+    const char *submodule,
+    const char *name,
+    mode_t mode,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off)
+{
+	kstat_t *procfs_kstat;
+
+	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&procfs_list->pl_list,
+	    procfs_list_node_off + sizeof (procfs_list_node_t),
+	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+	procfs_list->pl_show = show;
+	procfs_list->pl_show_header = show_header;
+	procfs_list->pl_clear = clear;
+	procfs_list->pl_next_id = 1;
+	procfs_list->pl_node_offset = procfs_list_node_off;
+
+	procfs_kstat =  kstat_create(module, 0, name, submodule,
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	if (procfs_kstat) {
+		procfs_kstat->ks_lock = &procfs_list->pl_lock;
+		procfs_kstat->ks_ndata = UINT32_MAX;
+		procfs_kstat->ks_private = procfs_list;
+		procfs_kstat->ks_update = procfs_list_update;
+		kstat_set_seq_raw_ops(procfs_kstat, show_header,
+		    procfs_list_data, procfs_list_addr);
+		kstat_install(procfs_kstat);
+		procfs_list->pl_private = procfs_kstat;
+	}
+}
+
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{}
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+	ASSERT(list_is_empty(&procfs_list->pl_list));
+	kstat_delete(procfs_list->pl_private);
+	list_destroy(&procfs_list->pl_list);
+	mutex_destroy(&procfs_list->pl_lock);
+}
+
+#define	NODE_ID(procfs_list, obj) \
+		(((procfs_list_node_t *)(((char *)obj) + \
+		(procfs_list)->pl_node_offset))->pln_id)
+
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+	list_insert_tail(&procfs_list->pl_list, p);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
new file mode 100644
index 000000000000..d13b64b4cd26
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_string.c
@@ -0,0 +1,107 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * $FreeBSD$
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/string.h>
+#include <sys/kmem.h>
+#include <machine/stdarg.h>
+
+#define	IS_DIGIT(c)	((c) >= '0' && (c) <= '9')
+
+#define	IS_ALPHA(c)	\
+	(((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
+
+char *
+strpbrk(const char *s, const char *b)
+{
+	const char *p;
+
+	do {
+		for (p = b; *p != '\0' && *p != *s; ++p)
+			;
+		if (*p != '\0')
+			return ((char *)s);
+	} while (*s++);
+
+	return (NULL);
+}
+
+/*
+ * Convert a string into a valid C identifier by replacing invalid
+ * characters with '_'.  Also makes sure the string is nul-terminated
+ * and takes up at most n bytes.
+ */
+void
+strident_canon(char *s, size_t n)
+{
+	char c;
+	char *end = s + n - 1;
+
+	if ((c = *s) == 0)
+		return;
+
+	if (!IS_ALPHA(c) && c != '_')
+		*s = '_';
+
+	while (s < end && ((c = *(++s)) != 0)) {
+		if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_')
+			*s = '_';
+	}
+	*s = 0;
+}
+
+/*
+ * Do not change the length of the returned string; it must be freed
+ * with strfree().
+ */
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	int size;
+	va_list adx;
+	char *buf;
+
+	va_start(adx, fmt);
+	size = vsnprintf(NULL, 0, fmt, adx) + 1;
+	va_end(adx);
+
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, size, fmt, adx);
+	va_end(adx);
+
+	return (buf);
+}
+
+void
+kmem_strfree(char *str)
+{
+	ASSERT(str != NULL);
+	kmem_free(str, strlen(str) + 1);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
new file mode 100644
index 000000000000..ebec77bdb37f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sunddi.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/misc.h>
+#include <sys/sunddi.h>
+#include <sys/sysctl.h>
+
+int
+ddi_strtol(const char *str, char **nptr, int base, long *result)
+{
+
+	*result = strtol(str, nptr, base);
+	return (0);
+}
+
+int
+ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result)
+{
+
+	if (str == hw_serial) {
+		*result = prison0.pr_hostid;
+		return (0);
+	}
+
+	*result = strtoul(str, nptr, base);
+	return (0);
+}
+
+int
+ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result)
+{
+
+	*result = (unsigned long long)strtouq(str, nptr, base);
+	return (0);
+}
+
+int
+ddi_strtoll(const char *str, char **nptr, int base, long long *result)
+{
+
+	*result = (long long)strtoq(str, nptr, base);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
new file mode 100644
index 000000000000..8c0e495681e9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_sysevent.c
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kmem.h>
+#include <sys/list.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/nvpair.h>
+#include <sys/sunddi.h>
+#include <sys/sysevent.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/bus.h>
+
+static int
+log_sysevent(nvlist_t *event)
+{
+	struct sbuf *sb;
+	const char *type;
+	char typestr[128];
+	nvpair_t *elem = NULL;
+
+	sb = sbuf_new_auto();
+	if (sb == NULL)
+		return (ENOMEM);
+	type = NULL;
+
+	while ((elem = nvlist_next_nvpair(event, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_BOOLEAN:
+		{
+			boolean_t value;
+
+			(void) nvpair_value_boolean_value(elem, &value);
+			sbuf_printf(sb, " %s=%s", nvpair_name(elem),
+			    value ? "true" : "false");
+			break;
+		}
+		case DATA_TYPE_UINT8:
+		{
+			uint8_t value;
+
+			(void) nvpair_value_uint8(elem, &value);
+			sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value);
+			break;
+		}
+		case DATA_TYPE_INT32:
+		{
+			int32_t value;
+
+			(void) nvpair_value_int32(elem, &value);
+			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+			    (intmax_t)value);
+			break;
+		}
+		case DATA_TYPE_UINT32:
+		{
+			uint32_t value;
+
+			(void) nvpair_value_uint32(elem, &value);
+			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+			    (uintmax_t)value);
+			break;
+		}
+		case DATA_TYPE_INT64:
+		{
+			int64_t value;
+
+			(void) nvpair_value_int64(elem, &value);
+			sbuf_printf(sb, " %s=%jd", nvpair_name(elem),
+			    (intmax_t)value);
+			break;
+		}
+		case DATA_TYPE_UINT64:
+		{
+			uint64_t value;
+
+			(void) nvpair_value_uint64(elem, &value);
+			sbuf_printf(sb, " %s=%ju", nvpair_name(elem),
+			    (uintmax_t)value);
+			break;
+		}
+		case DATA_TYPE_STRING:
+		{
+			char *value;
+
+			(void) nvpair_value_string(elem, &value);
+			sbuf_printf(sb, " %s=%s", nvpair_name(elem), value);
+			if (strcmp(FM_CLASS, nvpair_name(elem)) == 0)
+				type = value;
+			break;
+		}
+		case DATA_TYPE_UINT8_ARRAY:
+		{
+			uint8_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint8_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%02hhx", value[ii]);
+			break;
+		}
+		case DATA_TYPE_UINT16_ARRAY:
+		{
+			uint16_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint16_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%04hx", value[ii]);
+			break;
+		}
+		case DATA_TYPE_UINT32_ARRAY:
+		{
+			uint32_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint32_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]);
+			break;
+		}
+		case DATA_TYPE_INT64_ARRAY:
+		{
+			int64_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_int64_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%016lld",
+				    (long long)value[ii]);
+			break;
+		}
+		case DATA_TYPE_UINT64_ARRAY:
+		{
+			uint64_t *value;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_uint64_array(elem, &value, &nelem);
+			sbuf_printf(sb, " %s=", nvpair_name(elem));
+			for (ii = 0; ii < nelem; ii++)
+				sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]);
+			break;
+		}
+		case DATA_TYPE_STRING_ARRAY:
+		{
+			char **strarr;
+			uint_t ii, nelem;
+
+			(void) nvpair_value_string_array(elem, &strarr, &nelem);
+
+			for (ii = 0; ii < nelem; ii++) {
+				if (strarr[ii] == NULL)  {
+					sbuf_printf(sb, " <NULL>");
+					continue;
+				}
+
+				sbuf_printf(sb, " %s", strarr[ii]);
+				if (strcmp(FM_CLASS, strarr[ii]) == 0)
+					type = strarr[ii];
+			}
+			break;
+		}
+		case DATA_TYPE_NVLIST:
+			/* XXX - requires recursing in log_sysevent */
+			break;
+		default:
+			printf("%s: type %d is not implemented\n", __func__,
+			    nvpair_type(elem));
+			break;
+		}
+	}
+
+	if (sbuf_finish(sb) != 0) {
+		sbuf_delete(sb);
+		return (ENOMEM);
+	}
+
+	if (type == NULL)
+		type = "";
+	if (strncmp(type, "ESC_ZFS_", 8) == 0) {
+		snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8);
+		type = typestr;
+	}
+	devctl_notify("ZFS", "ZFS", type, sbuf_data(sb));
+	sbuf_delete(sb);
+
+	return (0);
+}
+
+static void
+sysevent_worker(void *arg __unused)
+{
+	zfs_zevent_t *ze;
+	nvlist_t *event;
+	uint64_t dropped = 0;
+	uint64_t dst_size;
+	int error;
+
+	zfs_zevent_init(&ze);
+	for (;;) {
+		dst_size = 131072;
+		dropped = 0;
+		event = NULL;
+		error = zfs_zevent_next(ze, &event,
+		    &dst_size, &dropped);
+		if (error) {
+			error = zfs_zevent_wait(ze);
+			if (error == ESHUTDOWN)
+				break;
+		} else {
+			VERIFY(event != NULL);
+			log_sysevent(event);
+			nvlist_free(event);
+		}
+	}
+	zfs_zevent_destroy(ze);
+	kthread_exit();
+}
+
+void
+ddi_sysevent_init(void)
+{
+	kproc_kthread_add(sysevent_worker, NULL, &system_proc, NULL, 0, 0,
+	    "zfskern", "sysevent");
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
new file mode 100644
index 000000000000..8ad6de9b5e9f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_taskq.c
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2009 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Copyright (c) 2012 Spectra Logic Corporation.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/ck.h>
+#include <sys/epoch.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/taskq.h>
+#include <sys/taskqueue.h>
+#include <sys/zfs_context.h>
+
+#if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
+#include <machine/pcb.h>
+#endif
+
+#include <vm/uma.h>
+
+#if __FreeBSD_version < 1201522
+#define	taskqueue_start_threads_in_proc(tqp, count, pri, proc, name, ...) \
+    taskqueue_start_threads(tqp, count, pri, name, __VA_ARGS__)
+#endif
+
+static uint_t taskq_tsd;
+static uma_zone_t taskq_zone;
+
+taskq_t *system_taskq = NULL;
+taskq_t *system_delay_taskq = NULL;
+taskq_t *dynamic_taskq = NULL;
+
+proc_t *system_proc;
+
+extern int uma_align_cache;
+
+static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
+
+static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
+static unsigned long tqenthash;
+static unsigned long tqenthashlock;
+static struct sx *tqenthashtbl_lock;
+
+static taskqid_t tqidnext;
+
+#define	TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
+#define	TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
+
+#define	TIMEOUT_TASK 1
+#define	NORMAL_TASK 2
+
+static void
+system_taskq_init(void *arg)
+{
+	int i;
+
+	tsd_create(&taskq_tsd, NULL);
+	tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash);
+	tqenthashlock = (tqenthash + 1) / 8;
+	if (tqenthashlock > 0)
+		tqenthashlock--;
+	tqenthashtbl_lock =
+	    malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1),
+	    M_TASKQ, M_WAITOK | M_ZERO);
+	for (i = 0; i < tqenthashlock + 1; i++)
+		sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK);
+	taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t),
+	    NULL, NULL, NULL, NULL,
+	    UMA_ALIGN_CACHE, 0);
+	system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri,
+	    0, 0, 0);
+	system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus,
+	    minclsyspri, 0, 0, 0);
+}
+SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init,
+    NULL);
+
+static void
+system_taskq_fini(void *arg)
+{
+	int i;
+
+	taskq_destroy(system_delay_taskq);
+	taskq_destroy(system_taskq);
+	uma_zdestroy(taskq_zone);
+	tsd_destroy(&taskq_tsd);
+	for (i = 0; i < tqenthashlock + 1; i++)
+		sx_destroy(&tqenthashtbl_lock[i]);
+	for (i = 0; i < tqenthash + 1; i++)
+		VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i]));
+	free(tqenthashtbl_lock, M_TASKQ);
+	free(tqenthashtbl, M_TASKQ);
+}
+SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini,
+    NULL);
+
+#ifdef __LP64__
+static taskqid_t
+__taskq_genid(void)
+{
+	taskqid_t tqid;
+
+	/*
+	 * Assume a 64-bit counter will not wrap in practice.
+	 */
+	tqid = atomic_add_64_nv(&tqidnext, 1);
+	VERIFY(tqid);
+	return (tqid);
+}
+#else
+static taskqid_t
+__taskq_genid(void)
+{
+	taskqid_t tqid;
+
+	for (;;) {
+		tqid = atomic_add_32_nv(&tqidnext, 1);
+		if (__predict_true(tqid != 0))
+			break;
+	}
+	VERIFY(tqid);
+	return (tqid);
+}
+#endif
+
+static taskq_ent_t *
+taskq_lookup(taskqid_t tqid)
+{
+	taskq_ent_t *ent = NULL;
+
+	sx_xlock(TQIDHASHLOCK(tqid));
+	CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
+		if (ent->tqent_id == tqid)
+			break;
+	}
+	if (ent != NULL)
+		refcount_acquire(&ent->tqent_rc);
+	sx_xunlock(TQIDHASHLOCK(tqid));
+	return (ent);
+}
+
+static taskqid_t
+taskq_insert(taskq_ent_t *ent)
+{
+	taskqid_t tqid;
+
+	tqid = __taskq_genid();
+	ent->tqent_id = tqid;
+	ent->tqent_registered = B_TRUE;
+	sx_xlock(TQIDHASHLOCK(tqid));
+	CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
+	sx_xunlock(TQIDHASHLOCK(tqid));
+	return (tqid);
+}
+
+static void
+taskq_remove(taskq_ent_t *ent)
+{
+	taskqid_t tqid = ent->tqent_id;
+
+	if (!ent->tqent_registered)
+		return;
+
+	sx_xlock(TQIDHASHLOCK(tqid));
+	CK_LIST_REMOVE(ent, tqent_hash);
+	sx_xunlock(TQIDHASHLOCK(tqid));
+	ent->tqent_registered = B_FALSE;
+}
+
+static void
+taskq_tsd_set(void *context)
+{
+	taskq_t *tq = context;
+
+#if defined(__amd64__) || defined(__aarch64__) 
+	if (context != NULL && tsd_get(taskq_tsd) == NULL)
+		fpu_kern_thread(FPU_KERN_NORMAL);
+#endif
+	tsd_set(taskq_tsd, tq);
+}
+
+static taskq_t *
+taskq_create_impl(const char *name, int nthreads, pri_t pri,
+    proc_t *proc __maybe_unused, uint_t flags)
+{
+	taskq_t *tq;
+
+	if ((flags & TASKQ_THREADS_CPU_PCT) != 0)
+		nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
+
+	tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
+	tq->tq_queue = taskqueue_create(name, M_WAITOK,
+	    taskqueue_thread_enqueue, &tq->tq_queue);
+	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
+	    taskq_tsd_set, tq);
+	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN,
+	    taskq_tsd_set, NULL);
+	(void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri,
+	    proc, "%s", name);
+
+	return ((taskq_t *)tq);
+}
+
+taskq_t *
+taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused,
+    int maxalloc __unused, uint_t flags)
+{
+	return (taskq_create_impl(name, nthreads, pri, system_proc, flags));
+}
+
+taskq_t *
+taskq_create_proc(const char *name, int nthreads, pri_t pri,
+    int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags)
+{
+	return (taskq_create_impl(name, nthreads, pri, proc, flags));
+}
+
+void
+taskq_destroy(taskq_t *tq)
+{
+
+	taskqueue_free(tq->tq_queue);
+	kmem_free(tq, sizeof (*tq));
+}
+
+int
+taskq_member(taskq_t *tq, kthread_t *thread)
+{
+
+	return (taskqueue_member(tq->tq_queue, thread));
+}
+
+taskq_t *
+taskq_of_curthread(void)
+{
+	return (tsd_get(taskq_tsd));
+}
+
+static void
+taskq_free(taskq_ent_t *task)
+{
+	taskq_remove(task);
+	if (refcount_release(&task->tqent_rc))
+		uma_zfree(taskq_zone, task);
+}
+
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t tid)
+{
+	uint32_t pend;
+	int rc;
+	taskq_ent_t *ent;
+
+	if (tid == 0)
+		return (0);
+
+	if ((ent = taskq_lookup(tid)) == NULL)
+		return (0);
+
+	ent->tqent_cancelled = B_TRUE;
+	if (ent->tqent_type == TIMEOUT_TASK) {
+		rc = taskqueue_cancel_timeout(tq->tq_queue,
+		    &ent->tqent_timeout_task, &pend);
+	} else
+		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
+	if (rc == EBUSY) {
+		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	} else if (pend) {
+		/*
+		 * Tasks normally free themselves when run, but here the task
+		 * was cancelled so it did not free itself.
+		 */
+		taskq_free(ent);
+	}
+	/* Free the extra reference we added with taskq_lookup. */
+	taskq_free(ent);
+	return (rc);
+}
+
+static void
+taskq_run(void *arg, int pending __unused)
+{
+	taskq_ent_t *task = arg;
+
+	if (!task->tqent_cancelled)
+		task->tqent_func(task->tqent_arg);
+	taskq_free(task);
+}
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+    uint_t flags, clock_t expire_time)
+{
+	taskq_ent_t *task;
+	taskqid_t tqid;
+	clock_t timo;
+	int mflag;
+
+	timo = expire_time - ddi_get_lbolt();
+	if (timo <= 0)
+		return (taskq_dispatch(tq, func, arg, flags));
+
+	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+		mflag = M_WAITOK;
+	else
+		mflag = M_NOWAIT;
+
+	task = uma_zalloc(taskq_zone, mflag);
+	if (task == NULL)
+		return (0);
+	task->tqent_func = func;
+	task->tqent_arg = arg;
+	task->tqent_type = TIMEOUT_TASK;
+	task->tqent_cancelled = B_FALSE;
+	refcount_init(&task->tqent_rc, 1);
+	tqid = taskq_insert(task);
+	TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
+	    taskq_run, task);
+
+	taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task,
+	    timo);
+	return (tqid);
+}
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+	taskq_ent_t *task;
+	int mflag, prio;
+	taskqid_t tqid;
+
+	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
+		mflag = M_WAITOK;
+	else
+		mflag = M_NOWAIT;
+	/*
+	 * If TQ_FRONT is given, we want higher priority for this task, so it
+	 * can go at the front of the queue.
+	 */
+	prio = !!(flags & TQ_FRONT);
+
+	task = uma_zalloc(taskq_zone, mflag);
+	if (task == NULL)
+		return (0);
+	refcount_init(&task->tqent_rc, 1);
+	task->tqent_func = func;
+	task->tqent_arg = arg;
+	task->tqent_cancelled = B_FALSE;
+	task->tqent_type = NORMAL_TASK;
+	tqid = taskq_insert(task);
+	TASK_INIT(&task->tqent_task, prio, taskq_run, task);
+	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+	return (tqid);
+}
+
+static void
+taskq_run_ent(void *arg, int pending __unused)
+{
+	taskq_ent_t *task = arg;
+
+	task->tqent_func(task->tqent_arg);
+}
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
+    taskq_ent_t *task)
+{
+	int prio;
+
+	/*
+	 * If TQ_FRONT is given, we want higher priority for this task, so it
+	 * can go at the front of the queue.
+	 */
+	prio = !!(flags & TQ_FRONT);
+	task->tqent_cancelled = B_FALSE;
+	task->tqent_registered = B_FALSE;
+	task->tqent_id = 0;
+	task->tqent_func = func;
+	task->tqent_arg = arg;
+
+	TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task);
+	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
+}
+
+void
+taskq_wait(taskq_t *tq)
+{
+	taskqueue_quiesce(tq->tq_queue);
+}
+
+void
+taskq_wait_id(taskq_t *tq, taskqid_t tid)
+{
+	taskq_ent_t *ent;
+
+	if (tid == 0)
+		return;
+	if ((ent = taskq_lookup(tid)) == NULL)
+		return;
+
+	taskqueue_drain(tq->tq_queue, &ent->tqent_task);
+	taskq_free(ent);
+}
+
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused)
+{
+	taskqueue_drain_all(tq->tq_queue);
+}
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+	return (t->tqent_task.ta_pending == 0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
new file mode 100644
index 000000000000..f5f3524f7b9d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_uio.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/* All Rights Reserved   */
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+/*
+ * $FreeBSD$
+ */
+
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+#include <sys/zfs_znode.h>
+
+/*
+ * same as zfs_uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
+{
+	struct iovec small_iovec[1];
+	struct uio small_uio_clone;
+	struct uio *uio_clone;
+	int error;
+
+	ASSERT3U(zfs_uio_rw(uio), ==, rw);
+	if (zfs_uio_iovcnt(uio) == 1) {
+		small_uio_clone = *(GET_UIO_STRUCT(uio));
+		small_iovec[0] = *(GET_UIO_STRUCT(uio)->uio_iov);
+		small_uio_clone.uio_iov = small_iovec;
+		uio_clone = &small_uio_clone;
+	} else {
+		uio_clone = cloneuio(GET_UIO_STRUCT(uio));
+	}
+
+	error = vn_io_fault_uiomove(p, n, uio_clone);
+	*cbytes = zfs_uio_resid(uio) - uio_clone->uio_resid;
+	if (uio_clone != &small_uio_clone)
+		free(uio_clone, M_IOV);
+	return (error);
+}
+
+/*
+ * Drop the next n chars out of *uiop.
+ */
+void
+zfs_uioskip(zfs_uio_t *uio, size_t n)
+{
+	zfs_uio_seg_t segflg;
+
+	/* For the full compatibility with illumos. */
+	if (n > zfs_uio_resid(uio))
+		return;
+
+	segflg = zfs_uio_segflg(uio);
+	zfs_uio_segflg(uio) = UIO_NOCOPY;
+	zfs_uiomove(NULL, n, zfs_uio_rw(uio), uio);
+	zfs_uio_segflg(uio) = segflg;
+}
+
+int
+zfs_uio_fault_move(void *p, size_t n, zfs_uio_rw_t dir, zfs_uio_t *uio)
+{
+	ASSERT(zfs_uio_rw(uio) == dir);
+	return (vn_io_fault_uiomove(p, n, GET_UIO_STRUCT(uio)));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
new file mode 100644
index 000000000000..09c8401267df
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vfs.c
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2006-2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/cred.h>
+#include <sys/vfs.h>
+#include <sys/priv.h>
+#include <sys/libkern.h>
+
+#include <sys/mutex.h>
+#include <sys/vnode.h>
+#include <sys/taskq.h>
+
+#include <sys/ccompat.h>
+
+MALLOC_DECLARE(M_MOUNT);
+
+void
+vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg,
+    int flags __unused)
+{
+	struct vfsopt *opt;
+	size_t namesize;
+	int locked;
+
+	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+		MNT_ILOCK(vfsp);
+
+	if (vfsp->mnt_opt == NULL) {
+		void *opts;
+
+		MNT_IUNLOCK(vfsp);
+		opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK);
+		MNT_ILOCK(vfsp);
+		if (vfsp->mnt_opt == NULL) {
+			vfsp->mnt_opt = opts;
+			TAILQ_INIT(vfsp->mnt_opt);
+		} else {
+			free(opts, M_MOUNT);
+		}
+	}
+
+	MNT_IUNLOCK(vfsp);
+
+	opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK);
+	namesize = strlen(name) + 1;
+	opt->name = malloc(namesize, M_MOUNT, M_WAITOK);
+	strlcpy(opt->name, name, namesize);
+	opt->pos = -1;
+	opt->seen = 1;
+	if (arg == NULL) {
+		opt->value = NULL;
+		opt->len = 0;
+	} else {
+		opt->len = strlen(arg) + 1;
+		opt->value = malloc(opt->len, M_MOUNT, M_WAITOK);
+		bcopy(arg, opt->value, opt->len);
+	}
+
+	MNT_ILOCK(vfsp);
+	TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link);
+	if (!locked)
+		MNT_IUNLOCK(vfsp);
+}
+
+void
+vfs_clearmntopt(vfs_t *vfsp, const char *name)
+{
+	int locked;
+
+	if (!(locked = mtx_owned(MNT_MTX(vfsp))))
+		MNT_ILOCK(vfsp);
+	vfs_deleteopt(vfsp->mnt_opt, name);
+	if (!locked)
+		MNT_IUNLOCK(vfsp);
+}
+
+int
+vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp)
+{
+	struct vfsoptlist *opts = vfsp->mnt_optnew;
+	int error;
+
+	if (opts == NULL)
+		return (0);
+	error = vfs_getopt(opts, opt, (void **)argp, NULL);
+	return (error != 0 ? 0 : 1);
+}
+
+int
+mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath,
+    char *fspec, int fsflags)
+{
+	struct vfsconf *vfsp;
+	struct mount *mp;
+	vnode_t *vp, *mvp;
+	struct ucred *cr;
+	int error;
+
+	ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot");
+
+	vp = *vpp;
+	*vpp = NULL;
+	error = 0;
+
+	/*
+	 * Be ultra-paranoid about making sure the type and fspath
+	 * variables will fit in our mp buffers, including the
+	 * terminating NUL.
+	 */
+	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
+		error = ENAMETOOLONG;
+	if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL)
+		error = ENODEV;
+	if (error == 0 && vp->v_type != VDIR)
+		error = ENOTDIR;
+	/*
+	 * We need vnode lock to protect v_mountedhere and vnode interlock
+	 * to protect v_iflag.
+	 */
+	if (error == 0) {
+		VI_LOCK(vp);
+		if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
+			vp->v_iflag |= VI_MOUNT;
+		else
+			error = EBUSY;
+		VI_UNLOCK(vp);
+	}
+	if (error != 0) {
+		vput(vp);
+		return (error);
+	}
+	vn_seqc_write_begin(vp);
+	VOP_UNLOCK1(vp);
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 * We don't want regular user that triggered snapshot mount to be able
+	 * to unmount it, so pass credentials of the parent mount.
+	 */
+	mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred);
+
+	mp->mnt_optnew = NULL;
+	vfs_setmntopt(mp, "from", fspec, 0);
+	mp->mnt_optnew = mp->mnt_opt;
+	mp->mnt_opt = NULL;
+
+	/*
+	 * Set the mount level flags.
+	 */
+	mp->mnt_flag = fsflags & MNT_UPDATEMASK;
+	/*
+	 * Snapshots are always read-only.
+	 */
+	mp->mnt_flag |= MNT_RDONLY;
+	/*
+	 * We don't want snapshots to allow access to vulnerable setuid
+	 * programs, so we turn off setuid when mounting snapshots.
+	 */
+	mp->mnt_flag |= MNT_NOSUID;
+	/*
+	 * We don't want snapshots to be visible in regular
+	 * mount(8) and df(1) output.
+	 */
+	mp->mnt_flag |= MNT_IGNORE;
+	/*
+	 * XXX: This is evil, but we can't mount a snapshot as a regular user.
+	 * XXX: Is is safe when snapshot is mounted from within a jail?
+	 */
+	cr = td->td_ucred;
+	td->td_ucred = kcred;
+	error = VFS_MOUNT(mp);
+	td->td_ucred = cr;
+
+	if (error != 0) {
+		/*
+		 * Clear VI_MOUNT and decrement the use count "atomically",
+		 * under the vnode lock.  This is not strictly required,
+		 * but makes it easier to reason about the life-cycle and
+		 * ownership of the covered vnode.
+		 */
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		VI_LOCK(vp);
+		vp->v_iflag &= ~VI_MOUNT;
+		VI_UNLOCK(vp);
+		vn_seqc_write_end(vp);
+		vput(vp);
+		vfs_unbusy(mp);
+		vfs_freeopts(mp->mnt_optnew);
+		mp->mnt_vnodecovered = NULL;
+		vfs_mount_destroy(mp);
+		return (error);
+	}
+
+	if (mp->mnt_opt != NULL)
+		vfs_freeopts(mp->mnt_opt);
+	mp->mnt_opt = mp->mnt_optnew;
+	(void) VFS_STATFS(mp, &mp->mnt_stat);
+
+	/*
+	 * Prevent external consumers of mount options from reading
+	 * mnt_optnew.
+	 */
+	mp->mnt_optnew = NULL;
+
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+#ifdef FREEBSD_NAMECACHE
+	cache_purge(vp);
+#endif
+	VI_LOCK(vp);
+	vp->v_iflag &= ~VI_MOUNT;
+#ifdef VIRF_MOUNTPOINT
+	vn_irflag_set_locked(vp, VIRF_MOUNTPOINT);
+#endif
+	vp->v_mountedhere = mp;
+	VI_UNLOCK(vp);
+	/* Put the new filesystem on the mount list. */
+	mtx_lock(&mountlist_mtx);
+	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	mtx_unlock(&mountlist_mtx);
+	vfs_event_signal(NULL, VQ_MOUNT, 0);
+	if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp))
+		panic("mount: lost mount");
+	vn_seqc_write_end(vp);
+	VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300048
+	vfs_op_exit(mp);
+#endif
+	vfs_unbusy(mp);
+	*vpp = mvp;
+	return (0);
+}
+
+/*
+ * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it
+ * asynchronously using a taskq. This can avoid deadlocks caused by re-entering
+ * the file system as a result of releasing the vnode. Note, file systems
+ * already have to handle the race where the vnode is incremented before the
+ * inactive routine is called and does its locking.
+ *
+ * Warning: Excessive use of this routine can lead to performance problems.
+ * This is because taskqs throttle back allocation if too many are created.
+ */
+void
+vn_rele_async(vnode_t *vp, taskq_t *taskq)
+{
+	VERIFY(vp->v_count > 0);
+	if (refcount_release_if_not_last(&vp->v_usecount)) {
+#if __FreeBSD_version < 1300045
+		vdrop(vp);
+#endif
+		return;
+	}
+	VERIFY(taskq_dispatch((taskq_t *)taskq,
+	    (task_func_t *)vrele, vp, TQ_SLEEP) != 0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
new file mode 100644
index 000000000000..739ddb05e895
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_vm.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2013 EMC Corp.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/counter.h>
+
+#include <sys/byteorder.h>
+#include <sys/lock.h>
+#include <sys/freebsd_rwlock.h>
+#include <sys/vm.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+
+const int zfs_vm_pagerret_bad = VM_PAGER_BAD;
+const int zfs_vm_pagerret_error = VM_PAGER_ERROR;
+const int zfs_vm_pagerret_ok = VM_PAGER_OK;
+const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC;
+const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL;
+
+void
+zfs_vmobject_assert_wlocked(vm_object_t object)
+{
+
+	/*
+	 * This is not ideal because FILE/LINE used by assertions will not
+	 * be too helpful, but it must be an hard function for
+	 * compatibility reasons.
+	 */
+	VM_OBJECT_ASSERT_WLOCKED(object);
+}
+
+void
+zfs_vmobject_wlock(vm_object_t object)
+{
+
+	VM_OBJECT_WLOCK(object);
+}
+
+void
+zfs_vmobject_wunlock(vm_object_t object)
+{
+
+	VM_OBJECT_WUNLOCK(object);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
new file mode 100644
index 000000000000..3644eba77ca1
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zlib.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+#if __FreeBSD_version >= 1300041
+#include <contrib/zlib/zlib.h>
+#else
+#include <sys/zlib.h>
+#endif
+#include <sys/kobj.h>
+
+
+/*ARGSUSED*/
+static void *
+zcalloc(void *opaque, uint_t items, uint_t size)
+{
+
+	return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT));
+}
+
+/*ARGSUSED*/
+static void
+zcfree(void *opaque, void *ptr)
+{
+
+	free(ptr, M_SOLARIS);
+}
+
+static int
+zlib_deflateInit(z_stream *stream, int level)
+{
+
+	stream->zalloc = zcalloc;
+	stream->opaque = NULL;
+	stream->zfree = zcfree;
+
+	return (deflateInit(stream, level));
+}
+
+static int
+zlib_deflate(z_stream *stream, int flush)
+{
+	return (deflate(stream, flush));
+}
+
+static int
+zlib_deflateEnd(z_stream *stream)
+{
+	return (deflateEnd(stream));
+}
+
+static int
+zlib_inflateInit(z_stream *stream)
+{
+	stream->zalloc = zcalloc;
+	stream->opaque = NULL;
+	stream->zfree = zcfree;
+
+	return (inflateInit(stream));
+}
+
+static int
+zlib_inflate(z_stream *stream, int finish)
+{
+#if __FreeBSD_version >= 1300024
+	return (inflate(stream, finish));
+#else
+	return (_zlib104_inflate(stream, finish));
+#endif
+}
+
+
+static int
+zlib_inflateEnd(z_stream *stream)
+{
+	return (inflateEnd(stream));
+}
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call.  Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use.  This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+	// return (kmem_cache_alloc(zlib_workspace_cache, flags));
+	return (NULL);
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+	// kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit.  sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+    size_t sourceLen, int level)
+{
+	z_stream stream;
+	int err;
+
+	bzero(&stream, sizeof (stream));
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+	stream.opaque = NULL;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+	if (!stream.opaque)
+		return (Z_MEM_ERROR);
+#endif
+	err = zlib_deflateInit(&stream, level);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.opaque);
+		return (err);
+	}
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_deflateEnd(&stream);
+		zlib_workspace_free(stream.opaque);
+		return (err == Z_OK ? Z_BUF_ERROR : err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_deflateEnd(&stream);
+	zlib_workspace_free(stream.opaque);
+	return (err);
+}
+
+/*
+ * Decompresses the source buffer into the destination buffer.  sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+	z_stream stream;
+	int err;
+
+	bzero(&stream, sizeof (stream));
+
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.opaque = zlib_workspace_alloc(KM_SLEEP);
+#if 0
+	if (!stream.opaque)
+		return (Z_MEM_ERROR);
+#endif
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.opaque);
+		return (err);
+	}
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_inflateEnd(&stream);
+		zlib_workspace_free(stream.opaque);
+
+		if (err == Z_NEED_DICT ||
+		    (err == Z_BUF_ERROR && stream.avail_in == 0))
+			return (Z_DATA_ERROR);
+
+		return (err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_inflateEnd(&stream);
+	zlib_workspace_free(stream.opaque);
+
+	return (err);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
new file mode 100644
index 000000000000..bd3f019b2fa6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/spl/spl_zone.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/jail.h>
+#include <sys/osd.h>
+#include <sys/priv.h>
+#include <sys/zone.h>
+
+#include <sys/policy.h>
+
+static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data");
+
+/*
+ * Structure to record list of ZFS datasets exported to a zone.
+ */
+typedef struct zone_dataset {
+	LIST_ENTRY(zone_dataset) zd_next;
+	char	zd_dataset[0];
+} zone_dataset_t;
+
+LIST_HEAD(zone_dataset_head, zone_dataset);
+
+static int zone_slot;
+
+int
+zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd, *zd2;
+	struct prison *pr;
+	int dofree, error;
+
+	if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+		return (error);
+
+	/* Allocate memory before we grab prison's mutex. */
+	zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK);
+
+	sx_slock(&allprison_lock);
+	pr = prison_find(jailid);	/* Locks &pr->pr_mtx. */
+	sx_sunlock(&allprison_lock);
+	if (pr == NULL) {
+		free(zd, M_ZONES);
+		return (ENOENT);
+	}
+
+	head = osd_jail_get(pr, zone_slot);
+	if (head != NULL) {
+		dofree = 0;
+		LIST_FOREACH(zd2, head, zd_next) {
+			if (strcmp(dataset, zd2->zd_dataset) == 0) {
+				free(zd, M_ZONES);
+				error = EEXIST;
+				goto end;
+			}
+		}
+	} else {
+		dofree = 1;
+		prison_hold_locked(pr);
+		mtx_unlock(&pr->pr_mtx);
+		head = malloc(sizeof (*head), M_ZONES, M_WAITOK);
+		LIST_INIT(head);
+		mtx_lock(&pr->pr_mtx);
+		error = osd_jail_set(pr, zone_slot, head);
+		KASSERT(error == 0, ("osd_jail_set() failed (error=%d)",
+		    error));
+	}
+	strcpy(zd->zd_dataset, dataset);
+	LIST_INSERT_HEAD(head, zd, zd_next);
+end:
+	if (dofree)
+		prison_free_locked(pr);
+	else
+		mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+
+int
+zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd;
+	struct prison *pr;
+	int error;
+
+	if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0)
+		return (error);
+
+	sx_slock(&allprison_lock);
+	pr = prison_find(jailid);
+	sx_sunlock(&allprison_lock);
+	if (pr == NULL)
+		return (ENOENT);
+	head = osd_jail_get(pr, zone_slot);
+	if (head == NULL) {
+		error = ENOENT;
+		goto end;
+	}
+	LIST_FOREACH(zd, head, zd_next) {
+		if (strcmp(dataset, zd->zd_dataset) == 0)
+			break;
+	}
+	if (zd == NULL)
+		error = ENOENT;
+	else {
+		LIST_REMOVE(zd, zd_next);
+		free(zd, M_ZONES);
+		if (LIST_EMPTY(head))
+			osd_jail_del(pr, zone_slot);
+		error = 0;
+	}
+end:
+	mtx_unlock(&pr->pr_mtx);
+	return (error);
+}
+
+/*
+ * Returns true if the named dataset is visible in the current zone.
+ * The 'write' parameter is set to 1 if the dataset is also writable.
+ */
+int
+zone_dataset_visible(const char *dataset, int *write)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd;
+	struct prison *pr;
+	size_t len;
+	int ret = 0;
+
+	if (dataset[0] == '\0')
+		return (0);
+	if (INGLOBALZONE(curproc)) {
+		if (write != NULL)
+			*write = 1;
+		return (1);
+	}
+	pr = curthread->td_ucred->cr_prison;
+	mtx_lock(&pr->pr_mtx);
+	head = osd_jail_get(pr, zone_slot);
+	if (head == NULL)
+		goto end;
+
+	/*
+	 * Walk the list once, looking for datasets which match exactly, or
+	 * specify a dataset underneath an exported dataset.  If found, return
+	 * true and note that it is writable.
+	 */
+	LIST_FOREACH(zd, head, zd_next) {
+		len = strlen(zd->zd_dataset);
+		if (strlen(dataset) >= len &&
+		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
+		    (dataset[len] == '\0' || dataset[len] == '/' ||
+		    dataset[len] == '@')) {
+			if (write)
+				*write = 1;
+			ret = 1;
+			goto end;
+		}
+	}
+
+	/*
+	 * Walk the list a second time, searching for datasets which are parents
+	 * of exported datasets.  These should be visible, but read-only.
+	 *
+	 * Note that we also have to support forms such as 'pool/dataset/', with
+	 * a trailing slash.
+	 */
+	LIST_FOREACH(zd, head, zd_next) {
+		len = strlen(dataset);
+		if (dataset[len - 1] == '/')
+			len--;	/* Ignore trailing slash */
+		if (len < strlen(zd->zd_dataset) &&
+		    bcmp(dataset, zd->zd_dataset, len) == 0 &&
+		    zd->zd_dataset[len] == '/') {
+			if (write)
+				*write = 0;
+			ret = 1;
+			goto end;
+		}
+	}
+end:
+	mtx_unlock(&pr->pr_mtx);
+	return (ret);
+}
+
+static void
+zone_destroy(void *arg)
+{
+	struct zone_dataset_head *head;
+	zone_dataset_t *zd;
+
+	head = arg;
+	while ((zd = LIST_FIRST(head)) != NULL) {
+		LIST_REMOVE(zd, zd_next);
+		free(zd, M_ZONES);
+	}
+	free(head, M_ZONES);
+}
+
+uint32_t
+zone_get_hostid(void *ptr)
+{
+
+	KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__));
+
+	return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid);
+}
+
+static void
+zone_sysinit(void *arg __unused)
+{
+
+	zone_slot = osd_jail_register(zone_destroy, NULL);
+}
+
+static void
+zone_sysuninit(void *arg __unused)
+{
+
+	osd_jail_deregister(zone_slot);
+}
+
+SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL);
+SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
new file mode 100644
index 000000000000..ff4d80ef1dfd
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/abd_os.c
@@ -0,0 +1,487 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Using a large proportion of scattered ABDs decreases ARC fragmentation since
+ * when we are at the limit of allocatable space, using equal-size chunks will
+ * allow us to quickly reclaim enough space for a new large allocation (assuming
+ * it is also scattered).
+ *
+ * ABDs are allocated scattered by default unless the caller uses
+ * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+typedef struct abd_stats {
+	kstat_named_t abdstat_struct_size;
+	kstat_named_t abdstat_scatter_cnt;
+	kstat_named_t abdstat_scatter_data_size;
+	kstat_named_t abdstat_scatter_chunk_waste;
+	kstat_named_t abdstat_linear_cnt;
+	kstat_named_t abdstat_linear_data_size;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+	/* Amount of memory occupied by all of the abd_t struct allocations */
+	{ "struct_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The number of scatter ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset()).
+	 */
+	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The amount of space wasted at the end of the last chunk across all
+	 * scatter ABDs tracked by scatter_cnt.
+	 */
+	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
+	/*
+	 * The number of linear ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
+	 * ABD takes ownership of its buf then it will become tracked.
+	 */
+	{ "linear_cnt",				KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
+	{ "linear_data_size",			KSTAT_DATA_UINT64 },
+};
+
+/*
+ * The size of the chunks ABD allocates. Because the sizes allocated from the
+ * kmem_cache can't change, this tunable can only be modified at boot. Changing
+ * it at runtime would cause ABD iteration to work incorrectly for ABDs which
+ * were allocated with the old size, so a safeguard has been put in place which
+ * will cause the machine to panic if you change it and try to access the data
+ * within a scattered ABD.
+ */
+size_t zfs_abd_chunk_size = 4096;
+
+#if defined(_KERNEL)
+SYSCTL_DECL(_vfs_zfs);
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN,
+	&zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers");
+SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN,
+	&zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates");
+#endif
+
+kmem_cache_t *abd_chunk_cache;
+static kstat_t *abd_ksp;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are
+ * just a single zero'd sized zfs_abd_chunk_size buffer. This
+ * allows us to conserve memory by only using a single zero buffer
+ * for the scatter chunks.
+ */
+abd_t *abd_zero_scatter = NULL;
+static char *abd_zero_buf = NULL;
+
+static void
+abd_free_chunk(void *c)
+{
+	kmem_cache_free(abd_chunk_cache, c);
+}
+
+static uint_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+	return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size);
+}
+
+static inline uint_t
+abd_scatter_chunkcnt(abd_t *abd)
+{
+	ASSERT(!abd_is_linear(abd));
+	return (abd_chunkcnt_for_bytes(
+	    ABD_SCATTER(abd).abd_offset + abd->abd_size));
+}
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	uint_t n = abd_scatter_chunkcnt(abd);
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = n * zfs_abd_chunk_size - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+	}
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+	uint_t i, n;
+
+	/*
+	 * There is no scatter linear pages in FreeBSD so there is an
+	 * if an error if the ABD has been marked as a linear page.
+	 */
+	ASSERT(!abd_is_linear_page(abd));
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+	    zfs_abd_chunk_size);
+	n = abd_scatter_chunkcnt(abd);
+	for (i = 0; i < n; i++) {
+		ASSERT3P(ABD_SCATTER(abd).abd_chunks[i], !=, NULL);
+	}
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	uint_t i, n;
+
+	n = abd_chunkcnt_for_bytes(size);
+	for (i = 0; i < n; i++) {
+		void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE);
+		ASSERT3P(c, !=, NULL);
+		ABD_SCATTER(abd).abd_chunks[i] = c;
+	}
+	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	uint_t i, n;
+
+	n = abd_scatter_chunkcnt(abd);
+	for (i = 0; i < n; i++) {
+		abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]);
+	}
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+	uint_t chunkcnt = abd_chunkcnt_for_bytes(size);
+	/*
+	 * In the event we are allocating a gang ABD, the size passed in
+	 * will be 0. We must make sure to set abd_size to the size of an
+	 * ABD struct as opposed to an ABD scatter with 0 chunks. The gang
+	 * ABD struct allocation accounts for an additional 24 bytes over
+	 * a scatter ABD with 0 chunks.
+	 */
+	size_t abd_size = MAX(sizeof (abd_t),
+	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
+	abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+	ABDSTAT_INCR(abdstat_struct_size, abd_size);
+
+	return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+	uint_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 :
+	    abd_scatter_chunkcnt(abd);
+	ssize_t size = MAX(sizeof (abd_t),
+	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]));
+	kmem_free(abd, size);
+	ABDSTAT_INCR(abdstat_struct_size, -size);
+}
+
+/*
+ * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where
+ * each chunk in the scatterlist will be set to abd_zero_buf.
+ */
+static void
+abd_alloc_zero_scatter(void)
+{
+	uint_t i, n;
+
+	n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP);
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER | ABD_FLAG_ZEROS;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_chunk_size =
+	    zfs_abd_chunk_size;
+
+	for (i = 0; i < n; i++) {
+		ABD_SCATTER(abd_zero_scatter).abd_chunks[i] =
+		    abd_zero_buf;
+	}
+
+	ABDSTAT_BUMP(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size);
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size);
+
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+	kmem_free(abd_zero_buf, zfs_abd_chunk_size);
+}
+
+void
+abd_init(void)
+{
+	abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0,
+	    NULL, NULL, NULL, NULL, 0, KMC_NODEBUG);
+
+	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (abd_ksp != NULL) {
+		abd_ksp->ks_data = &abd_stats;
+		kstat_install(abd_ksp);
+	}
+
+	abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+	abd_free_zero_scatter();
+
+	if (abd_ksp != NULL) {
+		kstat_delete(abd_ksp);
+		abd_ksp = NULL;
+	}
+
+	kmem_cache_destroy(abd_chunk_cache);
+	abd_chunk_cache = NULL;
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	/*
+	 * FreeBSD does not have have scatter linear pages
+	 * so there is an error.
+	 */
+	VERIFY(0);
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os
+ * using a scatter/gather list we should switch to that and replace this call
+ * with vanilla abd_alloc().
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc_linear(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+{
+	abd_verify(sabd);
+	ASSERT3U(off, <=, sabd->abd_size);
+
+	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+	uint_t chunkcnt = abd_scatter_chunkcnt(sabd) -
+	    (new_offset / zfs_abd_chunk_size);
+
+	/*
+	 * If an abd struct is provided, it is only the minimum size.  If we
+	 * need additional chunks, we need to allocate a new struct.
+	 */
+	if (abd != NULL &&
+	    offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]) >
+	    sizeof (abd_t)) {
+		abd = NULL;
+	}
+
+	if (abd == NULL)
+		abd = abd_alloc_struct(chunkcnt * zfs_abd_chunk_size);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that
+	 * if we own the underlying data buffer, which is not true in
+	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+
+	ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size;
+	ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size;
+
+	/* Copy the scatterlist starting at the correct offset */
+	(void) memcpy(&ABD_SCATTER(abd).abd_chunks,
+	    &ABD_SCATTER(sabd).abd_chunks[new_offset /
+	    zfs_abd_chunk_size],
+	    chunkcnt * sizeof (void *));
+
+	return (abd);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_offset(struct abd_iter *aiter)
+{
+	ASSERT(!abd_is_linear(aiter->iter_abd));
+	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
+	    aiter->iter_pos) % zfs_abd_chunk_size);
+}
+
+static inline size_t
+abd_iter_scatter_chunk_index(struct abd_iter *aiter)
+{
+	ASSERT(!abd_is_linear(aiter->iter_abd));
+	return ((ABD_SCATTER(aiter->iter_abd).abd_offset +
+	    aiter->iter_pos) / zfs_abd_chunk_size);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	aiter->iter_abd = abd;
+	aiter->iter_pos = 0;
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to advance to, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	void *paddr;
+	size_t offset = 0;
+
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* Panic if someone has changed zfs_abd_chunk_size */
+	IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size ==
+	    ABD_SCATTER(aiter->iter_abd).abd_chunk_size);
+
+	/* There's nothing left to iterate over, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		offset = aiter->iter_pos;
+		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+	} else {
+		size_t index = abd_iter_scatter_chunk_index(aiter);
+		offset = abd_iter_scatter_chunk_offset(aiter);
+		aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+		paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index];
+	}
+	aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	/* There's nothing left to unmap, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+	kmem_cache_reap_soon(abd_chunk_cache);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
new file mode 100644
index 000000000000..4fc7468bfa47
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/arc_os.c
@@ -0,0 +1,255 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/counter.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/eventhandler.h>
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/sdt.h>
+#include <sys/aggsum.h>
+#include <sys/vnode.h>
+#include <cityhash.h>
+#include <machine/vmparam.h>
+#include <sys/vm.h>
+#include <sys/vmmeter.h>
+
+extern struct vfsops zfs_vfsops;
+
+uint_t zfs_arc_free_target = 0;
+
+static void
+arc_free_target_init(void *unused __unused)
+{
+	zfs_arc_free_target = vm_cnt.v_free_target;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+    arc_free_target_init, NULL);
+
+/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+	uint_t val;
+	int err;
+
+	val = zfs_arc_free_target;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < minfree)
+		return (EINVAL);
+	if (val > vm_cnt.v_page_count)
+		return (EINVAL);
+
+	zfs_arc_free_target = val;
+
+	return (0);
+}
+SYSCTL_DECL(_vfs_zfs);
+/* BEGIN CSTYLED */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint_t),
+    sysctl_vfs_zfs_arc_free_target, "IU",
+    "Desired number of free pages below which ARC triggers reclaim");
+/* END CSTYLED */
+
+int64_t
+arc_available_memory(void)
+{
+	int64_t lowest = INT64_MAX;
+	int64_t n __unused;
+
+	/*
+	 * Cooperate with pagedaemon when it's time for it to scan
+	 * and reclaim some pages.
+	 */
+	n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+	if (n < lowest) {
+		lowest = n;
+	}
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+	/*
+	 * If we're on an i386 platform, it's possible that we'll exhaust the
+	 * kernel heap space before we ever run out of available physical
+	 * memory.  Most checks of the size of the heap_area compare against
+	 * tune.t_minarmem, which is the minimum available real memory that we
+	 * can have in the system.  However, this is generally fixed at 25 pages
+	 * which is so low that it's useless.  In this comparison, we seek to
+	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
+	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
+	 * free)
+	 */
+	n = uma_avail() - (long)(uma_limit() / 4);
+	if (n < lowest) {
+		lowest = n;
+	}
+#endif
+
+	DTRACE_PROBE1(arc__available_memory, int64_t, lowest);
+	return (lowest);
+}
+
+/*
+ * Return a default max arc size based on the amount of physical memory.
+ */
+uint64_t
+arc_default_max(uint64_t min, uint64_t allmem)
+{
+	uint64_t size;
+
+	if (allmem >= 1 << 30)
+		size = allmem - (1 << 30);
+	else
+		size = min;
+	return (MAX(allmem * 5 / 8, size));
+}
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *arg)
+{
+	int64_t nr_scan = *(int64_t *)arg;
+
+	arc_reduce_target_size(ptob(nr_scan));
+	free(arg, M_TEMP);
+	vnlru_free(nr_scan, &zfs_vfsops);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference.  This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+void
+arc_prune_async(int64_t adjust)
+{
+
+	int64_t *adjustptr;
+
+	if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL)
+		return;
+
+	*adjustptr = adjust;
+	taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP);
+	ARCSTAT_BUMP(arcstat_prune);
+}
+
+uint64_t
+arc_all_memory(void)
+{
+	return (ptob(physmem));
+}
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+	return (0);
+}
+
+uint64_t
+arc_free_memory(void)
+{
+	return (ptob(freemem));
+}
+
+static eventhandler_tag arc_event_lowmem = NULL;
+
+static void
+arc_lowmem(void *arg __unused, int howto __unused)
+{
+	int64_t free_memory, to_free;
+
+	arc_no_grow = B_TRUE;
+	arc_warm = B_TRUE;
+	arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+	free_memory = arc_available_memory();
+	to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0);
+	DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
+	arc_reduce_target_size(to_free);
+
+	/*
+	 * It is unsafe to block here in arbitrary threads, because we can come
+	 * here from ARC itself and may hold ARC locks and thus risk a deadlock
+	 * with ARC reclaim thread.
+	 */
+	if (curproc == pageproc)
+		arc_wait_for_eviction(to_free);
+	else
+		arc_wait_for_eviction(0);
+}
+
+void
+arc_lowmem_init(void)
+{
+	arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL,
+	    EVENTHANDLER_PRI_FIRST);
+
+}
+
+void
+arc_lowmem_fini(void)
+{
+	if (arc_event_lowmem != NULL)
+		EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
+}
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
new file mode 100644
index 000000000000..fbf998416234
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/crypto_os.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (c) 2005-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * Copyright (c) 2018 Sean Eric Fagan <sef@ixsystems.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+
+#ifdef _KERNEL
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#include <sys/sysctl.h>
+#include <opencrypto/cryptodev.h>
+#include <opencrypto/xform.h>
+#else
+#include <strings.h>
+#endif
+
+#include <sys/zio_crypt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+#include <sys/freebsd_crypto.h>
+
+#define	SHA512_HMAC_BLOCK_SIZE	128
+
+static int crypt_sessions = 0;
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD,
+	&crypt_sessions, 0, "Number of cryptographic sessions created");
+
+void
+crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key)
+{
+	uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE],
+	    k_opad[SHA512_HMAC_BLOCK_SIZE],
+	    key[SHA512_HMAC_BLOCK_SIZE];
+	SHA512_CTX lctx;
+	int i;
+	size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length);
+
+	/*
+	 * This code is based on the similar code in geom/eli/g_eli_hmac.c
+	 */
+	explicit_bzero(key, sizeof (key));
+	if (c_key->ck_length  == 0)
+		/* do nothing */;
+	else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE)
+		bcopy(c_key->ck_data, key, cl_bytes);
+	else {
+		/*
+		 * If key is longer than 128 bytes reset it to
+		 * key = SHA512(key).
+		 */
+		SHA512_Init(&lctx);
+		SHA512_Update(&lctx, c_key->ck_data, cl_bytes);
+		SHA512_Final(key, &lctx);
+	}
+
+	/* XOR key with ipad and opad values. */
+	for (i = 0; i < sizeof (key); i++) {
+		k_ipad[i] = key[i] ^ 0x36;
+		k_opad[i] = key[i] ^ 0x5c;
+	}
+	explicit_bzero(key, sizeof (key));
+
+	/* Start inner SHA512. */
+	SHA512_Init(&ctx->innerctx);
+	SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad));
+	explicit_bzero(k_ipad, sizeof (k_ipad));
+	/* Start outer SHA512. */
+	SHA512_Init(&ctx->outerctx);
+	SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad));
+	explicit_bzero(k_opad, sizeof (k_opad));
+}
+
+void
+crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize)
+{
+	SHA512_Update(&ctx->innerctx, data, datasize);
+}
+
+void
+crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize)
+{
+	uint8_t digest[SHA512_DIGEST_LENGTH];
+
+	/* Complete inner hash */
+	SHA512_Final(digest, &ctx->innerctx);
+
+	/* Complete outer hash */
+	SHA512_Update(&ctx->outerctx, digest, sizeof (digest));
+	SHA512_Final(digest, &ctx->outerctx);
+
+	explicit_bzero(ctx, sizeof (*ctx));
+	/* mdsize == 0 means "Give me the whole hash!" */
+	if (mdsize == 0)
+		mdsize = SHA512_DIGEST_LENGTH;
+	bcopy(digest, md, mdsize);
+	explicit_bzero(digest, sizeof (digest));
+}
+
+void
+crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size,
+    void *out_data, size_t out_data_size)
+{
+	struct hmac_ctx ctx;
+
+	crypto_mac_init(&ctx, key);
+	crypto_mac_update(&ctx, in_data, in_data_size);
+	crypto_mac_final(&ctx, out_data, out_data_size);
+}
+
+static int
+freebsd_zfs_crypt_done(struct cryptop *crp)
+{
+	freebsd_crypt_session_t *ses;
+
+	ses = crp->crp_opaque;
+	mtx_lock(&ses->fs_lock);
+	ses->fs_done = true;
+	mtx_unlock(&ses->fs_lock);
+	wakeup(crp);
+	return (0);
+}
+
+void
+freebsd_crypt_freesession(freebsd_crypt_session_t *sess)
+{
+	mtx_destroy(&sess->fs_lock);
+	crypto_freesession(sess->fs_sid);
+	explicit_bzero(sess, sizeof (*sess));
+}
+
+static int
+zfs_crypto_dispatch(freebsd_crypt_session_t *session, 	struct cryptop *crp)
+{
+	int error;
+
+	crp->crp_opaque = session;
+	crp->crp_callback = freebsd_zfs_crypt_done;
+	for (;;) {
+		error = crypto_dispatch(crp);
+		if (error)
+			break;
+		mtx_lock(&session->fs_lock);
+		while (session->fs_done == false)
+			msleep(crp, &session->fs_lock, PRIBIO,
+			    "zfs_crypto", hz/5);
+		mtx_unlock(&session->fs_lock);
+
+		if (crp->crp_etype != EAGAIN) {
+			error = crp->crp_etype;
+			break;
+		}
+		crp->crp_etype = 0;
+		crp->crp_flags &= ~CRYPTO_F_DONE;
+		session->fs_done = false;
+#if __FreeBSD_version < 1300087
+		/*
+		 * Session ID changed, so we should record that,
+		 * and try again
+		 */
+		session->fs_sid = crp->crp_session;
+#endif
+	}
+	return (error);
+}
+static void
+freebsd_crypt_uio_debug_log(boolean_t encrypt,
+    freebsd_crypt_session_t *input_sessionp,
+    struct zio_crypt_info *c_info,
+    zfs_uio_t *data_uio,
+    crypto_key_t *key,
+    uint8_t *ivbuf,
+    size_t datalen,
+    size_t auth_len)
+{
+#ifdef FCRYPTO_DEBUG
+	struct cryptodesc *crd;
+	uint8_t *p = NULL;
+	size_t total = 0;
+
+	printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, "
+	    "%p, %u, %u)\n",
+	    __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp,
+	    c_info->ci_algname, c_info->ci_crypt_type,
+	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
+	    data_uio, key->ck_format, key->ck_data,
+	    (unsigned int)key->ck_length,
+	    ivbuf, (unsigned int)datalen, (unsigned int)auth_len);
+	printf("\tkey = { ");
+	for (int i = 0; i < key->ck_length / 8; i++) {
+		uint8_t *b = (uint8_t *)key->ck_data;
+		printf("%02x ", b[i]);
+	}
+	printf("}\n");
+	for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++) {
+		printf("\tiovec #%d: <%p, %u>\n", i,
+		    zfs_uio_iovbase(data_uio, i),
+		    (unsigned int)zfs_uio_iovlen(data_uio, i));
+		total += zfs_uio_iovlen(data_uio, i);
+	}
+	zfs_uio_resid(data_uio) = total;
+#endif
+}
+/*
+ * Create a new cryptographic session.  This should
+ * happen every time the key changes (including when
+ * it's first loaded).
+ */
+#if __FreeBSD_version >= 1300087
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+    struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+	struct crypto_session_params csp;
+	int error = 0;
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+	    __FUNCTION__, sessp,
+	    c_info->ci_algname, c_info->ci_crypt_type,
+	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
+	    key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+	printf("\tkey = { ");
+	for (int i = 0; i < key->ck_length / 8; i++) {
+		uint8_t *b = (uint8_t *)key->ck_data;
+		printf("%02x ", b[i]);
+	}
+	printf("}\n");
+#endif
+	bzero(&csp, sizeof (csp));
+	csp.csp_mode = CSP_MODE_AEAD;
+	csp.csp_cipher_key = key->ck_data;
+	csp.csp_cipher_klen = key->ck_length / 8;
+	switch (c_info->ci_crypt_type) {
+		case ZC_TYPE_GCM:
+		csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16;
+		csp.csp_ivlen = AES_GCM_IV_LEN;
+		switch (key->ck_length/8) {
+		case AES_128_GMAC_KEY_LEN:
+		case AES_192_GMAC_KEY_LEN:
+		case AES_256_GMAC_KEY_LEN:
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+		}
+		break;
+	case ZC_TYPE_CCM:
+		csp.csp_cipher_alg = CRYPTO_AES_CCM_16;
+		csp.csp_ivlen = AES_CCM_IV_LEN;
+		switch (key->ck_length/8) {
+		case AES_128_CBC_MAC_KEY_LEN:
+		case AES_192_CBC_MAC_KEY_LEN:
+		case AES_256_CBC_MAC_KEY_LEN:
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+			break;
+		}
+		break;
+	default:
+		error = ENOTSUP;
+		goto bad;
+	}
+	error = crypto_newsession(&sessp->fs_sid, &csp,
+	    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+	mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+	    NULL, MTX_DEF);
+	crypt_sessions++;
+bad:
+#ifdef FCRYPTO_DEBUG
+	if (error)
+		printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+	return (error);
+}
+
+int
+freebsd_crypt_uio(boolean_t encrypt,
+    freebsd_crypt_session_t *input_sessionp,
+    struct zio_crypt_info *c_info,
+    zfs_uio_t *data_uio,
+    crypto_key_t *key,
+    uint8_t *ivbuf,
+    size_t datalen,
+    size_t auth_len)
+{
+	struct cryptop *crp;
+	freebsd_crypt_session_t *session = NULL;
+	int error = 0;
+	size_t total = 0;
+
+	freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+	    key, ivbuf, datalen, auth_len);
+	for (int i = 0; i < zfs_uio_iovcnt(data_uio); i++)
+		total += zfs_uio_iovlen(data_uio, i);
+	zfs_uio_resid(data_uio) = total;
+	if (input_sessionp == NULL) {
+		session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+		error = freebsd_crypt_newsession(session, c_info, key);
+		if (error)
+			goto out;
+	} else
+		session = input_sessionp;
+
+	crp = crypto_getreq(session->fs_sid, M_WAITOK);
+	if (encrypt) {
+		crp->crp_op = CRYPTO_OP_ENCRYPT |
+		    CRYPTO_OP_COMPUTE_DIGEST;
+	} else {
+		crp->crp_op = CRYPTO_OP_DECRYPT |
+		    CRYPTO_OP_VERIFY_DIGEST;
+	}
+	crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE;
+	crypto_use_uio(crp, GET_UIO_STRUCT(data_uio));
+
+	crp->crp_aad_start = 0;
+	crp->crp_aad_length = auth_len;
+	crp->crp_payload_start = auth_len;
+	crp->crp_payload_length = datalen;
+	crp->crp_digest_start = auth_len + datalen;
+
+	bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN);
+	error = zfs_crypto_dispatch(session, crp);
+	crypto_freereq(crp);
+out:
+#ifdef FCRYPTO_DEBUG
+	if (error)
+		printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+	if (input_sessionp == NULL) {
+		freebsd_crypt_freesession(session);
+		kmem_free(session, sizeof (*session));
+	}
+	return (error);
+}
+
+#else
+int
+freebsd_crypt_newsession(freebsd_crypt_session_t *sessp,
+    struct zio_crypt_info *c_info, crypto_key_t *key)
+{
+	struct cryptoini cria, crie, *crip;
+	struct enc_xform *xform;
+	struct auth_hash *xauth;
+	int error = 0;
+	crypto_session_t sid;
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n",
+	    __FUNCTION__, sessp,
+	    c_info->ci_algname, c_info->ci_crypt_type,
+	    (unsigned int)c_info->ci_keylen, c_info->ci_name,
+	    key->ck_format, key->ck_data, (unsigned int)key->ck_length);
+	printf("\tkey = { ");
+	for (int i = 0; i < key->ck_length / 8; i++) {
+		uint8_t *b = (uint8_t *)key->ck_data;
+		printf("%02x ", b[i]);
+	}
+	printf("}\n");
+#endif
+	switch (c_info->ci_crypt_type) {
+	case ZC_TYPE_GCM:
+		xform = &enc_xform_aes_nist_gcm;
+		switch (key->ck_length/8) {
+		case AES_128_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_128;
+			break;
+		case AES_192_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_192;
+			break;
+		case AES_256_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+		}
+		break;
+	case ZC_TYPE_CCM:
+		xform = &enc_xform_ccm;
+		switch (key->ck_length/8) {
+		case AES_128_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_128;
+			break;
+		case AES_192_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_192;
+			break;
+		case AES_256_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+			break;
+		}
+		break;
+	default:
+		error = ENOTSUP;
+		goto bad;
+	}
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+	    "auth %s (key length %d)\n",
+	    __FUNCTION__, __LINE__,
+	    xform->name, (unsigned int)key->ck_length,
+	    (unsigned int)key->ck_length/8,
+	    xauth->name, xauth->keysize);
+#endif
+
+	bzero(&crie, sizeof (crie));
+	bzero(&cria, sizeof (cria));
+
+	crie.cri_alg = xform->type;
+	crie.cri_key = key->ck_data;
+	crie.cri_klen = key->ck_length;
+
+	cria.cri_alg = xauth->type;
+	cria.cri_key = key->ck_data;
+	cria.cri_klen = key->ck_length;
+
+	cria.cri_next = &crie;
+	crie.cri_next = NULL;
+	crip = &cria;
+	// Everything else is bzero'd
+
+	error = crypto_newsession(&sid, crip,
+	    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
+	if (error != 0) {
+		printf("%s(%d):  crypto_newsession failed with %d\n",
+		    __FUNCTION__, __LINE__, error);
+		goto bad;
+	}
+	sessp->fs_sid = sid;
+	mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock",
+	    NULL, MTX_DEF);
+	crypt_sessions++;
+bad:
+	return (error);
+}
+
+/*
+ * The meat of encryption/decryption.
+ * If sessp is NULL, then it will create a
+ * temporary cryptographic session, and release
+ * it when done.
+ */
+int
+freebsd_crypt_uio(boolean_t encrypt,
+    freebsd_crypt_session_t *input_sessionp,
+    struct zio_crypt_info *c_info,
+    zfs_uio_t *data_uio,
+    crypto_key_t *key,
+    uint8_t *ivbuf,
+    size_t datalen,
+    size_t auth_len)
+{
+	struct cryptop *crp;
+	struct cryptodesc *enc_desc, *auth_desc;
+	struct enc_xform *xform;
+	struct auth_hash *xauth;
+	freebsd_crypt_session_t *session = NULL;
+	int error;
+
+	freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio,
+	    key, ivbuf, datalen, auth_len);
+	switch (c_info->ci_crypt_type) {
+	case ZC_TYPE_GCM:
+		xform = &enc_xform_aes_nist_gcm;
+		switch (key->ck_length/8) {
+		case AES_128_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_128;
+			break;
+		case AES_192_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_192;
+			break;
+		case AES_256_GMAC_KEY_LEN:
+			xauth = &auth_hash_nist_gmac_aes_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+		}
+		break;
+	case ZC_TYPE_CCM:
+		xform = &enc_xform_ccm;
+		switch (key->ck_length/8) {
+		case AES_128_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_128;
+			break;
+		case AES_192_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_192;
+			break;
+		case AES_256_CBC_MAC_KEY_LEN:
+			xauth = &auth_hash_ccm_cbc_mac_256;
+			break;
+		default:
+			error = EINVAL;
+			goto bad;
+			break;
+		}
+		break;
+	default:
+		error = ENOTSUP;
+		goto bad;
+	}
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%d): Using crypt %s (key length %u [%u bytes]), "
+	    "auth %s (key length %d)\n",
+	    __FUNCTION__, __LINE__,
+	    xform->name, (unsigned int)key->ck_length,
+	    (unsigned int)key->ck_length/8,
+	    xauth->name, xauth->keysize);
+#endif
+
+	if (input_sessionp == NULL) {
+		session = kmem_zalloc(sizeof (*session), KM_SLEEP);
+		error = freebsd_crypt_newsession(session, c_info, key);
+		if (error)
+			goto out;
+	} else
+		session = input_sessionp;
+
+	crp = crypto_getreq(2);
+	if (crp == NULL) {
+		error = ENOMEM;
+		goto bad;
+	}
+
+	auth_desc = crp->crp_desc;
+	enc_desc = auth_desc->crd_next;
+
+	crp->crp_session = session->fs_sid;
+	crp->crp_ilen = auth_len + datalen;
+	crp->crp_buf = (void*)GET_UIO_STRUCT(data_uio);
+	crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC;
+
+	auth_desc->crd_skip = 0;
+	auth_desc->crd_len = auth_len;
+	auth_desc->crd_inject = auth_len + datalen;
+	auth_desc->crd_alg = xauth->type;
+#ifdef FCRYPTO_DEBUG
+	printf("%s: auth: skip = %u, len = %u, inject = %u\n",
+	    __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len,
+	    auth_desc->crd_inject);
+#endif
+
+	enc_desc->crd_skip = auth_len;
+	enc_desc->crd_len = datalen;
+	enc_desc->crd_inject = auth_len;
+	enc_desc->crd_alg = xform->type;
+	enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT;
+	bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN);
+	enc_desc->crd_next = NULL;
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s: enc: skip = %u, len = %u, inject = %u\n",
+	    __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len,
+	    enc_desc->crd_inject);
+#endif
+
+	if (encrypt)
+		enc_desc->crd_flags |= CRD_F_ENCRYPT;
+
+	error = zfs_crypto_dispatch(session, crp);
+	crypto_freereq(crp);
+out:
+	if (input_sessionp == NULL) {
+		freebsd_crypt_freesession(session);
+		kmem_free(session, sizeof (*session));
+	}
+bad:
+#ifdef FCRYPTO_DEBUG
+	if (error)
+		printf("%s: returning error %d\n", __FUNCTION__, error);
+#endif
+	return (error);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
new file mode 100644
index 000000000000..8e412d9c1359
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/dmu_os.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/zfs_rlock.h>
+#include <sys/racct.h>
+#include <sys/vm.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+
+#include <sys/ccompat.h>
+
+#ifndef IDX_TO_OFF
+#define	IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT)
+#endif
+
+#if  __FreeBSD_version < 1300051
+#define	VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY
+#else
+#define	VM_ALLOC_BUSY_FLAGS  VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY
+#endif
+
+
+#if __FreeBSD_version < 1300072
+#define	dmu_page_lock(m)	vm_page_lock(m)
+#define	dmu_page_unlock(m)	vm_page_unlock(m)
+#else
+#define	dmu_page_lock(m)
+#define	dmu_page_unlock(m)
+#endif
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    vm_page_t *ma, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	struct sf_buf *sf;
+	int numbufs, i;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		int tocpy, copied, thiscpy;
+		int bufoff;
+		dmu_buf_t *db = dbp[i];
+		caddr_t va;
+
+		ASSERT(size > 0);
+		ASSERT3U(db->db_size, >=, PAGESIZE);
+
+		bufoff = offset - db->db_offset;
+		tocpy = (int)MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+			ASSERT3U(ptoa((*ma)->pindex), ==,
+			    db->db_offset + bufoff);
+			thiscpy = MIN(PAGESIZE, tocpy - copied);
+			va = zfs_map_page(*ma, &sf);
+			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+			zfs_unmap_page(sf);
+			ma += 1;
+			bufoff += PAGESIZE;
+		}
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
+
+int
+dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count,
+    int *rbehind, int *rahead, int last_size)
+{
+	struct sf_buf *sf;
+	vm_object_t vmobj;
+	vm_page_t m;
+	dmu_buf_t **dbp;
+	dmu_buf_t *db;
+	caddr_t va;
+	int numbufs, i;
+	int bufoff, pgoff, tocpy;
+	int mi, di;
+	int err;
+
+	ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex);
+	ASSERT(last_size <= PAGE_SIZE);
+
+	err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex),
+	    IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp);
+	if (err != 0)
+		return (err);
+
+#ifdef ZFS_DEBUG
+	IMPLY(last_size < PAGE_SIZE, *rahead == 0);
+	if (dbp[0]->db_offset != 0 || numbufs > 1) {
+		for (i = 0; i < numbufs; i++) {
+			ASSERT(ISP2(dbp[i]->db_size));
+			ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0);
+			ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size);
+		}
+	}
+#endif
+
+	vmobj = ma[0]->object;
+	zfs_vmobject_wlock_12(vmobj);
+
+	db = dbp[0];
+	for (i = 0; i < *rbehind; i++) {
+		m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+		if (m == NULL)
+			break;
+		if (!vm_page_none_valid(m)) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_do_sunbusy(m);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_write_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, PAGESIZE);
+		zfs_unmap_page(sf);
+		vm_page_valid(m);
+		dmu_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		dmu_page_unlock(m);
+		vm_page_do_sunbusy(m);
+	}
+	*rbehind = i;
+
+	bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size;
+	pgoff = 0;
+	for (mi = 0, di = 0; mi < count && di < numbufs; ) {
+		if (pgoff == 0) {
+			m = ma[mi];
+			if (m != bogus_page) {
+				vm_page_assert_xbusied(m);
+				ASSERT(vm_page_none_valid(m));
+				ASSERT(m->dirty == 0);
+				ASSERT(!pmap_page_is_write_mapped(m));
+				va = zfs_map_page(m, &sf);
+			}
+		}
+		if (bufoff == 0)
+			db = dbp[di];
+
+		if (m != bogus_page) {
+			ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==,
+			    db->db_offset + bufoff);
+		}
+
+		/*
+		 * We do not need to clamp the copy size by the file
+		 * size as the last block is zero-filled beyond the
+		 * end of file anyway.
+		 */
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff);
+		if (m != bogus_page)
+			bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy);
+
+		pgoff += tocpy;
+		ASSERT(pgoff <= PAGESIZE);
+		if (pgoff == PAGESIZE) {
+			if (m != bogus_page) {
+				zfs_unmap_page(sf);
+				vm_page_valid(m);
+			}
+			ASSERT(mi < count);
+			mi++;
+			pgoff = 0;
+		}
+
+		bufoff += tocpy;
+		ASSERT(bufoff <= db->db_size);
+		if (bufoff == db->db_size) {
+			ASSERT(di < numbufs);
+			di++;
+			bufoff = 0;
+		}
+	}
+
+#ifdef ZFS_DEBUG
+	/*
+	 * Three possibilities:
+	 * - last requested page ends at a buffer boundary and , thus,
+	 *   all pages and buffers have been iterated;
+	 * - all requested pages are filled, but the last buffer
+	 *   has not been exhausted;
+	 *   the read-ahead is possible only in this case;
+	 * - all buffers have been read, but the last page has not been
+	 *   fully filled;
+	 *   this is only possible if the file has only a single buffer
+	 *   with a size that is not a multiple of the page size.
+	 */
+	if (mi == count) {
+		ASSERT(di >= numbufs - 1);
+		IMPLY(*rahead != 0, di == numbufs - 1);
+		IMPLY(*rahead != 0, bufoff != 0);
+		ASSERT(pgoff == 0);
+	}
+	if (di == numbufs) {
+		ASSERT(mi >= count - 1);
+		ASSERT(*rahead == 0);
+		IMPLY(pgoff == 0, mi == count);
+		if (pgoff != 0) {
+			ASSERT(mi == count - 1);
+			ASSERT((dbp[0]->db_size & PAGE_MASK) != 0);
+		}
+	}
+#endif
+	if (pgoff != 0) {
+		ASSERT(m != bogus_page);
+		bzero(va + pgoff, PAGESIZE - pgoff);
+		zfs_unmap_page(sf);
+		vm_page_valid(m);
+	}
+
+	for (i = 0; i < *rahead; i++) {
+		m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i,
+		    VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS);
+		if (m == NULL)
+			break;
+		if (!vm_page_none_valid(m)) {
+			ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_do_sunbusy(m);
+			break;
+		}
+		ASSERT(m->dirty == 0);
+		ASSERT(!pmap_page_is_mapped(m));
+
+		ASSERT(db->db_size > PAGE_SIZE);
+		bufoff = IDX_TO_OFF(m->pindex) % db->db_size;
+		tocpy = MIN(db->db_size - bufoff, PAGESIZE);
+		va = zfs_map_page(m, &sf);
+		bcopy((char *)db->db_data + bufoff, va, tocpy);
+		if (tocpy < PAGESIZE) {
+			ASSERT(i == *rahead - 1);
+			ASSERT((db->db_size & PAGE_MASK) != 0);
+			bzero(va + tocpy, PAGESIZE - tocpy);
+		}
+		zfs_unmap_page(sf);
+		vm_page_valid(m);
+		dmu_page_lock(m);
+		if ((m->busy_lock & VPB_BIT_WAITERS) != 0)
+			vm_page_activate(m);
+		else
+			vm_page_deactivate(m);
+		dmu_page_unlock(m);
+		vm_page_do_sunbusy(m);
+	}
+	*rahead = i;
+	zfs_vmobject_wunlock_12(vmobj);
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c b/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c
new file mode 100644
index 000000000000..8324ff2319b6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/hkdf.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/hkdf.h>
+#include <sys/freebsd_crypto.h>
+#include <sys/hkdf.h>
+
+static int
+hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
+    uint_t km_len, uint8_t *out_buf)
+{
+	crypto_key_t key;
+
+	/* initialize the salt as a crypto key */
+	key.ck_format = CRYPTO_KEY_RAW;
+	key.ck_length = CRYPTO_BYTES2BITS(salt_len);
+	key.ck_data = salt;
+
+	crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH);
+
+	return (0);
+}
+
+static int
+hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
+    uint8_t *out_buf, uint_t out_len)
+{
+	struct hmac_ctx ctx;
+	crypto_key_t key;
+	uint_t i, T_len = 0, pos = 0;
+	uint8_t c;
+	uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH;
+	uint8_t T[SHA512_DIGEST_LENGTH];
+
+	if (N > 255)
+		return (SET_ERROR(EINVAL));
+
+	/* initialize the salt as a crypto key */
+	key.ck_format = CRYPTO_KEY_RAW;
+	key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
+	key.ck_data = extract_key;
+
+	for (i = 1; i <= N; i++) {
+		c = i;
+
+		crypto_mac_init(&ctx, &key);
+		crypto_mac_update(&ctx, T, T_len);
+		crypto_mac_update(&ctx, info, info_len);
+		crypto_mac_update(&ctx, &c, 1);
+		crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH);
+		bcopy(T, out_buf + pos,
+		    (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
+		pos += SHA512_DIGEST_LENGTH;
+	}
+
+	return (0);
+}
+
+/*
+ * HKDF is designed to be a relatively fast function for deriving keys from a
+ * master key + a salt. We use this function to generate new encryption keys
+ * so as to avoid hitting the cryptographic limits of the underlying
+ * encryption modes. Note that, for the sake of deriving encryption keys, the
+ * info parameter is called the "salt" everywhere else in the code.
+ */
+int
+hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt,
+    uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key,
+    uint_t out_len)
+{
+	int ret;
+	uint8_t extract_key[SHA512_DIGEST_LENGTH];
+
+	ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len,
+	    extract_key);
+	if (ret != 0)
+		return (ret);
+
+	ret = hkdf_sha512_expand(extract_key, info, info_len, output_key,
+	    out_len);
+	if (ret != 0)
+		return (ret);
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
new file mode 100644
index 000000000000..c11d4dbcf660
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/kmod_core.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/cmn_err.h>
+#include <sys/conf.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_userhold.h>
+#include <sys/errno.h>
+#include <sys/eventhandler.h>
+#include <sys/file.h>
+#include <sys/fm/util.h>
+#include <sys/fs/zfs.h>
+#include <sys/kernel.h>
+#include <sys/kmem.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/nvpair.h>
+#include <sys/policy.h>
+#include <sys/proc.h>
+#include <sys/sdt.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <sys/uio.h>
+#include <sys/vdev.h>
+#include <sys/vdev_removal.h>
+#include <sys/zap.h>
+#include <sys/zcp.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_ioctl_impl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zio_checksum.h>
+#include <sys/zone.h>
+#include <sys/zvol.h>
+
+#include "zfs_comutil.h"
+#include "zfs_deleg.h"
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_DECL(_vfs_zfs_vdev);
+
+extern uint_t rrw_tsd_key;
+static int zfs_version_ioctl = ZFS_IOCVER_OZFS;
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl,
+    0, "ZFS_IOCTL_VERSION");
+
+static struct cdev *zfsdev;
+
+static struct root_hold_token *zfs_root_token;
+
+extern uint_t rrw_tsd_key;
+extern uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
+
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+extern zfsdev_state_t *zfsdev_state_list;
+
+#define	ZFS_MIN_KSTACK_PAGES 4
+
+static int
+zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag,
+    struct thread *td)
+{
+	uint_t len;
+	int vecnum;
+	zfs_iocparm_t *zp;
+	zfs_cmd_t *zc;
+	zfs_cmd_legacy_t *zcl;
+	int rc, error;
+	void *uaddr;
+
+	len = IOCPARM_LEN(zcmd);
+	vecnum = zcmd & 0xff;
+	zp = (void *)arg;
+	uaddr = (void *)zp->zfs_cmd;
+	error = 0;
+	zcl = NULL;
+
+	if (len != sizeof (zfs_iocparm_t)) {
+		printf("len %d vecnum: %d sizeof (zfs_cmd_t) %ju\n",
+		    len, vecnum, (uintmax_t)sizeof (zfs_cmd_t));
+		return (EINVAL);
+	}
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	/*
+	 * Remap ioctl code for legacy user binaries
+	 */
+	if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) {
+		vecnum = zfs_ioctl_legacy_to_ozfs(vecnum);
+		if (vecnum < 0) {
+			kmem_free(zc, sizeof (zfs_cmd_t));
+			return (ENOTSUP);
+		}
+		zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP);
+		if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) {
+			error = SET_ERROR(EFAULT);
+			goto out;
+		}
+		zfs_cmd_legacy_to_ozfs(zcl, zc);
+	} else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) {
+		error = SET_ERROR(EFAULT);
+		goto out;
+	}
+	error = zfsdev_ioctl_common(vecnum, zc, 0);
+	if (zcl) {
+		zfs_cmd_ozfs_to_legacy(zc, zcl);
+		rc = copyout(zcl, uaddr, sizeof (*zcl));
+	} else {
+		rc = copyout(zc, uaddr, sizeof (*zc));
+	}
+	if (error == 0 && rc != 0)
+		error = SET_ERROR(EFAULT);
+out:
+	if (zcl)
+		kmem_free(zcl, sizeof (zfs_cmd_legacy_t));
+	kmem_free(zc, sizeof (zfs_cmd_t));
+	MPASS(tsd_get(rrw_tsd_key) == NULL);
+	return (error);
+}
+
+static void
+zfsdev_close(void *data)
+{
+	zfsdev_state_t *zs, *zsp = data;
+
+	mutex_enter(&zfsdev_state_lock);
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+		if (zs == zsp)
+			break;
+	}
+	if (zs == NULL || zs->zs_minor <= 0) {
+		mutex_exit(&zfsdev_state_lock);
+		return;
+	}
+	zs->zs_minor = -1;
+	zfs_onexit_destroy(zs->zs_onexit);
+	zfs_zevent_destroy(zs->zs_zevent);
+	mutex_exit(&zfsdev_state_lock);
+	zs->zs_onexit = NULL;
+	zs->zs_zevent = NULL;
+}
+
+static int
+zfs_ctldev_init(struct cdev *devp)
+{
+	boolean_t newzs = B_FALSE;
+	minor_t minor;
+	zfsdev_state_t *zs, *zsprev = NULL;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+	minor = zfsdev_minor_alloc();
+	if (minor == 0)
+		return (SET_ERROR(ENXIO));
+
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+		if (zs->zs_minor == -1)
+			break;
+		zsprev = zs;
+	}
+
+	if (!zs) {
+		zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+		newzs = B_TRUE;
+	}
+
+	devfs_set_cdevpriv(zs, zfsdev_close);
+
+	zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
+	zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
+
+	if (newzs) {
+		zs->zs_minor = minor;
+		wmb();
+		zsprev->zs_next = zs;
+	} else {
+		wmb();
+		zs->zs_minor = minor;
+	}
+	return (0);
+}
+
+static int
+zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td)
+{
+	int error;
+
+	mutex_enter(&zfsdev_state_lock);
+	error = zfs_ctldev_init(devp);
+	mutex_exit(&zfsdev_state_lock);
+
+	return (error);
+}
+
+static struct cdevsw zfs_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_open =	zfsdev_open,
+	.d_ioctl =	zfsdev_ioctl,
+	.d_name =	ZFS_DRIVER
+};
+
+int
+zfsdev_attach(void)
+{
+	zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666,
+	    ZFS_DRIVER);
+	return (0);
+}
+
+void
+zfsdev_detach(void)
+{
+	if (zfsdev != NULL)
+		destroy_dev(zfsdev);
+}
+
+int
+zfs__init(void)
+{
+	int error;
+
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+	printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+	    "overflow panic!\nPlease consider adding "
+	    "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+	    ZFS_MIN_KSTACK_PAGES);
+#endif
+	zfs_root_token = root_mount_hold("ZFS");
+	if ((error = zfs_kmod_init()) != 0) {
+		printf("ZFS: Failed to Load ZFS Filesystem"
+		    ", rc = %d\n", error);
+		root_mount_rel(zfs_root_token);
+		return (error);
+	}
+
+
+	tsd_create(&zfs_geom_probe_vdev_key, NULL);
+
+	printf("ZFS storage pool version: features support ("
+	    SPA_VERSION_STRING ")\n");
+	root_mount_rel(zfs_root_token);
+	ddi_sysevent_init();
+	return (0);
+}
+
+int
+zfs__fini(void)
+{
+	if (zfs_busy() || zvol_busy() ||
+	    zio_injection_enabled) {
+		return (EBUSY);
+	}
+	zfs_kmod_fini();
+	tsd_destroy(&zfs_geom_probe_vdev_key);
+	return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+	/*
+	 * ZFS fini routines can not properly work in a panic-ed system.
+	 */
+	if (panicstr == NULL)
+		zfs__fini();
+}
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+	int err;
+
+	switch (type) {
+	case MOD_LOAD:
+		err = zfs__init();
+		if (err == 0)
+			zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+			    shutdown_post_sync, zfs_shutdown, NULL,
+			    SHUTDOWN_PRI_FIRST);
+		return (err);
+	case MOD_UNLOAD:
+		err = zfs__fini();
+		if (err == 0 && zfs_shutdown_event_tag != NULL)
+			EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+			    zfs_shutdown_event_tag);
+		return (err);
+	case MOD_SHUTDOWN:
+		return (0);
+	default:
+		break;
+	}
+	return (EOPNOTSUPP);
+}
+
+static moduledata_t zfs_mod = {
+	"zfsctrl",
+	zfs_modevent,
+	0
+};
+
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+
+DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY);
+MODULE_VERSION(zfsctrl, 1);
+#if __FreeBSD_version > 1300092
+MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1);
+#else
+MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
+#endif
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
new file mode 100644
index 000000000000..2bc78cb451e8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/spa_os.c
@@ -0,0 +1,281 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
+ */
+
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/ddt.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zvol.h>
+#include <sys/abd.h>
+#include <sys/callb.h>
+#include <sys/zone.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+static nvlist_t *
+spa_generate_rootconf(const char *name)
+{
+	nvlist_t **configs, **tops;
+	nvlist_t *config;
+	nvlist_t *best_cfg, *nvtop, *nvroot;
+	uint64_t *holes;
+	uint64_t best_txg;
+	uint64_t nchildren;
+	uint64_t pgid;
+	uint64_t count;
+	uint64_t i;
+	uint_t   nholes;
+
+	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
+		return (NULL);
+
+	ASSERT3U(count, !=, 0);
+	best_txg = 0;
+	for (i = 0; i < count; i++) {
+		uint64_t txg;
+
+		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
+		    &txg) == 0);
+		if (txg > best_txg) {
+			best_txg = txg;
+			best_cfg = configs[i];
+		}
+	}
+
+	nchildren = 1;
+	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
+	holes = NULL;
+	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
+	    &holes, &nholes);
+
+	tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP);
+	for (i = 0; i < nchildren; i++) {
+		if (i >= count)
+			break;
+		if (configs[i] == NULL)
+			continue;
+		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
+		    &nvtop) == 0);
+		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
+	}
+	for (i = 0; holes != NULL && i < nholes; i++) {
+		if (i >= nchildren)
+			continue;
+		if (tops[holes[i]] != NULL)
+			continue;
+		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
+		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_HOLE) == 0);
+		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
+		    holes[i]) == 0);
+		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
+		    0) == 0);
+	}
+	for (i = 0; i < nchildren; i++) {
+		if (tops[i] != NULL)
+			continue;
+		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
+		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_MISSING) == 0);
+		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
+		    i) == 0);
+		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
+		    0) == 0);
+	}
+
+	/*
+	 * Create pool config based on the best vdev config.
+	 */
+	nvlist_dup(best_cfg, &config, KM_SLEEP);
+
+	/*
+	 * Put this pool's top-level vdevs into a root vdev.
+	 */
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &pgid) == 0);
+	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    tops, nchildren) == 0);
+
+	/*
+	 * Replace the existing vdev_tree with the new root vdev in
+	 * this pool's configuration (remove the old, add the new).
+	 */
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+
+	/*
+	 * Drop vdev config elements that should not be present at pool level.
+	 */
+	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
+	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
+
+	for (i = 0; i < count; i++)
+		nvlist_free(configs[i]);
+	kmem_free(configs, count * sizeof (void *));
+	for (i = 0; i < nchildren; i++)
+		nvlist_free(tops[i]);
+	kmem_free(tops, nchildren * sizeof (void *));
+	nvlist_free(nvroot);
+	return (config);
+}
+
+int
+spa_import_rootpool(const char *name, bool checkpointrewind)
+{
+	spa_t *spa;
+	vdev_t *rvd;
+	nvlist_t *config, *nvtop;
+	uint64_t txg;
+	char *pname;
+	int error;
+
+	/*
+	 * Read the label from the boot device and generate a configuration.
+	 */
+	config = spa_generate_rootconf(name);
+
+	mutex_enter(&spa_namespace_lock);
+	if (config != NULL) {
+		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    &pname) == 0 && strcmp(name, pname) == 0);
+		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
+		    == 0);
+
+		if ((spa = spa_lookup(pname)) != NULL) {
+			/*
+			 * The pool could already be imported,
+			 * e.g., after reboot -r.
+			 */
+			if (spa->spa_state == POOL_STATE_ACTIVE) {
+				mutex_exit(&spa_namespace_lock);
+				nvlist_free(config);
+				return (0);
+			}
+
+			/*
+			 * Remove the existing root pool from the namespace so
+			 * that we can replace it with the correct config
+			 * we just read in.
+			 */
+			spa_remove(spa);
+		}
+		spa = spa_add(pname, config, NULL);
+
+		/*
+		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
+		 * via spa_version().
+		 */
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+		    &spa->spa_ubsync.ub_version) != 0)
+			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+	} else if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
+		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
+		    name);
+		return (EIO);
+	} else {
+		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
+	}
+	spa->spa_is_root = B_TRUE;
+	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
+	if (checkpointrewind) {
+		spa->spa_import_flags |= ZFS_IMPORT_CHECKPOINT;
+	}
+
+	/*
+	 * Build up a vdev tree based on the boot device's label config.
+	 */
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvtop) == 0);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+	    VDEV_ALLOC_ROOTPOOL);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	if (error) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
+		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+		    pname);
+		return (error);
+	}
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	vdev_free(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	mutex_exit(&spa_namespace_lock);
+
+	nvlist_free(config);
+	return (0);
+}
+
+const char *
+spa_history_zone(void)
+{
+	return ("freebsd");
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
new file mode 100644
index 000000000000..647c1463ba14
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/buf.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/mount.h>
+#include <sys/taskqueue.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_compat.h>
+#include <sys/zfs_context.h>
+
+#include <sys/arc_impl.h>
+#include <sys/dsl_pool.h>
+
+
+/* BEGIN CSTYLED */
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+
+SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
+    "ZFS livelist condense");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
+SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+    "ZFS VDEV mirror");
+
+SYSCTL_DECL(_vfs_zfs_version);
+SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD,
+    (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version");
+
+extern arc_state_t ARC_anon;
+extern arc_state_t ARC_mru;
+extern arc_state_t ARC_mru_ghost;
+extern arc_state_t ARC_mfu;
+extern arc_state_t ARC_mfu_ghost;
+extern arc_state_t ARC_l2c_only;
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+
+/* arc.c */
+
+/* legacy compat */
+extern uint64_t l2arc_write_max;	/* def max write size */
+extern uint64_t l2arc_write_boost;	/* extra warmup write */
+extern uint64_t l2arc_headroom;		/* # of dev writes */
+extern uint64_t l2arc_headroom_boost;
+extern uint64_t l2arc_feed_secs;	/* interval seconds */
+extern uint64_t l2arc_feed_min_ms;	/* min interval msecs */
+extern int l2arc_noprefetch;			/* don't cache prefetch bufs */
+extern int l2arc_feed_again;			/* turbo warmup */
+extern int l2arc_norw;			/* no reads during writes */
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW,
+    &l2arc_write_max, 0, "max write size (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW,
+    &l2arc_write_boost, 0, "extra write during warmup (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW,
+    &l2arc_headroom, 0, "number of dev writes (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW,
+    &l2arc_feed_secs, 0, "interval seconds (LEGACY)");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW,
+    &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW,
+    &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW,
+    &l2arc_feed_again, 0, "turbo warmup (LEGACY)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW,
+    &l2arc_norw, 0, "no reads during writes (LEGACY)");
+#if 0
+extern int zfs_compressed_arc_enabled;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW,
+    &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)");
+#endif
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
+    &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+    &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of anonymous state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
+    &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+    &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mru ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
+    &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+    &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+    "size of metadata in mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+    &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+    "size of data in mfu ghost state");
+
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
+    &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+
+static int
+sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
+{
+	int err, val;
+
+	val = arc_no_grow_shift;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+        if (val < 0 || val >= arc_shrink_shift)
+		return (EINVAL);
+
+	arc_no_grow_shift = val;
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
+    CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, sizeof (int),
+    sysctl_vfs_zfs_arc_no_grow_shift, "I",
+    "log2(fraction of ARC which must be free to allow growing)");
+
+int
+param_set_arc_long(SYSCTL_HANDLER_ARGS)
+{
+	int err;
+
+	err = sysctl_handle_long(oidp, arg1, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+
+int
+param_set_arc_int(SYSCTL_HANDLER_ARGS)
+{
+	int err;
+
+	err = sysctl_handle_int(oidp, arg1, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
+    CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+    &zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_long, "LU",
+    "min arc size (LEGACY)");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
+    CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+    &zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_long, "LU",
+    "max arc size (LEGACY)");
+
+/* dbuf.c */
+
+
+/* dmu.c */
+
+/* dmu_zfetch.c */
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
+
+/* max bytes to prefetch per stream (default 8MB) */
+extern uint32_t	zfetch_max_distance;
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+    &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)");
+
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+extern uint32_t	zfetch_max_idistance;
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN,
+    &zfetch_max_idistance, 0,
+    "Max bytes to prefetch indirects for per stream (LEGACY)");
+
+/* dsl_pool.c */
+
+/* dnode.c */
+extern int zfs_default_bs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN,
+    &zfs_default_bs, 0, "Default dnode block shift");
+
+extern int zfs_default_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN,
+    &zfs_default_ibs, 0, "Default dnode indirect block shift");
+
+
+/* dsl_scan.c */
+
+/* metaslab.c */
+
+/*
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
+ */
+extern int zfs_metaslab_sm_blksz_no_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN,
+    &zfs_metaslab_sm_blksz_no_log, 0,
+    "Block size for space map in pools with log space map disabled.  "
+    "Power of 2 and greater than 4096.");
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+extern int zfs_metaslab_sm_blksz_with_log;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN,
+    &zfs_metaslab_sm_blksz_with_log, 0,
+    "Block size for space map in pools with log space map enabled.  "
+    "Power of 2 and greater than 4096.");
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+extern int zfs_condense_pct;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+    &zfs_condense_pct, 0,
+    "Condense on-disk spacemap when it is more than this many percents"
+    " of in-memory counterpart");
+
+extern int zfs_remove_max_segment;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN,
+    &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to"
+    " allocate when removing a device");
+
+extern int zfs_removal_suspend_progress;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN,
+    &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while"
+    " in the middle of a removal");
+
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy.  Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+extern uint64_t metaslab_df_alloc_threshold;
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+    &metaslab_df_alloc_threshold, 0,
+    "Minimum size which forces the dynamic allocator to change it's allocation strategy");
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+extern int metaslab_df_free_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+    &metaslab_df_free_pct, 0,
+    "The minimum free space, in percent, which must be available in a "
+    "space map to continue allocations in a first-fit fashion");
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+extern int metaslab_load_pct;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+    &metaslab_load_pct, 0,
+    "Percentage of cpus that can be used by the metaslab taskq");
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+extern int metaslab_preload_limit;
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+    &metaslab_preload_limit, 0,
+    "Max number of metaslabs per group to preload");
+
+/* refcount.c */
+extern int reference_tracking_enable;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+    &reference_tracking_enable, 0,
+    "Track reference holders to refcount_t objects, used mostly by ZFS");
+
+/* spa.c */
+extern int zfs_ccw_retry_interval;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN,
+    &zfs_ccw_retry_interval, 0,
+    "Configuration cache file write, retry after failure, interval (seconds)");
+
+extern uint64_t zfs_max_missing_tvds_cachefile;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN,
+    &zfs_max_missing_tvds_cachefile, 0,
+    "allow importing pools with missing top-level vdevs in cache file");
+
+extern uint64_t zfs_max_missing_tvds_scan;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN,
+    &zfs_max_missing_tvds_scan, 0,
+    "allow importing pools with missing top-level vdevs during scan");
+
+/* spa_misc.c */
+extern int zfs_flags;
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+	int err, val;
+
+	val = zfs_flags;
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	/*
+	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+	 * arc buffers in the system have the necessary additional
+	 * checksum data.  However, it is safe to disable at any
+	 * time.
+	 */
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+		val &= ~ZFS_DEBUG_MODIFY;
+	zfs_flags = val;
+
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
+    CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0,
+    sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+
+int
+param_set_deadman_synctime(SYSCTL_HANDLER_ARGS)
+{
+	unsigned long val;
+	int err;
+
+	val = zfs_deadman_synctime_ms;
+	err = sysctl_handle_long(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+	zfs_deadman_synctime_ms = val;
+
+	spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+	return (0);
+}
+
+int
+param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS)
+{
+	unsigned long val;
+	int err;
+
+	val = zfs_deadman_ziotime_ms;
+	err = sysctl_handle_long(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+	zfs_deadman_ziotime_ms = val;
+
+	spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+	return (0);
+}
+
+int
+param_set_deadman_failmode(SYSCTL_HANDLER_ARGS)
+{
+	char buf[16];
+	int rc;
+
+	if (req->newptr == NULL)
+		strlcpy(buf, zfs_deadman_failmode, sizeof (buf));
+
+	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (rc || req->newptr == NULL)
+		return (rc);
+	if (strcmp(buf, zfs_deadman_failmode) == 0)
+		return (0);
+	if (!strcmp(buf,  "wait"))
+		zfs_deadman_failmode = "wait";
+	if (!strcmp(buf,  "continue"))
+		zfs_deadman_failmode = "continue";
+	if (!strcmp(buf,  "panic"))
+		zfs_deadman_failmode = "panic";
+
+	return (-param_set_deadman_failmode_common(buf));
+}
+
+
+/* spacemap.c */
+extern int space_map_ibs;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
+    &space_map_ibs, 0, "Space map indirect block shift");
+
+
+/* vdev.c */
+int
+param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_vdev_min_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (SET_ERROR(err));
+
+	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+		return (SET_ERROR(EINVAL));
+
+	zfs_vdev_min_auto_ashift = val;
+
+	return (0);
+}
+
+int
+param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t val;
+	int err;
+
+	val = zfs_vdev_max_auto_ashift;
+	err = sysctl_handle_64(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (SET_ERROR(err));
+
+	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+		return (SET_ERROR(EINVAL));
+
+	zfs_vdev_max_auto_ashift = val;
+
+	return (0);
+}
+
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+    &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
+    param_set_min_auto_ashift, "QU",
+    "Min ashift used when creating new top-level vdev. (LEGACY)");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
+    &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
+    param_set_max_auto_ashift, "QU",
+    "Max ashift used when optimizing for logical -> physical sector size on "
+    "new top-level vdevs. (LEGACY)");
+
+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+extern int zfs_vdev_dtl_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN,
+    &zfs_vdev_dtl_sm_blksz, 0,
+    "Block size for DTL space map.  Power of 2 and greater than 4096.");
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+extern int zfs_vdev_standard_sm_blksz;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN,
+    &zfs_vdev_standard_sm_blksz, 0,
+    "Block size for standard space map.  Power of 2 and greater than 4096.");
+
+extern int vdev_validate_skip;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN,
+    &vdev_validate_skip, 0,
+    "Enable to bypass vdev_validate().");
+
+
+/* vdev_cache.c */
+
+/* vdev_mirror.c */
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+
+/* vdev_queue.c */
+#define	ZFS_VDEV_QUEUE_KNOB_MIN(name)					\
+extern uint32_t zfs_vdev_ ## name ## _min_active;				\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\
+    &zfs_vdev_ ## name ## _min_active, 0,				\
+    "Initial number of I/O requests of type " #name			\
+    " active for each device");
+
+#define	ZFS_VDEV_QUEUE_KNOB_MAX(name)					\
+extern uint32_t zfs_vdev_ ## name ## _max_active;				\
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \
+    &zfs_vdev_ ## name ## _max_active, 0,				\
+    "Maximum number of I/O requests of type " #name			\
+    " active for each device");
+
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
+extern uint32_t zfs_vdev_max_active;
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN,
+    &zfs_vdev_max_active, 0,
+    "The maximum number of I/Os of all types active for each device. (LEGACY)");
+
+extern int zfs_vdev_def_queue_depth;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN,
+    &zfs_vdev_def_queue_depth, 0,
+    "Default queue depth for each allocator");
+
+/*extern uint64_t zfs_multihost_history;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN,
+    &zfs_multihost_history, 0,
+    "Historical staticists for the last N multihost updates");*/
+
+#ifdef notyet
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
+    &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
+#endif
+
+
+/* zio.c */
+#if defined(__LP64__)
+int zio_use_uma = 1;
+#else
+int zio_use_uma = 0;
+#endif
+
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
+    "Use uma(9) for ZIO allocations");
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+    "Exclude metadata buffers from dumps as well");
+
+int
+param_set_slop_shift(SYSCTL_HANDLER_ARGS)
+{
+	int val;
+	int err;
+
+	val = *(int *)arg1;
+
+	err = sysctl_handle_int(oidp, &val, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (val < 1 || val > 31)
+		return (EINVAL);
+
+	*(int *)arg1 = val;
+
+	return (0);
+}
+
+int
+param_set_multihost_interval(SYSCTL_HANDLER_ARGS)
+{
+	int err;
+
+	err = sysctl_handle_long(oidp, arg1, 0, req);
+	if (err != 0 || req->newptr == NULL)
+		return (err);
+
+	if (spa_mode_global != SPA_MODE_UNINIT)
+		mmp_signal_all_threads();
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
new file mode 100644
index 000000000000..825bd706e0c0
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_file.c
@@ -0,0 +1,354 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/file.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/stat.h>
+
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
+unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
+
+void
+vdev_file_init(void)
+{
+	vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
+	    minclsyspri, max_ncpus, INT_MAX, 0);
+}
+
+void
+vdev_file_fini(void)
+{
+	taskq_destroy(vdev_file_taskq);
+}
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static mode_t
+vdev_file_open_mode(spa_mode_t spa_mode)
+{
+	mode_t mode = 0;
+
+	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
+		mode = O_RDWR;
+	} else if (spa_mode & SPA_MODE_READ) {
+		mode = O_RDONLY;
+	} else if (spa_mode & SPA_MODE_WRITE) {
+		mode = O_WRONLY;
+	}
+
+	return (mode | O_LARGEFILE);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	vdev_file_t *vf;
+	zfs_file_t *fp;
+	zfs_file_attr_t zfa;
+	int error;
+
+	/*
+	 * Rotational optimizations only make sense on block devices.
+	 */
+	vd->vdev_nonrot = B_TRUE;
+
+	/*
+	 * Allow TRIM on file based vdevs.  This may not always be supported,
+	 * since it depends on your kernel version and underlying filesystem
+	 * type but it is always safe to attempt.
+	 */
+	vd->vdev_has_trim = B_TRUE;
+
+	/*
+	 * Disable secure TRIM on file based vdevs.  There is no way to
+	 * request this behavior from the underlying filesystem.
+	 */
+	vd->vdev_has_securetrim = B_FALSE;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it's not currently open.  Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		vf = vd->vdev_tsd;
+		goto skip_open;
+	}
+
+	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+	/*
+	 * We always open the files from the root of the global zone, even if
+	 * we're in a local zone.  If the user has gotten to this point, the
+	 * administrator has already decided that the pool should be available
+	 * to local zone users, so the underlying devices should be as well.
+	 */
+	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+
+	error = zfs_file_open(vd->vdev_path,
+	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	vf->vf_file = fp;
+
+#ifdef _KERNEL
+	/*
+	 * Make sure it's a regular file.
+	 */
+	if (zfs_file_getattr(fp, &zfa)) {
+		return (SET_ERROR(ENODEV));
+	}
+	if (!S_ISREG(zfa.zfa_mode)) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (SET_ERROR(ENODEV));
+	}
+#endif
+
+skip_open:
+
+	error =  zfs_file_getattr(vf->vf_file, &zfa);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	*max_psize = *psize = zfa.zfa_size;
+	*logical_ashift = vdev_file_logical_ashift;
+	*physical_ashift = vdev_file_physical_ashift;
+
+	return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (vd->vdev_reopening || vf == NULL)
+		return;
+
+	if (vf->vf_file != NULL) {
+		zfs_file_close(vf->vf_file);
+	}
+
+	vd->vdev_delayed_close = B_FALSE;
+	kmem_free(vf, sizeof (vdev_file_t));
+	vd->vdev_tsd = NULL;
+}
+
+/*
+ * Implements the interrupt side for file vdev types. This routine will be
+ * called when the I/O completes allowing us to transfer the I/O to the
+ * interrupt taskqs. For consistency, the code structure mimics disk vdev
+ * types.
+ */
+static void
+vdev_file_io_intr(zio_t *zio)
+{
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+	zio_t *zio = arg;
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf;
+	void *buf;
+	ssize_t resid;
+	loff_t off;
+	ssize_t size;
+	int err;
+
+	off = zio->io_offset;
+	size = zio->io_size;
+	resid = 0;
+
+	vf = vd->vdev_tsd;
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	if (zio->io_type == ZIO_TYPE_READ) {
+		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf_copy(zio->io_abd, buf, size);
+	} else {
+		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf(zio->io_abd, buf, size);
+	}
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = ENOSPC;
+
+	vdev_file_io_intr(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+			zio->io_error = zfs_file_fsync(vf->vf_file,
+			    O_SYNC|O_DSYNC);
+			break;
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		zio_execute(zio);
+		return;
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+#ifdef notyet
+		int mode = 0;
+
+		ASSERT3U(zio->io_size, !=, 0);
+
+		/* XXX FreeBSD has no fallocate routine in file ops */
+		zio->io_error = zfs_file_fallocate(vf->vf_file,
+		    mode, zio->io_offset, zio->io_size);
+#endif
+		zio->io_error = SET_ERROR(ENOTSUP);
+		zio_execute(zio);
+		return;
+	}
+	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+	    TQ_SLEEP), !=, 0);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+#endif
+
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW,
+	"Logical ashift for file-based devices");
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW,
+	"Physical ashift for file-based devices");
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
new file mode 100644
index 000000000000..c9e8e21982cf
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_geom.c
@@ -0,0 +1,1214 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright (c) 2012 Martin Matuska <mm@FreeBSD.org>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/file.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <geom/geom.h>
+#include <geom/geom_disk.h>
+#include <geom/geom_int.h>
+
+#ifndef g_topology_locked
+#define	g_topology_locked()	sx_xlocked(&topology_lock)
+#endif
+
+/*
+ * Virtual device vector for GEOM.
+ */
+
+static g_attrchanged_t vdev_geom_attrchanged;
+struct g_class zfs_vdev_class = {
+	.name = "ZFS::VDEV",
+	.version = G_VERSION,
+	.attrchanged = vdev_geom_attrchanged,
+};
+
+struct consumer_vdev_elem {
+	SLIST_ENTRY(consumer_vdev_elem)	elems;
+	vdev_t	*vd;
+};
+
+SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
+/* BEGIN CSTYLED */
+_Static_assert(sizeof (((struct g_consumer *)NULL)->private)
+	== sizeof (struct consumer_priv_t*),
+	"consumer_priv_t* can't be stored in g_consumer.private");
+
+DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+/* Don't send BIO_FLUSH. */
+static int vdev_geom_bio_flush_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN,
+    &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH");
+/* Don't send BIO_DELETE. */
+static int vdev_geom_bio_delete_disable;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN,
+    &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
+/* END CSTYLED */
+
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids.  If NULL, this thread is not tasting geoms.  If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
+static void
+vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
+    boolean_t do_null_update)
+{
+	boolean_t needs_update = B_FALSE;
+	char *physpath;
+	int error, physpath_len;
+
+	physpath_len = MAXPATHLEN;
+	physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+	error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+	if (error == 0) {
+		char *old_physpath;
+
+		/* g_topology lock ensures that vdev has not been closed */
+		g_topology_assert();
+		old_physpath = vd->vdev_physpath;
+		vd->vdev_physpath = spa_strdup(physpath);
+
+		if (old_physpath != NULL) {
+			needs_update = (strcmp(old_physpath,
+			    vd->vdev_physpath) != 0);
+			spa_strfree(old_physpath);
+		} else
+			needs_update = do_null_update;
+	}
+	g_free(physpath);
+
+	/*
+	 * If the physical path changed, update the config.
+	 * Only request an update for previously unset physpaths if
+	 * requested by the caller.
+	 */
+	if (needs_update)
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem;
+
+	priv = (struct consumer_priv_t *)&cp->private;
+	if (SLIST_EMPTY(priv))
+		return;
+
+	SLIST_FOREACH(elem, priv, elems) {
+		vdev_t *vd = elem->vd;
+		if (strcmp(attr, "GEOM::physpath") == 0) {
+			vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE);
+			return;
+		}
+	}
+}
+
+static void
+vdev_geom_resize(struct g_consumer *cp)
+{
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem;
+	spa_t *spa;
+	vdev_t *vd;
+
+	priv = (struct consumer_priv_t *)&cp->private;
+	if (SLIST_EMPTY(priv))
+		return;
+
+	SLIST_FOREACH(elem, priv, elems) {
+		vd = elem->vd;
+		if (vd->vdev_state != VDEV_STATE_HEALTHY)
+			continue;
+		spa = vd->vdev_spa;
+		if (!spa->spa_autoexpand)
+			continue;
+		vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL);
+	}
+}
+
+static void
+vdev_geom_orphan(struct g_consumer *cp)
+{
+	struct consumer_priv_t *priv;
+	// cppcheck-suppress uninitvar
+	struct consumer_vdev_elem *elem;
+
+	g_topology_assert();
+
+	priv = (struct consumer_priv_t *)&cp->private;
+	if (SLIST_EMPTY(priv))
+		/* Vdev close in progress.  Ignore the event. */
+		return;
+
+	/*
+	 * Orphan callbacks occur from the GEOM event thread.
+	 * Concurrent with this call, new I/O requests may be
+	 * working their way through GEOM about to find out
+	 * (only once executed by the g_down thread) that we've
+	 * been orphaned from our disk provider.  These I/Os
+	 * must be retired before we can detach our consumer.
+	 * This is most easily achieved by acquiring the
+	 * SPA ZIO configuration lock as a writer, but doing
+	 * so with the GEOM topology lock held would cause
+	 * a lock order reversal.  Instead, rely on the SPA's
+	 * async removal support to invoke a close on this
+	 * vdev once it is safe to do so.
+	 */
+	// cppcheck-suppress All
+	SLIST_FOREACH(elem, priv, elems) {
+		// cppcheck-suppress uninitvar
+		vdev_t *vd = elem->vd;
+
+		vd->vdev_remove_wanted = B_TRUE;
+		spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+	}
+}
+
+static struct g_consumer *
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
+{
+	struct g_geom *gp;
+	struct g_consumer *cp;
+	int error;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+	if (sanity) {
+		if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+			ZFS_LOG(1, "Failing attach of %s. "
+			    "Incompatible sectorsize %d\n",
+			    pp->name, pp->sectorsize);
+			return (NULL);
+		} else if (pp->mediasize < SPA_MINDEVSIZE) {
+			ZFS_LOG(1, "Failing attach of %s. "
+			    "Incompatible mediasize %ju\n",
+			    pp->name, pp->mediasize);
+			return (NULL);
+		}
+	}
+
+	/* Do we have geom already? No? Create one. */
+	LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
+		if (gp->flags & G_GEOM_WITHER)
+			continue;
+		if (strcmp(gp->name, "zfs::vdev") != 0)
+			continue;
+		break;
+	}
+	if (gp == NULL) {
+		gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
+		gp->orphan = vdev_geom_orphan;
+		gp->attrchanged = vdev_geom_attrchanged;
+		gp->resize = vdev_geom_resize;
+		cp = g_new_consumer(gp);
+		error = g_attach(cp, pp);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+			    __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
+			return (NULL);
+		}
+		error = g_access(cp, 1, 0, 1);
+		if (error != 0) {
+			ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__,
+			    __LINE__, error);
+			vdev_geom_detach(cp, B_FALSE);
+			return (NULL);
+		}
+		ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
+	} else {
+		/* Check if we are already connected to this provider. */
+		LIST_FOREACH(cp, &gp->consumer, consumer) {
+			if (cp->provider == pp) {
+				ZFS_LOG(1, "Found consumer for %s.", pp->name);
+				break;
+			}
+		}
+		if (cp == NULL) {
+			cp = g_new_consumer(gp);
+			error = g_attach(cp, pp);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
+				return (NULL);
+			}
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
+				vdev_geom_detach(cp, B_FALSE);
+				return (NULL);
+			}
+			ZFS_LOG(1, "Created consumer for %s.", pp->name);
+		} else {
+			error = g_access(cp, 1, 0, 1);
+			if (error != 0) {
+				ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+				    __func__, __LINE__, error);
+				return (NULL);
+			}
+			ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
+		}
+	}
+
+	if (vd != NULL)
+		vd->vdev_tsd = cp;
+
+	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
+	return (cp);
+}
+
+static void
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
+{
+	struct g_geom *gp;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Detaching from %s.",
+	    cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
+	gp = cp->geom;
+	if (open_for_read)
+		g_access(cp, -1, 0, -1);
+	/* Destroy consumer on last close. */
+	if (cp->acr == 0 && cp->ace == 0) {
+		if (cp->acw > 0)
+			g_access(cp, 0, -cp->acw, 0);
+		if (cp->provider != NULL) {
+			ZFS_LOG(1, "Destroying consumer for %s.",
+			    cp->provider->name ? cp->provider->name : "NULL");
+			g_detach(cp);
+		}
+		g_destroy_consumer(cp);
+	}
+	/* Destroy geom if there are no consumers left. */
+	if (LIST_EMPTY(&gp->consumer)) {
+		ZFS_LOG(1, "Destroyed geom %s.", gp->name);
+		g_wither_geom(gp, ENXIO);
+	}
+}
+
+static void
+vdev_geom_close_locked(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	struct consumer_priv_t *priv;
+	struct consumer_vdev_elem *elem, *elem_temp;
+
+	g_topology_assert();
+
+	cp = vd->vdev_tsd;
+	vd->vdev_delayed_close = B_FALSE;
+	if (cp == NULL)
+		return;
+
+	ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+	KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
+	priv = (struct consumer_priv_t *)&cp->private;
+	vd->vdev_tsd = NULL;
+	SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
+		if (elem->vd == vd) {
+			SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
+			g_free(elem);
+		}
+	}
+
+	vdev_geom_detach(cp, B_TRUE);
+}
+
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds.  Each IO
+ * operation is described by parallel entries from each array.  There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+    off_t *sizes, int *errors, int ncmds)
+{
+	struct bio **bios;
+	uint8_t *p;
+	off_t off, maxio, s, end;
+	int i, n_bios, j;
+	size_t bios_size;
+
+	maxio = maxphys - (maxphys % cp->provider->sectorsize);
+	n_bios = 0;
+
+	/* How many bios are required for all commands ? */
+	for (i = 0; i < ncmds; i++)
+		n_bios += (sizes[i] + maxio - 1) / maxio;
+
+	/* Allocate memory for the bios */
+	bios_size = n_bios * sizeof (struct bio *);
+	bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+	/* Prepare and issue all of the bios */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		p = datas[i];
+		s = sizes[i];
+		end = off + s;
+		ASSERT((off % cp->provider->sectorsize) == 0);
+		ASSERT((s % cp->provider->sectorsize) == 0);
+
+		for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+			bios[j] = g_alloc_bio();
+			bios[j]->bio_cmd = cmds[i];
+			bios[j]->bio_done = NULL;
+			bios[j]->bio_offset = off;
+			bios[j]->bio_length = MIN(s, maxio);
+			bios[j]->bio_data = (caddr_t)p;
+			g_io_request(bios[j], cp);
+		}
+	}
+	ASSERT(j == n_bios);
+
+	/* Wait for all of the bios to complete, and clean them up */
+	for (i = j = 0; i < ncmds; i++) {
+		off = offsets[i];
+		s = sizes[i];
+		end = off + s;
+
+		for (; off < end; off += maxio, s -= maxio, j++) {
+			errors[i] = biowait(bios[j], "vdev_geom_io") ||
+			    errors[i];
+			g_destroy_bio(bios[j]);
+		}
+	}
+	kmem_free(bios, bios_size);
+}
+
+/*
+ * Read the vdev config from a device.  Return the number of valid labels that
+ * were found.  The vdev config will be returned in config if and only if at
+ * least one valid label was found.
+ */
+static int
+vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp)
+{
+	struct g_provider *pp;
+	nvlist_t *config;
+	vdev_phys_t *vdev_lists[VDEV_LABELS];
+	char *buf;
+	size_t buflen;
+	uint64_t psize, state, txg;
+	off_t offsets[VDEV_LABELS];
+	off_t size;
+	off_t sizes[VDEV_LABELS];
+	int cmds[VDEV_LABELS];
+	int errors[VDEV_LABELS];
+	int l, nlabels;
+
+	g_topology_assert_not();
+
+	pp = cp->provider;
+	ZFS_LOG(1, "Reading config from %s...", pp->name);
+
+	psize = pp->mediasize;
+	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
+
+	size = sizeof (*vdev_lists[0]) + pp->sectorsize -
+	    ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
+
+	buflen = sizeof (vdev_lists[0]->vp_nvlist);
+
+	/* Create all of the IO requests */
+	for (l = 0; l < VDEV_LABELS; l++) {
+		cmds[l] = BIO_READ;
+		vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+		offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+		sizes[l] = size;
+		errors[l] = 0;
+		ASSERT(offsets[l] % pp->sectorsize == 0);
+	}
+
+	/* Issue the IO requests */
+	vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+	    VDEV_LABELS);
+
+	/* Parse the labels */
+	config = *configp = NULL;
+	nlabels = 0;
+	for (l = 0; l < VDEV_LABELS; l++) {
+		if (errors[l] != 0)
+			continue;
+
+		buf = vdev_lists[l]->vp_nvlist;
+
+		if (nvlist_unpack(buf, buflen, &config, 0) != 0)
+			continue;
+
+		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+		    &state) != 0 || state > POOL_STATE_L2CACHE) {
+			nvlist_free(config);
+			continue;
+		}
+
+		if (state != POOL_STATE_SPARE &&
+		    state != POOL_STATE_L2CACHE &&
+		    (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+		    &txg) != 0 || txg == 0)) {
+			nvlist_free(config);
+			continue;
+		}
+
+		if (*configp != NULL)
+			nvlist_free(*configp);
+		*configp = config;
+		nlabels++;
+	}
+
+	/* Free the label storage */
+	for (l = 0; l < VDEV_LABELS; l++)
+		kmem_free(vdev_lists[l], size);
+
+	return (nlabels);
+}
+
+static void
+resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id)
+{
+	nvlist_t **new_configs;
+	uint64_t i;
+
+	if (id < *count)
+		return;
+	new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *),
+	    KM_SLEEP);
+	for (i = 0; i < *count; i++)
+		new_configs[i] = (*configs)[i];
+	if (*configs != NULL)
+		kmem_free(*configs, *count * sizeof (void *));
+	*configs = new_configs;
+	*count = id + 1;
+}
+
+static void
+process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg,
+    const char *name, uint64_t *known_pool_guid)
+{
+	nvlist_t *vdev_tree;
+	uint64_t pool_guid;
+	uint64_t vdev_guid;
+	uint64_t id, txg, known_txg;
+	char *pname;
+
+	if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 ||
+	    strcmp(pname, name) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0)
+		goto ignore;
+
+	if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0)
+		goto ignore;
+
+	VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+	if (*known_pool_guid != 0) {
+		if (pool_guid != *known_pool_guid)
+			goto ignore;
+	} else
+		*known_pool_guid = pool_guid;
+
+	resize_configs(configs, count, id);
+
+	if ((*configs)[id] != NULL) {
+		VERIFY(nvlist_lookup_uint64((*configs)[id],
+		    ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0);
+		if (txg <= known_txg)
+			goto ignore;
+		nvlist_free((*configs)[id]);
+	}
+
+	(*configs)[id] = cfg;
+	return;
+
+ignore:
+	nvlist_free(cfg);
+}
+
+int
+vdev_geom_read_pool_label(const char *name,
+    nvlist_t ***configs, uint64_t *count)
+{
+	struct g_class *mp;
+	struct g_geom *gp;
+	struct g_provider *pp;
+	struct g_consumer *zcp;
+	nvlist_t *vdev_cfg;
+	uint64_t pool_guid;
+	int nlabels;
+
+	DROP_GIANT();
+	g_topology_lock();
+
+	*configs = NULL;
+	*count = 0;
+	pool_guid = 0;
+	LIST_FOREACH(mp, &g_classes, class) {
+		if (mp == &zfs_vdev_class)
+			continue;
+		LIST_FOREACH(gp, &mp->geom, geom) {
+			if (gp->flags & G_GEOM_WITHER)
+				continue;
+			LIST_FOREACH(pp, &gp->provider, provider) {
+				if (pp->flags & G_PF_WITHER)
+					continue;
+				zcp = vdev_geom_attach(pp, NULL, B_TRUE);
+				if (zcp == NULL)
+					continue;
+				g_topology_unlock();
+				nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
+				g_topology_lock();
+				vdev_geom_detach(zcp, B_TRUE);
+				if (nlabels == 0)
+					continue;
+				ZFS_LOG(1, "successfully read vdev config");
+
+				process_vdev_config(configs, count,
+				    vdev_cfg, name, &pool_guid);
+			}
+		}
+	}
+	g_topology_unlock();
+	PICKUP_GIANT();
+
+	return (*count > 0 ? 0 : ENOENT);
+}
+
+enum match {
+	NO_MATCH = 0,		/* No matching labels found */
+	TOPGUID_MATCH = 1,	/* Labels match top guid, not vdev guid */
+	ZERO_MATCH = 1,		/* Should never be returned */
+	ONE_MATCH = 2,		/* 1 label matching the vdev_guid */
+	TWO_MATCH = 3,		/* 2 label matching the vdev_guid */
+	THREE_MATCH = 4,	/* 3 label matching the vdev_guid */
+	FULL_MATCH = 5		/* all labels match the vdev_guid */
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
+{
+	nvlist_t *config;
+	uint64_t pool_guid, top_guid, vdev_guid;
+	struct g_consumer *cp;
+	int nlabels;
+
+	cp = vdev_geom_attach(pp, NULL, B_TRUE);
+	if (cp == NULL) {
+		ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+		    pp->name);
+		return (NO_MATCH);
+	}
+	g_topology_unlock();
+	nlabels = vdev_geom_read_config(cp, &config);
+	g_topology_lock();
+	vdev_geom_detach(cp, B_TRUE);
+	if (nlabels == 0) {
+		ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+		return (NO_MATCH);
+	}
+
+	pool_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+	top_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+	vdev_guid = 0;
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+	nvlist_free(config);
+
+	/*
+	 * Check that the label's pool guid matches the desired guid.
+	 * Inactive spares and L2ARCs do not have any pool guid in the label.
+	 */
+	if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+		ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+		    pp->name,
+		    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+		return (NO_MATCH);
+	}
+
+	/*
+	 * Check that the label's vdev guid matches the desired guid.
+	 * The second condition handles possible race on vdev detach, when
+	 * remaining vdev receives GUID of destroyed top level mirror vdev.
+	 */
+	if (vdev_guid == vd->vdev_guid) {
+		ZFS_LOG(1, "guids match for provider %s.", pp->name);
+		return (ZERO_MATCH + nlabels);
+	} else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+		ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+		return (TOPGUID_MATCH);
+	}
+	ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+	    pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+	return (NO_MATCH);
+}
+
+static struct g_consumer *
+vdev_geom_attach_by_guids(vdev_t *vd)
+{
+	struct g_class *mp;
+	struct g_geom *gp;
+	struct g_provider *pp, *best_pp;
+	struct g_consumer *cp;
+	const char *vdpath;
+	enum match match, best_match;
+
+	g_topology_assert();
+
+	vdpath = vd->vdev_path + sizeof ("/dev/") - 1;
+	cp = NULL;
+	best_pp = NULL;
+	best_match = NO_MATCH;
+	LIST_FOREACH(mp, &g_classes, class) {
+		if (mp == &zfs_vdev_class)
+			continue;
+		LIST_FOREACH(gp, &mp->geom, geom) {
+			if (gp->flags & G_GEOM_WITHER)
+				continue;
+			LIST_FOREACH(pp, &gp->provider, provider) {
+				match = vdev_attach_ok(vd, pp);
+				if (match > best_match) {
+					best_match = match;
+					best_pp = pp;
+				} else if (match == best_match) {
+					if (strcmp(pp->name, vdpath) == 0) {
+						best_pp = pp;
+					}
+				}
+				if (match == FULL_MATCH)
+					goto out;
+			}
+		}
+	}
+
+out:
+	if (best_pp) {
+		cp = vdev_geom_attach(best_pp, vd, B_TRUE);
+		if (cp == NULL) {
+			printf("ZFS WARNING: Unable to attach to %s.\n",
+			    best_pp->name);
+		}
+	}
+	return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_guids(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	char *buf;
+	size_t len;
+
+	g_topology_assert();
+
+	ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+	    (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+	cp = vdev_geom_attach_by_guids(vd);
+	if (cp != NULL) {
+		len = strlen(cp->provider->name) + strlen("/dev/") + 1;
+		buf = kmem_alloc(len, KM_SLEEP);
+
+		snprintf(buf, len, "/dev/%s", cp->provider->name);
+		spa_strfree(vd->vdev_path);
+		vd->vdev_path = buf;
+
+		ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
+		    (uintmax_t)vd->vdev_guid, cp->provider->name);
+	} else {
+		ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+		    (uintmax_t)spa_guid(vd->vdev_spa),
+		    (uintmax_t)vd->vdev_guid);
+	}
+
+	return (cp);
+}
+
+static struct g_consumer *
+vdev_geom_open_by_path(vdev_t *vd, int check_guid)
+{
+	struct g_provider *pp;
+	struct g_consumer *cp;
+
+	g_topology_assert();
+
+	cp = NULL;
+	pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1);
+	if (pp != NULL) {
+		ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
+		if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+			cp = vdev_geom_attach(pp, vd, B_FALSE);
+	}
+
+	return (cp);
+}
+
+static int
+vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	struct g_provider *pp;
+	struct g_consumer *cp;
+	int error, has_trim;
+	uint16_t rate;
+
+	/*
+	 * Set the TLS to indicate downstack that we
+	 * should not access zvols
+	 */
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (EINVAL);
+	}
+
+	/*
+	 * Reopen the device if it's not currently open. Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if ((cp = vd->vdev_tsd) != NULL) {
+		ASSERT(vd->vdev_reopening);
+		goto skip_open;
+	}
+
+	DROP_GIANT();
+	g_topology_lock();
+	error = 0;
+
+	if (vd->vdev_spa->spa_is_splitting ||
+	    ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+	    (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+	    vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) {
+		/*
+		 * We are dealing with a vdev that hasn't been previously
+		 * opened (since boot), and we are not loading an
+		 * existing pool configuration.  This looks like a
+		 * vdev add operation to a new or existing pool.
+		 * Assume the user really wants to do this, and find
+		 * GEOM provider by its name, ignoring GUID mismatches.
+		 *
+		 * XXPOLICY: It would be safer to only allow a device
+		 *           that is unlabeled or labeled but missing
+		 *           GUID information to be opened in this fashion,
+		 *           unless we are doing a split, in which case we
+		 *           should allow any guid.
+		 */
+		cp = vdev_geom_open_by_path(vd, 0);
+	} else {
+		/*
+		 * Try using the recorded path for this device, but only
+		 * accept it if its label data contains the expected GUIDs.
+		 */
+		cp = vdev_geom_open_by_path(vd, 1);
+		if (cp == NULL) {
+			/*
+			 * The device at vd->vdev_path doesn't have the
+			 * expected GUIDs. The disks might have merely
+			 * moved around so try all other GEOM providers
+			 * to find one with the right GUIDs.
+			 */
+			cp = vdev_geom_open_by_guids(vd);
+		}
+	}
+
+	/* Clear the TLS now that tasting is done */
+	VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
+	if (cp == NULL) {
+		ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
+		error = ENOENT;
+	} else {
+		struct consumer_priv_t *priv;
+		struct consumer_vdev_elem *elem;
+		int spamode;
+
+		priv = (struct consumer_priv_t *)&cp->private;
+		if (cp->private == NULL)
+			SLIST_INIT(priv);
+		elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO);
+		elem->vd = vd;
+		SLIST_INSERT_HEAD(priv, elem, elems);
+
+		spamode = spa_mode(vd->vdev_spa);
+		if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+		    !ISP2(cp->provider->sectorsize)) {
+			ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+			    cp->provider->name);
+
+			vdev_geom_close_locked(vd);
+			error = EINVAL;
+			cp = NULL;
+		} else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
+			int i;
+
+			for (i = 0; i < 5; i++) {
+				error = g_access(cp, 0, 1, 0);
+				if (error == 0)
+					break;
+				g_topology_unlock();
+				tsleep(vd, 0, "vdev", hz / 2);
+				g_topology_lock();
+			}
+			if (error != 0) {
+				printf("ZFS WARNING: Unable to open %s for "
+				    "writing (error=%d).\n",
+				    cp->provider->name, error);
+				vdev_geom_close_locked(vd);
+				cp = NULL;
+			}
+		}
+	}
+
+	/* Fetch initial physical path information for this device. */
+	if (cp != NULL) {
+		vdev_geom_attrchanged(cp, "GEOM::physpath");
+
+		/* Set other GEOM characteristics */
+		vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE);
+	}
+
+	g_topology_unlock();
+	PICKUP_GIANT();
+	if (cp == NULL) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]",
+		    error);
+		return (error);
+	}
+skip_open:
+	pp = cp->provider;
+
+	/*
+	 * Determine the actual size of the device.
+	 */
+	*max_psize = *psize = pp->mediasize;
+
+	/*
+	 * Determine the device's minimum transfer size and preferred
+	 * transfer size.
+	 */
+	*logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+	*physical_ashift = 0;
+	if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) &&
+	    ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) &&
+	    pp->stripeoffset == 0)
+		*physical_ashift = highbit(pp->stripesize) - 1;
+
+	/*
+	 * Clear the nowritecache settings, so that on a vdev_reopen()
+	 * we will try again.
+	 */
+	vd->vdev_nowritecache = B_FALSE;
+
+	/* Inform the ZIO pipeline that we are non-rotational. */
+	error = g_getattr("GEOM::rotation_rate", cp, &rate);
+	if (error == 0 && rate == DISK_RR_NON_ROTATING)
+		vd->vdev_nonrot = B_TRUE;
+	else
+		vd->vdev_nonrot = B_FALSE;
+
+	/* Set when device reports it supports TRIM. */
+	error = g_getattr("GEOM::candelete", cp, &has_trim);
+	vd->vdev_has_trim = (error == 0 && has_trim);
+
+	/* Set when device reports it supports secure TRIM. */
+	/* unavailable on FreeBSD */
+	vd->vdev_has_securetrim = B_FALSE;
+
+	return (0);
+}
+
+static void
+vdev_geom_close(vdev_t *vd)
+{
+	struct g_consumer *cp;
+	boolean_t locked;
+
+	cp = vd->vdev_tsd;
+
+	DROP_GIANT();
+	locked = g_topology_locked();
+	if (!locked)
+		g_topology_lock();
+
+	if (!vd->vdev_reopening ||
+	    (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
+	    (cp->provider != NULL && cp->provider->error != 0))))
+		vdev_geom_close_locked(vd);
+
+	if (!locked)
+		g_topology_unlock();
+	PICKUP_GIANT();
+}
+
+static void
+vdev_geom_io_intr(struct bio *bp)
+{
+	vdev_t *vd;
+	zio_t *zio;
+
+	zio = bp->bio_caller1;
+	vd = zio->io_vd;
+	zio->io_error = bp->bio_error;
+	if (zio->io_error == 0 && bp->bio_resid != 0)
+		zio->io_error = SET_ERROR(EIO);
+
+	switch (zio->io_error) {
+	case ENOTSUP:
+		/*
+		 * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+		 * that future attempts will never succeed. In this case
+		 * we set a persistent flag so that we don't bother with
+		 * requests in the future.
+		 */
+		switch (bp->bio_cmd) {
+		case BIO_FLUSH:
+			vd->vdev_nowritecache = B_TRUE;
+			break;
+		case BIO_DELETE:
+			break;
+		}
+		break;
+	case ENXIO:
+		if (!vd->vdev_remove_wanted) {
+			/*
+			 * If provider's error is set we assume it is being
+			 * removed.
+			 */
+			if (bp->bio_to->error != 0) {
+				vd->vdev_remove_wanted = B_TRUE;
+				spa_async_request(zio->io_spa,
+				    SPA_ASYNC_REMOVE);
+			} else if (!vd->vdev_delayed_close) {
+				vd->vdev_delayed_close = B_TRUE;
+			}
+		}
+		break;
+	}
+
+	/*
+	 * We have to split bio freeing into two parts, because the ABD code
+	 * cannot be called in this context and vdev_op_io_done is not called
+	 * for ZIO_TYPE_IOCTL zio-s.
+	 */
+	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+		g_destroy_bio(bp);
+		zio->io_bio = NULL;
+	}
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_geom_io_start(zio_t *zio)
+{
+	vdev_t *vd;
+	struct g_consumer *cp;
+	struct bio *bp;
+
+	vd = zio->io_vd;
+
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		} else {
+			switch (zio->io_cmd) {
+			case DKIOCFLUSHWRITECACHE:
+				if (zfs_nocacheflush ||
+				    vdev_geom_bio_flush_disable)
+					break;
+				if (vd->vdev_nowritecache) {
+					zio->io_error = SET_ERROR(ENOTSUP);
+					break;
+				}
+				goto sendreq;
+			default:
+				zio->io_error = SET_ERROR(ENOTSUP);
+			}
+		}
+
+		zio_execute(zio);
+		return;
+	case ZIO_TYPE_TRIM:
+		if (!vdev_geom_bio_delete_disable) {
+			goto sendreq;
+		}
+		zio_execute(zio);
+		return;
+	default:
+			;
+		/* PASSTHROUGH --- placate compiler */
+	}
+sendreq:
+	ASSERT(zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_TRIM ||
+	    zio->io_type == ZIO_TYPE_IOCTL);
+
+	cp = vd->vdev_tsd;
+	if (cp == NULL) {
+		zio->io_error = SET_ERROR(ENXIO);
+		zio_interrupt(zio);
+		return;
+	}
+	bp = g_alloc_bio();
+	bp->bio_caller1 = zio;
+	switch (zio->io_type) {
+	case ZIO_TYPE_READ:
+	case ZIO_TYPE_WRITE:
+		zio->io_target_timestamp = zio_handle_io_delay(zio);
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		if (zio->io_type == ZIO_TYPE_READ) {
+			bp->bio_cmd = BIO_READ;
+			bp->bio_data =
+			    abd_borrow_buf(zio->io_abd, zio->io_size);
+		} else {
+			bp->bio_cmd = BIO_WRITE;
+			bp->bio_data =
+			    abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+		}
+		break;
+	case ZIO_TYPE_TRIM:
+		bp->bio_cmd = BIO_DELETE;
+		bp->bio_data = NULL;
+		bp->bio_offset = zio->io_offset;
+		bp->bio_length = zio->io_size;
+		break;
+	case ZIO_TYPE_IOCTL:
+		bp->bio_cmd = BIO_FLUSH;
+		bp->bio_data = NULL;
+		bp->bio_offset = cp->provider->mediasize;
+		bp->bio_length = 0;
+		break;
+	default:
+		panic("invalid zio->io_type: %d\n", zio->io_type);
+	}
+	bp->bio_done = vdev_geom_io_intr;
+	zio->io_bio = bp;
+
+	g_io_request(bp, cp);
+}
+
+static void
+vdev_geom_io_done(zio_t *zio)
+{
+	struct bio *bp = zio->io_bio;
+
+	if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) {
+		ASSERT(bp == NULL);
+		return;
+	}
+
+	if (bp == NULL) {
+		ASSERT3S(zio->io_error, ==, ENXIO);
+		return;
+	}
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size);
+	else
+		abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size);
+
+	g_destroy_bio(bp);
+	zio->io_bio = NULL;
+}
+
+static void
+vdev_geom_hold(vdev_t *vd)
+{
+}
+
+static void
+vdev_geom_rele(vdev_t *vd)
+{
+}
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_geom_open,
+	.vdev_op_close = vdev_geom_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_geom_io_start,
+	.vdev_op_io_done = vdev_geom_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_geom_hold,
+	.vdev_op_rele = vdev_geom_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
new file mode 100644
index 000000000000..97cb201934dc
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/vdev_label_os.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_os.h>
+#include <sys/vdev_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+	spa_t *spa = vd->vdev_spa;
+	zio_t *zio;
+	abd_t *pad2;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int error;
+
+	if (size > VDEV_PAD_SIZE)
+		return (EINVAL);
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (ENODEV);
+	if (vdev_is_dead(vd))
+		return (ENXIO);
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+	abd_zero(pad2, VDEV_PAD_SIZE);
+	abd_copy_from_buf(pad2, buf, size);
+
+retry:
+	zio = zio_root(spa, NULL, NULL, flags);
+	vdev_label_write(zio, vd, 0, pad2,
+	    offsetof(vdev_label_t, vl_be),
+	    VDEV_PAD_SIZE, NULL, NULL, flags);
+	error = zio_wait(zio);
+	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
+	abd_free(pad2);
+	return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
new file mode 100644
index 000000000000..23b87de8bd0d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_acl.c
@@ -0,0 +1,2700 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/sdt.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <acl/acl_common.h>
+
+
+#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
+#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
+#define	MAX_ACE_TYPE	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define	MIN_ACE_TYPE	ALLOW
+
+#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+    ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+    ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+    ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+    ACE_DELETE|ACE_DELETE_CHILD)
+#define	WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define	RESTRICTED_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+    ZFS_ACL_PROTECTED)
+
+#define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+    ZFS_ACL_OBJ_ACE)
+
+#define	ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+	((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+	((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+	return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+	*datap = NULL;
+	return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+	zfs_ace_v0_get_mask,
+	zfs_ace_v0_set_mask,
+	zfs_ace_v0_get_flags,
+	zfs_ace_v0_set_flags,
+	zfs_ace_v0_get_type,
+	zfs_ace_v0_set_type,
+	zfs_ace_v0_get_who,
+	zfs_ace_v0_set_who,
+	zfs_ace_v0_size,
+	zfs_ace_v0_abstract_size,
+	zfs_ace_v0_mask_off,
+	zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+	uint16_t entry_type;
+	zfs_ace_t *acep = args;
+
+	entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return (-1);
+	return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+	((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+	zfs_ace_t *acep = arg;
+
+	uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return;
+	acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+	zfs_ace_hdr_t *zacep = acep;
+	uint16_t entry_type;
+
+	switch (zacep->z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		return (sizeof (zfs_object_ace_t));
+	case ALLOW:
+	case DENY:
+		entry_type =
+		    (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+		if (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)
+			return (sizeof (zfs_ace_hdr_t));
+		/*FALLTHROUGH*/
+	default:
+		return (sizeof (zfs_ace_t));
+	}
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+	return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+	return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+	zfs_ace_t *zacep = acep;
+	zfs_object_ace_t *zobjp;
+
+	switch (zacep->z_hdr.z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		zobjp = acep;
+		*datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+		return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+	default:
+		*datap = NULL;
+		return (0);
+	}
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+	zfs_ace_fuid_get_mask,
+	zfs_ace_fuid_set_mask,
+	zfs_ace_fuid_get_flags,
+	zfs_ace_fuid_set_flags,
+	zfs_ace_fuid_get_type,
+	zfs_ace_fuid_set_type,
+	zfs_ace_fuid_get_who,
+	zfs_ace_fuid_set_who,
+	zfs_ace_fuid_size,
+	zfs_ace_fuid_abstract_size,
+	zfs_ace_fuid_mask_off,
+	zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file.  Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+	int error;
+
+	if (zp->z_is_sa)
+		return (0);
+
+	/*
+	 * Need to deal with a potential
+	 * race where zfs_sa_upgrade could cause
+	 * z_isa_sa to change.
+	 *
+	 * If the lookup fails then the state of z_is_sa should have
+	 * changed.
+	 */
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+	    &acl_phys, sizeof (acl_phys))) == 0)
+		return (acl_phys.z_acl_extern_obj);
+	else {
+		/*
+		 * after upgrade the SA_ZPL_ZNODE_ACL should have been
+		 * removed
+		 */
+		VERIFY(zp->z_is_sa && error == ENOENT);
+		return (0);
+	}
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+    zfs_acl_phys_t *aclphys)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t acl_count;
+	int size;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	if (zp->z_is_sa) {
+		if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+		    &size)) != 0)
+			return (error);
+		*aclsize = size;
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+		    &acl_count, sizeof (acl_count))) != 0)
+			return (error);
+		*aclcount = acl_count;
+	} else {
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    aclphys, sizeof (*aclphys))) != 0)
+			return (error);
+
+		if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+			*aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+			*aclcount = aclphys->z_acl_size;
+		} else {
+			*aclsize = aclphys->z_acl_size;
+			*aclcount = aclphys->z_acl_count;
+		}
+	}
+	return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+
+	if (zp->z_is_sa)
+		return (ZFS_ACL_VERSION_FUID);
+	else {
+		int error;
+
+		/*
+		 * Need to deal with a potential
+		 * race where zfs_sa_upgrade could cause
+		 * z_isa_sa to change.
+		 *
+		 * If the lookup fails then the state of z_is_sa should have
+		 * changed.
+		 */
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_ZNODE_ACL(zp->z_zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) == 0)
+			return (acl_phys.z_acl_version);
+		else {
+			/*
+			 * After upgrade SA_ZPL_ZNODE_ACL should have
+			 * been removed.
+			 */
+			VERIFY(zp->z_is_sa && error == ENOENT);
+			return (ZFS_ACL_VERSION_FUID);
+		}
+	}
+}
+
+static int
+zfs_acl_version(int version)
+{
+	if (version < ZPL_VERSION_FUID)
+		return (ZFS_ACL_VERSION_INITIAL);
+	else
+		return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+	return (zfs_acl_version(zp->z_zfsvfs->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+	zfs_acl_t *aclp;
+
+	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+	list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+	    offsetof(zfs_acl_node_t, z_next));
+	aclp->z_version = vers;
+	if (vers == ZFS_ACL_VERSION_FUID)
+		aclp->z_ops = &zfs_acl_fuid_ops;
+	else
+		aclp->z_ops = &zfs_acl_v0_ops;
+	return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+	zfs_acl_node_t *aclnode;
+
+	aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+	if (bytes) {
+		aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+		aclnode->z_allocdata = aclnode->z_acldata;
+		aclnode->z_allocsize = bytes;
+		aclnode->z_size = bytes;
+	}
+
+	return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+	if (aclnode->z_allocsize)
+		kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+	kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+	zfs_acl_node_t *aclnode;
+
+	while ((aclnode = list_head(&aclp->z_acl))) {
+		list_remove(&aclp->z_acl, aclnode);
+		zfs_acl_node_free(aclnode);
+	}
+	aclp->z_acl_count = 0;
+	aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+	zfs_acl_release_nodes(aclp);
+	list_destroy(&aclp->z_acl);
+	kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+	uint16_t entry_type;
+
+	switch (type) {
+	case ALLOW:
+	case DENY:
+	case ACE_SYSTEM_AUDIT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_ACE_TYPE:
+		entry_type = flags & ACE_TYPE_FLAGS;
+		return (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE || entry_type == 0 ||
+		    entry_type == ACE_IDENTIFIER_GROUP);
+	default:
+		if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+	/*
+	 * first check type of entry
+	 */
+
+	if (!zfs_acl_valid_ace_type(type, iflags))
+		return (B_FALSE);
+
+	switch (type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+			return (B_FALSE);
+		aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+	}
+
+	/*
+	 * next check inheritance level flags
+	 */
+
+	if (obj_type == VDIR &&
+	    (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+	if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+		if ((iflags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+    uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+	zfs_acl_node_t *aclnode;
+
+	ASSERT(aclp);
+
+	if (start == NULL) {
+		aclnode = list_head(&aclp->z_acl);
+		if (aclnode == NULL)
+			return (NULL);
+
+		aclp->z_next_ace = aclnode->z_acldata;
+		aclp->z_curr_node = aclnode;
+		aclnode->z_ace_idx = 0;
+	}
+
+	aclnode = aclp->z_curr_node;
+
+	if (aclnode == NULL)
+		return (NULL);
+
+	if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+		aclnode = list_next(&aclp->z_acl, aclnode);
+		if (aclnode == NULL)
+			return (NULL);
+		else {
+			aclp->z_curr_node = aclnode;
+			aclnode->z_ace_idx = 0;
+			aclp->z_next_ace = aclnode->z_acldata;
+		}
+	}
+
+	if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+		void *acep = aclp->z_next_ace;
+		size_t ace_size;
+
+		/*
+		 * Make sure we don't overstep our bounds
+		 */
+		ace_size = aclp->z_ops->ace_size(acep);
+
+		if (((caddr_t)acep + ace_size) >
+		    ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+			return (NULL);
+		}
+
+		*iflags = aclp->z_ops->ace_flags_get(acep);
+		*type = aclp->z_ops->ace_type_get(acep);
+		*access_mask = aclp->z_ops->ace_mask_get(acep);
+		*who = aclp->z_ops->ace_who_get(acep);
+		aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+		aclnode->z_ace_idx++;
+
+		return ((void *)acep);
+	}
+	return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+    uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+	zfs_acl_t *aclp = datap;
+	zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+	uint64_t who;
+
+	acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+	    flags, type);
+	return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+static int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+    void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+    zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+	int i;
+	uint16_t entry_type;
+	zfs_ace_t *aceptr = z_acl;
+	ace_t *acep = datap;
+	zfs_object_ace_t *zobjacep;
+	ace_object_t *aceobjp;
+
+	for (i = 0; i != aclcnt; i++) {
+		aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+		aceptr->z_hdr.z_flags = acep->a_flags;
+		aceptr->z_hdr.z_type = acep->a_type;
+		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE) {
+			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+			    cr, (entry_type == 0) ?
+			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+		}
+
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type,
+		    aceptr->z_hdr.z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+
+		switch (acep->a_type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			zobjacep = (zfs_object_ace_t *)aceptr;
+			aceobjp = (ace_object_t *)acep;
+
+			bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+			    sizeof (aceobjp->a_obj_type));
+			bcopy(aceobjp->a_inherit_obj_type,
+			    zobjacep->z_inherit_type,
+			    sizeof (aceobjp->a_inherit_obj_type));
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+			break;
+		default:
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+		}
+
+		aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+		    aclp->z_ops->ace_size(aceptr));
+	}
+
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+	return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+    void *datap, int filter)
+{
+	uint64_t who;
+	uint32_t access_mask;
+	uint16_t iflags, type;
+	zfs_ace_hdr_t *zacep = NULL;
+	ace_t *acep = datap;
+	ace_object_t *objacep;
+	zfs_object_ace_t *zobjacep;
+	size_t ace_size;
+	uint16_t entry_type;
+
+	while ((zacep = zfs_acl_next_ace(aclp, zacep,
+	    &who, &access_mask, &iflags, &type))) {
+
+		switch (type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			if (filter) {
+				continue;
+			}
+			zobjacep = (zfs_object_ace_t *)zacep;
+			objacep = (ace_object_t *)acep;
+			bcopy(zobjacep->z_object_type,
+			    objacep->a_obj_type,
+			    sizeof (zobjacep->z_object_type));
+			bcopy(zobjacep->z_inherit_type,
+			    objacep->a_inherit_obj_type,
+			    sizeof (zobjacep->z_inherit_type));
+			ace_size = sizeof (ace_object_t);
+			break;
+		default:
+			ace_size = sizeof (ace_t);
+			break;
+		}
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		if ((entry_type != ACE_OWNER &&
+		    entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE)) {
+			acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+			    cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+			    ZFS_ACE_GROUP : ZFS_ACE_USER);
+		} else {
+			acep->a_who = (uid_t)(int64_t)who;
+		}
+		acep->a_access_mask = access_mask;
+		acep->a_flags = iflags;
+		acep->a_type = type;
+		acep = (ace_t *)((caddr_t)acep + ace_size);
+	}
+}
+
+static int
+zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
+    zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+	int i;
+	zfs_oldace_t *aceptr = z_acl;
+
+	for (i = 0; i != aclcnt; i++, aceptr++) {
+		aceptr->z_access_mask = acep[i].a_access_mask;
+		aceptr->z_type = acep[i].a_type;
+		aceptr->z_flags = acep[i].a_flags;
+		aceptr->z_fuid = acep[i].a_who;
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_type, aclp, aceptr->z_type,
+		    aceptr->z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+	}
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+	return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+	zfs_oldace_t *oldaclp;
+	int i;
+	uint16_t type, iflags;
+	uint32_t access_mask;
+	uint64_t who;
+	void *cookie = NULL;
+	zfs_acl_node_t *newaclnode;
+
+	ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+	/*
+	 * First create the ACE in a contiguous piece of memory
+	 * for zfs_copy_ace_2_fuid().
+	 *
+	 * We only convert an ACL once, so this won't happen
+	 * everytime.
+	 */
+	oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+	    KM_SLEEP);
+	i = 0;
+	while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+	    &access_mask, &iflags, &type))) {
+		oldaclp[i].z_flags = iflags;
+		oldaclp[i].z_type = type;
+		oldaclp[i].z_fuid = who;
+		oldaclp[i++].z_access_mask = access_mask;
+	}
+
+	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+	    sizeof (zfs_object_ace_t));
+	aclp->z_ops = &zfs_acl_fuid_ops;
+	VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+	    oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+	    &newaclnode->z_size, NULL, cr) == 0);
+	newaclnode->z_ace_count = aclp->z_acl_count;
+	aclp->z_version = ZFS_ACL_VERSION;
+	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+	/*
+	 * Release all previous ACL nodes
+	 */
+
+	zfs_acl_release_nodes(aclp);
+
+	list_insert_head(&aclp->z_acl, newaclnode);
+
+	aclp->z_acl_bytes = newaclnode->z_size;
+	aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & S_IXOTH)
+		new_mask |= ACE_EXECUTE;
+	if (access_mask & S_IWOTH)
+		new_mask |= ACE_WRITE_DATA;
+	if (access_mask & S_IROTH)
+		new_mask |= ACE_READ_DATA;
+	return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+    uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+	uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+	aclp->z_ops->ace_mask_set(acep, access_mask);
+	aclp->z_ops->ace_type_set(acep, access_type);
+	aclp->z_ops->ace_flags_set(acep, entry_type);
+	if ((type != ACE_OWNER && type != OWNING_GROUP &&
+	    type != ACE_EVERYONE))
+		aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+    uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+	int		entry_type;
+	mode_t		mode;
+	mode_t		seen = 0;
+	zfs_ace_hdr_t 	*acep = NULL;
+	uint64_t	who;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	boolean_t	an_exec_denied = B_FALSE;
+
+	mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		/*
+		 * Skip over any inherit_only ACEs
+		 */
+		if (iflags & ACE_INHERIT_ONLY_ACE)
+			continue;
+
+		if (entry_type == ACE_OWNER || (entry_type == 0 &&
+		    who == fuid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRUSR))) {
+				seen |= S_IRUSR;
+				if (type == ALLOW) {
+					mode |= S_IRUSR;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWUSR))) {
+				seen |= S_IWUSR;
+				if (type == ALLOW) {
+					mode |= S_IWUSR;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXUSR))) {
+				seen |= S_IXUSR;
+				if (type == ALLOW) {
+					mode |= S_IXUSR;
+				}
+			}
+		} else if (entry_type == OWNING_GROUP ||
+		    (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRGRP))) {
+				seen |= S_IRGRP;
+				if (type == ALLOW) {
+					mode |= S_IRGRP;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWGRP))) {
+				seen |= S_IWGRP;
+				if (type == ALLOW) {
+					mode |= S_IWGRP;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXGRP))) {
+				seen |= S_IXGRP;
+				if (type == ALLOW) {
+					mode |= S_IXGRP;
+				}
+			}
+		} else if (entry_type == ACE_EVERYONE) {
+			if ((access_mask & ACE_READ_DATA)) {
+				if (!(seen & S_IRUSR)) {
+					seen |= S_IRUSR;
+					if (type == ALLOW) {
+						mode |= S_IRUSR;
+					}
+				}
+				if (!(seen & S_IRGRP)) {
+					seen |= S_IRGRP;
+					if (type == ALLOW) {
+						mode |= S_IRGRP;
+					}
+				}
+				if (!(seen & S_IROTH)) {
+					seen |= S_IROTH;
+					if (type == ALLOW) {
+						mode |= S_IROTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA)) {
+				if (!(seen & S_IWUSR)) {
+					seen |= S_IWUSR;
+					if (type == ALLOW) {
+						mode |= S_IWUSR;
+					}
+				}
+				if (!(seen & S_IWGRP)) {
+					seen |= S_IWGRP;
+					if (type == ALLOW) {
+						mode |= S_IWGRP;
+					}
+				}
+				if (!(seen & S_IWOTH)) {
+					seen |= S_IWOTH;
+					if (type == ALLOW) {
+						mode |= S_IWOTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_EXECUTE)) {
+				if (!(seen & S_IXUSR)) {
+					seen |= S_IXUSR;
+					if (type == ALLOW) {
+						mode |= S_IXUSR;
+					}
+				}
+				if (!(seen & S_IXGRP)) {
+					seen |= S_IXGRP;
+					if (type == ALLOW) {
+						mode |= S_IXGRP;
+					}
+				}
+				if (!(seen & S_IXOTH)) {
+					seen |= S_IXOTH;
+					if (type == ALLOW) {
+						mode |= S_IXOTH;
+					}
+				}
+			}
+		} else {
+			/*
+			 * Only care if this IDENTIFIER_GROUP or
+			 * USER ACE denies execute access to someone,
+			 * mode is not affected
+			 */
+			if ((access_mask & ACE_EXECUTE) && type == DENY)
+				an_exec_denied = B_TRUE;
+		}
+	}
+
+	/*
+	 * Failure to allow is effectively a deny, so execute permission
+	 * is denied if it was never mentioned or if we explicitly
+	 * weren't allowed it.
+	 */
+	if (!an_exec_denied &&
+	    ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+	    (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+		an_exec_denied = B_TRUE;
+
+	if (an_exec_denied)
+		*pflags &= ~ZFS_NO_EXECS_DENIED;
+	else
+		*pflags |= ZFS_NO_EXECS_DENIED;
+
+	return (mode);
+}
+
+/*
+ * Read an external acl object.  If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+    boolean_t will_modify)
+{
+	zfs_acl_t	*aclp;
+	int		aclsize;
+	int		acl_count;
+	zfs_acl_node_t	*aclnode;
+	zfs_acl_phys_t	znode_acl;
+	int		version;
+	int		error;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+
+	if (zp->z_acl_cached && !will_modify) {
+		*aclpp = zp->z_acl_cached;
+		return (0);
+	}
+
+	version = zfs_znode_acl_version(zp);
+
+	if ((error = zfs_acl_znode_info(zp, &aclsize,
+	    &acl_count, &znode_acl)) != 0) {
+		goto done;
+	}
+
+	aclp = zfs_acl_alloc(version);
+
+	aclp->z_acl_count = acl_count;
+	aclp->z_acl_bytes = aclsize;
+
+	aclnode = zfs_acl_node_alloc(aclsize);
+	aclnode->z_ace_count = aclp->z_acl_count;
+	aclnode->z_size = aclsize;
+
+	if (!zp->z_is_sa) {
+		if (znode_acl.z_acl_extern_obj) {
+			error = dmu_read(zp->z_zfsvfs->z_os,
+			    znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+			    aclnode->z_acldata, DMU_READ_PREFETCH);
+		} else {
+			bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+			    aclnode->z_size);
+		}
+	} else {
+		error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs),
+		    aclnode->z_acldata, aclnode->z_size);
+	}
+
+	if (error != 0) {
+		zfs_acl_free(aclp);
+		zfs_acl_node_free(aclnode);
+		/* convert checksum errors into IO errors */
+		if (error == ECKSUM)
+			error = SET_ERROR(EIO);
+		goto done;
+	}
+
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	*aclpp = aclp;
+	if (!will_modify)
+		zp->z_acl_cached = aclp;
+done:
+	return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+    boolean_t start, void *userdata)
+{
+	zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+	if (start) {
+		cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+	} else {
+		cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+		    cb->cb_acl_node);
+	}
+	*dataptr = cb->cb_acl_node->z_acldata;
+	*length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+	int error;
+	zfs_acl_t *aclp;
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+	if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+		zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
+		    &zp->z_pflags, zp->z_uid, zp->z_gid);
+	return (error);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+	int			error;
+	zfsvfs_t		*zfsvfs = zp->z_zfsvfs;
+	dmu_object_type_t	otype;
+	zfs_acl_locator_cb_t	locate = { 0 };
+	uint64_t		mode;
+	sa_bulk_attr_t		bulk[5];
+	uint64_t		ctime[2];
+	int			count = 0;
+	zfs_acl_phys_t		acl_phys;
+
+	ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+	mode = zp->z_mode;
+
+	mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+	    zp->z_uid, zp->z_gid);
+
+	zp->z_mode = mode;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, sizeof (ctime));
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	/*
+	 * Upgrade needed?
+	 */
+	if (!zfsvfs->z_use_fuids) {
+		otype = DMU_OT_OLDACL;
+	} else {
+		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
+			zfs_acl_xform(zp, aclp, cr);
+		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+		otype = DMU_OT_ACL;
+	}
+
+	/*
+	 * Arrgh, we have to handle old on disk format
+	 * as well as newer (preferred) SA format.
+	 */
+
+	if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+		locate.cb_aclp = aclp;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+		    NULL, &aclp->z_acl_count, sizeof (uint64_t));
+	} else { /* Painful legacy way */
+		zfs_acl_node_t *aclnode;
+		uint64_t off = 0;
+		uint64_t aoid;
+
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) != 0)
+			return (error);
+
+		aoid = acl_phys.z_acl_extern_obj;
+
+		if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			/*
+			 * If ACL was previously external and we are now
+			 * converting to new ACL format then release old
+			 * ACL object and create a new one.
+			 */
+			if (aoid &&
+			    aclp->z_version != acl_phys.z_acl_version) {
+				error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+				if (error)
+					return (error);
+				aoid = 0;
+			}
+			if (aoid == 0) {
+				aoid = dmu_object_alloc(zfsvfs->z_os,
+				    otype, aclp->z_acl_bytes,
+				    otype == DMU_OT_ACL ?
+				    DMU_OT_SYSACL : DMU_OT_NONE,
+				    otype == DMU_OT_ACL ?
+				    DN_OLD_MAX_BONUSLEN : 0, tx);
+			} else {
+				(void) dmu_object_set_blocksize(zfsvfs->z_os,
+				    aoid, aclp->z_acl_bytes, 0, tx);
+			}
+			acl_phys.z_acl_extern_obj = aoid;
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				dmu_write(zfsvfs->z_os, aoid, off,
+				    aclnode->z_size, aclnode->z_acldata, tx);
+				off += aclnode->z_size;
+			}
+		} else {
+			void *start = acl_phys.z_ace_data;
+			/*
+			 * Migrating back embedded?
+			 */
+			if (acl_phys.z_acl_extern_obj) {
+				error = dmu_object_free(zfsvfs->z_os,
+				    acl_phys.z_acl_extern_obj, tx);
+				if (error)
+					return (error);
+				acl_phys.z_acl_extern_obj = 0;
+			}
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+		}
+		/*
+		 * If Old version then swap count/bytes to match old
+		 * layout of znode_acl_phys_t.
+		 */
+		if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+			acl_phys.z_acl_size = aclp->z_acl_count;
+			acl_phys.z_acl_count = aclp->z_acl_bytes;
+		} else {
+			acl_phys.z_acl_size = aclp->z_acl_bytes;
+			acl_phys.z_acl_count = aclp->z_acl_count;
+		}
+		acl_phys.z_acl_version = aclp->z_version;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (acl_phys));
+	}
+
+	/*
+	 * Replace ACL wide bits, but first clear them.
+	 */
+	zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+	zp->z_pflags |= aclp->z_hints;
+
+	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+		zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+	zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+	return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim,
+    zfs_acl_t *aclp)
+{
+	void		*acep = NULL;
+	uint64_t	who;
+	int		new_count, new_bytes;
+	int		ace_size;
+	int 		entry_type;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	zfs_acl_node_t	*newnode;
+	size_t 		abstract_size = aclp->z_ops->ace_abstract_size();
+	void 		*zacep;
+	boolean_t	isdir;
+	trivial_acl_t	masks;
+
+	new_count = new_bytes = 0;
+
+	isdir = (vtype == VDIR);
+
+	acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+	newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+	zacep = newnode->z_acldata;
+	if (masks.allow0) {
+		zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny1) {
+		zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny2) {
+		zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		/*
+		 * ACEs used to represent the file mode may be divided
+		 * into an equivalent pair of inherit-only and regular
+		 * ACEs, if they are inheritable.
+		 * Skip regular ACEs, which are replaced by the new mode.
+		 */
+		if (split && (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)) {
+			if (!isdir || !(iflags &
+			    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+				continue;
+			/*
+			 * We preserve owner@, group@, or @everyone
+			 * permissions, if they are inheritable, by
+			 * copying them to inherit_only ACEs. This
+			 * prevents inheritable permissions from being
+			 * altered along with the file mode.
+			 */
+			iflags |= ACE_INHERIT_ONLY_ACE;
+		}
+
+		/*
+		 * If this ACL has any inheritable ACEs, mark that in
+		 * the hints (which are later masked into the pflags)
+		 * so create knows to do inheritance.
+		 */
+		if (isdir && (iflags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+			aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		if ((type != ALLOW && type != DENY) ||
+		    (iflags & ACE_INHERIT_ONLY_ACE)) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+				break;
+			}
+		} else {
+			/*
+			 * Limit permissions granted by ACEs to be no greater
+			 * than permissions of the requested group mode.
+			 * Applies when the "aclmode" property is set to
+			 * "groupmask".
+			 */
+			if ((type == ALLOW) && trim)
+				access_mask &= masks.group;
+		}
+		zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+		ace_size = aclp->z_ops->ace_size(acep);
+		zacep = (void *)((uintptr_t)zacep + ace_size);
+		new_count++;
+		new_bytes += ace_size;
+	}
+	zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+	new_count += 3;
+	new_bytes += abstract_size * 3;
+	zfs_acl_release_nodes(aclp);
+	aclp->z_acl_count = new_count;
+	aclp->z_acl_bytes = new_bytes;
+	newnode->z_ace_count = new_count;
+	newnode->z_size = new_bytes;
+	list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+	int error = 0;
+
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
+		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+	else
+		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+
+	if (error == 0) {
+		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+		zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE,
+		    (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+	}
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
+{
+	int	iflags = (acep_flags & 0xf);
+
+	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+		return (1);
+	else if (iflags & ACE_FILE_INHERIT_ACE)
+		return (!((vtype == VDIR) &&
+		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+	return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+    uint64_t mode, boolean_t *need_chmod)
+{
+	void		*pacep = NULL;
+	void		*acep;
+	zfs_acl_node_t  *aclnode;
+	zfs_acl_t	*aclp = NULL;
+	uint64_t	who;
+	uint32_t	access_mask;
+	uint16_t	iflags, newflags, type;
+	size_t		ace_size;
+	void		*data1, *data2;
+	size_t		data1sz, data2sz;
+	uint_t		aclinherit;
+	boolean_t	isdir = (vtype == VDIR);
+	boolean_t	isreg = (vtype == VREG);
+
+	*need_chmod = B_TRUE;
+
+	aclp = zfs_acl_alloc(paclp->z_version);
+	aclinherit = zfsvfs->z_acl_inherit;
+	if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK)
+		return (aclp);
+
+	while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		/*
+		 * don't inherit bogus ACEs
+		 */
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		/*
+		 * Check if ACE is inheritable by this vnode
+		 */
+		if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+		    !zfs_ace_can_use(vtype, iflags))
+			continue;
+
+		/*
+		 * If owner@, group@, or everyone@ inheritable
+		 * then zfs_acl_chmod() isn't needed.
+		 */
+		if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+		    aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+		    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+		    ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+		    (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+			*need_chmod = B_FALSE;
+
+		/*
+		 * Strip inherited execute permission from file if
+		 * not in mode
+		 */
+		if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+		    !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+			access_mask &= ~ACE_EXECUTE;
+		}
+
+		/*
+		 * Strip write_acl and write_owner from permissions
+		 * when inheriting an ACE
+		 */
+		if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+			access_mask &= ~RESTRICTED_CLEAR;
+		}
+
+		ace_size = aclp->z_ops->ace_size(pacep);
+		aclnode = zfs_acl_node_alloc(ace_size);
+		list_insert_tail(&aclp->z_acl, aclnode);
+		acep = aclnode->z_acldata;
+
+		zfs_set_ace(aclp, acep, access_mask, type,
+		    who, iflags|ACE_INHERITED_ACE);
+
+		/*
+		 * Copy special opaque data if any
+		 */
+		if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+			VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+			    &data2)) == data1sz);
+			bcopy(data1, data2, data2sz);
+		}
+
+		aclp->z_acl_count++;
+		aclnode->z_ace_count++;
+		aclp->z_acl_bytes += aclnode->z_size;
+		newflags = aclp->z_ops->ace_flags_get(acep);
+
+		/*
+		 * If ACE is not to be inherited further, or if the vnode is
+		 * not a directory, remove all inheritance flags
+		 */
+		if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+			newflags &= ~ALL_INHERIT;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+			continue;
+		}
+
+		/*
+		 * This directory has an inheritable ACE
+		 */
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		/*
+		 * If only FILE_INHERIT is set then turn on
+		 * inherit_only
+		 */
+		if ((iflags & (ACE_FILE_INHERIT_ACE |
+		    ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+			newflags |= ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		} else {
+			newflags &= ~ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		}
+	}
+	if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+	    aclp->z_acl_count != 0) {
+		*need_chmod = B_FALSE;
+	}
+
+	return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+    vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+	int		error;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zfs_acl_t	*paclp;
+	gid_t		gid;
+	boolean_t	need_chmod = B_TRUE;
+	boolean_t	trim = B_FALSE;
+	boolean_t	inherited = B_FALSE;
+
+	if ((flag & IS_ROOT_NODE) == 0) {
+		if (zfsvfs->z_replay == B_FALSE)
+			ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+	} else
+		ASSERT(dzp->z_vnode == NULL);
+	bzero(acl_ids, sizeof (zfs_acl_ids_t));
+	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+
+	if (vsecp)
+		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+		    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+			return (error);
+	/*
+	 * Determine uid and gid.
+	 */
+	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
+		acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_uid, cr,
+		    ZFS_OWNER, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_gid, cr,
+		    ZFS_GROUP, &acl_ids->z_fuidp);
+		gid = vap->va_gid;
+	} else {
+		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+		    cr, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = 0;
+		if (vap->va_mask & AT_GID)  {
+			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
+			gid = vap->va_gid;
+			if (acl_ids->z_fgid != dzp->z_gid &&
+			    !groupmember(vap->va_gid, cr) &&
+			    secpolicy_vnode_create_gid(cr) != 0)
+				acl_ids->z_fgid = 0;
+		}
+		if (acl_ids->z_fgid == 0) {
+			char		*domain;
+			uint32_t	rid;
+
+			acl_ids->z_fgid = dzp->z_gid;
+			gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+			    cr, ZFS_GROUP);
+
+			if (zfsvfs->z_use_fuids &&
+			    IS_EPHEMERAL(acl_ids->z_fgid)) {
+				domain =
+				    zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx,
+				    FUID_INDEX(acl_ids->z_fgid));
+				rid = FUID_RID(acl_ids->z_fgid);
+				zfs_fuid_node_add(&acl_ids->z_fuidp,
+				    domain, rid, FUID_INDEX(acl_ids->z_fgid),
+				    acl_ids->z_fgid, ZFS_GROUP);
+			}
+		}
+	}
+
+	/*
+	 * If we're creating a directory, and the parent directory has the
+	 * set-GID bit set, set in on the new directory.
+	 * Otherwise, if the user is neither privileged nor a member of the
+	 * file's new group, clear the file's set-GID bit.
+	 */
+
+	if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+	    (vap->va_type == VDIR)) {
+		acl_ids->z_mode |= S_ISGID;
+	} else {
+		if ((acl_ids->z_mode & S_ISGID) &&
+		    secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0)
+			acl_ids->z_mode &= ~S_ISGID;
+	}
+
+	if (acl_ids->z_aclp == NULL) {
+		mutex_enter(&dzp->z_acl_lock);
+		if (!(flag & IS_ROOT_NODE) &&
+		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+		    !(dzp->z_pflags & ZFS_XATTR)) {
+			VERIFY0(zfs_acl_node_read(dzp, B_TRUE,
+			    &paclp, B_FALSE));
+			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
+			inherited = B_TRUE;
+		} else {
+			acl_ids->z_aclp =
+			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+		}
+		mutex_exit(&dzp->z_acl_lock);
+
+		if (need_chmod) {
+			if (vap->va_type == VDIR)
+				acl_ids->z_aclp->z_hints |=
+				    ZFS_ACL_AUTO_INHERIT;
+
+			if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+				trim = B_TRUE;
+			zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE,
+			    trim, acl_ids->z_aclp);
+		}
+	}
+
+	if (inherited || vsecp) {
+		acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+		    acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+		if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+	}
+
+	return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+	if (acl_ids->z_aclp)
+		zfs_acl_free(acl_ids->z_aclp);
+	if (acl_ids->z_fuidp)
+		zfs_fuid_info_free(acl_ids->z_fuidp);
+	acl_ids->z_aclp = NULL;
+	acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+	return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+	    zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+	    (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+	    zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	ulong_t		mask;
+	int		error;
+	int 		count = 0;
+	int		largeace = 0;
+
+	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	/*
+	 * Scan ACL to determine number of ACEs
+	 */
+	if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+		void *zacep = NULL;
+		uint64_t who;
+		uint32_t access_mask;
+		uint16_t type, iflags;
+
+		while ((zacep = zfs_acl_next_ace(aclp, zacep,
+		    &who, &access_mask, &iflags, &type))) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				largeace++;
+				continue;
+			default:
+				count++;
+			}
+		}
+		vsecp->vsa_aclcnt = count;
+	} else
+		count = (int)aclp->z_acl_count;
+
+	if (mask & VSA_ACECNT) {
+		vsecp->vsa_aclcnt = count;
+	}
+
+	if (mask & VSA_ACE) {
+		size_t aclsz;
+
+		aclsz = count * sizeof (ace_t) +
+		    sizeof (ace_object_t) * largeace;
+
+		vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+		vsecp->vsa_aclentsz = aclsz;
+
+		if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+			zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
+			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+		else {
+			zfs_acl_node_t *aclnode;
+			void *start = vsecp->vsa_aclentp;
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+			    aclp->z_acl_bytes);
+		}
+	}
+	if (mask & VSA_ACE_ACLFLAGS) {
+		vsecp->vsa_aclflags = 0;
+		if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+			vsecp->vsa_aclflags |= ACL_DEFAULTED;
+		if (zp->z_pflags & ZFS_ACL_PROTECTED)
+			vsecp->vsa_aclflags |= ACL_PROTECTED;
+		if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type,
+    vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+	zfs_acl_t *aclp;
+	zfs_acl_node_t *aclnode;
+	int aclcnt = vsecp->vsa_aclcnt;
+	int error;
+
+	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+		return (SET_ERROR(EINVAL));
+
+	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+	aclp->z_hints = 0;
+	aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+		if ((error = zfs_copy_ace_2_oldace(obj_type, aclp,
+		    (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+		    aclcnt, &aclnode->z_size)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	} else {
+		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
+		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+		    &aclnode->z_size, fuidp, cr)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	}
+	aclp->z_acl_bytes = aclnode->z_size;
+	aclnode->z_ace_count = aclcnt;
+	aclp->z_acl_count = aclcnt;
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	/*
+	 * If flags are being set then add them to z_hints
+	 */
+	if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+		if (vsecp->vsa_aclflags & ACL_PROTECTED)
+			aclp->z_hints |= ZFS_ACL_PROTECTED;
+		if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+			aclp->z_hints |= ZFS_ACL_DEFAULTED;
+		if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+			aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+	}
+
+	*zaclp = aclp;
+
+	return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zilog_t		*zilog = zfsvfs->z_log;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	dmu_tx_t	*tx;
+	int		error;
+	zfs_acl_t	*aclp;
+	zfs_fuid_info_t	*fuidp = NULL;
+	boolean_t	fuid_dirtied;
+	uint64_t	acl_obj;
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		return (SET_ERROR(EPERM));
+
+	if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+	    &aclp);
+	if (error)
+		return (error);
+
+	/*
+	 * If ACL wide flags aren't being set then preserve any
+	 * existing flags.
+	 */
+	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+		aclp->z_hints |=
+		    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+	}
+top:
+	mutex_enter(&zp->z_acl_lock);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	/*
+	 * If old version and ACL won't fit in bonus and we aren't
+	 * upgrading then take out necessary DMU holds
+	 */
+
+	if ((acl_obj = zfs_external_acl(zp)) != 0) {
+		if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+		    zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+			dmu_tx_hold_free(tx, acl_obj, 0,
+			    DMU_OBJECT_END);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    aclp->z_acl_bytes);
+		} else {
+			dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+		}
+	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+	}
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (error) {
+		mutex_exit(&zp->z_acl_lock);
+
+		if (error == ERESTART) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zfs_acl_free(aclp);
+		return (error);
+	}
+
+	error = zfs_aclset_common(zp, aclp, cr, tx);
+	ASSERT(error == 0);
+	ASSERT(zp->z_acl_cached == NULL);
+	zp->z_acl_cached = aclp;
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
+	dmu_tx_commit(tx);
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only.  Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+	if ((v4_mode & WRITE_MASK) &&
+	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+	    (!IS_DEVVP(ZTOV(zp)) ||
+	    (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * Intentionally allow ZFS_READONLY through here.
+	 * See zfs_zaccess_common().
+	 */
+	if ((v4_mode & WRITE_MASK_DATA) &&
+	    (zp->z_pflags & ZFS_IMMUTABLE)) {
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK
+	 * (sunlnk) is set. We just don't allow directory removal, which is
+	 * handled in zfs_zaccess_delete().
+	 */
+	if ((v4_mode & ACE_DELETE) &&
+	    (zp->z_pflags & ZFS_NOUNLINK)) {
+		return (EPERM);
+	}
+
+	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+	    (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+		return (SET_ERROR(EACCES));
+	}
+
+	return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied.  The AoI are expressed as bits in
+ * the working_mode parameter.  As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode.  This removal
+ * facilitates two things.  The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE.  Removing the
+ * discovered access or denial enforces this rule.  At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses.  Returns:
+ *	0		if all AoI granted
+ *	EACCESS 	if the denied mask is non-zero
+ *	other error	if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted.  If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE.  The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+    boolean_t anyaccess, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zfs_acl_t	*aclp;
+	int		error;
+	uid_t		uid = crgetuid(cr);
+	uint64_t 	who;
+	uint16_t	type, iflags;
+	uint16_t	entry_type;
+	uint32_t	access_mask;
+	uint32_t	deny_mask = 0;
+	zfs_ace_hdr_t	*acep = NULL;
+	boolean_t	checkit;
+	uid_t		gowner;
+	uid_t		fowner;
+
+	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	if (zp->z_zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+	error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	ASSERT(zp->z_acl_cached);
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		uint32_t mask_matched;
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
+			continue;
+
+		/* Skip ACE if it does not affect any AoI */
+		mask_matched = (access_mask & *working_mode);
+		if (!mask_matched)
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		checkit = B_FALSE;
+
+		switch (entry_type) {
+		case ACE_OWNER:
+			if (uid == fowner)
+				checkit = B_TRUE;
+			break;
+		case OWNING_GROUP:
+			who = gowner;
+			/*FALLTHROUGH*/
+		case ACE_IDENTIFIER_GROUP:
+			checkit = zfs_groupmember(zfsvfs, who, cr);
+			break;
+		case ACE_EVERYONE:
+			checkit = B_TRUE;
+			break;
+
+		/* USER Entry */
+		default:
+			if (entry_type == 0) {
+				uid_t newid;
+
+				newid = zfs_fuid_map_id(zfsvfs, who, cr,
+				    ZFS_ACE_USER);
+				if (newid !=  UID_NOBODY &&
+				    uid == newid)
+					checkit = B_TRUE;
+				break;
+			} else {
+				mutex_exit(&zp->z_acl_lock);
+				return (SET_ERROR(EIO));
+			}
+		}
+
+		if (checkit) {
+			if (type == DENY) {
+				DTRACE_PROBE3(zfs__ace__denies,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				deny_mask |= mask_matched;
+			} else {
+				DTRACE_PROBE3(zfs__ace__allows,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				if (anyaccess) {
+					mutex_exit(&zp->z_acl_lock);
+					return (0);
+				}
+			}
+			*working_mode &= ~mask_matched;
+		}
+
+		/* Are we done? */
+		if (*working_mode == 0)
+			break;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	/* Put the found 'denies' back on the working mode */
+	if (deny_mask) {
+		*working_mode |= deny_mask;
+		return (SET_ERROR(EACCES));
+	} else if (*working_mode) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+	uint32_t have = ACE_ALL_PERMS;
+
+	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+		uid_t owner;
+
+		owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+		return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0);
+	}
+	return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int err;
+
+	*working_mode = v4_mode;
+	*check_privs = B_TRUE;
+
+	/*
+	 * Short circuit empty requests
+	 */
+	if (v4_mode == 0 || zfsvfs->z_replay) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+		*check_privs = B_FALSE;
+		return (err);
+	}
+
+	/*
+	 * The caller requested that the ACL check be skipped.  This
+	 * would only happen if the caller checked VOP_ACCESS() with a
+	 * 32 bit ACE mask and already had the appropriate permissions.
+	 */
+	if (skipaclchk) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	/*
+	 * Note: ZFS_READONLY represents the "DOS R/O" attribute.
+	 * When that flag is set, we should behave as if write access
+	 * were not granted by anything in the ACL.  In particular:
+	 * We _must_ allow writes after opening the file r/w, then
+	 * setting the DOS R/O attribute, and writing some more.
+	 * (Similar to how you can write after fchmod(fd, 0444).)
+	 *
+	 * Therefore ZFS_READONLY is ignored in the dataset check
+	 * above, and checked here as if part of the ACL check.
+	 * Also note: DOS R/O is ignored for directories.
+	 */
+	if ((v4_mode & WRITE_MASK_DATA) &&
+	    (ZTOV(zp)->v_type != VDIR) &&
+	    (zp->z_pflags & ZFS_READONLY)) {
+		return (SET_ERROR(EPERM));
+	}
+
+	return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+    cred_t *cr)
+{
+	if (*working_mode != ACE_WRITE_DATA)
+		return (SET_ERROR(EACCES));
+
+	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+	    check_privs, B_FALSE, cr));
+}
+
+/*
+ * Check if VEXEC is allowed.
+ *
+ * This routine is based on zfs_fastaccesschk_execute which has slowpath
+ * calling zfs_zaccess. This would be incorrect on FreeBSD (see
+ * zfs_freebsd_access for the difference). Thus this variant let's the
+ * caller handle the slowpath (if necessary).
+ *
+ * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED.
+ *
+ * Safe access to znode_t is provided by the vnode lock.
+ */
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+	boolean_t is_attr;
+
+	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+		return (1);
+
+	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+	    (ZTOV(zdp)->v_type == VDIR));
+	if (is_attr)
+		return (1);
+
+	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED)
+		return (0);
+
+	return (1);
+}
+
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+	uint32_t	working_mode;
+	int		error;
+	int		is_attr;
+	boolean_t 	check_privs;
+	znode_t		*xzp = NULL;
+	znode_t 	*check_zp = zp;
+	mode_t		needed_bits;
+	uid_t		owner;
+
+	is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
+
+#ifdef __FreeBSD_kernel__
+	/*
+	 * In FreeBSD, we don't care about permissions of individual ADS.
+	 * Note that not checking them is not just an optimization - without
+	 * this shortcut, EA operations may bogusly fail with EACCES.
+	 */
+	if (zp->z_pflags & ZFS_XATTR)
+		return (0);
+#else
+	/*
+	 * If attribute then validate against base file
+	 */
+	if (is_attr) {
+		uint64_t	parent;
+
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zp->z_zfsvfs), &parent,
+		    sizeof (parent))) != 0)
+			return (error);
+
+		if ((error = zfs_zget(zp->z_zfsvfs,
+		    parent, &xzp)) != 0)	{
+			return (error);
+		}
+
+		check_zp = xzp;
+
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+#endif
+
+	owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+	/*
+	 * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC
+	 * in needed_bits.  Map the bits mapped by working_mode (currently
+	 * missing) in missing_bits.
+	 * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+	 * needed_bits.
+	 */
+	needed_bits = 0;
+
+	working_mode = mode;
+	if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+	    owner == crgetuid(cr))
+		working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+	if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+	    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= VREAD;
+	if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+	    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= VWRITE;
+	if (working_mode & ACE_EXECUTE)
+		needed_bits |= VEXEC;
+
+	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+	    &check_privs, skipaclchk, cr)) == 0) {
+		if (is_attr)
+			VN_RELE(ZTOV(xzp));
+		return (secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+		    needed_bits, needed_bits));
+	}
+
+	if (error && !check_privs) {
+		if (is_attr)
+			VN_RELE(ZTOV(xzp));
+		return (error);
+	}
+
+	if (error && (flags & V_APPEND)) {
+		error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+	}
+
+	if (error && check_privs) {
+		mode_t		checkmode = 0;
+		vnode_t *check_vp = ZTOV(check_zp);
+
+		/*
+		 * First check for implicit owner permission on
+		 * read_acl/read_attributes
+		 */
+
+		error = 0;
+		ASSERT(working_mode != 0);
+
+		if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+		    owner == crgetuid(cr)))
+			working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+		if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+		    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= VREAD;
+		if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+		    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= VWRITE;
+		if (working_mode & ACE_EXECUTE)
+			checkmode |= VEXEC;
+
+		error = secpolicy_vnode_access2(cr, check_vp, owner,
+		    needed_bits & ~checkmode, needed_bits);
+
+		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+			error = secpolicy_vnode_chown(check_vp, cr, owner);
+		if (error == 0 && (working_mode & ACE_WRITE_ACL))
+			error = secpolicy_vnode_setdac(check_vp, cr, owner);
+
+		if (error == 0 && (working_mode &
+		    (ACE_DELETE|ACE_DELETE_CHILD)))
+			error = secpolicy_vnode_remove(check_vp, cr);
+
+		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+			error = secpolicy_vnode_chown(check_vp, cr, owner);
+		}
+		if (error == 0) {
+			/*
+			 * See if any bits other than those already checked
+			 * for are still present.  If so then return EACCES
+			 */
+			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+				error = SET_ERROR(EACCES);
+			}
+		}
+	} else if (error == 0) {
+		error = secpolicy_vnode_access2(cr, ZTOV(zp), owner,
+		    needed_bits, needed_bits);
+	}
+
+
+	if (is_attr)
+		VN_RELE(ZTOV(xzp));
+
+	return (error);
+}
+
+/*
+ * Translate traditional unix VREAD/VWRITE/VEXEC mode into
+ * NFSv4-style ZFS ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+	int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+static int
+zfs_delete_final_check(znode_t *zp, znode_t *dzp,
+    mode_t available_perms, cred_t *cr)
+{
+	int error;
+	uid_t downer;
+
+	downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER);
+
+	error = secpolicy_vnode_access2(cr, ZTOV(dzp),
+	    downer, available_perms, VWRITE|VEXEC);
+
+	if (error == 0)
+		error = zfs_sticky_remove_access(dzp, zp, cr);
+
+	return (error);
+}
+
+/*
+ * Determine whether Access should be granted/deny, without
+ * consulting least priv subsystem.
+ *
+ * The following chart is the recommended NFSv4 enforcement for
+ * ability to delete an object.
+ *
+ *      -------------------------------------------------------
+ *      |   Parent Dir  |           Target Object Permissions |
+ *      |  permissions  |                                     |
+ *      -------------------------------------------------------
+ *      |               | ACL Allows | ACL Denies| Delete     |
+ *      |               |  Delete    |  Delete   | unspecified|
+ *      -------------------------------------------------------
+ *      |  ACL Allows   | Permit     | Permit    | Permit     |
+ *      |  DELETE_CHILD |                                     |
+ *      -------------------------------------------------------
+ *      |  ACL Denies   | Permit     | Deny      | Deny       |
+ *      |  DELETE_CHILD |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL specifies |            |           |            |
+ *      | only allow    | Permit     | Permit    | Permit     |
+ *      | write and     |            |           |            |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL denies    |            |           |            |
+ *      | write and     | Permit     | Deny      | Deny       |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *         ^
+ *         |
+ *         No search privilege, can't even look up file?
+ *
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+	uint32_t dzp_working_mode = 0;
+	uint32_t zp_working_mode = 0;
+	int dzp_error, zp_error;
+	mode_t available_perms;
+	boolean_t dzpcheck_privs = B_TRUE;
+	boolean_t zpcheck_privs = B_TRUE;
+
+	/*
+	 * We want specific DELETE permissions to
+	 * take precedence over WRITE/EXECUTE.  We don't
+	 * want an ACL such as this to mess us up.
+	 * user:joe:write_data:deny,user:joe:delete:allow
+	 *
+	 * However, deny permissions may ultimately be overridden
+	 * by secpolicy_vnode_access().
+	 *
+	 * We will ask for all of the necessary permissions and then
+	 * look at the working modes from the directory and target object
+	 * to determine what was found.
+	 */
+
+	if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+		return (SET_ERROR(EPERM));
+
+	/*
+	 * First row
+	 * If the directory permissions allow the delete, we are done.
+	 */
+	if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0)
+		return (0);
+
+	/*
+	 * If target object has delete permission then we are done
+	 */
+	if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+	    &zpcheck_privs, B_FALSE, cr)) == 0)
+		return (0);
+
+	ASSERT(dzp_error && zp_error);
+
+	if (!dzpcheck_privs)
+		return (dzp_error);
+	if (!zpcheck_privs)
+		return (zp_error);
+
+	/*
+	 * Second row
+	 *
+	 * If directory returns EACCES then delete_child was denied
+	 * due to deny delete_child.  In this case send the request through
+	 * secpolicy_vnode_remove().  We don't use zfs_delete_final_check()
+	 * since that *could* allow the delete based on write/execute permission
+	 * and we want delete permissions to override write/execute.
+	 */
+
+	if (dzp_error == EACCES) {
+		/* XXXPJD: s/dzp/zp/ ? */
+		return (secpolicy_vnode_remove(ZTOV(dzp), cr));
+	}
+	/*
+	 * Third Row
+	 * only need to see if we have write/execute on directory.
+	 */
+
+	dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+
+	if (dzp_error != 0 && !dzpcheck_privs)
+		return (dzp_error);
+
+	/*
+	 * Fourth row
+	 */
+
+	available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE;
+	available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC;
+
+	return (zfs_delete_final_check(zp, dzp, available_perms, cr));
+
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+    znode_t *tzp, cred_t *cr)
+{
+	int add_perm;
+	int error;
+
+	if (szp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
+
+	add_perm = (ZTOV(szp)->v_type == VDIR) ?
+	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+	/*
+	 * Rename permissions are combination of delete permission +
+	 * add file/subdir permission.
+	 *
+	 * BSD operating systems also require write permission
+	 * on the directory being moved from one parent directory
+	 * to another.
+	 */
+	if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) {
+		if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr)))
+			return (error);
+	}
+
+	/*
+	 * first make sure we do the delete portion.
+	 *
+	 * If that succeeds then check for add_file/add_subdir permissions
+	 */
+
+	if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+		return (error);
+
+	/*
+	 * If we have a tzp, see if we can delete it?
+	 */
+	if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr)))
+		return (error);
+
+	/*
+	 * Now check for add permissions
+	 */
+	error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+	return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..f472aecdbafb
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ctldir.c
@@ -0,0 +1,1360 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' directory, but this may expand in the
+ * future.  The elements are built using the GFS primitives, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab.  We have three
+ * types of objects:
+ *
+ * 	ctldir ------> snapshotdir -------> snapshot
+ *                                             |
+ *                                             |
+ *                                             V
+ *                                         mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding vnode.
+ *
+ * All mounts are handled automatically by the kernel, but unmounts are
+ * (currently) handled from user land.  The main reason is that there is no
+ * reliable way to auto-unmount the filesystem when it's "no longer in use".
+ * When the user unmounts a filesystem, we call zfsctl_unmount(), which
+ * unmounts any snapshots within the snapshot directory.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') are all GFS nodes and
+ * share the same vfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted ontop of the GFS nodes '.zfs/snapshot/<snapname>'
+ * (ie: snapshots) are ZFS nodes and have their own unique vfs_t.
+ * However, vnodes within these mounted on file systems have their v_vfsp
+ * fields set to the head filesystem to make NFS happy (see
+ * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t
+ * so that it cannot be freed until all snapshots have been unmounted.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/libkern.h>
+#include <sys/dirent.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/namei.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/mount.h>
+#include <sys/zap.h>
+#include <sys/sysproto.h>
+
+#include "zfs_namecheck.h"
+
+#include <sys/kernel.h>
+#include <sys/ccompat.h>
+
+/* Common access mode for all virtual directories under the ctldir */
+const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+    S_IROTH | S_IXOTH;
+
+/*
+ * "Synthetic" filesystem implementation.
+ */
+
+/*
+ * Assert that A implies B.
+ */
+#define	KASSERT_IMPLY(A, B, msg)	KASSERT(!(A) || (B), (msg));
+
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+	char		sn_name[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t	sn_parent_id;
+	uint64_t	sn_id;
+} sfs_node_t;
+
+/*
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artificial
+ * IDs, znode IDs) may clash.
+ */
+static int
+sfs_compare_ids(struct vnode *vp, void *arg)
+{
+	sfs_node_t *n1 = vp->v_data;
+	sfs_node_t *n2 = arg;
+	bool equal;
+
+	equal = n1->sn_id == n2->sn_id &&
+	    n1->sn_parent_id == n2->sn_parent_id;
+
+	/* Zero means equality. */
+	return (!equal);
+}
+
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+    uint64_t id, struct vnode **vpp)
+{
+	sfs_node_t search;
+	int err;
+
+	search.sn_id = id;
+	search.sn_parent_id = parent_id;
+	err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp,
+	    sfs_compare_ids, &search);
+	return (err);
+}
+
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+    uint64_t id, struct vnode **vpp)
+{
+	int err;
+
+	KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+	err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp,
+	    sfs_compare_ids, vp->v_data);
+	return (err);
+}
+
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+	vfs_hash_remove(vp);
+}
+
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
+
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+    const char *tag, struct vop_vector *vops,
+    sfs_vnode_setup_fn setup, void *arg,
+    struct vnode **vpp)
+{
+	struct vnode *vp;
+	int error;
+
+	error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	/* Allocate a new vnode/inode. */
+	error = getnewvnode(tag, mp, vops, &vp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	/*
+	 * Exclusively lock the vnode vnode while it's being constructed.
+	 */
+	lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+	error = insmntque(vp, mp);
+	if (error != 0) {
+		*vpp = NULL;
+		return (error);
+	}
+
+	setup(vp, arg);
+
+	error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+	if (error != 0 || *vpp != NULL) {
+		KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+		    "sfs vnode with no data");
+		return (error);
+	}
+
+	*vpp = vp;
+	return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+	printf("\tname = %s\n", node->sn_name);
+	printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+	printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+	struct sfs_node *node;
+
+	KASSERT(strlen(name) < sizeof (node->sn_name),
+	    ("sfs node name is too long"));
+	KASSERT(size >= sizeof (*node), ("sfs node size is too small"));
+	node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+	strlcpy(node->sn_name, name, sizeof (node->sn_name));
+	node->sn_parent_id = parent_id;
+	node->sn_id = id;
+
+	return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+	free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+	void *data;
+
+	sfs_vnode_remove(vp);
+	data = vp->v_data;
+	vp->v_data = NULL;
+	return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+    zfs_uio_t *uio, off_t *offp)
+{
+	struct dirent entry;
+	int error;
+
+	/* Reset ncookies for subsequent use of vfs_read_dirent. */
+	if (ap->a_ncookies != NULL)
+		*ap->a_ncookies = 0;
+
+	if (zfs_uio_resid(uio) < sizeof (entry))
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_uio_offset(uio) < 0)
+		return (SET_ERROR(EINVAL));
+	if (zfs_uio_offset(uio) == 0) {
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '\0';
+		entry.d_namlen = 1;
+		entry.d_reclen = sizeof (entry);
+		error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio));
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (zfs_uio_offset(uio) < sizeof (entry))
+		return (SET_ERROR(EINVAL));
+	if (zfs_uio_offset(uio) == sizeof (entry)) {
+		entry.d_fileno = parent_id;
+		entry.d_type = DT_DIR;
+		entry.d_name[0] = '.';
+		entry.d_name[1] = '.';
+		entry.d_name[2] = '\0';
+		entry.d_namlen = 2;
+		entry.d_reclen = sizeof (entry);
+		error = vfs_read_dirent(ap, &entry, zfs_uio_offset(uio));
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	if (offp != NULL)
+		*offp = 2 * sizeof (entry);
+	return (0);
+}
+
+
+/*
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem.  We use the following scheme:
+ *
+ * 	ENTRY			ZFSCTL_INODE
+ * 	.zfs			1
+ * 	.zfs/snapshot		2
+ * 	.zfs/snapshot/<snap>	objectid(snap)
+ */
+#define	ZFSCTL_INO_SNAP(id)	(id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+
+void
+zfsctl_init(void)
+{
+}
+
+void
+zfsctl_fini(void)
+{
+}
+
+boolean_t
+zfsctl_is_node(vnode_t *vp)
+{
+	return (vn_matchops(vp, zfsctl_ops_root) ||
+	    vn_matchops(vp, zfsctl_ops_snapdir) ||
+	    vn_matchops(vp, zfsctl_ops_snapshot));
+
+}
+
+typedef struct zfsctl_root {
+	sfs_node_t	node;
+	sfs_node_t	*snapdir;
+	timestruc_t	cmtime;
+} zfsctl_root_t;
+
+
+/*
+ * Create the '.zfs' directory.
+ */
+void
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+	zfsctl_root_t *dot_zfs;
+	sfs_node_t *snapdir;
+	vnode_t *rvp;
+	uint64_t crtime[2];
+
+	ASSERT(zfsvfs->z_ctldir == NULL);
+
+	snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+	    ZFSCTL_INO_SNAPDIR);
+	dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0,
+	    ZFSCTL_INO_ROOT);
+	dot_zfs->snapdir = snapdir;
+
+	VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
+	VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
+	    &crtime, sizeof (crtime)));
+	ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+	vput(rvp);
+
+	zfsvfs->z_ctldir = dot_zfs;
+}
+
+/*
+ * Destroy the '.zfs' directory.  Only called when the filesystem is unmounted.
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+	sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+	sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
+	zfsvfs->z_ctldir = NULL;
+}
+
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	/* We support shared locking. */
+	VN_LOCK_ASHARE(vp);
+	vp->v_type = VDIR;
+	vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir;
+	err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+	    zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+    struct vnode **vpp)
+{
+	void *node;
+	int err;
+
+	node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir;
+	err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+	    &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+	return (err);
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
+{
+	int error;
+
+	error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+	return (error);
+}
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+static int
+zfsctl_common_open(struct vop_open_args *ap)
+{
+	int flags = ap->a_mode;
+
+	if (flags & FWRITE)
+		return (SET_ERROR(EACCES));
+
+	return (0);
+}
+
+/*
+ * Common close routine.  Nothing to do here.
+ */
+/* ARGSUSED */
+static int
+zfsctl_common_close(struct vop_close_args *ap)
+{
+	return (0);
+}
+
+/*
+ * Common access routine.  Disallow writes.
+ */
+static int
+zfsctl_common_access(struct vop_access_args *ap)
+{
+	accmode_t accmode = ap->a_accmode;
+
+	if (accmode & VWRITE)
+		return (SET_ERROR(EACCES));
+	return (0);
+}
+
+/*
+ * Common getattr function.  Fill in basic information.
+ */
+static void
+zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
+{
+	timestruc_t	now;
+	sfs_node_t *node;
+
+	node = vp->v_data;
+
+	vap->va_uid = 0;
+	vap->va_gid = 0;
+	vap->va_rdev = 0;
+	/*
+	 * We are a purely virtual object, so we have no
+	 * blocksize or allocated blocks.
+	 */
+	vap->va_blksize = 0;
+	vap->va_nblocks = 0;
+	vap->va_seq = 0;
+	vn_fsid(vp, vap);
+	vap->va_mode = zfsctl_ctldir_mode;
+	vap->va_type = VDIR;
+	/*
+	 * We live in the now (for atime).
+	 */
+	gethrestime(&now);
+	vap->va_atime = now;
+	/* FreeBSD: Reset chflags(2) flags. */
+	vap->va_flags = 0;
+
+	vap->va_nodeid = node->sn_id;
+
+	/* At least '.' and '..'. */
+	vap->va_nlink = 2;
+}
+
+#ifndef _OPENSOLARIS_SYS_VNODE_H_
+struct vop_fid_args {
+	struct vnode *a_vp;
+	struct fid *a_fid;
+};
+#endif
+
+static int
+zfsctl_common_fid(struct vop_fid_args *ap)
+{
+	vnode_t		*vp = ap->a_vp;
+	fid_t		*fidp = (void *)ap->a_fid;
+	sfs_node_t	*node = vp->v_data;
+	uint64_t	object = node->sn_id;
+	zfid_short_t	*zfid;
+	int		i;
+
+	zfid = (zfid_short_t *)fidp;
+	zfid->zf_len = SHORT_FID_LEN;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* .zfs nodes always have a generation number of 0 */
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = 0;
+
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfsctl_common_reclaim(struct vop_reclaim_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+
+	(void) sfs_reclaim_vnode(vp);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_print_args {
+	struct vnode *a_vp;
+};
+#endif
+
+static int
+zfsctl_common_print(struct vop_print_args *ap)
+{
+	sfs_print_node(ap->a_vp->v_data);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Get root directory attributes.
+ */
+static int
+zfsctl_root_getattr(struct vop_getattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vattr *vap = ap->a_vap;
+	zfsctl_root_t *node = vp->v_data;
+
+	zfsctl_common_getattr(vp, vap);
+	vap->va_ctime = node->cmtime;
+	vap->va_mtime = vap->va_ctime;
+	vap->va_birthtime = vap->va_ctime;
+	vap->va_nlink += 1; /* snapdir */
+	vap->va_size = vap->va_nlink;
+	return (0);
+}
+
+/*
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
+ */
+static int
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
+{
+	vref(dvp);
+	if (ltype != VOP_ISLOCKED(dvp)) {
+		if (ltype == LK_EXCLUSIVE)
+			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+		else /* if (ltype == LK_SHARED) */
+			vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+		/* Relock for the "." case may left us with reclaimed vnode. */
+		if (VN_IS_DOOMED(dvp)) {
+			vrele(dvp);
+			return (SET_ERROR(ENOENT));
+		}
+	}
+	return (0);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+static int
+zfsctl_root_lookup(struct vop_lookup_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	int flags = ap->a_cnp->cn_flags;
+	int lkflags = ap->a_cnp->cn_lkflags;
+	int nameiop = ap->a_cnp->cn_nameiop;
+	int err;
+
+	ASSERT(dvp->v_type == VDIR);
+
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
+
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+	} else if ((flags & ISDOTDOT) != 0) {
+		err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+		    lkflags, vpp);
+	} else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+		err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
+	} else {
+		err = SET_ERROR(ENOENT);
+	}
+	if (err != 0)
+		*vpp = NULL;
+	return (err);
+}
+
+static int
+zfsctl_root_readdir(struct vop_readdir_args *ap)
+{
+	struct dirent entry;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_root_t *node = vp->v_data;
+	zfs_uio_t uio;
+	int *eofp = ap->a_eofflag;
+	off_t dots_offset;
+	int error;
+
+	zfs_uio_init(&uio, ap->a_uio);
+
+	ASSERT(vp->v_type == VDIR);
+
+	error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio,
+	    &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
+		return (error);
+	}
+	if (zfs_uio_offset(&uio) != dots_offset)
+		return (SET_ERROR(EINVAL));
+
+	CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name));
+	entry.d_fileno = node->snapdir->sn_id;
+	entry.d_type = DT_DIR;
+	strcpy(entry.d_name, node->snapdir->sn_name);
+	entry.d_namlen = strlen(entry.d_name);
+	entry.d_reclen = sizeof (entry);
+	error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
+	if (error != 0) {
+		if (error == ENAMETOOLONG)
+			error = 0;
+		return (SET_ERROR(error));
+	}
+	if (eofp != NULL)
+		*eofp = 1;
+	return (0);
+}
+
+static int
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
+{
+	static const char dotzfs_name[4] = ".zfs";
+	vnode_t *dvp;
+	int error;
+
+	if (*ap->a_buflen < sizeof (dotzfs_name))
+		return (SET_ERROR(ENOMEM));
+
+	error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+	    LK_SHARED, &dvp);
+	if (error != 0)
+		return (SET_ERROR(error));
+
+	VOP_UNLOCK1(dvp);
+	*ap->a_vpp = dvp;
+	*ap->a_buflen -= sizeof (dotzfs_name);
+	bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
+	return (0);
+}
+
+static int
+zfsctl_common_pathconf(struct vop_pathconf_args *ap)
+{
+	/*
+	 * We care about ACL variables so that user land utilities like ls
+	 * can display them correctly.  Since the ctldir's st_dev is set to be
+	 * the same as the parent dataset, we must support all variables that
+	 * it supports.
+	 */
+	switch (ap->a_name) {
+	case _PC_LINK_MAX:
+		*ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX);
+		return (0);
+
+	case _PC_FILESIZEBITS:
+		*ap->a_retval = 64;
+		return (0);
+
+	case _PC_MIN_HOLE_SIZE:
+		*ap->a_retval = (int)SPA_MINBLOCKSIZE;
+		return (0);
+
+	case _PC_ACL_EXTENDED:
+		*ap->a_retval = 0;
+		return (0);
+
+	case _PC_ACL_NFS4:
+		*ap->a_retval = 1;
+		return (0);
+
+	case _PC_ACL_PATH_MAX:
+		*ap->a_retval = ACL_MAX_ENTRIES;
+		return (0);
+
+	case _PC_NAME_MAX:
+		*ap->a_retval = NAME_MAX;
+		return (0);
+
+	default:
+		return (vop_stdpathconf(ap));
+	}
+}
+
+/*
+ * Returns a trivial ACL
+ */
+static int
+zfsctl_common_getacl(struct vop_getacl_args *ap)
+{
+	int i;
+
+	if (ap->a_type != ACL_TYPE_NFS4)
+		return (EINVAL);
+
+	acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
+	/*
+	 * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
+	 * attributes.  That is not the case for the ctldir, so we must clear
+	 * those bits.  We also must clear ACL_READ_NAMED_ATTRS, because xattrs
+	 * aren't supported by the ctldir.
+	 */
+	for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
+		struct acl_entry *entry;
+		entry = &(ap->a_aclp->acl_entry[i]);
+		entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+		    ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
+		    ACL_READ_NAMED_ATTRS);
+	}
+
+	return (0);
+}
+
+static struct vop_vector zfsctl_ops_root = {
+	.vop_default =	&default_vnodeops,
+#if __FreeBSD_version >= 1300121
+	.vop_fplookup_vexec = VOP_EAGAIN,
+#endif
+	.vop_open =	zfsctl_common_open,
+	.vop_close =	zfsctl_common_close,
+	.vop_ioctl =	VOP_EINVAL,
+	.vop_getattr =	zfsctl_root_getattr,
+	.vop_access =	zfsctl_common_access,
+	.vop_readdir =	zfsctl_root_readdir,
+	.vop_lookup =	zfsctl_root_lookup,
+	.vop_inactive =	VOP_NULL,
+	.vop_reclaim =	zfsctl_common_reclaim,
+	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+	.vop_vptocnp =	zfsctl_root_vptocnp,
+	.vop_pathconf =	zfsctl_common_pathconf,
+	.vop_getacl =	zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root);
+
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+
+	dmu_objset_name(os, zname);
+	if (strlen(zname) + 1 + strlen(name) >= len)
+		return (SET_ERROR(ENAMETOOLONG));
+	(void) strcat(zname, "@");
+	(void) strcat(zname, name);
+	return (0);
+}
+
+static int
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
+{
+	objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+	int err;
+
+	err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
+	return (err);
+}
+
+/*
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any.  The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked.  If any error happens the orinal vnode is unlocked and
+ * released.
+ */
+static int
+zfsctl_mounted_here(vnode_t **vpp, int flags)
+{
+	struct mount *mp;
+	int err;
+
+	ASSERT_VOP_LOCKED(*vpp, __func__);
+	ASSERT3S((*vpp)->v_type, ==, VDIR);
+
+	if ((mp = (*vpp)->v_mountedhere) != NULL) {
+		err = vfs_busy(mp, 0);
+		KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+		KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+		vput(*vpp);
+		err = VFS_ROOT(mp, flags, vpp);
+		vfs_unbusy(mp);
+		return (err);
+	}
+	return (EJUSTRETURN);
+}
+
+typedef struct {
+	const char *snap_name;
+	uint64_t    snap_id;
+} snapshot_setup_arg_t;
+
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
+{
+	snapshot_setup_arg_t *ssa = arg;
+	sfs_node_t *node;
+
+	ASSERT_VOP_ELOCKED(vp, __func__);
+
+	node = sfs_alloc_node(sizeof (sfs_node_t),
+	    ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+	zfsctl_common_vnode_setup(vp, node);
+
+	/* We have to support recursive locking. */
+	VN_LOCK_AREC(vp);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem vnode as necessary.
+ * Perform a mount of the associated dataset on top of the vnode.
+ * There are four possibilities:
+ * - the snapshot node and vnode do not exist
+ * - the snapshot vnode is covered by the mounted snapshot
+ * - the snapshot vnode is not covered yet, the mount operation is in progress
+ * - the snapshot vnode is not covered, because the snapshot has been unmounted
+ * The last two states are transient and should be relatively short-lived.
+ */
+static int
+zfsctl_snapdir_lookup(struct vop_lookup_args *ap)
+{
+	vnode_t *dvp = ap->a_dvp;
+	vnode_t **vpp = ap->a_vpp;
+	struct componentname *cnp = ap->a_cnp;
+	char name[NAME_MAX + 1];
+	char fullname[ZFS_MAX_DATASET_NAME_LEN];
+	char *mountpoint;
+	size_t mountpoint_len;
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	uint64_t snap_id;
+	int nameiop = cnp->cn_nameiop;
+	int lkflags = cnp->cn_lkflags;
+	int flags = cnp->cn_flags;
+	int err;
+
+	ASSERT(dvp->v_type == VDIR);
+
+	if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+		return (SET_ERROR(ENOTSUP));
+
+	if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+		err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+		if (err == 0)
+			*vpp = dvp;
+		return (err);
+	}
+	if (flags & ISDOTDOT) {
+		err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+		    vpp);
+		return (err);
+	}
+
+	if (cnp->cn_namelen >= sizeof (name))
+		return (SET_ERROR(ENAMETOOLONG));
+
+	strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+	err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+	if (err != 0)
+		return (SET_ERROR(ENOENT));
+
+	for (;;) {
+		snapshot_setup_arg_t ssa;
+
+		ssa.snap_name = name;
+		ssa.snap_id = snap_id;
+		err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+		    snap_id, "zfs", &zfsctl_ops_snapshot,
+		    zfsctl_snapshot_vnode_setup, &ssa, vpp);
+		if (err != 0)
+			return (err);
+
+		/* Check if a new vnode has just been created. */
+		if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+			break;
+
+		/*
+		 * Check if a snapshot is already mounted on top of the vnode.
+		 */
+		err = zfsctl_mounted_here(vpp, lkflags);
+		if (err != EJUSTRETURN)
+			return (err);
+
+		/*
+		 * If the vnode is not covered, then either the mount operation
+		 * is in progress or the snapshot has already been unmounted
+		 * but the vnode hasn't been inactivated and reclaimed yet.
+		 * We can try to re-use the vnode in the latter case.
+		 */
+		VI_LOCK(*vpp);
+		if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+			/*
+			 * Upgrade to exclusive lock in order to:
+			 * - avoid race conditions
+			 * - satisfy the contract of mount_snapshot()
+			 */
+			err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+			if (err == 0)
+				break;
+		} else {
+			VI_UNLOCK(*vpp);
+		}
+
+		/*
+		 * In this state we can loop on uncontested locks and starve
+		 * the thread doing the lengthy, non-trivial mount operation.
+		 * So, yield to prevent that from happening.
+		 */
+		vput(*vpp);
+		kern_yield(PRI_USER);
+	}
+
+	VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname));
+
+	mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
+	    strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
+	mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
+	(void) snprintf(mountpoint, mountpoint_len,
+	    "%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
+	    dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+	err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
+	kmem_free(mountpoint, mountpoint_len);
+	if (err == 0) {
+		/*
+		 * Fix up the root vnode mounted on .zfs/snapshot/<snapname>.
+		 *
+		 * This is where we lie about our v_vfsp in order to
+		 * make .zfs/snapshot/<snapname> accessible over NFS
+		 * without requiring manual mounts of <snapname>.
+		 */
+		ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
+		VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+
+		/* Clear the root flag (set via VFS_ROOT) as well. */
+		(*vpp)->v_vflag &= ~VV_ROOT;
+	}
+
+	if (err != 0)
+		*vpp = NULL;
+	return (err);
+}
+
+static int
+zfsctl_snapdir_readdir(struct vop_readdir_args *ap)
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	struct dirent entry;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfs_uio_t uio;
+	int *eofp = ap->a_eofflag;
+	off_t dots_offset;
+	int error;
+
+	zfs_uio_init(&uio, ap->a_uio);
+
+	ASSERT(vp->v_type == VDIR);
+
+	error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap,
+	    &uio, &dots_offset);
+	if (error != 0) {
+		if (error == ENAMETOOLONG) /* ran out of destination space */
+			error = 0;
+		return (error);
+	}
+
+	ZFS_ENTER(zfsvfs);
+	for (;;) {
+		uint64_t cookie;
+		uint64_t id;
+
+		cookie = zfs_uio_offset(&uio) - dots_offset;
+
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT) {
+				if (eofp != NULL)
+					*eofp = 1;
+				error = 0;
+			}
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		entry.d_fileno = id;
+		entry.d_type = DT_DIR;
+		strcpy(entry.d_name, snapname);
+		entry.d_namlen = strlen(entry.d_name);
+		entry.d_reclen = sizeof (entry);
+		error = vfs_read_dirent(ap, &entry, zfs_uio_offset(&uio));
+		if (error != 0) {
+			if (error == ENAMETOOLONG)
+				error = 0;
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(error));
+		}
+		zfs_uio_setoffset(&uio, cookie + dots_offset);
+	}
+	/* NOTREACHED */
+}
+
+static int
+zfsctl_snapdir_getattr(struct vop_getattr_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	vattr_t *vap = ap->a_vap;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	dsl_dataset_t *ds;
+	uint64_t snap_count;
+	int err;
+
+	ZFS_ENTER(zfsvfs);
+	ds = dmu_objset_ds(zfsvfs->z_os);
+	zfsctl_common_getattr(vp, vap);
+	vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	vap->va_mtime = vap->va_ctime;
+	vap->va_birthtime = vap->va_ctime;
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+		err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+		if (err != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+		vap->va_nlink += snap_count;
+	}
+	vap->va_size = vap->va_nlink;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static struct vop_vector zfsctl_ops_snapdir = {
+	.vop_default =	&default_vnodeops,
+#if __FreeBSD_version >= 1300121
+	.vop_fplookup_vexec = VOP_EAGAIN,
+#endif
+	.vop_open =	zfsctl_common_open,
+	.vop_close =	zfsctl_common_close,
+	.vop_getattr =	zfsctl_snapdir_getattr,
+	.vop_access =	zfsctl_common_access,
+	.vop_readdir =	zfsctl_snapdir_readdir,
+	.vop_lookup =	zfsctl_snapdir_lookup,
+	.vop_reclaim =	zfsctl_common_reclaim,
+	.vop_fid =	zfsctl_common_fid,
+	.vop_print =	zfsctl_common_print,
+	.vop_pathconf =	zfsctl_common_pathconf,
+	.vop_getacl =	zfsctl_common_getacl,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir);
+
+
+static int
+zfsctl_snapshot_inactive(struct vop_inactive_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+
+	VERIFY(vrecycle(vp) == 1);
+	return (0);
+}
+
+static int
+zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	void *data = vp->v_data;
+
+	sfs_reclaim_vnode(vp);
+	sfs_destroy_node(data);
+	return (0);
+}
+
+static int
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
+{
+	struct mount *mp;
+	vnode_t *dvp;
+	vnode_t *vp;
+	sfs_node_t *node;
+	size_t len;
+	int locked;
+	int error;
+
+	vp = ap->a_vp;
+	node = vp->v_data;
+	len = strlen(node->sn_name);
+	if (*ap->a_buflen < len)
+		return (SET_ERROR(ENOMEM));
+
+	/*
+	 * Prevent unmounting of the snapshot while the vnode lock
+	 * is not held.  That is not strictly required, but allows
+	 * us to assert that an uncovered snapshot vnode is never
+	 * "leaked".
+	 */
+	mp = vp->v_mountedhere;
+	if (mp == NULL)
+		return (SET_ERROR(ENOENT));
+	error = vfs_busy(mp, 0);
+	KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
+
+	/*
+	 * We can vput the vnode as we can now depend on the reference owned
+	 * by the busied mp.  But we also need to hold the vnode, because
+	 * the reference may go after vfs_unbusy() which has to be called
+	 * before we can lock the vnode again.
+	 */
+	locked = VOP_ISLOCKED(vp);
+#if __FreeBSD_version >= 1300045
+	enum vgetstate vs = vget_prep(vp);
+#else
+	vhold(vp);
+#endif
+	vput(vp);
+
+	/* Look up .zfs/snapshot, our parent. */
+	error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+	if (error == 0) {
+		VOP_UNLOCK1(dvp);
+		*ap->a_vpp = dvp;
+		*ap->a_buflen -= len;
+		bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
+	}
+	vfs_unbusy(mp);
+#if __FreeBSD_version >= 1300045
+	vget_finish(vp, locked | LK_RETRY, vs);
+#else
+	vget(vp, locked | LK_VNHELD | LK_RETRY, curthread);
+#endif
+	return (error);
+}
+
+/*
+ * These VP's should never see the light of day.  They should always
+ * be covered.
+ */
+static struct vop_vector zfsctl_ops_snapshot = {
+#if __FreeBSD_version >= 1300121
+	.vop_fplookup_vexec =	VOP_EAGAIN,
+#endif
+	.vop_inactive =		zfsctl_snapshot_inactive,
+#if __FreeBSD_version >= 1300045
+	.vop_need_inactive = vop_stdneed_inactive,
+#endif
+	.vop_reclaim =		zfsctl_snapshot_reclaim,
+	.vop_vptocnp =		zfsctl_snapshot_vptocnp,
+	.vop_lock1 =		vop_stdlock,
+	.vop_unlock =		vop_stdunlock,
+	.vop_islocked =		vop_stdislocked,
+	.vop_advlockpurge =	vop_stdadvlockpurge, /* called by vgone */
+	.vop_print =		zfsctl_common_print,
+};
+VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot);
+
+int
+zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
+{
+	zfsvfs_t *zfsvfs __unused = vfsp->vfs_data;
+	vnode_t *vp;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+	*zfsvfsp = NULL;
+	error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+	    ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+	if (error == 0 && vp != NULL) {
+		/*
+		 * XXX Probably need to at least reference, if not busy, the mp.
+		 */
+		if (vp->v_mountedhere != NULL)
+			*zfsvfsp = vp->v_mountedhere->mnt_data;
+		vput(vp);
+	}
+	if (*zfsvfsp == NULL)
+		return (SET_ERROR(EINVAL));
+	return (0);
+}
+
+/*
+ * Unmount any snapshots for the given filesystem.  This is called from
+ * zfs_umount() - if we have a ctldir, then go through and unmount all the
+ * snapshots.
+ */
+int
+zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
+{
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	struct mount *mp;
+	vnode_t *vp;
+	uint64_t cookie;
+	int error;
+
+	ASSERT(zfsvfs->z_ctldir != NULL);
+
+	cookie = 0;
+	for (;;) {
+		uint64_t id;
+
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname),
+		    snapname, &id, &cookie, NULL);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error != 0) {
+			if (error == ENOENT)
+				error = 0;
+			break;
+		}
+
+		for (;;) {
+			error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+			    ZFSCTL_INO_SNAPDIR, id, &vp);
+			if (error != 0 || vp == NULL)
+				break;
+
+			mp = vp->v_mountedhere;
+
+			/*
+			 * v_mountedhere being NULL means that the
+			 * (uncovered) vnode is in a transient state
+			 * (mounting or unmounting), so loop until it
+			 * settles down.
+			 */
+			if (mp != NULL)
+				break;
+			vput(vp);
+		}
+		if (error != 0)
+			break;
+		if (vp == NULL)
+			continue;	/* no mountpoint, nothing to do */
+
+		/*
+		 * The mount-point vnode is kept locked to avoid spurious EBUSY
+		 * from a concurrent umount.
+		 * The vnode lock must have recursive locking enabled.
+		 */
+		vfs_ref(mp);
+		error = dounmount(mp, fflags, curthread);
+		KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+		    ("extra references after unmount"));
+		vput(vp);
+		if (error != 0)
+			break;
+	}
+	KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+	    ("force unmounting failed"));
+	return (error);
+}
+
+int
+zfsctl_snapshot_unmount(const char *snapname, int flags __unused)
+{
+	vfs_t *vfsp = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+
+	if (strchr(snapname, '@') == NULL)
+		return (0);
+
+	int err = getzfsvfs(snapname, &zfsvfs);
+	if (err != 0) {
+		ASSERT3P(zfsvfs, ==, NULL);
+		return (0);
+	}
+	vfsp = zfsvfs->z_vfs;
+
+	ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
+
+	vfs_ref(vfsp);
+	vfs_unbusy(vfsp);
+	return (dounmount(vfsp, MS_FORCE, curthread));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c
new file mode 100644
index 000000000000..74742ad3669f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_debug.c
@@ -0,0 +1,251 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/kstat.h>
+
+typedef struct zfs_dbgmsg {
+	list_node_t zdm_node;
+	time_t zdm_timestamp;
+	int zdm_size;
+	char zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+kmutex_t zfs_dbgmsgs_lock;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+kstat_t *zfs_dbgmsg_kstat;
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * dtrace -n 'zfs-dbgmsg { print(stringof(arg0)); }'
+ *
+ * # Disable the kernel debug message log.
+ * sysctl vfs.zfs.dbgmsg_enable=0
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_headers(char *buf, size_t size)
+{
+	(void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message");
+
+	return (0);
+}
+
+static int
+zfs_dbgmsg_data(char *buf, size_t size, void *data)
+{
+	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data;
+
+	(void) snprintf(buf, size, "%-12llu %-s\n",
+	    (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+
+	return (0);
+}
+
+static void *
+zfs_dbgmsg_addr(kstat_t *ksp, loff_t n)
+{
+	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private;
+
+	ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+	if (n == 0)
+		ksp->ks_private = list_head(&zfs_dbgmsgs);
+	else if (zdm)
+		ksp->ks_private = list_next(&zfs_dbgmsgs, zdm);
+
+	return (ksp->ks_private);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+	zfs_dbgmsg_t *zdm;
+	int size;
+
+	ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
+
+	while (zfs_dbgmsg_size > max_size) {
+		zdm = list_remove_head(&zfs_dbgmsgs);
+		if (zdm == NULL)
+			return;
+
+		size = zdm->zdm_size;
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+}
+
+static int
+zfs_dbgmsg_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		zfs_dbgmsg_purge(0);
+
+	return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+	list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+	    offsetof(zfs_dbgmsg_t, zdm_node));
+	mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+	if (zfs_dbgmsg_kstat) {
+		zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock;
+		zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX;
+		zfs_dbgmsg_kstat->ks_private = NULL;
+		zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update;
+		kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers,
+		    zfs_dbgmsg_data, zfs_dbgmsg_addr);
+		kstat_install(zfs_dbgmsg_kstat);
+	}
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+	if (zfs_dbgmsg_kstat)
+		kstat_delete(zfs_dbgmsg_kstat);
+	/*
+	 * TODO - decide how to make this permanent
+	 */
+#ifdef _KERNEL
+	mutex_enter(&zfs_dbgmsgs_lock);
+	zfs_dbgmsg_purge(0);
+	mutex_exit(&zfs_dbgmsgs_lock);
+	mutex_destroy(&zfs_dbgmsgs_lock);
+#endif
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+	zfs_dbgmsg_t *zdm;
+	int size;
+
+	DTRACE_PROBE1(zfs__dbgmsg, char *, buf);
+
+	size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+	zdm = kmem_zalloc(size, KM_SLEEP);
+	zdm->zdm_size = size;
+	zdm->zdm_timestamp = gethrestime_sec();
+	strcpy(zdm->zdm_msg, buf);
+
+	mutex_enter(&zfs_dbgmsgs_lock);
+	list_insert_tail(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += size;
+	zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+	mutex_exit(&zfs_dbgmsgs_lock);
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+	/*
+	 * To enable this:
+	 *
+	 * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+	 */
+	if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+		__dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+#ifdef _KERNEL
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+    int line, const char *fmt, ...)
+{
+	const char *newfile;
+	va_list adx;
+	size_t size;
+	char *buf;
+	char *nl;
+	int i;
+
+	size = 1024;
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Get rid of annoying prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
+
+	if (i < size) {
+		va_start(adx, fmt);
+		(void) vsnprintf(buf + i, size - i, fmt, adx);
+		va_end(adx);
+	}
+
+	/*
+	 * Get rid of trailing newline.
+	 */
+	nl = strrchr(buf, '\n');
+	if (nl != NULL)
+		*nl = '\0';
+
+	__zfs_dbgmsg(buf);
+
+	kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+	zfs_dbgmsg_t *zdm;
+
+	(void) printf("ZFS_DBGMSG(%s):\n", tag);
+	mutex_enter(&zfs_dbgmsgs_lock);
+	for (zdm = list_head(&zfs_dbgmsgs); zdm;
+	    zdm = list_next(&zfs_dbgmsgs, zdm))
+		(void) printf("%s\n", zdm->zdm_msg);
+	mutex_exit(&zfs_dbgmsgs_lock);
+}
+#endif /* _KERNEL */
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_enable, INT, ZMOD_RW,
+    "Enable ZFS debug message log");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_maxsize, INT, ZMOD_RW,
+    "Maximum ZFS debug log size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
new file mode 100644
index 000000000000..fb01012dd6e7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c
@@ -0,0 +1,968 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/extdirent.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/condvar.h>
+#include <sys/callb.h>
+#include <sys/smp.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+
+#include <sys/ccompat.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+    matchtype_t mt, uint64_t *zoid)
+{
+	int error;
+
+	if (zfsvfs->z_norm) {
+
+		/*
+		 * In the non-mixed case we only expect there would ever
+		 * be one match, but we need to use the normalizing lookup.
+		 */
+		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+		    zoid, mt, NULL, 0, NULL);
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+	}
+	*zoid = ZFS_DIRENT_OBJ(*zoid);
+
+	return (error);
+}
+
+/*
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
+ *
+ * Input arguments:
+ *	dzp	- znode for directory
+ *	name	- name of entry to lock
+ *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
+ *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
+ *		  ZXATTR: we want dzp's xattr directory
+ *
+ * Output arguments:
+ *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ */
+int
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
+{
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	znode_t		*zp;
+	matchtype_t	mt = 0;
+	uint64_t	zoid;
+	int		error = 0;
+
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+
+	*zpp = NULL;
+
+	/*
+	 * Verify that we are not trying to lock '.', '..', or '.zfs'
+	 */
+	if (name[0] == '.' &&
+	    (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) ||
+	    (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)))
+		return (SET_ERROR(EEXIST));
+
+	/*
+	 * Case sensitivity and normalization preferences are set when
+	 * the file system is created.  These are stored in the
+	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
+	 * affect how we perform zap lookups.
+	 *
+	 * When matching we may need to normalize & change case according to
+	 * FS settings.
+	 *
+	 * Note that a normalized match is necessary for a case insensitive
+	 * filesystem when the lookup request is not exact because normalization
+	 * can fold case independent of normalizing code point sequences.
+	 *
+	 * See the table above zfs_dropname().
+	 */
+	if (zfsvfs->z_norm != 0) {
+		mt = MT_NORMALIZE;
+
+		/*
+		 * Determine if the match needs to honor the case specified in
+		 * lookup, and if so keep track of that so that during
+		 * normalization we don't fold case.
+		 */
+		if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+			mt |= MT_MATCH_CASE;
+		}
+	}
+
+	/*
+	 * Only look in or update the DNLC if we are looking for the
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name.
+	 *
+	 * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+	 * because in that case MT_EXACT and MT_FIRST should produce exactly
+	 * the same result.
+	 */
+
+	if (dzp->z_unlinked && !(flag & ZXATTR))
+		return (ENOENT);
+	if (flag & ZXATTR) {
+		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+		    sizeof (zoid));
+		if (error == 0)
+			error = (zoid == 0 ? ENOENT : 0);
+	} else {
+		error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid);
+	}
+	if (error) {
+		if (error != ENOENT || (flag & ZEXISTS)) {
+			return (error);
+		}
+	} else {
+		if (flag & ZNEW) {
+			return (SET_ERROR(EEXIST));
+		}
+		error = zfs_zget(zfsvfs, zoid, &zp);
+		if (error)
+			return (error);
+		ASSERT(!zp->z_unlinked);
+		*zpp = zp;
+	}
+
+	return (0);
+}
+
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
+{
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	znode_t *zp;
+	uint64_t parent;
+	int error;
+
+#ifdef ZFS_DEBUG
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs));
+#endif
+	if (dzp->z_unlinked)
+		return (ENOENT);
+
+	if ((error = sa_lookup(dzp->z_sa_hdl,
+	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+		return (error);
+
+	error = zfs_zget(zfsvfs, parent, &zp);
+	if (error == 0)
+		*zpp = zp;
+	return (error);
+}
+
+int
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
+{
+	zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs;
+	znode_t *zp = NULL;
+	int error = 0;
+
+#ifdef ZFS_DEBUG
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+	ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs));
+#endif
+	if (dzp->z_unlinked)
+		return (SET_ERROR(ENOENT));
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*zpp = dzp;
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		error = zfs_dd_lookup(dzp, &zp);
+		if (error == 0)
+			*zpp = zp;
+	} else {
+		error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
+		if (error == 0) {
+			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+			*zpp = zp;
+		}
+	}
+	return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating.  We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem).  So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error.  On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ASSERT(zp->z_unlinked);
+	ASSERT(zp->z_links == 0);
+
+	VERIFY3U(0, ==,
+	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+	dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	dmu_tx_t	*tx;
+	int		error;
+
+	/*
+	 * Iterate over the contents of the unlinked set.
+	 */
+	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+	    zap_cursor_retrieve(&zc, &zap) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * See what kind of object we have in list
+		 */
+
+		error = dmu_object_info(zfsvfs->z_os,
+		    zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these list entries for deletion,
+		 * so we pull them back into core and set zp->z_unlinked.
+		 */
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for deletion.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system marked z_unlinked.
+		 */
+		if (error != 0)
+			continue;
+
+		vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
+
+		/*
+		 * Due to changes in zfs_rmnode we need to make sure the
+		 * link count is set to zero here.
+		 */
+		if (zp->z_links != 0) {
+			tx = dmu_tx_create(zfsvfs->z_os);
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+			error = dmu_tx_assign(tx, TXG_WAIT);
+			if (error != 0) {
+				dmu_tx_abort(tx);
+				vput(ZTOV(zp));
+				continue;
+			}
+			zp->z_links = 0;
+			VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+			    &zp->z_links, sizeof (zp->z_links), tx));
+			dmu_tx_commit(tx);
+		}
+
+		zp->z_unlinked = B_TRUE;
+		vput(ZTOV(zp));
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * Delete the entire contents of a directory.  Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ *	so there is no need to lock its entries before deletion.
+ *	Also, it assumes the directory contents is *only* regular
+ *	files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	znode_t		*xzp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	int skipped = 0;
+	int error;
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		error = zfs_zget(zfsvfs,
+		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+		if (error) {
+			skipped += 1;
+			continue;
+		}
+
+		vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
+		ASSERT((ZTOV(xzp)->v_type == VREG) ||
+		    (ZTOV(xzp)->v_type == VLNK));
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+		/* Is this really needed ? */
+		zfs_sa_upgrade_txholds(tx, xzp);
+		dmu_tx_mark_netfree(tx);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			vput(ZTOV(xzp));
+			skipped += 1;
+			continue;
+		}
+
+		error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
+		if (error)
+			skipped += 1;
+		dmu_tx_commit(tx);
+
+		vput(ZTOV(xzp));
+	}
+	zap_cursor_fini(&zc);
+	if (error != ENOENT)
+		skipped += 1;
+	return (skipped);
+}
+
+extern taskq_t *zfsvfs_taskq;
+
+void
+zfs_rmnode(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os = zfsvfs->z_os;
+	dmu_tx_t	*tx;
+	uint64_t	acl_obj;
+	uint64_t	xattr_obj;
+	uint64_t	count;
+	int		error;
+
+	ASSERT(zp->z_links == 0);
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+
+	/*
+	 * If this is an attribute directory, purge its contents.
+	 */
+	if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR &&
+	    (zp->z_pflags & ZFS_XATTR)) {
+		if (zfs_purgedir(zp) != 0) {
+			/*
+			 * Not enough space to delete some xattrs.
+			 * Leave it in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			return;
+		}
+	} else {
+		/*
+		 * Free up all the data in the file.  We don't do this for
+		 * XATTR directories because we need truncate and remove to be
+		 * in the same tx, like in zfs_znode_delete(). Otherwise, if
+		 * we crash here we'll end up with an inconsistent truncated
+		 * zap object in the delete queue.  Note a truncated file is
+		 * harmless since it only contains user data.
+		 */
+		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+		if (error) {
+			/*
+			 * Not enough space or we were interrupted by unmount.
+			 * Leave the file in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			return;
+		}
+	}
+
+	/*
+	 * If the file has extended attributes, we're going to unlink
+	 * the xattr dir.
+	 */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error)
+		xattr_obj = 0;
+
+	acl_obj = zfs_external_acl(zp);
+
+	/*
+	 * Set up the final transaction.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	if (xattr_obj)
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+	if (acl_obj)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		/*
+		 * Not enough space to delete the file.  Leave it in the
+		 * unlinked set, leaking it until the fs is remounted (at
+		 * which point we'll call zfs_unlinked_drain() to process it).
+		 */
+		dmu_tx_abort(tx);
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_free(zp);
+		return;
+	}
+
+	/*
+	 * FreeBSD's implementation of zfs_zget requires a vnode to back it.
+	 * This means that we could end up calling into getnewvnode while
+	 * calling zfs_rmnode as a result of a prior call to getnewvnode
+	 * trying to clear vnodes out of the cache. If this repeats we can
+	 * recurse enough that we overflow our stack. To avoid this, we
+	 * avoid calling zfs_zget on the xattr znode and instead simply add
+	 * it to the unlinked set and schedule a call to zfs_unlinked_drain.
+	 */
+	if (xattr_obj) {
+		/* Add extended attribute directory to the unlinked set. */
+		VERIFY3U(0, ==,
+		    zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx));
+	}
+
+	mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	/* Remove this znode from the unlinked set */
+	VERIFY3U(0, ==,
+	    zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+	if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
+		cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
+	}
+
+	mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
+	zfs_znode_delete(zp, tx);
+
+	dmu_tx_commit(tx);
+
+	if (xattr_obj) {
+		/*
+		 * We're using the FreeBSD taskqueue API here instead of
+		 * the Solaris taskq API since the FreeBSD API allows for a
+		 * task to be enqueued multiple times but executed once.
+		 */
+		taskqueue_enqueue(zfsvfs_taskq->tq_queue,
+		    &zfsvfs->z_unlinked_drain_task);
+	}
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+	uint64_t de = zp->z_id;
+
+	if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE)
+		de |= IFTODT(mode) << 60;
+	return (de);
+}
+
+/*
+ * Link zp into dzp.  Can only fail if zp has been unlinked.
+ */
+int
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	uint64_t value;
+	int zp_is_dir = (vp->v_type == VDIR);
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	int count = 0;
+	int error;
+
+	if (zfsvfs->z_replay == B_FALSE) {
+		ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	}
+	if (zp_is_dir) {
+		if (dzp->z_links >= ZFS_LINK_MAX)
+			return (SET_ERROR(EMLINK));
+	}
+	if (!(flag & ZRENAMING)) {
+		if (zp->z_unlinked) {	/* no new links to unlinked zp */
+			ASSERT(!(flag & (ZNEW | ZEXISTS)));
+			return (SET_ERROR(ENOENT));
+		}
+		if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) {
+			return (SET_ERROR(EMLINK));
+		}
+		zp->z_links++;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+		    &zp->z_links, sizeof (zp->z_links));
+
+	} else {
+		ASSERT(zp->z_unlinked == 0);
+	}
+	value = zfs_dirent(zp, zp->z_mode);
+	error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
+	    8, 1, &value, tx);
+
+	/*
+	 * zap_add could fail to add the entry if it exceeds the capacity of the
+	 * leaf-block and zap_leaf_split() failed to help.
+	 * The caller of this routine is responsible for failing the transaction
+	 * which will rollback the SA updates done above.
+	 */
+	if (error != 0) {
+		if (!(flag & ZRENAMING) && !(flag & ZNEW))
+			zp->z_links--;
+		return (error);
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+	    &dzp->z_id, sizeof (dzp->z_id));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (!(flag & ZNEW)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+		    ctime);
+	}
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
+
+	dzp->z_size++;
+	dzp->z_links += zp_is_dir;
+	count = 0;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &dzp->z_links, sizeof (dzp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
+	return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type  | z_norm      | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0           |           0 | 0 (exact)
+ * CS  norm | formX       |           0 | MT_NORMALIZE
+ * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
+ * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
+ * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ *    formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag)
+{
+	int error;
+
+	if (zp->z_zfsvfs->z_norm) {
+		matchtype_t mt = MT_NORMALIZE;
+
+		if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) {
+			mt |= MT_MATCH_CASE;
+		}
+
+		error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id,
+		    name, mt, tx);
+	} else {
+		error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx);
+	}
+
+	return (error);
+}
+
+/*
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
+ * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+    int flag, boolean_t *unlinkedp)
+{
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	int zp_is_dir = (vp->v_type == VDIR);
+	boolean_t unlinked = B_FALSE;
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	int count = 0;
+	int error;
+
+	if (zfsvfs->z_replay == B_FALSE) {
+		ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+	}
+	if (!(flag & ZRENAMING)) {
+
+		if (zp_is_dir && !zfs_dirempty(zp))
+			return (SET_ERROR(ENOTEMPTY));
+
+		/*
+		 * If we get here, we are going to try to remove the object.
+		 * First try removing the name from the directory; if that
+		 * fails, return the error.
+		 */
+		error = zfs_dropname(dzp, name, zp, tx, flag);
+		if (error != 0) {
+			return (error);
+		}
+
+		if (zp->z_links <= zp_is_dir) {
+			zfs_panic_recover("zfs: link count on vnode %p is %u, "
+			    "should be at least %u", zp->z_vnode,
+			    (int)zp->z_links,
+			    zp_is_dir + 1);
+			zp->z_links = zp_is_dir + 1;
+		}
+		if (--zp->z_links == zp_is_dir) {
+			zp->z_unlinked = B_TRUE;
+			zp->z_links = 0;
+			unlinked = B_TRUE;
+		} else {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+			    NULL, &ctime, sizeof (ctime));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+			    ctime);
+		}
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+		    NULL, &zp->z_links, sizeof (zp->z_links));
+		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		count = 0;
+		ASSERT0(error);
+	} else {
+		ASSERT(zp->z_unlinked == 0);
+		error = zfs_dropname(dzp, name, zp, tx, flag);
+		if (error != 0)
+			return (error);
+	}
+
+	dzp->z_size--;		/* one dirent removed */
+	dzp->z_links -= zp_is_dir;	/* ".." link from zp */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+	    NULL, &dzp->z_links, sizeof (dzp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+	    NULL, ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+	    NULL, mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT0(error);
+
+	if (unlinkedp != NULL)
+		*unlinkedp = unlinked;
+	else if (unlinked)
+		zfs_unlinked_add(zp, tx);
+
+	return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+	return (dzp->z_size == 2);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	znode_t *xzp;
+	dmu_tx_t *tx;
+	int error;
+	zfs_acl_ids_t acl_ids;
+	boolean_t fuid_dirtied;
+	uint64_t parent __unused;
+
+	*xvpp = NULL;
+
+	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+	    &acl_ids)) != 0)
+		return (error);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) {
+		zfs_acl_ids_free(&acl_ids);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	getnewvnode_reserve_();
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		return (error);
+	}
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef ZFS_DEBUG
+	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent));
+	ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+	    sizeof (xzp->z_id), tx));
+
+	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+	    xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+	*xvpp = xzp;
+
+	return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ *	IN:	zp	- znode to obtain attribute directory from
+ *		cr	- credentials of caller
+ *		flags	- flags from the VOP_LOOKUP call
+ *
+ *	OUT:	xzpp	- pointer to extended attribute znode
+ *
+ *	RETURN:	0 on success
+ *		error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	znode_t		*xzp;
+	vattr_t		va;
+	int		error;
+top:
+	error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
+	if (error)
+		return (error);
+
+	if (xzp != NULL) {
+		*xzpp = xzp;
+		return (0);
+	}
+
+
+	if (!(flags & CREATE_XATTR_DIR))
+		return (SET_ERROR(ENOATTR));
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * The ability to 'create' files in an attribute
+	 * directory comes from the write_xattr permission on the base file.
+	 *
+	 * The ability to 'search' an attribute directory requires
+	 * read_xattr permission on the base file.
+	 *
+	 * Once in a directory the ability to read/write attributes
+	 * is controlled by the permissions on the attribute file.
+	 */
+	va.va_mask = AT_MODE | AT_UID | AT_GID;
+	va.va_type = VDIR;
+	va.va_mode = S_IFDIR | S_ISVTX | 0777;
+	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+	error = zfs_make_xattrdir(zp, &va, xzpp, cr);
+
+	if (error == ERESTART) {
+		/* NB: we already did dmu_tx_wait() if necessary */
+		goto top;
+	}
+	if (error == 0)
+		VOP_UNLOCK1(ZTOV(*xzpp));
+
+	return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ *	you own the directory,
+ *	you own the entry,
+ *	the entry is a plain file and you have write access,
+ *	or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+	uid_t  		uid;
+	uid_t		downer;
+	uid_t		fowner;
+	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
+
+	if (zdp->z_zfsvfs->z_replay)
+		return (0);
+
+	if ((zdp->z_mode & S_ISVTX) == 0)
+		return (0);
+
+	downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER);
+	fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER);
+
+	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+	    (ZTOV(zp)->v_type == VREG &&
+	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0))
+		return (0);
+	else
+		return (secpolicy_vnode_remove(ZTOV(zp), cr));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
new file mode 100644
index 000000000000..06546c12e420
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_file_os.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_recv.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_file.h>
+#include <sys/buf.h>
+#include <sys/stat.h>
+
+int
+zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
+{
+	struct thread *td;
+	int rc, fd;
+
+	td = curthread;
+	pwd_ensure_dirs();
+	/* 12.x doesn't take a const char * */
+	rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path),
+	    UIO_SYSSPACE, flags, mode);
+	if (rc)
+		return (SET_ERROR(rc));
+	fd = td->td_retval[0];
+	td->td_retval[0] = 0;
+	if (fget(curthread, fd, &cap_no_rights, fpp))
+		kern_close(td, fd);
+	return (0);
+}
+
+void
+zfs_file_close(zfs_file_t *fp)
+{
+	fo_close(fp, curthread);
+}
+
+static int
+zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp,
+    ssize_t *resid)
+{
+	ssize_t rc;
+	struct uio auio;
+	struct thread *td;
+	struct iovec aiov;
+
+	td = curthread;
+	aiov.iov_base = (void *)(uintptr_t)buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_resid = count;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = *offp;
+
+	if ((fp->f_flag & FWRITE) == 0)
+		return (SET_ERROR(EBADF));
+
+	if (fp->f_type == DTYPE_VNODE)
+		bwillwrite();
+
+	rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+	if (rc)
+		return (SET_ERROR(rc));
+	if (resid)
+		*resid = auio.uio_resid;
+	else if (auio.uio_resid)
+		return (SET_ERROR(EIO));
+	*offp += count - auio.uio_resid;
+	return (rc);
+}
+
+int
+zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_offset;
+	ssize_t rc;
+
+	rc = zfs_file_write_impl(fp, buf, count, &off, resid);
+	if (rc == 0)
+		fp->f_offset = off;
+
+	return (SET_ERROR(rc));
+}
+
+int
+zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	return (zfs_file_write_impl(fp, buf, count, &off, resid));
+}
+
+static int
+zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp,
+    ssize_t *resid)
+{
+	ssize_t rc;
+	struct uio auio;
+	struct thread *td;
+	struct iovec aiov;
+
+	td = curthread;
+	aiov.iov_base = (void *)(uintptr_t)buf;
+	aiov.iov_len = count;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_resid = count;
+	auio.uio_rw = UIO_READ;
+	auio.uio_td = td;
+	auio.uio_offset = *offp;
+
+	if ((fp->f_flag & FREAD) == 0)
+		return (SET_ERROR(EBADF));
+
+	rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td);
+	if (rc)
+		return (SET_ERROR(rc));
+	if (resid)
+		*resid = auio.uio_resid;
+	*offp += count - auio.uio_resid;
+	return (SET_ERROR(0));
+}
+
+int
+zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_offset;
+	ssize_t rc;
+
+	rc = zfs_file_read_impl(fp, buf, count, &off, resid);
+	if (rc == 0)
+		fp->f_offset = off;
+	return (rc);
+}
+
+int
+zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	return (zfs_file_read_impl(fp, buf, count, &off, resid));
+}
+
+int
+zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
+{
+	int rc;
+	struct thread *td;
+
+	td = curthread;
+	if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0)
+		return (SET_ERROR(ESPIPE));
+	rc = fo_seek(fp, *offp, whence, td);
+	if (rc == 0)
+		*offp = td->td_uretoff.tdu_off;
+	return (SET_ERROR(rc));
+}
+
+int
+zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
+{
+	struct thread *td;
+	struct stat sb;
+	int rc;
+
+	td = curthread;
+
+	rc = fo_stat(fp, &sb, td->td_ucred, td);
+	if (rc)
+		return (SET_ERROR(rc));
+	zfattr->zfa_size = sb.st_size;
+	zfattr->zfa_mode = sb.st_mode;
+
+	return (0);
+}
+
+static __inline int
+zfs_vop_fsync(vnode_t *vp)
+{
+	struct mount *mp;
+	int error;
+
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto drop;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
+	VOP_UNLOCK1(vp);
+	vn_finished_write(mp);
+drop:
+	return (SET_ERROR(error));
+}
+
+int
+zfs_file_fsync(zfs_file_t *fp, int flags)
+{
+	if (fp->f_type != DTYPE_VNODE)
+		return (EINVAL);
+
+	return (zfs_vop_fsync(fp->f_vnode));
+}
+
+int
+zfs_file_get(int fd, zfs_file_t **fpp)
+{
+	struct file *fp;
+
+	if (fget(curthread, fd, &cap_no_rights, &fp))
+		return (SET_ERROR(EBADF));
+
+	*fpp = fp;
+	return (0);
+}
+
+void
+zfs_file_put(int fd)
+{
+	struct file *fp;
+
+	/* No CAP_ rights required, as we're only releasing. */
+	if (fget(curthread, fd, &cap_no_rights, &fp) == 0) {
+		fdrop(fp, curthread);
+		fdrop(fp, curthread);
+	}
+}
+
+loff_t
+zfs_file_off(zfs_file_t *fp)
+{
+	return (fp->f_offset);
+}
+
+void *
+zfs_file_private(zfs_file_t *fp)
+{
+	file_t *tmpfp;
+	void *data;
+	int error;
+
+	tmpfp = curthread->td_fpop;
+	curthread->td_fpop = fp;
+	error = devfs_get_cdevpriv(&data);
+	curthread->td_fpop = tmpfp;
+	if (error != 0)
+		return (NULL);
+	return (data);
+}
+
+int
+zfs_file_unlink(const char *fnamep)
+{
+	zfs_uio_seg_t seg = UIO_SYSSPACE;
+	int rc;
+
+#if __FreeBSD_version >= 1300018
+	rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0);
+#else
+#ifdef AT_BENEATH
+	rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
+	    seg, 0, 0);
+#else
+	rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep),
+	    seg, 0);
+#endif
+#endif
+	return (SET_ERROR(rc));
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
new file mode 100644
index 000000000000..81967bed73f9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_compat.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/errno.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_ioctl_compat.h>
+
+enum zfs_ioc_legacy {
+	ZFS_IOC_LEGACY_NONE =	-1,
+	ZFS_IOC_LEGACY_FIRST =	0,
+	ZFS_LEGACY_IOC = ZFS_IOC_LEGACY_FIRST,
+	ZFS_IOC_LEGACY_POOL_CREATE = ZFS_IOC_LEGACY_FIRST,
+	ZFS_IOC_LEGACY_POOL_DESTROY,
+	ZFS_IOC_LEGACY_POOL_IMPORT,
+	ZFS_IOC_LEGACY_POOL_EXPORT,
+	ZFS_IOC_LEGACY_POOL_CONFIGS,
+	ZFS_IOC_LEGACY_POOL_STATS,
+	ZFS_IOC_LEGACY_POOL_TRYIMPORT,
+	ZFS_IOC_LEGACY_POOL_SCAN,
+	ZFS_IOC_LEGACY_POOL_FREEZE,
+	ZFS_IOC_LEGACY_POOL_UPGRADE,
+	ZFS_IOC_LEGACY_POOL_GET_HISTORY,
+	ZFS_IOC_LEGACY_VDEV_ADD,
+	ZFS_IOC_LEGACY_VDEV_REMOVE,
+	ZFS_IOC_LEGACY_VDEV_SET_STATE,
+	ZFS_IOC_LEGACY_VDEV_ATTACH,
+	ZFS_IOC_LEGACY_VDEV_DETACH,
+	ZFS_IOC_LEGACY_VDEV_SETPATH,
+	ZFS_IOC_LEGACY_VDEV_SETFRU,
+	ZFS_IOC_LEGACY_OBJSET_STATS,
+	ZFS_IOC_LEGACY_OBJSET_ZPLPROPS,
+	ZFS_IOC_LEGACY_DATASET_LIST_NEXT,
+	ZFS_IOC_LEGACY_SNAPSHOT_LIST_NEXT,
+	ZFS_IOC_LEGACY_SET_PROP,
+	ZFS_IOC_LEGACY_CREATE,
+	ZFS_IOC_LEGACY_DESTROY,
+	ZFS_IOC_LEGACY_ROLLBACK,
+	ZFS_IOC_LEGACY_RENAME,
+	ZFS_IOC_LEGACY_RECV,
+	ZFS_IOC_LEGACY_SEND,
+	ZFS_IOC_LEGACY_INJECT_FAULT,
+	ZFS_IOC_LEGACY_CLEAR_FAULT,
+	ZFS_IOC_LEGACY_INJECT_LIST_NEXT,
+	ZFS_IOC_LEGACY_ERROR_LOG,
+	ZFS_IOC_LEGACY_CLEAR,
+	ZFS_IOC_LEGACY_PROMOTE,
+	ZFS_IOC_LEGACY_DESTROY_SNAPS,
+	ZFS_IOC_LEGACY_SNAPSHOT,
+	ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME,
+	ZFS_IOC_LEGACY_OBJ_TO_PATH,
+	ZFS_IOC_LEGACY_POOL_SET_PROPS,
+	ZFS_IOC_LEGACY_POOL_GET_PROPS,
+	ZFS_IOC_LEGACY_SET_FSACL,
+	ZFS_IOC_LEGACY_GET_FSACL,
+	ZFS_IOC_LEGACY_SHARE,
+	ZFS_IOC_LEGACY_INHERIT_PROP,
+	ZFS_IOC_LEGACY_SMB_ACL,
+	ZFS_IOC_LEGACY_USERSPACE_ONE,
+	ZFS_IOC_LEGACY_USERSPACE_MANY,
+	ZFS_IOC_LEGACY_USERSPACE_UPGRADE,
+	ZFS_IOC_LEGACY_HOLD,
+	ZFS_IOC_LEGACY_RELEASE,
+	ZFS_IOC_LEGACY_GET_HOLDS,
+	ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS,
+	ZFS_IOC_LEGACY_VDEV_SPLIT,
+	ZFS_IOC_LEGACY_NEXT_OBJ,
+	ZFS_IOC_LEGACY_DIFF,
+	ZFS_IOC_LEGACY_TMP_SNAPSHOT,
+	ZFS_IOC_LEGACY_OBJ_TO_STATS,
+	ZFS_IOC_LEGACY_JAIL,
+	ZFS_IOC_LEGACY_UNJAIL,
+	ZFS_IOC_LEGACY_POOL_REGUID,
+	ZFS_IOC_LEGACY_SPACE_WRITTEN,
+	ZFS_IOC_LEGACY_SPACE_SNAPS,
+	ZFS_IOC_LEGACY_SEND_PROGRESS,
+	ZFS_IOC_LEGACY_POOL_REOPEN,
+	ZFS_IOC_LEGACY_LOG_HISTORY,
+	ZFS_IOC_LEGACY_SEND_NEW,
+	ZFS_IOC_LEGACY_SEND_SPACE,
+	ZFS_IOC_LEGACY_CLONE,
+	ZFS_IOC_LEGACY_BOOKMARK,
+	ZFS_IOC_LEGACY_GET_BOOKMARKS,
+	ZFS_IOC_LEGACY_DESTROY_BOOKMARKS,
+	ZFS_IOC_LEGACY_NEXTBOOT,
+	ZFS_IOC_LEGACY_CHANNEL_PROGRAM,
+	ZFS_IOC_LEGACY_REMAP,
+	ZFS_IOC_LEGACY_POOL_CHECKPOINT,
+	ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT,
+	ZFS_IOC_LEGACY_POOL_INITIALIZE,
+	ZFS_IOC_LEGACY_POOL_SYNC,
+	ZFS_IOC_LEGACY_LAST
+};
+
+unsigned static long zfs_ioctl_legacy_to_ozfs_[] = {
+	ZFS_IOC_POOL_CREATE,			/* 0x00 */
+	ZFS_IOC_POOL_DESTROY,			/* 0x01 */
+	ZFS_IOC_POOL_IMPORT,			/* 0x02 */
+	ZFS_IOC_POOL_EXPORT,			/* 0x03 */
+	ZFS_IOC_POOL_CONFIGS,			/* 0x04 */
+	ZFS_IOC_POOL_STATS,			/* 0x05 */
+	ZFS_IOC_POOL_TRYIMPORT,			/* 0x06 */
+	ZFS_IOC_POOL_SCAN,			/* 0x07 */
+	ZFS_IOC_POOL_FREEZE,			/* 0x08 */
+	ZFS_IOC_POOL_UPGRADE,			/* 0x09 */
+	ZFS_IOC_POOL_GET_HISTORY,		/* 0x0a */
+	ZFS_IOC_VDEV_ADD,			/* 0x0b */
+	ZFS_IOC_VDEV_REMOVE,			/* 0x0c */
+	ZFS_IOC_VDEV_SET_STATE,			/* 0x0d */
+	ZFS_IOC_VDEV_ATTACH,			/* 0x0e */
+	ZFS_IOC_VDEV_DETACH,			/* 0x0f */
+	ZFS_IOC_VDEV_SETPATH,			/* 0x10 */
+	ZFS_IOC_VDEV_SETFRU,			/* 0x11 */
+	ZFS_IOC_OBJSET_STATS,			/* 0x12 */
+	ZFS_IOC_OBJSET_ZPLPROPS,		/* 0x13 */
+	ZFS_IOC_DATASET_LIST_NEXT,		/* 0x14 */
+	ZFS_IOC_SNAPSHOT_LIST_NEXT,		/* 0x15 */
+	ZFS_IOC_SET_PROP,			/* 0x16 */
+	ZFS_IOC_CREATE,				/* 0x17 */
+	ZFS_IOC_DESTROY,			/* 0x18 */
+	ZFS_IOC_ROLLBACK,			/* 0x19 */
+	ZFS_IOC_RENAME,				/* 0x1a */
+	ZFS_IOC_RECV,				/* 0x1b */
+	ZFS_IOC_SEND,				/* 0x1c */
+	ZFS_IOC_INJECT_FAULT,			/* 0x1d */
+	ZFS_IOC_CLEAR_FAULT,			/* 0x1e */
+	ZFS_IOC_INJECT_LIST_NEXT,		/* 0x1f */
+	ZFS_IOC_ERROR_LOG,			/* 0x20 */
+	ZFS_IOC_CLEAR,				/* 0x21 */
+	ZFS_IOC_PROMOTE,			/* 0x22 */
+	/* start of mismatch */
+
+	ZFS_IOC_DESTROY_SNAPS,			/* 0x23:0x3b */
+	ZFS_IOC_SNAPSHOT,			/* 0x24:0x23 */
+	ZFS_IOC_DSOBJ_TO_DSNAME,		/* 0x25:0x24 */
+	ZFS_IOC_OBJ_TO_PATH,			/* 0x26:0x25 */
+	ZFS_IOC_POOL_SET_PROPS,			/* 0x27:0x26 */
+	ZFS_IOC_POOL_GET_PROPS,			/* 0x28:0x27 */
+	ZFS_IOC_SET_FSACL,			/* 0x29:0x28 */
+	ZFS_IOC_GET_FSACL,			/* 0x30:0x29 */
+	ZFS_IOC_SHARE,				/* 0x2b:0x2a */
+	ZFS_IOC_INHERIT_PROP,			/* 0x2c:0x2b */
+	ZFS_IOC_SMB_ACL,			/* 0x2d:0x2c */
+	ZFS_IOC_USERSPACE_ONE,			/* 0x2e:0x2d */
+	ZFS_IOC_USERSPACE_MANY,			/* 0x2f:0x2e */
+	ZFS_IOC_USERSPACE_UPGRADE,		/* 0x30:0x2f */
+	ZFS_IOC_HOLD,				/* 0x31:0x30 */
+	ZFS_IOC_RELEASE,			/* 0x32:0x31 */
+	ZFS_IOC_GET_HOLDS,			/* 0x33:0x32 */
+	ZFS_IOC_OBJSET_RECVD_PROPS,		/* 0x34:0x33 */
+	ZFS_IOC_VDEV_SPLIT,			/* 0x35:0x34 */
+	ZFS_IOC_NEXT_OBJ,			/* 0x36:0x35 */
+	ZFS_IOC_DIFF,				/* 0x37:0x36 */
+	ZFS_IOC_TMP_SNAPSHOT,			/* 0x38:0x37 */
+	ZFS_IOC_OBJ_TO_STATS,			/* 0x39:0x38 */
+	ZFS_IOC_JAIL,			/* 0x3a:0xc2 */
+	ZFS_IOC_UNJAIL,			/* 0x3b:0xc3 */
+	ZFS_IOC_POOL_REGUID,			/* 0x3c:0x3c */
+	ZFS_IOC_SPACE_WRITTEN,			/* 0x3d:0x39 */
+	ZFS_IOC_SPACE_SNAPS,			/* 0x3e:0x3a */
+	ZFS_IOC_SEND_PROGRESS,			/* 0x3f:0x3e */
+	ZFS_IOC_POOL_REOPEN,			/* 0x40:0x3d */
+	ZFS_IOC_LOG_HISTORY,			/* 0x41:0x3f */
+	ZFS_IOC_SEND_NEW,			/* 0x42:0x40 */
+	ZFS_IOC_SEND_SPACE,			/* 0x43:0x41 */
+	ZFS_IOC_CLONE,				/* 0x44:0x42 */
+	ZFS_IOC_BOOKMARK,			/* 0x45:0x43 */
+	ZFS_IOC_GET_BOOKMARKS,			/* 0x46:0x44 */
+	ZFS_IOC_DESTROY_BOOKMARKS,		/* 0x47:0x45 */
+	ZFS_IOC_NEXTBOOT,			/* 0x48:0xc1 */
+	ZFS_IOC_CHANNEL_PROGRAM,		/* 0x49:0x48 */
+	ZFS_IOC_REMAP,				/* 0x4a:0x4c */
+	ZFS_IOC_POOL_CHECKPOINT,		/* 0x4b:0x4d */
+	ZFS_IOC_POOL_DISCARD_CHECKPOINT,	/* 0x4c:0x4e */
+	ZFS_IOC_POOL_INITIALIZE,		/* 0x4d:0x4f */
+};
+
+unsigned static long zfs_ioctl_ozfs_to_legacy_common_[] = {
+	ZFS_IOC_POOL_CREATE,			/* 0x00 */
+	ZFS_IOC_POOL_DESTROY,			/* 0x01 */
+	ZFS_IOC_POOL_IMPORT,			/* 0x02 */
+	ZFS_IOC_POOL_EXPORT,			/* 0x03 */
+	ZFS_IOC_POOL_CONFIGS,			/* 0x04 */
+	ZFS_IOC_POOL_STATS,			/* 0x05 */
+	ZFS_IOC_POOL_TRYIMPORT,			/* 0x06 */
+	ZFS_IOC_POOL_SCAN,			/* 0x07 */
+	ZFS_IOC_POOL_FREEZE,			/* 0x08 */
+	ZFS_IOC_POOL_UPGRADE,			/* 0x09 */
+	ZFS_IOC_POOL_GET_HISTORY,		/* 0x0a */
+	ZFS_IOC_VDEV_ADD,			/* 0x0b */
+	ZFS_IOC_VDEV_REMOVE,			/* 0x0c */
+	ZFS_IOC_VDEV_SET_STATE,			/* 0x0d */
+	ZFS_IOC_VDEV_ATTACH,			/* 0x0e */
+	ZFS_IOC_VDEV_DETACH,			/* 0x0f */
+	ZFS_IOC_VDEV_SETPATH,			/* 0x10 */
+	ZFS_IOC_VDEV_SETFRU,			/* 0x11 */
+	ZFS_IOC_OBJSET_STATS,			/* 0x12 */
+	ZFS_IOC_OBJSET_ZPLPROPS,		/* 0x13 */
+	ZFS_IOC_DATASET_LIST_NEXT,		/* 0x14 */
+	ZFS_IOC_SNAPSHOT_LIST_NEXT,		/* 0x15 */
+	ZFS_IOC_SET_PROP,			/* 0x16 */
+	ZFS_IOC_CREATE,				/* 0x17 */
+	ZFS_IOC_DESTROY,			/* 0x18 */
+	ZFS_IOC_ROLLBACK,			/* 0x19 */
+	ZFS_IOC_RENAME,				/* 0x1a */
+	ZFS_IOC_RECV,				/* 0x1b */
+	ZFS_IOC_SEND,				/* 0x1c */
+	ZFS_IOC_INJECT_FAULT,			/* 0x1d */
+	ZFS_IOC_CLEAR_FAULT,			/* 0x1e */
+	ZFS_IOC_INJECT_LIST_NEXT,		/* 0x1f */
+	ZFS_IOC_ERROR_LOG,			/* 0x20 */
+	ZFS_IOC_CLEAR,				/* 0x21 */
+	ZFS_IOC_PROMOTE,			/* 0x22 */
+	/* start of mismatch */
+	ZFS_IOC_LEGACY_SNAPSHOT,		/* 0x23 */
+	ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME,		/* 0x24 */
+	ZFS_IOC_LEGACY_OBJ_TO_PATH,		/* 0x25 */
+	ZFS_IOC_LEGACY_POOL_SET_PROPS,		/* 0x26 */
+	ZFS_IOC_LEGACY_POOL_GET_PROPS,		/* 0x27 */
+	ZFS_IOC_LEGACY_SET_FSACL,		/* 0x28 */
+	ZFS_IOC_LEGACY_GET_FSACL,		/* 0x29 */
+	ZFS_IOC_LEGACY_SHARE,			/* 0x2a */
+	ZFS_IOC_LEGACY_INHERIT_PROP,		/* 0x2b */
+	ZFS_IOC_LEGACY_SMB_ACL,			/* 0x2c */
+	ZFS_IOC_LEGACY_USERSPACE_ONE,		/* 0x2d */
+	ZFS_IOC_LEGACY_USERSPACE_MANY,		/* 0x2e */
+	ZFS_IOC_LEGACY_USERSPACE_UPGRADE,	/* 0x2f */
+	ZFS_IOC_LEGACY_HOLD,			/* 0x30 */
+	ZFS_IOC_LEGACY_RELEASE,			/* 0x31 */
+	ZFS_IOC_LEGACY_GET_HOLDS,		/* 0x32 */
+	ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS,	/* 0x33 */
+	ZFS_IOC_LEGACY_VDEV_SPLIT,		/* 0x34 */
+	ZFS_IOC_LEGACY_NEXT_OBJ,		/* 0x35 */
+	ZFS_IOC_LEGACY_DIFF,			/* 0x36 */
+	ZFS_IOC_LEGACY_TMP_SNAPSHOT,		/* 0x37 */
+	ZFS_IOC_LEGACY_OBJ_TO_STATS,		/* 0x38 */
+	ZFS_IOC_LEGACY_SPACE_WRITTEN,		/* 0x39 */
+	ZFS_IOC_LEGACY_SPACE_SNAPS,		/* 0x3a */
+	ZFS_IOC_LEGACY_DESTROY_SNAPS,		/* 0x3b */
+	ZFS_IOC_LEGACY_POOL_REGUID,		/* 0x3c */
+	ZFS_IOC_LEGACY_POOL_REOPEN,		/* 0x3d */
+	ZFS_IOC_LEGACY_SEND_PROGRESS,		/* 0x3e */
+	ZFS_IOC_LEGACY_LOG_HISTORY,		/* 0x3f */
+	ZFS_IOC_LEGACY_SEND_NEW,		/* 0x40 */
+	ZFS_IOC_LEGACY_SEND_SPACE,		/* 0x41 */
+	ZFS_IOC_LEGACY_CLONE,			/* 0x42 */
+	ZFS_IOC_LEGACY_BOOKMARK,		/* 0x43 */
+	ZFS_IOC_LEGACY_GET_BOOKMARKS,		/* 0x44 */
+	ZFS_IOC_LEGACY_DESTROY_BOOKMARKS,	/* 0x45 */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_RECV_NEW */
+	ZFS_IOC_LEGACY_POOL_SYNC,		/* 0x47 */
+	ZFS_IOC_LEGACY_CHANNEL_PROGRAM,		/* 0x48 */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_LOAD_KEY */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_UNLOAD_KEY */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_CHANGE_KEY */
+	ZFS_IOC_LEGACY_REMAP,			/* 0x4c */
+	ZFS_IOC_LEGACY_POOL_CHECKPOINT,		/* 0x4d */
+	ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT,	/* 0x4e */
+	ZFS_IOC_LEGACY_POOL_INITIALIZE,		/* 0x4f  */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_POOL_TRIM */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_REDACT */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOKMARK_PROPS */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT_FS */
+};
+
+unsigned static long zfs_ioctl_ozfs_to_legacy_platform_[] = {
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_NEXT */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_CLEAR */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_SEEK */
+	ZFS_IOC_LEGACY_NEXTBOOT,
+	ZFS_IOC_LEGACY_JAIL,
+	ZFS_IOC_LEGACY_UNJAIL,
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_SET_BOOTENV */
+	ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOTENV */
+};
+
+int
+zfs_ioctl_legacy_to_ozfs(int request)
+{
+	if (request >= sizeof (zfs_ioctl_legacy_to_ozfs_)/sizeof (long))
+		return (-1);
+	return (zfs_ioctl_legacy_to_ozfs_[request]);
+}
+
+int
+zfs_ioctl_ozfs_to_legacy(int request)
+{
+	if (request > ZFS_IOC_LAST)
+		return (-1);
+
+	if (request > ZFS_IOC_PLATFORM) {
+		request -= ZFS_IOC_PLATFORM + 1;
+		return (zfs_ioctl_ozfs_to_legacy_platform_[request]);
+	}
+	if (request >= sizeof (zfs_ioctl_ozfs_to_legacy_common_)/sizeof (long))
+		return (-1);
+	return (zfs_ioctl_ozfs_to_legacy_common_[request]);
+}
+
+void
+zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst)
+{
+	memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+	*&dst->zc_objset_stats = *&src->zc_objset_stats;
+	memcpy(&dst->zc_begin_record, &src->zc_begin_record,
+	    offsetof(zfs_cmd_t, zc_sendobj) -
+	    offsetof(zfs_cmd_t, zc_begin_record));
+	memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+	    sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+	dst->zc_zoneid = src->zc_jailid;
+}
+
+void
+zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst)
+{
+	memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats));
+	*&dst->zc_objset_stats = *&src->zc_objset_stats;
+	*&dst->zc_begin_record.drr_u.drr_begin = *&src->zc_begin_record;
+	dst->zc_begin_record.drr_payloadlen = 0;
+	dst->zc_begin_record.drr_type = 0;
+
+	memcpy(&dst->zc_inject_record, &src->zc_inject_record,
+	    offsetof(zfs_cmd_t, zc_sendobj) -
+	    offsetof(zfs_cmd_t, zc_inject_record));
+	dst->zc_resumable = B_FALSE;
+	memcpy(&dst->zc_sendobj, &src->zc_sendobj,
+	    sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj));
+	dst->zc_jailid = src->zc_zoneid;
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
new file mode 100644
index 000000000000..0e0c16033b15
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_ioctl_os.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2020 iXsystems, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/nvpair.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_os.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zone.h>
+#include <vm/vm_pageout.h>
+
+#include <sys/zfs_ioctl_impl.h>
+
+#if __FreeBSD_version < 1201517
+#define	vm_page_max_user_wired	vm_page_max_wired
+#endif
+
+int
+zfs_vfs_ref(zfsvfs_t **zfvp)
+{
+	int error = 0;
+
+	if (*zfvp == NULL)
+		return (SET_ERROR(ESRCH));
+
+	error = vfs_busy((*zfvp)->z_vfs, 0);
+	if (error != 0) {
+		*zfvp = NULL;
+		error = SET_ERROR(ESRCH);
+	}
+	return (error);
+}
+
+int
+zfs_vfs_held(zfsvfs_t *zfsvfs)
+{
+	return (zfsvfs->z_vfs != NULL);
+}
+
+void
+zfs_vfs_rele(zfsvfs_t *zfsvfs)
+{
+	vfs_unbusy(zfsvfs->z_vfs);
+}
+
+static const zfs_ioc_key_t zfs_keys_nextboot[] = {
+	{"command",		DATA_TYPE_STRING,	0},
+	{ ZPOOL_CONFIG_POOL_GUID,		DATA_TYPE_UINT64,	0},
+	{ ZPOOL_CONFIG_GUID,		DATA_TYPE_UINT64,	0}
+};
+
+static int
+zfs_ioc_jail(zfs_cmd_t *zc)
+{
+
+	return (zone_dataset_attach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_unjail(zfs_cmd_t *zc)
+{
+
+	return (zone_dataset_detach(curthread->td_ucred, zc->zc_name,
+	    (int)zc->zc_zoneid));
+}
+
+static int
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	char name[MAXNAMELEN];
+	spa_t *spa;
+	vdev_t *vd;
+	char *command;
+	uint64_t pool_guid;
+	uint64_t vdev_guid;
+	int error;
+
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_uint64(innvl,
+	    ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+		return (EINVAL);
+	if (nvlist_lookup_string(innvl,
+	    "command", &command) != 0)
+		return (EINVAL);
+
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_by_guid(pool_guid, vdev_guid);
+	if (spa != NULL)
+		strcpy(name, spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+	if (spa == NULL)
+		return (ENOENT);
+
+	if ((error = spa_open(name, &spa, FTAG)) != 0)
+		return (error);
+	spa_vdev_state_enter(spa, SCL_ALL);
+	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+	if (vd == NULL) {
+		(void) spa_vdev_state_exit(spa, NULL, ENXIO);
+		spa_close(spa, FTAG);
+		return (ENODEV);
+	}
+	error = vdev_label_write_pad2(vd, command, strlen(command));
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+uint64_t
+zfs_max_nvlist_src_size_os(void)
+{
+	if (zfs_max_nvlist_src_size != 0)
+		return (zfs_max_nvlist_src_size);
+
+	return (ptob(vm_page_max_user_wired) / 4);
+}
+
+void
+zfs_ioctl_init_os(void)
+{
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail,
+	    zfs_secpolicy_config, POOL_CHECK_NONE);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
+	    zfs_secpolicy_config, POOL_CHECK_NONE);
+	zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+	    zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+	    POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3);
+
+}
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..7bc6b83d0272
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c
@@ -0,0 +1,2301 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/acl.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/mount.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/spa_boot.h>
+#include <sys/jail.h>
+#include <ufs/ufs/quota.h>
+#include <sys/zfs_quota.h>
+
+#include "zfs_comutil.h"
+
+#ifndef	MNTK_VMSETSIZE_BUG
+#define	MNTK_VMSETSIZE_BUG	0
+#endif
+#ifndef	MNTK_NOMSYNC
+#define	MNTK_NOMSYNC	8
+#endif
+
+/* BEGIN CSTYLED */
+struct mtx zfs_debug_mtx;
+MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
+
+SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
+
+int zfs_super_owner;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
+    "File system owner can perform privileged operation on his file systems");
+
+int zfs_debug_level;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0,
+	"Debug level");
+
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
+static int zfs_version_acl = ZFS_ACL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
+    "ZFS_ACL_VERSION");
+static int zfs_version_spa = SPA_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
+    "SPA_VERSION");
+static int zfs_version_zpl = ZPL_VERSION;
+SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
+    "ZPL_VERSION");
+/* END CSTYLED */
+
+static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg);
+static int zfs_mount(vfs_t *vfsp);
+static int zfs_umount(vfs_t *vfsp, int fflag);
+static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
+static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
+static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
+static int zfs_sync(vfs_t *vfsp, int waitfor);
+#if __FreeBSD_version >= 1300098
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int *secflavors);
+#else
+static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors);
+#endif
+static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
+static void zfs_freevfs(vfs_t *vfsp);
+
+struct vfsops zfs_vfsops = {
+	.vfs_mount =		zfs_mount,
+	.vfs_unmount =		zfs_umount,
+#if __FreeBSD_version >= 1300049
+	.vfs_root =		vfs_cache_root,
+	.vfs_cachedroot = zfs_root,
+#else
+	.vfs_root =		zfs_root,
+#endif
+	.vfs_statfs =		zfs_statfs,
+	.vfs_vget =		zfs_vget,
+	.vfs_sync =		zfs_sync,
+	.vfs_checkexp =		zfs_checkexp,
+	.vfs_fhtovp =		zfs_fhtovp,
+	.vfs_quotactl =		zfs_quotactl,
+};
+
+VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
+
+/*
+ * We need to keep a count of active fs's.
+ * This is necessary to prevent our module
+ * from being unloaded after a umount -f
+ */
+static uint32_t	zfs_active_fs_count = 0;
+
+int
+zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+    char *setpoint)
+{
+	int error;
+	zfsvfs_t *zfvp;
+	vfs_t *vfsp;
+	objset_t *os;
+	uint64_t tmp = *val;
+
+	error = dmu_objset_from_ds(ds, &os);
+	if (error != 0)
+		return (error);
+
+	error = getzfsvfs_impl(os, &zfvp);
+	if (error != 0)
+		return (error);
+	if (zfvp == NULL)
+		return (ENOENT);
+	vfsp = zfvp->z_vfs;
+	switch (zfs_prop) {
+	case ZFS_PROP_ATIME:
+		if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_DEVICES:
+		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_EXEC:
+		if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_SETUID:
+		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_READONLY:
+		if (vfs_optionisset(vfsp, MNTOPT_RW, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
+			tmp = 1;
+		break;
+	case ZFS_PROP_XATTR:
+		if (zfvp->z_flags & ZSB_XATTR)
+			tmp = zfvp->z_xattr;
+		break;
+	case ZFS_PROP_NBMAND:
+		if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL))
+			tmp = 0;
+		if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL))
+			tmp = 1;
+		break;
+	default:
+		vfs_unbusy(vfsp);
+		return (ENOENT);
+	}
+
+	vfs_unbusy(vfsp);
+	if (tmp != *val) {
+		(void) strcpy(setpoint, "temporary");
+		*val = tmp;
+	}
+	return (0);
+}
+
+static int
+zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp)
+{
+	int error = 0;
+	char buf[32];
+	uint64_t usedobj, quotaobj;
+	uint64_t quota, used = 0;
+	timespec_t now;
+
+	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+	if (quotaobj == 0 || zfsvfs->z_replay) {
+		error = ENOENT;
+		goto done;
+	}
+	(void) sprintf(buf, "%llx", (longlong_t)id);
+	if ((error = zap_lookup(zfsvfs->z_os, quotaobj,
+	    buf, sizeof (quota), 1, &quota)) != 0) {
+		dprintf("%s(%d): quotaobj lookup failed\n",
+		    __FUNCTION__, __LINE__);
+		goto done;
+	}
+	/*
+	 * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit".
+	 * So we set them to be the same.
+	 */
+	dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota);
+	error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used);
+	if (error && error != ENOENT) {
+		dprintf("%s(%d):  usedobj failed; %d\n",
+		    __FUNCTION__, __LINE__, error);
+		goto done;
+	}
+	dqp->dqb_curblocks = btodb(used);
+	dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0;
+	vfs_timestamp(&now);
+	/*
+	 * Setting this to 0 causes FreeBSD quota(8) to print
+	 * the number of days since the epoch, which isn't
+	 * particularly useful.
+	 */
+	dqp->dqb_btime = dqp->dqb_itime = now.tv_sec;
+done:
+	return (error);
+}
+
+static int
+zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	struct thread *td;
+	int cmd, type, error = 0;
+	int bitsize;
+	zfs_userquota_prop_t quota_type;
+	struct dqblk64 dqblk = { 0 };
+
+	td = curthread;
+	cmd = cmds >> SUBCMDSHIFT;
+	type = cmds & SUBCMDMASK;
+
+	ZFS_ENTER(zfsvfs);
+	if (id == -1) {
+		switch (type) {
+		case USRQUOTA:
+			id = td->td_ucred->cr_ruid;
+			break;
+		case GRPQUOTA:
+			id = td->td_ucred->cr_rgid;
+			break;
+		default:
+			error = EINVAL;
+			if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF)
+				vfs_unbusy(vfsp);
+			goto done;
+		}
+	}
+	/*
+	 * Map BSD type to:
+	 * ZFS_PROP_USERUSED,
+	 * ZFS_PROP_USERQUOTA,
+	 * ZFS_PROP_GROUPUSED,
+	 * ZFS_PROP_GROUPQUOTA
+	 */
+	switch (cmd) {
+	case Q_SETQUOTA:
+	case Q_SETQUOTA32:
+		if (type == USRQUOTA)
+			quota_type = ZFS_PROP_USERQUOTA;
+		else if (type == GRPQUOTA)
+			quota_type = ZFS_PROP_GROUPQUOTA;
+		else
+			error = EINVAL;
+		break;
+	case Q_GETQUOTA:
+	case Q_GETQUOTA32:
+		if (type == USRQUOTA)
+			quota_type = ZFS_PROP_USERUSED;
+		else if (type == GRPQUOTA)
+			quota_type = ZFS_PROP_GROUPUSED;
+		else
+			error = EINVAL;
+		break;
+	}
+
+	/*
+	 * Depending on the cmd, we may need to get
+	 * the ruid and domain (see fuidstr_to_sid?),
+	 * the fuid (how?), or other information.
+	 * Create fuid using zfs_fuid_create(zfsvfs, id,
+	 * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)?
+	 * I think I can use just the id?
+	 *
+	 * Look at zfs_id_overquota() to look up a quota.
+	 * zap_lookup(something, quotaobj, fuidstring,
+	 *     sizeof (long long), 1, &quota)
+	 *
+	 * See zfs_set_userquota() to set a quota.
+	 */
+	if ((uint32_t)type >= MAXQUOTAS) {
+		error = EINVAL;
+		goto done;
+	}
+
+	switch (cmd) {
+	case Q_GETQUOTASIZE:
+		bitsize = 64;
+		error = copyout(&bitsize, arg, sizeof (int));
+		break;
+	case Q_QUOTAON:
+		// As far as I can tell, you can't turn quotas on or off on zfs
+		error = 0;
+		vfs_unbusy(vfsp);
+		break;
+	case Q_QUOTAOFF:
+		error = ENOTSUP;
+		vfs_unbusy(vfsp);
+		break;
+	case Q_SETQUOTA:
+		error = copyin(arg, &dqblk, sizeof (dqblk));
+		if (error == 0)
+			error = zfs_set_userquota(zfsvfs, quota_type,
+			    "", id, dbtob(dqblk.dqb_bhardlimit));
+		break;
+	case Q_GETQUOTA:
+		error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk);
+		if (error == 0)
+			error = copyout(&dqblk, arg, sizeof (dqblk));
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+done:
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+	return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY));
+}
+
+/*ARGSUSED*/
+static int
+zfs_sync(vfs_t *vfsp, int waitfor)
+{
+
+	/*
+	 * Data integrity is job one.  We don't want a compromised kernel
+	 * writing to the storage pool, so we never sync during panic.
+	 */
+	if (panicstr)
+		return (0);
+
+	/*
+	 * Ignore the system syncher.  ZFS already commits async data
+	 * at zfs_txg_timeout intervals.
+	 */
+	if (waitfor == MNT_LAZY)
+		return (0);
+
+	if (vfsp != NULL) {
+		/*
+		 * Sync a specific filesystem.
+		 */
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+		dsl_pool_t *dp;
+		int error;
+
+		error = vfs_stdsync(vfsp, waitfor);
+		if (error != 0)
+			return (error);
+
+		ZFS_ENTER(zfsvfs);
+		dp = dmu_objset_pool(zfsvfs->z_os);
+
+		/*
+		 * If the system is shutting down, then skip any
+		 * filesystems which may exist on a suspended pool.
+		 */
+		if (rebooting && spa_suspended(dp->dp_spa)) {
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, 0);
+
+		ZFS_EXIT(zfsvfs);
+	} else {
+		/*
+		 * Sync all ZFS filesystems.  This is what happens when you
+		 * run sync(8).  Unlike other filesystems, ZFS honors the
+		 * request by waiting for all pools to commit all dirty data.
+		 */
+		spa_sync_allpools();
+	}
+
+	return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == TRUE) {
+		zfsvfs->z_atime = TRUE;
+		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
+	} else {
+		zfsvfs->z_atime = FALSE;
+		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
+	}
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == ZFS_XATTR_OFF) {
+		zfsvfs->z_flags &= ~ZSB_XATTR;
+	} else {
+		zfsvfs->z_flags |= ZSB_XATTR;
+
+		if (newval == ZFS_XATTR_SA)
+			zfsvfs->z_xattr_sa = B_TRUE;
+		else
+			zfsvfs->z_xattr_sa = B_FALSE;
+	}
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+	ASSERT(ISP2(newval));
+
+	zfsvfs->z_max_blksz = newval;
+	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval) {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
+	} else {
+		/* XXX locking on vfs_flag? */
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
+	}
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
+	}
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == FALSE) {
+		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
+	} else {
+		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
+	}
+}
+
+/*
+ * The nbmand mount option can be changed at mount time.
+ * We can't allow it to be toggled on live file systems or incorrect
+ * behavior may be seen from cifs clients
+ *
+ * This property isn't registered via dsl_prop_register(), but this callback
+ * will be called when a file system is first mounted
+ */
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	if (newval == FALSE) {
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
+	} else {
+		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
+		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
+	}
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_inherit = newval;
+}
+
+static void
+acl_type_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_type = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+	struct dsl_dataset *ds = NULL;
+	objset_t *os = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+	uint64_t nbmand;
+	boolean_t readonly = B_FALSE;
+	boolean_t do_readonly = B_FALSE;
+	boolean_t setuid = B_FALSE;
+	boolean_t do_setuid = B_FALSE;
+	boolean_t exec = B_FALSE;
+	boolean_t do_exec = B_FALSE;
+	boolean_t xattr = B_FALSE;
+	boolean_t atime = B_FALSE;
+	boolean_t do_atime = B_FALSE;
+	boolean_t do_xattr = B_FALSE;
+	int error = 0;
+
+	ASSERT(vfsp);
+	zfsvfs = vfsp->vfs_data;
+	ASSERT(zfsvfs);
+	os = zfsvfs->z_os;
+
+	/*
+	 * This function can be called for a snapshot when we update snapshot's
+	 * mount point, which isn't really supported.
+	 */
+	if (dmu_objset_is_snapshot(os))
+		return (EOPNOTSUPP);
+
+	/*
+	 * The act of registering our callbacks will destroy any mount
+	 * options we may have.  In order to enable temporary overrides
+	 * of mount options, we stash away the current values and
+	 * restore them after we register the callbacks.
+	 */
+	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
+	    !spa_writeable(dmu_objset_spa(os))) {
+		readonly = B_TRUE;
+		do_readonly = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+		readonly = B_FALSE;
+		do_readonly = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+		setuid = B_FALSE;
+		do_setuid = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+		setuid = B_TRUE;
+		do_setuid = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+		exec = B_FALSE;
+		do_exec = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+		exec = B_TRUE;
+		do_exec = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF;
+		do_xattr = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+		do_xattr = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR;
+		do_xattr = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) {
+		zfsvfs->z_xattr = xattr = ZFS_XATTR_SA;
+		do_xattr = B_TRUE;
+	}
+	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
+		atime = B_FALSE;
+		do_atime = B_TRUE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
+		atime = B_TRUE;
+		do_atime = B_TRUE;
+	}
+
+	/*
+	 * We need to enter pool configuration here, so that we can use
+	 * dsl_prop_get_int_ds() to handle the special nbmand property below.
+	 * dsl_prop_get_integer() can not be used, because it has to acquire
+	 * spa_namespace_lock and we can not do that because we already hold
+	 * z_teardown_lock.  The problem is that spa_write_cachefile() is called
+	 * with spa_namespace_lock held and the function calls ZFS vnode
+	 * operations to write the cache file and thus z_teardown_lock is
+	 * acquired after spa_namespace_lock.
+	 */
+	ds = dmu_objset_ds(os);
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+	/*
+	 * nbmand is a special property.  It can only be changed at
+	 * mount time.
+	 *
+	 * This is weird, but it is documented to only be changeable
+	 * at mount time.
+	 */
+	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
+		nbmand = B_FALSE;
+	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
+		nbmand = B_TRUE;
+	} else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) {
+		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		return (error);
+	}
+
+	/*
+	 * Register property callbacks.
+	 *
+	 * It would probably be fine to just check for i/o error from
+	 * the first prop_register(), but I guess I like to go
+	 * overboard...
+	 */
+	error = dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+	    zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+	if (error)
+		goto unregister;
+
+	/*
+	 * Invoke our callbacks to restore temporary mount options.
+	 */
+	if (do_readonly)
+		readonly_changed_cb(zfsvfs, readonly);
+	if (do_setuid)
+		setuid_changed_cb(zfsvfs, setuid);
+	if (do_exec)
+		exec_changed_cb(zfsvfs, exec);
+	if (do_xattr)
+		xattr_changed_cb(zfsvfs, xattr);
+	if (do_atime)
+		atime_changed_cb(zfsvfs, atime);
+
+	nbmand_changed_cb(zfsvfs, nbmand);
+
+	return (0);
+
+unregister:
+	dsl_prop_unregister_all(ds, zfsvfs);
+	return (error);
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+	uint64_t val;
+
+	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+	zfsvfs->z_os = os;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+	if (error != 0)
+		return (error);
+	if (zfsvfs->z_version >
+	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+		(void) printf("Can't mount a version %lld file system "
+		    "on a version %lld pool\n. Pool must be upgraded to mount "
+		    "this file system.", (u_longlong_t)zfsvfs->z_version,
+		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
+		return (SET_ERROR(ENOTSUP));
+	}
+	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_norm = (int)val;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_utf8 = (val != 0);
+
+	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_case = (uint_t)val;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_acl_type = (uint_t)val;
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+	    zfsvfs->z_case == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+	uint64_t sa_obj = 0;
+	if (zfsvfs->z_use_sa) {
+		/* should either have both of these objects or none */
+		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+		    &sa_obj);
+		if (error != 0)
+			return (error);
+	}
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+	if (error != 0)
+		return (error);
+
+	if (zfsvfs->z_version >= ZPL_VERSION_SA)
+		sa_register_update_callback(os, zfs_sa_upgrade);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+	    &zfsvfs->z_root);
+	if (error != 0)
+		return (error);
+	ASSERT(zfsvfs->z_root != 0);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+	    &zfsvfs->z_unlinkedobj);
+	if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+	    8, 1, &zfsvfs->z_userquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+	    8, 1, &zfsvfs->z_groupquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+	    8, 1, &zfsvfs->z_projectquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+	    8, 1, &zfsvfs->z_userobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+	    8, 1, &zfsvfs->z_groupobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+	    8, 1, &zfsvfs->z_projectobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+	    &zfsvfs->z_fuid_obj);
+	if (error == ENOENT)
+		zfsvfs->z_fuid_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+	    &zfsvfs->z_shares_dir);
+	if (error == ENOENT)
+		zfsvfs->z_shares_dir = 0;
+	else if (error != 0)
+		return (error);
+
+	/*
+	 * Only use the name cache if we are looking for a
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name (which is always the case on
+	 * FreeBSD).
+	 */
+	zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+	    ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+	return (0);
+}
+
+taskq_t *zfsvfs_taskq;
+
+static void
+zfsvfs_task_unlinked_drain(void *context, int pending __unused)
+{
+
+	zfs_unlinked_drain((zfsvfs_t *)context);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	int error;
+	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+	/*
+	 * XXX: Fix struct statfs so this isn't necessary!
+	 *
+	 * The 'osname' is used as the filesystem's special node, which means
+	 * it must fit in statfs.f_mntfromname, or else it can't be
+	 * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+	 * 'zfs unmount' to think it's not mounted when it is.
+	 */
+	if (strlen(osname) >= MNAMELEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs,
+	    &os);
+	if (error != 0) {
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+
+	return (error);
+}
+
+
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_parent = zfsvfs;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0,
+	    zfsvfs_task_unlinked_drain, zfsvfs);
+	ZFS_TEARDOWN_INIT(zfsvfs);
+	ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs);
+	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	error = zfsvfs_init(zfsvfs, os);
+	if (error != 0) {
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+		*zfvp = NULL;
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	*zfvp = zfsvfs;
+	return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+	int error;
+
+	/*
+	 * Check for a bad on-disk format version now since we
+	 * lied about owning the dataset readonly before.
+	 */
+	if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
+	    dmu_objset_incompatible_encryption_version(zfsvfs->z_os))
+		return (SET_ERROR(EROFS));
+
+	error = zfs_register_callbacks(zfsvfs->z_vfs);
+	if (error)
+		return (error);
+
+	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+	/*
+	 * If we are not mounting (ie: online recv), then we don't
+	 * have to worry about replaying the log as we blocked all
+	 * operations out since we closed the ZIL.
+	 */
+	if (mounting) {
+		boolean_t readonly;
+
+		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
+		/*
+		 * During replay we remove the read only flag to
+		 * allow replays to succeed.
+		 */
+		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
+		if (readonly != 0) {
+			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		} else {
+			dsl_dir_t *dd;
+			zap_stats_t zs;
+
+			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+			    &zs) == 0) {
+				dataset_kstats_update_nunlinks_kstat(
+				    &zfsvfs->z_kstat, zs.zs_num_entries);
+				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+				    "num_entries in unlinked set: %llu",
+				    zs.zs_num_entries);
+			}
+
+			zfs_unlinked_drain(zfsvfs);
+			dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+			dd->dd_activity_cancelled = B_FALSE;
+		}
+
+		/*
+		 * Parse and replay the intent log.
+		 *
+		 * Because of ziltest, this must be done after
+		 * zfs_unlinked_drain().  (Further note: ziltest
+		 * doesn't use readonly mounts, where
+		 * zfs_unlinked_drain() isn't called.)  This is because
+		 * ziltest causes spa_sync() to think it's committed,
+		 * but actually it is not, so the intent log contains
+		 * many txg's worth of changes.
+		 *
+		 * In particular, if object N is in the unlinked set in
+		 * the last txg to actually sync, then it could be
+		 * actually freed in a later txg and then reallocated
+		 * in a yet later txg.  This would write a "create
+		 * object N" record to the intent log.  Normally, this
+		 * would be fine because the spa_sync() would have
+		 * written out the fact that object N is free, before
+		 * we could write the "create object N" intent log
+		 * record.
+		 *
+		 * But when we are in ziltest mode, we advance the "open
+		 * txg" without actually spa_sync()-ing the changes to
+		 * disk.  So we would see that object N is still
+		 * allocated and in the unlinked set, and there is an
+		 * intent log record saying to allocate it.
+		 */
+		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+			if (zil_replay_disable) {
+				zil_destroy(zfsvfs->z_log, B_FALSE);
+			} else {
+				boolean_t use_nc = zfsvfs->z_use_namecache;
+				zfsvfs->z_use_namecache = B_FALSE;
+				zfsvfs->z_replay = B_TRUE;
+				zil_replay(zfsvfs->z_os, zfsvfs,
+				    zfs_replay_vector);
+				zfsvfs->z_replay = B_FALSE;
+				zfsvfs->z_use_namecache = use_nc;
+			}
+		}
+
+		/* restore readonly bit */
+		if (readonly != 0)
+			zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
+	}
+
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+	return (0);
+}
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+	int i;
+
+	zfs_fuid_destroy(zfsvfs);
+
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+	mutex_destroy(&zfsvfs->z_lock);
+	ASSERT(zfsvfs->z_nr_znodes == 0);
+	list_destroy(&zfsvfs->z_all_znodes);
+	ZFS_TEARDOWN_DESTROY(zfsvfs);
+	ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs);
+	rw_destroy(&zfsvfs->z_fuid_lock);
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	dataset_kstats_destroy(&zfsvfs->z_kstat);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	if (zfsvfs->z_vfs) {
+		if (zfsvfs->z_use_fuids) {
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+		} else {
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+		}
+	}
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname)
+{
+	uint64_t recordsize, fsid_guid;
+	int error = 0;
+	zfsvfs_t *zfsvfs;
+
+	ASSERT(vfsp);
+	ASSERT(osname);
+
+	error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs);
+	if (error)
+		return (error);
+	zfsvfs->z_vfs = vfsp;
+
+	if ((error = dsl_prop_get_integer(osname,
+	    "recordsize", &recordsize, NULL)))
+		goto out;
+	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
+	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
+
+	vfsp->vfs_data = zfsvfs;
+	vfsp->mnt_flag |= MNT_LOCAL;
+	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
+	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
+	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+	/*
+	 * This can cause a loss of coherence between ARC and page cache
+	 * on ZoF - unclear if the problem is in FreeBSD or ZoF
+	 */
+	vfsp->mnt_kern_flag |= MNTK_NO_IOPF;	/* vn_io_fault can be used */
+	vfsp->mnt_kern_flag |= MNTK_NOMSYNC;
+	vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG;
+
+#if defined(_KERNEL) && !defined(KMEM_DEBUG)
+	vfsp->mnt_kern_flag |= MNTK_FPLOOKUP;
+#endif
+	/*
+	 * The fsid is 64 bits, composed of an 8-bit fs type, which
+	 * separates our fsid from any other filesystem types, and a
+	 * 56-bit objset unique ID.  The objset unique ID is unique to
+	 * all objsets open on this system, provided by unique_create().
+	 * The 8-bit fs type must be put in the low bits of fsid[1]
+	 * because that's where other Solaris filesystems put it.
+	 */
+	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+	vfsp->vfs_fsid.val[0] = fsid_guid;
+	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+	    (vfsp->mnt_vfc->vfc_typenum & 0xFF);
+
+	/*
+	 * Set features for file system.
+	 */
+	zfs_set_fuid_feature(zfsvfs);
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
+	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
+		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
+		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
+	}
+	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
+
+	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+		uint64_t pval;
+
+		atime_changed_cb(zfsvfs, B_FALSE);
+		readonly_changed_cb(zfsvfs, B_TRUE);
+		if ((error = dsl_prop_get_integer(osname,
+		    "xattr", &pval, NULL)))
+			goto out;
+		xattr_changed_cb(zfsvfs, pval);
+		if ((error = dsl_prop_get_integer(osname,
+		    "acltype", &pval, NULL)))
+			goto out;
+		acl_type_changed_cb(zfsvfs, pval);
+		zfsvfs->z_issnap = B_TRUE;
+		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+
+		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+	} else {
+		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+			goto out;
+	}
+
+	vfs_mountedfrom(vfsp, osname);
+
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+out:
+	if (error) {
+		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+		zfsvfs_free(zfsvfs);
+	} else {
+		atomic_inc_32(&zfs_active_fs_count);
+	}
+
+	return (error);
+}
+
+static void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+	objset_t *os = zfsvfs->z_os;
+
+	if (!dmu_objset_is_snapshot(os))
+		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+static int
+getpoolname(const char *osname, char *poolname)
+{
+	char *p;
+
+	p = strchr(osname, '/');
+	if (p == NULL) {
+		if (strlen(osname) >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strcpy(poolname, osname);
+	} else {
+		if (p - osname >= MAXNAMELEN)
+			return (ENAMETOOLONG);
+		(void) strncpy(poolname, osname, p - osname);
+		poolname[p - osname] = '\0';
+	}
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp)
+{
+	kthread_t	*td = curthread;
+	vnode_t		*mvp = vfsp->mnt_vnodecovered;
+	cred_t		*cr = td->td_ucred;
+	char		*osname;
+	int		error = 0;
+	int		canwrite;
+
+	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * If full-owner-access is enabled and delegated administration is
+	 * turned on, we must set nosuid.
+	 */
+	if (zfs_super_owner &&
+	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
+		secpolicy_fs_mount_clearopts(cr, vfsp);
+	}
+
+	/*
+	 * Check for mount privilege?
+	 *
+	 * If we don't have privilege then see if
+	 * we have local permission to allow it
+	 */
+	error = secpolicy_fs_mount(cr, mvp, vfsp);
+	if (error) {
+		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
+			goto out;
+
+		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
+			vattr_t		vattr;
+
+			/*
+			 * Make sure user is the owner of the mount point
+			 * or has sufficient privileges.
+			 */
+
+			vattr.va_mask = AT_UID;
+
+			vn_lock(mvp, LK_SHARED | LK_RETRY);
+			if (VOP_GETATTR(mvp, &vattr, cr)) {
+				VOP_UNLOCK1(mvp);
+				goto out;
+			}
+
+			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
+			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
+				VOP_UNLOCK1(mvp);
+				goto out;
+			}
+			VOP_UNLOCK1(mvp);
+		}
+
+		secpolicy_fs_mount_clearopts(cr, vfsp);
+	}
+
+	/*
+	 * Refuse to mount a filesystem if we are in a local zone and the
+	 * dataset is not visible.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+		error = SET_ERROR(EPERM);
+		goto out;
+	}
+
+	vfsp->vfs_flag |= MNT_NFS4ACLS;
+
+	/*
+	 * When doing a remount, we simply refresh our temporary properties
+	 * according to those options set in the current VFS options.
+	 */
+	if (vfsp->vfs_flag & MS_REMOUNT) {
+		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+		/*
+		 * Refresh mount options with z_teardown_lock blocking I/O while
+		 * the filesystem is in an inconsistent state.
+		 * The lock also serializes this code with filesystem
+		 * manipulations between entry to zfs_suspend_fs() and return
+		 * from zfs_resume_fs().
+		 */
+		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+		zfs_unregister_callbacks(zfsvfs);
+		error = zfs_register_callbacks(vfsp);
+		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+		goto out;
+	}
+
+	/* Initial root mount: try hard to import the requested root pool. */
+	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
+	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
+		char pname[MAXNAMELEN];
+
+		error = getpoolname(osname, pname);
+		if (error == 0)
+			error = spa_import_rootpool(pname, false);
+		if (error)
+			goto out;
+	}
+	DROP_GIANT();
+	error = zfs_domount(vfsp, osname);
+	PICKUP_GIANT();
+
+out:
+	return (error);
+}
+
+static int
+zfs_statfs(vfs_t *vfsp, struct statfs *statp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	uint64_t refdbytes, availbytes, usedobjs, availobjs;
+
+	statp->f_version = STATFS_VERSION;
+
+	ZFS_ENTER(zfsvfs);
+
+	dmu_objset_space(zfsvfs->z_os,
+	    &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+	/*
+	 * The underlying storage pool actually uses multiple block sizes.
+	 * We report the fragsize as the smallest block size we support,
+	 * and we report our blocksize as the filesystem's maximum blocksize.
+	 */
+	statp->f_bsize = SPA_MINBLOCKSIZE;
+	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
+
+	/*
+	 * The following report "total" blocks of various kinds in the
+	 * file system, but reported in terms of f_frsize - the
+	 * "fragment" size.
+	 */
+
+	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
+	statp->f_bfree = availbytes / statp->f_bsize;
+	statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+	/*
+	 * statvfs() should really be called statufs(), because it assumes
+	 * static metadata.  ZFS doesn't preallocate files, so the best
+	 * we can do is report the max that could possibly fit in f_files,
+	 * and that minus the number actually used in f_ffree.
+	 * For f_ffree, report the smaller of the number of object available
+	 * and the number of blocks (each object will take at least a block).
+	 */
+	statp->f_ffree = MIN(availobjs, statp->f_bfree);
+	statp->f_files = statp->f_ffree + usedobjs;
+
+	/*
+	 * We're a zfs filesystem.
+	 */
+	strlcpy(statp->f_fstypename, "zfs",
+	    sizeof (statp->f_fstypename));
+
+	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
+	    sizeof (statp->f_mntfromname));
+	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
+	    sizeof (statp->f_mntonname));
+
+	statp->f_namemax = MAXNAMELEN - 1;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	znode_t *rootzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+	if (error == 0)
+		*vpp = ZTOV(rootzp);
+
+	ZFS_EXIT(zfsvfs);
+
+	if (error == 0) {
+		error = vn_lock(*vpp, flags);
+		if (error != 0) {
+			VN_RELE(*vpp);
+			*vpp = NULL;
+		}
+	}
+	return (error);
+}
+
+/*
+ * Teardown the zfsvfs::z_os.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+	znode_t	*zp;
+	dsl_dir_t *dd;
+
+	/*
+	 * If someone has not already unmounted this file system,
+	 * drain the zrele_taskq to ensure all active references to the
+	 * zfsvfs_t have been handled only then can it be safely destroyed.
+	 */
+	if (zfsvfs->z_os) {
+		/*
+		 * If we're unmounting we have to wait for the list to
+		 * drain completely.
+		 *
+		 * If we're not unmounting there's no guarantee the list
+		 * will drain completely, but zreles run from the taskq
+		 * may add the parents of dir-based xattrs to the taskq
+		 * so we want to wait for these.
+		 *
+		 * We can safely read z_nr_znodes without locking because the
+		 * VFS has already blocked operations which add to the
+		 * z_all_znodes list and thus increment z_nr_znodes.
+		 */
+		int round = 0;
+		while (zfsvfs->z_nr_znodes > 0) {
+			taskq_wait_outstanding(dsl_pool_zrele_taskq(
+			    dmu_objset_pool(zfsvfs->z_os)), 0);
+			if (++round > 1 && !unmounting)
+				break;
+		}
+	}
+	ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+
+	if (!unmounting) {
+		/*
+		 * We purge the parent filesystem's vfsp as the parent
+		 * filesystem and all of its snapshots have their vnode's
+		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
+		 * 'z_parent' is self referential for non-snapshots.
+		 */
+#ifdef FREEBSD_NAMECACHE
+#if __FreeBSD_version >= 1300117
+		cache_purgevfs(zfsvfs->z_parent->z_vfs);
+#else
+		cache_purgevfs(zfsvfs->z_parent->z_vfs, true);
+#endif
+#endif
+	}
+
+	/*
+	 * Close the zil. NB: Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs);
+
+	/*
+	 * If we are not unmounting (ie: online recv) and someone already
+	 * unmounted this file system while we were doing the switcheroo,
+	 * or a reopen of z_os failed then just bail out now.
+	 */
+	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * At this point there are no vops active, and any new vops will
+	 * fail with EIO since we have z_teardown_lock for writer (only
+	 * relevant for forced unmount).
+	 *
+	 * Release all holds on dbufs.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp))
+		if (zp->z_sa_hdl) {
+			ASSERT(ZTOV(zp)->v_count >= 0);
+			zfs_znode_dmu_fini(zp);
+		}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * If we are unmounting, set the unmounted flag and let new vops
+	 * unblock.  zfs_inactive will have the unmounted behavior, and all
+	 * other vops will fail with EIO.
+	 */
+	if (unmounting) {
+		zfsvfs->z_unmounted = B_TRUE;
+		ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+	}
+
+	/*
+	 * z_os will be NULL if there was an error in attempting to reopen
+	 * zfsvfs, so just return as the properties had already been
+	 * unregistered and cached data had been evicted before.
+	 */
+	if (zfsvfs->z_os == NULL)
+		return (0);
+
+	/*
+	 * Unregister properties.
+	 */
+	zfs_unregister_callbacks(zfsvfs);
+
+	/*
+	 * Evict cached data
+	 */
+	if (!zfs_is_readonly(zfsvfs))
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+	dmu_objset_evict_dbufs(zfsvfs->z_os);
+	dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+	dsl_dir_cancel_waiters(dd);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_umount(vfs_t *vfsp, int fflag)
+{
+	kthread_t *td = curthread;
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+	objset_t *os;
+	cred_t *cr = td->td_ucred;
+	int ret;
+
+	ret = secpolicy_fs_unmount(cr, vfsp);
+	if (ret) {
+		if (dsl_deleg_access((char *)vfsp->vfs_resource,
+		    ZFS_DELEG_PERM_MOUNT, cr))
+			return (ret);
+	}
+
+	/*
+	 * Unmount any snapshots mounted under .zfs before unmounting the
+	 * dataset itself.
+	 */
+	if (zfsvfs->z_ctldir != NULL) {
+		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
+			return (ret);
+	}
+
+	if (fflag & MS_FORCE) {
+		/*
+		 * Mark file system as unmounted before calling
+		 * vflush(FORCECLOSE). This way we ensure no future vnops
+		 * will be called and risk operating on DOOMED vnodes.
+		 */
+		ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+		zfsvfs->z_unmounted = B_TRUE;
+		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+	}
+
+	/*
+	 * Flush all the files.
+	 */
+	ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+	if (ret != 0)
+		return (ret);
+	while (taskqueue_cancel(zfsvfs_taskq->tq_queue,
+	    &zfsvfs->z_unlinked_drain_task, NULL) != 0)
+		taskqueue_drain(zfsvfs_taskq->tq_queue,
+		    &zfsvfs->z_unlinked_drain_task);
+
+	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+	os = zfsvfs->z_os;
+
+	/*
+	 * z_os will be NULL if there was an error in
+	 * attempting to reopen zfsvfs.
+	 */
+	if (os != NULL) {
+		/*
+		 * Unset the objset user_ptr.
+		 */
+		mutex_enter(&os->os_user_ptr_lock);
+		dmu_objset_set_user(os, NULL);
+		mutex_exit(&os->os_user_ptr_lock);
+
+		/*
+		 * Finally release the objset
+		 */
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+	}
+
+	/*
+	 * We can now safely destroy the '.zfs' directory node.
+	 */
+	if (zfsvfs->z_ctldir != NULL)
+		zfsctl_destroy(zfsvfs);
+	zfs_freevfs(vfsp);
+
+	return (0);
+}
+
+static int
+zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
+{
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	int 		err;
+
+	/*
+	 * zfs_zget() can't operate on virtual entries like .zfs/ or
+	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
+	 * This will make NFS to switch to LOOKUP instead of using VGET.
+	 */
+	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
+	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
+		return (EOPNOTSUPP);
+
+	ZFS_ENTER(zfsvfs);
+	err = zfs_zget(zfsvfs, ino, &zp);
+	if (err == 0 && zp->z_unlinked) {
+		vrele(ZTOV(zp));
+		err = EINVAL;
+	}
+	if (err == 0)
+		*vpp = ZTOV(zp);
+	ZFS_EXIT(zfsvfs);
+	if (err == 0) {
+		err = vn_lock(*vpp, flags);
+		if (err != 0)
+			vrele(*vpp);
+	}
+	if (err != 0)
+		*vpp = NULL;
+	return (err);
+}
+
+static int
+#if __FreeBSD_version >= 1300098
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int *secflavors)
+#else
+zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp, int *numsecflavors, int **secflavors)
+#endif
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+	/*
+	 * If this is regular file system vfsp is the same as
+	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
+	 * zfsvfs->z_parent->z_vfs represents parent file system
+	 * which we have to use here, because only this file system
+	 * has mnt_export configured.
+	 */
+	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
+	    credanonp, numsecflavors, secflavors));
+}
+
+CTASSERT(SHORT_FID_LEN <= sizeof (struct fid));
+CTASSERT(LONG_FID_LEN <= sizeof (struct fid));
+
+static int
+zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
+{
+	struct componentname cn;
+	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
+	znode_t		*zp;
+	vnode_t		*dvp;
+	uint64_t	object = 0;
+	uint64_t	fid_gen = 0;
+	uint64_t	gen_mask;
+	uint64_t	zp_gen;
+	int 		i, err;
+
+	*vpp = NULL;
+
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * On FreeBSD we can get snapshot's mount point or its parent file
+	 * system mount point depending if snapshot is already mounted or not.
+	 */
+	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
+		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
+		uint64_t	objsetid = 0;
+		uint64_t	setgen = 0;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+		ZFS_EXIT(zfsvfs);
+
+		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
+		if (err)
+			return (SET_ERROR(EINVAL));
+		ZFS_ENTER(zfsvfs);
+	}
+
+	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+		zfid_short_t	*zfid = (zfid_short_t *)fidp;
+
+		for (i = 0; i < sizeof (zfid->zf_object); i++)
+			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zfid->zf_gen); i++)
+			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+	} else {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
+	 * directory tree. If the object == zfsvfs->z_shares_dir, then
+	 * we are in the .zfs/shares directory tree.
+	 */
+	if ((fid_gen == 0 &&
+	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
+	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
+		ZFS_EXIT(zfsvfs);
+		VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
+		if (object == ZFSCTL_INO_SNAPDIR) {
+			cn.cn_nameptr = "snapshot";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN | LOCKLEAF;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
+		} else if (object == zfsvfs->z_shares_dir) {
+			/*
+			 * XXX This branch must not be taken,
+			 * if it is, then the lookup below will
+			 * explode.
+			 */
+			cn.cn_nameptr = "shares";
+			cn.cn_namelen = strlen(cn.cn_nameptr);
+			cn.cn_nameiop = LOOKUP;
+			cn.cn_flags = ISLASTCN;
+			cn.cn_lkflags = flags;
+			VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+			vput(dvp);
+		} else {
+			*vpp = dvp;
+		}
+		return (err);
+	}
+
+	gen_mask = -1ULL >> (64 - 8 * i);
+
+	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
+	if ((err = zfs_zget(zfsvfs, object, &zp))) {
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+	    sizeof (uint64_t));
+	zp_gen = zp_gen & gen_mask;
+	if (zp_gen == 0)
+		zp_gen = 1;
+	if (zp->z_unlinked || zp_gen != fid_gen) {
+		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
+		vrele(ZTOV(zp));
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	*vpp = ZTOV(zp);
+	ZFS_EXIT(zfsvfs);
+	err = vn_lock(*vpp, flags);
+	if (err == 0)
+		vnode_create_vobject(*vpp, zp->z_size, curthread);
+	else
+		*vpp = NULL;
+	return (err);
+}
+
+/*
+ * Block out VOPs and close zfsvfs_t::z_os
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+	int error;
+
+	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+		return (error);
+
+	return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended.  Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	int err;
+	znode_t *zp;
+
+	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
+
+	/*
+	 * We already own this, so just update the objset_t, as the one we
+	 * had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+
+	err = zfsvfs_init(zfsvfs, os);
+	if (err != 0)
+		goto bail;
+
+	ds->ds_dir->dd_activity_cancelled = B_FALSE;
+	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+	zfs_set_fuid_feature(zfsvfs);
+
+	/*
+	 * Attempt to re-establish all the active znodes with
+	 * their dbufs.  If a zfs_rezget() fails, then we'll let
+	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+	 * when they try to use their znode.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+		(void) zfs_rezget(zp);
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+bail:
+	/* release the VOPs */
+	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+	if (err) {
+		/*
+		 * Since we couldn't setup the sa framework, try to force
+		 * unmount this file system.
+		 */
+		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+			vfs_ref(zfsvfs->z_vfs);
+			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+		}
+	}
+	return (err);
+}
+
+static void
+zfs_freevfs(vfs_t *vfsp)
+{
+	zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+	zfsvfs_free(zfsvfs);
+
+	atomic_dec_32(&zfs_active_fs_count);
+}
+
+#ifdef __i386__
+static int desiredvnodes_backup;
+#include <sys/vmmeter.h>
+
+
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#endif
+
+static void
+zfs_vnodes_adjust(void)
+{
+#ifdef __i386__
+	int newdesiredvnodes;
+
+	desiredvnodes_backup = desiredvnodes;
+
+	/*
+	 * We calculate newdesiredvnodes the same way it is done in
+	 * vntblinit(). If it is equal to desiredvnodes, it means that
+	 * it wasn't tuned by the administrator and we can tune it down.
+	 */
+	newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 *
+	    vm_kmem_size / (5 * (sizeof (struct vm_object) +
+	    sizeof (struct vnode))));
+	if (newdesiredvnodes == desiredvnodes)
+		desiredvnodes = (3 * newdesiredvnodes) / 4;
+#endif
+}
+
+static void
+zfs_vnodes_adjust_back(void)
+{
+
+#ifdef __i386__
+	desiredvnodes = desiredvnodes_backup;
+#endif
+}
+
+void
+zfs_init(void)
+{
+
+	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
+
+	/*
+	 * Initialize .zfs directory structures
+	 */
+	zfsctl_init();
+
+	/*
+	 * Initialize znode cache, vnode ops, etc...
+	 */
+	zfs_znode_init();
+
+	/*
+	 * Reduce number of vnodes. Originally number of vnodes is calculated
+	 * with UFS inode in mind. We reduce it here, because it's too big for
+	 * ZFS/i386.
+	 */
+	zfs_vnodes_adjust();
+
+	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
+
+	zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0);
+}
+
+void
+zfs_fini(void)
+{
+	taskq_destroy(zfsvfs_taskq);
+	zfsctl_fini();
+	zfs_znode_fini();
+	zfs_vnodes_adjust_back();
+}
+
+int
+zfs_busy(void)
+{
+	return (zfs_active_fs_count != 0);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+	ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs));
+
+	/*
+	 * We already own this, so just hold and rele it to update the
+	 * objset_t, as the one we had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+	zfsvfs->z_os = os;
+
+	/* release the VOPs */
+	ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs);
+	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+	/*
+	 * Try to force unmount this file system.
+	 */
+	(void) zfs_umount(zfsvfs->z_vfs, 0);
+	zfsvfs->z_unmounted = B_TRUE;
+	return (0);
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+	int error;
+	objset_t *os = zfsvfs->z_os;
+	dmu_tx_t *tx;
+
+	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+		return (SET_ERROR(EINVAL));
+
+	if (newvers < zfsvfs->z_version)
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_spa_version_map(newvers) >
+	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
+		return (SET_ERROR(ENOTSUP));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    ZFS_SA_ATTRS);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+	    8, 1, &newvers, tx);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		return (error);
+	}
+
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		uint64_t sa_obj;
+
+		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+		    SPA_VERSION_SA);
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+
+		error = zap_add(os, MASTER_NODE_OBJ,
+		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT0(error);
+
+		VERIFY(0 == sa_set_sa_object(os, sa_obj));
+		sa_register_update_callback(os, zfs_sa_upgrade);
+	}
+
+	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+	    "from %ju to %ju", (uintmax_t)zfsvfs->z_version,
+	    (uintmax_t)newvers);
+	dmu_tx_commit(tx);
+
+	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
+
+	zfs_set_fuid_feature(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+	uint64_t *cached_copy = NULL;
+
+	/*
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
+	 */
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
+
+	/*
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
+	 */
+	const char *pname;
+	int error = ENOENT;
+	if (prop == ZFS_PROP_VERSION) {
+		pname = ZPL_VERSION_STR;
+	} else {
+		pname = zfs_prop_to_name(prop);
+	}
+
+	if (os != NULL) {
+		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+	}
+
+	if (error == ENOENT) {
+		/* No value set, use the default value */
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			*value = ZPL_VERSION;
+			break;
+		case ZFS_PROP_NORMALIZE:
+		case ZFS_PROP_UTF8ONLY:
+			*value = 0;
+			break;
+		case ZFS_PROP_CASE:
+			*value = ZFS_CASE_SENSITIVE;
+			break;
+		case ZFS_PROP_ACLTYPE:
+			*value = ZFS_ACLTYPE_NFSV4;
+			break;
+		default:
+			return (error);
+		}
+		error = 0;
+	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
+	return (error);
+}
+
+/*
+ * Return true if the corresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+	zfsvfs_t *zfvp;
+	boolean_t unmounted = B_FALSE;
+
+	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+	mutex_enter(&os->os_user_ptr_lock);
+	zfvp = dmu_objset_get_user(os);
+	if (zfvp != NULL && zfvp->z_vfs != NULL &&
+	    (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT))
+		unmounted = B_TRUE;
+	mutex_exit(&os->os_user_ptr_lock);
+
+	return (unmounted);
+}
+
+#ifdef _KERNEL
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
+{
+	char tmpbuf[MAXPATHLEN];
+	struct mount *mp;
+	char *fromname;
+	size_t oldlen;
+
+	oldlen = strlen(oldname);
+
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
+		fromname = mp->mnt_stat.f_mntfromname;
+		if (strcmp(fromname, oldname) == 0) {
+			(void) strlcpy(fromname, newname,
+			    sizeof (mp->mnt_stat.f_mntfromname));
+			continue;
+		}
+		if (strncmp(fromname, oldname, oldlen) == 0 &&
+		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
+			(void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s",
+			    newname, fromname + oldlen);
+			(void) strlcpy(fromname, tmpbuf,
+			    sizeof (mp->mnt_stat.f_mntfromname));
+			continue;
+		}
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
new file mode 100644
index 000000000000..d5f0da9ecd4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -0,0 +1,5888 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/endian.h>
+#include <sys/vm.h>
+#include <sys/vnode.h>
+#if __FreeBSD_version >= 1300102
+#include <sys/smr.h>
+#endif
+#include <sys/dirent.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/atomic.h>
+#include <sys/namei.h>
+#include <sys/mman.h>
+#include <sys/cmn_err.h>
+#include <sys/kdb.h>
+#include <sys/sysproto.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/filio.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_rlock.h>
+#include <sys/extdirent.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/sched.h>
+#include <sys/acl.h>
+#include <sys/vmmeter.h>
+#include <vm/vm_param.h>
+#include <sys/zil.h>
+#include <sys/zfs_vnops.h>
+
+#include <vm/vm_object.h>
+
+#include <sys/extattr.h>
+#include <sys/priv.h>
+
+#ifndef VN_OPEN_INVFS
+#define	VN_OPEN_INVFS	0x0
+#endif
+
+VFS_SMR_DECLARE;
+
+#if __FreeBSD_version >= 1300047
+#define	vm_page_wire_lock(pp)
+#define	vm_page_wire_unlock(pp)
+#else
+#define	vm_page_wire_lock(pp) vm_page_lock(pp)
+#define	vm_page_wire_unlock(pp) vm_page_unlock(pp)
+#endif
+
+#ifdef DEBUG_VFS_LOCKS
+#define	VNCHECKREF(vp)				  \
+	VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,	\
+	    ("%s: wrong ref counts", __func__));
+#else
+#define	VNCHECKREF(vp)
+#endif
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work.  To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory.  The example below illustrates the following Big Rules:
+ *
+ *  (1)	A check must be made in each zfs thread for a mounted file system.
+ *	This is done avoiding races using ZFS_ENTER(zfsvfs).
+ *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
+ *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
+ *	can return EIO from the calling function.
+ *
+ *  (2)	VN_RELE() should always be the last thing except for zil_commit()
+ *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
+ *	First, if it's the last reference, the vnode/znode
+ *	can be freed, so the zp may point to freed memory.  Second, the last
+ *	reference will call zfs_zinactive(), which may induce a lot of work --
+ *	pushing cached pages (which acquires range locks) and syncing out
+ *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
+ *	which could deadlock the system if you were already holding one.
+ *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
+ *
+ *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
+ *	as they can span dmu_tx_assign() calls.
+ *
+ *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ *      dmu_tx_assign().  This is critical because we don't want to block
+ *      while holding locks.
+ *
+ *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
+ *	reduces lock contention and CPU usage when we must wait (note that if
+ *	throughput is constrained by the storage, nearly every transaction
+ *	must wait).
+ *
+ *      Note, in particular, that if a lock is sometimes acquired before
+ *      the tx assigns, and sometimes after (e.g. z_lock), then failing
+ *      to use a non-blocking assign can deadlock the system.  The scenario:
+ *
+ *	Thread A has grabbed a lock before calling dmu_tx_assign().
+ *	Thread B is in an already-assigned tx, and blocks for this lock.
+ *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ *	forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ *	to indicate that this operation has already called dmu_tx_wait().
+ *	This will ensure that we don't retry forever, waiting a short bit
+ *	each time.
+ *
+ *  (5)	If the operation succeeded, generate the intent log entry for it
+ *	before dropping locks.  This ensures that the ordering of events
+ *	in the intent log matches the order in which they actually occurred.
+ *	During ZIL replay the zfs_log_* functions will update the sequence
+ *	number to indicate the zil transaction has replayed.
+ *
+ *  (6)	At the end of each vnode op, the DMU tx must always commit,
+ *	regardless of whether there were any errors.
+ *
+ *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
+ *	to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ *	ZFS_ENTER(zfsvfs);		// exit if unmounted
+ * top:
+ *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
+ *	rw_enter(...);			// grab any other locks you need
+ *	tx = dmu_tx_create(...);	// get DMU tx
+ *	dmu_tx_hold_*();		// hold each object you might modify
+ *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ *	if (error) {
+ *		rw_exit(...);		// drop locks
+ *		zfs_dirent_unlock(dl);	// unlock directory entry
+ *		VN_RELE(...);		// release held vnodes
+ *		if (error == ERESTART) {
+ *			waited = B_TRUE;
+ *			dmu_tx_wait(tx);
+ *			dmu_tx_abort(tx);
+ *			goto top;
+ *		}
+ *		dmu_tx_abort(tx);	// abort DMU tx
+ *		ZFS_EXIT(zfsvfs);	// finished in zfs
+ *		return (error);		// really out of space
+ *	}
+ *	error = do_real_work();		// do whatever this VOP does
+ *	if (error == 0)
+ *		zfs_log_*(...);		// on success, make ZIL entry
+ *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
+ *	rw_exit(...);			// drop locks
+ *	zfs_dirent_unlock(dl);		// unlock directory entry
+ *	VN_RELE(...);			// release held vnodes
+ *	zil_commit(zilog, foid);	// synchronous when necessary
+ *	ZFS_EXIT(zfsvfs);		// finished in zfs
+ *	return (error);			// done, report error
+ */
+
+/* ARGSUSED */
+static int
+zfs_open(vnode_t **vpp, int flag, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(*vpp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+	    ((flag & FAPPEND) == 0)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+	    ZTOV(zp)->v_type == VREG &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+		if (fs_vscan(*vpp, cr, 0) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EACCES));
+		}
+	}
+
+	/* Keep a count of the synchronous opens in the znode */
+	if (flag & (FSYNC | FDSYNC))
+		atomic_inc_32(&zp->z_sync_cnt);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/* Decrement the synchronous opens in the znode */
+	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
+		atomic_dec_32(&zp->z_sync_cnt);
+
+	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
+	    ZTOV(zp)->v_type == VREG &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+		VERIFY(fs_vscan(vp, cr, 1) == 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
+    int *rvalp)
+{
+	loff_t off;
+	int error;
+
+	switch (com) {
+	case _FIOFFS:
+	{
+		return (0);
+
+		/*
+		 * The following two ioctls are used by bfu.  Faking out,
+		 * necessary to avoid bfu errors.
+		 */
+	}
+	case _FIOGDIO:
+	case _FIOSDIO:
+	{
+		return (0);
+	}
+
+	case F_SEEK_DATA:
+	case F_SEEK_HOLE:
+	{
+		off = *(offset_t *)data;
+		/* offset parameter is in/out */
+		error = zfs_holey(VTOZ(vp), com, &off);
+		if (error)
+			return (error);
+		*(offset_t *)data = off;
+		return (0);
+	}
+	}
+	return (SET_ERROR(ENOTTY));
+}
+
+static vm_page_t
+page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
+{
+	vm_object_t obj;
+	vm_page_t pp;
+	int64_t end;
+
+	/*
+	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+	 * aligned boundaries, if the range is not aligned.  As a result a
+	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
+	 * the whole page would be considered clean despite have some
+	 * dirty data.
+	 * For this reason we should shrink the range to DEV_BSIZE aligned
+	 * boundaries before calling vm_page_clear_dirty.
+	 */
+	end = rounddown2(off + nbytes, DEV_BSIZE);
+	off = roundup2(off, DEV_BSIZE);
+	nbytes = end - off;
+
+	obj = vp->v_object;
+	zfs_vmobject_assert_wlocked_12(obj);
+#if __FreeBSD_version < 1300050
+	for (;;) {
+		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+		    pp->valid) {
+			if (vm_page_xbusied(pp)) {
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_reference(pp);
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
+				continue;
+			}
+			vm_page_sbusy(pp);
+		} else if (pp != NULL) {
+			ASSERT(!pp->valid);
+			pp = NULL;
+		}
+		if (pp != NULL) {
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_object_pip_add(obj, 1);
+			pmap_remove_write(pp);
+			if (nbytes != 0)
+				vm_page_clear_dirty(pp, off, nbytes);
+		}
+		break;
+	}
+#else
+	vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
+	    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
+	    VM_ALLOC_IGN_SBUSY);
+	if (pp != NULL) {
+		ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+		vm_object_pip_add(obj, 1);
+		pmap_remove_write(pp);
+		if (nbytes != 0)
+			vm_page_clear_dirty(pp, off, nbytes);
+	}
+#endif
+	return (pp);
+}
+
+static void
+page_unbusy(vm_page_t pp)
+{
+
+	vm_page_sunbusy(pp);
+#if __FreeBSD_version >= 1300041
+	vm_object_pip_wakeup(pp->object);
+#else
+	vm_object_pip_subtract(pp->object, 1);
+#endif
+}
+
+#if __FreeBSD_version > 1300051
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+	vm_object_t obj;
+	vm_page_t m;
+
+	obj = vp->v_object;
+	vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
+	    VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
+	    VM_ALLOC_NOBUSY);
+	return (m);
+}
+#else
+static vm_page_t
+page_hold(vnode_t *vp, int64_t start)
+{
+	vm_object_t obj;
+	vm_page_t pp;
+
+	obj = vp->v_object;
+	zfs_vmobject_assert_wlocked(obj);
+
+	for (;;) {
+		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
+		    pp->valid) {
+			if (vm_page_xbusied(pp)) {
+				/*
+				 * Reference the page before unlocking and
+				 * sleeping so that the page daemon is less
+				 * likely to reclaim it.
+				 */
+				vm_page_reference(pp);
+				vm_page_lock(pp);
+				zfs_vmobject_wunlock(obj);
+				vm_page_busy_sleep(pp, "zfsmwb", true);
+				zfs_vmobject_wlock(obj);
+				continue;
+			}
+
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_wire_lock(pp);
+			vm_page_hold(pp);
+			vm_page_wire_unlock(pp);
+
+		} else
+			pp = NULL;
+		break;
+	}
+	return (pp);
+}
+#endif
+
+static void
+page_unhold(vm_page_t pp)
+{
+
+	vm_page_wire_lock(pp);
+#if __FreeBSD_version >= 1300035
+	vm_page_unwire(pp, PQ_ACTIVE);
+#else
+	vm_page_unhold(pp);
+#endif
+	vm_page_wire_unlock(pp);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Write:	If we find a memory mapped page, we write to *both*
+ *		the page and the dmu buffer.
+ */
+void
+update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
+{
+	vm_object_t obj;
+	struct sf_buf *sf;
+	vnode_t *vp = ZTOV(zp);
+	caddr_t va;
+	int off;
+
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
+
+	off = start & PAGEOFFSET;
+	zfs_vmobject_wlock_12(obj);
+#if __FreeBSD_version >= 1300041
+	vm_object_pip_add(obj, 1);
+#endif
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		vm_page_t pp;
+		int nbytes = imin(PAGESIZE - off, len);
+
+		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+			zfs_vmobject_wunlock_12(obj);
+
+			va = zfs_map_page(pp, &sf);
+			(void) dmu_read(os, zp->z_id, start + off, nbytes,
+			    va + off, DMU_READ_PREFETCH);
+			zfs_unmap_page(sf);
+
+			zfs_vmobject_wlock_12(obj);
+			page_unbusy(pp);
+		}
+		len -= nbytes;
+		off = 0;
+	}
+#if __FreeBSD_version >= 1300041
+	vm_object_pip_wakeup(obj);
+#else
+	vm_object_pip_wakeupn(obj, 0);
+#endif
+	zfs_vmobject_wunlock_12(obj);
+}
+
+/*
+ * Read with UIO_NOCOPY flag means that sendfile(2) requests
+ * ZFS to populate a range of page cache pages with data.
+ *
+ * NOTE: this function could be optimized to pre-allocate
+ * all pages in advance, drain exclusive busy on all of them,
+ * map them into contiguous KVA region and populate them
+ * in one single dmu_read() call.
+ */
+int
+mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
+{
+	vnode_t *vp = ZTOV(zp);
+	objset_t *os = zp->z_zfsvfs->z_os;
+	struct sf_buf *sf;
+	vm_object_t obj;
+	vm_page_t pp;
+	int64_t start;
+	caddr_t va;
+	int len = nbytes;
+	int error = 0;
+
+	ASSERT(zfs_uio_segflg(uio) == UIO_NOCOPY);
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
+	ASSERT((zfs_uio_offset(uio) & PAGEOFFSET) == 0);
+
+	zfs_vmobject_wlock_12(obj);
+	for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
+		int bytes = MIN(PAGESIZE, len);
+
+		pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
+		    VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
+		if (vm_page_none_valid(pp)) {
+			zfs_vmobject_wunlock_12(obj);
+			va = zfs_map_page(pp, &sf);
+			error = dmu_read(os, zp->z_id, start, bytes, va,
+			    DMU_READ_PREFETCH);
+			if (bytes != PAGESIZE && error == 0)
+				bzero(va + bytes, PAGESIZE - bytes);
+			zfs_unmap_page(sf);
+			zfs_vmobject_wlock_12(obj);
+#if  __FreeBSD_version >= 1300081
+			if (error == 0) {
+				vm_page_valid(pp);
+				vm_page_activate(pp);
+				vm_page_do_sunbusy(pp);
+			} else {
+				zfs_vmobject_wlock(obj);
+				if (!vm_page_wired(pp) && pp->valid == 0 &&
+				    vm_page_busy_tryupgrade(pp))
+					vm_page_free(pp);
+				else
+					vm_page_sunbusy(pp);
+				zfs_vmobject_wunlock(obj);
+			}
+#else
+			vm_page_do_sunbusy(pp);
+			vm_page_lock(pp);
+			if (error) {
+				if (pp->wire_count == 0 && pp->valid == 0 &&
+				    !vm_page_busied(pp))
+					vm_page_free(pp);
+			} else {
+				pp->valid = VM_PAGE_BITS_ALL;
+				vm_page_activate(pp);
+			}
+			vm_page_unlock(pp);
+#endif
+		} else {
+			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+			vm_page_do_sunbusy(pp);
+		}
+		if (error)
+			break;
+		zfs_uio_advance(uio, bytes);
+		len -= bytes;
+	}
+	zfs_vmobject_wunlock_12(obj);
+	return (error);
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Read:	We "read" preferentially from memory mapped pages,
+ *		else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	 the file is memory mapped.
+ */
+int
+mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
+{
+	vnode_t *vp = ZTOV(zp);
+	vm_object_t obj;
+	int64_t start;
+	int len = nbytes;
+	int off;
+	int error = 0;
+
+	ASSERT(vp->v_mount != NULL);
+	obj = vp->v_object;
+	ASSERT(obj != NULL);
+
+	start = zfs_uio_offset(uio);
+	off = start & PAGEOFFSET;
+	zfs_vmobject_wlock_12(obj);
+	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
+		vm_page_t pp;
+		uint64_t bytes = MIN(PAGESIZE - off, len);
+
+		if ((pp = page_hold(vp, start))) {
+			struct sf_buf *sf;
+			caddr_t va;
+
+			zfs_vmobject_wunlock_12(obj);
+			va = zfs_map_page(pp, &sf);
+			error = vn_io_fault_uiomove(va + off, bytes,
+			    GET_UIO_STRUCT(uio));
+			zfs_unmap_page(sf);
+			zfs_vmobject_wlock_12(obj);
+			page_unhold(pp);
+		} else {
+			zfs_vmobject_wunlock_12(obj);
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, bytes);
+			zfs_vmobject_wlock_12(obj);
+		}
+		len -= bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	zfs_vmobject_wunlock_12(obj);
+	return (error);
+}
+
+int
+zfs_write_simple(znode_t *zp, const void *data, size_t len,
+    loff_t pos, size_t *presid)
+{
+	int error = 0;
+	ssize_t resid;
+
+	error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
+	    UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
+
+	if (error) {
+		return (SET_ERROR(error));
+	} else if (presid == NULL) {
+		if (resid != 0) {
+			error = SET_ERROR(EIO);
+		}
+	} else {
+		*presid = resid;
+	}
+	return (error);
+}
+
+void
+zfs_zrele_async(znode_t *zp)
+{
+	vnode_t *vp = ZTOV(zp);
+	objset_t *os = ITOZSB(vp)->z_os;
+
+	VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
+}
+
+static int
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
+{
+	int error;
+
+	*vpp = arg;
+	error = vn_lock(*vpp, lkflags);
+	if (error != 0)
+		vrele(*vpp);
+	return (error);
+}
+
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+	znode_t *zdp = VTOZ(dvp);
+	zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
+	int error;
+	int ltype;
+
+	if (zfsvfs->z_replay == B_FALSE)
+		ASSERT_VOP_LOCKED(dvp, __func__);
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		ASSERT3P(dvp, ==, vp);
+		vref(dvp);
+		ltype = lkflags & LK_TYPE_MASK;
+		if (ltype != VOP_ISLOCKED(dvp)) {
+			if (ltype == LK_EXCLUSIVE)
+				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+			else /* if (ltype == LK_SHARED) */
+				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+			/*
+			 * Relock for the "." case could leave us with
+			 * reclaimed vnode.
+			 */
+			if (VN_IS_DOOMED(dvp)) {
+				vrele(dvp);
+				return (SET_ERROR(ENOENT));
+			}
+		}
+		return (0);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		/*
+		 * Note that in this case, dvp is the child vnode, and we
+		 * are looking up the parent vnode - exactly reverse from
+		 * normal operation.  Unlocking dvp requires some rather
+		 * tricky unlock/relock dance to prevent mp from being freed;
+		 * use vn_vget_ino_gen() which takes care of all that.
+		 *
+		 * XXX Note that there is a time window when both vnodes are
+		 * unlocked.  It is possible, although highly unlikely, that
+		 * during that window the parent-child relationship between
+		 * the vnodes may change, for example, get reversed.
+		 * In that case we would have a wrong lock order for the vnodes.
+		 * All other filesystems seem to ignore this problem, so we
+		 * do the same here.
+		 * A potential solution could be implemented as follows:
+		 * - using LK_NOWAIT when locking the second vnode and retrying
+		 *   if necessary
+		 * - checking that the parent-child relationship still holds
+		 *   after locking both vnodes and retrying if it doesn't
+		 */
+		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+		return (error);
+	} else {
+		error = vn_lock(vp, lkflags);
+		if (error != 0)
+			vrele(vp);
+		return (error);
+	}
+}
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held vnode reference for it.
+ *
+ *	IN:	dvp	- vnode of directory to search.
+ *		nm	- name of entry to lookup.
+ *		pnp	- full pathname to lookup [UNUSED].
+ *		flags	- LOOKUP_XATTR set if looking for an attribute.
+ *		rdir	- root directory vnode [UNUSED].
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	OUT:	vpp	- vnode of located entry, NULL if not found.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	NA
+ */
+/* ARGSUSED */
+static int
+zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
+    struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td,
+    int flags, boolean_t cached)
+{
+	znode_t *zdp = VTOZ(dvp);
+	znode_t *zp;
+	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+	int	error = 0;
+
+	/*
+	 * Fast path lookup, however we must skip DNLC lookup
+	 * for case folding or normalizing lookups because the
+	 * DNLC code only stores the passed in name.  This means
+	 * creating 'a' and removing 'A' on a case insensitive
+	 * file system would work, but DNLC still thinks 'a'
+	 * exists and won't let you create it again on the next
+	 * pass through fast path.
+	 */
+	if (!(flags & LOOKUP_XATTR)) {
+		if (dvp->v_type != VDIR) {
+			return (SET_ERROR(ENOTDIR));
+		} else if (zdp->z_sa_hdl == NULL) {
+			return (SET_ERROR(EIO));
+		}
+	}
+
+	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
+	    const char *, nm);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zdp);
+
+	*vpp = NULL;
+
+	if (flags & LOOKUP_XATTR) {
+		/*
+		 * If the xattr property is off, refuse the lookup request.
+		 */
+		if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOPNOTSUPP));
+		}
+
+		/*
+		 * We don't allow recursive attributes..
+		 * Maybe someday we will.
+		 */
+		if (zdp->z_pflags & ZFS_XATTR) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+
+		if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		*vpp = ZTOV(zp);
+
+		/*
+		 * Do we have permission to get into attribute directory?
+		 */
+		error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr);
+		if (error) {
+			vrele(ZTOV(zp));
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Check accessibility of directory if we're not coming in via
+	 * VOP_CACHEDLOOKUP.
+	 */
+	if (!cached) {
+#ifdef NOEXECCHECK
+		if ((cnp->cn_flags & NOEXECCHECK) != 0) {
+			cnp->cn_flags &= ~NOEXECCHECK;
+		} else
+#endif
+		if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+
+	/*
+	 * First handle the special cases.
+	 */
+	if ((cnp->cn_flags & ISDOTDOT) != 0) {
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the vp for the snapshot directory.
+		 */
+		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+			struct componentname cn;
+			vnode_t *zfsctl_vp;
+			int ltype;
+
+			ZFS_EXIT(zfsvfs);
+			ltype = VOP_ISLOCKED(dvp);
+			VOP_UNLOCK1(dvp);
+			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+			    &zfsctl_vp);
+			if (error == 0) {
+				cn.cn_nameptr = "snapshot";
+				cn.cn_namelen = strlen(cn.cn_nameptr);
+				cn.cn_nameiop = cnp->cn_nameiop;
+				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
+				cn.cn_lkflags = cnp->cn_lkflags;
+				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+				vput(zfsctl_vp);
+			}
+			vn_lock(dvp, ltype | LK_RETRY);
+			return (error);
+		}
+	}
+	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+		ZFS_EXIT(zfsvfs);
+		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+			return (SET_ERROR(ENOTSUP));
+		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+		return (error);
+	}
+
+	/*
+	 * The loop is retry the lookup if the parent-child relationship
+	 * changes during the dot-dot locking complexities.
+	 */
+	for (;;) {
+		uint64_t parent;
+
+		error = zfs_dirlook(zdp, nm, &zp);
+		if (error == 0)
+			*vpp = ZTOV(zp);
+
+		ZFS_EXIT(zfsvfs);
+		if (error != 0)
+			break;
+
+		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+		if (error != 0) {
+			/*
+			 * If we've got a locking error, then the vnode
+			 * got reclaimed because of a force unmount.
+			 * We never enter doomed vnodes into the name cache.
+			 */
+			*vpp = NULL;
+			return (error);
+		}
+
+		if ((cnp->cn_flags & ISDOTDOT) == 0)
+			break;
+
+		ZFS_ENTER(zfsvfs);
+		if (zdp->z_sa_hdl == NULL) {
+			error = SET_ERROR(EIO);
+		} else {
+			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+			    &parent, sizeof (parent));
+		}
+		if (error != 0) {
+			ZFS_EXIT(zfsvfs);
+			vput(ZTOV(zp));
+			break;
+		}
+		if (zp->z_id == parent) {
+			ZFS_EXIT(zfsvfs);
+			break;
+		}
+		vput(ZTOV(zp));
+	}
+
+	if (error != 0)
+		*vpp = NULL;
+
+	/* Translate errors and add SAVENAME when needed. */
+	if (cnp->cn_flags & ISLASTCN) {
+		switch (nameiop) {
+		case CREATE:
+		case RENAME:
+			if (error == ENOENT) {
+				error = EJUSTRETURN;
+				cnp->cn_flags |= SAVENAME;
+				break;
+			}
+			/* FALLTHROUGH */
+		case DELETE:
+			if (error == 0)
+				cnp->cn_flags |= SAVENAME;
+			break;
+		}
+	}
+
+	/* Insert name into cache (as non-existent) if appropriate. */
+	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(dvp, NULL, cnp);
+
+	/* Insert name into cache if appropriate. */
+	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
+	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+		if (!(cnp->cn_flags & ISLASTCN) ||
+		    (nameiop != DELETE && nameiop != RENAME)) {
+			cache_enter(dvp, *vpp, cnp);
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory.  If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error.  Return the vp of the created or trunc'd file.
+ *
+ *	IN:	dvp	- vnode of directory to put new file entry in.
+ *		name	- name of new file entry.
+ *		vap	- attributes of new file.
+ *		excl	- flag indicating exclusive or non-exclusive mode.
+ *		mode	- mode to open file with.
+ *		cr	- credentials of caller.
+ *		flag	- large file flag [UNUSED].
+ *		ct	- caller context
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	vpp	- vnode of created or trunc'd entry.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated if new entry created
+ *	 vp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
+    znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	objset_t	*os;
+	dmu_tx_t	*tx;
+	int		error;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	uint64_t	projid = ZFS_DEFAULT_PROJID;
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+	uint64_t	txtype;
+#ifdef DEBUG_VFS_LOCKS
+	vnode_t	*dvp = ZTOV(dzp);
+#endif
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || (vap->va_mask & AT_XVATTR) ||
+	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	os = zfsvfs->z_os;
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (vap->va_mask & AT_XVATTR) {
+		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_type)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	*zpp = NULL;
+
+	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
+		vap->va_mode &= ~S_ISVTX;
+
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ASSERT3P(zp, ==, NULL);
+
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		goto out;
+	}
+
+	/*
+	 * We only support the creation of regular files in
+	 * extended attribute directories.
+	 */
+
+	if ((dzp->z_pflags & ZFS_XATTR) &&
+	    (vap->va_type != VREG)) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
+
+	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+		projid = zfs_inherit_projid(dzp);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
+
+	getnewvnode_reserve_();
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+	    vsecp, acl_ids.z_fuidp, vap);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+out:
+	VNCHECKREF(dvp);
+	if (error == 0) {
+		*zpp = zp;
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ *	IN:	dvp	- vnode of directory to remove entry from.
+ *		name	- name of entry to remove.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime
+ *	 vp - ctime (if nlink > 0)
+ */
+
+/*ARGSUSED*/
+static int
+zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp;
+	znode_t		*xzp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	uint64_t	xattr_obj;
+	uint64_t	obj = 0;
+	dmu_tx_t	*tx;
+	boolean_t	unlinked;
+	uint64_t	txtype;
+	int		error;
+
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zp = VTOZ(vp);
+	ZFS_VERIFY_ZP(zp);
+	zilog = zfsvfs->z_log;
+
+	xattr_obj = 0;
+	xzp = NULL;
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	/*
+	 * Need to use rmdir for removing directories.
+	 */
+	if (vp->v_type == VDIR) {
+		error = SET_ERROR(EPERM);
+		goto out;
+	}
+
+	vnevent_remove(vp, dvp, name, ct);
+
+	obj = zp->z_id;
+
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+	}
+
+	/*
+	 * We may delete the znode now, or we may put it in the unlinked set;
+	 * it depends on whether we're the last link, and on whether there are
+	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
+	 * allow for either case.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+
+	if (xzp) {
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+	}
+
+	/* charge as an update -- would be nice not to charge at all */
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	/*
+	 * Mark this transaction as typically resulting in a net free of space
+	 */
+	dmu_tx_mark_netfree(tx);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Remove the directory entry.
+	 */
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		goto out;
+	}
+
+	if (unlinked) {
+		zfs_unlinked_add(zp, tx);
+		vp->v_vflag |= VV_NOSYNC;
+	}
+	/* XXX check changes to linux vnops */
+	txtype = TX_REMOVE;
+	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+	dmu_tx_commit(tx);
+out:
+
+	if (xzp)
+		vrele(ZTOV(xzp));
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+
+static int
+zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
+    struct componentname *cnp, int nameiop)
+{
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	int error;
+
+	cnp->cn_nameptr = __DECONST(char *, name);
+	cnp->cn_namelen = strlen(name);
+	cnp->cn_nameiop = nameiop;
+	cnp->cn_flags = ISLASTCN | SAVENAME;
+	cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
+	cnp->cn_cred = kcred;
+	cnp->cn_thread = curthread;
+
+	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
+		struct vop_lookup_args a;
+
+		a.a_gen.a_desc = &vop_lookup_desc;
+		a.a_dvp = ZTOV(dzp);
+		a.a_vpp = vpp;
+		a.a_cnp = cnp;
+		error = vfs_cache_lookup(&a);
+	} else {
+		error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred,
+		    curthread, 0, B_FALSE);
+	}
+#ifdef ZFS_DEBUG
+	if (error) {
+		printf("got error %d on name %s on op %d\n", error, name,
+		    nameiop);
+		kdb_backtrace();
+	}
+#endif
+	return (error);
+}
+
+int
+zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
+{
+	vnode_t *vp;
+	int error;
+	struct componentname cn;
+
+	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+		return (error);
+
+	error = zfs_remove_(ZTOV(dzp), vp, name, cr);
+	vput(vp);
+	return (error);
+}
+/*
+ * Create a new directory and insert it into dvp using the name
+ * provided.  Return a pointer to the inserted directory.
+ *
+ *	IN:	dvp	- vnode of directory to add subdir to.
+ *		dirname	- name of new directory.
+ *		vap	- attributes of new directory.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	vpp	- vnode of created directory.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ *	 vp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
+    cred_t *cr, int flags, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	uint64_t	txtype;
+	dmu_tx_t	*tx;
+	int		error;
+	ksid_t		*ksid;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+
+	ASSERT(vap->va_type == VDIR);
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	ksid = crgetsid(cr, KSID_OWNER);
+	if (ksid)
+		uid = ksid_getid(ksid);
+	else
+		uid = crgetuid(cr);
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    ((vap->va_mask & AT_XVATTR) ||
+	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (dzp->z_pflags & ZFS_XATTR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(dirname,
+	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (vap->va_mask & AT_XVATTR) {
+		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_type)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+	    NULL, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * First make sure the new directory doesn't exist.
+	 *
+	 * Existence is checked first to make sure we don't return
+	 * EACCES instead of EEXIST which can cause some applications
+	 * to fail.
+	 */
+	*zpp = NULL;
+
+	if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ASSERT3P(zp, ==, NULL);
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	/*
+	 * Add a new entry to the directory.
+	 */
+	getnewvnode_reserve_();
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create new node.
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	/*
+	 * Now put new name in parent dir.
+	 */
+	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
+
+	*zpp = zp;
+
+	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
+	    acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+#if	__FreeBSD_version < 1300124
+static void
+cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
+{
+
+	cache_purge(dvp);
+	cache_purge(vp);
+}
+#endif
+
+/*
+ * Remove a directory subdir entry.  If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ *	IN:	dvp	- vnode of directory to remove from.
+ *		name	- name of directory to be removed.
+ *		cwd	- vnode of current working directory.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
+{
+	znode_t		*dzp = VTOZ(dvp);
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	ZFS_VERIFY_ZP(zp);
+	zilog = zfsvfs->z_log;
+
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	if (vp->v_type != VDIR) {
+		error = SET_ERROR(ENOTDIR);
+		goto out;
+	}
+
+	vnevent_rmdir(vp, dvp, name, ct);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
+
+	if (error == 0) {
+		uint64_t txtype = TX_RMDIR;
+		zfs_log_remove(zilog, tx, txtype, dzp, name,
+		    ZFS_NO_OBJECT, B_FALSE);
+	}
+
+	dmu_tx_commit(tx);
+
+	cache_vop_rmdir(dvp, vp);
+out:
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+int
+zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
+{
+	struct componentname cn;
+	vnode_t *vp;
+	int error;
+
+	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
+		return (error);
+
+	error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Read as many directory entries as will fit into the provided
+ * buffer from the given directory cursor position (specified in
+ * the uio structure).
+ *
+ *	IN:	vp	- vnode of directory to read.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *		eofp	- set to true if end-of-file detected.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+static int
+zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
+    int *ncookies, ulong_t **cookies)
+{
+	znode_t		*zp = VTOZ(vp);
+	iovec_t		*iovp;
+	edirent_t	*eodp;
+	dirent64_t	*odp;
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os;
+	caddr_t		outbuf;
+	size_t		bufsize;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	uint_t		bytes_wanted;
+	uint64_t	offset; /* must be unsigned; checks for < 1 */
+	uint64_t	parent;
+	int		local_eof;
+	int		outcount;
+	int		error;
+	uint8_t		prefetch;
+	boolean_t	check_sysattrs;
+	uint8_t		type;
+	int		ncooks;
+	ulong_t		*cooks = NULL;
+	int		flags = 0;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * If we are not given an eof variable,
+	 * use a local one.
+	 */
+	if (eofp == NULL)
+		eofp = &local_eof;
+
+	/*
+	 * Check for valid iov_len.
+	 */
+	if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Quit if directory has been removed (posix)
+	 */
+	if ((*eofp = zp->z_unlinked) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	error = 0;
+	os = zfsvfs->z_os;
+	offset = zfs_uio_offset(uio);
+	prefetch = zp->z_zn_prefetch;
+
+	/*
+	 * Initialize the iterator cursor.
+	 */
+	if (offset <= 3) {
+		/*
+		 * Start iteration from the beginning of the directory.
+		 */
+		zap_cursor_init(&zc, os, zp->z_id);
+	} else {
+		/*
+		 * The offset is a serialized cursor.
+		 */
+		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+	}
+
+	/*
+	 * Get space to change directory entries into fs independent format.
+	 */
+	iovp = GET_UIO_STRUCT(uio)->uio_iov;
+	bytes_wanted = iovp->iov_len;
+	if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
+		bufsize = bytes_wanted;
+		outbuf = kmem_alloc(bufsize, KM_SLEEP);
+		odp = (struct dirent64 *)outbuf;
+	} else {
+		bufsize = bytes_wanted;
+		outbuf = NULL;
+		odp = (struct dirent64 *)iovp->iov_base;
+	}
+	eodp = (struct edirent *)odp;
+
+	if (ncookies != NULL) {
+		/*
+		 * Minimum entry size is dirent size and 1 byte for a file name.
+		 */
+		ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
+		    sizeof (((struct dirent *)NULL)->d_name) + 1);
+		cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK);
+		*cookies = cooks;
+		*ncookies = ncooks;
+	}
+	/*
+	 * If this VFS supports the system attribute view interface; and
+	 * we're looking at an extended attribute directory; and we care
+	 * about normalization conflicts on this vfs; then we must check
+	 * for normalization conflicts with the sysattr name space.
+	 */
+#ifdef TODO
+	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
+	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
+	    (flags & V_RDDIR_ENTFLAGS);
+#else
+	check_sysattrs = 0;
+#endif
+
+	/*
+	 * Transform to file-system independent format
+	 */
+	outcount = 0;
+	while (outcount < bytes_wanted) {
+		ino64_t objnum;
+		ushort_t reclen;
+		off64_t *next = NULL;
+
+		/*
+		 * Special case `.', `..', and `.zfs'.
+		 */
+		if (offset == 0) {
+			(void) strcpy(zap.za_name, ".");
+			zap.za_normalization_conflict = 0;
+			objnum = zp->z_id;
+			type = DT_DIR;
+		} else if (offset == 1) {
+			(void) strcpy(zap.za_name, "..");
+			zap.za_normalization_conflict = 0;
+			objnum = parent;
+			type = DT_DIR;
+		} else if (offset == 2 && zfs_show_ctldir(zp)) {
+			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+			zap.za_normalization_conflict = 0;
+			objnum = ZFSCTL_INO_ROOT;
+			type = DT_DIR;
+		} else {
+			/*
+			 * Grab next entry.
+			 */
+			if ((error = zap_cursor_retrieve(&zc, &zap))) {
+				if ((*eofp = (error == ENOENT)) != 0)
+					break;
+				else
+					goto update;
+			}
+
+			if (zap.za_integer_length != 8 ||
+			    zap.za_num_integers != 1) {
+				cmn_err(CE_WARN, "zap_readdir: bad directory "
+				    "entry, obj = %lld, offset = %lld\n",
+				    (u_longlong_t)zp->z_id,
+				    (u_longlong_t)offset);
+				error = SET_ERROR(ENXIO);
+				goto update;
+			}
+
+			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+			/*
+			 * MacOS X can extract the object type here such as:
+			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+			 */
+			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+
+			if (check_sysattrs && !zap.za_normalization_conflict) {
+#ifdef TODO
+				zap.za_normalization_conflict =
+				    xattr_sysattr_casechk(zap.za_name);
+#else
+				panic("%s:%u: TODO", __func__, __LINE__);
+#endif
+			}
+		}
+
+		if (flags & V_RDDIR_ACCFILTER) {
+			/*
+			 * If we have no access at all, don't include
+			 * this entry in the returned information
+			 */
+			znode_t	*ezp;
+			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
+				goto skip_entry;
+			if (!zfs_has_access(ezp, cr)) {
+				vrele(ZTOV(ezp));
+				goto skip_entry;
+			}
+			vrele(ZTOV(ezp));
+		}
+
+		if (flags & V_RDDIR_ENTFLAGS)
+			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
+		else
+			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
+
+		/*
+		 * Will this entry fit in the buffer?
+		 */
+		if (outcount + reclen > bufsize) {
+			/*
+			 * Did we manage to fit anything in the buffer?
+			 */
+			if (!outcount) {
+				error = SET_ERROR(EINVAL);
+				goto update;
+			}
+			break;
+		}
+		if (flags & V_RDDIR_ENTFLAGS) {
+			/*
+			 * Add extended flag entry:
+			 */
+			eodp->ed_ino = objnum;
+			eodp->ed_reclen = reclen;
+			/* NOTE: ed_off is the offset for the *next* entry */
+			next = &(eodp->ed_off);
+			eodp->ed_eflags = zap.za_normalization_conflict ?
+			    ED_CASE_CONFLICT : 0;
+			(void) strncpy(eodp->ed_name, zap.za_name,
+			    EDIRENT_NAMELEN(reclen));
+			eodp = (edirent_t *)((intptr_t)eodp + reclen);
+		} else {
+			/*
+			 * Add normal entry:
+			 */
+			odp->d_ino = objnum;
+			odp->d_reclen = reclen;
+			odp->d_namlen = strlen(zap.za_name);
+			/* NOTE: d_off is the offset for the *next* entry. */
+			next = &odp->d_off;
+			strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
+			odp->d_type = type;
+			dirent_terminate(odp);
+			odp = (dirent64_t *)((intptr_t)odp + reclen);
+		}
+		outcount += reclen;
+
+		ASSERT(outcount <= bufsize);
+
+		/* Prefetch znode */
+		if (prefetch)
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
+
+	skip_entry:
+		/*
+		 * Move to the next entry, fill in the previous offset.
+		 */
+		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+			zap_cursor_advance(&zc);
+			offset = zap_cursor_serialize(&zc);
+		} else {
+			offset += 1;
+		}
+
+		/* Fill the offset right after advancing the cursor. */
+		if (next != NULL)
+			*next = offset;
+		if (cooks != NULL) {
+			*cooks++ = offset;
+			ncooks--;
+			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
+		}
+	}
+	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+	/* Subtract unused cookies */
+	if (ncookies != NULL)
+		*ncookies -= ncooks;
+
+	if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
+		iovp->iov_base += outcount;
+		iovp->iov_len -= outcount;
+		zfs_uio_resid(uio) -= outcount;
+	} else if ((error =
+	    zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
+		/*
+		 * Reset the pointer.
+		 */
+		offset = zfs_uio_offset(uio);
+	}
+
+update:
+	zap_cursor_fini(&zc);
+	if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
+		kmem_free(outbuf, bufsize);
+
+	if (error == ENOENT)
+		error = 0;
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+	zfs_uio_setoffset(uio, offset);
+	ZFS_EXIT(zfsvfs);
+	if (error != 0 && cookies != NULL) {
+		free(*cookies, M_TEMP);
+		*cookies = NULL;
+		*ncookies = 0;
+	}
+	return (error);
+}
+
+/*
+ * Get the requested file attributes and place them in the provided
+ * vattr structure.
+ *
+ *	IN:	vp	- vnode of file.
+ *		vap	- va_mask identifies requested attributes.
+ *			  If AT_XVATTR set, then optional attrs are requested
+ *		flags	- ATTR_NOACLCHECK (CIFS server context)
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	vap	- attribute values.
+ *
+ *	RETURN:	0 (always succeeds).
+ */
+/* ARGSUSED */
+static int
+zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int	error = 0;
+	uint32_t blksize;
+	u_longlong_t nblocks;
+	uint64_t mtime[2], ctime[2], crtime[2], rdev;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t *xoap = NULL;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+		    &rdev, 8);
+
+	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
+	 * Also, if we are the owner don't bother, since owner should
+	 * always be allowed to read basic attributes of file.
+	 */
+	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
+	    (vap->va_uid != crgetuid(cr))) {
+		if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
+		    skipaclchk, cr))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	/*
+	 * Return all attributes.  It's cheaper to provide the answer
+	 * than to determine whether we were asked the question.
+	 */
+
+	vap->va_type = IFTOVT(zp->z_mode);
+	vap->va_mode = zp->z_mode & ~S_IFMT;
+	vn_fsid(vp, vap);
+	vap->va_nodeid = zp->z_id;
+	vap->va_nlink = zp->z_links;
+	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
+	    zp->z_links < ZFS_LINK_MAX)
+		vap->va_nlink++;
+	vap->va_size = zp->z_size;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		vap->va_rdev = zfs_cmpldev(rdev);
+	vap->va_seq = zp->z_seq;
+	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
+	vap->va_filerev = zp->z_seq;
+
+	/*
+	 * Add in any requested optional attributes and the create time.
+	 * Also set the corresponding bits in the returned attribute bitmap.
+	 */
+	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
+		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+			xoap->xoa_archive =
+			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
+			XVA_SET_RTN(xvap, XAT_ARCHIVE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+			xoap->xoa_readonly =
+			    ((zp->z_pflags & ZFS_READONLY) != 0);
+			XVA_SET_RTN(xvap, XAT_READONLY);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+			xoap->xoa_system =
+			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
+			XVA_SET_RTN(xvap, XAT_SYSTEM);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+			xoap->xoa_hidden =
+			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
+			XVA_SET_RTN(xvap, XAT_HIDDEN);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			xoap->xoa_nounlink =
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
+			XVA_SET_RTN(xvap, XAT_NOUNLINK);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			xoap->xoa_immutable =
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
+			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			xoap->xoa_appendonly =
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
+			XVA_SET_RTN(xvap, XAT_APPENDONLY);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			xoap->xoa_nodump =
+			    ((zp->z_pflags & ZFS_NODUMP) != 0);
+			XVA_SET_RTN(xvap, XAT_NODUMP);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+			xoap->xoa_opaque =
+			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
+			XVA_SET_RTN(xvap, XAT_OPAQUE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			xoap->xoa_av_quarantined =
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
+			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			xoap->xoa_av_modified =
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
+			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
+		    vp->v_type == VREG) {
+			zfs_sa_get_scanstamp(zp, xvap);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
+			XVA_SET_RTN(xvap, XAT_REPARSE);
+		}
+		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
+			xoap->xoa_generation = zp->z_gen;
+			XVA_SET_RTN(xvap, XAT_GEN);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+			xoap->xoa_offline =
+			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
+			XVA_SET_RTN(xvap, XAT_OFFLINE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+			xoap->xoa_sparse =
+			    ((zp->z_pflags & ZFS_SPARSE) != 0);
+			XVA_SET_RTN(xvap, XAT_SPARSE);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+			xoap->xoa_projinherit =
+			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
+			XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+			xoap->xoa_projid = zp->z_projid;
+			XVA_SET_RTN(xvap, XAT_PROJID);
+		}
+	}
+
+	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
+	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
+	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
+
+
+	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+	vap->va_blksize = blksize;
+	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
+
+	if (zp->z_blksz == 0) {
+		/*
+		 * Block size hasn't been set; suggest maximal I/O transfers.
+		 */
+		vap->va_blksize = zfsvfs->z_max_blksz;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ *	IN:	zp	- znode of file to be modified.
+ *		vap	- new attribute values.
+ *			  If AT_XVATTR set, then optional attrs are being set
+ *		flags	- ATTR_UTIME set if non-default time values provided.
+ *			- ATTR_NOACLCHECK (CIFS context only).
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+{
+	vnode_t		*vp = ZTOV(zp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	objset_t	*os = zfsvfs->z_os;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	vattr_t		oldva;
+	xvattr_t	tmpxvattr;
+	uint_t		mask = vap->va_mask;
+	uint_t		saved_mask = 0;
+	uint64_t	saved_mode;
+	int		trim_mask = 0;
+	uint64_t	new_mode;
+	uint64_t	new_uid, new_gid;
+	uint64_t	xattr_obj;
+	uint64_t	mtime[2], ctime[2];
+	uint64_t	projid = ZFS_INVALID_PROJID;
+	znode_t		*attrzp;
+	int		need_policy = FALSE;
+	int		err, err2;
+	zfs_fuid_info_t *fuidp = NULL;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t	*xoap;
+	zfs_acl_t	*aclp;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	boolean_t	fuid_dirtied = B_FALSE;
+	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
+	int		count = 0, xattr_count = 0;
+
+	if (mask == 0)
+		return (0);
+
+	if (mask & AT_NOSET)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * Make sure that if we have ephemeral uid/gid or xvattr specified
+	 * that file system is at proper version level
+	 */
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+	    (mask & AT_XVATTR))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (mask & AT_SIZE && vp->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EISDIR));
+	}
+
+	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * If this is an xvattr_t, then get a pointer to the structure of
+	 * optional attributes.  If this is NULL, then we have a vattr_t.
+	 */
+	xoap = xva_getxoptattr(xvap);
+
+	xva_init(&tmpxvattr);
+
+	/*
+	 * Immutable files can only alter immutable bit and atime
+	 */
+	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
+	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
+	 */
+
+	/*
+	 * Verify timestamps doesn't overflow 32 bits.
+	 * ZFS can handle large timestamps, but 32bit syscalls can't
+	 * handle times greater than 2039.  This check should be removed
+	 * once large timestamps are fully supported.
+	 */
+	if (mask & (AT_ATIME | AT_MTIME)) {
+		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOVERFLOW));
+		}
+	}
+	if (xoap != NULL && (mask & AT_XVATTR)) {
+		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
+		    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOVERFLOW));
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+			if (!dmu_objset_projectquota_enabled(os) ||
+			    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(EOPNOTSUPP));
+			}
+
+			projid = xoap->xoa_projid;
+			if (unlikely(projid == ZFS_INVALID_PROJID)) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(EINVAL));
+			}
+
+			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+				projid = ZFS_INVALID_PROJID;
+			else
+				need_policy = TRUE;
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+		    (xoap->xoa_projinherit !=
+		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+		    (!dmu_objset_projectquota_enabled(os) ||
+		    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EOPNOTSUPP));
+		}
+	}
+
+	attrzp = NULL;
+	aclp = NULL;
+
+	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * First validate permissions
+	 */
+
+	if (mask & AT_SIZE) {
+		/*
+		 * XXX - Note, we are not providing any open
+		 * mode flags here (like FNDELAY), so we may
+		 * block if there are locks present... this
+		 * should be addressed in openat().
+		 */
+		/* XXX - would it be OK to generate a log record here? */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+	}
+
+	if (mask & (AT_ATIME|AT_MTIME) ||
+	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+		    skipaclchk, cr);
+	}
+
+	if (mask & (AT_UID|AT_GID)) {
+		int	idmask = (mask & (AT_UID|AT_GID));
+		int	take_owner;
+		int	take_group;
+
+		/*
+		 * NOTE: even if a new mode is being set,
+		 * we may clear S_ISUID/S_ISGID bits.
+		 */
+
+		if (!(mask & AT_MODE))
+			vap->va_mode = zp->z_mode;
+
+		/*
+		 * Take ownership or chgrp to group we are a member of
+		 */
+
+		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
+		take_group = (mask & AT_GID) &&
+		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+		/*
+		 * If both AT_UID and AT_GID are set then take_owner and
+		 * take_group must both be set in order to allow taking
+		 * ownership.
+		 *
+		 * Otherwise, send the check through secpolicy_vnode_setattr()
+		 *
+		 */
+
+		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
+		    ((idmask == AT_UID) && take_owner) ||
+		    ((idmask == AT_GID) && take_group)) {
+			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+			    skipaclchk, cr) == 0) {
+				/*
+				 * Remove setuid/setgid for non-privileged users
+				 */
+				secpolicy_setid_clear(vap, vp, cr);
+				trim_mask = (mask & (AT_UID|AT_GID));
+			} else {
+				need_policy =  TRUE;
+			}
+		} else {
+			need_policy =  TRUE;
+		}
+	}
+
+	oldva.va_mode = zp->z_mode;
+	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+	if (mask & AT_XVATTR) {
+		/*
+		 * Update xvattr mask to include only those attributes
+		 * that are actually changing.
+		 *
+		 * the bits will be restored prior to actually setting
+		 * the attributes so the caller thinks they were set.
+		 */
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			if (xoap->xoa_appendonly !=
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+			if (xoap->xoa_projinherit !=
+			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+				XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			if (xoap->xoa_nounlink !=
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			if (xoap->xoa_immutable !=
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			if (xoap->xoa_nodump !=
+			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NODUMP);
+				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			if (xoap->xoa_av_modified !=
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			if ((vp->v_type != VREG &&
+			    xoap->xoa_av_quarantined) ||
+			    xoap->xoa_av_quarantined !=
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EPERM));
+		}
+
+		if (need_policy == FALSE &&
+		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+			need_policy = TRUE;
+		}
+	}
+
+	if (mask & AT_MODE) {
+		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+			err = secpolicy_setid_setsticky_clear(vp, vap,
+			    &oldva, cr);
+			if (err) {
+				ZFS_EXIT(zfsvfs);
+				return (err);
+			}
+			trim_mask |= AT_MODE;
+		} else {
+			need_policy = TRUE;
+		}
+	}
+
+	if (need_policy) {
+		/*
+		 * If trim_mask is set then take ownership
+		 * has been granted or write_acl is present and user
+		 * has the ability to modify mode.  In that case remove
+		 * UID|GID and or MODE from mask so that
+		 * secpolicy_vnode_setattr() doesn't revoke it.
+		 */
+
+		if (trim_mask) {
+			saved_mask = vap->va_mask;
+			vap->va_mask &= ~trim_mask;
+			if (trim_mask & AT_MODE) {
+				/*
+				 * Save the mode, as secpolicy_vnode_setattr()
+				 * will overwrite it with ova.va_mode.
+				 */
+				saved_mode = vap->va_mode;
+			}
+		}
+		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
+		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+		if (err) {
+			ZFS_EXIT(zfsvfs);
+			return (err);
+		}
+
+		if (trim_mask) {
+			vap->va_mask |= saved_mask;
+			if (trim_mask & AT_MODE) {
+				/*
+				 * Recover the mode after
+				 * secpolicy_vnode_setattr().
+				 */
+				vap->va_mode = saved_mode;
+			}
+		}
+	}
+
+	/*
+	 * secpolicy_vnode_setattr, or take ownership may have
+	 * changed va_mask
+	 */
+	mask = vap->va_mask;
+
+	if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
+		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+		    &xattr_obj, sizeof (xattr_obj));
+
+		if (err == 0 && xattr_obj) {
+			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+			if (err == 0) {
+				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+				if (err != 0)
+					vrele(ZTOV(attrzp));
+			}
+			if (err)
+				goto out2;
+		}
+		if (mask & AT_UID) {
+			new_uid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+			if (new_uid != zp->z_uid &&
+			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+			    new_uid)) {
+				if (attrzp)
+					vput(ZTOV(attrzp));
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (mask & AT_GID) {
+			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &fuidp);
+			if (new_gid != zp->z_gid &&
+			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+			    new_gid)) {
+				if (attrzp)
+					vput(ZTOV(attrzp));
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (projid != ZFS_INVALID_PROJID &&
+		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+			if (attrzp)
+				vput(ZTOV(attrzp));
+			err = SET_ERROR(EDQUOT);
+			goto out2;
+		}
+	}
+	tx = dmu_tx_create(os);
+
+	if (mask & AT_MODE) {
+		uint64_t pmode = zp->z_mode;
+		uint64_t acl_obj;
+		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+			err = SET_ERROR(EPERM);
+			goto out;
+		}
+
+		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
+			goto out;
+
+		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+			/*
+			 * Are we upgrading ACL from old V0 format
+			 * to V1 format?
+			 */
+			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+			    zfs_znode_acl_version(zp) ==
+			    ZFS_ACL_VERSION_INITIAL) {
+				dmu_tx_hold_free(tx, acl_obj, 0,
+				    DMU_OBJECT_END);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+				    0, aclp->z_acl_bytes);
+			} else {
+				dmu_tx_hold_write(tx, acl_obj, 0,
+				    aclp->z_acl_bytes);
+			}
+		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, aclp->z_acl_bytes);
+		}
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	} else {
+		if (((mask & AT_XVATTR) &&
+		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+		    (projid != ZFS_INVALID_PROJID &&
+		    !(zp->z_pflags & ZFS_PROJID)))
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		else
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	}
+
+	if (attrzp) {
+		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+	}
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err)
+		goto out;
+
+	count = 0;
+	/*
+	 * Set each attribute requested.
+	 * We group settings according to the locks they need to acquire.
+	 *
+	 * Note: you cannot set ctime directly, although it will be
+	 * updated as a side-effect of calling this function.
+	 */
+
+	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+		/*
+		 * For the existed object that is upgraded from old system,
+		 * its on-disk layout has no slot for the project ID attribute.
+		 * But quota accounting logic needs to access related slots by
+		 * offset directly. So we need to adjust old objects' layout
+		 * to make the project ID to some unified and fixed offset.
+		 */
+		if (attrzp)
+			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+		if (err == 0)
+			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+		if (unlikely(err == EEXIST))
+			err = 0;
+		else if (err != 0)
+			goto out;
+		else
+			projid = ZFS_INVALID_PROJID;
+	}
+
+	if (mask & (AT_UID|AT_GID|AT_MODE))
+		mutex_enter(&zp->z_acl_lock);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (attrzp) {
+		if (mask & (AT_UID|AT_GID|AT_MODE))
+			mutex_enter(&attrzp->z_acl_lock);
+		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+		    sizeof (attrzp->z_pflags));
+		if (projid != ZFS_INVALID_PROJID) {
+			attrzp->z_projid = projid;
+			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+			    sizeof (attrzp->z_projid));
+		}
+	}
+
+	if (mask & (AT_UID|AT_GID)) {
+
+		if (mask & AT_UID) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+			    &new_uid, sizeof (new_uid));
+			zp->z_uid = new_uid;
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+				    sizeof (new_uid));
+				attrzp->z_uid = new_uid;
+			}
+		}
+
+		if (mask & AT_GID) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+			    NULL, &new_gid, sizeof (new_gid));
+			zp->z_gid = new_gid;
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+				    sizeof (new_gid));
+				attrzp->z_gid = new_gid;
+			}
+		}
+		if (!(mask & AT_MODE)) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+			    NULL, &new_mode, sizeof (new_mode));
+			new_mode = zp->z_mode;
+		}
+		err = zfs_acl_chown_setattr(zp);
+		ASSERT(err == 0);
+		if (attrzp) {
+			err = zfs_acl_chown_setattr(attrzp);
+			ASSERT(err == 0);
+		}
+	}
+
+	if (mask & AT_MODE) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+		    &new_mode, sizeof (new_mode));
+		zp->z_mode = new_mode;
+		ASSERT3U((uintptr_t)aclp, !=, 0);
+		err = zfs_aclset_common(zp, aclp, cr, tx);
+		ASSERT0(err);
+		if (zp->z_acl_cached)
+			zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = aclp;
+		aclp = NULL;
+	}
+
+
+	if (mask & AT_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+		    &zp->z_atime, sizeof (zp->z_atime));
+	}
+
+	if (mask & AT_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    mtime, sizeof (mtime));
+	}
+
+	if (projid != ZFS_INVALID_PROJID) {
+		zp->z_projid = projid;
+		SA_ADD_BULK_ATTR(bulk, count,
+		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+		    sizeof (zp->z_projid));
+	}
+
+	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
+	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+		    NULL, mtime, sizeof (mtime));
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+	} else if (mask != 0) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
+		if (attrzp) {
+			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+			    SA_ZPL_CTIME(zfsvfs), NULL,
+			    &ctime, sizeof (ctime));
+			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
+			    mtime, ctime);
+		}
+	}
+
+	/*
+	 * Do this after setting timestamps to prevent timestamp
+	 * update from toggling bit
+	 */
+
+	if (xoap && (mask & AT_XVATTR)) {
+
+		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+			xoap->xoa_createtime = vap->va_birthtime;
+		/*
+		 * restore trimmed off masks
+		 * so that return masks can be set for caller.
+		 */
+
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+			XVA_SET_REQ(xvap, XAT_APPENDONLY);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+			XVA_SET_REQ(xvap, XAT_NOUNLINK);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+			XVA_SET_REQ(xvap, XAT_NODUMP);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
+			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+			ASSERT(vp->v_type == VREG);
+
+		zfs_xvattr_set(zp, xvap, tx);
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	if (mask != 0)
+		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+	if (mask & (AT_UID|AT_GID|AT_MODE))
+		mutex_exit(&zp->z_acl_lock);
+
+	if (attrzp) {
+		if (mask & (AT_UID|AT_GID|AT_MODE))
+			mutex_exit(&attrzp->z_acl_lock);
+	}
+out:
+	if (err == 0 && attrzp) {
+		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+		    xattr_count, tx);
+		ASSERT(err2 == 0);
+	}
+
+	if (attrzp)
+		vput(ZTOV(attrzp));
+
+	if (aclp)
+		zfs_acl_free(aclp);
+
+	if (fuidp) {
+		zfs_fuid_info_free(fuidp);
+		fuidp = NULL;
+	}
+
+	if (err) {
+		dmu_tx_abort(tx);
+	} else {
+		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		dmu_tx_commit(tx);
+	}
+
+out2:
+	if (os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * We acquire all but fdvp locks using non-blocking acquisitions.  If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename.  This acquire/release step ensures that we do not
+ * spin on a lock waiting for release.  On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
+ */
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+    struct vnode *tdvp, struct vnode **tvpp,
+    const struct componentname *scnp, const struct componentname *tcnp)
+{
+	zfsvfs_t	*zfsvfs;
+	struct vnode	*nvp, *svp, *tvp;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	const char	*snm = scnp->cn_nameptr;
+	const char	*tnm = tcnp->cn_nameptr;
+	int error;
+
+	VOP_UNLOCK1(tdvp);
+	if (*tvpp != NULL && *tvpp != tdvp)
+		VOP_UNLOCK1(*tvpp);
+
+relock:
+	error = vn_lock(sdvp, LK_EXCLUSIVE);
+	if (error)
+		goto out;
+	sdzp = VTOZ(sdvp);
+
+	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK1(sdvp);
+		if (error != EBUSY)
+			goto out;
+		error = vn_lock(tdvp, LK_EXCLUSIVE);
+		if (error)
+			goto out;
+		VOP_UNLOCK1(tdvp);
+		goto relock;
+	}
+	tdzp = VTOZ(tdvp);
+
+	/*
+	 * Before using sdzp and tdzp we must ensure that they are live.
+	 * As a porting legacy from illumos we have two things to worry
+	 * about.  One is typical for FreeBSD and it is that the vnode is
+	 * not reclaimed (doomed).  The other is that the znode is live.
+	 * The current code can invalidate the znode without acquiring the
+	 * corresponding vnode lock if the object represented by the znode
+	 * and vnode is no longer valid after a rollback or receive operation.
+	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+	 * that protects the znodes from the invalidation.
+	 */
+	zfsvfs = sdzp->z_zfsvfs;
+	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
+	 */
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		error = SET_ERROR(EIO);
+		goto out;
+	}
+
+	/*
+	 * Re-resolve svp to be certain it still exists and fetch the
+	 * correct vnode.
+	 */
+	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+	if (error != 0) {
+		/* Source entry invalid or not there. */
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	svp = ZTOV(szp);
+
+	/*
+	 * Re-resolve tvp, if it disappeared we just carry on.
+	 */
+	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		vrele(svp);
+		if ((tcnp->cn_flags & ISDOTDOT) != 0)
+			error = SET_ERROR(EINVAL);
+		goto out;
+	}
+	if (tzp != NULL)
+		tvp = ZTOV(tzp);
+	else
+		tvp = NULL;
+
+	/*
+	 * At present the vnode locks must be acquired before z_teardown_lock,
+	 * although it would be more logical to use the opposite order.
+	 */
+	ZFS_EXIT(zfsvfs);
+
+	/*
+	 * Now try acquire locks on svp and tvp.
+	 */
+	nvp = svp;
+	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+	if (error != 0) {
+		VOP_UNLOCK1(sdvp);
+		VOP_UNLOCK1(tdvp);
+		if (tvp != NULL)
+			vrele(tvp);
+		if (error != EBUSY) {
+			vrele(nvp);
+			goto out;
+		}
+		error = vn_lock(nvp, LK_EXCLUSIVE);
+		if (error != 0) {
+			vrele(nvp);
+			goto out;
+		}
+		VOP_UNLOCK1(nvp);
+		/*
+		 * Concurrent rename race.
+		 * XXX ?
+		 */
+		if (nvp == tdvp) {
+			vrele(nvp);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+		vrele(*svpp);
+		*svpp = nvp;
+		goto relock;
+	}
+	vrele(*svpp);
+	*svpp = nvp;
+
+	if (*tvpp != NULL)
+		vrele(*tvpp);
+	*tvpp = NULL;
+	if (tvp != NULL) {
+		nvp = tvp;
+		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+		if (error != 0) {
+			VOP_UNLOCK1(sdvp);
+			VOP_UNLOCK1(tdvp);
+			VOP_UNLOCK1(*svpp);
+			if (error != EBUSY) {
+				vrele(nvp);
+				goto out;
+			}
+			error = vn_lock(nvp, LK_EXCLUSIVE);
+			if (error != 0) {
+				vrele(nvp);
+				goto out;
+			}
+			vput(nvp);
+			goto relock;
+		}
+		*tvpp = nvp;
+	}
+
+	return (0);
+
+out:
+	return (error);
+}
+
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*zp, *zp1;
+	uint64_t	parent;
+	int		error;
+
+	zfsvfs = tdzp->z_zfsvfs;
+	if (tdzp == szp)
+		return (SET_ERROR(EINVAL));
+	if (tdzp == sdzp)
+		return (0);
+	if (tdzp->z_id == zfsvfs->z_root)
+		return (0);
+	zp = tdzp;
+	for (;;) {
+		ASSERT(!zp->z_unlinked);
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			break;
+
+		if (parent == szp->z_id) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		if (parent == zfsvfs->z_root)
+			break;
+		if (parent == sdzp->z_id)
+			break;
+
+		error = zfs_zget(zfsvfs, parent, &zp1);
+		if (error != 0)
+			break;
+
+		if (zp != tdzp)
+			VN_RELE_ASYNC(ZTOV(zp),
+			    dsl_pool_zrele_taskq(
+			    dmu_objset_pool(zfsvfs->z_os)));
+		zp = zp1;
+	}
+
+	if (error == ENOTDIR)
+		panic("checkpath: .. not a directory\n");
+	if (zp != tdzp)
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+	return (error);
+}
+
+#if	__FreeBSD_version < 1300124
+static void
+cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
+    struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
+{
+
+	cache_purge(fvp);
+	if (tvp != NULL)
+		cache_purge(tvp);
+	cache_purge_negative(tdvp);
+}
+#endif
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory.  Change the entry name as indicated.
+ *
+ *	IN:	sdvp	- Source directory containing the "old entry".
+ *		snm	- Old entry name.
+ *		tdvp	- Target directory to contain the "new entry".
+ *		tnm	- New entry name.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	sdvp,tdvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+static int
+zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+    cred_t *cr, int log)
+{
+	zfsvfs_t	*zfsvfs;
+	znode_t		*sdzp, *tdzp, *szp, *tzp;
+	zilog_t		*zilog = NULL;
+	dmu_tx_t	*tx;
+	const char	*snm = scnp->cn_nameptr;
+	const char	*tnm = tcnp->cn_nameptr;
+	int		error = 0;
+	bool	want_seqc_end __maybe_unused = false;
+
+	/* Reject renames across filesystems. */
+	if ((*svpp)->v_mount != tdvp->v_mount ||
+	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	if (zfsctl_is_node(tdvp)) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	/*
+	 * Lock all four vnodes to ensure safety and semantics of renaming.
+	 */
+	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+	if (error != 0) {
+		/* no vnodes are locked in the case of error here */
+		return (error);
+	}
+
+	tdzp = VTOZ(tdvp);
+	sdzp = VTOZ(sdvp);
+	zfsvfs = tdzp->z_zfsvfs;
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * After we re-enter ZFS_ENTER() we will have to revalidate all
+	 * znodes involved.
+	 */
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_utf8 && u8_validate(tnm,
+	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		error = SET_ERROR(EILSEQ);
+		goto unlockout;
+	}
+
+	/* If source and target are the same file, there is nothing to do. */
+	if ((*svpp) == (*tvpp)) {
+		error = 0;
+		goto unlockout;
+	}
+
+	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+	    (*tvpp)->v_mountedhere != NULL)) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
+	}
+
+	/*
+	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
+	 * bypassing the cleanup code in the case of an error.
+	 */
+	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
+	}
+
+	szp = VTOZ(*svpp);
+	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+		error = SET_ERROR(EIO);
+		goto unlockout;
+	}
+
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		error = SET_ERROR(EINVAL);
+		goto unlockout;
+	}
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow renames into our tree when the project
+	 * IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		error = SET_ERROR(EXDEV);
+		goto unlockout;
+	}
+
+	/*
+	 * Must have write access at the source to remove the old entry
+	 * and write access at the target to create the new entry.
+	 * Note that if target and source are the same, this can be
+	 * done in a single check.
+	 */
+	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+		goto unlockout;
+
+	if ((*svpp)->v_type == VDIR) {
+		/*
+		 * Avoid ".", "..", and aliases of "." for obvious reasons.
+		 */
+		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+		    sdzp == szp ||
+		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+			error = EINVAL;
+			goto unlockout;
+		}
+
+		/*
+		 * Check to make sure rename is valid.
+		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+		 */
+		if ((error = zfs_rename_check(szp, sdzp, tdzp)))
+			goto unlockout;
+	}
+
+	/*
+	 * Does target exist?
+	 */
+	if (tzp) {
+		/*
+		 * Source and target must be the same type.
+		 */
+		if ((*svpp)->v_type == VDIR) {
+			if ((*tvpp)->v_type != VDIR) {
+				error = SET_ERROR(ENOTDIR);
+				goto unlockout;
+			} else {
+				cache_purge(tdvp);
+				if (sdvp != tdvp)
+					cache_purge(sdvp);
+			}
+		} else {
+			if ((*tvpp)->v_type == VDIR) {
+				error = SET_ERROR(EISDIR);
+				goto unlockout;
+			}
+		}
+	}
+
+	vn_seqc_write_begin(*svpp);
+	vn_seqc_write_begin(sdvp);
+	if (*tvpp != NULL)
+		vn_seqc_write_begin(*tvpp);
+	if (tdvp != *tvpp)
+		vn_seqc_write_begin(tdvp);
+#if	__FreeBSD_version >= 1300102
+	want_seqc_end = true;
+#endif
+	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
+	if (tzp)
+		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
+
+	/*
+	 * notify the target directory if it is not the same
+	 * as source directory.
+	 */
+	if (tdvp != sdvp) {
+		vnevent_rename_dest_dir(tdvp, ct);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+	if (sdzp != tdzp) {
+		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tdzp);
+	}
+	if (tzp) {
+		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tzp);
+	}
+
+	zfs_sa_upgrade_txholds(tx, szp);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		goto unlockout;
+	}
+
+
+	if (tzp)	/* Attempt to remove the existing target */
+		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
+
+	if (error == 0) {
+		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
+		if (error == 0) {
+			szp->z_pflags |= ZFS_AV_MODIFIED;
+
+			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+			ASSERT0(error);
+
+			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+			    NULL);
+			if (error == 0) {
+				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+				    snm, tdzp, tnm, szp);
+
+				/*
+				 * Update path information for the target vnode
+				 */
+				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
+			} else {
+				/*
+				 * At this point, we have successfully created
+				 * the target name, but have failed to remove
+				 * the source name.  Since the create was done
+				 * with the ZRENAMING flag, there are
+				 * complications; for one, the link count is
+				 * wrong.  The easiest way to deal with this
+				 * is to remove the newly created target, and
+				 * return the original error.  This must
+				 * succeed; fortunately, it is very unlikely to
+				 * fail, since we just created it.
+				 */
+				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
+				    ZRENAMING, NULL), ==, 0);
+			}
+		}
+		if (error == 0) {
+			cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
+		}
+	}
+
+	dmu_tx_commit(tx);
+
+unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
+	ZFS_EXIT(zfsvfs);
+	if (want_seqc_end) {
+		vn_seqc_write_end(*svpp);
+		vn_seqc_write_end(sdvp);
+		if (*tvpp != NULL)
+			vn_seqc_write_end(*tvpp);
+		if (tdvp != *tvpp)
+			vn_seqc_write_end(tdvp);
+		want_seqc_end = false;
+	}
+	VOP_UNLOCK1(*svpp);
+	VOP_UNLOCK1(sdvp);
+
+out:				/* original two vnodes are locked */
+	MPASS(!want_seqc_end);
+	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	if (*tvpp != NULL)
+		VOP_UNLOCK1(*tvpp);
+	if (tdvp != *tvpp)
+		VOP_UNLOCK1(tdvp);
+	return (error);
+}
+
+int
+zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
+    cred_t *cr, int flags)
+{
+	struct componentname scn, tcn;
+	vnode_t *sdvp, *tdvp;
+	vnode_t *svp, *tvp;
+	int error;
+	svp = tvp = NULL;
+
+	sdvp = ZTOV(sdzp);
+	tdvp = ZTOV(tdzp);
+	error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
+	if (sdzp->z_zfsvfs->z_replay == B_FALSE)
+		VOP_UNLOCK1(sdvp);
+	if (error != 0)
+		goto fail;
+	VOP_UNLOCK1(svp);
+
+	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
+	error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
+	if (error == EJUSTRETURN)
+		tvp = NULL;
+	else if (error != 0) {
+		VOP_UNLOCK1(tdvp);
+		goto fail;
+	}
+
+	error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0);
+fail:
+	if (svp != NULL)
+		vrele(svp);
+	if (tvp != NULL)
+		vrele(tvp);
+
+	return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ *	IN:	dvp	- Directory to contain new symbolic link.
+ *		link	- Name for new symlink entry.
+ *		vap	- Attributes of new entry.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dvp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
+    const char *link, znode_t **zpp, cred_t *cr, int flags)
+{
+	znode_t		*zp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	zilog_t		*zilog;
+	uint64_t	len = strlen(link);
+	int		error;
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
+	uint64_t	txtype = TX_SYMLINK;
+
+	ASSERT(vap->va_type == VLNK);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (len > MAXPATHLEN) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENAMETOOLONG));
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0,
+	    vap, cr, NULL, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
+	    0 /* projid */)) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	getnewvnode_reserve_();
+	tx = dmu_tx_create(zfsvfs->z_os);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE + len);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		getnewvnode_drop_reserve();
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create a new object for the symlink.
+	 * for version 4 ZPL datsets the symlink will be an SA attribute
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	if (zp->z_is_sa)
+		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+		    __DECONST(void *, link), len, tx);
+	else
+		zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
+
+	zp->z_size = len;
+	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx);
+	/*
+	 * Insert the new object into the directory.
+	 */
+	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+
+	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+	*zpp = zp;
+
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	getnewvnode_drop_reserve();
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by vp.
+ *
+ *	IN:	vp	- vnode of symbolic link.
+ *		uio	- structure to contain the link path.
+ *		cr	- credentials of caller.
+ *		ct	- caller context
+ *
+ *	OUT:	uio	- structure containing the link path.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+static int
+zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (zp->z_is_sa)
+		error = sa_lookup_uio(zp->z_sa_hdl,
+		    SA_ZPL_SYMLINK(zfsvfs), uio);
+	else
+		error = zfs_sa_readlink(zp, uio);
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert a new entry into directory tdvp referencing svp.
+ *
+ *	IN:	tdvp	- Directory to contain new entry.
+ *		svp	- vnode of new entry.
+ *		name	- name of new entry.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	tdvp - ctime|mtime updated
+ *	 svp - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
+    int flags)
+{
+	znode_t		*tzp;
+	zfsvfs_t	*zfsvfs = tdzp->z_zfsvfs;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	int		error;
+	uint64_t	parent;
+	uid_t		owner;
+
+	ASSERT(ZTOV(tdzp)->v_type == VDIR);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(tdzp);
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * POSIX dictates that we return EPERM here.
+	 * Better choices include ENOTSUP or EISDIR.
+	 */
+	if (ZTOV(szp)->v_type == VDIR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	ZFS_VERIFY_ZP(szp);
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow hard link creation in our tree when the
+	 * project IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	if (szp->z_pflags & (ZFS_APPENDONLY |
+	    ZFS_IMMUTABLE | ZFS_READONLY)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/* Prevent links to .zfs/shares files */
+
+	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	if (parent == zfsvfs->z_shares_dir) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(name,
+	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	/*
+	 * We do not support links between attributes and non-attributes
+	 * because of the potential security risk of creating links
+	 * into "normal" file space in order to circumvent restrictions
+	 * imposed in attribute space.
+	 */
+	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+
+	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
+	if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
+	zfs_sa_upgrade_txholds(tx, szp);
+	zfs_sa_upgrade_txholds(tx, tdzp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_create(tdzp, name, szp, tx, 0);
+
+	if (error == 0) {
+		uint64_t txtype = TX_LINK;
+		zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+	}
+
+	dmu_tx_commit(tx);
+
+	if (error == 0) {
+		vnevent_link(ZTOV(szp), ct);
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Free or allocate space in a file.  Currently, this function only
+ * supports the `F_FREESP' command.  However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ *	IN:	ip	- inode of file to free data in.
+ *		cmd	- action to take (only F_FREESP supported).
+ *		bfp	- section of file to free/alloc.
+ *		flag	- current file open mode flags.
+ *		offset	- current file offset.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
+    offset_t offset, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	uint64_t	off, len;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (cmd != F_FREESP) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(zfsvfs)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	if (bfp->l_len < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Permissions aren't checked on Solaris because on this OS
+	 * zfs_space() can only be called with an opened file handle.
+	 * On Linux we can get here through truncate_range() which
+	 * operates directly on inodes, so we need to check access rights.
+	 */
+	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	off = bfp->l_start;
+	len = bfp->l_len; /* 0 means from off to end of file */
+
+	error = zfs_freesp(zp, off, len, flag, TRUE);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+static void
+zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int error;
+
+	ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
+	if (zp->z_sa_hdl == NULL) {
+		/*
+		 * The fs has been unmounted, or we did a
+		 * suspend/resume and this file no longer exists.
+		 */
+		ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+		vrecycle(vp);
+		return;
+	}
+
+	if (zp->z_unlinked) {
+		/*
+		 * Fast path to recycle a vnode of a removed file.
+		 */
+		ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+		vrecycle(vp);
+		return;
+	}
+
+	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
+			zp->z_atime_dirty = 0;
+			dmu_tx_commit(tx);
+		}
+	}
+	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+}
+
+
+CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid));
+CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid));
+
+/*ARGSUSED*/
+static int
+zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	uint32_t	gen;
+	uint64_t	gen64;
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		size, i, error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+	    &gen64, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	gen = (uint32_t)gen64;
+
+	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
+	fidp->fid_len = size;
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = size;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* Must have a non-zero generation number to distinguish from .zfs */
+	if (gen == 0)
+		gen = 1;
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	if (size == LONG_FID_LEN) {
+		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
+		zfid_long_t	*zlfid;
+
+		zlfid = (zfid_long_t *)fidp;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+		/* XXX - this should be the generation number for the objset */
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			zlfid->zf_setgen[i] = 0;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+static int
+zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	znode_t *zp;
+	zfsvfs_t *zfsvfs;
+
+	switch (cmd) {
+	case _PC_LINK_MAX:
+		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
+		return (0);
+
+	case _PC_FILESIZEBITS:
+		*valp = 64;
+		return (0);
+	case _PC_MIN_HOLE_SIZE:
+		*valp = (int)SPA_MINBLOCKSIZE;
+		return (0);
+	case _PC_ACL_EXTENDED:
+#if 0		/* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+		*valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
+		ZFS_EXIT(zfsvfs);
+#else
+		*valp = 0;
+#endif
+		return (0);
+
+	case _PC_ACL_NFS4:
+		zp = VTOZ(vp);
+		zfsvfs = zp->z_zfsvfs;
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+		*valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
+		ZFS_EXIT(zfsvfs);
+		return (0);
+
+	case _PC_ACL_PATH_MAX:
+		*valp = ACL_MAX_ENTRIES;
+		return (0);
+
+	default:
+		return (EOPNOTSUPP);
+	}
+}
+
+static int
+zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
+    int *rahead)
+{
+	znode_t *zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	objset_t *os = zp->z_zfsvfs->z_os;
+	zfs_locked_range_t *lr;
+	vm_object_t object;
+	off_t start, end, obj_size;
+	uint_t blksz;
+	int pgsin_b, pgsin_a;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	start = IDX_TO_OFF(ma[0]->pindex);
+	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
+
+	/*
+	 * Lock a range covering all required and optional pages.
+	 * Note that we need to handle the case of the block size growing.
+	 */
+	for (;;) {
+		blksz = zp->z_blksz;
+		lr = zfs_rangelock_tryenter(&zp->z_rangelock,
+		    rounddown(start, blksz),
+		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
+		if (lr == NULL) {
+			if (rahead != NULL) {
+				*rahead = 0;
+				rahead = NULL;
+			}
+			if (rbehind != NULL) {
+				*rbehind = 0;
+				rbehind = NULL;
+			}
+			break;
+		}
+		if (blksz == zp->z_blksz)
+			break;
+		zfs_rangelock_exit(lr);
+	}
+
+	object = ma[0]->object;
+	zfs_vmobject_wlock(object);
+	obj_size = object->un_pager.vnp.vnp_size;
+	zfs_vmobject_wunlock(object);
+	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
+		if (lr != NULL)
+			zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (zfs_vm_pagerret_bad);
+	}
+
+	pgsin_b = 0;
+	if (rbehind != NULL) {
+		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
+		pgsin_b = MIN(*rbehind, pgsin_b);
+	}
+
+	pgsin_a = 0;
+	if (rahead != NULL) {
+		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
+		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
+			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
+		pgsin_a = MIN(*rahead, pgsin_a);
+	}
+
+	/*
+	 * NB: we need to pass the exact byte size of the data that we expect
+	 * to read after accounting for the file size.  This is required because
+	 * ZFS will panic if we request DMU to read beyond the end of the last
+	 * allocated block.
+	 */
+	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
+	    MIN(end, obj_size) - (end - PAGE_SIZE));
+
+	if (lr != NULL)
+		zfs_rangelock_exit(lr);
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+
+	if (error != 0)
+		return (zfs_vm_pagerret_error);
+
+	VM_CNT_INC(v_vnodein);
+	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
+	if (rbehind != NULL)
+		*rbehind = pgsin_b;
+	if (rahead != NULL)
+		*rahead = pgsin_a;
+	return (zfs_vm_pagerret_ok);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getpages_args {
+	struct vnode *a_vp;
+	vm_page_t *a_m;
+	int a_count;
+	int *a_rbehind;
+	int *a_rahead;
+};
+#endif
+
+static int
+zfs_freebsd_getpages(struct vop_getpages_args *ap)
+{
+
+	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
+	    ap->a_rahead));
+}
+
+static int
+zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
+    int *rtvals)
+{
+	znode_t		*zp = VTOZ(vp);
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zfs_locked_range_t		*lr;
+	dmu_tx_t	*tx;
+	struct sf_buf	*sf;
+	vm_object_t	object;
+	vm_page_t	m;
+	caddr_t		va;
+	size_t		tocopy;
+	size_t		lo_len;
+	vm_ooffset_t	lo_off;
+	vm_ooffset_t	off;
+	uint_t		blksz;
+	int		ncount;
+	int		pcount;
+	int		err;
+	int		i;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	object = vp->v_object;
+	pcount = btoc(len);
+	ncount = pcount;
+
+	KASSERT(ma[0]->object == object, ("mismatching object"));
+	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
+
+	for (i = 0; i < pcount; i++)
+		rtvals[i] = zfs_vm_pagerret_error;
+
+	off = IDX_TO_OFF(ma[0]->pindex);
+	blksz = zp->z_blksz;
+	lo_off = rounddown(off, blksz);
+	lo_len = roundup(len + (off - lo_off), blksz);
+	lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
+
+	zfs_vmobject_wlock(object);
+	if (len + off > object->un_pager.vnp.vnp_size) {
+		if (object->un_pager.vnp.vnp_size > off) {
+			int pgoff;
+
+			len = object->un_pager.vnp.vnp_size - off;
+			ncount = btoc(len);
+			if ((pgoff = (int)len & PAGE_MASK) != 0) {
+				/*
+				 * If the object is locked and the following
+				 * conditions hold, then the page's dirty
+				 * field cannot be concurrently changed by a
+				 * pmap operation.
+				 */
+				m = ma[ncount - 1];
+				vm_page_assert_sbusied(m);
+				KASSERT(!pmap_page_is_write_mapped(m),
+				    ("zfs_putpages: page %p is not read-only",
+				    m));
+				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
+				    pgoff);
+			}
+		} else {
+			len = 0;
+			ncount = 0;
+		}
+		if (ncount < pcount) {
+			for (i = ncount; i < pcount; i++) {
+				rtvals[i] = zfs_vm_pagerret_bad;
+			}
+		}
+	}
+	zfs_vmobject_wunlock(object);
+
+	if (ncount == 0)
+		goto out;
+
+	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
+	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
+	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
+	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+	    zp->z_projid))) {
+		goto out;
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	if (zp->z_blksz < PAGE_SIZE) {
+		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+			va = zfs_map_page(ma[i], &sf);
+			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+			zfs_unmap_page(sf);
+		}
+	} else {
+		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
+	}
+
+	if (err == 0) {
+		uint64_t mtime[2], ctime[2];
+		sa_bulk_attr_t bulk[3];
+		int count = 0;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    &mtime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+		    &zp->z_pflags, 8);
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		ASSERT0(err);
+		/*
+		 * XXX we should be passing a callback to undirty
+		 * but that would make the locking messier
+		 */
+		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
+		    len, 0, NULL, NULL);
+
+		zfs_vmobject_wlock(object);
+		for (i = 0; i < ncount; i++) {
+			rtvals[i] = zfs_vm_pagerret_ok;
+			vm_page_undirty(ma[i]);
+		}
+		zfs_vmobject_wunlock(object);
+		VM_CNT_INC(v_vnodeout);
+		VM_CNT_ADD(v_vnodepgsout, ncount);
+	}
+	dmu_tx_commit(tx);
+
+out:
+	zfs_rangelock_exit(lr);
+	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zfsvfs->z_log, zp->z_id);
+	ZFS_EXIT(zfsvfs);
+	return (rtvals[0]);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_putpages_args {
+	struct vnode *a_vp;
+	vm_page_t *a_m;
+	int a_count;
+	int a_sync;
+	int *a_rtvals;
+};
+#endif
+
+static int
+zfs_freebsd_putpages(struct vop_putpages_args *ap)
+{
+
+	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
+	    ap->a_rtvals));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_bmap_args {
+	struct vnode *a_vp;
+	daddr_t  a_bn;
+	struct bufobj **a_bop;
+	daddr_t *a_bnp;
+	int *a_runp;
+	int *a_runb;
+};
+#endif
+
+static int
+zfs_freebsd_bmap(struct vop_bmap_args *ap)
+{
+
+	if (ap->a_bop != NULL)
+		*ap->a_bop = &ap->a_vp->v_bufobj;
+	if (ap->a_bnp != NULL)
+		*ap->a_bnp = ap->a_bn;
+	if (ap->a_runp != NULL)
+		*ap->a_runp = 0;
+	if (ap->a_runb != NULL)
+		*ap->a_runb = 0;
+
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_open_args {
+	struct vnode *a_vp;
+	int a_mode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_open(struct vop_open_args *ap)
+{
+	vnode_t	*vp = ap->a_vp;
+	znode_t *zp = VTOZ(vp);
+	int error;
+
+	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
+	if (error == 0)
+		vnode_create_vobject(vp, zp->z_size, ap->a_td);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_close_args {
+	struct vnode *a_vp;
+	int  a_fflag;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_close(struct vop_close_args *ap)
+{
+
+	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_ioctl_args {
+	struct vnode *a_vp;
+	ulong_t a_command;
+	caddr_t a_data;
+	int a_fflag;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
+{
+
+	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
+	    ap->a_fflag, ap->a_cred, NULL));
+}
+
+static int
+ioflags(int ioflags)
+{
+	int flags = 0;
+
+	if (ioflags & IO_APPEND)
+		flags |= FAPPEND;
+	if (ioflags & IO_NDELAY)
+		flags |= FNONBLOCK;
+	if (ioflags & IO_SYNC)
+		flags |= (FSYNC | FDSYNC | FRSYNC);
+
+	return (flags);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_read_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	int a_ioflag;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_read(struct vop_read_args *ap)
+{
+	zfs_uio_t uio;
+	zfs_uio_init(&uio, ap->a_uio);
+	return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
+	    ap->a_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_write_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	int a_ioflag;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_write(struct vop_write_args *ap)
+{
+	zfs_uio_t uio;
+	zfs_uio_init(&uio, ap->a_uio);
+	return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
+	    ap->a_cred));
+}
+
+#if __FreeBSD_version >= 1300102
+/*
+ * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
+ * the comment above cache_fplookup for details.
+ */
+static int
+zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
+{
+	vnode_t *vp;
+	znode_t *zp;
+	uint64_t pflags;
+
+	vp = v->a_vp;
+	zp = VTOZ_SMR(vp);
+	if (__predict_false(zp == NULL))
+		return (EAGAIN);
+	pflags = atomic_load_64(&zp->z_pflags);
+	if (pflags & ZFS_AV_QUARANTINED)
+		return (EAGAIN);
+	if (pflags & ZFS_XATTR)
+		return (EAGAIN);
+	if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
+		return (EAGAIN);
+	return (0);
+}
+#endif
+
+static int
+zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
+{
+	vnode_t *vp;
+	znode_t *zp;
+	char *target;
+
+	vp = v->a_vp;
+	zp = VTOZ_SMR(vp);
+	if (__predict_false(zp == NULL)) {
+		return (EAGAIN);
+	}
+
+	target = atomic_load_consume_ptr(&zp->z_cached_symlink);
+	if (target == NULL) {
+		return (EAGAIN);
+	}
+	return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_access_args {
+	struct vnode *a_vp;
+	accmode_t a_accmode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_access(struct vop_access_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	znode_t *zp = VTOZ(vp);
+	accmode_t accmode;
+	int error = 0;
+
+
+	if (ap->a_accmode == VEXEC) {
+		if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
+			return (0);
+	}
+
+	/*
+	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
+	 */
+	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
+	if (accmode != 0)
+		error = zfs_access(zp, accmode, 0, ap->a_cred);
+
+	/*
+	 * VADMIN has to be handled by vaccess().
+	 */
+	if (error == 0) {
+		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
+		if (accmode != 0) {
+#if __FreeBSD_version >= 1300105
+			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+			    zp->z_gid, accmode, ap->a_cred);
+#else
+			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
+			    zp->z_gid, accmode, ap->a_cred, NULL);
+#endif
+		}
+	}
+
+	/*
+	 * For VEXEC, ensure that at least one execute bit is set for
+	 * non-directories.
+	 */
+	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
+	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
+		error = EACCES;
+	}
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
+{
+	struct componentname *cnp = ap->a_cnp;
+	char nm[NAME_MAX + 1];
+
+	ASSERT(cnp->cn_namelen < sizeof (nm));
+	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
+
+	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
+	    cnp->cn_cred, cnp->cn_thread, 0, cached));
+}
+
+static int
+zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
+{
+
+	return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_lookup_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_cache_lookup(struct vop_lookup_args *ap)
+{
+	zfsvfs_t *zfsvfs;
+
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+	if (zfsvfs->z_use_namecache)
+		return (vfs_cache_lookup(ap));
+	else
+		return (zfs_freebsd_lookup(ap, B_FALSE));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_create_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_create(struct vop_create_args *ap)
+{
+	zfsvfs_t *zfsvfs;
+	struct componentname *cnp = ap->a_cnp;
+	vattr_t *vap = ap->a_vap;
+	znode_t *zp = NULL;
+	int rc, mode;
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	vattr_init_mask(vap);
+	mode = vap->va_mode & ALLPERMS;
+	zfsvfs = ap->a_dvp->v_mount->mnt_data;
+	*ap->a_vpp = NULL;
+
+	rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode,
+	    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */);
+	if (rc == 0)
+		*ap->a_vpp = ZTOV(zp);
+	if (zfsvfs->z_use_namecache &&
+	    rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
+
+	return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_remove_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_remove(struct vop_remove_args *ap)
+{
+
+	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+	return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+	    ap->a_cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_mkdir_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+static int
+zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
+{
+	vattr_t *vap = ap->a_vap;
+	znode_t *zp = NULL;
+	int rc;
+
+	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+
+	vattr_init_mask(vap);
+	*ap->a_vpp = NULL;
+
+	rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
+	    ap->a_cnp->cn_cred, 0, NULL);
+
+	if (rc == 0)
+		*ap->a_vpp = ZTOV(zp);
+	return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rmdir_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readdir_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	struct ucred *a_cred;
+	int *a_eofflag;
+	int *a_ncookies;
+	ulong_t **a_cookies;
+};
+#endif
+
+static int
+zfs_freebsd_readdir(struct vop_readdir_args *ap)
+{
+	zfs_uio_t uio;
+	zfs_uio_init(&uio, ap->a_uio);
+	return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
+	    ap->a_ncookies, ap->a_cookies));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fsync_args {
+	struct vnode *a_vp;
+	int a_waitfor;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_fsync(struct vop_fsync_args *ap)
+{
+
+	vop_stdfsync(ap);
+	return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getattr_args {
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_getattr(struct vop_getattr_args *ap)
+{
+	vattr_t *vap = ap->a_vap;
+	xvattr_t xvap;
+	ulong_t fflags = 0;
+	int error;
+
+	xva_init(&xvap);
+	xvap.xva_vattr = *vap;
+	xvap.xva_vattr.va_mask |= AT_XVATTR;
+
+	/* Convert chflags into ZFS-type flags. */
+	/* XXX: what about SF_SETTABLE?. */
+	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
+	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
+	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
+	XVA_SET_REQ(&xvap, XAT_NODUMP);
+	XVA_SET_REQ(&xvap, XAT_READONLY);
+	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
+	XVA_SET_REQ(&xvap, XAT_SYSTEM);
+	XVA_SET_REQ(&xvap, XAT_HIDDEN);
+	XVA_SET_REQ(&xvap, XAT_REPARSE);
+	XVA_SET_REQ(&xvap, XAT_OFFLINE);
+	XVA_SET_REQ(&xvap, XAT_SPARSE);
+
+	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
+	if (error != 0)
+		return (error);
+
+	/* Convert ZFS xattr into chflags. */
+#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
+	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
+		fflags |= (fflag);					\
+} while (0)
+	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
+	    xvap.xva_xoptattrs.xoa_immutable);
+	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
+	    xvap.xva_xoptattrs.xoa_appendonly);
+	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
+	    xvap.xva_xoptattrs.xoa_nounlink);
+	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
+	    xvap.xva_xoptattrs.xoa_archive);
+	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
+	    xvap.xva_xoptattrs.xoa_nodump);
+	FLAG_CHECK(UF_READONLY, XAT_READONLY,
+	    xvap.xva_xoptattrs.xoa_readonly);
+	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
+	    xvap.xva_xoptattrs.xoa_system);
+	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
+	    xvap.xva_xoptattrs.xoa_hidden);
+	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
+	    xvap.xva_xoptattrs.xoa_reparse);
+	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
+	    xvap.xva_xoptattrs.xoa_offline);
+	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
+	    xvap.xva_xoptattrs.xoa_sparse);
+
+#undef	FLAG_CHECK
+	*vap = xvap.xva_vattr;
+	vap->va_flags = fflags;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setattr_args {
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_setattr(struct vop_setattr_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	vattr_t *vap = ap->a_vap;
+	cred_t *cred = ap->a_cred;
+	xvattr_t xvap;
+	ulong_t fflags;
+	uint64_t zflags;
+
+	vattr_init_mask(vap);
+	vap->va_mask &= ~AT_NOSET;
+
+	xva_init(&xvap);
+	xvap.xva_vattr = *vap;
+
+	zflags = VTOZ(vp)->z_pflags;
+
+	if (vap->va_flags != VNOVAL) {
+		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
+		int error;
+
+		if (zfsvfs->z_use_fuids == B_FALSE)
+			return (EOPNOTSUPP);
+
+		fflags = vap->va_flags;
+		/*
+		 * XXX KDM
+		 * We need to figure out whether it makes sense to allow
+		 * UF_REPARSE through, since we don't really have other
+		 * facilities to handle reparse points and zfs_setattr()
+		 * doesn't currently allow setting that attribute anyway.
+		 */
+		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
+		    UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
+		    UF_OFFLINE|UF_SPARSE)) != 0)
+			return (EOPNOTSUPP);
+		/*
+		 * Unprivileged processes are not permitted to unset system
+		 * flags, or modify flags if any system flags are set.
+		 * Privileged non-jail processes may not modify system flags
+		 * if securelevel > 0 and any existing system flags are set.
+		 * Privileged jail processes behave like privileged non-jail
+		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
+		 * otherwise, they behave like unprivileged processes.
+		 */
+		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
+		    spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
+			if (zflags &
+			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
+				error = securelevel_gt(cred, 0);
+				if (error != 0)
+					return (error);
+			}
+		} else {
+			/*
+			 * Callers may only modify the file flags on
+			 * objects they have VADMIN rights for.
+			 */
+			if ((error = VOP_ACCESS(vp, VADMIN, cred,
+			    curthread)) != 0)
+				return (error);
+			if (zflags &
+			    (ZFS_IMMUTABLE | ZFS_APPENDONLY |
+			    ZFS_NOUNLINK)) {
+				return (EPERM);
+			}
+			if (fflags &
+			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
+				return (EPERM);
+			}
+		}
+
+#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
+	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
+	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
+		XVA_SET_REQ(&xvap, (xflag));				\
+		(xfield) = ((fflags & (fflag)) != 0);			\
+	}								\
+} while (0)
+		/* Convert chflags into ZFS-type flags. */
+		/* XXX: what about SF_SETTABLE?. */
+		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
+		    xvap.xva_xoptattrs.xoa_immutable);
+		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
+		    xvap.xva_xoptattrs.xoa_appendonly);
+		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
+		    xvap.xva_xoptattrs.xoa_nounlink);
+		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
+		    xvap.xva_xoptattrs.xoa_archive);
+		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
+		    xvap.xva_xoptattrs.xoa_nodump);
+		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
+		    xvap.xva_xoptattrs.xoa_readonly);
+		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
+		    xvap.xva_xoptattrs.xoa_system);
+		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
+		    xvap.xva_xoptattrs.xoa_hidden);
+		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
+		    xvap.xva_xoptattrs.xoa_reparse);
+		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
+		    xvap.xva_xoptattrs.xoa_offline);
+		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
+		    xvap.xva_xoptattrs.xoa_sparse);
+#undef	FLAG_CHANGE
+	}
+	if (vap->va_birthtime.tv_sec != VNOVAL) {
+		xvap.xva_vattr.va_mask |= AT_XVATTR;
+		XVA_SET_REQ(&xvap, XAT_CREATETIME);
+	}
+	return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_rename_args {
+	struct vnode *a_fdvp;
+	struct vnode *a_fvp;
+	struct componentname *a_fcnp;
+	struct vnode *a_tdvp;
+	struct vnode *a_tvp;
+	struct componentname *a_tcnp;
+};
+#endif
+
+static int
+zfs_freebsd_rename(struct vop_rename_args *ap)
+{
+	vnode_t *fdvp = ap->a_fdvp;
+	vnode_t *fvp = ap->a_fvp;
+	vnode_t *tdvp = ap->a_tdvp;
+	vnode_t *tvp = ap->a_tvp;
+	int error;
+
+	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
+	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
+
+	error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+	    ap->a_tcnp, ap->a_fcnp->cn_cred, 1);
+
+	vrele(fdvp);
+	vrele(fvp);
+	vrele(tdvp);
+	if (tvp != NULL)
+		vrele(tvp);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_symlink_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+	char *a_target;
+};
+#endif
+
+static int
+zfs_freebsd_symlink(struct vop_symlink_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+	vattr_t *vap = ap->a_vap;
+	znode_t *zp = NULL;
+	char *symlink;
+	size_t symlink_len;
+	int rc;
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
+	vattr_init_mask(vap);
+	*ap->a_vpp = NULL;
+
+	rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
+	    ap->a_target, &zp, cnp->cn_cred, 0 /* flags */);
+	if (rc == 0) {
+		*ap->a_vpp = ZTOV(zp);
+		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+		MPASS(zp->z_cached_symlink == NULL);
+		symlink_len = strlen(ap->a_target);
+		symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
+		if (symlink != NULL) {
+			memcpy(symlink, ap->a_target, symlink_len);
+			symlink[symlink_len] = '\0';
+			atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
+			    (uintptr_t)symlink);
+		}
+	}
+	return (rc);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_readlink_args {
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	struct ucred *a_cred;
+};
+#endif
+
+static int
+zfs_freebsd_readlink(struct vop_readlink_args *ap)
+{
+	zfs_uio_t uio;
+	znode_t	*zp = VTOZ(ap->a_vp);
+	char *symlink, *base;
+	size_t symlink_len;
+	int error;
+	bool trycache;
+
+	zfs_uio_init(&uio, ap->a_uio);
+	trycache = false;
+	if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
+	    zfs_uio_iovcnt(&uio) == 1) {
+		base = zfs_uio_iovbase(&uio, 0);
+		symlink_len = zfs_uio_iovlen(&uio, 0);
+		trycache = true;
+	}
+	error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
+	if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
+	    error != 0 || !trycache) {
+		return (error);
+	}
+	symlink_len -= zfs_uio_resid(&uio);
+	symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
+	if (symlink != NULL) {
+		memcpy(symlink, base, symlink_len);
+		symlink[symlink_len] = '\0';
+		if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
+		    (uintptr_t)NULL, (uintptr_t)symlink)) {
+			cache_symlink_free(symlink, symlink_len + 1);
+		}
+	}
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_link_args {
+	struct vnode *a_tdvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+static int
+zfs_freebsd_link(struct vop_link_args *ap)
+{
+	struct componentname *cnp = ap->a_cnp;
+	vnode_t *vp = ap->a_vp;
+	vnode_t *tdvp = ap->a_tdvp;
+
+	if (tdvp->v_mount != vp->v_mount)
+		return (EXDEV);
+
+	ASSERT(cnp->cn_flags & SAVENAME);
+
+	return (zfs_link(VTOZ(tdvp), VTOZ(vp),
+	    cnp->cn_nameptr, cnp->cn_cred, 0));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_inactive_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_inactive(struct vop_inactive_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+
+#if __FreeBSD_version >= 1300123
+	zfs_inactive(vp, curthread->td_ucred, NULL);
+#else
+	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
+#endif
+	return (0);
+}
+
+#if __FreeBSD_version >= 1300042
+#ifndef _SYS_SYSPROTO_H_
+struct vop_need_inactive_args {
+	struct vnode *a_vp;
+};
+#endif
+
+static int
+zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
+{
+	vnode_t *vp = ap->a_vp;
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int need;
+
+	if (vn_need_pageq_flush(vp))
+		return (1);
+
+	if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
+		return (1);
+	need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
+	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+
+	return (need);
+}
+#endif
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_reclaim_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
+{
+	vnode_t	*vp = ap->a_vp;
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+
+	ASSERT(zp != NULL);
+
+#if __FreeBSD_version < 1300042
+	/* Destroy the vm object and flush associated pages. */
+	vnode_destroy_vobject(vp);
+#endif
+	/*
+	 * z_teardown_inactive_lock protects from a race with
+	 * zfs_znode_dmu_fini in zfsvfs_teardown during
+	 * force unmount.
+	 */
+	ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
+	if (zp->z_sa_hdl == NULL)
+		zfs_znode_free(zp);
+	else
+		zfs_zinactive(zp);
+	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
+
+	vp->v_data = NULL;
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_fid_args {
+	struct vnode *a_vp;
+	struct fid *a_fid;
+};
+#endif
+
+static int
+zfs_freebsd_fid(struct vop_fid_args *ap)
+{
+
+	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
+}
+
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_pathconf_args {
+	struct vnode *a_vp;
+	int a_name;
+	register_t *a_retval;
+} *ap;
+#endif
+
+static int
+zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
+{
+	ulong_t val;
+	int error;
+
+	error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
+	    curthread->td_ucred, NULL);
+	if (error == 0) {
+		*ap->a_retval = val;
+		return (error);
+	}
+	if (error != EOPNOTSUPP)
+		return (error);
+
+	switch (ap->a_name) {
+	case _PC_NAME_MAX:
+		*ap->a_retval = NAME_MAX;
+		return (0);
+	case _PC_PIPE_BUF:
+		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
+			*ap->a_retval = PIPE_BUF;
+			return (0);
+		}
+		return (EINVAL);
+	default:
+		return (vop_stdpathconf(ap));
+	}
+}
+
+/*
+ * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
+ * extended attribute name:
+ *
+ *	NAMESPACE	PREFIX
+ *	system		freebsd:system:
+ *	user		(none, can be used to access ZFS fsattr(5) attributes
+ *			created on Solaris)
+ */
+static int
+zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
+    size_t size)
+{
+	const char *namespace, *prefix, *suffix;
+
+	/* We don't allow '/' character in attribute name. */
+	if (strchr(name, '/') != NULL)
+		return (EINVAL);
+	/* We don't allow attribute names that start with "freebsd:" string. */
+	if (strncmp(name, "freebsd:", 8) == 0)
+		return (EINVAL);
+
+	bzero(attrname, size);
+
+	switch (attrnamespace) {
+	case EXTATTR_NAMESPACE_USER:
+#if 0
+		prefix = "freebsd:";
+		namespace = EXTATTR_NAMESPACE_USER_STRING;
+		suffix = ":";
+#else
+		/*
+		 * This is the default namespace by which we can access all
+		 * attributes created on Solaris.
+		 */
+		prefix = namespace = suffix = "";
+#endif
+		break;
+	case EXTATTR_NAMESPACE_SYSTEM:
+		prefix = "freebsd:";
+		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
+		suffix = ":";
+		break;
+	case EXTATTR_NAMESPACE_EMPTY:
+	default:
+		return (EINVAL);
+	}
+	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
+	    name) >= size) {
+		return (ENAMETOOLONG);
+	}
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operating to retrieve a named extended attribute.
+ */
+static int
+zfs_getextattr(struct vop_getextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	struct vattr va;
+	vnode_t *xvp = NULL, *vp;
+	int error, flags;
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VREAD);
+	if (error != 0)
+		return (error);
+
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof (attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	flags = FREAD;
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+	    xvp, td);
+	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		if (error == ENOENT)
+			error = ENOATTR;
+		return (error);
+	}
+
+	if (ap->a_size != NULL) {
+		error = VOP_GETATTR(vp, &va, ap->a_cred);
+		if (error == 0)
+			*ap->a_size = (size_t)va.va_size;
+	} else if (ap->a_uio != NULL)
+		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+	VOP_UNLOCK1(vp);
+	vn_close(vp, flags, ap->a_cred, td);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_deleteextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to remove a named attribute.
+ */
+static int
+zfs_deleteextattr(struct vop_deleteextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	vnode_t *xvp = NULL, *vp;
+	int error;
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VWRITE);
+	if (error != 0)
+		return (error);
+
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof (attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
+	    UIO_SYSSPACE, attrname, xvp, td);
+	error = namei(&nd);
+	vp = nd.ni_vp;
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (error == ENOENT)
+			error = ENOATTR;
+		return (error);
+	}
+
+	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vput(nd.ni_dvp);
+	if (vp == nd.ni_dvp)
+		vrele(vp);
+	else
+		vput(vp);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	IN const char *a_name;
+	INOUT struct uio *a_uio;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to set a named attribute.
+ */
+static int
+zfs_setextattr(struct vop_setextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrname[255];
+	struct vattr va;
+	vnode_t *xvp = NULL, *vp;
+	int error, flags;
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VWRITE);
+	if (error != 0)
+		return (error);
+	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
+	    sizeof (attrname));
+	if (error != 0)
+		return (error);
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	flags = FFLAGS(O_WRONLY | O_CREAT);
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
+	    xvp, td);
+	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
+	    NULL);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	VATTR_NULL(&va);
+	va.va_size = 0;
+	error = VOP_SETATTR(vp, &va, ap->a_cred);
+	if (error == 0)
+		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
+
+	VOP_UNLOCK1(vp);
+	vn_close(vp, flags, ap->a_cred, td);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_listextattr {
+	IN struct vnode *a_vp;
+	IN int a_attrnamespace;
+	INOUT struct uio *a_uio;
+	OUT size_t *a_size;
+	IN struct ucred *a_cred;
+	IN struct thread *a_td;
+};
+#endif
+
+/*
+ * Vnode operation to retrieve extended attributes on a vnode.
+ */
+static int
+zfs_listextattr(struct vop_listextattr_args *ap)
+{
+	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
+	struct thread *td = ap->a_td;
+	struct nameidata nd;
+	char attrprefix[16];
+	uint8_t dirbuf[sizeof (struct dirent)];
+	struct dirent *dp;
+	struct iovec aiov;
+	struct uio auio;
+	size_t *sizep = ap->a_size;
+	size_t plen;
+	vnode_t *xvp = NULL, *vp;
+	int done, error, eof, pos;
+	zfs_uio_t uio;
+
+	zfs_uio_init(&uio, ap->a_uio);
+
+	/*
+	 * If the xattr property is off, refuse the request.
+	 */
+	if (!(zfsvfs->z_flags & ZSB_XATTR)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
+	    ap->a_cred, ap->a_td, VREAD);
+	if (error != 0)
+		return (error);
+
+	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
+	    sizeof (attrprefix));
+	if (error != 0)
+		return (error);
+	plen = strlen(attrprefix);
+
+	ZFS_ENTER(zfsvfs);
+
+	if (sizep != NULL)
+		*sizep = 0;
+
+	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
+	    LOOKUP_XATTR, B_FALSE);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		/*
+		 * ENOATTR means that the EA directory does not yet exist,
+		 * i.e. there are no extended attributes there.
+		 */
+		if (error == ENOATTR)
+			error = 0;
+		return (error);
+	}
+
+	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
+	    UIO_SYSSPACE, ".", xvp, td);
+	error = namei(&nd);
+	vp = nd.ni_vp;
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	if (error != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_SYSSPACE;
+	auio.uio_td = td;
+	auio.uio_rw = UIO_READ;
+	auio.uio_offset = 0;
+
+	do {
+		uint8_t nlen;
+
+		aiov.iov_base = (void *)dirbuf;
+		aiov.iov_len = sizeof (dirbuf);
+		auio.uio_resid = sizeof (dirbuf);
+		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
+		done = sizeof (dirbuf) - auio.uio_resid;
+		if (error != 0)
+			break;
+		for (pos = 0; pos < done; ) {
+			dp = (struct dirent *)(dirbuf + pos);
+			pos += dp->d_reclen;
+			/*
+			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
+			 * is what we get when attribute was created on Solaris.
+			 */
+			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
+				continue;
+			if (plen == 0 &&
+			    strncmp(dp->d_name, "freebsd:", 8) == 0)
+				continue;
+			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
+				continue;
+			nlen = dp->d_namlen - plen;
+			if (sizep != NULL)
+				*sizep += 1 + nlen;
+			else if (GET_UIO_STRUCT(&uio) != NULL) {
+				/*
+				 * Format of extattr name entry is one byte for
+				 * length and the rest for name.
+				 */
+				error = zfs_uiomove(&nlen, 1, zfs_uio_rw(&uio),
+				    &uio);
+				if (error == 0) {
+					error = zfs_uiomove(dp->d_name + plen,
+					    nlen, zfs_uio_rw(&uio), &uio);
+				}
+				if (error != 0)
+					break;
+			}
+		}
+	} while (!eof && error == 0);
+
+	vput(vp);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_getacl_args {
+	struct vnode *vp;
+	acl_type_t type;
+	struct acl *aclp;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_getacl(struct vop_getacl_args *ap)
+{
+	int		error;
+	vsecattr_t	vsecattr;
+
+	if (ap->a_type != ACL_TYPE_NFS4)
+		return (EINVAL);
+
+	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
+	if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
+	    &vsecattr, 0, ap->a_cred)))
+		return (error);
+
+	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
+	    vsecattr.vsa_aclcnt);
+	if (vsecattr.vsa_aclentp != NULL)
+		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_setacl_args {
+	struct vnode *vp;
+	acl_type_t type;
+	struct acl *aclp;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_setacl(struct vop_setacl_args *ap)
+{
+	int		error;
+	vsecattr_t vsecattr;
+	int		aclbsize;	/* size of acl list in bytes */
+	aclent_t	*aaclp;
+
+	if (ap->a_type != ACL_TYPE_NFS4)
+		return (EINVAL);
+
+	if (ap->a_aclp == NULL)
+		return (EINVAL);
+
+	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
+		return (EINVAL);
+
+	/*
+	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
+	 * splitting every entry into two and appending "canonical six"
+	 * entries at the end.  Don't allow for setting an ACL that would
+	 * cause chmod(2) to run out of ACL entries.
+	 */
+	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
+		return (ENOSPC);
+
+	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
+	if (error != 0)
+		return (error);
+
+	vsecattr.vsa_mask = VSA_ACE;
+	aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
+	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
+	aaclp = vsecattr.vsa_aclentp;
+	vsecattr.vsa_aclentsz = aclbsize;
+
+	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
+	error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
+	kmem_free(aaclp, aclbsize);
+
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct vop_aclcheck_args {
+	struct vnode *vp;
+	acl_type_t type;
+	struct acl *aclp;
+	struct ucred *cred;
+	struct thread *td;
+};
+#endif
+
+static int
+zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
+{
+
+	return (EOPNOTSUPP);
+}
+
+static int
+zfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+	vnode_t *covered_vp;
+	vnode_t *vp = ap->a_vp;
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t *zp = VTOZ(vp);
+	int ltype;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/*
+	 * If we are a snapshot mounted under .zfs, run the operation
+	 * on the covered vnode.
+	 */
+	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
+		char name[MAXNAMLEN + 1];
+		znode_t *dzp;
+		size_t len;
+
+		error = zfs_znode_parent_and_name(zp, &dzp, name);
+		if (error == 0) {
+			len = strlen(name);
+			if (*ap->a_buflen < len)
+				error = SET_ERROR(ENOMEM);
+		}
+		if (error == 0) {
+			*ap->a_buflen -= len;
+			bcopy(name, ap->a_buf + *ap->a_buflen, len);
+			*ap->a_vpp = ZTOV(dzp);
+		}
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	ZFS_EXIT(zfsvfs);
+
+	covered_vp = vp->v_mount->mnt_vnodecovered;
+#if __FreeBSD_version >= 1300045
+	enum vgetstate vs = vget_prep(covered_vp);
+#else
+	vhold(covered_vp);
+#endif
+	ltype = VOP_ISLOCKED(vp);
+	VOP_UNLOCK1(vp);
+#if __FreeBSD_version >= 1300045
+	error = vget_finish(covered_vp, LK_SHARED, vs);
+#else
+	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
+#endif
+	if (error == 0) {
+#if __FreeBSD_version >= 1300123
+		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
+		    ap->a_buflen);
+#else
+		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+		    ap->a_buf, ap->a_buflen);
+#endif
+		vput(covered_vp);
+	}
+	vn_lock(vp, ltype | LK_RETRY);
+	if (VN_IS_DOOMED(vp))
+		error = SET_ERROR(ENOENT);
+	return (error);
+}
+
+struct vop_vector zfs_vnodeops;
+struct vop_vector zfs_fifoops;
+struct vop_vector zfs_shareops;
+
+struct vop_vector zfs_vnodeops = {
+	.vop_default =		&default_vnodeops,
+	.vop_inactive =		zfs_freebsd_inactive,
+#if __FreeBSD_version >= 1300042
+	.vop_need_inactive =	zfs_freebsd_need_inactive,
+#endif
+	.vop_reclaim =		zfs_freebsd_reclaim,
+#if __FreeBSD_version >= 1300102
+	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+#endif
+	.vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
+	.vop_access =		zfs_freebsd_access,
+	.vop_allocate =		VOP_EINVAL,
+	.vop_lookup =		zfs_cache_lookup,
+	.vop_cachedlookup =	zfs_freebsd_cachedlookup,
+	.vop_getattr =		zfs_freebsd_getattr,
+	.vop_setattr =		zfs_freebsd_setattr,
+	.vop_create =		zfs_freebsd_create,
+	.vop_mknod =		(vop_mknod_t *)zfs_freebsd_create,
+	.vop_mkdir =		zfs_freebsd_mkdir,
+	.vop_readdir =		zfs_freebsd_readdir,
+	.vop_fsync =		zfs_freebsd_fsync,
+	.vop_open =		zfs_freebsd_open,
+	.vop_close =		zfs_freebsd_close,
+	.vop_rmdir =		zfs_freebsd_rmdir,
+	.vop_ioctl =		zfs_freebsd_ioctl,
+	.vop_link =		zfs_freebsd_link,
+	.vop_symlink =		zfs_freebsd_symlink,
+	.vop_readlink =		zfs_freebsd_readlink,
+	.vop_read =		zfs_freebsd_read,
+	.vop_write =		zfs_freebsd_write,
+	.vop_remove =		zfs_freebsd_remove,
+	.vop_rename =		zfs_freebsd_rename,
+	.vop_pathconf =		zfs_freebsd_pathconf,
+	.vop_bmap =		zfs_freebsd_bmap,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_getextattr =	zfs_getextattr,
+	.vop_deleteextattr =	zfs_deleteextattr,
+	.vop_setextattr =	zfs_setextattr,
+	.vop_listextattr =	zfs_listextattr,
+	.vop_getacl =		zfs_freebsd_getacl,
+	.vop_setacl =		zfs_freebsd_setacl,
+	.vop_aclcheck =		zfs_freebsd_aclcheck,
+	.vop_getpages =		zfs_freebsd_getpages,
+	.vop_putpages =		zfs_freebsd_putpages,
+	.vop_vptocnp =		zfs_vptocnp,
+#if __FreeBSD_version >= 1300064
+	.vop_lock1 =		vop_lock,
+	.vop_unlock =		vop_unlock,
+	.vop_islocked =		vop_islocked,
+#endif
+};
+VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
+
+struct vop_vector zfs_fifoops = {
+	.vop_default =		&fifo_specops,
+	.vop_fsync =		zfs_freebsd_fsync,
+#if __FreeBSD_version >= 1300102
+	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
+#endif
+	.vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
+	.vop_access =		zfs_freebsd_access,
+	.vop_getattr =		zfs_freebsd_getattr,
+	.vop_inactive =		zfs_freebsd_inactive,
+	.vop_read =		VOP_PANIC,
+	.vop_reclaim =		zfs_freebsd_reclaim,
+	.vop_setattr =		zfs_freebsd_setattr,
+	.vop_write =		VOP_PANIC,
+	.vop_pathconf = 	zfs_freebsd_pathconf,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_getacl =		zfs_freebsd_getacl,
+	.vop_setacl =		zfs_freebsd_setacl,
+	.vop_aclcheck =		zfs_freebsd_aclcheck,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
+
+/*
+ * special share hidden files vnode operations template
+ */
+struct vop_vector zfs_shareops = {
+	.vop_default =		&default_vnodeops,
+#if __FreeBSD_version >= 1300121
+	.vop_fplookup_vexec =	VOP_EAGAIN,
+#endif
+	.vop_fplookup_symlink =	VOP_EAGAIN,
+	.vop_access =		zfs_freebsd_access,
+	.vop_inactive =		zfs_freebsd_inactive,
+	.vop_reclaim =		zfs_freebsd_reclaim,
+	.vop_fid =		zfs_freebsd_fid,
+	.vop_pathconf =		zfs_freebsd_pathconf,
+};
+VFS_VOP_VECTOR_REGISTER(zfs_shareops);
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
new file mode 100644
index 000000000000..0491b2ff3e28
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_znode.c
@@ -0,0 +1,2067 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/systm.h>
+#include <sys/sysmacros.h>
+#include <sys/resource.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/unistd.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/* Used by fstat(1). */
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+	SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)");
+
+/*
+ * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
+ * turned on when DEBUG is also defined.
+ */
+#ifdef	ZFS_DEBUG
+#define	ZNODE_STATS
+#endif	/* DEBUG */
+
+#ifdef	ZNODE_STATS
+#define	ZNODE_STAT_ADD(stat)			((stat)++)
+#else
+#define	ZNODE_STAT_ADD(stat)			/* nothing */
+#endif	/* ZNODE_STATS */
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+#if !defined(KMEM_DEBUG) && __FreeBSD_version >= 1300102
+#define	_ZFS_USE_SMR
+static uma_zone_t znode_uma_zone;
+#else
+static kmem_cache_t *znode_cache = NULL;
+#endif
+
+extern struct vop_vector zfs_vnodeops;
+extern struct vop_vector zfs_fifoops;
+extern struct vop_vector zfs_shareops;
+
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
+{
+	znode_t *zp = arg;
+
+	/*
+	 * If in append mode, convert to writer and lock starting at the
+	 * current end of file.
+	 */
+	if (new->lr_type == RL_APPEND) {
+		new->lr_offset = zp->z_size;
+		new->lr_type = RL_WRITER;
+	}
+
+	/*
+	 * If we need to grow the block size then lock the whole file range.
+	 */
+	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+		new->lr_offset = 0;
+		new->lr_length = UINT64_MAX;
+	}
+}
+
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	znode_t *zp = buf;
+
+	POINTER_INVALIDATE(&zp->z_zfsvfs);
+
+	list_link_init(&zp->z_link_node);
+
+	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+	zp->z_acl_cached = NULL;
+	zp->z_vnode = NULL;
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+	znode_t *zp = buf;
+
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+	ASSERT3P(zp->z_vnode, ==, NULL);
+	ASSERT(!list_link_active(&zp->z_link_node));
+	mutex_destroy(&zp->z_acl_lock);
+	zfs_rangelock_fini(&zp->z_rangelock);
+
+	ASSERT(zp->z_acl_cached == NULL);
+}
+
+
+#ifdef _ZFS_USE_SMR
+VFS_SMR_DECLARE;
+
+static int
+zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private,
+    int flags)
+{
+
+	return (zfs_znode_cache_constructor(mem, private, flags));
+}
+
+static void
+zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private)
+{
+
+	zfs_znode_cache_destructor(mem, private);
+}
+
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache
+	 */
+	ASSERT(znode_uma_zone == NULL);
+	znode_uma_zone = uma_zcreate("zfs_znode_cache",
+	    sizeof (znode_t), zfs_znode_cache_constructor_smr,
+	    zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0);
+	VFS_SMR_ZONE_SET(znode_uma_zone);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+	return (uma_zalloc_smr(znode_uma_zone, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+	uma_zfree_smr(znode_uma_zone, zp);
+}
+#else
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache
+	 */
+	ASSERT(znode_cache == NULL);
+	znode_cache = kmem_cache_create("zfs_znode_cache",
+	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
+	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+static znode_t *
+zfs_znode_alloc_kmem(int flags)
+{
+
+	return (kmem_cache_alloc(znode_cache, flags));
+}
+
+static void
+zfs_znode_free_kmem(znode_t *zp)
+{
+
+	kmem_cache_free(znode_cache, zp);
+}
+#endif
+
+void
+zfs_znode_fini(void)
+{
+	/*
+	 * Cleanup zcache
+	 */
+#ifdef _ZFS_USE_SMR
+	if (znode_uma_zone) {
+		uma_zdestroy(znode_uma_zone);
+		znode_uma_zone = NULL;
+	}
+#else
+	if (znode_cache) {
+		kmem_cache_destroy(znode_cache);
+		znode_cache = NULL;
+	}
+#endif
+}
+
+
+static int
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	zfs_acl_ids_t acl_ids;
+	vattr_t vattr;
+	znode_t *sharezp;
+	znode_t *zp;
+	int error;
+
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0555;
+	vattr.va_uid = crgetuid(kcred);
+	vattr.va_gid = crgetgid(kcred);
+
+	sharezp = zfs_znode_alloc_kmem(KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
+	sharezp->z_unlinked = 0;
+	sharezp->z_atime_dirty = 0;
+	sharezp->z_zfsvfs = zfsvfs;
+	sharezp->z_is_sa = zfsvfs->z_use_sa;
+
+	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+	    kcred, NULL, &acl_ids));
+	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
+	ASSERT3P(zp, ==, sharezp);
+	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+	zfsvfs->z_shares_dir = sharezp->z_id;
+
+	zfs_acl_ids_free(&acl_ids);
+	sa_handle_destroy(sharezp->z_sa_hdl);
+	zfs_znode_free_kmem(sharezp);
+
+	return (error);
+}
+
+/*
+ * define a couple of values we need available
+ * for both 64 and 32 bit environments.
+ */
+#ifndef NBITSMINOR64
+#define	NBITSMINOR64	32
+#endif
+#ifndef MAXMAJ64
+#define	MAXMAJ64	0xffffffffUL
+#endif
+#ifndef	MAXMIN64
+#define	MAXMIN64	0xffffffffUL
+#endif
+
+/*
+ * Create special expldev for ZFS private use.
+ * Can't use standard expldev since it doesn't do
+ * what we want.  The standard expldev() takes a
+ * dev32_t in LP64 and expands it to a long dev_t.
+ * We need an interface that takes a dev32_t in ILP32
+ * and expands it to a long dev_t.
+ */
+static uint64_t
+zfs_expldev(dev_t dev)
+{
+	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
+}
+/*
+ * Special cmpldev for ZFS private use.
+ * Can't use standard cmpldev since it takes
+ * a long dev_t and compresses it to dev32_t in
+ * LP64.  We need to do a compaction of a long dev_t
+ * to a dev32_t in ILP32.
+ */
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
+	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
+	if (sa_hdl == NULL) {
+		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+		    SA_HDL_SHARED, &zp->z_sa_hdl));
+	} else {
+		zp->z_sa_hdl = sa_hdl;
+		sa_set_userp(sa_hdl, zp);
+	}
+
+	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+	/*
+	 * Slap on VROOT if we are the root znode unless we are the root
+	 * node of a snapshot mounted under .zfs.
+	 */
+	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
+		ZTOV(zp)->v_flag |= VROOT;
+
+	vn_exists(ZTOV(zp));
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
+	    zp->z_unlinked ||
+	    ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zp->z_zfsvfs));
+
+	sa_handle_destroy(zp->z_sa_hdl);
+	zp->z_sa_hdl = NULL;
+}
+
+static void
+zfs_vnode_forget(vnode_t *vp)
+{
+
+	/* copied from insmntque_stddtr */
+	vp->v_data = NULL;
+	vp->v_op = &dead_vnodeops;
+	vgone(vp);
+	vput(vp);
+}
+
+/*
+ * Construct a new znode/vnode and initialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+    dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+	znode_t	*zp;
+	vnode_t *vp;
+	uint64_t mode;
+	uint64_t parent;
+#ifdef notyet
+	uint64_t mtime[2], ctime[2];
+#endif
+	uint64_t projid = ZFS_DEFAULT_PROJID;
+	sa_bulk_attr_t bulk[9];
+	int count = 0;
+	int error;
+
+	zp = zfs_znode_alloc_kmem(KM_SLEEP);
+
+#ifndef _ZFS_USE_SMR
+	KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0,
+	    ("%s: fast path lookup enabled without smr", __func__));
+#endif
+
+#if __FreeBSD_version >= 1300076
+	KASSERT(curthread->td_vp_reserved != NULL,
+	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#else
+	KASSERT(curthread->td_vp_reserv > 0,
+	    ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+#endif
+	error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+	if (error != 0) {
+		zfs_znode_free_kmem(zp);
+		return (NULL);
+	}
+	zp->z_vnode = vp;
+	vp->v_data = zp;
+
+	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
+
+	zp->z_sa_hdl = NULL;
+	zp->z_unlinked = 0;
+	zp->z_atime_dirty = 0;
+	zp->z_mapcnt = 0;
+	zp->z_id = db->db_object;
+	zp->z_blksz = blksz;
+	zp->z_seq = 0x7A4653;
+	zp->z_sync_cnt = 0;
+	atomic_store_ptr(&zp->z_cached_symlink, NULL);
+
+	vp = ZTOV(zp);
+
+	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &zp->z_atime, 16);
+#ifdef notyet
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, 16);
+#endif
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &zp->z_uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &zp->z_gid, 8);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 ||
+	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    (zp->z_pflags & ZFS_PROJID) &&
+	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+		if (hdl == NULL)
+			sa_handle_destroy(zp->z_sa_hdl);
+		zfs_vnode_forget(vp);
+		zp->z_vnode = NULL;
+		zfs_znode_free_kmem(zp);
+		return (NULL);
+	}
+
+	zp->z_projid = projid;
+	zp->z_mode = mode;
+
+	/* Cache the xattr parent id */
+	if (zp->z_pflags & ZFS_XATTR)
+		zp->z_xattr_parent = parent;
+
+	vp->v_type = IFTOVT((mode_t)mode);
+
+	switch (vp->v_type) {
+	case VDIR:
+		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
+		break;
+	case VFIFO:
+		vp->v_op = &zfs_fifoops;
+		break;
+	case VREG:
+		if (parent == zfsvfs->z_shares_dir) {
+			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
+			vp->v_op = &zfs_shareops;
+		}
+		break;
+	default:
+			break;
+	}
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes++;
+	zp->z_zfsvfs = zfsvfs;
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	/*
+	 * Acquire vnode lock before making it available to the world.
+	 */
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+	VN_LOCK_AREC(vp);
+	if (vp->v_type != VFIFO)
+		VN_LOCK_ASHARE(vp);
+
+	return (zp);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ *	IN:	dzp	- parent directory for new znode
+ *		vap	- file attributes for new znode
+ *		tx	- dmu transaction id for zap operations
+ *		cr	- credentials of caller
+ *		flag	- flags:
+ *			  IS_ROOT_NODE	- new object will be root
+ *			  IS_XATTR	- new object is an attribute
+ *		bonuslen - length of bonus buffer
+ *		setaclp  - File/Dir initial ACL
+ *		fuidp	 - Tracks fuid allocation.
+ *
+ *	OUT:	zpp	- allocated znode
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
+	uint64_t	mode, size, links, parent, pflags;
+	uint64_t	dzp_pflags = 0;
+	uint64_t	rdev = 0;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
+	dmu_buf_t	*db;
+	timestruc_t	now;
+	uint64_t	gen, obj;
+	int		err;
+	int		bonuslen;
+	int		dnodesize;
+	sa_handle_t	*sa_hdl;
+	dmu_object_type_t obj_type;
+	sa_bulk_attr_t	*sa_attrs;
+	int		cnt = 0;
+	zfs_acl_locator_cb_t locate = { 0 };
+
+	ASSERT(vap && ((vap->va_mask & AT_MODE) == AT_MODE));
+
+	if (zfsvfs->z_replay) {
+		obj = vap->va_nodeid;
+		now = vap->va_ctime;		/* see zfs_replay_create() */
+		gen = vap->va_nblocks;		/* ditto */
+		dnodesize = vap->va_fsid;	/* ditto */
+	} else {
+		obj = 0;
+		vfs_timestamp(&now);
+		gen = dmu_tx_get_txg(tx);
+		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+	}
+
+	if (dnodesize == 0)
+		dnodesize = DNODE_MIN_SIZE;
+
+	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+	bonuslen = (obj_type == DMU_OT_SA) ?
+	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+	/*
+	 * Create a new DMU object.
+	 */
+	/*
+	 * There's currently no mechanism for pre-reading the blocks that will
+	 * be needed to allocate a new object, so we accept the small chance
+	 * that there will be an i/o error and we will fail one of the
+	 * assertions below.
+	 */
+	if (vap->va_type == VDIR) {
+		if (zfsvfs->z_replay) {
+			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = zap_create_norm_dnsize(zfsvfs->z_os,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	} else {
+		if (zfsvfs->z_replay) {
+			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	}
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+	/*
+	 * If this is the root, fix up the half-initialized parent pointer
+	 * to reference the just-allocated physical data area.
+	 */
+	if (flag & IS_ROOT_NODE) {
+		dzp->z_id = obj;
+	} else {
+		dzp_pflags = dzp->z_pflags;
+	}
+
+	/*
+	 * If parent is an xattr, so am I.
+	 */
+	if (dzp_pflags & ZFS_XATTR) {
+		flag |= IS_XATTR;
+	}
+
+	if (zfsvfs->z_use_fuids)
+		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+	else
+		pflags = 0;
+
+	if (vap->va_type == VDIR) {
+		size = 2;		/* contents ("." and "..") */
+		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
+	} else {
+		size = links = 0;
+	}
+
+	if (vap->va_type == VBLK || vap->va_type == VCHR) {
+		rdev = zfs_expldev(vap->va_rdev);
+	}
+
+	parent = dzp->z_id;
+	mode = acl_ids->z_mode;
+	if (flag & IS_XATTR)
+		pflags |= ZFS_XATTR;
+
+	/*
+	 * No execs denied will be determined when zfs_mode_compute() is called.
+	 */
+	pflags |= acl_ids->z_aclp->z_hints &
+	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+	ZFS_TIME_ENCODE(&now, crtime);
+	ZFS_TIME_ENCODE(&now, ctime);
+
+	if (vap->va_mask & AT_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, atime);
+	} else {
+		ZFS_TIME_ENCODE(&now, atime);
+	}
+
+	if (vap->va_mask & AT_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+	} else {
+		ZFS_TIME_ENCODE(&now, mtime);
+	}
+
+	/* Now add in all of the "SA" attributes */
+	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+	    &sa_hdl));
+
+	/*
+	 * Setup the array of attributes to be replaced/set on the new file
+	 *
+	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
+	 * in the old znode_phys_t format.  Don't change this ordering
+	 */
+	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+	} else {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+		    NULL, &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+		    NULL, &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+	}
+
+	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+		    &empty_xattr, 8);
+	}
+	if (obj_type == DMU_OT_ZNODE ||
+	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+		    NULL, &rdev, 8);
+
+	}
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+		    &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+		    &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+		    sizeof (uint64_t) * 4);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (zfs_acl_phys_t));
+	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+		    &acl_ids->z_aclp->z_acl_count, 8);
+		locate.cb_aclp = acl_ids->z_aclp;
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate,
+		    acl_ids->z_aclp->z_acl_bytes);
+		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+	}
+
+	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+	if (!(flag & IS_ROOT_NODE)) {
+		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+		ASSERT(*zpp != NULL);
+	} else {
+		/*
+		 * If we are creating the root node, the "parent" we
+		 * passed in is the znode for the root.
+		 */
+		*zpp = dzp;
+
+		(*zpp)->z_sa_hdl = sa_hdl;
+	}
+
+	(*zpp)->z_pflags = pflags;
+	(*zpp)->z_mode = mode;
+	(*zpp)->z_dnodesize = dnodesize;
+
+	if (vap->va_mask & AT_XVATTR)
+		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
+
+	if (obj_type == DMU_OT_ZNODE ||
+	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+	}
+	if (!(flag & IS_ROOT_NODE)) {
+		vnode_t *vp;
+
+		vp = ZTOV(*zpp);
+		vp->v_vflag |= VV_FORCEINSMQ;
+		err = insmntque(vp, zfsvfs->z_vfs);
+		vp->v_vflag &= ~VV_FORCEINSMQ;
+		KASSERT(err == 0, ("insmntque() failed: error %d", err));
+	}
+	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+}
+
+/*
+ * Update in-core attributes.  It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+	xoptattr_t *xoap;
+
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
+
+	ASSERT_VOP_IN_SEQC(ZTOV(zp));
+
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+		uint64_t times[2];
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
+		    &times, sizeof (times), tx);
+		XVA_SET_RTN(xvap, XAT_CREATETIME);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_READONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_HIDDEN);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SYSTEM);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_ARCHIVE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NOUNLINK);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_APPENDONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NODUMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OPAQUE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+		zfs_sa_set_scanstamp(zp, xvap, tx);
+		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_REPARSE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OFFLINE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SPARSE);
+	}
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+	dmu_object_info_t doi;
+	dmu_buf_t	*db;
+	znode_t		*zp;
+	vnode_t		*vp;
+	sa_handle_t	*hdl;
+	struct thread	*td;
+	int locked;
+	int err;
+
+	td = curthread;
+	getnewvnode_reserve_();
+again:
+	*zpp = NULL;
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		getnewvnode_drop_reserve();
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		getnewvnode_drop_reserve();
+		return (SET_ERROR(EINVAL));
+	}
+
+	hdl = dmu_buf_get_user(db);
+	if (hdl != NULL) {
+		zp  = sa_get_userdata(hdl);
+
+		/*
+		 * Since "SA" does immediate eviction we
+		 * should never find a sa handle that doesn't
+		 * know about the znode.
+		 */
+		ASSERT3P(zp, !=, NULL);
+		ASSERT3U(zp->z_id, ==, obj_num);
+		if (zp->z_unlinked) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			vp = ZTOV(zp);
+			/*
+			 * Don't let the vnode disappear after
+			 * ZFS_OBJ_HOLD_EXIT.
+			 */
+			VN_HOLD(vp);
+			*zpp = zp;
+			err = 0;
+		}
+
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+		if (err) {
+			getnewvnode_drop_reserve();
+			return (err);
+		}
+
+		locked = VOP_ISLOCKED(vp);
+		VI_LOCK(vp);
+		if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) {
+			/*
+			 * The vnode is doomed and this thread doesn't
+			 * hold the exclusive lock on it, so the vnode
+			 * must be being reclaimed by another thread.
+			 * Otherwise the doomed vnode is being reclaimed
+			 * by this thread and zfs_zget is called from
+			 * ZIL internals.
+			 */
+			VI_UNLOCK(vp);
+
+			/*
+			 * XXX vrele() locks the vnode when the last reference
+			 * is dropped.  Although in this case the vnode is
+			 * doomed / dead and so no inactivation is required,
+			 * the vnode lock is still acquired.  That could result
+			 * in a LOR with z_teardown_lock if another thread holds
+			 * the vnode's lock and tries to take z_teardown_lock.
+			 * But that is only possible if the other thread peforms
+			 * a ZFS vnode operation on the vnode.  That either
+			 * should not happen if the vnode is dead or the thread
+			 * should also have a reference to the vnode and thus
+			 * our reference is not last.
+			 */
+			VN_RELE(vp);
+			goto again;
+		}
+		VI_UNLOCK(vp);
+		getnewvnode_drop_reserve();
+		return (err);
+	}
+
+	/*
+	 * Not found create new znode/vnode
+	 * but only if file exists.
+	 *
+	 * There is a small window where zfs_vget() could
+	 * find this object while a file create is still in
+	 * progress.  This is checked for in zfs_znode_alloc()
+	 *
+	 * if zfs_znode_alloc() fails it will drop the hold on the
+	 * bonus buffer.
+	 */
+	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+	    doi.doi_bonus_type, NULL);
+	if (zp == NULL) {
+		err = SET_ERROR(ENOENT);
+	} else {
+		*zpp = zp;
+	}
+	if (err == 0) {
+		vnode_t *vp = ZTOV(zp);
+
+		err = insmntque(vp, zfsvfs->z_vfs);
+		if (err == 0) {
+			vp->v_hash = obj_num;
+			VOP_UNLOCK1(vp);
+		} else {
+			zp->z_vnode = NULL;
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_free(zp);
+			*zpp = NULL;
+		}
+	}
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+	getnewvnode_drop_reserve();
+	return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	vnode_t *vp;
+	uint64_t obj_num = zp->z_id;
+	uint64_t mode, size;
+	sa_bulk_attr_t bulk[8];
+	int err;
+	int count = 0;
+	uint64_t gen;
+
+	/*
+	 * Remove cached pages before reloading the znode, so that they are not
+	 * lingering after we run into any error.  Ideally, we should vgone()
+	 * the vnode in case of error, but currently we cannot do that
+	 * because of the LOR between the vnode lock and z_teardown_lock.
+	 * So, instead, we have to "doom" the znode in the illumos style.
+	 */
+	vp = ZTOV(zp);
+	vn_pages_remove(vp, 0, 0);
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
+
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+	ASSERT(zp->z_sa_hdl == NULL);
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EINVAL));
+	}
+
+	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+	size = zp->z_size;
+
+	/* reload cached values */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+	    &gen, sizeof (gen));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, sizeof (zp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &zp->z_links, sizeof (zp->z_links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &zp->z_atime, sizeof (zp->z_atime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &zp->z_uid, sizeof (zp->z_uid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &zp->z_gid, sizeof (zp->z_gid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EIO));
+	}
+
+	zp->z_mode = mode;
+
+	if (gen != zp->z_gen) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * It is highly improbable but still quite possible that two
+	 * objects in different datasets are created with the same
+	 * object numbers and in transaction groups with the same
+	 * numbers.  znodes corresponding to those objects would
+	 * have the same z_id and z_gen, but their other attributes
+	 * may be different.
+	 * zfs recv -F may replace one of such objects with the other.
+	 * As a result file properties recorded in the replaced
+	 * object's vnode may no longer match the received object's
+	 * properties.  At present the only cached property is the
+	 * files type recorded in v_type.
+	 * So, handle this case by leaving the old vnode and znode
+	 * disassociated from the actual object.  A new vnode and a
+	 * znode will be created if the object is accessed
+	 * (e.g. via a look-up).  The old vnode and znode will be
+	 * recycled when the last vnode reference is dropped.
+	 */
+	if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * If the file has zero links, then it has been unlinked on the send
+	 * side and it must be in the received unlinked set.
+	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+	 * stale data and to prevent automatically removal of the file in
+	 * zfs_zinactive().  The file will be removed either when it is removed
+	 * on the send side and the next incremental stream is received or
+	 * when the unlinked set gets processed.
+	 */
+	zp->z_unlinked = (zp->z_links == 0);
+	if (zp->z_unlinked) {
+		zfs_znode_dmu_fini(zp);
+		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+		return (0);
+	}
+
+	zp->z_blksz = doi.doi_data_block_size;
+	if (zp->z_size != size)
+		vnode_pager_setsize(vp, zp->z_size);
+
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
+
+	return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	objset_t *os = zfsvfs->z_os;
+	uint64_t obj = zp->z_id;
+	uint64_t acl_obj = zfs_external_acl(zp);
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
+	if (acl_obj) {
+		VERIFY(!zp->z_is_sa);
+		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+	}
+	VERIFY(0 == dmu_object_free(os, obj, tx));
+	zfs_znode_dmu_fini(zp);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
+	zfs_znode_free(zp);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t z_id = zp->z_id;
+
+	ASSERT(zp->z_sa_hdl);
+
+	/*
+	 * Don't allow a zfs_zget() while were trying to release this znode
+	 */
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
+
+	/*
+	 * If this was the last reference to a file with no links, remove
+	 * the file from the file system unless the file system is mounted
+	 * read-only.  That can happen, for example, if the file system was
+	 * originally read-write, the file was opened, then unlinked and
+	 * the file system was made read-only before the file was finally
+	 * closed.  The file will remain in the unlinked set.
+	 */
+	if (zp->z_unlinked) {
+		ASSERT(!zfsvfs->z_issnap);
+		if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) {
+			ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+			zfs_rmnode(zp);
+			return;
+		}
+	}
+
+	zfs_znode_dmu_fini(zp);
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+	zfs_znode_free(zp);
+}
+
+void
+zfs_znode_free(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	char *symlink;
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	zp->z_vnode = NULL;
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	POINTER_INVALIDATE(&zp->z_zfsvfs);
+	list_remove(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes--;
+	mutex_exit(&zfsvfs->z_znodes_lock);
+	symlink = atomic_load_ptr(&zp->z_cached_symlink);
+	if (symlink != NULL) {
+		atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink, (uintptr_t)NULL);
+		cache_symlink_free(symlink, strlen(symlink) + 1);
+	}
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	zfs_znode_free_kmem(zp);
+}
+
+void
+zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2], boolean_t have_tx)
+{
+	timestruc_t	now;
+
+	vfs_timestamp(&now);
+
+	if (have_tx) {	/* will sa_bulk_update happen really soon? */
+		zp->z_atime_dirty = 0;
+		zp->z_seq++;
+	} else {
+		zp->z_atime_dirty = 1;
+	}
+
+	if (flag & AT_ATIME) {
+		ZFS_TIME_ENCODE(&now, zp->z_atime);
+	}
+
+	if (flag & AT_MTIME) {
+		ZFS_TIME_ENCODE(&now, mtime);
+		if (zp->z_zfsvfs->z_use_fuids) {
+			zp->z_pflags |= (ZFS_ARCHIVE |
+			    ZFS_AV_MODIFIED);
+		}
+	}
+
+	if (flag & AT_CTIME) {
+		ZFS_TIME_ENCODE(&now, ctime);
+		if (zp->z_zfsvfs->z_use_fuids)
+			zp->z_pflags |= ZFS_ARCHIVE;
+	}
+}
+
+
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2])
+{
+	zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE);
+}
+/*
+ * Grow the block size for a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		size	- requested block size
+ *		tx	- open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+	int		error;
+	u_longlong_t	dummy;
+
+	if (size <= zp->z_blksz)
+		return;
+	/*
+	 * If the file size is already greater than the current blocksize,
+	 * we will not grow.  If there is more than one block in a file,
+	 * the blocksize cannot change.
+	 */
+	if (zp->z_blksz && zp->z_size > zp->z_blksz)
+		return;
+
+	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
+	    size, 0, tx);
+
+	if (error == ENOTSUP)
+		return;
+	ASSERT0(error);
+
+	/* What blocksize did we actually get? */
+	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	uint64_t newblksz;
+	int error;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end <= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	if (end > zp->z_blksz &&
+	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+		/*
+		 * We are growing the file past the current block size.
+		 */
+		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+			/*
+			 * File's blocksize is already larger than the
+			 * "recordsize" property.  Only let it grow to
+			 * the next power of 2.
+			 */
+			ASSERT(!ISP2(zp->z_blksz));
+			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+		} else {
+			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
+		}
+		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+	} else {
+		newblksz = 0;
+	}
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	if (newblksz)
+		zfs_grow_blocksize(zp, newblksz, tx);
+
+	zp->z_size = end;
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx));
+
+	vnode_pager_setsize(ZTOV(zp), end);
+
+	zfs_rangelock_exit(lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * Free space in a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of section to free.
+ *		len	- length of section to free.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	zfs_locked_range_t *lr;
+	int error;
+
+	/*
+	 * Lock the range being freed.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (off >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	if (off + len > zp->z_size)
+		len = zp->z_size - off;
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+	if (error == 0) {
+		/*
+		 * In FreeBSD we cannot free block in the middle of a file,
+		 * but only at the end of a file, so this code path should
+		 * never happen.
+		 */
+		vnode_pager_setsize(ZTOV(zp), off);
+	}
+
+	zfs_rangelock_exit(lr);
+
+	return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	vnode_t *vp = ZTOV(zp);
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	int error;
+	sa_bulk_attr_t bulk[2];
+	int count = 0;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+	    DMU_OBJECT_END);
+	if (error) {
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	zp->z_size = end;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &zp->z_size, sizeof (zp->z_size));
+
+	if (end == 0) {
+		zp->z_pflags &= ~ZFS_SPARSE;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &zp->z_pflags, 8);
+	}
+	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+	dmu_tx_commit(tx);
+
+	/*
+	 * Clear any mapped pages in the truncated region.  This has to
+	 * happen outside of the transaction to avoid the possibility of
+	 * a deadlock with someone trying to push a page that we are
+	 * about to invalidate.
+	 */
+	vnode_pager_setsize(vp, end);
+
+	zfs_rangelock_exit(lr);
+
+	return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of range
+ *		len	- end of range (0 => EOF)
+ *		flag	- current file open mode flags.
+ *		log	- TRUE if this action should be logged
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+	dmu_tx_t *tx;
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	zilog_t *zilog = zfsvfs->z_log;
+	uint64_t mode;
+	uint64_t mtime[2], ctime[2];
+	sa_bulk_attr_t bulk[3];
+	int count = 0;
+	int error;
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+	    sizeof (mode))) != 0)
+		return (error);
+
+	if (off > zp->z_size) {
+		error =  zfs_extend(zp, off+len);
+		if (error == 0 && log)
+			goto log;
+		else
+			return (error);
+	}
+
+	if (len == 0) {
+		error = zfs_trunc(zp, off);
+	} else {
+		if ((error = zfs_free_range(zp, off, len)) == 0 &&
+		    off + len > zp->z_size)
+			error = zfs_extend(zp, off+len);
+	}
+	if (error || !log)
+		return (error);
+log:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &zp->z_pflags, 8);
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+
+	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+	uint64_t	moid, obj, sa_obj, version;
+	uint64_t	sense = ZFS_CASE_SENSITIVE;
+	uint64_t	norm = 0;
+	nvpair_t	*elem;
+	int		error;
+	int		i;
+	znode_t		*rootzp = NULL;
+	zfsvfs_t	*zfsvfs;
+	vattr_t		vattr;
+	znode_t		*zp;
+	zfs_acl_ids_t	acl_ids;
+
+	/*
+	 * First attempt to create master node.
+	 */
+	/*
+	 * In an empty objset, there are no blocks to read and thus
+	 * there can be no i/o errors (which we assert below).
+	 */
+	moid = MASTER_NODE_OBJ;
+	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Set starting attributes.
+	 */
+	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+		/* For the moment we expect all zpl props to be uint64_ts */
+		uint64_t val;
+		char *name;
+
+		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+		VERIFY(nvpair_value_uint64(elem, &val) == 0);
+		name = nvpair_name(elem);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+			if (val < version)
+				version = val;
+		} else {
+			error = zap_update(os, moid, name, 8, 1, &val, tx);
+		}
+		ASSERT(error == 0);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+			norm = val;
+		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+			sense = val;
+	}
+	ASSERT(version != 0);
+	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+	/*
+	 * Create zap object used for SA attribute registration
+	 */
+
+	if (version >= ZPL_VERSION_SA) {
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT(error == 0);
+	} else {
+		sa_obj = 0;
+	}
+	/*
+	 * Create a delete queue.
+	 */
+	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create root znode.  Create minimal znode/vnode/zfsvfs
+	 * to allow zfs_mknode to work.
+	 */
+	VATTR_NULL(&vattr);
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0755;
+	vattr.va_uid = crgetuid(cr);
+	vattr.va_gid = crgetgid(cr);
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	rootzp = zfs_znode_alloc_kmem(KM_SLEEP);
+	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
+	rootzp->z_unlinked = 0;
+	rootzp->z_atime_dirty = 0;
+	rootzp->z_is_sa = USE_SA(version, os);
+
+	zfsvfs->z_os = os;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_version = version;
+	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+	zfsvfs->z_use_sa = USE_SA(version, os);
+	zfsvfs->z_norm = norm;
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+
+	ASSERT(error == 0);
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	rootzp->z_zfsvfs = zfsvfs;
+	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+	    cr, NULL, &acl_ids));
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+	ASSERT3P(zp, ==, rootzp);
+	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+	ASSERT(error == 0);
+	zfs_acl_ids_free(&acl_ids);
+	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
+
+	sa_handle_destroy(rootzp->z_sa_hdl);
+	zfs_znode_free_kmem(rootzp);
+
+	/*
+	 * Create shares directory
+	 */
+
+	error = zfs_create_share_dir(zfsvfs, tx);
+
+	ASSERT(error == 0);
+
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+	uint64_t sa_obj = 0;
+	int error;
+
+	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+	if (error != 0 && error != ENOENT)
+		return (error);
+
+	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+	return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+    dmu_buf_t **db, void *tag)
+{
+	dmu_object_info_t doi;
+	int error;
+
+	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+		return (error);
+
+	dmu_object_info_from_db(*db, &doi);
+	if ((doi.doi_bonus_type != DMU_OT_SA &&
+	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t))) {
+		sa_buf_rele(*db, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+	if (error != 0) {
+		sa_buf_rele(*db, tag);
+		return (error);
+	}
+
+	return (0);
+}
+
+static void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+	sa_handle_destroy(hdl);
+	sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    uint64_t *pobjp, int *is_xattrdir)
+{
+	uint64_t parent;
+	uint64_t pflags;
+	uint64_t mode;
+	uint64_t parent_mode;
+	sa_bulk_attr_t bulk[3];
+	sa_handle_t *sa_hdl;
+	dmu_buf_t *sa_db;
+	int count = 0;
+	int error;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+	    &parent, sizeof (parent));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+	    &pflags, sizeof (pflags));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &mode, sizeof (mode));
+
+	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+		return (error);
+
+	/*
+	 * When a link is removed its parent pointer is not changed and will
+	 * be invalid.  There are two cases where a link is removed but the
+	 * file stays around, when it goes to the delete queue and when there
+	 * are additional links.
+	 */
+	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+	/*
+	 * Extended attributes can be applied to files, directories, etc.
+	 * Otherwise the parent must be a directory.
+	 */
+	if (!*is_xattrdir && !S_ISDIR(parent_mode))
+		return (SET_ERROR(EINVAL));
+
+	*pobjp = parent;
+
+	return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    zfs_stat_t *sb)
+{
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &sb->zs_mode, sizeof (sb->zs_mode));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+	    &sb->zs_gen, sizeof (sb->zs_gen));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+	    &sb->zs_links, sizeof (sb->zs_links));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+	    &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+	return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+    sa_attr_type_t *sa_table, char *buf, int len)
+{
+	sa_handle_t *sa_hdl;
+	sa_handle_t *prevhdl = NULL;
+	dmu_buf_t *prevdb = NULL;
+	dmu_buf_t *sa_db = NULL;
+	char *path = buf + len - 1;
+	int error;
+
+	*path = '\0';
+	sa_hdl = hdl;
+
+	uint64_t deleteq_obj;
+	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+	error = zap_lookup_int(osp, deleteq_obj, obj);
+	if (error == 0) {
+		return (ESTALE);
+	} else if (error != ENOENT) {
+		return (error);
+	}
+	error = 0;
+
+	for (;;) {
+		uint64_t pobj;
+		char component[MAXNAMELEN + 2];
+		size_t complen;
+		int is_xattrdir;
+
+		if (prevdb) {
+			ASSERT(prevhdl != NULL);
+			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+		}
+
+		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+		    &is_xattrdir)) != 0)
+			break;
+
+		if (pobj == obj) {
+			if (path[0] != '/')
+				*--path = '/';
+			break;
+		}
+
+		component[0] = '/';
+		if (is_xattrdir) {
+			(void) sprintf(component + 1, "<xattrdir>");
+		} else {
+			error = zap_value_search(osp, pobj, obj,
+			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
+			if (error != 0)
+				break;
+		}
+
+		complen = strlen(component);
+		path -= complen;
+		ASSERT(path >= buf);
+		bcopy(component, path, complen);
+		obj = pobj;
+
+		if (sa_hdl != hdl) {
+			prevhdl = sa_hdl;
+			prevdb = sa_db;
+		}
+		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+		if (error != 0) {
+			sa_hdl = prevhdl;
+			sa_db = prevdb;
+			break;
+		}
+	}
+
+	if (sa_hdl != NULL && sa_hdl != hdl) {
+		ASSERT(sa_db != NULL);
+		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	}
+
+	if (error == 0)
+		(void) memmove(buf, path, buf + len - path);
+
+	return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len)
+{
+	char *path = buf + len - 1;
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	*path = '\0';
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+	if (error != 0) {
+		zfs_release_sa_handle(hdl, db, FTAG);
+		return (error);
+	}
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+
+void
+zfs_znode_update_vfs(znode_t *zp)
+{
+	vm_object_t object;
+
+	if ((object = ZTOV(zp)->v_object) == NULL ||
+	    zp->z_size == object->un_pager.vnp.vnp_size)
+		return;
+
+	vnode_pager_setsize(ZTOV(zp), zp->z_size);
+}
+
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	uint64_t parent;
+	int is_xattrdir;
+	int err;
+
+	/* Extended attributes should not be visible as regular files. */
+	if ((zp->z_pflags & ZFS_XATTR) != 0)
+		return (SET_ERROR(EINVAL));
+
+	err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+	    &parent, &is_xattrdir);
+	if (err != 0)
+		return (err);
+	ASSERT0(is_xattrdir);
+
+	/* No name as this is a root object. */
+	if (parent == zp->z_id)
+		return (SET_ERROR(EINVAL));
+
+	err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+	    ZFS_DIRENT_OBJ(-1ULL), buf);
+	if (err != 0)
+		return (err);
+	err = zfs_zget(zfsvfs, parent, dzpp);
+	return (err);
+}
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
new file mode 100644
index 000000000000..9fe678d2574f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -0,0 +1,1839 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
+#define	ZFS_CURRENT_MAX_SALT_USES	\
+	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+/*
+ * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many
+ * calls, to test decryption error handling code paths.
+ */
+uint64_t zio_decrypt_fail_fraction = 0;
+
+typedef struct blkptr_auth_buf {
+	uint64_t bab_prop;			/* blk_prop - portable mask */
+	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
+	uint64_t bab_pad;			/* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+	{"",			ZC_TYPE_NONE,	0,	"inherit"},
+	{"",			ZC_TYPE_NONE,	0,	"on"},
+	{"",			ZC_TYPE_NONE,	0,	"off"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
+};
+
+static void
+zio_crypt_key_destroy_early(zio_crypt_key_t *key)
+{
+	rw_destroy(&key->zk_salt_lock);
+
+	/* free crypto templates */
+	bzero(&key->zk_session, sizeof (key->zk_session));
+
+	/* zero out sensitive data */
+	bzero(key, sizeof (zio_crypt_key_t));
+}
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+
+	freebsd_crypt_freesession(&key->zk_session);
+	zio_crypt_key_destroy_early(key);
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+	int ret;
+	crypto_mechanism_t mech __unused;
+	uint_t keydata_len;
+	zio_crypt_info_t *ci = NULL;
+
+	ASSERT(key != NULL);
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+	ci = &zio_crypt_table[crypt];
+	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+	    ci->ci_crypt_type != ZC_TYPE_CCM)
+		return (ENOTSUP);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+	bzero(key, sizeof (zio_crypt_key_t));
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	/* fill keydata buffers and salt with random data */
+	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for the ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	ci = &zio_crypt_table[crypt];
+	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+	    ci->ci_crypt_type != ZC_TYPE_CCM)
+		return (ENOTSUP);
+
+	ret = freebsd_crypt_newsession(&key->zk_session, ci,
+	    &key->zk_current_key);
+	if (ret)
+		goto error;
+
+	key->zk_crypt = crypt;
+	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+	key->zk_salt_count = 0;
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy_early(key);
+	return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+	int ret = 0;
+	uint8_t salt[ZIO_DATA_SALT_LEN];
+	crypto_mechanism_t mech __unused;
+
+	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+	/* generate a new salt */
+	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+	/* someone beat us to the salt rotation, just unlock and return */
+	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+		goto out_unlock;
+
+	/* derive the current key from the master key and the new salt */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+	if (ret != 0)
+		goto out_unlock;
+
+	/* assign the salt and reset the usage count */
+	bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+	key->zk_salt_count = 0;
+
+	freebsd_crypt_freesession(&key->zk_session);
+	ret = freebsd_crypt_newsession(&key->zk_session,
+	    &zio_crypt_table[key->zk_crypt], &key->zk_current_key);
+	if (ret != 0)
+		goto out_unlock;
+
+	rw_exit(&key->zk_salt_lock);
+
+	return (0);
+
+out_unlock:
+	rw_exit(&key->zk_salt_lock);
+error:
+	return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+	int ret;
+	boolean_t salt_change;
+
+	rw_enter(&key->zk_salt_lock, RW_READER);
+
+	bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+	    ZFS_CURRENT_MAX_SALT_USES);
+
+	rw_exit(&key->zk_salt_lock);
+
+	if (salt_change) {
+		ret = zio_crypt_key_change_salt(key);
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+void *failed_decrypt_buf;
+int failed_decrypt_size;
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+/*
+ * The implementation for FreeBSD's OpenCrypto.
+ *
+ * The big difference between ICP and FOC is that FOC uses a single
+ * buffer for input and output.  This means that (for AES-GCM, the
+ * only one supported right now) the source must be copied into the
+ * destination, and the destination must have the AAD, and the tag/MAC,
+ * already associated with it.  (Both implementations can use a uio.)
+ *
+ * Since the auth data is part of the iovec array, all we need to know
+ * is the length:  0 means there's no AAD.
+ *
+ */
+static int
+zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess,
+    uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen,
+    zfs_uio_t *uio, uint_t auth_len)
+{
+	zio_crypt_info_t *ci;
+	int ret;
+
+	ci = &zio_crypt_table[crypt];
+	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
+	    ci->ci_crypt_type != ZC_TYPE_CCM)
+		return (ENOTSUP);
+
+
+	ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf,
+	    datalen, auth_len);
+	if (ret != 0) {
+#ifdef FCRYPTO_DEBUG
+		printf("%s(%d):  Returning error %s\n",
+		    __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM");
+#endif
+		ret = SET_ERROR(encrypt ? EIO : ECKSUM);
+	}
+
+	return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+    uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+	int ret;
+	uint64_t aad[3];
+	/*
+	 * With OpenCrypto in FreeBSD, the same buffer is used for
+	 * input and output.  Also, the AAD (for AES-GMC at least)
+	 * needs to logically go in front.
+	 */
+	zfs_uio_t cuio;
+	struct uio cuio_s;
+	iovec_t iovecs[4];
+	uint64_t crypt = key->zk_crypt;
+	uint_t enc_len, keydata_len, aad_len;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	zfs_uio_init(&cuio, &cuio_s);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+	/* generate iv for wrapping the master and hmac key */
+	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	/*
+	 * Since we only support one buffer, we need to copy
+	 * the plain text (source) to the cipher buffer (dest).
+	 * We set iovecs[0] -- the authentication data -- below.
+	 */
+	bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len);
+	bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out,
+	    SHA512_HMAC_KEYLEN);
+	iovecs[1].iov_base = keydata_out;
+	iovecs[1].iov_len = keydata_len;
+	iovecs[2].iov_base = hmac_keydata_out;
+	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+	iovecs[3].iov_base = mac;
+	iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+	/*
+	 * Although we don't support writing to the old format, we do
+	 * support rewrapping the key so that the user can move and
+	 * quarantine datasets on the old format.
+	 */
+	if (key->zk_version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(key->zk_guid);
+	} else {
+		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(key->zk_guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(key->zk_version);
+	}
+
+	iovecs[0].iov_base = aad;
+	iovecs[0].iov_len = aad_len;
+	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+
+	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
+	zfs_uio_iovcnt(&cuio) = 4;
+	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
+
+	/* encrypt the keys and store the resulting ciphertext and mac */
+	ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey,
+	    iv, enc_len, &cuio, aad_len);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+    uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+    uint8_t *mac, zio_crypt_key_t *key)
+{
+	int ret;
+	uint64_t aad[3];
+	/*
+	 * With OpenCrypto in FreeBSD, the same buffer is used for
+	 * input and output.  Also, the AAD (for AES-GMC at least)
+	 * needs to logically go in front.
+	 */
+	zfs_uio_t cuio;
+	struct uio cuio_s;
+	iovec_t iovecs[4];
+	void *src, *dst;
+	uint_t enc_len, keydata_len, aad_len;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	zfs_uio_init(&cuio, &cuio_s);
+
+	/*
+	 * Since we only support one buffer, we need to copy
+	 * the encrypted buffer (source) to the plain buffer
+	 * (dest).  We set iovecs[0] -- the authentication data --
+	 * below.
+	 */
+	dst = key->zk_master_keydata;
+	src = keydata;
+
+	bcopy(src, dst, keydata_len);
+
+	dst = key->zk_hmac_keydata;
+	src = hmac_keydata;
+	bcopy(src, dst, SHA512_HMAC_KEYLEN);
+
+	iovecs[1].iov_base = key->zk_master_keydata;
+	iovecs[1].iov_len = keydata_len;
+	iovecs[2].iov_base = key->zk_hmac_keydata;
+	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
+	iovecs[3].iov_base = mac;
+	iovecs[3].iov_len = WRAPPING_MAC_LEN;
+
+	if (version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(guid);
+	} else {
+		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(version);
+	}
+
+	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+	iovecs[0].iov_base = aad;
+	iovecs[0].iov_len = aad_len;
+
+	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
+	zfs_uio_iovcnt(&cuio) = 4;
+	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
+
+	/* decrypt the keys and store the result in the output buffers */
+	ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey,
+	    iv, enc_len, &cuio, aad_len);
+
+	if (ret != 0)
+		goto error;
+
+	/* generate a fresh salt */
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	ret = freebsd_crypt_newsession(&key->zk_session,
+	    &zio_crypt_table[crypt], &key->zk_current_key);
+	if (ret != 0)
+		goto error;
+
+	key->zk_crypt = crypt;
+	key->zk_version = version;
+	key->zk_guid = guid;
+	key->zk_salt_count = 0;
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy_early(key);
+	return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+	int ret;
+
+	/* randomly generate the IV */
+	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	bzero(ivbuf, ZIO_DATA_IV_LEN);
+	return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+    uint8_t *digestbuf, uint_t digestlen)
+{
+	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+	crypto_mac(&key->zk_hmac_key, data, datalen,
+	    raw_digestbuf, SHA512_DIGEST_LENGTH);
+
+	bcopy(raw_digestbuf, digestbuf, digestlen);
+
+	return (0);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+    uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+	int ret;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	ret = zio_crypt_do_hmac(key, data, datalen,
+	    digestbuf, SHA512_DIGEST_LENGTH);
+	if (ret != 0)
+		return (ret);
+
+	bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+	bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+	return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_ENCRYPTED(bp));
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+		bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, val32);
+	} else {
+		bcopy(salt, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+		bcopy(iv, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, BSWAP_32(val32));
+	}
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_PROTECTED(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_IS_AUTHENTICATED(bp)) {
+		bzero(salt, ZIO_DATA_SALT_LEN);
+		bzero(iv, ZIO_DATA_IV_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+		bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+		val32 = (uint32_t)BP_GET_IV2(bp);
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+		bcopy(&val64, salt, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+		bcopy(&val64, iv, sizeof (uint64_t));
+
+		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp));
+	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+		bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+		    sizeof (uint64_t));
+	} else {
+		bcopy(mac, &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+		bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+	}
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		bzero(mac, ZIO_DATA_MAC_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+		bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+		    sizeof (uint64_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+		bcopy(&val64, mac, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+		bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+	zil_chain_t *zilc = data;
+
+	bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+	bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+	    sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+	/*
+	 * The ZIL MAC is embedded in the block it protects, which will
+	 * not have been byteswapped by the time this function has been called.
+	 * As a result, we don't need to worry about byteswapping the MAC.
+	 */
+	const zil_chain_t *zilc = data;
+
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+	    sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+	uint8_t *src;
+	dnode_phys_t *dnp, *sdnp, *ddnp;
+
+	src = abd_borrow_buf_copy(src_abd, datalen);
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+			    DN_MAX_BONUS_LEN(dnp));
+		}
+	}
+
+	abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+	int avoidlint = SPA_MINBLOCKSIZE;
+	/*
+	 * Version 0 did not properly zero out all non-portable fields
+	 * as it should have done. We maintain this code so that we can
+	 * do read-only imports of pools on this version.
+	 */
+	if (version == 0) {
+		BP_SET_DEDUP(bp, 0);
+		BP_SET_CHECKSUM(bp, 0);
+		BP_SET_PSIZE(bp, avoidlint);
+		return;
+	}
+
+	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+	/*
+	 * The hole_birth feature might set these fields even if this bp
+	 * is a hole. We zero them out here to guarantee that raw sends
+	 * will function with or without the feature.
+	 */
+	if (BP_IS_HOLE(bp)) {
+		bp->blk_prop = 0ULL;
+		return;
+	}
+
+	/*
+	 * At L0 we want to verify these fields to ensure that data blocks
+	 * can not be reinterpreted. For instance, we do not want an attacker
+	 * to trick us into returning raw lz4 compressed data to the user
+	 * by modifying the compression bits. At higher levels, we cannot
+	 * enforce this policy since raw sends do not convey any information
+	 * about indirect blocks, so these values might be different on the
+	 * receive side. Fortunately, this does not open any new attack
+	 * vectors, since any alterations that can be made to a higher level
+	 * bp must still verify the correct order of the layer below it.
+	 */
+	if (BP_GET_LEVEL(bp) != 0) {
+		BP_SET_BYTEORDER(bp, 0);
+		BP_SET_COMPRESS(bp, 0);
+
+		/*
+		 * psize cannot be set to zero or it will trigger
+		 * asserts, but the value doesn't really matter as
+		 * long as it is constant.
+		 */
+		BP_SET_PSIZE(bp, avoidlint);
+	}
+
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+    blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+	blkptr_t tmpbp = *bp;
+
+	if (should_bswap)
+		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+	/*
+	 * We always MAC blk_prop in LE to ensure portability. This
+	 * must be done after decoding the mac, since the endianness
+	 * will get zero'd out here.
+	 */
+	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+	bab->bab_prop = LE_64(tmpbp.blk_prop);
+	bab->bab_pad = 0ULL;
+
+	/* version 0 did not include the padding */
+	*bab_len = sizeof (blkptr_auth_buf_t);
+	if (version == 0)
+		*bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	crypto_mac_update(ctx, &bab, bab_len);
+
+	return (0);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	bcopy(&bab, *aadp, bab_len);
+	*aadp += bab_len;
+	*aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, dnode_phys_t *dnp)
+{
+	int ret, i;
+	dnode_phys_t *adnp;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+	/* authenticate the core dnode (masking out non-portable bits) */
+	bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+	adnp = (dnode_phys_t *)tmp_dncore;
+	if (le_bswap) {
+		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+		adnp->dn_used = BSWAP_64(adnp->dn_used);
+	}
+	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+	adnp->dn_used = 0;
+
+	crypto_mac_update(ctx, adnp, sizeof (tmp_dncore));
+
+	for (i = 0; i < dnp->dn_nblkptr; i++) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, &dnp->dn_blkptr[i]);
+		if (ret != 0)
+			goto error;
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, DN_SPILL_BLKPTR(dnp));
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+/* ARGSUSED */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+    boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+	int ret;
+	struct hmac_ctx hash_ctx;
+	struct hmac_ctx *ctx = &hash_ctx;
+	objset_phys_t *osp = data;
+	uint64_t intval;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+
+	/* calculate the portable MAC from the portable fields and metadnode */
+	crypto_mac_init(ctx, &key->zk_hmac_key);
+
+	/* add in the os_type */
+	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+	/* add in the portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	/* CONSTCOND */
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+	/* add in fields from the metadnode */
+	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+	    should_bswap, &osp->os_meta_dnode);
+	if (ret)
+		goto error;
+
+	crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH);
+
+	bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+	/*
+	 * This is necessary here as we check next whether
+	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE or
+	 * OBJSET_FLAG_USEROBJACCOUNTING are set in order to
+	 * decide if the local_mac should be zeroed out.
+	 */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+
+	/*
+	 * The local MAC protects the user, group and project accounting.
+	 * If these objects are not present, the local MAC is zeroed out.
+	 */
+	if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen <= OBJSET_PHYS_SIZE_V1) ||
+	    (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 ||
+	    (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) &&
+	    key->zk_version > 0)) {
+		bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+		return (0);
+	}
+
+	/* calculate the local MAC from the userused and groupused dnodes */
+	crypto_mac_init(ctx, &key->zk_hmac_key);
+
+	/* add in the non-portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	/* CONSTCOND */
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
+
+	/* XXX check dnode type ... */
+	/* add in fields from the user accounting dnodes */
+	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_userused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_groupused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+	    datalen >= OBJSET_PHYS_SIZE_V3) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_projectused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH);
+
+	bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+	return (0);
+
+error:
+	bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+	bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+	return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(zfs_uio_t *uio)
+{
+	if (GET_UIO_STRUCT(uio)->uio_iov)
+		kmem_free(GET_UIO_STRUCT(uio)->uio_iov,
+		    zfs_uio_iovcnt(uio) * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+    uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+	blkptr_t *bp;
+	int i, epb = datalen >> SPA_BLKPTRSHIFT;
+	SHA2_CTX ctx;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	/* checksum all of the MACs from the layer below */
+	SHA2Init(SHA512, &ctx);
+	for (i = 0, bp = buf; i < epb; i++, bp++) {
+		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+		    byteswap, bp);
+	}
+	SHA2Final(digestbuf, &ctx);
+
+	if (generate) {
+		bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+		return (0);
+	}
+
+	if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) {
+#ifdef FCRYPTO_DEBUG
+		printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__);
+#endif
+		return (SET_ERROR(ECKSUM));
+	}
+	return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+
+	/*
+	 * Unfortunately, callers of this function will not always have
+	 * easy access to the on-disk format version. This info is
+	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
+	 * is expected to be verifiable even when the key isn't loaded.
+	 * Here, instead of doing a ZAP lookup for the version for each
+	 * zio, we simply try both existing formats.
+	 */
+	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+	if (ret == ECKSUM) {
+		ASSERT(!generate);
+		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+		    buf, datalen, 0, byteswap, cksum);
+	}
+
+	return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+	void *buf;
+
+	buf = abd_borrow_buf_copy(abd, datalen);
+	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+	    byteswap, cksum);
+	abd_return_buf(abd, buf, datalen);
+
+	return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+/*
+ * The OpenCrypto used in FreeBSD does not use separate source and
+ * destination buffers; instead, the same buffer is used.  Further, to
+ * accommodate some of the drivers, the authbuf needs to be logically before
+ * the data.  This means that we need to copy the source to the destination,
+ * and set up an extra iovec_t at the beginning to handle the authbuf.
+ * It also means we'll only return one zfs_uio_t.
+ */
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
+    zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+    boolean_t *no_crypt)
+{
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+	iovec_t *dst_iovecs;
+	zil_chain_t *zilc;
+	lr_t *lr;
+	uint64_t txtype, lr_len;
+	uint_t crypt_len, nr_iovecs, vec;
+	uint_t aad_len = 0, total_len = 0;
+
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+	}
+	bcopy(src, dst, datalen);
+
+	/* Find the start and end record of the log block. */
+	zilc = (zil_chain_t *)src;
+	slrp = src + sizeof (zil_chain_t);
+	aadp = aadbuf;
+	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+	/*
+	 * Calculate the number of encrypted iovecs we will need.
+	 */
+
+	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
+	nr_iovecs = 2;
+
+	for (; slrp < blkend; slrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (byteswap) {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		} else {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		}
+
+		nr_iovecs++;
+		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+			nr_iovecs++;
+	}
+
+	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
+
+	/*
+	 * Copy the plain zil header over and authenticate everything except
+	 * the checksum that will store our MAC. If we are writing the data
+	 * the embedded checksum will not have been calculated yet, so we don't
+	 * authenticate that.
+	 */
+	bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+	slrp = src + sizeof (zil_chain_t);
+	dlrp = dst + sizeof (zil_chain_t);
+
+	/*
+	 * Loop over records again, filling in iovecs.
+	 */
+
+	/* The first iovec will contain the authbuf. */
+	vec = 1;
+
+	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (!byteswap) {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		} else {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		}
+
+		/* copy the common lr_t */
+		bcopy(slrp, dlrp, sizeof (lr_t));
+		bcopy(slrp, aadp, sizeof (lr_t));
+		aadp += sizeof (lr_t);
+		aad_len += sizeof (lr_t);
+
+		/*
+		 * If this is a TX_WRITE record we want to encrypt everything
+		 * except the bp if exists. If the bp does exist we want to
+		 * authenticate it.
+		 */
+		if (txtype == TX_WRITE) {
+			crypt_len = sizeof (lr_write_t) -
+			    sizeof (lr_t) - sizeof (blkptr_t);
+			dst_iovecs[vec].iov_base = (char *)dlrp +
+			    sizeof (lr_t);
+			dst_iovecs[vec].iov_len = crypt_len;
+
+			/* copy the bp now since it will not be encrypted */
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    sizeof (blkptr_t));
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    aadp, sizeof (blkptr_t));
+			aadp += sizeof (blkptr_t);
+			aad_len += sizeof (blkptr_t);
+			vec++;
+			total_len += crypt_len;
+
+			if (lr_len != sizeof (lr_write_t)) {
+				crypt_len = lr_len - sizeof (lr_write_t);
+				dst_iovecs[vec].iov_base = (char *)
+				    dlrp + sizeof (lr_write_t);
+				dst_iovecs[vec].iov_len = crypt_len;
+				vec++;
+				total_len += crypt_len;
+			}
+		} else {
+			crypt_len = lr_len - sizeof (lr_t);
+			dst_iovecs[vec].iov_base = (char *)dlrp +
+			    sizeof (lr_t);
+			dst_iovecs[vec].iov_len = crypt_len;
+			vec++;
+			total_len += crypt_len;
+		}
+	}
+
+	/* The last iovec will contain the MAC. */
+	ASSERT3U(vec, ==, nr_iovecs - 1);
+
+	/* AAD */
+	dst_iovecs[0].iov_base = aadbuf;
+	dst_iovecs[0].iov_len = aad_len;
+	/* MAC */
+	dst_iovecs[vec].iov_base = 0;
+	dst_iovecs[vec].iov_len = 0;
+
+	*no_crypt = (vec == 1);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
+	zfs_uio_iovcnt(out_uio) = nr_iovecs;
+
+	return (0);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf,
+    uint_t *auth_len, boolean_t *no_crypt)
+{
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+	uint8_t *src, *dst, *aadp;
+	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+	iovec_t *dst_iovecs;
+	uint_t nr_iovecs, crypt_len, vec;
+	uint_t aad_len = 0, total_len = 0;
+	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+	}
+	bcopy(src, dst, datalen);
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+	aadp = aadbuf;
+
+	/*
+	 * Count the number of iovecs we will need to do the encryption by
+	 * counting the number of bonus buffers that need to be encrypted.
+	 */
+
+	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
+	nr_iovecs = 2;
+
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		/*
+		 * This block may still be byteswapped. However, all of the
+		 * values we use are either uint8_t's (for which byteswapping
+		 * is a noop) or a * != 0 check, which will work regardless
+		 * of whether or not we byteswap.
+		 */
+		if (sdnp[i].dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+		    sdnp[i].dn_bonuslen != 0) {
+			nr_iovecs++;
+		}
+	}
+
+	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
+
+	/*
+	 * Iterate through the dnodes again, this time filling in the uios
+	 * we allocated earlier. We also concatenate any data we want to
+	 * authenticate onto aadbuf.
+	 */
+
+	/* The first iovec will contain the authbuf. */
+	vec = 1;
+
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+
+		/* copy over the core fields and blkptrs (kept as plaintext) */
+		bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+			    sizeof (blkptr_t));
+		}
+
+		/*
+		 * Handle authenticated data. We authenticate everything in
+		 * the dnode that can be brought over when we do a raw send.
+		 * This includes all of the core fields as well as the MACs
+		 * stored in the bp checksums and all of the portable bits
+		 * from blk_prop. We include the dnode padding here in case it
+		 * ever gets used in the future. Some dn_flags and dn_used are
+		 * not portable so we mask those out values out of the
+		 * authenticated data.
+		 */
+		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+		bcopy(dnp, aadp, crypt_len);
+		adnp = (dnode_phys_t *)aadp;
+		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+		adnp->dn_used = 0;
+		aadp += crypt_len;
+		aad_len += crypt_len;
+
+		for (j = 0; j < dnp->dn_nblkptr; j++) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, &dnp->dn_blkptr[j]);
+		}
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, DN_SPILL_BLKPTR(dnp));
+		}
+
+		/*
+		 * If this bonus buffer needs to be encrypted, we prepare an
+		 * iovec_t. The encryption / decryption functions will fill
+		 * this in for us with the encrypted or decrypted data.
+		 * Otherwise we add the bonus buffer to the authenticated
+		 * data buffer and copy it over to the destination. The
+		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+		 * we can guarantee alignment with the AES block size
+		 * (128 bits).
+		 */
+		crypt_len = DN_MAX_BONUS_LEN(dnp);
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]);
+			dst_iovecs[vec].iov_len = crypt_len;
+
+			vec++;
+			total_len += crypt_len;
+		} else {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+			bcopy(DN_BONUS(dnp), aadp, crypt_len);
+			aadp += crypt_len;
+			aad_len += crypt_len;
+		}
+	}
+
+	/* The last iovec will contain the MAC. */
+	ASSERT3U(vec, ==, nr_iovecs - 1);
+
+	/* AAD */
+	dst_iovecs[0].iov_base = aadbuf;
+	dst_iovecs[0].iov_len = aad_len;
+	/* MAC */
+	dst_iovecs[vec].iov_base = 0;
+	dst_iovecs[vec].iov_len = 0;
+
+	*no_crypt = (vec == 1);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
+	zfs_uio_iovcnt(out_uio) = nr_iovecs;
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio,
+    uint_t *enc_len)
+{
+	int ret;
+	uint_t nr_plain = 1, nr_cipher = 2;
+	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+	void *src, *dst;
+
+	cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+	    KM_SLEEP);
+	if (!cipher_iovecs) {
+		ret = SET_ERROR(ENOMEM);
+		goto error;
+	}
+	bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+	}
+	bcopy(src, dst, datalen);
+	cipher_iovecs[0].iov_base = dst;
+	cipher_iovecs[0].iov_len = datalen;
+
+	*enc_len = datalen;
+	GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs;
+	zfs_uio_iovcnt(out_uio) = nr_cipher;
+
+	return (0);
+
+error:
+	if (plain_iovecs != NULL)
+		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+	if (cipher_iovecs != NULL)
+		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+	*enc_len = 0;
+	GET_UIO_STRUCT(out_uio)->uio_iov = NULL;
+	zfs_uio_iovcnt(out_uio) = 0;
+
+	return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
+    uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
+{
+	int ret;
+	iovec_t *mac_iov;
+
+	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+	/* route to handler */
+	switch (ot) {
+	case DMU_OT_INTENT_LOG:
+		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+		    no_crypt);
+		break;
+	case DMU_OT_DNODE:
+		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+		    auth_len, no_crypt);
+		break;
+	default:
+		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+		    datalen, puio, cuio, enc_len);
+		*authbuf = NULL;
+		*auth_len = 0;
+		*no_crypt = B_FALSE;
+		break;
+	}
+
+	if (ret != 0)
+		goto error;
+
+	/* populate the uios */
+	zfs_uio_segflg(cuio) = UIO_SYSSPACE;
+
+	mac_iov =
+	    ((iovec_t *)&(GET_UIO_STRUCT(cuio)->
+	    uio_iov[zfs_uio_iovcnt(cuio) - 1]));
+	mac_iov->iov_base = (void *)mac;
+	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+void *failed_decrypt_buf;
+int faile_decrypt_size;
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+    dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+    uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+    boolean_t *no_crypt)
+{
+	int ret;
+	boolean_t locked = B_FALSE;
+	uint64_t crypt = key->zk_crypt;
+	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+	uint_t enc_len, auth_len;
+	zfs_uio_t puio, cuio;
+	struct uio puio_s, cuio_s;
+	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+	crypto_key_t tmp_ckey, *ckey = NULL;
+	freebsd_crypt_session_t *tmpl = NULL;
+	uint8_t *authbuf = NULL;
+
+
+	zfs_uio_init(&puio, &puio_s);
+	zfs_uio_init(&cuio, &cuio_s);
+	bzero(GET_UIO_STRUCT(&puio), sizeof (struct uio));
+	bzero(GET_UIO_STRUCT(&cuio), sizeof (struct uio));
+
+#ifdef FCRYPTO_DEBUG
+	printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",
+	    __FUNCTION__,
+	    encrypt ? "encrypt" : "decrypt",
+	    key, salt, ot, iv, mac, datalen,
+	    byteswap ? "byteswap" : "native_endian", plainbuf,
+	    cipherbuf, no_crypt);
+
+	printf("\tkey = {");
+	for (int i = 0; i < key->zk_current_key.ck_length/8; i++)
+		printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]);
+	printf("}\n");
+#endif
+	/* create uios for encryption */
+	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+	    &authbuf, &auth_len, no_crypt);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * If the needed key is the current one, just use it. Otherwise we
+	 * need to generate a temporary one from the given salt + master key.
+	 * If we are encrypting, we must return a copy of the current salt
+	 * so that it can be stored in the blkptr_t.
+	 */
+	rw_enter(&key->zk_salt_lock, RW_READER);
+	locked = B_TRUE;
+
+	if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+		ckey = &key->zk_current_key;
+		tmpl = &key->zk_session;
+	} else {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+
+		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+		if (ret != 0)
+			goto error;
+		tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+		tmp_ckey.ck_data = enc_keydata;
+		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+		ckey = &tmp_ckey;
+		tmpl = NULL;
+	}
+
+	/* perform the encryption / decryption */
+	ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt,
+	    ckey, iv, enc_len, &cuio, auth_len);
+	if (ret != 0)
+		goto error;
+	if (locked) {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+	}
+
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+
+	return (0);
+
+error:
+	if (!encrypt) {
+		if (failed_decrypt_buf != NULL)
+			kmem_free(failed_decrypt_buf, failed_decrypt_size);
+		failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
+		failed_decrypt_size = datalen;
+		bcopy(cipherbuf, failed_decrypt_buf, datalen);
+	}
+	if (locked)
+		rw_exit(&key->zk_salt_lock);
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+	return (SET_ERROR(ret));
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+    boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+    uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+	int ret;
+	void *ptmp, *ctmp;
+
+	if (encrypt) {
+		ptmp = abd_borrow_buf_copy(pabd, datalen);
+		ctmp = abd_borrow_buf(cabd, datalen);
+	} else {
+		ptmp = abd_borrow_buf(pabd, datalen);
+		ctmp = abd_borrow_buf_copy(cabd, datalen);
+	}
+
+	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+	    datalen, ptmp, ctmp, no_crypt);
+	if (ret != 0)
+		goto error;
+
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (0);
+
+error:
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (SET_ERROR(ret));
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+	"can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
new file mode 100644
index 000000000000..2389b1a06355
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@ -0,0 +1,1525 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot.  No user command needs to be
+ * run before opening and using a device.
+ *
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system. Except when they're simply character devices (volmode=dev).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/disk.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/byteorder.h>
+#include <sys/sunddi.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/queue.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
+#include <sys/zvol.h>
+#include <sys/zil_impl.h>
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil_impl.h>
+#include <sys/filio.h>
+
+#include <geom/geom.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include "zfs_namecheck.h"
+
+#define	ZVOL_DUMPSIZE		"dumpsize"
+
+#ifdef ZVOL_LOCK_DEBUG
+#define	ZVOL_RW_READER		RW_WRITER
+#define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
+#else
+#define	ZVOL_RW_READER		RW_READER
+#define	ZVOL_RW_READ_HELD	RW_READ_HELD
+#endif
+
+enum zvol_geom_state {
+	ZVOL_GEOM_UNINIT,
+	ZVOL_GEOM_STOPPED,
+	ZVOL_GEOM_RUNNING,
+};
+
+struct zvol_state_os {
+#define	zso_dev		_zso_state._zso_dev
+#define	zso_geom	_zso_state._zso_geom
+	union {
+		/* volmode=dev */
+		struct zvol_state_dev {
+			struct cdev *zsd_cdev;
+			uint64_t zsd_sync_cnt;
+		} _zso_dev;
+
+		/* volmode=geom */
+		struct zvol_state_geom {
+			struct g_provider *zsg_provider;
+			struct bio_queue_head zsg_queue;
+			struct mtx zsg_queue_mtx;
+			enum zvol_geom_state zsg_state;
+		} _zso_geom;
+	} _zso_state;
+	int zso_dying;
+};
+
+static uint32_t zvol_minors;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
+	"Expose as GEOM providers (1), device files (2) or neither");
+static boolean_t zpool_on_zvol = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
+	"Allow zpools to use zvols as vdevs (DANGEROUS)");
+
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS / 2;
+
+static void zvol_ensure_zilog(zvol_state_t *zv);
+
+static d_open_t		zvol_cdev_open;
+static d_close_t	zvol_cdev_close;
+static d_ioctl_t	zvol_cdev_ioctl;
+static d_read_t		zvol_cdev_read;
+static d_write_t	zvol_cdev_write;
+static d_strategy_t	zvol_geom_bio_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+	.d_name =	"zvol",
+	.d_version =	D_VERSION,
+	.d_flags =	D_DISK | D_TRACKCLOSE,
+	.d_open =	zvol_cdev_open,
+	.d_close =	zvol_cdev_close,
+	.d_ioctl =	zvol_cdev_ioctl,
+	.d_read =	zvol_cdev_read,
+	.d_write =	zvol_cdev_write,
+	.d_strategy =	zvol_geom_bio_strategy,
+};
+
+extern uint_t zfs_geom_probe_vdev_key;
+
+struct g_class zfs_zvol_class = {
+	.name = "ZFS::ZVOL",
+	.version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+static int zvol_geom_open(struct g_provider *pp, int flag, int count);
+static int zvol_geom_close(struct g_provider *pp, int flag, int count);
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_worker(void *arg);
+static void zvol_geom_bio_start(struct bio *bp);
+static int zvol_geom_bio_getattr(struct bio *bp);
+/* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
+
+/*
+ * GEOM mode implementation
+ */
+
+/*ARGSUSED*/
+static int
+zvol_geom_open(struct g_provider *pp, int flag, int count)
+{
+	zvol_state_t *zv;
+	int err = 0;
+	boolean_t drop_suspend = B_FALSE;
+	boolean_t drop_namespace = B_FALSE;
+
+	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+		/*
+		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
+		 * attempting to probe geom providers while looking for a
+		 * replacement for a missing VDEV.  In this case, the
+		 * spa_namespace_lock will not be held, but it is still illegal
+		 * to use a zvol as a vdev.  Deadlocks can result if another
+		 * thread has spa_namespace_lock
+		 */
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+retry:
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = pp->private;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		err = SET_ERROR(ENXIO);
+		goto out_locked;
+	}
+
+	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+		/*
+		 * We need to guarantee that the namespace lock is held
+		 * to avoid spurious failures in zvol_first_open.
+		 */
+		drop_namespace = B_TRUE;
+		if (!mutex_tryenter(&spa_namespace_lock)) {
+			rw_exit(&zvol_state_lock);
+			mutex_enter(&spa_namespace_lock);
+			goto retry;
+		}
+	}
+	mutex_enter(&zv->zv_state_lock);
+	if (zv->zv_zso->zso_dying) {
+		rw_exit(&zvol_state_lock);
+		err = SET_ERROR(ENXIO);
+		goto out_zv_locked;
+	}
+	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+	/*
+	 * make sure zvol is not suspended during first open
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 0) {
+		drop_suspend = B_TRUE;
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 0) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		err = zvol_first_open(zv, !(flag & FWRITE));
+		if (err)
+			goto out_zv_locked;
+		pp->mediasize = zv->zv_volsize;
+		pp->stripeoffset = 0;
+		pp->stripesize = zv->zv_volblocksize;
+	}
+
+	/*
+	 * Check for a bad on-disk format version now since we
+	 * lied about owning the dataset readonly before.
+	 */
+	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
+	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
+		err = SET_ERROR(EROFS);
+		goto out_opened;
+	}
+	if (zv->zv_flags & ZVOL_EXCL) {
+		err = SET_ERROR(EBUSY);
+		goto out_opened;
+	}
+#ifdef FEXCL
+	if (flag & FEXCL) {
+		if (zv->zv_open_count != 0) {
+			err = SET_ERROR(EBUSY);
+			goto out_opened;
+		}
+		zv->zv_flags |= ZVOL_EXCL;
+	}
+#endif
+
+	zv->zv_open_count += count;
+out_opened:
+	if (zv->zv_open_count == 0) {
+		zvol_last_close(zv);
+		wakeup(zv);
+	}
+out_zv_locked:
+	mutex_exit(&zv->zv_state_lock);
+out_locked:
+	if (drop_namespace)
+		mutex_exit(&spa_namespace_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (err);
+}
+
+/*ARGSUSED*/
+static int
+zvol_geom_close(struct g_provider *pp, int flag, int count)
+{
+	zvol_state_t *zv;
+	boolean_t drop_suspend = B_TRUE;
+	int new_open_count;
+
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = pp->private;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+	if (zv->zv_flags & ZVOL_EXCL) {
+		ASSERT3U(zv->zv_open_count, ==, 1);
+		zv->zv_flags &= ~ZVOL_EXCL;
+	}
+
+	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT3U(zv->zv_open_count, >, 0);
+
+	/*
+	 * make sure zvol is not suspended during last close
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	new_open_count = zv->zv_open_count - count;
+	if (new_open_count == 0) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			new_open_count = zv->zv_open_count - count;
+			if (new_open_count != 0) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_open_count = new_open_count;
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		zvol_last_close(zv);
+		wakeup(zv);
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (0);
+}
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+	struct g_provider *pp = zsg->zsg_provider;
+
+	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+	g_error_provider(pp, 0);
+
+	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
+	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+	struct g_provider *pp = zsg->zsg_provider;
+
+	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+	g_topology_assert();
+
+	mutex_enter(&zv->zv_state_lock);
+	VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
+	mutex_exit(&zv->zv_state_lock);
+	zsg->zsg_provider = NULL;
+	g_wither_geom(pp->geom, ENXIO);
+}
+
+void
+zvol_wait_close(zvol_state_t *zv)
+{
+
+	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
+		return;
+	mutex_enter(&zv->zv_state_lock);
+	zv->zv_zso->zso_dying = B_TRUE;
+
+	if (zv->zv_open_count)
+		msleep(zv, &zv->zv_state_lock,
+		    PRIBIO, "zvol:dying", 10*hz);
+	mutex_exit(&zv->zv_state_lock);
+}
+
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+	int count, error, flags;
+
+	g_topology_assert();
+
+	/*
+	 * To make it easier we expect either open or close, but not both
+	 * at the same time.
+	 */
+	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+	    (acr <= 0 && acw <= 0 && ace <= 0),
+	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+	    pp->name, acr, acw, ace));
+
+	if (pp->private == NULL) {
+		if (acr <= 0 && acw <= 0 && ace <= 0)
+			return (0);
+		return (pp->error);
+	}
+
+	/*
+	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
+	 * ace != 0, because GEOM already handles that and handles it a bit
+	 * differently. GEOM allows for multiple read/exclusive consumers and
+	 * ZFS allows only one exclusive consumer, no matter if it is reader or
+	 * writer. I like better the way GEOM works so I'll leave it for GEOM
+	 * to decide what to do.
+	 */
+
+	count = acr + acw + ace;
+	if (count == 0)
+		return (0);
+
+	flags = 0;
+	if (acr != 0 || ace != 0)
+		flags |= FREAD;
+	if (acw != 0)
+		flags |= FWRITE;
+
+	g_topology_unlock();
+	if (count > 0)
+		error = zvol_geom_open(pp, flags, count);
+	else
+		error = zvol_geom_close(pp, flags, -count);
+	g_topology_lock();
+	return (error);
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+	zvol_state_t *zv = arg;
+	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+	struct bio *bp;
+
+	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+	thread_lock(curthread);
+	sched_prio(curthread, PRIBIO);
+	thread_unlock(curthread);
+
+	for (;;) {
+		mtx_lock(&zsg->zsg_queue_mtx);
+		bp = bioq_takefirst(&zsg->zsg_queue);
+		if (bp == NULL) {
+			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
+				zsg->zsg_state = ZVOL_GEOM_RUNNING;
+				wakeup(&zsg->zsg_state);
+				mtx_unlock(&zsg->zsg_queue_mtx);
+				kthread_exit();
+			}
+			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
+			    PRIBIO | PDROP, "zvol:io", 0);
+			continue;
+		}
+		mtx_unlock(&zsg->zsg_queue_mtx);
+		zvol_geom_bio_strategy(bp);
+	}
+}
+
+static void
+zvol_geom_bio_start(struct bio *bp)
+{
+	zvol_state_t *zv = bp->bio_to->private;
+	struct zvol_state_geom *zsg;
+	boolean_t first;
+
+	if (zv == NULL) {
+		g_io_deliver(bp, ENXIO);
+		return;
+	}
+	if (bp->bio_cmd == BIO_GETATTR) {
+		if (zvol_geom_bio_getattr(bp))
+			g_io_deliver(bp, EOPNOTSUPP);
+		return;
+	}
+
+	if (!THREAD_CAN_SLEEP()) {
+		zsg = &zv->zv_zso->zso_geom;
+		mtx_lock(&zsg->zsg_queue_mtx);
+		first = (bioq_first(&zsg->zsg_queue) == NULL);
+		bioq_insert_tail(&zsg->zsg_queue, bp);
+		mtx_unlock(&zsg->zsg_queue_mtx);
+		if (first)
+			wakeup_one(&zsg->zsg_queue);
+		return;
+	}
+
+	zvol_geom_bio_strategy(bp);
+}
+
+static int
+zvol_geom_bio_getattr(struct bio *bp)
+{
+	zvol_state_t *zv;
+
+	zv = bp->bio_to->private;
+	ASSERT3P(zv, !=, NULL);
+
+	spa_t *spa = dmu_objset_spa(zv->zv_objset);
+	uint64_t refd, avail, usedobjs, availobjs;
+
+	if (g_handleattr_int(bp, "GEOM::candelete", 1))
+		return (0);
+	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+		dmu_objset_space(zv->zv_objset, &refd, &avail,
+		    &usedobjs, &availobjs);
+		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
+			return (0);
+	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+		dmu_objset_space(zv->zv_objset, &refd, &avail,
+		    &usedobjs, &availobjs);
+		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
+			return (0);
+	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+		avail = metaslab_class_get_space(spa_normal_class(spa));
+		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+		if (g_handleattr_off_t(bp, "poolblocksavail",
+		    avail / DEV_BSIZE))
+			return (0);
+	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+		refd = metaslab_class_get_alloc(spa_normal_class(spa));
+		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
+			return (0);
+	}
+	return (1);
+}
+
+static void
+zvol_geom_bio_strategy(struct bio *bp)
+{
+	zvol_state_t *zv;
+	uint64_t off, volsize;
+	size_t resid;
+	char *addr;
+	objset_t *os;
+	zfs_locked_range_t *lr;
+	int error = 0;
+	boolean_t doread = B_FALSE;
+	boolean_t is_dumpified;
+	boolean_t sync;
+
+	if (bp->bio_to)
+		zv = bp->bio_to->private;
+	else
+		zv = bp->bio_dev->si_drv2;
+
+	if (zv == NULL) {
+		error = SET_ERROR(ENXIO);
+		goto out;
+	}
+
+	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+
+	switch (bp->bio_cmd) {
+	case BIO_READ:
+		doread = B_TRUE;
+		break;
+	case BIO_WRITE:
+	case BIO_FLUSH:
+	case BIO_DELETE:
+		if (zv->zv_flags & ZVOL_RDONLY) {
+			error = SET_ERROR(EROFS);
+			goto resume;
+		}
+		zvol_ensure_zilog(zv);
+		if (bp->bio_cmd == BIO_FLUSH)
+			goto sync;
+		break;
+	default:
+		error = SET_ERROR(EOPNOTSUPP);
+		goto resume;
+	}
+
+	off = bp->bio_offset;
+	volsize = zv->zv_volsize;
+
+	os = zv->zv_objset;
+	ASSERT3P(os, !=, NULL);
+
+	addr = bp->bio_data;
+	resid = bp->bio_length;
+
+	if (resid > 0 && off >= volsize) {
+		error = SET_ERROR(EIO);
+		goto resume;
+	}
+
+	is_dumpified = B_FALSE;
+	sync = !doread && !is_dumpified &&
+	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+	/*
+	 * There must be no buffer changes when doing a dmu_sync() because
+	 * we can't change the data whilst calculating the checksum.
+	 */
+	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
+	    doread ? RL_READER : RL_WRITER);
+
+	if (bp->bio_cmd == BIO_DELETE) {
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			dmu_tx_abort(tx);
+		} else {
+			zvol_log_truncate(zv, tx, off, resid, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    off, resid);
+			resid = 0;
+		}
+		goto unlock;
+	}
+	while (resid != 0 && off < volsize) {
+		size_t size = MIN(resid, zvol_maxphys);
+		if (doread) {
+			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+			    DMU_READ_PREFETCH);
+		} else {
+			dmu_tx_t *tx = dmu_tx_create(os);
+			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
+			error = dmu_tx_assign(tx, TXG_WAIT);
+			if (error) {
+				dmu_tx_abort(tx);
+			} else {
+				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+				zvol_log_write(zv, tx, off, size, sync);
+				dmu_tx_commit(tx);
+			}
+		}
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+		off += size;
+		addr += size;
+		resid -= size;
+	}
+unlock:
+	zfs_rangelock_exit(lr);
+
+	bp->bio_completed = bp->bio_length - resid;
+	if (bp->bio_completed < bp->bio_length && off > volsize)
+		error = SET_ERROR(EINVAL);
+
+	switch (bp->bio_cmd) {
+	case BIO_FLUSH:
+		break;
+	case BIO_READ:
+		dataset_kstats_update_read_kstats(&zv->zv_kstat,
+		    bp->bio_completed);
+		break;
+	case BIO_WRITE:
+		dataset_kstats_update_write_kstats(&zv->zv_kstat,
+		    bp->bio_completed);
+		break;
+	case BIO_DELETE:
+		break;
+	default:
+		break;
+	}
+
+	if (sync) {
+sync:
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	}
+resume:
+	rw_exit(&zv->zv_suspend_lock);
+out:
+	if (bp->bio_to)
+		g_io_deliver(bp, error);
+	else
+		biofinish(bp, NULL, error);
+}
+
+/*
+ * Character device mode implementation
+ */
+
+static int
+zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
+{
+	zvol_state_t *zv;
+	uint64_t volsize;
+	zfs_locked_range_t *lr;
+	int error = 0;
+	zfs_uio_t uio;
+
+	zfs_uio_init(&uio, uio_s);
+
+	zv = dev->si_drv2;
+
+	volsize = zv->zv_volsize;
+	/*
+	 * uio_loffset == volsize isn't an error as
+	 * its required for EOF processing.
+	 */
+	if (zfs_uio_resid(&uio) > 0 &&
+	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
+		return (SET_ERROR(EIO));
+
+	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
+	    zfs_uio_resid(&uio), RL_READER);
+	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
+		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
+
+		/* don't read past the end */
+		if (bytes > volsize - zfs_uio_offset(&uio))
+			bytes = volsize - zfs_uio_offset(&uio);
+
+		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+	}
+	zfs_rangelock_exit(lr);
+
+	return (error);
+}
+
+static int
+zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
+{
+	zvol_state_t *zv;
+	uint64_t volsize;
+	zfs_locked_range_t *lr;
+	int error = 0;
+	boolean_t sync;
+	zfs_uio_t uio;
+
+	zv = dev->si_drv2;
+
+	volsize = zv->zv_volsize;
+
+	zfs_uio_init(&uio, uio_s);
+
+	if (zfs_uio_resid(&uio) > 0 &&
+	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
+		return (SET_ERROR(EIO));
+
+	sync = (ioflag & IO_SYNC) ||
+	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+	zvol_ensure_zilog(zv);
+
+	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
+	    zfs_uio_resid(&uio), RL_WRITER);
+	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
+		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
+		uint64_t off = zfs_uio_offset(&uio);
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+		if (bytes > volsize - off)	/* don't write past the end */
+			bytes = volsize - off;
+
+		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			break;
+		}
+		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+		if (error == 0)
+			zvol_log_write(zv, tx, off, bytes, sync);
+		dmu_tx_commit(tx);
+
+		if (error)
+			break;
+	}
+	zfs_rangelock_exit(lr);
+	if (sync)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+	rw_exit(&zv->zv_suspend_lock);
+	return (error);
+}
+
+static int
+zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv;
+	struct zvol_state_dev *zsd;
+	int err = 0;
+	boolean_t drop_suspend = B_FALSE;
+	boolean_t drop_namespace = B_FALSE;
+
+retry:
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = dev->si_drv2;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		err = SET_ERROR(ENXIO);
+		goto out_locked;
+	}
+
+	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+		/*
+		 * We need to guarantee that the namespace lock is held
+		 * to avoid spurious failures in zvol_first_open.
+		 */
+		drop_namespace = B_TRUE;
+		if (!mutex_tryenter(&spa_namespace_lock)) {
+			rw_exit(&zvol_state_lock);
+			mutex_enter(&spa_namespace_lock);
+			goto retry;
+		}
+	}
+	mutex_enter(&zv->zv_state_lock);
+
+	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
+
+	/*
+	 * make sure zvol is not suspended during first open
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 0) {
+		drop_suspend = B_TRUE;
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 0) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		err = zvol_first_open(zv, !(flags & FWRITE));
+		if (err)
+			goto out_zv_locked;
+	}
+
+	if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		err = SET_ERROR(EROFS);
+		goto out_opened;
+	}
+	if (zv->zv_flags & ZVOL_EXCL) {
+		err = SET_ERROR(EBUSY);
+		goto out_opened;
+	}
+#ifdef FEXCL
+	if (flags & FEXCL) {
+		if (zv->zv_open_count != 0) {
+			err = SET_ERROR(EBUSY);
+			goto out_opened;
+		}
+		zv->zv_flags |= ZVOL_EXCL;
+	}
+#endif
+
+	zv->zv_open_count++;
+	if (flags & (FSYNC | FDSYNC)) {
+		zsd = &zv->zv_zso->zso_dev;
+		zsd->zsd_sync_cnt++;
+		if (zsd->zsd_sync_cnt == 1 &&
+		    (zv->zv_flags & ZVOL_WRITTEN_TO) != 0)
+			zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+	}
+out_opened:
+	if (zv->zv_open_count == 0) {
+		zvol_last_close(zv);
+		wakeup(zv);
+	}
+out_zv_locked:
+	mutex_exit(&zv->zv_state_lock);
+out_locked:
+	if (drop_namespace)
+		mutex_exit(&spa_namespace_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (err);
+}
+
+static int
+zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	zvol_state_t *zv;
+	struct zvol_state_dev *zsd;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+	zv = dev->si_drv2;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+	if (zv->zv_flags & ZVOL_EXCL) {
+		ASSERT3U(zv->zv_open_count, ==, 1);
+		zv->zv_flags &= ~ZVOL_EXCL;
+	}
+
+	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
+
+	/*
+	 * If the open count is zero, this is a spurious close.
+	 * That indicates a bug in the kernel / DDI framework.
+	 */
+	ASSERT3U(zv->zv_open_count, >, 0);
+	/*
+	 * make sure zvol is not suspended during last close
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 1) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 1) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	/*
+	 * You may get multiple opens, but only one close.
+	 */
+	zv->zv_open_count--;
+	if (flags & (FSYNC | FDSYNC)) {
+		zsd = &zv->zv_zso->zso_dev;
+		zsd->zsd_sync_cnt--;
+	}
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+		zvol_last_close(zv);
+		wakeup(zv);
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	return (0);
+}
+
+static int
+zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data,
+    int fflag, struct thread *td)
+{
+	zvol_state_t *zv;
+	zfs_locked_range_t *lr;
+	off_t offset, length;
+	int i, error;
+	boolean_t sync;
+
+	zv = dev->si_drv2;
+
+	error = 0;
+	KASSERT(zv->zv_open_count > 0,
+	    ("Device with zero access count in %s", __func__));
+
+	i = IOCPARM_LEN(cmd);
+	switch (cmd) {
+	case DIOCGSECTORSIZE:
+		*(uint32_t *)data = DEV_BSIZE;
+		break;
+	case DIOCGMEDIASIZE:
+		*(off_t *)data = zv->zv_volsize;
+		break;
+	case DIOCGFLUSH:
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+		if (zv->zv_zilog != NULL)
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		rw_exit(&zv->zv_suspend_lock);
+		break;
+	case DIOCGDELETE:
+		if (!zvol_unmap_enabled)
+			break;
+
+		offset = ((off_t *)data)[0];
+		length = ((off_t *)data)[1];
+		if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+		    offset < 0 || offset >= zv->zv_volsize ||
+		    length <= 0) {
+			printf("%s: offset=%jd length=%jd\n", __func__, offset,
+			    length);
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+		zvol_ensure_zilog(zv);
+		lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length,
+		    RL_WRITER);
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error != 0) {
+			sync = FALSE;
+			dmu_tx_abort(tx);
+		} else {
+			sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+			zvol_log_truncate(zv, tx, offset, length, sync);
+			dmu_tx_commit(tx);
+			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+			    offset, length);
+		}
+		zfs_rangelock_exit(lr);
+		if (sync)
+			zil_commit(zv->zv_zilog, ZVOL_OBJ);
+		rw_exit(&zv->zv_suspend_lock);
+		break;
+	case DIOCGSTRIPESIZE:
+		*(off_t *)data = zv->zv_volblocksize;
+		break;
+	case DIOCGSTRIPEOFFSET:
+		*(off_t *)data = 0;
+		break;
+	case DIOCGATTR: {
+		spa_t *spa = dmu_objset_spa(zv->zv_objset);
+		struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+		uint64_t refd, avail, usedobjs, availobjs;
+
+		if (strcmp(arg->name, "GEOM::candelete") == 0)
+			arg->value.i = 1;
+		else if (strcmp(arg->name, "blocksavail") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "blocksused") == 0) {
+			dmu_objset_space(zv->zv_objset, &refd, &avail,
+			    &usedobjs, &availobjs);
+			arg->value.off = refd / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksavail") == 0) {
+			avail = metaslab_class_get_space(spa_normal_class(spa));
+			avail -= metaslab_class_get_alloc(
+			    spa_normal_class(spa));
+			arg->value.off = avail / DEV_BSIZE;
+		} else if (strcmp(arg->name, "poolblocksused") == 0) {
+			refd = metaslab_class_get_alloc(spa_normal_class(spa));
+			arg->value.off = refd / DEV_BSIZE;
+		} else
+			error = SET_ERROR(ENOIOCTL);
+		break;
+	}
+	case FIOSEEKHOLE:
+	case FIOSEEKDATA: {
+		off_t *off = (off_t *)data;
+		uint64_t noff;
+		boolean_t hole;
+
+		hole = (cmd == FIOSEEKHOLE);
+		noff = *off;
+		error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+		*off = noff;
+		break;
+	}
+	default:
+		error = SET_ERROR(ENOIOCTL);
+	}
+
+	return (error);
+}
+
+/*
+ * Misc. helpers
+ */
+
+static void
+zvol_ensure_zilog(zvol_state_t *zv)
+{
+	ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+
+	/*
+	 * Open a ZIL if this is the first time we have written to this
+	 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+	 * than zv_state_lock so that we don't need to acquire an
+	 * additional lock in this path.
+	 */
+	if (zv->zv_zilog == NULL) {
+		if (!rw_tryupgrade(&zv->zv_suspend_lock)) {
+			rw_exit(&zv->zv_suspend_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+		}
+		if (zv->zv_zilog == NULL) {
+			zv->zv_zilog = zil_open(zv->zv_objset,
+			    zvol_get_data);
+			zv->zv_flags |= ZVOL_WRITTEN_TO;
+		}
+		rw_downgrade(&zv->zv_suspend_lock);
+	}
+}
+
+static boolean_t
+zvol_is_zvol_impl(const char *device)
+{
+	return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	/* move to new hashtable entry  */
+	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	hlist_del(&zv->zv_hlink);
+	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+		struct g_geom *gp;
+
+		g_topology_lock();
+		gp = pp->geom;
+		ASSERT3P(gp, !=, NULL);
+
+		zsg->zsg_provider = NULL;
+		g_wither_provider(pp, ENXIO);
+
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = zv->zv_volsize;
+		pp->private = zv;
+		zsg->zsg_provider = pp;
+		g_error_provider(pp, 0);
+		g_topology_unlock();
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev;
+		struct make_dev_args args;
+
+		dev = zsd->zsd_cdev;
+		if (dev != NULL) {
+			destroy_dev(dev);
+			dev = zsd->zsd_cdev = NULL;
+			if (zv->zv_open_count > 0) {
+				zv->zv_flags &= ~ZVOL_EXCL;
+				zv->zv_open_count = 0;
+				/* XXX  need suspend lock but lock order */
+				zvol_last_close(zv);
+			}
+		}
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname)
+		    == 0) {
+			dev->si_iosize_max = maxphys;
+			zsd->zsd_cdev = dev;
+		}
+	}
+	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+}
+
+/*
+ * Remove minor node for the specified volume.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+
+	ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
+
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp __maybe_unused = zsg->zsg_provider;
+
+		ASSERT3P(pp->private, ==, NULL);
+
+		g_topology_lock();
+		zvol_geom_destroy(zv);
+		g_topology_unlock();
+		mtx_destroy(&zsg->zsg_queue_mtx);
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev = zsd->zsd_cdev;
+
+		ASSERT3P(dev->si_drv2, ==, NULL);
+
+		destroy_dev(dev);
+	}
+
+	mutex_destroy(&zv->zv_state_lock);
+	dataset_kstats_destroy(&zv->zv_kstat);
+	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+	kmem_free(zv, sizeof (zvol_state_t));
+	zvol_minors--;
+}
+
+/*
+ * Create a minor node (plus a whole lot more) for the specified volume.
+ */
+static int
+zvol_create_minor_impl(const char *name)
+{
+	zvol_state_t *zv;
+	objset_t *os;
+	dmu_object_info_t *doi;
+	uint64_t volsize;
+	uint64_t volmode, hash;
+	int error;
+
+	ZFS_LOG(1, "Creating ZVOL %s...", name);
+	hash = zvol_name_hash(name);
+	if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) {
+		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+		mutex_exit(&zv->zv_state_lock);
+		return (SET_ERROR(EEXIST));
+	}
+
+	DROP_GIANT();
+
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+	/* lie and say we're read-only */
+	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+	if (error)
+		goto out_doi;
+
+	error = dmu_object_info(os, ZVOL_OBJ, doi);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL);
+	if (error || volmode == ZFS_VOLMODE_DEFAULT)
+		volmode = zvol_volmode;
+	error = 0;
+
+	/*
+	 * zvol_alloc equivalent ...
+	 */
+	zv = kmem_zalloc(sizeof (*zv), KM_SLEEP);
+	zv->zv_hash = hash;
+	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+	zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+	zv->zv_volmode = volmode;
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp;
+		struct g_geom *gp;
+
+		zsg->zsg_state = ZVOL_GEOM_UNINIT;
+		mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF);
+
+		g_topology_lock();
+		gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+		gp->start = zvol_geom_bio_start;
+		gp->access = zvol_geom_access;
+		pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+		pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+		pp->sectorsize = DEV_BSIZE;
+		pp->mediasize = 0;
+		pp->private = zv;
+
+		zsg->zsg_provider = pp;
+		bioq_init(&zsg->zsg_queue);
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev;
+		struct make_dev_args args;
+
+		make_dev_args_init(&args);
+		args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+		args.mda_devsw = &zvol_cdevsw;
+		args.mda_cr = NULL;
+		args.mda_uid = UID_ROOT;
+		args.mda_gid = GID_OPERATOR;
+		args.mda_mode = 0640;
+		args.mda_si_drv2 = zv;
+		error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name);
+		if (error) {
+			kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+			mutex_destroy(&zv->zv_state_lock);
+			kmem_free(zv, sizeof (*zv));
+			dmu_objset_disown(os, B_TRUE, FTAG);
+			goto out_doi;
+		}
+		dev->si_iosize_max = maxphys;
+		zsd->zsd_cdev = dev;
+	}
+	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
+	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+
+	if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
+		zv->zv_flags |= ZVOL_RDONLY;
+
+	zv->zv_volblocksize = doi->doi_data_block_size;
+	zv->zv_volsize = volsize;
+	zv->zv_objset = os;
+
+	if (spa_writeable(dmu_objset_spa(os))) {
+		if (zil_replay_disable)
+			zil_destroy(dmu_objset_zil(os), B_FALSE);
+		else
+			zil_replay(os, zv, zvol_replay_vector);
+	}
+	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
+
+	/* TODO: prefetch for geom tasting */
+
+	zv->zv_objset = NULL;
+out_dmu_objset_disown:
+	dmu_objset_disown(os, B_TRUE, FTAG);
+
+	if (error == 0 && volmode == ZFS_VOLMODE_GEOM) {
+		zvol_geom_run(zv);
+		g_topology_unlock();
+	}
+out_doi:
+	kmem_free(doi, sizeof (dmu_object_info_t));
+	if (error == 0) {
+		rw_enter(&zvol_state_lock, RW_WRITER);
+		zvol_insert(zv);
+		zvol_minors++;
+		rw_exit(&zvol_state_lock);
+		ZFS_LOG(1, "ZVOL %s created.", name);
+	}
+	PICKUP_GIANT();
+	return (error);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+
+		if (pp->private == NULL) /* already cleared */
+			return;
+
+		mtx_lock(&zsg->zsg_queue_mtx);
+		zsg->zsg_state = ZVOL_GEOM_STOPPED;
+		pp->private = NULL;
+		wakeup_one(&zsg->zsg_queue);
+		while (zsg->zsg_state != ZVOL_GEOM_RUNNING)
+			msleep(&zsg->zsg_state, &zsg->zsg_queue_mtx,
+			    0, "zvol:w", 0);
+		mtx_unlock(&zsg->zsg_queue_mtx);
+		ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	} else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+		struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev;
+		struct cdev *dev = zsd->zsd_cdev;
+
+		dev->si_drv2 = NULL;
+	}
+}
+
+static int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+	zv->zv_volsize = volsize;
+	if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+		struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+		struct g_provider *pp = zsg->zsg_provider;
+
+		g_topology_lock();
+
+		if (pp->private == NULL) {
+			g_topology_unlock();
+			return (SET_ERROR(ENXIO));
+		}
+
+		/*
+		 * Do not invoke resize event when initial size was zero.
+		 * ZVOL initializes the size on first open, this is not
+		 * real resizing.
+		 */
+		if (pp->mediasize == 0)
+			pp->mediasize = zv->zv_volsize;
+		else
+			g_resize_provider(pp, zv->zv_volsize);
+
+		g_topology_unlock();
+	}
+	return (0);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+	// XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+	// XXX? set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_freebsd_ops = {
+	.zv_free = zvol_free,
+	.zv_rename_minor = zvol_rename_minor,
+	.zv_create_minor = zvol_create_minor_impl,
+	.zv_update_volsize = zvol_update_volsize,
+	.zv_clear_private = zvol_clear_private,
+	.zv_is_zvol = zvol_is_zvol_impl,
+	.zv_set_disk_ro = zvol_set_disk_ro_impl,
+	.zv_set_capacity = zvol_set_capacity_impl,
+};
+
+/*
+ * Public interfaces
+ */
+
+int
+zvol_busy(void)
+{
+	return (zvol_minors != 0);
+}
+
+int
+zvol_init(void)
+{
+	zvol_init_impl();
+	zvol_register_ops(&zvol_freebsd_ops);
+	return (0);
+}
+
+void
+zvol_fini(void)
+{
+	zvol_fini_impl();
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/Makefile.in b/sys/contrib/openzfs/module/os/linux/spl/Makefile.in
new file mode 100644
index 000000000000..b2325f91b4a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/Makefile.in
@@ -0,0 +1,17 @@
+$(MODULE)-objs += ../os/linux/spl/spl-atomic.o
+$(MODULE)-objs += ../os/linux/spl/spl-condvar.o
+$(MODULE)-objs += ../os/linux/spl/spl-cred.o
+$(MODULE)-objs += ../os/linux/spl/spl-err.o
+$(MODULE)-objs += ../os/linux/spl/spl-generic.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o
+$(MODULE)-objs += ../os/linux/spl/spl-kstat.o
+$(MODULE)-objs += ../os/linux/spl/spl-proc.o
+$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o
+$(MODULE)-objs += ../os/linux/spl/spl-taskq.o
+$(MODULE)-objs += ../os/linux/spl/spl-thread.o
+$(MODULE)-objs += ../os/linux/spl/spl-trace.o
+$(MODULE)-objs += ../os/linux/spl/spl-tsd.o
+$(MODULE)-objs += ../os/linux/spl/spl-vmem.o
+$(MODULE)-objs += ../os/linux/spl/spl-xdr.o
+$(MODULE)-objs += ../os/linux/spl/spl-zlib.o
diff --git a/sys/contrib/openzfs/module/os/linux/spl/README.md b/sys/contrib/openzfs/module/os/linux/spl/README.md
new file mode 100644
index 000000000000..906530bcf2ad
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/README.md
@@ -0,0 +1,16 @@
+The Solaris Porting Layer, SPL, is a Linux kernel module which provides a
+compatibility layer used by the [OpenZFS](https://github.com/openzfs/zfs) project.
+
+# Installation
+
+The latest version of the SPL is maintained as part of this repository.
+Only when building ZFS version 0.7.x or earlier must an external SPL release
+be used.  These releases can be found at:
+
+  * Version 0.7.x: https://github.com/zfsonlinux/spl/tree/spl-0.7-release  
+  * Version 0.6.5.x: https://github.com/zfsonlinux/spl/tree/spl-0.6.5-release  
+
+# Release
+
+The SPL is released under a GPLv2 license.  
+For more details see the NOTICE and THIRDPARTYLICENSE files; `UCRL-CODE-235197`
diff --git a/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
new file mode 100644
index 000000000000..d159169d1050
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
new file mode 100644
index 000000000000..78535a8ee133
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
@@ -0,0 +1 @@
+COMPATIBILITY LAYER FOR OPENZFS ON LINUX
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
new file mode 100644
index 000000000000..accf656fbcc6
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-atomic.c
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Atomic Implementation.
+ */
+
+#include <sys/atomic.h>
+
+#ifdef ATOMIC_SPINLOCK
+/* Global atomic lock declarations */
+DEFINE_SPINLOCK(atomic32_lock);
+DEFINE_SPINLOCK(atomic64_lock);
+
+EXPORT_SYMBOL(atomic32_lock);
+EXPORT_SYMBOL(atomic64_lock);
+#endif /* ATOMIC_SPINLOCK */
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
new file mode 100644
index 000000000000..d0461a9f1298
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-condvar.c
@@ -0,0 +1,509 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/condvar.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <linux/hrtimer.h>
+#include <linux/compiler_compat.h>
+#include <linux/mod_compat.h>
+
+#include <linux/sched.h>
+
+#ifdef HAVE_SCHED_SIGNAL_HEADER
+#include <linux/sched/signal.h>
+#endif
+
+#define	MAX_HRTIMEOUT_SLACK_US	1000
+unsigned int spl_schedule_hrtimeout_slack_us = 0;
+
+static int
+param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp)
+{
+	unsigned long val;
+	int error;
+
+	error = kstrtoul(buf, 0, &val);
+	if (error)
+		return (error);
+
+	if (val > MAX_HRTIMEOUT_SLACK_US)
+		return (-EINVAL);
+
+	error = param_set_uint(buf, kp);
+	if (error < 0)
+		return (error);
+
+	return (0);
+}
+
+module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack,
+	param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644);
+MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us,
+	"schedule_hrtimeout_range() delta/slack value in us, default(0)");
+
+void
+__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg)
+{
+	ASSERT(cvp);
+	ASSERT(name == NULL);
+	ASSERT(type == CV_DEFAULT);
+	ASSERT(arg == NULL);
+
+	cvp->cv_magic = CV_MAGIC;
+	init_waitqueue_head(&cvp->cv_event);
+	init_waitqueue_head(&cvp->cv_destroy);
+	atomic_set(&cvp->cv_waiters, 0);
+	atomic_set(&cvp->cv_refs, 1);
+	cvp->cv_mutex = NULL;
+}
+EXPORT_SYMBOL(__cv_init);
+
+static int
+cv_destroy_wakeup(kcondvar_t *cvp)
+{
+	if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) {
+		ASSERT(cvp->cv_mutex == NULL);
+		ASSERT(!waitqueue_active(&cvp->cv_event));
+		return (1);
+	}
+
+	return (0);
+}
+
+void
+__cv_destroy(kcondvar_t *cvp)
+{
+	ASSERT(cvp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+
+	cvp->cv_magic = CV_DESTROY;
+	atomic_dec(&cvp->cv_refs);
+
+	/* Block until all waiters are woken and references dropped. */
+	while (cv_destroy_wakeup(cvp) == 0)
+		wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1);
+
+	ASSERT3P(cvp->cv_mutex, ==, NULL);
+	ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0);
+	ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0);
+	ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0);
+}
+EXPORT_SYMBOL(__cv_destroy);
+
+static void
+cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io)
+{
+	DEFINE_WAIT(wait);
+	kmutex_t *m;
+
+	ASSERT(cvp);
+	ASSERT(mp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	ASSERT(mutex_owned(mp));
+	atomic_inc(&cvp->cv_refs);
+
+	m = READ_ONCE(cvp->cv_mutex);
+	if (!m)
+		m = xchg(&cvp->cv_mutex, mp);
+	/* Ensure the same mutex is used by all callers */
+	ASSERT(m == NULL || m == mp);
+
+	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+	atomic_inc(&cvp->cv_waiters);
+
+	/*
+	 * Mutex should be dropped after prepare_to_wait() this
+	 * ensures we're linked in to the waiters list and avoids the
+	 * race where 'cvp->cv_waiters > 0' but the list is empty.
+	 */
+	mutex_exit(mp);
+	if (io)
+		io_schedule();
+	else
+		schedule();
+
+	/* No more waiters a different mutex could be used */
+	if (atomic_dec_and_test(&cvp->cv_waiters)) {
+		/*
+		 * This is set without any lock, so it's racy. But this is
+		 * just for debug anyway, so make it best-effort
+		 */
+		cvp->cv_mutex = NULL;
+		wake_up(&cvp->cv_destroy);
+	}
+
+	finish_wait(&cvp->cv_event, &wait);
+	atomic_dec(&cvp->cv_refs);
+
+	/*
+	 * Hold mutex after we release the cvp, otherwise we could dead lock
+	 * with a thread holding the mutex and call cv_destroy.
+	 */
+	mutex_enter(mp);
+}
+
+void
+__cv_wait(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0);
+}
+EXPORT_SYMBOL(__cv_wait);
+
+void
+__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1);
+}
+EXPORT_SYMBOL(__cv_wait_io);
+
+int
+__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1);
+
+	return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_io_sig);
+
+int
+__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
+{
+	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+
+	return (signal_pending(current) ? 0 : 1);
+}
+EXPORT_SYMBOL(__cv_wait_sig);
+
+void
+__cv_wait_idle(kcondvar_t *cvp, kmutex_t *mp)
+{
+	sigset_t blocked, saved;
+
+	sigfillset(&blocked);
+	(void) sigprocmask(SIG_BLOCK, &blocked, &saved);
+	cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0);
+	(void) sigprocmask(SIG_SETMASK, &saved, NULL);
+}
+EXPORT_SYMBOL(__cv_wait_idle);
+
+#if defined(HAVE_IO_SCHEDULE_TIMEOUT)
+#define	spl_io_schedule_timeout(t)	io_schedule_timeout(t)
+#else
+
+struct spl_task_timer {
+	struct timer_list timer;
+	struct task_struct *task;
+};
+
+static void
+__cv_wakeup(spl_timer_list_t t)
+{
+	struct timer_list *tmr = (struct timer_list *)t;
+	struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer);
+
+	wake_up_process(task_timer->task);
+}
+
+static long
+spl_io_schedule_timeout(long time_left)
+{
+	long expire_time = jiffies + time_left;
+	struct spl_task_timer task_timer;
+	struct timer_list *timer = &task_timer.timer;
+
+	task_timer.task = current;
+
+	timer_setup(timer, __cv_wakeup, 0);
+
+	timer->expires = expire_time;
+	add_timer(timer);
+
+	io_schedule();
+
+	del_timer_sync(timer);
+
+	time_left = expire_time - jiffies;
+
+	return (time_left < 0 ? 0 : time_left);
+}
+#endif
+
+/*
+ * 'expire_time' argument is an absolute wall clock time in jiffies.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time,
+    int state, int io)
+{
+	DEFINE_WAIT(wait);
+	kmutex_t *m;
+	clock_t time_left;
+
+	ASSERT(cvp);
+	ASSERT(mp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	ASSERT(mutex_owned(mp));
+
+	/* XXX - Does not handle jiffie wrap properly */
+	time_left = expire_time - jiffies;
+	if (time_left <= 0)
+		return (-1);
+
+	atomic_inc(&cvp->cv_refs);
+	m = READ_ONCE(cvp->cv_mutex);
+	if (!m)
+		m = xchg(&cvp->cv_mutex, mp);
+	/* Ensure the same mutex is used by all callers */
+	ASSERT(m == NULL || m == mp);
+
+	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+	atomic_inc(&cvp->cv_waiters);
+
+	/*
+	 * Mutex should be dropped after prepare_to_wait() this
+	 * ensures we're linked in to the waiters list and avoids the
+	 * race where 'cvp->cv_waiters > 0' but the list is empty.
+	 */
+	mutex_exit(mp);
+	if (io)
+		time_left = spl_io_schedule_timeout(time_left);
+	else
+		time_left = schedule_timeout(time_left);
+
+	/* No more waiters a different mutex could be used */
+	if (atomic_dec_and_test(&cvp->cv_waiters)) {
+		/*
+		 * This is set without any lock, so it's racy. But this is
+		 * just for debug anyway, so make it best-effort
+		 */
+		cvp->cv_mutex = NULL;
+		wake_up(&cvp->cv_destroy);
+	}
+
+	finish_wait(&cvp->cv_event, &wait);
+	atomic_dec(&cvp->cv_refs);
+
+	/*
+	 * Hold mutex after we release the cvp, otherwise we could dead lock
+	 * with a thread holding the mutex and call cv_destroy.
+	 */
+	mutex_enter(mp);
+	return (time_left > 0 ? 1 : -1);
+}
+
+int
+__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+	return (__cv_timedwait_common(cvp, mp, exp_time,
+	    TASK_UNINTERRUPTIBLE, 0));
+}
+EXPORT_SYMBOL(__cv_timedwait);
+
+int
+__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+	return (__cv_timedwait_common(cvp, mp, exp_time,
+	    TASK_UNINTERRUPTIBLE, 1));
+}
+EXPORT_SYMBOL(__cv_timedwait_io);
+
+int
+__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+	int rc;
+
+	rc = __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE, 0);
+	return (signal_pending(current) ? 0 : rc);
+}
+EXPORT_SYMBOL(__cv_timedwait_sig);
+
+int
+__cv_timedwait_idle(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time)
+{
+	sigset_t blocked, saved;
+	int rc;
+
+	sigfillset(&blocked);
+	(void) sigprocmask(SIG_BLOCK, &blocked, &saved);
+	rc = __cv_timedwait_common(cvp, mp, exp_time,
+	    TASK_INTERRUPTIBLE, 0);
+	(void) sigprocmask(SIG_SETMASK, &saved, NULL);
+
+	return (rc);
+}
+EXPORT_SYMBOL(__cv_timedwait_idle);
+/*
+ * 'expire_time' argument is an absolute clock time in nanoseconds.
+ * Return value is time left (expire_time - now) or -1 if timeout occurred.
+ */
+static clock_t
+__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time,
+    hrtime_t res, int state)
+{
+	DEFINE_WAIT(wait);
+	kmutex_t *m;
+	hrtime_t time_left;
+	ktime_t ktime_left;
+	u64 slack = 0;
+	int rc;
+
+	ASSERT(cvp);
+	ASSERT(mp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	ASSERT(mutex_owned(mp));
+
+	time_left = expire_time - gethrtime();
+	if (time_left <= 0)
+		return (-1);
+
+	atomic_inc(&cvp->cv_refs);
+	m = READ_ONCE(cvp->cv_mutex);
+	if (!m)
+		m = xchg(&cvp->cv_mutex, mp);
+	/* Ensure the same mutex is used by all callers */
+	ASSERT(m == NULL || m == mp);
+
+	prepare_to_wait_exclusive(&cvp->cv_event, &wait, state);
+	atomic_inc(&cvp->cv_waiters);
+
+	/*
+	 * Mutex should be dropped after prepare_to_wait() this
+	 * ensures we're linked in to the waiters list and avoids the
+	 * race where 'cvp->cv_waiters > 0' but the list is empty.
+	 */
+	mutex_exit(mp);
+
+	ktime_left = ktime_set(0, time_left);
+	slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC),
+	    MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC);
+	rc = schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL);
+
+	/* No more waiters a different mutex could be used */
+	if (atomic_dec_and_test(&cvp->cv_waiters)) {
+		/*
+		 * This is set without any lock, so it's racy. But this is
+		 * just for debug anyway, so make it best-effort
+		 */
+		cvp->cv_mutex = NULL;
+		wake_up(&cvp->cv_destroy);
+	}
+
+	finish_wait(&cvp->cv_event, &wait);
+	atomic_dec(&cvp->cv_refs);
+
+	mutex_enter(mp);
+	return (rc == -EINTR ? 1 : -1);
+}
+
+/*
+ * Compatibility wrapper for the cv_timedwait_hires() Illumos interface.
+ */
+static int
+cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag, int state)
+{
+	if (!(flag & CALLOUT_FLAG_ABSOLUTE))
+		tim += gethrtime();
+
+	return (__cv_timedwait_hires(cvp, mp, tim, res, state));
+}
+
+int
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res,
+    int flag)
+{
+	return (cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+	    TASK_UNINTERRUPTIBLE));
+}
+EXPORT_SYMBOL(cv_timedwait_hires);
+
+int
+cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag)
+{
+	int rc;
+
+	rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+	    TASK_INTERRUPTIBLE);
+	return (signal_pending(current) ? 0 : rc);
+}
+EXPORT_SYMBOL(cv_timedwait_sig_hires);
+
+int
+cv_timedwait_idle_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+    hrtime_t res, int flag)
+{
+	sigset_t blocked, saved;
+	int rc;
+
+	sigfillset(&blocked);
+	(void) sigprocmask(SIG_BLOCK, &blocked, &saved);
+	rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag,
+	    TASK_INTERRUPTIBLE);
+	(void) sigprocmask(SIG_SETMASK, &saved, NULL);
+
+	return (rc);
+}
+EXPORT_SYMBOL(cv_timedwait_idle_hires);
+
+void
+__cv_signal(kcondvar_t *cvp)
+{
+	ASSERT(cvp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	atomic_inc(&cvp->cv_refs);
+
+	/*
+	 * All waiters are added with WQ_FLAG_EXCLUSIVE so only one
+	 * waiter will be set runnable with each call to wake_up().
+	 * Additionally wake_up() holds a spin_lock associated with
+	 * the wait queue to ensure we don't race waking up processes.
+	 */
+	if (atomic_read(&cvp->cv_waiters) > 0)
+		wake_up(&cvp->cv_event);
+
+	atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_signal);
+
+void
+__cv_broadcast(kcondvar_t *cvp)
+{
+	ASSERT(cvp);
+	ASSERT(cvp->cv_magic == CV_MAGIC);
+	atomic_inc(&cvp->cv_refs);
+
+	/*
+	 * Wake_up_all() will wake up all waiters even those which
+	 * have the WQ_FLAG_EXCLUSIVE flag set.
+	 */
+	if (atomic_read(&cvp->cv_waiters) > 0)
+		wake_up_all(&cvp->cv_event);
+
+	atomic_dec(&cvp->cv_refs);
+}
+EXPORT_SYMBOL(__cv_broadcast);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
new file mode 100644
index 000000000000..8fe1cc30ba99
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-cred.c
@@ -0,0 +1,195 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Credential Implementation.
+ */
+
+#include <sys/cred.h>
+
+static int
+cr_groups_search(const struct group_info *group_info, kgid_t grp)
+{
+	unsigned int left, right, mid;
+	int cmp;
+
+	if (!group_info)
+		return (0);
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		mid = (left + right) / 2;
+		cmp = KGID_TO_SGID(grp) -
+		    KGID_TO_SGID(GROUP_AT(group_info, mid));
+
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return (1);
+	}
+	return (0);
+}
+
+/* Hold a reference on the credential */
+void
+crhold(cred_t *cr)
+{
+	(void) get_cred((const cred_t *)cr);
+}
+
+/* Free a reference on the credential */
+void
+crfree(cred_t *cr)
+{
+	put_cred((const cred_t *)cr);
+}
+
+/* Return the number of supplemental groups */
+int
+crgetngroups(const cred_t *cr)
+{
+	struct group_info *gi;
+	int rc;
+
+	gi = cr->group_info;
+	rc = gi->ngroups;
+#ifndef HAVE_GROUP_INFO_GID
+	/*
+	 * For Linux <= 4.8,
+	 * crgetgroups will only returns gi->blocks[0], which contains only
+	 * the first NGROUPS_PER_BLOCK groups.
+	 */
+	if (rc > NGROUPS_PER_BLOCK) {
+		WARN_ON_ONCE(1);
+		rc = NGROUPS_PER_BLOCK;
+	}
+#endif
+	return (rc);
+}
+
+/*
+ * Return an array of supplemental gids.  The returned address is safe
+ * to use as long as the caller has taken a reference with crhold().
+ *
+ * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d
+ * array via ->gid.
+ */
+gid_t *
+crgetgroups(const cred_t *cr)
+{
+	struct group_info *gi;
+	gid_t *gids = NULL;
+
+	gi = cr->group_info;
+#ifdef HAVE_GROUP_INFO_GID
+	gids = KGIDP_TO_SGIDP(gi->gid);
+#else
+	if (gi->nblocks > 0)
+		gids = KGIDP_TO_SGIDP(gi->blocks[0]);
+#endif
+	return (gids);
+}
+
+/* Check if the passed gid is available in supplied credential. */
+int
+groupmember(gid_t gid, const cred_t *cr)
+{
+	struct group_info *gi;
+	int rc;
+
+	gi = cr->group_info;
+	rc = cr_groups_search(gi, SGID_TO_KGID(gid));
+
+	return (rc);
+}
+
+/* Return the effective user id */
+uid_t
+crgetuid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->euid));
+}
+
+/* Return the real user id */
+uid_t
+crgetruid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->uid));
+}
+
+/* Return the saved user id */
+uid_t
+crgetsuid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->suid));
+}
+
+/* Return the filesystem user id */
+uid_t
+crgetfsuid(const cred_t *cr)
+{
+	return (KUID_TO_SUID(cr->fsuid));
+}
+
+/* Return the effective group id */
+gid_t
+crgetgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->egid));
+}
+
+/* Return the real group id */
+gid_t
+crgetrgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->gid));
+}
+
+/* Return the saved group id */
+gid_t
+crgetsgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->sgid));
+}
+
+/* Return the filesystem group id */
+gid_t
+crgetfsgid(const cred_t *cr)
+{
+	return (KGID_TO_SGID(cr->fsgid));
+}
+
+EXPORT_SYMBOL(crhold);
+EXPORT_SYMBOL(crfree);
+EXPORT_SYMBOL(crgetuid);
+EXPORT_SYMBOL(crgetruid);
+EXPORT_SYMBOL(crgetsuid);
+EXPORT_SYMBOL(crgetfsuid);
+EXPORT_SYMBOL(crgetgid);
+EXPORT_SYMBOL(crgetrgid);
+EXPORT_SYMBOL(crgetsgid);
+EXPORT_SYMBOL(crgetfsgid);
+EXPORT_SYMBOL(crgetngroups);
+EXPORT_SYMBOL(crgetgroups);
+EXPORT_SYMBOL(groupmember);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-err.c b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
new file mode 100644
index 000000000000..10b768d57360
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-err.c
@@ -0,0 +1,123 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Error Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+
+/*
+ * It is often useful to actually have the panic crash the node so you
+ * can then get notified of the event, get the crashdump for later
+ * analysis and other such goodies.
+ * But we would still default to the current default of not to do that.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_panic_halt;
+module_param(spl_panic_halt, uint, 0644);
+MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures");
+/* END CSTYLED */
+
+void
+spl_dumpstack(void)
+{
+	printk("Showing stack for process %d\n", current->pid);
+	dump_stack();
+}
+EXPORT_SYMBOL(spl_dumpstack);
+
+int
+spl_panic(const char *file, const char *func, int line, const char *fmt, ...)
+{
+	const char *newfile;
+	char msg[MAXMSGLEN];
+	va_list ap;
+
+	newfile = strrchr(file, '/');
+	if (newfile != NULL)
+		newfile = newfile + 1;
+	else
+		newfile = file;
+
+	va_start(ap, fmt);
+	(void) vsnprintf(msg, sizeof (msg), fmt, ap);
+	va_end(ap);
+
+	printk(KERN_EMERG "%s", msg);
+	printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func);
+	if (spl_panic_halt)
+		panic("%s", msg);
+
+	spl_dumpstack();
+
+	/* Halt the thread to facilitate further debugging */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	while (1)
+		schedule();
+
+	/* Unreachable */
+	return (1);
+}
+EXPORT_SYMBOL(spl_panic);
+
+void
+vcmn_err(int ce, const char *fmt, va_list ap)
+{
+	char msg[MAXMSGLEN];
+
+	vsnprintf(msg, MAXMSGLEN, fmt, ap);
+
+	switch (ce) {
+	case CE_IGNORE:
+		break;
+	case CE_CONT:
+		printk("%s", msg);
+		break;
+	case CE_NOTE:
+		printk(KERN_NOTICE "NOTICE: %s\n", msg);
+		break;
+	case CE_WARN:
+		printk(KERN_WARNING "WARNING: %s\n", msg);
+		break;
+	case CE_PANIC:
+		printk(KERN_EMERG "PANIC: %s\n", msg);
+		spl_dumpstack();
+
+		/* Halt the thread to facilitate further debugging */
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		while (1)
+			schedule();
+	}
+} /* vcmn_err() */
+EXPORT_SYMBOL(vcmn_err);
+
+void
+cmn_err(int ce, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vcmn_err(ce, fmt, ap);
+	va_end(ap);
+} /* cmn_err() */
+EXPORT_SYMBOL(cmn_err);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
new file mode 100644
index 000000000000..36fdff72a133
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-generic.c
@@ -0,0 +1,841 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Generic Implementation.
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/systeminfo.h>
+#include <sys/vmsystm.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/mutex.h>
+#include <sys/rwlock.h>
+#include <sys/taskq.h>
+#include <sys/tsd.h>
+#include <sys/zmod.h>
+#include <sys/debug.h>
+#include <sys/proc.h>
+#include <sys/kstat.h>
+#include <sys/file.h>
+#include <sys/sunddi.h>
+#include <linux/ctype.h>
+#include <sys/disp.h>
+#include <sys/random.h>
+#include <sys/strings.h>
+#include <linux/kmod.h>
+#include "zfs_gitrev.h"
+#include <linux/mod_compat.h>
+#include <sys/cred.h>
+#include <sys/vnode.h>
+
+char spl_gitrev[64] = ZFS_META_GITREV;
+
+/* BEGIN CSTYLED */
+unsigned long spl_hostid = 0;
+EXPORT_SYMBOL(spl_hostid);
+/* BEGIN CSTYLED */
+module_param(spl_hostid, ulong, 0644);
+MODULE_PARM_DESC(spl_hostid, "The system hostid.");
+/* END CSTYLED */
+
+proc_t p0;
+EXPORT_SYMBOL(p0);
+
+/*
+ * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna
+ *
+ * "Further scramblings of Marsaglia's xorshift generators"
+ * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *
+ * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose
+ * is to provide bytes containing random numbers. It is mapped to /dev/urandom
+ * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's
+ * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so
+ * we can implement it using a fast PRNG that we seed using Linux' actual
+ * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU
+ * with an independent seed so that all calls to random_get_pseudo_bytes() are
+ * free of atomic instructions.
+ *
+ * A consequence of using a fast PRNG is that using random_get_pseudo_bytes()
+ * to generate words larger than 128 bits will paradoxically be limited to
+ * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1`
+ * 128-bit words and selecting the first will implicitly select the second. If
+ * a caller finds this behavior undesirable, random_get_bytes() should be used
+ * instead.
+ *
+ * XXX: Linux interrupt handlers that trigger within the critical section
+ * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will
+ * see the same numbers. Nothing in the code currently calls this in an
+ * interrupt handler, so this is considered to be okay. If that becomes a
+ * problem, we could create a set of per-cpu variables for interrupt handlers
+ * and use them when in_interrupt() from linux/preempt_mask.h evaluates to
+ * true.
+ */
+void __percpu *spl_pseudo_entropy;
+
+/*
+ * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed
+ * file:
+ *
+ * http://xorshift.di.unimi.it/xorshift128plus.c
+ */
+
+static inline uint64_t
+spl_rand_next(uint64_t *s)
+{
+	uint64_t s1 = s[0];
+	const uint64_t s0 = s[1];
+	s[0] = s0;
+	s1 ^= s1 << 23; // a
+	s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c
+	return (s[1] + s0);
+}
+
+static inline void
+spl_rand_jump(uint64_t *s)
+{
+	static const uint64_t JUMP[] =
+	    { 0x8a5cd789635d2dff, 0x121fd2155c472f96 };
+
+	uint64_t s0 = 0;
+	uint64_t s1 = 0;
+	int i, b;
+	for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++)
+		for (b = 0; b < 64; b++) {
+			if (JUMP[i] & 1ULL << b) {
+				s0 ^= s[0];
+				s1 ^= s[1];
+			}
+			(void) spl_rand_next(s);
+		}
+
+	s[0] = s0;
+	s[1] = s1;
+}
+
+int
+random_get_pseudo_bytes(uint8_t *ptr, size_t len)
+{
+	uint64_t *xp, s[2];
+
+	ASSERT(ptr);
+
+	xp = get_cpu_ptr(spl_pseudo_entropy);
+
+	s[0] = xp[0];
+	s[1] = xp[1];
+
+	while (len) {
+		union {
+			uint64_t ui64;
+			uint8_t byte[sizeof (uint64_t)];
+		}entropy;
+		int i = MIN(len, sizeof (uint64_t));
+
+		len -= i;
+		entropy.ui64 = spl_rand_next(s);
+
+		while (i--)
+			*ptr++ = entropy.byte[i];
+	}
+
+	xp[0] = s[0];
+	xp[1] = s[1];
+
+	put_cpu_ptr(spl_pseudo_entropy);
+
+	return (0);
+}
+
+
+EXPORT_SYMBOL(random_get_pseudo_bytes);
+
+#if BITS_PER_LONG == 32
+
+/*
+ * Support 64/64 => 64 division on a 32-bit platform.  While the kernel
+ * provides a div64_u64() function for this we do not use it because the
+ * implementation is flawed.  There are cases which return incorrect
+ * results as late as linux-2.6.35.  Until this is fixed upstream the
+ * spl must provide its own implementation.
+ *
+ * This implementation is a slightly modified version of the algorithm
+ * proposed by the book 'Hacker's Delight'.  The original source can be
+ * found here and is available for use without restriction.
+ *
+ * http://www.hackersdelight.org/HDcode/newCode/divDouble.c
+ */
+
+/*
+ * Calculate number of leading of zeros for a 64-bit value.
+ */
+static int
+nlz64(uint64_t x)
+{
+	register int n = 0;
+
+	if (x == 0)
+		return (64);
+
+	if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; }
+	if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; }
+	if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n +  8; x = x <<  8; }
+	if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n +  4; x = x <<  4; }
+	if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n +  2; x = x <<  2; }
+	if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n +  1; }
+
+	return (n);
+}
+
+/*
+ * Newer kernels have a div_u64() function but we define our own
+ * to simplify portability between kernel versions.
+ */
+static inline uint64_t
+__div_u64(uint64_t u, uint32_t v)
+{
+	(void) do_div(u, v);
+	return (u);
+}
+
+/*
+ * Turn off missing prototypes warning for these functions. They are
+ * replacements for libgcc-provided functions and will never be called
+ * directly.
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmissing-prototypes"
+
+/*
+ * Implementation of 64-bit unsigned division for 32-bit machines.
+ *
+ * First the procedure takes care of the case in which the divisor is a
+ * 32-bit quantity. There are two subcases: (1) If the left half of the
+ * dividend is less than the divisor, one execution of do_div() is all that
+ * is required (overflow is not possible). (2) Otherwise it does two
+ * divisions, using the grade school method.
+ */
+uint64_t
+__udivdi3(uint64_t u, uint64_t v)
+{
+	uint64_t u0, u1, v1, q0, q1, k;
+	int n;
+
+	if (v >> 32 == 0) {			// If v < 2**32:
+		if (u >> 32 < v) {		// If u/v cannot overflow,
+			return (__div_u64(u, v)); // just do one division.
+		} else {			// If u/v would overflow:
+			u1 = u >> 32;		// Break u into two halves.
+			u0 = u & 0xFFFFFFFF;
+			q1 = __div_u64(u1, v);	// First quotient digit.
+			k  = u1 - q1 * v;	// First remainder, < v.
+			u0 += (k << 32);
+			q0 = __div_u64(u0, v);	// Seconds quotient digit.
+			return ((q1 << 32) + q0);
+		}
+	} else {				// If v >= 2**32:
+		n = nlz64(v);			// 0 <= n <= 31.
+		v1 = (v << n) >> 32;		// Normalize divisor, MSB is 1.
+		u1 = u >> 1;			// To ensure no overflow.
+		q1 = __div_u64(u1, v1);		// Get quotient from
+		q0 = (q1 << n) >> 31;		// Undo normalization and
+						// division of u by 2.
+		if (q0 != 0)			// Make q0 correct or
+			q0 = q0 - 1;		// too small by 1.
+		if ((u - q0 * v) >= v)
+			q0 = q0 + 1;		// Now q0 is correct.
+
+		return (q0);
+	}
+}
+EXPORT_SYMBOL(__udivdi3);
+
+/* BEGIN CSTYLED */
+#ifndef abs64
+#define	abs64(x)	({ uint64_t t = (x) >> 63; ((x) ^ t) - t; })
+#endif
+/* END CSTYLED */
+
+/*
+ * Implementation of 64-bit signed division for 32-bit machines.
+ */
+int64_t
+__divdi3(int64_t u, int64_t v)
+{
+	int64_t q, t;
+	q = __udivdi3(abs64(u), abs64(v));
+	t = (u ^ v) >> 63;	// If u, v have different
+	return ((q ^ t) - t);	// signs, negate q.
+}
+EXPORT_SYMBOL(__divdi3);
+
+/*
+ * Implementation of 64-bit unsigned modulo for 32-bit machines.
+ */
+uint64_t
+__umoddi3(uint64_t dividend, uint64_t divisor)
+{
+	return (dividend - (divisor * __udivdi3(dividend, divisor)));
+}
+EXPORT_SYMBOL(__umoddi3);
+
+/* 64-bit signed modulo for 32-bit machines. */
+int64_t
+__moddi3(int64_t n, int64_t d)
+{
+	int64_t q;
+	boolean_t nn = B_FALSE;
+
+	if (n < 0) {
+		nn = B_TRUE;
+		n = -n;
+	}
+	if (d < 0)
+		d = -d;
+
+	q = __umoddi3(n, d);
+
+	return (nn ? -q : q);
+}
+EXPORT_SYMBOL(__moddi3);
+
+/*
+ * Implementation of 64-bit unsigned division/modulo for 32-bit machines.
+ */
+uint64_t
+__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r)
+{
+	uint64_t q = __udivdi3(n, d);
+	if (r)
+		*r = n - d * q;
+	return (q);
+}
+EXPORT_SYMBOL(__udivmoddi4);
+
+/*
+ * Implementation of 64-bit signed division/modulo for 32-bit machines.
+ */
+int64_t
+__divmoddi4(int64_t n, int64_t d, int64_t *r)
+{
+	int64_t q, rr;
+	boolean_t nn = B_FALSE;
+	boolean_t nd = B_FALSE;
+	if (n < 0) {
+		nn = B_TRUE;
+		n = -n;
+	}
+	if (d < 0) {
+		nd = B_TRUE;
+		d = -d;
+	}
+
+	q = __udivmoddi4(n, d, (uint64_t *)&rr);
+
+	if (nn != nd)
+		q = -q;
+	if (nn)
+		rr = -rr;
+	if (r)
+		*r = rr;
+	return (q);
+}
+EXPORT_SYMBOL(__divmoddi4);
+
+#if defined(__arm) || defined(__arm__)
+/*
+ * Implementation of 64-bit (un)signed division for 32-bit arm machines.
+ *
+ * Run-time ABI for the ARM Architecture (page 20).  A pair of (unsigned)
+ * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1},
+ * and the remainder in {r2, r3}.  The return type is specifically left
+ * set to 'void' to ensure the compiler does not overwrite these registers
+ * during the return.  All results are in registers as per ABI
+ */
+void
+__aeabi_uldivmod(uint64_t u, uint64_t v)
+{
+	uint64_t res;
+	uint64_t mod;
+
+	res = __udivdi3(u, v);
+	mod = __umoddi3(u, v);
+	{
+		register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+		register uint32_t r1 asm("r1") = (res >> 32);
+		register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+		register uint32_t r3 asm("r3") = (mod >> 32);
+
+		/* BEGIN CSTYLED */
+		asm volatile(""
+		    : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3)  /* output */
+		    : "r"(r0), "r"(r1), "r"(r2), "r"(r3));   /* input */
+		/* END CSTYLED */
+
+		return; /* r0; */
+	}
+}
+EXPORT_SYMBOL(__aeabi_uldivmod);
+
+void
+__aeabi_ldivmod(int64_t u, int64_t v)
+{
+	int64_t res;
+	uint64_t mod;
+
+	res =  __divdi3(u, v);
+	mod = __umoddi3(u, v);
+	{
+		register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF);
+		register uint32_t r1 asm("r1") = (res >> 32);
+		register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF);
+		register uint32_t r3 asm("r3") = (mod >> 32);
+
+		/* BEGIN CSTYLED */
+		asm volatile(""
+		    : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3)  /* output */
+		    : "r"(r0), "r"(r1), "r"(r2), "r"(r3));   /* input */
+		/* END CSTYLED */
+
+		return; /* r0; */
+	}
+}
+EXPORT_SYMBOL(__aeabi_ldivmod);
+#endif /* __arm || __arm__ */
+
+#pragma GCC diagnostic pop
+
+#endif /* BITS_PER_LONG */
+
+/*
+ * NOTE: The strtoxx behavior is solely based on my reading of the Solaris
+ * ddi_strtol(9F) man page.  I have not verified the behavior of these
+ * functions against their Solaris counterparts.  It is possible that I
+ * may have misinterpreted the man page or the man page is incorrect.
+ */
+int ddi_strtoul(const char *, char **, int, unsigned long *);
+int ddi_strtol(const char *, char **, int, long *);
+int ddi_strtoull(const char *, char **, int, unsigned long long *);
+int ddi_strtoll(const char *, char **, int, long long *);
+
+#define	define_ddi_strtoux(type, valtype)				\
+int ddi_strtou##type(const char *str, char **endptr,			\
+    int base, valtype *result)						\
+{									\
+	valtype last_value, value = 0;					\
+	char *ptr = (char *)str;					\
+	int flag = 1, digit;						\
+									\
+	if (strlen(ptr) == 0)						\
+		return (EINVAL);					\
+									\
+	/* Auto-detect base based on prefix */				\
+	if (!base) {							\
+		if (str[0] == '0') {					\
+			if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \
+				base = 16; /* hex */			\
+				ptr += 2;				\
+			} else if (str[1] >= '0' && str[1] < 8) {	\
+				base = 8; /* octal */			\
+				ptr += 1;				\
+			} else {					\
+				return (EINVAL);			\
+			}						\
+		} else {						\
+			base = 10; /* decimal */			\
+		}							\
+	}								\
+									\
+	while (1) {							\
+		if (isdigit(*ptr))					\
+			digit = *ptr - '0';				\
+		else if (isalpha(*ptr))					\
+			digit = tolower(*ptr) - 'a' + 10;		\
+		else							\
+			break;						\
+									\
+		if (digit >= base)					\
+			break;						\
+									\
+		last_value = value;					\
+		value = value * base + digit;				\
+		if (last_value > value) /* Overflow */			\
+			return (ERANGE);				\
+									\
+		flag = 1;						\
+		ptr++;							\
+	}								\
+									\
+	if (flag)							\
+		*result = value;					\
+									\
+	if (endptr)							\
+		*endptr = (char *)(flag ? ptr : str);			\
+									\
+	return (0);							\
+}									\
+
+#define	define_ddi_strtox(type, valtype)				\
+int ddi_strto##type(const char *str, char **endptr,			\
+    int base, valtype *result)						\
+{									\
+	int rc;								\
+									\
+	if (*str == '-') {						\
+		rc = ddi_strtou##type(str + 1, endptr, base, result);	\
+		if (!rc) {						\
+			if (*endptr == str + 1)				\
+				*endptr = (char *)str;			\
+			else						\
+				*result = -*result;			\
+		}							\
+	} else {							\
+		rc = ddi_strtou##type(str, endptr, base, result);	\
+	}								\
+									\
+	return (rc);							\
+}
+
+define_ddi_strtoux(l, unsigned long)
+define_ddi_strtox(l, long)
+define_ddi_strtoux(ll, unsigned long long)
+define_ddi_strtox(ll, long long)
+
+EXPORT_SYMBOL(ddi_strtoul);
+EXPORT_SYMBOL(ddi_strtol);
+EXPORT_SYMBOL(ddi_strtoll);
+EXPORT_SYMBOL(ddi_strtoull);
+
+int
+ddi_copyin(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyin(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyin);
+
+int
+ddi_copyout(const void *from, void *to, size_t len, int flags)
+{
+	/* Fake ioctl() issued by kernel, 'from' is a kernel address */
+	if (flags & FKIOCTL) {
+		memcpy(to, from, len);
+		return (0);
+	}
+
+	return (copyout(from, to, len));
+}
+EXPORT_SYMBOL(ddi_copyout);
+
+static ssize_t
+spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+	return (kernel_read(file, buf, count, pos));
+#else
+	mm_segment_t saved_fs;
+	ssize_t ret;
+
+	saved_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	ret = vfs_read(file, (void __user *)buf, count, pos);
+
+	set_fs(saved_fs);
+
+	return (ret);
+#endif
+}
+
+static int
+spl_getattr(struct file *filp, struct kstat *stat)
+{
+	int rc;
+
+	ASSERT(filp);
+	ASSERT(stat);
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, stat, STATX_BASIC_STATS,
+	    AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, stat);
+#else
+	rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, stat);
+#endif
+	if (rc)
+		return (-rc);
+
+	return (0);
+}
+
+/*
+ * Read the unique system identifier from the /etc/hostid file.
+ *
+ * The behavior of /usr/bin/hostid on Linux systems with the
+ * regular eglibc and coreutils is:
+ *
+ *   1. Generate the value if the /etc/hostid file does not exist
+ *      or if the /etc/hostid file is less than four bytes in size.
+ *
+ *   2. If the /etc/hostid file is at least 4 bytes, then return
+ *      the first four bytes [0..3] in native endian order.
+ *
+ *   3. Always ignore bytes [4..] if they exist in the file.
+ *
+ * Only the first four bytes are significant, even on systems that
+ * have a 64-bit word size.
+ *
+ * See:
+ *
+ *   eglibc: sysdeps/unix/sysv/linux/gethostid.c
+ *   coreutils: src/hostid.c
+ *
+ * Notes:
+ *
+ * The /etc/hostid file on Solaris is a text file that often reads:
+ *
+ *   # DO NOT EDIT
+ *   "0123456789"
+ *
+ * Directly copying this file to Linux results in a constant
+ * hostid of 4f442023 because the default comment constitutes
+ * the first four bytes of the file.
+ *
+ */
+
+char *spl_hostid_path = HW_HOSTID_PATH;
+module_param(spl_hostid_path, charp, 0444);
+MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)");
+
+static int
+hostid_read(uint32_t *hostid)
+{
+	uint64_t size;
+	uint32_t value = 0;
+	int error;
+	loff_t off;
+	struct file *filp;
+	struct kstat stat;
+
+	filp = filp_open(spl_hostid_path, 0, 0);
+
+	if (IS_ERR(filp))
+		return (ENOENT);
+
+	error = spl_getattr(filp, &stat);
+	if (error) {
+		filp_close(filp, 0);
+		return (error);
+	}
+	size = stat.size;
+	if (size < sizeof (HW_HOSTID_MASK)) {
+		filp_close(filp, 0);
+		return (EINVAL);
+	}
+
+	off = 0;
+	/*
+	 * Read directly into the variable like eglibc does.
+	 * Short reads are okay; native behavior is preserved.
+	 */
+	error = spl_kernel_read(filp, &value, sizeof (value), &off);
+	if (error < 0) {
+		filp_close(filp, 0);
+		return (EIO);
+	}
+
+	/* Mask down to 32 bits like coreutils does. */
+	*hostid = (value & HW_HOSTID_MASK);
+	filp_close(filp, 0);
+
+	return (0);
+}
+
+/*
+ * Return the system hostid.  Preferentially use the spl_hostid module option
+ * when set, otherwise use the value in the /etc/hostid file.
+ */
+uint32_t
+zone_get_hostid(void *zone)
+{
+	uint32_t hostid;
+
+	ASSERT3P(zone, ==, NULL);
+
+	if (spl_hostid != 0)
+		return ((uint32_t)(spl_hostid & HW_HOSTID_MASK));
+
+	if (hostid_read(&hostid) == 0)
+		return (hostid);
+
+	return (0);
+}
+EXPORT_SYMBOL(zone_get_hostid);
+
+static int
+spl_kvmem_init(void)
+{
+	int rc = 0;
+
+	rc = spl_kmem_init();
+	if (rc)
+		return (rc);
+
+	rc = spl_vmem_init();
+	if (rc) {
+		spl_kmem_fini();
+		return (rc);
+	}
+
+	return (rc);
+}
+
+/*
+ * We initialize the random number generator with 128 bits of entropy from the
+ * system random number generator. In the improbable case that we have a zero
+ * seed, we fallback to the system jiffies, unless it is also zero, in which
+ * situation we use a preprogrammed seed. We step forward by 2^64 iterations to
+ * initialize each of the per-cpu seeds so that the sequences generated on each
+ * CPU are guaranteed to never overlap in practice.
+ */
+static void __init
+spl_random_init(void)
+{
+	uint64_t s[2];
+	int i = 0;
+
+	spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t),
+	    sizeof (uint64_t));
+
+	get_random_bytes(s, sizeof (s));
+
+	if (s[0] == 0 && s[1] == 0) {
+		if (jiffies != 0) {
+			s[0] = jiffies;
+			s[1] = ~0 - jiffies;
+		} else {
+			(void) memcpy(s, "improbable seed", sizeof (s));
+		}
+		printk("SPL: get_random_bytes() returned 0 "
+		    "when generating random seed. Setting initial seed to "
+		    "0x%016llx%016llx.\n", cpu_to_be64(s[0]),
+		    cpu_to_be64(s[1]));
+	}
+
+	for_each_possible_cpu(i) {
+		uint64_t *wordp = per_cpu_ptr(spl_pseudo_entropy, i);
+
+		spl_rand_jump(s);
+
+		wordp[0] = s[0];
+		wordp[1] = s[1];
+	}
+}
+
+static void
+spl_random_fini(void)
+{
+	free_percpu(spl_pseudo_entropy);
+}
+
+static void
+spl_kvmem_fini(void)
+{
+	spl_vmem_fini();
+	spl_kmem_fini();
+}
+
+static int __init
+spl_init(void)
+{
+	int rc = 0;
+
+	bzero(&p0, sizeof (proc_t));
+	spl_random_init();
+
+	if ((rc = spl_kvmem_init()))
+		goto out1;
+
+	if ((rc = spl_tsd_init()))
+		goto out2;
+
+	if ((rc = spl_taskq_init()))
+		goto out3;
+
+	if ((rc = spl_kmem_cache_init()))
+		goto out4;
+
+	if ((rc = spl_proc_init()))
+		goto out5;
+
+	if ((rc = spl_kstat_init()))
+		goto out6;
+
+	if ((rc = spl_zlib_init()))
+		goto out7;
+
+	return (rc);
+
+out7:
+	spl_kstat_fini();
+out6:
+	spl_proc_fini();
+out5:
+	spl_kmem_cache_fini();
+out4:
+	spl_taskq_fini();
+out3:
+	spl_tsd_fini();
+out2:
+	spl_kvmem_fini();
+out1:
+	return (rc);
+}
+
+static void __exit
+spl_fini(void)
+{
+	spl_zlib_fini();
+	spl_kstat_fini();
+	spl_proc_fini();
+	spl_kmem_cache_fini();
+	spl_taskq_fini();
+	spl_tsd_fini();
+	spl_kvmem_fini();
+	spl_random_fini();
+}
+
+module_init(spl_init);
+module_exit(spl_fini);
+
+ZFS_MODULE_DESCRIPTION("Solaris Porting Layer");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE("GPL");
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
new file mode 100644
index 000000000000..6b3d559ffc1c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem-cache.c
@@ -0,0 +1,1468 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/percpu_compat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/taskq.h>
+#include <sys/timer.h>
+#include <sys/vmem.h>
+#include <sys/wait.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/prefetch.h>
+
+/*
+ * Within the scope of spl-kmem.c file the kmem_cache_* definitions
+ * are removed to allow access to the real Linux slab allocator.
+ */
+#undef kmem_cache_destroy
+#undef kmem_cache_create
+#undef kmem_cache_alloc
+#undef kmem_cache_free
+
+
+/*
+ * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
+ * with smp_mb__{before,after}_atomic() because they were redundant. This is
+ * only used inside our SLAB allocator, so we implement an internal wrapper
+ * here to give us smp_mb__{before,after}_atomic() on older kernels.
+ */
+#ifndef smp_mb__before_atomic
+#define	smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
+#endif
+
+#ifndef smp_mb__after_atomic
+#define	smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
+#endif
+
+/* BEGIN CSTYLED */
+
+/*
+ * Cache magazines are an optimization designed to minimize the cost of
+ * allocating memory.  They do this by keeping a per-cpu cache of recently
+ * freed objects, which can then be reallocated without taking a lock. This
+ * can improve performance on highly contended caches.  However, because
+ * objects in magazines will prevent otherwise empty slabs from being
+ * immediately released this may not be ideal for low memory machines.
+ *
+ * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
+ * magazine size.  When this value is set to 0 the magazine size will be
+ * automatically determined based on the object size.  Otherwise magazines
+ * will be limited to 2-256 objects per magazine (i.e per cpu).  Magazines
+ * may never be entirely disabled in this implementation.
+ */
+unsigned int spl_kmem_cache_magazine_size = 0;
+module_param(spl_kmem_cache_magazine_size, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
+	"Default magazine size (2-256), set automatically (0)");
+
+/*
+ * The default behavior is to report the number of objects remaining in the
+ * cache.  This allows the Linux VM to repeatedly reclaim objects from the
+ * cache when memory is low satisfy other memory allocations.  Alternately,
+ * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache
+ * is reclaimed.  This may increase the likelihood of out of memory events.
+ */
+unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */;
+module_param(spl_kmem_cache_reclaim, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)");
+
+unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
+module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
+
+unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
+module_param(spl_kmem_cache_max_size, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
+
+/*
+ * For small objects the Linux slab allocator should be used to make the most
+ * efficient use of the memory.  However, large objects are not supported by
+ * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
+ * of 16K was determined to be optimal for architectures using 4K pages.
+ */
+#if PAGE_SIZE == 4096
+unsigned int spl_kmem_cache_slab_limit = 16384;
+#else
+unsigned int spl_kmem_cache_slab_limit = 0;
+#endif
+module_param(spl_kmem_cache_slab_limit, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
+	"Objects less than N bytes use the Linux slab");
+
+/*
+ * The number of threads available to allocate new slabs for caches.  This
+ * should not need to be tuned but it is available for performance analysis.
+ */
+unsigned int spl_kmem_cache_kmem_threads = 4;
+module_param(spl_kmem_cache_kmem_threads, uint, 0444);
+MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
+	"Number of spl_kmem_cache threads");
+/* END CSTYLED */
+
+/*
+ * Slab allocation interfaces
+ *
+ * While the Linux slab implementation was inspired by the Solaris
+ * implementation I cannot use it to emulate the Solaris APIs.  I
+ * require two features which are not provided by the Linux slab.
+ *
+ * 1) Constructors AND destructors.  Recent versions of the Linux
+ *    kernel have removed support for destructors.  This is a deal
+ *    breaker for the SPL which contains particularly expensive
+ *    initializers for mutex's, condition variables, etc.  We also
+ *    require a minimal level of cleanup for these data types unlike
+ *    many Linux data types which do need to be explicitly destroyed.
+ *
+ * 2) Virtual address space backed slab.  Callers of the Solaris slab
+ *    expect it to work well for both small are very large allocations.
+ *    Because of memory fragmentation the Linux slab which is backed
+ *    by kmalloc'ed memory performs very badly when confronted with
+ *    large numbers of large allocations.  Basing the slab on the
+ *    virtual address space removes the need for contiguous pages
+ *    and greatly improve performance for large allocations.
+ *
+ * For these reasons, the SPL has its own slab implementation with
+ * the needed features.  It is not as highly optimized as either the
+ * Solaris or Linux slabs, but it should get me most of what is
+ * needed until it can be optimized or obsoleted by another approach.
+ *
+ * One serious concern I do have about this method is the relatively
+ * small virtual address space on 32bit arches.  This will seriously
+ * constrain the size of the slab caches and their performance.
+ */
+
+struct list_head spl_kmem_cache_list;   /* List of caches */
+struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
+taskq_t *spl_kmem_cache_taskq;		/* Task queue for aging / reclaim */
+
+static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
+
+static void *
+kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
+{
+	gfp_t lflags = kmem_flags_convert(flags);
+	void *ptr;
+
+	ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+
+	/* Resulting allocated memory will be page aligned */
+	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+	return (ptr);
+}
+
+static void
+kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
+{
+	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
+
+	/*
+	 * The Linux direct reclaim path uses this out of band value to
+	 * determine if forward progress is being made.  Normally this is
+	 * incremented by kmem_freepages() which is part of the various
+	 * Linux slab implementations.  However, since we are using none
+	 * of that infrastructure we are responsible for incrementing it.
+	 */
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
+
+	vfree(ptr);
+}
+
+/*
+ * Required space for each aligned sks.
+ */
+static inline uint32_t
+spl_sks_size(spl_kmem_cache_t *skc)
+{
+	return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
+	    skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * Required space for each aligned object.
+ */
+static inline uint32_t
+spl_obj_size(spl_kmem_cache_t *skc)
+{
+	uint32_t align = skc->skc_obj_align;
+
+	return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
+	    P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
+}
+
+uint64_t
+spl_kmem_cache_inuse(kmem_cache_t *cache)
+{
+	return (cache->skc_obj_total);
+}
+EXPORT_SYMBOL(spl_kmem_cache_inuse);
+
+uint64_t
+spl_kmem_cache_entry_size(kmem_cache_t *cache)
+{
+	return (cache->skc_obj_size);
+}
+EXPORT_SYMBOL(spl_kmem_cache_entry_size);
+
+/*
+ * Lookup the spl_kmem_object_t for an object given that object.
+ */
+static inline spl_kmem_obj_t *
+spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
+{
+	return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
+	    skc->skc_obj_align, uint32_t));
+}
+
+/*
+ * It's important that we pack the spl_kmem_obj_t structure and the
+ * actual objects in to one large address space to minimize the number
+ * of calls to the allocator.  It is far better to do a few large
+ * allocations and then subdivide it ourselves.  Now which allocator
+ * we use requires balancing a few trade offs.
+ *
+ * For small objects we use kmem_alloc() because as long as you are
+ * only requesting a small number of pages (ideally just one) its cheap.
+ * However, when you start requesting multiple pages with kmem_alloc()
+ * it gets increasingly expensive since it requires contiguous pages.
+ * For this reason we shift to vmem_alloc() for slabs of large objects
+ * which removes the need for contiguous pages.  We do not use
+ * vmem_alloc() in all cases because there is significant locking
+ * overhead in __get_vm_area_node().  This function takes a single
+ * global lock when acquiring an available virtual address range which
+ * serializes all vmem_alloc()'s for all slab caches.  Using slightly
+ * different allocation functions for small and large objects should
+ * give us the best of both worlds.
+ *
+ * +------------------------+
+ * | spl_kmem_slab_t --+-+  |
+ * | skc_obj_size    <-+ |  |
+ * | spl_kmem_obj_t      |  |
+ * | skc_obj_size    <---+  |
+ * | spl_kmem_obj_t      |  |
+ * | ...                 v  |
+ * +------------------------+
+ */
+static spl_kmem_slab_t *
+spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
+{
+	spl_kmem_slab_t *sks;
+	void *base;
+	uint32_t obj_size;
+
+	base = kv_alloc(skc, skc->skc_slab_size, flags);
+	if (base == NULL)
+		return (NULL);
+
+	sks = (spl_kmem_slab_t *)base;
+	sks->sks_magic = SKS_MAGIC;
+	sks->sks_objs = skc->skc_slab_objs;
+	sks->sks_age = jiffies;
+	sks->sks_cache = skc;
+	INIT_LIST_HEAD(&sks->sks_list);
+	INIT_LIST_HEAD(&sks->sks_free_list);
+	sks->sks_ref = 0;
+	obj_size = spl_obj_size(skc);
+
+	for (int i = 0; i < sks->sks_objs; i++) {
+		void *obj = base + spl_sks_size(skc) + (i * obj_size);
+
+		ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+		spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj);
+		sko->sko_addr = obj;
+		sko->sko_magic = SKO_MAGIC;
+		sko->sko_slab = sks;
+		INIT_LIST_HEAD(&sko->sko_list);
+		list_add_tail(&sko->sko_list, &sks->sks_free_list);
+	}
+
+	return (sks);
+}
+
+/*
+ * Remove a slab from complete or partial list, it must be called with
+ * the 'skc->skc_lock' held but the actual free must be performed
+ * outside the lock to prevent deadlocking on vmem addresses.
+ */
+static void
+spl_slab_free(spl_kmem_slab_t *sks,
+    struct list_head *sks_list, struct list_head *sko_list)
+{
+	spl_kmem_cache_t *skc;
+
+	ASSERT(sks->sks_magic == SKS_MAGIC);
+	ASSERT(sks->sks_ref == 0);
+
+	skc = sks->sks_cache;
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+
+	/*
+	 * Update slab/objects counters in the cache, then remove the
+	 * slab from the skc->skc_partial_list.  Finally add the slab
+	 * and all its objects in to the private work lists where the
+	 * destructors will be called and the memory freed to the system.
+	 */
+	skc->skc_obj_total -= sks->sks_objs;
+	skc->skc_slab_total--;
+	list_del(&sks->sks_list);
+	list_add(&sks->sks_list, sks_list);
+	list_splice_init(&sks->sks_free_list, sko_list);
+}
+
+/*
+ * Reclaim empty slabs at the end of the partial list.
+ */
+static void
+spl_slab_reclaim(spl_kmem_cache_t *skc)
+{
+	spl_kmem_slab_t *sks = NULL, *m = NULL;
+	spl_kmem_obj_t *sko = NULL, *n = NULL;
+	LIST_HEAD(sks_list);
+	LIST_HEAD(sko_list);
+
+	/*
+	 * Empty slabs and objects must be moved to a private list so they
+	 * can be safely freed outside the spin lock.  All empty slabs are
+	 * at the end of skc->skc_partial_list, therefore once a non-empty
+	 * slab is found we can stop scanning.
+	 */
+	spin_lock(&skc->skc_lock);
+	list_for_each_entry_safe_reverse(sks, m,
+	    &skc->skc_partial_list, sks_list) {
+
+		if (sks->sks_ref > 0)
+			break;
+
+		spl_slab_free(sks, &sks_list, &sko_list);
+	}
+	spin_unlock(&skc->skc_lock);
+
+	/*
+	 * The following two loops ensure all the object destructors are run,
+	 * and the slabs themselves are freed.  This is all done outside the
+	 * skc->skc_lock since this allows the destructor to sleep, and
+	 * allows us to perform a conditional reschedule when a freeing a
+	 * large number of objects and slabs back to the system.
+	 */
+
+	list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
+		ASSERT(sko->sko_magic == SKO_MAGIC);
+	}
+
+	list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
+		ASSERT(sks->sks_magic == SKS_MAGIC);
+		kv_free(skc, sks, skc->skc_slab_size);
+	}
+}
+
+static spl_kmem_emergency_t *
+spl_emergency_search(struct rb_root *root, void *obj)
+{
+	struct rb_node *node = root->rb_node;
+	spl_kmem_emergency_t *ske;
+	unsigned long address = (unsigned long)obj;
+
+	while (node) {
+		ske = container_of(node, spl_kmem_emergency_t, ske_node);
+
+		if (address < ske->ske_obj)
+			node = node->rb_left;
+		else if (address > ske->ske_obj)
+			node = node->rb_right;
+		else
+			return (ske);
+	}
+
+	return (NULL);
+}
+
+static int
+spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	spl_kmem_emergency_t *ske_tmp;
+	unsigned long address = ske->ske_obj;
+
+	while (*new) {
+		ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
+
+		parent = *new;
+		if (address < ske_tmp->ske_obj)
+			new = &((*new)->rb_left);
+		else if (address > ske_tmp->ske_obj)
+			new = &((*new)->rb_right);
+		else
+			return (0);
+	}
+
+	rb_link_node(&ske->ske_node, parent, new);
+	rb_insert_color(&ske->ske_node, root);
+
+	return (1);
+}
+
+/*
+ * Allocate a single emergency object and track it in a red black tree.
+ */
+static int
+spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+	gfp_t lflags = kmem_flags_convert(flags);
+	spl_kmem_emergency_t *ske;
+	int order = get_order(skc->skc_obj_size);
+	int empty;
+
+	/* Last chance use a partial slab if one now exists */
+	spin_lock(&skc->skc_lock);
+	empty = list_empty(&skc->skc_partial_list);
+	spin_unlock(&skc->skc_lock);
+	if (!empty)
+		return (-EEXIST);
+
+	ske = kmalloc(sizeof (*ske), lflags);
+	if (ske == NULL)
+		return (-ENOMEM);
+
+	ske->ske_obj = __get_free_pages(lflags, order);
+	if (ske->ske_obj == 0) {
+		kfree(ske);
+		return (-ENOMEM);
+	}
+
+	spin_lock(&skc->skc_lock);
+	empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
+	if (likely(empty)) {
+		skc->skc_obj_total++;
+		skc->skc_obj_emergency++;
+		if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
+			skc->skc_obj_emergency_max = skc->skc_obj_emergency;
+	}
+	spin_unlock(&skc->skc_lock);
+
+	if (unlikely(!empty)) {
+		free_pages(ske->ske_obj, order);
+		kfree(ske);
+		return (-EINVAL);
+	}
+
+	*obj = (void *)ske->ske_obj;
+
+	return (0);
+}
+
+/*
+ * Locate the passed object in the red black tree and free it.
+ */
+static int
+spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
+{
+	spl_kmem_emergency_t *ske;
+	int order = get_order(skc->skc_obj_size);
+
+	spin_lock(&skc->skc_lock);
+	ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
+	if (ske) {
+		rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
+		skc->skc_obj_emergency--;
+		skc->skc_obj_total--;
+	}
+	spin_unlock(&skc->skc_lock);
+
+	if (ske == NULL)
+		return (-ENOENT);
+
+	free_pages(ske->ske_obj, order);
+	kfree(ske);
+
+	return (0);
+}
+
+/*
+ * Release objects from the per-cpu magazine back to their slab.  The flush
+ * argument contains the max number of entries to remove from the magazine.
+ */
+static void
+spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
+{
+	spin_lock(&skc->skc_lock);
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	int count = MIN(flush, skm->skm_avail);
+	for (int i = 0; i < count; i++)
+		spl_cache_shrink(skc, skm->skm_objs[i]);
+
+	skm->skm_avail -= count;
+	memmove(skm->skm_objs, &(skm->skm_objs[count]),
+	    sizeof (void *) * skm->skm_avail);
+
+	spin_unlock(&skc->skc_lock);
+}
+
+/*
+ * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
+ * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
+ * for very small objects we may end up with more than this so as not
+ * to waste space in the minimal allocation of a single page.  Also for
+ * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min,
+ * lower than this and we will fail.
+ */
+static int
+spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
+{
+	uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
+
+	sks_size = spl_sks_size(skc);
+	obj_size = spl_obj_size(skc);
+	max_size = (spl_kmem_cache_max_size * 1024 * 1024);
+	tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
+
+	if (tgt_size <= max_size) {
+		tgt_objs = (tgt_size - sks_size) / obj_size;
+	} else {
+		tgt_objs = (max_size - sks_size) / obj_size;
+		tgt_size = (tgt_objs * obj_size) + sks_size;
+	}
+
+	if (tgt_objs == 0)
+		return (-ENOSPC);
+
+	*objs = tgt_objs;
+	*size = tgt_size;
+
+	return (0);
+}
+
+/*
+ * Make a guess at reasonable per-cpu magazine size based on the size of
+ * each object and the cost of caching N of them in each magazine.  Long
+ * term this should really adapt based on an observed usage heuristic.
+ */
+static int
+spl_magazine_size(spl_kmem_cache_t *skc)
+{
+	uint32_t obj_size = spl_obj_size(skc);
+	int size;
+
+	if (spl_kmem_cache_magazine_size > 0)
+		return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
+
+	/* Per-magazine sizes below assume a 4Kib page size */
+	if (obj_size > (PAGE_SIZE * 256))
+		size = 4;  /* Minimum 4Mib per-magazine */
+	else if (obj_size > (PAGE_SIZE * 32))
+		size = 16; /* Minimum 2Mib per-magazine */
+	else if (obj_size > (PAGE_SIZE))
+		size = 64; /* Minimum 256Kib per-magazine */
+	else if (obj_size > (PAGE_SIZE / 4))
+		size = 128; /* Minimum 128Kib per-magazine */
+	else
+		size = 256;
+
+	return (size);
+}
+
+/*
+ * Allocate a per-cpu magazine to associate with a specific core.
+ */
+static spl_kmem_magazine_t *
+spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
+{
+	spl_kmem_magazine_t *skm;
+	int size = sizeof (spl_kmem_magazine_t) +
+	    sizeof (void *) * skc->skc_mag_size;
+
+	skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+	if (skm) {
+		skm->skm_magic = SKM_MAGIC;
+		skm->skm_avail = 0;
+		skm->skm_size = skc->skc_mag_size;
+		skm->skm_refill = skc->skc_mag_refill;
+		skm->skm_cache = skc;
+		skm->skm_cpu = cpu;
+	}
+
+	return (skm);
+}
+
+/*
+ * Free a per-cpu magazine associated with a specific core.
+ */
+static void
+spl_magazine_free(spl_kmem_magazine_t *skm)
+{
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+	ASSERT(skm->skm_avail == 0);
+	kfree(skm);
+}
+
+/*
+ * Create all pre-cpu magazines of reasonable sizes.
+ */
+static int
+spl_magazine_create(spl_kmem_cache_t *skc)
+{
+	int i = 0;
+
+	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+
+	skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
+	    num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
+	skc->skc_mag_size = spl_magazine_size(skc);
+	skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
+
+	for_each_possible_cpu(i) {
+		skc->skc_mag[i] = spl_magazine_alloc(skc, i);
+		if (!skc->skc_mag[i]) {
+			for (i--; i >= 0; i--)
+				spl_magazine_free(skc->skc_mag[i]);
+
+			kfree(skc->skc_mag);
+			return (-ENOMEM);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Destroy all pre-cpu magazines.
+ */
+static void
+spl_magazine_destroy(spl_kmem_cache_t *skc)
+{
+	spl_kmem_magazine_t *skm;
+	int i = 0;
+
+	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+
+	for_each_possible_cpu(i) {
+		skm = skc->skc_mag[i];
+		spl_cache_flush(skc, skm, skm->skm_avail);
+		spl_magazine_free(skm);
+	}
+
+	kfree(skc->skc_mag);
+}
+
+/*
+ * Create a object cache based on the following arguments:
+ * name		cache name
+ * size		cache object size
+ * align	cache object alignment
+ * ctor		cache object constructor
+ * dtor		cache object destructor
+ * reclaim	cache object reclaim
+ * priv		cache private data for ctor/dtor/reclaim
+ * vmp		unused must be NULL
+ * flags
+ *	KMC_KVMEM       Force kvmem backed SPL cache
+ *	KMC_SLAB        Force Linux slab backed cache
+ *	KMC_NODEBUG	Disable debugging (unsupported)
+ */
+spl_kmem_cache_t *
+spl_kmem_cache_create(char *name, size_t size, size_t align,
+    spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,
+    void *priv, void *vmp, int flags)
+{
+	gfp_t lflags = kmem_flags_convert(KM_SLEEP);
+	spl_kmem_cache_t *skc;
+	int rc;
+
+	/*
+	 * Unsupported flags
+	 */
+	ASSERT(vmp == NULL);
+	ASSERT(reclaim == NULL);
+
+	might_sleep();
+
+	skc = kzalloc(sizeof (*skc), lflags);
+	if (skc == NULL)
+		return (NULL);
+
+	skc->skc_magic = SKC_MAGIC;
+	skc->skc_name_size = strlen(name) + 1;
+	skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags);
+	if (skc->skc_name == NULL) {
+		kfree(skc);
+		return (NULL);
+	}
+	strncpy(skc->skc_name, name, skc->skc_name_size);
+
+	skc->skc_ctor = ctor;
+	skc->skc_dtor = dtor;
+	skc->skc_private = priv;
+	skc->skc_vmp = vmp;
+	skc->skc_linux_cache = NULL;
+	skc->skc_flags = flags;
+	skc->skc_obj_size = size;
+	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
+	atomic_set(&skc->skc_ref, 0);
+
+	INIT_LIST_HEAD(&skc->skc_list);
+	INIT_LIST_HEAD(&skc->skc_complete_list);
+	INIT_LIST_HEAD(&skc->skc_partial_list);
+	skc->skc_emergency_tree = RB_ROOT;
+	spin_lock_init(&skc->skc_lock);
+	init_waitqueue_head(&skc->skc_waitq);
+	skc->skc_slab_fail = 0;
+	skc->skc_slab_create = 0;
+	skc->skc_slab_destroy = 0;
+	skc->skc_slab_total = 0;
+	skc->skc_slab_alloc = 0;
+	skc->skc_slab_max = 0;
+	skc->skc_obj_total = 0;
+	skc->skc_obj_alloc = 0;
+	skc->skc_obj_max = 0;
+	skc->skc_obj_deadlock = 0;
+	skc->skc_obj_emergency = 0;
+	skc->skc_obj_emergency_max = 0;
+
+	rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0,
+	    GFP_KERNEL);
+	if (rc != 0) {
+		kfree(skc);
+		return (NULL);
+	}
+
+	/*
+	 * Verify the requested alignment restriction is sane.
+	 */
+	if (align) {
+		VERIFY(ISP2(align));
+		VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
+		VERIFY3U(align, <=, PAGE_SIZE);
+		skc->skc_obj_align = align;
+	}
+
+	/*
+	 * When no specific type of slab is requested (kmem, vmem, or
+	 * linuxslab) then select a cache type based on the object size
+	 * and default tunables.
+	 */
+	if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) {
+		if (spl_kmem_cache_slab_limit &&
+		    size <= (size_t)spl_kmem_cache_slab_limit) {
+			/*
+			 * Objects smaller than spl_kmem_cache_slab_limit can
+			 * use the Linux slab for better space-efficiency.
+			 */
+			skc->skc_flags |= KMC_SLAB;
+		} else {
+			/*
+			 * All other objects are considered large and are
+			 * placed on kvmem backed slabs.
+			 */
+			skc->skc_flags |= KMC_KVMEM;
+		}
+	}
+
+	/*
+	 * Given the type of slab allocate the required resources.
+	 */
+	if (skc->skc_flags & KMC_KVMEM) {
+		rc = spl_slab_size(skc,
+		    &skc->skc_slab_objs, &skc->skc_slab_size);
+		if (rc)
+			goto out;
+
+		rc = spl_magazine_create(skc);
+		if (rc)
+			goto out;
+	} else {
+		unsigned long slabflags = 0;
+
+		if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) {
+			rc = EINVAL;
+			goto out;
+		}
+
+#if defined(SLAB_USERCOPY)
+		/*
+		 * Required for PAX-enabled kernels if the slab is to be
+		 * used for copying between user and kernel space.
+		 */
+		slabflags |= SLAB_USERCOPY;
+#endif
+
+#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY)
+		/*
+		 * Newer grsec patchset uses kmem_cache_create_usercopy()
+		 * instead of SLAB_USERCOPY flag
+		 */
+		skc->skc_linux_cache = kmem_cache_create_usercopy(
+		    skc->skc_name, size, align, slabflags, 0, size, NULL);
+#else
+		skc->skc_linux_cache = kmem_cache_create(
+		    skc->skc_name, size, align, slabflags, NULL);
+#endif
+		if (skc->skc_linux_cache == NULL) {
+			rc = ENOMEM;
+			goto out;
+		}
+	}
+
+	down_write(&spl_kmem_cache_sem);
+	list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
+	up_write(&spl_kmem_cache_sem);
+
+	return (skc);
+out:
+	kfree(skc->skc_name);
+	percpu_counter_destroy(&skc->skc_linux_alloc);
+	kfree(skc);
+	return (NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_create);
+
+/*
+ * Register a move callback for cache defragmentation.
+ * XXX: Unimplemented but harmless to stub out for now.
+ */
+void
+spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
+    kmem_cbrc_t (move)(void *, void *, size_t, void *))
+{
+	ASSERT(move != NULL);
+}
+EXPORT_SYMBOL(spl_kmem_cache_set_move);
+
+/*
+ * Destroy a cache and all objects associated with the cache.
+ */
+void
+spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
+{
+	DECLARE_WAIT_QUEUE_HEAD(wq);
+	taskqid_t id;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB));
+
+	down_write(&spl_kmem_cache_sem);
+	list_del_init(&skc->skc_list);
+	up_write(&spl_kmem_cache_sem);
+
+	/* Cancel any and wait for any pending delayed tasks */
+	VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	spin_lock(&skc->skc_lock);
+	id = skc->skc_taskqid;
+	spin_unlock(&skc->skc_lock);
+
+	taskq_cancel_id(spl_kmem_cache_taskq, id);
+
+	/*
+	 * Wait until all current callers complete, this is mainly
+	 * to catch the case where a low memory situation triggers a
+	 * cache reaping action which races with this destroy.
+	 */
+	wait_event(wq, atomic_read(&skc->skc_ref) == 0);
+
+	if (skc->skc_flags & KMC_KVMEM) {
+		spl_magazine_destroy(skc);
+		spl_slab_reclaim(skc);
+	} else {
+		ASSERT(skc->skc_flags & KMC_SLAB);
+		kmem_cache_destroy(skc->skc_linux_cache);
+	}
+
+	spin_lock(&skc->skc_lock);
+
+	/*
+	 * Validate there are no objects in use and free all the
+	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
+	 */
+	ASSERT3U(skc->skc_slab_alloc, ==, 0);
+	ASSERT3U(skc->skc_obj_alloc, ==, 0);
+	ASSERT3U(skc->skc_slab_total, ==, 0);
+	ASSERT3U(skc->skc_obj_total, ==, 0);
+	ASSERT3U(skc->skc_obj_emergency, ==, 0);
+	ASSERT(list_empty(&skc->skc_complete_list));
+
+	ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
+	percpu_counter_destroy(&skc->skc_linux_alloc);
+
+	spin_unlock(&skc->skc_lock);
+
+	kfree(skc->skc_name);
+	kfree(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_destroy);
+
+/*
+ * Allocate an object from a slab attached to the cache.  This is used to
+ * repopulate the per-cpu magazine caches in batches when they run low.
+ */
+static void *
+spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
+{
+	spl_kmem_obj_t *sko;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(sks->sks_magic == SKS_MAGIC);
+
+	sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
+	ASSERT(sko->sko_magic == SKO_MAGIC);
+	ASSERT(sko->sko_addr != NULL);
+
+	/* Remove from sks_free_list */
+	list_del_init(&sko->sko_list);
+
+	sks->sks_age = jiffies;
+	sks->sks_ref++;
+	skc->skc_obj_alloc++;
+
+	/* Track max obj usage statistics */
+	if (skc->skc_obj_alloc > skc->skc_obj_max)
+		skc->skc_obj_max = skc->skc_obj_alloc;
+
+	/* Track max slab usage statistics */
+	if (sks->sks_ref == 1) {
+		skc->skc_slab_alloc++;
+
+		if (skc->skc_slab_alloc > skc->skc_slab_max)
+			skc->skc_slab_max = skc->skc_slab_alloc;
+	}
+
+	return (sko->sko_addr);
+}
+
+/*
+ * Generic slab allocation function to run by the global work queues.
+ * It is responsible for allocating a new slab, linking it in to the list
+ * of partial slabs, and then waking any waiters.
+ */
+static int
+__spl_cache_grow(spl_kmem_cache_t *skc, int flags)
+{
+	spl_kmem_slab_t *sks;
+
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	sks = spl_slab_alloc(skc, flags);
+	spl_fstrans_unmark(cookie);
+
+	spin_lock(&skc->skc_lock);
+	if (sks) {
+		skc->skc_slab_total++;
+		skc->skc_obj_total += sks->sks_objs;
+		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+
+		smp_mb__before_atomic();
+		clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+		smp_mb__after_atomic();
+	}
+	spin_unlock(&skc->skc_lock);
+
+	return (sks == NULL ? -ENOMEM : 0);
+}
+
+static void
+spl_cache_grow_work(void *data)
+{
+	spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
+	spl_kmem_cache_t *skc = ska->ska_cache;
+
+	int error = __spl_cache_grow(skc, ska->ska_flags);
+
+	atomic_dec(&skc->skc_ref);
+	smp_mb__before_atomic();
+	clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
+	smp_mb__after_atomic();
+	if (error == 0)
+		wake_up_all(&skc->skc_waitq);
+
+	kfree(ska);
+}
+
+/*
+ * Returns non-zero when a new slab should be available.
+ */
+static int
+spl_cache_grow_wait(spl_kmem_cache_t *skc)
+{
+	return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
+}
+
+/*
+ * No available objects on any slabs, create a new slab.  Note that this
+ * functionality is disabled for KMC_SLAB caches which are backed by the
+ * Linux slab.
+ */
+static int
+spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
+{
+	int remaining, rc = 0;
+
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT((skc->skc_flags & KMC_SLAB) == 0);
+	might_sleep();
+	*obj = NULL;
+
+	/*
+	 * Before allocating a new slab wait for any reaping to complete and
+	 * then return so the local magazine can be rechecked for new objects.
+	 */
+	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
+		rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
+		    TASK_UNINTERRUPTIBLE);
+		return (rc ? rc : -EAGAIN);
+	}
+
+	/*
+	 * Note: It would be nice to reduce the overhead of context switch
+	 * and improve NUMA locality, by trying to allocate a new slab in the
+	 * current process context with KM_NOSLEEP flag.
+	 *
+	 * However, this can't be applied to vmem/kvmem due to a bug that
+	 * spl_vmalloc() doesn't honor gfp flags in page table allocation.
+	 */
+
+	/*
+	 * This is handled by dispatching a work request to the global work
+	 * queue.  This allows us to asynchronously allocate a new slab while
+	 * retaining the ability to safely fall back to a smaller synchronous
+	 * allocations to ensure forward progress is always maintained.
+	 */
+	if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
+		spl_kmem_alloc_t *ska;
+
+		ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
+		if (ska == NULL) {
+			clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
+			smp_mb__after_atomic();
+			wake_up_all(&skc->skc_waitq);
+			return (-ENOMEM);
+		}
+
+		atomic_inc(&skc->skc_ref);
+		ska->ska_cache = skc;
+		ska->ska_flags = flags;
+		taskq_init_ent(&ska->ska_tqe);
+		taskq_dispatch_ent(spl_kmem_cache_taskq,
+		    spl_cache_grow_work, ska, 0, &ska->ska_tqe);
+	}
+
+	/*
+	 * The goal here is to only detect the rare case where a virtual slab
+	 * allocation has deadlocked.  We must be careful to minimize the use
+	 * of emergency objects which are more expensive to track.  Therefore,
+	 * we set a very long timeout for the asynchronous allocation and if
+	 * the timeout is reached the cache is flagged as deadlocked.  From
+	 * this point only new emergency objects will be allocated until the
+	 * asynchronous allocation completes and clears the deadlocked flag.
+	 */
+	if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
+		rc = spl_emergency_alloc(skc, flags, obj);
+	} else {
+		remaining = wait_event_timeout(skc->skc_waitq,
+		    spl_cache_grow_wait(skc), HZ / 10);
+
+		if (!remaining) {
+			spin_lock(&skc->skc_lock);
+			if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
+				set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
+				skc->skc_obj_deadlock++;
+			}
+			spin_unlock(&skc->skc_lock);
+		}
+
+		rc = -ENOMEM;
+	}
+
+	return (rc);
+}
+
+/*
+ * Refill a per-cpu magazine with objects from the slabs for this cache.
+ * Ideally the magazine can be repopulated using existing objects which have
+ * been released, however if we are unable to locate enough free objects new
+ * slabs of objects will be created.  On success NULL is returned, otherwise
+ * the address of a single emergency object is returned for use by the caller.
+ */
+static void *
+spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
+{
+	spl_kmem_slab_t *sks;
+	int count = 0, rc, refill;
+	void *obj = NULL;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
+	spin_lock(&skc->skc_lock);
+
+	while (refill > 0) {
+		/* No slabs available we may need to grow the cache */
+		if (list_empty(&skc->skc_partial_list)) {
+			spin_unlock(&skc->skc_lock);
+
+			local_irq_enable();
+			rc = spl_cache_grow(skc, flags, &obj);
+			local_irq_disable();
+
+			/* Emergency object for immediate use by caller */
+			if (rc == 0 && obj != NULL)
+				return (obj);
+
+			if (rc)
+				goto out;
+
+			/* Rescheduled to different CPU skm is not local */
+			if (skm != skc->skc_mag[smp_processor_id()])
+				goto out;
+
+			/*
+			 * Potentially rescheduled to the same CPU but
+			 * allocations may have occurred from this CPU while
+			 * we were sleeping so recalculate max refill.
+			 */
+			refill = MIN(refill, skm->skm_size - skm->skm_avail);
+
+			spin_lock(&skc->skc_lock);
+			continue;
+		}
+
+		/* Grab the next available slab */
+		sks = list_entry((&skc->skc_partial_list)->next,
+		    spl_kmem_slab_t, sks_list);
+		ASSERT(sks->sks_magic == SKS_MAGIC);
+		ASSERT(sks->sks_ref < sks->sks_objs);
+		ASSERT(!list_empty(&sks->sks_free_list));
+
+		/*
+		 * Consume as many objects as needed to refill the requested
+		 * cache.  We must also be careful not to overfill it.
+		 */
+		while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
+		    ++count) {
+			ASSERT(skm->skm_avail < skm->skm_size);
+			ASSERT(count < skm->skm_size);
+			skm->skm_objs[skm->skm_avail++] =
+			    spl_cache_obj(skc, sks);
+		}
+
+		/* Move slab to skc_complete_list when full */
+		if (sks->sks_ref == sks->sks_objs) {
+			list_del(&sks->sks_list);
+			list_add(&sks->sks_list, &skc->skc_complete_list);
+		}
+	}
+
+	spin_unlock(&skc->skc_lock);
+out:
+	return (NULL);
+}
+
+/*
+ * Release an object back to the slab from which it came.
+ */
+static void
+spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
+{
+	spl_kmem_slab_t *sks = NULL;
+	spl_kmem_obj_t *sko = NULL;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+
+	sko = spl_sko_from_obj(skc, obj);
+	ASSERT(sko->sko_magic == SKO_MAGIC);
+	sks = sko->sko_slab;
+	ASSERT(sks->sks_magic == SKS_MAGIC);
+	ASSERT(sks->sks_cache == skc);
+	list_add(&sko->sko_list, &sks->sks_free_list);
+
+	sks->sks_age = jiffies;
+	sks->sks_ref--;
+	skc->skc_obj_alloc--;
+
+	/*
+	 * Move slab to skc_partial_list when no longer full.  Slabs
+	 * are added to the head to keep the partial list is quasi-full
+	 * sorted order.  Fuller at the head, emptier at the tail.
+	 */
+	if (sks->sks_ref == (sks->sks_objs - 1)) {
+		list_del(&sks->sks_list);
+		list_add(&sks->sks_list, &skc->skc_partial_list);
+	}
+
+	/*
+	 * Move empty slabs to the end of the partial list so
+	 * they can be easily found and freed during reclamation.
+	 */
+	if (sks->sks_ref == 0) {
+		list_del(&sks->sks_list);
+		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
+		skc->skc_slab_alloc--;
+	}
+}
+
+/*
+ * Allocate an object from the per-cpu magazine, or if the magazine
+ * is empty directly allocate from a slab and repopulate the magazine.
+ */
+void *
+spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
+{
+	spl_kmem_magazine_t *skm;
+	void *obj = NULL;
+
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	/*
+	 * Allocate directly from a Linux slab.  All optimizations are left
+	 * to the underlying cache we only need to guarantee that KM_SLEEP
+	 * callers will never fail.
+	 */
+	if (skc->skc_flags & KMC_SLAB) {
+		struct kmem_cache *slc = skc->skc_linux_cache;
+		do {
+			obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
+		} while ((obj == NULL) && !(flags & KM_NOSLEEP));
+
+		if (obj != NULL) {
+			/*
+			 * Even though we leave everything up to the
+			 * underlying cache we still keep track of
+			 * how many objects we've allocated in it for
+			 * better debuggability.
+			 */
+			percpu_counter_inc(&skc->skc_linux_alloc);
+		}
+		goto ret;
+	}
+
+	local_irq_disable();
+
+restart:
+	/*
+	 * Safe to update per-cpu structure without lock, but
+	 * in the restart case we must be careful to reacquire
+	 * the local magazine since this may have changed
+	 * when we need to grow the cache.
+	 */
+	skm = skc->skc_mag[smp_processor_id()];
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	if (likely(skm->skm_avail)) {
+		/* Object available in CPU cache, use it */
+		obj = skm->skm_objs[--skm->skm_avail];
+	} else {
+		obj = spl_cache_refill(skc, skm, flags);
+		if ((obj == NULL) && !(flags & KM_NOSLEEP))
+			goto restart;
+
+		local_irq_enable();
+		goto ret;
+	}
+
+	local_irq_enable();
+	ASSERT(obj);
+	ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
+
+ret:
+	/* Pre-emptively migrate object to CPU L1 cache */
+	if (obj) {
+		if (obj && skc->skc_ctor)
+			skc->skc_ctor(obj, skc->skc_private, flags);
+		else
+			prefetchw(obj);
+	}
+
+	return (obj);
+}
+EXPORT_SYMBOL(spl_kmem_cache_alloc);
+
+/*
+ * Free an object back to the local per-cpu magazine, there is no
+ * guarantee that this is the same magazine the object was originally
+ * allocated from.  We may need to flush entire from the magazine
+ * back to the slabs to make space.
+ */
+void
+spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
+{
+	spl_kmem_magazine_t *skm;
+	unsigned long flags;
+	int do_reclaim = 0;
+	int do_emergency = 0;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	/*
+	 * Run the destructor
+	 */
+	if (skc->skc_dtor)
+		skc->skc_dtor(obj, skc->skc_private);
+
+	/*
+	 * Free the object from the Linux underlying Linux slab.
+	 */
+	if (skc->skc_flags & KMC_SLAB) {
+		kmem_cache_free(skc->skc_linux_cache, obj);
+		percpu_counter_dec(&skc->skc_linux_alloc);
+		return;
+	}
+
+	/*
+	 * While a cache has outstanding emergency objects all freed objects
+	 * must be checked.  However, since emergency objects will never use
+	 * a virtual address these objects can be safely excluded as an
+	 * optimization.
+	 */
+	if (!is_vmalloc_addr(obj)) {
+		spin_lock(&skc->skc_lock);
+		do_emergency = (skc->skc_obj_emergency > 0);
+		spin_unlock(&skc->skc_lock);
+
+		if (do_emergency && (spl_emergency_free(skc, obj) == 0))
+			return;
+	}
+
+	local_irq_save(flags);
+
+	/*
+	 * Safe to update per-cpu structure without lock, but
+	 * no remote memory allocation tracking is being performed
+	 * it is entirely possible to allocate an object from one
+	 * CPU cache and return it to another.
+	 */
+	skm = skc->skc_mag[smp_processor_id()];
+	ASSERT(skm->skm_magic == SKM_MAGIC);
+
+	/*
+	 * Per-CPU cache full, flush it to make space for this object,
+	 * this may result in an empty slab which can be reclaimed once
+	 * interrupts are re-enabled.
+	 */
+	if (unlikely(skm->skm_avail >= skm->skm_size)) {
+		spl_cache_flush(skc, skm, skm->skm_refill);
+		do_reclaim = 1;
+	}
+
+	/* Available space in cache, use it */
+	skm->skm_objs[skm->skm_avail++] = obj;
+
+	local_irq_restore(flags);
+
+	if (do_reclaim)
+		spl_slab_reclaim(skc);
+}
+EXPORT_SYMBOL(spl_kmem_cache_free);
+
+/*
+ * Depending on how many and which objects are released it may simply
+ * repopulate the local magazine which will then need to age-out.  Objects
+ * which cannot fit in the magazine will be released back to their slabs
+ * which will also need to age out before being released.  This is all just
+ * best effort and we do not want to thrash creating and destroying slabs.
+ */
+void
+spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
+{
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
+
+	if (skc->skc_flags & KMC_SLAB)
+		return;
+
+	atomic_inc(&skc->skc_ref);
+
+	/*
+	 * Prevent concurrent cache reaping when contended.
+	 */
+	if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
+		goto out;
+
+	/* Reclaim from the magazine and free all now empty slabs. */
+	unsigned long irq_flags;
+	local_irq_save(irq_flags);
+	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
+	spl_cache_flush(skc, skm, skm->skm_avail);
+	local_irq_restore(irq_flags);
+
+	spl_slab_reclaim(skc);
+	clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
+out:
+	atomic_dec(&skc->skc_ref);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_now);
+
+/*
+ * This is stubbed out for code consistency with other platforms.  There
+ * is existing logic to prevent concurrent reaping so while this is ugly
+ * it should do no harm.
+ */
+int
+spl_kmem_cache_reap_active()
+{
+	return (0);
+}
+EXPORT_SYMBOL(spl_kmem_cache_reap_active);
+
+/*
+ * Reap all free slabs from all registered caches.
+ */
+void
+spl_kmem_reap(void)
+{
+	spl_kmem_cache_t *skc = NULL;
+
+	down_read(&spl_kmem_cache_sem);
+	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+		spl_kmem_cache_reap_now(skc);
+	}
+	up_read(&spl_kmem_cache_sem);
+}
+EXPORT_SYMBOL(spl_kmem_reap);
+
+int
+spl_kmem_cache_init(void)
+{
+	init_rwsem(&spl_kmem_cache_sem);
+	INIT_LIST_HEAD(&spl_kmem_cache_list);
+	spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
+	    spl_kmem_cache_kmem_threads, maxclsyspri,
+	    spl_kmem_cache_kmem_threads * 8, INT_MAX,
+	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+
+	return (0);
+}
+
+void
+spl_kmem_cache_fini(void)
+{
+	taskq_destroy(spl_kmem_cache_taskq);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
new file mode 100644
index 000000000000..943966cbb17a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kmem.c
@@ -0,0 +1,617 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <sys/debug.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/vmem.h>
+
+/*
+ * As a general rule kmem_alloc() allocations should be small, preferably
+ * just a few pages since they must by physically contiguous.  Therefore, a
+ * rate limited warning will be printed to the console for any kmem_alloc()
+ * which exceeds a reasonable threshold.
+ *
+ * The default warning threshold is set to sixteen pages but capped at 64K to
+ * accommodate systems using large pages.  This value was selected to be small
+ * enough to ensure the largest allocations are quickly noticed and fixed.
+ * But large enough to avoid logging any warnings when a allocation size is
+ * larger than optimal but not a serious concern.  Since this value is tunable,
+ * developers are encouraged to set it lower when testing so any new largish
+ * allocations are quickly caught.  These warnings may be disabled by setting
+ * the threshold to zero.
+ */
+/* BEGIN CSTYLED */
+unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024);
+module_param(spl_kmem_alloc_warn, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_warn,
+	"Warning threshold in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_warn);
+
+/*
+ * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE.
+ * Allocations which are marginally smaller than this limit may succeed but
+ * should still be avoided due to the expense of locating a contiguous range
+ * of free pages.  Therefore, a maximum kmem size with reasonable safely
+ * margin of 4x is set.  Kmem_alloc() allocations larger than this maximum
+ * will quickly fail.  Vmem_alloc() allocations less than or equal to this
+ * value will use kmalloc(), but shift to vmalloc() when exceeding this value.
+ */
+unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2);
+module_param(spl_kmem_alloc_max, uint, 0644);
+MODULE_PARM_DESC(spl_kmem_alloc_max,
+	"Maximum size in bytes for a kmem_alloc()");
+EXPORT_SYMBOL(spl_kmem_alloc_max);
+/* END CSTYLED */
+
+int
+kmem_debugging(void)
+{
+	return (0);
+}
+EXPORT_SYMBOL(kmem_debugging);
+
+char *
+kmem_vasprintf(const char *fmt, va_list ap)
+{
+	va_list aq;
+	char *ptr;
+
+	do {
+		va_copy(aq, ap);
+		ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq);
+		va_end(aq);
+	} while (ptr == NULL);
+
+	return (ptr);
+}
+EXPORT_SYMBOL(kmem_vasprintf);
+
+char *
+kmem_asprintf(const char *fmt, ...)
+{
+	va_list ap;
+	char *ptr;
+
+	do {
+		va_start(ap, fmt);
+		ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap);
+		va_end(ap);
+	} while (ptr == NULL);
+
+	return (ptr);
+}
+EXPORT_SYMBOL(kmem_asprintf);
+
+static char *
+__strdup(const char *str, int flags)
+{
+	char *ptr;
+	int n;
+
+	n = strlen(str);
+	ptr = kmalloc(n + 1, kmem_flags_convert(flags));
+	if (ptr)
+		memcpy(ptr, str, n + 1);
+
+	return (ptr);
+}
+
+char *
+kmem_strdup(const char *str)
+{
+	return (__strdup(str, KM_SLEEP));
+}
+EXPORT_SYMBOL(kmem_strdup);
+
+void
+kmem_strfree(char *str)
+{
+	kfree(str);
+}
+EXPORT_SYMBOL(kmem_strfree);
+
+void *
+spl_kvmalloc(size_t size, gfp_t lflags)
+{
+#ifdef HAVE_KVMALLOC
+	/*
+	 * GFP_KERNEL allocations can safely use kvmalloc which may
+	 * improve performance by avoiding a) high latency caused by
+	 * vmalloc's on-access allocation, b) performance loss due to
+	 * MMU memory address mapping and c) vmalloc locking overhead.
+	 * This has the side-effect that the slab statistics will
+	 * incorrectly report this as a vmem allocation, but that is
+	 * purely cosmetic.
+	 */
+	if ((lflags & GFP_KERNEL) == GFP_KERNEL)
+		return (kvmalloc(size, lflags));
+#endif
+
+	gfp_t kmalloc_lflags = lflags;
+
+	if (size > PAGE_SIZE) {
+		/*
+		 * We need to set __GFP_NOWARN here since spl_kvmalloc is not
+		 * only called by spl_kmem_alloc_impl but can be called
+		 * directly with custom lflags, too. In that case
+		 * kmem_flags_convert does not get called, which would
+		 * implicitly set __GFP_NOWARN.
+		 */
+		kmalloc_lflags |= __GFP_NOWARN;
+
+		/*
+		 * N.B. __GFP_RETRY_MAYFAIL is supported only for large
+		 * e (>32kB) allocations.
+		 *
+		 * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY
+		 * for !costly requests because there is no other way to tell
+		 * the allocator that we want to fail rather than retry
+		 * endlessly.
+		 */
+		if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) ||
+		    (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+			kmalloc_lflags |= __GFP_NORETRY;
+		}
+	}
+
+	/*
+	 * We first try kmalloc - even for big sizes - and fall back to
+	 * spl_vmalloc if that fails.
+	 *
+	 * For non-__GFP-RECLAIM allocations we always stick to
+	 * kmalloc_node, and fail when kmalloc is not successful (returns
+	 * NULL).
+	 * We cannot fall back to spl_vmalloc in this case because spl_vmalloc
+	 * internally uses GPF_KERNEL allocations.
+	 */
+	void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE);
+	if (ptr || size <= PAGE_SIZE ||
+	    (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) {
+		return (ptr);
+	}
+
+	return (spl_vmalloc(size, lflags | __GFP_HIGHMEM));
+}
+
+/*
+ * General purpose unified implementation of kmem_alloc(). It is an
+ * amalgamation of Linux and Illumos allocator design. It should never be
+ * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains
+ * relatively portable.  Consumers may only access this function through
+ * wrappers that enforce the common flags to ensure portability.
+ */
+inline void *
+spl_kmem_alloc_impl(size_t size, int flags, int node)
+{
+	gfp_t lflags = kmem_flags_convert(flags);
+	void *ptr;
+
+	/*
+	 * Log abnormally large allocations and rate limit the console output.
+	 * Allocations larger than spl_kmem_alloc_warn should be performed
+	 * through the vmem_alloc()/vmem_zalloc() interfaces.
+	 */
+	if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) &&
+	    !(flags & KM_VMEM)) {
+		printk(KERN_WARNING
+		    "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n"
+		    "https://github.com/openzfs/zfs/issues/new\n",
+		    (unsigned long)size, flags);
+		dump_stack();
+	}
+
+	/*
+	 * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used
+	 * unlike kmem_alloc() with KM_SLEEP on Illumos.
+	 */
+	do {
+		/*
+		 * Calling kmalloc_node() when the size >= spl_kmem_alloc_max
+		 * is unsafe.  This must fail for all for kmem_alloc() and
+		 * kmem_zalloc() callers.
+		 *
+		 * For vmem_alloc() and vmem_zalloc() callers it is permissible
+		 * to use spl_vmalloc().  However, in general use of
+		 * spl_vmalloc() is strongly discouraged because a global lock
+		 * must be acquired.  Contention on this lock can significantly
+		 * impact performance so frequently manipulating the virtual
+		 * address space is strongly discouraged.
+		 */
+		if (size > spl_kmem_alloc_max) {
+			if (flags & KM_VMEM) {
+				ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
+			} else {
+				return (NULL);
+			}
+		} else {
+			if (flags & KM_VMEM) {
+				ptr = spl_kvmalloc(size, lflags);
+			} else {
+				ptr = kmalloc_node(size, lflags, node);
+			}
+		}
+
+		if (likely(ptr) || (flags & KM_NOSLEEP))
+			return (ptr);
+
+		/*
+		 * Try hard to satisfy the allocation. However, when progress
+		 * cannot be made, the allocation is allowed to fail.
+		 */
+		if ((lflags & GFP_KERNEL) == GFP_KERNEL)
+			lflags |= __GFP_RETRY_MAYFAIL;
+
+		/*
+		 * Use cond_resched() instead of congestion_wait() to avoid
+		 * deadlocking systems where there are no block devices.
+		 */
+		cond_resched();
+	} while (1);
+
+	return (NULL);
+}
+
+inline void
+spl_kmem_free_impl(const void *buf, size_t size)
+{
+	if (is_vmalloc_addr(buf))
+		vfree(buf);
+	else
+		kfree(buf);
+}
+
+/*
+ * Memory allocation and accounting for kmem_* * style allocations.  When
+ * DEBUG_KMEM is enabled the total memory allocated will be tracked and
+ * any memory leaked will be reported during module unload.
+ *
+ * ./configure --enable-debug-kmem
+ */
+#ifdef DEBUG_KMEM
+
+/* Shim layer memory accounting */
+#ifdef HAVE_ATOMIC64_T
+atomic64_t kmem_alloc_used = ATOMIC64_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#else  /* HAVE_ATOMIC64_T */
+atomic_t kmem_alloc_used = ATOMIC_INIT(0);
+unsigned long long kmem_alloc_max = 0;
+#endif /* HAVE_ATOMIC64_T */
+
+EXPORT_SYMBOL(kmem_alloc_used);
+EXPORT_SYMBOL(kmem_alloc_max);
+
+inline void *
+spl_kmem_alloc_debug(size_t size, int flags, int node)
+{
+	void *ptr;
+
+	ptr = spl_kmem_alloc_impl(size, flags, node);
+	if (ptr) {
+		kmem_alloc_used_add(size);
+		if (unlikely(kmem_alloc_used_read() > kmem_alloc_max))
+			kmem_alloc_max = kmem_alloc_used_read();
+	}
+
+	return (ptr);
+}
+
+inline void
+spl_kmem_free_debug(const void *ptr, size_t size)
+{
+	kmem_alloc_used_sub(size);
+	spl_kmem_free_impl(ptr, size);
+}
+
+/*
+ * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked
+ * but also the location of every alloc and free.  When the SPL module is
+ * unloaded a list of all leaked addresses and where they were allocated
+ * will be dumped to the console.  Enabling this feature has a significant
+ * impact on performance but it makes finding memory leaks straight forward.
+ *
+ * Not surprisingly with debugging enabled the xmem_locks are very highly
+ * contended particularly on xfree().  If we want to run with this detailed
+ * debugging enabled for anything other than debugging  we need to minimize
+ * the contention by moving to a lock per xmem_table entry model.
+ *
+ * ./configure --enable-debug-kmem-tracking
+ */
+#ifdef DEBUG_KMEM_TRACKING
+
+#include <linux/hash.h>
+#include <linux/ctype.h>
+
+#define	KMEM_HASH_BITS		10
+#define	KMEM_TABLE_SIZE		(1 << KMEM_HASH_BITS)
+
+typedef struct kmem_debug {
+	struct hlist_node kd_hlist;	/* Hash node linkage */
+	struct list_head kd_list;	/* List of all allocations */
+	void *kd_addr;			/* Allocation pointer */
+	size_t kd_size;			/* Allocation size */
+	const char *kd_func;		/* Allocation function */
+	int kd_line;			/* Allocation line */
+} kmem_debug_t;
+
+static spinlock_t kmem_lock;
+static struct hlist_head kmem_table[KMEM_TABLE_SIZE];
+static struct list_head kmem_list;
+
+static kmem_debug_t *
+kmem_del_init(spinlock_t *lock, struct hlist_head *table,
+    int bits, const void *addr)
+{
+	struct hlist_head *head;
+	struct hlist_node *node = NULL;
+	struct kmem_debug *p;
+	unsigned long flags;
+
+	spin_lock_irqsave(lock, flags);
+
+	head = &table[hash_ptr((void *)addr, bits)];
+	hlist_for_each(node, head) {
+		p = list_entry(node, struct kmem_debug, kd_hlist);
+		if (p->kd_addr == addr) {
+			hlist_del_init(&p->kd_hlist);
+			list_del_init(&p->kd_list);
+			spin_unlock_irqrestore(lock, flags);
+			return (p);
+		}
+	}
+
+	spin_unlock_irqrestore(lock, flags);
+
+	return (NULL);
+}
+
+inline void *
+spl_kmem_alloc_track(size_t size, int flags,
+    const char *func, int line, int node)
+{
+	void *ptr = NULL;
+	kmem_debug_t *dptr;
+	unsigned long irq_flags;
+
+	dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags));
+	if (dptr == NULL)
+		return (NULL);
+
+	dptr->kd_func = __strdup(func, flags);
+	if (dptr->kd_func == NULL) {
+		kfree(dptr);
+		return (NULL);
+	}
+
+	ptr = spl_kmem_alloc_debug(size, flags, node);
+	if (ptr == NULL) {
+		kfree(dptr->kd_func);
+		kfree(dptr);
+		return (NULL);
+	}
+
+	INIT_HLIST_NODE(&dptr->kd_hlist);
+	INIT_LIST_HEAD(&dptr->kd_list);
+
+	dptr->kd_addr = ptr;
+	dptr->kd_size = size;
+	dptr->kd_line = line;
+
+	spin_lock_irqsave(&kmem_lock, irq_flags);
+	hlist_add_head(&dptr->kd_hlist,
+	    &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]);
+	list_add_tail(&dptr->kd_list, &kmem_list);
+	spin_unlock_irqrestore(&kmem_lock, irq_flags);
+
+	return (ptr);
+}
+
+inline void
+spl_kmem_free_track(const void *ptr, size_t size)
+{
+	kmem_debug_t *dptr;
+
+	/* Ignore NULL pointer since we haven't tracked it at all */
+	if (ptr == NULL)
+		return;
+
+	/* Must exist in hash due to kmem_alloc() */
+	dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr);
+	ASSERT3P(dptr, !=, NULL);
+	ASSERT3S(dptr->kd_size, ==, size);
+
+	kfree(dptr->kd_func);
+	kfree(dptr);
+
+	spl_kmem_free_debug(ptr, size);
+}
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+/*
+ * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces.
+ */
+void *
+spl_kmem_alloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_alloc);
+
+void *
+spl_kmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= KM_ZERO;
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_zalloc);
+
+void
+spl_kmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_free_debug(buf, size));
+#else
+	return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_kmem_free);
+
+#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING)
+static char *
+spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min)
+{
+	int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size;
+	int i, flag = 1;
+
+	ASSERT(str != NULL && len >= 17);
+	memset(str, 0, len);
+
+	/*
+	 * Check for a fully printable string, and while we are at
+	 * it place the printable characters in the passed buffer.
+	 */
+	for (i = 0; i < size; i++) {
+		str[i] = ((char *)(kd->kd_addr))[i];
+		if (isprint(str[i])) {
+			continue;
+		} else {
+			/*
+			 * Minimum number of printable characters found
+			 * to make it worthwhile to print this as ascii.
+			 */
+			if (i > min)
+				break;
+
+			flag = 0;
+			break;
+		}
+	}
+
+	if (!flag) {
+		sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x",
+		    *((uint8_t *)kd->kd_addr),
+		    *((uint8_t *)kd->kd_addr + 2),
+		    *((uint8_t *)kd->kd_addr + 4),
+		    *((uint8_t *)kd->kd_addr + 6),
+		    *((uint8_t *)kd->kd_addr + 8),
+		    *((uint8_t *)kd->kd_addr + 10),
+		    *((uint8_t *)kd->kd_addr + 12),
+		    *((uint8_t *)kd->kd_addr + 14));
+	}
+
+	return (str);
+}
+
+static int
+spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size)
+{
+	int i;
+
+	spin_lock_init(lock);
+	INIT_LIST_HEAD(list);
+
+	for (i = 0; i < size; i++)
+		INIT_HLIST_HEAD(&kmem_table[i]);
+
+	return (0);
+}
+
+static void
+spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock)
+{
+	unsigned long flags;
+	kmem_debug_t *kd = NULL;
+	char str[17];
+
+	spin_lock_irqsave(lock, flags);
+	if (!list_empty(list))
+		printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address",
+		    "size", "data", "func", "line");
+
+	list_for_each_entry(kd, list, kd_list) {
+		printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr,
+		    (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8),
+		    kd->kd_func, kd->kd_line);
+	}
+
+	spin_unlock_irqrestore(lock, flags);
+}
+#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */
+
+int
+spl_kmem_init(void)
+{
+
+#ifdef DEBUG_KMEM
+	kmem_alloc_used_set(0);
+
+
+
+#ifdef DEBUG_KMEM_TRACKING
+	spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+
+	return (0);
+}
+
+void
+spl_kmem_fini(void)
+{
+#ifdef DEBUG_KMEM
+	/*
+	 * Display all unreclaimed memory addresses, including the
+	 * allocation size and the first few bytes of what's located
+	 * at that address to aid in debugging.  Performance is not
+	 * a serious concern here since it is module unload time.
+	 */
+	if (kmem_alloc_used_read() != 0)
+		printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n",
+		    (unsigned long)kmem_alloc_used_read(), kmem_alloc_max);
+
+#ifdef DEBUG_KMEM_TRACKING
+	spl_kmem_fini_tracking(&kmem_list, &kmem_lock);
+#endif /* DEBUG_KMEM_TRACKING */
+#endif /* DEBUG_KMEM */
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
new file mode 100644
index 000000000000..c7f1aadf784e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-kstat.c
@@ -0,0 +1,781 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Kstat Implementation.
+ *
+ *  Links to Illumos.org for more information on kstat function:
+ *  [1] https://illumos.org/man/1M/kstat
+ *  [2] https://illumos.org/man/9f/kstat_create
+ */
+
+#include <linux/seq_file.h>
+#include <sys/kstat.h>
+#include <sys/vmem.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+
+static kmutex_t kstat_module_lock;
+static struct list_head kstat_module_list;
+static kid_t kstat_id;
+
+static int
+kstat_resize_raw(kstat_t *ksp)
+{
+	if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
+		return (ENOMEM);
+
+	vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+	ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
+	ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+
+	return (0);
+}
+
+void
+kstat_waitq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt++;
+	if (wcnt != 0) {
+		kiop->wlentime += delta * wcnt;
+		kiop->wtime += delta;
+	}
+}
+EXPORT_SYMBOL(kstat_waitq_enter);
+
+void
+kstat_waitq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t wcnt;
+
+	new = gethrtime();
+	delta = new - kiop->wlastupdate;
+	kiop->wlastupdate = new;
+	wcnt = kiop->wcnt--;
+	ASSERT((int)wcnt > 0);
+	kiop->wlentime += delta * wcnt;
+	kiop->wtime += delta;
+}
+EXPORT_SYMBOL(kstat_waitq_exit);
+
+void
+kstat_runq_enter(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt++;
+	if (rcnt != 0) {
+		kiop->rlentime += delta * rcnt;
+		kiop->rtime += delta;
+	}
+}
+EXPORT_SYMBOL(kstat_runq_enter);
+
+void
+kstat_runq_exit(kstat_io_t *kiop)
+{
+	hrtime_t new, delta;
+	ulong_t rcnt;
+
+	new = gethrtime();
+	delta = new - kiop->rlastupdate;
+	kiop->rlastupdate = new;
+	rcnt = kiop->rcnt--;
+	ASSERT((int)rcnt > 0);
+	kiop->rlentime += delta * rcnt;
+	kiop->rtime += delta;
+}
+EXPORT_SYMBOL(kstat_runq_exit);
+
+static int
+kstat_seq_show_headers(struct seq_file *f)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	int rc = 0;
+
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n",
+	    ksp->ks_kid, ksp->ks_type, ksp->ks_flags,
+	    ksp->ks_ndata, (int)ksp->ks_data_size,
+	    ksp->ks_crtime, ksp->ks_snaptime);
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+restart:
+			if (ksp->ks_raw_ops.headers) {
+				rc = ksp->ks_raw_ops.headers(
+				    ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+				if (rc == ENOMEM && !kstat_resize_raw(ksp))
+					goto restart;
+				if (!rc)
+					seq_puts(f, ksp->ks_raw_buf);
+			} else {
+				seq_printf(f, "raw data\n");
+			}
+			break;
+		case KSTAT_TYPE_NAMED:
+			seq_printf(f, "%-31s %-4s %s\n",
+			    "name", "type", "data");
+			break;
+		case KSTAT_TYPE_INTR:
+			seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n",
+			    "hard", "soft", "watchdog",
+			    "spurious", "multsvc");
+			break;
+		case KSTAT_TYPE_IO:
+			seq_printf(f,
+			    "%-8s %-8s %-8s %-8s %-8s %-8s "
+			    "%-8s %-8s %-8s %-8s %-8s %-8s\n",
+			    "nread", "nwritten", "reads", "writes",
+			    "wtime", "wlentime", "wupdate",
+			    "rtime", "rlentime", "rupdate",
+			    "wcnt", "rcnt");
+			break;
+		case KSTAT_TYPE_TIMER:
+			seq_printf(f,
+			    "%-31s %-8s "
+			    "%-8s %-8s %-8s %-8s %-8s\n",
+			    "name", "events", "elapsed",
+			    "min", "max", "start", "stop");
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	return (-rc);
+}
+
+static int
+kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l)
+{
+	int i, j;
+
+	for (i = 0; ; i++) {
+		seq_printf(f, "%03x:", i);
+
+		for (j = 0; j < 16; j++) {
+			if (i * 16 + j >= l) {
+				seq_printf(f, "\n");
+				goto out;
+			}
+
+			seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]);
+		}
+		seq_printf(f, "\n");
+	}
+out:
+	return (0);
+}
+
+static int
+kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp)
+{
+	seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type);
+
+	switch (knp->data_type) {
+		case KSTAT_DATA_CHAR:
+			knp->value.c[15] = '\0'; /* NULL terminate */
+			seq_printf(f, "%-16s", knp->value.c);
+			break;
+		/*
+		 * NOTE - We need to be more careful able what tokens are
+		 * used for each arch, for now this is correct for x86_64.
+		 */
+		case KSTAT_DATA_INT32:
+			seq_printf(f, "%d", knp->value.i32);
+			break;
+		case KSTAT_DATA_UINT32:
+			seq_printf(f, "%u", knp->value.ui32);
+			break;
+		case KSTAT_DATA_INT64:
+			seq_printf(f, "%lld", (signed long long)knp->value.i64);
+			break;
+		case KSTAT_DATA_UINT64:
+			seq_printf(f, "%llu",
+			    (unsigned long long)knp->value.ui64);
+			break;
+		case KSTAT_DATA_LONG:
+			seq_printf(f, "%ld", knp->value.l);
+			break;
+		case KSTAT_DATA_ULONG:
+			seq_printf(f, "%lu", knp->value.ul);
+			break;
+		case KSTAT_DATA_STRING:
+			KSTAT_NAMED_STR_PTR(knp)
+				[KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0';
+			seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp));
+			break;
+		default:
+			PANIC("Undefined kstat data type %d\n", knp->data_type);
+	}
+
+	seq_printf(f, "\n");
+
+	return (0);
+}
+
+static int
+kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip)
+{
+	seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n",
+	    kip->intrs[KSTAT_INTR_HARD],
+	    kip->intrs[KSTAT_INTR_SOFT],
+	    kip->intrs[KSTAT_INTR_WATCHDOG],
+	    kip->intrs[KSTAT_INTR_SPURIOUS],
+	    kip->intrs[KSTAT_INTR_MULTSVC]);
+
+	return (0);
+}
+
+static int
+kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip)
+{
+	/* though wlentime & friends are signed, they will never be negative */
+	seq_printf(f,
+	    "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
+	    "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
+	    kip->nread, kip->nwritten,
+	    kip->reads, kip->writes,
+	    kip->wtime, kip->wlentime, kip->wlastupdate,
+	    kip->rtime, kip->rlentime, kip->rlastupdate,
+	    kip->wcnt,  kip->rcnt);
+
+	return (0);
+}
+
+static int
+kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp)
+{
+	seq_printf(f,
+	    "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n",
+	    ktp->name, ktp->num_events, ktp->elapsed_time,
+	    ktp->min_time, ktp->max_time,
+	    ktp->start_time, ktp->stop_time);
+
+	return (0);
+}
+
+static int
+kstat_seq_show(struct seq_file *f, void *p)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	int rc = 0;
+
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+restart:
+			if (ksp->ks_raw_ops.data) {
+				rc = ksp->ks_raw_ops.data(
+				    ksp->ks_raw_buf, ksp->ks_raw_bufsize, p);
+				if (rc == ENOMEM && !kstat_resize_raw(ksp))
+					goto restart;
+				if (!rc)
+					seq_puts(f, ksp->ks_raw_buf);
+			} else {
+				ASSERT(ksp->ks_ndata == 1);
+				rc = kstat_seq_show_raw(f, ksp->ks_data,
+				    ksp->ks_data_size);
+			}
+			break;
+		case KSTAT_TYPE_NAMED:
+			rc = kstat_seq_show_named(f, (kstat_named_t *)p);
+			break;
+		case KSTAT_TYPE_INTR:
+			rc = kstat_seq_show_intr(f, (kstat_intr_t *)p);
+			break;
+		case KSTAT_TYPE_IO:
+			rc = kstat_seq_show_io(f, (kstat_io_t *)p);
+			break;
+		case KSTAT_TYPE_TIMER:
+			rc = kstat_seq_show_timer(f, (kstat_timer_t *)p);
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	return (-rc);
+}
+
+static int
+kstat_default_update(kstat_t *ksp, int rw)
+{
+	ASSERT(ksp != NULL);
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	return (0);
+}
+
+static void *
+kstat_seq_data_addr(kstat_t *ksp, loff_t n)
+{
+	void *rc = NULL;
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+			if (ksp->ks_raw_ops.addr)
+				rc = ksp->ks_raw_ops.addr(ksp, n);
+			else
+				rc = ksp->ks_data;
+			break;
+		case KSTAT_TYPE_NAMED:
+			rc = ksp->ks_data + n * sizeof (kstat_named_t);
+			break;
+		case KSTAT_TYPE_INTR:
+			rc = ksp->ks_data + n * sizeof (kstat_intr_t);
+			break;
+		case KSTAT_TYPE_IO:
+			rc = ksp->ks_data + n * sizeof (kstat_io_t);
+			break;
+		case KSTAT_TYPE_TIMER:
+			rc = ksp->ks_data + n * sizeof (kstat_timer_t);
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	return (rc);
+}
+
+static void *
+kstat_seq_start(struct seq_file *f, loff_t *pos)
+{
+	loff_t n = *pos;
+	kstat_t *ksp = (kstat_t *)f->private;
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	mutex_enter(ksp->ks_lock);
+
+	if (ksp->ks_type == KSTAT_TYPE_RAW) {
+		ksp->ks_raw_bufsize = PAGE_SIZE;
+		ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
+	}
+
+	/* Dynamically update kstat, on error existing kstats are used */
+	(void) ksp->ks_update(ksp, KSTAT_READ);
+
+	ksp->ks_snaptime = gethrtime();
+
+	if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n &&
+	    kstat_seq_show_headers(f))
+		return (NULL);
+
+	if (n >= ksp->ks_ndata)
+		return (NULL);
+
+	return (kstat_seq_data_addr(ksp, n));
+}
+
+static void *
+kstat_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	++*pos;
+	if (*pos >= ksp->ks_ndata)
+		return (NULL);
+
+	return (kstat_seq_data_addr(ksp, *pos));
+}
+
+static void
+kstat_seq_stop(struct seq_file *f, void *v)
+{
+	kstat_t *ksp = (kstat_t *)f->private;
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	if (ksp->ks_type == KSTAT_TYPE_RAW)
+		vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
+
+	mutex_exit(ksp->ks_lock);
+}
+
+static struct seq_operations kstat_seq_ops = {
+	.show  = kstat_seq_show,
+	.start = kstat_seq_start,
+	.next  = kstat_seq_next,
+	.stop  = kstat_seq_stop,
+};
+
+static kstat_module_t *
+kstat_find_module(char *name)
+{
+	kstat_module_t *module = NULL;
+
+	list_for_each_entry(module, &kstat_module_list, ksm_module_list) {
+		if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0)
+			return (module);
+	}
+
+	return (NULL);
+}
+
+static kstat_module_t *
+kstat_create_module(char *name)
+{
+	kstat_module_t *module;
+	struct proc_dir_entry *pde;
+
+	pde = proc_mkdir(name, proc_spl_kstat);
+	if (pde == NULL)
+		return (NULL);
+
+	module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
+	module->ksm_proc = pde;
+	strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
+	INIT_LIST_HEAD(&module->ksm_kstat_list);
+	list_add_tail(&module->ksm_module_list, &kstat_module_list);
+
+	return (module);
+
+}
+
+static void
+kstat_delete_module(kstat_module_t *module)
+{
+	ASSERT(list_empty(&module->ksm_kstat_list));
+	remove_proc_entry(module->ksm_name, proc_spl_kstat);
+	list_del(&module->ksm_module_list);
+	kmem_free(module, sizeof (kstat_module_t));
+}
+
+static int
+proc_kstat_open(struct inode *inode, struct file *filp)
+{
+	struct seq_file *f;
+	int rc;
+
+	rc = seq_open(filp, &kstat_seq_ops);
+	if (rc)
+		return (rc);
+
+	f = filp->private_data;
+	f->private = PDE_DATA(inode);
+
+	return (0);
+}
+
+static ssize_t
+proc_kstat_write(struct file *filp, const char __user *buf, size_t len,
+    loff_t *ppos)
+{
+	struct seq_file *f = filp->private_data;
+	kstat_t *ksp = f->private;
+	int rc;
+
+	ASSERT(ksp->ks_magic == KS_MAGIC);
+
+	mutex_enter(ksp->ks_lock);
+	rc = ksp->ks_update(ksp, KSTAT_WRITE);
+	mutex_exit(ksp->ks_lock);
+
+	if (rc)
+		return (-rc);
+
+	*ppos += len;
+	return (len);
+}
+
+static const kstat_proc_op_t proc_kstat_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_kstat_open,
+	.proc_write	= proc_kstat_write,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_kstat_open,
+	.write		= proc_kstat_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+void
+__kstat_set_raw_ops(kstat_t *ksp,
+    int (*headers)(char *buf, size_t size),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index))
+{
+	ksp->ks_raw_ops.headers = headers;
+	ksp->ks_raw_ops.data    = data;
+	ksp->ks_raw_ops.addr    = addr;
+}
+EXPORT_SYMBOL(__kstat_set_raw_ops);
+
+void
+kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
+    const char *name)
+{
+	kpep->kpe_owner = NULL;
+	kpep->kpe_proc = NULL;
+	INIT_LIST_HEAD(&kpep->kpe_list);
+	strncpy(kpep->kpe_module, module, KSTAT_STRLEN);
+	strncpy(kpep->kpe_name, name, KSTAT_STRLEN);
+}
+EXPORT_SYMBOL(kstat_proc_entry_init);
+
+kstat_t *
+__kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
+    const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
+    uchar_t ks_flags)
+{
+	kstat_t *ksp;
+
+	ASSERT(ks_module);
+	ASSERT(ks_instance == 0);
+	ASSERT(ks_name);
+
+	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
+		ASSERT(ks_ndata == 1);
+
+	ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP);
+	if (ksp == NULL)
+		return (ksp);
+
+	mutex_enter(&kstat_module_lock);
+	ksp->ks_kid = kstat_id;
+	kstat_id++;
+	mutex_exit(&kstat_module_lock);
+
+	ksp->ks_magic = KS_MAGIC;
+	mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
+	ksp->ks_lock = &ksp->ks_private_lock;
+
+	ksp->ks_crtime = gethrtime();
+	ksp->ks_snaptime = ksp->ks_crtime;
+	ksp->ks_instance = ks_instance;
+	strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
+	ksp->ks_type = ks_type;
+	ksp->ks_flags = ks_flags;
+	ksp->ks_update = kstat_default_update;
+	ksp->ks_private = NULL;
+	ksp->ks_raw_ops.headers = NULL;
+	ksp->ks_raw_ops.data = NULL;
+	ksp->ks_raw_ops.addr = NULL;
+	ksp->ks_raw_buf = NULL;
+	ksp->ks_raw_bufsize = 0;
+	kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name);
+
+	switch (ksp->ks_type) {
+		case KSTAT_TYPE_RAW:
+			ksp->ks_ndata = 1;
+			ksp->ks_data_size = ks_ndata;
+			break;
+		case KSTAT_TYPE_NAMED:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
+			break;
+		case KSTAT_TYPE_INTR:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
+			break;
+		case KSTAT_TYPE_IO:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
+			break;
+		case KSTAT_TYPE_TIMER:
+			ksp->ks_ndata = ks_ndata;
+			ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
+			break;
+		default:
+			PANIC("Undefined kstat type %d\n", ksp->ks_type);
+	}
+
+	if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
+		ksp->ks_data = NULL;
+	} else {
+		ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
+		if (ksp->ks_data == NULL) {
+			kmem_free(ksp, sizeof (*ksp));
+			ksp = NULL;
+		}
+	}
+
+	return (ksp);
+}
+EXPORT_SYMBOL(__kstat_create);
+
+static int
+kstat_detect_collision(kstat_proc_entry_t *kpep)
+{
+	kstat_module_t *module;
+	kstat_proc_entry_t *tmp = NULL;
+	char *parent;
+	char *cp;
+
+	parent = kmem_asprintf("%s", kpep->kpe_module);
+
+	if ((cp = strrchr(parent, '/')) == NULL) {
+		kmem_strfree(parent);
+		return (0);
+	}
+
+	cp[0] = '\0';
+	if ((module = kstat_find_module(parent)) != NULL) {
+		list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+			if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) {
+				kmem_strfree(parent);
+				return (EEXIST);
+			}
+		}
+	}
+
+	kmem_strfree(parent);
+	return (0);
+}
+
+/*
+ * Add a file to the proc filesystem under the kstat namespace (i.e.
+ * /proc/spl/kstat/). The file need not necessarily be implemented as a
+ * kstat.
+ */
+void
+kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode,
+    const kstat_proc_op_t *proc_ops, void *data)
+{
+	kstat_module_t *module;
+	kstat_proc_entry_t *tmp = NULL;
+
+	ASSERT(kpep);
+
+	mutex_enter(&kstat_module_lock);
+
+	module = kstat_find_module(kpep->kpe_module);
+	if (module == NULL) {
+		if (kstat_detect_collision(kpep) != 0) {
+			cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
+			    " collision", kpep->kpe_module, kpep->kpe_name);
+			goto out;
+		}
+		module = kstat_create_module(kpep->kpe_module);
+		if (module == NULL)
+			goto out;
+	}
+
+	/*
+	 * Only one entry by this name per-module, on failure the module
+	 * shouldn't be deleted because we know it has at least one entry.
+	 */
+	list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+		if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0)
+			goto out;
+	}
+
+	list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list);
+
+	kpep->kpe_owner = module;
+	kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode,
+	    module->ksm_proc, proc_ops, data);
+	if (kpep->kpe_proc == NULL) {
+		list_del_init(&kpep->kpe_list);
+		if (list_empty(&module->ksm_kstat_list))
+			kstat_delete_module(module);
+	}
+out:
+	mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_install);
+
+void
+__kstat_install(kstat_t *ksp)
+{
+	ASSERT(ksp);
+	mode_t mode;
+	/* Specify permission modes for different kstats */
+	if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) {
+		mode = 0600;
+	} else {
+		mode = 0644;
+	}
+	kstat_proc_entry_install(
+	    &ksp->ks_proc, mode, &proc_kstat_operations, ksp);
+}
+EXPORT_SYMBOL(__kstat_install);
+
+void
+kstat_proc_entry_delete(kstat_proc_entry_t *kpep)
+{
+	kstat_module_t *module = kpep->kpe_owner;
+	if (kpep->kpe_proc)
+		remove_proc_entry(kpep->kpe_name, module->ksm_proc);
+
+	mutex_enter(&kstat_module_lock);
+	list_del_init(&kpep->kpe_list);
+
+	/*
+	 * Remove top level module directory if it wasn't empty before, but now
+	 * is.
+	 */
+	if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list))
+		kstat_delete_module(module);
+	mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_delete);
+
+void
+__kstat_delete(kstat_t *ksp)
+{
+	kstat_proc_entry_delete(&ksp->ks_proc);
+
+	if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
+		kmem_free(ksp->ks_data, ksp->ks_data_size);
+
+	ksp->ks_lock = NULL;
+	mutex_destroy(&ksp->ks_private_lock);
+	kmem_free(ksp, sizeof (*ksp));
+}
+EXPORT_SYMBOL(__kstat_delete);
+
+int
+spl_kstat_init(void)
+{
+	mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL);
+	INIT_LIST_HEAD(&kstat_module_list);
+	kstat_id = 0;
+	return (0);
+}
+
+void
+spl_kstat_fini(void)
+{
+	ASSERT(list_empty(&kstat_module_list));
+	mutex_destroy(&kstat_module_lock);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
new file mode 100644
index 000000000000..3e58598d43f8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -0,0 +1,790 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Proc Implementation.
+ */
+
+#include <sys/systeminfo.h>
+#include <sys/kstat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/vmem.h>
+#include <sys/taskq.h>
+#include <sys/proc.h>
+#include <linux/ctype.h>
+#include <linux/kmod.h>
+#include <linux/seq_file.h>
+#include <linux/uaccess.h>
+#include <linux/version.h>
+
+#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
+typedef struct ctl_table __no_const spl_ctl_table;
+#else
+typedef struct ctl_table spl_ctl_table;
+#endif
+
+static unsigned long table_min = 0;
+static unsigned long table_max = ~0;
+
+static struct ctl_table_header *spl_header = NULL;
+static struct proc_dir_entry *proc_spl = NULL;
+static struct proc_dir_entry *proc_spl_kmem = NULL;
+static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
+static struct proc_dir_entry *proc_spl_taskq_all = NULL;
+static struct proc_dir_entry *proc_spl_taskq = NULL;
+struct proc_dir_entry *proc_spl_kstat = NULL;
+
+static int
+proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer,
+    int ubuffer_size)
+{
+	int size;
+
+	if (ubuffer_size > kbuffer_size)
+		return (-EOVERFLOW);
+
+	if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size))
+		return (-EFAULT);
+
+	/* strip trailing whitespace */
+	size = strnlen(kbuffer, ubuffer_size);
+	while (size-- >= 0)
+		if (!isspace(kbuffer[size]))
+			break;
+
+	/* empty string */
+	if (size < 0)
+		return (-EINVAL);
+
+	/* no space to terminate */
+	if (size == kbuffer_size)
+		return (-EOVERFLOW);
+
+	kbuffer[size + 1] = 0;
+	return (0);
+}
+
+static int
+proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer,
+    char *append)
+{
+	/*
+	 * NB if 'append' != NULL, it's a single character to append to the
+	 * copied out string - usually "\n", for /proc entries and
+	 * (i.e. a terminating zero byte) for sysctl entries
+	 */
+	int size = MIN(strlen(kbuffer), ubuffer_size);
+
+	if (copy_to_user(ubuffer, kbuffer, size))
+		return (-EFAULT);
+
+	if (append != NULL && size < ubuffer_size) {
+		if (copy_to_user(ubuffer + size, append, 1))
+			return (-EFAULT);
+
+		size++;
+	}
+
+	return (size);
+}
+
+#ifdef DEBUG_KMEM
+static int
+proc_domemused(struct ctl_table *table, int write,
+    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc = 0;
+	unsigned long min = 0, max = ~0, val;
+	spl_ctl_table dummy = *table;
+
+	dummy.data = &val;
+	dummy.proc_handler = &proc_dointvec;
+	dummy.extra1 = &min;
+	dummy.extra2 = &max;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+#ifdef HAVE_ATOMIC64_T
+		val = atomic64_read((atomic64_t *)table->data);
+#else
+		val = atomic_read((atomic_t *)table->data);
+#endif /* HAVE_ATOMIC64_T */
+		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+	}
+
+	return (rc);
+}
+#endif /* DEBUG_KMEM */
+
+static int
+proc_doslab(struct ctl_table *table, int write,
+    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int rc = 0;
+	unsigned long min = 0, max = ~0, val = 0, mask;
+	spl_ctl_table dummy = *table;
+	spl_kmem_cache_t *skc = NULL;
+
+	dummy.data = &val;
+	dummy.proc_handler = &proc_dointvec;
+	dummy.extra1 = &min;
+	dummy.extra2 = &max;
+
+	if (write) {
+		*ppos += *lenp;
+	} else {
+		down_read(&spl_kmem_cache_sem);
+		mask = (unsigned long)table->data;
+
+		list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
+
+			/* Only use slabs of the correct kmem/vmem type */
+			if (!(skc->skc_flags & mask))
+				continue;
+
+			/* Sum the specified field for selected slabs */
+			switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) {
+			case KMC_TOTAL:
+				val += skc->skc_slab_size * skc->skc_slab_total;
+				break;
+			case KMC_ALLOC:
+				val += skc->skc_obj_size * skc->skc_obj_alloc;
+				break;
+			case KMC_MAX:
+				val += skc->skc_obj_size * skc->skc_obj_max;
+				break;
+			}
+		}
+
+		up_read(&spl_kmem_cache_sem);
+		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
+	}
+
+	return (rc);
+}
+
+static int
+proc_dohostid(struct ctl_table *table, int write,
+    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int len, rc = 0;
+	char *end, str[32];
+
+	if (write) {
+		/*
+		 * We can't use proc_doulongvec_minmax() in the write
+		 * case here because hostid while a hex value has no
+		 * leading 0x which confuses the helper function.
+		 */
+		rc = proc_copyin_string(str, sizeof (str), buffer, *lenp);
+		if (rc < 0)
+			return (rc);
+
+		spl_hostid = simple_strtoul(str, &end, 16);
+		if (str == end)
+			return (-EINVAL);
+
+	} else {
+		len = snprintf(str, sizeof (str), "%lx",
+		    (unsigned long) zone_get_hostid(NULL));
+		if (*ppos >= len)
+			rc = 0;
+		else
+			rc = proc_copyout_string(buffer,
+			    *lenp, str + *ppos, "\n");
+
+		if (rc >= 0) {
+			*lenp = rc;
+			*ppos += rc;
+		}
+	}
+
+	return (rc);
+}
+
+static void
+taskq_seq_show_headers(struct seq_file *f)
+{
+	seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
+	    "taskq", "act", "nthr", "spwn", "maxt", "pri",
+	    "mina", "maxa", "cura", "flags");
+}
+
+/* indices into the lheads array below */
+#define	LHEAD_PEND	0
+#define	LHEAD_PRIO	1
+#define	LHEAD_DELAY	2
+#define	LHEAD_WAIT	3
+#define	LHEAD_ACTIVE	4
+#define	LHEAD_SIZE	5
+
+/* BEGIN CSTYLED */
+static unsigned int spl_max_show_tasks = 512;
+module_param(spl_max_show_tasks, uint, 0644);
+MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
+/* END CSTYLED */
+
+static int
+taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
+{
+	taskq_t *tq = p;
+	taskq_thread_t *tqt = NULL;
+	spl_wait_queue_entry_t *wq;
+	struct task_struct *tsk;
+	taskq_ent_t *tqe;
+	char name[100];
+	struct list_head *lheads[LHEAD_SIZE], *lh;
+	static char *list_names[LHEAD_SIZE] =
+	    {"pend", "prio", "delay", "wait", "active" };
+	int i, j, have_lheads = 0;
+	unsigned long wflags, flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
+
+	/* get the various lists and check whether they're empty */
+	lheads[LHEAD_PEND] = &tq->tq_pend_list;
+	lheads[LHEAD_PRIO] = &tq->tq_prio_list;
+	lheads[LHEAD_DELAY] = &tq->tq_delay_list;
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
+#else
+	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
+#endif
+	lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
+
+	for (i = 0; i < LHEAD_SIZE; ++i) {
+		if (list_empty(lheads[i]))
+			lheads[i] = NULL;
+		else
+			++have_lheads;
+	}
+
+	/* early return in non-"all" mode if lists are all empty */
+	if (!allflag && !have_lheads) {
+		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+		return (0);
+	}
+
+	/* unlock the waitq quickly */
+	if (!lheads[LHEAD_WAIT])
+		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+
+	/* show the base taskq contents */
+	snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
+	seq_printf(f, "%-25s ", name);
+	seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
+	    tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
+	    tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
+	    tq->tq_nalloc, tq->tq_flags);
+
+	/* show the active list */
+	if (lheads[LHEAD_ACTIVE]) {
+		j = 0;
+		list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
+			if (j == 0)
+				seq_printf(f, "\t%s:",
+				    list_names[LHEAD_ACTIVE]);
+			else if (j == 2) {
+				seq_printf(f, "\n\t       ");
+				j = 0;
+			}
+			seq_printf(f, " [%d]%pf(%ps)",
+			    tqt->tqt_thread->pid,
+			    tqt->tqt_task->tqent_func,
+			    tqt->tqt_task->tqent_arg);
+			++j;
+		}
+		seq_printf(f, "\n");
+	}
+
+	for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
+		if (lheads[i]) {
+			j = 0;
+			list_for_each(lh, lheads[i]) {
+				if (spl_max_show_tasks != 0 &&
+				    j >= spl_max_show_tasks) {
+					seq_printf(f, "\n\t(truncated)");
+					break;
+				}
+				/* show the wait waitq list */
+				if (i == LHEAD_WAIT) {
+#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
+					wq = list_entry(lh,
+					    spl_wait_queue_entry_t, entry);
+#else
+					wq = list_entry(lh,
+					    spl_wait_queue_entry_t, task_list);
+#endif
+					if (j == 0)
+						seq_printf(f, "\t%s:",
+						    list_names[i]);
+					else if (j % 8 == 0)
+						seq_printf(f, "\n\t     ");
+
+					tsk = wq->private;
+					seq_printf(f, " %d", tsk->pid);
+				/* pend, prio and delay lists */
+				} else {
+					tqe = list_entry(lh, taskq_ent_t,
+					    tqent_list);
+					if (j == 0)
+						seq_printf(f, "\t%s:",
+						    list_names[i]);
+					else if (j % 2 == 0)
+						seq_printf(f, "\n\t     ");
+
+					seq_printf(f, " %pf(%ps)",
+					    tqe->tqent_func,
+					    tqe->tqent_arg);
+				}
+				++j;
+			}
+			seq_printf(f, "\n");
+		}
+	if (lheads[LHEAD_WAIT])
+		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (0);
+}
+
+static int
+taskq_all_seq_show(struct seq_file *f, void *p)
+{
+	return (taskq_seq_show_impl(f, p, B_TRUE));
+}
+
+static int
+taskq_seq_show(struct seq_file *f, void *p)
+{
+	return (taskq_seq_show_impl(f, p, B_FALSE));
+}
+
+static void *
+taskq_seq_start(struct seq_file *f, loff_t *pos)
+{
+	struct list_head *p;
+	loff_t n = *pos;
+
+	down_read(&tq_list_sem);
+	if (!n)
+		taskq_seq_show_headers(f);
+
+	p = tq_list.next;
+	while (n--) {
+		p = p->next;
+		if (p == &tq_list)
+		return (NULL);
+	}
+
+	return (list_entry(p, taskq_t, tq_taskqs));
+}
+
+static void *
+taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	taskq_t *tq = p;
+
+	++*pos;
+	return ((tq->tq_taskqs.next == &tq_list) ?
+	    NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
+}
+
+static void
+slab_seq_show_headers(struct seq_file *f)
+{
+	seq_printf(f,
+	    "--------------------- cache ----------"
+	    "---------------------------------------------  "
+	    "----- slab ------  "
+	    "---- object -----  "
+	    "--- emergency ---\n");
+	seq_printf(f,
+	    "name                                  "
+	    "  flags      size     alloc slabsize  objsize  "
+	    "total alloc   max  "
+	    "total alloc   max  "
+	    "dlock alloc   max\n");
+}
+
+static int
+slab_seq_show(struct seq_file *f, void *p)
+{
+	spl_kmem_cache_t *skc = p;
+
+	ASSERT(skc->skc_magic == SKC_MAGIC);
+
+	if (skc->skc_flags & KMC_SLAB) {
+		/*
+		 * This cache is backed by a generic Linux kmem cache which
+		 * has its own accounting. For these caches we only track
+		 * the number of active allocated objects that exist within
+		 * the underlying Linux slabs. For the overall statistics of
+		 * the underlying Linux cache please refer to /proc/slabinfo.
+		 */
+		spin_lock(&skc->skc_lock);
+		uint64_t objs_allocated =
+		    percpu_counter_sum(&skc->skc_linux_alloc);
+		seq_printf(f, "%-36s  ", skc->skc_name);
+		seq_printf(f, "0x%05lx %9s %9lu %8s %8u  "
+		    "%5s %5s %5s  %5s %5lu %5s  %5s %5s %5s\n",
+		    (long unsigned)skc->skc_flags,
+		    "-",
+		    (long unsigned)(skc->skc_obj_size * objs_allocated),
+		    "-",
+		    (unsigned)skc->skc_obj_size,
+		    "-", "-", "-", "-",
+		    (long unsigned)objs_allocated,
+		    "-", "-", "-", "-");
+		spin_unlock(&skc->skc_lock);
+		return (0);
+	}
+
+	spin_lock(&skc->skc_lock);
+	seq_printf(f, "%-36s  ", skc->skc_name);
+	seq_printf(f, "0x%05lx %9lu %9lu %8u %8u  "
+	    "%5lu %5lu %5lu  %5lu %5lu %5lu  %5lu %5lu %5lu\n",
+	    (long unsigned)skc->skc_flags,
+	    (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
+	    (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
+	    (unsigned)skc->skc_slab_size,
+	    (unsigned)skc->skc_obj_size,
+	    (long unsigned)skc->skc_slab_total,
+	    (long unsigned)skc->skc_slab_alloc,
+	    (long unsigned)skc->skc_slab_max,
+	    (long unsigned)skc->skc_obj_total,
+	    (long unsigned)skc->skc_obj_alloc,
+	    (long unsigned)skc->skc_obj_max,
+	    (long unsigned)skc->skc_obj_deadlock,
+	    (long unsigned)skc->skc_obj_emergency,
+	    (long unsigned)skc->skc_obj_emergency_max);
+	spin_unlock(&skc->skc_lock);
+	return (0);
+}
+
+static void *
+slab_seq_start(struct seq_file *f, loff_t *pos)
+{
+	struct list_head *p;
+	loff_t n = *pos;
+
+	down_read(&spl_kmem_cache_sem);
+	if (!n)
+		slab_seq_show_headers(f);
+
+	p = spl_kmem_cache_list.next;
+	while (n--) {
+		p = p->next;
+		if (p == &spl_kmem_cache_list)
+			return (NULL);
+	}
+
+	return (list_entry(p, spl_kmem_cache_t, skc_list));
+}
+
+static void *
+slab_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	spl_kmem_cache_t *skc = p;
+
+	++*pos;
+	return ((skc->skc_list.next == &spl_kmem_cache_list) ?
+	    NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list));
+}
+
+static void
+slab_seq_stop(struct seq_file *f, void *v)
+{
+	up_read(&spl_kmem_cache_sem);
+}
+
+static struct seq_operations slab_seq_ops = {
+	.show  = slab_seq_show,
+	.start = slab_seq_start,
+	.next  = slab_seq_next,
+	.stop  = slab_seq_stop,
+};
+
+static int
+proc_slab_open(struct inode *inode, struct file *filp)
+{
+	return (seq_open(filp, &slab_seq_ops));
+}
+
+static const kstat_proc_op_t proc_slab_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_slab_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_slab_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+static void
+taskq_seq_stop(struct seq_file *f, void *v)
+{
+	up_read(&tq_list_sem);
+}
+
+static struct seq_operations taskq_all_seq_ops = {
+	.show	= taskq_all_seq_show,
+	.start	= taskq_seq_start,
+	.next	= taskq_seq_next,
+	.stop	= taskq_seq_stop,
+};
+
+static struct seq_operations taskq_seq_ops = {
+	.show	= taskq_seq_show,
+	.start	= taskq_seq_start,
+	.next	= taskq_seq_next,
+	.stop	= taskq_seq_stop,
+};
+
+static int
+proc_taskq_all_open(struct inode *inode, struct file *filp)
+{
+	return (seq_open(filp, &taskq_all_seq_ops));
+}
+
+static int
+proc_taskq_open(struct inode *inode, struct file *filp)
+{
+	return (seq_open(filp, &taskq_seq_ops));
+}
+
+static const kstat_proc_op_t proc_taskq_all_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_taskq_all_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_taskq_all_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+static const kstat_proc_op_t proc_taskq_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= proc_taskq_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release,
+#else
+	.open		= proc_taskq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+#endif
+};
+
+static struct ctl_table spl_kmem_table[] = {
+#ifdef DEBUG_KMEM
+	{
+		.procname	= "kmem_used",
+		.data		= &kmem_alloc_used,
+#ifdef HAVE_ATOMIC64_T
+		.maxlen		= sizeof (atomic64_t),
+#else
+		.maxlen		= sizeof (atomic_t),
+#endif /* HAVE_ATOMIC64_T */
+		.mode		= 0444,
+		.proc_handler	= &proc_domemused,
+	},
+	{
+		.procname	= "kmem_max",
+		.data		= &kmem_alloc_max,
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doulongvec_minmax,
+	},
+#endif /* DEBUG_KMEM */
+	{
+		.procname	= "slab_kvmem_total",
+		.data		= (void *)(KMC_KVMEM | KMC_TOTAL),
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doslab,
+	},
+	{
+		.procname	= "slab_kvmem_alloc",
+		.data		= (void *)(KMC_KVMEM | KMC_ALLOC),
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doslab,
+	},
+	{
+		.procname	= "slab_kvmem_max",
+		.data		= (void *)(KMC_KVMEM | KMC_MAX),
+		.maxlen		= sizeof (unsigned long),
+		.extra1		= &table_min,
+		.extra2		= &table_max,
+		.mode		= 0444,
+		.proc_handler	= &proc_doslab,
+	},
+	{},
+};
+
+static struct ctl_table spl_kstat_table[] = {
+	{},
+};
+
+static struct ctl_table spl_table[] = {
+	/*
+	 * NB No .strategy entries have been provided since
+	 * sysctl(8) prefers to go via /proc for portability.
+	 */
+	{
+		.procname	= "gitrev",
+		.data		= spl_gitrev,
+		.maxlen		= sizeof (spl_gitrev),
+		.mode		= 0444,
+		.proc_handler	= &proc_dostring,
+	},
+	{
+		.procname	= "hostid",
+		.data		= &spl_hostid,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_dohostid,
+	},
+	{
+		.procname	= "kmem",
+		.mode		= 0555,
+		.child		= spl_kmem_table,
+	},
+	{
+		.procname	= "kstat",
+		.mode		= 0555,
+		.child		= spl_kstat_table,
+	},
+	{},
+};
+
+static struct ctl_table spl_dir[] = {
+	{
+		.procname	= "spl",
+		.mode		= 0555,
+		.child		= spl_table,
+	},
+	{}
+};
+
+static struct ctl_table spl_root[] = {
+	{
+	.procname = "kernel",
+	.mode = 0555,
+	.child = spl_dir,
+	},
+	{}
+};
+
+int
+spl_proc_init(void)
+{
+	int rc = 0;
+
+	spl_header = register_sysctl_table(spl_root);
+	if (spl_header == NULL)
+		return (-EUNATCH);
+
+	proc_spl = proc_mkdir("spl", NULL);
+	if (proc_spl == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
+	    &proc_taskq_all_operations, NULL);
+	if (proc_spl_taskq_all == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
+	    &proc_taskq_operations, NULL);
+	if (proc_spl_taskq == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_kmem = proc_mkdir("kmem", proc_spl);
+	if (proc_spl_kmem == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem,
+	    &proc_slab_operations, NULL);
+	if (proc_spl_kmem_slab == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+
+	proc_spl_kstat = proc_mkdir("kstat", proc_spl);
+	if (proc_spl_kstat == NULL) {
+		rc = -EUNATCH;
+		goto out;
+	}
+out:
+	if (rc) {
+		remove_proc_entry("kstat", proc_spl);
+		remove_proc_entry("slab", proc_spl_kmem);
+		remove_proc_entry("kmem", proc_spl);
+		remove_proc_entry("taskq-all", proc_spl);
+		remove_proc_entry("taskq", proc_spl);
+		remove_proc_entry("spl", NULL);
+		unregister_sysctl_table(spl_header);
+	}
+
+	return (rc);
+}
+
+void
+spl_proc_fini(void)
+{
+	remove_proc_entry("kstat", proc_spl);
+	remove_proc_entry("slab", proc_spl_kmem);
+	remove_proc_entry("kmem", proc_spl);
+	remove_proc_entry("taskq-all", proc_spl);
+	remove_proc_entry("taskq", proc_spl);
+	remove_proc_entry("spl", NULL);
+
+	ASSERT(spl_header != NULL);
+	unregister_sysctl_table(spl_header);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
new file mode 100644
index 000000000000..cae13228c62c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-procfs-list.c
@@ -0,0 +1,284 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+#include <linux/proc_fs.h>
+
+/*
+ * A procfs_list is a wrapper around a linked list which implements the seq_file
+ * interface, allowing the contents of the list to be exposed through procfs.
+ * The kernel already has some utilities to help implement the seq_file
+ * interface for linked lists (seq_list_*), but they aren't appropriate for use
+ * with lists that have many entries, because seq_list_start walks the list at
+ * the start of each read syscall to find where it left off, so reading a file
+ * ends up being quadratic in the number of entries in the list.
+ *
+ * This implementation avoids this penalty by maintaining a separate cursor into
+ * the list per instance of the file that is open. It also maintains some extra
+ * information in each node of the list to prevent reads of entries that have
+ * been dropped from the list.
+ *
+ * Callers should only add elements to the list using procfs_list_add, which
+ * adds an element to the tail of the list. Other operations can be performed
+ * directly on the wrapped list using the normal list manipulation functions,
+ * but elements should only be removed from the head of the list.
+ */
+
+#define	NODE_ID(procfs_list, obj) \
+		(((procfs_list_node_t *)(((char *)obj) + \
+		(procfs_list)->pl_node_offset))->pln_id)
+
+typedef struct procfs_list_cursor {
+	procfs_list_t	*procfs_list;	/* List into which this cursor points */
+	void		*cached_node;	/* Most recently accessed node */
+	loff_t		cached_pos;	/* Position of cached_node */
+} procfs_list_cursor_t;
+
+static int
+procfs_list_seq_show(struct seq_file *f, void *p)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+
+	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+	if (p == SEQ_START_TOKEN) {
+		if (procfs_list->pl_show_header != NULL)
+			return (procfs_list->pl_show_header(f));
+		else
+			return (0);
+	}
+	return (procfs_list->pl_show(f, p));
+}
+
+static void *
+procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos)
+{
+	void *next_node;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+
+	if (cursor->cached_node == SEQ_START_TOKEN)
+		next_node = list_head(&procfs_list->pl_list);
+	else
+		next_node = list_next(&procfs_list->pl_list,
+		    cursor->cached_node);
+
+	if (next_node != NULL) {
+		cursor->cached_node = next_node;
+		cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node);
+		*pos = cursor->cached_pos;
+	} else {
+		/*
+		 * seq_read() expects ->next() to update the position even
+		 * when there are no more entries. Advance the position to
+		 * prevent a warning from being logged.
+		 */
+		cursor->cached_node = NULL;
+		cursor->cached_pos++;
+		*pos = cursor->cached_pos;
+	}
+
+	return (next_node);
+}
+
+static void *
+procfs_list_seq_start(struct seq_file *f, loff_t *pos)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+
+	mutex_enter(&procfs_list->pl_lock);
+
+	if (*pos == 0) {
+		cursor->cached_node = SEQ_START_TOKEN;
+		cursor->cached_pos = 0;
+		return (SEQ_START_TOKEN);
+	} else if (cursor->cached_node == NULL) {
+		return (NULL);
+	}
+
+	/*
+	 * Check if our cached pointer has become stale, which happens if the
+	 * the message where we left off has been dropped from the list since
+	 * the last read syscall completed.
+	 */
+	void *oldest_node = list_head(&procfs_list->pl_list);
+	if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL ||
+	    NODE_ID(procfs_list, oldest_node) > cursor->cached_pos))
+		return (ERR_PTR(-EIO));
+
+	/*
+	 * If it isn't starting from the beginning of the file, the seq_file
+	 * code will either pick up at the same position it visited last or the
+	 * following one.
+	 */
+	if (*pos == cursor->cached_pos) {
+		return (cursor->cached_node);
+	} else {
+		ASSERT3U(*pos, ==, cursor->cached_pos + 1);
+		return (procfs_list_next_node(cursor, pos));
+	}
+}
+
+static void *
+procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock));
+	return (procfs_list_next_node(cursor, pos));
+}
+
+static void
+procfs_list_seq_stop(struct seq_file *f, void *p)
+{
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+	mutex_exit(&procfs_list->pl_lock);
+}
+
+static struct seq_operations procfs_list_seq_ops = {
+	.show  = procfs_list_seq_show,
+	.start = procfs_list_seq_start,
+	.next  = procfs_list_seq_next,
+	.stop  = procfs_list_seq_stop,
+};
+
+static int
+procfs_list_open(struct inode *inode, struct file *filp)
+{
+	int rc = seq_open_private(filp, &procfs_list_seq_ops,
+	    sizeof (procfs_list_cursor_t));
+	if (rc != 0)
+		return (rc);
+
+	struct seq_file *f = filp->private_data;
+	procfs_list_cursor_t *cursor = f->private;
+	cursor->procfs_list = PDE_DATA(inode);
+	cursor->cached_node = NULL;
+	cursor->cached_pos = 0;
+
+	return (0);
+}
+
+static ssize_t
+procfs_list_write(struct file *filp, const char __user *buf, size_t len,
+    loff_t *ppos)
+{
+	struct seq_file *f = filp->private_data;
+	procfs_list_cursor_t *cursor = f->private;
+	procfs_list_t *procfs_list = cursor->procfs_list;
+	int rc;
+
+	if (procfs_list->pl_clear != NULL &&
+	    (rc = procfs_list->pl_clear(procfs_list)) != 0)
+		return (-rc);
+	return (len);
+}
+
+static const kstat_proc_op_t procfs_list_operations = {
+#ifdef HAVE_PROC_OPS_STRUCT
+	.proc_open	= procfs_list_open,
+	.proc_write	= procfs_list_write,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= seq_release_private,
+#else
+	.open		= procfs_list_open,
+	.write		= procfs_list_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release_private,
+#endif
+};
+
+/*
+ * Initialize a procfs_list and create a file for it in the proc filesystem
+ * under the kstat namespace.
+ */
+void
+procfs_list_install(const char *module,
+    const char *submodule,
+    const char *name,
+    mode_t mode,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off)
+{
+	char *modulestr;
+
+	if (submodule != NULL)
+		modulestr = kmem_asprintf("%s/%s", module, submodule);
+	else
+		modulestr = kmem_asprintf("%s", module);
+	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&procfs_list->pl_list,
+	    procfs_list_node_off + sizeof (procfs_list_node_t),
+	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+	procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */
+	procfs_list->pl_show = show;
+	procfs_list->pl_show_header = show_header;
+	procfs_list->pl_clear = clear;
+	procfs_list->pl_node_offset = procfs_list_node_off;
+
+	kstat_proc_entry_init(&procfs_list->pl_kstat_entry, modulestr, name);
+	kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode,
+	    &procfs_list_operations, procfs_list);
+	kmem_strfree(modulestr);
+}
+EXPORT_SYMBOL(procfs_list_install);
+
+/* Remove the proc filesystem file corresponding to the given list */
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{
+	kstat_proc_entry_delete(&procfs_list->pl_kstat_entry);
+}
+EXPORT_SYMBOL(procfs_list_uninstall);
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+	ASSERT(list_is_empty(&procfs_list->pl_list));
+	list_destroy(&procfs_list->pl_list);
+	mutex_destroy(&procfs_list->pl_lock);
+}
+EXPORT_SYMBOL(procfs_list_destroy);
+
+/*
+ * Add a new node to the tail of the list. While the standard list manipulation
+ * functions can be use for all other operation, adding elements to the list
+ * should only be done using this helper so that the id of the new node is set
+ * correctly.
+ */
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+	list_insert_tail(&procfs_list->pl_list, p);
+}
+EXPORT_SYMBOL(procfs_list_add);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
new file mode 100644
index 000000000000..61631256c858
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-taskq.c
@@ -0,0 +1,1428 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Task Queue Implementation.
+ */
+
+#include <sys/timer.h>
+#include <sys/taskq.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+#include <sys/trace_spl.h>
+#ifdef HAVE_CPU_HOTPLUG
+#include <linux/cpuhotplug.h>
+#endif
+
+int spl_taskq_thread_bind = 0;
+module_param(spl_taskq_thread_bind, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
+
+
+int spl_taskq_thread_dynamic = 1;
+module_param(spl_taskq_thread_dynamic, int, 0444);
+MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
+
+int spl_taskq_thread_priority = 1;
+module_param(spl_taskq_thread_priority, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_priority,
+	"Allow non-default priority for taskq threads");
+
+int spl_taskq_thread_sequential = 4;
+module_param(spl_taskq_thread_sequential, int, 0644);
+MODULE_PARM_DESC(spl_taskq_thread_sequential,
+	"Create new taskq threads after N sequential tasks");
+
+/* Global system-wide dynamic task queue available for all consumers */
+taskq_t *system_taskq;
+EXPORT_SYMBOL(system_taskq);
+/* Global dynamic task queue for long delay */
+taskq_t *system_delay_taskq;
+EXPORT_SYMBOL(system_delay_taskq);
+
+/* Private dedicated taskq for creating new taskq threads on demand. */
+static taskq_t *dynamic_taskq;
+static taskq_thread_t *taskq_thread_create(taskq_t *);
+
+#ifdef HAVE_CPU_HOTPLUG
+/* Multi-callback id for cpu hotplugging. */
+static int spl_taskq_cpuhp_state;
+#endif
+
+/* List of all taskqs */
+LIST_HEAD(tq_list);
+struct rw_semaphore tq_list_sem;
+static uint_t taskq_tsd;
+
+static int
+task_km_flags(uint_t flags)
+{
+	if (flags & TQ_NOSLEEP)
+		return (KM_NOSLEEP);
+
+	if (flags & TQ_PUSHPAGE)
+		return (KM_PUSHPAGE);
+
+	return (KM_SLEEP);
+}
+
+/*
+ * taskq_find_by_name - Find the largest instance number of a named taskq.
+ */
+static int
+taskq_find_by_name(const char *name)
+{
+	struct list_head *tql = NULL;
+	taskq_t *tq;
+
+	list_for_each_prev(tql, &tq_list) {
+		tq = list_entry(tql, taskq_t, tq_taskqs);
+		if (strcmp(name, tq->tq_name) == 0)
+			return (tq->tq_instance);
+	}
+	return (-1);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, returns a list_t which
+ * is not attached to the free, work, or pending taskq lists.
+ */
+static taskq_ent_t *
+task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
+{
+	taskq_ent_t *t;
+	int count = 0;
+
+	ASSERT(tq);
+retry:
+	/* Acquire taskq_ent_t's from free list if available */
+	if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
+		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+		ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
+		ASSERT(!timer_pending(&t->tqent_timer));
+
+		list_del_init(&t->tqent_list);
+		return (t);
+	}
+
+	/* Free list is empty and memory allocations are prohibited */
+	if (flags & TQ_NOALLOC)
+		return (NULL);
+
+	/* Hit maximum taskq_ent_t pool size */
+	if (tq->tq_nalloc >= tq->tq_maxalloc) {
+		if (flags & TQ_NOSLEEP)
+			return (NULL);
+
+		/*
+		 * Sleep periodically polling the free list for an available
+		 * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
+		 * but we cannot block forever waiting for an taskq_ent_t to
+		 * show up in the free list, otherwise a deadlock can happen.
+		 *
+		 * Therefore, we need to allocate a new task even if the number
+		 * of allocated tasks is above tq->tq_maxalloc, but we still
+		 * end up delaying the task allocation by one second, thereby
+		 * throttling the task dispatch rate.
+		 */
+		spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+		schedule_timeout(HZ / 100);
+		spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
+		    tq->tq_lock_class);
+		if (count < 100) {
+			count++;
+			goto retry;
+		}
+	}
+
+	spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
+	t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
+	spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
+
+	if (t) {
+		taskq_init_ent(t);
+		tq->tq_nalloc++;
+	}
+
+	return (t);
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
+ * to already be removed from the free, work, or pending taskq lists.
+ */
+static void
+task_free(taskq_t *tq, taskq_ent_t *t)
+{
+	ASSERT(tq);
+	ASSERT(t);
+	ASSERT(list_empty(&t->tqent_list));
+	ASSERT(!timer_pending(&t->tqent_timer));
+
+	kmem_free(t, sizeof (taskq_ent_t));
+	tq->tq_nalloc--;
+}
+
+/*
+ * NOTE: Must be called with tq->tq_lock held, either destroys the
+ * taskq_ent_t if too many exist or moves it to the free list for later use.
+ */
+static void
+task_done(taskq_t *tq, taskq_ent_t *t)
+{
+	ASSERT(tq);
+	ASSERT(t);
+
+	/* Wake tasks blocked in taskq_wait_id() */
+	wake_up_all(&t->tqent_waitq);
+
+	list_del_init(&t->tqent_list);
+
+	if (tq->tq_nalloc <= tq->tq_minalloc) {
+		t->tqent_id = TASKQID_INVALID;
+		t->tqent_func = NULL;
+		t->tqent_arg = NULL;
+		t->tqent_flags = 0;
+
+		list_add_tail(&t->tqent_list, &tq->tq_free_list);
+	} else {
+		task_free(tq, t);
+	}
+}
+
+/*
+ * When a delayed task timer expires remove it from the delay list and
+ * add it to the priority list in order for immediate processing.
+ */
+static void
+task_expire_impl(taskq_ent_t *t)
+{
+	taskq_ent_t *w;
+	taskq_t *tq = t->tqent_taskq;
+	struct list_head *l = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+	if (t->tqent_flags & TQENT_FLAG_CANCEL) {
+		ASSERT(list_empty(&t->tqent_list));
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+		return;
+	}
+
+	t->tqent_birth = jiffies;
+	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+	/*
+	 * The priority list must be maintained in strict task id order
+	 * from lowest to highest for lowest_id to be easily calculable.
+	 */
+	list_del(&t->tqent_list);
+	list_for_each_prev(l, &tq->tq_prio_list) {
+		w = list_entry(l, taskq_ent_t, tqent_list);
+		if (w->tqent_id < t->tqent_id) {
+			list_add(&t->tqent_list, l);
+			break;
+		}
+	}
+	if (l == &tq->tq_prio_list)
+		list_add(&t->tqent_list, &tq->tq_prio_list);
+
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	wake_up(&tq->tq_work_waitq);
+}
+
+static void
+task_expire(spl_timer_list_t tl)
+{
+	struct timer_list *tmr = (struct timer_list *)tl;
+	taskq_ent_t *t = from_timer(t, tmr, tqent_timer);
+	task_expire_impl(t);
+}
+
+/*
+ * Returns the lowest incomplete taskqid_t.  The taskqid_t may
+ * be queued on the pending list, on the priority list, on the
+ * delay list, or on the work list currently being handled, but
+ * it is not 100% complete yet.
+ */
+static taskqid_t
+taskq_lowest_id(taskq_t *tq)
+{
+	taskqid_t lowest_id = tq->tq_next_id;
+	taskq_ent_t *t;
+	taskq_thread_t *tqt;
+
+	if (!list_empty(&tq->tq_pend_list)) {
+		t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
+		lowest_id = MIN(lowest_id, t->tqent_id);
+	}
+
+	if (!list_empty(&tq->tq_prio_list)) {
+		t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
+		lowest_id = MIN(lowest_id, t->tqent_id);
+	}
+
+	if (!list_empty(&tq->tq_delay_list)) {
+		t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
+		lowest_id = MIN(lowest_id, t->tqent_id);
+	}
+
+	if (!list_empty(&tq->tq_active_list)) {
+		tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
+		    tqt_active_list);
+		ASSERT(tqt->tqt_id != TASKQID_INVALID);
+		lowest_id = MIN(lowest_id, tqt->tqt_id);
+	}
+
+	return (lowest_id);
+}
+
+/*
+ * Insert a task into a list keeping the list sorted by increasing taskqid.
+ */
+static void
+taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
+{
+	taskq_thread_t *w;
+	struct list_head *l = NULL;
+
+	ASSERT(tq);
+	ASSERT(tqt);
+
+	list_for_each_prev(l, &tq->tq_active_list) {
+		w = list_entry(l, taskq_thread_t, tqt_active_list);
+		if (w->tqt_id < tqt->tqt_id) {
+			list_add(&tqt->tqt_active_list, l);
+			break;
+		}
+	}
+	if (l == &tq->tq_active_list)
+		list_add(&tqt->tqt_active_list, &tq->tq_active_list);
+}
+
+/*
+ * Find and return a task from the given list if it exists.  The list
+ * must be in lowest to highest task id order.
+ */
+static taskq_ent_t *
+taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
+{
+	struct list_head *l = NULL;
+	taskq_ent_t *t;
+
+	list_for_each(l, lh) {
+		t = list_entry(l, taskq_ent_t, tqent_list);
+
+		if (t->tqent_id == id)
+			return (t);
+
+		if (t->tqent_id > id)
+			break;
+	}
+
+	return (NULL);
+}
+
+/*
+ * Find an already dispatched task given the task id regardless of what
+ * state it is in.  If a task is still pending it will be returned.
+ * If a task is executing, then -EBUSY will be returned instead.
+ * If the task has already been run then NULL is returned.
+ */
+static taskq_ent_t *
+taskq_find(taskq_t *tq, taskqid_t id)
+{
+	taskq_thread_t *tqt;
+	struct list_head *l = NULL;
+	taskq_ent_t *t;
+
+	t = taskq_find_list(tq, &tq->tq_delay_list, id);
+	if (t)
+		return (t);
+
+	t = taskq_find_list(tq, &tq->tq_prio_list, id);
+	if (t)
+		return (t);
+
+	t = taskq_find_list(tq, &tq->tq_pend_list, id);
+	if (t)
+		return (t);
+
+	list_for_each(l, &tq->tq_active_list) {
+		tqt = list_entry(l, taskq_thread_t, tqt_active_list);
+		if (tqt->tqt_id == id) {
+			/*
+			 * Instead of returning tqt_task, we just return a non
+			 * NULL value to prevent misuse, since tqt_task only
+			 * has two valid fields.
+			 */
+			return (ERR_PTR(-EBUSY));
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
+ * taskq_wait() functions below.
+ *
+ * Taskq waiting is accomplished by tracking the lowest outstanding task
+ * id and the next available task id.  As tasks are dispatched they are
+ * added to the tail of the pending, priority, or delay lists.  As worker
+ * threads become available the tasks are removed from the heads of these
+ * lists and linked to the worker threads.  This ensures the lists are
+ * kept sorted by lowest to highest task id.
+ *
+ * Therefore the lowest outstanding task id can be quickly determined by
+ * checking the head item from all of these lists.  This value is stored
+ * with the taskq as the lowest id.  It only needs to be recalculated when
+ * either the task with the current lowest id completes or is canceled.
+ *
+ * By blocking until the lowest task id exceeds the passed task id the
+ * taskq_wait_outstanding() function can be easily implemented.  Similarly,
+ * by blocking until the lowest task id matches the next task id taskq_wait()
+ * can be implemented.
+ *
+ * Callers should be aware that when there are multiple worked threads it
+ * is possible for larger task ids to complete before smaller ones.  Also
+ * when the taskq contains delay tasks with small task ids callers may
+ * block for a considerable length of time waiting for them to expire and
+ * execute.
+ */
+static int
+taskq_wait_id_check(taskq_t *tq, taskqid_t id)
+{
+	int rc;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	rc = (taskq_find(tq, id) == NULL);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (rc);
+}
+
+/*
+ * The taskq_wait_id() function blocks until the passed task id completes.
+ * This does not guarantee that all lower task ids have completed.
+ */
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+	wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_id);
+
+static int
+taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
+{
+	int rc;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	rc = (id < tq->tq_lowest_id);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (rc);
+}
+
+/*
+ * The taskq_wait_outstanding() function will block until all tasks with a
+ * lower taskqid than the passed 'id' have been completed.  Note that all
+ * task id's are assigned monotonically at dispatch time.  Zero may be
+ * passed for the id to indicate all tasks dispatch up to this point,
+ * but not after, should be waited for.
+ */
+void
+taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
+{
+	id = id ? id : tq->tq_next_id - 1;
+	wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
+}
+EXPORT_SYMBOL(taskq_wait_outstanding);
+
+static int
+taskq_wait_check(taskq_t *tq)
+{
+	int rc;
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	rc = (tq->tq_lowest_id == tq->tq_next_id);
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	return (rc);
+}
+
+/*
+ * The taskq_wait() function will block until the taskq is empty.
+ * This means that if a taskq re-dispatches work to itself taskq_wait()
+ * callers will block indefinitely.
+ */
+void
+taskq_wait(taskq_t *tq)
+{
+	wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
+}
+EXPORT_SYMBOL(taskq_wait);
+
+int
+taskq_member(taskq_t *tq, kthread_t *t)
+{
+	return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
+}
+EXPORT_SYMBOL(taskq_member);
+
+taskq_t *
+taskq_of_curthread(void)
+{
+	return (tsd_get(taskq_tsd));
+}
+EXPORT_SYMBOL(taskq_of_curthread);
+
+/*
+ * Cancel an already dispatched task given the task id.  Still pending tasks
+ * will be immediately canceled, and if the task is active the function will
+ * block until it completes.  Preallocated tasks which are canceled must be
+ * freed by the caller.
+ */
+int
+taskq_cancel_id(taskq_t *tq, taskqid_t id)
+{
+	taskq_ent_t *t;
+	int rc = ENOENT;
+	unsigned long flags;
+
+	ASSERT(tq);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	t = taskq_find(tq, id);
+	if (t && t != ERR_PTR(-EBUSY)) {
+		list_del_init(&t->tqent_list);
+		t->tqent_flags |= TQENT_FLAG_CANCEL;
+
+		/*
+		 * When canceling the lowest outstanding task id we
+		 * must recalculate the new lowest outstanding id.
+		 */
+		if (tq->tq_lowest_id == t->tqent_id) {
+			tq->tq_lowest_id = taskq_lowest_id(tq);
+			ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
+		}
+
+		/*
+		 * The task_expire() function takes the tq->tq_lock so drop
+		 * drop the lock before synchronously cancelling the timer.
+		 */
+		if (timer_pending(&t->tqent_timer)) {
+			spin_unlock_irqrestore(&tq->tq_lock, flags);
+			del_timer_sync(&t->tqent_timer);
+			spin_lock_irqsave_nested(&tq->tq_lock, flags,
+			    tq->tq_lock_class);
+		}
+
+		if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
+			task_done(tq, t);
+
+		rc = 0;
+	}
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	if (t == ERR_PTR(-EBUSY)) {
+		taskq_wait_id(tq, id);
+		rc = EBUSY;
+	}
+
+	return (rc);
+}
+EXPORT_SYMBOL(taskq_cancel_id);
+
+static int taskq_thread_spawn(taskq_t *tq);
+
+taskqid_t
+taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
+{
+	taskq_ent_t *t;
+	taskqid_t rc = TASKQID_INVALID;
+	unsigned long irqflags;
+
+	ASSERT(tq);
+	ASSERT(func);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+	/* Taskq being destroyed and all tasks drained */
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	/* Do not queue the task unless there is idle thread for it */
+	ASSERT(tq->tq_nactive <= tq->tq_nthreads);
+	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+		/* Dynamic taskq may be able to spawn another thread */
+		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+		    taskq_thread_spawn(tq) == 0)
+			goto out;
+	}
+
+	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+		goto out;
+
+	spin_lock(&t->tqent_lock);
+
+	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
+	if (flags & TQ_NOQUEUE)
+		list_add(&t->tqent_list, &tq->tq_prio_list);
+	/* Queue to the priority list instead of the pending list */
+	else if (flags & TQ_FRONT)
+		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+	else
+		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+	t->tqent_id = rc = tq->tq_next_id;
+	tq->tq_next_id++;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	t->tqent_taskq = tq;
+	t->tqent_timer.function = NULL;
+	t->tqent_timer.expires = 0;
+
+	t->tqent_birth = jiffies;
+	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+	spin_unlock(&t->tqent_lock);
+
+	wake_up(&tq->tq_work_waitq);
+out:
+	/* Spawn additional taskq threads if required. */
+	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
+		(void) taskq_thread_spawn(tq);
+
+	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+	return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch);
+
+taskqid_t
+taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
+    uint_t flags, clock_t expire_time)
+{
+	taskqid_t rc = TASKQID_INVALID;
+	taskq_ent_t *t;
+	unsigned long irqflags;
+
+	ASSERT(tq);
+	ASSERT(func);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
+
+	/* Taskq being destroyed and all tasks drained */
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
+		goto out;
+
+	spin_lock(&t->tqent_lock);
+
+	/* Queue to the delay list for subsequent execution */
+	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
+
+	t->tqent_id = rc = tq->tq_next_id;
+	tq->tq_next_id++;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	t->tqent_taskq = tq;
+	t->tqent_timer.function = task_expire;
+	t->tqent_timer.expires = (unsigned long)expire_time;
+	add_timer(&t->tqent_timer);
+
+	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+	spin_unlock(&t->tqent_lock);
+out:
+	/* Spawn additional taskq threads if required. */
+	if (tq->tq_nactive == tq->tq_nthreads)
+		(void) taskq_thread_spawn(tq);
+	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+	return (rc);
+}
+EXPORT_SYMBOL(taskq_dispatch_delay);
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+    taskq_ent_t *t)
+{
+	unsigned long irqflags;
+	ASSERT(tq);
+	ASSERT(func);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+	    tq->tq_lock_class);
+
+	/* Taskq being destroyed and all tasks drained */
+	if (!(tq->tq_flags & TASKQ_ACTIVE)) {
+		t->tqent_id = TASKQID_INVALID;
+		goto out;
+	}
+
+	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
+		/* Dynamic taskq may be able to spawn another thread */
+		if (!(tq->tq_flags & TASKQ_DYNAMIC) ||
+		    taskq_thread_spawn(tq) == 0)
+			goto out2;
+		flags |= TQ_FRONT;
+	}
+
+	spin_lock(&t->tqent_lock);
+
+	/*
+	 * Make sure the entry is not on some other taskq; it is important to
+	 * ASSERT() under lock
+	 */
+	ASSERT(taskq_empty_ent(t));
+
+	/*
+	 * Mark it as a prealloc'd task.  This is important
+	 * to ensure that we don't free it later.
+	 */
+	t->tqent_flags |= TQENT_FLAG_PREALLOC;
+
+	/* Queue to the priority list instead of the pending list */
+	if (flags & TQ_FRONT)
+		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+	else
+		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+	t->tqent_id = tq->tq_next_id;
+	tq->tq_next_id++;
+	t->tqent_func = func;
+	t->tqent_arg = arg;
+	t->tqent_taskq = tq;
+
+	t->tqent_birth = jiffies;
+	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
+
+	spin_unlock(&t->tqent_lock);
+
+	wake_up(&tq->tq_work_waitq);
+out:
+	/* Spawn additional taskq threads if required. */
+	if (tq->tq_nactive == tq->tq_nthreads)
+		(void) taskq_thread_spawn(tq);
+out2:
+	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+}
+EXPORT_SYMBOL(taskq_dispatch_ent);
+
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+	return (list_empty(&t->tqent_list));
+}
+EXPORT_SYMBOL(taskq_empty_ent);
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+	spin_lock_init(&t->tqent_lock);
+	init_waitqueue_head(&t->tqent_waitq);
+	timer_setup(&t->tqent_timer, NULL, 0);
+	INIT_LIST_HEAD(&t->tqent_list);
+	t->tqent_id = 0;
+	t->tqent_func = NULL;
+	t->tqent_arg = NULL;
+	t->tqent_flags = 0;
+	t->tqent_taskq = NULL;
+}
+EXPORT_SYMBOL(taskq_init_ent);
+
+/*
+ * Return the next pending task, preference is given to tasks on the
+ * priority list which were dispatched with TQ_FRONT.
+ */
+static taskq_ent_t *
+taskq_next_ent(taskq_t *tq)
+{
+	struct list_head *list;
+
+	if (!list_empty(&tq->tq_prio_list))
+		list = &tq->tq_prio_list;
+	else if (!list_empty(&tq->tq_pend_list))
+		list = &tq->tq_pend_list;
+	else
+		return (NULL);
+
+	return (list_entry(list->next, taskq_ent_t, tqent_list));
+}
+
+/*
+ * Spawns a new thread for the specified taskq.
+ */
+static void
+taskq_thread_spawn_task(void *arg)
+{
+	taskq_t *tq = (taskq_t *)arg;
+	unsigned long flags;
+
+	if (taskq_thread_create(tq) == NULL) {
+		/* restore spawning count if failed */
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+		tq->tq_nspawn--;
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+	}
+}
+
+/*
+ * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
+ * number of threads is insufficient to handle the pending tasks.  These
+ * new threads must be created by the dedicated dynamic_taskq to avoid
+ * deadlocks between thread creation and memory reclaim.  The system_taskq
+ * which is also a dynamic taskq cannot be safely used for this.
+ */
+static int
+taskq_thread_spawn(taskq_t *tq)
+{
+	int spawning = 0;
+
+	if (!(tq->tq_flags & TASKQ_DYNAMIC))
+		return (0);
+
+	if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
+	    (tq->tq_flags & TASKQ_ACTIVE)) {
+		spawning = (++tq->tq_nspawn);
+		taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
+		    tq, TQ_NOSLEEP);
+	}
+
+	return (spawning);
+}
+
+/*
+ * Threads in a dynamic taskq should only exit once it has been completely
+ * drained and no other threads are actively servicing tasks.  This prevents
+ * threads from being created and destroyed more than is required.
+ *
+ * The first thread is the thread list is treated as the primary thread.
+ * There is nothing special about the primary thread but in order to avoid
+ * all the taskq pids from changing we opt to make it long running.
+ */
+static int
+taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
+{
+	if (!(tq->tq_flags & TASKQ_DYNAMIC))
+		return (0);
+
+	if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
+	    tqt_thread_list) == tqt)
+		return (0);
+
+	return
+	    ((tq->tq_nspawn == 0) &&	/* No threads are being spawned */
+	    (tq->tq_nactive == 0) &&	/* No threads are handling tasks */
+	    (tq->tq_nthreads > 1) &&	/* More than 1 thread is running */
+	    (!taskq_next_ent(tq)) &&	/* There are no pending tasks */
+	    (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
+}
+
+static int
+taskq_thread(void *args)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	sigset_t blocked;
+	taskq_thread_t *tqt = args;
+	taskq_t *tq;
+	taskq_ent_t *t;
+	int seq_tasks = 0;
+	unsigned long flags;
+	taskq_ent_t dup_task = {};
+
+	ASSERT(tqt);
+	ASSERT(tqt->tqt_tq);
+	tq = tqt->tqt_tq;
+	current->flags |= PF_NOFREEZE;
+
+	(void) spl_fstrans_mark();
+
+	sigfillset(&blocked);
+	sigprocmask(SIG_BLOCK, &blocked, NULL);
+	flush_signals(current);
+
+	tsd_set(taskq_tsd, tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	/*
+	 * If we are dynamically spawned, decrease spawning count. Note that
+	 * we could be created during taskq_create, in which case we shouldn't
+	 * do the decrement. But it's fine because taskq_create will reset
+	 * tq_nspawn later.
+	 */
+	if (tq->tq_flags & TASKQ_DYNAMIC)
+		tq->tq_nspawn--;
+
+	/* Immediately exit if more threads than allowed were created. */
+	if (tq->tq_nthreads >= tq->tq_maxthreads)
+		goto error;
+
+	tq->tq_nthreads++;
+	list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
+	wake_up(&tq->tq_wait_waitq);
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		if (list_empty(&tq->tq_pend_list) &&
+		    list_empty(&tq->tq_prio_list)) {
+
+			if (taskq_thread_should_stop(tq, tqt)) {
+				wake_up_all(&tq->tq_wait_waitq);
+				break;
+			}
+
+			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
+			spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+			schedule();
+			seq_tasks = 0;
+
+			spin_lock_irqsave_nested(&tq->tq_lock, flags,
+			    tq->tq_lock_class);
+			remove_wait_queue(&tq->tq_work_waitq, &wait);
+		} else {
+			__set_current_state(TASK_RUNNING);
+		}
+
+		if ((t = taskq_next_ent(tq)) != NULL) {
+			list_del_init(&t->tqent_list);
+
+			/*
+			 * A TQENT_FLAG_PREALLOC task may be reused or freed
+			 * during the task function call. Store tqent_id and
+			 * tqent_flags here.
+			 *
+			 * Also use an on stack taskq_ent_t for tqt_task
+			 * assignment in this case; we want to make sure
+			 * to duplicate all fields, so the values are
+			 * correct when it's accessed via DTRACE_PROBE*.
+			 */
+			tqt->tqt_id = t->tqent_id;
+			tqt->tqt_flags = t->tqent_flags;
+
+			if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
+				dup_task = *t;
+				t = &dup_task;
+			}
+			tqt->tqt_task = t;
+
+			taskq_insert_in_order(tq, tqt);
+			tq->tq_nactive++;
+			spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
+
+			/* Perform the requested task */
+			t->tqent_func(t->tqent_arg);
+
+			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
+
+			spin_lock_irqsave_nested(&tq->tq_lock, flags,
+			    tq->tq_lock_class);
+			tq->tq_nactive--;
+			list_del_init(&tqt->tqt_active_list);
+			tqt->tqt_task = NULL;
+
+			/* For prealloc'd tasks, we don't free anything. */
+			if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+				task_done(tq, t);
+
+			/*
+			 * When the current lowest outstanding taskqid is
+			 * done calculate the new lowest outstanding id
+			 */
+			if (tq->tq_lowest_id == tqt->tqt_id) {
+				tq->tq_lowest_id = taskq_lowest_id(tq);
+				ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
+			}
+
+			/* Spawn additional taskq threads if required. */
+			if ((++seq_tasks) > spl_taskq_thread_sequential &&
+			    taskq_thread_spawn(tq))
+				seq_tasks = 0;
+
+			tqt->tqt_id = TASKQID_INVALID;
+			tqt->tqt_flags = 0;
+			wake_up_all(&tq->tq_wait_waitq);
+		} else {
+			if (taskq_thread_should_stop(tq, tqt))
+				break;
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+
+	}
+
+	__set_current_state(TASK_RUNNING);
+	tq->tq_nthreads--;
+	list_del_init(&tqt->tqt_thread_list);
+error:
+	kmem_free(tqt, sizeof (taskq_thread_t));
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	tsd_set(taskq_tsd, NULL);
+	thread_exit();
+
+	return (0);
+}
+
+static taskq_thread_t *
+taskq_thread_create(taskq_t *tq)
+{
+	static int last_used_cpu = 0;
+	taskq_thread_t *tqt;
+
+	tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
+	INIT_LIST_HEAD(&tqt->tqt_thread_list);
+	INIT_LIST_HEAD(&tqt->tqt_active_list);
+	tqt->tqt_tq = tq;
+	tqt->tqt_id = TASKQID_INVALID;
+
+	tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
+	    "%s", tq->tq_name);
+	if (tqt->tqt_thread == NULL) {
+		kmem_free(tqt, sizeof (taskq_thread_t));
+		return (NULL);
+	}
+
+	if (spl_taskq_thread_bind) {
+		last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
+		kthread_bind(tqt->tqt_thread, last_used_cpu);
+	}
+
+	if (spl_taskq_thread_priority)
+		set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
+
+	wake_up_process(tqt->tqt_thread);
+
+	return (tqt);
+}
+
+taskq_t *
+taskq_create(const char *name, int threads_arg, pri_t pri,
+    int minalloc, int maxalloc, uint_t flags)
+{
+	taskq_t *tq;
+	taskq_thread_t *tqt;
+	int count = 0, rc = 0, i;
+	unsigned long irqflags;
+	int nthreads = threads_arg;
+
+	ASSERT(name != NULL);
+	ASSERT(minalloc >= 0);
+	ASSERT(maxalloc <= INT_MAX);
+	ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
+
+	/* Scale the number of threads using nthreads as a percentage */
+	if (flags & TASKQ_THREADS_CPU_PCT) {
+		ASSERT(nthreads <= 100);
+		ASSERT(nthreads >= 0);
+		nthreads = MIN(threads_arg, 100);
+		nthreads = MAX(nthreads, 0);
+		nthreads = MAX((num_online_cpus() * nthreads) /100, 1);
+	}
+
+	tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
+	if (tq == NULL)
+		return (NULL);
+
+	tq->tq_hp_support = B_FALSE;
+#ifdef HAVE_CPU_HOTPLUG
+	if (flags & TASKQ_THREADS_CPU_PCT) {
+		tq->tq_hp_support = B_TRUE;
+		if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state,
+		    &tq->tq_hp_cb_node) != 0) {
+			kmem_free(tq, sizeof (*tq));
+			return (NULL);
+		}
+	}
+#endif
+
+	spin_lock_init(&tq->tq_lock);
+	INIT_LIST_HEAD(&tq->tq_thread_list);
+	INIT_LIST_HEAD(&tq->tq_active_list);
+	tq->tq_name = kmem_strdup(name);
+	tq->tq_nactive = 0;
+	tq->tq_nthreads = 0;
+	tq->tq_nspawn = 0;
+	tq->tq_maxthreads = nthreads;
+	tq->tq_cpu_pct = threads_arg;
+	tq->tq_pri = pri;
+	tq->tq_minalloc = minalloc;
+	tq->tq_maxalloc = maxalloc;
+	tq->tq_nalloc = 0;
+	tq->tq_flags = (flags | TASKQ_ACTIVE);
+	tq->tq_next_id = TASKQID_INITIAL;
+	tq->tq_lowest_id = TASKQID_INITIAL;
+	INIT_LIST_HEAD(&tq->tq_free_list);
+	INIT_LIST_HEAD(&tq->tq_pend_list);
+	INIT_LIST_HEAD(&tq->tq_prio_list);
+	INIT_LIST_HEAD(&tq->tq_delay_list);
+	init_waitqueue_head(&tq->tq_work_waitq);
+	init_waitqueue_head(&tq->tq_wait_waitq);
+	tq->tq_lock_class = TQ_LOCK_GENERAL;
+	INIT_LIST_HEAD(&tq->tq_taskqs);
+
+	if (flags & TASKQ_PREPOPULATE) {
+		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
+		    tq->tq_lock_class);
+
+		for (i = 0; i < minalloc; i++)
+			task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
+			    &irqflags));
+
+		spin_unlock_irqrestore(&tq->tq_lock, irqflags);
+	}
+
+	if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
+		nthreads = 1;
+
+	for (i = 0; i < nthreads; i++) {
+		tqt = taskq_thread_create(tq);
+		if (tqt == NULL)
+			rc = 1;
+		else
+			count++;
+	}
+
+	/* Wait for all threads to be started before potential destroy */
+	wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
+	/*
+	 * taskq_thread might have touched nspawn, but we don't want them to
+	 * because they're not dynamically spawned. So we reset it to 0
+	 */
+	tq->tq_nspawn = 0;
+
+	if (rc) {
+		taskq_destroy(tq);
+		tq = NULL;
+	} else {
+		down_write(&tq_list_sem);
+		tq->tq_instance = taskq_find_by_name(name) + 1;
+		list_add_tail(&tq->tq_taskqs, &tq_list);
+		up_write(&tq_list_sem);
+	}
+
+	return (tq);
+}
+EXPORT_SYMBOL(taskq_create);
+
+void
+taskq_destroy(taskq_t *tq)
+{
+	struct task_struct *thread;
+	taskq_thread_t *tqt;
+	taskq_ent_t *t;
+	unsigned long flags;
+
+	ASSERT(tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	tq->tq_flags &= ~TASKQ_ACTIVE;
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+#ifdef HAVE_CPU_HOTPLUG
+	if (tq->tq_hp_support) {
+		VERIFY0(cpuhp_state_remove_instance_nocalls(
+		    spl_taskq_cpuhp_state, &tq->tq_hp_cb_node));
+	}
+#endif
+	/*
+	 * When TASKQ_ACTIVE is clear new tasks may not be added nor may
+	 * new worker threads be spawned for dynamic taskq.
+	 */
+	if (dynamic_taskq != NULL)
+		taskq_wait_outstanding(dynamic_taskq, 0);
+
+	taskq_wait(tq);
+
+	/* remove taskq from global list used by the kstats */
+	down_write(&tq_list_sem);
+	list_del(&tq->tq_taskqs);
+	up_write(&tq_list_sem);
+
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+	/* wait for spawning threads to insert themselves to the list */
+	while (tq->tq_nspawn) {
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+		schedule_timeout_interruptible(1);
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+	}
+
+	/*
+	 * Signal each thread to exit and block until it does.  Each thread
+	 * is responsible for removing itself from the list and freeing its
+	 * taskq_thread_t.  This allows for idle threads to opt to remove
+	 * themselves from the taskq.  They can be recreated as needed.
+	 */
+	while (!list_empty(&tq->tq_thread_list)) {
+		tqt = list_entry(tq->tq_thread_list.next,
+		    taskq_thread_t, tqt_thread_list);
+		thread = tqt->tqt_thread;
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+		kthread_stop(thread);
+
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+	}
+
+	while (!list_empty(&tq->tq_free_list)) {
+		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+		list_del_init(&t->tqent_list);
+		task_free(tq, t);
+	}
+
+	ASSERT0(tq->tq_nthreads);
+	ASSERT0(tq->tq_nalloc);
+	ASSERT0(tq->tq_nspawn);
+	ASSERT(list_empty(&tq->tq_thread_list));
+	ASSERT(list_empty(&tq->tq_active_list));
+	ASSERT(list_empty(&tq->tq_free_list));
+	ASSERT(list_empty(&tq->tq_pend_list));
+	ASSERT(list_empty(&tq->tq_prio_list));
+	ASSERT(list_empty(&tq->tq_delay_list));
+
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+	kmem_strfree(tq->tq_name);
+	kmem_free(tq, sizeof (taskq_t));
+}
+EXPORT_SYMBOL(taskq_destroy);
+
+static unsigned int spl_taskq_kick = 0;
+
+/*
+ * 2.6.36 API Change
+ * module_param_cb is introduced to take kernel_param_ops and
+ * module_param_call is marked as obsolete. Also set and get operations
+ * were changed to take a 'const struct kernel_param *'.
+ */
+static int
+#ifdef module_param_cb
+param_set_taskq_kick(const char *val, const struct kernel_param *kp)
+#else
+param_set_taskq_kick(const char *val, struct kernel_param *kp)
+#endif
+{
+	int ret;
+	taskq_t *tq = NULL;
+	taskq_ent_t *t;
+	unsigned long flags;
+
+	ret = param_set_uint(val, kp);
+	if (ret < 0 || !spl_taskq_kick)
+		return (ret);
+	/* reset value */
+	spl_taskq_kick = 0;
+
+	down_read(&tq_list_sem);
+	list_for_each_entry(tq, &tq_list, tq_taskqs) {
+		spin_lock_irqsave_nested(&tq->tq_lock, flags,
+		    tq->tq_lock_class);
+		/* Check if the first pending is older than 5 seconds */
+		t = taskq_next_ent(tq);
+		if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
+			(void) taskq_thread_spawn(tq);
+			printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
+			    tq->tq_name, tq->tq_instance);
+		}
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+	}
+	up_read(&tq_list_sem);
+	return (ret);
+}
+
+#ifdef module_param_cb
+static const struct kernel_param_ops param_ops_taskq_kick = {
+	.set = param_set_taskq_kick,
+	.get = param_get_uint,
+};
+module_param_cb(spl_taskq_kick, &param_ops_taskq_kick, &spl_taskq_kick, 0644);
+#else
+module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
+	&spl_taskq_kick, 0644);
+#endif
+MODULE_PARM_DESC(spl_taskq_kick,
+	"Write nonzero to kick stuck taskqs to spawn more threads");
+
+#ifdef HAVE_CPU_HOTPLUG
+/*
+ * This callback will be called exactly once for each core that comes online,
+ * for each dynamic taskq. We attempt to expand taskqs that have
+ * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every
+ * time, to correctly determine whether or not to add a thread.
+ */
+static int
+spl_taskq_expand(unsigned int cpu, struct hlist_node *node)
+{
+	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+	unsigned long flags;
+	int err = 0;
+
+	ASSERT(tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+	int nthreads = MIN(tq->tq_cpu_pct, 100);
+	nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1);
+	tq->tq_maxthreads = nthreads;
+
+	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+	    tq->tq_maxthreads > tq->tq_nthreads) {
+		ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads + 1);
+		taskq_thread_t *tqt = taskq_thread_create(tq);
+		if (tqt == NULL)
+			err = -1;
+	}
+
+out:
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+	return (err);
+}
+
+/*
+ * While we don't support offlining CPUs, it is possible that CPUs will fail
+ * to online successfully. We do need to be able to handle this case
+ * gracefully.
+ */
+static int
+spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node)
+{
+	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
+	unsigned long flags;
+
+	ASSERT(tq);
+	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+
+	if (!(tq->tq_flags & TASKQ_ACTIVE))
+		goto out;
+
+	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
+	int nthreads = MIN(tq->tq_cpu_pct, 100);
+	nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1);
+	tq->tq_maxthreads = nthreads;
+
+	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
+	    tq->tq_maxthreads < tq->tq_nthreads) {
+		ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1);
+		taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next,
+		    taskq_thread_t, tqt_thread_list);
+		struct task_struct *thread = tqt->tqt_thread;
+		spin_unlock_irqrestore(&tq->tq_lock, flags);
+
+		kthread_stop(thread);
+
+		return (0);
+	}
+
+out:
+	spin_unlock_irqrestore(&tq->tq_lock, flags);
+	return (0);
+}
+#endif
+
+int
+spl_taskq_init(void)
+{
+	init_rwsem(&tq_list_sem);
+	tsd_create(&taskq_tsd, NULL);
+
+#ifdef HAVE_CPU_HOTPLUG
+	spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+	    "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down);
+#endif
+
+	system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
+	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+	if (system_taskq == NULL)
+		return (1);
+
+	system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
+	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
+	if (system_delay_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
+		taskq_destroy(system_taskq);
+		return (1);
+	}
+
+	dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
+	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
+	if (dynamic_taskq == NULL) {
+#ifdef HAVE_CPU_HOTPLUG
+		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+#endif
+		taskq_destroy(system_taskq);
+		taskq_destroy(system_delay_taskq);
+		return (1);
+	}
+
+	/*
+	 * This is used to annotate tq_lock, so
+	 *   taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
+	 * does not trigger a lockdep warning re: possible recursive locking
+	 */
+	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
+
+	return (0);
+}
+
+void
+spl_taskq_fini(void)
+{
+	taskq_destroy(dynamic_taskq);
+	dynamic_taskq = NULL;
+
+	taskq_destroy(system_delay_taskq);
+	system_delay_taskq = NULL;
+
+	taskq_destroy(system_taskq);
+	system_taskq = NULL;
+
+	tsd_destroy(&taskq_tsd);
+
+#ifdef HAVE_CPU_HOTPLUG
+	cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
+	spl_taskq_cpuhp_state = 0;
+#endif
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
new file mode 100644
index 000000000000..db23fb64a298
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-thread.c
@@ -0,0 +1,160 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) Thread Implementation.
+ */
+
+#include <sys/thread.h>
+#include <sys/kmem.h>
+#include <sys/tsd.h>
+
+/*
+ * Thread interfaces
+ */
+typedef struct thread_priv_s {
+	unsigned long tp_magic;		/* Magic */
+	int tp_name_size;		/* Name size */
+	char *tp_name;			/* Name (without _thread suffix) */
+	void (*tp_func)(void *);	/* Registered function */
+	void *tp_args;			/* Args to be passed to function */
+	size_t tp_len;			/* Len to be passed to function */
+	int tp_state;			/* State to start thread at */
+	pri_t tp_pri;			/* Priority to start threat at */
+} thread_priv_t;
+
+static int
+thread_generic_wrapper(void *arg)
+{
+	thread_priv_t *tp = (thread_priv_t *)arg;
+	void (*func)(void *);
+	void *args;
+
+	ASSERT(tp->tp_magic == TP_MAGIC);
+	func = tp->tp_func;
+	args = tp->tp_args;
+	set_current_state(tp->tp_state);
+	set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
+	kmem_free(tp->tp_name, tp->tp_name_size);
+	kmem_free(tp, sizeof (thread_priv_t));
+
+	if (func)
+		func(args);
+
+	return (0);
+}
+
+void
+__thread_exit(void)
+{
+	tsd_exit();
+	complete_and_exit(NULL, 0);
+	/* Unreachable */
+}
+EXPORT_SYMBOL(__thread_exit);
+
+/*
+ * thread_create() may block forever if it cannot create a thread or
+ * allocate memory.  This is preferable to returning a NULL which Solaris
+ * style callers likely never check for... since it can't fail.
+ */
+kthread_t *
+__thread_create(caddr_t stk, size_t  stksize, thread_func_t func,
+    const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri)
+{
+	thread_priv_t *tp;
+	struct task_struct *tsk;
+	char *p;
+
+	/* Option pp is simply ignored */
+	/* Variable stack size unsupported */
+	ASSERT(stk == NULL);
+
+	tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE);
+	if (tp == NULL)
+		return (NULL);
+
+	tp->tp_magic = TP_MAGIC;
+	tp->tp_name_size = strlen(name) + 1;
+
+	tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE);
+	if (tp->tp_name == NULL) {
+		kmem_free(tp, sizeof (thread_priv_t));
+		return (NULL);
+	}
+
+	strncpy(tp->tp_name, name, tp->tp_name_size);
+
+	/*
+	 * Strip trailing "_thread" from passed name which will be the func
+	 * name since the exposed API has no parameter for passing a name.
+	 */
+	p = strstr(tp->tp_name, "_thread");
+	if (p)
+		p[0] = '\0';
+
+	tp->tp_func  = func;
+	tp->tp_args  = args;
+	tp->tp_len   = len;
+	tp->tp_state = state;
+	tp->tp_pri   = pri;
+
+	tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp,
+	    "%s", tp->tp_name);
+	if (IS_ERR(tsk))
+		return (NULL);
+
+	wake_up_process(tsk);
+	return ((kthread_t *)tsk);
+}
+EXPORT_SYMBOL(__thread_create);
+
+/*
+ * spl_kthread_create - Wrapper providing pre-3.13 semantics for
+ * kthread_create() in which it is not killable and less likely
+ * to return -ENOMEM.
+ */
+struct task_struct *
+spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...)
+{
+	struct task_struct *tsk;
+	va_list args;
+	char name[TASK_COMM_LEN];
+
+	va_start(args, namefmt);
+	vsnprintf(name, sizeof (name), namefmt, args);
+	va_end(args);
+	do {
+		tsk = kthread_create(func, data, "%s", name);
+		if (IS_ERR(tsk)) {
+			if (signal_pending(current)) {
+				clear_thread_flag(TIF_SIGPENDING);
+				continue;
+			}
+			if (PTR_ERR(tsk) == -ENOMEM)
+				continue;
+			return (NULL);
+		} else {
+			return (tsk);
+		}
+	} while (1);
+}
+EXPORT_SYMBOL(spl_kthread_create);
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
new file mode 100644
index 000000000000..7912a381294d
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-trace.c
@@ -0,0 +1,33 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Each DTRACE_PROBE must define its trace point in one (and only one)
+ * source file, so this dummy file exists for that purpose.
+ */
+
+#include <sys/taskq.h>
+
+#ifdef _KERNEL
+#define	CREATE_TRACE_POINTS
+#include <sys/trace.h>
+#include <sys/trace_taskq.h>
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
new file mode 100644
index 000000000000..546db9ab8bd7
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-tsd.c
@@ -0,0 +1,719 @@
+/*
+ *  Copyright (C) 2010 Lawrence Livermore National Security, LLC.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ *  Solaris Porting Layer (SPL) Thread Specific Data Implementation.
+ *
+ *  Thread specific data has implemented using a hash table, this avoids
+ *  the need to add a member to the task structure and allows maximum
+ *  portability between kernels.  This implementation has been optimized
+ *  to keep the tsd_set() and tsd_get() times as small as possible.
+ *
+ *  The majority of the entries in the hash table are for specific tsd
+ *  entries.  These entries are hashed by the product of their key and
+ *  pid because by design the key and pid are guaranteed to be unique.
+ *  Their product also has the desirable properly that it will be uniformly
+ *  distributed over the hash bins providing neither the pid nor key is zero.
+ *  Under linux the zero pid is always the init process and thus won't be
+ *  used, and this implementation is careful to never to assign a zero key.
+ *  By default the hash table is sized to 512 bins which is expected to
+ *  be sufficient for light to moderate usage of thread specific data.
+ *
+ *  The hash table contains two additional type of entries.  They first
+ *  type is entry is called a 'key' entry and it is added to the hash during
+ *  tsd_create().  It is used to store the address of the destructor function
+ *  and it is used as an anchor point.  All tsd entries which use the same
+ *  key will be linked to this entry.  This is used during tsd_destroy() to
+ *  quickly call the destructor function for all tsd associated with the key.
+ *  The 'key' entry may be looked up with tsd_hash_search() by passing the
+ *  key you wish to lookup and DTOR_PID constant as the pid.
+ *
+ *  The second type of entry is called a 'pid' entry and it is added to the
+ *  hash the first time a process set a key.  The 'pid' entry is also used
+ *  as an anchor and all tsd for the process will be linked to it.  This
+ *  list is using during tsd_exit() to ensure all registered destructors
+ *  are run for the process.  The 'pid' entry may be looked up with
+ *  tsd_hash_search() by passing the PID_KEY constant as the key, and
+ *  the process pid.  Note that tsd_exit() is called by thread_exit()
+ *  so if your using the Solaris thread API you should not need to call
+ *  tsd_exit() directly.
+ *
+ */
+
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/tsd.h>
+#include <linux/hash.h>
+
+typedef struct tsd_hash_bin {
+	spinlock_t		hb_lock;
+	struct hlist_head	hb_head;
+} tsd_hash_bin_t;
+
+typedef struct tsd_hash_table {
+	spinlock_t		ht_lock;
+	uint_t			ht_bits;
+	uint_t			ht_key;
+	tsd_hash_bin_t		*ht_bins;
+} tsd_hash_table_t;
+
+typedef struct tsd_hash_entry {
+	uint_t			he_key;
+	pid_t			he_pid;
+	dtor_func_t		he_dtor;
+	void			*he_value;
+	struct hlist_node	he_list;
+	struct list_head	he_key_list;
+	struct list_head	he_pid_list;
+} tsd_hash_entry_t;
+
+static tsd_hash_table_t *tsd_hash_table = NULL;
+
+
+/*
+ * tsd_hash_search - searches hash table for tsd_hash_entry
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static tsd_hash_entry_t *
+tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid)
+{
+	struct hlist_node *node = NULL;
+	tsd_hash_entry_t *entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+
+	hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+	hlist_for_each(node, &bin->hb_head) {
+		entry = list_entry(node, tsd_hash_entry_t, he_list);
+		if ((entry->he_key == key) && (entry->he_pid == pid)) {
+			spin_unlock(&bin->hb_lock);
+			return (entry);
+		}
+	}
+
+	spin_unlock(&bin->hb_lock);
+	return (NULL);
+}
+
+/*
+ * tsd_hash_dtor - call the destructor and free all entries on the list
+ * @work: list of hash entries
+ *
+ * For a list of entries which have all already been removed from the
+ * hash call their registered destructor then free the associated memory.
+ */
+static void
+tsd_hash_dtor(struct hlist_head *work)
+{
+	tsd_hash_entry_t *entry;
+
+	while (!hlist_empty(work)) {
+		entry = hlist_entry(work->first, tsd_hash_entry_t, he_list);
+		hlist_del(&entry->he_list);
+
+		if (entry->he_dtor && entry->he_pid != DTOR_PID)
+			entry->he_dtor(entry->he_value);
+
+		kmem_free(entry, sizeof (tsd_hash_entry_t));
+	}
+}
+
+/*
+ * tsd_hash_add - adds an entry to hash table
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ *
+ * The caller is responsible for ensuring the unique key/pid do not
+ * already exist in the hash table.  This possible because all entries
+ * are thread specific thus a concurrent thread will never attempt to
+ * add this key/pid.  Because multiple bins must be checked to add
+ * links to the dtor and pid entries the entire table is locked.
+ */
+static int
+tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value)
+{
+	tsd_hash_entry_t *entry, *dtor_entry, *pid_entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+	int rc = 0;
+
+	ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL);
+
+	/* New entry allocate structure, set value, and add to hash */
+	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+	if (entry == NULL)
+		return (ENOMEM);
+
+	entry->he_key = key;
+	entry->he_pid = pid;
+	entry->he_value = value;
+	INIT_HLIST_NODE(&entry->he_list);
+	INIT_LIST_HEAD(&entry->he_key_list);
+	INIT_LIST_HEAD(&entry->he_pid_list);
+
+	spin_lock(&table->ht_lock);
+
+	/* Destructor entry must exist for all valid keys */
+	dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID);
+	ASSERT3P(dtor_entry, !=, NULL);
+	entry->he_dtor = dtor_entry->he_dtor;
+
+	/* Process entry must exist for all valid processes */
+	pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid);
+	ASSERT3P(pid_entry, !=, NULL);
+
+	hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+
+	/* Add to the hash, key, and pid lists */
+	hlist_add_head(&entry->he_list, &bin->hb_head);
+	list_add(&entry->he_key_list, &dtor_entry->he_key_list);
+	list_add(&entry->he_pid_list, &pid_entry->he_pid_list);
+
+	spin_unlock(&bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	return (rc);
+}
+
+/*
+ * tsd_hash_add_key - adds a destructor entry to the hash table
+ * @table: hash table
+ * @keyp: search key
+ * @dtor: key destructor
+ *
+ * For every unique key there is a single entry in the hash which is used
+ * as anchor.  All other thread specific entries for this key are linked
+ * to this anchor via the 'he_key_list' list head.  On return they keyp
+ * will be set to the next available key for the hash table.
+ */
+static int
+tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor)
+{
+	tsd_hash_entry_t *tmp_entry, *entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+	int keys_checked = 0;
+
+	ASSERT3P(table, !=, NULL);
+
+	/* Allocate entry to be used as a destructor for this key */
+	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+	if (entry == NULL)
+		return (ENOMEM);
+
+	/* Determine next available key value */
+	spin_lock(&table->ht_lock);
+	do {
+		/* Limited to TSD_KEYS_MAX concurrent unique keys */
+		if (table->ht_key++ > TSD_KEYS_MAX)
+			table->ht_key = 1;
+
+		/* Ensure failure when all TSD_KEYS_MAX keys are in use */
+		if (keys_checked++ >= TSD_KEYS_MAX) {
+			spin_unlock(&table->ht_lock);
+			return (ENOENT);
+		}
+
+		tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID);
+	} while (tmp_entry);
+
+	/* Add destructor entry in to hash table */
+	entry->he_key = *keyp = table->ht_key;
+	entry->he_pid = DTOR_PID;
+	entry->he_dtor = dtor;
+	entry->he_value = NULL;
+	INIT_HLIST_NODE(&entry->he_list);
+	INIT_LIST_HEAD(&entry->he_key_list);
+	INIT_LIST_HEAD(&entry->he_pid_list);
+
+	hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+
+	hlist_add_head(&entry->he_list, &bin->hb_head);
+
+	spin_unlock(&bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	return (0);
+}
+
+/*
+ * tsd_hash_add_pid - adds a process entry to the hash table
+ * @table: hash table
+ * @pid: search pid
+ *
+ * For every process there is a single entry in the hash which is used
+ * as anchor.  All other thread specific entries for this process are
+ * linked to this anchor via the 'he_pid_list' list head.
+ */
+static int
+tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid)
+{
+	tsd_hash_entry_t *entry;
+	tsd_hash_bin_t *bin;
+	ulong_t hash;
+
+	/* Allocate entry to be used as the process reference */
+	entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE);
+	if (entry == NULL)
+		return (ENOMEM);
+
+	spin_lock(&table->ht_lock);
+	entry->he_key = PID_KEY;
+	entry->he_pid = pid;
+	entry->he_dtor = NULL;
+	entry->he_value = NULL;
+	INIT_HLIST_NODE(&entry->he_list);
+	INIT_LIST_HEAD(&entry->he_key_list);
+	INIT_LIST_HEAD(&entry->he_pid_list);
+
+	hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits);
+	bin = &table->ht_bins[hash];
+	spin_lock(&bin->hb_lock);
+
+	hlist_add_head(&entry->he_list, &bin->hb_head);
+
+	spin_unlock(&bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	return (0);
+}
+
+/*
+ * tsd_hash_del - delete an entry from hash table, key, and pid lists
+ * @table: hash table
+ * @key: search key
+ * @pid: search pid
+ */
+static void
+tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry)
+{
+	hlist_del(&entry->he_list);
+	list_del_init(&entry->he_key_list);
+	list_del_init(&entry->he_pid_list);
+}
+
+/*
+ * tsd_hash_table_init - allocate a hash table
+ * @bits: hash table size
+ *
+ * A hash table with 2^bits bins will be created, it may not be resized
+ * after the fact and must be free'd with tsd_hash_table_fini().
+ */
+static tsd_hash_table_t *
+tsd_hash_table_init(uint_t bits)
+{
+	tsd_hash_table_t *table;
+	int hash, size = (1 << bits);
+
+	table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP);
+	if (table == NULL)
+		return (NULL);
+
+	table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP);
+	if (table->ht_bins == NULL) {
+		kmem_free(table, sizeof (tsd_hash_table_t));
+		return (NULL);
+	}
+
+	for (hash = 0; hash < size; hash++) {
+		spin_lock_init(&table->ht_bins[hash].hb_lock);
+		INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head);
+	}
+
+	spin_lock_init(&table->ht_lock);
+	table->ht_bits = bits;
+	table->ht_key = 1;
+
+	return (table);
+}
+
+/*
+ * tsd_hash_table_fini - free a hash table
+ * @table: hash table
+ *
+ * Free a hash table allocated by tsd_hash_table_init().  If the hash
+ * table is not empty this function will call the proper destructor for
+ * all remaining entries before freeing the memory used by those entries.
+ */
+static void
+tsd_hash_table_fini(tsd_hash_table_t *table)
+{
+	HLIST_HEAD(work);
+	tsd_hash_bin_t *bin;
+	tsd_hash_entry_t *entry;
+	int size, i;
+
+	ASSERT3P(table, !=, NULL);
+	spin_lock(&table->ht_lock);
+	for (i = 0, size = (1 << table->ht_bits); i < size; i++) {
+		bin = &table->ht_bins[i];
+		spin_lock(&bin->hb_lock);
+		while (!hlist_empty(&bin->hb_head)) {
+			entry = hlist_entry(bin->hb_head.first,
+			    tsd_hash_entry_t, he_list);
+			tsd_hash_del(table, entry);
+			hlist_add_head(&entry->he_list, &work);
+		}
+		spin_unlock(&bin->hb_lock);
+	}
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+	kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<<table->ht_bits));
+	kmem_free(table, sizeof (tsd_hash_table_t));
+}
+
+/*
+ * tsd_remove_entry - remove a tsd entry for this thread
+ * @entry: entry to remove
+ *
+ * Remove the thread specific data @entry for this thread.
+ * If this is the last entry for this thread, also remove the PID entry.
+ */
+static void
+tsd_remove_entry(tsd_hash_entry_t *entry)
+{
+	HLIST_HEAD(work);
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *pid_entry;
+	tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+	ulong_t hash;
+
+	table = tsd_hash_table;
+	ASSERT3P(table, !=, NULL);
+	ASSERT3P(entry, !=, NULL);
+
+	spin_lock(&table->ht_lock);
+
+	hash = hash_long((ulong_t)entry->he_key *
+	    (ulong_t)entry->he_pid, table->ht_bits);
+	entry_bin = &table->ht_bins[hash];
+
+	/* save the possible pid_entry */
+	pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t,
+	    he_pid_list);
+
+	/* remove entry */
+	spin_lock(&entry_bin->hb_lock);
+	tsd_hash_del(table, entry);
+	hlist_add_head(&entry->he_list, &work);
+	spin_unlock(&entry_bin->hb_lock);
+
+	/* if pid_entry is indeed pid_entry, then remove it if it's empty */
+	if (pid_entry->he_key == PID_KEY &&
+	    list_empty(&pid_entry->he_pid_list)) {
+		hash = hash_long((ulong_t)pid_entry->he_key *
+		    (ulong_t)pid_entry->he_pid, table->ht_bits);
+		pid_entry_bin = &table->ht_bins[hash];
+
+		spin_lock(&pid_entry_bin->hb_lock);
+		tsd_hash_del(table, pid_entry);
+		hlist_add_head(&pid_entry->he_list, &work);
+		spin_unlock(&pid_entry_bin->hb_lock);
+	}
+
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+}
+
+/*
+ * tsd_set - set thread specific data
+ * @key: lookup key
+ * @value: value to set
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy(), protected
+ * from racing tsd_get() or tsd_set() because it is thread specific.
+ * This function has been optimized to be fast for the update case.
+ * When setting the tsd initially it will be slower due to additional
+ * required locking and potential memory allocations.
+ */
+int
+tsd_set(uint_t key, void *value)
+{
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *entry;
+	pid_t pid;
+	int rc;
+	/* mark remove if value is NULL */
+	boolean_t remove = (value == NULL);
+
+	table = tsd_hash_table;
+	pid = curthread->pid;
+	ASSERT3P(table, !=, NULL);
+
+	if ((key == 0) || (key > TSD_KEYS_MAX))
+		return (EINVAL);
+
+	/* Entry already exists in hash table update value */
+	entry = tsd_hash_search(table, key, pid);
+	if (entry) {
+		entry->he_value = value;
+		/* remove the entry */
+		if (remove)
+			tsd_remove_entry(entry);
+		return (0);
+	}
+
+	/* don't create entry if value is NULL */
+	if (remove)
+		return (0);
+
+	/* Add a process entry to the hash if not yet exists */
+	entry = tsd_hash_search(table, PID_KEY, pid);
+	if (entry == NULL) {
+		rc = tsd_hash_add_pid(table, pid);
+		if (rc)
+			return (rc);
+	}
+
+	rc = tsd_hash_add(table, key, pid, value);
+	return (rc);
+}
+EXPORT_SYMBOL(tsd_set);
+
+/*
+ * tsd_get - get thread specific data
+ * @key: lookup key
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy().  This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get(uint_t key)
+{
+	tsd_hash_entry_t *entry;
+
+	ASSERT3P(tsd_hash_table, !=, NULL);
+
+	if ((key == 0) || (key > TSD_KEYS_MAX))
+		return (NULL);
+
+	entry = tsd_hash_search(tsd_hash_table, key, curthread->pid);
+	if (entry == NULL)
+		return (NULL);
+
+	return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get);
+
+/*
+ * tsd_get_by_thread - get thread specific data for specified thread
+ * @key: lookup key
+ * @thread: thread to lookup
+ *
+ * Caller must prevent racing tsd_create() or tsd_destroy().  This
+ * implementation is designed to be fast and scalable, it does not
+ * lock the entire table only a single hash bin.
+ */
+void *
+tsd_get_by_thread(uint_t key, kthread_t *thread)
+{
+	tsd_hash_entry_t *entry;
+
+	ASSERT3P(tsd_hash_table, !=, NULL);
+
+	if ((key == 0) || (key > TSD_KEYS_MAX))
+		return (NULL);
+
+	entry = tsd_hash_search(tsd_hash_table, key, thread->pid);
+	if (entry == NULL)
+		return (NULL);
+
+	return (entry->he_value);
+}
+EXPORT_SYMBOL(tsd_get_by_thread);
+
+/*
+ * tsd_create - create thread specific data key
+ * @keyp: lookup key address
+ * @dtor: destructor called during tsd_destroy() or tsd_exit()
+ *
+ * Provided key must be set to 0 or it assumed to be already in use.
+ * The dtor is allowed to be NULL in which case no additional cleanup
+ * for the data is performed during tsd_destroy() or tsd_exit().
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_create(uint_t *keyp, dtor_func_t dtor)
+{
+	ASSERT3P(keyp, !=, NULL);
+	if (*keyp)
+		return;
+
+	(void) tsd_hash_add_key(tsd_hash_table, keyp, dtor);
+}
+EXPORT_SYMBOL(tsd_create);
+
+/*
+ * tsd_destroy - destroy thread specific data
+ * @keyp: lookup key address
+ *
+ * Destroys the thread specific data on all threads which use this key.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_destroy(uint_t *keyp)
+{
+	HLIST_HEAD(work);
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *dtor_entry, *entry;
+	tsd_hash_bin_t *dtor_entry_bin, *entry_bin;
+	ulong_t hash;
+
+	table = tsd_hash_table;
+	ASSERT3P(table, !=, NULL);
+
+	spin_lock(&table->ht_lock);
+	dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID);
+	if (dtor_entry == NULL) {
+		spin_unlock(&table->ht_lock);
+		return;
+	}
+
+	/*
+	 * All threads which use this key must be linked off of the
+	 * DTOR_PID entry.  They are removed from the hash table and
+	 * linked in to a private working list to be destroyed.
+	 */
+	while (!list_empty(&dtor_entry->he_key_list)) {
+		entry = list_entry(dtor_entry->he_key_list.next,
+		    tsd_hash_entry_t, he_key_list);
+		ASSERT3U(dtor_entry->he_key, ==, entry->he_key);
+		ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor);
+
+		hash = hash_long((ulong_t)entry->he_key *
+		    (ulong_t)entry->he_pid, table->ht_bits);
+		entry_bin = &table->ht_bins[hash];
+
+		spin_lock(&entry_bin->hb_lock);
+		tsd_hash_del(table, entry);
+		hlist_add_head(&entry->he_list, &work);
+		spin_unlock(&entry_bin->hb_lock);
+	}
+
+	hash = hash_long((ulong_t)dtor_entry->he_key *
+	    (ulong_t)dtor_entry->he_pid, table->ht_bits);
+	dtor_entry_bin = &table->ht_bins[hash];
+
+	spin_lock(&dtor_entry_bin->hb_lock);
+	tsd_hash_del(table, dtor_entry);
+	hlist_add_head(&dtor_entry->he_list, &work);
+	spin_unlock(&dtor_entry_bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+	*keyp = 0;
+}
+EXPORT_SYMBOL(tsd_destroy);
+
+/*
+ * tsd_exit - destroys all thread specific data for this thread
+ *
+ * Destroys all the thread specific data for this thread.
+ *
+ * Caller must prevent racing tsd_set() or tsd_get(), this function is
+ * safe from racing tsd_create(), tsd_destroy(), and tsd_exit().
+ */
+void
+tsd_exit(void)
+{
+	HLIST_HEAD(work);
+	tsd_hash_table_t *table;
+	tsd_hash_entry_t *pid_entry, *entry;
+	tsd_hash_bin_t *pid_entry_bin, *entry_bin;
+	ulong_t hash;
+
+	table = tsd_hash_table;
+	ASSERT3P(table, !=, NULL);
+
+	spin_lock(&table->ht_lock);
+	pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid);
+	if (pid_entry == NULL) {
+		spin_unlock(&table->ht_lock);
+		return;
+	}
+
+	/*
+	 * All keys associated with this pid must be linked off of the
+	 * PID_KEY entry.  They are removed from the hash table and
+	 * linked in to a private working list to be destroyed.
+	 */
+
+	while (!list_empty(&pid_entry->he_pid_list)) {
+		entry = list_entry(pid_entry->he_pid_list.next,
+		    tsd_hash_entry_t, he_pid_list);
+		ASSERT3U(pid_entry->he_pid, ==, entry->he_pid);
+
+		hash = hash_long((ulong_t)entry->he_key *
+		    (ulong_t)entry->he_pid, table->ht_bits);
+		entry_bin = &table->ht_bins[hash];
+
+		spin_lock(&entry_bin->hb_lock);
+		tsd_hash_del(table, entry);
+		hlist_add_head(&entry->he_list, &work);
+		spin_unlock(&entry_bin->hb_lock);
+	}
+
+	hash = hash_long((ulong_t)pid_entry->he_key *
+	    (ulong_t)pid_entry->he_pid, table->ht_bits);
+	pid_entry_bin = &table->ht_bins[hash];
+
+	spin_lock(&pid_entry_bin->hb_lock);
+	tsd_hash_del(table, pid_entry);
+	hlist_add_head(&pid_entry->he_list, &work);
+	spin_unlock(&pid_entry_bin->hb_lock);
+	spin_unlock(&table->ht_lock);
+
+	tsd_hash_dtor(&work);
+}
+EXPORT_SYMBOL(tsd_exit);
+
+int
+spl_tsd_init(void)
+{
+	tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT);
+	if (tsd_hash_table == NULL)
+		return (1);
+
+	return (0);
+}
+
+void
+spl_tsd_fini(void)
+{
+	tsd_hash_table_fini(tsd_hash_table);
+	tsd_hash_table = NULL;
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c b/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c
new file mode 100644
index 000000000000..cab3e9549cfe
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-vmem.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/percpu_compat.h>
+#include <sys/debug.h>
+#include <sys/vmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/shrinker.h>
+#include <linux/module.h>
+
+/*
+ * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces.
+ */
+void *
+spl_vmem_alloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= KM_VMEM;
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_alloc);
+
+void *
+spl_vmem_zalloc(size_t size, int flags, const char *func, int line)
+{
+	ASSERT0(flags & ~KM_PUBLIC_MASK);
+
+	flags |= (KM_VMEM | KM_ZERO);
+
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE));
+#else
+	return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_zalloc);
+
+void
+spl_vmem_free(const void *buf, size_t size)
+{
+#if !defined(DEBUG_KMEM)
+	return (spl_kmem_free_impl(buf, size));
+#elif !defined(DEBUG_KMEM_TRACKING)
+	return (spl_kmem_free_debug(buf, size));
+#else
+	return (spl_kmem_free_track(buf, size));
+#endif
+}
+EXPORT_SYMBOL(spl_vmem_free);
+
+int
+spl_vmem_init(void)
+{
+	return (0);
+}
+
+void
+spl_vmem_fini(void)
+{
+}
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
new file mode 100644
index 000000000000..5e763c25606f
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-xdr.c
@@ -0,0 +1,512 @@
+/*
+ *  Copyright (c) 2008-2010 Sun Microsystems, Inc.
+ *  Written by Ricardo Correia <Ricardo.M.Correia@Sun.COM>
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Solaris Porting Layer (SPL) XDR Implementation.
+ */
+
+#include <linux/string.h>
+#include <sys/kmem.h>
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <rpc/xdr.h>
+
+/*
+ * SPL's XDR mem implementation.
+ *
+ * This is used by libnvpair to serialize/deserialize the name-value pair data
+ * structures into byte arrays in a well-defined and portable manner.
+ *
+ * These data structures are used by the DMU/ZFS to flexibly manipulate various
+ * information in memory and later serialize it/deserialize it to disk.
+ * Examples of usages include the pool configuration, lists of pool and dataset
+ * properties, etc.
+ *
+ * Reference documentation for the XDR representation and XDR operations can be
+ * found in RFC 1832 and xdr(3), respectively.
+ *
+ * ===  Implementation shortcomings ===
+ *
+ * It is assumed that the following C types have the following sizes:
+ *
+ * char/unsigned char:      1 byte
+ * short/unsigned short:    2 bytes
+ * int/unsigned int:        4 bytes
+ * longlong_t/u_longlong_t: 8 bytes
+ *
+ * The C standard allows these types to be larger (and in the case of ints,
+ * shorter), so if that is the case on some compiler/architecture, the build
+ * will fail (on purpose).
+ *
+ * If someone wants to fix the code to work properly on such environments, then:
+ *
+ * 1) Preconditions should be added to xdrmem_enc functions to make sure the
+ *    caller doesn't pass arguments which exceed the expected range.
+ * 2) Functions which take signed integers should be changed to properly do
+ *    sign extension.
+ * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger
+ *    problems than this implementation.
+ *
+ * It is also assumed that:
+ *
+ * 1) Chars have 8 bits.
+ * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned
+ *    memcpy, memset and memcmp.
+ * 3) Arrays passed to xdr_array() are packed and the compiler/architecture
+ *    supports element-sized-aligned memory accesses.
+ * 4) Negative integers are natively stored in two's complement binary
+ *    representation.
+ *
+ * No checks are done for the 4 assumptions above, though.
+ *
+ * === Caller expectations ===
+ *
+ * Existing documentation does not describe the semantics of XDR operations very
+ * well.  Therefore, some assumptions about failure semantics will be made and
+ * will be described below:
+ *
+ * 1) If any encoding operation fails (e.g., due to lack of buffer space), the
+ * the stream should be considered valid only up to the encoding operation
+ * previous to the one that first failed. However, the stream size as returned
+ * by xdr_control() cannot be considered to be strictly correct (it may be
+ * bigger).
+ *
+ * Putting it another way, if there is an encoding failure it's undefined
+ * whether anything is added to the stream in that operation and therefore
+ * neither xdr_control() nor future encoding operations on the same stream can
+ * be relied upon to produce correct results.
+ *
+ * 2) If a decoding operation fails, it's undefined whether anything will be
+ * decoded into passed buffers/pointers during that operation, or what the
+ * values on those buffers will look like.
+ *
+ * Future decoding operations on the same stream will also have similar
+ * undefined behavior.
+ *
+ * 3) When the first decoding operation fails it is OK to trust the results of
+ * previous decoding operations on the same stream, as long as the caller
+ * expects a failure to be possible (e.g. due to end-of-stream).
+ *
+ * However, this is highly discouraged because the caller should know the
+ * stream size and should be coded to expect any decoding failure to be data
+ * corruption due to hardware, accidental or even malicious causes, which should
+ * be handled gracefully in all cases.
+ *
+ * In very rare situations where there are strong reasons to believe the data
+ * can be trusted to be valid and non-tampered with, then the caller may assume
+ * a decoding failure to be a bug (e.g. due to mismatched data types) and may
+ * fail non-gracefully.
+ *
+ * 4) Non-zero padding bytes will cause the decoding operation to fail.
+ *
+ * 5) Zero bytes on string types will also cause the decoding operation to fail.
+ *
+ * 6) It is assumed that either the pointer to the stream buffer given by the
+ * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int
+ * memory accesses.
+ *
+ * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap.
+ *
+ * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user
+ * space or MMIO space), the computer may explode.
+ */
+
+static struct xdr_ops xdrmem_encode_ops;
+static struct xdr_ops xdrmem_decode_ops;
+
+void
+xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size,
+    const enum xdr_op op)
+{
+	switch (op) {
+		case XDR_ENCODE:
+			xdrs->x_ops = &xdrmem_encode_ops;
+			break;
+		case XDR_DECODE:
+			xdrs->x_ops = &xdrmem_decode_ops;
+			break;
+		default:
+			xdrs->x_ops = NULL; /* Let the caller know we failed */
+			return;
+	}
+
+	xdrs->x_op = op;
+	xdrs->x_addr = addr;
+	xdrs->x_addr_end = addr + size;
+
+	if (xdrs->x_addr_end < xdrs->x_addr) {
+		xdrs->x_ops = NULL;
+	}
+}
+EXPORT_SYMBOL(xdrmem_create);
+
+static bool_t
+xdrmem_control(XDR *xdrs, int req, void *info)
+{
+	struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info;
+
+	if (req != XDR_GET_BYTES_AVAIL)
+		return (FALSE);
+
+	rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */
+	rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr;
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+	uint_t size = roundup(cnt, 4);
+	uint_t pad;
+
+	if (size < cnt)
+		return (FALSE); /* Integer overflow */
+
+	if (xdrs->x_addr > xdrs->x_addr_end)
+		return (FALSE);
+
+	if (xdrs->x_addr_end - xdrs->x_addr < size)
+		return (FALSE);
+
+	memcpy(xdrs->x_addr, cp, cnt);
+
+	xdrs->x_addr += cnt;
+
+	pad = size - cnt;
+	if (pad > 0) {
+		memset(xdrs->x_addr, 0, pad);
+		xdrs->x_addr += pad;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt)
+{
+	static uint32_t zero = 0;
+	uint_t size = roundup(cnt, 4);
+	uint_t pad;
+
+	if (size < cnt)
+		return (FALSE); /* Integer overflow */
+
+	if (xdrs->x_addr > xdrs->x_addr_end)
+		return (FALSE);
+
+	if (xdrs->x_addr_end - xdrs->x_addr < size)
+		return (FALSE);
+
+	memcpy(cp, xdrs->x_addr, cnt);
+	xdrs->x_addr += cnt;
+
+	pad = size - cnt;
+	if (pad > 0) {
+		/* An inverted memchr() would be useful here... */
+		if (memcmp(&zero, xdrs->x_addr, pad) != 0)
+			return (FALSE);
+
+		xdrs->x_addr += pad;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint32(XDR *xdrs, uint32_t val)
+{
+	if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+		return (FALSE);
+
+	*((uint32_t *)xdrs->x_addr) = cpu_to_be32(val);
+
+	xdrs->x_addr += sizeof (uint32_t);
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_dec_uint32(XDR *xdrs, uint32_t *val)
+{
+	if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end)
+		return (FALSE);
+
+	*val = be32_to_cpu(*((uint32_t *)xdrs->x_addr));
+
+	xdrs->x_addr += sizeof (uint32_t);
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_char(XDR *xdrs, char *cp)
+{
+	uint32_t val;
+
+	BUILD_BUG_ON(sizeof (char) != 1);
+	val = *((unsigned char *) cp);
+
+	return (xdrmem_enc_uint32(xdrs, val));
+}
+
+static bool_t
+xdrmem_dec_char(XDR *xdrs, char *cp)
+{
+	uint32_t val;
+
+	BUILD_BUG_ON(sizeof (char) != 1);
+
+	if (!xdrmem_dec_uint32(xdrs, &val))
+		return (FALSE);
+
+	/*
+	 * If any of the 3 other bytes are non-zero then val will be greater
+	 * than 0xff and we fail because according to the RFC, this block does
+	 * not have a char encoded in it.
+	 */
+	if (val > 0xff)
+		return (FALSE);
+
+	*((unsigned char *) cp) = val;
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp)
+{
+	BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+	return (xdrmem_enc_uint32(xdrs, *usp));
+}
+
+static bool_t
+xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp)
+{
+	uint32_t val;
+
+	BUILD_BUG_ON(sizeof (unsigned short) != 2);
+
+	if (!xdrmem_dec_uint32(xdrs, &val))
+		return (FALSE);
+
+	/*
+	 * Short ints are not in the RFC, but we assume similar logic as in
+	 * xdrmem_dec_char().
+	 */
+	if (val > 0xffff)
+		return (FALSE);
+
+	*usp = val;
+
+	return (TRUE);
+}
+
+static bool_t
+xdrmem_enc_uint(XDR *xdrs, unsigned *up)
+{
+	BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+	return (xdrmem_enc_uint32(xdrs, *up));
+}
+
+static bool_t
+xdrmem_dec_uint(XDR *xdrs, unsigned *up)
+{
+	BUILD_BUG_ON(sizeof (unsigned) != 4);
+
+	return (xdrmem_dec_uint32(xdrs, (uint32_t *)up));
+}
+
+static bool_t
+xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+	BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+	if (!xdrmem_enc_uint32(xdrs, *ullp >> 32))
+		return (FALSE);
+
+	return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff));
+}
+
+static bool_t
+xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp)
+{
+	uint32_t low, high;
+
+	BUILD_BUG_ON(sizeof (u_longlong_t) != 8);
+
+	if (!xdrmem_dec_uint32(xdrs, &high))
+		return (FALSE);
+	if (!xdrmem_dec_uint32(xdrs, &low))
+		return (FALSE);
+
+	*ullp = ((u_longlong_t)high << 32) | low;
+
+	return (TRUE);
+}
+
+static bool_t
+xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+    const uint_t elsize, const xdrproc_t elproc)
+{
+	uint_t i;
+	caddr_t addr = *arrp;
+
+	if (*sizep > maxsize || *sizep > UINT_MAX / elsize)
+		return (FALSE);
+
+	if (!xdrmem_enc_uint(xdrs, sizep))
+		return (FALSE);
+
+	for (i = 0; i < *sizep; i++) {
+		if (!elproc(xdrs, addr))
+			return (FALSE);
+		addr += elsize;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize,
+    const uint_t elsize, const xdrproc_t elproc)
+{
+	uint_t i, size;
+	bool_t alloc = FALSE;
+	caddr_t addr;
+
+	if (!xdrmem_dec_uint(xdrs, sizep))
+		return (FALSE);
+
+	size = *sizep;
+
+	if (size > maxsize || size > UINT_MAX / elsize)
+		return (FALSE);
+
+	/*
+	 * The Solaris man page says: "If *arrp is NULL when decoding,
+	 * xdr_array() allocates memory and *arrp points to it".
+	 */
+	if (*arrp == NULL) {
+		BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+		*arrp = kmem_alloc(size * elsize, KM_NOSLEEP);
+		if (*arrp == NULL)
+			return (FALSE);
+
+		alloc = TRUE;
+	}
+
+	addr = *arrp;
+
+	for (i = 0; i < size; i++) {
+		if (!elproc(xdrs, addr)) {
+			if (alloc)
+				kmem_free(*arrp, size * elsize);
+			return (FALSE);
+		}
+		addr += elsize;
+	}
+
+	return (TRUE);
+}
+
+static bool_t
+xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+	size_t slen = strlen(*sp);
+	uint_t len;
+
+	if (slen > maxsize)
+		return (FALSE);
+
+	len = slen;
+
+	if (!xdrmem_enc_uint(xdrs, &len))
+		return (FALSE);
+
+	return (xdrmem_enc_bytes(xdrs, *sp, len));
+}
+
+static bool_t
+xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize)
+{
+	uint_t size;
+	bool_t alloc = FALSE;
+
+	if (!xdrmem_dec_uint(xdrs, &size))
+		return (FALSE);
+
+	if (size > maxsize || size > UINT_MAX - 1)
+		return (FALSE);
+
+	/*
+	 * Solaris man page: "If *sp is NULL when decoding, xdr_string()
+	 * allocates memory and *sp points to it".
+	 */
+	if (*sp == NULL) {
+		BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t));
+
+		*sp = kmem_alloc(size + 1, KM_NOSLEEP);
+		if (*sp == NULL)
+			return (FALSE);
+
+		alloc = TRUE;
+	}
+
+	if (!xdrmem_dec_bytes(xdrs, *sp, size))
+		goto fail;
+
+	if (memchr(*sp, 0, size) != NULL)
+		goto fail;
+
+	(*sp)[size] = '\0';
+
+	return (TRUE);
+
+fail:
+	if (alloc)
+		kmem_free(*sp, size + 1);
+
+	return (FALSE);
+}
+
+static struct xdr_ops xdrmem_encode_ops = {
+	.xdr_control		= xdrmem_control,
+	.xdr_char		= xdrmem_enc_char,
+	.xdr_u_short		= xdrmem_enc_ushort,
+	.xdr_u_int		= xdrmem_enc_uint,
+	.xdr_u_longlong_t	= xdrmem_enc_ulonglong,
+	.xdr_opaque		= xdrmem_enc_bytes,
+	.xdr_string		= xdr_enc_string,
+	.xdr_array		= xdr_enc_array
+};
+
+static struct xdr_ops xdrmem_decode_ops = {
+	.xdr_control		= xdrmem_control,
+	.xdr_char		= xdrmem_dec_char,
+	.xdr_u_short		= xdrmem_dec_ushort,
+	.xdr_u_int		= xdrmem_dec_uint,
+	.xdr_u_longlong_t	= xdrmem_dec_ulonglong,
+	.xdr_opaque		= xdrmem_dec_bytes,
+	.xdr_string		= xdr_dec_string,
+	.xdr_array		= xdr_dec_array
+};
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
new file mode 100644
index 000000000000..589496da0c78
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-zlib.c
@@ -0,0 +1,217 @@
+/*
+ *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
+ *  Copyright (C) 2007 The Regents of the University of California.
+ *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
+ *  UCRL-CODE-235197
+ *
+ *  This file is part of the SPL, Solaris Porting Layer.
+ *
+ *  The SPL is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  The SPL is distributed in the hope that it will be useful, but WITHOUT
+ *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ *  for more details.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ *  z_compress_level/z_uncompress are nearly identical copies of the
+ *  compress2/uncompress functions provided by the official zlib package
+ *  available at http://zlib.net/.  The only changes made we to slightly
+ *  adapt the functions called to match the linux kernel implementation
+ *  of zlib.  The full zlib license follows:
+ *
+ *  zlib.h -- interface of the 'zlib' general purpose compression library
+ *  version 1.2.5, April 19th, 2010
+ *
+ *  Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
+ *
+ *  This software is provided 'as-is', without any express or implied
+ *  warranty.  In no event will the authors be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this software must not be misrepresented; you must not
+ *     claim that you wrote the original software. If you use this software
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original software.
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  Jean-loup Gailly
+ *  Mark Adler
+ */
+
+
+#include <linux/percpu_compat.h>
+#include <sys/kmem.h>
+#include <sys/kmem_cache.h>
+#include <sys/zmod.h>
+
+static spl_kmem_cache_t *zlib_workspace_cache;
+
+/*
+ * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc
+ * and vfree for every call.  Using a kmem_cache also has the advantage
+ * that improves the odds that the memory used will be local to this cpu.
+ * To further improve things it might be wise to create a dedicated per-cpu
+ * workspace for use.  This would take some additional care because we then
+ * must disable preemption around the critical section, and verify that
+ * zlib_deflate* and zlib_inflate* never internally call schedule().
+ */
+static void *
+zlib_workspace_alloc(int flags)
+{
+	return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS)));
+}
+
+static void
+zlib_workspace_free(void *workspace)
+{
+	kmem_cache_free(zlib_workspace_cache, workspace);
+}
+
+/*
+ * Compresses the source buffer into the destination buffer. The level
+ * parameter has the same meaning as in deflateInit.  sourceLen is the byte
+ * length of the source buffer. Upon entry, destLen is the total size of the
+ * destination buffer, which must be at least 0.1% larger than sourceLen plus
+ * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer.
+ *
+ * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+ * memory, Z_BUF_ERROR if there was not enough room in the output buffer,
+ * Z_STREAM_ERROR if the level parameter is invalid.
+ */
+int
+z_compress_level(void *dest, size_t *destLen, const void *source,
+    size_t sourceLen, int level)
+{
+	z_stream stream;
+	int err;
+
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+	if (!stream.workspace)
+		return (Z_MEM_ERROR);
+
+	err = zlib_deflateInit(&stream, level);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.workspace);
+		return (err);
+	}
+
+	err = zlib_deflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_deflateEnd(&stream);
+		zlib_workspace_free(stream.workspace);
+		return (err == Z_OK ? Z_BUF_ERROR : err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_deflateEnd(&stream);
+	zlib_workspace_free(stream.workspace);
+
+	return (err);
+}
+EXPORT_SYMBOL(z_compress_level);
+
+/*
+ * Decompresses the source buffer into the destination buffer.  sourceLen is
+ * the byte length of the source buffer. Upon entry, destLen is the total
+ * size of the destination buffer, which must be large enough to hold the
+ * entire uncompressed data. (The size of the uncompressed data must have
+ * been saved previously by the compressor and transmitted to the decompressor
+ * by some mechanism outside the scope of this compression library.)
+ * Upon exit, destLen is the actual size of the compressed buffer.
+ * This function can be used to decompress a whole file at once if the
+ * input file is mmap'ed.
+ *
+ * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
+ * enough memory, Z_BUF_ERROR if there was not enough room in the output
+ * buffer, or Z_DATA_ERROR if the input data was corrupted.
+ */
+int
+z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen)
+{
+	z_stream stream;
+	int err;
+
+	stream.next_in = (Byte *)source;
+	stream.avail_in = (uInt)sourceLen;
+	stream.next_out = dest;
+	stream.avail_out = (uInt)*destLen;
+
+	if ((size_t)stream.avail_out != *destLen)
+		return (Z_BUF_ERROR);
+
+	stream.workspace = zlib_workspace_alloc(KM_SLEEP);
+	if (!stream.workspace)
+		return (Z_MEM_ERROR);
+
+	err = zlib_inflateInit(&stream);
+	if (err != Z_OK) {
+		zlib_workspace_free(stream.workspace);
+		return (err);
+	}
+
+	err = zlib_inflate(&stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		zlib_inflateEnd(&stream);
+		zlib_workspace_free(stream.workspace);
+
+		if (err == Z_NEED_DICT ||
+		    (err == Z_BUF_ERROR && stream.avail_in == 0))
+			return (Z_DATA_ERROR);
+
+		return (err);
+	}
+	*destLen = stream.total_out;
+
+	err = zlib_inflateEnd(&stream);
+	zlib_workspace_free(stream.workspace);
+
+	return (err);
+}
+EXPORT_SYMBOL(z_uncompress);
+
+int
+spl_zlib_init(void)
+{
+	int size;
+
+	size = MAX(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+	    zlib_inflate_workspacesize());
+
+	zlib_workspace_cache = kmem_cache_create(
+	    "spl_zlib_workspace_cache",
+	    size, 0, NULL, NULL, NULL, NULL, NULL,
+	    KMC_KVMEM);
+	if (!zlib_workspace_cache)
+		return (1);
+
+	return (0);
+}
+
+void
+spl_zlib_fini(void)
+{
+	kmem_cache_destroy(zlib_workspace_cache);
+	zlib_workspace_cache = NULL;
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
new file mode 100644
index 000000000000..75bec52c94e2
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/Makefile.in
@@ -0,0 +1,37 @@
+#
+# Linux specific sources included from module/zfs/Makefile.in
+#
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
+
+$(MODULE)-objs += ../os/linux/zfs/abd_os.o
+$(MODULE)-objs += ../os/linux/zfs/arc_os.o
+$(MODULE)-objs += ../os/linux/zfs/mmp_os.o
+$(MODULE)-objs += ../os/linux/zfs/policy.o
+$(MODULE)-objs += ../os/linux/zfs/trace.o
+$(MODULE)-objs += ../os/linux/zfs/qat.o
+$(MODULE)-objs += ../os/linux/zfs/qat_compress.o
+$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/spa_misc_os.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o
+$(MODULE)-objs += ../os/linux/zfs/vdev_file.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_uio.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_vnops_os.o
+$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o
+$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_export.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_file.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_super.o
+$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o
+$(MODULE)-objs += ../os/linux/zfs/zvol_os.o
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
new file mode 100644
index 000000000000..d82e5f4dcf15
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/abd_os.c
@@ -0,0 +1,1073 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * See abd.c for a general overview of the arc buffered data (ABD).
+ *
+ * Linear buffers act exactly like normal buffers and are always mapped into the
+ * kernel's virtual memory space, while scattered ABD data chunks are allocated
+ * as physical pages and then mapped in only while they are actually being
+ * accessed through one of the abd_* library functions. Using scattered ABDs
+ * provides several benefits:
+ *
+ *  (1) They avoid use of kmem_*, preventing performance problems where running
+ *      kmem_reap on very large memory systems never finishes and causes
+ *      constant TLB shootdowns.
+ *
+ *  (2) Fragmentation is less of an issue since when we are at the limit of
+ *      allocatable space, we won't have to search around for a long free
+ *      hole in the VA space for large ARC allocations. Each chunk is mapped in
+ *      individually, so even if we are using HIGHMEM (see next point) we
+ *      wouldn't need to worry about finding a contiguous address range.
+ *
+ *  (3) If we are not using HIGHMEM, then all physical memory is always
+ *      mapped into the kernel's address space, so we also avoid the map /
+ *      unmap costs on each ABD access.
+ *
+ * If we are not using HIGHMEM, scattered buffers which have only one chunk
+ * can be treated as linear buffers, because they are contiguous in the
+ * kernel's virtual address space.  See abd_alloc_chunks() for details.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#ifdef _KERNEL
+#include <linux/kmap_compat.h>
+#include <linux/scatterlist.h>
+#else
+#define	MAX_ORDER	1
+#endif
+
+typedef struct abd_stats {
+	kstat_named_t abdstat_struct_size;
+	kstat_named_t abdstat_linear_cnt;
+	kstat_named_t abdstat_linear_data_size;
+	kstat_named_t abdstat_scatter_cnt;
+	kstat_named_t abdstat_scatter_data_size;
+	kstat_named_t abdstat_scatter_chunk_waste;
+	kstat_named_t abdstat_scatter_orders[MAX_ORDER];
+	kstat_named_t abdstat_scatter_page_multi_chunk;
+	kstat_named_t abdstat_scatter_page_multi_zone;
+	kstat_named_t abdstat_scatter_page_alloc_retry;
+	kstat_named_t abdstat_scatter_sg_table_retry;
+} abd_stats_t;
+
+static abd_stats_t abd_stats = {
+	/* Amount of memory occupied by all of the abd_t struct allocations */
+	{ "struct_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The number of linear ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset() and abd_get_from_buf()). If an
+	 * ABD takes ownership of its buf then it will become tracked.
+	 */
+	{ "linear_cnt",				KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all linear ABDs tracked by linear_cnt */
+	{ "linear_data_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The number of scatter ABDs which are currently allocated, excluding
+	 * ABDs which don't own their data (for instance the ones which were
+	 * allocated through abd_get_offset()).
+	 */
+	{ "scatter_cnt",			KSTAT_DATA_UINT64 },
+	/* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
+	{ "scatter_data_size",			KSTAT_DATA_UINT64 },
+	/*
+	 * The amount of space wasted at the end of the last chunk across all
+	 * scatter ABDs tracked by scatter_cnt.
+	 */
+	{ "scatter_chunk_waste",		KSTAT_DATA_UINT64 },
+	/*
+	 * The number of compound allocations of a given order.  These
+	 * allocations are spread over all currently allocated ABDs, and
+	 * act as a measure of memory fragmentation.
+	 */
+	{ { "scatter_order_N",			KSTAT_DATA_UINT64 } },
+	/*
+	 * The number of scatter ABDs which contain multiple chunks.
+	 * ABDs are preferentially allocated from the minimum number of
+	 * contiguous multi-page chunks, a single chunk is optimal.
+	 */
+	{ "scatter_page_multi_chunk",		KSTAT_DATA_UINT64 },
+	/*
+	 * The number of scatter ABDs which are split across memory zones.
+	 * ABDs are preferentially allocated using pages from a single zone.
+	 */
+	{ "scatter_page_multi_zone",		KSTAT_DATA_UINT64 },
+	/*
+	 *  The total number of retries encountered when attempting to
+	 *  allocate the pages to populate the scatter ABD.
+	 */
+	{ "scatter_page_alloc_retry",		KSTAT_DATA_UINT64 },
+	/*
+	 *  The total number of retries encountered when attempting to
+	 *  allocate the sg table for an ABD.
+	 */
+	{ "scatter_sg_table_retry",		KSTAT_DATA_UINT64 },
+};
+
+#define	abd_for_each_sg(abd, sg, n, i)	\
+	for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
+
+unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
+
+/*
+ * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
+ * ABD's.  Smaller allocations will use linear ABD's which uses
+ * zio_[data_]buf_alloc().
+ *
+ * Scatter ABD's use at least one page each, so sub-page allocations waste
+ * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
+ * half of each page).  Using linear ABD's for small allocations means that
+ * they will be put on slabs which contain many allocations.  This can
+ * improve memory efficiency, but it also makes it much harder for ARC
+ * evictions to actually free pages, because all the buffers on one slab need
+ * to be freed in order for the slab (and underlying pages) to be freed.
+ * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
+ * possible for them to actually waste more memory than scatter (one page per
+ * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
+ *
+ * Spill blocks are typically 512B and are heavily used on systems running
+ * selinux with the default dnode size and the `xattr=sa` property set.
+ *
+ * By default we use linear allocations for 512B and 1KB, and scatter
+ * allocations for larger (1.5KB and up).
+ */
+int zfs_abd_scatter_min_size = 512 * 3;
+
+/*
+ * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
+ * just a single zero'd page. This allows us to conserve memory by
+ * only using a single zero page for the scatterlist.
+ */
+abd_t *abd_zero_scatter = NULL;
+
+struct page;
+/*
+ * abd_zero_page we will be an allocated zero'd PAGESIZE buffer, which is
+ * assigned to set each of the pages of abd_zero_scatter.
+ */
+static struct page *abd_zero_page = NULL;
+
+static kmem_cache_t *abd_cache = NULL;
+static kstat_t *abd_ksp;
+
+static uint_t
+abd_chunkcnt_for_bytes(size_t size)
+{
+	return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
+}
+
+abd_t *
+abd_alloc_struct_impl(size_t size)
+{
+	/*
+	 * In Linux we do not use the size passed in during ABD
+	 * allocation, so we just ignore it.
+	 */
+	abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
+	ASSERT3P(abd, !=, NULL);
+	ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
+
+	return (abd);
+}
+
+void
+abd_free_struct_impl(abd_t *abd)
+{
+	kmem_cache_free(abd_cache, abd);
+	ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
+}
+
+#ifdef _KERNEL
+/*
+ * Mark zfs data pages so they can be excluded from kernel crash dumps
+ */
+#ifdef _LP64
+#define	ABD_FILE_CACHE_PAGE	0x2F5ABDF11ECAC4E
+
+static inline void
+abd_mark_zfs_page(struct page *page)
+{
+	get_page(page);
+	SetPagePrivate(page);
+	set_page_private(page, ABD_FILE_CACHE_PAGE);
+}
+
+static inline void
+abd_unmark_zfs_page(struct page *page)
+{
+	set_page_private(page, 0UL);
+	ClearPagePrivate(page);
+	put_page(page);
+}
+#else
+#define	abd_mark_zfs_page(page)
+#define	abd_unmark_zfs_page(page)
+#endif /* _LP64 */
+
+#ifndef CONFIG_HIGHMEM
+
+#ifndef __GFP_RECLAIM
+#define	__GFP_RECLAIM		__GFP_WAIT
+#endif
+
+/*
+ * The goal is to minimize fragmentation by preferentially populating ABDs
+ * with higher order compound pages from a single zone.  Allocation size is
+ * progressively decreased until it can be satisfied without performing
+ * reclaim or compaction.  When necessary this function will degenerate to
+ * allocating individual pages and allowing reclaim to satisfy allocations.
+ */
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	struct list_head pages;
+	struct sg_table table;
+	struct scatterlist *sg;
+	struct page *page, *tmp_page = NULL;
+	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+	gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
+	int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
+	int nr_pages = abd_chunkcnt_for_bytes(size);
+	int chunks = 0, zones = 0;
+	size_t remaining_size;
+	int nid = NUMA_NO_NODE;
+	int alloc_pages = 0;
+
+	INIT_LIST_HEAD(&pages);
+
+	while (alloc_pages < nr_pages) {
+		unsigned chunk_pages;
+		int order;
+
+		order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
+		chunk_pages = (1U << order);
+
+		page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
+		if (page == NULL) {
+			if (order == 0) {
+				ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+				schedule_timeout_interruptible(1);
+			} else {
+				max_order = MAX(0, order - 1);
+			}
+			continue;
+		}
+
+		list_add_tail(&page->lru, &pages);
+
+		if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
+			zones++;
+
+		nid = page_to_nid(page);
+		ABDSTAT_BUMP(abdstat_scatter_orders[order]);
+		chunks++;
+		alloc_pages += chunk_pages;
+	}
+
+	ASSERT3S(alloc_pages, ==, nr_pages);
+
+	while (sg_alloc_table(&table, chunks, gfp)) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+
+	sg = table.sgl;
+	remaining_size = size;
+	list_for_each_entry_safe(page, tmp_page, &pages, lru) {
+		size_t sg_size = MIN(PAGESIZE << compound_order(page),
+		    remaining_size);
+		sg_set_page(sg, page, sg_size, 0);
+		abd_mark_zfs_page(page);
+		remaining_size -= sg_size;
+
+		sg = sg_next(sg);
+		list_del(&page->lru);
+	}
+
+	/*
+	 * These conditions ensure that a possible transformation to a linear
+	 * ABD would be valid.
+	 */
+	ASSERT(!PageHighMem(sg_page(table.sgl)));
+	ASSERT0(ABD_SCATTER(abd).abd_offset);
+
+	if (table.nents == 1) {
+		/*
+		 * Since there is only one entry, this ABD can be represented
+		 * as a linear buffer.  All single-page (4K) ABD's can be
+		 * represented this way.  Some multi-page ABD's can also be
+		 * represented this way, if we were able to allocate a single
+		 * "chunk" (higher-order "page" which represents a power-of-2
+		 * series of physically-contiguous pages).  This is often the
+		 * case for 2-page (8K) ABD's.
+		 *
+		 * Representing a single-entry scatter ABD as a linear ABD
+		 * has the performance advantage of avoiding the copy (and
+		 * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
+		 * A performance increase of around 5% has been observed for
+		 * ARC-cached reads (of small blocks which can take advantage
+		 * of this).
+		 *
+		 * Note that this optimization is only possible because the
+		 * pages are always mapped into the kernel's address space.
+		 * This is not the case for highmem pages, so the
+		 * optimization can not be made there.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR;
+		abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
+		abd->abd_u.abd_linear.abd_sgl = table.sgl;
+		ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
+	} else if (table.nents > 1) {
+		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+
+		if (zones) {
+			ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
+			abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
+		}
+
+		ABD_SCATTER(abd).abd_sgl = table.sgl;
+		ABD_SCATTER(abd).abd_nents = table.nents;
+	}
+}
+#else
+
+/*
+ * Allocate N individual pages to construct a scatter ABD.  This function
+ * makes no attempt to request contiguous pages and requires the minimal
+ * number of kernel interfaces.  It's designed for maximum compatibility.
+ */
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	struct scatterlist *sg = NULL;
+	struct sg_table table;
+	struct page *page;
+	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+	int nr_pages = abd_chunkcnt_for_bytes(size);
+	int i = 0;
+
+	while (sg_alloc_table(&table, nr_pages, gfp)) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+
+	ASSERT3U(table.nents, ==, nr_pages);
+	ABD_SCATTER(abd).abd_sgl = table.sgl;
+	ABD_SCATTER(abd).abd_nents = nr_pages;
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		while ((page = __page_cache_alloc(gfp)) == NULL) {
+			ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+			schedule_timeout_interruptible(1);
+		}
+
+		ABDSTAT_BUMP(abdstat_scatter_orders[0]);
+		sg_set_page(sg, page, PAGESIZE, 0);
+		abd_mark_zfs_page(page);
+	}
+
+	if (nr_pages > 1) {
+		ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+		abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
+	}
+}
+#endif /* !CONFIG_HIGHMEM */
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+	struct sg_table table;
+
+	table.sgl = ABD_SCATTER(abd).abd_sgl;
+	table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
+	sg_free_table(&table);
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	struct scatterlist *sg = NULL;
+	struct page *page;
+	int nr_pages = ABD_SCATTER(abd).abd_nents;
+	int order, i = 0;
+
+	if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
+		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
+
+	if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
+		ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		page = sg_page(sg);
+		abd_unmark_zfs_page(page);
+		order = compound_order(page);
+		__free_pages(page, order);
+		ASSERT3U(sg->length, <=, PAGE_SIZE << order);
+		ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
+	}
+	abd_free_sg_table(abd);
+}
+
+/*
+ * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
+ * the scatterlist will be set to the zero'd out buffer abd_zero_page.
+ */
+static void
+abd_alloc_zero_scatter(void)
+{
+	struct scatterlist *sg = NULL;
+	struct sg_table table;
+	gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
+	gfp_t gfp_zero_page = gfp | __GFP_ZERO;
+	int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	int i = 0;
+
+	while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
+		ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
+		schedule_timeout_interruptible(1);
+	}
+	abd_mark_zfs_page(abd_zero_page);
+
+	while (sg_alloc_table(&table, nr_pages, gfp)) {
+		ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
+		schedule_timeout_interruptible(1);
+	}
+	ASSERT3U(table.nents, ==, nr_pages);
+
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
+	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+
+	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+	}
+
+	ABDSTAT_BUMP(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
+	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+}
+
+#else /* _KERNEL */
+
+#ifndef PAGE_SHIFT
+#define	PAGE_SHIFT (highbit64(PAGESIZE)-1)
+#endif
+
+#define	zfs_kmap_atomic(chunk, km)	((void *)chunk)
+#define	zfs_kunmap_atomic(addr, km)	do { (void)(addr); } while (0)
+#define	local_irq_save(flags)		do { (void)(flags); } while (0)
+#define	local_irq_restore(flags)	do { (void)(flags); } while (0)
+#define	nth_page(pg, i) \
+	((struct page *)((void *)(pg) + (i) * PAGESIZE))
+
+struct scatterlist {
+	struct page *page;
+	int length;
+	int end;
+};
+
+static void
+sg_init_table(struct scatterlist *sg, int nr)
+{
+	memset(sg, 0, nr * sizeof (struct scatterlist));
+	sg[nr - 1].end = 1;
+}
+
+/*
+ * This must be called if any of the sg_table allocation functions
+ * are called.
+ */
+static void
+abd_free_sg_table(abd_t *abd)
+{
+	int nents = ABD_SCATTER(abd).abd_nents;
+	vmem_free(ABD_SCATTER(abd).abd_sgl,
+	    nents * sizeof (struct scatterlist));
+}
+
+#define	for_each_sg(sgl, sg, nr, i)	\
+	for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
+
+static inline void
+sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
+    unsigned int offset)
+{
+	/* currently we don't use offset */
+	ASSERT(offset == 0);
+	sg->page = page;
+	sg->length = len;
+}
+
+static inline struct page *
+sg_page(struct scatterlist *sg)
+{
+	return (sg->page);
+}
+
+static inline struct scatterlist *
+sg_next(struct scatterlist *sg)
+{
+	if (sg->end)
+		return (NULL);
+
+	return (sg + 1);
+}
+
+void
+abd_alloc_chunks(abd_t *abd, size_t size)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(size);
+	struct scatterlist *sg;
+	int i;
+
+	ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+	sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd, sg, nr_pages, i) {
+		struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+		sg_set_page(sg, p, PAGESIZE, 0);
+	}
+	ABD_SCATTER(abd).abd_nents = nr_pages;
+}
+
+void
+abd_free_chunks(abd_t *abd)
+{
+	int i, n = ABD_SCATTER(abd).abd_nents;
+	struct scatterlist *sg;
+
+	abd_for_each_sg(abd, sg, n, i) {
+		for (int j = 0; j < sg->length; j += PAGESIZE) {
+			struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
+			umem_free(p, PAGESIZE);
+		}
+	}
+	abd_free_sg_table(abd);
+}
+
+static void
+abd_alloc_zero_scatter(void)
+{
+	unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
+	struct scatterlist *sg;
+	int i;
+
+	abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
+	memset(abd_zero_page, 0, PAGESIZE);
+	abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
+	abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
+	abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
+	ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
+	ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
+	abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
+	zfs_refcount_create(&abd_zero_scatter->abd_children);
+	ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
+	    sizeof (struct scatterlist), KM_SLEEP);
+
+	sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
+
+	abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
+		sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
+	}
+
+	ABDSTAT_BUMP(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
+	ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
+}
+
+#endif /* _KERNEL */
+
+boolean_t
+abd_size_alloc_linear(size_t size)
+{
+	return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE);
+}
+
+void
+abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
+		arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+		ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
+		ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
+		arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
+	}
+}
+
+void
+abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
+{
+	ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
+	if (op == ABDSTAT_INCR) {
+		ABDSTAT_BUMP(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
+	} else {
+		ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
+		ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
+	}
+}
+
+void
+abd_verify_scatter(abd_t *abd)
+{
+	size_t n;
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
+	ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
+	    ABD_SCATTER(abd).abd_sgl->length);
+	n = ABD_SCATTER(abd).abd_nents;
+	abd_for_each_sg(abd, sg, n, i) {
+		ASSERT3P(sg_page(sg), !=, NULL);
+	}
+}
+
+static void
+abd_free_zero_scatter(void)
+{
+	ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
+	ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
+	ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
+
+	abd_free_sg_table(abd_zero_scatter);
+	abd_free_struct(abd_zero_scatter);
+	abd_zero_scatter = NULL;
+	ASSERT3P(abd_zero_page, !=, NULL);
+#if defined(_KERNEL)
+	abd_unmark_zfs_page(abd_zero_page);
+	__free_page(abd_zero_page);
+#else
+	umem_free(abd_zero_page, PAGESIZE);
+#endif /* _KERNEL */
+}
+
+void
+abd_init(void)
+{
+	int i;
+
+	abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (abd_ksp != NULL) {
+		for (i = 0; i < MAX_ORDER; i++) {
+			snprintf(abd_stats.abdstat_scatter_orders[i].name,
+			    KSTAT_STRLEN, "scatter_order_%d", i);
+			abd_stats.abdstat_scatter_orders[i].data_type =
+			    KSTAT_DATA_UINT64;
+		}
+		abd_ksp->ks_data = &abd_stats;
+		kstat_install(abd_ksp);
+	}
+
+	abd_alloc_zero_scatter();
+}
+
+void
+abd_fini(void)
+{
+	abd_free_zero_scatter();
+
+	if (abd_ksp != NULL) {
+		kstat_delete(abd_ksp);
+		abd_ksp = NULL;
+	}
+
+	if (abd_cache) {
+		kmem_cache_destroy(abd_cache);
+		abd_cache = NULL;
+	}
+}
+
+void
+abd_free_linear_page(abd_t *abd)
+{
+	/* Transform it back into a scatter ABD for freeing */
+	struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
+	abd->abd_flags &= ~ABD_FLAG_LINEAR;
+	abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
+	ABD_SCATTER(abd).abd_nents = 1;
+	ABD_SCATTER(abd).abd_offset = 0;
+	ABD_SCATTER(abd).abd_sgl = sg;
+	abd_free_chunks(abd);
+
+	abd_update_scatter_stats(abd, ABDSTAT_DECR);
+}
+
+/*
+ * If we're going to use this ABD for doing I/O using the block layer, the
+ * consumer of the ABD data doesn't care if it's scattered or not, and we don't
+ * plan to store this ABD in memory for a long period of time, we should
+ * allocate the ABD type that requires the least data copying to do the I/O.
+ *
+ * On Linux the optimal thing to do would be to use abd_get_offset() and
+ * construct a new ABD which shares the original pages thereby eliminating
+ * the copy.  But for the moment a new linear ABD is allocated until this
+ * performance optimization can be implemented.
+ */
+abd_t *
+abd_alloc_for_io(size_t size, boolean_t is_metadata)
+{
+	return (abd_alloc(size, is_metadata));
+}
+
+abd_t *
+abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off)
+{
+	int i = 0;
+	struct scatterlist *sg = NULL;
+
+	abd_verify(sabd);
+	ASSERT3U(off, <=, sabd->abd_size);
+
+	size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
+
+	if (abd == NULL)
+		abd = abd_alloc_struct(0);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that
+	 * if we own the underlying data buffer, which is not true in
+	 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+
+	abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
+		if (new_offset < sg->length)
+			break;
+		new_offset -= sg->length;
+	}
+
+	ABD_SCATTER(abd).abd_sgl = sg;
+	ABD_SCATTER(abd).abd_offset = new_offset;
+	ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
+
+	return (abd);
+}
+
+/*
+ * Initialize the abd_iter.
+ */
+void
+abd_iter_init(struct abd_iter *aiter, abd_t *abd)
+{
+	ASSERT(!abd_is_gang(abd));
+	abd_verify(abd);
+	aiter->iter_abd = abd;
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+	aiter->iter_pos = 0;
+	if (abd_is_linear(abd)) {
+		aiter->iter_offset = 0;
+		aiter->iter_sg = NULL;
+	} else {
+		aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
+		aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
+	}
+}
+
+/*
+ * This is just a helper function to see if we have exhausted the
+ * abd_iter and reached the end.
+ */
+boolean_t
+abd_iter_at_end(struct abd_iter *aiter)
+{
+	return (aiter->iter_pos == aiter->iter_abd->abd_size);
+}
+
+/*
+ * Advance the iterator by a certain amount. Cannot be called when a chunk is
+ * in use. This can be safely called when the aiter has already exhausted, in
+ * which case this does nothing.
+ */
+void
+abd_iter_advance(struct abd_iter *aiter, size_t amount)
+{
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to advance to, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	aiter->iter_pos += amount;
+	aiter->iter_offset += amount;
+	if (!abd_is_linear(aiter->iter_abd)) {
+		while (aiter->iter_offset >= aiter->iter_sg->length) {
+			aiter->iter_offset -= aiter->iter_sg->length;
+			aiter->iter_sg = sg_next(aiter->iter_sg);
+			if (aiter->iter_sg == NULL) {
+				ASSERT0(aiter->iter_offset);
+				break;
+			}
+		}
+	}
+}
+
+/*
+ * Map the current chunk into aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_map(struct abd_iter *aiter)
+{
+	void *paddr;
+	size_t offset = 0;
+
+	ASSERT3P(aiter->iter_mapaddr, ==, NULL);
+	ASSERT0(aiter->iter_mapsize);
+
+	/* There's nothing left to iterate over, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (abd_is_linear(aiter->iter_abd)) {
+		ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
+		paddr = ABD_LINEAR_BUF(aiter->iter_abd);
+	} else {
+		offset = aiter->iter_offset;
+		aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
+		    aiter->iter_abd->abd_size - aiter->iter_pos);
+
+		paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
+		    km_table[aiter->iter_km]);
+	}
+
+	aiter->iter_mapaddr = (char *)paddr + offset;
+}
+
+/*
+ * Unmap the current chunk from aiter. This can be safely called when the aiter
+ * has already exhausted, in which case this does nothing.
+ */
+void
+abd_iter_unmap(struct abd_iter *aiter)
+{
+	/* There's nothing left to unmap, so do nothing */
+	if (abd_iter_at_end(aiter))
+		return;
+
+	if (!abd_is_linear(aiter->iter_abd)) {
+		/* LINTED E_FUNC_SET_NOT_USED */
+		zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
+		    km_table[aiter->iter_km]);
+	}
+
+	ASSERT3P(aiter->iter_mapaddr, !=, NULL);
+	ASSERT3U(aiter->iter_mapsize, >, 0);
+
+	aiter->iter_mapaddr = NULL;
+	aiter->iter_mapsize = 0;
+}
+
+void
+abd_cache_reap_now(void)
+{
+}
+
+#if defined(_KERNEL)
+/*
+ * bio_nr_pages for ABD.
+ * @off is the offset in @abd
+ */
+unsigned long
+abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
+{
+	unsigned long pos;
+
+	if (abd_is_gang(abd)) {
+		unsigned long count = 0;
+
+		for (abd_t *cabd = abd_gang_get_offset(abd, &off);
+		    cabd != NULL && size != 0;
+		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+			ASSERT3U(off, <, cabd->abd_size);
+			int mysize = MIN(size, cabd->abd_size - off);
+			count += abd_nr_pages_off(cabd, mysize, off);
+			size -= mysize;
+			off = 0;
+		}
+		return (count);
+	}
+
+	if (abd_is_linear(abd))
+		pos = (unsigned long)abd_to_buf(abd) + off;
+	else
+		pos = ABD_SCATTER(abd).abd_offset + off;
+
+	return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
+	    (pos >> PAGE_SHIFT));
+}
+
+static unsigned int
+bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
+{
+	unsigned int offset, size, i;
+	struct page *page;
+
+	offset = offset_in_page(buf_ptr);
+	for (i = 0; i < bio->bi_max_vecs; i++) {
+		size = PAGE_SIZE - offset;
+
+		if (bio_size <= 0)
+			break;
+
+		if (size > bio_size)
+			size = bio_size;
+
+		if (is_vmalloc_addr(buf_ptr))
+			page = vmalloc_to_page(buf_ptr);
+		else
+			page = virt_to_page(buf_ptr);
+
+		/*
+		 * Some network related block device uses tcp_sendpage, which
+		 * doesn't behave well when using 0-count page, this is a
+		 * safety net to catch them.
+		 */
+		ASSERT3S(page_count(page), >, 0);
+
+		if (bio_add_page(bio, page, size, offset) != size)
+			break;
+
+		buf_ptr += size;
+		bio_size -= size;
+		offset = 0;
+	}
+
+	return (bio_size);
+}
+
+/*
+ * bio_map for gang ABD.
+ */
+static unsigned int
+abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
+    unsigned int io_size, size_t off)
+{
+	ASSERT(abd_is_gang(abd));
+
+	for (abd_t *cabd = abd_gang_get_offset(abd, &off);
+	    cabd != NULL;
+	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+		ASSERT3U(off, <, cabd->abd_size);
+		int size = MIN(io_size, cabd->abd_size - off);
+		int remainder = abd_bio_map_off(bio, cabd, size, off);
+		io_size -= (size - remainder);
+		if (io_size == 0 || remainder > 0)
+			return (io_size);
+		off = 0;
+	}
+	ASSERT0(io_size);
+	return (io_size);
+}
+
+/*
+ * bio_map for ABD.
+ * @off is the offset in @abd
+ * Remaining IO size is returned
+ */
+unsigned int
+abd_bio_map_off(struct bio *bio, abd_t *abd,
+    unsigned int io_size, size_t off)
+{
+	struct abd_iter aiter;
+
+	ASSERT3U(io_size, <=, abd->abd_size - off);
+	if (abd_is_linear(abd))
+		return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
+
+	ASSERT(!abd_is_linear(abd));
+	if (abd_is_gang(abd))
+		return (abd_gang_bio_map_off(bio, abd, io_size, off));
+
+	abd_iter_init(&aiter, abd);
+	abd_iter_advance(&aiter, off);
+
+	for (int i = 0; i < bio->bi_max_vecs; i++) {
+		struct page *pg;
+		size_t len, sgoff, pgoff;
+		struct scatterlist *sg;
+
+		if (io_size <= 0)
+			break;
+
+		sg = aiter.iter_sg;
+		sgoff = aiter.iter_offset;
+		pgoff = sgoff & (PAGESIZE - 1);
+		len = MIN(io_size, PAGESIZE - pgoff);
+		ASSERT(len > 0);
+
+		pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
+		if (bio_add_page(bio, pg, len, pgoff) != len)
+			break;
+
+		io_size -= len;
+		abd_iter_advance(&aiter, len);
+	}
+
+	return (io_size);
+}
+
+/* Tunable Parameters */
+module_param(zfs_abd_scatter_enabled, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_enabled,
+	"Toggle whether ABD allocations must be linear.");
+module_param(zfs_abd_scatter_min_size, int, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_min_size,
+	"Minimum size of scatter allocations.");
+/* CSTYLED */
+module_param(zfs_abd_scatter_max_order, uint, 0644);
+MODULE_PARM_DESC(zfs_abd_scatter_max_order,
+	"Maximum order allocation used for a scatter ABD.");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
new file mode 100644
index 000000000000..83d4a3d8496c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/arc_os.c
@@ -0,0 +1,530 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#ifdef _KERNEL
+#include <sys/shrinker.h>
+#include <sys/vmsystm.h>
+#include <sys/zpl.h>
+#include <linux/page_compat.h>
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#endif
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/trace_zfs.h>
+#include <sys/aggsum.h>
+
+/*
+ * This is a limit on how many pages the ARC shrinker makes available for
+ * eviction in response to one page allocation attempt.  Note that in
+ * practice, the kernel's shrinker can ask us to evict up to about 4x this
+ * for one allocation attempt.
+ *
+ * The default limit of 10,000 (in practice, 160MB per allocation attempt
+ * with 4K pages) limits the amount of time spent attempting to reclaim ARC
+ * memory to less than 100ms per allocation attempt, even with a small
+ * average compressed block size of ~8KB.
+ *
+ * See also the comment in arc_shrinker_count().
+ * Set to 0 to disable limit.
+ */
+int zfs_arc_shrinker_limit = 10000;
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static struct notifier_block arc_hotplug_callback_mem_nb;
+#endif
+
+/*
+ * Return a default max arc size based on the amount of physical memory.
+ */
+uint64_t
+arc_default_max(uint64_t min, uint64_t allmem)
+{
+	/* Default to 1/2 of all memory. */
+	return (MAX(allmem / 2, min));
+}
+
+#ifdef _KERNEL
+/*
+ * Return maximum amount of memory that we could possibly use.  Reduced
+ * to half of all memory in user space which is primarily used for testing.
+ */
+uint64_t
+arc_all_memory(void)
+{
+#ifdef CONFIG_HIGHMEM
+	return (ptob(zfs_totalram_pages - zfs_totalhigh_pages));
+#else
+	return (ptob(zfs_totalram_pages));
+#endif /* CONFIG_HIGHMEM */
+}
+
+/*
+ * Return the amount of memory that is considered free.  In user space
+ * which is primarily used for testing we pretend that free memory ranges
+ * from 0-20% of all memory.
+ */
+uint64_t
+arc_free_memory(void)
+{
+#ifdef CONFIG_HIGHMEM
+	struct sysinfo si;
+	si_meminfo(&si);
+	return (ptob(si.freeram - si.freehigh));
+#else
+	return (ptob(nr_free_pages() +
+	    nr_inactive_file_pages()));
+#endif /* CONFIG_HIGHMEM */
+}
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed.  Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+int64_t
+arc_available_memory(void)
+{
+	return (arc_free_memory() - arc_sys_free);
+}
+
+static uint64_t
+arc_evictable_memory(void)
+{
+	int64_t asize = aggsum_value(&arc_size);
+	uint64_t arc_clean =
+	    zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) +
+	    zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) +
+	    zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0);
+
+	/*
+	 * Scale reported evictable memory in proportion to page cache, cap
+	 * at specified min/max.
+	 */
+	uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent;
+	min = MAX(arc_c_min, MIN(arc_c_max, min));
+
+	if (arc_dirty >= min)
+		return (arc_clean);
+
+	return (MAX((int64_t)asize - (int64_t)min, 0));
+}
+
+/*
+ * The _count() function returns the number of free-able objects.
+ * The _scan() function returns the number of objects that were freed.
+ */
+static unsigned long
+arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	/*
+	 * __GFP_FS won't be set if we are called from ZFS code (see
+	 * kmem_flags_convert(), which removes it).  To avoid a deadlock, we
+	 * don't allow evicting in this case.  We return 0 rather than
+	 * SHRINK_STOP so that the shrinker logic doesn't accumulate a
+	 * deficit against us.
+	 */
+	if (!(sc->gfp_mask & __GFP_FS)) {
+		return (0);
+	}
+
+	/*
+	 * This code is reached in the "direct reclaim" case, where the
+	 * kernel (outside ZFS) is trying to allocate a page, and the system
+	 * is low on memory.
+	 *
+	 * The kernel's shrinker code doesn't understand how many pages the
+	 * ARC's callback actually frees, so it may ask the ARC to shrink a
+	 * lot for one page allocation. This is problematic because it may
+	 * take a long time, thus delaying the page allocation, and because
+	 * it may force the ARC to unnecessarily shrink very small.
+	 *
+	 * Therefore, we limit the amount of data that we say is evictable,
+	 * which limits the amount that the shrinker will ask us to evict for
+	 * one page allocation attempt.
+	 *
+	 * In practice, we may be asked to shrink 4x the limit to satisfy one
+	 * page allocation, before the kernel's shrinker code gives up on us.
+	 * When that happens, we rely on the kernel code to find the pages
+	 * that we freed before invoking the OOM killer.  This happens in
+	 * __alloc_pages_slowpath(), which retries and finds the pages we
+	 * freed when it calls get_page_from_freelist().
+	 *
+	 * See also the comment above zfs_arc_shrinker_limit.
+	 */
+	int64_t limit = zfs_arc_shrinker_limit != 0 ?
+	    zfs_arc_shrinker_limit : INT64_MAX;
+	return (MIN(limit, btop((int64_t)arc_evictable_memory())));
+}
+
+static unsigned long
+arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	ASSERT((sc->gfp_mask & __GFP_FS) != 0);
+
+	/* The arc is considered warm once reclaim has occurred */
+	if (unlikely(arc_warm == B_FALSE))
+		arc_warm = B_TRUE;
+
+	/*
+	 * Evict the requested number of pages by reducing arc_c and waiting
+	 * for the requested amount of data to be evicted.
+	 */
+	arc_reduce_target_size(ptob(sc->nr_to_scan));
+	arc_wait_for_eviction(ptob(sc->nr_to_scan));
+	if (current->reclaim_state != NULL)
+		current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
+
+	/*
+	 * We are experiencing memory pressure which the arc_evict_zthr was
+	 * unable to keep up with. Set arc_no_grow to briefly pause arc
+	 * growth to avoid compounding the memory pressure.
+	 */
+	arc_no_grow = B_TRUE;
+
+	/*
+	 * When direct reclaim is observed it usually indicates a rapid
+	 * increase in memory pressure.  This occurs because the kswapd
+	 * threads were unable to asynchronously keep enough free memory
+	 * available.
+	 */
+	if (current_is_kswapd()) {
+		ARCSTAT_BUMP(arcstat_memory_indirect_count);
+	} else {
+		ARCSTAT_BUMP(arcstat_memory_direct_count);
+	}
+
+	return (sc->nr_to_scan);
+}
+
+SPL_SHRINKER_DECLARE(arc_shrinker,
+    arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+	uint64_t free_memory = arc_free_memory();
+
+	if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100)
+		return (0);
+
+	if (txg > spa->spa_lowmem_last_txg) {
+		spa->spa_lowmem_last_txg = txg;
+		spa->spa_lowmem_page_load = 0;
+	}
+	/*
+	 * If we are in pageout, we know that memory is already tight,
+	 * the arc is already going to be evicting, so we just want to
+	 * continue to let page writes occur as quickly as possible.
+	 */
+	if (current_is_kswapd()) {
+		if (spa->spa_lowmem_page_load >
+		    MAX(arc_sys_free / 4, free_memory) / 4) {
+			DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+			return (SET_ERROR(ERESTART));
+		}
+		/* Note: reserve is inflated, so we deflate */
+		atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
+		return (0);
+	} else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
+		/* memory is low, delay before restarting */
+		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
+		DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim);
+		return (SET_ERROR(EAGAIN));
+	}
+	spa->spa_lowmem_page_load = 0;
+	return (0);
+}
+
+static void
+arc_set_sys_free(uint64_t allmem)
+{
+	/*
+	 * The ARC tries to keep at least this much memory available for the
+	 * system.  This gives the ARC time to shrink in response to memory
+	 * pressure, before running completely out of memory and invoking the
+	 * direct-reclaim ARC shrinker.
+	 *
+	 * This should be more than twice high_wmark_pages(), so that
+	 * arc_wait_for_eviction() will wait until at least the
+	 * high_wmark_pages() are free (see arc_evict_state_impl()).
+	 *
+	 * Note: Even when the system is very low on memory, the kernel's
+	 * shrinker code may only ask for one "batch" of pages (512KB) to be
+	 * evicted.  If concurrent allocations consume these pages, there may
+	 * still be insufficient free pages, and the OOM killer takes action.
+	 *
+	 * By setting arc_sys_free large enough, and having
+	 * arc_wait_for_eviction() wait until there is at least arc_sys_free/2
+	 * free memory, it is much less likely that concurrent allocations can
+	 * consume all the memory that was evicted before checking for
+	 * OOM.
+	 *
+	 * It's hard to iterate the zones from a linux kernel module, which
+	 * makes it difficult to determine the watermark dynamically. Instead
+	 * we compute the maximum high watermark for this system, based
+	 * on the amount of memory, assuming default parameters on Linux kernel
+	 * 5.3.
+	 */
+
+	/*
+	 * Base wmark_low is 4 * the square root of Kbytes of RAM.
+	 */
+	long wmark = 4 * int_sqrt(allmem/1024) * 1024;
+
+	/*
+	 * Clamp to between 128K and 64MB.
+	 */
+	wmark = MAX(wmark, 128 * 1024);
+	wmark = MIN(wmark, 64 * 1024 * 1024);
+
+	/*
+	 * watermark_boost can increase the wmark by up to 150%.
+	 */
+	wmark += wmark * 150 / 100;
+
+	/*
+	 * arc_sys_free needs to be more than 2x the watermark, because
+	 * arc_wait_for_eviction() waits for half of arc_sys_free.  Bump this up
+	 * to 3x to ensure we're above it.
+	 */
+	arc_sys_free = wmark * 3 + allmem / 32;
+}
+
+void
+arc_lowmem_init(void)
+{
+	uint64_t allmem = arc_all_memory();
+
+	/*
+	 * Register a shrinker to support synchronous (direct) memory
+	 * reclaim from the arc.  This is done to prevent kswapd from
+	 * swapping out pages when it is preferable to shrink the arc.
+	 */
+	spl_register_shrinker(&arc_shrinker);
+	arc_set_sys_free(allmem);
+}
+
+void
+arc_lowmem_fini(void)
+{
+	spl_unregister_shrinker(&arc_shrinker);
+}
+
+int
+param_set_arc_long(const char *buf, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_long(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+
+int
+param_set_arc_int(const char *buf, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_int(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	arc_tuning_update(B_TRUE);
+
+	return (0);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/* ARGSUSED */
+static int
+arc_hotplug_callback(struct notifier_block *self, unsigned long action,
+    void *arg)
+{
+	uint64_t allmem = arc_all_memory();
+	if (action != MEM_ONLINE)
+		return (NOTIFY_OK);
+
+	arc_set_limits(allmem);
+
+#ifdef __LP64__
+	if (zfs_dirty_data_max_max == 0)
+		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
+		    allmem * zfs_dirty_data_max_max_percent / 100);
+#else
+	if (zfs_dirty_data_max_max == 0)
+		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
+		    allmem * zfs_dirty_data_max_max_percent / 100);
+#endif
+
+	arc_set_sys_free(allmem);
+	return (NOTIFY_OK);
+}
+#endif
+
+void
+arc_register_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+	arc_hotplug_callback_mem_nb.notifier_call = arc_hotplug_callback;
+	/* There is no significance to the value 100 */
+	arc_hotplug_callback_mem_nb.priority = 100;
+	register_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
+
+void
+arc_unregister_hotplug(void)
+{
+#ifdef CONFIG_MEMORY_HOTPLUG
+	unregister_memory_notifier(&arc_hotplug_callback_mem_nb);
+#endif
+}
+#else /* _KERNEL */
+int64_t
+arc_available_memory(void)
+{
+	int64_t lowest = INT64_MAX;
+
+	/* Every 100 calls, free a small amount */
+	if (spa_get_random(100) == 0)
+		lowest = -1024;
+
+	return (lowest);
+}
+
+int
+arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+	return (0);
+}
+
+uint64_t
+arc_all_memory(void)
+{
+	return (ptob(physmem) / 2);
+}
+
+uint64_t
+arc_free_memory(void)
+{
+	return (spa_get_random(arc_all_memory() * 20 / 100));
+}
+
+void
+arc_register_hotplug(void)
+{
+}
+
+void
+arc_unregister_hotplug(void)
+{
+}
+#endif /* _KERNEL */
+
+/*
+ * Helper function for arc_prune_async() it is responsible for safely
+ * handling the execution of a registered arc_prune_func_t.
+ */
+static void
+arc_prune_task(void *ptr)
+{
+	arc_prune_t *ap = (arc_prune_t *)ptr;
+	arc_prune_func_t *func = ap->p_pfunc;
+
+	if (func != NULL)
+		func(ap->p_adjust, ap->p_private);
+
+	zfs_refcount_remove(&ap->p_refcnt, func);
+}
+
+/*
+ * Notify registered consumers they must drop holds on a portion of the ARC
+ * buffered they reference.  This provides a mechanism to ensure the ARC can
+ * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers.  This
+ * is analogous to dnlc_reduce_cache() but more generic.
+ *
+ * This operation is performed asynchronously so it may be safely called
+ * in the context of the arc_reclaim_thread().  A reference is taken here
+ * for each registered arc_prune_t and the arc_prune_task() is responsible
+ * for releasing it once the registered arc_prune_func_t has completed.
+ */
+void
+arc_prune_async(int64_t adjust)
+{
+	arc_prune_t *ap;
+
+	mutex_enter(&arc_prune_mtx);
+	for (ap = list_head(&arc_prune_list); ap != NULL;
+	    ap = list_next(&arc_prune_list, ap)) {
+
+		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
+			continue;
+
+		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
+		ap->p_adjust = adjust;
+		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
+		    ap, TQ_SLEEP) == TASKQID_INVALID) {
+			zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
+			continue;
+		}
+		ARCSTAT_BUMP(arcstat_prune);
+	}
+	mutex_exit(&arc_prune_mtx);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
+	"Limit on number of pages that ARC shrinker can reclaim at once");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
new file mode 100644
index 000000000000..ff3ef1bf6ad9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/mmp_os.c
@@ -0,0 +1,41 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/mmp.h>
+
+int
+param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+
+	ret = param_set_ulong(val, kp);
+	if (ret < 0)
+		return (ret);
+
+	if (spa_mode_global != SPA_MODE_UNINIT)
+		mmp_signal_all_threads();
+
+	return (ret);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/policy.c b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
new file mode 100644
index 000000000000..8780d7f6c70a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/policy.c
@@ -0,0 +1,375 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013, Joyent, Inc. All rights reserved.
+ * Copyright (C) 2016 Lawrence Livermore National Security, LLC.
+ *
+ * For Linux the vast majority of this enforcement is already handled via
+ * the standard Linux VFS permission checks.  However certain administrative
+ * commands which bypass the standard mechanisms may need to make use of
+ * this functionality.
+ */
+
+#include <sys/policy.h>
+#include <linux/security.h>
+#include <linux/vfs_compat.h>
+
+/*
+ * The passed credentials cannot be directly verified because Linux only
+ * provides and interface to check the *current* process credentials.  In
+ * order to handle this the capable() test is only run when the passed
+ * credentials match the current process credentials or the kcred.  In
+ * all other cases this function must fail and return the passed err.
+ */
+static int
+priv_policy_ns(const cred_t *cr, int capability, int err,
+    struct user_namespace *ns)
+{
+	if (cr != CRED() && (cr != kcred))
+		return (err);
+
+#if defined(CONFIG_USER_NS)
+	if (!(ns ? ns_capable(ns, capability) : capable(capability)))
+#else
+	if (!capable(capability))
+#endif
+		return (err);
+
+	return (0);
+}
+
+static int
+priv_policy(const cred_t *cr, int capability, int err)
+{
+	return (priv_policy_ns(cr, capability, err, NULL));
+}
+
+static int
+priv_policy_user(const cred_t *cr, int capability, int err)
+{
+	/*
+	 * All priv_policy_user checks are preceded by kuid/kgid_has_mapping()
+	 * checks. If we cannot do them, we shouldn't be using ns_capable()
+	 * since we don't know whether the affected files are valid in our
+	 * namespace.
+	 */
+#if defined(CONFIG_USER_NS)
+	return (priv_policy_ns(cr, capability, err, cr->user_ns));
+#else
+	return (priv_policy_ns(cr, capability, err, NULL));
+#endif
+}
+
+/*
+ * Checks for operations that are either client-only or are used by
+ * both clients and servers.
+ */
+int
+secpolicy_nfs(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EPERM));
+}
+
+/*
+ * Catch all system configuration.
+ */
+int
+secpolicy_sys_config(const cred_t *cr, boolean_t checkonly)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EPERM));
+}
+
+/*
+ * Like secpolicy_vnode_access() but we get the actual wanted mode and the
+ * current mode of the file, not the missing bits.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner,
+    mode_t curmode, mode_t wantmode)
+{
+	return (0);
+}
+
+/*
+ * This is a special routine for ZFS; it is used to determine whether
+ * any of the privileges in effect allow any form of access to the
+ * file.  There's no reason to audit this or any reason to record
+ * this.  More work is needed to do the "KPLD" stuff.
+ */
+int
+secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+	if (inode_owner_or_capable(ip))
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	if (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0)
+		return (0);
+
+	if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0)
+		return (0);
+
+	return (EPERM);
+}
+
+/*
+ * Determine if subject can chown owner of a file.
+ */
+int
+secpolicy_vnode_chown(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Determine if subject can change group ownership of a file.
+ */
+int
+secpolicy_vnode_create_gid(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SETGID, EPERM));
+}
+
+/*
+ * Policy determines whether we can remove an entry from a directory,
+ * regardless of permission bits.
+ */
+int
+secpolicy_vnode_remove(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Determine that subject can modify the mode of a file.  allzone privilege
+ * needed when modifying root owned object.
+ */
+int
+secpolicy_vnode_setdac(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FOWNER, EPERM));
+}
+
+/*
+ * Are we allowed to retain the set-uid/set-gid bits when
+ * changing ownership or when writing to a file?
+ * "issuid" should be true when set-uid; only in that case
+ * root ownership is checked (setgid is assumed).
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr,
+    boolean_t issuidroot)
+{
+	return (priv_policy_user(cr, CAP_FSETID, EPERM));
+}
+
+/*
+ * Determine that subject can set the file setgid flag.
+ */
+int
+secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid)
+{
+#if defined(CONFIG_USER_NS)
+	if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid)))
+		return (EPERM);
+#endif
+	if (crgetfsgid(cr) != gid && !groupmember(gid, cr))
+		return (priv_policy_user(cr, CAP_FSETID, EPERM));
+
+	return (0);
+}
+
+/*
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework.  Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EACCES));
+}
+
+/*
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools).  Equivalent to the SYS_MOUNT privilege.
+ */
+int
+secpolicy_zfs(const cred_t *cr)
+{
+	return (priv_policy(cr, CAP_SYS_ADMIN, EACCES));
+}
+
+/*
+ * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of
+ * the current process.  Takes both cred_t and proc_t so that this can work
+ * easily on all platforms.
+ *
+ * The has_capability() function was first exported in the 4.10 Linux kernel
+ * then backported to some LTS kernels.  Prior to this change there was no
+ * mechanism to perform this check therefore EACCES is returned when the
+ * functionality is not present in the kernel.
+ */
+int
+secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
+{
+#if defined(HAVE_HAS_CAPABILITY)
+	if (!has_capability(proc, CAP_SYS_ADMIN))
+		return (EACCES);
+	return (0);
+#else
+	return (EACCES);
+#endif
+}
+
+void
+secpolicy_setid_clear(vattr_t *vap, cred_t *cr)
+{
+	if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 &&
+	    secpolicy_vnode_setid_retain(NULL, cr,
+	    (vap->va_mode & S_ISUID) != 0 &&
+	    (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) {
+		vap->va_mask |= AT_MODE;
+		vap->va_mode &= ~(S_ISUID|S_ISGID);
+	}
+}
+
+/*
+ * Determine that subject can set the file setid flags.
+ */
+static int
+secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner)
+{
+	if (crgetfsuid(cr) == owner)
+		return (0);
+
+#if defined(CONFIG_USER_NS)
+	if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner)))
+		return (EPERM);
+#endif
+
+	return (priv_policy_user(cr, CAP_FSETID, EPERM));
+}
+
+/*
+ * Determine that subject can make a file a "sticky".
+ *
+ * Enforced in the Linux VFS.
+ */
+static int
+secpolicy_vnode_stky_modify(const cred_t *cr)
+{
+	return (0);
+}
+
+int
+secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap,
+    const vattr_t *ovap, cred_t *cr)
+{
+	int error;
+
+	if ((vap->va_mode & S_ISUID) != 0 &&
+	    (error = secpolicy_vnode_setid_modify(cr,
+	    ovap->va_uid)) != 0) {
+		return (error);
+	}
+
+	/*
+	 * Check privilege if attempting to set the
+	 * sticky bit on a non-directory.
+	 */
+	if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 &&
+	    secpolicy_vnode_stky_modify(cr) != 0) {
+		vap->va_mode &= ~S_ISVTX;
+	}
+
+	/*
+	 * Check for privilege if attempting to set the
+	 * group-id bit.
+	 */
+	if ((vap->va_mode & S_ISGID) != 0 &&
+	    secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) {
+		vap->va_mode &= ~S_ISGID;
+	}
+
+	return (0);
+}
+
+/*
+ * Check privileges for setting xvattr attributes
+ */
+int
+secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type)
+{
+	return (secpolicy_vnode_chown(cr, owner));
+}
+
+/*
+ * Check privileges for setattr attributes.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap,
+    const struct vattr *ovap, int flags,
+    int unlocked_access(void *, int, cred_t *), void *node)
+{
+	return (0);
+}
+
+/*
+ * Check privileges for links.
+ *
+ * Enforced in the Linux VFS.
+ */
+int
+secpolicy_basic_link(const cred_t *cr)
+{
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat.c b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
new file mode 100644
index 000000000000..08613b3a2042
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <sys/zfs_context.h>
+#include <sys/qat.h>
+
+qat_stats_t qat_stats = {
+	{ "comp_requests",			KSTAT_DATA_UINT64 },
+	{ "comp_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "comp_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "decomp_requests",			KSTAT_DATA_UINT64 },
+	{ "decomp_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "decomp_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "dc_fails",				KSTAT_DATA_UINT64 },
+	{ "encrypt_requests",			KSTAT_DATA_UINT64 },
+	{ "encrypt_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "encrypt_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "decrypt_requests",			KSTAT_DATA_UINT64 },
+	{ "decrypt_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "decrypt_total_out_bytes",		KSTAT_DATA_UINT64 },
+	{ "crypt_fails",			KSTAT_DATA_UINT64 },
+	{ "cksum_requests",			KSTAT_DATA_UINT64 },
+	{ "cksum_total_in_bytes",		KSTAT_DATA_UINT64 },
+	{ "cksum_fails",			KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *qat_ksp = NULL;
+
+CpaStatus
+qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes)
+{
+	*pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL);
+	if (*pp_mem_addr == NULL)
+		return (CPA_STATUS_RESOURCE);
+	return (CPA_STATUS_SUCCESS);
+}
+
+void
+qat_mem_free_contig(void **pp_mem_addr)
+{
+	if (*pp_mem_addr != NULL) {
+		kfree(*pp_mem_addr);
+		*pp_mem_addr = NULL;
+	}
+}
+
+int
+qat_init(void)
+{
+	qat_ksp = kstat_create("zfs", 0, "qat", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (qat_ksp != NULL) {
+		qat_ksp->ks_data = &qat_stats;
+		kstat_install(qat_ksp);
+	}
+
+	/*
+	 * Just set the disable flag when qat init failed, qat can be
+	 * turned on again in post-process after zfs module is loaded, e.g.:
+	 * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable
+	 */
+	if (qat_dc_init() != 0)
+		zfs_qat_compress_disable = 1;
+
+	if (qat_cy_init() != 0) {
+		zfs_qat_checksum_disable = 1;
+		zfs_qat_encrypt_disable = 1;
+	}
+
+	return (0);
+}
+
+void
+qat_fini(void)
+{
+	if (qat_ksp != NULL) {
+		kstat_delete(qat_ksp);
+		qat_ksp = NULL;
+	}
+
+	qat_cy_fini();
+	qat_dc_fini();
+}
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
new file mode 100644
index 000000000000..ad3ead3b16e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_compress.c
@@ -0,0 +1,569 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/byteorder.h>
+#include <sys/zio.h>
+#include <sys/qat.h>
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instance and
+ * session arrays; the actual number of instances are defined in the
+ * QAT driver's configuration file.
+ */
+#define	QAT_DC_MAX_INSTANCES	48
+
+/*
+ * ZLIB head and foot size
+ */
+#define	ZLIB_HEAD_SZ		2
+#define	ZLIB_FOOT_SZ		4
+
+static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES];
+static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES];
+static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES];
+static Cpa16U num_inst = 0;
+static Cpa32U inst_num = 0;
+static boolean_t qat_dc_init_done = B_FALSE;
+int zfs_qat_compress_disable = 0;
+
+boolean_t
+qat_dc_use_accel(size_t s_len)
+{
+	return (!zfs_qat_compress_disable &&
+	    qat_dc_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+static void
+qat_dc_callback(void *p_callback, CpaStatus status)
+{
+	if (p_callback != NULL)
+		complete((struct completion *)p_callback);
+}
+
+static void
+qat_dc_clean(void)
+{
+	Cpa16U buff_num = 0;
+	Cpa16U num_inter_buff_lists = 0;
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		cpaDcStopInstance(dc_inst_handles[i]);
+		QAT_PHYS_CONTIG_FREE(session_handles[i]);
+		/* free intermediate buffers  */
+		if (buffer_array[i] != NULL) {
+			cpaDcGetNumIntermediateBuffers(
+			    dc_inst_handles[i], &num_inter_buff_lists);
+			for (buff_num = 0; buff_num < num_inter_buff_lists;
+			    buff_num++) {
+				CpaBufferList *buffer_inter =
+				    buffer_array[i][buff_num];
+				if (buffer_inter->pBuffers) {
+					QAT_PHYS_CONTIG_FREE(
+					    buffer_inter->pBuffers->pData);
+					QAT_PHYS_CONTIG_FREE(
+					    buffer_inter->pBuffers);
+				}
+				QAT_PHYS_CONTIG_FREE(
+				    buffer_inter->pPrivateMetaData);
+				QAT_PHYS_CONTIG_FREE(buffer_inter);
+			}
+		}
+	}
+
+	num_inst = 0;
+	qat_dc_init_done = B_FALSE;
+}
+
+int
+qat_dc_init(void)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U sess_size = 0;
+	Cpa32U ctx_size = 0;
+	Cpa16U num_inter_buff_lists = 0;
+	Cpa16U buff_num = 0;
+	Cpa32U buff_meta_size = 0;
+	CpaDcSessionSetupData sd = {0};
+
+	if (qat_dc_init_done)
+		return (0);
+
+	status = cpaDcGetNumInstances(&num_inst);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	/* if the user has configured no QAT compression units just return */
+	if (num_inst == 0)
+		return (0);
+
+	if (num_inst > QAT_DC_MAX_INSTANCES)
+		num_inst = QAT_DC_MAX_INSTANCES;
+
+	status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		cpaDcSetAddressTranslation(dc_inst_handles[i],
+		    (void*)virt_to_phys);
+
+		status = cpaDcBufferListGetMetaSize(dc_inst_handles[i],
+		    1, &buff_meta_size);
+
+		if (status == CPA_STATUS_SUCCESS)
+			status = cpaDcGetNumIntermediateBuffers(
+			    dc_inst_handles[i], &num_inter_buff_lists);
+
+		if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0)
+			status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i],
+			    num_inter_buff_lists *
+			    sizeof (CpaBufferList *));
+
+		for (buff_num = 0; buff_num < num_inter_buff_lists;
+		    buff_num++) {
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num],
+				    sizeof (CpaBufferList));
+
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->
+				    pPrivateMetaData,
+				    buff_meta_size);
+
+			if (status == CPA_STATUS_SUCCESS)
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->pBuffers,
+				    sizeof (CpaFlatBuffer));
+
+			if (status == CPA_STATUS_SUCCESS) {
+				/*
+				 *  implementation requires an intermediate
+				 *  buffer approximately twice the size of
+				 *  output buffer, which is 2x max buffer
+				 *  size here.
+				 */
+				status = QAT_PHYS_CONTIG_ALLOC(
+				    &buffer_array[i][buff_num]->pBuffers->
+				    pData, 2 * QAT_MAX_BUF_SIZE);
+				if (status != CPA_STATUS_SUCCESS)
+					goto fail;
+
+				buffer_array[i][buff_num]->numBuffers = 1;
+				buffer_array[i][buff_num]->pBuffers->
+				    dataLenInBytes = 2 * QAT_MAX_BUF_SIZE;
+			}
+		}
+
+		status = cpaDcStartInstance(dc_inst_handles[i],
+		    num_inter_buff_lists, buffer_array[i]);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		sd.compLevel = CPA_DC_L1;
+		sd.compType = CPA_DC_DEFLATE;
+		sd.huffType = CPA_DC_HT_FULL_DYNAMIC;
+		sd.sessDirection = CPA_DC_DIR_COMBINED;
+		sd.sessState = CPA_DC_STATELESS;
+		sd.deflateWindowSize = 7;
+		sd.checksum = CPA_DC_ADLER32;
+		status = cpaDcGetSessionSize(dc_inst_handles[i],
+		    &sd, &sess_size, &ctx_size);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size);
+		if (session_handles[i] == NULL)
+			goto fail;
+
+		status = cpaDcInitSession(dc_inst_handles[i],
+		    session_handles[i],
+		    &sd, NULL, qat_dc_callback);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+	}
+
+	qat_dc_init_done = B_TRUE;
+	return (0);
+fail:
+	qat_dc_clean();
+	return (-1);
+}
+
+void
+qat_dc_fini(void)
+{
+	if (!qat_dc_init_done)
+		return;
+
+	qat_dc_clean();
+}
+
+/*
+ * The "add" parameter is an additional buffer which is passed
+ * to QAT as a scratch buffer alongside the destination buffer
+ * in case the "compressed" data ends up being larger than the
+ * original source data. This is necessary to prevent QAT from
+ * generating buffer overflow warnings for incompressible data.
+ */
+static int
+qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len,
+    char *dst, int dst_len, char *add, int add_len, size_t *c_len)
+{
+	CpaInstanceHandle dc_inst_handle;
+	CpaDcSessionHandle session_handle;
+	CpaBufferList *buf_list_src = NULL;
+	CpaBufferList *buf_list_dst = NULL;
+	CpaFlatBuffer *flat_buf_src = NULL;
+	CpaFlatBuffer *flat_buf_dst = NULL;
+	Cpa8U *buffer_meta_src = NULL;
+	Cpa8U *buffer_meta_dst = NULL;
+	Cpa32U buffer_meta_size = 0;
+	CpaDcRqResults dc_results;
+	CpaStatus status = CPA_STATUS_FAIL;
+	Cpa32U hdr_sz = 0;
+	Cpa32U compressed_sz;
+	Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2;
+	Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2;
+	Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left;
+	Cpa32U dst_pages = 0;
+	Cpa32U adler32 = 0;
+	char *data;
+	struct page *page;
+	struct page **in_pages = NULL;
+	struct page **out_pages = NULL;
+	struct page **add_pages = NULL;
+	Cpa32U page_off = 0;
+	struct completion complete;
+	Cpa32U page_num = 0;
+	Cpa16U i;
+
+	/*
+	 * We increment num_src_buf and num_dst_buf by 2 to allow
+	 * us to handle non page-aligned buffer addresses and buffers
+	 * whose sizes are not divisible by PAGE_SIZE.
+	 */
+	Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) +
+	    (num_src_buf * sizeof (CpaFlatBuffer));
+	Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) +
+	    ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer));
+
+	status = QAT_PHYS_CONTIG_ALLOC(&in_pages,
+	    num_src_buf * sizeof (struct page *));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&out_pages,
+	    num_dst_buf * sizeof (struct page *));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&add_pages,
+	    num_add_buf * sizeof (struct page *));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	dc_inst_handle = dc_inst_handles[i];
+	session_handle = session_handles[i];
+
+	cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf,
+	    &buffer_meta_size);
+	status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf,
+	    &buffer_meta_size);
+	status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	/* build source buffer list */
+	status = QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1);
+
+	buf_list_src->pBuffers = flat_buf_src; /* always point to first one */
+
+	/* build destination buffer list */
+	status = QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+
+	buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */
+
+	buf_list_src->numBuffers = 0;
+	buf_list_src->pPrivateMetaData = buffer_meta_src;
+	bytes_left = src_len;
+	data = src;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		in_pages[page_num] = page;
+		flat_buf_src->pData = kmap(page) + page_off;
+		flat_buf_src->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_src->dataLenInBytes;
+		data += flat_buf_src->dataLenInBytes;
+		flat_buf_src++;
+		buf_list_src->numBuffers++;
+		page_num++;
+	}
+
+	buf_list_dst->numBuffers = 0;
+	buf_list_dst->pPrivateMetaData = buffer_meta_dst;
+	bytes_left = dst_len;
+	data = dst;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		flat_buf_dst->pData = kmap(page) + page_off;
+		out_pages[page_num] = page;
+		flat_buf_dst->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_dst->dataLenInBytes;
+		data += flat_buf_dst->dataLenInBytes;
+		flat_buf_dst++;
+		buf_list_dst->numBuffers++;
+		page_num++;
+		dst_pages++;
+	}
+
+	/* map additional scratch pages into the destination buffer list */
+	bytes_left = add_len;
+	data = add;
+	page_num = 0;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		page = qat_mem_to_page(data);
+		flat_buf_dst->pData = kmap(page) + page_off;
+		add_pages[page_num] = page;
+		flat_buf_dst->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+
+		bytes_left -= flat_buf_dst->dataLenInBytes;
+		data += flat_buf_dst->dataLenInBytes;
+		flat_buf_dst++;
+		buf_list_dst->numBuffers++;
+		page_num++;
+	}
+
+	init_completion(&complete);
+
+	if (dir == QAT_COMPRESS) {
+		QAT_STAT_BUMP(comp_requests);
+		QAT_STAT_INCR(comp_total_in_bytes, src_len);
+
+		cpaDcGenerateHeader(session_handle,
+		    buf_list_dst->pBuffers, &hdr_sz);
+		buf_list_dst->pBuffers->pData += hdr_sz;
+		buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz;
+		status = cpaDcCompressData(
+		    dc_inst_handle, session_handle,
+		    buf_list_src, buf_list_dst,
+		    &dc_results, CPA_DC_FLUSH_FINAL,
+		    &complete);
+		if (status != CPA_STATUS_SUCCESS) {
+			goto fail;
+		}
+
+		/* we now wait until the completion of the operation. */
+		wait_for_completion(&complete);
+
+		if (dc_results.status != CPA_STATUS_SUCCESS) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		compressed_sz = dc_results.produced;
+		if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) {
+			status = CPA_STATUS_INCOMPRESSIBLE;
+			goto fail;
+		}
+
+		flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1);
+		/* move to the last page */
+		flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT;
+
+		/* no space for gzip footer in the last page */
+		if (((compressed_sz + hdr_sz) % PAGE_SIZE)
+		    + ZLIB_FOOT_SZ > PAGE_SIZE) {
+			status = CPA_STATUS_INCOMPRESSIBLE;
+			goto fail;
+		}
+
+		/* jump to the end of the buffer and append footer */
+		flat_buf_dst->pData =
+		    (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK)
+		    + ((compressed_sz + hdr_sz) % PAGE_SIZE);
+		flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ;
+
+		dc_results.produced = 0;
+		status = cpaDcGenerateFooter(session_handle,
+		    flat_buf_dst, &dc_results);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+
+		*c_len = compressed_sz + dc_results.produced + hdr_sz;
+		QAT_STAT_INCR(comp_total_out_bytes, *c_len);
+	} else {
+		ASSERT3U(dir, ==, QAT_DECOMPRESS);
+		QAT_STAT_BUMP(decomp_requests);
+		QAT_STAT_INCR(decomp_total_in_bytes, src_len);
+
+		buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ;
+		buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ;
+		status = cpaDcDecompressData(dc_inst_handle, session_handle,
+		    buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL,
+		    &complete);
+
+		if (CPA_STATUS_SUCCESS != status) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		/* we now wait until the completion of the operation. */
+		wait_for_completion(&complete);
+
+		if (dc_results.status != CPA_STATUS_SUCCESS) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+
+		/* verify adler checksum */
+		adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ);
+		if (adler32 != BSWAP_32(dc_results.checksum)) {
+			status = CPA_STATUS_FAIL;
+			goto fail;
+		}
+		*c_len = dc_results.produced;
+		QAT_STAT_INCR(decomp_total_out_bytes, *c_len);
+	}
+
+fail:
+	if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE)
+		QAT_STAT_BUMP(dc_fails);
+
+	if (in_pages) {
+		for (page_num = 0;
+		    page_num < buf_list_src->numBuffers;
+		    page_num++) {
+			kunmap(in_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(in_pages);
+	}
+
+	if (out_pages) {
+		for (page_num = 0; page_num < dst_pages; page_num++) {
+			kunmap(out_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(out_pages);
+	}
+
+	if (add_pages) {
+		for (page_num = 0;
+		    page_num < buf_list_dst->numBuffers - dst_pages;
+		    page_num++) {
+			kunmap(add_pages[page_num]);
+		}
+		QAT_PHYS_CONTIG_FREE(add_pages);
+	}
+
+	QAT_PHYS_CONTIG_FREE(buffer_meta_src);
+	QAT_PHYS_CONTIG_FREE(buffer_meta_dst);
+	QAT_PHYS_CONTIG_FREE(buf_list_src);
+	QAT_PHYS_CONTIG_FREE(buf_list_dst);
+
+	return (status);
+}
+
+/*
+ * Entry point for QAT accelerated compression / decompression.
+ */
+int
+qat_compress(qat_compress_dir_t dir, char *src, int src_len,
+    char *dst, int dst_len, size_t *c_len)
+{
+	int ret;
+	size_t add_len = 0;
+	void *add = NULL;
+
+	if (dir == QAT_COMPRESS) {
+		add_len = dst_len;
+		add = zio_data_buf_alloc(add_len);
+	}
+
+	ret = qat_compress_impl(dir, src, src_len, dst,
+	    dst_len, add, add_len, c_len);
+
+	if (dir == QAT_COMPRESS)
+		zio_data_buf_free(add, add_len);
+
+	return (ret);
+}
+
+static int
+param_set_qat_compress(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * zfs_qat_compress_disable = 0: enable qat compress
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_dc_init_done) {
+		ret = qat_dc_init();
+		if (ret != 0) {
+			zfs_qat_compress_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+module_param_call(zfs_qat_compress_disable, param_set_qat_compress,
+    param_get_int, &zfs_qat_compress_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression");
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
new file mode 100644
index 000000000000..4771b2f3bec5
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/qat_crypt.c
@@ -0,0 +1,630 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * This file represents the QAT implementation of checksums and encryption.
+ * Internally, QAT shares the same cryptographic instances for both of these
+ * operations, so the code has been combined here. QAT data compression uses
+ * compression instances, so that code is separated into qat_compress.c
+ */
+
+#if defined(_KERNEL) && defined(HAVE_QAT)
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/pagemap.h>
+#include <linux/completion.h>
+#include <sys/zfs_context.h>
+#include <sys/zio_crypt.h>
+#include "lac/cpa_cy_im.h"
+#include "lac/cpa_cy_common.h"
+#include <sys/qat.h>
+
+/*
+ * Max instances in a QAT device, each instance is a channel to submit
+ * jobs to QAT hardware, this is only for pre-allocating instances
+ * and session arrays; the actual number of instances are defined in
+ * the QAT driver's configure file.
+ */
+#define	QAT_CRYPT_MAX_INSTANCES		48
+
+#define	MAX_PAGE_NUM			1024
+
+static Cpa32U inst_num = 0;
+static Cpa16U num_inst = 0;
+static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES];
+static boolean_t qat_cy_init_done = B_FALSE;
+int zfs_qat_encrypt_disable = 0;
+int zfs_qat_checksum_disable = 0;
+
+typedef struct cy_callback {
+	CpaBoolean verify_result;
+	struct completion complete;
+} cy_callback_t;
+
+static void
+symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation,
+    void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify)
+{
+	cy_callback_t *cb = p_callback;
+
+	if (cb != NULL) {
+		/* indicate that the function has been called */
+		cb->verify_result = verify;
+		complete(&cb->complete);
+	}
+}
+
+boolean_t
+qat_crypt_use_accel(size_t s_len)
+{
+	return (!zfs_qat_encrypt_disable &&
+	    qat_cy_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+boolean_t
+qat_checksum_use_accel(size_t s_len)
+{
+	return (!zfs_qat_checksum_disable &&
+	    qat_cy_init_done &&
+	    s_len >= QAT_MIN_BUF_SIZE &&
+	    s_len <= QAT_MAX_BUF_SIZE);
+}
+
+void
+qat_cy_clean(void)
+{
+	for (Cpa16U i = 0; i < num_inst; i++)
+		cpaCyStopInstance(cy_inst_handles[i]);
+
+	num_inst = 0;
+	qat_cy_init_done = B_FALSE;
+}
+
+int
+qat_cy_init(void)
+{
+	CpaStatus status = CPA_STATUS_FAIL;
+
+	if (qat_cy_init_done)
+		return (0);
+
+	status = cpaCyGetNumInstances(&num_inst);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	/* if the user has configured no QAT encryption units just return */
+	if (num_inst == 0)
+		return (0);
+
+	if (num_inst > QAT_CRYPT_MAX_INSTANCES)
+		num_inst = QAT_CRYPT_MAX_INSTANCES;
+
+	status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]);
+	if (status != CPA_STATUS_SUCCESS)
+		return (-1);
+
+	for (Cpa16U i = 0; i < num_inst; i++) {
+		status = cpaCySetAddressTranslation(cy_inst_handles[i],
+		    (void *)virt_to_phys);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+
+		status = cpaCyStartInstance(cy_inst_handles[i]);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+	}
+
+	qat_cy_init_done = B_TRUE;
+	return (0);
+
+error:
+	qat_cy_clean();
+	return (-1);
+}
+
+void
+qat_cy_fini(void)
+{
+	if (!qat_cy_init_done)
+		return;
+
+	qat_cy_clean();
+}
+
+static CpaStatus
+qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle,
+    CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key,
+    Cpa64U crypt, Cpa32U aad_len)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U ctx_size;
+	Cpa32U ciper_algorithm;
+	Cpa32U hash_algorithm;
+	CpaCySymSessionSetupData sd = { 0 };
+
+	if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) {
+		return (CPA_STATUS_FAIL);
+	} else {
+		ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM;
+		hash_algorithm = CPA_CY_SYM_HASH_AES_GCM;
+	}
+
+	sd.cipherSetupData.cipherAlgorithm = ciper_algorithm;
+	sd.cipherSetupData.pCipherKey = key->ck_data;
+	sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8;
+	sd.hashSetupData.hashAlgorithm = hash_algorithm;
+	sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH;
+	sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN;
+	sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len;
+	sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+	sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING;
+	sd.digestIsAppended = CPA_FALSE;
+	sd.verifyDigest = CPA_FALSE;
+
+	if (dir == QAT_ENCRYPT) {
+		sd.cipherSetupData.cipherDirection =
+		    CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT;
+		sd.algChainOrder =
+		    CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER;
+	} else {
+		ASSERT3U(dir, ==, QAT_DECRYPT);
+		sd.cipherSetupData.cipherDirection =
+		    CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT;
+		sd.algChainOrder =
+		    CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH;
+	}
+
+	status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+	    *cy_session_ctx);
+	if (status != CPA_STATUS_SUCCESS) {
+		QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+		return (status);
+	}
+
+	return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle,
+    CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U ctx_size;
+	Cpa32U hash_algorithm;
+	CpaCySymSessionSetupData sd = { 0 };
+
+	/*
+	 * ZFS's SHA512 checksum is actually SHA512/256, which uses
+	 * a different IV from standard SHA512. QAT does not support
+	 * SHA512/256, so we can only support SHA256.
+	 */
+	if (cksum == ZIO_CHECKSUM_SHA256)
+		hash_algorithm = CPA_CY_SYM_HASH_SHA256;
+	else
+		return (CPA_STATUS_FAIL);
+
+	sd.sessionPriority = CPA_CY_PRIORITY_NORMAL;
+	sd.symOperation = CPA_CY_SYM_OP_HASH;
+	sd.hashSetupData.hashAlgorithm = hash_algorithm;
+	sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN;
+	sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t);
+	sd.digestIsAppended = CPA_FALSE;
+	sd.verifyDigest = CPA_FALSE;
+
+	status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = cpaCySymInitSession(inst_handle, symcallback, &sd,
+	    *cy_session_ctx);
+	if (status != CPA_STATUS_SUCCESS) {
+		QAT_PHYS_CONTIG_FREE(*cy_session_ctx);
+		return (status);
+	}
+
+	return (CPA_STATUS_SUCCESS);
+}
+
+static CpaStatus
+qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs,
+    CpaBufferList *src, CpaBufferList *dst)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa32U meta_size = 0;
+
+	status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		return (status);
+
+	status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size);
+	if (status != CPA_STATUS_SUCCESS)
+		goto error;
+
+	if (src != dst) {
+		status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData,
+		    meta_size);
+		if (status != CPA_STATUS_SUCCESS)
+			goto error;
+	}
+
+	return (CPA_STATUS_SUCCESS);
+
+error:
+	QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData);
+	if (src != dst)
+		QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData);
+
+	return (status);
+}
+
+int
+qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf,
+    uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf,
+    crypto_key_t *key, uint64_t crypt, uint32_t enc_len)
+{
+	CpaStatus status = CPA_STATUS_SUCCESS;
+	Cpa16U i;
+	CpaInstanceHandle cy_inst_handle;
+	Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left = 0;
+	Cpa8S *data = NULL;
+	CpaCySymSessionCtx *cy_session_ctx = NULL;
+	cy_callback_t cb;
+	CpaCySymOpData op_data = { 0 };
+	CpaBufferList src_buffer_list = { 0 };
+	CpaBufferList dst_buffer_list = { 0 };
+	CpaFlatBuffer *flat_src_buf_array = NULL;
+	CpaFlatBuffer *flat_src_buf = NULL;
+	CpaFlatBuffer *flat_dst_buf_array = NULL;
+	CpaFlatBuffer *flat_dst_buf = NULL;
+	struct page *in_pages[MAX_PAGE_NUM];
+	struct page *out_pages[MAX_PAGE_NUM];
+	Cpa32U in_page_num = 0;
+	Cpa32U out_page_num = 0;
+	Cpa32U in_page_off = 0;
+	Cpa32U out_page_off = 0;
+
+	if (dir == QAT_ENCRYPT) {
+		QAT_STAT_BUMP(encrypt_requests);
+		QAT_STAT_INCR(encrypt_total_in_bytes, enc_len);
+	} else {
+		QAT_STAT_BUMP(decrypt_requests);
+		QAT_STAT_INCR(decrypt_total_in_bytes, enc_len);
+	}
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	cy_inst_handle = cy_inst_handles[i];
+
+	status = qat_init_crypt_session_ctx(dir, cy_inst_handle,
+	    &cy_session_ctx, key, crypt, aad_len);
+	if (status != CPA_STATUS_SUCCESS) {
+		/* don't count CCM as a failure since it's not supported */
+		if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM)
+			QAT_STAT_BUMP(crypt_fails);
+		return (status);
+	}
+
+	/*
+	 * We increment nr_bufs by 2 to allow us to handle non
+	 * page-aligned buffer addresses and buffers whose sizes
+	 * are not divisible by PAGE_SIZE.
+	 */
+	status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+	    &src_buffer_list, &dst_buffer_list);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult,
+	    ZIO_DATA_MAC_LEN);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv,
+	    ZIO_DATA_IV_LEN);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	if (aad_len > 0) {
+		status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData,
+		    aad_len);
+		if (status != CPA_STATUS_SUCCESS)
+			goto fail;
+		bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len);
+	}
+
+	bytes_left = enc_len;
+	data = src_buf;
+	flat_src_buf = flat_src_buf_array;
+	while (bytes_left > 0) {
+		in_page_off = ((long)data & ~PAGE_MASK);
+		in_pages[in_page_num] = qat_mem_to_page(data);
+		flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off;
+		flat_src_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - in_page_off, (long)bytes_left);
+		data += flat_src_buf->dataLenInBytes;
+		bytes_left -= flat_src_buf->dataLenInBytes;
+		flat_src_buf++;
+		in_page_num++;
+	}
+	src_buffer_list.pBuffers = flat_src_buf_array;
+	src_buffer_list.numBuffers = in_page_num;
+
+	bytes_left = enc_len;
+	data = dst_buf;
+	flat_dst_buf = flat_dst_buf_array;
+	while (bytes_left > 0) {
+		out_page_off = ((long)data & ~PAGE_MASK);
+		out_pages[out_page_num] = qat_mem_to_page(data);
+		flat_dst_buf->pData = kmap(out_pages[out_page_num]) +
+		    out_page_off;
+		flat_dst_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - out_page_off, (long)bytes_left);
+		data += flat_dst_buf->dataLenInBytes;
+		bytes_left -= flat_dst_buf->dataLenInBytes;
+		flat_dst_buf++;
+		out_page_num++;
+	}
+	dst_buffer_list.pBuffers = flat_dst_buf_array;
+	dst_buffer_list.numBuffers = out_page_num;
+
+	op_data.sessionCtx = cy_session_ctx;
+	op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+	op_data.cryptoStartSrcOffsetInBytes = 0;
+	op_data.messageLenToCipherInBytes = 0;
+	op_data.hashStartSrcOffsetInBytes = 0;
+	op_data.messageLenToHashInBytes = 0;
+	op_data.messageLenToCipherInBytes = enc_len;
+	op_data.ivLenInBytes = ZIO_DATA_IV_LEN;
+	bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN);
+	/* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */
+	if (dir == QAT_DECRYPT)
+		bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN);
+
+	cb.verify_result = CPA_FALSE;
+	init_completion(&cb.complete);
+	status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+	    &src_buffer_list, &dst_buffer_list, NULL);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	/* we now wait until the completion of the operation. */
+	wait_for_completion(&cb.complete);
+
+	if (cb.verify_result == CPA_FALSE) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+
+	if (dir == QAT_ENCRYPT) {
+		/* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */
+		bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN);
+		QAT_STAT_INCR(encrypt_total_out_bytes, enc_len);
+	} else {
+		QAT_STAT_INCR(decrypt_total_out_bytes, enc_len);
+	}
+
+fail:
+	if (status != CPA_STATUS_SUCCESS)
+		QAT_STAT_BUMP(crypt_fails);
+
+	for (i = 0; i < in_page_num; i++)
+		kunmap(in_pages[i]);
+	for (i = 0; i < out_page_num; i++)
+		kunmap(out_pages[i]);
+
+	cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+	if (aad_len > 0)
+		QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData);
+	QAT_PHYS_CONTIG_FREE(op_data.pIv);
+	QAT_PHYS_CONTIG_FREE(op_data.pDigestResult);
+	QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+	QAT_PHYS_CONTIG_FREE(flat_dst_buf_array);
+
+	return (status);
+}
+
+int
+qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	CpaStatus status;
+	Cpa16U i;
+	CpaInstanceHandle cy_inst_handle;
+	Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2;
+	Cpa32U bytes_left = 0;
+	Cpa8S *data = NULL;
+	CpaCySymSessionCtx *cy_session_ctx = NULL;
+	cy_callback_t cb;
+	Cpa8U *digest_buffer = NULL;
+	CpaCySymOpData op_data = { 0 };
+	CpaBufferList src_buffer_list = { 0 };
+	CpaFlatBuffer *flat_src_buf_array = NULL;
+	CpaFlatBuffer *flat_src_buf = NULL;
+	struct page *in_pages[MAX_PAGE_NUM];
+	Cpa32U page_num = 0;
+	Cpa32U page_off = 0;
+
+	QAT_STAT_BUMP(cksum_requests);
+	QAT_STAT_INCR(cksum_total_in_bytes, size);
+
+	i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst;
+	cy_inst_handle = cy_inst_handles[i];
+
+	status = qat_init_checksum_session_ctx(cy_inst_handle,
+	    &cy_session_ctx, cksum);
+	if (status != CPA_STATUS_SUCCESS) {
+		/* don't count unsupported checksums as a failure */
+		if (cksum == ZIO_CHECKSUM_SHA256 ||
+		    cksum == ZIO_CHECKSUM_SHA512)
+			QAT_STAT_BUMP(cksum_fails);
+		return (status);
+	}
+
+	/*
+	 * We increment nr_bufs by 2 to allow us to handle non
+	 * page-aligned buffer addresses and buffers whose sizes
+	 * are not divisible by PAGE_SIZE.
+	 */
+	status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs,
+	    &src_buffer_list, &src_buffer_list);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array,
+	    nr_bufs * sizeof (CpaFlatBuffer));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+	status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer,
+	    sizeof (zio_cksum_t));
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	bytes_left = size;
+	data = buf;
+	flat_src_buf = flat_src_buf_array;
+	while (bytes_left > 0) {
+		page_off = ((long)data & ~PAGE_MASK);
+		in_pages[page_num] = qat_mem_to_page(data);
+		flat_src_buf->pData = kmap(in_pages[page_num]) + page_off;
+		flat_src_buf->dataLenInBytes =
+		    min((long)PAGE_SIZE - page_off, (long)bytes_left);
+		data += flat_src_buf->dataLenInBytes;
+		bytes_left -= flat_src_buf->dataLenInBytes;
+		flat_src_buf++;
+		page_num++;
+	}
+	src_buffer_list.pBuffers = flat_src_buf_array;
+	src_buffer_list.numBuffers = page_num;
+
+	op_data.sessionCtx = cy_session_ctx;
+	op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL;
+	op_data.hashStartSrcOffsetInBytes = 0;
+	op_data.messageLenToHashInBytes = size;
+	op_data.pDigestResult = digest_buffer;
+
+	cb.verify_result = CPA_FALSE;
+	init_completion(&cb.complete);
+	status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data,
+	    &src_buffer_list, &src_buffer_list, NULL);
+	if (status != CPA_STATUS_SUCCESS)
+		goto fail;
+
+	/* we now wait until the completion of the operation. */
+	wait_for_completion(&cb.complete);
+
+	if (cb.verify_result == CPA_FALSE) {
+		status = CPA_STATUS_FAIL;
+		goto fail;
+	}
+
+	bcopy(digest_buffer, zcp, sizeof (zio_cksum_t));
+
+fail:
+	if (status != CPA_STATUS_SUCCESS)
+		QAT_STAT_BUMP(cksum_fails);
+
+	for (i = 0; i < page_num; i++)
+		kunmap(in_pages[i]);
+
+	cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(digest_buffer);
+	QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData);
+	QAT_PHYS_CONTIG_FREE(cy_session_ctx);
+	QAT_PHYS_CONTIG_FREE(flat_src_buf_array);
+
+	return (status);
+}
+
+static int
+param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * zfs_qat_encrypt_disable = 0: enable qat encrypt
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_cy_init_done) {
+		ret = qat_cy_init();
+		if (ret != 0) {
+			zfs_qat_encrypt_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+static int
+param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp)
+{
+	int ret;
+	int *pvalue = kp->arg;
+	ret = param_set_int(val, kp);
+	if (ret)
+		return (ret);
+	/*
+	 * set_checksum_param_ops = 0: enable qat checksum
+	 * try to initialize qat instance if it has not been done
+	 */
+	if (*pvalue == 0 && !qat_cy_init_done) {
+		ret = qat_cy_init();
+		if (ret != 0) {
+			zfs_qat_checksum_disable = 1;
+			return (ret);
+		}
+	}
+	return (ret);
+}
+
+module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt,
+    param_get_int, &zfs_qat_encrypt_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption");
+
+module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum,
+    param_get_int, &zfs_qat_checksum_disable, 0644);
+MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming");
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
new file mode 100644
index 000000000000..5672cd6d5c5e
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/spa_misc_os.c
@@ -0,0 +1,110 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_scan.h>
+#include <sys/fs/zfs.h>
+#include <sys/kstat.h>
+#include "zfs_prop.h"
+
+
+int
+param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = -param_set_deadman_failmode_common(val);
+	if (error == 0)
+		error = param_set_charp(val, kp);
+
+	return (error);
+}
+
+int
+param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_ulong(val, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_ziotime_ms));
+
+	return (0);
+}
+
+int
+param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp)
+{
+	int error;
+
+	error = param_set_ulong(val, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
+
+	return (0);
+}
+
+int
+param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp)
+{
+	unsigned long val;
+	int error;
+
+	error = kstrtoul(buf, 0, &val);
+	if (error)
+		return (SET_ERROR(error));
+
+	if (val < 1 || val > 31)
+		return (SET_ERROR(-EINVAL));
+
+	error = param_set_int(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	return (0);
+}
+
+const char *
+spa_history_zone(void)
+{
+	return ("linux");
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/trace.c b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
new file mode 100644
index 000000000000..a690822ae14c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/trace.c
@@ -0,0 +1,55 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Each DTRACE_PROBE must define its trace point in one (and only one)
+ * source file, so this dummy file exists for that purpose.
+ */
+
+#include <sys/multilist.h>
+#include <sys/arc_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/zfs_znode.h>
+#include <sys/zil_impl.h>
+
+#ifdef _KERNEL
+#define	CREATE_TRACE_POINTS
+#include <sys/trace.h>
+#include <sys/trace_acl.h>
+#include <sys/trace_arc.h>
+#include <sys/trace_dbgmsg.h>
+#include <sys/trace_dbuf.h>
+#include <sys/trace_dmu.h>
+#include <sys/trace_dnode.h>
+#include <sys/trace_multilist.h>
+#include <sys/trace_rrwlock.h>
+#include <sys/trace_txg.h>
+#include <sys/trace_vdev.h>
+#include <sys/trace_zil.h>
+#include <sys/trace_zio.h>
+#include <sys/trace_zrlock.h>
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
new file mode 100644
index 000000000000..b373f2c2e83c
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_disk.c
@@ -0,0 +1,919 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * LLNL-CODE-403049.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <linux/blkpg.h>
+#include <linux/msdos_fs.h>
+#include <linux/vfs_compat.h>
+
+typedef struct vdev_disk {
+	struct block_device		*vd_bdev;
+	krwlock_t			vd_lock;
+} vdev_disk_t;
+
+/*
+ * Unique identifier for the exclusive vdev holder.
+ */
+static void *zfs_vdev_holder = VDEV_HOLDER;
+
+/*
+ * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the
+ * device is missing. The missing path may be transient since the links
+ * can be briefly removed and recreated in response to udev events.
+ */
+static unsigned zfs_vdev_open_timeout_ms = 1000;
+
+/*
+ * Size of the "reserved" partition, in blocks.
+ */
+#define	EFI_MIN_RESV_SIZE	(16 * 1024)
+
+/*
+ * Virtual device vector for disks.
+ */
+typedef struct dio_request {
+	zio_t			*dr_zio;	/* Parent ZIO */
+	atomic_t		dr_ref;		/* References */
+	int			dr_error;	/* Bio error */
+	int			dr_bio_count;	/* Count of bio's */
+	struct bio		*dr_bio[0];	/* Attached bio's */
+} dio_request_t;
+
+static fmode_t
+vdev_bdev_mode(spa_mode_t spa_mode)
+{
+	fmode_t mode = 0;
+
+	if (spa_mode & SPA_MODE_READ)
+		mode |= FMODE_READ;
+
+	if (spa_mode & SPA_MODE_WRITE)
+		mode |= FMODE_WRITE;
+
+	return (mode);
+}
+
+/*
+ * Returns the usable capacity (in bytes) for the partition or disk.
+ */
+static uint64_t
+bdev_capacity(struct block_device *bdev)
+{
+	return (i_size_read(bdev->bd_inode));
+}
+
+#if !defined(HAVE_BDEV_WHOLE)
+static inline struct block_device *
+bdev_whole(struct block_device *bdev)
+{
+	return (bdev->bd_contains);
+}
+#endif
+
+/*
+ * Returns the maximum expansion capacity of the block device (in bytes).
+ *
+ * It is possible to expand a vdev when it has been created as a wholedisk
+ * and the containing block device has increased in capacity.  Or when the
+ * partition containing the pool has been manually increased in size.
+ *
+ * This function is only responsible for calculating the potential expansion
+ * size so it can be reported by 'zpool list'.  The efi_use_whole_disk() is
+ * responsible for verifying the expected partition layout in the wholedisk
+ * case, and updating the partition table if appropriate.  Once the partition
+ * size has been increased the additional capacity will be visible using
+ * bdev_capacity().
+ *
+ * The returned maximum expansion capacity is always expected to be larger, or
+ * at the very least equal, to its usable capacity to prevent overestimating
+ * the pool expandsize.
+ */
+static uint64_t
+bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk)
+{
+	uint64_t psize;
+	int64_t available;
+
+	if (wholedisk && bdev != bdev_whole(bdev)) {
+		/*
+		 * When reporting maximum expansion capacity for a wholedisk
+		 * deduct any capacity which is expected to be lost due to
+		 * alignment restrictions.  Over reporting this value isn't
+		 * harmful and would only result in slightly less capacity
+		 * than expected post expansion.
+		 * The estimated available space may be slightly smaller than
+		 * bdev_capacity() for devices where the number of sectors is
+		 * not a multiple of the alignment size and the partition layout
+		 * is keeping less than PARTITION_END_ALIGNMENT bytes after the
+		 * "reserved" EFI partition: in such cases return the device
+		 * usable capacity.
+		 */
+		available = i_size_read(bdev_whole(bdev)->bd_inode) -
+		    ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK +
+		    PARTITION_END_ALIGNMENT) << SECTOR_BITS);
+		psize = MAX(available, bdev_capacity(bdev));
+	} else {
+		psize = bdev_capacity(bdev);
+	}
+
+	return (psize);
+}
+
+static void
+vdev_disk_error(zio_t *zio)
+{
+	/*
+	 * This function can be called in interrupt context, for instance while
+	 * handling IRQs coming from a misbehaving disk device; use printk()
+	 * which is safe from any context.
+	 */
+	printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d "
+	    "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa),
+	    zio->io_vd->vdev_path, zio->io_error, zio->io_type,
+	    (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size,
+	    zio->io_flags);
+}
+
+static int
+vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	struct block_device *bdev;
+	fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa));
+	hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms);
+	vdev_disk_t *vd;
+
+	/* Must have a pathname and it must be absolute. */
+	if (v->vdev_path == NULL || v->vdev_path[0] != '/') {
+		v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		vdev_dbgmsg(v, "invalid vdev_path");
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it is currently open.  When expanding a
+	 * partition force re-scanning the partition table if userland
+	 * did not take care of this already. We need to do this while closed
+	 * in order to get an accurate updated block device size.  Then
+	 * since udev may need to recreate the device links increase the
+	 * open retry timeout before reporting the device as unavailable.
+	 */
+	vd = v->vdev_tsd;
+	if (vd) {
+		char disk_name[BDEVNAME_SIZE + 6] = "/dev/";
+		boolean_t reread_part = B_FALSE;
+
+		rw_enter(&vd->vd_lock, RW_WRITER);
+		bdev = vd->vd_bdev;
+		vd->vd_bdev = NULL;
+
+		if (bdev) {
+			if (v->vdev_expanding && bdev != bdev_whole(bdev)) {
+				bdevname(bdev_whole(bdev), disk_name + 5);
+				/*
+				 * If userland has BLKPG_RESIZE_PARTITION,
+				 * then it should have updated the partition
+				 * table already. We can detect this by
+				 * comparing our current physical size
+				 * with that of the device. If they are
+				 * the same, then we must not have
+				 * BLKPG_RESIZE_PARTITION or it failed to
+				 * update the partition table online. We
+				 * fallback to rescanning the partition
+				 * table from the kernel below. However,
+				 * if the capacity already reflects the
+				 * updated partition, then we skip
+				 * rescanning the partition table here.
+				 */
+				if (v->vdev_psize == bdev_capacity(bdev))
+					reread_part = B_TRUE;
+			}
+
+			blkdev_put(bdev, mode | FMODE_EXCL);
+		}
+
+		if (reread_part) {
+			bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL,
+			    zfs_vdev_holder);
+			if (!IS_ERR(bdev)) {
+				int error = vdev_bdev_reread_part(bdev);
+				blkdev_put(bdev, mode | FMODE_EXCL);
+				if (error == 0) {
+					timeout = MSEC2NSEC(
+					    zfs_vdev_open_timeout_ms * 2);
+				}
+			}
+		}
+	} else {
+		vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+
+		rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL);
+		rw_enter(&vd->vd_lock, RW_WRITER);
+	}
+
+	/*
+	 * Devices are always opened by the path provided at configuration
+	 * time.  This means that if the provided path is a udev by-id path
+	 * then drives may be re-cabled without an issue.  If the provided
+	 * path is a udev by-path path, then the physical location information
+	 * will be preserved.  This can be critical for more complicated
+	 * configurations where drives are located in specific physical
+	 * locations to maximize the systems tolerance to component failure.
+	 *
+	 * Alternatively, you can provide your own udev rule to flexibly map
+	 * the drives as you see fit.  It is not advised that you use the
+	 * /dev/[hd]d devices which may be reordered due to probing order.
+	 * Devices in the wrong locations will be detected by the higher
+	 * level vdev validation.
+	 *
+	 * The specified paths may be briefly removed and recreated in
+	 * response to udev events.  This should be exceptionally unlikely
+	 * because the zpool command makes every effort to verify these paths
+	 * have already settled prior to reaching this point.  Therefore,
+	 * a ENOENT failure at this point is highly likely to be transient
+	 * and it is reasonable to sleep and retry before giving up.  In
+	 * practice delays have been observed to be on the order of 100ms.
+	 */
+	hrtime_t start = gethrtime();
+	bdev = ERR_PTR(-ENXIO);
+	while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) {
+		bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL,
+		    zfs_vdev_holder);
+		if (unlikely(PTR_ERR(bdev) == -ENOENT)) {
+			schedule_timeout(MSEC_TO_TICK(10));
+		} else if (IS_ERR(bdev)) {
+			break;
+		}
+	}
+
+	if (IS_ERR(bdev)) {
+		int error = -PTR_ERR(bdev);
+		vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error,
+		    (u_longlong_t)(gethrtime() - start),
+		    (u_longlong_t)timeout);
+		vd->vd_bdev = NULL;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
+		return (SET_ERROR(error));
+	} else {
+		vd->vd_bdev = bdev;
+		v->vdev_tsd = vd;
+		rw_exit(&vd->vd_lock);
+	}
+
+	struct request_queue *q = bdev_get_queue(vd->vd_bdev);
+
+	/*  Determine the physical block size */
+	int physical_block_size = bdev_physical_block_size(vd->vd_bdev);
+
+	/*  Determine the logical block size */
+	int logical_block_size = bdev_logical_block_size(vd->vd_bdev);
+
+	/* Clear the nowritecache bit, causes vdev_reopen() to try again. */
+	v->vdev_nowritecache = B_FALSE;
+
+	/* Set when device reports it supports TRIM. */
+	v->vdev_has_trim = !!blk_queue_discard(q);
+
+	/* Set when device reports it supports secure TRIM. */
+	v->vdev_has_securetrim = !!blk_queue_discard_secure(q);
+
+	/* Inform the ZIO pipeline that we are non-rotational */
+	v->vdev_nonrot = blk_queue_nonrot(q);
+
+	/* Physical volume size in bytes for the partition */
+	*psize = bdev_capacity(vd->vd_bdev);
+
+	/* Physical volume size in bytes including possible expansion space */
+	*max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk);
+
+	/* Based on the minimum sector size set the block size */
+	*physical_ashift = highbit64(MAX(physical_block_size,
+	    SPA_MINBLOCKSIZE)) - 1;
+
+	*logical_ashift = highbit64(MAX(logical_block_size,
+	    SPA_MINBLOCKSIZE)) - 1;
+
+	return (0);
+}
+
+static void
+vdev_disk_close(vdev_t *v)
+{
+	vdev_disk_t *vd = v->vdev_tsd;
+
+	if (v->vdev_reopening || vd == NULL)
+		return;
+
+	if (vd->vd_bdev != NULL) {
+		blkdev_put(vd->vd_bdev,
+		    vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL);
+	}
+
+	rw_destroy(&vd->vd_lock);
+	kmem_free(vd, sizeof (vdev_disk_t));
+	v->vdev_tsd = NULL;
+}
+
+static dio_request_t *
+vdev_disk_dio_alloc(int bio_count)
+{
+	dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
+	    sizeof (struct bio *) * bio_count, KM_SLEEP);
+	atomic_set(&dr->dr_ref, 0);
+	dr->dr_bio_count = bio_count;
+	dr->dr_error = 0;
+
+	for (int i = 0; i < dr->dr_bio_count; i++)
+		dr->dr_bio[i] = NULL;
+
+	return (dr);
+}
+
+static void
+vdev_disk_dio_free(dio_request_t *dr)
+{
+	int i;
+
+	for (i = 0; i < dr->dr_bio_count; i++)
+		if (dr->dr_bio[i])
+			bio_put(dr->dr_bio[i]);
+
+	kmem_free(dr, sizeof (dio_request_t) +
+	    sizeof (struct bio *) * dr->dr_bio_count);
+}
+
+static void
+vdev_disk_dio_get(dio_request_t *dr)
+{
+	atomic_inc(&dr->dr_ref);
+}
+
+static int
+vdev_disk_dio_put(dio_request_t *dr)
+{
+	int rc = atomic_dec_return(&dr->dr_ref);
+
+	/*
+	 * Free the dio_request when the last reference is dropped and
+	 * ensure zio_interpret is called only once with the correct zio
+	 */
+	if (rc == 0) {
+		zio_t *zio = dr->dr_zio;
+		int error = dr->dr_error;
+
+		vdev_disk_dio_free(dr);
+
+		if (zio) {
+			zio->io_error = error;
+			ASSERT3S(zio->io_error, >=, 0);
+			if (zio->io_error)
+				vdev_disk_error(zio);
+
+			zio_delay_interrupt(zio);
+		}
+	}
+
+	return (rc);
+}
+
+BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error)
+{
+	dio_request_t *dr = bio->bi_private;
+	int rc;
+
+	if (dr->dr_error == 0) {
+#ifdef HAVE_1ARG_BIO_END_IO_T
+		dr->dr_error = BIO_END_IO_ERROR(bio);
+#else
+		if (error)
+			dr->dr_error = -(error);
+		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			dr->dr_error = EIO;
+#endif
+	}
+
+	/* Drop reference acquired by __vdev_disk_physio */
+	rc = vdev_disk_dio_put(dr);
+}
+
+static inline void
+vdev_submit_bio_impl(struct bio *bio)
+{
+#ifdef HAVE_1ARG_SUBMIT_BIO
+	submit_bio(bio);
+#else
+	submit_bio(0, bio);
+#endif
+}
+
+/*
+ * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so
+ * replace it with preempt_schedule under the following condition:
+ */
+#if defined(CONFIG_ARM64) && \
+    defined(CONFIG_PREEMPTION) && \
+    defined(CONFIG_BLK_CGROUP)
+#define	preempt_schedule_notrace(x) preempt_schedule(x)
+#endif
+
+#ifdef HAVE_BIO_SET_DEV
+#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY)
+/*
+ * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by
+ * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched().
+ * As a side effect the function was converted to GPL-only.  Define our
+ * own version when needed which uses rcu_read_lock_sched().
+ */
+#if defined(HAVE_BLKG_TRYGET_GPL_ONLY)
+static inline bool
+vdev_blkg_tryget(struct blkcg_gq *blkg)
+{
+	struct percpu_ref *ref = &blkg->refcnt;
+	unsigned long __percpu *count;
+	bool rc;
+
+	rcu_read_lock_sched();
+
+	if (__ref_is_percpu(ref, &count)) {
+		this_cpu_inc(*count);
+		rc = true;
+	} else {
+#ifdef ZFS_PERCPU_REF_COUNT_IN_DATA
+		rc = atomic_long_inc_not_zero(&ref->data->count);
+#else
+		rc = atomic_long_inc_not_zero(&ref->count);
+#endif
+	}
+
+	rcu_read_unlock_sched();
+
+	return (rc);
+}
+#elif defined(HAVE_BLKG_TRYGET)
+#define	vdev_blkg_tryget(bg)	blkg_tryget(bg)
+#endif
+/*
+ * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the
+ * GPL-only bio_associate_blkg() symbol thus inadvertently converting
+ * the entire macro.  Provide a minimal version which always assigns the
+ * request queue's root_blkg to the bio.
+ */
+static inline void
+vdev_bio_associate_blkg(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+
+	ASSERT3P(q, !=, NULL);
+	ASSERT3P(bio->bi_blkg, ==, NULL);
+
+	if (q->root_blkg && vdev_blkg_tryget(q->root_blkg))
+		bio->bi_blkg = q->root_blkg;
+}
+#define	bio_associate_blkg vdev_bio_associate_blkg
+#endif
+#else
+/*
+ * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels.
+ */
+static inline void
+bio_set_dev(struct bio *bio, struct block_device *bdev)
+{
+	bio->bi_bdev = bdev;
+}
+#endif /* HAVE_BIO_SET_DEV */
+
+static inline void
+vdev_submit_bio(struct bio *bio)
+{
+	struct bio_list *bio_list = current->bio_list;
+	current->bio_list = NULL;
+	vdev_submit_bio_impl(bio);
+	current->bio_list = bio_list;
+}
+
+static int
+__vdev_disk_physio(struct block_device *bdev, zio_t *zio,
+    size_t io_size, uint64_t io_offset, int rw, int flags)
+{
+	dio_request_t *dr;
+	uint64_t abd_offset;
+	uint64_t bio_offset;
+	int bio_size;
+	int bio_count = 16;
+	int error = 0;
+	struct blk_plug plug;
+
+	/*
+	 * Accessing outside the block device is never allowed.
+	 */
+	if (io_offset + io_size > bdev->bd_inode->i_size) {
+		vdev_dbgmsg(zio->io_vd,
+		    "Illegal access %llu size %llu, device size %llu",
+		    io_offset, io_size, i_size_read(bdev->bd_inode));
+		return (SET_ERROR(EIO));
+	}
+
+retry:
+	dr = vdev_disk_dio_alloc(bio_count);
+
+	if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+		bio_set_flags_failfast(bdev, &flags);
+
+	dr->dr_zio = zio;
+
+	/*
+	 * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
+	 * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
+	 * can cover at least 128KB and at most 1MB.  When the required number
+	 * of iovec's exceeds this, we are forced to break the IO in multiple
+	 * bio's and wait for them all to complete.  This is likely if the
+	 * recordsize property is increased beyond 1MB.  The default
+	 * bio_count=16 should typically accommodate the maximum-size zio of
+	 * 16MB.
+	 */
+
+	abd_offset = 0;
+	bio_offset = io_offset;
+	bio_size = io_size;
+	for (int i = 0; i <= dr->dr_bio_count; i++) {
+
+		/* Finished constructing bio's for given buffer */
+		if (bio_size <= 0)
+			break;
+
+		/*
+		 * If additional bio's are required, we have to retry, but
+		 * this should be rare - see the comment above.
+		 */
+		if (dr->dr_bio_count == i) {
+			vdev_disk_dio_free(dr);
+			bio_count *= 2;
+			goto retry;
+		}
+
+		/* bio_alloc() with __GFP_WAIT never returns NULL */
+		dr->dr_bio[i] = bio_alloc(GFP_NOIO,
+		    MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset),
+		    BIO_MAX_PAGES));
+		if (unlikely(dr->dr_bio[i] == NULL)) {
+			vdev_disk_dio_free(dr);
+			return (SET_ERROR(ENOMEM));
+		}
+
+		/* Matching put called by vdev_disk_physio_completion */
+		vdev_disk_dio_get(dr);
+
+		bio_set_dev(dr->dr_bio[i], bdev);
+		BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
+		dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion;
+		dr->dr_bio[i]->bi_private = dr;
+		bio_set_op_attrs(dr->dr_bio[i], rw, flags);
+
+		/* Remaining size is returned to become the new size */
+		bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
+		    bio_size, abd_offset);
+
+		/* Advance in buffer and construct another bio if needed */
+		abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+		bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
+	}
+
+	/* Extra reference to protect dio_request during vdev_submit_bio */
+	vdev_disk_dio_get(dr);
+
+	if (dr->dr_bio_count > 1)
+		blk_start_plug(&plug);
+
+	/* Submit all bio's associated with this dio */
+	for (int i = 0; i < dr->dr_bio_count; i++) {
+		if (dr->dr_bio[i])
+			vdev_submit_bio(dr->dr_bio[i]);
+	}
+
+	if (dr->dr_bio_count > 1)
+		blk_finish_plug(&plug);
+
+	(void) vdev_disk_dio_put(dr);
+
+	return (error);
+}
+
+BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
+{
+	zio_t *zio = bio->bi_private;
+#ifdef HAVE_1ARG_BIO_END_IO_T
+	zio->io_error = BIO_END_IO_ERROR(bio);
+#else
+	zio->io_error = -error;
+#endif
+
+	if (zio->io_error && (zio->io_error == EOPNOTSUPP))
+		zio->io_vd->vdev_nowritecache = B_TRUE;
+
+	bio_put(bio);
+	ASSERT3S(zio->io_error, >=, 0);
+	if (zio->io_error)
+		vdev_disk_error(zio);
+	zio_interrupt(zio);
+}
+
+static int
+vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
+{
+	struct request_queue *q;
+	struct bio *bio;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return (SET_ERROR(ENXIO));
+
+	bio = bio_alloc(GFP_NOIO, 0);
+	/* bio_alloc() with __GFP_WAIT never returns NULL */
+	if (unlikely(bio == NULL))
+		return (SET_ERROR(ENOMEM));
+
+	bio->bi_end_io = vdev_disk_io_flush_completion;
+	bio->bi_private = zio;
+	bio_set_dev(bio, bdev);
+	bio_set_flush(bio);
+	vdev_submit_bio(bio);
+	invalidate_bdev(bdev);
+
+	return (0);
+}
+
+static void
+vdev_disk_io_start(zio_t *zio)
+{
+	vdev_t *v = zio->io_vd;
+	vdev_disk_t *vd = v->vdev_tsd;
+	unsigned long trim_flags = 0;
+	int rw, error;
+
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (vd == NULL) {
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	rw_enter(&vd->vd_lock, RW_READER);
+
+	/*
+	 * If the vdev is closed, it's likely due to a failed reopen and is
+	 * in the UNAVAIL state.  Nothing to be done here but return failure.
+	 */
+	if (vd->vd_bdev == NULL) {
+		rw_exit(&vd->vd_lock);
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
+
+		if (!vdev_readable(v)) {
+			rw_exit(&vd->vd_lock);
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+
+			if (zfs_nocacheflush)
+				break;
+
+			if (v->vdev_nowritecache) {
+				zio->io_error = SET_ERROR(ENOTSUP);
+				break;
+			}
+
+			error = vdev_disk_io_flush(vd->vd_bdev, zio);
+			if (error == 0) {
+				rw_exit(&vd->vd_lock);
+				return;
+			}
+
+			zio->io_error = error;
+
+			break;
+
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		rw_exit(&vd->vd_lock);
+		zio_execute(zio);
+		return;
+	case ZIO_TYPE_WRITE:
+		rw = WRITE;
+		break;
+
+	case ZIO_TYPE_READ:
+		rw = READ;
+		break;
+
+	case ZIO_TYPE_TRIM:
+#if defined(BLKDEV_DISCARD_SECURE)
+		if (zio->io_trim_flags & ZIO_TRIM_SECURE)
+			trim_flags |= BLKDEV_DISCARD_SECURE;
+#endif
+		zio->io_error = -blkdev_issue_discard(vd->vd_bdev,
+		    zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS,
+		    trim_flags);
+
+		rw_exit(&vd->vd_lock);
+		zio_interrupt(zio);
+		return;
+
+	default:
+		rw_exit(&vd->vd_lock);
+		zio->io_error = SET_ERROR(ENOTSUP);
+		zio_interrupt(zio);
+		return;
+	}
+
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+	error = __vdev_disk_physio(vd->vd_bdev, zio,
+	    zio->io_size, zio->io_offset, rw, 0);
+	rw_exit(&vd->vd_lock);
+
+	if (error) {
+		zio->io_error = error;
+		zio_interrupt(zio);
+		return;
+	}
+}
+
+static void
+vdev_disk_io_done(zio_t *zio)
+{
+	/*
+	 * If the device returned EIO, we revalidate the media.  If it is
+	 * determined the media has changed this triggers the asynchronous
+	 * removal of the device from the configuration.
+	 */
+	if (zio->io_error == EIO) {
+		vdev_t *v = zio->io_vd;
+		vdev_disk_t *vd = v->vdev_tsd;
+
+		if (zfs_check_media_change(vd->vd_bdev)) {
+			invalidate_bdev(vd->vd_bdev);
+			v->vdev_remove_wanted = B_TRUE;
+			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
+		}
+	}
+}
+
+static void
+vdev_disk_hold(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+	/* We must have a pathname, and it must be absolute. */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/')
+		return;
+
+	/*
+	 * Only prefetch path and devid info if the device has
+	 * never been opened.
+	 */
+	if (vd->vdev_tsd != NULL)
+		return;
+
+}
+
+static void
+vdev_disk_rele(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER));
+
+	/* XXX: Implement me as a vnode rele for the device */
+}
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_disk_open,
+	.vdev_op_close = vdev_disk_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_disk_io_start,
+	.vdev_op_io_done = vdev_disk_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_disk_hold,
+	.vdev_op_rele = vdev_disk_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+/*
+ * The zfs_vdev_scheduler module option has been deprecated. Setting this
+ * value no longer has any effect.  It has not yet been entirely removed
+ * to allow the module to be loaded if this option is specified in the
+ * /etc/modprobe.d/zfs.conf file.  The following warning will be logged.
+ */
+static int
+param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp)
+{
+	int error = param_set_charp(val, kp);
+	if (error == 0) {
+		printk(KERN_INFO "The 'zfs_vdev_scheduler' module option "
+		    "is not supported.\n");
+	}
+
+	return (error);
+}
+
+char *zfs_vdev_scheduler = "unused";
+module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler,
+    param_get_charp, &zfs_vdev_scheduler, 0644);
+MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler");
+
+int
+param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+	uint64_t val;
+	int error;
+
+	error = kstrtoull(buf, 0, &val);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
+		return (SET_ERROR(-EINVAL));
+
+	error = param_set_ulong(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	return (0);
+}
+
+int
+param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp)
+{
+	uint64_t val;
+	int error;
+
+	error = kstrtoull(buf, 0, &val);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
+		return (SET_ERROR(-EINVAL));
+
+	error = param_set_ulong(buf, kp);
+	if (error < 0)
+		return (SET_ERROR(error));
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
new file mode 100644
index 000000000000..bf8a13ae6154
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/vdev_file.c
@@ -0,0 +1,382 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/abd.h>
+#include <sys/fcntl.h>
+#include <sys/vnode.h>
+#include <sys/zfs_file.h>
+#ifdef _KERNEL
+#include <linux/falloc.h>
+#endif
+/*
+ * Virtual device vector for files.
+ */
+
+static taskq_t *vdev_file_taskq;
+
+/*
+ * By default, the logical/physical ashift for file vdevs is set to
+ * SPA_MINBLOCKSHIFT (9). This allows all file vdevs to use 512B (1 << 9)
+ * blocksizes. Users may opt to change one or both of these for testing
+ * or performance reasons. Care should be taken as these values will
+ * impact the vdev_ashift setting which can only be set at vdev creation
+ * time.
+ */
+unsigned long vdev_file_logical_ashift = SPA_MINBLOCKSHIFT;
+unsigned long vdev_file_physical_ashift = SPA_MINBLOCKSHIFT;
+
+static void
+vdev_file_hold(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static void
+vdev_file_rele(vdev_t *vd)
+{
+	ASSERT(vd->vdev_path != NULL);
+}
+
+static mode_t
+vdev_file_open_mode(spa_mode_t spa_mode)
+{
+	mode_t mode = 0;
+
+	if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) {
+		mode = O_RDWR;
+	} else if (spa_mode & SPA_MODE_READ) {
+		mode = O_RDONLY;
+	} else if (spa_mode & SPA_MODE_WRITE) {
+		mode = O_WRONLY;
+	}
+
+	return (mode | O_LARGEFILE);
+}
+
+static int
+vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	vdev_file_t *vf;
+	zfs_file_t *fp;
+	zfs_file_attr_t zfa;
+	int error;
+
+	/*
+	 * Rotational optimizations only make sense on block devices.
+	 */
+	vd->vdev_nonrot = B_TRUE;
+
+	/*
+	 * Allow TRIM on file based vdevs.  This may not always be supported,
+	 * since it depends on your kernel version and underlying filesystem
+	 * type but it is always safe to attempt.
+	 */
+	vd->vdev_has_trim = B_TRUE;
+
+	/*
+	 * Disable secure TRIM on file based vdevs.  There is no way to
+	 * request this behavior from the underlying filesystem.
+	 */
+	vd->vdev_has_securetrim = B_FALSE;
+
+	/*
+	 * We must have a pathname, and it must be absolute.
+	 */
+	if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Reopen the device if it's not currently open.  Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		vf = vd->vdev_tsd;
+		goto skip_open;
+	}
+
+	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
+
+	/*
+	 * We always open the files from the root of the global zone, even if
+	 * we're in a local zone.  If the user has gotten to this point, the
+	 * administrator has already decided that the pool should be available
+	 * to local zone users, so the underlying devices should be as well.
+	 */
+	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
+
+	error = zfs_file_open(vd->vdev_path,
+	    vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	vf->vf_file = fp;
+
+#ifdef _KERNEL
+	/*
+	 * Make sure it's a regular file.
+	 */
+	if (zfs_file_getattr(fp, &zfa)) {
+		return (SET_ERROR(ENODEV));
+	}
+	if (!S_ISREG(zfa.zfa_mode)) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (SET_ERROR(ENODEV));
+	}
+#endif
+
+skip_open:
+
+	error =  zfs_file_getattr(vf->vf_file, &zfa);
+	if (error) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
+		return (error);
+	}
+
+	*max_psize = *psize = zfa.zfa_size;
+	*logical_ashift = vdev_file_logical_ashift;
+	*physical_ashift = vdev_file_physical_ashift;
+
+	return (0);
+}
+
+static void
+vdev_file_close(vdev_t *vd)
+{
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (vd->vdev_reopening || vf == NULL)
+		return;
+
+	if (vf->vf_file != NULL) {
+		(void) zfs_file_close(vf->vf_file);
+	}
+
+	vd->vdev_delayed_close = B_FALSE;
+	kmem_free(vf, sizeof (vdev_file_t));
+	vd->vdev_tsd = NULL;
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+	zio_t *zio = (zio_t *)arg;
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+	ssize_t resid;
+	void *buf;
+	loff_t off;
+	ssize_t size;
+	int err;
+
+	off = zio->io_offset;
+	size = zio->io_size;
+	resid = 0;
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		buf = abd_borrow_buf(zio->io_abd, zio->io_size);
+		err = zfs_file_pread(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf_copy(zio->io_abd, buf, size);
+	} else {
+		buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size);
+		err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid);
+		abd_return_buf(zio->io_abd, buf, size);
+	}
+	zio->io_error = err;
+	if (resid != 0 && zio->io_error == 0)
+		zio->io_error = SET_ERROR(ENOSPC);
+
+	zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_fsync(void *arg)
+{
+	zio_t *zio = (zio_t *)arg;
+	vdev_file_t *vf = zio->io_vd->vdev_tsd;
+
+	zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC);
+
+	zio_interrupt(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_file_t *vf = vd->vdev_tsd;
+
+	if (zio->io_type == ZIO_TYPE_IOCTL) {
+		/* XXPOLICY */
+		if (!vdev_readable(vd)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return;
+		}
+
+		switch (zio->io_cmd) {
+		case DKIOCFLUSHWRITECACHE:
+
+			if (zfs_nocacheflush)
+				break;
+
+			/*
+			 * We cannot safely call vfs_fsync() when PF_FSTRANS
+			 * is set in the current context.  Filesystems like
+			 * XFS include sanity checks to verify it is not
+			 * already set, see xfs_vm_writepage().  Therefore
+			 * the sync must be dispatched to a different context.
+			 */
+			if (__spl_pf_fstrans_check()) {
+				VERIFY3U(taskq_dispatch(vdev_file_taskq,
+				    vdev_file_io_fsync, zio, TQ_SLEEP), !=,
+				    TASKQID_INVALID);
+				return;
+			}
+
+			zio->io_error = zfs_file_fsync(vf->vf_file,
+			    O_SYNC | O_DSYNC);
+			break;
+		default:
+			zio->io_error = SET_ERROR(ENOTSUP);
+		}
+
+		zio_execute(zio);
+		return;
+	} else if (zio->io_type == ZIO_TYPE_TRIM) {
+		int mode = 0;
+
+		ASSERT3U(zio->io_size, !=, 0);
+#ifdef __linux__
+		mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+#endif
+		zio->io_error = zfs_file_fallocate(vf->vf_file,
+		    mode, zio->io_offset, zio->io_size);
+		zio_execute(zio);
+		return;
+	}
+
+	zio->io_target_timestamp = zio_handle_io_delay(zio);
+
+	VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+	    TQ_SLEEP), !=, TASKQID_INVALID);
+}
+
+/* ARGSUSED */
+static void
+vdev_file_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_file_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_FILE,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+void
+vdev_file_init(void)
+{
+	vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16),
+	    minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC);
+
+	VERIFY(vdev_file_taskq);
+}
+
+void
+vdev_file_fini(void)
+{
+	taskq_destroy(vdev_file_taskq);
+}
+
+/*
+ * From userland we access disks just like files.
+ */
+#ifndef _KERNEL
+
+vdev_ops_t vdev_disk_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_file_open,
+	.vdev_op_close = vdev_file_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_file_io_start,
+	.vdev_op_io_done = vdev_file_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = vdev_file_hold,
+	.vdev_op_rele = vdev_file_rele,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_DISK,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+#endif
+
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, logical_ashift, ULONG, ZMOD_RW,
+	"Logical ashift for file-based devices");
+ZFS_MODULE_PARAM(zfs_vdev_file, vdev_file_, physical_ashift, ULONG, ZMOD_RW,
+	"Physical ashift for file-based devices");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
new file mode 100644
index 000000000000..2628325c0ba9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c
@@ -0,0 +1,2932 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/sid.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/fs/zfs.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/trace_acl.h>
+#include <sys/zpl.h>
+
+#define	ALLOW	ACE_ACCESS_ALLOWED_ACE_TYPE
+#define	DENY	ACE_ACCESS_DENIED_ACE_TYPE
+#define	MAX_ACE_TYPE	ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE
+#define	MIN_ACE_TYPE	ALLOW
+
+#define	OWNING_GROUP		(ACE_GROUP|ACE_IDENTIFIER_GROUP)
+#define	EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \
+    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE)
+#define	EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+#define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
+
+#define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
+    ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
+    ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
+    ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
+
+#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+    ACE_DELETE|ACE_DELETE_CHILD)
+#define	WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
+
+#define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
+    ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
+
+#define	ALL_INHERIT	(ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \
+    ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE)
+
+#define	RESTRICTED_CLEAR	(ACE_WRITE_ACL|ACE_WRITE_OWNER)
+
+#define	V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\
+    ZFS_ACL_PROTECTED)
+
+#define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
+    ZFS_ACL_OBJ_ACE)
+
+#define	ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
+#define	IDMAP_WK_CREATOR_OWNER_UID	2147483648U
+
+static uint16_t
+zfs_ace_v0_get_type(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_v0_get_flags(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_v0_get_mask(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_v0_get_who(void *acep)
+{
+	return (((zfs_oldace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_v0_set_type(void *acep, uint16_t type)
+{
+	((zfs_oldace_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_v0_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_oldace_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_v0_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_oldace_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_v0_set_who(void *acep, uint64_t who)
+{
+	((zfs_oldace_t *)acep)->z_fuid = who;
+}
+
+/*ARGSUSED*/
+static size_t
+zfs_ace_v0_size(void *acep)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static size_t
+zfs_ace_v0_abstract_size(void)
+{
+	return (sizeof (zfs_oldace_t));
+}
+
+static int
+zfs_ace_v0_mask_off(void)
+{
+	return (offsetof(zfs_oldace_t, z_access_mask));
+}
+
+/*ARGSUSED*/
+static int
+zfs_ace_v0_data(void *acep, void **datap)
+{
+	*datap = NULL;
+	return (0);
+}
+
+static acl_ops_t zfs_acl_v0_ops = {
+	.ace_mask_get = zfs_ace_v0_get_mask,
+	.ace_mask_set = zfs_ace_v0_set_mask,
+	.ace_flags_get = zfs_ace_v0_get_flags,
+	.ace_flags_set = zfs_ace_v0_set_flags,
+	.ace_type_get = zfs_ace_v0_get_type,
+	.ace_type_set = zfs_ace_v0_set_type,
+	.ace_who_get = zfs_ace_v0_get_who,
+	.ace_who_set = zfs_ace_v0_set_who,
+	.ace_size = zfs_ace_v0_size,
+	.ace_abstract_size = zfs_ace_v0_abstract_size,
+	.ace_mask_off = zfs_ace_v0_mask_off,
+	.ace_data = zfs_ace_v0_data
+};
+
+static uint16_t
+zfs_ace_fuid_get_type(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_type);
+}
+
+static uint16_t
+zfs_ace_fuid_get_flags(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_flags);
+}
+
+static uint32_t
+zfs_ace_fuid_get_mask(void *acep)
+{
+	return (((zfs_ace_hdr_t *)acep)->z_access_mask);
+}
+
+static uint64_t
+zfs_ace_fuid_get_who(void *args)
+{
+	uint16_t entry_type;
+	zfs_ace_t *acep = args;
+
+	entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return (-1);
+	return (((zfs_ace_t *)acep)->z_fuid);
+}
+
+static void
+zfs_ace_fuid_set_type(void *acep, uint16_t type)
+{
+	((zfs_ace_hdr_t *)acep)->z_type = type;
+}
+
+static void
+zfs_ace_fuid_set_flags(void *acep, uint16_t flags)
+{
+	((zfs_ace_hdr_t *)acep)->z_flags = flags;
+}
+
+static void
+zfs_ace_fuid_set_mask(void *acep, uint32_t mask)
+{
+	((zfs_ace_hdr_t *)acep)->z_access_mask = mask;
+}
+
+static void
+zfs_ace_fuid_set_who(void *arg, uint64_t who)
+{
+	zfs_ace_t *acep = arg;
+
+	uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+
+	if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP ||
+	    entry_type == ACE_EVERYONE)
+		return;
+	acep->z_fuid = who;
+}
+
+static size_t
+zfs_ace_fuid_size(void *acep)
+{
+	zfs_ace_hdr_t *zacep = acep;
+	uint16_t entry_type;
+
+	switch (zacep->z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		return (sizeof (zfs_object_ace_t));
+	case ALLOW:
+	case DENY:
+		entry_type =
+		    (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS);
+		if (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)
+			return (sizeof (zfs_ace_hdr_t));
+		/*FALLTHROUGH*/
+	default:
+		return (sizeof (zfs_ace_t));
+	}
+}
+
+static size_t
+zfs_ace_fuid_abstract_size(void)
+{
+	return (sizeof (zfs_ace_hdr_t));
+}
+
+static int
+zfs_ace_fuid_mask_off(void)
+{
+	return (offsetof(zfs_ace_hdr_t, z_access_mask));
+}
+
+static int
+zfs_ace_fuid_data(void *acep, void **datap)
+{
+	zfs_ace_t *zacep = acep;
+	zfs_object_ace_t *zobjp;
+
+	switch (zacep->z_hdr.z_type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		zobjp = acep;
+		*datap = (caddr_t)zobjp + sizeof (zfs_ace_t);
+		return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t));
+	default:
+		*datap = NULL;
+		return (0);
+	}
+}
+
+static acl_ops_t zfs_acl_fuid_ops = {
+	.ace_mask_get = zfs_ace_fuid_get_mask,
+	.ace_mask_set = zfs_ace_fuid_set_mask,
+	.ace_flags_get = zfs_ace_fuid_get_flags,
+	.ace_flags_set = zfs_ace_fuid_set_flags,
+	.ace_type_get = zfs_ace_fuid_get_type,
+	.ace_type_set = zfs_ace_fuid_set_type,
+	.ace_who_get = zfs_ace_fuid_get_who,
+	.ace_who_set = zfs_ace_fuid_set_who,
+	.ace_size = zfs_ace_fuid_size,
+	.ace_abstract_size = zfs_ace_fuid_abstract_size,
+	.ace_mask_off = zfs_ace_fuid_mask_off,
+	.ace_data = zfs_ace_fuid_data
+};
+
+/*
+ * The following three functions are provided for compatibility with
+ * older ZPL version in order to determine if the file use to have
+ * an external ACL and what version of ACL previously existed on the
+ * file.  Would really be nice to not need this, sigh.
+ */
+uint64_t
+zfs_external_acl(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+	int error;
+
+	if (zp->z_is_sa)
+		return (0);
+
+	/*
+	 * Need to deal with a potential
+	 * race where zfs_sa_upgrade could cause
+	 * z_isa_sa to change.
+	 *
+	 * If the lookup fails then the state of z_is_sa should have
+	 * changed.
+	 */
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+	    &acl_phys, sizeof (acl_phys))) == 0)
+		return (acl_phys.z_acl_extern_obj);
+	else {
+		/*
+		 * after upgrade the SA_ZPL_ZNODE_ACL should have been
+		 * removed
+		 */
+		VERIFY(zp->z_is_sa && error == ENOENT);
+		return (0);
+	}
+}
+
+/*
+ * Determine size of ACL in bytes
+ *
+ * This is more complicated than it should be since we have to deal
+ * with old external ACLs.
+ */
+static int
+zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount,
+    zfs_acl_phys_t *aclphys)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	uint64_t acl_count;
+	int size;
+	int error;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+	if (zp->z_is_sa) {
+		if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs),
+		    &size)) != 0)
+			return (error);
+		*aclsize = size;
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs),
+		    &acl_count, sizeof (acl_count))) != 0)
+			return (error);
+		*aclcount = acl_count;
+	} else {
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    aclphys, sizeof (*aclphys))) != 0)
+			return (error);
+
+		if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) {
+			*aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size);
+			*aclcount = aclphys->z_acl_size;
+		} else {
+			*aclsize = aclphys->z_acl_size;
+			*aclcount = aclphys->z_acl_count;
+		}
+	}
+	return (0);
+}
+
+int
+zfs_znode_acl_version(znode_t *zp)
+{
+	zfs_acl_phys_t acl_phys;
+
+	if (zp->z_is_sa)
+		return (ZFS_ACL_VERSION_FUID);
+	else {
+		int error;
+
+		/*
+		 * Need to deal with a potential
+		 * race where zfs_sa_upgrade could cause
+		 * z_isa_sa to change.
+		 *
+		 * If the lookup fails then the state of z_is_sa should have
+		 * changed.
+		 */
+		if ((error = sa_lookup(zp->z_sa_hdl,
+		    SA_ZPL_ZNODE_ACL(ZTOZSB(zp)),
+		    &acl_phys, sizeof (acl_phys))) == 0)
+			return (acl_phys.z_acl_version);
+		else {
+			/*
+			 * After upgrade SA_ZPL_ZNODE_ACL should have
+			 * been removed.
+			 */
+			VERIFY(zp->z_is_sa && error == ENOENT);
+			return (ZFS_ACL_VERSION_FUID);
+		}
+	}
+}
+
+static int
+zfs_acl_version(int version)
+{
+	if (version < ZPL_VERSION_FUID)
+		return (ZFS_ACL_VERSION_INITIAL);
+	else
+		return (ZFS_ACL_VERSION_FUID);
+}
+
+static int
+zfs_acl_version_zp(znode_t *zp)
+{
+	return (zfs_acl_version(ZTOZSB(zp)->z_version));
+}
+
+zfs_acl_t *
+zfs_acl_alloc(int vers)
+{
+	zfs_acl_t *aclp;
+
+	aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP);
+	list_create(&aclp->z_acl, sizeof (zfs_acl_node_t),
+	    offsetof(zfs_acl_node_t, z_next));
+	aclp->z_version = vers;
+	if (vers == ZFS_ACL_VERSION_FUID)
+		aclp->z_ops = &zfs_acl_fuid_ops;
+	else
+		aclp->z_ops = &zfs_acl_v0_ops;
+	return (aclp);
+}
+
+zfs_acl_node_t *
+zfs_acl_node_alloc(size_t bytes)
+{
+	zfs_acl_node_t *aclnode;
+
+	aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP);
+	if (bytes) {
+		aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP);
+		aclnode->z_allocdata = aclnode->z_acldata;
+		aclnode->z_allocsize = bytes;
+		aclnode->z_size = bytes;
+	}
+
+	return (aclnode);
+}
+
+static void
+zfs_acl_node_free(zfs_acl_node_t *aclnode)
+{
+	if (aclnode->z_allocsize)
+		kmem_free(aclnode->z_allocdata, aclnode->z_allocsize);
+	kmem_free(aclnode, sizeof (zfs_acl_node_t));
+}
+
+static void
+zfs_acl_release_nodes(zfs_acl_t *aclp)
+{
+	zfs_acl_node_t *aclnode;
+
+	while ((aclnode = list_head(&aclp->z_acl))) {
+		list_remove(&aclp->z_acl, aclnode);
+		zfs_acl_node_free(aclnode);
+	}
+	aclp->z_acl_count = 0;
+	aclp->z_acl_bytes = 0;
+}
+
+void
+zfs_acl_free(zfs_acl_t *aclp)
+{
+	zfs_acl_release_nodes(aclp);
+	list_destroy(&aclp->z_acl);
+	kmem_free(aclp, sizeof (zfs_acl_t));
+}
+
+static boolean_t
+zfs_acl_valid_ace_type(uint_t type, uint_t flags)
+{
+	uint16_t entry_type;
+
+	switch (type) {
+	case ALLOW:
+	case DENY:
+	case ACE_SYSTEM_AUDIT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_ACE_TYPE:
+		entry_type = flags & ACE_TYPE_FLAGS;
+		return (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE || entry_type == 0 ||
+		    entry_type == ACE_IDENTIFIER_GROUP);
+	default:
+		if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static boolean_t
+zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags)
+{
+	/*
+	 * first check type of entry
+	 */
+
+	if (!zfs_acl_valid_ace_type(type, iflags))
+		return (B_FALSE);
+
+	switch (type) {
+	case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+	case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+	case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+		if (aclp->z_version < ZFS_ACL_VERSION_FUID)
+			return (B_FALSE);
+		aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+	}
+
+	/*
+	 * next check inheritance level flags
+	 */
+
+	if (S_ISDIR(obj_mode) &&
+	    (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+	if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) {
+		if ((iflags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE)) == 0) {
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static void *
+zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who,
+    uint32_t *access_mask, uint16_t *iflags, uint16_t *type)
+{
+	zfs_acl_node_t *aclnode;
+
+	ASSERT(aclp);
+
+	if (start == NULL) {
+		aclnode = list_head(&aclp->z_acl);
+		if (aclnode == NULL)
+			return (NULL);
+
+		aclp->z_next_ace = aclnode->z_acldata;
+		aclp->z_curr_node = aclnode;
+		aclnode->z_ace_idx = 0;
+	}
+
+	aclnode = aclp->z_curr_node;
+
+	if (aclnode == NULL)
+		return (NULL);
+
+	if (aclnode->z_ace_idx >= aclnode->z_ace_count) {
+		aclnode = list_next(&aclp->z_acl, aclnode);
+		if (aclnode == NULL)
+			return (NULL);
+		else {
+			aclp->z_curr_node = aclnode;
+			aclnode->z_ace_idx = 0;
+			aclp->z_next_ace = aclnode->z_acldata;
+		}
+	}
+
+	if (aclnode->z_ace_idx < aclnode->z_ace_count) {
+		void *acep = aclp->z_next_ace;
+		size_t ace_size;
+
+		/*
+		 * Make sure we don't overstep our bounds
+		 */
+		ace_size = aclp->z_ops->ace_size(acep);
+
+		if (((caddr_t)acep + ace_size) >
+		    ((caddr_t)aclnode->z_acldata + aclnode->z_size)) {
+			return (NULL);
+		}
+
+		*iflags = aclp->z_ops->ace_flags_get(acep);
+		*type = aclp->z_ops->ace_type_get(acep);
+		*access_mask = aclp->z_ops->ace_mask_get(acep);
+		*who = aclp->z_ops->ace_who_get(acep);
+		aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size;
+		aclnode->z_ace_idx++;
+
+		return ((void *)acep);
+	}
+	return (NULL);
+}
+
+/*ARGSUSED*/
+static uint64_t
+zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt,
+    uint16_t *flags, uint16_t *type, uint32_t *mask)
+{
+	zfs_acl_t *aclp = datap;
+	zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie;
+	uint64_t who;
+
+	acep = zfs_acl_next_ace(aclp, acep, &who, mask,
+	    flags, type);
+	return ((uint64_t)(uintptr_t)acep);
+}
+
+/*
+ * Copy ACE to internal ZFS format.
+ * While processing the ACL each ACE will be validated for correctness.
+ * ACE FUIDs will be created later.
+ */
+static int
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp,
+    void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size,
+    zfs_fuid_info_t **fuidp, cred_t *cr)
+{
+	int i;
+	uint16_t entry_type;
+	zfs_ace_t *aceptr = z_acl;
+	ace_t *acep = datap;
+	zfs_object_ace_t *zobjacep;
+	ace_object_t *aceobjp;
+
+	for (i = 0; i != aclcnt; i++) {
+		aceptr->z_hdr.z_access_mask = acep->a_access_mask;
+		aceptr->z_hdr.z_flags = acep->a_flags;
+		aceptr->z_hdr.z_type = acep->a_type;
+		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
+		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE) {
+			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+			    cr, (entry_type == 0) ?
+			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
+		}
+
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type,
+		    aceptr->z_hdr.z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+
+		switch (acep->a_type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			zobjacep = (zfs_object_ace_t *)aceptr;
+			aceobjp = (ace_object_t *)acep;
+
+			bcopy(aceobjp->a_obj_type, zobjacep->z_object_type,
+			    sizeof (aceobjp->a_obj_type));
+			bcopy(aceobjp->a_inherit_obj_type,
+			    zobjacep->z_inherit_type,
+			    sizeof (aceobjp->a_inherit_obj_type));
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t));
+			break;
+		default:
+			acep = (ace_t *)((caddr_t)acep + sizeof (ace_t));
+		}
+
+		aceptr = (zfs_ace_t *)((caddr_t)aceptr +
+		    aclp->z_ops->ace_size(aceptr));
+	}
+
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+
+	return (0);
+}
+
+/*
+ * Copy ZFS ACEs to fixed size ace_t layout
+ */
+static void
+zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr,
+    void *datap, int filter)
+{
+	uint64_t who;
+	uint32_t access_mask;
+	uint16_t iflags, type;
+	zfs_ace_hdr_t *zacep = NULL;
+	ace_t *acep = datap;
+	ace_object_t *objacep;
+	zfs_object_ace_t *zobjacep;
+	size_t ace_size;
+	uint16_t entry_type;
+
+	while ((zacep = zfs_acl_next_ace(aclp, zacep,
+	    &who, &access_mask, &iflags, &type))) {
+
+		switch (type) {
+		case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+		case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+		case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+			if (filter) {
+				continue;
+			}
+			zobjacep = (zfs_object_ace_t *)zacep;
+			objacep = (ace_object_t *)acep;
+			bcopy(zobjacep->z_object_type,
+			    objacep->a_obj_type,
+			    sizeof (zobjacep->z_object_type));
+			bcopy(zobjacep->z_inherit_type,
+			    objacep->a_inherit_obj_type,
+			    sizeof (zobjacep->z_inherit_type));
+			ace_size = sizeof (ace_object_t);
+			break;
+		default:
+			ace_size = sizeof (ace_t);
+			break;
+		}
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		if ((entry_type != ACE_OWNER &&
+		    entry_type != OWNING_GROUP &&
+		    entry_type != ACE_EVERYONE)) {
+			acep->a_who = zfs_fuid_map_id(zfsvfs, who,
+			    cr, (entry_type & ACE_IDENTIFIER_GROUP) ?
+			    ZFS_ACE_GROUP : ZFS_ACE_USER);
+		} else {
+			acep->a_who = (uid_t)(int64_t)who;
+		}
+		acep->a_access_mask = access_mask;
+		acep->a_flags = iflags;
+		acep->a_type = type;
+		acep = (ace_t *)((caddr_t)acep + ace_size);
+	}
+}
+
+static int
+zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep,
+    zfs_oldace_t *z_acl, int aclcnt, size_t *size)
+{
+	int i;
+	zfs_oldace_t *aceptr = z_acl;
+
+	for (i = 0; i != aclcnt; i++, aceptr++) {
+		aceptr->z_access_mask = acep[i].a_access_mask;
+		aceptr->z_type = acep[i].a_type;
+		aceptr->z_flags = acep[i].a_flags;
+		aceptr->z_fuid = acep[i].a_who;
+		/*
+		 * Make sure ACE is valid
+		 */
+		if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type,
+		    aceptr->z_flags) != B_TRUE)
+			return (SET_ERROR(EINVAL));
+	}
+	*size = (caddr_t)aceptr - (caddr_t)z_acl;
+	return (0);
+}
+
+/*
+ * convert old ACL format to new
+ */
+void
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
+{
+	zfs_oldace_t *oldaclp;
+	int i;
+	uint16_t type, iflags;
+	uint32_t access_mask;
+	uint64_t who;
+	void *cookie = NULL;
+	zfs_acl_node_t *newaclnode;
+
+	ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL);
+	/*
+	 * First create the ACE in a contiguous piece of memory
+	 * for zfs_copy_ace_2_fuid().
+	 *
+	 * We only convert an ACL once, so this won't happen
+	 * every time.
+	 */
+	oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count,
+	    KM_SLEEP);
+	i = 0;
+	while ((cookie = zfs_acl_next_ace(aclp, cookie, &who,
+	    &access_mask, &iflags, &type))) {
+		oldaclp[i].z_flags = iflags;
+		oldaclp[i].z_type = type;
+		oldaclp[i].z_fuid = who;
+		oldaclp[i++].z_access_mask = access_mask;
+	}
+
+	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
+	    sizeof (zfs_object_ace_t));
+	aclp->z_ops = &zfs_acl_fuid_ops;
+	VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode,
+	    aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+	    &newaclnode->z_size, NULL, cr) == 0);
+	newaclnode->z_ace_count = aclp->z_acl_count;
+	aclp->z_version = ZFS_ACL_VERSION;
+	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
+
+	/*
+	 * Release all previous ACL nodes
+	 */
+
+	zfs_acl_release_nodes(aclp);
+
+	list_insert_head(&aclp->z_acl, newaclnode);
+
+	aclp->z_acl_bytes = newaclnode->z_size;
+	aclp->z_acl_count = newaclnode->z_ace_count;
+
+}
+
+/*
+ * Convert unix access mask to v4 access mask
+ */
+static uint32_t
+zfs_unix_to_v4(uint32_t access_mask)
+{
+	uint32_t new_mask = 0;
+
+	if (access_mask & S_IXOTH)
+		new_mask |= ACE_EXECUTE;
+	if (access_mask & S_IWOTH)
+		new_mask |= ACE_WRITE_DATA;
+	if (access_mask & S_IROTH)
+		new_mask |= ACE_READ_DATA;
+	return (new_mask);
+}
+
+static void
+zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
+    uint16_t access_type, uint64_t fuid, uint16_t entry_type)
+{
+	uint16_t type = entry_type & ACE_TYPE_FLAGS;
+
+	aclp->z_ops->ace_mask_set(acep, access_mask);
+	aclp->z_ops->ace_type_set(acep, access_type);
+	aclp->z_ops->ace_flags_set(acep, entry_type);
+	if ((type != ACE_OWNER && type != OWNING_GROUP &&
+	    type != ACE_EVERYONE))
+		aclp->z_ops->ace_who_set(acep, fuid);
+}
+
+/*
+ * Determine mode of file based on ACL.
+ */
+uint64_t
+zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp,
+    uint64_t *pflags, uint64_t fuid, uint64_t fgid)
+{
+	int		entry_type;
+	mode_t		mode;
+	mode_t		seen = 0;
+	zfs_ace_hdr_t 	*acep = NULL;
+	uint64_t	who;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	boolean_t	an_exec_denied = B_FALSE;
+
+	mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		/*
+		 * Skip over any inherit_only ACEs
+		 */
+		if (iflags & ACE_INHERIT_ONLY_ACE)
+			continue;
+
+		if (entry_type == ACE_OWNER || (entry_type == 0 &&
+		    who == fuid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRUSR))) {
+				seen |= S_IRUSR;
+				if (type == ALLOW) {
+					mode |= S_IRUSR;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWUSR))) {
+				seen |= S_IWUSR;
+				if (type == ALLOW) {
+					mode |= S_IWUSR;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXUSR))) {
+				seen |= S_IXUSR;
+				if (type == ALLOW) {
+					mode |= S_IXUSR;
+				}
+			}
+		} else if (entry_type == OWNING_GROUP ||
+		    (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) {
+			if ((access_mask & ACE_READ_DATA) &&
+			    (!(seen & S_IRGRP))) {
+				seen |= S_IRGRP;
+				if (type == ALLOW) {
+					mode |= S_IRGRP;
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA) &&
+			    (!(seen & S_IWGRP))) {
+				seen |= S_IWGRP;
+				if (type == ALLOW) {
+					mode |= S_IWGRP;
+				}
+			}
+			if ((access_mask & ACE_EXECUTE) &&
+			    (!(seen & S_IXGRP))) {
+				seen |= S_IXGRP;
+				if (type == ALLOW) {
+					mode |= S_IXGRP;
+				}
+			}
+		} else if (entry_type == ACE_EVERYONE) {
+			if ((access_mask & ACE_READ_DATA)) {
+				if (!(seen & S_IRUSR)) {
+					seen |= S_IRUSR;
+					if (type == ALLOW) {
+						mode |= S_IRUSR;
+					}
+				}
+				if (!(seen & S_IRGRP)) {
+					seen |= S_IRGRP;
+					if (type == ALLOW) {
+						mode |= S_IRGRP;
+					}
+				}
+				if (!(seen & S_IROTH)) {
+					seen |= S_IROTH;
+					if (type == ALLOW) {
+						mode |= S_IROTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_WRITE_DATA)) {
+				if (!(seen & S_IWUSR)) {
+					seen |= S_IWUSR;
+					if (type == ALLOW) {
+						mode |= S_IWUSR;
+					}
+				}
+				if (!(seen & S_IWGRP)) {
+					seen |= S_IWGRP;
+					if (type == ALLOW) {
+						mode |= S_IWGRP;
+					}
+				}
+				if (!(seen & S_IWOTH)) {
+					seen |= S_IWOTH;
+					if (type == ALLOW) {
+						mode |= S_IWOTH;
+					}
+				}
+			}
+			if ((access_mask & ACE_EXECUTE)) {
+				if (!(seen & S_IXUSR)) {
+					seen |= S_IXUSR;
+					if (type == ALLOW) {
+						mode |= S_IXUSR;
+					}
+				}
+				if (!(seen & S_IXGRP)) {
+					seen |= S_IXGRP;
+					if (type == ALLOW) {
+						mode |= S_IXGRP;
+					}
+				}
+				if (!(seen & S_IXOTH)) {
+					seen |= S_IXOTH;
+					if (type == ALLOW) {
+						mode |= S_IXOTH;
+					}
+				}
+			}
+		} else {
+			/*
+			 * Only care if this IDENTIFIER_GROUP or
+			 * USER ACE denies execute access to someone,
+			 * mode is not affected
+			 */
+			if ((access_mask & ACE_EXECUTE) && type == DENY)
+				an_exec_denied = B_TRUE;
+		}
+	}
+
+	/*
+	 * Failure to allow is effectively a deny, so execute permission
+	 * is denied if it was never mentioned or if we explicitly
+	 * weren't allowed it.
+	 */
+	if (!an_exec_denied &&
+	    ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+	    (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+		an_exec_denied = B_TRUE;
+
+	if (an_exec_denied)
+		*pflags &= ~ZFS_NO_EXECS_DENIED;
+	else
+		*pflags |= ZFS_NO_EXECS_DENIED;
+
+	return (mode);
+}
+
+/*
+ * Read an external acl object.  If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
+ */
+int
+zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp,
+    boolean_t will_modify)
+{
+	zfs_acl_t	*aclp;
+	int		aclsize = 0;
+	int		acl_count = 0;
+	zfs_acl_node_t	*aclnode;
+	zfs_acl_phys_t	znode_acl;
+	int		version;
+	int		error;
+	boolean_t	drop_lock = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	if (zp->z_acl_cached && !will_modify) {
+		*aclpp = zp->z_acl_cached;
+		return (0);
+	}
+
+	/*
+	 * close race where znode could be upgrade while trying to
+	 * read the znode attributes.
+	 *
+	 * But this could only happen if the file isn't already an SA
+	 * znode
+	 */
+	if (!zp->z_is_sa && !have_lock) {
+		mutex_enter(&zp->z_lock);
+		drop_lock = B_TRUE;
+	}
+	version = zfs_znode_acl_version(zp);
+
+	if ((error = zfs_acl_znode_info(zp, &aclsize,
+	    &acl_count, &znode_acl)) != 0) {
+		goto done;
+	}
+
+	aclp = zfs_acl_alloc(version);
+
+	aclp->z_acl_count = acl_count;
+	aclp->z_acl_bytes = aclsize;
+
+	aclnode = zfs_acl_node_alloc(aclsize);
+	aclnode->z_ace_count = aclp->z_acl_count;
+	aclnode->z_size = aclsize;
+
+	if (!zp->z_is_sa) {
+		if (znode_acl.z_acl_extern_obj) {
+			error = dmu_read(ZTOZSB(zp)->z_os,
+			    znode_acl.z_acl_extern_obj, 0, aclnode->z_size,
+			    aclnode->z_acldata, DMU_READ_PREFETCH);
+		} else {
+			bcopy(znode_acl.z_ace_data, aclnode->z_acldata,
+			    aclnode->z_size);
+		}
+	} else {
+		error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)),
+		    aclnode->z_acldata, aclnode->z_size);
+	}
+
+	if (error != 0) {
+		zfs_acl_free(aclp);
+		zfs_acl_node_free(aclnode);
+		/* convert checksum errors into IO errors */
+		if (error == ECKSUM)
+			error = SET_ERROR(EIO);
+		goto done;
+	}
+
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	*aclpp = aclp;
+	if (!will_modify)
+		zp->z_acl_cached = aclp;
+done:
+	if (drop_lock)
+		mutex_exit(&zp->z_lock);
+	return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen,
+    boolean_t start, void *userdata)
+{
+	zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata;
+
+	if (start) {
+		cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl);
+	} else {
+		cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl,
+		    cb->cb_acl_node);
+	}
+	*dataptr = cb->cb_acl_node->z_acldata;
+	*length = cb->cb_acl_node->z_size;
+}
+
+int
+zfs_acl_chown_setattr(znode_t *zp)
+{
+	int error;
+	zfs_acl_t *aclp;
+
+	if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIX)
+		return (0);
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+
+	error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE);
+	if (error == 0 && aclp->z_acl_count > 0)
+		zp->z_mode = ZTOI(zp)->i_mode =
+		    zfs_mode_compute(zp->z_mode, aclp,
+		    &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid),
+		    KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+	/*
+	 * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL
+	 * nor a DACL_ACES SA in which case ENOENT is returned from
+	 * zfs_acl_node_read() when the SA can't be located.
+	 * Allow chown/chgrp to succeed in these cases rather than
+	 * returning an error that makes no sense in the context of
+	 * the caller.
+	 */
+	if (error == ENOENT)
+		return (0);
+
+	return (error);
+}
+
+typedef struct trivial_acl {
+	uint32_t	allow0;		/* allow mask for bits only in owner */
+	uint32_t	deny1;		/* deny mask for bits not in owner */
+	uint32_t	deny2;		/* deny mask for bits not in group */
+	uint32_t	owner;		/* allow mask matching mode */
+	uint32_t	group;		/* allow mask matching mode */
+	uint32_t	everyone;	/* allow mask matching mode */
+} trivial_acl_t;
+
+static void
+acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks)
+{
+	uint32_t read_mask = ACE_READ_DATA;
+	uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA;
+	uint32_t execute_mask = ACE_EXECUTE;
+
+	if (isdir)
+		write_mask |= ACE_DELETE_CHILD;
+
+	masks->deny1 = 0;
+
+	if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH)))
+		masks->deny1 |= read_mask;
+	if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH)))
+		masks->deny1 |= write_mask;
+	if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH)))
+		masks->deny1 |= execute_mask;
+
+	masks->deny2 = 0;
+	if (!(mode & S_IRGRP) && (mode & S_IROTH))
+		masks->deny2 |= read_mask;
+	if (!(mode & S_IWGRP) && (mode & S_IWOTH))
+		masks->deny2 |= write_mask;
+	if (!(mode & S_IXGRP) && (mode & S_IXOTH))
+		masks->deny2 |= execute_mask;
+
+	masks->allow0 = 0;
+	if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH)))
+		masks->allow0 |= read_mask;
+	if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH)))
+		masks->allow0 |= write_mask;
+	if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH)))
+		masks->allow0 |= execute_mask;
+
+	masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL|
+	    ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES|
+	    ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE;
+	if (mode & S_IRUSR)
+		masks->owner |= read_mask;
+	if (mode & S_IWUSR)
+		masks->owner |= write_mask;
+	if (mode & S_IXUSR)
+		masks->owner |= execute_mask;
+
+	masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IRGRP)
+		masks->group |= read_mask;
+	if (mode & S_IWGRP)
+		masks->group |= write_mask;
+	if (mode & S_IXGRP)
+		masks->group |= execute_mask;
+
+	masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS|
+	    ACE_SYNCHRONIZE;
+	if (mode & S_IROTH)
+		masks->everyone |= read_mask;
+	if (mode & S_IWOTH)
+		masks->everyone |= write_mask;
+	if (mode & S_IXOTH)
+		masks->everyone |= execute_mask;
+}
+
+/*
+ * ace_trivial:
+ * determine whether an ace_t acl is trivial
+ *
+ * Trivialness implies that the acl is composed of only
+ * owner, group, everyone entries.  ACL can't
+ * have read_acl denied, and write_owner/write_acl/write_attributes
+ * can only be owner@ entry.
+ */
+static int
+ace_trivial_common(void *acep, int aclcnt,
+    uint64_t (*walk)(void *, uint64_t, int aclcnt,
+    uint16_t *, uint16_t *, uint32_t *))
+{
+	uint16_t flags;
+	uint32_t mask;
+	uint16_t type;
+	uint64_t cookie = 0;
+
+	while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) {
+		switch (flags & ACE_TYPE_FLAGS) {
+		case ACE_OWNER:
+		case ACE_GROUP|ACE_IDENTIFIER_GROUP:
+		case ACE_EVERYONE:
+			break;
+		default:
+			return (1);
+		}
+
+		if (flags & (ACE_FILE_INHERIT_ACE|
+		    ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|
+		    ACE_INHERIT_ONLY_ACE))
+			return (1);
+
+		/*
+		 * Special check for some special bits
+		 *
+		 * Don't allow anybody to deny reading basic
+		 * attributes or a files ACL.
+		 */
+		if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+		    (type == ACE_ACCESS_DENIED_ACE_TYPE))
+			return (1);
+
+		/*
+		 * Delete permission is never set by default
+		 */
+		if (mask & ACE_DELETE)
+			return (1);
+
+		/*
+		 * Child delete permission should be accompanied by write
+		 */
+		if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA))
+			return (1);
+
+		/*
+		 * only allow owner@ to have
+		 * write_acl/write_owner/write_attributes/write_xattr/
+		 */
+		if (type == ACE_ACCESS_ALLOWED_ACE_TYPE &&
+		    (!(flags & ACE_OWNER) && (mask &
+		    (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES|
+		    ACE_WRITE_NAMED_ATTRS))))
+			return (1);
+
+	}
+
+	return (0);
+}
+
+/*
+ * common code for setting ACLs.
+ *
+ * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl.
+ * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's
+ * already checked the acl and knows whether to inherit.
+ */
+int
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
+{
+	int			error;
+	zfsvfs_t		*zfsvfs = ZTOZSB(zp);
+	dmu_object_type_t	otype;
+	zfs_acl_locator_cb_t	locate = { 0 };
+	uint64_t		mode;
+	sa_bulk_attr_t		bulk[5];
+	uint64_t		ctime[2];
+	int			count = 0;
+	zfs_acl_phys_t		acl_phys;
+
+	mode = zp->z_mode;
+
+	mode = zfs_mode_compute(mode, aclp, &zp->z_pflags,
+	    KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid));
+
+	zp->z_mode = ZTOI(zp)->i_mode = mode;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, sizeof (ctime));
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	/*
+	 * Upgrade needed?
+	 */
+	if (!zfsvfs->z_use_fuids) {
+		otype = DMU_OT_OLDACL;
+	} else {
+		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
+		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
+			zfs_acl_xform(zp, aclp, cr);
+		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
+		otype = DMU_OT_ACL;
+	}
+
+	/*
+	 * Arrgh, we have to handle old on disk format
+	 * as well as newer (preferred) SA format.
+	 */
+
+	if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */
+		locate.cb_aclp = aclp;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate, aclp->z_acl_bytes);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs),
+		    NULL, &aclp->z_acl_count, sizeof (uint64_t));
+	} else { /* Painful legacy way */
+		zfs_acl_node_t *aclnode;
+		uint64_t off = 0;
+		uint64_t aoid;
+
+		if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
+		    &acl_phys, sizeof (acl_phys))) != 0)
+			return (error);
+
+		aoid = acl_phys.z_acl_extern_obj;
+
+		if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			/*
+			 * If ACL was previously external and we are now
+			 * converting to new ACL format then release old
+			 * ACL object and create a new one.
+			 */
+			if (aoid &&
+			    aclp->z_version != acl_phys.z_acl_version) {
+				error = dmu_object_free(zfsvfs->z_os, aoid, tx);
+				if (error)
+					return (error);
+				aoid = 0;
+			}
+			if (aoid == 0) {
+				aoid = dmu_object_alloc(zfsvfs->z_os,
+				    otype, aclp->z_acl_bytes,
+				    otype == DMU_OT_ACL ?
+				    DMU_OT_SYSACL : DMU_OT_NONE,
+				    otype == DMU_OT_ACL ?
+				    DN_OLD_MAX_BONUSLEN : 0, tx);
+			} else {
+				(void) dmu_object_set_blocksize(zfsvfs->z_os,
+				    aoid, aclp->z_acl_bytes, 0, tx);
+			}
+			acl_phys.z_acl_extern_obj = aoid;
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				dmu_write(zfsvfs->z_os, aoid, off,
+				    aclnode->z_size, aclnode->z_acldata, tx);
+				off += aclnode->z_size;
+			}
+		} else {
+			void *start = acl_phys.z_ace_data;
+			/*
+			 * Migrating back embedded?
+			 */
+			if (acl_phys.z_acl_extern_obj) {
+				error = dmu_object_free(zfsvfs->z_os,
+				    acl_phys.z_acl_extern_obj, tx);
+				if (error)
+					return (error);
+				acl_phys.z_acl_extern_obj = 0;
+			}
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				if (aclnode->z_ace_count == 0)
+					continue;
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+		}
+		/*
+		 * If Old version then swap count/bytes to match old
+		 * layout of znode_acl_phys_t.
+		 */
+		if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+			acl_phys.z_acl_size = aclp->z_acl_count;
+			acl_phys.z_acl_count = aclp->z_acl_bytes;
+		} else {
+			acl_phys.z_acl_size = aclp->z_acl_bytes;
+			acl_phys.z_acl_count = aclp->z_acl_count;
+		}
+		acl_phys.z_acl_version = aclp->z_version;
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (acl_phys));
+	}
+
+	/*
+	 * Replace ACL wide bits, but first clear them.
+	 */
+	zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS;
+
+	zp->z_pflags |= aclp->z_hints;
+
+	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
+		zp->z_pflags |= ZFS_ACL_TRIVIAL;
+
+	zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+	return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+}
+
+static void
+zfs_acl_chmod(boolean_t isdir, uint64_t mode, boolean_t split, boolean_t trim,
+    zfs_acl_t *aclp)
+{
+	void		*acep = NULL;
+	uint64_t	who;
+	int		new_count, new_bytes;
+	int		ace_size;
+	int		entry_type;
+	uint16_t	iflags, type;
+	uint32_t	access_mask;
+	zfs_acl_node_t	*newnode;
+	size_t		abstract_size = aclp->z_ops->ace_abstract_size();
+	void		*zacep;
+	trivial_acl_t	masks;
+
+	new_count = new_bytes = 0;
+
+	acl_trivial_access_masks((mode_t)mode, isdir, &masks);
+
+	newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes);
+
+	zacep = newnode->z_acldata;
+	if (masks.allow0) {
+		zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny1) {
+		zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+	if (masks.deny2) {
+		zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP);
+		zacep = (void *)((uintptr_t)zacep + abstract_size);
+		new_count++;
+		new_bytes += abstract_size;
+	}
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+		/*
+		 * ACEs used to represent the file mode may be divided
+		 * into an equivalent pair of inherit-only and regular
+		 * ACEs, if they are inheritable.
+		 * Skip regular ACEs, which are replaced by the new mode.
+		 */
+		if (split && (entry_type == ACE_OWNER ||
+		    entry_type == OWNING_GROUP ||
+		    entry_type == ACE_EVERYONE)) {
+			if (!isdir || !(iflags &
+			    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+				continue;
+			/*
+			 * We preserve owner@, group@, or @everyone
+			 * permissions, if they are inheritable, by
+			 * copying them to inherit_only ACEs. This
+			 * prevents inheritable permissions from being
+			 * altered along with the file mode.
+			 */
+			iflags |= ACE_INHERIT_ONLY_ACE;
+		}
+
+		/*
+		 * If this ACL has any inheritable ACEs, mark that in
+		 * the hints (which are later masked into the pflags)
+		 * so create knows to do inheritance.
+		 */
+		if (isdir && (iflags &
+		    (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE)))
+			aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		if ((type != ALLOW && type != DENY) ||
+		    (iflags & ACE_INHERIT_ONLY_ACE)) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				aclp->z_hints |= ZFS_ACL_OBJ_ACE;
+				break;
+			}
+		} else {
+			/*
+			 * Limit permissions to be no greater than
+			 * group permissions.
+			 * The "aclinherit" and "aclmode" properties
+			 * affect policy for create and chmod(2),
+			 * respectively.
+			 */
+			if ((type == ALLOW) && trim)
+				access_mask &= masks.group;
+		}
+		zfs_set_ace(aclp, zacep, access_mask, type, who, iflags);
+		ace_size = aclp->z_ops->ace_size(acep);
+		zacep = (void *)((uintptr_t)zacep + ace_size);
+		new_count++;
+		new_bytes += ace_size;
+	}
+	zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP);
+	zacep = (void *)((uintptr_t)zacep + abstract_size);
+	zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE);
+
+	new_count += 3;
+	new_bytes += abstract_size * 3;
+	zfs_acl_release_nodes(aclp);
+	aclp->z_acl_count = new_count;
+	aclp->z_acl_bytes = new_bytes;
+	newnode->z_ace_count = new_count;
+	newnode->z_size = new_bytes;
+	list_insert_tail(&aclp->z_acl, newnode);
+}
+
+int
+zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
+{
+	int error = 0;
+
+	mutex_enter(&zp->z_acl_lock);
+	mutex_enter(&zp->z_lock);
+	if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_DISCARD)
+		*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+	else
+		error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+
+	if (error == 0) {
+		(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
+		zfs_acl_chmod(S_ISDIR(ZTOI(zp)->i_mode), mode, B_TRUE,
+		    (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
+	}
+	mutex_exit(&zp->z_lock);
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Should ACE be inherited?
+ */
+static int
+zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags)
+{
+	int	iflags = (acep_flags & 0xf);
+
+	if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
+		return (1);
+	else if (iflags & ACE_FILE_INHERIT_ACE)
+		return (!(S_ISDIR(obj_mode) &&
+		    (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)));
+	return (0);
+}
+
+/*
+ * inherit inheritable ACEs from parent
+ */
+static zfs_acl_t *
+zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp,
+    uint64_t mode, boolean_t *need_chmod)
+{
+	void		*pacep = NULL;
+	void		*acep;
+	zfs_acl_node_t  *aclnode;
+	zfs_acl_t	*aclp = NULL;
+	uint64_t	who;
+	uint32_t	access_mask;
+	uint16_t	iflags, newflags, type;
+	size_t		ace_size;
+	void		*data1, *data2;
+	size_t		data1sz, data2sz;
+	uint_t		aclinherit;
+	boolean_t	isdir = S_ISDIR(va_mode);
+	boolean_t	isreg = S_ISREG(va_mode);
+
+	*need_chmod = B_TRUE;
+
+	aclp = zfs_acl_alloc(paclp->z_version);
+	aclinherit = zfsvfs->z_acl_inherit;
+	if (aclinherit == ZFS_ACL_DISCARD || S_ISLNK(va_mode))
+		return (aclp);
+
+	while ((pacep = zfs_acl_next_ace(paclp, pacep, &who,
+	    &access_mask, &iflags, &type))) {
+
+		/*
+		 * don't inherit bogus ACEs
+		 */
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		/*
+		 * Check if ACE is inheritable by this vnode
+		 */
+		if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) ||
+		    !zfs_ace_can_use(va_mode, iflags))
+			continue;
+
+		/*
+		 * If owner@, group@, or everyone@ inheritable
+		 * then zfs_acl_chmod() isn't needed.
+		 */
+		if ((aclinherit == ZFS_ACL_PASSTHROUGH ||
+		    aclinherit == ZFS_ACL_PASSTHROUGH_X) &&
+		    ((iflags & (ACE_OWNER|ACE_EVERYONE)) ||
+		    ((iflags & OWNING_GROUP) == OWNING_GROUP)) &&
+		    (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE))))
+			*need_chmod = B_FALSE;
+
+		/*
+		 * Strip inherited execute permission from file if
+		 * not in mode
+		 */
+		if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW &&
+		    !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) {
+			access_mask &= ~ACE_EXECUTE;
+		}
+
+		/*
+		 * Strip write_acl and write_owner from permissions
+		 * when inheriting an ACE
+		 */
+		if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) {
+			access_mask &= ~RESTRICTED_CLEAR;
+		}
+
+		ace_size = aclp->z_ops->ace_size(pacep);
+		aclnode = zfs_acl_node_alloc(ace_size);
+		list_insert_tail(&aclp->z_acl, aclnode);
+		acep = aclnode->z_acldata;
+
+		zfs_set_ace(aclp, acep, access_mask, type,
+		    who, iflags|ACE_INHERITED_ACE);
+
+		/*
+		 * Copy special opaque data if any
+		 */
+		if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) {
+			VERIFY((data2sz = aclp->z_ops->ace_data(acep,
+			    &data2)) == data1sz);
+			bcopy(data1, data2, data2sz);
+		}
+
+		aclp->z_acl_count++;
+		aclnode->z_ace_count++;
+		aclp->z_acl_bytes += aclnode->z_size;
+		newflags = aclp->z_ops->ace_flags_get(acep);
+
+		/*
+		 * If ACE is not to be inherited further, or if the vnode is
+		 * not a directory, remove all inheritance flags
+		 */
+		if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) {
+			newflags &= ~ALL_INHERIT;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+			continue;
+		}
+
+		/*
+		 * This directory has an inheritable ACE
+		 */
+		aclp->z_hints |= ZFS_INHERIT_ACE;
+
+		/*
+		 * If only FILE_INHERIT is set then turn on
+		 * inherit_only
+		 */
+		if ((iflags & (ACE_FILE_INHERIT_ACE |
+		    ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) {
+			newflags |= ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		} else {
+			newflags &= ~ACE_INHERIT_ONLY_ACE;
+			aclp->z_ops->ace_flags_set(acep,
+			    newflags|ACE_INHERITED_ACE);
+		}
+	}
+	if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
+	    aclp->z_acl_count != 0) {
+		*need_chmod = B_FALSE;
+	}
+
+	return (aclp);
+}
+
+/*
+ * Create file system object initial permissions
+ * including inheritable ACEs.
+ * Also, create FUIDs for owner and group.
+ */
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+    vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
+{
+	int		error;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zfs_acl_t	*paclp;
+	gid_t		gid = vap->va_gid;
+	boolean_t	need_chmod = B_TRUE;
+	boolean_t	trim = B_FALSE;
+	boolean_t	inherited = B_FALSE;
+
+	bzero(acl_ids, sizeof (zfs_acl_ids_t));
+	acl_ids->z_mode = vap->va_mode;
+
+	if (vsecp)
+		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp,
+		    cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+			return (error);
+
+	acl_ids->z_fuid = vap->va_uid;
+	acl_ids->z_fgid = vap->va_gid;
+#ifdef HAVE_KSID
+	/*
+	 * Determine uid and gid.
+	 */
+	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
+	    ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) {
+		acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid,
+		    cr, ZFS_OWNER, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+		    cr, ZFS_GROUP, &acl_ids->z_fuidp);
+		gid = vap->va_gid;
+	} else {
+		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+		    cr, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = 0;
+		if (vap->va_mask & AT_GID)  {
+			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
+			gid = vap->va_gid;
+			if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) &&
+			    !groupmember(vap->va_gid, cr) &&
+			    secpolicy_vnode_create_gid(cr) != 0)
+				acl_ids->z_fgid = 0;
+		}
+		if (acl_ids->z_fgid == 0) {
+			if (dzp->z_mode & S_ISGID) {
+				char		*domain;
+				uint32_t	rid;
+
+				acl_ids->z_fgid = KGID_TO_SGID(
+				    ZTOI(dzp)->i_gid);
+				gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
+				    cr, ZFS_GROUP);
+
+				if (zfsvfs->z_use_fuids &&
+				    IS_EPHEMERAL(acl_ids->z_fgid)) {
+					domain = zfs_fuid_idx_domain(
+					    &zfsvfs->z_fuid_idx,
+					    FUID_INDEX(acl_ids->z_fgid));
+					rid = FUID_RID(acl_ids->z_fgid);
+					zfs_fuid_node_add(&acl_ids->z_fuidp,
+					    domain, rid,
+					    FUID_INDEX(acl_ids->z_fgid),
+					    acl_ids->z_fgid, ZFS_GROUP);
+				}
+			} else {
+				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
+				gid = crgetgid(cr);
+			}
+		}
+	}
+#endif /* HAVE_KSID */
+
+	/*
+	 * If we're creating a directory, and the parent directory has the
+	 * set-GID bit set, set in on the new directory.
+	 * Otherwise, if the user is neither privileged nor a member of the
+	 * file's new group, clear the file's set-GID bit.
+	 */
+
+	if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) &&
+	    (S_ISDIR(vap->va_mode))) {
+		acl_ids->z_mode |= S_ISGID;
+	} else {
+		if ((acl_ids->z_mode & S_ISGID) &&
+		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
+			acl_ids->z_mode &= ~S_ISGID;
+	}
+
+	if (acl_ids->z_aclp == NULL) {
+		mutex_enter(&dzp->z_acl_lock);
+		mutex_enter(&dzp->z_lock);
+		if (!(flag & IS_ROOT_NODE) &&
+		    (dzp->z_pflags & ZFS_INHERIT_ACE) &&
+		    !(dzp->z_pflags & ZFS_XATTR)) {
+			VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
+			    &paclp, B_FALSE));
+			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+			    vap->va_mode, paclp, acl_ids->z_mode, &need_chmod);
+			inherited = B_TRUE;
+		} else {
+			acl_ids->z_aclp =
+			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+		}
+		mutex_exit(&dzp->z_lock);
+		mutex_exit(&dzp->z_acl_lock);
+
+		if (need_chmod) {
+			if (S_ISDIR(vap->va_mode))
+				acl_ids->z_aclp->z_hints |=
+				    ZFS_ACL_AUTO_INHERIT;
+
+			if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH &&
+			    zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X)
+				trim = B_TRUE;
+			zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE,
+			    trim, acl_ids->z_aclp);
+		}
+	}
+
+	if (inherited || vsecp) {
+		acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode,
+		    acl_ids->z_aclp, &acl_ids->z_aclp->z_hints,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+		if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0)
+			acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
+	}
+
+	return (0);
+}
+
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+	if (acl_ids->z_aclp)
+		zfs_acl_free(acl_ids->z_aclp);
+	if (acl_ids->z_fuidp)
+		zfs_fuid_info_free(acl_ids->z_fuidp);
+	acl_ids->z_aclp = NULL;
+	acl_ids->z_fuidp = NULL;
+}
+
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid)
+{
+	return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) ||
+	    zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) ||
+	    (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID &&
+	    zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid)));
+}
+
+/*
+ * Retrieve a file's ACL
+ */
+int
+zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfs_acl_t	*aclp;
+	ulong_t		mask;
+	int		error;
+	int 		count = 0;
+	int		largeace = 0;
+
+	mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT |
+	    VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES);
+
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	/*
+	 * Scan ACL to determine number of ACEs
+	 */
+	if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) {
+		void *zacep = NULL;
+		uint64_t who;
+		uint32_t access_mask;
+		uint16_t type, iflags;
+
+		while ((zacep = zfs_acl_next_ace(aclp, zacep,
+		    &who, &access_mask, &iflags, &type))) {
+			switch (type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				largeace++;
+				continue;
+			default:
+				count++;
+			}
+		}
+		vsecp->vsa_aclcnt = count;
+	} else
+		count = (int)aclp->z_acl_count;
+
+	if (mask & VSA_ACECNT) {
+		vsecp->vsa_aclcnt = count;
+	}
+
+	if (mask & VSA_ACE) {
+		size_t aclsz;
+
+		aclsz = count * sizeof (ace_t) +
+		    sizeof (ace_object_t) * largeace;
+
+		vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP);
+		vsecp->vsa_aclentsz = aclsz;
+
+		if (aclp->z_version == ZFS_ACL_VERSION_FUID)
+			zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr,
+			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
+		else {
+			zfs_acl_node_t *aclnode;
+			void *start = vsecp->vsa_aclentp;
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+			    aclp->z_acl_bytes);
+		}
+	}
+	if (mask & VSA_ACE_ACLFLAGS) {
+		vsecp->vsa_aclflags = 0;
+		if (zp->z_pflags & ZFS_ACL_DEFAULTED)
+			vsecp->vsa_aclflags |= ACL_DEFAULTED;
+		if (zp->z_pflags & ZFS_ACL_PROTECTED)
+			vsecp->vsa_aclflags |= ACL_PROTECTED;
+		if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT)
+			vsecp->vsa_aclflags |= ACL_AUTO_INHERIT;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	return (0);
+}
+
+int
+zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode,
+    vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
+{
+	zfs_acl_t *aclp;
+	zfs_acl_node_t *aclnode;
+	int aclcnt = vsecp->vsa_aclcnt;
+	int error;
+
+	if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0)
+		return (SET_ERROR(EINVAL));
+
+	aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version));
+
+	aclp->z_hints = 0;
+	aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t));
+	if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) {
+		if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp,
+		    (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata,
+		    aclcnt, &aclnode->z_size)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	} else {
+		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp,
+		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
+		    &aclnode->z_size, fuidp, cr)) != 0) {
+			zfs_acl_free(aclp);
+			zfs_acl_node_free(aclnode);
+			return (error);
+		}
+	}
+	aclp->z_acl_bytes = aclnode->z_size;
+	aclnode->z_ace_count = aclcnt;
+	aclp->z_acl_count = aclcnt;
+	list_insert_head(&aclp->z_acl, aclnode);
+
+	/*
+	 * If flags are being set then add them to z_hints
+	 */
+	if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) {
+		if (vsecp->vsa_aclflags & ACL_PROTECTED)
+			aclp->z_hints |= ZFS_ACL_PROTECTED;
+		if (vsecp->vsa_aclflags & ACL_DEFAULTED)
+			aclp->z_hints |= ZFS_ACL_DEFAULTED;
+		if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT)
+			aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
+	}
+
+	*zaclp = aclp;
+
+	return (0);
+}
+
+/*
+ * Set a file's ACL
+ */
+int
+zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	zilog_t		*zilog = zfsvfs->z_log;
+	ulong_t		mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT);
+	dmu_tx_t	*tx;
+	int		error;
+	zfs_acl_t	*aclp;
+	zfs_fuid_info_t	*fuidp = NULL;
+	boolean_t	fuid_dirtied;
+	uint64_t	acl_obj;
+
+	if (mask == 0)
+		return (SET_ERROR(ENOSYS));
+
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		return (SET_ERROR(EPERM));
+
+	if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)))
+		return (error);
+
+	error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp,
+	    &aclp);
+	if (error)
+		return (error);
+
+	/*
+	 * If ACL wide flags aren't being set then preserve any
+	 * existing flags.
+	 */
+	if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) {
+		aclp->z_hints |=
+		    (zp->z_pflags & V4_ACL_WIDE_FLAGS);
+	}
+top:
+	mutex_enter(&zp->z_acl_lock);
+	mutex_enter(&zp->z_lock);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	/*
+	 * If old version and ACL won't fit in bonus and we aren't
+	 * upgrading then take out necessary DMU holds
+	 */
+
+	if ((acl_obj = zfs_external_acl(zp)) != 0) {
+		if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+		    zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) {
+			dmu_tx_hold_free(tx, acl_obj, 0,
+			    DMU_OBJECT_END);
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+			    aclp->z_acl_bytes);
+		} else {
+			dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes);
+		}
+	} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
+	}
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (error) {
+		mutex_exit(&zp->z_acl_lock);
+		mutex_exit(&zp->z_lock);
+
+		if (error == ERESTART) {
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zfs_acl_free(aclp);
+		return (error);
+	}
+
+	error = zfs_aclset_common(zp, aclp, cr, tx);
+	ASSERT(error == 0);
+	ASSERT(zp->z_acl_cached == NULL);
+	zp->z_acl_cached = aclp;
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
+
+	if (fuidp)
+		zfs_fuid_info_free(fuidp);
+	dmu_tx_commit(tx);
+
+	mutex_exit(&zp->z_lock);
+	mutex_exit(&zp->z_acl_lock);
+
+	return (error);
+}
+
+/*
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only.  Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
+ */
+static int
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
+{
+	if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) &&
+	    (!Z_ISDEV(ZTOI(zp)->i_mode) ||
+	    (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) {
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * Only check for READONLY on non-directories.
+	 */
+	if ((v4_mode & WRITE_MASK_DATA) &&
+	    ((!S_ISDIR(ZTOI(zp)->i_mode) &&
+	    (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
+	    (S_ISDIR(ZTOI(zp)->i_mode) &&
+	    (zp->z_pflags & ZFS_IMMUTABLE)))) {
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
+	    (zp->z_pflags & ZFS_NOUNLINK)) {
+		return (SET_ERROR(EPERM));
+	}
+
+	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
+	    (zp->z_pflags & ZFS_AV_QUARANTINED))) {
+		return (SET_ERROR(EACCES));
+	}
+
+	return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied.  The AoI are expressed as bits in
+ * the working_mode parameter.  As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode.  This removal
+ * facilitates two things.  The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE.  Removing the
+ * discovered access or denial enforces this rule.  At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses.  Returns:
+ *	0		if all AoI granted
+ *	EACCES 		if the denied mask is non-zero
+ *	other error	if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted.  If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE.  The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+    boolean_t anyaccess, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	zfs_acl_t	*aclp;
+	int		error;
+	uid_t		uid = crgetuid(cr);
+	uint64_t	who;
+	uint16_t	type, iflags;
+	uint16_t	entry_type;
+	uint32_t	access_mask;
+	uint32_t	deny_mask = 0;
+	zfs_ace_hdr_t	*acep = NULL;
+	boolean_t	checkit;
+	uid_t		gowner;
+	uid_t		fowner;
+
+	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
+
+	mutex_enter(&zp->z_acl_lock);
+
+	error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+	if (error != 0) {
+		mutex_exit(&zp->z_acl_lock);
+		return (error);
+	}
+
+	ASSERT(zp->z_acl_cached);
+
+	while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
+	    &iflags, &type))) {
+		uint32_t mask_matched;
+
+		if (!zfs_acl_valid_ace_type(type, iflags))
+			continue;
+
+		if (S_ISDIR(ZTOI(zp)->i_mode) &&
+		    (iflags & ACE_INHERIT_ONLY_ACE))
+			continue;
+
+		/* Skip ACE if it does not affect any AoI */
+		mask_matched = (access_mask & *working_mode);
+		if (!mask_matched)
+			continue;
+
+		entry_type = (iflags & ACE_TYPE_FLAGS);
+
+		checkit = B_FALSE;
+
+		switch (entry_type) {
+		case ACE_OWNER:
+			if (uid == fowner)
+				checkit = B_TRUE;
+			break;
+		case OWNING_GROUP:
+			who = gowner;
+			/*FALLTHROUGH*/
+		case ACE_IDENTIFIER_GROUP:
+			checkit = zfs_groupmember(zfsvfs, who, cr);
+			break;
+		case ACE_EVERYONE:
+			checkit = B_TRUE;
+			break;
+
+		/* USER Entry */
+		default:
+			if (entry_type == 0) {
+				uid_t newid;
+
+				newid = zfs_fuid_map_id(zfsvfs, who, cr,
+				    ZFS_ACE_USER);
+				if (newid != IDMAP_WK_CREATOR_OWNER_UID &&
+				    uid == newid)
+					checkit = B_TRUE;
+				break;
+			} else {
+				mutex_exit(&zp->z_acl_lock);
+				return (SET_ERROR(EIO));
+			}
+		}
+
+		if (checkit) {
+			if (type == DENY) {
+				DTRACE_PROBE3(zfs__ace__denies,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				deny_mask |= mask_matched;
+			} else {
+				DTRACE_PROBE3(zfs__ace__allows,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				if (anyaccess) {
+					mutex_exit(&zp->z_acl_lock);
+					return (0);
+				}
+			}
+			*working_mode &= ~mask_matched;
+		}
+
+		/* Are we done? */
+		if (*working_mode == 0)
+			break;
+	}
+
+	mutex_exit(&zp->z_acl_lock);
+
+	/* Put the found 'denies' back on the working mode */
+	if (deny_mask) {
+		*working_mode |= deny_mask;
+		return (SET_ERROR(EACCES));
+	} else if (*working_mode) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+	uint32_t have = ACE_ALL_PERMS;
+
+	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+		uid_t owner;
+
+		owner = zfs_fuid_map_id(ZTOZSB(zp),
+		    KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER);
+		return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0);
+	}
+	return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int err;
+
+	*working_mode = v4_mode;
+	*check_privs = B_TRUE;
+
+	/*
+	 * Short circuit empty requests
+	 */
+	if (v4_mode == 0 || zfsvfs->z_replay) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+		*check_privs = B_FALSE;
+		return (err);
+	}
+
+	/*
+	 * The caller requested that the ACL check be skipped.  This
+	 * would only happen if the caller checked VOP_ACCESS() with a
+	 * 32 bit ACE mask and already had the appropriate permissions.
+	 */
+	if (skipaclchk) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
+static int
+zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
+    cred_t *cr)
+{
+	if (*working_mode != ACE_WRITE_DATA)
+		return (SET_ERROR(EACCES));
+
+	return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode,
+	    check_privs, B_FALSE, cr));
+}
+
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+	boolean_t owner = B_FALSE;
+	boolean_t groupmbr = B_FALSE;
+	boolean_t is_attr;
+	uid_t uid = crgetuid(cr);
+	int error;
+
+	if (zdp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
+
+	is_attr = ((zdp->z_pflags & ZFS_XATTR) &&
+	    (S_ISDIR(ZTOI(zdp)->i_mode)));
+	if (is_attr)
+		goto slow;
+
+
+	mutex_enter(&zdp->z_acl_lock);
+
+	if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) {
+		mutex_exit(&zdp->z_acl_lock);
+		return (0);
+	}
+
+	if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 ||
+	    KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) {
+		mutex_exit(&zdp->z_acl_lock);
+		goto slow;
+	}
+
+	if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) {
+		owner = B_TRUE;
+		if (zdp->z_mode & S_IXUSR) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		} else {
+			mutex_exit(&zdp->z_acl_lock);
+			goto slow;
+		}
+	}
+	if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) {
+		groupmbr = B_TRUE;
+		if (zdp->z_mode & S_IXGRP) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		} else {
+			mutex_exit(&zdp->z_acl_lock);
+			goto slow;
+		}
+	}
+	if (!owner && !groupmbr) {
+		if (zdp->z_mode & S_IXOTH) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		}
+	}
+
+	mutex_exit(&zdp->z_acl_lock);
+
+slow:
+	DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+	ZFS_ENTER(ZTOZSB(zdp));
+	error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+	ZFS_EXIT(ZTOZSB(zdp));
+	return (error);
+}
+
+/*
+ * Determine whether Access should be granted/denied.
+ *
+ * The least priv subsystem is always consulted as a basic privilege
+ * can define any form of access.
+ */
+int
+zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
+{
+	uint32_t	working_mode;
+	int		error;
+	int		is_attr;
+	boolean_t 	check_privs;
+	znode_t		*xzp;
+	znode_t 	*check_zp = zp;
+	mode_t		needed_bits;
+	uid_t		owner;
+
+	is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode));
+
+	/*
+	 * If attribute then validate against base file
+	 */
+	if (is_attr) {
+		if ((error = zfs_zget(ZTOZSB(zp),
+		    zp->z_xattr_parent, &xzp)) != 0) {
+			return (error);
+		}
+
+		check_zp = xzp;
+
+		/*
+		 * fixup mode to map to xattr perms
+		 */
+
+		if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) {
+			mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA);
+			mode |= ACE_WRITE_NAMED_ATTRS;
+		}
+
+		if (mode & (ACE_READ_DATA|ACE_EXECUTE)) {
+			mode &= ~(ACE_READ_DATA|ACE_EXECUTE);
+			mode |= ACE_READ_NAMED_ATTRS;
+		}
+	}
+
+	owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid),
+	    cr, ZFS_OWNER);
+	/*
+	 * Map the bits required to the standard inode flags
+	 * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits.  Map the bits
+	 * mapped by working_mode (currently missing) in missing_bits.
+	 * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode),
+	 * needed_bits.
+	 */
+	needed_bits = 0;
+
+	working_mode = mode;
+	if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) &&
+	    owner == crgetuid(cr))
+		working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+	if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+	    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= S_IRUSR;
+	if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+	    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+		needed_bits |= S_IWUSR;
+	if (working_mode & ACE_EXECUTE)
+		needed_bits |= S_IXUSR;
+
+	if ((error = zfs_zaccess_common(check_zp, mode, &working_mode,
+	    &check_privs, skipaclchk, cr)) == 0) {
+		if (is_attr)
+			zrele(xzp);
+		return (secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+		    needed_bits, needed_bits));
+	}
+
+	if (error && !check_privs) {
+		if (is_attr)
+			zrele(xzp);
+		return (error);
+	}
+
+	if (error && (flags & V_APPEND)) {
+		error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr);
+	}
+
+	if (error && check_privs) {
+		mode_t		checkmode = 0;
+
+		/*
+		 * First check for implicit owner permission on
+		 * read_acl/read_attributes
+		 */
+
+		error = 0;
+		ASSERT(working_mode != 0);
+
+		if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) &&
+		    owner == crgetuid(cr)))
+			working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES);
+
+		if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS|
+		    ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= S_IRUSR;
+		if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS|
+		    ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE))
+			checkmode |= S_IWUSR;
+		if (working_mode & ACE_EXECUTE)
+			checkmode |= S_IXUSR;
+
+		error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner,
+		    needed_bits & ~checkmode, needed_bits);
+
+		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
+			error = secpolicy_vnode_chown(cr, owner);
+		if (error == 0 && (working_mode & ACE_WRITE_ACL))
+			error = secpolicy_vnode_setdac(cr, owner);
+
+		if (error == 0 && (working_mode &
+		    (ACE_DELETE|ACE_DELETE_CHILD)))
+			error = secpolicy_vnode_remove(cr);
+
+		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
+			error = secpolicy_vnode_chown(cr, owner);
+		}
+		if (error == 0) {
+			/*
+			 * See if any bits other than those already checked
+			 * for are still present.  If so then return EACCES
+			 */
+			if (working_mode & ~(ZFS_CHECKED_MASKS)) {
+				error = SET_ERROR(EACCES);
+			}
+		}
+	} else if (error == 0) {
+		error = secpolicy_vnode_access2(cr, ZTOI(zp), owner,
+		    needed_bits, needed_bits);
+	}
+
+	if (is_attr)
+		zrele(xzp);
+
+	return (error);
+}
+
+/*
+ * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into
+ * NFSv4-style ZFS ACL format and call zfs_zaccess()
+ */
+int
+zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr)
+{
+	return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr));
+}
+
+/*
+ * Access function for secpolicy_vnode_setattr
+ */
+int
+zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr)
+{
+	int v4_mode = zfs_unix_to_v4(mode >> 6);
+
+	return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr));
+}
+
+/* See zfs_zaccess_delete() */
+int zfs_write_implies_delete_child = 1;
+
+/*
+ * Determine whether delete access should be granted.
+ *
+ * The following chart outlines how we handle delete permissions which is
+ * how recent versions of windows (Windows 2008) handles it.  The efficiency
+ * comes from not having to check the parent ACL where the object itself grants
+ * delete:
+ *
+ *      -------------------------------------------------------
+ *      |   Parent Dir  |      Target Object Permissions      |
+ *      |  permissions  |                                     |
+ *      -------------------------------------------------------
+ *      |               | ACL Allows | ACL Denies| Delete     |
+ *      |               |  Delete    |  Delete   | unspecified|
+ *      -------------------------------------------------------
+ *      | ACL Allows    | Permit     | Deny *    | Permit     |
+ *      | DELETE_CHILD  |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL Denies    | Permit     | Deny      | Deny       |
+ *      | DELETE_CHILD  |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL specifies |            |           |            |
+ *      | only allow    | Permit     | Deny *    | Permit     |
+ *      | write and     |            |           |            |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *      | ACL denies    |            |           |            |
+ *      | write and     | Permit     | Deny      | Deny       |
+ *      | execute       |            |           |            |
+ *      -------------------------------------------------------
+ *         ^
+ *         |
+ *         Re. execute permission on the directory:  if that's missing,
+ *	   the vnode lookup of the target will fail before we get here.
+ *
+ * Re [*] in the table above:  NFSv4 would normally Permit delete for
+ * these two cells of the matrix.
+ * See acl.h for notes on which ACE_... flags should be checked for which
+ * operations.  Specifically, the NFSv4 committee recommendation is in
+ * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs
+ * should take precedence ahead of ALLOW ACEs.
+ *
+ * This implementation always consults the target object's ACL first.
+ * If a DENY ACE is present on the target object that specifies ACE_DELETE,
+ * delete access is denied.  If an ALLOW ACE with ACE_DELETE is present on
+ * the target object, access is allowed.  If and only if no entries with
+ * ACE_DELETE are present in the object's ACL, check the container's ACL
+ * for entries with ACE_DELETE_CHILD.
+ *
+ * A summary of the logic implemented from the table above is as follows:
+ *
+ * First check for DENY ACEs that apply.
+ * If either target or container has a deny, EACCES.
+ *
+ * Delete access can then be summarized as follows:
+ * 1: The object to be deleted grants ACE_DELETE, or
+ * 2: The containing directory grants ACE_DELETE_CHILD.
+ * In a Windows system, that would be the end of the story.
+ * In this system, (2) has some complications...
+ * 2a: "sticky" bit on a directory adds restrictions, and
+ * 2b: existing ACEs from previous versions of ZFS may
+ * not carry ACE_DELETE_CHILD where they should, so we
+ * also allow delete when ACE_WRITE_DATA is granted.
+ *
+ * Note: 2b is technically a work-around for a prior bug,
+ * which hopefully can go away some day.  For those who
+ * no longer need the work around, and for testing, this
+ * work-around is made conditional via the tunable:
+ * zfs_write_implies_delete_child
+ */
+int
+zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr)
+{
+	uint32_t wanted_dirperms;
+	uint32_t dzp_working_mode = 0;
+	uint32_t zp_working_mode = 0;
+	int dzp_error, zp_error;
+	boolean_t dzpcheck_privs;
+	boolean_t zpcheck_privs;
+
+	if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK))
+		return (SET_ERROR(EPERM));
+
+	/*
+	 * Case 1:
+	 * If target object grants ACE_DELETE then we are done.  This is
+	 * indicated by a return value of 0.  For this case we don't worry
+	 * about the sticky bit because sticky only applies to the parent
+	 * directory and this is the child access result.
+	 *
+	 * If we encounter a DENY ACE here, we're also done (EACCES).
+	 * Note that if we hit a DENY ACE here (on the target) it should
+	 * take precedence over a DENY ACE on the container, so that when
+	 * we have more complete auditing support we will be able to
+	 * report an access failure against the specific target.
+	 * (This is part of why we're checking the target first.)
+	 */
+	zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode,
+	    &zpcheck_privs, B_FALSE, cr);
+	if (zp_error == EACCES) {
+		/* We hit a DENY ACE. */
+		if (!zpcheck_privs)
+			return (SET_ERROR(zp_error));
+		return (secpolicy_vnode_remove(cr));
+
+	}
+	if (zp_error == 0)
+		return (0);
+
+	/*
+	 * Case 2:
+	 * If the containing directory grants ACE_DELETE_CHILD,
+	 * or we're in backward compatibility mode and the
+	 * containing directory has ACE_WRITE_DATA, allow.
+	 * Case 2b is handled with wanted_dirperms.
+	 */
+	wanted_dirperms = ACE_DELETE_CHILD;
+	if (zfs_write_implies_delete_child)
+		wanted_dirperms |= ACE_WRITE_DATA;
+	dzp_error = zfs_zaccess_common(dzp, wanted_dirperms,
+	    &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr);
+	if (dzp_error == EACCES) {
+		/* We hit a DENY ACE. */
+		if (!dzpcheck_privs)
+			return (SET_ERROR(dzp_error));
+		return (secpolicy_vnode_remove(cr));
+	}
+
+	/*
+	 * Cases 2a, 2b (continued)
+	 *
+	 * Note: dzp_working_mode now contains any permissions
+	 * that were NOT granted.  Therefore, if any of the
+	 * wanted_dirperms WERE granted, we will have:
+	 *   dzp_working_mode != wanted_dirperms
+	 * We're really asking if ANY of those permissions
+	 * were granted, and if so, grant delete access.
+	 */
+	if (dzp_working_mode != wanted_dirperms)
+		dzp_error = 0;
+
+	/*
+	 * dzp_error is 0 if the container granted us permissions to "modify".
+	 * If we do not have permission via one or more ACEs, our current
+	 * privileges may still permit us to modify the container.
+	 *
+	 * dzpcheck_privs is false when i.e. the FS is read-only.
+	 * Otherwise, do privilege checks for the container.
+	 */
+	if (dzp_error != 0 && dzpcheck_privs) {
+		uid_t owner;
+
+		/*
+		 * The secpolicy call needs the requested access and
+		 * the current access mode of the container, but it
+		 * only knows about Unix-style modes (VEXEC, VWRITE),
+		 * so this must condense the fine-grained ACE bits into
+		 * Unix modes.
+		 *
+		 * The VEXEC flag is easy, because we know that has
+		 * always been checked before we get here (during the
+		 * lookup of the target vnode).  The container has not
+		 * granted us permissions to "modify", so we do not set
+		 * the VWRITE flag in the current access mode.
+		 */
+		owner = zfs_fuid_map_id(ZTOZSB(dzp),
+		    KUID_TO_SUID(ZTOI(dzp)->i_uid), cr, ZFS_OWNER);
+		dzp_error = secpolicy_vnode_access2(cr, ZTOI(dzp),
+		    owner, S_IXUSR, S_IWUSR|S_IXUSR);
+	}
+	if (dzp_error != 0) {
+		/*
+		 * Note: We may have dzp_error = -1 here (from
+		 * zfs_zacess_common).  Don't return that.
+		 */
+		return (SET_ERROR(EACCES));
+	}
+
+
+	/*
+	 * At this point, we know that the directory permissions allow
+	 * us to modify, but we still need to check for the additional
+	 * restrictions that apply when the "sticky bit" is set.
+	 *
+	 * Yes, zfs_sticky_remove_access() also checks this bit, but
+	 * checking it here and skipping the call below is nice when
+	 * you're watching all of this with dtrace.
+	 */
+	if ((dzp->z_mode & S_ISVTX) == 0)
+		return (0);
+
+	/*
+	 * zfs_sticky_remove_access will succeed if:
+	 * 1. The sticky bit is absent.
+	 * 2. We pass the sticky bit restrictions.
+	 * 3. We have privileges that always allow file removal.
+	 */
+	return (zfs_sticky_remove_access(dzp, zp, cr));
+}
+
+int
+zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp,
+    znode_t *tzp, cred_t *cr)
+{
+	int add_perm;
+	int error;
+
+	if (szp->z_pflags & ZFS_AV_QUARANTINED)
+		return (SET_ERROR(EACCES));
+
+	add_perm = S_ISDIR(ZTOI(szp)->i_mode) ?
+	    ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE;
+
+	/*
+	 * Rename permissions are combination of delete permission +
+	 * add file/subdir permission.
+	 */
+
+	/*
+	 * first make sure we do the delete portion.
+	 *
+	 * If that succeeds then check for add_file/add_subdir permissions
+	 */
+
+	if ((error = zfs_zaccess_delete(sdzp, szp, cr)))
+		return (error);
+
+	/*
+	 * If we have a tzp, see if we can delete it?
+	 */
+	if (tzp) {
+		if ((error = zfs_zaccess_delete(tdzp, tzp, cr)))
+			return (error);
+	}
+
+	/*
+	 * Now check for add permissions
+	 */
+	error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr);
+
+	return (error);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
new file mode 100644
index 000000000000..a1668e46e4f9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ctldir.c
@@ -0,0 +1,1260 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright (c) 2018 George Melikov. All Rights Reserved.
+ * Copyright (c) 2019 Datto, Inc. All rights reserved.
+ * Copyright (c) 2020 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * ZFS control directory (a.k.a. ".zfs")
+ *
+ * This directory provides a common location for all ZFS meta-objects.
+ * Currently, this is only the 'snapshot' and 'shares' directory, but this may
+ * expand in the future.  The elements are built dynamically, as the hierarchy
+ * does not actually exist on disk.
+ *
+ * For 'snapshot', we don't want to have all snapshots always mounted, because
+ * this would take up a huge amount of space in /etc/mnttab.  We have three
+ * types of objects:
+ *
+ *	ctldir ------> snapshotdir -------> snapshot
+ *                                             |
+ *                                             |
+ *                                             V
+ *                                         mounted fs
+ *
+ * The 'snapshot' node contains just enough information to lookup '..' and act
+ * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
+ * perform an automount of the underlying filesystem and return the
+ * corresponding inode.
+ *
+ * All mounts are handled automatically by an user mode helper which invokes
+ * the mount procedure.  Unmounts are handled by allowing the mount
+ * point to expire so the kernel may automatically unmount it.
+ *
+ * The '.zfs', '.zfs/snapshot', and all directories created under
+ * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
+ * zfsvfs_t as the head filesystem (what '.zfs' lives under).
+ *
+ * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
+ * (ie: snapshots) are complete ZFS filesystems and have their own unique
+ * zfsvfs_t.  However, the fsid reported by these mounts will be the same
+ * as that used by the parent zfsvfs_t to make NFS happy.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/pathname.h>
+#include <sys/vfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/stat.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_deleg.h>
+#include <sys/zpl.h>
+#include <sys/mntent.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Two AVL trees are maintained which contain all currently automounted
+ * snapshots.  Every automounted snapshots maps to a single zfs_snapentry_t
+ * entry which MUST:
+ *
+ *   - be attached to both trees, and
+ *   - be unique, no duplicate entries are allowed.
+ *
+ * The zfs_snapshots_by_name tree is indexed by the full dataset name
+ * while the zfs_snapshots_by_objsetid tree is indexed by the unique
+ * objsetid.  This allows for fast lookups either by name or objsetid.
+ */
+static avl_tree_t zfs_snapshots_by_name;
+static avl_tree_t zfs_snapshots_by_objsetid;
+static krwlock_t zfs_snapshot_lock;
+
+/*
+ * Control Directory Tunables (.zfs)
+ */
+int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
+int zfs_admin_snapshot = 0;
+
+typedef struct {
+	char		*se_name;	/* full snapshot name */
+	char		*se_path;	/* full mount path */
+	spa_t		*se_spa;	/* pool spa */
+	uint64_t	se_objsetid;	/* snapshot objset id */
+	struct dentry   *se_root_dentry; /* snapshot root dentry */
+	taskqid_t	se_taskqid;	/* scheduled unmount taskqid */
+	avl_node_t	se_node_name;	/* zfs_snapshots_by_name link */
+	avl_node_t	se_node_objsetid; /* zfs_snapshots_by_objsetid link */
+	zfs_refcount_t	se_refcount;	/* reference count */
+} zfs_snapentry_t;
+
+static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
+
+/*
+ * Allocate a new zfs_snapentry_t being careful to make a copy of the
+ * the snapshot name and provided mount point.  No reference is taken.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
+    uint64_t objsetid, struct dentry *root_dentry)
+{
+	zfs_snapentry_t *se;
+
+	se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
+
+	se->se_name = kmem_strdup(full_name);
+	se->se_path = kmem_strdup(full_path);
+	se->se_spa = spa;
+	se->se_objsetid = objsetid;
+	se->se_root_dentry = root_dentry;
+	se->se_taskqid = TASKQID_INVALID;
+
+	zfs_refcount_create(&se->se_refcount);
+
+	return (se);
+}
+
+/*
+ * Free a zfs_snapentry_t the caller must ensure there are no active
+ * references.
+ */
+static void
+zfsctl_snapshot_free(zfs_snapentry_t *se)
+{
+	zfs_refcount_destroy(&se->se_refcount);
+	kmem_strfree(se->se_name);
+	kmem_strfree(se->se_path);
+
+	kmem_free(se, sizeof (zfs_snapentry_t));
+}
+
+/*
+ * Hold a reference on the zfs_snapentry_t.
+ */
+static void
+zfsctl_snapshot_hold(zfs_snapentry_t *se)
+{
+	zfs_refcount_add(&se->se_refcount, NULL);
+}
+
+/*
+ * Release a reference on the zfs_snapentry_t.  When the number of
+ * references drops to zero the structure will be freed.
+ */
+static void
+zfsctl_snapshot_rele(zfs_snapentry_t *se)
+{
+	if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
+		zfsctl_snapshot_free(se);
+}
+
+/*
+ * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  While the zfs_snapentry_t is part
+ * of the trees a reference is held.
+ */
+static void
+zfsctl_snapshot_add(zfs_snapentry_t *se)
+{
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+	zfsctl_snapshot_hold(se);
+	avl_add(&zfs_snapshots_by_name, se);
+	avl_add(&zfs_snapshots_by_objsetid, se);
+}
+
+/*
+ * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
+ * zfs_snapshots_by_objsetid trees.  Upon removal a reference is dropped,
+ * this can result in the structure being freed if that was the last
+ * remaining reference.
+ */
+static void
+zfsctl_snapshot_remove(zfs_snapentry_t *se)
+{
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+	avl_remove(&zfs_snapshots_by_name, se);
+	avl_remove(&zfs_snapshots_by_objsetid, se);
+	zfsctl_snapshot_rele(se);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_name.
+ */
+static int
+snapentry_compare_by_name(const void *a, const void *b)
+{
+	const zfs_snapentry_t *se_a = a;
+	const zfs_snapentry_t *se_b = b;
+	int ret;
+
+	ret = strcmp(se_a->se_name, se_b->se_name);
+
+	if (ret < 0)
+		return (-1);
+	else if (ret > 0)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
+ */
+static int
+snapentry_compare_by_objsetid(const void *a, const void *b)
+{
+	const zfs_snapentry_t *se_a = a;
+	const zfs_snapentry_t *se_b = b;
+
+	if (se_a->se_spa != se_b->se_spa)
+		return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
+
+	if (se_a->se_objsetid < se_b->se_objsetid)
+		return (-1);
+	else if (se_a->se_objsetid > se_b->se_objsetid)
+		return (1);
+	else
+		return (0);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_name.  If the snapname
+ * is found a pointer to the zfs_snapentry_t is returned and a reference
+ * taken on the structure.  The caller is responsible for dropping the
+ * reference with zfsctl_snapshot_rele().  If the snapname is not found
+ * NULL will be returned.
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_name(const char *snapname)
+{
+	zfs_snapentry_t *se, search;
+
+	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+	search.se_name = (char *)snapname;
+	se = avl_find(&zfs_snapshots_by_name, &search, NULL);
+	if (se)
+		zfsctl_snapshot_hold(se);
+
+	return (se);
+}
+
+/*
+ * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
+ * rather than the snapname.  In all other respects it behaves the same
+ * as zfsctl_snapshot_find_by_name().
+ */
+static zfs_snapentry_t *
+zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
+{
+	zfs_snapentry_t *se, search;
+
+	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
+
+	search.se_spa = spa;
+	search.se_objsetid = objsetid;
+	se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
+	if (se)
+		zfsctl_snapshot_hold(se);
+
+	return (se);
+}
+
+/*
+ * Rename a zfs_snapentry_t in the zfs_snapshots_by_name.  The structure is
+ * removed, renamed, and added back to the new correct location in the tree.
+ */
+static int
+zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)
+{
+	zfs_snapentry_t *se;
+
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+
+	se = zfsctl_snapshot_find_by_name(old_snapname);
+	if (se == NULL)
+		return (SET_ERROR(ENOENT));
+
+	zfsctl_snapshot_remove(se);
+	kmem_strfree(se->se_name);
+	se->se_name = kmem_strdup(new_snapname);
+	zfsctl_snapshot_add(se);
+	zfsctl_snapshot_rele(se);
+
+	return (0);
+}
+
+/*
+ * Delayed task responsible for unmounting an expired automounted snapshot.
+ */
+static void
+snapentry_expire(void *data)
+{
+	zfs_snapentry_t *se = (zfs_snapentry_t *)data;
+	spa_t *spa = se->se_spa;
+	uint64_t objsetid = se->se_objsetid;
+
+	if (zfs_expire_snapshot <= 0) {
+		zfsctl_snapshot_rele(se);
+		return;
+	}
+
+	se->se_taskqid = TASKQID_INVALID;
+	(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
+	zfsctl_snapshot_rele(se);
+
+	/*
+	 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+	 * This can occur when the snapshot is busy.
+	 */
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+		zfsctl_snapshot_rele(se);
+	}
+	rw_exit(&zfs_snapshot_lock);
+}
+
+/*
+ * Cancel an automatic unmount of a snapname.  This callback is responsible
+ * for dropping the reference on the zfs_snapentry_t which was taken when
+ * during dispatch.
+ */
+static void
+zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
+{
+	if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) {
+		se->se_taskqid = TASKQID_INVALID;
+		zfsctl_snapshot_rele(se);
+	}
+}
+
+/*
+ * Dispatch the unmount task for delayed handling with a hold protecting it.
+ */
+static void
+zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
+{
+	ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID);
+
+	if (delay <= 0)
+		return;
+
+	zfsctl_snapshot_hold(se);
+	se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
+	    snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
+}
+
+/*
+ * Schedule an automatic unmount of objset id to occur in delay seconds from
+ * now.  Any previous delayed unmount will be cancelled in favor of the
+ * updated deadline.  A reference is taken by zfsctl_snapshot_find_by_name()
+ * and held until the outstanding task is handled or cancelled.
+ */
+int
+zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
+{
+	zfs_snapentry_t *se;
+	int error = ENOENT;
+
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
+		zfsctl_snapshot_unmount_cancel(se);
+		zfsctl_snapshot_unmount_delay_impl(se, delay);
+		zfsctl_snapshot_rele(se);
+		error = 0;
+	}
+	rw_exit(&zfs_snapshot_lock);
+
+	return (error);
+}
+
+/*
+ * Check if snapname is currently mounted.  Returned non-zero when mounted
+ * and zero when unmounted.
+ */
+static boolean_t
+zfsctl_snapshot_ismounted(const char *snapname)
+{
+	zfs_snapentry_t *se;
+	boolean_t ismounted = B_FALSE;
+
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
+		zfsctl_snapshot_rele(se);
+		ismounted = B_TRUE;
+	}
+	rw_exit(&zfs_snapshot_lock);
+
+	return (ismounted);
+}
+
+/*
+ * Check if the given inode is a part of the virtual .zfs directory.
+ */
+boolean_t
+zfsctl_is_node(struct inode *ip)
+{
+	return (ITOZ(ip)->z_is_ctldir);
+}
+
+/*
+ * Check if the given inode is a .zfs/snapshots/snapname directory.
+ */
+boolean_t
+zfsctl_is_snapdir(struct inode *ip)
+{
+	return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
+}
+
+/*
+ * Allocate a new inode with the passed id and ops.
+ */
+static struct inode *
+zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
+{
+	inode_timespec_t now;
+	struct inode *ip;
+	znode_t *zp;
+
+	ip = new_inode(zfsvfs->z_sb);
+	if (ip == NULL)
+		return (NULL);
+
+	now = current_time(ip);
+	zp = ITOZ(ip);
+	ASSERT3P(zp->z_dirlocks, ==, NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	zp->z_id = id;
+	zp->z_unlinked = B_FALSE;
+	zp->z_atime_dirty = B_FALSE;
+	zp->z_zn_prefetch = B_FALSE;
+	zp->z_is_sa = B_FALSE;
+	zp->z_is_mapped = B_FALSE;
+	zp->z_is_ctldir = B_TRUE;
+	zp->z_is_stale = B_FALSE;
+	zp->z_sa_hdl = NULL;
+	zp->z_blksz = 0;
+	zp->z_seq = 0;
+	zp->z_mapcnt = 0;
+	zp->z_size = 0;
+	zp->z_pflags = 0;
+	zp->z_mode = 0;
+	zp->z_sync_cnt = 0;
+	ip->i_generation = 0;
+	ip->i_ino = id;
+	ip->i_mode = (S_IFDIR | S_IRWXUGO);
+	ip->i_uid = SUID_TO_KUID(0);
+	ip->i_gid = SGID_TO_KGID(0);
+	ip->i_blkbits = SPA_MINBLOCKSHIFT;
+	ip->i_atime = now;
+	ip->i_mtime = now;
+	ip->i_ctime = now;
+	ip->i_fop = fops;
+	ip->i_op = ops;
+#if defined(IOP_XATTR)
+	ip->i_opflags &= ~IOP_XATTR;
+#endif
+
+	if (insert_inode_locked(ip)) {
+		unlock_new_inode(ip);
+		iput(ip);
+		return (NULL);
+	}
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes++;
+	membar_producer();
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	unlock_new_inode(ip);
+
+	return (ip);
+}
+
+/*
+ * Lookup the inode with given id, it will be allocated if needed.
+ */
+static struct inode *
+zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
+    const struct file_operations *fops, const struct inode_operations *ops)
+{
+	struct inode *ip = NULL;
+
+	while (ip == NULL) {
+		ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
+		if (ip)
+			break;
+
+		/* May fail due to concurrent zfsctl_inode_alloc() */
+		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops);
+	}
+
+	return (ip);
+}
+
+/*
+ * Create the '.zfs' directory.  This directory is cached as part of the VFS
+ * structure.  This results in a hold on the zfsvfs_t.  The code in zfs_umount()
+ * therefore checks against a vfs_count of 2 instead of 1.  This reference
+ * is removed when the ctldir is destroyed in the unmount.  All other entities
+ * under the '.zfs' directory are created dynamically as needed.
+ *
+ * Because the dynamically created '.zfs' directory entries assume the use
+ * of 64-bit inode numbers this support must be disabled on 32-bit systems.
+ */
+int
+zfsctl_create(zfsvfs_t *zfsvfs)
+{
+	ASSERT(zfsvfs->z_ctldir == NULL);
+
+	zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
+	    &zpl_fops_root, &zpl_ops_root);
+	if (zfsvfs->z_ctldir == NULL)
+		return (SET_ERROR(ENOENT));
+
+	return (0);
+}
+
+/*
+ * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
+ * Only called when the filesystem is unmounted.
+ */
+void
+zfsctl_destroy(zfsvfs_t *zfsvfs)
+{
+	if (zfsvfs->z_issnap) {
+		zfs_snapentry_t *se;
+		spa_t *spa = zfsvfs->z_os->os_spa;
+		uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
+
+		rw_enter(&zfs_snapshot_lock, RW_WRITER);
+		se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
+		if (se != NULL)
+			zfsctl_snapshot_remove(se);
+		rw_exit(&zfs_snapshot_lock);
+		if (se != NULL) {
+			zfsctl_snapshot_unmount_cancel(se);
+			zfsctl_snapshot_rele(se);
+		}
+	} else if (zfsvfs->z_ctldir) {
+		iput(zfsvfs->z_ctldir);
+		zfsvfs->z_ctldir = NULL;
+	}
+}
+
+/*
+ * Given a root znode, retrieve the associated .zfs directory.
+ * Add a hold to the vnode and return it.
+ */
+struct inode *
+zfsctl_root(znode_t *zp)
+{
+	ASSERT(zfs_has_ctldir(zp));
+	igrab(ZTOZSB(zp)->z_ctldir);
+	return (ZTOZSB(zp)->z_ctldir);
+}
+
+/*
+ * Generate a long fid to indicate a snapdir. We encode whether snapdir is
+ * already mounted in gen field. We do this because nfsd lookup will not
+ * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
+ * this and do automount and return ESTALE to force nfsd revalidate and follow
+ * mount.
+ */
+static int
+zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
+{
+	zfid_short_t *zfid = (zfid_short_t *)fidp;
+	zfid_long_t *zlfid = (zfid_long_t *)fidp;
+	uint32_t gen = 0;
+	uint64_t object;
+	uint64_t objsetid;
+	int i;
+	struct dentry *dentry;
+
+	if (fidp->fid_len < LONG_FID_LEN) {
+		fidp->fid_len = LONG_FID_LEN;
+		return (SET_ERROR(ENOSPC));
+	}
+
+	object = ip->i_ino;
+	objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
+	zfid->zf_len = LONG_FID_LEN;
+
+	dentry = d_obtain_alias(igrab(ip));
+	if (!IS_ERR(dentry)) {
+		gen = !!d_mountpoint(dentry);
+		dput(dentry);
+	}
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+		zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
+
+	for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+		zlfid->zf_setgen[i] = 0;
+
+	return (0);
+}
+
+/*
+ * Generate an appropriate fid for an entry in the .zfs directory.
+ */
+int
+zfsctl_fid(struct inode *ip, fid_t *fidp)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		i;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsctl_is_snapdir(ip)) {
+		ZFS_EXIT(zfsvfs);
+		return (zfsctl_snapdir_fid(ip, fidp));
+	}
+
+	if (fidp->fid_len < SHORT_FID_LEN) {
+		fidp->fid_len = SHORT_FID_LEN;
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = SHORT_FID_LEN;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* .zfs znodes always have a generation number of 0 */
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = 0;
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Construct a full dataset name in full_name: "pool/dataset@snap_name"
+ */
+static int
+zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
+    char *full_name)
+{
+	objset_t *os = zfsvfs->z_os;
+
+	if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
+		return (SET_ERROR(EILSEQ));
+
+	dmu_objset_name(os, full_name);
+	if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	(void) strcat(full_name, "@");
+	(void) strcat(full_name, snap_name);
+
+	return (0);
+}
+
+/*
+ * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
+ */
+static int
+zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
+    int path_len, char *full_path)
+{
+	objset_t *os = zfsvfs->z_os;
+	fstrans_cookie_t cookie;
+	char *snapname;
+	boolean_t case_conflict;
+	uint64_t id, pos = 0;
+	int error = 0;
+
+	if (zfsvfs->z_vfs->vfs_mntpoint == NULL)
+		return (SET_ERROR(ENOENT));
+
+	cookie = spl_fstrans_mark();
+	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	while (error == 0) {
+		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+		error = dmu_snapshot_list_next(zfsvfs->z_os,
+		    ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
+		    &case_conflict);
+		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+		if (error)
+			goto out;
+
+		if (id == objsetid)
+			break;
+	}
+
+	snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
+	    zfsvfs->z_vfs->vfs_mntpoint, snapname);
+out:
+	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+	spl_fstrans_unmark(cookie);
+
+	return (error);
+}
+
+/*
+ * Special case the handling of "..".
+ */
+int
+zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	int error = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (strcmp(name, "..") == 0) {
+		*ipp = dip->i_sb->s_root->d_inode;
+	} else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
+		    &zpl_fops_snapdir, &zpl_ops_snapdir);
+	} else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
+		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
+		    &zpl_fops_shares, &zpl_ops_shares);
+	} else {
+		*ipp = NULL;
+	}
+
+	if (*ipp == NULL)
+		error = SET_ERROR(ENOENT);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Lookup entry point for the 'snapshot' directory.  Try to open the
+ * snapshot if it exist, creating the pseudo filesystem inode as necessary.
+ */
+int
+zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	uint64_t id;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
+	    &simple_dir_operations, &simple_dir_inode_operations);
+	if (*ipp == NULL)
+		error = SET_ERROR(ENOENT);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Renaming a directory under '.zfs/snapshot' will automatically trigger
+ * a rename of the snapshot to the new given name.  The rename is confined
+ * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
+ */
+int
+zfsctl_snapdir_rename(struct inode *sdip, const char *snm,
+    struct inode *tdip, const char *tnm, cred_t *cr, int flags)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(sdip);
+	char *to, *from, *real, *fsname;
+	int error;
+
+	if (!zfs_admin_snapshot)
+		return (SET_ERROR(EACCES));
+
+	ZFS_ENTER(zfsvfs);
+
+	to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
+		    ZFS_MAX_DATASET_NAME_LEN, NULL);
+		if (error == 0) {
+			snm = real;
+		} else if (error != ENOTSUP) {
+			goto out;
+		}
+	}
+
+	dmu_objset_name(zfsvfs->z_os, fsname);
+
+	error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
+	    ZFS_MAX_DATASET_NAME_LEN, from);
+	if (error == 0)
+		error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
+		    ZFS_MAX_DATASET_NAME_LEN, to);
+	if (error == 0)
+		error = zfs_secpolicy_rename_perms(from, to, cr);
+	if (error != 0)
+		goto out;
+
+	/*
+	 * Cannot move snapshots out of the snapdir.
+	 */
+	if (sdip != tdip) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	/*
+	 * No-op when names are identical.
+	 */
+	if (strcmp(snm, tnm) == 0) {
+		error = 0;
+		goto out;
+	}
+
+	rw_enter(&zfs_snapshot_lock, RW_WRITER);
+
+	error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
+	if (error == 0)
+		(void) zfsctl_snapshot_rename(snm, tnm);
+
+	rw_exit(&zfs_snapshot_lock);
+out:
+	kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Removing a directory under '.zfs/snapshot' will automatically trigger
+ * the removal of the snapshot with the given name.
+ */
+int
+zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr,
+    int flags)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	char *snapname, *real;
+	int error;
+
+	if (!zfs_admin_snapshot)
+		return (SET_ERROR(EACCES));
+
+	ZFS_ENTER(zfsvfs);
+
+	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
+		    ZFS_MAX_DATASET_NAME_LEN, NULL);
+		if (error == 0) {
+			name = real;
+		} else if (error != ENOTSUP) {
+			goto out;
+		}
+	}
+
+	error = zfsctl_snapshot_name(ITOZSB(dip), name,
+	    ZFS_MAX_DATASET_NAME_LEN, snapname);
+	if (error == 0)
+		error = zfs_secpolicy_destroy_perms(snapname, cr);
+	if (error != 0)
+		goto out;
+
+	error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
+	if ((error == 0) || (error == ENOENT))
+		error = dsl_destroy_snapshot(snapname, B_FALSE);
+out:
+	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Creating a directory under '.zfs/snapshot' will automatically trigger
+ * the creation of a new snapshot with the given name.
+ */
+int
+zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap,
+    struct inode **ipp, cred_t *cr, int flags)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	char *dsname;
+	int error;
+
+	if (!zfs_admin_snapshot)
+		return (SET_ERROR(EACCES));
+
+	dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+
+	if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
+		error = SET_ERROR(EILSEQ);
+		goto out;
+	}
+
+	dmu_objset_name(zfsvfs->z_os, dsname);
+
+	error = zfs_secpolicy_snapshot_perms(dsname, cr);
+	if (error != 0)
+		goto out;
+
+	if (error == 0) {
+		error = dmu_objset_snapshot_one(dsname, dirname);
+		if (error != 0)
+			goto out;
+
+		error = zfsctl_snapdir_lookup(dip, dirname, ipp,
+		    0, cr, NULL, NULL);
+	}
+out:
+	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+	return (error);
+}
+
+/*
+ * Flush everything out of the kernel's export table and such.
+ * This is needed as once the snapshot is used over NFS, its
+ * entries in svc_export and svc_expkey caches hold reference
+ * to the snapshot mount point. There is no known way of flushing
+ * only the entries related to the snapshot.
+ */
+static void
+exportfs_flush(void)
+{
+	char *argv[] = { "/usr/sbin/exportfs", "-f", NULL };
+	char *envp[] = { NULL };
+
+	(void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+}
+
+/*
+ * Attempt to unmount a snapshot by making a call to user space.
+ * There is no assurance that this can or will succeed, is just a
+ * best effort.  In the case where it does fail, perhaps because
+ * it's in use, the unmount will fail harmlessly.
+ */
+int
+zfsctl_snapshot_unmount(const char *snapname, int flags)
+{
+	char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
+	    NULL };
+	char *envp[] = { NULL };
+	zfs_snapentry_t *se;
+	int error;
+
+	rw_enter(&zfs_snapshot_lock, RW_READER);
+	if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
+		rw_exit(&zfs_snapshot_lock);
+		return (SET_ERROR(ENOENT));
+	}
+	rw_exit(&zfs_snapshot_lock);
+
+	exportfs_flush();
+
+	if (flags & MNT_FORCE)
+		argv[4] = "-fn";
+	argv[5] = se->se_path;
+	dprintf("unmount; path=%s\n", se->se_path);
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	zfsctl_snapshot_rele(se);
+
+
+	/*
+	 * The umount system utility will return 256 on error.  We must
+	 * assume this error is because the file system is busy so it is
+	 * converted to the more sensible EBUSY.
+	 */
+	if (error)
+		error = SET_ERROR(EBUSY);
+
+	return (error);
+}
+
+int
+zfsctl_snapshot_mount(struct path *path, int flags)
+{
+	struct dentry *dentry = path->dentry;
+	struct inode *ip = dentry->d_inode;
+	zfsvfs_t *zfsvfs;
+	zfsvfs_t *snap_zfsvfs;
+	zfs_snapentry_t *se;
+	char *full_name, *full_path;
+	char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL,
+	    NULL };
+	char *envp[] = { NULL };
+	int error;
+	struct path spath;
+
+	if (ip == NULL)
+		return (SET_ERROR(EISDIR));
+
+	zfsvfs = ITOZSB(ip);
+	ZFS_ENTER(zfsvfs);
+
+	full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+	error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
+	    ZFS_MAX_DATASET_NAME_LEN, full_name);
+	if (error)
+		goto error;
+
+	/*
+	 * Construct a mount point path from sb of the ctldir inode and dirent
+	 * name, instead of from d_path(), so that chroot'd process doesn't fail
+	 * on mount.zfs(8).
+	 */
+	snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
+	    zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "",
+	    dname(dentry));
+
+	/*
+	 * Multiple concurrent automounts of a snapshot are never allowed.
+	 * The snapshot may be manually mounted as many times as desired.
+	 */
+	if (zfsctl_snapshot_ismounted(full_name)) {
+		error = 0;
+		goto error;
+	}
+
+	/*
+	 * Attempt to mount the snapshot from user space.  Normally this
+	 * would be done using the vfs_kern_mount() function, however that
+	 * function is marked GPL-only and cannot be used.  On error we
+	 * careful to log the real error to the console and return EISDIR
+	 * to safely abort the automount.  This should be very rare.
+	 *
+	 * If the user mode helper happens to return EBUSY, a concurrent
+	 * mount is already in progress in which case the error is ignored.
+	 * Take note that if the program was executed successfully the return
+	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
+	 */
+	dprintf("mount; name=%s path=%s\n", full_name, full_path);
+	argv[5] = full_name;
+	argv[6] = full_path;
+	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	if (error) {
+		if (!(error & MOUNT_BUSY << 8)) {
+			zfs_dbgmsg("Unable to automount %s error=%d",
+			    full_path, error);
+			error = SET_ERROR(EISDIR);
+		} else {
+			/*
+			 * EBUSY, this could mean a concurrent mount, or the
+			 * snapshot has already been mounted at completely
+			 * different place. We return 0 so VFS will retry. For
+			 * the latter case the VFS will retry several times
+			 * and return ELOOP, which is probably not a very good
+			 * behavior.
+			 */
+			error = 0;
+		}
+		goto error;
+	}
+
+	/*
+	 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
+	 * to identify this as an automounted filesystem.
+	 */
+	spath = *path;
+	path_get(&spath);
+	if (follow_down_one(&spath)) {
+		snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
+		snap_zfsvfs->z_parent = zfsvfs;
+		dentry = spath.dentry;
+		spath.mnt->mnt_flags |= MNT_SHRINKABLE;
+
+		rw_enter(&zfs_snapshot_lock, RW_WRITER);
+		se = zfsctl_snapshot_alloc(full_name, full_path,
+		    snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
+		    dentry);
+		zfsctl_snapshot_add(se);
+		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
+		rw_exit(&zfs_snapshot_lock);
+	}
+	path_put(&spath);
+error:
+	kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(full_path, MAXPATHLEN);
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Get the snapdir inode from fid
+ */
+int
+zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
+    struct inode **ipp)
+{
+	int error;
+	struct path path;
+	char *mnt;
+	struct dentry *dentry;
+
+	mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
+	    MAXPATHLEN, mnt);
+	if (error)
+		goto out;
+
+	/* Trigger automount */
+	error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
+	if (error)
+		goto out;
+
+	path_put(&path);
+	/*
+	 * Get the snapdir inode. Note, we don't want to use the above
+	 * path because it contains the root of the snapshot rather
+	 * than the snapdir.
+	 */
+	*ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
+	if (*ipp == NULL) {
+		error = SET_ERROR(ENOENT);
+		goto out;
+	}
+
+	/* check gen, see zfsctl_snapdir_fid */
+	dentry = d_obtain_alias(igrab(*ipp));
+	if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
+		iput(*ipp);
+		*ipp = NULL;
+		error = SET_ERROR(ENOENT);
+	}
+	if (!IS_ERR(dentry))
+		dput(dentry);
+out:
+	kmem_free(mnt, MAXPATHLEN);
+	return (error);
+}
+
+int
+zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
+    int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(dip);
+	znode_t *zp;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL);
+		zrele(dzp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Initialize the various pieces we'll need to create and manipulate .zfs
+ * directories.  Currently this is unused but available.
+ */
+void
+zfsctl_init(void)
+{
+	avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+	    se_node_name));
+	avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
+	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
+	    se_node_objsetid));
+	rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
+}
+
+/*
+ * Cleanup the various pieces we needed for .zfs directories.  In particular
+ * ensure the expiry timer is canceled safely.
+ */
+void
+zfsctl_fini(void)
+{
+	avl_destroy(&zfs_snapshots_by_name);
+	avl_destroy(&zfs_snapshots_by_objsetid);
+	rw_destroy(&zfs_snapshot_lock);
+}
+
+module_param(zfs_admin_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
+
+module_param(zfs_expire_snapshot, int, 0644);
+MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
new file mode 100644
index 000000000000..8d7f04097da8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_debug.c
@@ -0,0 +1,255 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/trace_zfs.h>
+
+typedef struct zfs_dbgmsg {
+	procfs_list_node_t	zdm_node;
+	uint64_t		zdm_timestamp;
+	int			zdm_size;
+	char			zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+procfs_list_t zfs_dbgmsgs;
+int zfs_dbgmsg_size = 0;
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
+
+/*
+ * Internal ZFS debug messages are enabled by default.
+ *
+ * # Print debug messages
+ * cat /proc/spl/kstat/zfs/dbgmsg
+ *
+ * # Disable the kernel debug message log.
+ * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable
+ *
+ * # Clear the kernel debug message log.
+ * echo 0 >/proc/spl/kstat/zfs/dbgmsg
+ */
+int zfs_dbgmsg_enable = 1;
+
+static int
+zfs_dbgmsg_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
+	return (0);
+}
+
+static int
+zfs_dbgmsg_show(struct seq_file *f, void *p)
+{
+	zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
+	seq_printf(f, "%-12llu %-s\n",
+	    (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
+	return (0);
+}
+
+static void
+zfs_dbgmsg_purge(int max_size)
+{
+	while (zfs_dbgmsg_size > max_size) {
+		zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
+		if (zdm == NULL)
+			return;
+
+		int size = zdm->zdm_size;
+		kmem_free(zdm, size);
+		zfs_dbgmsg_size -= size;
+	}
+}
+
+static int
+zfs_dbgmsg_clear(procfs_list_t *procfs_list)
+{
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	zfs_dbgmsg_purge(0);
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+	return (0);
+}
+
+void
+zfs_dbgmsg_init(void)
+{
+	procfs_list_install("zfs",
+	    NULL,
+	    "dbgmsg",
+	    0600,
+	    &zfs_dbgmsgs,
+	    zfs_dbgmsg_show,
+	    zfs_dbgmsg_show_header,
+	    zfs_dbgmsg_clear,
+	    offsetof(zfs_dbgmsg_t, zdm_node));
+}
+
+void
+zfs_dbgmsg_fini(void)
+{
+	procfs_list_uninstall(&zfs_dbgmsgs);
+	zfs_dbgmsg_purge(0);
+
+	/*
+	 * TODO - decide how to make this permanent
+	 */
+#ifdef _KERNEL
+	procfs_list_destroy(&zfs_dbgmsgs);
+#endif
+}
+
+void
+__set_error(const char *file, const char *func, int line, int err)
+{
+	/*
+	 * To enable this:
+	 *
+	 * $ echo 512 >/sys/module/zfs/parameters/zfs_flags
+	 */
+	if (zfs_flags & ZFS_DEBUG_SET_ERROR)
+		__dprintf(B_FALSE, file, func, line, "error %lu", err);
+}
+
+void
+__zfs_dbgmsg(char *buf)
+{
+	int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+	zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
+	zdm->zdm_size = size;
+	zdm->zdm_timestamp = gethrestime_sec();
+	strcpy(zdm->zdm_msg, buf);
+
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	procfs_list_add(&zfs_dbgmsgs, zdm);
+	zfs_dbgmsg_size += size;
+	zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+
+#ifdef _KERNEL
+
+void
+__dprintf(boolean_t dprint, const char *file, const char *func,
+    int line, const char *fmt, ...)
+{
+	const char *newfile;
+	va_list adx;
+	size_t size;
+	char *buf;
+	char *nl;
+	int i;
+	char *prefix = (dprint) ? "dprintf: " : "";
+
+	size = 1024;
+	buf = kmem_alloc(size, KM_SLEEP);
+
+	/*
+	 * Get rid of annoying prefix to filename.
+	 */
+	newfile = strrchr(file, '/');
+	if (newfile != NULL) {
+		newfile = newfile + 1; /* Get rid of leading / */
+	} else {
+		newfile = file;
+	}
+
+	i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func);
+
+	if (i < size) {
+		va_start(adx, fmt);
+		(void) vsnprintf(buf + i, size - i, fmt, adx);
+		va_end(adx);
+	}
+
+	/*
+	 * Get rid of trailing newline for dprintf logs.
+	 */
+	if (dprint && buf[0] != '\0') {
+		nl = &buf[strlen(buf) - 1];
+		if (*nl == '\n')
+			*nl = '\0';
+	}
+
+	/*
+	 * To get this data enable the zfs__dprintf trace point as shown:
+	 *
+	 * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer
+	 * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable
+	 * $ echo 0 > /sys/kernel/debug/tracing/trace
+	 *
+	 * # Dump the ring buffer.
+	 * $ cat /sys/kernel/debug/tracing/trace
+	 */
+	DTRACE_PROBE1(zfs__dprintf, char *, buf);
+
+	/*
+	 * To get this data:
+	 *
+	 * $ cat /proc/spl/kstat/zfs/dbgmsg
+	 *
+	 * To clear the buffer:
+	 * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg
+	 */
+	__zfs_dbgmsg(buf);
+
+	kmem_free(buf, size);
+}
+
+#else
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+	ssize_t ret __attribute__((unused));
+
+	/*
+	 * We use write() in this function instead of printf()
+	 * so it is safe to call from a signal handler.
+	 */
+	ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+	ret = write(STDOUT_FILENO, tag, strlen(tag));
+	ret = write(STDOUT_FILENO, ") START:\n", 9);
+
+	mutex_enter(&zfs_dbgmsgs.pl_lock);
+	for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL;
+	    zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) {
+		ret = write(STDOUT_FILENO, zdm->zdm_msg,
+		    strlen(zdm->zdm_msg));
+		ret = write(STDOUT_FILENO, "\n", 1);
+	}
+
+	ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11);
+	ret = write(STDOUT_FILENO, tag, strlen(tag));
+	ret = write(STDOUT_FILENO, ") END\n", 6);
+
+	mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+module_param(zfs_dbgmsg_enable, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log");
+
+module_param(zfs_dbgmsg_maxsize, int, 0644);
+MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
new file mode 100644
index 000000000000..207a51d75bc9
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_dir.c
@@ -0,0 +1,1225 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/uio.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/sunddi.h>
+#include <sys/random.h>
+#include <sys/policy.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_vnops.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/dmu.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+
+/*
+ * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
+ * of names after deciding which is the appropriate lookup interface.
+ */
+static int
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+    matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp,
+    uint64_t *zoid)
+{
+	boolean_t conflict = B_FALSE;
+	int error;
+
+	if (zfsvfs->z_norm) {
+		size_t bufsz = 0;
+		char *buf = NULL;
+
+		if (rpnp) {
+			buf = rpnp->pn_buf;
+			bufsz = rpnp->pn_bufsize;
+		}
+
+		/*
+		 * In the non-mixed case we only expect there would ever
+		 * be one match, but we need to use the normalizing lookup.
+		 */
+		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
+		    zoid, mt, buf, bufsz, &conflict);
+	} else {
+		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
+	}
+
+	/*
+	 * Allow multiple entries provided the first entry is
+	 * the object id.  Non-zpl consumers may safely make
+	 * use of the additional space.
+	 *
+	 * XXX: This should be a feature flag for compatibility
+	 */
+	if (error == EOVERFLOW)
+		error = 0;
+
+	if (zfsvfs->z_norm && !error && deflags)
+		*deflags = conflict ? ED_CASE_CONFLICT : 0;
+
+	*zoid = ZFS_DIRENT_OBJ(*zoid);
+
+	return (error);
+}
+
+/*
+ * Lock a directory entry.  A dirlock on <dzp, name> protects that name
+ * in dzp's directory zap object.  As long as you hold a dirlock, you can
+ * assume two things: (1) dzp cannot be reaped, and (2) no other thread
+ * can change the zap entry for (i.e. link or unlink) this name.
+ *
+ * Input arguments:
+ *	dzp	- znode for directory
+ *	name	- name of entry to lock
+ *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
+ *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
+ *		  ZSHARED: allow concurrent access with other ZSHARED callers.
+ *		  ZXATTR: we want dzp's xattr directory
+ *		  ZCILOOK: On a mixed sensitivity file system,
+ *			   this lookup should be case-insensitive.
+ *		  ZCIEXACT: On a purely case-insensitive file system,
+ *			    this lookup should be case-sensitive.
+ *		  ZRENAMING: we are locking for renaming, force narrow locks
+ *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
+ *			     current thread already holds it.
+ *
+ * Output arguments:
+ *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
+ *	dlpp	- pointer to the dirlock for this entry (NULL on error)
+ *      direntflags - (case-insensitive lookup only)
+ *		flags if multiple case-sensitive matches exist in directory
+ *      realpnp     - (case-insensitive lookup only)
+ *		actual name matched within the directory
+ *
+ * Return value: 0 on success or errno on failure.
+ *
+ * NOTE: Always checks for, and rejects, '.' and '..'.
+ * NOTE: For case-insensitive file systems we take wide locks (see below),
+ *	 but return znode pointers to a single match.
+ */
+int
+zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name,
+    znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zfs_dirlock_t	*dl;
+	boolean_t	update;
+	matchtype_t	mt = 0;
+	uint64_t	zoid;
+	int		error = 0;
+	int		cmpflags;
+
+	*zpp = NULL;
+	*dlpp = NULL;
+
+	/*
+	 * Verify that we are not trying to lock '.', '..', or '.zfs'
+	 */
+	if ((name[0] == '.' &&
+	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
+	    (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
+		return (SET_ERROR(EEXIST));
+
+	/*
+	 * Case sensitivity and normalization preferences are set when
+	 * the file system is created.  These are stored in the
+	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
+	 * affect what vnodes can be cached in the DNLC, how we
+	 * perform zap lookups, and the "width" of our dirlocks.
+	 *
+	 * A normal dirlock locks a single name.  Note that with
+	 * normalization a name can be composed multiple ways, but
+	 * when normalized, these names all compare equal.  A wide
+	 * dirlock locks multiple names.  We need these when the file
+	 * system is supporting mixed-mode access.  It is sometimes
+	 * necessary to lock all case permutations of file name at
+	 * once so that simultaneous case-insensitive/case-sensitive
+	 * behaves as rationally as possible.
+	 */
+
+	/*
+	 * When matching we may need to normalize & change case according to
+	 * FS settings.
+	 *
+	 * Note that a normalized match is necessary for a case insensitive
+	 * filesystem when the lookup request is not exact because normalization
+	 * can fold case independent of normalizing code point sequences.
+	 *
+	 * See the table above zfs_dropname().
+	 */
+	if (zfsvfs->z_norm != 0) {
+		mt = MT_NORMALIZE;
+
+		/*
+		 * Determine if the match needs to honor the case specified in
+		 * lookup, and if so keep track of that so that during
+		 * normalization we don't fold case.
+		 */
+		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
+		    (flag & ZCIEXACT)) ||
+		    (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
+			mt |= MT_MATCH_CASE;
+		}
+	}
+
+	/*
+	 * Only look in or update the DNLC if we are looking for the
+	 * name on a file system that does not require normalization
+	 * or case folding.  We can also look there if we happen to be
+	 * on a non-normalizing, mixed sensitivity file system IF we
+	 * are looking for the exact name.
+	 *
+	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
+	 * case for performance improvement?
+	 */
+	update = !zfsvfs->z_norm ||
+	    (zfsvfs->z_case == ZFS_CASE_MIXED &&
+	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+
+	/*
+	 * ZRENAMING indicates we are in a situation where we should
+	 * take narrow locks regardless of the file system's
+	 * preferences for normalizing and case folding.  This will
+	 * prevent us deadlocking trying to grab the same wide lock
+	 * twice if the two names happen to be case-insensitive
+	 * matches.
+	 */
+	if (flag & ZRENAMING)
+		cmpflags = 0;
+	else
+		cmpflags = zfsvfs->z_norm;
+
+	/*
+	 * Wait until there are no locks on this name.
+	 *
+	 * Don't grab the lock if it is already held. However, cannot
+	 * have both ZSHARED and ZHAVELOCK together.
+	 */
+	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
+	if (!(flag & ZHAVELOCK))
+		rw_enter(&dzp->z_name_lock, RW_READER);
+
+	mutex_enter(&dzp->z_lock);
+	for (;;) {
+		if (dzp->z_unlinked && !(flag & ZXATTR)) {
+			mutex_exit(&dzp->z_lock);
+			if (!(flag & ZHAVELOCK))
+				rw_exit(&dzp->z_name_lock);
+			return (SET_ERROR(ENOENT));
+		}
+		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
+			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
+			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
+				break;
+		}
+		if (error != 0) {
+			mutex_exit(&dzp->z_lock);
+			if (!(flag & ZHAVELOCK))
+				rw_exit(&dzp->z_name_lock);
+			return (SET_ERROR(ENOENT));
+		}
+		if (dl == NULL)	{
+			/*
+			 * Allocate a new dirlock and add it to the list.
+			 */
+			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
+			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
+			dl->dl_name = name;
+			dl->dl_sharecnt = 0;
+			dl->dl_namelock = 0;
+			dl->dl_namesize = 0;
+			dl->dl_dzp = dzp;
+			dl->dl_next = dzp->z_dirlocks;
+			dzp->z_dirlocks = dl;
+			break;
+		}
+		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
+			break;
+		cv_wait(&dl->dl_cv, &dzp->z_lock);
+	}
+
+	/*
+	 * If the z_name_lock was NOT held for this dirlock record it.
+	 */
+	if (flag & ZHAVELOCK)
+		dl->dl_namelock = 1;
+
+	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
+		/*
+		 * We're the second shared reference to dl.  Make a copy of
+		 * dl_name in case the first thread goes away before we do.
+		 * Note that we initialize the new name before storing its
+		 * pointer into dl_name, because the first thread may load
+		 * dl->dl_name at any time.  It'll either see the old value,
+		 * which belongs to it, or the new shared copy; either is OK.
+		 */
+		dl->dl_namesize = strlen(dl->dl_name) + 1;
+		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
+		bcopy(dl->dl_name, name, dl->dl_namesize);
+		dl->dl_name = name;
+	}
+
+	mutex_exit(&dzp->z_lock);
+
+	/*
+	 * We have a dirlock on the name.  (Note that it is the dirlock,
+	 * not the dzp's z_lock, that protects the name in the zap object.)
+	 * See if there's an object by this name; if so, put a hold on it.
+	 */
+	if (flag & ZXATTR) {
+		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
+		    sizeof (zoid));
+		if (error == 0)
+			error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
+	} else {
+		error = zfs_match_find(zfsvfs, dzp, name, mt,
+		    update, direntflags, realpnp, &zoid);
+	}
+	if (error) {
+		if (error != ENOENT || (flag & ZEXISTS)) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	} else {
+		if (flag & ZNEW) {
+			zfs_dirent_unlock(dl);
+			return (SET_ERROR(EEXIST));
+		}
+		error = zfs_zget(zfsvfs, zoid, zpp);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			return (error);
+		}
+	}
+
+	*dlpp = dl;
+
+	return (0);
+}
+
+/*
+ * Unlock this directory entry and wake anyone who was waiting for it.
+ */
+void
+zfs_dirent_unlock(zfs_dirlock_t *dl)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfs_dirlock_t **prev_dl, *cur_dl;
+
+	mutex_enter(&dzp->z_lock);
+
+	if (!dl->dl_namelock)
+		rw_exit(&dzp->z_name_lock);
+
+	if (dl->dl_sharecnt > 1) {
+		dl->dl_sharecnt--;
+		mutex_exit(&dzp->z_lock);
+		return;
+	}
+	prev_dl = &dzp->z_dirlocks;
+	while ((cur_dl = *prev_dl) != dl)
+		prev_dl = &cur_dl->dl_next;
+	*prev_dl = dl->dl_next;
+	cv_broadcast(&dl->dl_cv);
+	mutex_exit(&dzp->z_lock);
+
+	if (dl->dl_namesize != 0)
+		kmem_free(dl->dl_name, dl->dl_namesize);
+	cv_destroy(&dl->dl_cv);
+	kmem_free(dl, sizeof (*dl));
+}
+
+/*
+ * Look up an entry in a directory.
+ *
+ * NOTE: '.' and '..' are handled as special cases because
+ *	no directory entries are actually stored for them.  If this is
+ *	the root of a filesystem, then '.zfs' is also treated as a
+ *	special pseudo-directory.
+ */
+int
+zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,
+    int *deflg, pathname_t *rpnp)
+{
+	zfs_dirlock_t *dl;
+	znode_t *zp;
+	struct inode *ip;
+	int error = 0;
+	uint64_t parent;
+
+	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+		*zpp = dzp;
+		zhold(*zpp);
+	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+		zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+
+		/*
+		 * If we are a snapshot mounted under .zfs, return
+		 * the inode pointer for the snapshot directory.
+		 */
+		if ((error = sa_lookup(dzp->z_sa_hdl,
+		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+			return (error);
+
+		if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
+			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
+			    "snapshot", &ip, 0, kcred, NULL, NULL);
+			*zpp = ITOZ(ip);
+			return (error);
+		}
+		rw_enter(&dzp->z_parent_lock, RW_READER);
+		error = zfs_zget(zfsvfs, parent, &zp);
+		if (error == 0)
+			*zpp = zp;
+		rw_exit(&dzp->z_parent_lock);
+	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
+		ip = zfsctl_root(dzp);
+		*zpp = ITOZ(ip);
+	} else {
+		int zf;
+
+		zf = ZEXISTS | ZSHARED;
+		if (flags & FIGNORECASE)
+			zf |= ZCILOOK;
+
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+		if (error == 0) {
+			*zpp = zp;
+			zfs_dirent_unlock(dl);
+			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+		}
+		rpnp = NULL;
+	}
+
+	if ((flags & FIGNORECASE) && rpnp && !error)
+		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
+
+	return (error);
+}
+
+/*
+ * unlinked Set (formerly known as the "delete queue") Error Handling
+ *
+ * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating.  We
+ * also fib and say that we won't be adding any new entries to the
+ * unlinked set, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem).  So on the small
+ * chance that the nlink list is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the unlinked set below to fail due to i/o error.  On a
+ * nondebug system, this will result in the space being leaked.
+ */
+void
+zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+	ASSERT(zp->z_unlinked);
+	ASSERT(ZTOI(zp)->i_nlink == 0);
+
+	VERIFY3U(0, ==,
+	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
+
+	dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
+}
+
+/*
+ * Clean up any znodes that had no links when we either crashed or
+ * (force) umounted the file system.
+ */
+static void
+zfs_unlinked_drain_task(void *arg)
+{
+	zfsvfs_t *zfsvfs = arg;
+	zap_cursor_t	zc;
+	zap_attribute_t zap;
+	dmu_object_info_t doi;
+	znode_t		*zp;
+	int		error;
+
+	ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
+
+	/*
+	 * Iterate over the contents of the unlinked set.
+	 */
+	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
+	    zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel;
+	    zap_cursor_advance(&zc)) {
+
+		/*
+		 * See what kind of object we have in list
+		 */
+
+		error = dmu_object_info(zfsvfs->z_os,
+		    zap.za_first_integer, &doi);
+		if (error != 0)
+			continue;
+
+		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
+		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
+		/*
+		 * We need to re-mark these list entries for deletion,
+		 * so we pull them back into core and set zp->z_unlinked.
+		 */
+		error = zfs_zget(zfsvfs, zap.za_first_integer, &zp);
+
+		/*
+		 * We may pick up znodes that are already marked for deletion.
+		 * This could happen during the purge of an extended attribute
+		 * directory.  All we need to do is skip over them, since they
+		 * are already in the system marked z_unlinked.
+		 */
+		if (error != 0)
+			continue;
+
+		zp->z_unlinked = B_TRUE;
+
+		/*
+		 * zrele() decrements the znode's ref count and may cause
+		 * it to be synchronously freed. We interrupt freeing
+		 * of this znode by checking the return value of
+		 * dmu_objset_zfs_unmounting() in dmu_free_long_range()
+		 * when an unmount is requested.
+		 */
+		zrele(zp);
+		ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+	}
+	zap_cursor_fini(&zc);
+
+	zfsvfs->z_draining = B_FALSE;
+	zfsvfs->z_drain_task = TASKQID_INVALID;
+}
+
+/*
+ * Sets z_draining then tries to dispatch async unlinked drain.
+ * If that fails executes synchronous unlinked drain.
+ */
+void
+zfs_unlinked_drain(zfsvfs_t *zfsvfs)
+{
+	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+	ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
+
+	zfsvfs->z_draining = B_TRUE;
+	zfsvfs->z_drain_cancel = B_FALSE;
+
+	zfsvfs->z_drain_task = taskq_dispatch(
+	    dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
+	    zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
+	if (zfsvfs->z_drain_task == TASKQID_INVALID) {
+		zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
+		zfs_unlinked_drain_task(zfsvfs);
+	}
+}
+
+/*
+ * Wait for the unlinked drain taskq task to stop. This will interrupt the
+ * unlinked set processing if it is in progress.
+ */
+void
+zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
+{
+	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
+
+	if (zfsvfs->z_draining) {
+		zfsvfs->z_drain_cancel = B_TRUE;
+		taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
+		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+		zfsvfs->z_drain_task = TASKQID_INVALID;
+		zfsvfs->z_draining = B_FALSE;
+	}
+}
+
+/*
+ * Delete the entire contents of a directory.  Return a count
+ * of the number of entries that could not be deleted. If we encounter
+ * an error, return a count of at least one so that the directory stays
+ * in the unlinked set.
+ *
+ * NOTE: this function assumes that the directory is inactive,
+ *	so there is no need to lock its entries before deletion.
+ *	Also, it assumes the directory contents is *only* regular
+ *	files.
+ */
+static int
+zfs_purgedir(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	znode_t		*xzp;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zfs_dirlock_t	dl;
+	int skipped = 0;
+	int error;
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		error = zfs_zget(zfsvfs,
+		    ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp);
+		if (error) {
+			skipped += 1;
+			continue;
+		}
+
+		ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
+		    S_ISLNK(ZTOI(xzp)->i_mode));
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+		/* Is this really needed ? */
+		zfs_sa_upgrade_txholds(tx, xzp);
+		dmu_tx_mark_netfree(tx);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			zfs_zrele_async(xzp);
+			skipped += 1;
+			continue;
+		}
+		bzero(&dl, sizeof (dl));
+		dl.dl_dzp = dzp;
+		dl.dl_name = zap.za_name;
+
+		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+		if (error)
+			skipped += 1;
+		dmu_tx_commit(tx);
+
+		zfs_zrele_async(xzp);
+	}
+	zap_cursor_fini(&zc);
+	if (error != ENOENT)
+		skipped += 1;
+	return (skipped);
+}
+
+void
+zfs_rmnode(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	objset_t	*os = zfsvfs->z_os;
+	znode_t		*xzp = NULL;
+	dmu_tx_t	*tx;
+	uint64_t	acl_obj;
+	uint64_t	xattr_obj;
+	uint64_t	links;
+	int		error;
+
+	ASSERT(ZTOI(zp)->i_nlink == 0);
+	ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0);
+
+	/*
+	 * If this is an attribute directory, purge its contents.
+	 */
+	if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
+		if (zfs_purgedir(zp) != 0) {
+			/*
+			 * Not enough space to delete some xattrs.
+			 * Leave it in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+
+			return;
+		}
+	}
+
+	/*
+	 * Free up all the data in the file.  We don't do this for directories
+	 * because we need truncate and remove to be in the same tx, like in
+	 * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
+	 * an inconsistent truncated zap object in the delete queue.  Note a
+	 * truncated file is harmless since it only contains user data.
+	 */
+	if (S_ISREG(ZTOI(zp)->i_mode)) {
+		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
+		if (error) {
+			/*
+			 * Not enough space or we were interrupted by unmount.
+			 * Leave the file in the unlinked set.
+			 */
+			zfs_znode_dmu_fini(zp);
+			return;
+		}
+	}
+
+	/*
+	 * If the file has extended attributes, we're going to unlink
+	 * the xattr dir.
+	 */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT(error == 0);
+	}
+
+	acl_obj = zfs_external_acl(zp);
+
+	/*
+	 * Set up the final transaction.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	if (xzp) {
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+	}
+	if (acl_obj)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		/*
+		 * Not enough space to delete the file.  Leave it in the
+		 * unlinked set, leaking it until the fs is remounted (at
+		 * which point we'll call zfs_unlinked_drain() to process it).
+		 */
+		dmu_tx_abort(tx);
+		zfs_znode_dmu_fini(zp);
+		goto out;
+	}
+
+	if (xzp) {
+		ASSERT(error == 0);
+		mutex_enter(&xzp->z_lock);
+		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
+		clear_nlink(ZTOI(xzp));		/* no more links to it */
+		links = 0;
+		VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+		    &links, sizeof (links), tx));
+		mutex_exit(&xzp->z_lock);
+		zfs_unlinked_add(xzp, tx);
+	}
+
+	mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	/*
+	 * Remove this znode from the unlinked set.  If a has rollback has
+	 * occurred while a file is open and unlinked.  Then when the file
+	 * is closed post rollback it will not exist in the rolled back
+	 * version of the unlinked object.
+	 */
+	error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+	    zp->z_id, tx);
+	VERIFY(error == 0 || error == ENOENT);
+
+	uint64_t count;
+	if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
+		cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
+	}
+
+	mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
+
+	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
+
+	zfs_znode_delete(zp, tx);
+
+	dmu_tx_commit(tx);
+out:
+	if (xzp)
+		zfs_zrele_async(xzp);
+}
+
+static uint64_t
+zfs_dirent(znode_t *zp, uint64_t mode)
+{
+	uint64_t de = zp->z_id;
+
+	if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
+		de |= IFTODT(mode) << 60;
+	return (de);
+}
+
+/*
+ * Link zp into dl.  Can fail in the following cases :
+ * - if zp has been unlinked.
+ * - if the number of entries with the same hash (aka. colliding entries)
+ *    exceed the capacity of a leaf-block of fatzap and splitting of the
+ *    leaf-block does not help.
+ */
+int
+zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	uint64_t value;
+	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	uint64_t links;
+	int count = 0;
+	int error;
+
+	mutex_enter(&zp->z_lock);
+
+	if (!(flag & ZRENAMING)) {
+		if (zp->z_unlinked) {	/* no new links to unlinked zp */
+			ASSERT(!(flag & (ZNEW | ZEXISTS)));
+			mutex_exit(&zp->z_lock);
+			return (SET_ERROR(ENOENT));
+		}
+		if (!(flag & ZNEW)) {
+			/*
+			 * ZNEW nodes come from zfs_mknode() where the link
+			 * count has already been initialised
+			 */
+			inc_nlink(ZTOI(zp));
+			links = ZTOI(zp)->i_nlink;
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+			    NULL, &links, sizeof (links));
+		}
+	}
+
+	value = zfs_dirent(zp, zp->z_mode);
+	error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
+	    &value, tx);
+
+	/*
+	 * zap_add could fail to add the entry if it exceeds the capacity of the
+	 * leaf-block and zap_leaf_split() failed to help.
+	 * The caller of this routine is responsible for failing the transaction
+	 * which will rollback the SA updates done above.
+	 */
+	if (error != 0) {
+		if (!(flag & ZRENAMING) && !(flag & ZNEW))
+			drop_nlink(ZTOI(zp));
+		mutex_exit(&zp->z_lock);
+		return (error);
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+	    &dzp->z_id, sizeof (dzp->z_id));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (!(flag & ZNEW)) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    ctime, sizeof (ctime));
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+		    ctime);
+	}
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+
+	mutex_exit(&zp->z_lock);
+
+	mutex_enter(&dzp->z_lock);
+	dzp->z_size++;
+	if (zp_is_dir)
+		inc_nlink(ZTOI(dzp));
+	links = ZTOI(dzp)->i_nlink;
+	count = 0;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &links, sizeof (links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+	mutex_exit(&dzp->z_lock);
+
+	return (0);
+}
+
+/*
+ * The match type in the code for this function should conform to:
+ *
+ * ------------------------------------------------------------------------
+ * fs type  | z_norm      | lookup type | match type
+ * ---------|-------------|-------------|----------------------------------
+ * CS !norm | 0           |           0 | 0 (exact)
+ * CS  norm | formX       |           0 | MT_NORMALIZE
+ * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
+ * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
+ * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
+ * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
+ * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
+ *
+ * Abbreviations:
+ *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
+ *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
+ *    formX = unicode normalization form set on fs creation
+ */
+static int
+zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+    int flag)
+{
+	int error;
+
+	if (ZTOZSB(zp)->z_norm) {
+		matchtype_t mt = MT_NORMALIZE;
+
+		if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
+		    (flag & ZCIEXACT)) ||
+		    (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
+		    !(flag & ZCILOOK))) {
+			mt |= MT_MATCH_CASE;
+		}
+
+		error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
+		    dl->dl_name, mt, tx);
+	} else {
+		error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
+		    tx);
+	}
+
+	return (error);
+}
+
+/*
+ * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
+ * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
+ * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
+ * If it's non-NULL, we use it to indicate whether the znode needs deletion,
+ * and it's the caller's job to do it.
+ */
+int
+zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
+    boolean_t *unlinkedp)
+{
+	znode_t *dzp = dl->dl_dzp;
+	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
+	boolean_t unlinked = B_FALSE;
+	sa_bulk_attr_t bulk[5];
+	uint64_t mtime[2], ctime[2];
+	uint64_t links;
+	int count = 0;
+	int error;
+
+	if (!(flag & ZRENAMING)) {
+		mutex_enter(&zp->z_lock);
+
+		if (zp_is_dir && !zfs_dirempty(zp)) {
+			mutex_exit(&zp->z_lock);
+			return (SET_ERROR(ENOTEMPTY));
+		}
+
+		/*
+		 * If we get here, we are going to try to remove the object.
+		 * First try removing the name from the directory; if that
+		 * fails, return the error.
+		 */
+		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		if (error != 0) {
+			mutex_exit(&zp->z_lock);
+			return (error);
+		}
+
+		if (ZTOI(zp)->i_nlink <= zp_is_dir) {
+			zfs_panic_recover("zfs: link count on %lu is %u, "
+			    "should be at least %u", zp->z_id,
+			    (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
+			set_nlink(ZTOI(zp), zp_is_dir + 1);
+		}
+		drop_nlink(ZTOI(zp));
+		if (ZTOI(zp)->i_nlink == zp_is_dir) {
+			zp->z_unlinked = B_TRUE;
+			clear_nlink(ZTOI(zp));
+			unlinked = B_TRUE;
+		} else {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+			    NULL, &ctime, sizeof (ctime));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+			    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
+			zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
+			    ctime);
+		}
+		links = ZTOI(zp)->i_nlink;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+		    NULL, &links, sizeof (links));
+		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		count = 0;
+		ASSERT(error == 0);
+		mutex_exit(&zp->z_lock);
+	} else {
+		error = zfs_dropname(dl, zp, dzp, tx, flag);
+		if (error != 0)
+			return (error);
+	}
+
+	mutex_enter(&dzp->z_lock);
+	dzp->z_size--;		/* one dirent removed */
+	if (zp_is_dir)
+		drop_nlink(ZTOI(dzp));	/* ".." link from zp */
+	links = ZTOI(dzp)->i_nlink;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
+	    NULL, &links, sizeof (links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &dzp->z_size, sizeof (dzp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+	    NULL, ctime, sizeof (ctime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
+	    NULL, mtime, sizeof (mtime));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
+	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+	mutex_exit(&dzp->z_lock);
+
+	if (unlinkedp != NULL)
+		*unlinkedp = unlinked;
+	else if (unlinked)
+		zfs_unlinked_add(zp, tx);
+
+	return (0);
+}
+
+/*
+ * Indicate whether the directory is empty.  Works with or without z_lock
+ * held, but can only be consider a hint in the latter case.  Returns true
+ * if only "." and ".." remain and there's no work in progress.
+ *
+ * The internal ZAP size, rather than zp->z_size, needs to be checked since
+ * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
+ */
+boolean_t
+zfs_dirempty(znode_t *dzp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
+	uint64_t count;
+	int error;
+
+	if (dzp->z_dirlocks != NULL)
+		return (B_FALSE);
+
+	error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
+	if (error != 0 || count != 0)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+int
+zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	znode_t *xzp;
+	dmu_tx_t *tx;
+	int error;
+	zfs_acl_ids_t acl_ids;
+	boolean_t fuid_dirtied;
+#ifdef ZFS_DEBUG
+	uint64_t parent;
+#endif
+
+	*xzpp = NULL;
+
+	if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)))
+		return (error);
+
+	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+	    &acl_ids)) != 0)
+		return (error);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
+		zfs_acl_ids_free(&acl_ids);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+#ifdef ZFS_DEBUG
+	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent));
+	ASSERT(error == 0 && parent == zp->z_id);
+#endif
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
+	    sizeof (xzp->z_id), tx));
+
+	if (!zp->z_unlinked)
+		(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
+		    xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+
+	*xzpp = xzp;
+
+	return (0);
+}
+
+/*
+ * Return a znode for the extended attribute directory for zp.
+ * ** If the directory does not already exist, it is created **
+ *
+ *	IN:	zp	- znode to obtain attribute directory from
+ *		cr	- credentials of caller
+ *		flags	- flags from the VOP_LOOKUP call
+ *
+ *	OUT:	xipp	- pointer to extended attribute znode
+ *
+ *	RETURN:	0 on success
+ *		error number on failure
+ */
+int
+zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	znode_t		*xzp;
+	zfs_dirlock_t	*dl;
+	vattr_t		va;
+	int		error;
+top:
+	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+	if (error)
+		return (error);
+
+	if (xzp != NULL) {
+		*xzpp = xzp;
+		zfs_dirent_unlock(dl);
+		return (0);
+	}
+
+	if (!(flags & CREATE_XATTR_DIR)) {
+		zfs_dirent_unlock(dl);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (zfs_is_readonly(zfsvfs)) {
+		zfs_dirent_unlock(dl);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * The ability to 'create' files in an attribute
+	 * directory comes from the write_xattr permission on the base file.
+	 *
+	 * The ability to 'search' an attribute directory requires
+	 * read_xattr permission on the base file.
+	 *
+	 * Once in a directory the ability to read/write attributes
+	 * is controlled by the permissions on the attribute file.
+	 */
+	va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
+	va.va_mode = S_IFDIR | S_ISVTX | 0777;
+	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
+
+	va.va_dentry = NULL;
+	error = zfs_make_xattrdir(zp, &va, xzpp, cr);
+	zfs_dirent_unlock(dl);
+
+	if (error == ERESTART) {
+		/* NB: we already did dmu_tx_wait() if necessary */
+		goto top;
+	}
+
+	return (error);
+}
+
+/*
+ * Decide whether it is okay to remove within a sticky directory.
+ *
+ * In sticky directories, write access is not sufficient;
+ * you can remove entries from a directory only if:
+ *
+ *	you own the directory,
+ *	you own the entry,
+ *	you have write access to the entry,
+ *	or you are privileged (checked in secpolicy...).
+ *
+ * The function returns 0 if remove access is granted.
+ */
+int
+zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
+{
+	uid_t		uid;
+	uid_t		downer;
+	uid_t		fowner;
+	zfsvfs_t	*zfsvfs = ZTOZSB(zdp);
+
+	if (zfsvfs->z_replay)
+		return (0);
+
+	if ((zdp->z_mode & S_ISVTX) == 0)
+		return (0);
+
+	downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
+	    cr, ZFS_OWNER);
+	fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
+	    cr, ZFS_OWNER);
+
+	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
+	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)
+		return (0);
+	else
+		return (secpolicy_vnode_remove(cr));
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
new file mode 100644
index 000000000000..99c6ffc95940
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_file_os.c
@@ -0,0 +1,440 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_file.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <linux/falloc.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#ifdef HAVE_FDTABLE_HEADER
+#include <linux/fdtable.h>
+#endif
+
+/*
+ * Open file
+ *
+ * path - fully qualified path to file
+ * flags - file attributes O_READ / O_WRITE / O_EXCL
+ * fpp - pointer to return file pointer
+ *
+ * Returns 0 on success underlying error on failure.
+ */
+int
+zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
+{
+	struct file *filp;
+	int saved_umask;
+
+	if (!(flags & O_CREAT) && (flags & O_WRONLY))
+		flags |= O_EXCL;
+
+	if (flags & O_CREAT)
+		saved_umask = xchg(&current->fs->umask, 0);
+
+	filp = filp_open(path, flags, mode);
+
+	if (flags & O_CREAT)
+		(void) xchg(&current->fs->umask, saved_umask);
+
+	if (IS_ERR(filp))
+		return (-PTR_ERR(filp));
+
+	*fpp = filp;
+	return (0);
+}
+
+void
+zfs_file_close(zfs_file_t *fp)
+{
+	filp_close(fp, 0);
+}
+
+static ssize_t
+zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *off)
+{
+#if defined(HAVE_KERNEL_WRITE_PPOS)
+	return (kernel_write(fp, buf, count, off));
+#else
+	mm_segment_t saved_fs;
+	ssize_t rc;
+
+	saved_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	rc = vfs_write(fp, (__force const char __user __user *)buf, count, off);
+
+	set_fs(saved_fs);
+
+	return (rc);
+#endif
+}
+
+/*
+ * Stateful write - use os internal file pointer to determine where to
+ * write and update on successful completion.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to write to
+ * buf - buffer to write
+ * count - # of bytes to write
+ * resid -  pointer to count of unwritten bytes  (if short write)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_pos;
+	ssize_t rc;
+
+	rc = zfs_file_write_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	fp->f_pos = off;
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Stateless write - os internal file pointer is not updated.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to write to
+ * buf - buffer to write
+ * count - # of bytes to write
+ * off - file offset to write to (only valid for seekable types)
+ * resid -  pointer to count of unwritten bytes
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	ssize_t rc;
+
+	rc  = zfs_file_write_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+static ssize_t
+zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off)
+{
+#if defined(HAVE_KERNEL_READ_PPOS)
+	return (kernel_read(fp, buf, count, off));
+#else
+	mm_segment_t saved_fs;
+	ssize_t rc;
+
+	saved_fs = get_fs();
+	set_fs(KERNEL_DS);
+
+	rc = vfs_read(fp, (void __user *)buf, count, off);
+	set_fs(saved_fs);
+
+	return (rc);
+#endif
+}
+
+/*
+ * Stateful read - use os internal file pointer to determine where to
+ * read and update on successful completion.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to read from
+ * buf - buffer to write
+ * count - # of bytes to read
+ * resid -  pointer to count of unread bytes (if short read)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
+{
+	loff_t off = fp->f_pos;
+	ssize_t rc;
+
+	rc = zfs_file_read_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	fp->f_pos = off;
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * Stateless read - os internal file pointer is not updated.
+ *
+ * fp -  pointer to file (pipe, socket, etc) to read from
+ * buf - buffer to write
+ * count - # of bytes to write
+ * off - file offset to read from (only valid for seekable types)
+ * resid -  pointer to count of unwritten bytes (if short write)
+ *
+ * Returns 0 on success errno on failure.
+ */
+int
+zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
+    ssize_t *resid)
+{
+	ssize_t rc;
+
+	rc = zfs_file_read_impl(fp, buf, count, &off);
+	if (rc < 0)
+		return (-rc);
+
+	if (resid) {
+		*resid = count - rc;
+	} else if (rc != count) {
+		return (EIO);
+	}
+
+	return (0);
+}
+
+/*
+ * lseek - set / get file pointer
+ *
+ * fp -  pointer to file (pipe, socket, etc) to read from
+ * offp - value to seek to, returns current value plus passed offset
+ * whence - see man pages for standard lseek whence values
+ *
+ * Returns 0 on success errno on failure (ESPIPE for non seekable types)
+ */
+int
+zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
+{
+	loff_t rc;
+
+	if (*offp < 0 || *offp > MAXOFFSET_T)
+		return (EINVAL);
+
+	rc = vfs_llseek(fp, *offp, whence);
+	if (rc < 0)
+		return (-rc);
+
+	*offp = rc;
+
+	return (0);
+}
+
+/*
+ * Get file attributes
+ *
+ * filp - file pointer
+ * zfattr - pointer to file attr structure
+ *
+ * Currently only used for fetching size and file mode.
+ *
+ * Returns 0 on success or error code of underlying getattr call on failure.
+ */
+int
+zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr)
+{
+	struct kstat stat;
+	int rc;
+
+#if defined(HAVE_4ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, &stat, STATX_BASIC_STATS,
+	    AT_STATX_SYNC_AS_STAT);
+#elif defined(HAVE_2ARGS_VFS_GETATTR)
+	rc = vfs_getattr(&filp->f_path, &stat);
+#else
+	rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, &stat);
+#endif
+	if (rc)
+		return (-rc);
+
+	zfattr->zfa_size = stat.size;
+	zfattr->zfa_mode = stat.mode;
+
+	return (0);
+}
+
+/*
+ * Sync file to disk
+ *
+ * filp - file pointer
+ * flags - O_SYNC and or O_DSYNC
+ *
+ * Returns 0 on success or error code of underlying sync call on failure.
+ */
+int
+zfs_file_fsync(zfs_file_t *filp, int flags)
+{
+	int datasync = 0;
+	int error;
+	int fstrans;
+
+	if (flags & O_DSYNC)
+		datasync = 1;
+
+	/*
+	 * May enter XFS which generates a warning when PF_FSTRANS is set.
+	 * To avoid this the flag is cleared over vfs_sync() and then reset.
+	 */
+	fstrans = __spl_pf_fstrans_check();
+	if (fstrans)
+		current->flags &= ~(__SPL_PF_FSTRANS);
+
+	error = -vfs_fsync(filp, datasync);
+
+	if (fstrans)
+		current->flags |= __SPL_PF_FSTRANS;
+
+	return (error);
+}
+
+/*
+ * fallocate - allocate or free space on disk
+ *
+ * fp - file pointer
+ * mode (non-standard options for hole punching etc)
+ * offset - offset to start allocating or freeing from
+ * len - length to free / allocate
+ *
+ * OPTIONAL
+ */
+int
+zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len)
+{
+	/*
+	 * May enter XFS which generates a warning when PF_FSTRANS is set.
+	 * To avoid this the flag is cleared over vfs_sync() and then reset.
+	 */
+	int fstrans = __spl_pf_fstrans_check();
+	if (fstrans)
+		current->flags &= ~(__SPL_PF_FSTRANS);
+
+	/*
+	 * When supported by the underlying file system preferentially
+	 * use the fallocate() callback to preallocate the space.
+	 */
+	int error = EOPNOTSUPP;
+	if (fp->f_op->fallocate)
+		error = fp->f_op->fallocate(fp, mode, offset, len);
+
+	if (fstrans)
+		current->flags |= __SPL_PF_FSTRANS;
+
+	return (error);
+}
+
+/*
+ * Request current file pointer offset
+ *
+ * fp - pointer to file
+ *
+ * Returns current file offset.
+ */
+loff_t
+zfs_file_off(zfs_file_t *fp)
+{
+	return (fp->f_pos);
+}
+
+/*
+ * Request file pointer private data
+ *
+ * fp - pointer to file
+ *
+ * Returns pointer to file private data.
+ */
+void *
+zfs_file_private(zfs_file_t *fp)
+{
+	return (fp->private_data);
+}
+
+/*
+ * unlink file
+ *
+ * path - fully qualified file path
+ *
+ * Returns 0 on success.
+ *
+ * OPTIONAL
+ */
+int
+zfs_file_unlink(const char *path)
+{
+	return (EOPNOTSUPP);
+}
+
+/*
+ * Get reference to file pointer
+ *
+ * fd - input file descriptor
+ * fpp - pointer to file pointer
+ *
+ * Returns 0 on success EBADF on failure.
+ */
+int
+zfs_file_get(int fd, zfs_file_t **fpp)
+{
+	zfs_file_t *fp;
+
+	fp = fget(fd);
+	if (fp == NULL)
+		return (EBADF);
+
+	*fpp = fp;
+
+	return (0);
+}
+
+/*
+ * Drop reference to file pointer
+ *
+ * fd - input file descriptor
+ */
+void
+zfs_file_put(int fd)
+{
+	struct file *fp;
+
+	if ((fp = fget(fd)) != NULL) {
+		fput(fp);
+		fput(fp);
+	}
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
new file mode 100644
index 000000000000..b88e0497d000
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_ioctl_os.c
@@ -0,0 +1,329 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2019 Datto Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/nvpair.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_crypt.h>
+
+#include <sys/zfs_ioctl_impl.h>
+
+#include <sys/zfs_sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+
+boolean_t
+zfs_vfs_held(zfsvfs_t *zfsvfs)
+{
+	return (zfsvfs->z_sb != NULL);
+}
+
+int
+zfs_vfs_ref(zfsvfs_t **zfvp)
+{
+	if (*zfvp == NULL || (*zfvp)->z_sb == NULL ||
+	    !atomic_inc_not_zero(&((*zfvp)->z_sb->s_active))) {
+		return (SET_ERROR(ESRCH));
+	}
+	return (0);
+}
+
+void
+zfs_vfs_rele(zfsvfs_t *zfsvfs)
+{
+	deactivate_super(zfsvfs->z_sb);
+}
+
+static int
+zfsdev_state_init(struct file *filp)
+{
+	zfsdev_state_t *zs, *zsprev = NULL;
+	minor_t minor;
+	boolean_t newzs = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+	minor = zfsdev_minor_alloc();
+	if (minor == 0)
+		return (SET_ERROR(ENXIO));
+
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+		if (zs->zs_minor == -1)
+			break;
+		zsprev = zs;
+	}
+
+	if (!zs) {
+		zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+		newzs = B_TRUE;
+	}
+
+	filp->private_data = zs;
+
+	zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
+	zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
+
+	/*
+	 * In order to provide for lock-free concurrent read access
+	 * to the minor list in zfsdev_get_state_impl(), new entries
+	 * must be completely written before linking them into the
+	 * list whereas existing entries are already linked; the last
+	 * operation must be updating zs_minor (from -1 to the new
+	 * value).
+	 */
+	if (newzs) {
+		zs->zs_minor = minor;
+		smp_wmb();
+		zsprev->zs_next = zs;
+	} else {
+		smp_wmb();
+		zs->zs_minor = minor;
+	}
+
+	return (0);
+}
+
+static int
+zfsdev_state_destroy(struct file *filp)
+{
+	zfsdev_state_t *zs;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+	ASSERT(filp->private_data != NULL);
+
+	zs = filp->private_data;
+	zs->zs_minor = -1;
+	zfs_onexit_destroy(zs->zs_onexit);
+	zfs_zevent_destroy(zs->zs_zevent);
+	zs->zs_onexit = NULL;
+	zs->zs_zevent = NULL;
+
+	return (0);
+}
+
+static int
+zfsdev_open(struct inode *ino, struct file *filp)
+{
+	int error;
+
+	mutex_enter(&zfsdev_state_lock);
+	error = zfsdev_state_init(filp);
+	mutex_exit(&zfsdev_state_lock);
+
+	return (-error);
+}
+
+static int
+zfsdev_release(struct inode *ino, struct file *filp)
+{
+	int error;
+
+	mutex_enter(&zfsdev_state_lock);
+	error = zfsdev_state_destroy(filp);
+	mutex_exit(&zfsdev_state_lock);
+
+	return (-error);
+}
+
+static long
+zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+	uint_t vecnum;
+	zfs_cmd_t *zc;
+	int error, rc;
+
+	vecnum = cmd - ZFS_IOC_FIRST;
+
+	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+
+	if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) {
+		error = -SET_ERROR(EFAULT);
+		goto out;
+	}
+	error = -zfsdev_ioctl_common(vecnum, zc, 0);
+	rc = ddi_copyout(zc, (void *)(uintptr_t)arg, sizeof (zfs_cmd_t), 0);
+	if (error == 0 && rc != 0)
+		error = -SET_ERROR(EFAULT);
+out:
+	kmem_free(zc, sizeof (zfs_cmd_t));
+	return (error);
+
+}
+
+uint64_t
+zfs_max_nvlist_src_size_os(void)
+{
+	if (zfs_max_nvlist_src_size != 0)
+		return (zfs_max_nvlist_src_size);
+
+	return (KMALLOC_MAX_SIZE);
+}
+
+void
+zfs_ioctl_init_os(void)
+{
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
+{
+	return (zfsdev_ioctl(filp, cmd, arg));
+}
+#else
+#define	zfsdev_compat_ioctl	NULL
+#endif
+
+static const struct file_operations zfsdev_fops = {
+	.open		= zfsdev_open,
+	.release	= zfsdev_release,
+	.unlocked_ioctl	= zfsdev_ioctl,
+	.compat_ioctl	= zfsdev_compat_ioctl,
+	.owner		= THIS_MODULE,
+};
+
+static struct miscdevice zfs_misc = {
+	.minor		= ZFS_DEVICE_MINOR,
+	.name		= ZFS_DRIVER,
+	.fops		= &zfsdev_fops,
+};
+
+MODULE_ALIAS_MISCDEV(ZFS_DEVICE_MINOR);
+MODULE_ALIAS("devname:zfs");
+
+int
+zfsdev_attach(void)
+{
+	int error;
+
+	error = misc_register(&zfs_misc);
+	if (error == -EBUSY) {
+		/*
+		 * Fallback to dynamic minor allocation in the event of a
+		 * collision with a reserved minor in linux/miscdevice.h.
+		 * In this case the kernel modules must be manually loaded.
+		 */
+		printk(KERN_INFO "ZFS: misc_register() with static minor %d "
+		    "failed %d, retrying with MISC_DYNAMIC_MINOR\n",
+		    ZFS_DEVICE_MINOR, error);
+
+		zfs_misc.minor = MISC_DYNAMIC_MINOR;
+		error = misc_register(&zfs_misc);
+	}
+
+	if (error)
+		printk(KERN_INFO "ZFS: misc_register() failed %d\n", error);
+
+	return (error);
+}
+
+void
+zfsdev_detach(void)
+{
+	misc_deregister(&zfs_misc);
+}
+
+#ifdef ZFS_DEBUG
+#define	ZFS_DEBUG_STR	" (DEBUG mode)"
+#else
+#define	ZFS_DEBUG_STR	""
+#endif
+
+static int __init
+_init(void)
+{
+	int error;
+
+	if ((error = zfs_kmod_init()) != 0) {
+		printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s"
+		    ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE,
+		    ZFS_DEBUG_STR, error);
+
+		return (-error);
+	}
+
+	zfs_sysfs_init();
+
+	printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, "
+	    "ZFS pool version %s, ZFS filesystem version %s\n",
+	    ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR,
+	    SPA_VERSION_STRING, ZPL_VERSION_STRING);
+#ifndef CONFIG_FS_POSIX_ACL
+	printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n");
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	return (0);
+}
+
+static void __exit
+_fini(void)
+{
+	zfs_sysfs_fini();
+	zfs_kmod_fini();
+
+	printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n",
+	    ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR);
+}
+
+#if defined(_KERNEL)
+module_init(_init);
+module_exit(_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("ZFS");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
new file mode 100644
index 000000000000..fb7c68987360
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_sysfs.c
@@ -0,0 +1,662 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
+#include <sys/kmem.h>
+#include <sys/fs/zfs.h>
+#include <linux/kobject.h>
+
+#include "zfs_prop.h"
+
+#if !defined(_KERNEL)
+#error kernel builds only
+#endif
+
+/*
+ * ZFS Module sysfs support
+ *
+ * This extends our sysfs '/sys/module/zfs' entry to include feature
+ * and property attributes. The primary consumer of this information
+ * is user processes, like the zfs CLI, that need to know what the
+ * current loaded ZFS module supports. The libzfs binary will consult
+ * this information when instantiating the zfs|zpool property tables
+ * and the pool features table.
+ *
+ * The added top-level directories are:
+ * /sys/module/zfs
+ *		├── features.kernel
+ *		├── features.pool
+ *		├── properties.dataset
+ *		└── properties.pool
+ *
+ * The local interface for the zfs kobjects includes:
+ *	zfs_kobj_init()
+ *	zfs_kobj_add()
+ *	zfs_kobj_release()
+ *	zfs_kobj_add_attr()
+ *	zfs_kobj_fini()
+ */
+
+/*
+ * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs'
+ */
+struct zfs_mod_kobj;
+typedef struct zfs_mod_kobj zfs_mod_kobj_t;
+
+struct zfs_mod_kobj {
+	struct kobject		zko_kobj;
+	struct kobj_type	zko_kobj_type;
+	struct sysfs_ops	zko_sysfs_ops;
+	size_t			zko_attr_count;
+	struct attribute	*zko_attr_list;		/* allocated */
+	struct attribute	**zko_default_attrs;	/* allocated */
+	size_t			zko_child_count;
+	zfs_mod_kobj_t		*zko_children;		/* allocated */
+};
+
+#define	ATTR_TABLE_SIZE(cnt)	(sizeof (struct attribute) * (cnt))
+/* Note +1 for NULL terminator slot */
+#define	DEFAULT_ATTR_SIZE(cnt)	(sizeof (struct attribute *) * (cnt + 1))
+#define	CHILD_TABLE_SIZE(cnt)	(sizeof (zfs_mod_kobj_t) * (cnt))
+
+/*
+ * These are the top-level kobjects under '/sys/module/zfs/'
+ */
+static zfs_mod_kobj_t kernel_features_kobj;
+static zfs_mod_kobj_t pool_features_kobj;
+static zfs_mod_kobj_t dataset_props_kobj;
+static zfs_mod_kobj_t pool_props_kobj;
+
+/*
+ * The show function is used to provide the content
+ * of an attribute into a PAGE_SIZE buffer.
+ */
+typedef ssize_t	(*sysfs_show_func)(struct kobject *, struct attribute *,
+    char *);
+
+static void
+zfs_kobj_fini(zfs_mod_kobj_t *zkobj)
+{
+	/* finalize any child kobjects */
+	if (zkobj->zko_child_count != 0) {
+		ASSERT(zkobj->zko_children);
+		for (int i = 0; i < zkobj->zko_child_count; i++)
+			zfs_kobj_fini(&zkobj->zko_children[i]);
+	}
+
+	/* kobject_put() will call zfs_kobj_release() to release memory */
+	kobject_del(&zkobj->zko_kobj);
+	kobject_put(&zkobj->zko_kobj);
+}
+
+static void
+zfs_kobj_release(struct kobject *kobj)
+{
+	zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj);
+
+	if (zkobj->zko_attr_list != NULL) {
+		ASSERT3S(zkobj->zko_attr_count, !=, 0);
+		kmem_free(zkobj->zko_attr_list,
+		    ATTR_TABLE_SIZE(zkobj->zko_attr_count));
+		zkobj->zko_attr_list = NULL;
+	}
+
+	if (zkobj->zko_default_attrs != NULL) {
+		kmem_free(zkobj->zko_default_attrs,
+		    DEFAULT_ATTR_SIZE(zkobj->zko_attr_count));
+		zkobj->zko_default_attrs = NULL;
+	}
+
+	if (zkobj->zko_child_count != 0) {
+		ASSERT(zkobj->zko_children);
+
+		kmem_free(zkobj->zko_children,
+		    CHILD_TABLE_SIZE(zkobj->zko_child_count));
+		zkobj->zko_child_count = 0;
+		zkobj->zko_children = NULL;
+	}
+
+	zkobj->zko_attr_count = 0;
+}
+
+#ifndef sysfs_attr_init
+#define	sysfs_attr_init(attr) do {} while (0)
+#endif
+
+static void
+zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name)
+{
+	VERIFY3U(attr_num, <, zkobj->zko_attr_count);
+	ASSERT(zkobj->zko_attr_list);
+	ASSERT(zkobj->zko_default_attrs);
+
+	zkobj->zko_attr_list[attr_num].name = attr_name;
+	zkobj->zko_attr_list[attr_num].mode = 0444;
+	zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num];
+	sysfs_attr_init(&zkobj->zko_attr_list[attr_num]);
+}
+
+static int
+zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt,
+    sysfs_show_func show_func)
+{
+	/*
+	 * Initialize object's attributes. Count can be zero.
+	 */
+	if (attr_cnt > 0) {
+		zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt),
+		    KM_SLEEP);
+		if (zkobj->zko_attr_list == NULL)
+			return (ENOMEM);
+	}
+	/* this will always have at least one slot for NULL termination */
+	zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt),
+	    KM_SLEEP);
+	if (zkobj->zko_default_attrs == NULL) {
+		if (zkobj->zko_attr_list != NULL) {
+			kmem_free(zkobj->zko_attr_list,
+			    ATTR_TABLE_SIZE(attr_cnt));
+		}
+		return (ENOMEM);
+	}
+	zkobj->zko_attr_count = attr_cnt;
+	zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs;
+
+	if (child_cnt > 0) {
+		zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt),
+		    KM_SLEEP);
+		if (zkobj->zko_children == NULL) {
+			if (zkobj->zko_default_attrs != NULL) {
+				kmem_free(zkobj->zko_default_attrs,
+				    DEFAULT_ATTR_SIZE(attr_cnt));
+			}
+			if (zkobj->zko_attr_list != NULL) {
+				kmem_free(zkobj->zko_attr_list,
+				    ATTR_TABLE_SIZE(attr_cnt));
+			}
+			return (ENOMEM);
+		}
+		zkobj->zko_child_count = child_cnt;
+	}
+
+	zkobj->zko_sysfs_ops.show = show_func;
+	zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops;
+	zkobj->zko_kobj_type.release = zfs_kobj_release;
+
+	return (0);
+}
+
+static int
+zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name)
+{
+	/* zko_default_attrs must be NULL terminated */
+	ASSERT(zkobj->zko_default_attrs != NULL);
+	ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL);
+
+	kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type);
+	return (kobject_add(&zkobj->zko_kobj, parent, name));
+}
+
+/*
+ * Each zfs property has these common attributes
+ */
+static const char *zprop_attrs[]  = {
+	"type",
+	"readonly",
+	"setonce",
+	"visible",
+	"values",
+	"default",
+	"datasets"	/* zfs properties only */
+};
+
+#define	ZFS_PROP_ATTR_COUNT	ARRAY_SIZE(zprop_attrs)
+#define	ZPOOL_PROP_ATTR_COUNT	(ZFS_PROP_ATTR_COUNT - 1)
+
+static const char *zprop_types[]  = {
+	"number",
+	"string",
+	"index",
+};
+
+typedef struct zfs_type_map {
+	zfs_type_t	ztm_type;
+	const char	*ztm_name;
+} zfs_type_map_t;
+
+static zfs_type_map_t type_map[] = {
+	{ZFS_TYPE_FILESYSTEM,	"filesystem"},
+	{ZFS_TYPE_SNAPSHOT,	"snapshot"},
+	{ZFS_TYPE_VOLUME,	"volume"},
+	{ZFS_TYPE_BOOKMARK,	"bookmark"}
+};
+
+/*
+ * Show the content for a zfs property attribute
+ */
+static ssize_t
+zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property,
+    char *buf, size_t buflen)
+{
+	const char *show_str;
+	char number[32];
+
+	/* For dataset properties list the dataset types that apply */
+	if (strcmp(attr_name, "datasets") == 0 &&
+	    property->pd_types != ZFS_TYPE_POOL) {
+		int len = 0;
+
+		for (int i = 0; i < ARRAY_SIZE(type_map); i++) {
+			if (type_map[i].ztm_type & property->pd_types)  {
+				len += snprintf(buf + len, buflen - len, "%s ",
+				    type_map[i].ztm_name);
+			}
+		}
+		len += snprintf(buf + len, buflen - len, "\n");
+		return (len);
+	}
+
+	if (strcmp(attr_name, "type") == 0) {
+		show_str = zprop_types[property->pd_proptype];
+	} else if (strcmp(attr_name, "readonly") == 0) {
+		show_str = property->pd_attr == PROP_READONLY ? "1" : "0";
+	} else if (strcmp(attr_name, "setonce") == 0) {
+		show_str = property->pd_attr == PROP_ONETIME ? "1" : "0";
+	} else if (strcmp(attr_name, "visible") == 0) {
+		show_str = property->pd_visible ? "1" : "0";
+	} else if (strcmp(attr_name, "values") == 0) {
+		show_str = property->pd_values ? property->pd_values : "";
+	} else if (strcmp(attr_name, "default") == 0) {
+		switch (property->pd_proptype) {
+		case PROP_TYPE_NUMBER:
+			(void) snprintf(number, sizeof (number), "%llu",
+			    (u_longlong_t)property->pd_numdefault);
+			show_str = number;
+			break;
+		case PROP_TYPE_STRING:
+			show_str = property->pd_strdefault ?
+			    property->pd_strdefault : "";
+			break;
+		case PROP_TYPE_INDEX:
+			if (zprop_index_to_string(property->pd_propnum,
+			    property->pd_numdefault, &show_str,
+			    property->pd_types) != 0) {
+				show_str = "";
+			}
+			break;
+		default:
+			return (0);
+		}
+	} else {
+		return (0);
+	}
+
+	return (snprintf(buf, buflen, "%s\n", show_str));
+}
+
+static ssize_t
+dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj));
+	zprop_desc_t *prop_tbl = zfs_prop_get_table();
+	ssize_t len;
+
+	ASSERT3U(prop, <, ZFS_NUM_PROPS);
+
+	len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+	return (len);
+}
+
+static ssize_t
+pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj));
+	zprop_desc_t *prop_tbl = zpool_prop_get_table();
+	ssize_t len;
+
+	ASSERT3U(prop, <, ZPOOL_NUM_PROPS);
+
+	len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE);
+
+	return (len);
+}
+
+/*
+ * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel'
+ *
+ * This list is intended for kernel features that don't have a pool feature
+ * association or that extend existing user kernel interfaces.
+ *
+ * A user process can easily check if the running zfs kernel module
+ * supports the new feature.
+ */
+static const char *zfs_kernel_features[] = {
+	/* --> Add new kernel features here */
+	"com.delphix:vdev_initialize",
+	"org.zfsonlinux:vdev_trim",
+	"org.openzfs:l2arc_persistent",
+};
+
+#define	KERNEL_FEATURE_COUNT	ARRAY_SIZE(zfs_kernel_features)
+
+static ssize_t
+kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	if (strcmp(attr->name, "supported") == 0)
+		return (snprintf(buf, PAGE_SIZE, "yes\n"));
+	return (0);
+}
+
+static void
+kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name)
+{
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot];
+
+	ASSERT3U(slot, <, KERNEL_FEATURE_COUNT);
+	ASSERT(name);
+
+	int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show);
+	if (err)
+		return;
+
+	zfs_kobj_add_attr(zfs_kobj, 0, "supported");
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+	/*
+	 * Create a parent kobject to host kernel features.
+	 *
+	 * '/sys/module/zfs/features.kernel'
+	 */
+	int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT,
+	    kernel_feature_show);
+	if (err)
+		return (err);
+	err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Now create a kobject for each feature.
+	 *
+	 * '/sys/module/zfs/features.kernel/<feature>'
+	 */
+	for (int f = 0; f < KERNEL_FEATURE_COUNT; f++)
+		kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]);
+
+	return (0);
+}
+
+/*
+ * Each pool feature has these common attributes
+ */
+static const char *pool_feature_attrs[]  = {
+	"description",
+	"guid",
+	"uname",
+	"readonly_compatible",
+	"required_for_mos",
+	"activate_on_enable",
+	"per_dataset"
+};
+
+#define	ZPOOL_FEATURE_ATTR_COUNT	ARRAY_SIZE(pool_feature_attrs)
+
+/*
+ * Show the content for the given zfs pool feature attribute
+ */
+static ssize_t
+pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	spa_feature_t fid;
+
+	if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0)
+		return (0);
+
+	ASSERT3U(fid, <, SPA_FEATURES);
+
+	zfeature_flags_t flags = spa_feature_table[fid].fi_flags;
+	const char *show_str = NULL;
+
+	if (strcmp(attr->name, "description") == 0) {
+		show_str = spa_feature_table[fid].fi_desc;
+	} else if (strcmp(attr->name, "guid") == 0) {
+		show_str = spa_feature_table[fid].fi_guid;
+	} else if (strcmp(attr->name, "uname") == 0) {
+		show_str = spa_feature_table[fid].fi_uname;
+	} else if (strcmp(attr->name, "readonly_compatible") == 0) {
+		show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0";
+	} else if (strcmp(attr->name, "required_for_mos") == 0) {
+		show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0";
+	} else if (strcmp(attr->name, "activate_on_enable") == 0) {
+		show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0";
+	} else if (strcmp(attr->name, "per_dataset") == 0) {
+		show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0";
+	}
+	if (show_str == NULL)
+		return (0);
+
+	return (snprintf(buf, PAGE_SIZE, "%s\n", show_str));
+}
+
+static void
+pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid,
+    const char *name)
+{
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid];
+
+	ASSERT3U(fid, <, SPA_FEATURES);
+	ASSERT(name);
+
+	int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0,
+	    pool_feature_show);
+	if (err)
+		return;
+
+	for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++)
+		zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]);
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+}
+
+static int
+zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent)
+{
+	/*
+	 * Create a parent kobject to host pool features.
+	 *
+	 * '/sys/module/zfs/features.pool'
+	 */
+	int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show);
+	if (err)
+		return (err);
+	err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Now create a kobject for each feature.
+	 *
+	 * '/sys/module/zfs/features.pool/<feature>'
+	 */
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++)
+		pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid);
+
+	return (0);
+}
+
+typedef struct prop_to_kobj_arg {
+	zprop_desc_t	*p2k_table;
+	zfs_mod_kobj_t	*p2k_parent;
+	sysfs_show_func	p2k_show_func;
+	int		p2k_attr_count;
+} prop_to_kobj_arg_t;
+
+static int
+zprop_to_kobj(int prop, void *args)
+{
+	prop_to_kobj_arg_t *data = args;
+	zfs_mod_kobj_t *parent = data->p2k_parent;
+	zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop];
+	const char *name = data->p2k_table[prop].pd_name;
+	int err;
+
+	ASSERT(name);
+
+	err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0,
+	    data->p2k_show_func);
+	if (err)
+		return (ZPROP_CONT);
+
+	for (int i = 0; i < data->p2k_attr_count; i++)
+		zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]);
+
+	err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name);
+	if (err)
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+
+	return (ZPROP_CONT);
+}
+
+static int
+zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent,
+    zfs_type_t type)
+{
+	prop_to_kobj_arg_t context;
+	const char *name;
+	int err;
+
+	/*
+	 * Create a parent kobject to host properties.
+	 *
+	 * '/sys/module/zfs/properties.<type>'
+	 */
+	if (type == ZFS_TYPE_POOL) {
+		name = ZFS_SYSFS_POOL_PROPERTIES;
+		context.p2k_table = zpool_prop_get_table();
+		context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT;
+		context.p2k_parent = zfs_kobj;
+		context.p2k_show_func = pool_property_show;
+		err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS,
+		    pool_property_show);
+	} else {
+		name = ZFS_SYSFS_DATASET_PROPERTIES;
+		context.p2k_table = zfs_prop_get_table();
+		context.p2k_attr_count = ZFS_PROP_ATTR_COUNT;
+		context.p2k_parent = zfs_kobj;
+		context.p2k_show_func = dataset_property_show;
+		err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS,
+		    dataset_property_show);
+	}
+
+	if (err)
+		return (err);
+
+	err = zfs_kobj_add(zfs_kobj, parent, name);
+	if (err) {
+		zfs_kobj_release(&zfs_kobj->zko_kobj);
+		return (err);
+	}
+
+	/*
+	 * Create a kobject for each property.
+	 *
+	 * '/sys/module/zfs/properties.<type>/<property>'
+	 */
+	(void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE,
+	    B_FALSE, type);
+
+	return (err);
+}
+
+void
+zfs_sysfs_init(void)
+{
+	struct kobject *parent;
+#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE)
+	parent = kobject_create_and_add("zfs", fs_kobj);
+#else
+	parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj;
+#endif
+	int err;
+
+	if (parent == NULL)
+		return;
+
+	err = zfs_kernel_features_init(&kernel_features_kobj, parent);
+	if (err)
+		return;
+
+	err = zfs_pool_features_init(&pool_features_kobj, parent);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		return;
+	}
+
+	err = zfs_sysfs_properties_init(&pool_props_kobj, parent,
+	    ZFS_TYPE_POOL);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		zfs_kobj_fini(&pool_features_kobj);
+		return;
+	}
+
+	err = zfs_sysfs_properties_init(&dataset_props_kobj, parent,
+	    ZFS_TYPE_FILESYSTEM);
+	if (err) {
+		zfs_kobj_fini(&kernel_features_kobj);
+		zfs_kobj_fini(&pool_features_kobj);
+		zfs_kobj_fini(&pool_props_kobj);
+		return;
+	}
+}
+
+void
+zfs_sysfs_fini(void)
+{
+	/*
+	 * Remove top-level kobjects; each will remove any children kobjects
+	 */
+	zfs_kobj_fini(&kernel_features_kobj);
+	zfs_kobj_fini(&pool_features_kobj);
+	zfs_kobj_fini(&dataset_props_kobj);
+	zfs_kobj_fini(&pool_props_kobj);
+}
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
new file mode 100644
index 000000000000..3b0f824115f8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_uio.c
@@ -0,0 +1,333 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+/*
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/uio_impl.h>
+#include <sys/sysmacros.h>
+#include <sys/strings.h>
+#include <linux/kmap_compat.h>
+#include <linux/uaccess.h>
+
+/*
+ * Move "n" bytes at byte address "p"; "rw" indicates the direction
+ * of the move, and the I/O parameters are provided in "uio", which is
+ * update to reflect the data which was moved.  Returns 0 on success or
+ * a non-zero errno on failure.
+ */
+static int
+zfs_uiomove_iov(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+	const struct iovec *iov = uio->uio_iov;
+	size_t skip = uio->uio_skip;
+	ulong_t cnt;
+
+	while (n && uio->uio_resid) {
+		cnt = MIN(iov->iov_len - skip, n);
+		switch (uio->uio_segflg) {
+		case UIO_USERSPACE:
+			/*
+			 * p = kernel data pointer
+			 * iov->iov_base = user data pointer
+			 */
+			if (rw == UIO_READ) {
+				if (copy_to_user(iov->iov_base+skip, p, cnt))
+					return (EFAULT);
+			} else {
+				unsigned long b_left = 0;
+				if (uio->uio_fault_disable) {
+					if (!zfs_access_ok(VERIFY_READ,
+					    (iov->iov_base + skip), cnt)) {
+						return (EFAULT);
+					}
+					pagefault_disable();
+					b_left =
+					    __copy_from_user_inatomic(p,
+					    (iov->iov_base + skip), cnt);
+					pagefault_enable();
+				} else {
+					b_left =
+					    copy_from_user(p,
+					    (iov->iov_base + skip), cnt);
+				}
+				if (b_left > 0) {
+					unsigned long c_bytes =
+					    cnt - b_left;
+					uio->uio_skip += c_bytes;
+					ASSERT3U(uio->uio_skip, <,
+					    iov->iov_len);
+					uio->uio_resid -= c_bytes;
+					uio->uio_loffset += c_bytes;
+					return (EFAULT);
+				}
+			}
+			break;
+		case UIO_SYSSPACE:
+			if (rw == UIO_READ)
+				bcopy(p, iov->iov_base + skip, cnt);
+			else
+				bcopy(iov->iov_base + skip, p, cnt);
+			break;
+		default:
+			ASSERT(0);
+		}
+		skip += cnt;
+		if (skip == iov->iov_len) {
+			skip = 0;
+			uio->uio_iov = (++iov);
+			uio->uio_iovcnt--;
+		}
+		uio->uio_skip = skip;
+		uio->uio_resid -= cnt;
+		uio->uio_loffset += cnt;
+		p = (caddr_t)p + cnt;
+		n -= cnt;
+	}
+	return (0);
+}
+
+static int
+zfs_uiomove_bvec(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+	const struct bio_vec *bv = uio->uio_bvec;
+	size_t skip = uio->uio_skip;
+	ulong_t cnt;
+
+	while (n && uio->uio_resid) {
+		void *paddr;
+		cnt = MIN(bv->bv_len - skip, n);
+
+		paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1);
+		if (rw == UIO_READ)
+			bcopy(p, paddr + bv->bv_offset + skip, cnt);
+		else
+			bcopy(paddr + bv->bv_offset + skip, p, cnt);
+		zfs_kunmap_atomic(paddr, KM_USER1);
+
+		skip += cnt;
+		if (skip == bv->bv_len) {
+			skip = 0;
+			uio->uio_bvec = (++bv);
+			uio->uio_iovcnt--;
+		}
+		uio->uio_skip = skip;
+		uio->uio_resid -= cnt;
+		uio->uio_loffset += cnt;
+		p = (caddr_t)p + cnt;
+		n -= cnt;
+	}
+	return (0);
+}
+
+#if defined(HAVE_VFS_IOV_ITER)
+static int
+zfs_uiomove_iter(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio,
+    boolean_t revert)
+{
+	size_t cnt = MIN(n, uio->uio_resid);
+
+	if (uio->uio_skip)
+		iov_iter_advance(uio->uio_iter, uio->uio_skip);
+
+	if (rw == UIO_READ)
+		cnt = copy_to_iter(p, cnt, uio->uio_iter);
+	else
+		cnt = copy_from_iter(p, cnt, uio->uio_iter);
+
+	/*
+	 * When operating on a full pipe no bytes are processed.
+	 * In which case return EFAULT which is converted to EAGAIN
+	 * by the kernel's generic_file_splice_read() function.
+	 */
+	if (cnt == 0)
+		return (EFAULT);
+
+	/*
+	 * Revert advancing the uio_iter.  This is set by zfs_uiocopy()
+	 * to avoid consuming the uio and its iov_iter structure.
+	 */
+	if (revert)
+		iov_iter_revert(uio->uio_iter, cnt);
+
+	uio->uio_resid -= cnt;
+	uio->uio_loffset += cnt;
+
+	return (0);
+}
+#endif
+
+int
+zfs_uiomove(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio)
+{
+	if (uio->uio_segflg == UIO_BVEC)
+		return (zfs_uiomove_bvec(p, n, rw, uio));
+#if defined(HAVE_VFS_IOV_ITER)
+	else if (uio->uio_segflg == UIO_ITER)
+		return (zfs_uiomove_iter(p, n, rw, uio, B_FALSE));
+#endif
+	else
+		return (zfs_uiomove_iov(p, n, rw, uio));
+}
+EXPORT_SYMBOL(zfs_uiomove);
+
+/*
+ * Fault in the pages of the first n bytes specified by the uio structure.
+ * 1 byte in each page is touched and the uio struct is unmodified. Any
+ * error will terminate the process as this is only a best attempt to get
+ * the pages resident.
+ */
+int
+zfs_uio_prefaultpages(ssize_t n, zfs_uio_t *uio)
+{
+	if (uio->uio_segflg == UIO_SYSSPACE || uio->uio_segflg == UIO_BVEC) {
+		/* There's never a need to fault in kernel pages */
+		return (0);
+#if defined(HAVE_VFS_IOV_ITER)
+	} else if (uio->uio_segflg == UIO_ITER) {
+		/*
+		 * At least a Linux 4.9 kernel, iov_iter_fault_in_readable()
+		 * can be relied on to fault in user pages when referenced.
+		 */
+		if (iov_iter_fault_in_readable(uio->uio_iter, n))
+			return (EFAULT);
+#endif
+	} else {
+		/* Fault in all user pages */
+		ASSERT3S(uio->uio_segflg, ==, UIO_USERSPACE);
+		const struct iovec *iov = uio->uio_iov;
+		int iovcnt = uio->uio_iovcnt;
+		size_t skip = uio->uio_skip;
+		uint8_t tmp;
+		caddr_t p;
+
+		for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) {
+			ulong_t cnt = MIN(iov->iov_len - skip, n);
+			/* empty iov */
+			if (cnt == 0)
+				continue;
+			n -= cnt;
+			/* touch each page in this segment. */
+			p = iov->iov_base + skip;
+			while (cnt) {
+				if (get_user(tmp, (uint8_t *)p))
+					return (EFAULT);
+				ulong_t incr = MIN(cnt, PAGESIZE);
+				p += incr;
+				cnt -= incr;
+			}
+			/* touch the last byte in case it straddles a page. */
+			p--;
+			if (get_user(tmp, (uint8_t *)p))
+				return (EFAULT);
+		}
+	}
+
+	if (iterp && iov_iter_fault_in_readable(iterp, n))
+		return (EFAULT);
+#endif
+	return (0);
+}
+EXPORT_SYMBOL(zfs_uio_prefaultpages);
+
+/*
+ * The same as zfs_uiomove() but doesn't modify uio structure.
+ * return in cbytes how many bytes were copied.
+ */
+int
+zfs_uiocopy(void *p, size_t n, zfs_uio_rw_t rw, zfs_uio_t *uio, size_t *cbytes)
+{
+	zfs_uio_t uio_copy;
+	int ret;
+
+	bcopy(uio, &uio_copy, sizeof (zfs_uio_t));
+
+	if (uio->uio_segflg == UIO_BVEC)
+		ret = zfs_uiomove_bvec(p, n, rw, &uio_copy);
+#if defined(HAVE_VFS_IOV_ITER)
+	else if (uio->uio_segflg == UIO_ITER)
+		ret = zfs_uiomove_iter(p, n, rw, &uio_copy, B_TRUE);
+#endif
+	else
+		ret = zfs_uiomove_iov(p, n, rw, &uio_copy);
+
+	*cbytes = uio->uio_resid - uio_copy.uio_resid;
+
+	return (ret);
+}
+EXPORT_SYMBOL(zfs_uiocopy);
+
+/*
+ * Drop the next n chars out of *uio.
+ */
+void
+zfs_uioskip(zfs_uio_t *uio, size_t n)
+{
+	if (n > uio->uio_resid)
+		return;
+
+	if (uio->uio_segflg == UIO_BVEC) {
+		uio->uio_skip += n;
+		while (uio->uio_iovcnt &&
+		    uio->uio_skip >= uio->uio_bvec->bv_len) {
+			uio->uio_skip -= uio->uio_bvec->bv_len;
+			uio->uio_bvec++;
+			uio->uio_iovcnt--;
+		}
+#if defined(HAVE_VFS_IOV_ITER)
+	} else if (uio->uio_segflg == UIO_ITER) {
+		iov_iter_advance(uio->uio_iter, n);
+#endif
+	} else {
+		uio->uio_skip += n;
+		while (uio->uio_iovcnt &&
+		    uio->uio_skip >= uio->uio_iov->iov_len) {
+			uio->uio_skip -= uio->uio_iov->iov_len;
+			uio->uio_iov++;
+			uio->uio_iovcnt--;
+		}
+	}
+	uio->uio_loffset += n;
+	uio->uio_resid -= n;
+}
+EXPORT_SYMBOL(zfs_uioskip);
+
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
new file mode 100644
index 000000000000..3cc4b560e477
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c
@@ -0,0 +1,2176 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/kmem.h>
+#include <sys/pathname.h>
+#include <sys/vnode.h>
+#include <sys/vfs.h>
+#include <sys/mntent.h>
+#include <sys/cmn_err.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/sunddi.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/spa_boot.h>
+#include <sys/objlist.h>
+#include <sys/zpl.h>
+#include <linux/vfs_compat.h>
+#include "zfs_comutil.h"
+
+enum {
+	TOKEN_RO,
+	TOKEN_RW,
+	TOKEN_SETUID,
+	TOKEN_NOSETUID,
+	TOKEN_EXEC,
+	TOKEN_NOEXEC,
+	TOKEN_DEVICES,
+	TOKEN_NODEVICES,
+	TOKEN_DIRXATTR,
+	TOKEN_SAXATTR,
+	TOKEN_XATTR,
+	TOKEN_NOXATTR,
+	TOKEN_ATIME,
+	TOKEN_NOATIME,
+	TOKEN_RELATIME,
+	TOKEN_NORELATIME,
+	TOKEN_NBMAND,
+	TOKEN_NONBMAND,
+	TOKEN_MNTPOINT,
+	TOKEN_LAST,
+};
+
+static const match_table_t zpl_tokens = {
+	{ TOKEN_RO,		MNTOPT_RO },
+	{ TOKEN_RW,		MNTOPT_RW },
+	{ TOKEN_SETUID,		MNTOPT_SETUID },
+	{ TOKEN_NOSETUID,	MNTOPT_NOSETUID },
+	{ TOKEN_EXEC,		MNTOPT_EXEC },
+	{ TOKEN_NOEXEC,		MNTOPT_NOEXEC },
+	{ TOKEN_DEVICES,	MNTOPT_DEVICES },
+	{ TOKEN_NODEVICES,	MNTOPT_NODEVICES },
+	{ TOKEN_DIRXATTR,	MNTOPT_DIRXATTR },
+	{ TOKEN_SAXATTR,	MNTOPT_SAXATTR },
+	{ TOKEN_XATTR,		MNTOPT_XATTR },
+	{ TOKEN_NOXATTR,	MNTOPT_NOXATTR },
+	{ TOKEN_ATIME,		MNTOPT_ATIME },
+	{ TOKEN_NOATIME,	MNTOPT_NOATIME },
+	{ TOKEN_RELATIME,	MNTOPT_RELATIME },
+	{ TOKEN_NORELATIME,	MNTOPT_NORELATIME },
+	{ TOKEN_NBMAND,		MNTOPT_NBMAND },
+	{ TOKEN_NONBMAND,	MNTOPT_NONBMAND },
+	{ TOKEN_MNTPOINT,	MNTOPT_MNTPOINT "=%s" },
+	{ TOKEN_LAST,		NULL },
+};
+
+static void
+zfsvfs_vfs_free(vfs_t *vfsp)
+{
+	if (vfsp != NULL) {
+		if (vfsp->vfs_mntpoint != NULL)
+			kmem_strfree(vfsp->vfs_mntpoint);
+
+		kmem_free(vfsp, sizeof (vfs_t));
+	}
+}
+
+static int
+zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp)
+{
+	switch (token) {
+	case TOKEN_RO:
+		vfsp->vfs_readonly = B_TRUE;
+		vfsp->vfs_do_readonly = B_TRUE;
+		break;
+	case TOKEN_RW:
+		vfsp->vfs_readonly = B_FALSE;
+		vfsp->vfs_do_readonly = B_TRUE;
+		break;
+	case TOKEN_SETUID:
+		vfsp->vfs_setuid = B_TRUE;
+		vfsp->vfs_do_setuid = B_TRUE;
+		break;
+	case TOKEN_NOSETUID:
+		vfsp->vfs_setuid = B_FALSE;
+		vfsp->vfs_do_setuid = B_TRUE;
+		break;
+	case TOKEN_EXEC:
+		vfsp->vfs_exec = B_TRUE;
+		vfsp->vfs_do_exec = B_TRUE;
+		break;
+	case TOKEN_NOEXEC:
+		vfsp->vfs_exec = B_FALSE;
+		vfsp->vfs_do_exec = B_TRUE;
+		break;
+	case TOKEN_DEVICES:
+		vfsp->vfs_devices = B_TRUE;
+		vfsp->vfs_do_devices = B_TRUE;
+		break;
+	case TOKEN_NODEVICES:
+		vfsp->vfs_devices = B_FALSE;
+		vfsp->vfs_do_devices = B_TRUE;
+		break;
+	case TOKEN_DIRXATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_DIR;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_SAXATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_SA;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_XATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_DIR;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_NOXATTR:
+		vfsp->vfs_xattr = ZFS_XATTR_OFF;
+		vfsp->vfs_do_xattr = B_TRUE;
+		break;
+	case TOKEN_ATIME:
+		vfsp->vfs_atime = B_TRUE;
+		vfsp->vfs_do_atime = B_TRUE;
+		break;
+	case TOKEN_NOATIME:
+		vfsp->vfs_atime = B_FALSE;
+		vfsp->vfs_do_atime = B_TRUE;
+		break;
+	case TOKEN_RELATIME:
+		vfsp->vfs_relatime = B_TRUE;
+		vfsp->vfs_do_relatime = B_TRUE;
+		break;
+	case TOKEN_NORELATIME:
+		vfsp->vfs_relatime = B_FALSE;
+		vfsp->vfs_do_relatime = B_TRUE;
+		break;
+	case TOKEN_NBMAND:
+		vfsp->vfs_nbmand = B_TRUE;
+		vfsp->vfs_do_nbmand = B_TRUE;
+		break;
+	case TOKEN_NONBMAND:
+		vfsp->vfs_nbmand = B_FALSE;
+		vfsp->vfs_do_nbmand = B_TRUE;
+		break;
+	case TOKEN_MNTPOINT:
+		vfsp->vfs_mntpoint = match_strdup(&args[0]);
+		if (vfsp->vfs_mntpoint == NULL)
+			return (SET_ERROR(ENOMEM));
+
+		break;
+	default:
+		break;
+	}
+
+	return (0);
+}
+
+/*
+ * Parse the raw mntopts and return a vfs_t describing the options.
+ */
+static int
+zfsvfs_parse_options(char *mntopts, vfs_t **vfsp)
+{
+	vfs_t *tmp_vfsp;
+	int error;
+
+	tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP);
+
+	if (mntopts != NULL) {
+		substring_t args[MAX_OPT_ARGS];
+		char *tmp_mntopts, *p, *t;
+		int token;
+
+		tmp_mntopts = t = kmem_strdup(mntopts);
+		if (tmp_mntopts == NULL)
+			return (SET_ERROR(ENOMEM));
+
+		while ((p = strsep(&t, ",")) != NULL) {
+			if (!*p)
+				continue;
+
+			args[0].to = args[0].from = NULL;
+			token = match_token(p, zpl_tokens, args);
+			error = zfsvfs_parse_option(p, token, args, tmp_vfsp);
+			if (error) {
+				kmem_strfree(tmp_mntopts);
+				zfsvfs_vfs_free(tmp_vfsp);
+				return (error);
+			}
+		}
+
+		kmem_strfree(tmp_mntopts);
+	}
+
+	*vfsp = tmp_vfsp;
+
+	return (0);
+}
+
+boolean_t
+zfs_is_readonly(zfsvfs_t *zfsvfs)
+{
+	return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY));
+}
+
+/*ARGSUSED*/
+int
+zfs_sync(struct super_block *sb, int wait, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+	/*
+	 * Semantically, the only requirement is that the sync be initiated.
+	 * The DMU syncs out txgs frequently, so there's nothing to do.
+	 */
+	if (!wait)
+		return (0);
+
+	if (zfsvfs != NULL) {
+		/*
+		 * Sync a specific filesystem.
+		 */
+		dsl_pool_t *dp;
+
+		ZFS_ENTER(zfsvfs);
+		dp = dmu_objset_pool(zfsvfs->z_os);
+
+		/*
+		 * If the system is shutting down, then skip any
+		 * filesystems which may exist on a suspended pool.
+		 */
+		if (spa_suspended(dp->dp_spa)) {
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, 0);
+
+		ZFS_EXIT(zfsvfs);
+	} else {
+		/*
+		 * Sync all ZFS filesystems.  This is what happens when you
+		 * run sync(1).  Unlike other filesystems, ZFS honors the
+		 * request by waiting for all pools to commit all dirty data.
+		 */
+		spa_sync_allpools();
+	}
+
+	return (0);
+}
+
+static void
+atime_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	struct super_block *sb = zfsvfs->z_sb;
+
+	if (sb == NULL)
+		return;
+	/*
+	 * Update SB_NOATIME bit in VFS super block.  Since atime update is
+	 * determined by atime_needs_update(), atime_needs_update() needs to
+	 * return false if atime is turned off, and not unconditionally return
+	 * false if atime is turned on.
+	 */
+	if (newval)
+		sb->s_flags &= ~SB_NOATIME;
+	else
+		sb->s_flags |= SB_NOATIME;
+}
+
+static void
+relatime_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_relatime = newval;
+}
+
+static void
+xattr_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	if (newval == ZFS_XATTR_OFF) {
+		zfsvfs->z_flags &= ~ZSB_XATTR;
+	} else {
+		zfsvfs->z_flags |= ZSB_XATTR;
+
+		if (newval == ZFS_XATTR_SA)
+			zfsvfs->z_xattr_sa = B_TRUE;
+		else
+			zfsvfs->z_xattr_sa = B_FALSE;
+	}
+}
+
+static void
+acltype_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	switch (newval) {
+	case ZFS_ACLTYPE_NFSV4:
+	case ZFS_ACLTYPE_OFF:
+		zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+		zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+		break;
+	case ZFS_ACLTYPE_POSIX:
+#ifdef CONFIG_FS_POSIX_ACL
+		zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX;
+		zfsvfs->z_sb->s_flags |= SB_POSIXACL;
+#else
+		zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF;
+		zfsvfs->z_sb->s_flags &= ~SB_POSIXACL;
+#endif /* CONFIG_FS_POSIX_ACL */
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+blksz_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+	ASSERT(ISP2(newval));
+
+	zfsvfs->z_max_blksz = newval;
+}
+
+static void
+readonly_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	struct super_block *sb = zfsvfs->z_sb;
+
+	if (sb == NULL)
+		return;
+
+	if (newval)
+		sb->s_flags |= SB_RDONLY;
+	else
+		sb->s_flags &= ~SB_RDONLY;
+}
+
+static void
+devices_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+setuid_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+exec_changed_cb(void *arg, uint64_t newval)
+{
+}
+
+static void
+nbmand_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+	struct super_block *sb = zfsvfs->z_sb;
+
+	if (sb == NULL)
+		return;
+
+	if (newval == TRUE)
+		sb->s_flags |= SB_MANDLOCK;
+	else
+		sb->s_flags &= ~SB_MANDLOCK;
+}
+
+static void
+snapdir_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_show_ctldir = newval;
+}
+
+static void
+vscan_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_vscan = newval;
+}
+
+static void
+acl_mode_changed_cb(void *arg, uint64_t newval)
+{
+	zfsvfs_t *zfsvfs = arg;
+
+	zfsvfs->z_acl_mode = newval;
+}
+
+static void
+acl_inherit_changed_cb(void *arg, uint64_t newval)
+{
+	((zfsvfs_t *)arg)->z_acl_inherit = newval;
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+	struct dsl_dataset *ds = NULL;
+	objset_t *os = NULL;
+	zfsvfs_t *zfsvfs = NULL;
+	int error = 0;
+
+	ASSERT(vfsp);
+	zfsvfs = vfsp->vfs_data;
+	ASSERT(zfsvfs);
+	os = zfsvfs->z_os;
+
+	/*
+	 * The act of registering our callbacks will destroy any mount
+	 * options we may have.  In order to enable temporary overrides
+	 * of mount options, we stash away the current values and
+	 * restore them after we register the callbacks.
+	 */
+	if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) {
+		vfsp->vfs_do_readonly = B_TRUE;
+		vfsp->vfs_readonly = B_TRUE;
+	}
+
+	/*
+	 * Register property callbacks.
+	 *
+	 * It would probably be fine to just check for i/o error from
+	 * the first prop_register(), but I guess I like to go
+	 * overboard...
+	 */
+	ds = dmu_objset_ds(os);
+	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+	error = dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
+	    zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
+	error = error ? error : dsl_prop_register(ds,
+	    zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs);
+	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+	if (error)
+		goto unregister;
+
+	/*
+	 * Invoke our callbacks to restore temporary mount options.
+	 */
+	if (vfsp->vfs_do_readonly)
+		readonly_changed_cb(zfsvfs, vfsp->vfs_readonly);
+	if (vfsp->vfs_do_setuid)
+		setuid_changed_cb(zfsvfs, vfsp->vfs_setuid);
+	if (vfsp->vfs_do_exec)
+		exec_changed_cb(zfsvfs, vfsp->vfs_exec);
+	if (vfsp->vfs_do_devices)
+		devices_changed_cb(zfsvfs, vfsp->vfs_devices);
+	if (vfsp->vfs_do_xattr)
+		xattr_changed_cb(zfsvfs, vfsp->vfs_xattr);
+	if (vfsp->vfs_do_atime)
+		atime_changed_cb(zfsvfs, vfsp->vfs_atime);
+	if (vfsp->vfs_do_relatime)
+		relatime_changed_cb(zfsvfs, vfsp->vfs_relatime);
+	if (vfsp->vfs_do_nbmand)
+		nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand);
+
+	return (0);
+
+unregister:
+	dsl_prop_unregister_all(ds, zfsvfs);
+	return (error);
+}
+
+/*
+ * Takes a dataset, a property, a value and that value's setpoint as
+ * found in the ZAP. Checks if the property has been changed in the vfs.
+ * If so, val and setpoint will be overwritten with updated content.
+ * Otherwise, they are left unchanged.
+ */
+int
+zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val,
+    char *setpoint)
+{
+	int error;
+	zfsvfs_t *zfvp;
+	vfs_t *vfsp;
+	objset_t *os;
+	uint64_t tmp = *val;
+
+	error = dmu_objset_from_ds(ds, &os);
+	if (error != 0)
+		return (error);
+
+	if (dmu_objset_type(os) != DMU_OST_ZFS)
+		return (EINVAL);
+
+	mutex_enter(&os->os_user_ptr_lock);
+	zfvp = dmu_objset_get_user(os);
+	mutex_exit(&os->os_user_ptr_lock);
+	if (zfvp == NULL)
+		return (ESRCH);
+
+	vfsp = zfvp->z_vfs;
+
+	switch (zfs_prop) {
+	case ZFS_PROP_ATIME:
+		if (vfsp->vfs_do_atime)
+			tmp = vfsp->vfs_atime;
+		break;
+	case ZFS_PROP_RELATIME:
+		if (vfsp->vfs_do_relatime)
+			tmp = vfsp->vfs_relatime;
+		break;
+	case ZFS_PROP_DEVICES:
+		if (vfsp->vfs_do_devices)
+			tmp = vfsp->vfs_devices;
+		break;
+	case ZFS_PROP_EXEC:
+		if (vfsp->vfs_do_exec)
+			tmp = vfsp->vfs_exec;
+		break;
+	case ZFS_PROP_SETUID:
+		if (vfsp->vfs_do_setuid)
+			tmp = vfsp->vfs_setuid;
+		break;
+	case ZFS_PROP_READONLY:
+		if (vfsp->vfs_do_readonly)
+			tmp = vfsp->vfs_readonly;
+		break;
+	case ZFS_PROP_XATTR:
+		if (vfsp->vfs_do_xattr)
+			tmp = vfsp->vfs_xattr;
+		break;
+	case ZFS_PROP_NBMAND:
+		if (vfsp->vfs_do_nbmand)
+			tmp = vfsp->vfs_nbmand;
+		break;
+	default:
+		return (ENOENT);
+	}
+
+	if (tmp != *val) {
+		(void) strcpy(setpoint, "temporary");
+		*val = tmp;
+	}
+	return (0);
+}
+
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+	uint64_t val;
+
+	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+	zfsvfs->z_os = os;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+	if (error != 0)
+		return (error);
+	if (zfsvfs->z_version >
+	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
+		(void) printk("Can't mount a version %lld file system "
+		    "on a version %lld pool\n. Pool must be upgraded to mount "
+		    "this file system.\n", (u_longlong_t)zfsvfs->z_version,
+		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
+		return (SET_ERROR(ENOTSUP));
+	}
+	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_norm = (int)val;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_utf8 = (val != 0);
+
+	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+	if (error != 0)
+		return (error);
+	zfsvfs->z_case = (uint_t)val;
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0)
+		return (error);
+	zfsvfs->z_acl_type = (uint_t)val;
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+	    zfsvfs->z_case == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+
+	uint64_t sa_obj = 0;
+	if (zfsvfs->z_use_sa) {
+		/* should either have both of these objects or none */
+		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
+		    &sa_obj);
+		if (error != 0)
+			return (error);
+
+		error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val);
+		if ((error == 0) && (val == ZFS_XATTR_SA))
+			zfsvfs->z_xattr_sa = B_TRUE;
+	}
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+	    &zfsvfs->z_root);
+	if (error != 0)
+		return (error);
+	ASSERT(zfsvfs->z_root != 0);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+	    &zfsvfs->z_unlinkedobj);
+	if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+	    8, 1, &zfsvfs->z_userquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+	    8, 1, &zfsvfs->z_groupquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA],
+	    8, 1, &zfsvfs->z_projectquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA],
+	    8, 1, &zfsvfs->z_userobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_userobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA],
+	    8, 1, &zfsvfs->z_groupobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_groupobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA],
+	    8, 1, &zfsvfs->z_projectobjquota_obj);
+	if (error == ENOENT)
+		zfsvfs->z_projectobjquota_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+	    &zfsvfs->z_fuid_obj);
+	if (error == ENOENT)
+		zfsvfs->z_fuid_obj = 0;
+	else if (error != 0)
+		return (error);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+	    &zfsvfs->z_shares_dir);
+	if (error == ENOENT)
+		zfsvfs->z_shares_dir = 0;
+	else if (error != 0)
+		return (error);
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+	if (error != 0)
+		return (error);
+
+	if (zfsvfs->z_version >= ZPL_VERSION_SA)
+		sa_register_update_callback(os, zfs_sa_upgrade);
+
+	return (0);
+}
+
+int
+zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	int error;
+	boolean_t ro = (readonly || (strchr(osname, '@') != NULL));
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os);
+	if (error != 0) {
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
+	if (error != 0) {
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+	}
+	return (error);
+}
+
+
+/*
+ * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function
+ * on a failure.  Do not pass in a statically allocated zfsvfs.
+ */
+int
+zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
+{
+	int error;
+
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_sb = NULL;
+	zfsvfs->z_parent = zfsvfs;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	ZFS_TEARDOWN_INIT(zfsvfs);
+	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+
+	int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1),
+	    ZFS_OBJ_MTX_MAX);
+	zfsvfs->z_hold_size = size;
+	zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+	    KM_SLEEP);
+	zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+	for (int i = 0; i != size; i++) {
+		avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+		    sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+	}
+
+	error = zfsvfs_init(zfsvfs, os);
+	if (error != 0) {
+		*zfvp = NULL;
+		zfsvfs_free(zfsvfs);
+		return (error);
+	}
+
+	zfsvfs->z_drain_task = TASKQID_INVALID;
+	zfsvfs->z_draining = B_FALSE;
+	zfsvfs->z_drain_cancel = B_TRUE;
+
+	*zfvp = zfsvfs;
+	return (0);
+}
+
+static int
+zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
+{
+	int error;
+	boolean_t readonly = zfs_is_readonly(zfsvfs);
+
+	error = zfs_register_callbacks(zfsvfs->z_vfs);
+	if (error)
+		return (error);
+
+	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+
+	/*
+	 * If we are not mounting (ie: online recv), then we don't
+	 * have to worry about replaying the log as we blocked all
+	 * operations out since we closed the ZIL.
+	 */
+	if (mounting) {
+		ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL);
+		dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os);
+
+		/*
+		 * During replay we remove the read only flag to
+		 * allow replays to succeed.
+		 */
+		if (readonly != 0) {
+			readonly_changed_cb(zfsvfs, B_FALSE);
+		} else {
+			zap_stats_t zs;
+			if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
+			    &zs) == 0) {
+				dataset_kstats_update_nunlinks_kstat(
+				    &zfsvfs->z_kstat, zs.zs_num_entries);
+				dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+				    "num_entries in unlinked set: %llu",
+				    zs.zs_num_entries);
+			}
+			zfs_unlinked_drain(zfsvfs);
+			dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir;
+			dd->dd_activity_cancelled = B_FALSE;
+		}
+
+		/*
+		 * Parse and replay the intent log.
+		 *
+		 * Because of ziltest, this must be done after
+		 * zfs_unlinked_drain().  (Further note: ziltest
+		 * doesn't use readonly mounts, where
+		 * zfs_unlinked_drain() isn't called.)  This is because
+		 * ziltest causes spa_sync() to think it's committed,
+		 * but actually it is not, so the intent log contains
+		 * many txg's worth of changes.
+		 *
+		 * In particular, if object N is in the unlinked set in
+		 * the last txg to actually sync, then it could be
+		 * actually freed in a later txg and then reallocated
+		 * in a yet later txg.  This would write a "create
+		 * object N" record to the intent log.  Normally, this
+		 * would be fine because the spa_sync() would have
+		 * written out the fact that object N is free, before
+		 * we could write the "create object N" intent log
+		 * record.
+		 *
+		 * But when we are in ziltest mode, we advance the "open
+		 * txg" without actually spa_sync()-ing the changes to
+		 * disk.  So we would see that object N is still
+		 * allocated and in the unlinked set, and there is an
+		 * intent log record saying to allocate it.
+		 */
+		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
+			if (zil_replay_disable) {
+				zil_destroy(zfsvfs->z_log, B_FALSE);
+			} else {
+				zfsvfs->z_replay = B_TRUE;
+				zil_replay(zfsvfs->z_os, zfsvfs,
+				    zfs_replay_vector);
+				zfsvfs->z_replay = B_FALSE;
+			}
+		}
+
+		/* restore readonly bit */
+		if (readonly != 0)
+			readonly_changed_cb(zfsvfs, B_TRUE);
+	}
+
+	/*
+	 * Set the objset user_ptr to track its zfsvfs.
+	 */
+	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+	return (0);
+}
+
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
+{
+	int i, size = zfsvfs->z_hold_size;
+
+	zfs_fuid_destroy(zfsvfs);
+
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+	mutex_destroy(&zfsvfs->z_lock);
+	list_destroy(&zfsvfs->z_all_znodes);
+	ZFS_TEARDOWN_DESTROY(zfsvfs);
+	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
+	rw_destroy(&zfsvfs->z_fuid_lock);
+	for (i = 0; i != size; i++) {
+		avl_destroy(&zfsvfs->z_hold_trees[i]);
+		mutex_destroy(&zfsvfs->z_hold_locks[i]);
+	}
+	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+	zfsvfs_vfs_free(zfsvfs->z_vfs);
+	dataset_kstats_destroy(&zfsvfs->z_kstat);
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+}
+
+static void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+	objset_t *os = zfsvfs->z_os;
+
+	if (!dmu_objset_is_snapshot(os))
+		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
+}
+
+#ifdef HAVE_MLSLABEL
+/*
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
+ *
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high.  For admin_low labels, the corresponding
+ * dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+		/* must be readonly */
+		uint64_t rdonly;
+
+		if (dsl_prop_get_integer(dsname,
+		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+			return (SET_ERROR(EACCES));
+		return (rdonly ? 0 : SET_ERROR(EACCES));
+	}
+	return (SET_ERROR(EACCES));
+}
+#endif /* HAVE_MLSLABEL */
+
+static int
+zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp,
+    uint32_t bshift)
+{
+	char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+	uint64_t offset = DMU_OBJACCT_PREFIX_LEN;
+	uint64_t quota;
+	uint64_t used;
+	int err;
+
+	strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
+	err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset,
+	    sizeof (buf) - offset, B_FALSE);
+	if (err)
+		return (err);
+
+	if (zfsvfs->z_projectquota_obj == 0)
+		goto objs;
+
+	err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj,
+	    buf + offset, 8, 1, &quota);
+	if (err == ENOENT)
+		goto objs;
+	else if (err)
+		return (err);
+
+	err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+	    buf + offset, 8, 1, &used);
+	if (unlikely(err == ENOENT)) {
+		uint32_t blksize;
+		u_longlong_t nblocks;
+
+		/*
+		 * Quota accounting is async, so it is possible race case.
+		 * There is at least one object with the given project ID.
+		 */
+		sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+		if (unlikely(zp->z_blksz == 0))
+			blksize = zfsvfs->z_max_blksz;
+
+		used = blksize * nblocks;
+	} else if (err) {
+		return (err);
+	}
+
+	statp->f_blocks = quota >> bshift;
+	statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0;
+	statp->f_bavail = statp->f_bfree;
+
+objs:
+	if (zfsvfs->z_projectobjquota_obj == 0)
+		return (0);
+
+	err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj,
+	    buf + offset, 8, 1, &quota);
+	if (err == ENOENT)
+		return (0);
+	else if (err)
+		return (err);
+
+	err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT,
+	    buf, 8, 1, &used);
+	if (unlikely(err == ENOENT)) {
+		/*
+		 * Quota accounting is async, so it is possible race case.
+		 * There is at least one object with the given project ID.
+		 */
+		used = 1;
+	} else if (err) {
+		return (err);
+	}
+
+	statp->f_files = quota;
+	statp->f_ffree = (quota > used) ? (quota - used) : 0;
+
+	return (0);
+}
+
+int
+zfs_statvfs(struct inode *ip, struct kstatfs *statp)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	uint64_t refdbytes, availbytes, usedobjs, availobjs;
+	int err = 0;
+
+	ZFS_ENTER(zfsvfs);
+
+	dmu_objset_space(zfsvfs->z_os,
+	    &refdbytes, &availbytes, &usedobjs, &availobjs);
+
+	uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os);
+	/*
+	 * The underlying storage pool actually uses multiple block
+	 * size.  Under Solaris frsize (fragment size) is reported as
+	 * the smallest block size we support, and bsize (block size)
+	 * as the filesystem's maximum block size.  Unfortunately,
+	 * under Linux the fragment size and block size are often used
+	 * interchangeably.  Thus we are forced to report both of them
+	 * as the filesystem's maximum block size.
+	 */
+	statp->f_frsize = zfsvfs->z_max_blksz;
+	statp->f_bsize = zfsvfs->z_max_blksz;
+	uint32_t bshift = fls(statp->f_bsize) - 1;
+
+	/*
+	 * The following report "total" blocks of various kinds in
+	 * the file system, but reported in terms of f_bsize - the
+	 * "preferred" size.
+	 */
+
+	/* Round up so we never have a filesystem using 0 blocks. */
+	refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize);
+	statp->f_blocks = (refdbytes + availbytes) >> bshift;
+	statp->f_bfree = availbytes >> bshift;
+	statp->f_bavail = statp->f_bfree; /* no root reservation */
+
+	/*
+	 * statvfs() should really be called statufs(), because it assumes
+	 * static metadata.  ZFS doesn't preallocate files, so the best
+	 * we can do is report the max that could possibly fit in f_files,
+	 * and that minus the number actually used in f_ffree.
+	 * For f_ffree, report the smaller of the number of objects available
+	 * and the number of blocks (each object will take at least a block).
+	 */
+	statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT);
+	statp->f_files = statp->f_ffree + usedobjs;
+	statp->f_fsid.val[0] = (uint32_t)fsid;
+	statp->f_fsid.val[1] = (uint32_t)(fsid >> 32);
+	statp->f_type = ZFS_SUPER_MAGIC;
+	statp->f_namelen = MAXNAMELEN - 1;
+
+	/*
+	 * We have all of 40 characters to stuff a string here.
+	 * Is there anything useful we could/should provide?
+	 */
+	bzero(statp->f_spare, sizeof (statp->f_spare));
+
+	if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    dmu_objset_projectquota_present(zfsvfs->z_os)) {
+		znode_t *zp = ITOZ(ip);
+
+		if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid &&
+		    zpl_is_valid_projid(zp->z_projid))
+			err = zfs_statfs_project(zfsvfs, zp, statp, bshift);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+static int
+zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp)
+{
+	znode_t *rootzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
+	if (error == 0)
+		*ipp = ZTOI(rootzp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Linux kernels older than 3.1 do not support a per-filesystem shrinker.
+ * To accommodate this we must improvise and manually walk the list of znodes
+ * attempting to prune dentries in order to be able to drop the inodes.
+ *
+ * To avoid scanning the same znodes multiple times they are always rotated
+ * to the end of the z_all_znodes list.  New znodes are inserted at the
+ * end of the list so we're always scanning the oldest znodes first.
+ */
+static int
+zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan)
+{
+	znode_t **zp_array, *zp;
+	int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *));
+	int objects = 0;
+	int i = 0, j = 0;
+
+	zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) {
+
+		if ((i++ > nr_to_scan) || (j >= max_array))
+			break;
+
+		ASSERT(list_link_active(&zp->z_link_node));
+		list_remove(&zfsvfs->z_all_znodes, zp);
+		list_insert_tail(&zfsvfs->z_all_znodes, zp);
+
+		/* Skip active znodes and .zfs entries */
+		if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir)
+			continue;
+
+		if (igrab(ZTOI(zp)) == NULL)
+			continue;
+
+		zp_array[j] = zp;
+		j++;
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	for (i = 0; i < j; i++) {
+		zp = zp_array[i];
+
+		ASSERT3P(zp, !=, NULL);
+		d_prune_aliases(ZTOI(zp));
+
+		if (atomic_read(&ZTOI(zp)->i_count) == 1)
+			objects++;
+
+		zrele(zp);
+	}
+
+	kmem_free(zp_array, max_array * sizeof (znode_t *));
+
+	return (objects);
+}
+
+/*
+ * The ARC has requested that the filesystem drop entries from the dentry
+ * and inode caches.  This can occur when the ARC needs to free meta data
+ * blocks but can't because they are all pinned by entries in these caches.
+ */
+int
+zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+	int error = 0;
+	struct shrinker *shrinker = &sb->s_shrink;
+	struct shrink_control sc = {
+		.nr_to_scan = nr_to_scan,
+		.gfp_mask = GFP_KERNEL,
+	};
+
+	ZFS_ENTER(zfsvfs);
+
+#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \
+	defined(SHRINK_CONTROL_HAS_NID) && \
+	defined(SHRINKER_NUMA_AWARE)
+	if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) {
+		*objects = 0;
+		for_each_online_node(sc.nid) {
+			*objects += (*shrinker->scan_objects)(shrinker, &sc);
+		}
+	} else {
+			*objects = (*shrinker->scan_objects)(shrinker, &sc);
+	}
+
+#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK)
+	*objects = (*shrinker->scan_objects)(shrinker, &sc);
+#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK)
+	*objects = (*shrinker->shrink)(shrinker, &sc);
+#elif defined(HAVE_D_PRUNE_ALIASES)
+#define	D_PRUNE_ALIASES_IS_DEFAULT
+	*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#else
+#error "No available dentry and inode cache pruning mechanism."
+#endif
+
+#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT)
+#undef	D_PRUNE_ALIASES_IS_DEFAULT
+	/*
+	 * Fall back to zfs_prune_aliases if the kernel's per-superblock
+	 * shrinker couldn't free anything, possibly due to the inodes being
+	 * allocated in a different memcg.
+	 */
+	if (*objects == 0)
+		*objects = zfs_prune_aliases(zfsvfs, nr_to_scan);
+#endif
+
+	ZFS_EXIT(zfsvfs);
+
+	dprintf_ds(zfsvfs->z_os->os_dsl_dataset,
+	    "pruning, nr_to_scan=%lu objects=%d error=%d\n",
+	    nr_to_scan, *objects, error);
+
+	return (error);
+}
+
+/*
+ * Teardown the zfsvfs_t.
+ *
+ * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
+ * and 'z_teardown_inactive_lock' held.
+ */
+static int
+zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
+{
+	znode_t	*zp;
+
+	zfs_unlinked_drain_stop_wait(zfsvfs);
+
+	/*
+	 * If someone has not already unmounted this file system,
+	 * drain the zrele_taskq to ensure all active references to the
+	 * zfsvfs_t have been handled only then can it be safely destroyed.
+	 */
+	if (zfsvfs->z_os) {
+		/*
+		 * If we're unmounting we have to wait for the list to
+		 * drain completely.
+		 *
+		 * If we're not unmounting there's no guarantee the list
+		 * will drain completely, but iputs run from the taskq
+		 * may add the parents of dir-based xattrs to the taskq
+		 * so we want to wait for these.
+		 *
+		 * We can safely read z_nr_znodes without locking because the
+		 * VFS has already blocked operations which add to the
+		 * z_all_znodes list and thus increment z_nr_znodes.
+		 */
+		int round = 0;
+		while (zfsvfs->z_nr_znodes > 0) {
+			taskq_wait_outstanding(dsl_pool_zrele_taskq(
+			    dmu_objset_pool(zfsvfs->z_os)), 0);
+			if (++round > 1 && !unmounting)
+				break;
+		}
+	}
+
+	ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG);
+
+	if (!unmounting) {
+		/*
+		 * We purge the parent filesystem's super block as the
+		 * parent filesystem and all of its snapshots have their
+		 * inode's super block set to the parent's filesystem's
+		 * super block.  Note,  'z_parent' is self referential
+		 * for non-snapshots.
+		 */
+		shrink_dcache_sb(zfsvfs->z_parent->z_sb);
+	}
+
+	/*
+	 * Close the zil. NB: Can't close the zil while zfs_inactive
+	 * threads are blocked as zil_close can call zfs_inactive.
+	 */
+	if (zfsvfs->z_log) {
+		zil_close(zfsvfs->z_log);
+		zfsvfs->z_log = NULL;
+	}
+
+	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
+
+	/*
+	 * If we are not unmounting (ie: online recv) and someone already
+	 * unmounted this file system while we were doing the switcheroo,
+	 * or a reopen of z_os failed then just bail out now.
+	 */
+	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * At this point there are no VFS ops active, and any new VFS ops
+	 * will fail with EIO since we have z_teardown_lock for writer (only
+	 * relevant for forced unmount).
+	 *
+	 * Release all holds on dbufs. We also grab an extra reference to all
+	 * the remaining inodes so that the kernel does not attempt to free
+	 * any inodes of a suspended fs. This can cause deadlocks since the
+	 * zfs_resume_fs() process may involve starting threads, which might
+	 * attempt to free unreferenced inodes to free up memory for the new
+	 * thread.
+	 */
+	if (!unmounting) {
+		mutex_enter(&zfsvfs->z_znodes_lock);
+		for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
+		    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+			if (zp->z_sa_hdl)
+				zfs_znode_dmu_fini(zp);
+			if (igrab(ZTOI(zp)) != NULL)
+				zp->z_suspended = B_TRUE;
+
+		}
+		mutex_exit(&zfsvfs->z_znodes_lock);
+	}
+
+	/*
+	 * If we are unmounting, set the unmounted flag and let new VFS ops
+	 * unblock.  zfs_inactive will have the unmounted behavior, and all
+	 * other VFS ops will fail with EIO.
+	 */
+	if (unmounting) {
+		zfsvfs->z_unmounted = B_TRUE;
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+	}
+
+	/*
+	 * z_os will be NULL if there was an error in attempting to reopen
+	 * zfsvfs, so just return as the properties had already been
+	 *
+	 * unregistered and cached data had been evicted before.
+	 */
+	if (zfsvfs->z_os == NULL)
+		return (0);
+
+	/*
+	 * Unregister properties.
+	 */
+	zfs_unregister_callbacks(zfsvfs);
+
+	/*
+	 * Evict cached data. We must write out any dirty data before
+	 * disowning the dataset.
+	 */
+	objset_t *os = zfsvfs->z_os;
+	boolean_t os_dirty = B_FALSE;
+	for (int t = 0; t < TXG_SIZE; t++) {
+		if (dmu_objset_is_dirty(os, t)) {
+			os_dirty = B_TRUE;
+			break;
+		}
+	}
+	if (!zfs_is_readonly(zfsvfs) && os_dirty) {
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+	}
+	dmu_objset_evict_dbufs(zfsvfs->z_os);
+	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+	dsl_dir_cancel_waiters(dd);
+
+	return (0);
+}
+
+#if defined(HAVE_SUPER_SETUP_BDI_NAME)
+atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0);
+#endif
+
+int
+zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent)
+{
+	const char *osname = zm->mnt_osname;
+	struct inode *root_inode = NULL;
+	uint64_t recordsize;
+	int error = 0;
+	zfsvfs_t *zfsvfs = NULL;
+	vfs_t *vfs = NULL;
+
+	ASSERT(zm);
+	ASSERT(osname);
+
+	error = zfsvfs_parse_options(zm->mnt_data, &vfs);
+	if (error)
+		return (error);
+
+	error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs);
+	if (error) {
+		zfsvfs_vfs_free(vfs);
+		goto out;
+	}
+
+	if ((error = dsl_prop_get_integer(osname, "recordsize",
+	    &recordsize, NULL))) {
+		zfsvfs_vfs_free(vfs);
+		goto out;
+	}
+
+	vfs->vfs_data = zfsvfs;
+	zfsvfs->z_vfs = vfs;
+	zfsvfs->z_sb = sb;
+	sb->s_fs_info = zfsvfs;
+	sb->s_magic = ZFS_SUPER_MAGIC;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_time_gran = 1;
+	sb->s_blocksize = recordsize;
+	sb->s_blocksize_bits = ilog2(recordsize);
+
+	error = -zpl_bdi_setup(sb, "zfs");
+	if (error)
+		goto out;
+
+	sb->s_bdi->ra_pages = 0;
+
+	/* Set callback operations for the file system. */
+	sb->s_op = &zpl_super_operations;
+	sb->s_xattr = zpl_xattr_handlers;
+	sb->s_export_op = &zpl_export_operations;
+	sb->s_d_op = &zpl_dentry_operations;
+
+	/* Set features for file system. */
+	zfs_set_fuid_feature(zfsvfs);
+
+	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
+		uint64_t pval;
+
+		atime_changed_cb(zfsvfs, B_FALSE);
+		readonly_changed_cb(zfsvfs, B_TRUE);
+		if ((error = dsl_prop_get_integer(osname,
+		    "xattr", &pval, NULL)))
+			goto out;
+		xattr_changed_cb(zfsvfs, pval);
+		if ((error = dsl_prop_get_integer(osname,
+		    "acltype", &pval, NULL)))
+			goto out;
+		acltype_changed_cb(zfsvfs, pval);
+		zfsvfs->z_issnap = B_TRUE;
+		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
+		zfsvfs->z_snap_defer_time = jiffies;
+
+		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+	} else {
+		if ((error = zfsvfs_setup(zfsvfs, B_TRUE)))
+			goto out;
+	}
+
+	/* Allocate a root inode for the filesystem. */
+	error = zfs_root(zfsvfs, &root_inode);
+	if (error) {
+		(void) zfs_umount(sb);
+		goto out;
+	}
+
+	/* Allocate a root dentry for the filesystem */
+	sb->s_root = d_make_root(root_inode);
+	if (sb->s_root == NULL) {
+		(void) zfs_umount(sb);
+		error = SET_ERROR(ENOMEM);
+		goto out;
+	}
+
+	if (!zfsvfs->z_issnap)
+		zfsctl_create(zfsvfs);
+
+	zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb);
+out:
+	if (error) {
+		if (zfsvfs != NULL) {
+			dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+			zfsvfs_free(zfsvfs);
+		}
+		/*
+		 * make sure we don't have dangling sb->s_fs_info which
+		 * zfs_preumount will use.
+		 */
+		sb->s_fs_info = NULL;
+	}
+
+	return (error);
+}
+
+/*
+ * Called when an unmount is requested and certain sanity checks have
+ * already passed.  At this point no dentries or inodes have been reclaimed
+ * from their respective caches.  We drop the extra reference on the .zfs
+ * control directory to allow everything to be reclaimed.  All snapshots
+ * must already have been unmounted to reach this point.
+ */
+void
+zfs_preumount(struct super_block *sb)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+
+	/* zfsvfs is NULL when zfs_domount fails during mount */
+	if (zfsvfs) {
+		zfs_unlinked_drain_stop_wait(zfsvfs);
+		zfsctl_destroy(sb->s_fs_info);
+		/*
+		 * Wait for zrele_async before entering evict_inodes in
+		 * generic_shutdown_super. The reason we must finish before
+		 * evict_inodes is when lazytime is on, or when zfs_purgedir
+		 * calls zfs_zget, zrele would bump i_count from 0 to 1. This
+		 * would race with the i_count check in evict_inodes. This means
+		 * it could destroy the inode while we are still using it.
+		 *
+		 * We wait for two passes. xattr directories in the first pass
+		 * may add xattr entries in zfs_purgedir, so in the second pass
+		 * we wait for them. We don't use taskq_wait here because it is
+		 * a pool wide taskq. Other mounted filesystems can constantly
+		 * do zrele_async and there's no guarantee when taskq will be
+		 * empty.
+		 */
+		taskq_wait_outstanding(dsl_pool_zrele_taskq(
+		    dmu_objset_pool(zfsvfs->z_os)), 0);
+		taskq_wait_outstanding(dsl_pool_zrele_taskq(
+		    dmu_objset_pool(zfsvfs->z_os)), 0);
+	}
+}
+
+/*
+ * Called once all other unmount released tear down has occurred.
+ * It is our responsibility to release any remaining infrastructure.
+ */
+/*ARGSUSED*/
+int
+zfs_umount(struct super_block *sb)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+	objset_t *os;
+
+	if (zfsvfs->z_arc_prune != NULL)
+		arc_remove_prune_callback(zfsvfs->z_arc_prune);
+	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
+	os = zfsvfs->z_os;
+	zpl_bdi_destroy(sb);
+
+	/*
+	 * z_os will be NULL if there was an error in
+	 * attempting to reopen zfsvfs.
+	 */
+	if (os != NULL) {
+		/*
+		 * Unset the objset user_ptr.
+		 */
+		mutex_enter(&os->os_user_ptr_lock);
+		dmu_objset_set_user(os, NULL);
+		mutex_exit(&os->os_user_ptr_lock);
+
+		/*
+		 * Finally release the objset
+		 */
+		dmu_objset_disown(os, B_TRUE, zfsvfs);
+	}
+
+	zfsvfs_free(zfsvfs);
+	return (0);
+}
+
+int
+zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm)
+{
+	zfsvfs_t *zfsvfs = sb->s_fs_info;
+	vfs_t *vfsp;
+	boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os);
+	int error;
+
+	if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) &&
+	    !(*flags & SB_RDONLY)) {
+		*flags |= SB_RDONLY;
+		return (EROFS);
+	}
+
+	error = zfsvfs_parse_options(zm->mnt_data, &vfsp);
+	if (error)
+		return (error);
+
+	if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY))
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+
+	zfs_unregister_callbacks(zfsvfs);
+	zfsvfs_vfs_free(zfsvfs->z_vfs);
+
+	vfsp->vfs_data = zfsvfs;
+	zfsvfs->z_vfs = vfsp;
+	if (!issnap)
+		(void) zfs_register_callbacks(vfsp);
+
+	return (error);
+}
+
+int
+zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp)
+{
+	zfsvfs_t	*zfsvfs = sb->s_fs_info;
+	znode_t		*zp;
+	uint64_t	object = 0;
+	uint64_t	fid_gen = 0;
+	uint64_t	gen_mask;
+	uint64_t	zp_gen;
+	int		i, err;
+
+	*ipp = NULL;
+
+	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
+		zfid_short_t	*zfid = (zfid_short_t *)fidp;
+
+		for (i = 0; i < sizeof (zfid->zf_object); i++)
+			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zfid->zf_gen); i++)
+			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
+	} else {
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* LONG_FID_LEN means snapdirs */
+	if (fidp->fid_len == LONG_FID_LEN) {
+		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
+		uint64_t	objsetid = 0;
+		uint64_t	setgen = 0;
+
+		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
+			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
+
+		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
+			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
+
+		if (objsetid != ZFSCTL_INO_SNAPDIRS - object) {
+			dprintf("snapdir fid: objsetid (%llu) != "
+			    "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n",
+			    objsetid, ZFSCTL_INO_SNAPDIRS, object);
+
+			return (SET_ERROR(EINVAL));
+		}
+
+		if (fid_gen > 1 || setgen != 0) {
+			dprintf("snapdir fid: fid_gen (%llu) and setgen "
+			    "(%llu)\n", fid_gen, setgen);
+			return (SET_ERROR(EINVAL));
+		}
+
+		return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp));
+	}
+
+	ZFS_ENTER(zfsvfs);
+	/* A zero fid_gen means we are in the .zfs control directories */
+	if (fid_gen == 0 &&
+	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
+		*ipp = zfsvfs->z_ctldir;
+		ASSERT(*ipp != NULL);
+		if (object == ZFSCTL_INO_SNAPDIR) {
+			VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp,
+			    0, kcred, NULL, NULL) == 0);
+		} else {
+			igrab(*ipp);
+		}
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	gen_mask = -1ULL >> (64 - 8 * i);
+
+	dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask);
+	if ((err = zfs_zget(zfsvfs, object, &zp))) {
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+
+	/* Don't export xattr stuff */
+	if (zp->z_pflags & ZFS_XATTR) {
+		zrele(zp);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOENT));
+	}
+
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
+	    sizeof (uint64_t));
+	zp_gen = zp_gen & gen_mask;
+	if (zp_gen == 0)
+		zp_gen = 1;
+	if ((fid_gen == 0) && (zfsvfs->z_root == object))
+		fid_gen = zp_gen;
+	if (zp->z_unlinked || zp_gen != fid_gen) {
+		dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen,
+		    fid_gen);
+		zrele(zp);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOENT));
+	}
+
+	*ipp = ZTOI(zp);
+	if (*ipp)
+		zfs_znode_update_vfs(ITOZ(*ipp));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Block out VFS ops and close zfsvfs_t
+ *
+ * Note, if successful, then we return with the 'z_teardown_lock' and
+ * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
+ */
+int
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
+{
+	int error;
+
+	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
+		return (error);
+
+	return (0);
+}
+
+/*
+ * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended.  Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
+ */
+int
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	int err, err2;
+	znode_t *zp;
+
+	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	/*
+	 * We already own this, so just update the objset_t, as the one we
+	 * had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+
+	err = zfsvfs_init(zfsvfs, os);
+	if (err != 0)
+		goto bail;
+
+	ds->ds_dir->dd_activity_cancelled = B_FALSE;
+	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
+
+	zfs_set_fuid_feature(zfsvfs);
+	zfsvfs->z_rollback_time = jiffies;
+
+	/*
+	 * Attempt to re-establish all the active inodes with their
+	 * dbufs.  If a zfs_rezget() fails, then we unhash the inode
+	 * and mark it stale.  This prevents a collision if a new
+	 * inode/object is created which must use the same inode
+	 * number.  The stale inode will be be released when the
+	 * VFS prunes the dentry holding the remaining references
+	 * on the stale inode.
+	 */
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+		err2 = zfs_rezget(zp);
+		if (err2) {
+			remove_inode_hash(ZTOI(zp));
+			zp->z_is_stale = B_TRUE;
+		}
+
+		/* see comment in zfs_suspend_fs() */
+		if (zp->z_suspended) {
+			zfs_zrele_async(zp);
+			zp->z_suspended = B_FALSE;
+		}
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) {
+		/*
+		 * zfs_suspend_fs() could have interrupted freeing
+		 * of dnodes. We need to restart this freeing so
+		 * that we don't "leak" the space.
+		 */
+		zfs_unlinked_drain(zfsvfs);
+	}
+
+	/*
+	 * Most of the time zfs_suspend_fs is used for changing the contents
+	 * of the underlying dataset. ZFS rollback and receive operations
+	 * might create files for which negative dentries are present in
+	 * the cache. Since walking the dcache would require a lot of GPL-only
+	 * code duplication, it's much easier on these rather rare occasions
+	 * just to flush the whole dcache for the given dataset/filesystem.
+	 */
+	shrink_dcache_sb(zfsvfs->z_sb);
+
+bail:
+	if (err != 0)
+		zfsvfs->z_unmounted = B_TRUE;
+
+	/* release the VFS ops */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+	if (err != 0) {
+		/*
+		 * Since we couldn't setup the sa framework, try to force
+		 * unmount this file system.
+		 */
+		if (zfsvfs->z_os)
+			(void) zfs_umount(zfsvfs->z_sb);
+	}
+	return (err);
+}
+
+/*
+ * Release VOPs and unmount a suspended filesystem.
+ */
+int
+zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
+{
+	ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs));
+	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
+
+	/*
+	 * We already own this, so just hold and rele it to update the
+	 * objset_t, as the one we had before may have been evicted.
+	 */
+	objset_t *os;
+	VERIFY3P(ds->ds_owner, ==, zfsvfs);
+	VERIFY(dsl_dataset_long_held(ds));
+	dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds));
+	dsl_pool_config_enter(dp, FTAG);
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_config_exit(dp, FTAG);
+	zfsvfs->z_os = os;
+
+	/* release the VOPs */
+	rw_exit(&zfsvfs->z_teardown_inactive_lock);
+	ZFS_TEARDOWN_EXIT(zfsvfs, FTAG);
+
+	/*
+	 * Try to force unmount this file system.
+	 */
+	(void) zfs_umount(zfsvfs->z_sb);
+	zfsvfs->z_unmounted = B_TRUE;
+	return (0);
+}
+
+/*
+ * Automounted snapshots rely on periodic revalidation
+ * to defer snapshots from being automatically unmounted.
+ */
+
+inline void
+zfs_exit_fs(zfsvfs_t *zfsvfs)
+{
+	if (!zfsvfs->z_issnap)
+		return;
+
+	if (time_after(jiffies, zfsvfs->z_snap_defer_time +
+	    MAX(zfs_expire_snapshot * HZ / 2, HZ))) {
+		zfsvfs->z_snap_defer_time = jiffies;
+		zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa,
+		    dmu_objset_id(zfsvfs->z_os),
+		    zfs_expire_snapshot);
+	}
+}
+
+int
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
+{
+	int error;
+	objset_t *os = zfsvfs->z_os;
+	dmu_tx_t *tx;
+
+	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
+		return (SET_ERROR(EINVAL));
+
+	if (newvers < zfsvfs->z_version)
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_spa_version_map(newvers) >
+	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
+		return (SET_ERROR(ENOTSUP));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    ZFS_SA_ATTRS);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	}
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+
+	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+	    8, 1, &newvers, tx);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		return (error);
+	}
+
+	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
+		uint64_t sa_obj;
+
+		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
+		    SPA_VERSION_SA);
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+
+		error = zap_add(os, MASTER_NODE_OBJ,
+		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT0(error);
+
+		VERIFY(0 == sa_set_sa_object(os, sa_obj));
+		sa_register_update_callback(os, zfs_sa_upgrade);
+	}
+
+	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
+	    "from %llu to %llu", zfsvfs->z_version, newvers);
+
+	dmu_tx_commit(tx);
+
+	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
+
+	zfs_set_fuid_feature(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * Read a property stored within the master node.
+ */
+int
+zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
+{
+	uint64_t *cached_copy = NULL;
+
+	/*
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
+	 */
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
+
+	/*
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
+	 */
+	const char *pname;
+	int error = ENOENT;
+	if (prop == ZFS_PROP_VERSION)
+		pname = ZPL_VERSION_STR;
+	else
+		pname = zfs_prop_to_name(prop);
+
+	if (os != NULL) {
+		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
+	}
+
+	if (error == ENOENT) {
+		/* No value set, use the default value */
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			*value = ZPL_VERSION;
+			break;
+		case ZFS_PROP_NORMALIZE:
+		case ZFS_PROP_UTF8ONLY:
+			*value = 0;
+			break;
+		case ZFS_PROP_CASE:
+			*value = ZFS_CASE_SENSITIVE;
+			break;
+		case ZFS_PROP_ACLTYPE:
+			*value = ZFS_ACLTYPE_OFF;
+			break;
+		default:
+			return (error);
+		}
+		error = 0;
+	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
+	return (error);
+}
+
+/*
+ * Return true if the corresponding vfs's unmounted flag is set.
+ * Otherwise return false.
+ * If this function returns true we know VFS unmount has been initiated.
+ */
+boolean_t
+zfs_get_vfs_flag_unmounted(objset_t *os)
+{
+	zfsvfs_t *zfvp;
+	boolean_t unmounted = B_FALSE;
+
+	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
+
+	mutex_enter(&os->os_user_ptr_lock);
+	zfvp = dmu_objset_get_user(os);
+	if (zfvp != NULL && zfvp->z_unmounted)
+		unmounted = B_TRUE;
+	mutex_exit(&os->os_user_ptr_lock);
+
+	return (unmounted);
+}
+
+/*ARGSUSED*/
+void
+zfsvfs_update_fromname(const char *oldname, const char *newname)
+{
+	/*
+	 * We don't need to do anything here, the devname is always current by
+	 * virtue of zfsvfs->z_sb->s_op->show_devname.
+	 */
+}
+
+void
+zfs_init(void)
+{
+	zfsctl_init();
+	zfs_znode_init();
+	dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info);
+	register_filesystem(&zpl_fs_type);
+}
+
+void
+zfs_fini(void)
+{
+	/*
+	 * we don't use outstanding because zpl_posix_acl_free might add more.
+	 */
+	taskq_wait(system_delay_taskq);
+	taskq_wait(system_taskq);
+	unregister_filesystem(&zpl_fs_type);
+	zfs_znode_fini();
+	zfsctl_fini();
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_suspend_fs);
+EXPORT_SYMBOL(zfs_resume_fs);
+EXPORT_SYMBOL(zfs_set_version);
+EXPORT_SYMBOL(zfsvfs_create);
+EXPORT_SYMBOL(zfsvfs_free);
+EXPORT_SYMBOL(zfs_is_readonly);
+EXPORT_SYMBOL(zfs_domount);
+EXPORT_SYMBOL(zfs_preumount);
+EXPORT_SYMBOL(zfs_umount);
+EXPORT_SYMBOL(zfs_remount);
+EXPORT_SYMBOL(zfs_statvfs);
+EXPORT_SYMBOL(zfs_vget);
+EXPORT_SYMBOL(zfs_prune);
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
new file mode 100644
index 000000000000..84c33b541ea3
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -0,0 +1,4010 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/taskq.h>
+#include <sys/uio.h>
+#include <sys/vmsystm.h>
+#include <sys/atomic.h>
+#include <sys/pathname.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/policy.h>
+#include <sys/sunddi.h>
+#include <sys/sid.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_rlock.h>
+#include <sys/cred.h>
+#include <sys/zpl.h>
+#include <sys/zil.h>
+#include <sys/sa_impl.h>
+
+/*
+ * Programming rules.
+ *
+ * Each vnode op performs some logical unit of work.  To do this, the ZPL must
+ * properly lock its in-core state, create a DMU transaction, do the work,
+ * record this work in the intent log (ZIL), commit the DMU transaction,
+ * and wait for the intent log to commit if it is a synchronous operation.
+ * Moreover, the vnode ops must work in both normal and log replay context.
+ * The ordering of events is important to avoid deadlocks and references
+ * to freed memory.  The example below illustrates the following Big Rules:
+ *
+ *  (1) A check must be made in each zfs thread for a mounted file system.
+ *	This is done avoiding races using ZFS_ENTER(zfsvfs).
+ *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
+ *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
+ *      can return EIO from the calling function.
+ *
+ *  (2) zrele() should always be the last thing except for zil_commit() (if
+ *	necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the
+ *	last reference, the vnode/znode can be freed, so the zp may point to
+ *	freed memory.  Second, the last reference will call zfs_zinactive(),
+ *	which may induce a lot of work -- pushing cached pages (which acquires
+ *	range locks) and syncing out cached atime changes.  Third,
+ *	zfs_zinactive() may require a new tx, which could deadlock the system
+ *	if you were already holding one. This deadlock occurs because the tx
+ *	currently being operated on prevents a txg from syncing, which
+ *	prevents the new tx from progressing, resulting in a deadlock.  If you
+ *	must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
+ *	is a synonym for zrele().
+ *
+ *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
+ *	as they can span dmu_tx_assign() calls.
+ *
+ *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ *      dmu_tx_assign().  This is critical because we don't want to block
+ *      while holding locks.
+ *
+ *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
+ *	reduces lock contention and CPU usage when we must wait (note that if
+ *	throughput is constrained by the storage, nearly every transaction
+ *	must wait).
+ *
+ *      Note, in particular, that if a lock is sometimes acquired before
+ *      the tx assigns, and sometimes after (e.g. z_lock), then failing
+ *      to use a non-blocking assign can deadlock the system.  The scenario:
+ *
+ *	Thread A has grabbed a lock before calling dmu_tx_assign().
+ *	Thread B is in an already-assigned tx, and blocks for this lock.
+ *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
+ *	forever, because the previous txg can't quiesce until B's tx commits.
+ *
+ *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
+ *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
+ *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ *	to indicate that this operation has already called dmu_tx_wait().
+ *	This will ensure that we don't retry forever, waiting a short bit
+ *	each time.
+ *
+ *  (5)	If the operation succeeded, generate the intent log entry for it
+ *	before dropping locks.  This ensures that the ordering of events
+ *	in the intent log matches the order in which they actually occurred.
+ *	During ZIL replay the zfs_log_* functions will update the sequence
+ *	number to indicate the zil transaction has replayed.
+ *
+ *  (6)	At the end of each vnode op, the DMU tx must always commit,
+ *	regardless of whether there were any errors.
+ *
+ *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
+ *	to ensure that synchronous semantics are provided when necessary.
+ *
+ * In general, this is how things should be ordered in each vnode op:
+ *
+ *	ZFS_ENTER(zfsvfs);		// exit if unmounted
+ * top:
+ *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may igrab())
+ *	rw_enter(...);			// grab any other locks you need
+ *	tx = dmu_tx_create(...);	// get DMU tx
+ *	dmu_tx_hold_*();		// hold each object you might modify
+ *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+ *	if (error) {
+ *		rw_exit(...);		// drop locks
+ *		zfs_dirent_unlock(dl);	// unlock directory entry
+ *		zrele(...);		// release held znodes
+ *		if (error == ERESTART) {
+ *			waited = B_TRUE;
+ *			dmu_tx_wait(tx);
+ *			dmu_tx_abort(tx);
+ *			goto top;
+ *		}
+ *		dmu_tx_abort(tx);	// abort DMU tx
+ *		ZFS_EXIT(zfsvfs);	// finished in zfs
+ *		return (error);		// really out of space
+ *	}
+ *	error = do_real_work();		// do whatever this VOP does
+ *	if (error == 0)
+ *		zfs_log_*(...);		// on success, make ZIL entry
+ *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
+ *	rw_exit(...);			// drop locks
+ *	zfs_dirent_unlock(dl);		// unlock directory entry
+ *	zrele(...);			// release held znodes
+ *	zil_commit(zilog, foid);	// synchronous when necessary
+ *	ZFS_EXIT(zfsvfs);		// finished in zfs
+ *	return (error);			// done, report error
+ */
+
+/*
+ * Virus scanning is unsupported.  It would be possible to add a hook
+ * here to performance the required virus scan.  This could be done
+ * entirely in the kernel or potentially as an update to invoke a
+ * scanning utility.
+ */
+static int
+zfs_vscan(struct inode *ip, cred_t *cr, int async)
+{
+	return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/* Honor ZFS_APPENDONLY file attribute */
+	if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
+	    ((flag & O_APPEND) == 0)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/* Virus scan eligible files on open */
+	if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
+		if (zfs_vscan(ip, cr, 0) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EACCES));
+		}
+	}
+
+	/* Keep a count of the synchronous opens in the znode */
+	if (flag & O_SYNC)
+		atomic_inc_32(&zp->z_sync_cnt);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/* ARGSUSED */
+int
+zfs_close(struct inode *ip, int flag, cred_t *cr)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	/* Decrement the synchronous opens in the znode */
+	if (flag & O_SYNC)
+		atomic_dec_32(&zp->z_sync_cnt);
+
+	if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) &&
+	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
+		VERIFY(zfs_vscan(ip, cr, 1) == 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+#if defined(_KERNEL)
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Write:	If we find a memory mapped page, we write to *both*
+ *		the page and the dmu buffer.
+ */
+void
+update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
+{
+	struct inode *ip = ZTOI(zp);
+	struct address_space *mp = ip->i_mapping;
+	struct page *pp;
+	uint64_t nbytes;
+	int64_t	off;
+	void *pb;
+
+	off = start & (PAGE_SIZE-1);
+	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+		nbytes = MIN(PAGE_SIZE - off, len);
+
+		pp = find_lock_page(mp, start >> PAGE_SHIFT);
+		if (pp) {
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			pb = kmap(pp);
+			(void) dmu_read(os, zp->z_id, start + off, nbytes,
+			    pb + off, DMU_READ_PREFETCH);
+			kunmap(pp);
+
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			mark_page_accessed(pp);
+			SetPageUptodate(pp);
+			ClearPageError(pp);
+			unlock_page(pp);
+			put_page(pp);
+		}
+
+		len -= nbytes;
+		off = 0;
+	}
+}
+
+/*
+ * When a file is memory mapped, we must keep the IO data synchronized
+ * between the DMU cache and the memory mapped pages.  What this means:
+ *
+ * On Read:	We "read" preferentially from memory mapped pages,
+ *		else we default from the dmu buffer.
+ *
+ * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
+ *	 the file is memory mapped.
+ */
+int
+mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
+{
+	struct inode *ip = ZTOI(zp);
+	struct address_space *mp = ip->i_mapping;
+	struct page *pp;
+	int64_t	start, off;
+	uint64_t bytes;
+	int len = nbytes;
+	int error = 0;
+	void *pb;
+
+	start = uio->uio_loffset;
+	off = start & (PAGE_SIZE-1);
+	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
+		bytes = MIN(PAGE_SIZE - off, len);
+
+		pp = find_lock_page(mp, start >> PAGE_SHIFT);
+		if (pp) {
+			ASSERT(PageUptodate(pp));
+			unlock_page(pp);
+
+			pb = kmap(pp);
+			error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
+			kunmap(pp);
+
+			if (mapping_writably_mapped(mp))
+				flush_dcache_page(pp);
+
+			mark_page_accessed(pp);
+			put_page(pp);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, bytes);
+		}
+
+		len -= bytes;
+		off = 0;
+		if (error)
+			break;
+	}
+	return (error);
+}
+#endif /* _KERNEL */
+
+unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
+
+/*
+ * Write the bytes to a file.
+ *
+ *	IN:	zp	- znode of file to be written to
+ *		data	- bytes to write
+ *		len	- number of bytes to write
+ *		pos	- offset to start writing at
+ *
+ *	OUT:	resid	- remaining bytes to write
+ *
+ *	RETURN:	0 if success
+ *		positive error code if failure.  EIO is	returned
+ *		for a short write when residp isn't provided.
+ *
+ * Timestamps:
+ *	zp - ctime|mtime updated if byte count > 0
+ */
+int
+zfs_write_simple(znode_t *zp, const void *data, size_t len,
+    loff_t pos, size_t *residp)
+{
+	fstrans_cookie_t cookie;
+	int error;
+
+	struct iovec iov;
+	iov.iov_base = (void *)data;
+	iov.iov_len = len;
+
+	zfs_uio_t uio;
+	zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
+
+	cookie = spl_fstrans_mark();
+	error = zfs_write(zp, &uio, 0, kcred);
+	spl_fstrans_unmark(cookie);
+
+	if (error == 0) {
+		if (residp != NULL)
+			*residp = zfs_uio_resid(&uio);
+		else if (zfs_uio_resid(&uio) != 0)
+			error = SET_ERROR(EIO);
+	}
+
+	return (error);
+}
+
+void
+zfs_zrele_async(znode_t *zp)
+{
+	struct inode *ip = ZTOI(zp);
+	objset_t *os = ITOZSB(ip)->z_os;
+
+	ASSERT(atomic_read(&ip->i_count) > 0);
+	ASSERT(os != NULL);
+
+	/*
+	 * If decrementing the count would put us at 0, we can't do it inline
+	 * here, because that would be synchronous. Instead, dispatch an iput
+	 * to run later.
+	 *
+	 * For more information on the dangers of a synchronous iput, see the
+	 * header comment of this file.
+	 */
+	if (!atomic_add_unless(&ip->i_count, -1, 1)) {
+		VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
+		    (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID);
+	}
+}
+
+
+/*
+ * Lookup an entry in a directory, or an extended attribute directory.
+ * If it exists, return a held inode reference for it.
+ *
+ *	IN:	zdp	- znode of directory to search.
+ *		nm	- name of entry to lookup.
+ *		flags	- LOOKUP_XATTR set if looking for an attribute.
+ *		cr	- credentials of caller.
+ *		direntflags - directory lookup flags
+ *		realpnp - returned pathname.
+ *
+ *	OUT:	zpp	- znode of located entry, NULL if not found.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	NA
+ */
+/* ARGSUSED */
+int
+zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
+    int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zdp);
+	int error = 0;
+
+	/*
+	 * Fast path lookup, however we must skip DNLC lookup
+	 * for case folding or normalizing lookups because the
+	 * DNLC code only stores the passed in name.  This means
+	 * creating 'a' and removing 'A' on a case insensitive
+	 * file system would work, but DNLC still thinks 'a'
+	 * exists and won't let you create it again on the next
+	 * pass through fast path.
+	 */
+	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+		if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
+			return (SET_ERROR(ENOTDIR));
+		} else if (zdp->z_sa_hdl == NULL) {
+			return (SET_ERROR(EIO));
+		}
+
+		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+			error = zfs_fastaccesschk_execute(zdp, cr);
+			if (!error) {
+				*zpp = zdp;
+				zhold(*zpp);
+				return (0);
+			}
+			return (error);
+		}
+	}
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zdp);
+
+	*zpp = NULL;
+
+	if (flags & LOOKUP_XATTR) {
+		/*
+		 * We don't allow recursive attributes..
+		 * Maybe someday we will.
+		 */
+		if (zdp->z_pflags & ZFS_XATTR) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(EINVAL));
+		}
+
+		if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+
+		/*
+		 * Do we have permission to get into attribute directory?
+		 */
+
+		if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
+		    B_FALSE, cr))) {
+			zrele(*zpp);
+			*zpp = NULL;
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENOTDIR));
+	}
+
+	/*
+	 * Check accessibility of directory.
+	 */
+
+	if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
+	if ((error == 0) && (*zpp))
+		zfs_znode_update_vfs(*zpp);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Attempt to create a new entry in a directory.  If the entry
+ * already exists, truncate the file if permissible, else return
+ * an error.  Return the ip of the created or trunc'd file.
+ *
+ *	IN:	dzp	- znode of directory to put new file entry in.
+ *		name	- name of new file entry.
+ *		vap	- attributes of new file.
+ *		excl	- flag indicating exclusive or non-exclusive mode.
+ *		mode	- mode to open file with.
+ *		cr	- credentials of caller.
+ *		flag	- file flag.
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	zpp	- znode of created or trunc'd entry.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime updated if new entry created
+ *	 zp - ctime|mtime always, atime if new
+ */
+
+/* ARGSUSED */
+int
+zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
+    int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	objset_t	*os;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	uid_t		uid;
+	gid_t		gid;
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+	boolean_t	have_acl = B_FALSE;
+	boolean_t	waited = B_FALSE;
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	gid = crgetgid(cr);
+	uid = crgetuid(cr);
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	os = zfsvfs->z_os;
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (vap->va_mask & ATTR_XVATTR) {
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_mode)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+top:
+	*zpp = NULL;
+	if (*name == '\0') {
+		/*
+		 * Null component name refers to the directory itself.
+		 */
+		zhold(dzp);
+		zp = dzp;
+		dl = NULL;
+		error = 0;
+	} else {
+		/* possible igrab(zp) */
+		int zflg = 0;
+
+		if (flag & FIGNORECASE)
+			zflg |= ZCILOOK;
+
+		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+		    NULL, NULL);
+		if (error) {
+			if (have_acl)
+				zfs_acl_ids_free(&acl_ids);
+			if (strcmp(name, "..") == 0)
+				error = SET_ERROR(EISDIR);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if (zp == NULL) {
+		uint64_t txtype;
+		uint64_t projid = ZFS_DEFAULT_PROJID;
+
+		/*
+		 * Create a new file object and update the directory
+		 * to reference it.
+		 */
+		if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+			if (have_acl)
+				zfs_acl_ids_free(&acl_ids);
+			goto out;
+		}
+
+		/*
+		 * We only support the creation of regular files in
+		 * extended attribute directories.
+		 */
+
+		if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
+			if (have_acl)
+				zfs_acl_ids_free(&acl_ids);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+		    cr, vsecp, &acl_ids)) != 0)
+			goto out;
+		have_acl = B_TRUE;
+
+		if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+			projid = zfs_inherit_projid(dzp);
+		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+			zfs_acl_ids_free(&acl_ids);
+			error = SET_ERROR(EDQUOT);
+			goto out;
+		}
+
+		tx = dmu_tx_create(os);
+
+		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+		    ZFS_SA_BASE_ATTR_SIZE);
+
+		fuid_dirtied = zfsvfs->z_fuid_dirty;
+		if (fuid_dirtied)
+			zfs_fuid_txhold(zfsvfs, tx);
+		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+		if (!zfsvfs->z_use_sa &&
+		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, acl_ids.z_aclp->z_acl_bytes);
+		}
+
+		error = dmu_tx_assign(tx,
+		    (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+		if (error) {
+			zfs_dirent_unlock(dl);
+			if (error == ERESTART) {
+				waited = B_TRUE;
+				dmu_tx_wait(tx);
+				dmu_tx_abort(tx);
+				goto top;
+			}
+			zfs_acl_ids_free(&acl_ids);
+			dmu_tx_abort(tx);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+		error = zfs_link_create(dl, zp, tx, ZNEW);
+		if (error != 0) {
+			/*
+			 * Since, we failed to add the directory entry for it,
+			 * delete the newly created dnode.
+			 */
+			zfs_znode_delete(zp, tx);
+			remove_inode_hash(ZTOI(zp));
+			zfs_acl_ids_free(&acl_ids);
+			dmu_tx_commit(tx);
+			goto out;
+		}
+
+		if (fuid_dirtied)
+			zfs_fuid_sync(zfsvfs, tx);
+
+		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+		if (flag & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+		    vsecp, acl_ids.z_fuidp, vap);
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_commit(tx);
+	} else {
+		int aflags = (flag & O_APPEND) ? V_APPEND : 0;
+
+		if (have_acl)
+			zfs_acl_ids_free(&acl_ids);
+		have_acl = B_FALSE;
+
+		/*
+		 * A directory entry already exists for this name.
+		 */
+		/*
+		 * Can't truncate an existing file if in exclusive mode.
+		 */
+		if (excl) {
+			error = SET_ERROR(EEXIST);
+			goto out;
+		}
+		/*
+		 * Can't open a directory for writing.
+		 */
+		if (S_ISDIR(ZTOI(zp)->i_mode)) {
+			error = SET_ERROR(EISDIR);
+			goto out;
+		}
+		/*
+		 * Verify requested access to file.
+		 */
+		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
+			goto out;
+		}
+
+		mutex_enter(&dzp->z_lock);
+		dzp->z_seq++;
+		mutex_exit(&dzp->z_lock);
+
+		/*
+		 * Truncate regular files if requested.
+		 */
+		if (S_ISREG(ZTOI(zp)->i_mode) &&
+		    (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
+			/* we can't hold any locks when calling zfs_freesp() */
+			if (dl) {
+				zfs_dirent_unlock(dl);
+				dl = NULL;
+			}
+			error = zfs_freesp(zp, 0, 0, mode, TRUE);
+		}
+	}
+out:
+
+	if (dl)
+		zfs_dirent_unlock(dl);
+
+	if (error) {
+		if (zp)
+			zrele(zp);
+	} else {
+		zfs_znode_update_vfs(dzp);
+		zfs_znode_update_vfs(zp);
+		*zpp = zp;
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/* ARGSUSED */
+int
+zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
+    int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp)
+{
+	znode_t		*zp = NULL, *dzp = ITOZ(dip);
+	zfsvfs_t	*zfsvfs = ITOZSB(dip);
+	objset_t	*os;
+	dmu_tx_t	*tx;
+	int		error;
+	uid_t		uid;
+	gid_t		gid;
+	zfs_acl_ids_t   acl_ids;
+	uint64_t	projid = ZFS_DEFAULT_PROJID;
+	boolean_t	fuid_dirtied;
+	boolean_t	have_acl = B_FALSE;
+	boolean_t	waited = B_FALSE;
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	gid = crgetgid(cr);
+	uid = crgetuid(cr);
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	os = zfsvfs->z_os;
+
+	if (vap->va_mask & ATTR_XVATTR) {
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_mode)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+top:
+	*ipp = NULL;
+
+	/*
+	 * Create a new file object and update the directory
+	 * to reference it.
+	 */
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		if (have_acl)
+			zfs_acl_ids_free(&acl_ids);
+		goto out;
+	}
+
+	if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
+	    cr, vsecp, &acl_ids)) != 0)
+		goto out;
+	have_acl = B_TRUE;
+
+	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
+		projid = zfs_inherit_projid(dzp);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
+		zfs_acl_ids_free(&acl_ids);
+		error = SET_ERROR(EDQUOT);
+		goto out;
+	}
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (!zfsvfs->z_use_sa &&
+	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+		    0, acl_ids.z_aclp->z_acl_bytes);
+	}
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	/* Add to unlinked set */
+	zp->z_unlinked = B_TRUE;
+	zfs_unlinked_add(zp, tx);
+	zfs_acl_ids_free(&acl_ids);
+	dmu_tx_commit(tx);
+out:
+
+	if (error) {
+		if (zp)
+			zrele(zp);
+	} else {
+		zfs_znode_update_vfs(dzp);
+		zfs_znode_update_vfs(zp);
+		*ipp = ZTOI(zp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove an entry from a directory.
+ *
+ *	IN:	dzp	- znode of directory to remove entry from.
+ *		name	- name of entry to remove.
+ *		cr	- credentials of caller.
+ *		flags	- case flags.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime
+ *	 ip - ctime (if nlink > 0)
+ */
+
+uint64_t null_xattr = 0;
+
+/*ARGSUSED*/
+int
+zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
+{
+	znode_t		*zp;
+	znode_t		*xzp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	uint64_t	acl_obj, xattr_obj;
+	uint64_t	xattr_obj_unlinked = 0;
+	uint64_t	obj = 0;
+	uint64_t	links;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	boolean_t	may_delete_now, delete_now = FALSE;
+	boolean_t	unlinked, toobig = FALSE;
+	uint64_t	txtype;
+	pathname_t	*realnmp = NULL;
+	pathname_t	realnm;
+	int		error;
+	int		zflg = ZEXISTS;
+	boolean_t	waited = B_FALSE;
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (flags & FIGNORECASE) {
+		zflg |= ZCILOOK;
+		pn_alloc(&realnm);
+		realnmp = &realnm;
+	}
+
+top:
+	xattr_obj = 0;
+	xzp = NULL;
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+	    NULL, realnmp))) {
+		if (realnmp)
+			pn_free(realnmp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	/*
+	 * Need to use rmdir for removing directories.
+	 */
+	if (S_ISDIR(ZTOI(zp)->i_mode)) {
+		error = SET_ERROR(EPERM);
+		goto out;
+	}
+
+	mutex_enter(&zp->z_lock);
+	may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
+	    !(zp->z_is_mapped);
+	mutex_exit(&zp->z_lock);
+
+	/*
+	 * We may delete the znode now, or we may put it in the unlinked set;
+	 * it depends on whether we're the last link, and on whether there are
+	 * other holds on the inode.  So we dmu_tx_hold() the right things to
+	 * allow for either case.
+	 */
+	obj = zp->z_id;
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	if (may_delete_now) {
+		toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
+		/* if the file is too big, only hold_free a token amount */
+		dmu_tx_hold_free(tx, zp->z_id, 0,
+		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
+	}
+
+	/* are there any extended attributes? */
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+	    &xattr_obj, sizeof (xattr_obj));
+	if (error == 0 && xattr_obj) {
+		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+		ASSERT0(error);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
+	}
+
+	mutex_enter(&zp->z_lock);
+	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
+		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
+	mutex_exit(&zp->z_lock);
+
+	/* charge as an update -- would be nice not to charge at all */
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	/*
+	 * Mark this transaction as typically resulting in a net free of space
+	 */
+	dmu_tx_mark_netfree(tx);
+
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			zrele(zp);
+			if (xzp)
+				zrele(xzp);
+			goto top;
+		}
+		if (realnmp)
+			pn_free(realnmp);
+		dmu_tx_abort(tx);
+		zrele(zp);
+		if (xzp)
+			zrele(xzp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Remove the directory entry.
+	 */
+	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		goto out;
+	}
+
+	if (unlinked) {
+		/*
+		 * Hold z_lock so that we can make sure that the ACL obj
+		 * hasn't changed.  Could have been deleted due to
+		 * zfs_sa_upgrade().
+		 */
+		mutex_enter(&zp->z_lock);
+		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
+		delete_now = may_delete_now && !toobig &&
+		    atomic_read(&ZTOI(zp)->i_count) == 1 &&
+		    !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked &&
+		    zfs_external_acl(zp) == acl_obj;
+	}
+
+	if (delete_now) {
+		if (xattr_obj_unlinked) {
+			ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
+			mutex_enter(&xzp->z_lock);
+			xzp->z_unlinked = B_TRUE;
+			clear_nlink(ZTOI(xzp));
+			links = 0;
+			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
+			    &links, sizeof (links), tx);
+			ASSERT3U(error,  ==,  0);
+			mutex_exit(&xzp->z_lock);
+			zfs_unlinked_add(xzp, tx);
+
+			if (zp->z_is_sa)
+				error = sa_remove(zp->z_sa_hdl,
+				    SA_ZPL_XATTR(zfsvfs), tx);
+			else
+				error = sa_update(zp->z_sa_hdl,
+				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
+				    sizeof (uint64_t), tx);
+			ASSERT0(error);
+		}
+		/*
+		 * Add to the unlinked set because a new reference could be
+		 * taken concurrently resulting in a deferred destruction.
+		 */
+		zfs_unlinked_add(zp, tx);
+		mutex_exit(&zp->z_lock);
+	} else if (unlinked) {
+		mutex_exit(&zp->z_lock);
+		zfs_unlinked_add(zp, tx);
+	}
+
+	txtype = TX_REMOVE;
+	if (flags & FIGNORECASE)
+		txtype |= TX_CI;
+	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
+
+	dmu_tx_commit(tx);
+out:
+	if (realnmp)
+		pn_free(realnmp);
+
+	zfs_dirent_unlock(dl);
+	zfs_znode_update_vfs(dzp);
+	zfs_znode_update_vfs(zp);
+
+	if (delete_now)
+		zrele(zp);
+	else
+		zfs_zrele_async(zp);
+
+	if (xzp) {
+		zfs_znode_update_vfs(xzp);
+		zfs_zrele_async(xzp);
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Create a new directory and insert it into dzp using the name
+ * provided.  Return a pointer to the inserted directory.
+ *
+ *	IN:	dzp	- znode of directory to add subdir to.
+ *		dirname	- name of new directory.
+ *		vap	- attributes of new directory.
+ *		cr	- credentials of caller.
+ *		flags	- case flags.
+ *		vsecp	- ACL to be set
+ *
+ *	OUT:	zpp	- znode of created directory.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime updated
+ *	zpp - ctime|mtime|atime updated
+ */
+/*ARGSUSED*/
+int
+zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
+    cred_t *cr, int flags, vsecattr_t *vsecp)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*dl;
+	uint64_t	txtype;
+	dmu_tx_t	*tx;
+	int		error;
+	int		zf = ZNEW;
+	uid_t		uid;
+	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t   acl_ids;
+	boolean_t	fuid_dirtied;
+	boolean_t	waited = B_FALSE;
+
+	ASSERT(S_ISDIR(vap->va_mode));
+
+	/*
+	 * If we have an ephemeral id, ACL, or XVATTR then
+	 * make sure file system is at proper version
+	 */
+
+	uid = crgetuid(cr);
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
+		return (SET_ERROR(EINVAL));
+
+	if (dirname == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (dzp->z_pflags & ZFS_XATTR) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(dirname,
+	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+	if (flags & FIGNORECASE)
+		zf |= ZCILOOK;
+
+	if (vap->va_mask & ATTR_XVATTR) {
+		if ((error = secpolicy_xvattr((xvattr_t *)vap,
+		    crgetuid(cr), cr, vap->va_mode)) != 0) {
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
+	    vsecp, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	/*
+	 * First make sure the new directory doesn't exist.
+	 *
+	 * Existence is checked first to make sure we don't return
+	 * EACCES instead of EEXIST which can cause some applications
+	 * to fail.
+	 */
+top:
+	*zpp = NULL;
+
+	if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
+	    NULL, NULL))) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	/*
+	 * Add a new entry to the directory.
+	 */
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE);
+
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create new node.
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	/*
+	 * Now put new name in parent dir.
+	 */
+	error = zfs_link_create(dl, zp, tx, ZNEW);
+	if (error != 0) {
+		zfs_znode_delete(zp, tx);
+		remove_inode_hash(ZTOI(zp));
+		goto out;
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	*zpp = zp;
+
+	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
+	if (flags & FIGNORECASE)
+		txtype |= TX_CI;
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	    acl_ids.z_fuidp, vap);
+
+out:
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	if (error != 0) {
+		zrele(zp);
+	} else {
+		zfs_znode_update_vfs(dzp);
+		zfs_znode_update_vfs(zp);
+	}
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Remove a directory subdir entry.  If the current working
+ * directory is the same as the subdir to be removed, the
+ * remove will fail.
+ *
+ *	IN:	dzp	- znode of directory to remove from.
+ *		name	- name of directory to be removed.
+ *		cwd	- inode of current working directory.
+ *		cr	- credentials of caller.
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dzp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
+    int flags)
+{
+	znode_t		*zp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	int		zflg = ZEXISTS;
+	boolean_t	waited = B_FALSE;
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
+top:
+	zp = NULL;
+
+	/*
+	 * Attempt to lock directory; fail if entry doesn't exist.
+	 */
+	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
+	    NULL, NULL))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess_delete(dzp, zp, cr))) {
+		goto out;
+	}
+
+	if (!S_ISDIR(ZTOI(zp)->i_mode)) {
+		error = SET_ERROR(ENOTDIR);
+		goto out;
+	}
+
+	if (zp == cwd) {
+		error = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	/*
+	 * Grab a lock on the directory to make sure that no one is
+	 * trying to add (or lookup) entries while we are removing it.
+	 */
+	rw_enter(&zp->z_name_lock, RW_WRITER);
+
+	/*
+	 * Grab a lock on the parent pointer to make sure we play well
+	 * with the treewalk and directory rename code.
+	 */
+	rw_enter(&zp->z_parent_lock, RW_WRITER);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	zfs_sa_upgrade_txholds(tx, zp);
+	zfs_sa_upgrade_txholds(tx, dzp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		rw_exit(&zp->z_parent_lock);
+		rw_exit(&zp->z_name_lock);
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			zrele(zp);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zrele(zp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+
+	if (error == 0) {
+		uint64_t txtype = TX_RMDIR;
+		if (flags & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
+		    B_FALSE);
+	}
+
+	dmu_tx_commit(tx);
+
+	rw_exit(&zp->z_parent_lock);
+	rw_exit(&zp->z_name_lock);
+out:
+	zfs_dirent_unlock(dl);
+
+	zfs_znode_update_vfs(dzp);
+	zfs_znode_update_vfs(zp);
+	zrele(zp);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Read directory entries from the given directory cursor position and emit
+ * name and position for each entry.
+ *
+ *	IN:	ip	- inode of directory to read.
+ *		ctx	- directory entry context.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - atime updated
+ *
+ * Note that the low 4 bits of the cookie returned by zap is always zero.
+ * This allows us to use the low range for "special" directory entries:
+ * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
+ * we use the offset 2 for the '.zfs' directory.
+ */
+/* ARGSUSED */
+int
+zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	objset_t	*os;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	int		error;
+	uint8_t		prefetch;
+	uint8_t		type;
+	int		done = 0;
+	uint64_t	parent;
+	uint64_t	offset; /* must be unsigned; checks for < 1 */
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (parent))) != 0)
+		goto out;
+
+	/*
+	 * Quit if directory has been removed (posix)
+	 */
+	if (zp->z_unlinked)
+		goto out;
+
+	error = 0;
+	os = zfsvfs->z_os;
+	offset = ctx->pos;
+	prefetch = zp->z_zn_prefetch;
+
+	/*
+	 * Initialize the iterator cursor.
+	 */
+	if (offset <= 3) {
+		/*
+		 * Start iteration from the beginning of the directory.
+		 */
+		zap_cursor_init(&zc, os, zp->z_id);
+	} else {
+		/*
+		 * The offset is a serialized cursor.
+		 */
+		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
+	}
+
+	/*
+	 * Transform to file-system independent format
+	 */
+	while (!done) {
+		uint64_t objnum;
+		/*
+		 * Special case `.', `..', and `.zfs'.
+		 */
+		if (offset == 0) {
+			(void) strcpy(zap.za_name, ".");
+			zap.za_normalization_conflict = 0;
+			objnum = zp->z_id;
+			type = DT_DIR;
+		} else if (offset == 1) {
+			(void) strcpy(zap.za_name, "..");
+			zap.za_normalization_conflict = 0;
+			objnum = parent;
+			type = DT_DIR;
+		} else if (offset == 2 && zfs_show_ctldir(zp)) {
+			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
+			zap.za_normalization_conflict = 0;
+			objnum = ZFSCTL_INO_ROOT;
+			type = DT_DIR;
+		} else {
+			/*
+			 * Grab next entry.
+			 */
+			if ((error = zap_cursor_retrieve(&zc, &zap))) {
+				if (error == ENOENT)
+					break;
+				else
+					goto update;
+			}
+
+			/*
+			 * Allow multiple entries provided the first entry is
+			 * the object id.  Non-zpl consumers may safely make
+			 * use of the additional space.
+			 *
+			 * XXX: This should be a feature flag for compatibility
+			 */
+			if (zap.za_integer_length != 8 ||
+			    zap.za_num_integers == 0) {
+				cmn_err(CE_WARN, "zap_readdir: bad directory "
+				    "entry, obj = %lld, offset = %lld, "
+				    "length = %d, num = %lld\n",
+				    (u_longlong_t)zp->z_id,
+				    (u_longlong_t)offset,
+				    zap.za_integer_length,
+				    (u_longlong_t)zap.za_num_integers);
+				error = SET_ERROR(ENXIO);
+				goto update;
+			}
+
+			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
+			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
+		}
+
+		done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
+		    objnum, type);
+		if (done)
+			break;
+
+		/* Prefetch znode */
+		if (prefetch) {
+			dmu_prefetch(os, objnum, 0, 0, 0,
+			    ZIO_PRIORITY_SYNC_READ);
+		}
+
+		/*
+		 * Move to the next entry, fill in the previous offset.
+		 */
+		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
+			zap_cursor_advance(&zc);
+			offset = zap_cursor_serialize(&zc);
+		} else {
+			offset += 1;
+		}
+		ctx->pos = offset;
+	}
+	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
+
+update:
+	zap_cursor_fini(&zc);
+	if (error == ENOENT)
+		error = 0;
+out:
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * Get the basic file attributes and place them in the provided kstat
+ * structure.  The inode is assumed to be the authoritative source
+ * for most of the attributes.  However, the znode currently has the
+ * authoritative atime, blksize, and block count.
+ *
+ *	IN:	ip	- inode of file.
+ *
+ *	OUT:	sp	- kstat values.
+ *
+ *	RETURN:	0 (always succeeds)
+ */
+/* ARGSUSED */
+int
+zfs_getattr_fast(struct inode *ip, struct kstat *sp)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	uint32_t blksize;
+	u_longlong_t nblocks;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	mutex_enter(&zp->z_lock);
+
+	generic_fillattr(ip, sp);
+	/*
+	 * +1 link count for root inode with visible '.zfs' directory.
+	 */
+	if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
+		if (sp->nlink < ZFS_LINK_MAX)
+			sp->nlink++;
+
+	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
+	sp->blksize = blksize;
+	sp->blocks = nblocks;
+
+	if (unlikely(zp->z_blksz == 0)) {
+		/*
+		 * Block size hasn't been set; suggest maximal I/O transfers.
+		 */
+		sp->blksize = zfsvfs->z_max_blksz;
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	/*
+	 * Required to prevent NFS client from detecting different inode
+	 * numbers of snapshot root dentry before and after snapshot mount.
+	 */
+	if (zfsvfs->z_issnap) {
+		if (ip->i_sb->s_root->d_inode == ip)
+			sp->ino = ZFSCTL_INO_SNAPDIRS -
+			    dmu_objset_id(zfsvfs->z_os);
+	}
+
+	ZFS_EXIT(zfsvfs);
+
+	return (0);
+}
+
+/*
+ * For the operation of changing file's user/group/project, we need to
+ * handle not only the main object that is assigned to the file directly,
+ * but also the ones that are used by the file via hidden xattr directory.
+ *
+ * Because the xattr directory may contains many EA entries, as to it may
+ * be impossible to change all of them via the transaction of changing the
+ * main object's user/group/project attributes. Then we have to change them
+ * via other multiple independent transactions one by one. It may be not good
+ * solution, but we have no better idea yet.
+ */
+static int
+zfs_setattr_dir(znode_t *dzp)
+{
+	struct inode	*dxip = ZTOI(dzp);
+	struct inode	*xip = NULL;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	objset_t	*os = zfsvfs->z_os;
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	zfs_dirlock_t	*dl;
+	znode_t		*zp = NULL;
+	dmu_tx_t	*tx = NULL;
+	uint64_t	uid, gid;
+	sa_bulk_attr_t	bulk[4];
+	int		count;
+	int		err;
+
+	zap_cursor_init(&zc, os, dzp->z_id);
+	while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
+		count = 0;
+		if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+			err = ENXIO;
+			break;
+		}
+
+		err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
+		    ZEXISTS, NULL, NULL);
+		if (err == ENOENT)
+			goto next;
+		if (err)
+			break;
+
+		xip = ZTOI(zp);
+		if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
+		    KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
+		    zp->z_projid == dzp->z_projid)
+			goto next;
+
+		tx = dmu_tx_create(os);
+		if (!(zp->z_pflags & ZFS_PROJID))
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		else
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err)
+			break;
+
+		mutex_enter(&dzp->z_lock);
+
+		if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
+			xip->i_uid = dxip->i_uid;
+			uid = zfs_uid_read(dxip);
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+			    &uid, sizeof (uid));
+		}
+
+		if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
+			xip->i_gid = dxip->i_gid;
+			gid = zfs_gid_read(dxip);
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+			    &gid, sizeof (gid));
+		}
+
+		if (zp->z_projid != dzp->z_projid) {
+			if (!(zp->z_pflags & ZFS_PROJID)) {
+				zp->z_pflags |= ZFS_PROJID;
+				SA_ADD_BULK_ATTR(bulk, count,
+				    SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
+				    sizeof (zp->z_pflags));
+			}
+
+			zp->z_projid = dzp->z_projid;
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
+			    NULL, &zp->z_projid, sizeof (zp->z_projid));
+		}
+
+		mutex_exit(&dzp->z_lock);
+
+		if (likely(count > 0)) {
+			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+			dmu_tx_commit(tx);
+		} else {
+			dmu_tx_abort(tx);
+		}
+		tx = NULL;
+		if (err != 0 && err != ENOENT)
+			break;
+
+next:
+		if (zp) {
+			zrele(zp);
+			zp = NULL;
+			zfs_dirent_unlock(dl);
+		}
+		zap_cursor_advance(&zc);
+	}
+
+	if (tx)
+		dmu_tx_abort(tx);
+	if (zp) {
+		zrele(zp);
+		zfs_dirent_unlock(dl);
+	}
+	zap_cursor_fini(&zc);
+
+	return (err == ENOENT ? 0 : err);
+}
+
+/*
+ * Set the file attributes to the values contained in the
+ * vattr structure.
+ *
+ *	IN:	zp	- znode of file to be modified.
+ *		vap	- new attribute values.
+ *			  If ATTR_XVATTR set, then optional attrs are being set
+ *		flags	- ATTR_UTIME set if non-default time values provided.
+ *			- ATTR_NOACLCHECK (CIFS context only).
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - ctime updated, mtime updated if size changed.
+ */
+/* ARGSUSED */
+int
+zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr)
+{
+	struct inode	*ip;
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	objset_t	*os = zfsvfs->z_os;
+	zilog_t		*zilog;
+	dmu_tx_t	*tx;
+	vattr_t		oldva;
+	xvattr_t	*tmpxvattr;
+	uint_t		mask = vap->va_mask;
+	uint_t		saved_mask = 0;
+	int		trim_mask = 0;
+	uint64_t	new_mode;
+	uint64_t	new_kuid = 0, new_kgid = 0, new_uid, new_gid;
+	uint64_t	xattr_obj;
+	uint64_t	mtime[2], ctime[2], atime[2];
+	uint64_t	projid = ZFS_INVALID_PROJID;
+	znode_t		*attrzp;
+	int		need_policy = FALSE;
+	int		err, err2 = 0;
+	zfs_fuid_info_t *fuidp = NULL;
+	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
+	xoptattr_t	*xoap;
+	zfs_acl_t	*aclp;
+	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	boolean_t	fuid_dirtied = B_FALSE;
+	boolean_t	handle_eadir = B_FALSE;
+	sa_bulk_attr_t	*bulk, *xattr_bulk;
+	int		count = 0, xattr_count = 0, bulks = 8;
+
+	if (mask == 0)
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	ip = ZTOI(zp);
+
+	/*
+	 * If this is a xvattr_t, then get a pointer to the structure of
+	 * optional attributes.  If this is NULL, then we have a vattr_t.
+	 */
+	xoap = xva_getxoptattr(xvap);
+	if (xoap != NULL && (mask & ATTR_XVATTR)) {
+		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+			if (!dmu_objset_projectquota_enabled(os) ||
+			    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(ENOTSUP));
+			}
+
+			projid = xoap->xoa_projid;
+			if (unlikely(projid == ZFS_INVALID_PROJID)) {
+				ZFS_EXIT(zfsvfs);
+				return (SET_ERROR(EINVAL));
+			}
+
+			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
+				projid = ZFS_INVALID_PROJID;
+			else
+				need_policy = TRUE;
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
+		    (xoap->xoa_projinherit !=
+		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
+		    (!dmu_objset_projectquota_enabled(os) ||
+		    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
+			ZFS_EXIT(zfsvfs);
+			return (SET_ERROR(ENOTSUP));
+		}
+	}
+
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * Make sure that if we have ephemeral uid/gid or xvattr specified
+	 * that file system is at proper version level
+	 */
+
+	if (zfsvfs->z_use_fuids == B_FALSE &&
+	    (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
+	    ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
+	    (mask & ATTR_XVATTR))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EISDIR));
+	}
+
+	if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
+	xva_init(tmpxvattr);
+
+	bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+	xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
+
+	/*
+	 * Immutable files can only alter immutable bit and atime
+	 */
+	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
+	    ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
+	    ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
+		err = SET_ERROR(EPERM);
+		goto out3;
+	}
+
+	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
+		err = SET_ERROR(EPERM);
+		goto out3;
+	}
+
+	/*
+	 * Verify timestamps doesn't overflow 32 bits.
+	 * ZFS can handle large timestamps, but 32bit syscalls can't
+	 * handle times greater than 2039.  This check should be removed
+	 * once large timestamps are fully supported.
+	 */
+	if (mask & (ATTR_ATIME | ATTR_MTIME)) {
+		if (((mask & ATTR_ATIME) &&
+		    TIMESPEC_OVERFLOW(&vap->va_atime)) ||
+		    ((mask & ATTR_MTIME) &&
+		    TIMESPEC_OVERFLOW(&vap->va_mtime))) {
+			err = SET_ERROR(EOVERFLOW);
+			goto out3;
+		}
+	}
+
+top:
+	attrzp = NULL;
+	aclp = NULL;
+
+	/* Can this be moved to before the top label? */
+	if (zfs_is_readonly(zfsvfs)) {
+		err = SET_ERROR(EROFS);
+		goto out3;
+	}
+
+	/*
+	 * First validate permissions
+	 */
+
+	if (mask & ATTR_SIZE) {
+		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
+		if (err)
+			goto out3;
+
+		/*
+		 * XXX - Note, we are not providing any open
+		 * mode flags here (like FNDELAY), so we may
+		 * block if there are locks present... this
+		 * should be addressed in openat().
+		 */
+		/* XXX - would it be OK to generate a log record here? */
+		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
+		if (err)
+			goto out3;
+	}
+
+	if (mask & (ATTR_ATIME|ATTR_MTIME) ||
+	    ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
+	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
+	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
+	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
+	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
+	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
+	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
+		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
+		    skipaclchk, cr);
+	}
+
+	if (mask & (ATTR_UID|ATTR_GID)) {
+		int	idmask = (mask & (ATTR_UID|ATTR_GID));
+		int	take_owner;
+		int	take_group;
+
+		/*
+		 * NOTE: even if a new mode is being set,
+		 * we may clear S_ISUID/S_ISGID bits.
+		 */
+
+		if (!(mask & ATTR_MODE))
+			vap->va_mode = zp->z_mode;
+
+		/*
+		 * Take ownership or chgrp to group we are a member of
+		 */
+
+		take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr));
+		take_group = (mask & ATTR_GID) &&
+		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
+
+		/*
+		 * If both ATTR_UID and ATTR_GID are set then take_owner and
+		 * take_group must both be set in order to allow taking
+		 * ownership.
+		 *
+		 * Otherwise, send the check through secpolicy_vnode_setattr()
+		 *
+		 */
+
+		if (((idmask == (ATTR_UID|ATTR_GID)) &&
+		    take_owner && take_group) ||
+		    ((idmask == ATTR_UID) && take_owner) ||
+		    ((idmask == ATTR_GID) && take_group)) {
+			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
+			    skipaclchk, cr) == 0) {
+				/*
+				 * Remove setuid/setgid for non-privileged users
+				 */
+				(void) secpolicy_setid_clear(vap, cr);
+				trim_mask = (mask & (ATTR_UID|ATTR_GID));
+			} else {
+				need_policy =  TRUE;
+			}
+		} else {
+			need_policy =  TRUE;
+		}
+	}
+
+	mutex_enter(&zp->z_lock);
+	oldva.va_mode = zp->z_mode;
+	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
+	if (mask & ATTR_XVATTR) {
+		/*
+		 * Update xvattr mask to include only those attributes
+		 * that are actually changing.
+		 *
+		 * the bits will be restored prior to actually setting
+		 * the attributes so the caller thinks they were set.
+		 */
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			if (xoap->xoa_appendonly !=
+			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+				XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+			if (xoap->xoa_projinherit !=
+			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
+				XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			if (xoap->xoa_nounlink !=
+			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+				XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			if (xoap->xoa_immutable !=
+			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+				XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			if (xoap->xoa_nodump !=
+			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NODUMP);
+				XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			if (xoap->xoa_av_modified !=
+			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+				XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			if ((!S_ISREG(ip->i_mode) &&
+			    xoap->xoa_av_quarantined) ||
+			    xoap->xoa_av_quarantined !=
+			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+				XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			mutex_exit(&zp->z_lock);
+			err = SET_ERROR(EPERM);
+			goto out3;
+		}
+
+		if (need_policy == FALSE &&
+		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+			need_policy = TRUE;
+		}
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	if (mask & ATTR_MODE) {
+		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+			err = secpolicy_setid_setsticky_clear(ip, vap,
+			    &oldva, cr);
+			if (err)
+				goto out3;
+
+			trim_mask |= ATTR_MODE;
+		} else {
+			need_policy = TRUE;
+		}
+	}
+
+	if (need_policy) {
+		/*
+		 * If trim_mask is set then take ownership
+		 * has been granted or write_acl is present and user
+		 * has the ability to modify mode.  In that case remove
+		 * UID|GID and or MODE from mask so that
+		 * secpolicy_vnode_setattr() doesn't revoke it.
+		 */
+
+		if (trim_mask) {
+			saved_mask = vap->va_mask;
+			vap->va_mask &= ~trim_mask;
+		}
+		err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
+		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
+		if (err)
+			goto out3;
+
+		if (trim_mask)
+			vap->va_mask |= saved_mask;
+	}
+
+	/*
+	 * secpolicy_vnode_setattr, or take ownership may have
+	 * changed va_mask
+	 */
+	mask = vap->va_mask;
+
+	if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
+		handle_eadir = B_TRUE;
+		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+		    &xattr_obj, sizeof (xattr_obj));
+
+		if (err == 0 && xattr_obj) {
+			err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
+			if (err)
+				goto out2;
+		}
+		if (mask & ATTR_UID) {
+			new_kuid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+			if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
+			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
+			    new_kuid)) {
+				if (attrzp)
+					zrele(attrzp);
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (mask & ATTR_GID) {
+			new_kgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
+			if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
+			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+			    new_kgid)) {
+				if (attrzp)
+					zrele(attrzp);
+				err = SET_ERROR(EDQUOT);
+				goto out2;
+			}
+		}
+
+		if (projid != ZFS_INVALID_PROJID &&
+		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
+			if (attrzp)
+				zrele(attrzp);
+			err = EDQUOT;
+			goto out2;
+		}
+	}
+	tx = dmu_tx_create(os);
+
+	if (mask & ATTR_MODE) {
+		uint64_t pmode = zp->z_mode;
+		uint64_t acl_obj;
+		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
+
+		if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
+		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
+			err = EPERM;
+			goto out;
+		}
+
+		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
+			goto out;
+
+		mutex_enter(&zp->z_lock);
+		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
+			/*
+			 * Are we upgrading ACL from old V0 format
+			 * to V1 format?
+			 */
+			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
+			    zfs_znode_acl_version(zp) ==
+			    ZFS_ACL_VERSION_INITIAL) {
+				dmu_tx_hold_free(tx, acl_obj, 0,
+				    DMU_OBJECT_END);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+				    0, aclp->z_acl_bytes);
+			} else {
+				dmu_tx_hold_write(tx, acl_obj, 0,
+				    aclp->z_acl_bytes);
+			}
+		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+			    0, aclp->z_acl_bytes);
+		}
+		mutex_exit(&zp->z_lock);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+	} else {
+		if (((mask & ATTR_XVATTR) &&
+		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
+		    (projid != ZFS_INVALID_PROJID &&
+		    !(zp->z_pflags & ZFS_PROJID)))
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+		else
+			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	}
+
+	if (attrzp) {
+		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
+	}
+
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err)
+		goto out;
+
+	count = 0;
+	/*
+	 * Set each attribute requested.
+	 * We group settings according to the locks they need to acquire.
+	 *
+	 * Note: you cannot set ctime directly, although it will be
+	 * updated as a side-effect of calling this function.
+	 */
+
+	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
+		/*
+		 * For the existed object that is upgraded from old system,
+		 * its on-disk layout has no slot for the project ID attribute.
+		 * But quota accounting logic needs to access related slots by
+		 * offset directly. So we need to adjust old objects' layout
+		 * to make the project ID to some unified and fixed offset.
+		 */
+		if (attrzp)
+			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
+		if (err == 0)
+			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
+
+		if (unlikely(err == EEXIST))
+			err = 0;
+		else if (err != 0)
+			goto out;
+		else
+			projid = ZFS_INVALID_PROJID;
+	}
+
+	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+		mutex_enter(&zp->z_acl_lock);
+	mutex_enter(&zp->z_lock);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+
+	if (attrzp) {
+		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+			mutex_enter(&attrzp->z_acl_lock);
+		mutex_enter(&attrzp->z_lock);
+		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
+		    sizeof (attrzp->z_pflags));
+		if (projid != ZFS_INVALID_PROJID) {
+			attrzp->z_projid = projid;
+			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
+			    sizeof (attrzp->z_projid));
+		}
+	}
+
+	if (mask & (ATTR_UID|ATTR_GID)) {
+
+		if (mask & ATTR_UID) {
+			ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
+			new_uid = zfs_uid_read(ZTOI(zp));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+			    &new_uid, sizeof (new_uid));
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
+				    sizeof (new_uid));
+				ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
+			}
+		}
+
+		if (mask & ATTR_GID) {
+			ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
+			new_gid = zfs_gid_read(ZTOI(zp));
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
+			    NULL, &new_gid, sizeof (new_gid));
+			if (attrzp) {
+				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
+				    sizeof (new_gid));
+				ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
+			}
+		}
+		if (!(mask & ATTR_MODE)) {
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
+			    NULL, &new_mode, sizeof (new_mode));
+			new_mode = zp->z_mode;
+		}
+		err = zfs_acl_chown_setattr(zp);
+		ASSERT(err == 0);
+		if (attrzp) {
+			err = zfs_acl_chown_setattr(attrzp);
+			ASSERT(err == 0);
+		}
+	}
+
+	if (mask & ATTR_MODE) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+		    &new_mode, sizeof (new_mode));
+		zp->z_mode = ZTOI(zp)->i_mode = new_mode;
+		ASSERT3P(aclp, !=, NULL);
+		err = zfs_aclset_common(zp, aclp, cr, tx);
+		ASSERT0(err);
+		if (zp->z_acl_cached)
+			zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = aclp;
+		aclp = NULL;
+	}
+
+	if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
+		zp->z_atime_dirty = B_FALSE;
+		ZFS_TIME_ENCODE(&ip->i_atime, atime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+		    &atime, sizeof (atime));
+	}
+
+	if (mask & (ATTR_MTIME | ATTR_SIZE)) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+		ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate(
+		    vap->va_mtime, ZTOI(zp));
+
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    mtime, sizeof (mtime));
+	}
+
+	if (mask & (ATTR_CTIME | ATTR_SIZE)) {
+		ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
+		ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime,
+		    ZTOI(zp));
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    ctime, sizeof (ctime));
+	}
+
+	if (projid != ZFS_INVALID_PROJID) {
+		zp->z_projid = projid;
+		SA_ADD_BULK_ATTR(bulk, count,
+		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
+		    sizeof (zp->z_projid));
+	}
+
+	if (attrzp && mask) {
+		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
+		    SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
+		    sizeof (ctime));
+	}
+
+	/*
+	 * Do this after setting timestamps to prevent timestamp
+	 * update from toggling bit
+	 */
+
+	if (xoap && (mask & ATTR_XVATTR)) {
+
+		/*
+		 * restore trimmed off masks
+		 * so that return masks can be set for caller.
+		 */
+
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
+			XVA_SET_REQ(xvap, XAT_APPENDONLY);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
+			XVA_SET_REQ(xvap, XAT_NOUNLINK);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
+			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
+			XVA_SET_REQ(xvap, XAT_NODUMP);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
+			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
+			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+		}
+		if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
+			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
+			ASSERT(S_ISREG(ip->i_mode));
+
+		zfs_xvattr_set(zp, xvap, tx);
+	}
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	if (mask != 0)
+		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
+
+	mutex_exit(&zp->z_lock);
+	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+		mutex_exit(&zp->z_acl_lock);
+
+	if (attrzp) {
+		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
+			mutex_exit(&attrzp->z_acl_lock);
+		mutex_exit(&attrzp->z_lock);
+	}
+out:
+	if (err == 0 && xattr_count > 0) {
+		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
+		    xattr_count, tx);
+		ASSERT(err2 == 0);
+	}
+
+	if (aclp)
+		zfs_acl_free(aclp);
+
+	if (fuidp) {
+		zfs_fuid_info_free(fuidp);
+		fuidp = NULL;
+	}
+
+	if (err) {
+		dmu_tx_abort(tx);
+		if (attrzp)
+			zrele(attrzp);
+		if (err == ERESTART)
+			goto top;
+	} else {
+		if (count > 0)
+			err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+		dmu_tx_commit(tx);
+		if (attrzp) {
+			if (err2 == 0 && handle_eadir)
+				err2 = zfs_setattr_dir(attrzp);
+			zrele(attrzp);
+		}
+		zfs_znode_update_vfs(zp);
+	}
+
+out2:
+	if (os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+out3:
+	kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
+	kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
+	kmem_free(tmpxvattr, sizeof (xvattr_t));
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+typedef struct zfs_zlock {
+	krwlock_t	*zl_rwlock;	/* lock we acquired */
+	znode_t		*zl_znode;	/* znode we held */
+	struct zfs_zlock *zl_next;	/* next in list */
+} zfs_zlock_t;
+
+/*
+ * Drop locks and release vnodes that were held by zfs_rename_lock().
+ */
+static void
+zfs_rename_unlock(zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t *zl;
+
+	while ((zl = *zlpp) != NULL) {
+		if (zl->zl_znode != NULL)
+			zfs_zrele_async(zl->zl_znode);
+		rw_exit(zl->zl_rwlock);
+		*zlpp = zl->zl_next;
+		kmem_free(zl, sizeof (*zl));
+	}
+}
+
+/*
+ * Search back through the directory tree, using the ".." entries.
+ * Lock each directory in the chain to prevent concurrent renames.
+ * Fail any attempt to move a directory into one of its own descendants.
+ * XXX - z_parent_lock can overlap with map or grow locks
+ */
+static int
+zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
+{
+	zfs_zlock_t	*zl;
+	znode_t		*zp = tdzp;
+	uint64_t	rootid = ZTOZSB(zp)->z_root;
+	uint64_t	oidp = zp->z_id;
+	krwlock_t	*rwlp = &szp->z_parent_lock;
+	krw_t		rw = RW_WRITER;
+
+	/*
+	 * First pass write-locks szp and compares to zp->z_id.
+	 * Later passes read-lock zp and compare to zp->z_parent.
+	 */
+	do {
+		if (!rw_tryenter(rwlp, rw)) {
+			/*
+			 * Another thread is renaming in this path.
+			 * Note that if we are a WRITER, we don't have any
+			 * parent_locks held yet.
+			 */
+			if (rw == RW_READER && zp->z_id > szp->z_id) {
+				/*
+				 * Drop our locks and restart
+				 */
+				zfs_rename_unlock(&zl);
+				*zlpp = NULL;
+				zp = tdzp;
+				oidp = zp->z_id;
+				rwlp = &szp->z_parent_lock;
+				rw = RW_WRITER;
+				continue;
+			} else {
+				/*
+				 * Wait for other thread to drop its locks
+				 */
+				rw_enter(rwlp, rw);
+			}
+		}
+
+		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
+		zl->zl_rwlock = rwlp;
+		zl->zl_znode = NULL;
+		zl->zl_next = *zlpp;
+		*zlpp = zl;
+
+		if (oidp == szp->z_id)		/* We're a descendant of szp */
+			return (SET_ERROR(EINVAL));
+
+		if (oidp == rootid)		/* We've hit the top */
+			return (0);
+
+		if (rw == RW_READER) {		/* i.e. not the first pass */
+			int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
+			if (error)
+				return (error);
+			zl->zl_znode = zp;
+		}
+		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
+		    &oidp, sizeof (oidp));
+		rwlp = &zp->z_parent_lock;
+		rw = RW_READER;
+
+	} while (zp->z_id != sdzp->z_id);
+
+	return (0);
+}
+
+/*
+ * Move an entry from the provided source directory to the target
+ * directory.  Change the entry name as indicated.
+ *
+ *	IN:	sdzp	- Source directory containing the "old entry".
+ *		snm	- Old entry name.
+ *		tdzp	- Target directory to contain the "new entry".
+ *		tnm	- New entry name.
+ *		cr	- credentials of caller.
+ *		flags	- case flags
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	sdzp,tdzp - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
+    cred_t *cr, int flags)
+{
+	znode_t		*szp, *tzp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(sdzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*sdl, *tdl;
+	dmu_tx_t	*tx;
+	zfs_zlock_t	*zl;
+	int		cmp, serr, terr;
+	int		error = 0;
+	int		zflg = 0;
+	boolean_t	waited = B_FALSE;
+
+	if (snm == NULL || tnm == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(sdzp);
+	zilog = zfsvfs->z_log;
+
+	ZFS_VERIFY_ZP(tdzp);
+
+	/*
+	 * We check i_sb because snapshots and the ctldir must have different
+	 * super blocks.
+	 */
+	if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
+	    zfsctl_is_node(ZTOI(tdzp))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(tnm,
+	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
+
+top:
+	szp = NULL;
+	tzp = NULL;
+	zl = NULL;
+
+	/*
+	 * This is to prevent the creation of links into attribute space
+	 * by renaming a linked file into/outof an attribute directory.
+	 * See the comment in zfs_link() for why this is considered bad.
+	 */
+	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Lock source and target directory entries.  To prevent deadlock,
+	 * a lock ordering must be defined.  We lock the directory with
+	 * the smallest object id first, or if it's a tie, the one with
+	 * the lexically first name.
+	 */
+	if (sdzp->z_id < tdzp->z_id) {
+		cmp = -1;
+	} else if (sdzp->z_id > tdzp->z_id) {
+		cmp = 1;
+	} else {
+		/*
+		 * First compare the two name arguments without
+		 * considering any case folding.
+		 */
+		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
+
+		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
+		ASSERT(error == 0 || !zfsvfs->z_utf8);
+		if (cmp == 0) {
+			/*
+			 * POSIX: "If the old argument and the new argument
+			 * both refer to links to the same existing file,
+			 * the rename() function shall return successfully
+			 * and perform no other action."
+			 */
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+		/*
+		 * If the file system is case-folding, then we may
+		 * have some more checking to do.  A case-folding file
+		 * system is either supporting mixed case sensitivity
+		 * access or is completely case-insensitive.  Note
+		 * that the file system is always case preserving.
+		 *
+		 * In mixed sensitivity mode case sensitive behavior
+		 * is the default.  FIGNORECASE must be used to
+		 * explicitly request case insensitive behavior.
+		 *
+		 * If the source and target names provided differ only
+		 * by case (e.g., a request to rename 'tim' to 'Tim'),
+		 * we will treat this as a special case in the
+		 * case-insensitive mode: as long as the source name
+		 * is an exact match, we will allow this to proceed as
+		 * a name-change request.
+		 */
+		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
+		    flags & FIGNORECASE)) &&
+		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
+		    &error) == 0) {
+			/*
+			 * case preserving rename request, require exact
+			 * name matches
+			 */
+			zflg |= ZCIEXACT;
+			zflg &= ~ZCILOOK;
+		}
+	}
+
+	/*
+	 * If the source and destination directories are the same, we should
+	 * grab the z_name_lock of that directory only once.
+	 */
+	if (sdzp == tdzp) {
+		zflg |= ZHAVELOCK;
+		rw_enter(&sdzp->z_name_lock, RW_READER);
+	}
+
+	if (cmp < 0) {
+		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
+		    ZEXISTS | zflg, NULL, NULL);
+		terr = zfs_dirent_lock(&tdl,
+		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
+	} else {
+		terr = zfs_dirent_lock(&tdl,
+		    tdzp, tnm, &tzp, zflg, NULL, NULL);
+		serr = zfs_dirent_lock(&sdl,
+		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
+		    NULL, NULL);
+	}
+
+	if (serr) {
+		/*
+		 * Source entry invalid or not there.
+		 */
+		if (!terr) {
+			zfs_dirent_unlock(tdl);
+			if (tzp)
+				zrele(tzp);
+		}
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
+		if (strcmp(snm, "..") == 0)
+			serr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (serr);
+	}
+	if (terr) {
+		zfs_dirent_unlock(sdl);
+		zrele(szp);
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
+		if (strcmp(tnm, "..") == 0)
+			terr = EINVAL;
+		ZFS_EXIT(zfsvfs);
+		return (terr);
+	}
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow renames into our tree when the project
+	 * IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		error = SET_ERROR(EXDEV);
+		goto out;
+	}
+
+	/*
+	 * Must have write access at the source to remove the old entry
+	 * and write access at the target to create the new entry.
+	 * Note that if target and source are the same, this can be
+	 * done in a single check.
+	 */
+
+	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr)))
+		goto out;
+
+	if (S_ISDIR(ZTOI(szp)->i_mode)) {
+		/*
+		 * Check to make sure rename is valid.
+		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
+		 */
+		if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
+			goto out;
+	}
+
+	/*
+	 * Does target exist?
+	 */
+	if (tzp) {
+		/*
+		 * Source and target must be the same type.
+		 */
+		if (S_ISDIR(ZTOI(szp)->i_mode)) {
+			if (!S_ISDIR(ZTOI(tzp)->i_mode)) {
+				error = SET_ERROR(ENOTDIR);
+				goto out;
+			}
+		} else {
+			if (S_ISDIR(ZTOI(tzp)->i_mode)) {
+				error = SET_ERROR(EISDIR);
+				goto out;
+			}
+		}
+		/*
+		 * POSIX dictates that when the source and target
+		 * entries refer to the same file object, rename
+		 * must do nothing and exit without error.
+		 */
+		if (szp->z_id == tzp->z_id) {
+			error = 0;
+			goto out;
+		}
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+	if (sdzp != tdzp) {
+		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tdzp);
+	}
+	if (tzp) {
+		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, tzp);
+	}
+
+	zfs_sa_upgrade_txholds(tx, szp);
+	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		if (zl != NULL)
+			zfs_rename_unlock(&zl);
+		zfs_dirent_unlock(sdl);
+		zfs_dirent_unlock(tdl);
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			zrele(szp);
+			if (tzp)
+				zrele(tzp);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		zrele(szp);
+		if (tzp)
+			zrele(tzp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (tzp)	/* Attempt to remove the existing target */
+		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+
+	if (error == 0) {
+		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+		if (error == 0) {
+			szp->z_pflags |= ZFS_AV_MODIFIED;
+			if (tdzp->z_pflags & ZFS_PROJINHERIT)
+				szp->z_pflags |= ZFS_PROJINHERIT;
+
+			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
+			ASSERT0(error);
+
+			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+			if (error == 0) {
+				zfs_log_rename(zilog, tx, TX_RENAME |
+				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
+				    sdl->dl_name, tdzp, tdl->dl_name, szp);
+			} else {
+				/*
+				 * At this point, we have successfully created
+				 * the target name, but have failed to remove
+				 * the source name.  Since the create was done
+				 * with the ZRENAMING flag, there are
+				 * complications; for one, the link count is
+				 * wrong.  The easiest way to deal with this
+				 * is to remove the newly created target, and
+				 * return the original error.  This must
+				 * succeed; fortunately, it is very unlikely to
+				 * fail, since we just created it.
+				 */
+				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+				    ZRENAMING, NULL), ==, 0);
+			}
+		} else {
+			/*
+			 * If we had removed the existing target, subsequent
+			 * call to zfs_link_create() to add back the same entry
+			 * but, the new dnode (szp) should not fail.
+			 */
+			ASSERT(tzp == NULL);
+		}
+	}
+
+	dmu_tx_commit(tx);
+out:
+	if (zl != NULL)
+		zfs_rename_unlock(&zl);
+
+	zfs_dirent_unlock(sdl);
+	zfs_dirent_unlock(tdl);
+
+	zfs_znode_update_vfs(sdzp);
+	if (sdzp == tdzp)
+		rw_exit(&sdzp->z_name_lock);
+
+	if (sdzp != tdzp)
+		zfs_znode_update_vfs(tdzp);
+
+	zfs_znode_update_vfs(szp);
+	zrele(szp);
+	if (tzp) {
+		zfs_znode_update_vfs(tzp);
+		zrele(tzp);
+	}
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert the indicated symbolic reference entry into the directory.
+ *
+ *	IN:	dzp	- Directory to contain new symbolic link.
+ *		name	- Name of directory entry in dip.
+ *		vap	- Attributes of new entry.
+ *		link	- Name for new symlink entry.
+ *		cr	- credentials of caller.
+ *		flags	- case flags
+ *
+ *	OUT:	zpp	- Znode for new symbolic link.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	dip - ctime|mtime updated
+ */
+/*ARGSUSED*/
+int
+zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
+    znode_t **zpp, cred_t *cr, int flags)
+{
+	znode_t		*zp;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	zilog_t		*zilog;
+	uint64_t	len = strlen(link);
+	int		error;
+	int		zflg = ZNEW;
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
+	uint64_t	txtype = TX_SYMLINK;
+	boolean_t	waited = B_FALSE;
+
+	ASSERT(S_ISLNK(vap->va_mode));
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(dzp);
+	zilog = zfsvfs->z_log;
+
+	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
+	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+	if (flags & FIGNORECASE)
+		zflg |= ZCILOOK;
+
+	if (len > MAXPATHLEN) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENAMETOOLONG));
+	}
+
+	if ((error = zfs_acl_ids_create(dzp, 0,
+	    vap, cr, NULL, &acl_ids)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+top:
+	*zpp = NULL;
+
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+	if (error) {
+		zfs_acl_ids_free(&acl_ids);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EDQUOT));
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
+	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+	    ZFS_SA_BASE_ATTR_SIZE + len);
+	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    acl_ids.z_aclp->z_acl_bytes);
+	}
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		zfs_acl_ids_free(&acl_ids);
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	/*
+	 * Create a new object for the symlink.
+	 * for version 4 ZPL datsets the symlink will be an SA attribute
+	 */
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_is_sa)
+		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
+		    link, len, tx);
+	else
+		zfs_sa_symlink(zp, link, len, tx);
+	mutex_exit(&zp->z_lock);
+
+	zp->z_size = len;
+	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+	    &zp->z_size, sizeof (zp->z_size), tx);
+	/*
+	 * Insert the new object into the directory.
+	 */
+	error = zfs_link_create(dl, zp, tx, ZNEW);
+	if (error != 0) {
+		zfs_znode_delete(zp, tx);
+		remove_inode_hash(ZTOI(zp));
+	} else {
+		if (flags & FIGNORECASE)
+			txtype |= TX_CI;
+		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
+
+		zfs_znode_update_vfs(dzp);
+		zfs_znode_update_vfs(zp);
+	}
+
+	zfs_acl_ids_free(&acl_ids);
+
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	if (error == 0) {
+		*zpp = zp;
+
+		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+			zil_commit(zilog, 0);
+	} else {
+		zrele(zp);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Return, in the buffer contained in the provided uio structure,
+ * the symbolic path referred to by ip.
+ *
+ *	IN:	ip	- inode of symbolic link
+ *		uio	- structure to contain the link path.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_is_sa)
+		error = sa_lookup_uio(zp->z_sa_hdl,
+		    SA_ZPL_SYMLINK(zfsvfs), uio);
+	else
+		error = zfs_sa_readlink(zp, uio);
+	mutex_exit(&zp->z_lock);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Insert a new entry into directory tdzp referencing szp.
+ *
+ *	IN:	tdzp	- Directory to contain new entry.
+ *		szp	- znode of new entry.
+ *		name	- name of new entry.
+ *		cr	- credentials of caller.
+ *		flags	- case flags.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	tdzp - ctime|mtime updated
+ *	 szp - ctime updated
+ */
+/* ARGSUSED */
+int
+zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
+    int flags)
+{
+	struct inode *sip = ZTOI(szp);
+	znode_t		*tzp;
+	zfsvfs_t	*zfsvfs = ZTOZSB(tdzp);
+	zilog_t		*zilog;
+	zfs_dirlock_t	*dl;
+	dmu_tx_t	*tx;
+	int		error;
+	int		zf = ZNEW;
+	uint64_t	parent;
+	uid_t		owner;
+	boolean_t	waited = B_FALSE;
+	boolean_t	is_tmpfile = 0;
+	uint64_t	txg;
+#ifdef HAVE_TMPFILE
+	is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
+#endif
+	ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
+
+	if (name == NULL)
+		return (SET_ERROR(EINVAL));
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(tdzp);
+	zilog = zfsvfs->z_log;
+
+	/*
+	 * POSIX dictates that we return EPERM here.
+	 * Better choices include ENOTSUP or EISDIR.
+	 */
+	if (S_ISDIR(sip->i_mode)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	ZFS_VERIFY_ZP(szp);
+
+	/*
+	 * If we are using project inheritance, means if the directory has
+	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
+	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
+	 * such case, we only allow hard link creation in our tree when the
+	 * project IDs are the same.
+	 */
+	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
+	    tdzp->z_projid != szp->z_projid) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/*
+	 * We check i_sb because snapshots and the ctldir must have different
+	 * super blocks.
+	 */
+	if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/* Prevent links to .zfs/shares files */
+
+	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+	    &parent, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	if (parent == zfsvfs->z_shares_dir) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if (zfsvfs->z_utf8 && u8_validate(name,
+	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EILSEQ));
+	}
+	if (flags & FIGNORECASE)
+		zf |= ZCILOOK;
+
+	/*
+	 * We do not support links between attributes and non-attributes
+	 * because of the potential security risk of creating links
+	 * into "normal" file space in order to circumvent restrictions
+	 * imposed in attribute space.
+	 */
+	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
+	    cr, ZFS_OWNER);
+	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+top:
+	/*
+	 * Attempt to lock directory; fail if entry already exists.
+	 */
+	error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
+	if (error) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
+	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
+	if (is_tmpfile)
+		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
+
+	zfs_sa_upgrade_txholds(tx, szp);
+	zfs_sa_upgrade_txholds(tx, tdzp);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
+	if (error) {
+		zfs_dirent_unlock(dl);
+		if (error == ERESTART) {
+			waited = B_TRUE;
+			dmu_tx_wait(tx);
+			dmu_tx_abort(tx);
+			goto top;
+		}
+		dmu_tx_abort(tx);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	/* unmark z_unlinked so zfs_link_create will not reject */
+	if (is_tmpfile)
+		szp->z_unlinked = B_FALSE;
+	error = zfs_link_create(dl, szp, tx, 0);
+
+	if (error == 0) {
+		uint64_t txtype = TX_LINK;
+		/*
+		 * tmpfile is created to be in z_unlinkedobj, so remove it.
+		 * Also, we don't log in ZIL, because all previous file
+		 * operation on the tmpfile are ignored by ZIL. Instead we
+		 * always wait for txg to sync to make sure all previous
+		 * operation are sync safe.
+		 */
+		if (is_tmpfile) {
+			VERIFY(zap_remove_int(zfsvfs->z_os,
+			    zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
+		} else {
+			if (flags & FIGNORECASE)
+				txtype |= TX_CI;
+			zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
+		}
+	} else if (is_tmpfile) {
+		/* restore z_unlinked since when linking failed */
+		szp->z_unlinked = B_TRUE;
+	}
+	txg = dmu_tx_get_txg(tx);
+	dmu_tx_commit(tx);
+
+	zfs_dirent_unlock(dl);
+
+	if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
+		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
+
+	zfs_znode_update_vfs(tdzp);
+	zfs_znode_update_vfs(szp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static void
+zfs_putpage_commit_cb(void *arg)
+{
+	struct page *pp = arg;
+
+	ClearPageError(pp);
+	end_page_writeback(pp);
+}
+
+/*
+ * Push a page out to disk, once the page is on stable storage the
+ * registered commit callback will be run as notification of completion.
+ *
+ *	IN:	ip	- page mapped for inode.
+ *		pp	- page to push (page is locked)
+ *		wbc	- writeback control data
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	loff_t		offset;
+	loff_t		pgoff;
+	unsigned int	pglen;
+	dmu_tx_t	*tx;
+	caddr_t		va;
+	int		err = 0;
+	uint64_t	mtime[2], ctime[2];
+	sa_bulk_attr_t	bulk[3];
+	int		cnt = 0;
+	struct address_space *mapping;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	ASSERT(PageLocked(pp));
+
+	pgoff = page_offset(pp);	/* Page byte-offset in file */
+	offset = i_size_read(ip);	/* File length in bytes */
+	pglen = MIN(PAGE_SIZE,		/* Page length in bytes */
+	    P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
+
+	/* Page is beyond end of file */
+	if (pgoff >= offset) {
+		unlock_page(pp);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* Truncate page length to end of file */
+	if (pgoff + pglen > offset)
+		pglen = offset - pgoff;
+
+#if 0
+	/*
+	 * FIXME: Allow mmap writes past its quota.  The correct fix
+	 * is to register a page_mkwrite() handler to count the page
+	 * against its quota when it is about to be dirtied.
+	 */
+	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
+	    KUID_TO_SUID(ip->i_uid)) ||
+	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
+	    KGID_TO_SGID(ip->i_gid)) ||
+	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
+	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+	    zp->z_projid))) {
+		err = EDQUOT;
+	}
+#endif
+
+	/*
+	 * The ordering here is critical and must adhere to the following
+	 * rules in order to avoid deadlocking in either zfs_read() or
+	 * zfs_free_range() due to a lock inversion.
+	 *
+	 * 1) The page must be unlocked prior to acquiring the range lock.
+	 *    This is critical because zfs_read() calls find_lock_page()
+	 *    which may block on the page lock while holding the range lock.
+	 *
+	 * 2) Before setting or clearing write back on a page the range lock
+	 *    must be held in order to prevent a lock inversion with the
+	 *    zfs_free_range() function.
+	 *
+	 * This presents a problem because upon entering this function the
+	 * page lock is already held.  To safely acquire the range lock the
+	 * page lock must be dropped.  This creates a window where another
+	 * process could truncate, invalidate, dirty, or write out the page.
+	 *
+	 * Therefore, after successfully reacquiring the range and page locks
+	 * the current page state is checked.  In the common case everything
+	 * will be as is expected and it can be written out.  However, if
+	 * the page state has changed it must be handled accordingly.
+	 */
+	mapping = pp->mapping;
+	redirty_page_for_writepage(wbc, pp);
+	unlock_page(pp);
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
+	    pgoff, pglen, RL_WRITER);
+	lock_page(pp);
+
+	/* Page mapping changed or it was no longer dirty, we're done */
+	if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
+		unlock_page(pp);
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* Another process started write block if required */
+	if (PageWriteback(pp)) {
+		unlock_page(pp);
+		zfs_rangelock_exit(lr);
+
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			if (PageWriteback(pp))
+				wait_on_page_bit(pp, PG_writeback);
+		}
+
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/* Clear the dirty flag the required locks are held */
+	if (!clear_page_dirty_for_io(pp)) {
+		unlock_page(pp);
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	/*
+	 * Counterpart for redirty_page_for_writepage() above.  This page
+	 * was in fact not skipped and should not be counted as if it were.
+	 */
+	wbc->pages_skipped--;
+	set_page_writeback(pp);
+	unlock_page(pp);
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (err != 0) {
+		if (err == ERESTART)
+			dmu_tx_wait(tx);
+
+		dmu_tx_abort(tx);
+		__set_page_dirty_nobuffers(pp);
+		ClearPageError(pp);
+		end_page_writeback(pp);
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (err);
+	}
+
+	va = kmap(pp);
+	ASSERT3U(pglen, <=, PAGE_SIZE);
+	dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
+	kunmap(pp);
+
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+
+	/* Preserve the mtime and ctime provided by the inode */
+	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+	zp->z_atime_dirty = B_FALSE;
+	zp->z_seq++;
+
+	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+
+	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
+	    zfs_putpage_commit_cb, pp);
+	dmu_tx_commit(tx);
+
+	zfs_rangelock_exit(lr);
+
+	if (wbc->sync_mode != WB_SYNC_NONE) {
+		/*
+		 * Note that this is rarely called under writepages(), because
+		 * writepages() normally handles the entire commit for
+		 * performance reasons.
+		 */
+		zil_commit(zfsvfs->z_log, zp->z_id);
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * Update the system attributes when the inode has been dirtied.  For the
+ * moment we only update the mode, atime, mtime, and ctime.
+ */
+int
+zfs_dirty_inode(struct inode *ip, int flags)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	dmu_tx_t	*tx;
+	uint64_t	mode, atime[2], mtime[2], ctime[2];
+	sa_bulk_attr_t	bulk[4];
+	int		error = 0;
+	int		cnt = 0;
+
+	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+#ifdef I_DIRTY_TIME
+	/*
+	 * This is the lazytime semantic introduced in Linux 4.0
+	 * This flag will only be called from update_time when lazytime is set.
+	 * (Note, I_DIRTY_SYNC will also set if not lazytime)
+	 * Fortunately mtime and ctime are managed within ZFS itself, so we
+	 * only need to dirty atime.
+	 */
+	if (flags == I_DIRTY_TIME) {
+		zp->z_atime_dirty = B_TRUE;
+		goto out;
+	}
+#endif
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	mutex_enter(&zp->z_lock);
+	zp->z_atime_dirty = B_FALSE;
+
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+	/* Preserve the mode, mtime and ctime provided by the inode */
+	ZFS_TIME_ENCODE(&ip->i_atime, atime);
+	ZFS_TIME_ENCODE(&ip->i_mtime, mtime);
+	ZFS_TIME_ENCODE(&ip->i_ctime, ctime);
+	mode = ip->i_mode;
+
+	zp->z_mode = mode;
+
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
+	mutex_exit(&zp->z_lock);
+
+	dmu_tx_commit(tx);
+out:
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+void
+zfs_inactive(struct inode *ip)
+{
+	znode_t	*zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	uint64_t atime[2];
+	int error;
+	int need_unlock = 0;
+
+	/* Only read lock if we haven't already write locked, e.g. rollback */
+	if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
+		need_unlock = 1;
+		rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
+	}
+	if (zp->z_sa_hdl == NULL) {
+		if (need_unlock)
+			rw_exit(&zfsvfs->z_teardown_inactive_lock);
+		return;
+	}
+
+	if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			ZFS_TIME_ENCODE(&ip->i_atime, atime);
+			mutex_enter(&zp->z_lock);
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
+			    (void *)&atime, sizeof (atime), tx);
+			zp->z_atime_dirty = B_FALSE;
+			mutex_exit(&zp->z_lock);
+			dmu_tx_commit(tx);
+		}
+	}
+
+	zfs_zinactive(zp);
+	if (need_unlock)
+		rw_exit(&zfsvfs->z_teardown_inactive_lock);
+}
+
+/*
+ * Fill pages with data from the disk.
+ */
+static int
+zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	objset_t *os;
+	struct page *cur_pp;
+	u_offset_t io_off, total;
+	size_t io_len;
+	loff_t i_size;
+	unsigned page_idx;
+	int err;
+
+	os = zfsvfs->z_os;
+	io_len = nr_pages << PAGE_SHIFT;
+	i_size = i_size_read(ip);
+	io_off = page_offset(pl[0]);
+
+	if (io_off + io_len > i_size)
+		io_len = i_size - io_off;
+
+	/*
+	 * Iterate over list of pages and read each page individually.
+	 */
+	page_idx = 0;
+	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+		caddr_t va;
+
+		cur_pp = pl[page_idx++];
+		va = kmap(cur_pp);
+		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+		    DMU_READ_PREFETCH);
+		kunmap(cur_pp);
+		if (err) {
+			/* convert checksum errors into IO errors */
+			if (err == ECKSUM)
+				err = SET_ERROR(EIO);
+			return (err);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Uses zfs_fillpage to read data from the file and fill the pages.
+ *
+ *	IN:	ip	 - inode of file to get data from.
+ *		pl	 - list of pages to read
+ *		nr_pages - number of pages to read
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	vp - atime updated
+ */
+/* ARGSUSED */
+int
+zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages)
+{
+	znode_t	 *zp  = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	int	 err;
+
+	if (pl == NULL)
+		return (0);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	err = zfs_fillpage(ip, pl, nr_pages);
+
+	ZFS_EXIT(zfsvfs);
+	return (err);
+}
+
+/*
+ * Check ZFS specific permissions to memory map a section of a file.
+ *
+ *	IN:	ip	- inode of the file to mmap
+ *		off	- file offset
+ *		addrp	- start address in memory region
+ *		len	- length of memory region
+ *		vm_flags- address flags
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ */
+/*ARGSUSED*/
+int
+zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
+    unsigned long vm_flags)
+{
+	znode_t  *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((vm_flags & VM_WRITE) && (zp->z_pflags &
+	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	if ((vm_flags & (VM_READ | VM_EXEC)) &&
+	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EACCES));
+	}
+
+	if (off < 0 || len > MAXOFFSET_T - off) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(ENXIO));
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*
+ * Free or allocate space in a file.  Currently, this function only
+ * supports the `F_FREESP' command.  However, this command is somewhat
+ * misnamed, as its functionality includes the ability to allocate as
+ * well as free space.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		cmd	- action to take (only F_FREESP supported).
+ *		bfp	- section of file to free/alloc.
+ *		flag	- current file open mode flags.
+ *		offset	- current file offset.
+ *		cr	- credentials of caller.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Timestamps:
+ *	zp - ctime|mtime updated
+ */
+/* ARGSUSED */
+int
+zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
+    offset_t offset, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
+	uint64_t	off, len;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (cmd != F_FREESP) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(zfsvfs)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	if (bfp->l_len < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Permissions aren't checked on Solaris because on this OS
+	 * zfs_space() can only be called with an opened file handle.
+	 * On Linux we can get here through truncate_range() which
+	 * operates directly on inodes, so we need to check access rights.
+	 */
+	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	off = bfp->l_start;
+	len = bfp->l_len; /* 0 means from off to end of file */
+
+	error = zfs_freesp(zp, off, len, flag, TRUE);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_fid(struct inode *ip, fid_t *fidp)
+{
+	znode_t		*zp = ITOZ(ip);
+	zfsvfs_t	*zfsvfs = ITOZSB(ip);
+	uint32_t	gen;
+	uint64_t	gen64;
+	uint64_t	object = zp->z_id;
+	zfid_short_t	*zfid;
+	int		size, i, error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
+	    &gen64, sizeof (uint64_t))) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	gen = (uint32_t)gen64;
+
+	size = SHORT_FID_LEN;
+
+	zfid = (zfid_short_t *)fidp;
+
+	zfid->zf_len = size;
+
+	for (i = 0; i < sizeof (zfid->zf_object); i++)
+		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
+
+	/* Must have a non-zero generation number to distinguish from .zfs */
+	if (gen == 0)
+		gen = 1;
+	for (i = 0; i < sizeof (zfid->zf_gen); i++)
+		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_open);
+EXPORT_SYMBOL(zfs_close);
+EXPORT_SYMBOL(zfs_lookup);
+EXPORT_SYMBOL(zfs_create);
+EXPORT_SYMBOL(zfs_tmpfile);
+EXPORT_SYMBOL(zfs_remove);
+EXPORT_SYMBOL(zfs_mkdir);
+EXPORT_SYMBOL(zfs_rmdir);
+EXPORT_SYMBOL(zfs_readdir);
+EXPORT_SYMBOL(zfs_getattr_fast);
+EXPORT_SYMBOL(zfs_setattr);
+EXPORT_SYMBOL(zfs_rename);
+EXPORT_SYMBOL(zfs_symlink);
+EXPORT_SYMBOL(zfs_readlink);
+EXPORT_SYMBOL(zfs_link);
+EXPORT_SYMBOL(zfs_inactive);
+EXPORT_SYMBOL(zfs_space);
+EXPORT_SYMBOL(zfs_fid);
+EXPORT_SYMBOL(zfs_getpage);
+EXPORT_SYMBOL(zfs_putpage);
+EXPORT_SYMBOL(zfs_dirty_inode);
+EXPORT_SYMBOL(zfs_map);
+
+/* BEGIN CSTYLED */
+module_param(zfs_delete_blocks, ulong, 0644);
+MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
+/* END CSTYLED */
+
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
new file mode 100644
index 000000000000..d59c1bb0716a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_znode.c
@@ -0,0 +1,2244 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/mntent.h>
+#include <sys/u8_textprep.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vfs.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/errno.h>
+#include <sys/atomic.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_rlock.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zpl.h>
+#endif /* _KERNEL */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_refcount.h>
+#include <sys/stat.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#include <sys/sa.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_stat.h>
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/*
+ * Functions needed for userland (ie: libzpool) are not put under
+ * #ifdef_KERNEL; the rest of the functions have dependencies
+ * (such as VFS logic) that will not compile easily in userland.
+ */
+#ifdef _KERNEL
+
+static kmem_cache_t *znode_cache = NULL;
+static kmem_cache_t *znode_hold_cache = NULL;
+unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ;
+
+/*
+ * This is used by the test suite so that it can delay znodes from being
+ * freed in order to inspect the unlinked set.
+ */
+int zfs_unlink_suspend_progress = 0;
+
+/*
+ * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on
+ * z_rangelock. It will modify the offset and length of the lock to reflect
+ * znode-specific information, and convert RL_APPEND to RL_WRITER.  This is
+ * called with the rangelock_t's rl_lock held, which avoids races.
+ */
+static void
+zfs_rangelock_cb(zfs_locked_range_t *new, void *arg)
+{
+	znode_t *zp = arg;
+
+	/*
+	 * If in append mode, convert to writer and lock starting at the
+	 * current end of file.
+	 */
+	if (new->lr_type == RL_APPEND) {
+		new->lr_offset = zp->z_size;
+		new->lr_type = RL_WRITER;
+	}
+
+	/*
+	 * If we need to grow the block size then lock the whole file range.
+	 */
+	uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length);
+	if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) ||
+	    zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) {
+		new->lr_offset = 0;
+		new->lr_length = UINT64_MAX;
+	}
+}
+
+/*ARGSUSED*/
+static int
+zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	znode_t *zp = buf;
+
+	inode_init_once(ZTOI(zp));
+	list_link_init(&zp->z_link_node);
+
+	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL);
+	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL);
+
+	zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp);
+
+	zp->z_dirlocks = NULL;
+	zp->z_acl_cached = NULL;
+	zp->z_xattr_cached = NULL;
+	zp->z_xattr_parent = 0;
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_znode_cache_destructor(void *buf, void *arg)
+{
+	znode_t *zp = buf;
+
+	ASSERT(!list_link_active(&zp->z_link_node));
+	mutex_destroy(&zp->z_lock);
+	rw_destroy(&zp->z_parent_lock);
+	rw_destroy(&zp->z_name_lock);
+	mutex_destroy(&zp->z_acl_lock);
+	rw_destroy(&zp->z_xattr_lock);
+	zfs_rangelock_fini(&zp->z_rangelock);
+
+	ASSERT(zp->z_dirlocks == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
+	ASSERT(zp->z_xattr_cached == NULL);
+}
+
+static int
+zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags)
+{
+	znode_hold_t *zh = buf;
+
+	mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL);
+	zfs_refcount_create(&zh->zh_refcount);
+	zh->zh_obj = ZFS_NO_OBJECT;
+
+	return (0);
+}
+
+static void
+zfs_znode_hold_cache_destructor(void *buf, void *arg)
+{
+	znode_hold_t *zh = buf;
+
+	mutex_destroy(&zh->zh_lock);
+	zfs_refcount_destroy(&zh->zh_refcount);
+}
+
+void
+zfs_znode_init(void)
+{
+	/*
+	 * Initialize zcache.  The KMC_SLAB hint is used in order that it be
+	 * backed by kmalloc() when on the Linux slab in order that any
+	 * wait_on_bit() operations on the related inode operate properly.
+	 */
+	ASSERT(znode_cache == NULL);
+	znode_cache = kmem_cache_create("zfs_znode_cache",
+	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
+	    zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB);
+
+	ASSERT(znode_hold_cache == NULL);
+	znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache",
+	    sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor,
+	    zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_znode_fini(void)
+{
+	/*
+	 * Cleanup zcache
+	 */
+	if (znode_cache)
+		kmem_cache_destroy(znode_cache);
+	znode_cache = NULL;
+
+	if (znode_hold_cache)
+		kmem_cache_destroy(znode_hold_cache);
+	znode_hold_cache = NULL;
+}
+
+/*
+ * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
+ * serialize access to a znode and its SA buffer while the object is being
+ * created or destroyed.  This kind of locking would normally reside in the
+ * znode itself but in this case that's impossible because the znode and SA
+ * buffer may not yet exist.  Therefore the locking is handled externally
+ * with an array of mutexs and AVLs trees which contain per-object locks.
+ *
+ * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
+ * in to the correct AVL tree and finally the per-object lock is held.  In
+ * zfs_znode_hold_exit() the process is reversed.  The per-object lock is
+ * released, removed from the AVL tree and destroyed if there are no waiters.
+ *
+ * This scheme has two important properties:
+ *
+ * 1) No memory allocations are performed while holding one of the z_hold_locks.
+ *    This ensures evict(), which can be called from direct memory reclaim, will
+ *    never block waiting on a z_hold_locks which just happens to have hashed
+ *    to the same index.
+ *
+ * 2) All locks used to serialize access to an object are per-object and never
+ *    shared.  This minimizes lock contention without creating a large number
+ *    of dedicated locks.
+ *
+ * On the downside it does require znode_lock_t structures to be frequently
+ * allocated and freed.  However, because these are backed by a kmem cache
+ * and very short lived this cost is minimal.
+ */
+int
+zfs_znode_hold_compare(const void *a, const void *b)
+{
+	const znode_hold_t *zh_a = (const znode_hold_t *)a;
+	const znode_hold_t *zh_b = (const znode_hold_t *)b;
+
+	return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj));
+}
+
+static boolean_t __maybe_unused
+zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+	znode_hold_t *zh, search;
+	int i = ZFS_OBJ_HASH(zfsvfs, obj);
+	boolean_t held;
+
+	search.zh_obj = obj;
+
+	mutex_enter(&zfsvfs->z_hold_locks[i]);
+	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+	held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE;
+	mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+	return (held);
+}
+
+static znode_hold_t *
+zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj)
+{
+	znode_hold_t *zh, *zh_new, search;
+	int i = ZFS_OBJ_HASH(zfsvfs, obj);
+	boolean_t found = B_FALSE;
+
+	zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP);
+	zh_new->zh_obj = obj;
+	search.zh_obj = obj;
+
+	mutex_enter(&zfsvfs->z_hold_locks[i]);
+	zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL);
+	if (likely(zh == NULL)) {
+		zh = zh_new;
+		avl_add(&zfsvfs->z_hold_trees[i], zh);
+	} else {
+		ASSERT3U(zh->zh_obj, ==, obj);
+		found = B_TRUE;
+	}
+	zfs_refcount_add(&zh->zh_refcount, NULL);
+	mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+	if (found == B_TRUE)
+		kmem_cache_free(znode_hold_cache, zh_new);
+
+	ASSERT(MUTEX_NOT_HELD(&zh->zh_lock));
+	ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+	mutex_enter(&zh->zh_lock);
+
+	return (zh);
+}
+
+static void
+zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh)
+{
+	int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj);
+	boolean_t remove = B_FALSE;
+
+	ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj));
+	ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0);
+	mutex_exit(&zh->zh_lock);
+
+	mutex_enter(&zfsvfs->z_hold_locks[i]);
+	if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) {
+		avl_remove(&zfsvfs->z_hold_trees[i], zh);
+		remove = B_TRUE;
+	}
+	mutex_exit(&zfsvfs->z_hold_locks[i]);
+
+	if (remove == B_TRUE)
+		kmem_cache_free(znode_hold_cache, zh);
+}
+
+dev_t
+zfs_cmpldev(uint64_t dev)
+{
+	return (dev);
+}
+
+static void
+zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
+    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
+{
+	ASSERT(zfs_znode_held(zfsvfs, zp->z_id));
+
+	mutex_enter(&zp->z_lock);
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
+	if (sa_hdl == NULL) {
+		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
+		    SA_HDL_SHARED, &zp->z_sa_hdl));
+	} else {
+		zp->z_sa_hdl = sa_hdl;
+		sa_set_userp(sa_hdl, zp);
+	}
+
+	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
+
+	mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_znode_dmu_fini(znode_t *zp)
+{
+	ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked ||
+	    RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock));
+
+	sa_handle_destroy(zp->z_sa_hdl);
+	zp->z_sa_hdl = NULL;
+}
+
+/*
+ * Called by new_inode() to allocate a new inode.
+ */
+int
+zfs_inode_alloc(struct super_block *sb, struct inode **ip)
+{
+	znode_t *zp;
+
+	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	*ip = ZTOI(zp);
+
+	return (0);
+}
+
+/*
+ * Called in multiple places when an inode should be destroyed.
+ */
+void
+zfs_inode_destroy(struct inode *ip)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	if (list_link_active(&zp->z_link_node)) {
+		list_remove(&zfsvfs->z_all_znodes, zp);
+		zfsvfs->z_nr_znodes--;
+	}
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	if (zp->z_xattr_cached) {
+		nvlist_free(zp->z_xattr_cached);
+		zp->z_xattr_cached = NULL;
+	}
+
+	kmem_cache_free(znode_cache, zp);
+}
+
+static void
+zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip)
+{
+	uint64_t rdev = 0;
+
+	switch (ip->i_mode & S_IFMT) {
+	case S_IFREG:
+		ip->i_op = &zpl_inode_operations;
+		ip->i_fop = &zpl_file_operations;
+		ip->i_mapping->a_ops = &zpl_address_space_operations;
+		break;
+
+	case S_IFDIR:
+		ip->i_op = &zpl_dir_inode_operations;
+		ip->i_fop = &zpl_dir_file_operations;
+		ITOZ(ip)->z_zn_prefetch = B_TRUE;
+		break;
+
+	case S_IFLNK:
+		ip->i_op = &zpl_symlink_inode_operations;
+		break;
+
+	/*
+	 * rdev is only stored in a SA only for device files.
+	 */
+	case S_IFCHR:
+	case S_IFBLK:
+		(void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev,
+		    sizeof (rdev));
+		/*FALLTHROUGH*/
+	case S_IFIFO:
+	case S_IFSOCK:
+		init_special_inode(ip, ip->i_mode, rdev);
+		ip->i_op = &zpl_special_inode_operations;
+		break;
+
+	default:
+		zfs_panic_recover("inode %llu has invalid mode: 0x%x\n",
+		    (u_longlong_t)ip->i_ino, ip->i_mode);
+
+		/* Assume the inode is a file and attempt to continue */
+		ip->i_mode = S_IFREG | 0644;
+		ip->i_op = &zpl_inode_operations;
+		ip->i_fop = &zpl_file_operations;
+		ip->i_mapping->a_ops = &zpl_address_space_operations;
+		break;
+	}
+}
+
+static void
+zfs_set_inode_flags(znode_t *zp, struct inode *ip)
+{
+	/*
+	 * Linux and Solaris have different sets of file attributes, so we
+	 * restrict this conversion to the intersection of the two.
+	 */
+#ifdef HAVE_INODE_SET_FLAGS
+	unsigned int flags = 0;
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		flags |= S_IMMUTABLE;
+	if (zp->z_pflags & ZFS_APPENDONLY)
+		flags |= S_APPEND;
+
+	inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND);
+#else
+	if (zp->z_pflags & ZFS_IMMUTABLE)
+		ip->i_flags |= S_IMMUTABLE;
+	else
+		ip->i_flags &= ~S_IMMUTABLE;
+
+	if (zp->z_pflags & ZFS_APPENDONLY)
+		ip->i_flags |= S_APPEND;
+	else
+		ip->i_flags &= ~S_APPEND;
+#endif
+}
+
+/*
+ * Update the embedded inode given the znode.
+ */
+void
+zfs_znode_update_vfs(znode_t *zp)
+{
+	zfsvfs_t	*zfsvfs;
+	struct inode	*ip;
+	uint32_t	blksize;
+	u_longlong_t	i_blocks;
+
+	ASSERT(zp != NULL);
+	zfsvfs = ZTOZSB(zp);
+	ip = ZTOI(zp);
+
+	/* Skip .zfs control nodes which do not exist on disk. */
+	if (zfsctl_is_node(ip))
+		return;
+
+	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks);
+
+	spin_lock(&ip->i_lock);
+	ip->i_mode = zp->z_mode;
+	ip->i_blocks = i_blocks;
+	i_size_write(ip, zp->z_size);
+	spin_unlock(&ip->i_lock);
+}
+
+
+/*
+ * Construct a znode+inode and initialize.
+ *
+ * This does not do a call to dmu_set_user() that is
+ * up to the caller to do, in case you don't want to
+ * return the znode
+ */
+static znode_t *
+zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
+    dmu_object_type_t obj_type, sa_handle_t *hdl)
+{
+	znode_t	*zp;
+	struct inode *ip;
+	uint64_t mode;
+	uint64_t parent;
+	uint64_t tmp_gen;
+	uint64_t links;
+	uint64_t z_uid, z_gid;
+	uint64_t atime[2], mtime[2], ctime[2];
+	uint64_t projid = ZFS_DEFAULT_PROJID;
+	sa_bulk_attr_t bulk[11];
+	int count = 0;
+
+	ASSERT(zfsvfs != NULL);
+
+	ip = new_inode(zfsvfs->z_sb);
+	if (ip == NULL)
+		return (NULL);
+
+	zp = ITOZ(ip);
+	ASSERT(zp->z_dirlocks == NULL);
+	ASSERT3P(zp->z_acl_cached, ==, NULL);
+	ASSERT3P(zp->z_xattr_cached, ==, NULL);
+	zp->z_unlinked = B_FALSE;
+	zp->z_atime_dirty = B_FALSE;
+	zp->z_is_mapped = B_FALSE;
+	zp->z_is_ctldir = B_FALSE;
+	zp->z_is_stale = B_FALSE;
+	zp->z_suspended = B_FALSE;
+	zp->z_sa_hdl = NULL;
+	zp->z_mapcnt = 0;
+	zp->z_id = db->db_object;
+	zp->z_blksz = blksz;
+	zp->z_seq = 0x7A4653;
+	zp->z_sync_cnt = 0;
+
+	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+	    &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 ||
+	    (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    (zp->z_pflags & ZFS_PROJID) &&
+	    sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) {
+		if (hdl == NULL)
+			sa_handle_destroy(zp->z_sa_hdl);
+		zp->z_sa_hdl = NULL;
+		goto error;
+	}
+
+	zp->z_projid = projid;
+	zp->z_mode = ip->i_mode = mode;
+	ip->i_generation = (uint32_t)tmp_gen;
+	ip->i_blkbits = SPA_MINBLOCKSHIFT;
+	set_nlink(ip, (uint32_t)links);
+	zfs_uid_write(ip, z_uid);
+	zfs_gid_write(ip, z_gid);
+	zfs_set_inode_flags(zp, ip);
+
+	/* Cache the xattr parent id */
+	if (zp->z_pflags & ZFS_XATTR)
+		zp->z_xattr_parent = parent;
+
+	ZFS_TIME_DECODE(&ip->i_atime, atime);
+	ZFS_TIME_DECODE(&ip->i_mtime, mtime);
+	ZFS_TIME_DECODE(&ip->i_ctime, ctime);
+
+	ip->i_ino = zp->z_id;
+	zfs_znode_update_vfs(zp);
+	zfs_inode_set_ops(zfsvfs, ip);
+
+	/*
+	 * The only way insert_inode_locked() can fail is if the ip->i_ino
+	 * number is already hashed for this super block.  This can never
+	 * happen because the inode numbers map 1:1 with the object numbers.
+	 *
+	 * The one exception is rolling back a mounted file system, but in
+	 * this case all the active inode are unhashed during the rollback.
+	 */
+	VERIFY3S(insert_inode_locked(ip), ==, 0);
+
+	mutex_enter(&zfsvfs->z_znodes_lock);
+	list_insert_tail(&zfsvfs->z_all_znodes, zp);
+	zfsvfs->z_nr_znodes++;
+	mutex_exit(&zfsvfs->z_znodes_lock);
+
+	unlock_new_inode(ip);
+	return (zp);
+
+error:
+	iput(ip);
+	return (NULL);
+}
+
+/*
+ * Safely mark an inode dirty.  Inodes which are part of a read-only
+ * file system or snapshot may not be dirtied.
+ */
+void
+zfs_mark_inode_dirty(struct inode *ip)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
+		return;
+
+	mark_inode_dirty(ip);
+}
+
+static uint64_t empty_xattr;
+static uint64_t pad[4];
+static zfs_acl_phys_t acl_phys;
+/*
+ * Create a new DMU object to hold a zfs znode.
+ *
+ *	IN:	dzp	- parent directory for new znode
+ *		vap	- file attributes for new znode
+ *		tx	- dmu transaction id for zap operations
+ *		cr	- credentials of caller
+ *		flag	- flags:
+ *			  IS_ROOT_NODE	- new object will be root
+ *			  IS_TMPFILE	- new object is of O_TMPFILE
+ *			  IS_XATTR	- new object is an attribute
+ *		acl_ids	- ACL related attributes
+ *
+ *	OUT:	zpp	- allocated znode (set to dzp if IS_ROOT_NODE)
+ *
+ */
+void
+zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
+    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
+{
+	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
+	uint64_t	mode, size, links, parent, pflags;
+	uint64_t	projid = ZFS_DEFAULT_PROJID;
+	uint64_t	rdev = 0;
+	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
+	dmu_buf_t	*db;
+	inode_timespec_t now;
+	uint64_t	gen, obj;
+	int		bonuslen;
+	int		dnodesize;
+	sa_handle_t	*sa_hdl;
+	dmu_object_type_t obj_type;
+	sa_bulk_attr_t	*sa_attrs;
+	int		cnt = 0;
+	zfs_acl_locator_cb_t locate = { 0 };
+	znode_hold_t	*zh;
+
+	if (zfsvfs->z_replay) {
+		obj = vap->va_nodeid;
+		now = vap->va_ctime;		/* see zfs_replay_create() */
+		gen = vap->va_nblocks;		/* ditto */
+		dnodesize = vap->va_fsid;	/* ditto */
+	} else {
+		obj = 0;
+		gethrestime(&now);
+		gen = dmu_tx_get_txg(tx);
+		dnodesize = dmu_objset_dnodesize(zfsvfs->z_os);
+	}
+
+	if (dnodesize == 0)
+		dnodesize = DNODE_MIN_SIZE;
+
+	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
+
+	bonuslen = (obj_type == DMU_OT_SA) ?
+	    DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE;
+
+	/*
+	 * Create a new DMU object.
+	 */
+	/*
+	 * There's currently no mechanism for pre-reading the blocks that will
+	 * be needed to allocate a new object, so we accept the small chance
+	 * that there will be an i/o error and we will fail one of the
+	 * assertions below.
+	 */
+	if (S_ISDIR(vap->va_mode)) {
+		if (zfsvfs->z_replay) {
+			VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = zap_create_norm_dnsize(zfsvfs->z_os,
+			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	} else {
+		if (zfsvfs->z_replay) {
+			VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx));
+		} else {
+			obj = dmu_object_alloc_dnsize(zfsvfs->z_os,
+			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
+			    obj_type, bonuslen, dnodesize, tx);
+		}
+	}
+
+	zh = zfs_znode_hold_enter(zfsvfs, obj);
+	VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
+
+	/*
+	 * If this is the root, fix up the half-initialized parent pointer
+	 * to reference the just-allocated physical data area.
+	 */
+	if (flag & IS_ROOT_NODE) {
+		dzp->z_id = obj;
+	}
+
+	/*
+	 * If parent is an xattr, so am I.
+	 */
+	if (dzp->z_pflags & ZFS_XATTR) {
+		flag |= IS_XATTR;
+	}
+
+	if (zfsvfs->z_use_fuids)
+		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
+	else
+		pflags = 0;
+
+	if (S_ISDIR(vap->va_mode)) {
+		size = 2;		/* contents ("." and "..") */
+		links = 2;
+	} else {
+		size = 0;
+		links = (flag & IS_TMPFILE) ? 0 : 1;
+	}
+
+	if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))
+		rdev = vap->va_rdev;
+
+	parent = dzp->z_id;
+	mode = acl_ids->z_mode;
+	if (flag & IS_XATTR)
+		pflags |= ZFS_XATTR;
+
+	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) {
+		/*
+		 * With ZFS_PROJID flag, we can easily know whether there is
+		 * project ID stored on disk or not. See zfs_space_delta_cb().
+		 */
+		if (obj_type != DMU_OT_ZNODE &&
+		    dmu_objset_projectquota_enabled(zfsvfs->z_os))
+			pflags |= ZFS_PROJID;
+
+		/*
+		 * Inherit project ID from parent if required.
+		 */
+		projid = zfs_inherit_projid(dzp);
+		if (dzp->z_pflags & ZFS_PROJINHERIT)
+			pflags |= ZFS_PROJINHERIT;
+	}
+
+	/*
+	 * No execs denied will be determined when zfs_mode_compute() is called.
+	 */
+	pflags |= acl_ids->z_aclp->z_hints &
+	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
+	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
+
+	ZFS_TIME_ENCODE(&now, crtime);
+	ZFS_TIME_ENCODE(&now, ctime);
+
+	if (vap->va_mask & ATTR_ATIME) {
+		ZFS_TIME_ENCODE(&vap->va_atime, atime);
+	} else {
+		ZFS_TIME_ENCODE(&now, atime);
+	}
+
+	if (vap->va_mask & ATTR_MTIME) {
+		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
+	} else {
+		ZFS_TIME_ENCODE(&now, mtime);
+	}
+
+	/* Now add in all of the "SA" attributes */
+	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
+	    &sa_hdl));
+
+	/*
+	 * Setup the array of attributes to be replaced/set on the new file
+	 *
+	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
+	 * in the old znode_phys_t format.  Don't change this ordering
+	 */
+	sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+	} else {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
+		    NULL, &mode, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
+		    NULL, &size, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
+		    NULL, &gen, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs),
+		    NULL, &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs),
+		    NULL, &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
+		    NULL, &parent, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
+		    NULL, &atime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
+		    NULL, &mtime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
+		    NULL, &crtime, 16);
+	}
+
+	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
+		    &empty_xattr, 8);
+	} else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) &&
+	    pflags & ZFS_PROJID) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs),
+		    NULL, &projid, 8);
+	}
+	if (obj_type == DMU_OT_ZNODE ||
+	    (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
+		    NULL, &rdev, 8);
+	}
+	if (obj_type == DMU_OT_ZNODE) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &pflags, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
+		    &acl_ids->z_fuid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
+		    &acl_ids->z_fgid, 8);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
+		    sizeof (uint64_t) * 4);
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &acl_phys, sizeof (zfs_acl_phys_t));
+	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+		    &acl_ids->z_aclp->z_acl_count, 8);
+		locate.cb_aclp = acl_ids->z_aclp;
+		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate,
+		    acl_ids->z_aclp->z_acl_bytes);
+		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
+		    acl_ids->z_fuid, acl_ids->z_fgid);
+	}
+
+	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
+
+	if (!(flag & IS_ROOT_NODE)) {
+		/*
+		 * The call to zfs_znode_alloc() may fail if memory is low
+		 * via the call path: alloc_inode() -> inode_init_always() ->
+		 * security_inode_alloc() -> inode_alloc_security().  Since
+		 * the existing code is written such that zfs_mknode() can
+		 * not fail retry until sufficient memory has been reclaimed.
+		 */
+		do {
+			*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
+		} while (*zpp == NULL);
+
+		VERIFY(*zpp != NULL);
+		VERIFY(dzp != NULL);
+	} else {
+		/*
+		 * If we are creating the root node, the "parent" we
+		 * passed in is the znode for the root.
+		 */
+		*zpp = dzp;
+
+		(*zpp)->z_sa_hdl = sa_hdl;
+	}
+
+	(*zpp)->z_pflags = pflags;
+	(*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode;
+	(*zpp)->z_dnodesize = dnodesize;
+	(*zpp)->z_projid = projid;
+
+	if (obj_type == DMU_OT_ZNODE ||
+	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
+		VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+	}
+	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+	zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+/*
+ * Update in-core attributes.  It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
+ */
+void
+zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+	xoptattr_t *xoap;
+	boolean_t update_inode = B_FALSE;
+
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
+
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
+		uint64_t times[2];
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+		    &times, sizeof (times), tx);
+		XVA_SET_RTN(xvap, XAT_CREATETIME);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_READONLY);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
+		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_HIDDEN);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
+		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SYSTEM);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
+		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_ARCHIVE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
+
+		update_inode = B_TRUE;
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NOUNLINK);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_APPENDONLY);
+
+		update_inode = B_TRUE;
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_NODUMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
+		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OPAQUE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
+		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+		zfs_sa_set_scanstamp(zp, xvap, tx);
+		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_REPARSE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
+		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_OFFLINE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_SPARSE);
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
+		ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit,
+		    zp->z_pflags, tx);
+		XVA_SET_RTN(xvap, XAT_PROJINHERIT);
+	}
+
+	if (update_inode)
+		zfs_set_inode_flags(zp, ZTOI(zp));
+}
+
+int
+zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
+{
+	dmu_object_info_t doi;
+	dmu_buf_t	*db;
+	znode_t		*zp;
+	znode_hold_t	*zh;
+	int err;
+	sa_handle_t	*hdl;
+
+	*zpp = NULL;
+
+again:
+	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EINVAL));
+	}
+
+	hdl = dmu_buf_get_user(db);
+	if (hdl != NULL) {
+		zp = sa_get_userdata(hdl);
+
+
+		/*
+		 * Since "SA" does immediate eviction we
+		 * should never find a sa handle that doesn't
+		 * know about the znode.
+		 */
+
+		ASSERT3P(zp, !=, NULL);
+
+		mutex_enter(&zp->z_lock);
+		ASSERT3U(zp->z_id, ==, obj_num);
+		/*
+		 * If zp->z_unlinked is set, the znode is already marked
+		 * for deletion and should not be discovered. Check this
+		 * after checking igrab() due to fsetxattr() & O_TMPFILE.
+		 *
+		 * If igrab() returns NULL the VFS has independently
+		 * determined the inode should be evicted and has
+		 * called iput_final() to start the eviction process.
+		 * The SA handle is still valid but because the VFS
+		 * requires that the eviction succeed we must drop
+		 * our locks and references to allow the eviction to
+		 * complete.  The zfs_zget() may then be retried.
+		 *
+		 * This unlikely case could be optimized by registering
+		 * a sops->drop_inode() callback.  The callback would
+		 * need to detect the active SA hold thereby informing
+		 * the VFS that this inode should not be evicted.
+		 */
+		if (igrab(ZTOI(zp)) == NULL) {
+			if (zp->z_unlinked)
+				err = SET_ERROR(ENOENT);
+			else
+				err = SET_ERROR(EAGAIN);
+		} else {
+			*zpp = zp;
+			err = 0;
+		}
+
+		mutex_exit(&zp->z_lock);
+		sa_buf_rele(db, NULL);
+		zfs_znode_hold_exit(zfsvfs, zh);
+
+		if (err == EAGAIN) {
+			/* inode might need this to finish evict */
+			cond_resched();
+			goto again;
+		}
+		return (err);
+	}
+
+	/*
+	 * Not found create new znode/vnode but only if file exists.
+	 *
+	 * There is a small window where zfs_vget() could
+	 * find this object while a file create is still in
+	 * progress.  This is checked for in zfs_znode_alloc()
+	 *
+	 * if zfs_znode_alloc() fails it will drop the hold on the
+	 * bonus buffer.
+	 */
+	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
+	    doi.doi_bonus_type, NULL);
+	if (zp == NULL) {
+		err = SET_ERROR(ENOENT);
+	} else {
+		*zpp = zp;
+	}
+	zfs_znode_hold_exit(zfsvfs, zh);
+	return (err);
+}
+
+int
+zfs_rezget(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	uint64_t obj_num = zp->z_id;
+	uint64_t mode;
+	uint64_t links;
+	sa_bulk_attr_t bulk[10];
+	int err;
+	int count = 0;
+	uint64_t gen;
+	uint64_t z_uid, z_gid;
+	uint64_t atime[2], mtime[2], ctime[2];
+	uint64_t projid = ZFS_DEFAULT_PROJID;
+	znode_hold_t *zh;
+
+	/*
+	 * skip ctldir, otherwise they will always get invalidated. This will
+	 * cause funny behaviour for the mounted snapdirs. Especially for
+	 * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent
+	 * anyone automount it again as long as someone is still using the
+	 * detached mount.
+	 */
+	if (zp->z_is_ctldir)
+		return (0);
+
+	zh = zfs_znode_hold_enter(zfsvfs, obj_num);
+
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+	mutex_exit(&zp->z_acl_lock);
+
+	rw_enter(&zp->z_xattr_lock, RW_WRITER);
+	if (zp->z_xattr_cached) {
+		nvlist_free(zp->z_xattr_cached);
+		zp->z_xattr_cached = NULL;
+	}
+	rw_exit(&zp->z_xattr_lock);
+
+	ASSERT(zp->z_sa_hdl == NULL);
+	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
+	if (err) {
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (err);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	if (doi.doi_bonus_type != DMU_OT_SA &&
+	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
+		sa_buf_rele(db, NULL);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EINVAL));
+	}
+
+	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
+
+	/* reload cached values */
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+	    &gen, sizeof (gen));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, sizeof (zp->z_size));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &links, sizeof (links));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, sizeof (zp->z_pflags));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+	    &z_uid, sizeof (z_uid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+	    &z_gid, sizeof (z_gid));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+	    &mode, sizeof (mode));
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &atime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, 16);
+
+	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EIO));
+	}
+
+	if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) {
+		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs),
+		    &projid, 8);
+		if (err != 0 && err != ENOENT) {
+			zfs_znode_dmu_fini(zp);
+			zfs_znode_hold_exit(zfsvfs, zh);
+			return (SET_ERROR(err));
+		}
+	}
+
+	zp->z_projid = projid;
+	zp->z_mode = ZTOI(zp)->i_mode = mode;
+	zfs_uid_write(ZTOI(zp), z_uid);
+	zfs_gid_write(ZTOI(zp), z_gid);
+
+	ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime);
+	ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime);
+	ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime);
+
+	if ((uint32_t)gen != ZTOI(zp)->i_generation) {
+		zfs_znode_dmu_fini(zp);
+		zfs_znode_hold_exit(zfsvfs, zh);
+		return (SET_ERROR(EIO));
+	}
+
+	set_nlink(ZTOI(zp), (uint32_t)links);
+	zfs_set_inode_flags(zp, ZTOI(zp));
+
+	zp->z_blksz = doi.doi_data_block_size;
+	zp->z_atime_dirty = B_FALSE;
+	zfs_znode_update_vfs(zp);
+
+	/*
+	 * If the file has zero links, then it has been unlinked on the send
+	 * side and it must be in the received unlinked set.
+	 * We call zfs_znode_dmu_fini() now to prevent any accesses to the
+	 * stale data and to prevent automatic removal of the file in
+	 * zfs_zinactive().  The file will be removed either when it is removed
+	 * on the send side and the next incremental stream is received or
+	 * when the unlinked set gets processed.
+	 */
+	zp->z_unlinked = (ZTOI(zp)->i_nlink == 0);
+	if (zp->z_unlinked)
+		zfs_znode_dmu_fini(zp);
+
+	zfs_znode_hold_exit(zfsvfs, zh);
+
+	return (0);
+}
+
+void
+zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	objset_t *os = zfsvfs->z_os;
+	uint64_t obj = zp->z_id;
+	uint64_t acl_obj = zfs_external_acl(zp);
+	znode_hold_t *zh;
+
+	zh = zfs_znode_hold_enter(zfsvfs, obj);
+	if (acl_obj) {
+		VERIFY(!zp->z_is_sa);
+		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
+	}
+	VERIFY(0 == dmu_object_free(os, obj, tx));
+	zfs_znode_dmu_fini(zp);
+	zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+void
+zfs_zinactive(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	uint64_t z_id = zp->z_id;
+	znode_hold_t *zh;
+
+	ASSERT(zp->z_sa_hdl);
+
+	/*
+	 * Don't allow a zfs_zget() while were trying to release this znode.
+	 */
+	zh = zfs_znode_hold_enter(zfsvfs, z_id);
+
+	mutex_enter(&zp->z_lock);
+
+	/*
+	 * If this was the last reference to a file with no links, remove
+	 * the file from the file system unless the file system is mounted
+	 * read-only.  That can happen, for example, if the file system was
+	 * originally read-write, the file was opened, then unlinked and
+	 * the file system was made read-only before the file was finally
+	 * closed.  The file will remain in the unlinked set.
+	 */
+	if (zp->z_unlinked) {
+		ASSERT(!zfsvfs->z_issnap);
+		if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) {
+			mutex_exit(&zp->z_lock);
+			zfs_znode_hold_exit(zfsvfs, zh);
+			zfs_rmnode(zp);
+			return;
+		}
+	}
+
+	mutex_exit(&zp->z_lock);
+	zfs_znode_dmu_fini(zp);
+
+	zfs_znode_hold_exit(zfsvfs, zh);
+}
+
+#if defined(HAVE_INODE_TIMESPEC64_TIMES)
+#define	zfs_compare_timespec timespec64_compare
+#else
+#define	zfs_compare_timespec timespec_compare
+#endif
+
+/*
+ * Determine whether the znode's atime must be updated.  The logic mostly
+ * duplicates the Linux kernel's relatime_need_update() functionality.
+ * This function is only called if the underlying filesystem actually has
+ * atime updates enabled.
+ */
+boolean_t
+zfs_relatime_need_update(const struct inode *ip)
+{
+	inode_timespec_t now;
+
+	gethrestime(&now);
+	/*
+	 * In relatime mode, only update the atime if the previous atime
+	 * is earlier than either the ctime or mtime or if at least a day
+	 * has passed since the last update of atime.
+	 */
+	if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0)
+		return (B_TRUE);
+
+	if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0)
+		return (B_TRUE);
+
+	if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * Prepare to update znode time stamps.
+ *
+ *	IN:	zp	- znode requiring timestamp update
+ *		flag	- ATTR_MTIME, ATTR_CTIME flags
+ *
+ *	OUT:	zp	- z_seq
+ *		mtime	- new mtime
+ *		ctime	- new ctime
+ *
+ *	Note: We don't update atime here, because we rely on Linux VFS to do
+ *	atime updating.
+ */
+void
+zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
+    uint64_t ctime[2])
+{
+	inode_timespec_t now;
+
+	gethrestime(&now);
+
+	zp->z_seq++;
+
+	if (flag & ATTR_MTIME) {
+		ZFS_TIME_ENCODE(&now, mtime);
+		ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime);
+		if (ZTOZSB(zp)->z_use_fuids) {
+			zp->z_pflags |= (ZFS_ARCHIVE |
+			    ZFS_AV_MODIFIED);
+		}
+	}
+
+	if (flag & ATTR_CTIME) {
+		ZFS_TIME_ENCODE(&now, ctime);
+		ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime);
+		if (ZTOZSB(zp)->z_use_fuids)
+			zp->z_pflags |= ZFS_ARCHIVE;
+	}
+}
+
+/*
+ * Grow the block size for a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		size	- requested block size
+ *		tx	- open transaction.
+ *
+ * NOTE: this function assumes that the znode is write locked.
+ */
+void
+zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
+{
+	int		error;
+	u_longlong_t	dummy;
+
+	if (size <= zp->z_blksz)
+		return;
+	/*
+	 * If the file size is already greater than the current blocksize,
+	 * we will not grow.  If there is more than one block in a file,
+	 * the blocksize cannot change.
+	 */
+	if (zp->z_blksz && zp->z_size > zp->z_blksz)
+		return;
+
+	error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id,
+	    size, 0, tx);
+
+	if (error == ENOTSUP)
+		return;
+	ASSERT0(error);
+
+	/* What blocksize did we actually get? */
+	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
+}
+
+/*
+ * Increase the file length
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_extend(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	uint64_t newblksz;
+	int error;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end <= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	if (end > zp->z_blksz &&
+	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
+		/*
+		 * We are growing the file past the current block size.
+		 */
+		if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) {
+			/*
+			 * File's blocksize is already larger than the
+			 * "recordsize" property.  Only let it grow to
+			 * the next power of 2.
+			 */
+			ASSERT(!ISP2(zp->z_blksz));
+			newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
+		} else {
+			newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz);
+		}
+		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
+	} else {
+		newblksz = 0;
+	}
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	if (newblksz)
+		zfs_grow_blocksize(zp, newblksz, tx);
+
+	zp->z_size = end;
+
+	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)),
+	    &zp->z_size, sizeof (zp->z_size), tx));
+
+	zfs_rangelock_exit(lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * zfs_zero_partial_page - Modeled after update_pages() but
+ * with different arguments and semantics for use by zfs_freesp().
+ *
+ * Zeroes a piece of a single page cache entry for zp at offset
+ * start and length len.
+ *
+ * Caller must acquire a range lock on the file for the region
+ * being zeroed in order that the ARC and page cache stay in sync.
+ */
+static void
+zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len)
+{
+	struct address_space *mp = ZTOI(zp)->i_mapping;
+	struct page *pp;
+	int64_t	off;
+	void *pb;
+
+	ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK));
+
+	off = start & (PAGE_SIZE - 1);
+	start &= PAGE_MASK;
+
+	pp = find_lock_page(mp, start >> PAGE_SHIFT);
+	if (pp) {
+		if (mapping_writably_mapped(mp))
+			flush_dcache_page(pp);
+
+		pb = kmap(pp);
+		bzero(pb + off, len);
+		kunmap(pp);
+
+		if (mapping_writably_mapped(mp))
+			flush_dcache_page(pp);
+
+		mark_page_accessed(pp);
+		SetPageUptodate(pp);
+		ClearPageError(pp);
+		unlock_page(pp);
+		put_page(pp);
+	}
+}
+
+/*
+ * Free space in a file.
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of section to free.
+ *		len	- length of section to free.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	zfs_locked_range_t *lr;
+	int error;
+
+	/*
+	 * Lock the range being freed.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (off >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	if (off + len > zp->z_size)
+		len = zp->z_size - off;
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
+
+	/*
+	 * Zero partial page cache entries.  This must be done under a
+	 * range lock in order to keep the ARC and page cache in sync.
+	 */
+	if (zp->z_is_mapped) {
+		loff_t first_page, last_page, page_len;
+		loff_t first_page_offset, last_page_offset;
+
+		/* first possible full page in hole */
+		first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		/* last page of hole */
+		last_page = (off + len) >> PAGE_SHIFT;
+
+		/* offset of first_page */
+		first_page_offset = first_page << PAGE_SHIFT;
+		/* offset of last_page */
+		last_page_offset = last_page << PAGE_SHIFT;
+
+		/* truncate whole pages */
+		if (last_page_offset > first_page_offset) {
+			truncate_inode_pages_range(ZTOI(zp)->i_mapping,
+			    first_page_offset, last_page_offset - 1);
+		}
+
+		/* truncate sub-page ranges */
+		if (first_page > last_page) {
+			/* entire punched area within a single page */
+			zfs_zero_partial_page(zp, off, len);
+		} else {
+			/* beginning of punched area at the end of a page */
+			page_len  = first_page_offset - off;
+			if (page_len > 0)
+				zfs_zero_partial_page(zp, off, page_len);
+
+			/* end of punched area at the beginning of a page */
+			page_len = off + len - last_page_offset;
+			if (page_len > 0)
+				zfs_zero_partial_page(zp, last_page_offset,
+				    page_len);
+		}
+	}
+	zfs_rangelock_exit(lr);
+
+	return (error);
+}
+
+/*
+ * Truncate a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		end	- new end-of-file.
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+static int
+zfs_trunc(znode_t *zp, uint64_t end)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	dmu_tx_t *tx;
+	zfs_locked_range_t *lr;
+	int error;
+	sa_bulk_attr_t bulk[2];
+	int count = 0;
+
+	/*
+	 * We will change zp_size, lock the whole file.
+	 */
+	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER);
+
+	/*
+	 * Nothing to do if file already at desired length.
+	 */
+	if (end >= zp->z_size) {
+		zfs_rangelock_exit(lr);
+		return (0);
+	}
+
+	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,
+	    DMU_OBJECT_END);
+	if (error) {
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		zfs_rangelock_exit(lr);
+		return (error);
+	}
+
+	zp->z_size = end;
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
+	    NULL, &zp->z_size, sizeof (zp->z_size));
+
+	if (end == 0) {
+		zp->z_pflags &= ~ZFS_SPARSE;
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+		    NULL, &zp->z_pflags, 8);
+	}
+	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
+
+	dmu_tx_commit(tx);
+	zfs_rangelock_exit(lr);
+
+	return (0);
+}
+
+/*
+ * Free space in a file
+ *
+ *	IN:	zp	- znode of file to free data in.
+ *		off	- start of range
+ *		len	- end of range (0 => EOF)
+ *		flag	- current file open mode flags.
+ *		log	- TRUE if this action should be logged
+ *
+ *	RETURN:	0 on success, error code on failure
+ */
+int
+zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
+{
+	dmu_tx_t *tx;
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	zilog_t *zilog = zfsvfs->z_log;
+	uint64_t mode;
+	uint64_t mtime[2], ctime[2];
+	sa_bulk_attr_t bulk[3];
+	int count = 0;
+	int error;
+
+	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
+	    sizeof (mode))) != 0)
+		return (error);
+
+	if (off > zp->z_size) {
+		error =  zfs_extend(zp, off+len);
+		if (error == 0 && log)
+			goto log;
+		goto out;
+	}
+
+	if (len == 0) {
+		error = zfs_trunc(zp, off);
+	} else {
+		if ((error = zfs_free_range(zp, off, len)) == 0 &&
+		    off + len > zp->z_size)
+			error = zfs_extend(zp, off+len);
+	}
+	if (error || !log)
+		goto out;
+log:
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+	zfs_sa_upgrade_txholds(tx, zp);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
+	    NULL, &zp->z_pflags, 8);
+	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+	ASSERT(error == 0);
+
+	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
+
+	dmu_tx_commit(tx);
+
+	zfs_znode_update_vfs(zp);
+	error = 0;
+
+out:
+	/*
+	 * Truncate the page cache - for file truncate operations, use
+	 * the purpose-built API for truncations.  For punching operations,
+	 * the truncation is handled under a range lock in zfs_free_range.
+	 */
+	if (len == 0)
+		truncate_setsize(ZTOI(zp), off);
+	return (error);
+}
+
+void
+zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
+{
+	struct super_block *sb;
+	zfsvfs_t	*zfsvfs;
+	uint64_t	moid, obj, sa_obj, version;
+	uint64_t	sense = ZFS_CASE_SENSITIVE;
+	uint64_t	norm = 0;
+	nvpair_t	*elem;
+	int		size;
+	int		error;
+	int		i;
+	znode_t		*rootzp = NULL;
+	vattr_t		vattr;
+	znode_t		*zp;
+	zfs_acl_ids_t	acl_ids;
+
+	/*
+	 * First attempt to create master node.
+	 */
+	/*
+	 * In an empty objset, there are no blocks to read and thus
+	 * there can be no i/o errors (which we assert below).
+	 */
+	moid = MASTER_NODE_OBJ;
+	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Set starting attributes.
+	 */
+	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
+		/* For the moment we expect all zpl props to be uint64_ts */
+		uint64_t val;
+		char *name;
+
+		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
+		VERIFY(nvpair_value_uint64(elem, &val) == 0);
+		name = nvpair_name(elem);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
+			if (val < version)
+				version = val;
+		} else {
+			error = zap_update(os, moid, name, 8, 1, &val, tx);
+		}
+		ASSERT(error == 0);
+		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
+			norm = val;
+		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
+			sense = val;
+	}
+	ASSERT(version != 0);
+	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
+
+	/*
+	 * Create zap object used for SA attribute registration
+	 */
+
+	if (version >= ZPL_VERSION_SA) {
+		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
+		    DMU_OT_NONE, 0, tx);
+		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
+		ASSERT(error == 0);
+	} else {
+		sa_obj = 0;
+	}
+	/*
+	 * Create a delete queue.
+	 */
+	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+
+	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
+	ASSERT(error == 0);
+
+	/*
+	 * Create root znode.  Create minimal znode/inode/zfsvfs/sb
+	 * to allow zfs_mknode to work.
+	 */
+	vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID;
+	vattr.va_mode = S_IFDIR|0755;
+	vattr.va_uid = crgetuid(cr);
+	vattr.va_gid = crgetgid(cr);
+
+	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	rootzp->z_unlinked = B_FALSE;
+	rootzp->z_atime_dirty = B_FALSE;
+	rootzp->z_is_sa = USE_SA(version, os);
+	rootzp->z_pflags = 0;
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	zfsvfs->z_os = os;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_version = version;
+	zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+	zfsvfs->z_use_sa = USE_SA(version, os);
+	zfsvfs->z_norm = norm;
+
+	sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP);
+	sb->s_fs_info = zfsvfs;
+
+	ZTOI(rootzp)->i_sb = sb;
+
+	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
+	    &zfsvfs->z_attr_table);
+
+	ASSERT(error == 0);
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+
+	size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
+	zfsvfs->z_hold_size = size;
+	zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size,
+	    KM_SLEEP);
+	zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
+	for (i = 0; i != size; i++) {
+		avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare,
+		    sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
+		mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
+	}
+
+	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+	    cr, NULL, &acl_ids));
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
+	ASSERT3P(zp, ==, rootzp);
+	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
+	ASSERT(error == 0);
+	zfs_acl_ids_free(&acl_ids);
+
+	atomic_set(&ZTOI(rootzp)->i_count, 0);
+	sa_handle_destroy(rootzp->z_sa_hdl);
+	kmem_cache_free(znode_cache, rootzp);
+
+	for (i = 0; i != size; i++) {
+		avl_destroy(&zfsvfs->z_hold_trees[i]);
+		mutex_destroy(&zfsvfs->z_hold_locks[i]);
+	}
+
+	mutex_destroy(&zfsvfs->z_znodes_lock);
+
+	vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size);
+	vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size);
+	kmem_free(sb, sizeof (struct super_block));
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+}
+#endif /* _KERNEL */
+
+static int
+zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
+{
+	uint64_t sa_obj = 0;
+	int error;
+
+	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
+	if (error != 0 && error != ENOENT)
+		return (error);
+
+	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
+	return (error);
+}
+
+static int
+zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
+    dmu_buf_t **db, void *tag)
+{
+	dmu_object_info_t doi;
+	int error;
+
+	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
+		return (error);
+
+	dmu_object_info_from_db(*db, &doi);
+	if ((doi.doi_bonus_type != DMU_OT_SA &&
+	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
+	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
+	    doi.doi_bonus_size < sizeof (znode_phys_t))) {
+		sa_buf_rele(*db, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
+	if (error != 0) {
+		sa_buf_rele(*db, tag);
+		return (error);
+	}
+
+	return (0);
+}
+
+static void
+zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
+{
+	sa_handle_destroy(hdl);
+	sa_buf_rele(db, tag);
+}
+
+/*
+ * Given an object number, return its parent object number and whether
+ * or not the object is an extended attribute directory.
+ */
+static int
+zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    uint64_t *pobjp, int *is_xattrdir)
+{
+	uint64_t parent;
+	uint64_t pflags;
+	uint64_t mode;
+	uint64_t parent_mode;
+	sa_bulk_attr_t bulk[3];
+	sa_handle_t *sa_hdl;
+	dmu_buf_t *sa_db;
+	int count = 0;
+	int error;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
+	    &parent, sizeof (parent));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
+	    &pflags, sizeof (pflags));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &mode, sizeof (mode));
+
+	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
+		return (error);
+
+	/*
+	 * When a link is removed its parent pointer is not changed and will
+	 * be invalid.  There are two cases where a link is removed but the
+	 * file stays around, when it goes to the delete queue and when there
+	 * are additional links.
+	 */
+	error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode));
+	zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	if (error != 0)
+		return (error);
+
+	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
+
+	/*
+	 * Extended attributes can be applied to files, directories, etc.
+	 * Otherwise the parent must be a directory.
+	 */
+	if (!*is_xattrdir && !S_ISDIR(parent_mode))
+		return (SET_ERROR(EINVAL));
+
+	*pobjp = parent;
+
+	return (0);
+}
+
+/*
+ * Given an object number, return some zpl level statistics
+ */
+static int
+zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
+    zfs_stat_t *sb)
+{
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
+	    &sb->zs_mode, sizeof (sb->zs_mode));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
+	    &sb->zs_gen, sizeof (sb->zs_gen));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
+	    &sb->zs_links, sizeof (sb->zs_links));
+	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
+	    &sb->zs_ctime, sizeof (sb->zs_ctime));
+
+	return (sa_bulk_lookup(hdl, bulk, count));
+}
+
+static int
+zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
+    sa_attr_type_t *sa_table, char *buf, int len)
+{
+	sa_handle_t *sa_hdl;
+	sa_handle_t *prevhdl = NULL;
+	dmu_buf_t *prevdb = NULL;
+	dmu_buf_t *sa_db = NULL;
+	char *path = buf + len - 1;
+	int error;
+
+	*path = '\0';
+	sa_hdl = hdl;
+
+	uint64_t deleteq_obj;
+	VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ,
+	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+	error = zap_lookup_int(osp, deleteq_obj, obj);
+	if (error == 0) {
+		return (ESTALE);
+	} else if (error != ENOENT) {
+		return (error);
+	}
+	error = 0;
+
+	for (;;) {
+		uint64_t pobj = 0;
+		char component[MAXNAMELEN + 2];
+		size_t complen;
+		int is_xattrdir = 0;
+
+		if (prevdb) {
+			ASSERT(prevhdl != NULL);
+			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
+		}
+
+		if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj,
+		    &is_xattrdir)) != 0)
+			break;
+
+		if (pobj == obj) {
+			if (path[0] != '/')
+				*--path = '/';
+			break;
+		}
+
+		component[0] = '/';
+		if (is_xattrdir) {
+			(void) sprintf(component + 1, "<xattrdir>");
+		} else {
+			error = zap_value_search(osp, pobj, obj,
+			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
+			if (error != 0)
+				break;
+		}
+
+		complen = strlen(component);
+		path -= complen;
+		ASSERT(path >= buf);
+		bcopy(component, path, complen);
+		obj = pobj;
+
+		if (sa_hdl != hdl) {
+			prevhdl = sa_hdl;
+			prevdb = sa_db;
+		}
+		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
+		if (error != 0) {
+			sa_hdl = prevhdl;
+			sa_db = prevdb;
+			break;
+		}
+	}
+
+	if (sa_hdl != NULL && sa_hdl != hdl) {
+		ASSERT(sa_db != NULL);
+		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
+	}
+
+	if (error == 0)
+		(void) memmove(buf, path, buf + len - path);
+
+	return (error);
+}
+
+int
+zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
+{
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+int
+zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
+    char *buf, int len)
+{
+	char *path = buf + len - 1;
+	sa_attr_type_t *sa_table;
+	sa_handle_t *hdl;
+	dmu_buf_t *db;
+	int error;
+
+	*path = '\0';
+
+	error = zfs_sa_setup(osp, &sa_table);
+	if (error != 0)
+		return (error);
+
+	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
+	if (error != 0) {
+		zfs_release_sa_handle(hdl, db, FTAG);
+		return (error);
+	}
+
+	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
+
+	zfs_release_sa_handle(hdl, db, FTAG);
+	return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_create_fs);
+EXPORT_SYMBOL(zfs_obj_to_path);
+
+/* CSTYLED */
+module_param(zfs_object_mutex_size, uint, 0644);
+MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array");
+module_param(zfs_unlink_suspend_progress, int, 0644);
+MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks "
+"(debug - leaks space into the unlinked set)");
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
new file mode 100644
index 000000000000..284ca706ede5
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
@@ -0,0 +1,2049 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/zio_crypt.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dnode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zil.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+#include <sys/qat.h>
+
+/*
+ * This file is responsible for handling all of the details of generating
+ * encryption parameters and performing encryption and authentication.
+ *
+ * BLOCK ENCRYPTION PARAMETERS:
+ * Encryption /Authentication Algorithm Suite (crypt):
+ * The encryption algorithm, mode, and key length we are going to use. We
+ * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
+ * keys. All authentication is currently done with SHA512-HMAC.
+ *
+ * Plaintext:
+ * The unencrypted data that we want to encrypt.
+ *
+ * Initialization Vector (IV):
+ * An initialization vector for the encryption algorithms. This is used to
+ * "tweak" the encryption algorithms so that two blocks of the same data are
+ * encrypted into different ciphertext outputs, thus obfuscating block patterns.
+ * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
+ * never reused with the same encryption key. This value is stored unencrypted
+ * and must simply be provided to the decryption function. We use a 96 bit IV
+ * (as recommended by NIST) for all block encryption. For non-dedup blocks we
+ * derive the IV randomly. The first 64 bits of the IV are stored in the second
+ * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
+ * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
+ * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
+ * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
+ * level 0 blocks is the number of allocated dnodes in that block. The on-disk
+ * format supports at most 2^15 slots per L0 dnode block, because the maximum
+ * block size is 16MB (2^24). In either case, for level 0 blocks this number
+ * will still be smaller than UINT32_MAX so it is safe to store the IV in the
+ * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
+ * for the dnode code.
+ *
+ * Master key:
+ * This is the most important secret data of an encrypted dataset. It is used
+ * along with the salt to generate that actual encryption keys via HKDF. We
+ * do not use the master key to directly encrypt any data because there are
+ * theoretical limits on how much data can actually be safely encrypted with
+ * any encryption mode. The master key is stored encrypted on disk with the
+ * user's wrapping key. Its length is determined by the encryption algorithm.
+ * For details on how this is stored see the block comment in dsl_crypt.c
+ *
+ * Salt:
+ * Used as an input to the HKDF function, along with the master key. We use a
+ * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
+ * can be used for encrypting many blocks, so we cache the current salt and the
+ * associated derived key in zio_crypt_t so we do not need to derive it again
+ * needlessly.
+ *
+ * Encryption Key:
+ * A secret binary key, generated from an HKDF function used to encrypt and
+ * decrypt data.
+ *
+ * Message Authentication Code (MAC)
+ * The MAC is an output of authenticated encryption modes such as AES-GCM and
+ * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
+ * data on disk and return garbage to the application. Effectively, it is a
+ * checksum that can not be reproduced by an attacker. We store the MAC in the
+ * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
+ * regular checksum of the ciphertext which can be used for scrubbing.
+ *
+ * OBJECT AUTHENTICATION:
+ * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
+ * they contain some info that always needs to be readable. To prevent this
+ * data from being altered, we authenticate this data using SHA512-HMAC. This
+ * will produce a MAC (similar to the one produced via encryption) which can
+ * be used to verify the object was not modified. HMACs do not require key
+ * rotation or IVs, so we can keep up to the full 3 copies of authenticated
+ * data.
+ *
+ * ZIL ENCRYPTION:
+ * ZIL blocks have their bp written to disk ahead of the associated data, so we
+ * cannot store the MAC there as we normally do. For these blocks the MAC is
+ * stored in the embedded checksum within the zil_chain_t header. The salt and
+ * IV are generated for the block on bp allocation instead of at encryption
+ * time. In addition, ZIL blocks have some pieces that must be left in plaintext
+ * for claiming even though all of the sensitive user data still needs to be
+ * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
+ * pieces of the block need to be encrypted. All data that is not encrypted is
+ * authenticated using the AAD mechanisms that the supported encryption modes
+ * provide for. In order to preserve the semantics of the ZIL for encrypted
+ * datasets, the ZIL is not protected at the objset level as described below.
+ *
+ * DNODE ENCRYPTION:
+ * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
+ * in plaintext for scrubbing and claiming, but the bonus buffers might contain
+ * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
+ * which which pieces of the block need to be encrypted. For more details about
+ * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
+ *
+ * OBJECT SET AUTHENTICATION:
+ * Up to this point, everything we have encrypted and authenticated has been
+ * at level 0 (or -2 for the ZIL). If we did not do any further work the
+ * on-disk format would be susceptible to attacks that deleted or rearranged
+ * the order of level 0 blocks. Ideally, the cleanest solution would be to
+ * maintain a tree of authentication MACs going up the bp tree. However, this
+ * presents a problem for raw sends. Send files do not send information about
+ * indirect blocks so there would be no convenient way to transfer the MACs and
+ * they cannot be recalculated on the receive side without the master key which
+ * would defeat one of the purposes of raw sends in the first place. Instead,
+ * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
+ * from the level below. We also include some portable fields from blk_prop such
+ * as the lsize and compression algorithm to prevent the data from being
+ * misinterpreted.
+ *
+ * At the objset level, we maintain 2 separate 256 bit MACs in the
+ * objset_phys_t. The first one is "portable" and is the logical root of the
+ * MAC tree maintained in the metadnode's bps. The second, is "local" and is
+ * used as the root MAC for the user accounting objects, which are also not
+ * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
+ * of the send file. The useraccounting code ensures that the useraccounting
+ * info is not present upon a receive, so the local MAC can simply be cleared
+ * out at that time. For more info about objset_phys_t authentication, see
+ * zio_crypt_do_objset_hmacs().
+ *
+ * CONSIDERATIONS FOR DEDUP:
+ * In order for dedup to work, blocks that we want to dedup with one another
+ * need to use the same IV and encryption key, so that they will have the same
+ * ciphertext. Normally, one should never reuse an IV with the same encryption
+ * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
+ * blocks. In this case, however, since we are using the same plaintext as
+ * well all that we end up with is a duplicate of the original ciphertext we
+ * already had. As a result, an attacker with read access to the raw disk will
+ * be able to tell which blocks are the same but this information is given away
+ * by dedup anyway. In order to get the same IVs and encryption keys for
+ * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
+ * here so that a reproducible checksum of the plaintext is never available to
+ * the attacker. The HMAC key is kept alongside the master key, encrypted on
+ * disk. The first 64 bits of the HMAC are used in place of the random salt, and
+ * the next 96 bits are used as the IV. As a result of this mechanism, dedup
+ * will only work within a clone family since encrypted dedup requires use of
+ * the same master and HMAC keys.
+ */
+
+/*
+ * After encrypting many blocks with the same key we may start to run up
+ * against the theoretical limits of how much data can securely be encrypted
+ * with a single key using the supported encryption modes. The most obvious
+ * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
+ * the more IVs we generate (which both GCM and CCM modes strictly forbid).
+ * This risk actually grows surprisingly quickly over time according to the
+ * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
+ * generated n IVs with a cryptographically secure RNG, the approximate
+ * probability p(n) of a collision is given as:
+ *
+ * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
+ *
+ * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
+ *
+ * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
+ * we must not write more than 398,065,730 blocks with the same encryption key.
+ * Therefore, we rotate our keys after 400,000,000 blocks have been written by
+ * generating a new random 64 bit salt for our HKDF encryption key generation
+ * function.
+ */
+#define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
+#define	ZFS_CURRENT_MAX_SALT_USES	\
+	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
+unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
+
+typedef struct blkptr_auth_buf {
+	uint64_t bab_prop;			/* blk_prop - portable mask */
+	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
+	uint64_t bab_pad;			/* reserved for future use */
+} blkptr_auth_buf_t;
+
+zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
+	{"",			ZC_TYPE_NONE,	0,	"inherit"},
+	{"",			ZC_TYPE_NONE,	0,	"on"},
+	{"",			ZC_TYPE_NONE,	0,	"off"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
+	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
+	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
+};
+
+void
+zio_crypt_key_destroy(zio_crypt_key_t *key)
+{
+	rw_destroy(&key->zk_salt_lock);
+
+	/* free crypto templates */
+	crypto_destroy_ctx_template(key->zk_current_tmpl);
+	crypto_destroy_ctx_template(key->zk_hmac_tmpl);
+
+	/* zero out sensitive data */
+	bzero(key, sizeof (zio_crypt_key_t));
+}
+
+int
+zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	uint_t keydata_len;
+
+	ASSERT(key != NULL);
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+	bzero(key, sizeof (zio_crypt_key_t));
+
+	/* fill keydata buffers and salt with random data */
+	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
+	if (ret != 0)
+		goto error;
+
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for the ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	/*
+	 * Initialize the crypto templates. It's ok if this fails because
+	 * this is just an optimization.
+	 */
+	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+	    &key->zk_current_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_current_tmpl = NULL;
+
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+	    &key->zk_hmac_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_hmac_tmpl = NULL;
+
+	key->zk_crypt = crypt;
+	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+	key->zk_salt_count = 0;
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy(key);
+	return (ret);
+}
+
+static int
+zio_crypt_key_change_salt(zio_crypt_key_t *key)
+{
+	int ret = 0;
+	uint8_t salt[ZIO_DATA_SALT_LEN];
+	crypto_mechanism_t mech;
+	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
+
+	/* generate a new salt */
+	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	rw_enter(&key->zk_salt_lock, RW_WRITER);
+
+	/* someone beat us to the salt rotation, just unlock and return */
+	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
+		goto out_unlock;
+
+	/* derive the current key from the master key and the new salt */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
+	if (ret != 0)
+		goto out_unlock;
+
+	/* assign the salt and reset the usage count */
+	bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
+	key->zk_salt_count = 0;
+
+	/* destroy the old context template and create the new one */
+	crypto_destroy_ctx_template(key->zk_current_tmpl);
+	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+	    &key->zk_current_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_current_tmpl = NULL;
+
+	rw_exit(&key->zk_salt_lock);
+
+	return (0);
+
+out_unlock:
+	rw_exit(&key->zk_salt_lock);
+error:
+	return (ret);
+}
+
+/* See comment above zfs_key_max_salt_uses definition for details */
+int
+zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
+{
+	int ret;
+	boolean_t salt_change;
+
+	rw_enter(&key->zk_salt_lock, RW_READER);
+
+	bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
+	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
+	    ZFS_CURRENT_MAX_SALT_USES);
+
+	rw_exit(&key->zk_salt_lock);
+
+	if (salt_change) {
+		ret = zio_crypt_key_change_salt(key);
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * This function handles all encryption and decryption in zfs. When
+ * encrypting it expects puio to reference the plaintext and cuio to
+ * reference the ciphertext. cuio must have enough space for the
+ * ciphertext + room for a MAC. datalen should be the length of the
+ * plaintext / ciphertext alone.
+ */
+static int
+zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
+    crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
+    zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
+{
+	int ret;
+	crypto_data_t plaindata, cipherdata;
+	CK_AES_CCM_PARAMS ccmp;
+	CK_AES_GCM_PARAMS gcmp;
+	crypto_mechanism_t mech;
+	zio_crypt_info_t crypt_info;
+	uint_t plain_full_len, maclen;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW);
+
+	/* lookup the encryption info */
+	crypt_info = zio_crypt_table[crypt];
+
+	/* the mac will always be the last iovec_t in the cipher uio */
+	maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
+
+	ASSERT(maclen <= ZIO_DATA_MAC_LEN);
+
+	/* setup encryption mechanism (same as crypt) */
+	mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
+
+	/*
+	 * Strangely, the ICP requires that plain_full_len must include
+	 * the MAC length when decrypting, even though the UIO does not
+	 * need to have the extra space allocated.
+	 */
+	if (encrypt) {
+		plain_full_len = datalen;
+	} else {
+		plain_full_len = datalen + maclen;
+	}
+
+	/*
+	 * setup encryption params (currently only AES CCM and AES GCM
+	 * are supported)
+	 */
+	if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
+		ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
+		ccmp.ulAuthDataSize = auth_len;
+		ccmp.authData = authbuf;
+		ccmp.ulMACSize = maclen;
+		ccmp.nonce = ivbuf;
+		ccmp.ulDataSize = plain_full_len;
+
+		mech.cm_param = (char *)(&ccmp);
+		mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
+	} else {
+		gcmp.ulIvLen = ZIO_DATA_IV_LEN;
+		gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
+		gcmp.ulAADLen = auth_len;
+		gcmp.pAAD = authbuf;
+		gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
+		gcmp.pIv = ivbuf;
+
+		mech.cm_param = (char *)(&gcmp);
+		mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
+	}
+
+	/* populate the cipher and plain data structs. */
+	plaindata.cd_format = CRYPTO_DATA_UIO;
+	plaindata.cd_offset = 0;
+	plaindata.cd_uio = puio;
+	plaindata.cd_miscdata = NULL;
+	plaindata.cd_length = plain_full_len;
+
+	cipherdata.cd_format = CRYPTO_DATA_UIO;
+	cipherdata.cd_offset = 0;
+	cipherdata.cd_uio = cuio;
+	cipherdata.cd_miscdata = NULL;
+	cipherdata.cd_length = datalen + maclen;
+
+	/* perform the actual encryption */
+	if (encrypt) {
+		ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata,
+		    NULL);
+		if (ret != CRYPTO_SUCCESS) {
+			ret = SET_ERROR(EIO);
+			goto error;
+		}
+	} else {
+		ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata,
+		    NULL);
+		if (ret != CRYPTO_SUCCESS) {
+			ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
+			ret = SET_ERROR(ECKSUM);
+			goto error;
+		}
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+int
+zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
+    uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
+{
+	int ret;
+	zfs_uio_t puio, cuio;
+	uint64_t aad[3];
+	iovec_t plain_iovecs[2], cipher_iovecs[3];
+	uint64_t crypt = key->zk_crypt;
+	uint_t enc_len, keydata_len, aad_len;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+	/* generate iv for wrapping the master and hmac key */
+	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* initialize zfs_uio_ts */
+	plain_iovecs[0].iov_base = key->zk_master_keydata;
+	plain_iovecs[0].iov_len = keydata_len;
+	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+	cipher_iovecs[0].iov_base = keydata_out;
+	cipher_iovecs[0].iov_len = keydata_len;
+	cipher_iovecs[1].iov_base = hmac_keydata_out;
+	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+	cipher_iovecs[2].iov_base = mac;
+	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+	/*
+	 * Although we don't support writing to the old format, we do
+	 * support rewrapping the key so that the user can move and
+	 * quarantine datasets on the old format.
+	 */
+	if (key->zk_version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(key->zk_guid);
+	} else {
+		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(key->zk_guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(key->zk_version);
+	}
+
+	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
+	puio.uio_iov = plain_iovecs;
+	puio.uio_iovcnt = 2;
+	puio.uio_segflg = UIO_SYSSPACE;
+	cuio.uio_iov = cipher_iovecs;
+	cuio.uio_iovcnt = 3;
+	cuio.uio_segflg = UIO_SYSSPACE;
+
+	/* encrypt the keys and store the resulting ciphertext and mac */
+	ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
+	    &puio, &cuio, (uint8_t *)aad, aad_len);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+int
+zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
+    uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
+    uint8_t *mac, zio_crypt_key_t *key)
+{
+	crypto_mechanism_t mech;
+	zfs_uio_t puio, cuio;
+	uint64_t aad[3];
+	iovec_t plain_iovecs[2], cipher_iovecs[3];
+	uint_t enc_len, keydata_len, aad_len;
+	int ret;
+
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW);
+
+	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
+
+	keydata_len = zio_crypt_table[crypt].ci_keylen;
+
+	/* initialize zfs_uio_ts */
+	plain_iovecs[0].iov_base = key->zk_master_keydata;
+	plain_iovecs[0].iov_len = keydata_len;
+	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
+	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+
+	cipher_iovecs[0].iov_base = keydata;
+	cipher_iovecs[0].iov_len = keydata_len;
+	cipher_iovecs[1].iov_base = hmac_keydata;
+	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
+	cipher_iovecs[2].iov_base = mac;
+	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
+
+	if (version == 0) {
+		aad_len = sizeof (uint64_t);
+		aad[0] = LE_64(guid);
+	} else {
+		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+		aad_len = sizeof (uint64_t) * 3;
+		aad[0] = LE_64(guid);
+		aad[1] = LE_64(crypt);
+		aad[2] = LE_64(version);
+	}
+
+	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
+	puio.uio_iov = plain_iovecs;
+	puio.uio_segflg = UIO_SYSSPACE;
+	puio.uio_iovcnt = 2;
+	cuio.uio_iov = cipher_iovecs;
+	cuio.uio_iovcnt = 3;
+	cuio.uio_segflg = UIO_SYSSPACE;
+
+	/* decrypt the keys and store the result in the output buffers */
+	ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
+	    &puio, &cuio, (uint8_t *)aad, aad_len);
+	if (ret != 0)
+		goto error;
+
+	/* generate a fresh salt */
+	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
+	if (ret != 0)
+		goto error;
+
+	/* derive the current key from the master key */
+	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
+	    keydata_len);
+	if (ret != 0)
+		goto error;
+
+	/* initialize keys for ICP */
+	key->zk_current_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_current_key.ck_data = key->zk_current_keydata;
+	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+	key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW;
+	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
+	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
+
+	/*
+	 * Initialize the crypto templates. It's ok if this fails because
+	 * this is just an optimization.
+	 */
+	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
+	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
+	    &key->zk_current_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_current_tmpl = NULL;
+
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
+	    &key->zk_hmac_tmpl, KM_SLEEP);
+	if (ret != CRYPTO_SUCCESS)
+		key->zk_hmac_tmpl = NULL;
+
+	key->zk_crypt = crypt;
+	key->zk_version = version;
+	key->zk_guid = guid;
+	key->zk_salt_count = 0;
+
+	return (0);
+
+error:
+	zio_crypt_key_destroy(key);
+	return (ret);
+}
+
+int
+zio_crypt_generate_iv(uint8_t *ivbuf)
+{
+	int ret;
+
+	/* randomly generate the IV */
+	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
+	if (ret != 0)
+		goto error;
+
+	return (0);
+
+error:
+	bzero(ivbuf, ZIO_DATA_IV_LEN);
+	return (ret);
+}
+
+int
+zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
+    uint8_t *digestbuf, uint_t digestlen)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	crypto_data_t in_data, digest_data;
+	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
+
+	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
+
+	/* initialize sha512-hmac mechanism and crypto data */
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	mech.cm_param = NULL;
+	mech.cm_param_len = 0;
+
+	/* initialize the crypto data */
+	in_data.cd_format = CRYPTO_DATA_RAW;
+	in_data.cd_offset = 0;
+	in_data.cd_length = datalen;
+	in_data.cd_raw.iov_base = (char *)data;
+	in_data.cd_raw.iov_len = in_data.cd_length;
+
+	digest_data.cd_format = CRYPTO_DATA_RAW;
+	digest_data.cd_offset = 0;
+	digest_data.cd_length = SHA512_DIGEST_LENGTH;
+	digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
+	digest_data.cd_raw.iov_len = digest_data.cd_length;
+
+	/* generate the hmac */
+	ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
+	    &digest_data, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	bcopy(raw_digestbuf, digestbuf, digestlen);
+
+	return (0);
+
+error:
+	bzero(digestbuf, digestlen);
+	return (ret);
+}
+
+int
+zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
+    uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
+{
+	int ret;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	ret = zio_crypt_do_hmac(key, data, datalen,
+	    digestbuf, SHA512_DIGEST_LENGTH);
+	if (ret != 0)
+		return (ret);
+
+	bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN);
+	bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN);
+
+	return (0);
+}
+
+/*
+ * The following functions are used to encode and decode encryption parameters
+ * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
+ * byte strings, which normally means that these strings would not need to deal
+ * with byteswapping at all. However, both blkptr_t and zil_header_t may be
+ * byteswapped by lower layers and so we must "undo" that byteswap here upon
+ * decoding and encoding in a non-native byteorder. These functions require
+ * that the byteorder bit is correct before being called.
+ */
+void
+zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_ENCRYPTED(bp));
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
+		bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, val32);
+	} else {
+		bcopy(salt, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
+
+		bcopy(iv, &val64, sizeof (uint64_t));
+		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
+
+		bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
+		BP_SET_IV2(bp, BSWAP_32(val32));
+	}
+}
+
+void
+zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
+{
+	uint64_t val64;
+	uint32_t val32;
+
+	ASSERT(BP_IS_PROTECTED(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_IS_AUTHENTICATED(bp)) {
+		bzero(salt, ZIO_DATA_SALT_LEN);
+		bzero(iv, ZIO_DATA_IV_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
+		bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
+
+		val32 = (uint32_t)BP_GET_IV2(bp);
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
+		bcopy(&val64, salt, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
+		bcopy(&val64, iv, sizeof (uint64_t));
+
+		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
+		bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp));
+	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
+		bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
+		    sizeof (uint64_t));
+	} else {
+		bcopy(mac, &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
+
+		bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
+		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
+	}
+}
+
+void
+zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
+{
+	uint64_t val64;
+
+	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
+
+	/* for convenience, so callers don't need to check */
+	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		bzero(mac, ZIO_DATA_MAC_LEN);
+		return;
+	}
+
+	if (!BP_SHOULD_BYTESWAP(bp)) {
+		bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
+		bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
+		    sizeof (uint64_t));
+	} else {
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
+		bcopy(&val64, mac, sizeof (uint64_t));
+
+		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
+		bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
+	}
+}
+
+void
+zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
+{
+	zil_chain_t *zilc = data;
+
+	bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
+	bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
+	    sizeof (uint64_t));
+}
+
+void
+zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
+{
+	/*
+	 * The ZIL MAC is embedded in the block it protects, which will
+	 * not have been byteswapped by the time this function has been called.
+	 * As a result, we don't need to worry about byteswapping the MAC.
+	 */
+	const zil_chain_t *zilc = data;
+
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
+	bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
+	    sizeof (uint64_t));
+}
+
+/*
+ * This routine takes a block of dnodes (src_abd) and copies only the bonus
+ * buffers to the same offsets in the dst buffer. datalen should be the size
+ * of both the src_abd and the dst buffer (not just the length of the bonus
+ * buffers).
+ */
+void
+zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
+{
+	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
+	uint8_t *src;
+	dnode_phys_t *dnp, *sdnp, *ddnp;
+
+	src = abd_borrow_buf_copy(src_abd, datalen);
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]),
+			    DN_MAX_BONUS_LEN(dnp));
+		}
+	}
+
+	abd_return_buf(src_abd, src, datalen);
+}
+
+/*
+ * This function decides what fields from blk_prop are included in
+ * the on-disk various MAC algorithms.
+ */
+static void
+zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
+{
+	/*
+	 * Version 0 did not properly zero out all non-portable fields
+	 * as it should have done. We maintain this code so that we can
+	 * do read-only imports of pools on this version.
+	 */
+	if (version == 0) {
+		BP_SET_DEDUP(bp, 0);
+		BP_SET_CHECKSUM(bp, 0);
+		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+		return;
+	}
+
+	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
+
+	/*
+	 * The hole_birth feature might set these fields even if this bp
+	 * is a hole. We zero them out here to guarantee that raw sends
+	 * will function with or without the feature.
+	 */
+	if (BP_IS_HOLE(bp)) {
+		bp->blk_prop = 0ULL;
+		return;
+	}
+
+	/*
+	 * At L0 we want to verify these fields to ensure that data blocks
+	 * can not be reinterpreted. For instance, we do not want an attacker
+	 * to trick us into returning raw lz4 compressed data to the user
+	 * by modifying the compression bits. At higher levels, we cannot
+	 * enforce this policy since raw sends do not convey any information
+	 * about indirect blocks, so these values might be different on the
+	 * receive side. Fortunately, this does not open any new attack
+	 * vectors, since any alterations that can be made to a higher level
+	 * bp must still verify the correct order of the layer below it.
+	 */
+	if (BP_GET_LEVEL(bp) != 0) {
+		BP_SET_BYTEORDER(bp, 0);
+		BP_SET_COMPRESS(bp, 0);
+
+		/*
+		 * psize cannot be set to zero or it will trigger
+		 * asserts, but the value doesn't really matter as
+		 * long as it is constant.
+		 */
+		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
+	}
+
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_CHECKSUM(bp, 0);
+}
+
+static void
+zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
+    blkptr_auth_buf_t *bab, uint_t *bab_len)
+{
+	blkptr_t tmpbp = *bp;
+
+	if (should_bswap)
+		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
+
+	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
+	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
+
+	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
+
+	/*
+	 * We always MAC blk_prop in LE to ensure portability. This
+	 * must be done after decoding the mac, since the endianness
+	 * will get zero'd out here.
+	 */
+	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
+	bab->bab_prop = LE_64(tmpbp.blk_prop);
+	bab->bab_pad = 0ULL;
+
+	/* version 0 did not include the padding */
+	*bab_len = sizeof (blkptr_auth_buf_t);
+	if (version == 0)
+		*bab_len -= sizeof (uint64_t);
+}
+
+static int
+zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	int ret;
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+	crypto_data_t cd;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	cd.cd_format = CRYPTO_DATA_RAW;
+	cd.cd_offset = 0;
+	cd.cd_length = bab_len;
+	cd.cd_raw.iov_base = (char *)&bab;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+static void
+zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	SHA2Update(ctx, &bab, bab_len);
+}
+
+static void
+zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
+    boolean_t should_bswap, blkptr_t *bp)
+{
+	uint_t bab_len;
+	blkptr_auth_buf_t bab;
+
+	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
+	bcopy(&bab, *aadp, bab_len);
+	*aadp += bab_len;
+	*aad_len += bab_len;
+}
+
+static int
+zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
+    boolean_t should_bswap, dnode_phys_t *dnp)
+{
+	int ret, i;
+	dnode_phys_t *adnp;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	crypto_data_t cd;
+	uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
+
+	cd.cd_format = CRYPTO_DATA_RAW;
+	cd.cd_offset = 0;
+
+	/* authenticate the core dnode (masking out non-portable bits) */
+	bcopy(dnp, tmp_dncore, sizeof (tmp_dncore));
+	adnp = (dnode_phys_t *)tmp_dncore;
+	if (le_bswap) {
+		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
+		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
+		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
+		adnp->dn_used = BSWAP_64(adnp->dn_used);
+	}
+	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+	adnp->dn_used = 0;
+
+	cd.cd_length = sizeof (tmp_dncore);
+	cd.cd_raw.iov_base = (char *)adnp;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	for (i = 0; i < dnp->dn_nblkptr; i++) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, &dnp->dn_blkptr[i]);
+		if (ret != 0)
+			goto error;
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
+		    should_bswap, DN_SPILL_BLKPTR(dnp));
+		if (ret != 0)
+			goto error;
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * objset_phys_t blocks introduce a number of exceptions to the normal
+ * authentication process. objset_phys_t's contain 2 separate HMACS for
+ * protecting the integrity of their data. The portable_mac protects the
+ * metadnode. This MAC can be sent with a raw send and protects against
+ * reordering of data within the metadnode. The local_mac protects the user
+ * accounting objects which are not sent from one system to another.
+ *
+ * In addition, objset blocks are the only blocks that can be modified and
+ * written to disk without the key loaded under certain circumstances. During
+ * zil_claim() we need to be able to update the zil_header_t to complete
+ * claiming log blocks and during raw receives we need to write out the
+ * portable_mac from the send file. Both of these actions are possible
+ * because these fields are not protected by either MAC so neither one will
+ * need to modify the MACs without the key. However, when the modified blocks
+ * are written out they will be byteswapped into the host machine's native
+ * endianness which will modify fields protected by the MAC. As a result, MAC
+ * calculation for objset blocks works slightly differently from other block
+ * types. Where other block types MAC the data in whatever endianness is
+ * written to disk, objset blocks always MAC little endian version of their
+ * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
+ * and le_bswap indicates whether a byteswap is needed to get this block
+ * into little endian format.
+ */
+int
+zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
+    boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	crypto_context_t ctx;
+	crypto_data_t cd;
+	objset_phys_t *osp = data;
+	uint64_t intval;
+	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
+	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
+	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
+
+	/* initialize HMAC mechanism */
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	mech.cm_param = NULL;
+	mech.cm_param_len = 0;
+
+	cd.cd_format = CRYPTO_DATA_RAW;
+	cd.cd_offset = 0;
+
+	/* calculate the portable MAC from the portable fields and metadnode */
+	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in the os_type */
+	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
+	cd.cd_length = sizeof (uint64_t);
+	cd.cd_raw.iov_base = (char *)&intval;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in the portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	cd.cd_length = sizeof (uint64_t);
+	cd.cd_raw.iov_base = (char *)&intval;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in fields from the metadnode */
+	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+	    should_bswap, &osp->os_meta_dnode);
+	if (ret)
+		goto error;
+
+	/* store the final digest in a temporary buffer and copy what we need */
+	cd.cd_length = SHA512_DIGEST_LENGTH;
+	cd.cd_raw.iov_base = (char *)raw_portable_mac;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_final(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN);
+
+	/*
+	 * This is necessary here as we check next whether
+	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE or
+	 * OBJSET_FLAG_USEROBJACCOUNTING are set in order to
+	 * decide if the local_mac should be zeroed out.
+	 */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+
+	/*
+	 * The local MAC protects the user, group and project accounting.
+	 * If these objects are not present, the local MAC is zeroed out.
+	 */
+	if ((datalen >= OBJSET_PHYS_SIZE_V3 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
+	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
+	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
+	    (datalen <= OBJSET_PHYS_SIZE_V1) ||
+	    (((intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE) == 0 ||
+	    (intval & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE) == 0) &&
+	    key->zk_version > 0)) {
+		bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+		return (0);
+	}
+
+	/* calculate the local MAC from the userused and groupused dnodes */
+	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in the non-portable os_flags */
+	intval = osp->os_flags;
+	if (should_bswap)
+		intval = BSWAP_64(intval);
+	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
+	if (!ZFS_HOST_BYTEORDER)
+		intval = BSWAP_64(intval);
+
+	cd.cd_length = sizeof (uint64_t);
+	cd.cd_raw.iov_base = (char *)&intval;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_update(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	/* add in fields from the user accounting dnodes */
+	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_userused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_groupused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
+	    datalen >= OBJSET_PHYS_SIZE_V3) {
+		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
+		    should_bswap, &osp->os_projectused_dnode);
+		if (ret)
+			goto error;
+	}
+
+	/* store the final digest in a temporary buffer and copy what we need */
+	cd.cd_length = SHA512_DIGEST_LENGTH;
+	cd.cd_raw.iov_base = (char *)raw_local_mac;
+	cd.cd_raw.iov_len = cd.cd_length;
+
+	ret = crypto_mac_final(ctx, &cd, NULL);
+	if (ret != CRYPTO_SUCCESS) {
+		ret = SET_ERROR(EIO);
+		goto error;
+	}
+
+	bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN);
+
+	return (0);
+
+error:
+	bzero(portable_mac, ZIO_OBJSET_MAC_LEN);
+	bzero(local_mac, ZIO_OBJSET_MAC_LEN);
+	return (ret);
+}
+
+static void
+zio_crypt_destroy_uio(zfs_uio_t *uio)
+{
+	if (uio->uio_iov)
+		kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
+}
+
+/*
+ * This function parses an uncompressed indirect block and returns a checksum
+ * of all the portable fields from all of the contained bps. The portable
+ * fields are the MAC and all of the fields from blk_prop except for the dedup,
+ * checksum, and psize bits. For an explanation of the purpose of this, see
+ * the comment block on object set authentication.
+ */
+static int
+zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
+    uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
+{
+	blkptr_t *bp;
+	int i, epb = datalen >> SPA_BLKPTRSHIFT;
+	SHA2_CTX ctx;
+	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
+
+	/* checksum all of the MACs from the layer below */
+	SHA2Init(SHA512, &ctx);
+	for (i = 0, bp = buf; i < epb; i++, bp++) {
+		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
+		    byteswap, bp);
+	}
+	SHA2Final(digestbuf, &ctx);
+
+	if (generate) {
+		bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN);
+		return (0);
+	}
+
+	if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
+		return (SET_ERROR(ECKSUM));
+
+	return (0);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+
+	/*
+	 * Unfortunately, callers of this function will not always have
+	 * easy access to the on-disk format version. This info is
+	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
+	 * is expected to be verifiable even when the key isn't loaded.
+	 * Here, instead of doing a ZAP lookup for the version for each
+	 * zio, we simply try both existing formats.
+	 */
+	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
+	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
+	if (ret == ECKSUM) {
+		ASSERT(!generate);
+		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
+		    buf, datalen, 0, byteswap, cksum);
+	}
+
+	return (ret);
+}
+
+int
+zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
+    uint_t datalen, boolean_t byteswap, uint8_t *cksum)
+{
+	int ret;
+	void *buf;
+
+	buf = abd_borrow_buf_copy(abd, datalen);
+	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
+	    byteswap, cksum);
+	abd_return_buf(abd, buf, datalen);
+
+	return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting ZIL blocks.
+ * We do not check for the older ZIL chain because the encryption feature
+ * was not available before the newer ZIL chain was introduced. The goal
+ * here is to encrypt everything except the blkptr_t of a lr_write_t and
+ * the zil_chain_t header. Everything that is not encrypted is authenticated.
+ */
+static int
+zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
+    zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
+    boolean_t *no_crypt)
+{
+	int ret;
+	uint64_t txtype, lr_len;
+	uint_t nr_src, nr_dst, crypt_len;
+	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
+	zil_chain_t *zilc;
+	lr_t *lr;
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+	/* cipherbuf always needs an extra iovec for the MAC */
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+		nr_src = 0;
+		nr_dst = 1;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+		nr_src = 1;
+		nr_dst = 0;
+	}
+
+	/* find the start and end record of the log block */
+	zilc = (zil_chain_t *)src;
+	slrp = src + sizeof (zil_chain_t);
+	aadp = aadbuf;
+	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+
+	/* calculate the number of encrypted iovecs we will need */
+	for (; slrp < blkend; slrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (!byteswap) {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		} else {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		}
+
+		nr_iovecs++;
+		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
+			nr_iovecs++;
+	}
+
+	nr_src += nr_iovecs;
+	nr_dst += nr_iovecs;
+
+	/* allocate the iovec arrays */
+	if (nr_src != 0) {
+		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+		if (src_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	if (nr_dst != 0) {
+		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+		if (dst_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	/*
+	 * Copy the plain zil header over and authenticate everything except
+	 * the checksum that will store our MAC. If we are writing the data
+	 * the embedded checksum will not have been calculated yet, so we don't
+	 * authenticate that.
+	 */
+	bcopy(src, dst, sizeof (zil_chain_t));
+	bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t));
+	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
+
+	/* loop over records again, filling in iovecs */
+	nr_iovecs = 0;
+	slrp = src + sizeof (zil_chain_t);
+	dlrp = dst + sizeof (zil_chain_t);
+
+	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
+		lr = (lr_t *)slrp;
+
+		if (!byteswap) {
+			txtype = lr->lrc_txtype;
+			lr_len = lr->lrc_reclen;
+		} else {
+			txtype = BSWAP_64(lr->lrc_txtype);
+			lr_len = BSWAP_64(lr->lrc_reclen);
+		}
+
+		/* copy the common lr_t */
+		bcopy(slrp, dlrp, sizeof (lr_t));
+		bcopy(slrp, aadp, sizeof (lr_t));
+		aadp += sizeof (lr_t);
+		aad_len += sizeof (lr_t);
+
+		ASSERT3P(src_iovecs, !=, NULL);
+		ASSERT3P(dst_iovecs, !=, NULL);
+
+		/*
+		 * If this is a TX_WRITE record we want to encrypt everything
+		 * except the bp if exists. If the bp does exist we want to
+		 * authenticate it.
+		 */
+		if (txtype == TX_WRITE) {
+			crypt_len = sizeof (lr_write_t) -
+			    sizeof (lr_t) - sizeof (blkptr_t);
+			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			/* copy the bp now since it will not be encrypted */
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    dlrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    sizeof (blkptr_t));
+			bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t),
+			    aadp, sizeof (blkptr_t));
+			aadp += sizeof (blkptr_t);
+			aad_len += sizeof (blkptr_t);
+			nr_iovecs++;
+			total_len += crypt_len;
+
+			if (lr_len != sizeof (lr_write_t)) {
+				crypt_len = lr_len - sizeof (lr_write_t);
+				src_iovecs[nr_iovecs].iov_base =
+				    slrp + sizeof (lr_write_t);
+				src_iovecs[nr_iovecs].iov_len = crypt_len;
+				dst_iovecs[nr_iovecs].iov_base =
+				    dlrp + sizeof (lr_write_t);
+				dst_iovecs[nr_iovecs].iov_len = crypt_len;
+				nr_iovecs++;
+				total_len += crypt_len;
+			}
+		} else {
+			crypt_len = lr_len - sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+			nr_iovecs++;
+			total_len += crypt_len;
+		}
+	}
+
+	*no_crypt = (nr_iovecs == 0);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+
+	if (encrypt) {
+		puio->uio_iov = src_iovecs;
+		puio->uio_iovcnt = nr_src;
+		cuio->uio_iov = dst_iovecs;
+		cuio->uio_iovcnt = nr_dst;
+	} else {
+		puio->uio_iov = dst_iovecs;
+		puio->uio_iovcnt = nr_dst;
+		cuio->uio_iov = src_iovecs;
+		cuio->uio_iovcnt = nr_src;
+	}
+
+	return (0);
+
+error:
+	zio_buf_free(aadbuf, datalen);
+	if (src_iovecs != NULL)
+		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+	if (dst_iovecs != NULL)
+		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+	*enc_len = 0;
+	*authbuf = NULL;
+	*auth_len = 0;
+	*no_crypt = B_FALSE;
+	puio->uio_iov = NULL;
+	puio->uio_iovcnt = 0;
+	cuio->uio_iov = NULL;
+	cuio->uio_iovcnt = 0;
+	return (ret);
+}
+
+/*
+ * Special case handling routine for encrypting / decrypting dnode blocks.
+ */
+static int
+zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
+    uint_t *auth_len, boolean_t *no_crypt)
+{
+	int ret;
+	uint_t nr_src, nr_dst, crypt_len;
+	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
+	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
+	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
+	uint8_t *src, *dst, *aadp;
+	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
+	uint8_t *aadbuf = zio_buf_alloc(datalen);
+
+	if (encrypt) {
+		src = plainbuf;
+		dst = cipherbuf;
+		nr_src = 0;
+		nr_dst = 1;
+	} else {
+		src = cipherbuf;
+		dst = plainbuf;
+		nr_src = 1;
+		nr_dst = 0;
+	}
+
+	sdnp = (dnode_phys_t *)src;
+	ddnp = (dnode_phys_t *)dst;
+	aadp = aadbuf;
+
+	/*
+	 * Count the number of iovecs we will need to do the encryption by
+	 * counting the number of bonus buffers that need to be encrypted.
+	 */
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		/*
+		 * This block may still be byteswapped. However, all of the
+		 * values we use are either uint8_t's (for which byteswapping
+		 * is a noop) or a * != 0 check, which will work regardless
+		 * of whether or not we byteswap.
+		 */
+		if (sdnp[i].dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
+		    sdnp[i].dn_bonuslen != 0) {
+			nr_iovecs++;
+		}
+	}
+
+	nr_src += nr_iovecs;
+	nr_dst += nr_iovecs;
+
+	if (nr_src != 0) {
+		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
+		if (src_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	if (nr_dst != 0) {
+		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
+		if (dst_iovecs == NULL) {
+			ret = SET_ERROR(ENOMEM);
+			goto error;
+		}
+	}
+
+	nr_iovecs = 0;
+
+	/*
+	 * Iterate through the dnodes again, this time filling in the uios
+	 * we allocated earlier. We also concatenate any data we want to
+	 * authenticate onto aadbuf.
+	 */
+	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
+		dnp = &sdnp[i];
+
+		/* copy over the core fields and blkptrs (kept as plaintext) */
+		bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]),
+			    sizeof (blkptr_t));
+		}
+
+		/*
+		 * Handle authenticated data. We authenticate everything in
+		 * the dnode that can be brought over when we do a raw send.
+		 * This includes all of the core fields as well as the MACs
+		 * stored in the bp checksums and all of the portable bits
+		 * from blk_prop. We include the dnode padding here in case it
+		 * ever gets used in the future. Some dn_flags and dn_used are
+		 * not portable so we mask those out values out of the
+		 * authenticated data.
+		 */
+		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
+		bcopy(dnp, aadp, crypt_len);
+		adnp = (dnode_phys_t *)aadp;
+		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
+		adnp->dn_used = 0;
+		aadp += crypt_len;
+		aad_len += crypt_len;
+
+		for (j = 0; j < dnp->dn_nblkptr; j++) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, &dnp->dn_blkptr[j]);
+		}
+
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
+			    version, byteswap, DN_SPILL_BLKPTR(dnp));
+		}
+
+		/*
+		 * If this bonus buffer needs to be encrypted, we prepare an
+		 * iovec_t. The encryption / decryption functions will fill
+		 * this in for us with the encrypted or decrypted data.
+		 * Otherwise we add the bonus buffer to the authenticated
+		 * data buffer and copy it over to the destination. The
+		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
+		 * we can guarantee alignment with the AES block size
+		 * (128 bits).
+		 */
+		crypt_len = DN_MAX_BONUS_LEN(dnp);
+		if (dnp->dn_type != DMU_OT_NONE &&
+		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
+		    dnp->dn_bonuslen != 0) {
+			ASSERT3U(nr_iovecs, <, nr_src);
+			ASSERT3U(nr_iovecs, <, nr_dst);
+			ASSERT3P(src_iovecs, !=, NULL);
+			ASSERT3P(dst_iovecs, !=, NULL);
+			src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
+			src_iovecs[nr_iovecs].iov_len = crypt_len;
+			dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
+			dst_iovecs[nr_iovecs].iov_len = crypt_len;
+
+			nr_iovecs++;
+			total_len += crypt_len;
+		} else {
+			bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len);
+			bcopy(DN_BONUS(dnp), aadp, crypt_len);
+			aadp += crypt_len;
+			aad_len += crypt_len;
+		}
+	}
+
+	*no_crypt = (nr_iovecs == 0);
+	*enc_len = total_len;
+	*authbuf = aadbuf;
+	*auth_len = aad_len;
+
+	if (encrypt) {
+		puio->uio_iov = src_iovecs;
+		puio->uio_iovcnt = nr_src;
+		cuio->uio_iov = dst_iovecs;
+		cuio->uio_iovcnt = nr_dst;
+	} else {
+		puio->uio_iov = dst_iovecs;
+		puio->uio_iovcnt = nr_dst;
+		cuio->uio_iov = src_iovecs;
+		cuio->uio_iovcnt = nr_src;
+	}
+
+	return (0);
+
+error:
+	zio_buf_free(aadbuf, datalen);
+	if (src_iovecs != NULL)
+		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
+	if (dst_iovecs != NULL)
+		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
+
+	*enc_len = 0;
+	*authbuf = NULL;
+	*auth_len = 0;
+	*no_crypt = B_FALSE;
+	puio->uio_iov = NULL;
+	puio->uio_iovcnt = 0;
+	cuio->uio_iov = NULL;
+	cuio->uio_iovcnt = 0;
+	return (ret);
+}
+
+static int
+zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
+    uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio,
+    uint_t *enc_len)
+{
+	int ret;
+	uint_t nr_plain = 1, nr_cipher = 2;
+	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
+
+	/* allocate the iovecs for the plain and cipher data */
+	plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
+	    KM_SLEEP);
+	if (!plain_iovecs) {
+		ret = SET_ERROR(ENOMEM);
+		goto error;
+	}
+
+	cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
+	    KM_SLEEP);
+	if (!cipher_iovecs) {
+		ret = SET_ERROR(ENOMEM);
+		goto error;
+	}
+
+	plain_iovecs[0].iov_base = plainbuf;
+	plain_iovecs[0].iov_len = datalen;
+	cipher_iovecs[0].iov_base = cipherbuf;
+	cipher_iovecs[0].iov_len = datalen;
+
+	*enc_len = datalen;
+	puio->uio_iov = plain_iovecs;
+	puio->uio_iovcnt = nr_plain;
+	cuio->uio_iov = cipher_iovecs;
+	cuio->uio_iovcnt = nr_cipher;
+
+	return (0);
+
+error:
+	if (plain_iovecs != NULL)
+		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
+	if (cipher_iovecs != NULL)
+		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
+
+	*enc_len = 0;
+	puio->uio_iov = NULL;
+	puio->uio_iovcnt = 0;
+	cuio->uio_iov = NULL;
+	cuio->uio_iovcnt = 0;
+	return (ret);
+}
+
+/*
+ * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
+ * that they can be used for encryption and decryption by zio_do_crypt_uio().
+ * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
+ * requiring special handling to parse out pieces that are to be encrypted. The
+ * authbuf is used by these special cases to store additional authenticated
+ * data (AAD) for the encryption modes.
+ */
+static int
+zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
+    uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
+    uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
+    uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
+{
+	int ret;
+	iovec_t *mac_iov;
+
+	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
+
+	/* route to handler */
+	switch (ot) {
+	case DMU_OT_INTENT_LOG:
+		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
+		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
+		    no_crypt);
+		break;
+	case DMU_OT_DNODE:
+		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
+		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
+		    auth_len, no_crypt);
+		break;
+	default:
+		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
+		    datalen, puio, cuio, enc_len);
+		*authbuf = NULL;
+		*auth_len = 0;
+		*no_crypt = B_FALSE;
+		break;
+	}
+
+	if (ret != 0)
+		goto error;
+
+	/* populate the uios */
+	puio->uio_segflg = UIO_SYSSPACE;
+	cuio->uio_segflg = UIO_SYSSPACE;
+
+	mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
+	mac_iov->iov_base = mac;
+	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * Primary encryption / decryption entrypoint for zio data.
+ */
+int
+zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
+    dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
+    uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
+    boolean_t *no_crypt)
+{
+	int ret;
+	boolean_t locked = B_FALSE;
+	uint64_t crypt = key->zk_crypt;
+	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
+	uint_t enc_len, auth_len;
+	zfs_uio_t puio, cuio;
+	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
+	crypto_key_t tmp_ckey, *ckey = NULL;
+	crypto_ctx_template_t tmpl;
+	uint8_t *authbuf = NULL;
+
+	/*
+	 * If the needed key is the current one, just use it. Otherwise we
+	 * need to generate a temporary one from the given salt + master key.
+	 * If we are encrypting, we must return a copy of the current salt
+	 * so that it can be stored in the blkptr_t.
+	 */
+	rw_enter(&key->zk_salt_lock, RW_READER);
+	locked = B_TRUE;
+
+	if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
+		ckey = &key->zk_current_key;
+		tmpl = key->zk_current_tmpl;
+	} else {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+
+		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
+		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
+		if (ret != 0)
+			goto error;
+
+		tmp_ckey.ck_format = CRYPTO_KEY_RAW;
+		tmp_ckey.ck_data = enc_keydata;
+		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
+
+		ckey = &tmp_ckey;
+		tmpl = NULL;
+	}
+
+	/*
+	 * Attempt to use QAT acceleration if we can. We currently don't
+	 * do this for metadnode and ZIL blocks, since they have a much
+	 * more involved buffer layout and the qat_crypt() function only
+	 * works in-place.
+	 */
+	if (qat_crypt_use_accel(datalen) &&
+	    ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
+		uint8_t *srcbuf, *dstbuf;
+
+		if (encrypt) {
+			srcbuf = plainbuf;
+			dstbuf = cipherbuf;
+		} else {
+			srcbuf = cipherbuf;
+			dstbuf = plainbuf;
+		}
+
+		ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
+		    dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
+		if (ret == CPA_STATUS_SUCCESS) {
+			if (locked) {
+				rw_exit(&key->zk_salt_lock);
+				locked = B_FALSE;
+			}
+
+			return (0);
+		}
+		/* If the hardware implementation fails fall back to software */
+	}
+
+	bzero(&puio, sizeof (zfs_uio_t));
+	bzero(&cuio, sizeof (zfs_uio_t));
+
+	/* create uios for encryption */
+	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
+	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
+	    &authbuf, &auth_len, no_crypt);
+	if (ret != 0)
+		goto error;
+
+	/* perform the encryption / decryption in software */
+	ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
+	    &puio, &cuio, authbuf, auth_len);
+	if (ret != 0)
+		goto error;
+
+	if (locked) {
+		rw_exit(&key->zk_salt_lock);
+		locked = B_FALSE;
+	}
+
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+
+	return (0);
+
+error:
+	if (locked)
+		rw_exit(&key->zk_salt_lock);
+	if (authbuf != NULL)
+		zio_buf_free(authbuf, datalen);
+	if (ckey == &tmp_ckey)
+		bzero(enc_keydata, keydata_len);
+	zio_crypt_destroy_uio(&puio);
+	zio_crypt_destroy_uio(&cuio);
+
+	return (ret);
+}
+
+/*
+ * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
+ * linear buffers.
+ */
+int
+zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
+    boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
+    uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
+{
+	int ret;
+	void *ptmp, *ctmp;
+
+	if (encrypt) {
+		ptmp = abd_borrow_buf_copy(pabd, datalen);
+		ctmp = abd_borrow_buf(cabd, datalen);
+	} else {
+		ptmp = abd_borrow_buf(pabd, datalen);
+		ctmp = abd_borrow_buf_copy(cabd, datalen);
+	}
+
+	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
+	    datalen, ptmp, ctmp, no_crypt);
+	if (ret != 0)
+		goto error;
+
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (0);
+
+error:
+	if (encrypt) {
+		abd_return_buf(pabd, ptmp, datalen);
+		abd_return_buf_copy(cabd, ctmp, datalen);
+	} else {
+		abd_return_buf_copy(pabd, ptmp, datalen);
+		abd_return_buf(cabd, ctmp, datalen);
+	}
+
+	return (ret);
+}
+
+#if defined(_KERNEL)
+/* BEGIN CSTYLED */
+module_param(zfs_key_max_salt_uses, ulong, 0644);
+MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
+	"can be used for generating encryption keys before it is rotated");
+/* END CSTYLED */
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
new file mode 100644
index 000000000000..e6420f19ed87
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -0,0 +1,552 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * LLNL-CODE-403049.
+ * Rewritten for Linux by:
+ *   Rohan Puri <rohan.puri15@gmail.com>
+ *   Brian Behlendorf <behlendorf1@llnl.gov>
+ */
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+/*
+ * Common open routine.  Disallow any write access.
+ */
+/* ARGSUSED */
+static int
+zpl_common_open(struct inode *ip, struct file *filp)
+{
+	if (filp->f_mode & FMODE_WRITE)
+		return (-EACCES);
+
+	return (generic_file_open(ip, filp));
+}
+
+/*
+ * Get root directory contents.
+ */
+static int
+zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	int error = 0;
+
+	ZPL_ENTER(zfsvfs);
+
+	if (!zpl_dir_emit_dots(filp, ctx))
+		goto out;
+
+	if (ctx->pos == 2) {
+		if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME,
+		    strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+	}
+
+	if (ctx->pos == 3) {
+		if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME,
+		    strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR))
+			goto out;
+
+		ctx->pos++;
+	}
+out:
+	ZPL_EXIT(zfsvfs);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_root_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/*
+ * Get root directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+
+	generic_fillattr(ip, stat);
+	stat->atime = current_time(ip);
+
+	return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_root_getattr);
+
+static struct dentry *
+zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+
+	return (d_splice_alias(ip, dentry));
+}
+
+/*
+ * The '.zfs' control directory file and inode operations.
+ */
+const struct file_operations zpl_fops_root = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_root_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_root_iterate,
+#else
+	.readdir	= zpl_root_readdir,
+#endif
+};
+
+const struct inode_operations zpl_ops_root = {
+	.lookup		= zpl_root_lookup,
+	.getattr	= zpl_root_getattr,
+};
+
+static struct vfsmount *
+zpl_snapdir_automount(struct path *path)
+{
+	int error;
+
+	error = -zfsctl_snapshot_mount(path, 0);
+	if (error)
+		return (ERR_PTR(error));
+
+	/*
+	 * Rather than returning the new vfsmount for the snapshot we must
+	 * return NULL to indicate a mount collision.  This is done because
+	 * the user space mount calls do_add_mount() which adds the vfsmount
+	 * to the name space.  If we returned the new mount here it would be
+	 * added again to the vfsmount list resulting in list corruption.
+	 */
+	return (NULL);
+}
+
+/*
+ * Negative dentries must always be revalidated so newly created snapshots
+ * can be detected and automounted.  Normal dentries should be kept because
+ * as of the 3.18 kernel revaliding the mountpoint dentry will result in
+ * the snapshot being immediately unmounted.
+ */
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i)
+#else
+zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
+#endif
+{
+	return (!!dentry->d_inode);
+}
+
+dentry_operations_t zpl_dops_snapdirs = {
+/*
+ * Auto mounting of snapshots is only supported for 2.6.37 and
+ * newer kernels.  Prior to this kernel the ops->follow_link()
+ * callback was used as a hack to trigger the mount.  The
+ * resulting vfsmount was then explicitly grafted in to the
+ * name space.  While it might be possible to add compatibility
+ * code to accomplish this it would require considerable care.
+ */
+	.d_automount	= zpl_snapdir_automount,
+	.d_revalidate	= zpl_snapdir_revalidate,
+};
+
+static struct dentry *
+zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
+    unsigned int flags)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct inode *ip = NULL;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error && error != -ENOENT)
+		return (ERR_PTR(error));
+
+	ASSERT(error == 0 || ip == NULL);
+	d_clear_d_op(dentry);
+	d_set_d_op(dentry, &zpl_dops_snapdirs);
+	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
+
+	return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	fstrans_cookie_t cookie;
+	char snapname[MAXNAMELEN];
+	boolean_t case_conflict;
+	uint64_t id, pos;
+	int error = 0;
+
+	ZPL_ENTER(zfsvfs);
+	cookie = spl_fstrans_mark();
+
+	if (!zpl_dir_emit_dots(filp, ctx))
+		goto out;
+
+	/* Start the position at 0 if it already emitted . and .. */
+	pos = (ctx->pos == 2 ? 0 : ctx->pos);
+	while (error == 0) {
+		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
+		    snapname, &id, &pos, &case_conflict);
+		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+		if (error)
+			goto out;
+
+		if (!zpl_dir_emit(ctx, snapname, strlen(snapname),
+		    ZFSCTL_INO_SHARES - id, DT_DIR))
+			goto out;
+
+		ctx->pos = pos;
+	}
+out:
+	spl_fstrans_unmark(cookie);
+	ZPL_EXIT(zfsvfs);
+
+	if (error == -ENOENT)
+		return (0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_snapdir_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+static int
+zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	/* We probably don't want to support renameat2(2) in ctldir */
+	if (flags)
+		return (-EINVAL);
+
+	crhold(cr);
+	error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
+	    tdip, dname(tdentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry)
+{
+	return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	struct inode *ip;
+	int error;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dip, mode | S_IFDIR, cr);
+
+	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
+	if (error == 0) {
+		d_clear_d_op(dentry);
+		d_set_d_op(dentry, &zpl_dops_snapdirs);
+		d_instantiate(dentry, ip);
+	}
+
+	kmem_free(vap, sizeof (vattr_t));
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	return (error);
+}
+
+/*
+ * Get snapshot directory attributes.
+ */
+/* ARGSUSED */
+static int
+zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+
+	ZPL_ENTER(zfsvfs);
+	generic_fillattr(ip, stat);
+
+	stat->nlink = stat->size = 2;
+	stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+	stat->atime = current_time(ip);
+	ZPL_EXIT(zfsvfs);
+
+	return (0);
+}
+ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr);
+
+/*
+ * The '.zfs/snapshot' directory file operations.  These mainly control
+ * generating the list of available snapshots when doing an 'ls' in the
+ * directory.  See zpl_snapdir_readdir().
+ */
+const struct file_operations zpl_fops_snapdir = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_snapdir_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_snapdir_iterate,
+#else
+	.readdir	= zpl_snapdir_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/snapshot' directory inode operations.  These mainly control
+ * creating an inode for a snapshot directory and initializing the needed
+ * infrastructure to automount the snapshot.  See zpl_snapdir_lookup().
+ */
+const struct inode_operations zpl_ops_snapdir = {
+	.lookup		= zpl_snapdir_lookup,
+	.getattr	= zpl_snapdir_getattr,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+	.rename		= zpl_snapdir_rename2,
+#else
+	.rename		= zpl_snapdir_rename,
+#endif
+	.rmdir		= zpl_snapdir_rmdir,
+	.mkdir		= zpl_snapdir_mkdir,
+};
+
+static struct dentry *
+zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
+    unsigned int flags)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	struct inode *ip = NULL;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
+	    0, cr, NULL, NULL);
+	ASSERT3S(error, <=, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error) {
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+
+	return (d_splice_alias(ip, dentry));
+}
+
+static int
+zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
+	znode_t *dzp;
+	int error = 0;
+
+	ZPL_ENTER(zfsvfs);
+	cookie = spl_fstrans_mark();
+
+	if (zfsvfs->z_shares_dir == 0) {
+		zpl_dir_emit_dots(filp, ctx);
+		goto out;
+	}
+
+	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+	if (error)
+		goto out;
+
+	crhold(cr);
+	error = -zfs_readdir(ZTOI(dzp), ctx, cr);
+	crfree(cr);
+
+	iput(ZTOI(dzp));
+out:
+	spl_fstrans_unmark(cookie);
+	ZPL_EXIT(zfsvfs);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_shares_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+/* ARGSUSED */
+static int
+zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
+    u32 request_mask, unsigned int query_flags)
+{
+	struct inode *ip = path->dentry->d_inode;
+	zfsvfs_t *zfsvfs = ITOZSB(ip);
+	znode_t *dzp;
+	int error;
+
+	ZPL_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		generic_fillattr(path->dentry->d_inode, stat);
+		stat->nlink = stat->size = 2;
+		stat->atime = current_time(ip);
+		ZPL_EXIT(zfsvfs);
+		return (0);
+	}
+
+	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
+	if (error == 0) {
+		error = -zfs_getattr_fast(ZTOI(dzp), stat);
+		iput(ZTOI(dzp));
+	}
+
+	ZPL_EXIT(zfsvfs);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_shares_getattr);
+
+/*
+ * The '.zfs/shares' directory file operations.
+ */
+const struct file_operations zpl_fops_shares = {
+	.open		= zpl_common_open,
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#ifdef HAVE_VFS_ITERATE_SHARED
+	.iterate_shared	= zpl_shares_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_shares_iterate,
+#else
+	.readdir	= zpl_shares_readdir,
+#endif
+
+};
+
+/*
+ * The '.zfs/shares' directory inode operations.
+ */
+const struct inode_operations zpl_ops_shares = {
+	.lookup		= zpl_shares_lookup,
+	.getattr	= zpl_shares_getattr,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
new file mode 100644
index 000000000000..eaf048c38db1
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_export.c
@@ -0,0 +1,154 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011 Gunnar Beutner
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ */
+
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static int
+#ifdef HAVE_ENCODE_FH_WITH_INODE
+zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent)
+{
+#else
+zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable)
+{
+	/* CSTYLED */
+	struct inode *ip = dentry->d_inode;
+#endif /* HAVE_ENCODE_FH_WITH_INODE */
+	fstrans_cookie_t cookie;
+	fid_t *fid = (fid_t *)fh;
+	int len_bytes, rc;
+
+	len_bytes = *max_len * sizeof (__u32);
+
+	if (len_bytes < offsetof(fid_t, fid_data))
+		return (255);
+
+	fid->fid_len = len_bytes - offsetof(fid_t, fid_data);
+	cookie = spl_fstrans_mark();
+
+	if (zfsctl_is_node(ip))
+		rc = zfsctl_fid(ip, fid);
+	else
+		rc = zfs_fid(ip, fid);
+
+	spl_fstrans_unmark(cookie);
+	len_bytes = offsetof(fid_t, fid_data) + fid->fid_len;
+	*max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32);
+
+	return (rc == 0 ? FILEID_INO32_GEN : 255);
+}
+
+static struct dentry *
+zpl_fh_to_dentry(struct super_block *sb, struct fid *fh,
+    int fh_len, int fh_type)
+{
+	fid_t *fid = (fid_t *)fh;
+	fstrans_cookie_t cookie;
+	struct inode *ip;
+	int len_bytes, rc;
+
+	len_bytes = fh_len * sizeof (__u32);
+
+	if (fh_type != FILEID_INO32_GEN ||
+	    len_bytes < offsetof(fid_t, fid_data) ||
+	    len_bytes < offsetof(fid_t, fid_data) + fid->fid_len)
+		return (ERR_PTR(-EINVAL));
+
+	cookie = spl_fstrans_mark();
+	rc = zfs_vget(sb, &ip, fid);
+	spl_fstrans_unmark(cookie);
+
+	if (rc) {
+		/*
+		 * If we see ENOENT it might mean that an NFSv4 * client
+		 * is using a cached inode value in a file handle and
+		 * that the sought after file has had its inode changed
+		 * by a third party.  So change the error to ESTALE
+		 * which will trigger a full lookup by the client and
+		 * will find the new filename/inode pair if it still
+		 * exists.
+		 */
+		if (rc == ENOENT)
+			rc = ESTALE;
+
+		return (ERR_PTR(-rc));
+	}
+
+	ASSERT((ip != NULL) && !IS_ERR(ip));
+
+	return (d_obtain_alias(ip));
+}
+
+static struct dentry *
+zpl_get_parent(struct dentry *child)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	znode_t *zp;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_lookup(ITOZ(child->d_inode), "..", &zp, 0, cr, NULL, NULL);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	if (error)
+		return (ERR_PTR(error));
+
+	return (d_obtain_alias(ZTOI(zp)));
+}
+
+static int
+zpl_commit_metadata(struct inode *inode)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int error;
+
+	if (zfsctl_is_node(inode))
+		return (0);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_fsync(ITOZ(inode), 0, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+const struct export_operations zpl_export_operations = {
+	.encode_fh		= zpl_encode_fh,
+	.fh_to_dentry		= zpl_fh_to_dentry,
+	.get_parent		= zpl_get_parent,
+	.commit_metadata	= zpl_commit_metadata,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
new file mode 100644
index 000000000000..970db4a8b73a
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_file.c
@@ -0,0 +1,1069 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <sys/file.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_project.h>
+
+/*
+ * When using fallocate(2) to preallocate space, inflate the requested
+ * capacity check by 10% to account for the required metadata blocks.
+ */
+unsigned int zfs_fallocate_reserve_percent = 110;
+
+static int
+zpl_open(struct inode *ip, struct file *filp)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	error = generic_file_open(ip, filp);
+	if (error)
+		return (error);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_release(struct inode *ip, struct file *filp)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	if (ITOZ(ip)->z_atime_dirty)
+		zfs_mark_inode_dirty(ip);
+
+	crhold(cr);
+	error = -zfs_close(ip, filp->f_flags, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_iterate(struct file *filp, zpl_dir_context_t *ctx)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_readdir(file_inode(filp), ctx, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED)
+static int
+zpl_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	zpl_dir_context_t ctx =
+	    ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos);
+	int error;
+
+	error = zpl_iterate(filp, &ctx);
+	filp->f_pos = ctx.pos;
+
+	return (error);
+}
+#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */
+
+#if defined(HAVE_FSYNC_WITHOUT_DENTRY)
+/*
+ * Linux 2.6.35 - 3.0 API,
+ * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed
+ * redundant.  The dentry is still accessible via filp->f_path.dentry,
+ * and we are guaranteed that filp will never be NULL.
+ */
+static int
+zpl_fsync(struct file *filp, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_fsync(ITOZ(inode), datasync, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+	return (zpl_fsync(kiocb->ki_filp, datasync));
+}
+#endif
+
+#elif defined(HAVE_FSYNC_RANGE)
+/*
+ * Linux 3.1 - 3.x API,
+ * As of 3.1 the responsibility to call filemap_write_and_wait_range() has
+ * been pushed down in to the .fsync() vfs hook.  Additionally, the i_mutex
+ * lock is no longer held by the caller, for zfs we don't require the lock
+ * to be held so we don't acquire it.
+ */
+static int
+zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
+{
+	struct inode *inode = filp->f_mapping->host;
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	if (error)
+		return (error);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_fsync(ITOZ(inode), datasync, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_FILE_AIO_FSYNC
+static int
+zpl_aio_fsync(struct kiocb *kiocb, int datasync)
+{
+	return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync));
+}
+#endif
+
+#else
+#error "Unsupported fops->fsync() implementation"
+#endif
+
+static inline int
+zfs_io_flags(struct kiocb *kiocb)
+{
+	int flags = 0;
+
+#if defined(IOCB_DSYNC)
+	if (kiocb->ki_flags & IOCB_DSYNC)
+		flags |= O_DSYNC;
+#endif
+#if defined(IOCB_SYNC)
+	if (kiocb->ki_flags & IOCB_SYNC)
+		flags |= O_SYNC;
+#endif
+#if defined(IOCB_APPEND)
+	if (kiocb->ki_flags & IOCB_APPEND)
+		flags |= O_APPEND;
+#endif
+#if defined(IOCB_DIRECT)
+	if (kiocb->ki_flags & IOCB_DIRECT)
+		flags |= O_DIRECT;
+#endif
+	return (flags);
+}
+
+/*
+ * If relatime is enabled, call file_accessed() if zfs_relatime_need_update()
+ * is true.  This is needed since datasets with inherited "relatime" property
+ * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after
+ * `zfs set relatime=...`), which is what relatime test in VFS by
+ * relatime_need_update() is based on.
+ */
+static inline void
+zpl_file_accessed(struct file *filp)
+{
+	struct inode *ip = filp->f_mapping->host;
+
+	if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) {
+		if (zfs_relatime_need_update(ip))
+			file_accessed(filp);
+	} else {
+		file_accessed(filp);
+	}
+}
+
+#if defined(HAVE_VFS_RW_ITERATE)
+
+/*
+ * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports
+ * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to
+ * manipulate the iov_iter are available.  In which case the full iov_iter
+ * can be attached to the uio and correctly handled in the lower layers.
+ * Otherwise, for older kernels extract the iovec and pass it instead.
+ */
+static void
+zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to,
+    loff_t pos, ssize_t count, size_t skip)
+{
+#if defined(HAVE_VFS_IOV_ITER)
+	zfs_uio_iov_iter_init(uio, to, pos, count, skip);
+#else
+	zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos,
+	    to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE,
+	    count, skip);
+#endif
+}
+
+static ssize_t
+zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	struct file *filp = kiocb->ki_filp;
+	ssize_t count = iov_iter_count(to);
+	zfs_uio_t uio;
+
+	zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+	    filp->f_flags | zfs_io_flags(kiocb), cr);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error < 0)
+		return (error);
+
+	ssize_t read = count - uio.uio_resid;
+	kiocb->ki_pos += read;
+
+	zpl_file_accessed(filp);
+
+	return (read);
+}
+
+static inline ssize_t
+zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from,
+    size_t *countp)
+{
+#ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB
+	ssize_t ret = generic_write_checks(kiocb, from);
+	if (ret <= 0)
+		return (ret);
+
+	*countp = ret;
+#else
+	struct file *file = kiocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *ip = mapping->host;
+	int isblk = S_ISBLK(ip->i_mode);
+
+	*countp = iov_iter_count(from);
+	ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk);
+	if (ret)
+		return (ret);
+#endif
+
+	return (0);
+}
+
+static ssize_t
+zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	struct file *filp = kiocb->ki_filp;
+	struct inode *ip = filp->f_mapping->host;
+	zfs_uio_t uio;
+	size_t count = 0;
+	ssize_t ret;
+
+	ret = zpl_generic_write_checks(kiocb, from, &count);
+	if (ret)
+		return (ret);
+
+	zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	int error = -zfs_write(ITOZ(ip), &uio,
+	    filp->f_flags | zfs_io_flags(kiocb), cr);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error < 0)
+		return (error);
+
+	ssize_t wrote = count - uio.uio_resid;
+	kiocb->ki_pos += wrote;
+
+	if (wrote > 0)
+		iov_iter_advance(from, wrote);
+
+	return (wrote);
+}
+
+#else /* !HAVE_VFS_RW_ITERATE */
+
+static ssize_t
+zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov,
+    unsigned long nr_segs, loff_t pos)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	struct file *filp = kiocb->ki_filp;
+	size_t count;
+	ssize_t ret;
+
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+	if (ret)
+		return (ret);
+
+	zfs_uio_t uio;
+	zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE,
+	    count, 0);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio,
+	    filp->f_flags | zfs_io_flags(kiocb), cr);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error < 0)
+		return (error);
+
+	ssize_t read = count - uio.uio_resid;
+	kiocb->ki_pos += read;
+
+	zpl_file_accessed(filp);
+
+	return (read);
+}
+
+static ssize_t
+zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov,
+    unsigned long nr_segs, loff_t pos)
+{
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	struct file *filp = kiocb->ki_filp;
+	struct inode *ip = filp->f_mapping->host;
+	size_t count;
+	ssize_t ret;
+
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (ret)
+		return (ret);
+
+	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode));
+	if (ret)
+		return (ret);
+
+	zfs_uio_t uio;
+	zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE,
+	    count, 0);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	int error = -zfs_write(ITOZ(ip), &uio,
+	    filp->f_flags | zfs_io_flags(kiocb), cr);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error < 0)
+		return (error);
+
+	ssize_t wrote = count - uio.uio_resid;
+	kiocb->ki_pos += wrote;
+
+	return (wrote);
+}
+#endif /* HAVE_VFS_RW_ITERATE */
+
+#if defined(HAVE_VFS_RW_ITERATE)
+static ssize_t
+zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter)
+{
+	if (rw == WRITE)
+		return (zpl_iter_write(kiocb, iter));
+	else
+		return (zpl_iter_read(kiocb, iter));
+}
+#if defined(HAVE_VFS_DIRECT_IO_ITER)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter)
+{
+	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET)
+static ssize_t
+zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+	ASSERT3S(pos, ==, kiocb->ki_pos);
+	return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+	ASSERT3S(pos, ==, kiocb->ki_pos);
+	return (zpl_direct_IO_impl(rw, kiocb, iter));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#else /* HAVE_VFS_RW_ITERATE */
+
+#if defined(HAVE_VFS_DIRECT_IO_IOVEC)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov,
+    loff_t pos, unsigned long nr_segs)
+{
+	if (rw == WRITE)
+		return (zpl_aio_write(kiocb, iov, nr_segs, pos));
+	else
+		return (zpl_aio_read(kiocb, iov, nr_segs, pos));
+}
+#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET)
+static ssize_t
+zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos)
+{
+	const struct iovec *iovp = iov_iter_iovec(iter);
+	unsigned long nr_segs = iter->nr_segs;
+
+	ASSERT3S(pos, ==, kiocb->ki_pos);
+	if (rw == WRITE)
+		return (zpl_aio_write(kiocb, iovp, nr_segs, pos));
+	else
+		return (zpl_aio_read(kiocb, iovp, nr_segs, pos));
+}
+#else
+#error "Unknown direct IO interface"
+#endif
+
+#endif /* HAVE_VFS_RW_ITERATE */
+
+static loff_t
+zpl_llseek(struct file *filp, loff_t offset, int whence)
+{
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+	fstrans_cookie_t cookie;
+
+	if (whence == SEEK_DATA || whence == SEEK_HOLE) {
+		struct inode *ip = filp->f_mapping->host;
+		loff_t maxbytes = ip->i_sb->s_maxbytes;
+		loff_t error;
+
+		spl_inode_lock_shared(ip);
+		cookie = spl_fstrans_mark();
+		error = -zfs_holey(ITOZ(ip), whence, &offset);
+		spl_fstrans_unmark(cookie);
+		if (error == 0)
+			error = lseek_execute(filp, ip, offset, maxbytes);
+		spl_inode_unlock_shared(ip);
+
+		return (error);
+	}
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+	return (generic_file_llseek(filp, offset, whence));
+}
+
+/*
+ * It's worth taking a moment to describe how mmap is implemented
+ * for zfs because it differs considerably from other Linux filesystems.
+ * However, this issue is handled the same way under OpenSolaris.
+ *
+ * The issue is that by design zfs bypasses the Linux page cache and
+ * leaves all caching up to the ARC.  This has been shown to work
+ * well for the common read(2)/write(2) case.  However, mmap(2)
+ * is problem because it relies on being tightly integrated with the
+ * page cache.  To handle this we cache mmap'ed files twice, once in
+ * the ARC and a second time in the page cache.  The code is careful
+ * to keep both copies synchronized.
+ *
+ * When a file with an mmap'ed region is written to using write(2)
+ * both the data in the ARC and existing pages in the page cache
+ * are updated.  For a read(2) data will be read first from the page
+ * cache then the ARC if needed.  Neither a write(2) or read(2) will
+ * will ever result in new pages being added to the page cache.
+ *
+ * New pages are added to the page cache only via .readpage() which
+ * is called when the vfs needs to read a page off disk to back the
+ * virtual memory region.  These pages may be modified without
+ * notifying the ARC and will be written out periodically via
+ * .writepage().  This will occur due to either a sync or the usual
+ * page aging behavior.  Note because a read(2) of a mmap'ed file
+ * will always check the page cache first even when the ARC is out
+ * of date correct data will still be returned.
+ *
+ * While this implementation ensures correct behavior it does have
+ * have some drawbacks.  The most obvious of which is that it
+ * increases the required memory footprint when access mmap'ed
+ * files.  It also adds additional complexity to the code keeping
+ * both caches synchronized.
+ *
+ * Longer term it may be possible to cleanly resolve this wart by
+ * mapping page cache pages directly on to the ARC buffers.  The
+ * Linux address space operations are flexible enough to allow
+ * selection of which pages back a particular index.  The trick
+ * would be working out the details of which subsystem is in
+ * charge, the ARC, the page cache, or both.  It may also prove
+ * helpful to move the ARC buffers to a scatter-gather lists
+ * rather than a vmalloc'ed region.
+ */
+static int
+zpl_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct inode *ip = filp->f_mapping->host;
+	znode_t *zp = ITOZ(ip);
+	int error;
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start,
+	    (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags);
+	spl_fstrans_unmark(cookie);
+	if (error)
+		return (error);
+
+	error = generic_file_mmap(filp, vma);
+	if (error)
+		return (error);
+
+	mutex_enter(&zp->z_lock);
+	zp->z_is_mapped = B_TRUE;
+	mutex_exit(&zp->z_lock);
+
+	return (error);
+}
+
+/*
+ * Populate a page with data for the Linux page cache.  This function is
+ * only used to support mmap(2).  There will be an identical copy of the
+ * data in the ARC which is kept up to date via .write() and .writepage().
+ */
+static int
+zpl_readpage(struct file *filp, struct page *pp)
+{
+	struct inode *ip;
+	struct page *pl[1];
+	int error = 0;
+	fstrans_cookie_t cookie;
+
+	ASSERT(PageLocked(pp));
+	ip = pp->mapping->host;
+	pl[0] = pp;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_getpage(ip, pl, 1);
+	spl_fstrans_unmark(cookie);
+
+	if (error) {
+		SetPageError(pp);
+		ClearPageUptodate(pp);
+	} else {
+		ClearPageError(pp);
+		SetPageUptodate(pp);
+		flush_dcache_page(pp);
+	}
+
+	unlock_page(pp);
+	return (error);
+}
+
+/*
+ * Populate a set of pages with data for the Linux page cache.  This
+ * function will only be called for read ahead and never for demand
+ * paging.  For simplicity, the code relies on read_cache_pages() to
+ * correctly lock each page for IO and call zpl_readpage().
+ */
+static int
+zpl_readpages(struct file *filp, struct address_space *mapping,
+    struct list_head *pages, unsigned nr_pages)
+{
+	return (read_cache_pages(mapping, pages,
+	    (filler_t *)zpl_readpage, filp));
+}
+
+static int
+zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data)
+{
+	struct address_space *mapping = data;
+	fstrans_cookie_t cookie;
+
+	ASSERT(PageLocked(pp));
+	ASSERT(!PageWriteback(pp));
+
+	cookie = spl_fstrans_mark();
+	(void) zfs_putpage(mapping->host, pp, wbc);
+	spl_fstrans_unmark(cookie);
+
+	return (0);
+}
+
+static int
+zpl_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+	znode_t		*zp = ITOZ(mapping->host);
+	zfsvfs_t	*zfsvfs = ITOZSB(mapping->host);
+	enum writeback_sync_modes sync_mode;
+	int result;
+
+	ZPL_ENTER(zfsvfs);
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		wbc->sync_mode = WB_SYNC_ALL;
+	ZPL_EXIT(zfsvfs);
+	sync_mode = wbc->sync_mode;
+
+	/*
+	 * We don't want to run write_cache_pages() in SYNC mode here, because
+	 * that would make putpage() wait for a single page to be committed to
+	 * disk every single time, resulting in atrocious performance. Instead
+	 * we run it once in non-SYNC mode so that the ZIL gets all the data,
+	 * and then we commit it all in one go.
+	 */
+	wbc->sync_mode = WB_SYNC_NONE;
+	result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+	if (sync_mode != wbc->sync_mode) {
+		ZPL_ENTER(zfsvfs);
+		ZPL_VERIFY_ZP(zp);
+		if (zfsvfs->z_log != NULL)
+			zil_commit(zfsvfs->z_log, zp->z_id);
+		ZPL_EXIT(zfsvfs);
+
+		/*
+		 * We need to call write_cache_pages() again (we can't just
+		 * return after the commit) because the previous call in
+		 * non-SYNC mode does not guarantee that we got all the dirty
+		 * pages (see the implementation of write_cache_pages() for
+		 * details). That being said, this is a no-op in most cases.
+		 */
+		wbc->sync_mode = sync_mode;
+		result = write_cache_pages(mapping, wbc, zpl_putpage, mapping);
+	}
+	return (result);
+}
+
+/*
+ * Write out dirty pages to the ARC, this function is only required to
+ * support mmap(2).  Mapped pages may be dirtied by memory operations
+ * which never call .write().  These dirty pages are kept in sync with
+ * the ARC buffers via this hook.
+ */
+static int
+zpl_writepage(struct page *pp, struct writeback_control *wbc)
+{
+	if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		wbc->sync_mode = WB_SYNC_ALL;
+
+	return (zpl_putpage(pp, wbc, pp->mapping));
+}
+
+/*
+ * The flag combination which matches the behavior of zfs_space() is
+ * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE.  The FALLOC_FL_PUNCH_HOLE
+ * flag was introduced in the 2.6.38 kernel.
+ *
+ * The original mode=0 (allocate space) behavior can be reasonably emulated
+ * by checking if enough space exists and creating a sparse file, as real
+ * persistent space reservation is not possible due to COW, snapshots, etc.
+ */
+static long
+zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len)
+{
+	cred_t *cr = CRED();
+	loff_t olen;
+	fstrans_cookie_t cookie;
+	int error = 0;
+
+	if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0)
+		return (-EOPNOTSUPP);
+
+	if (offset < 0 || len <= 0)
+		return (-EINVAL);
+
+	spl_inode_lock(ip);
+	olen = i_size_read(ip);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	if (mode & FALLOC_FL_PUNCH_HOLE) {
+		flock64_t bf;
+
+		if (offset > olen)
+			goto out_unmark;
+
+		if (offset + len > olen)
+			len = olen - offset;
+		bf.l_type = F_WRLCK;
+		bf.l_whence = SEEK_SET;
+		bf.l_start = offset;
+		bf.l_len = len;
+		bf.l_pid = 0;
+
+		error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr);
+	} else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) {
+		unsigned int percent = zfs_fallocate_reserve_percent;
+		struct kstatfs statfs;
+
+		/* Legacy mode, disable fallocate compatibility. */
+		if (percent == 0) {
+			error = -EOPNOTSUPP;
+			goto out_unmark;
+		}
+
+		/*
+		 * Use zfs_statvfs() instead of dmu_objset_space() since it
+		 * also checks project quota limits, which are relevant here.
+		 */
+		error = zfs_statvfs(ip, &statfs);
+		if (error)
+			goto out_unmark;
+
+		/*
+		 * Shrink available space a bit to account for overhead/races.
+		 * We know the product previously fit into availbytes from
+		 * dmu_objset_space(), so the smaller product will also fit.
+		 */
+		if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) {
+			error = -ENOSPC;
+			goto out_unmark;
+		}
+		if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen)
+			error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE);
+	}
+out_unmark:
+	spl_fstrans_unmark(cookie);
+	spl_inode_unlock(ip);
+
+	crfree(cr);
+
+	return (error);
+}
+
+static long
+zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len)
+{
+	return zpl_fallocate_common(file_inode(filp),
+	    mode, offset, len);
+}
+
+#define	ZFS_FL_USER_VISIBLE	(FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL)
+#define	ZFS_FL_USER_MODIFIABLE	(FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL)
+
+static uint32_t
+__zpl_ioctl_getflags(struct inode *ip)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	uint32_t ioctl_flags = 0;
+
+	if (zfs_flags & ZFS_IMMUTABLE)
+		ioctl_flags |= FS_IMMUTABLE_FL;
+
+	if (zfs_flags & ZFS_APPENDONLY)
+		ioctl_flags |= FS_APPEND_FL;
+
+	if (zfs_flags & ZFS_NODUMP)
+		ioctl_flags |= FS_NODUMP_FL;
+
+	if (zfs_flags & ZFS_PROJINHERIT)
+		ioctl_flags |= ZFS_PROJINHERIT_FL;
+
+	return (ioctl_flags & ZFS_FL_USER_VISIBLE);
+}
+
+/*
+ * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file
+ * attributes common to both Linux and Solaris are mapped.
+ */
+static int
+zpl_ioctl_getflags(struct file *filp, void __user *arg)
+{
+	uint32_t flags;
+	int err;
+
+	flags = __zpl_ioctl_getflags(file_inode(filp));
+	err = copy_to_user(arg, &flags, sizeof (flags));
+
+	return (err);
+}
+
+/*
+ * fchange() is a helper macro to detect if we have been asked to change a
+ * flag. This is ugly, but the requirement that we do this is a consequence of
+ * how the Linux file attribute interface was designed. Another consequence is
+ * that concurrent modification of files suffers from a TOCTOU race. Neither
+ * are things we can fix without modifying the kernel-userland interface, which
+ * is outside of our jurisdiction.
+ */
+
+#define	fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1)))
+
+static int
+__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva)
+{
+	uint64_t zfs_flags = ITOZ(ip)->z_pflags;
+	xoptattr_t *xoap;
+
+	if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL |
+	    ZFS_PROJINHERIT_FL))
+		return (-EOPNOTSUPP);
+
+	if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE)
+		return (-EACCES);
+
+	if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) ||
+	    fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) &&
+	    !capable(CAP_LINUX_IMMUTABLE))
+		return (-EACCES);
+
+	if (!inode_owner_or_capable(ip))
+		return (-EACCES);
+
+	xva_init(xva);
+	xoap = xva_getxoptattr(xva);
+
+	XVA_SET_REQ(xva, XAT_IMMUTABLE);
+	if (ioctl_flags & FS_IMMUTABLE_FL)
+		xoap->xoa_immutable = B_TRUE;
+
+	XVA_SET_REQ(xva, XAT_APPENDONLY);
+	if (ioctl_flags & FS_APPEND_FL)
+		xoap->xoa_appendonly = B_TRUE;
+
+	XVA_SET_REQ(xva, XAT_NODUMP);
+	if (ioctl_flags & FS_NODUMP_FL)
+		xoap->xoa_nodump = B_TRUE;
+
+	XVA_SET_REQ(xva, XAT_PROJINHERIT);
+	if (ioctl_flags & ZFS_PROJINHERIT_FL)
+		xoap->xoa_projinherit = B_TRUE;
+
+	return (0);
+}
+
+static int
+zpl_ioctl_setflags(struct file *filp, void __user *arg)
+{
+	struct inode *ip = file_inode(filp);
+	uint32_t flags;
+	cred_t *cr = CRED();
+	xvattr_t xva;
+	int err;
+	fstrans_cookie_t cookie;
+
+	if (copy_from_user(&flags, arg, sizeof (flags)))
+		return (-EFAULT);
+
+	err = __zpl_ioctl_setflags(ip, flags, &xva);
+	if (err)
+		return (err);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (err);
+}
+
+static int
+zpl_ioctl_getxattr(struct file *filp, void __user *arg)
+{
+	zfsxattr_t fsx = { 0 };
+	struct inode *ip = file_inode(filp);
+	int err;
+
+	fsx.fsx_xflags = __zpl_ioctl_getflags(ip);
+	fsx.fsx_projid = ITOZ(ip)->z_projid;
+	err = copy_to_user(arg, &fsx, sizeof (fsx));
+
+	return (err);
+}
+
+static int
+zpl_ioctl_setxattr(struct file *filp, void __user *arg)
+{
+	struct inode *ip = file_inode(filp);
+	zfsxattr_t fsx;
+	cred_t *cr = CRED();
+	xvattr_t xva;
+	xoptattr_t *xoap;
+	int err;
+	fstrans_cookie_t cookie;
+
+	if (copy_from_user(&fsx, arg, sizeof (fsx)))
+		return (-EFAULT);
+
+	if (!zpl_is_valid_projid(fsx.fsx_projid))
+		return (-EINVAL);
+
+	err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva);
+	if (err)
+		return (err);
+
+	xoap = xva_getxoptattr(&xva);
+	XVA_SET_REQ(&xva, XAT_PROJID);
+	xoap->xoa_projid = fsx.fsx_projid;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (err);
+}
+
+static long
+zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return (zpl_ioctl_getflags(filp, (void *)arg));
+	case FS_IOC_SETFLAGS:
+		return (zpl_ioctl_setflags(filp, (void *)arg));
+	case ZFS_IOC_FSGETXATTR:
+		return (zpl_ioctl_getxattr(filp, (void *)arg));
+	case ZFS_IOC_FSSETXATTR:
+		return (zpl_ioctl_setxattr(filp, (void *)arg));
+	default:
+		return (-ENOTTY);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long
+zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case FS_IOC32_GETFLAGS:
+		cmd = FS_IOC_GETFLAGS;
+		break;
+	case FS_IOC32_SETFLAGS:
+		cmd = FS_IOC_SETFLAGS;
+		break;
+	default:
+		return (-ENOTTY);
+	}
+	return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)));
+}
+#endif /* CONFIG_COMPAT */
+
+
+const struct address_space_operations zpl_address_space_operations = {
+	.readpages	= zpl_readpages,
+	.readpage	= zpl_readpage,
+	.writepage	= zpl_writepage,
+	.writepages	= zpl_writepages,
+	.direct_IO	= zpl_direct_IO,
+};
+
+const struct file_operations zpl_file_operations = {
+	.open		= zpl_open,
+	.release	= zpl_release,
+	.llseek		= zpl_llseek,
+#ifdef HAVE_VFS_RW_ITERATE
+#ifdef HAVE_NEW_SYNC_READ
+	.read		= new_sync_read,
+	.write		= new_sync_write,
+#endif
+	.read_iter	= zpl_iter_read,
+	.write_iter	= zpl_iter_write,
+#ifdef HAVE_VFS_IOV_ITER
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= iter_file_splice_write,
+#endif
+#else
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= zpl_aio_read,
+	.aio_write	= zpl_aio_write,
+#endif
+	.mmap		= zpl_mmap,
+	.fsync		= zpl_fsync,
+#ifdef HAVE_FILE_AIO_FSYNC
+	.aio_fsync	= zpl_aio_fsync,
+#endif
+	.fallocate	= zpl_fallocate,
+	.unlocked_ioctl	= zpl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= zpl_compat_ioctl,
+#endif
+};
+
+const struct file_operations zpl_dir_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+#if defined(HAVE_VFS_ITERATE_SHARED)
+	.iterate_shared	= zpl_iterate,
+#elif defined(HAVE_VFS_ITERATE)
+	.iterate	= zpl_iterate,
+#else
+	.readdir	= zpl_readdir,
+#endif
+	.fsync		= zpl_fsync,
+	.unlocked_ioctl = zpl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl   = zpl_compat_ioctl,
+#endif
+};
+
+/* BEGIN CSTYLED */
+module_param(zfs_fallocate_reserve_percent, uint, 0644);
+MODULE_PARM_DESC(zfs_fallocate_reserve_percent,
+    "Percentage of length to use for the available capacity check");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
new file mode 100644
index 000000000000..e79d334edc9b
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_inode.c
@@ -0,0 +1,745 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ */
+
+
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_znode.h>
+#include <sys/dmu_objset.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+#include <sys/file.h>
+
+
+static struct dentry *
+zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	znode_t *zp;
+	int error;
+	fstrans_cookie_t cookie;
+	pathname_t *ppn = NULL;
+	pathname_t pn;
+	int zfs_flags = 0;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	if (dlen(dentry) >= ZAP_MAXNAMELEN)
+		return (ERR_PTR(-ENAMETOOLONG));
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+
+	/* If we are a case insensitive fs, we need the real name */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
+		zfs_flags = FIGNORECASE;
+		pn_alloc(&pn);
+		ppn = &pn;
+	}
+
+	error = -zfs_lookup(ITOZ(dir), dname(dentry), &zp,
+	    zfs_flags, cr, NULL, ppn);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+	crfree(cr);
+
+	spin_lock(&dentry->d_lock);
+	dentry->d_time = jiffies;
+	spin_unlock(&dentry->d_lock);
+
+	if (error) {
+		/*
+		 * If we have a case sensitive fs, we do not want to
+		 * insert negative entries, so return NULL for ENOENT.
+		 * Fall through if the error is not ENOENT. Also free memory.
+		 */
+		if (ppn) {
+			pn_free(ppn);
+			if (error == -ENOENT)
+				return (NULL);
+		}
+
+		if (error == -ENOENT)
+			return (d_splice_alias(NULL, dentry));
+		else
+			return (ERR_PTR(error));
+	}
+	ip = ZTOI(zp);
+
+	/*
+	 * If we are case insensitive, call the correct function
+	 * to install the name.
+	 */
+	if (ppn) {
+		struct dentry *new_dentry;
+		struct qstr ci_name;
+
+		if (strcmp(dname(dentry), pn.pn_buf) == 0) {
+			new_dentry = d_splice_alias(ip,  dentry);
+		} else {
+			ci_name.name = pn.pn_buf;
+			ci_name.len = strlen(pn.pn_buf);
+			new_dentry = d_add_ci(dentry, ip, &ci_name);
+		}
+		pn_free(ppn);
+		return (new_dentry);
+	} else {
+		return (d_splice_alias(ip, dentry));
+	}
+}
+
+void
+zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr)
+{
+	vap->va_mask = ATTR_MODE;
+	vap->va_mode = mode;
+	vap->va_uid = crgetfsuid(cr);
+
+	if (dir && dir->i_mode & S_ISGID) {
+		vap->va_gid = KGID_TO_SGID(dir->i_gid);
+		if (S_ISDIR(mode))
+			vap->va_mode |= S_ISGID;
+	} else {
+		vap->va_gid = crgetfsgid(cr);
+	}
+}
+
+static int
+zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag)
+{
+	cred_t *cr = CRED();
+	znode_t *zp;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
+	    mode, &zp, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ZTOI(zp), dir);
+
+		if (error)
+			(void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+    dev_t rdev)
+{
+	cred_t *cr = CRED();
+	znode_t *zp;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	/*
+	 * We currently expect Linux to supply rdev=0 for all sockets
+	 * and fifos, but we want to know if this behavior ever changes.
+	 */
+	if (S_ISSOCK(mode) || S_ISFIFO(mode))
+		ASSERT(rdev == 0);
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode, cr);
+	vap->va_rdev = rdev;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0,
+	    mode, &zp, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ZTOI(zp), dir);
+
+		if (error)
+			(void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifdef HAVE_TMPFILE
+static int
+zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	cred_t *cr = CRED();
+	struct inode *ip;
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	/*
+	 * The VFS does not apply the umask, therefore it is applied here
+	 * when POSIX ACLs are not enabled.
+	 */
+	if (!IS_POSIXACL(dir))
+		mode &= ~current_umask();
+	zpl_vap_init(vap, dir, mode, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL);
+	if (error == 0) {
+		/* d_tmpfile will do drop_nlink, so we should set it first */
+		set_nlink(ip, 1);
+		d_tmpfile(dentry, ip);
+
+		error = zpl_xattr_security_init(ip, dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ip, dir);
+		/*
+		 * don't need to handle error here, file is already in
+		 * unlinked set.
+		 */
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+#endif
+
+static int
+zpl_unlink(struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+
+	/*
+	 * For a CI FS we must invalidate the dentry to prevent the
+	 * creation of negative entries.
+	 */
+	if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+		d_invalidate(dentry);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	znode_t *zp;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, mode | S_IFDIR, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error == 0)
+			error = zpl_init_acl(ZTOI(zp), dir);
+
+		if (error)
+			(void) zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0);
+
+	/*
+	 * For a CI FS we must invalidate the dentry to prevent the
+	 * creation of negative entries.
+	 */
+	if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE)
+		d_invalidate(dentry);
+
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask,
+    unsigned int query_flags)
+{
+	int error;
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+
+	/*
+	 * XXX request_mask and query_flags currently ignored.
+	 */
+
+	error = -zfs_getattr_fast(path->dentry->d_inode, stat);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+ZPL_GETATTR_WRAPPER(zpl_getattr);
+
+static int
+zpl_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	struct inode *ip = dentry->d_inode;
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	int error;
+	fstrans_cookie_t cookie;
+
+	error = setattr_prepare(dentry, ia);
+	if (error)
+		return (error);
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK;
+	vap->va_mode = ia->ia_mode;
+	vap->va_uid = KUID_TO_SUID(ia->ia_uid);
+	vap->va_gid = KGID_TO_SGID(ia->ia_gid);
+	vap->va_size = ia->ia_size;
+	vap->va_atime = ia->ia_atime;
+	vap->va_mtime = ia->ia_mtime;
+	vap->va_ctime = ia->ia_ctime;
+
+	if (vap->va_mask & ATTR_ATIME)
+		ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_setattr(ITOZ(ip), vap, 0, cr);
+	if (!error && (ia->ia_valid & ATTR_MODE))
+		error = zpl_chmod_acl(ip);
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_rename2(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry, unsigned int flags)
+{
+	cred_t *cr = CRED();
+	int error;
+	fstrans_cookie_t cookie;
+
+	/* We don't have renameat2(2) support */
+	if (flags)
+		return (-EINVAL);
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip),
+	    dname(tdentry), cr, 0);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#ifndef HAVE_RENAME_WANTS_FLAGS
+static int
+zpl_rename(struct inode *sdip, struct dentry *sdentry,
+    struct inode *tdip, struct dentry *tdentry)
+{
+	return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0));
+}
+#endif
+
+static int
+zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name)
+{
+	cred_t *cr = CRED();
+	vattr_t *vap;
+	znode_t *zp;
+	int error;
+	fstrans_cookie_t cookie;
+
+	crhold(cr);
+	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+	zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_symlink(ITOZ(dir), dname(dentry), vap,
+	    (char *)name, &zp, cr, 0);
+	if (error == 0) {
+		d_instantiate(dentry, ZTOI(zp));
+
+		error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name);
+		if (error)
+			(void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0);
+	}
+
+	spl_fstrans_unmark(cookie);
+	kmem_free(vap, sizeof (vattr_t));
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+#if defined(HAVE_PUT_LINK_COOKIE)
+static void
+zpl_put_link(struct inode *unused, void *cookie)
+{
+	kmem_free(cookie, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_NAMEIDATA)
+static void
+zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr)
+{
+	const char *link = nd_get_link(nd);
+
+	if (!IS_ERR(link))
+		kmem_free(link, MAXPATHLEN);
+}
+#elif defined(HAVE_PUT_LINK_DELAYED)
+static void
+zpl_put_link(void *ptr)
+{
+	kmem_free(ptr, MAXPATHLEN);
+}
+#endif
+
+static int
+zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	*link = NULL;
+
+	struct iovec iov;
+	iov.iov_len = MAXPATHLEN;
+	iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+	zfs_uio_t uio;
+	zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_readlink(ip, &uio, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	if (error)
+		kmem_free(iov.iov_base, MAXPATHLEN);
+	else
+		*link = iov.iov_base;
+
+	return (error);
+}
+
+#if defined(HAVE_GET_LINK_DELAYED)
+static const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode,
+    struct delayed_call *done)
+{
+	char *link = NULL;
+	int error;
+
+	if (!dentry)
+		return (ERR_PTR(-ECHILD));
+
+	error = zpl_get_link_common(dentry, inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	set_delayed_call(done, zpl_put_link, link);
+
+	return (link);
+}
+#elif defined(HAVE_GET_LINK_COOKIE)
+static const char *
+zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie)
+{
+	char *link = NULL;
+	int error;
+
+	if (!dentry)
+		return (ERR_PTR(-ECHILD));
+
+	error = zpl_get_link_common(dentry, inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_COOKIE)
+static const char *
+zpl_follow_link(struct dentry *dentry, void **cookie)
+{
+	char *link = NULL;
+	int error;
+
+	error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+	if (error)
+		return (ERR_PTR(error));
+
+	return (*cookie = link);
+}
+#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+static void *
+zpl_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	char *link = NULL;
+	int error;
+
+	error = zpl_get_link_common(dentry, dentry->d_inode, &link);
+	if (error)
+		nd_set_link(nd, ERR_PTR(error));
+	else
+		nd_set_link(nd, link);
+
+	return (NULL);
+}
+#endif
+
+static int
+zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
+{
+	cred_t *cr = CRED();
+	struct inode *ip = old_dentry->d_inode;
+	int error;
+	fstrans_cookie_t cookie;
+
+	if (ip->i_nlink >= ZFS_LINK_MAX)
+		return (-EMLINK);
+
+	crhold(cr);
+	ip->i_ctime = current_time(ip);
+	igrab(ip); /* Use ihold() if available */
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0);
+	if (error) {
+		iput(ip);
+		goto out;
+	}
+
+	d_instantiate(dentry, ip);
+out:
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+#ifdef HAVE_D_REVALIDATE_NAMEIDATA
+zpl_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned int flags = (nd ? nd->flags : 0);
+#else
+zpl_revalidate(struct dentry *dentry, unsigned int flags)
+{
+#endif /* HAVE_D_REVALIDATE_NAMEIDATA */
+	/* CSTYLED */
+	zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info;
+	int error;
+
+	if (flags & LOOKUP_RCU)
+		return (-ECHILD);
+
+	/*
+	 * After a rollback negative dentries created before the rollback
+	 * time must be invalidated.  Otherwise they can obscure files which
+	 * are only present in the rolled back dataset.
+	 */
+	if (dentry->d_inode == NULL) {
+		spin_lock(&dentry->d_lock);
+		error = time_before(dentry->d_time, zfsvfs->z_rollback_time);
+		spin_unlock(&dentry->d_lock);
+
+		if (error)
+			return (0);
+	}
+
+	/*
+	 * The dentry may reference a stale inode if a mounted file system
+	 * was rolled back to a point in time where the object didn't exist.
+	 */
+	if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale)
+		return (0);
+
+	return (1);
+}
+
+const struct inode_operations zpl_inode_operations = {
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+	.get_acl	= zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_dir_inode_operations = {
+	.create		= zpl_create,
+	.lookup		= zpl_lookup,
+	.link		= zpl_link,
+	.unlink		= zpl_unlink,
+	.symlink	= zpl_symlink,
+	.mkdir		= zpl_mkdir,
+	.rmdir		= zpl_rmdir,
+	.mknod		= zpl_mknod,
+#ifdef HAVE_RENAME_WANTS_FLAGS
+	.rename		= zpl_rename2,
+#else
+	.rename		= zpl_rename,
+#endif
+#ifdef HAVE_TMPFILE
+	.tmpfile	= zpl_tmpfile,
+#endif
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+	.get_acl	= zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+const struct inode_operations zpl_symlink_inode_operations = {
+#ifdef HAVE_GENERIC_READLINK
+	.readlink	= generic_readlink,
+#endif
+#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE)
+	.get_link	= zpl_get_link,
+#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA)
+	.follow_link	= zpl_follow_link,
+#endif
+#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA)
+	.put_link	= zpl_put_link,
+#endif
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+};
+
+const struct inode_operations zpl_special_inode_operations = {
+	.setattr	= zpl_setattr,
+	.getattr	= zpl_getattr,
+#ifdef HAVE_GENERIC_SETXATTR
+	.setxattr	= generic_setxattr,
+	.getxattr	= generic_getxattr,
+	.removexattr	= generic_removexattr,
+#endif
+	.listxattr	= zpl_xattr_list,
+#if defined(CONFIG_FS_POSIX_ACL)
+#if defined(HAVE_SET_ACL)
+	.set_acl	= zpl_set_acl,
+#endif /* HAVE_SET_ACL */
+	.get_acl	= zpl_get_acl,
+#endif /* CONFIG_FS_POSIX_ACL */
+};
+
+dentry_operations_t zpl_dentry_operations = {
+	.d_revalidate	= zpl_revalidate,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
new file mode 100644
index 000000000000..c2fd3fee1401
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_super.c
@@ -0,0 +1,365 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ */
+
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zpl.h>
+
+
+static struct inode *
+zpl_inode_alloc(struct super_block *sb)
+{
+	struct inode *ip;
+
+	VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0);
+	inode_set_iversion(ip, 1);
+
+	return (ip);
+}
+
+static void
+zpl_inode_destroy(struct inode *ip)
+{
+	ASSERT(atomic_read(&ip->i_count) == 0);
+	zfs_inode_destroy(ip);
+}
+
+/*
+ * Called from __mark_inode_dirty() to reflect that something in the
+ * inode has changed.  We use it to ensure the znode system attributes
+ * are always strictly update to date with respect to the inode.
+ */
+#ifdef HAVE_DIRTY_INODE_WITH_FLAGS
+static void
+zpl_dirty_inode(struct inode *ip, int flags)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	zfs_dirty_inode(ip, flags);
+	spl_fstrans_unmark(cookie);
+}
+#else
+static void
+zpl_dirty_inode(struct inode *ip)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	zfs_dirty_inode(ip, 0);
+	spl_fstrans_unmark(cookie);
+}
+#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */
+
+/*
+ * When ->drop_inode() is called its return value indicates if the
+ * inode should be evicted from the inode cache.  If the inode is
+ * unhashed and has no links the default policy is to evict it
+ * immediately.
+ *
+ * The ->evict_inode() callback must minimally truncate the inode pages,
+ * and call clear_inode().  For 2.6.35 and later kernels this will
+ * simply update the inode state, with the sync occurring before the
+ * truncate in evict().  For earlier kernels clear_inode() maps to
+ * end_writeback() which is responsible for completing all outstanding
+ * write back.  In either case, once this is done it is safe to cleanup
+ * any remaining inode specific data via zfs_inactive().
+ * remaining filesystem specific data.
+ */
+static void
+zpl_evict_inode(struct inode *ip)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	truncate_setsize(ip, 0);
+	clear_inode(ip);
+	zfs_inactive(ip);
+	spl_fstrans_unmark(cookie);
+}
+
+static void
+zpl_put_super(struct super_block *sb)
+{
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_umount(sb);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+}
+
+static int
+zpl_sync_fs(struct super_block *sb, int wait)
+{
+	fstrans_cookie_t cookie;
+	cred_t *cr = CRED();
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	error = -zfs_sync(sb, wait, cr);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_statfs(struct dentry *dentry, struct kstatfs *statp)
+{
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_statvfs(dentry->d_inode, statp);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	/*
+	 * If required by a 32-bit system call, dynamically scale the
+	 * block size up to 16MiB and decrease the block counts.  This
+	 * allows for a maximum size of 64EiB to be reported.  The file
+	 * counts must be artificially capped at 2^32-1.
+	 */
+	if (unlikely(zpl_is_32bit_api())) {
+		while (statp->f_blocks > UINT32_MAX &&
+		    statp->f_bsize < SPA_MAXBLOCKSIZE) {
+			statp->f_frsize <<= 1;
+			statp->f_bsize <<= 1;
+
+			statp->f_blocks >>= 1;
+			statp->f_bfree >>= 1;
+			statp->f_bavail >>= 1;
+		}
+
+		uint64_t usedobjs = statp->f_files - statp->f_ffree;
+		statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs);
+		statp->f_files = statp->f_ffree + usedobjs;
+	}
+
+	return (error);
+}
+
+static int
+zpl_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data };
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_remount(sb, flags, &zm);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+__zpl_show_devname(struct seq_file *seq, zfsvfs_t *zfsvfs)
+{
+	ZPL_ENTER(zfsvfs);
+
+	char *fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	dmu_objset_name(zfsvfs->z_os, fsname);
+
+	for (int i = 0; fsname[i] != 0; i++) {
+		/*
+		 * Spaces in the dataset name must be converted to their
+		 * octal escape sequence for getmntent(3) to correctly
+		 * parse then fsname portion of /proc/self/mounts.
+		 */
+		if (fsname[i] == ' ') {
+			seq_puts(seq, "\\040");
+		} else {
+			seq_putc(seq, fsname[i]);
+		}
+	}
+
+	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
+
+	ZPL_EXIT(zfsvfs);
+
+	return (0);
+}
+
+static int
+zpl_show_devname(struct seq_file *seq, struct dentry *root)
+{
+	return (__zpl_show_devname(seq, root->d_sb->s_fs_info));
+}
+
+static int
+__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs)
+{
+	seq_printf(seq, ",%s",
+	    zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr");
+
+#ifdef CONFIG_FS_POSIX_ACL
+	switch (zfsvfs->z_acl_type) {
+	case ZFS_ACLTYPE_POSIX:
+		seq_puts(seq, ",posixacl");
+		break;
+	default:
+		seq_puts(seq, ",noacl");
+		break;
+	}
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	return (0);
+}
+
+static int
+zpl_show_options(struct seq_file *seq, struct dentry *root)
+{
+	return (__zpl_show_options(seq, root->d_sb->s_fs_info));
+}
+
+static int
+zpl_fill_super(struct super_block *sb, void *data, int silent)
+{
+	zfs_mnt_t *zm = (zfs_mnt_t *)data;
+	fstrans_cookie_t cookie;
+	int error;
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_domount(sb, zm, silent);
+	spl_fstrans_unmark(cookie);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_test_super(struct super_block *s, void *data)
+{
+	zfsvfs_t *zfsvfs = s->s_fs_info;
+	objset_t *os = data;
+
+	if (zfsvfs == NULL)
+		return (0);
+
+	return (os == zfsvfs->z_os);
+}
+
+static struct super_block *
+zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm)
+{
+	struct super_block *s;
+	objset_t *os;
+	int err;
+
+	err = dmu_objset_hold(zm->mnt_osname, FTAG, &os);
+	if (err)
+		return (ERR_PTR(-err));
+
+	/*
+	 * The dsl pool lock must be released prior to calling sget().
+	 * It is possible sget() may block on the lock in grab_super()
+	 * while deactivate_super() holds that same lock and waits for
+	 * a txg sync.  If the dsl_pool lock is held over sget()
+	 * this can prevent the pool sync and cause a deadlock.
+	 */
+	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
+	dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+	s = sget(fs_type, zpl_test_super, set_anon_super, flags, os);
+
+	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
+	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
+
+	if (IS_ERR(s))
+		return (ERR_CAST(s));
+
+	if (s->s_root == NULL) {
+		err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0);
+		if (err) {
+			deactivate_locked_super(s);
+			return (ERR_PTR(err));
+		}
+		s->s_flags |= SB_ACTIVE;
+	} else if ((flags ^ s->s_flags) & SB_RDONLY) {
+		deactivate_locked_super(s);
+		return (ERR_PTR(-EBUSY));
+	}
+
+	return (s);
+}
+
+static struct dentry *
+zpl_mount(struct file_system_type *fs_type, int flags,
+    const char *osname, void *data)
+{
+	zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data };
+
+	struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm);
+	if (IS_ERR(sb))
+		return (ERR_CAST(sb));
+
+	return (dget(sb->s_root));
+}
+
+static void
+zpl_kill_sb(struct super_block *sb)
+{
+	zfs_preumount(sb);
+	kill_anon_super(sb);
+}
+
+void
+zpl_prune_sb(int64_t nr_to_scan, void *arg)
+{
+	struct super_block *sb = (struct super_block *)arg;
+	int objects = 0;
+
+	(void) -zfs_prune(sb, nr_to_scan, &objects);
+}
+
+const struct super_operations zpl_super_operations = {
+	.alloc_inode		= zpl_inode_alloc,
+	.destroy_inode		= zpl_inode_destroy,
+	.dirty_inode		= zpl_dirty_inode,
+	.write_inode		= NULL,
+	.evict_inode		= zpl_evict_inode,
+	.put_super		= zpl_put_super,
+	.sync_fs		= zpl_sync_fs,
+	.statfs			= zpl_statfs,
+	.remount_fs		= zpl_remount_fs,
+	.show_devname		= zpl_show_devname,
+	.show_options		= zpl_show_options,
+	.show_stats		= NULL,
+};
+
+struct file_system_type zpl_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= ZFS_DRIVER,
+	.mount			= zpl_mount,
+	.kill_sb		= zpl_kill_sb,
+};
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
new file mode 100644
index 000000000000..83812f2dcba8
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_xattr.c
@@ -0,0 +1,1486 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2011, Lawrence Livermore National Security, LLC.
+ *
+ * Extended attributes (xattr) on Solaris are implemented as files
+ * which exist in a hidden xattr directory.  These extended attributes
+ * can be accessed using the attropen() system call which opens
+ * the extended attribute.  It can then be manipulated just like
+ * a standard file descriptor.  This has a couple advantages such
+ * as practically no size limit on the file, and the extended
+ * attributes permissions may differ from those of the parent file.
+ * This interface is really quite clever, but it's also completely
+ * different than what is supported on Linux.  It also comes with a
+ * steep performance penalty when accessing small xattrs because they
+ * are not stored with the parent file.
+ *
+ * Under Linux extended attributes are manipulated by the system
+ * calls getxattr(2), setxattr(2), and listxattr(2).  They consider
+ * extended attributes to be name/value pairs where the name is a
+ * NULL terminated string.  The name must also include one of the
+ * following namespace prefixes:
+ *
+ *   user     - No restrictions and is available to user applications.
+ *   trusted  - Restricted to kernel and root (CAP_SYS_ADMIN) use.
+ *   system   - Used for access control lists (system.nfs4_acl, etc).
+ *   security - Used by SELinux to store a files security context.
+ *
+ * The value under Linux to limited to 65536 bytes of binary data.
+ * In practice, individual xattrs tend to be much smaller than this
+ * and are typically less than 100 bytes.  A good example of this
+ * are the security.selinux xattrs which are less than 100 bytes and
+ * exist for every file when xattr labeling is enabled.
+ *
+ * The Linux xattr implementation has been written to take advantage of
+ * this typical usage.  When the dataset property 'xattr=sa' is set,
+ * then xattrs will be preferentially stored as System Attributes (SA).
+ * This allows tiny xattrs (~100 bytes) to be stored with the dnode and
+ * up to 64k of xattrs to be stored in the spill block.  If additional
+ * xattr space is required, which is unlikely under Linux, they will
+ * be stored using the traditional directory approach.
+ *
+ * This optimization results in roughly a 3x performance improvement
+ * when accessing xattrs because it avoids the need to perform a seek
+ * for every xattr value.  When multiple xattrs are stored per-file
+ * the performance improvements are even greater because all of the
+ * xattrs stored in the spill block will be cached.
+ *
+ * However, by default SA based xattrs are disabled in the Linux port
+ * to maximize compatibility with other implementations.  If you do
+ * enable SA based xattrs then they will not be visible on platforms
+ * which do not support this feature.
+ *
+ * NOTE: One additional consequence of the xattr directory implementation
+ * is that when an extended attribute is manipulated an inode is created.
+ * This inode will exist in the Linux inode cache but there will be no
+ * associated entry in the dentry cache which references it.  This is
+ * safe but it may result in some confusion.  Enabling SA based xattrs
+ * largely avoids the issue except in the overflow case.
+ */
+
+#include <sys/zfs_znode.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zap.h>
+#include <sys/vfs.h>
+#include <sys/zpl.h>
+
+typedef struct xattr_filldir {
+	size_t size;
+	size_t offset;
+	char *buf;
+	struct dentry *dentry;
+} xattr_filldir_t;
+
+static const struct xattr_handler *zpl_xattr_handler(const char *);
+
+static int
+zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len)
+{
+	static const struct xattr_handler *handler;
+	struct dentry *d = xf->dentry;
+
+	handler = zpl_xattr_handler(name);
+	if (!handler)
+		return (0);
+
+	if (handler->list) {
+#if defined(HAVE_XATTR_LIST_SIMPLE)
+		if (!handler->list(d))
+			return (0);
+#elif defined(HAVE_XATTR_LIST_DENTRY)
+		if (!handler->list(d, NULL, 0, name, name_len, 0))
+			return (0);
+#elif defined(HAVE_XATTR_LIST_HANDLER)
+		if (!handler->list(handler, d, NULL, 0, name, name_len))
+			return (0);
+#endif
+	}
+
+	return (1);
+}
+
+/*
+ * Determine is a given xattr name should be visible and if so copy it
+ * in to the provided buffer (xf->buf).
+ */
+static int
+zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len)
+{
+	/* Check permissions using the per-namespace list xattr handler. */
+	if (!zpl_xattr_permission(xf, name, name_len))
+		return (0);
+
+	/* When xf->buf is NULL only calculate the required size. */
+	if (xf->buf) {
+		if (xf->offset + name_len + 1 > xf->size)
+			return (-ERANGE);
+
+		memcpy(xf->buf + xf->offset, name, name_len);
+		xf->buf[xf->offset + name_len] = '\0';
+	}
+
+	xf->offset += (name_len + 1);
+
+	return (0);
+}
+
+/*
+ * Read as many directory entry names as will fit in to the provided buffer,
+ * or when no buffer is provided calculate the required buffer size.
+ */
+static int
+zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf)
+{
+	zap_cursor_t zc;
+	zap_attribute_t	zap;
+	int error;
+
+	zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id);
+
+	while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) {
+
+		if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
+			error = -ENXIO;
+			break;
+		}
+
+		error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name));
+		if (error)
+			break;
+
+		zap_cursor_advance(&zc);
+	}
+
+	zap_cursor_fini(&zc);
+
+	if (error == -ENOENT)
+		error = 0;
+
+	return (error);
+}
+
+static ssize_t
+zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr)
+{
+	struct inode *ip = xf->dentry->d_inode;
+	struct inode *dxip = NULL;
+	znode_t *dxzp;
+	int error;
+
+	/* Lookup the xattr directory */
+	error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR,
+	    cr, NULL, NULL);
+	if (error) {
+		if (error == -ENOENT)
+			error = 0;
+
+		return (error);
+	}
+
+	dxip = ZTOI(dxzp);
+	error = zpl_xattr_readdir(dxip, xf);
+	iput(dxip);
+
+	return (error);
+}
+
+static ssize_t
+zpl_xattr_list_sa(xattr_filldir_t *xf)
+{
+	znode_t *zp = ITOZ(xf->dentry->d_inode);
+	nvpair_t *nvp = NULL;
+	int error = 0;
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_xattr_cached == NULL)
+		error = -zfs_sa_get_xattr(zp);
+	mutex_exit(&zp->z_lock);
+
+	if (error)
+		return (error);
+
+	ASSERT(zp->z_xattr_cached);
+
+	while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
+		ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
+
+		error = zpl_xattr_filldir(xf, nvpair_name(nvp),
+		    strlen(nvpair_name(nvp)));
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+ssize_t
+zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+{
+	znode_t *zp = ITOZ(dentry->d_inode);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	xattr_filldir_t xf = { buffer_size, 0, buffer, dentry };
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int error = 0;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	ZPL_ENTER(zfsvfs);
+	ZPL_VERIFY_ZP(zp);
+	rw_enter(&zp->z_xattr_lock, RW_READER);
+
+	if (zfsvfs->z_use_sa && zp->z_is_sa) {
+		error = zpl_xattr_list_sa(&xf);
+		if (error)
+			goto out;
+	}
+
+	error = zpl_xattr_list_dir(&xf, cr);
+	if (error)
+		goto out;
+
+	error = xf.offset;
+out:
+
+	rw_exit(&zp->z_xattr_lock);
+	ZPL_EXIT(zfsvfs);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_xattr_get_dir(struct inode *ip, const char *name, void *value,
+    size_t size, cred_t *cr)
+{
+	fstrans_cookie_t cookie;
+	struct inode *xip = NULL;
+	znode_t *dxzp = NULL;
+	znode_t *xzp = NULL;
+	int error;
+
+	/* Lookup the xattr directory */
+	error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR,
+	    cr, NULL, NULL);
+	if (error)
+		goto out;
+
+	/* Lookup a specific xattr name in the directory */
+	error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL);
+	if (error)
+		goto out;
+
+	xip = ZTOI(xzp);
+	if (!size) {
+		error = i_size_read(xip);
+		goto out;
+	}
+
+	if (size < i_size_read(xip)) {
+		error = -ERANGE;
+		goto out;
+	}
+
+	struct iovec iov;
+	iov.iov_base = (void *)value;
+	iov.iov_len = size;
+
+	zfs_uio_t uio;
+	zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0);
+
+	cookie = spl_fstrans_mark();
+	error = -zfs_read(ITOZ(xip), &uio, 0, cr);
+	spl_fstrans_unmark(cookie);
+
+	if (error == 0)
+		error = size - zfs_uio_resid(&uio);
+out:
+	if (xzp)
+		zrele(xzp);
+
+	if (dxzp)
+		zrele(dxzp);
+
+	return (error);
+}
+
+static int
+zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size)
+{
+	znode_t *zp = ITOZ(ip);
+	uchar_t *nv_value;
+	uint_t nv_size;
+	int error = 0;
+
+	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_xattr_cached == NULL)
+		error = -zfs_sa_get_xattr(zp);
+	mutex_exit(&zp->z_lock);
+
+	if (error)
+		return (error);
+
+	ASSERT(zp->z_xattr_cached);
+	error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name,
+	    &nv_value, &nv_size);
+	if (error)
+		return (error);
+
+	if (size == 0 || value == NULL)
+		return (nv_size);
+
+	if (size < nv_size)
+		return (-ERANGE);
+
+	memcpy(value, nv_value, nv_size);
+
+	return (nv_size);
+}
+
+static int
+__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size,
+    cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+
+	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+	if (zfsvfs->z_use_sa && zp->z_is_sa) {
+		error = zpl_xattr_get_sa(ip, name, value, size);
+		if (error != -ENOENT)
+			goto out;
+	}
+
+	error = zpl_xattr_get_dir(ip, name, value, size, cr);
+out:
+	if (error == -ENOENT)
+		error = -ENODATA;
+
+	return (error);
+}
+
+#define	XATTR_NOENT	0x0
+#define	XATTR_IN_SA	0x1
+#define	XATTR_IN_DIR	0x2
+/* check where the xattr resides */
+static int
+__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+
+	ASSERT(where);
+	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+
+	*where = XATTR_NOENT;
+	if (zfsvfs->z_use_sa && zp->z_is_sa) {
+		error = zpl_xattr_get_sa(ip, name, NULL, 0);
+		if (error >= 0)
+			*where |= XATTR_IN_SA;
+		else if (error != -ENOENT)
+			return (error);
+	}
+
+	error = zpl_xattr_get_dir(ip, name, NULL, 0, cr);
+	if (error >= 0)
+		*where |= XATTR_IN_DIR;
+	else if (error != -ENOENT)
+		return (error);
+
+	if (*where == (XATTR_IN_SA|XATTR_IN_DIR))
+		cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\""
+		    " in both SA and dir", ip, name);
+	if (*where == XATTR_NOENT)
+		error = -ENODATA;
+	else
+		error = 0;
+	return (error);
+}
+
+static int
+zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	ZPL_ENTER(zfsvfs);
+	ZPL_VERIFY_ZP(zp);
+	rw_enter(&zp->z_xattr_lock, RW_READER);
+	error = __zpl_xattr_get(ip, name, value, size, cr);
+	rw_exit(&zp->z_xattr_lock);
+	ZPL_EXIT(zfsvfs);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+
+	return (error);
+}
+
+static int
+zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value,
+    size_t size, int flags, cred_t *cr)
+{
+	znode_t *dxzp = NULL;
+	znode_t *xzp = NULL;
+	vattr_t *vap = NULL;
+	int lookup_flags, error;
+	const int xattr_mode = S_IFREG | 0644;
+	loff_t pos = 0;
+
+	/*
+	 * Lookup the xattr directory.  When we're adding an entry pass
+	 * CREATE_XATTR_DIR to ensure the xattr directory is created.
+	 * When removing an entry this flag is not passed to avoid
+	 * unnecessarily creating a new xattr directory.
+	 */
+	lookup_flags = LOOKUP_XATTR;
+	if (value != NULL)
+		lookup_flags |= CREATE_XATTR_DIR;
+
+	error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags,
+	    cr, NULL, NULL);
+	if (error)
+		goto out;
+
+	/* Lookup a specific xattr name in the directory */
+	error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL);
+	if (error && (error != -ENOENT))
+		goto out;
+
+	error = 0;
+
+	/* Remove a specific name xattr when value is set to NULL. */
+	if (value == NULL) {
+		if (xzp)
+			error = -zfs_remove(dxzp, (char *)name, cr, 0);
+
+		goto out;
+	}
+
+	/* Lookup failed create a new xattr. */
+	if (xzp == NULL) {
+		vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
+		vap->va_mode = xattr_mode;
+		vap->va_mask = ATTR_MODE;
+		vap->va_uid = crgetfsuid(cr);
+		vap->va_gid = crgetfsgid(cr);
+
+		error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp,
+		    cr, 0, NULL);
+		if (error)
+			goto out;
+	}
+
+	ASSERT(xzp != NULL);
+
+	error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE);
+	if (error)
+		goto out;
+
+	error = -zfs_write_simple(xzp, value, size, pos, NULL);
+out:
+	if (error == 0) {
+		ip->i_ctime = current_time(ip);
+		zfs_mark_inode_dirty(ip);
+	}
+
+	if (vap)
+		kmem_free(vap, sizeof (vattr_t));
+
+	if (xzp)
+		zrele(xzp);
+
+	if (dxzp)
+		zrele(dxzp);
+
+	if (error == -ENOENT)
+		error = -ENODATA;
+
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value,
+    size_t size, int flags, cred_t *cr)
+{
+	znode_t *zp = ITOZ(ip);
+	nvlist_t *nvl;
+	size_t sa_size;
+	int error = 0;
+
+	mutex_enter(&zp->z_lock);
+	if (zp->z_xattr_cached == NULL)
+		error = -zfs_sa_get_xattr(zp);
+	mutex_exit(&zp->z_lock);
+
+	if (error)
+		return (error);
+
+	ASSERT(zp->z_xattr_cached);
+	nvl = zp->z_xattr_cached;
+
+	if (value == NULL) {
+		error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
+		if (error == -ENOENT)
+			error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr);
+	} else {
+		/* Limited to 32k to keep nvpair memory allocations small */
+		if (size > DXATTR_MAX_ENTRY_SIZE)
+			return (-EFBIG);
+
+		/* Prevent the DXATTR SA from consuming the entire SA region */
+		error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
+		if (error)
+			return (error);
+
+		if (sa_size > DXATTR_MAX_SA_SIZE)
+			return (-EFBIG);
+
+		error = -nvlist_add_byte_array(nvl, name,
+		    (uchar_t *)value, size);
+	}
+
+	/*
+	 * Update the SA for additions, modifications, and removals. On
+	 * error drop the inconsistent cached version of the nvlist, it
+	 * will be reconstructed from the ARC when next accessed.
+	 */
+	if (error == 0)
+		error = -zfs_sa_set_xattr(zp);
+
+	if (error) {
+		nvlist_free(nvl);
+		zp->z_xattr_cached = NULL;
+	}
+
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+static int
+zpl_xattr_set(struct inode *ip, const char *name, const void *value,
+    size_t size, int flags)
+{
+	znode_t *zp = ITOZ(ip);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	cred_t *cr = CRED();
+	fstrans_cookie_t cookie;
+	int where;
+	int error;
+
+	crhold(cr);
+	cookie = spl_fstrans_mark();
+	ZPL_ENTER(zfsvfs);
+	ZPL_VERIFY_ZP(zp);
+	rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER);
+
+	/*
+	 * Before setting the xattr check to see if it already exists.
+	 * This is done to ensure the following optional flags are honored.
+	 *
+	 *   XATTR_CREATE: fail if xattr already exists
+	 *   XATTR_REPLACE: fail if xattr does not exist
+	 *
+	 * We also want to know if it resides in sa or dir, so we can make
+	 * sure we don't end up with duplicate in both places.
+	 */
+	error = __zpl_xattr_where(ip, name, &where, cr);
+	if (error < 0) {
+		if (error != -ENODATA)
+			goto out;
+		if (flags & XATTR_REPLACE)
+			goto out;
+
+		/* The xattr to be removed already doesn't exist */
+		error = 0;
+		if (value == NULL)
+			goto out;
+	} else {
+		error = -EEXIST;
+		if (flags & XATTR_CREATE)
+			goto out;
+	}
+
+	/* Preferentially store the xattr as a SA for better performance */
+	if (zfsvfs->z_use_sa && zp->z_is_sa &&
+	    (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) {
+		error = zpl_xattr_set_sa(ip, name, value, size, flags, cr);
+		if (error == 0) {
+			/*
+			 * Successfully put into SA, we need to clear the one
+			 * in dir.
+			 */
+			if (where & XATTR_IN_DIR)
+				zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr);
+			goto out;
+		}
+	}
+
+	error = zpl_xattr_set_dir(ip, name, value, size, flags, cr);
+	/*
+	 * Successfully put into dir, we need to clear the one in SA.
+	 */
+	if (error == 0 && (where & XATTR_IN_SA))
+		zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr);
+out:
+	rw_exit(&ITOZ(ip)->z_xattr_lock);
+	ZPL_EXIT(zfsvfs);
+	spl_fstrans_unmark(cookie);
+	crfree(cr);
+	ASSERT3S(error, <=, 0);
+
+	return (error);
+}
+
+/*
+ * Extended user attributes
+ *
+ * "Extended user attributes may be assigned to files and directories for
+ * storing arbitrary additional information such as the mime type,
+ * character set or encoding of a file.  The access permissions for user
+ * attributes are defined by the file permission bits: read permission
+ * is required to retrieve the attribute value, and writer permission is
+ * required to change it.
+ *
+ * The file permission bits of regular files and directories are
+ * interpreted differently from the file permission bits of special
+ * files and symbolic links.  For regular files and directories the file
+ * permission bits define access to the file's contents, while for
+ * device special files they define access to the device described by
+ * the special file.  The file permissions of symbolic links are not
+ * used in access checks.  These differences would allow users to
+ * consume filesystem resources in a way not controllable by disk quotas
+ * for group or world writable special files and directories.
+ *
+ * For this reason, extended user attributes are allowed only for
+ * regular files and directories, and access to extended user attributes
+ * is restricted to the owner and to users with appropriate capabilities
+ * for directories with the sticky bit set (see the chmod(1) manual page
+ * for an explanation of the sticky bit)." - xattr(7)
+ *
+ * ZFS allows extended user attributes to be disabled administratively
+ * by setting the 'xattr=off' property on the dataset.
+ */
+static int
+__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	return (ITOZSB(ip)->z_flags & ZSB_XATTR);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list);
+
+static int
+__zpl_xattr_user_get(struct inode *ip, const char *name,
+    void *value, size_t size)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+		return (-EOPNOTSUPP);
+
+	xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+	error = zpl_xattr_get(ip, xattr_name, value, size);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get);
+
+static int
+__zpl_xattr_user_set(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	if (!(ITOZSB(ip)->z_flags & ZSB_XATTR))
+		return (-EOPNOTSUPP);
+
+	xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name);
+	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set);
+
+xattr_handler_t zpl_xattr_user_handler =
+{
+	.prefix	= XATTR_USER_PREFIX,
+	.list	= zpl_xattr_user_list,
+	.get	= zpl_xattr_user_get,
+	.set	= zpl_xattr_user_set,
+};
+
+/*
+ * Trusted extended attributes
+ *
+ * "Trusted extended attributes are visible and accessible only to
+ * processes that have the CAP_SYS_ADMIN capability.  Attributes in this
+ * class are used to implement mechanisms in user space (i.e., outside
+ * the kernel) which keep information in extended attributes to which
+ * ordinary processes should not have access." - xattr(7)
+ */
+static int
+__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	return (capable(CAP_SYS_ADMIN));
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list);
+
+static int
+__zpl_xattr_trusted_get(struct inode *ip, const char *name,
+    void *value, size_t size)
+{
+	char *xattr_name;
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return (-EACCES);
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+	error = zpl_xattr_get(ip, xattr_name, value, size);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get);
+
+static int
+__zpl_xattr_trusted_set(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	char *xattr_name;
+	int error;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return (-EACCES);
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name);
+	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set);
+
+xattr_handler_t zpl_xattr_trusted_handler =
+{
+	.prefix	= XATTR_TRUSTED_PREFIX,
+	.list	= zpl_xattr_trusted_list,
+	.get	= zpl_xattr_trusted_get,
+	.set	= zpl_xattr_trusted_set,
+};
+
+/*
+ * Extended security attributes
+ *
+ * "The security attribute namespace is used by kernel security modules,
+ * such as Security Enhanced Linux, and also to implement file
+ * capabilities (see capabilities(7)).  Read and write access
+ * permissions to security attributes depend on the policy implemented
+ * for each security attribute by the security module.  When no security
+ * module is loaded, all processes have read access to extended security
+ * attributes, and write access is limited to processes that have the
+ * CAP_SYS_ADMIN capability." - xattr(7)
+ */
+static int
+__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	return (1);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list);
+
+static int
+__zpl_xattr_security_get(struct inode *ip, const char *name,
+    void *value, size_t size)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+	error = zpl_xattr_get(ip, xattr_name, value, size);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get);
+
+static int
+__zpl_xattr_security_set(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	char *xattr_name;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") == 0)
+		return (-EINVAL);
+#endif
+	xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name);
+	error = zpl_xattr_set(ip, xattr_name, value, size, flags);
+	kmem_strfree(xattr_name);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set);
+
+static int
+zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs,
+    void *fs_info)
+{
+	const struct xattr *xattr;
+	int error = 0;
+
+	for (xattr = xattrs; xattr->name != NULL; xattr++) {
+		error = __zpl_xattr_security_set(ip,
+		    xattr->name, xattr->value, xattr->value_len, 0);
+
+		if (error < 0)
+			break;
+	}
+
+	return (error);
+}
+
+int
+zpl_xattr_security_init(struct inode *ip, struct inode *dip,
+    const struct qstr *qstr)
+{
+	return security_inode_init_security(ip, dip, qstr,
+	    &zpl_xattr_security_init_impl, NULL);
+}
+
+/*
+ * Security xattr namespace handlers.
+ */
+xattr_handler_t zpl_xattr_security_handler = {
+	.prefix	= XATTR_SECURITY_PREFIX,
+	.list	= zpl_xattr_security_list,
+	.get	= zpl_xattr_security_get,
+	.set	= zpl_xattr_security_set,
+};
+
+/*
+ * Extended system attributes
+ *
+ * "Extended system attributes are used by the kernel to store system
+ * objects such as Access Control Lists.  Read and write access permissions
+ * to system attributes depend on the policy implemented for each system
+ * attribute implemented by filesystems in the kernel." - xattr(7)
+ */
+#ifdef CONFIG_FS_POSIX_ACL
+#ifndef HAVE_SET_ACL
+static
+#endif
+int
+zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type)
+{
+	char *name, *value = NULL;
+	int error = 0;
+	size_t size = 0;
+
+	if (S_ISLNK(ip->i_mode))
+		return (-EOPNOTSUPP);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+		if (acl) {
+			umode_t mode = ip->i_mode;
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0) {
+				return (error);
+			} else {
+				/*
+				 * The mode bits will have been set by
+				 * ->zfs_setattr()->zfs_acl_chmod_setattr()
+				 * using the ZFS ACL conversion.  If they
+				 * differ from the Posix ACL conversion dirty
+				 * the inode to write the Posix mode bits.
+				 */
+				if (ip->i_mode != mode) {
+					ip->i_mode = mode;
+					ip->i_ctime = current_time(ip);
+					zfs_mark_inode_dirty(ip);
+				}
+
+				if (error == 0)
+					acl = NULL;
+			}
+		}
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+		if (!S_ISDIR(ip->i_mode))
+			return (acl ? -EACCES : 0);
+		break;
+
+	default:
+		return (-EINVAL);
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		value = kmem_alloc(size, KM_SLEEP);
+
+		error = zpl_acl_to_xattr(acl, value, size);
+		if (error < 0) {
+			kmem_free(value, size);
+			return (error);
+		}
+	}
+
+	error = zpl_xattr_set(ip, name, value, size, 0);
+	if (value)
+		kmem_free(value, size);
+
+	if (!error) {
+		if (acl)
+			zpl_set_cached_acl(ip, type, acl);
+		else
+			zpl_forget_cached_acl(ip, type);
+	}
+
+	return (error);
+}
+
+struct posix_acl *
+zpl_get_acl(struct inode *ip, int type)
+{
+	struct posix_acl *acl;
+	void *value = NULL;
+	char *name;
+	int size;
+
+	/*
+	 * As of Linux 3.14, the kernel get_acl will check this for us.
+	 * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong
+	 * as the kernel get_acl will set it to temporary sentinel value.
+	 */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+	acl = get_cached_acl(ip, type);
+	if (acl != ACL_NOT_CACHED)
+		return (acl);
+#endif
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return (ERR_PTR(-EINVAL));
+	}
+
+	size = zpl_xattr_get(ip, name, NULL, 0);
+	if (size > 0) {
+		value = kmem_alloc(size, KM_SLEEP);
+		size = zpl_xattr_get(ip, name, value, size);
+	}
+
+	if (size > 0) {
+		acl = zpl_acl_from_xattr(value, size);
+	} else if (size == -ENODATA || size == -ENOSYS) {
+		acl = NULL;
+	} else {
+		acl = ERR_PTR(-EIO);
+	}
+
+	if (size > 0)
+		kmem_free(value, size);
+
+	/* As of Linux 4.7, the kernel get_acl will set this for us */
+#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE
+	if (!IS_ERR(acl))
+		zpl_set_cached_acl(ip, type, acl);
+#endif
+
+	return (acl);
+}
+
+int
+zpl_init_acl(struct inode *ip, struct inode *dir)
+{
+	struct posix_acl *acl = NULL;
+	int error = 0;
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (0);
+
+	if (!S_ISLNK(ip->i_mode)) {
+		acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return (PTR_ERR(acl));
+		if (!acl) {
+			ip->i_mode &= ~current_umask();
+			ip->i_ctime = current_time(ip);
+			zfs_mark_inode_dirty(ip);
+			return (0);
+		}
+	}
+
+	if (acl) {
+		umode_t mode;
+
+		if (S_ISDIR(ip->i_mode)) {
+			error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT);
+			if (error)
+				goto out;
+		}
+
+		mode = ip->i_mode;
+		error = __posix_acl_create(&acl, GFP_KERNEL, &mode);
+		if (error >= 0) {
+			ip->i_mode = mode;
+			zfs_mark_inode_dirty(ip);
+			if (error > 0)
+				error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+		}
+	}
+out:
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+
+int
+zpl_chmod_acl(struct inode *ip)
+{
+	struct posix_acl *acl;
+	int error;
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (0);
+
+	if (S_ISLNK(ip->i_mode))
+		return (-EOPNOTSUPP);
+
+	acl = zpl_get_acl(ip, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return (PTR_ERR(acl));
+
+	error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode);
+	if (!error)
+		error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS);
+
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+
+static int
+__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
+	size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS);
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (0);
+
+	if (list && xattr_size <= list_size)
+		memcpy(list, xattr_name, xattr_size);
+
+	return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access);
+
+static int
+__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size,
+    const char *name, size_t name_len)
+{
+	char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
+	size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT);
+
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (0);
+
+	if (list && xattr_size <= list_size)
+		memcpy(list, xattr_name, xattr_size);
+
+	return (xattr_size);
+}
+ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default);
+
+static int
+__zpl_xattr_acl_get_access(struct inode *ip, const char *name,
+    void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_ACCESS;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (-EOPNOTSUPP);
+
+	acl = zpl_get_acl(ip, type);
+	if (IS_ERR(acl))
+		return (PTR_ERR(acl));
+	if (acl == NULL)
+		return (-ENODATA);
+
+	error = zpl_acl_to_xattr(acl, buffer, size);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access);
+
+static int
+__zpl_xattr_acl_get_default(struct inode *ip, const char *name,
+    void *buffer, size_t size)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_DEFAULT;
+	int error;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (-EOPNOTSUPP);
+
+	acl = zpl_get_acl(ip, type);
+	if (IS_ERR(acl))
+		return (PTR_ERR(acl));
+	if (acl == NULL)
+		return (-ENODATA);
+
+	error = zpl_acl_to_xattr(acl, buffer, size);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default);
+
+static int
+__zpl_xattr_acl_set_access(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_ACCESS;
+	int error = 0;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (-EOPNOTSUPP);
+
+	if (!inode_owner_or_capable(ip))
+		return (-EPERM);
+
+	if (value) {
+		acl = zpl_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return (PTR_ERR(acl));
+		else if (acl) {
+			error = zpl_posix_acl_valid(ip, acl);
+			if (error) {
+				zpl_posix_acl_release(acl);
+				return (error);
+			}
+		}
+	} else {
+		acl = NULL;
+	}
+
+	error = zpl_set_acl(ip, acl, type);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access);
+
+static int
+__zpl_xattr_acl_set_default(struct inode *ip, const char *name,
+    const void *value, size_t size, int flags)
+{
+	struct posix_acl *acl;
+	int type = ACL_TYPE_DEFAULT;
+	int error = 0;
+	/* xattr_resolve_name will do this for us if this is defined */
+#ifndef HAVE_XATTR_HANDLER_NAME
+	if (strcmp(name, "") != 0)
+		return (-EINVAL);
+#endif
+	if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX)
+		return (-EOPNOTSUPP);
+
+	if (!inode_owner_or_capable(ip))
+		return (-EPERM);
+
+	if (value) {
+		acl = zpl_acl_from_xattr(value, size);
+		if (IS_ERR(acl))
+			return (PTR_ERR(acl));
+		else if (acl) {
+			error = zpl_posix_acl_valid(ip, acl);
+			if (error) {
+				zpl_posix_acl_release(acl);
+				return (error);
+			}
+		}
+	} else {
+		acl = NULL;
+	}
+
+	error = zpl_set_acl(ip, acl, type);
+	zpl_posix_acl_release(acl);
+
+	return (error);
+}
+ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default);
+
+/*
+ * ACL access xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_access_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name	= XATTR_NAME_POSIX_ACL_ACCESS,
+#else
+	.prefix	= XATTR_NAME_POSIX_ACL_ACCESS,
+#endif
+	.list	= zpl_xattr_acl_list_access,
+	.get	= zpl_xattr_acl_get_access,
+	.set	= zpl_xattr_acl_set_access,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+    defined(HAVE_XATTR_LIST_DENTRY) || \
+    defined(HAVE_XATTR_LIST_HANDLER)
+	.flags	= ACL_TYPE_ACCESS,
+#endif
+};
+
+/*
+ * ACL default xattr namespace handlers.
+ *
+ * Use .name instead of .prefix when available. xattr_resolve_name will match
+ * whole name and reject anything that has .name only as prefix.
+ */
+xattr_handler_t zpl_xattr_acl_default_handler =
+{
+#ifdef HAVE_XATTR_HANDLER_NAME
+	.name	= XATTR_NAME_POSIX_ACL_DEFAULT,
+#else
+	.prefix	= XATTR_NAME_POSIX_ACL_DEFAULT,
+#endif
+	.list	= zpl_xattr_acl_list_default,
+	.get	= zpl_xattr_acl_get_default,
+	.set	= zpl_xattr_acl_set_default,
+#if defined(HAVE_XATTR_LIST_SIMPLE) || \
+    defined(HAVE_XATTR_LIST_DENTRY) || \
+    defined(HAVE_XATTR_LIST_HANDLER)
+	.flags	= ACL_TYPE_DEFAULT,
+#endif
+};
+
+#endif /* CONFIG_FS_POSIX_ACL */
+
+xattr_handler_t *zpl_xattr_handlers[] = {
+	&zpl_xattr_security_handler,
+	&zpl_xattr_trusted_handler,
+	&zpl_xattr_user_handler,
+#ifdef CONFIG_FS_POSIX_ACL
+	&zpl_xattr_acl_access_handler,
+	&zpl_xattr_acl_default_handler,
+#endif /* CONFIG_FS_POSIX_ACL */
+	NULL
+};
+
+static const struct xattr_handler *
+zpl_xattr_handler(const char *name)
+{
+	if (strncmp(name, XATTR_USER_PREFIX,
+	    XATTR_USER_PREFIX_LEN) == 0)
+		return (&zpl_xattr_user_handler);
+
+	if (strncmp(name, XATTR_TRUSTED_PREFIX,
+	    XATTR_TRUSTED_PREFIX_LEN) == 0)
+		return (&zpl_xattr_trusted_handler);
+
+	if (strncmp(name, XATTR_SECURITY_PREFIX,
+	    XATTR_SECURITY_PREFIX_LEN) == 0)
+		return (&zpl_xattr_security_handler);
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS,
+	    sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0)
+		return (&zpl_xattr_acl_access_handler);
+
+	if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT,
+	    sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0)
+		return (&zpl_xattr_acl_default_handler);
+#endif /* CONFIG_FS_POSIX_ACL */
+
+	return (NULL);
+}
+
+#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY)
+struct acl_rel_struct {
+	struct acl_rel_struct *next;
+	struct posix_acl *acl;
+	clock_t time;
+};
+
+#define	ACL_REL_GRACE	(60*HZ)
+#define	ACL_REL_WINDOW	(1*HZ)
+#define	ACL_REL_SCHED	(ACL_REL_GRACE+ACL_REL_WINDOW)
+
+/*
+ * Lockless multi-producer single-consumer fifo list.
+ * Nodes are added to tail and removed from head. Tail pointer is our
+ * synchronization point. It always points to the next pointer of the last
+ * node, or head if list is empty.
+ */
+static struct acl_rel_struct *acl_rel_head = NULL;
+static struct acl_rel_struct **acl_rel_tail = &acl_rel_head;
+
+static void
+zpl_posix_acl_free(void *arg)
+{
+	struct acl_rel_struct *freelist = NULL;
+	struct acl_rel_struct *a;
+	clock_t new_time;
+	boolean_t refire = B_FALSE;
+
+	ASSERT3P(acl_rel_head, !=, NULL);
+	while (acl_rel_head) {
+		a = acl_rel_head;
+		if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) {
+			/*
+			 * If a is the last node we need to reset tail, but we
+			 * need to use cmpxchg to make sure it is still the
+			 * last node.
+			 */
+			if (acl_rel_tail == &a->next) {
+				acl_rel_head = NULL;
+				if (cmpxchg(&acl_rel_tail, &a->next,
+				    &acl_rel_head) == &a->next) {
+					ASSERT3P(a->next, ==, NULL);
+					a->next = freelist;
+					freelist = a;
+					break;
+				}
+			}
+			/*
+			 * a is not last node, make sure next pointer is set
+			 * by the adder and advance the head.
+			 */
+			while (READ_ONCE(a->next) == NULL)
+				cpu_relax();
+			acl_rel_head = a->next;
+			a->next = freelist;
+			freelist = a;
+		} else {
+			/*
+			 * a is still in grace period. We are responsible to
+			 * reschedule the free task, since adder will only do
+			 * so if list is empty.
+			 */
+			new_time = a->time + ACL_REL_SCHED;
+			refire = B_TRUE;
+			break;
+		}
+	}
+
+	if (refire)
+		taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+		    NULL, TQ_SLEEP, new_time);
+
+	while (freelist) {
+		a = freelist;
+		freelist = a->next;
+		kfree(a->acl);
+		kmem_free(a, sizeof (struct acl_rel_struct));
+	}
+}
+
+void
+zpl_posix_acl_release_impl(struct posix_acl *acl)
+{
+	struct acl_rel_struct *a, **prev;
+
+	a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP);
+	a->next = NULL;
+	a->acl = acl;
+	a->time = ddi_get_lbolt();
+	/* atomically points tail to us and get the previous tail */
+	prev = xchg(&acl_rel_tail, &a->next);
+	ASSERT3P(*prev, ==, NULL);
+	*prev = a;
+	/* if it was empty before, schedule the free task */
+	if (prev == &acl_rel_head)
+		taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free,
+		    NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
new file mode 100644
index 000000000000..0caf31307718
--- /dev/null
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -0,0 +1,1098 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio.h>
+#include <sys/zfs_rlock.h>
+#include <sys/spa_impl.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include <linux/blkdev_compat.h>
+#include <linux/task_io_accounting_ops.h>
+
+unsigned int zvol_major = ZVOL_MAJOR;
+unsigned int zvol_request_sync = 0;
+unsigned int zvol_prefetch_bytes = (128 * 1024);
+unsigned long zvol_max_discard_blocks = 16384;
+unsigned int zvol_threads = 32;
+
+struct zvol_state_os {
+	struct gendisk		*zvo_disk;	/* generic disk */
+	struct request_queue	*zvo_queue;	/* request queue */
+	dev_t			zvo_dev;	/* device id */
+};
+
+taskq_t *zvol_taskq;
+static struct ida zvol_ida;
+
+typedef struct zv_request {
+	zvol_state_t	*zv;
+	struct bio	*bio;
+	taskq_ent_t	ent;
+} zv_request_t;
+
+/*
+ * Given a path, return TRUE if path is a ZVOL.
+ */
+static boolean_t
+zvol_is_zvol_impl(const char *path)
+{
+	dev_t dev = 0;
+
+	if (vdev_lookup_bdev(path, &dev) != 0)
+		return (B_FALSE);
+
+	if (MAJOR(dev) == zvol_major)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static void
+zvol_write(void *arg)
+{
+	zv_request_t *zvr = arg;
+	struct bio *bio = zvr->bio;
+	int error = 0;
+	zfs_uio_t uio;
+
+	zfs_uio_bvec_init(&uio, bio);
+
+	zvol_state_t *zv = zvr->zv;
+	ASSERT3P(zv, !=, NULL);
+	ASSERT3U(zv->zv_open_count, >, 0);
+	ASSERT3P(zv->zv_zilog, !=, NULL);
+
+	/* bio marked as FLUSH need to flush before write */
+	if (bio_is_flush(bio))
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	/* Some requests are just for flush and nothing else. */
+	if (uio.uio_resid == 0) {
+		rw_exit(&zv->zv_suspend_lock);
+		BIO_END_IO(bio, 0);
+		kmem_free(zvr, sizeof (zv_request_t));
+		return;
+	}
+
+	struct request_queue *q = zv->zv_zso->zvo_queue;
+	struct gendisk *disk = zv->zv_zso->zvo_disk;
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_time;
+
+	boolean_t acct = blk_queue_io_stat(q);
+	if (acct)
+		start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+
+	boolean_t sync =
+	    bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
+
+	uint64_t volsize = zv->zv_volsize;
+	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+		uint64_t off = uio.uio_loffset;
+		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+		if (bytes > volsize - off)	/* don't write past the end */
+			bytes = volsize - off;
+
+		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+
+		/* This will only fail for ENOSPC */
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			break;
+		}
+		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+		if (error == 0) {
+			zvol_log_write(zv, tx, off, bytes, sync);
+		}
+		dmu_tx_commit(tx);
+
+		if (error)
+			break;
+	}
+	zfs_rangelock_exit(lr);
+
+	int64_t nwritten = start_resid - uio.uio_resid;
+	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
+	task_io_account_write(nwritten);
+
+	if (sync)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+	rw_exit(&zv->zv_suspend_lock);
+
+	if (acct)
+		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+
+	BIO_END_IO(bio, -error);
+	kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_discard(void *arg)
+{
+	zv_request_t *zvr = arg;
+	struct bio *bio = zvr->bio;
+	zvol_state_t *zv = zvr->zv;
+	uint64_t start = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	uint64_t end = start + size;
+	boolean_t sync;
+	int error = 0;
+	dmu_tx_t *tx;
+
+	ASSERT3P(zv, !=, NULL);
+	ASSERT3U(zv->zv_open_count, >, 0);
+	ASSERT3P(zv->zv_zilog, !=, NULL);
+
+	struct request_queue *q = zv->zv_zso->zvo_queue;
+	struct gendisk *disk = zv->zv_zso->zvo_disk;
+	unsigned long start_time;
+
+	boolean_t acct = blk_queue_io_stat(q);
+	if (acct)
+		start_time = blk_generic_start_io_acct(q, disk, WRITE, bio);
+
+	sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+	if (end > zv->zv_volsize) {
+		error = SET_ERROR(EIO);
+		goto unlock;
+	}
+
+	/*
+	 * Align the request to volume block boundaries when a secure erase is
+	 * not required.  This will prevent dnode_free_range() from zeroing out
+	 * the unaligned parts which is slow (read-modify-write) and useless
+	 * since we are not freeing any space by doing so.
+	 */
+	if (!bio_is_secure_erase(bio)) {
+		start = P2ROUNDUP(start, zv->zv_volblocksize);
+		end = P2ALIGN(end, zv->zv_volblocksize);
+		size = end - start;
+	}
+
+	if (start >= end)
+		goto unlock;
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+	    start, size, RL_WRITER);
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error != 0) {
+		dmu_tx_abort(tx);
+	} else {
+		zvol_log_truncate(zv, tx, start, size, B_TRUE);
+		dmu_tx_commit(tx);
+		error = dmu_free_long_range(zv->zv_objset,
+		    ZVOL_OBJ, start, size);
+	}
+	zfs_rangelock_exit(lr);
+
+	if (error == 0 && sync)
+		zil_commit(zv->zv_zilog, ZVOL_OBJ);
+
+unlock:
+	rw_exit(&zv->zv_suspend_lock);
+
+	if (acct)
+		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
+
+	BIO_END_IO(bio, -error);
+	kmem_free(zvr, sizeof (zv_request_t));
+}
+
+static void
+zvol_read(void *arg)
+{
+	zv_request_t *zvr = arg;
+	struct bio *bio = zvr->bio;
+	int error = 0;
+	zfs_uio_t uio;
+
+	zfs_uio_bvec_init(&uio, bio);
+
+	zvol_state_t *zv = zvr->zv;
+	ASSERT3P(zv, !=, NULL);
+	ASSERT3U(zv->zv_open_count, >, 0);
+
+	struct request_queue *q = zv->zv_zso->zvo_queue;
+	struct gendisk *disk = zv->zv_zso->zvo_disk;
+	ssize_t start_resid = uio.uio_resid;
+	unsigned long start_time;
+
+	boolean_t acct = blk_queue_io_stat(q);
+	if (acct)
+		start_time = blk_generic_start_io_acct(q, disk, READ, bio);
+
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
+	    uio.uio_loffset, uio.uio_resid, RL_READER);
+
+	uint64_t volsize = zv->zv_volsize;
+	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
+		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
+
+		/* don't read past the end */
+		if (bytes > volsize - uio.uio_loffset)
+			bytes = volsize - uio.uio_loffset;
+
+		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+	}
+	zfs_rangelock_exit(lr);
+
+	int64_t nread = start_resid - uio.uio_resid;
+	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
+	task_io_account_read(nread);
+
+	rw_exit(&zv->zv_suspend_lock);
+
+	if (acct)
+		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
+
+	BIO_END_IO(bio, -error);
+	kmem_free(zvr, sizeof (zv_request_t));
+}
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+static blk_qc_t
+zvol_submit_bio(struct bio *bio)
+#else
+static MAKE_REQUEST_FN_RET
+zvol_request(struct request_queue *q, struct bio *bio)
+#endif
+{
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+	struct request_queue *q = bio->bi_disk->queue;
+#endif
+	zvol_state_t *zv = q->queuedata;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	uint64_t offset = BIO_BI_SECTOR(bio) << 9;
+	uint64_t size = BIO_BI_SIZE(bio);
+	int rw = bio_data_dir(bio);
+	zv_request_t *zvr;
+
+	if (bio_has_data(bio) && offset + size > zv->zv_volsize) {
+		printk(KERN_INFO
+		    "%s: bad access: offset=%llu, size=%lu\n",
+		    zv->zv_zso->zvo_disk->disk_name,
+		    (long long unsigned)offset,
+		    (long unsigned)size);
+
+		BIO_END_IO(bio, -SET_ERROR(EIO));
+		goto out;
+	}
+
+	if (rw == WRITE) {
+		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
+			BIO_END_IO(bio, -SET_ERROR(EROFS));
+			goto out;
+		}
+
+		/*
+		 * Prevents the zvol from being suspended, or the ZIL being
+		 * concurrently opened.  Will be released after the i/o
+		 * completes.
+		 */
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		/*
+		 * Open a ZIL if this is the first time we have written to this
+		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
+		 * than zv_state_lock so that we don't need to acquire an
+		 * additional lock in this path.
+		 */
+		if (zv->zv_zilog == NULL) {
+			rw_exit(&zv->zv_suspend_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
+			if (zv->zv_zilog == NULL) {
+				zv->zv_zilog = zil_open(zv->zv_objset,
+				    zvol_get_data);
+				zv->zv_flags |= ZVOL_WRITTEN_TO;
+			}
+			rw_downgrade(&zv->zv_suspend_lock);
+		}
+
+		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+		zvr->zv = zv;
+		zvr->bio = bio;
+		taskq_init_ent(&zvr->ent);
+
+		/*
+		 * We don't want this thread to be blocked waiting for i/o to
+		 * complete, so we instead wait from a taskq callback. The
+		 * i/o may be a ZIL write (via zil_commit()), or a read of an
+		 * indirect block, or a read of a data block (if this is a
+		 * partial-block write).  We will indicate that the i/o is
+		 * complete by calling BIO_END_IO() from the taskq callback.
+		 *
+		 * This design allows the calling thread to continue and
+		 * initiate more concurrent operations by calling
+		 * zvol_request() again. There are typically only a small
+		 * number of threads available to call zvol_request() (e.g.
+		 * one per iSCSI target), so keeping the latency of
+		 * zvol_request() low is important for performance.
+		 *
+		 * The zvol_request_sync module parameter allows this
+		 * behavior to be altered, for performance evaluation
+		 * purposes.  If the callback blocks, setting
+		 * zvol_request_sync=1 will result in much worse performance.
+		 *
+		 * We can have up to zvol_threads concurrent i/o's being
+		 * processed for all zvols on the system.  This is typically
+		 * a vast improvement over the zvol_request_sync=1 behavior
+		 * of one i/o at a time per zvol.  However, an even better
+		 * design would be for zvol_request() to initiate the zio
+		 * directly, and then be notified by the zio_done callback,
+		 * which would call BIO_END_IO().  Unfortunately, the DMU/ZIL
+		 * interfaces lack this functionality (they block waiting for
+		 * the i/o to complete).
+		 */
+		if (bio_is_discard(bio) || bio_is_secure_erase(bio)) {
+			if (zvol_request_sync) {
+				zvol_discard(zvr);
+			} else {
+				taskq_dispatch_ent(zvol_taskq,
+				    zvol_discard, zvr, 0, &zvr->ent);
+			}
+		} else {
+			if (zvol_request_sync) {
+				zvol_write(zvr);
+			} else {
+				taskq_dispatch_ent(zvol_taskq,
+				    zvol_write, zvr, 0, &zvr->ent);
+			}
+		}
+	} else {
+		/*
+		 * The SCST driver, and possibly others, may issue READ I/Os
+		 * with a length of zero bytes.  These empty I/Os contain no
+		 * data and require no additional handling.
+		 */
+		if (size == 0) {
+			BIO_END_IO(bio, 0);
+			goto out;
+		}
+
+		zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP);
+		zvr->zv = zv;
+		zvr->bio = bio;
+		taskq_init_ent(&zvr->ent);
+
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		/* See comment in WRITE case above. */
+		if (zvol_request_sync) {
+			zvol_read(zvr);
+		} else {
+			taskq_dispatch_ent(zvol_taskq,
+			    zvol_read, zvr, 0, &zvr->ent);
+		}
+	}
+
+out:
+	spl_fstrans_unmark(cookie);
+#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
+	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
+	return (BLK_QC_T_NONE);
+#endif
+}
+
+static int
+zvol_open(struct block_device *bdev, fmode_t flag)
+{
+	zvol_state_t *zv;
+	int error = 0;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+	/*
+	 * Obtain a copy of private_data under the zvol_state_lock to make
+	 * sure that either the result of zvol free code path setting
+	 * bdev->bd_disk->private_data to NULL is observed, or zvol_free()
+	 * is not called on this zv because of the positive zv_open_count.
+	 */
+	zv = bdev->bd_disk->private_data;
+	if (zv == NULL) {
+		rw_exit(&zvol_state_lock);
+		return (SET_ERROR(-ENXIO));
+	}
+
+	mutex_enter(&zv->zv_state_lock);
+	/*
+	 * make sure zvol is not suspended during first open
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 0) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 0) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	if (zv->zv_open_count == 0) {
+		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+		error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+		if (error)
+			goto out_mutex;
+	}
+
+	if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		error = -EROFS;
+		goto out_open_count;
+	}
+
+	zv->zv_open_count++;
+
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+
+	zfs_check_media_change(bdev);
+
+	return (0);
+
+out_open_count:
+	if (zv->zv_open_count == 0)
+		zvol_last_close(zv);
+
+out_mutex:
+	mutex_exit(&zv->zv_state_lock);
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+	if (error == -EINTR) {
+		error = -ERESTARTSYS;
+		schedule();
+	}
+	return (SET_ERROR(error));
+}
+
+static void
+zvol_release(struct gendisk *disk, fmode_t mode)
+{
+	zvol_state_t *zv;
+	boolean_t drop_suspend = B_TRUE;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+	zv = disk->private_data;
+
+	mutex_enter(&zv->zv_state_lock);
+	ASSERT3U(zv->zv_open_count, >, 0);
+	/*
+	 * make sure zvol is not suspended during last close
+	 * (hold zv_suspend_lock) and respect proper lock acquisition
+	 * ordering - zv_suspend_lock before zv_state_lock
+	 */
+	if (zv->zv_open_count == 1) {
+		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+			mutex_exit(&zv->zv_state_lock);
+			rw_enter(&zv->zv_suspend_lock, RW_READER);
+			mutex_enter(&zv->zv_state_lock);
+			/* check to see if zv_suspend_lock is needed */
+			if (zv->zv_open_count != 1) {
+				rw_exit(&zv->zv_suspend_lock);
+				drop_suspend = B_FALSE;
+			}
+		}
+	} else {
+		drop_suspend = B_FALSE;
+	}
+	rw_exit(&zvol_state_lock);
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	zv->zv_open_count--;
+	if (zv->zv_open_count == 0) {
+		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+		zvol_last_close(zv);
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+
+	if (drop_suspend)
+		rw_exit(&zv->zv_suspend_lock);
+}
+
+static int
+zvol_ioctl(struct block_device *bdev, fmode_t mode,
+    unsigned int cmd, unsigned long arg)
+{
+	zvol_state_t *zv = bdev->bd_disk->private_data;
+	int error = 0;
+
+	ASSERT3U(zv->zv_open_count, >, 0);
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		fsync_bdev(bdev);
+		invalidate_bdev(bdev);
+		rw_enter(&zv->zv_suspend_lock, RW_READER);
+
+		if (!(zv->zv_flags & ZVOL_RDONLY))
+			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+
+		rw_exit(&zv->zv_suspend_lock);
+		break;
+
+	case BLKZNAME:
+		mutex_enter(&zv->zv_state_lock);
+		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
+		mutex_exit(&zv->zv_state_lock);
+		break;
+
+	default:
+		error = -ENOTTY;
+		break;
+	}
+
+	return (SET_ERROR(error));
+}
+
+#ifdef CONFIG_COMPAT
+static int
+zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
+    unsigned cmd, unsigned long arg)
+{
+	return (zvol_ioctl(bdev, mode, cmd, arg));
+}
+#else
+#define	zvol_compat_ioctl	NULL
+#endif
+
+static unsigned int
+zvol_check_events(struct gendisk *disk, unsigned int clearing)
+{
+	unsigned int mask = 0;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+
+	zvol_state_t *zv = disk->private_data;
+	if (zv != NULL) {
+		mutex_enter(&zv->zv_state_lock);
+		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
+		zv->zv_changed = 0;
+		mutex_exit(&zv->zv_state_lock);
+	}
+
+	rw_exit(&zvol_state_lock);
+
+	return (mask);
+}
+
+static int
+zvol_revalidate_disk(struct gendisk *disk)
+{
+	rw_enter(&zvol_state_lock, RW_READER);
+
+	zvol_state_t *zv = disk->private_data;
+	if (zv != NULL) {
+		mutex_enter(&zv->zv_state_lock);
+		set_capacity(zv->zv_zso->zvo_disk,
+		    zv->zv_volsize >> SECTOR_BITS);
+		mutex_exit(&zv->zv_state_lock);
+	}
+
+	rw_exit(&zvol_state_lock);
+
+	return (0);
+}
+
+static int
+zvol_update_volsize(zvol_state_t *zv, uint64_t volsize)
+{
+	struct gendisk *disk = zv->zv_zso->zvo_disk;
+
+#if defined(HAVE_REVALIDATE_DISK_SIZE)
+	revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
+#elif defined(HAVE_REVALIDATE_DISK)
+	revalidate_disk(disk);
+#else
+	zvol_revalidate_disk(disk);
+#endif
+	return (0);
+}
+
+static void
+zvol_clear_private(zvol_state_t *zv)
+{
+	/*
+	 * Cleared while holding zvol_state_lock as a writer
+	 * which will prevent zvol_open() from opening it.
+	 */
+	zv->zv_zso->zvo_disk->private_data = NULL;
+}
+
+/*
+ * Provide a simple virtual geometry for legacy compatibility.  For devices
+ * smaller than 1 MiB a small head and sector count is used to allow very
+ * tiny devices.  For devices over 1 Mib a standard head and sector count
+ * is used to keep the cylinders count reasonable.
+ */
+static int
+zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	zvol_state_t *zv = bdev->bd_disk->private_data;
+	sector_t sectors;
+
+	ASSERT3U(zv->zv_open_count, >, 0);
+
+	sectors = get_capacity(zv->zv_zso->zvo_disk);
+
+	if (sectors > 2048) {
+		geo->heads = 16;
+		geo->sectors = 63;
+	} else {
+		geo->heads = 2;
+		geo->sectors = 4;
+	}
+
+	geo->start = 0;
+	geo->cylinders = sectors / (geo->heads * geo->sectors);
+
+	return (0);
+}
+
+static struct block_device_operations zvol_ops = {
+	.open			= zvol_open,
+	.release		= zvol_release,
+	.ioctl			= zvol_ioctl,
+	.compat_ioctl		= zvol_compat_ioctl,
+	.check_events		= zvol_check_events,
+	.revalidate_disk	= zvol_revalidate_disk,
+	.getgeo			= zvol_getgeo,
+	.owner			= THIS_MODULE,
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+    .submit_bio		= zvol_submit_bio,
+#endif
+};
+
+/*
+ * Allocate memory for a new zvol_state_t and setup the required
+ * request queue and generic disk structures for the block device.
+ */
+static zvol_state_t *
+zvol_alloc(dev_t dev, const char *name)
+{
+	zvol_state_t *zv;
+	struct zvol_state_os *zso;
+	uint64_t volmode;
+
+	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
+		return (NULL);
+
+	if (volmode == ZFS_VOLMODE_DEFAULT)
+		volmode = zvol_volmode;
+
+	if (volmode == ZFS_VOLMODE_NONE)
+		return (NULL);
+
+	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
+	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
+	zv->zv_zso = zso;
+	zv->zv_volmode = volmode;
+
+	list_link_init(&zv->zv_next);
+	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
+
+#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
+	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
+#else
+	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
+#endif
+	if (zso->zvo_queue == NULL)
+		goto out_kmem;
+
+	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
+
+	/* Limit read-ahead to a single page to prevent over-prefetching. */
+	blk_queue_set_read_ahead(zso->zvo_queue, 1);
+
+	/* Disable write merging in favor of the ZIO pipeline. */
+	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
+
+	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
+	if (zso->zvo_disk == NULL)
+		goto out_queue;
+
+	zso->zvo_queue->queuedata = zv;
+	zso->zvo_dev = dev;
+	zv->zv_open_count = 0;
+	strlcpy(zv->zv_name, name, MAXNAMELEN);
+
+	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
+	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
+
+	zso->zvo_disk->major = zvol_major;
+	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
+
+	if (volmode == ZFS_VOLMODE_DEV) {
+		/*
+		 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set
+		 * gendisk->minors = 1 as noted in include/linux/genhd.h.
+		 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT)
+		 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN)
+		 * setting gendisk->flags accordingly.
+		 */
+		zso->zvo_disk->minors = 1;
+#if defined(GENHD_FL_EXT_DEVT)
+		zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT;
+#endif
+#if defined(GENHD_FL_NO_PART_SCAN)
+		zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+#endif
+	}
+	zso->zvo_disk->first_minor = (dev & MINORMASK);
+	zso->zvo_disk->fops = &zvol_ops;
+	zso->zvo_disk->private_data = zv;
+	zso->zvo_disk->queue = zso->zvo_queue;
+	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
+	    ZVOL_DEV_NAME, (dev & MINORMASK));
+
+	return (zv);
+
+out_queue:
+	blk_cleanup_queue(zso->zvo_queue);
+out_kmem:
+	kmem_free(zso, sizeof (struct zvol_state_os));
+	kmem_free(zv, sizeof (zvol_state_t));
+	return (NULL);
+}
+
+/*
+ * Cleanup then free a zvol_state_t which was created by zvol_alloc().
+ * At this time, the structure is not opened by anyone, is taken off
+ * the zvol_state_list, and has its private data set to NULL.
+ * The zvol_state_lock is dropped.
+ *
+ * This function may take many milliseconds to complete (e.g. we've seen
+ * it take over 256ms), due to the calls to "blk_cleanup_queue" and
+ * "del_gendisk". Thus, consumers need to be careful to account for this
+ * latency when calling this function.
+ */
+static void
+zvol_free(zvol_state_t *zv)
+{
+
+	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
+	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT0(zv->zv_open_count);
+	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
+
+	rw_destroy(&zv->zv_suspend_lock);
+	zfs_rangelock_fini(&zv->zv_rangelock);
+
+	del_gendisk(zv->zv_zso->zvo_disk);
+	blk_cleanup_queue(zv->zv_zso->zvo_queue);
+	put_disk(zv->zv_zso->zvo_disk);
+
+	ida_simple_remove(&zvol_ida,
+	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
+
+	mutex_destroy(&zv->zv_state_lock);
+	dataset_kstats_destroy(&zv->zv_kstat);
+
+	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
+	kmem_free(zv, sizeof (zvol_state_t));
+}
+
+void
+zvol_wait_close(zvol_state_t *zv)
+{
+}
+
+/*
+ * Create a block device minor node and setup the linkage between it
+ * and the specified volume.  Once this function returns the block
+ * device is live and ready for use.
+ */
+static int
+zvol_os_create_minor(const char *name)
+{
+	zvol_state_t *zv;
+	objset_t *os;
+	dmu_object_info_t *doi;
+	uint64_t volsize;
+	uint64_t len;
+	unsigned minor = 0;
+	int error = 0;
+	int idx;
+	uint64_t hash = zvol_name_hash(name);
+
+	if (zvol_inhibit_dev)
+		return (0);
+
+	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
+	if (idx < 0)
+		return (SET_ERROR(-idx));
+	minor = idx << ZVOL_MINOR_BITS;
+
+	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
+	if (zv) {
+		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+		mutex_exit(&zv->zv_state_lock);
+		ida_simple_remove(&zvol_ida, idx);
+		return (SET_ERROR(EEXIST));
+	}
+
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+
+	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
+	if (error)
+		goto out_doi;
+
+	error = dmu_object_info(os, ZVOL_OBJ, doi);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error)
+		goto out_dmu_objset_disown;
+
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	if (zv == NULL) {
+		error = SET_ERROR(EAGAIN);
+		goto out_dmu_objset_disown;
+	}
+	zv->zv_hash = hash;
+
+	if (dmu_objset_is_snapshot(os))
+		zv->zv_flags |= ZVOL_RDONLY;
+
+	zv->zv_volblocksize = doi->doi_data_block_size;
+	zv->zv_volsize = volsize;
+	zv->zv_objset = os;
+
+	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
+
+	blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue,
+	    (DMU_MAX_ACCESS / 4) >> 9);
+	blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX);
+	blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX);
+	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
+	    zv->zv_volblocksize);
+	blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize);
+	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
+	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
+	    zv->zv_volblocksize);
+	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
+#ifdef QUEUE_FLAG_NONROT
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
+#endif
+#ifdef QUEUE_FLAG_ADD_RANDOM
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
+#endif
+	/* This flag was introduced in kernel version 4.12. */
+#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
+	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
+#endif
+
+	if (spa_writeable(dmu_objset_spa(os))) {
+		if (zil_replay_disable)
+			zil_destroy(dmu_objset_zil(os), B_FALSE);
+		else
+			zil_replay(os, zv, zvol_replay_vector);
+	}
+	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
+	dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
+
+	/*
+	 * When udev detects the addition of the device it will immediately
+	 * invoke blkid(8) to determine the type of content on the device.
+	 * Prefetching the blocks commonly scanned by blkid(8) will speed
+	 * up this process.
+	 */
+	len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE);
+	if (len > 0) {
+		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
+		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+
+	zv->zv_objset = NULL;
+out_dmu_objset_disown:
+	dmu_objset_disown(os, B_TRUE, FTAG);
+out_doi:
+	kmem_free(doi, sizeof (dmu_object_info_t));
+
+	/*
+	 * Keep in mind that once add_disk() is called, the zvol is
+	 * announced to the world, and zvol_open()/zvol_release() can
+	 * be called at any time. Incidentally, add_disk() itself calls
+	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
+	 * directly as well.
+	 */
+	if (error == 0) {
+		rw_enter(&zvol_state_lock, RW_WRITER);
+		zvol_insert(zv);
+		rw_exit(&zvol_state_lock);
+		add_disk(zv->zv_zso->zvo_disk);
+	} else {
+		ida_simple_remove(&zvol_ida, idx);
+	}
+
+	return (error);
+}
+
+static void
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
+{
+	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
+
+	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
+
+	/* move to new hashtable entry  */
+	zv->zv_hash = zvol_name_hash(zv->zv_name);
+	hlist_del(&zv->zv_hlink);
+	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+
+	/*
+	 * The block device's read-only state is briefly changed causing
+	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
+	 * the name change and fixes the symlinks.  This does not change
+	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
+	 * changes.  This would normally be done using kobject_uevent() but
+	 * that is a GPL-only symbol which is why we need this workaround.
+	 */
+	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
+	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
+}
+
+static void
+zvol_set_disk_ro_impl(zvol_state_t *zv, int flags)
+{
+
+	set_disk_ro(zv->zv_zso->zvo_disk, flags);
+}
+
+static void
+zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity)
+{
+
+	set_capacity(zv->zv_zso->zvo_disk, capacity);
+}
+
+const static zvol_platform_ops_t zvol_linux_ops = {
+	.zv_free = zvol_free,
+	.zv_rename_minor = zvol_rename_minor,
+	.zv_create_minor = zvol_os_create_minor,
+	.zv_update_volsize = zvol_update_volsize,
+	.zv_clear_private = zvol_clear_private,
+	.zv_is_zvol = zvol_is_zvol_impl,
+	.zv_set_disk_ro = zvol_set_disk_ro_impl,
+	.zv_set_capacity = zvol_set_capacity_impl,
+};
+
+int
+zvol_init(void)
+{
+	int error;
+	int threads = MIN(MAX(zvol_threads, 1), 1024);
+
+	error = register_blkdev(zvol_major, ZVOL_DRIVER);
+	if (error) {
+		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
+		return (error);
+	}
+	zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri,
+	    threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
+	if (zvol_taskq == NULL) {
+		unregister_blkdev(zvol_major, ZVOL_DRIVER);
+		return (-ENOMEM);
+	}
+	zvol_init_impl();
+	ida_init(&zvol_ida);
+	zvol_register_ops(&zvol_linux_ops);
+	return (0);
+}
+
+void
+zvol_fini(void)
+{
+	zvol_fini_impl();
+	unregister_blkdev(zvol_major, ZVOL_DRIVER);
+	taskq_destroy(zvol_taskq);
+	ida_destroy(&zvol_ida);
+}
+
+/* BEGIN CSTYLED */
+module_param(zvol_inhibit_dev, uint, 0644);
+MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
+
+module_param(zvol_major, uint, 0444);
+MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
+
+module_param(zvol_threads, uint, 0444);
+MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests");
+
+module_param(zvol_request_sync, uint, 0644);
+MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
+
+module_param(zvol_max_discard_blocks, ulong, 0444);
+MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
+
+module_param(zvol_prefetch_bytes, uint, 0644);
+MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
+
+module_param(zvol_volmode, uint, 0644);
+MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/spl/Makefile.in b/sys/contrib/openzfs/module/spl/Makefile.in
new file mode 100644
index 000000000000..cedbfe92b58a
--- /dev/null
+++ b/sys/contrib/openzfs/module/spl/Makefile.in
@@ -0,0 +1,13 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+mfdir = $(obj)
+else
+mfdir = $(srctree)/$(src)
+endif
+
+MODULE := spl
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+include $(mfdir)/../os/linux/spl/Makefile
diff --git a/sys/contrib/openzfs/module/unicode/Makefile.in b/sys/contrib/openzfs/module/unicode/Makefile.in
new file mode 100644
index 000000000000..59c07c4555b7
--- /dev/null
+++ b/sys/contrib/openzfs/module/unicode/Makefile.in
@@ -0,0 +1,11 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zunicode
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+$(MODULE)-objs += u8_textprep.o
+$(MODULE)-objs += uconv.o
diff --git a/sys/contrib/openzfs/module/unicode/u8_textprep.c b/sys/contrib/openzfs/module/unicode/u8_textprep.c
new file mode 100644
index 000000000000..be816d728359
--- /dev/null
+++ b/sys/contrib/openzfs/module/unicode/u8_textprep.c
@@ -0,0 +1,2151 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+
+/*
+ * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458).
+ *
+ * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F),
+ * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed.
+ */
+
+#include <sys/types.h>
+#include <sys/strings.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/u8_textprep.h>
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+#include <sys/u8_textprep_data.h>
+#include <sys/mod.h>
+
+/* The maximum possible number of bytes in a UTF-8 character. */
+#define	U8_MB_CUR_MAX			(4)
+
+/*
+ * The maximum number of bytes needed for a UTF-8 character to cover
+ * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2.
+ */
+#define	U8_MAX_BYTES_UCS2		(3)
+
+/* The maximum possible number of bytes in a Stream-Safe Text. */
+#define	U8_STREAM_SAFE_TEXT_MAX		(128)
+
+/*
+ * The maximum number of characters in a combining/conjoining sequence and
+ * the actual upperbound limit of a combining/conjoining sequence.
+ */
+#define	U8_MAX_CHARS_A_SEQ		(32)
+#define	U8_UPPER_LIMIT_IN_A_SEQ		(31)
+
+/* The combining class value for Starter. */
+#define	U8_COMBINING_CLASS_STARTER	(0)
+
+/*
+ * Some Hangul related macros at below.
+ *
+ * The first and the last of Hangul syllables, Hangul Jamo Leading consonants,
+ * Vowels, and optional Trailing consonants in Unicode scalar values.
+ *
+ * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not
+ * the actual U+11A8. This is due to that the trailing consonant is optional
+ * and thus we are doing a pre-calculation of subtracting one.
+ *
+ * Each of 19 modern leading consonants has total 588 possible syllables since
+ * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for
+ * no trailing consonant case, i.e., 21 x 28 = 588.
+ *
+ * We also have bunch of Hangul related macros at below. Please bear in mind
+ * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is
+ * a Hangul Jamo or not but the value does not guarantee that it is a Hangul
+ * Jamo; it just guarantee that it will be most likely.
+ */
+#define	U8_HANGUL_SYL_FIRST		(0xAC00U)
+#define	U8_HANGUL_SYL_LAST		(0xD7A3U)
+
+#define	U8_HANGUL_JAMO_L_FIRST		(0x1100U)
+#define	U8_HANGUL_JAMO_L_LAST		(0x1112U)
+#define	U8_HANGUL_JAMO_V_FIRST		(0x1161U)
+#define	U8_HANGUL_JAMO_V_LAST		(0x1175U)
+#define	U8_HANGUL_JAMO_T_FIRST		(0x11A7U)
+#define	U8_HANGUL_JAMO_T_LAST		(0x11C2U)
+
+#define	U8_HANGUL_V_COUNT		(21)
+#define	U8_HANGUL_VT_COUNT		(588)
+#define	U8_HANGUL_T_COUNT		(28)
+
+#define	U8_HANGUL_JAMO_1ST_BYTE		(0xE1U)
+
+#define	U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \
+	(s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \
+	(s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \
+	(s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU));
+
+#define	U8_HANGUL_JAMO_L(u) \
+	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST)
+
+#define	U8_HANGUL_JAMO_V(u) \
+	((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST)
+
+#define	U8_HANGUL_JAMO_T(u) \
+	((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define	U8_HANGUL_JAMO(u) \
+	((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST)
+
+#define	U8_HANGUL_SYLLABLE(u) \
+	((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST)
+
+#define	U8_HANGUL_COMPOSABLE_L_V(s, u) \
+	((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u)))
+
+#define	U8_HANGUL_COMPOSABLE_LV_T(s, u) \
+	((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u)))
+
+/* The types of decomposition mappings. */
+#define	U8_DECOMP_BOTH			(0xF5U)
+#define	U8_DECOMP_CANONICAL		(0xF6U)
+
+/* The indicator for 16-bit table. */
+#define	U8_16BIT_TABLE_INDICATOR	(0x8000U)
+
+/* The following are some convenience macros. */
+#define	U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3)  \
+	(u) = ((((uint32_t)(b1) & 0x0F) << 12) | \
+		(((uint32_t)(b2) & 0x3F) << 6)  | \
+		((uint32_t)(b3) & 0x3F));
+
+#define	U8_SIMPLE_SWAP(a, b, t) \
+	(t) = (a); \
+	(a) = (b); \
+	(b) = (t);
+
+#define	U8_ASCII_TOUPPER(c) \
+	(((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c))
+
+#define	U8_ASCII_TOLOWER(c) \
+	(((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c))
+
+#define	U8_ISASCII(c)			(((uchar_t)(c)) < 0x80U)
+/*
+ * The following macro assumes that the two characters that are to be
+ * swapped are adjacent to each other and 'a' comes before 'b'.
+ *
+ * If the assumptions are not met, then, the macro will fail.
+ */
+#define	U8_SWAP_COMB_MARKS(a, b) \
+	for (k = 0; k < disp[(a)]; k++) \
+		u8t[k] = u8s[start[(a)] + k]; \
+	for (k = 0; k < disp[(b)]; k++) \
+		u8s[start[(a)] + k] = u8s[start[(b)] + k]; \
+	start[(b)] = start[(a)] + disp[(b)]; \
+	for (k = 0; k < disp[(a)]; k++) \
+		u8s[start[(b)] + k] = u8t[k]; \
+	U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \
+	U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc);
+
+/* The possible states during normalization. */
+typedef enum {
+	U8_STATE_START = 0,
+	U8_STATE_HANGUL_L = 1,
+	U8_STATE_HANGUL_LV = 2,
+	U8_STATE_HANGUL_LVT = 3,
+	U8_STATE_HANGUL_V = 4,
+	U8_STATE_HANGUL_T = 5,
+	U8_STATE_COMBINING_MARK = 6
+} u8_normalization_states_t;
+
+/*
+ * The three vectors at below are used to check bytes of a given UTF-8
+ * character are valid and not containing any malformed byte values.
+ *
+ * We used to have a quite relaxed UTF-8 binary representation but then there
+ * was some security related issues and so the Unicode Consortium defined
+ * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it
+ * one more time at the Unicode 3.2. The following three tables are based on
+ * that.
+ */
+
+#define	U8_ILLEGAL_NEXT_BYTE_COMMON(c)	((c) < 0x80 || (c) > 0xBF)
+
+#define	I_				U8_ILLEGAL_CHAR
+#define	O_				U8_OUT_OF_RANGE_CHAR
+
+const int8_t u8_number_of_bytes[0x100] = {
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+
+/*	80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*  	90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*  	A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*	B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
+	I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_,
+
+/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
+	I_, I_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+
+/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
+	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+
+/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
+	3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+
+/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
+	4,  4,  4,  4,  4,  O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_,
+};
+
+#undef	I_
+#undef	O_
+
+const uint8_t u8_valid_min_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+/*	C0    C1    C2    C3    C4    C5    C6    C7    */
+	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	C8    C9    CA    CB    CC    CD    CE    CF    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	D0    D1    D2    D3    D4    D5    D6    D7    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	D8    D9    DA    DB    DC    DD    DE    DF    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	E0    E1    E2    E3    E4    E5    E6    E7    */
+	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	E8    E9    EA    EB    EC    ED    EE    EF    */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+/*	F0    F1    F2    F3    F4    F5    F6    F7    */
+	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+};
+
+const uint8_t u8_valid_max_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+/*	C0    C1    C2    C3    C4    C5    C6    C7    */
+	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	C8    C9    CA    CB    CC    CD    CE    CF    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	D0    D1    D2    D3    D4    D5    D6    D7    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	D8    D9    DA    DB    DC    DD    DE    DF    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	E0    E1    E2    E3    E4    E5    E6    E7    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+/*	E8    E9    EA    EB    EC    ED    EE    EF    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+/*	F0    F1    F2    F3    F4    F5    F6    F7    */
+	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+};
+
+
+/*
+ * The u8_validate() validates on the given UTF-8 character string and
+ * calculate the byte length. It is quite similar to mblen(3C) except that
+ * this will validate against the list of characters if required and
+ * specific to UTF-8 and Unicode.
+ */
+int
+u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum)
+{
+	uchar_t *ib;
+	uchar_t *ibtail;
+	uchar_t **p;
+	uchar_t *s1;
+	uchar_t *s2;
+	uchar_t f;
+	int sz;
+	size_t i;
+	int ret_val;
+	boolean_t second;
+	boolean_t no_need_to_validate_entire;
+	boolean_t check_additional;
+	boolean_t validate_ucs2_range_only;
+
+	if (! u8str)
+		return (0);
+
+	ib = (uchar_t *)u8str;
+	ibtail = ib + n;
+
+	ret_val = 0;
+
+	no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE);
+	check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL;
+	validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE;
+
+	while (ib < ibtail) {
+		/*
+		 * The first byte of a UTF-8 character tells how many
+		 * bytes will follow for the character. If the first byte
+		 * is an illegal byte value or out of range value, we just
+		 * return -1 with an appropriate error number.
+		 */
+		sz = u8_number_of_bytes[*ib];
+		if (sz == U8_ILLEGAL_CHAR) {
+			*errnum = EILSEQ;
+			return (-1);
+		}
+
+		if (sz == U8_OUT_OF_RANGE_CHAR ||
+		    (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) {
+			*errnum = ERANGE;
+			return (-1);
+		}
+
+		/*
+		 * If we don't have enough bytes to check on, that's also
+		 * an error. As you can see, we give illegal byte sequence
+		 * checking higher priority then EINVAL cases.
+		 */
+		if ((ibtail - ib) < sz) {
+			*errnum = EINVAL;
+			return (-1);
+		}
+
+		if (sz == 1) {
+			ib++;
+			ret_val++;
+		} else {
+			/*
+			 * Check on the multi-byte UTF-8 character. For more
+			 * details on this, see comment added for the used
+			 * data structures at the beginning of the file.
+			 */
+			f = *ib++;
+			ret_val++;
+			second = B_TRUE;
+			for (i = 1; i < sz; i++) {
+				if (second) {
+					if (*ib < u8_valid_min_2nd_byte[f] ||
+					    *ib > u8_valid_max_2nd_byte[f]) {
+						*errnum = EILSEQ;
+						return (-1);
+					}
+					second = B_FALSE;
+				} else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) {
+					*errnum = EILSEQ;
+					return (-1);
+				}
+				ib++;
+				ret_val++;
+			}
+		}
+
+		if (check_additional) {
+			for (p = (uchar_t **)list, i = 0; p[i]; i++) {
+				s1 = ib - sz;
+				s2 = p[i];
+				while (s1 < ib) {
+					if (*s1 != *s2 || *s2 == '\0')
+						break;
+					s1++;
+					s2++;
+				}
+
+				if (s1 >= ib && *s2 == '\0') {
+					*errnum = EBADF;
+					return (-1);
+				}
+			}
+		}
+
+		if (no_need_to_validate_entire)
+			break;
+	}
+
+	return (ret_val);
+}
+
+/*
+ * The do_case_conv() looks at the mapping tables and returns found
+ * bytes if any. If not found, the input bytes are returned. The function
+ * always terminate the return bytes with a null character assuming that
+ * there are plenty of room to do so.
+ *
+ * The case conversions are simple case conversions mapping a character to
+ * another character as specified in the Unicode data. The byte size of
+ * the mapped character could be different from that of the input character.
+ *
+ * The return value is the byte length of the returned character excluding
+ * the terminating null byte.
+ */
+static size_t
+do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper)
+{
+	size_t i;
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b3_tbl;
+	uint16_t b3_base;
+	uint16_t b4 = 0;
+	size_t start_id;
+	size_t end_id;
+
+	/*
+	 * At this point, the only possible values for sz are 2, 3, and 4.
+	 * The u8s should point to a vector that is well beyond the size of
+	 * 5 bytes.
+	 */
+	if (sz == 2) {
+		b3 = u8s[0] = s[0];
+		b4 = u8s[1] = s[1];
+	} else if (sz == 3) {
+		b2 = u8s[0] = s[0];
+		b3 = u8s[1] = s[1];
+		b4 = u8s[2] = s[2];
+	} else if (sz == 4) {
+		b1 = u8s[0] = s[0];
+		b2 = u8s[1] = s[1];
+		b3 = u8s[2] = s[2];
+		b4 = u8s[3] = s[3];
+	} else {
+		/* This is not possible but just in case as a fallback. */
+		if (is_it_toupper)
+			*u8s = U8_ASCII_TOUPPER(*s);
+		else
+			*u8s = U8_ASCII_TOLOWER(*s);
+		u8s[1] = '\0';
+
+		return (1);
+	}
+	u8s[sz] = '\0';
+
+	/*
+	 * Let's find out if we have a corresponding character.
+	 */
+	b1 = u8_common_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	b2 = u8_case_common_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	if (is_it_toupper) {
+		b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id;
+		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+			return ((size_t)sz);
+
+		start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1];
+
+		/* Either there is no match or an error at the table. */
+		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+			return ((size_t)sz);
+
+		b3_base = u8_toupper_b3_tbl[uv][b2][b3].base;
+
+		for (i = 0; start_id < end_id; start_id++)
+			u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id];
+	} else {
+		b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id;
+		if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+			return ((size_t)sz);
+
+		start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1];
+
+		if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX)
+			return ((size_t)sz);
+
+		b3_base = u8_tolower_b3_tbl[uv][b2][b3].base;
+
+		for (i = 0; start_id < end_id; start_id++)
+			u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id];
+	}
+
+	/*
+	 * If i is still zero, that means there is no corresponding character.
+	 */
+	if (i == 0)
+		return ((size_t)sz);
+
+	u8s[i] = '\0';
+
+	return (i);
+}
+
+/*
+ * The do_case_compare() function compares the two input strings, s1 and s2,
+ * one character at a time doing case conversions if applicable and return
+ * the comparison result as like strcmp().
+ *
+ * Since, in empirical sense, most of text data are 7-bit ASCII characters,
+ * we treat the 7-bit ASCII characters as a special case trying to yield
+ * faster processing time.
+ */
+static int
+do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1,
+    size_t n2, boolean_t is_it_toupper, int *errnum)
+{
+	int f;
+	int sz1;
+	int sz2;
+	size_t j;
+	size_t i1;
+	size_t i2;
+	uchar_t u8s1[U8_MB_CUR_MAX + 1];
+	uchar_t u8s2[U8_MB_CUR_MAX + 1];
+
+	i1 = i2 = 0;
+	while (i1 < n1 && i2 < n2) {
+		/*
+		 * Find out what would be the byte length for this UTF-8
+		 * character at string s1 and also find out if this is
+		 * an illegal start byte or not and if so, issue a proper
+		 * error number and yet treat this byte as a character.
+		 */
+		sz1 = u8_number_of_bytes[*s1];
+		if (sz1 < 0) {
+			*errnum = EILSEQ;
+			sz1 = 1;
+		}
+
+		/*
+		 * For 7-bit ASCII characters mainly, we do a quick case
+		 * conversion right at here.
+		 *
+		 * If we don't have enough bytes for this character, issue
+		 * an EINVAL error and use what are available.
+		 *
+		 * If we have enough bytes, find out if there is
+		 * a corresponding uppercase character and if so, copy over
+		 * the bytes for a comparison later. If there is no
+		 * corresponding uppercase character, then, use what we have
+		 * for the comparison.
+		 */
+		if (sz1 == 1) {
+			if (is_it_toupper)
+				u8s1[0] = U8_ASCII_TOUPPER(*s1);
+			else
+				u8s1[0] = U8_ASCII_TOLOWER(*s1);
+			s1++;
+			u8s1[1] = '\0';
+		} else if ((i1 + sz1) > n1) {
+			*errnum = EINVAL;
+			for (j = 0; (i1 + j) < n1; )
+				u8s1[j++] = *s1++;
+			u8s1[j] = '\0';
+		} else {
+			(void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper);
+			s1 += sz1;
+		}
+
+		/* Do the same for the string s2. */
+		sz2 = u8_number_of_bytes[*s2];
+		if (sz2 < 0) {
+			*errnum = EILSEQ;
+			sz2 = 1;
+		}
+
+		if (sz2 == 1) {
+			if (is_it_toupper)
+				u8s2[0] = U8_ASCII_TOUPPER(*s2);
+			else
+				u8s2[0] = U8_ASCII_TOLOWER(*s2);
+			s2++;
+			u8s2[1] = '\0';
+		} else if ((i2 + sz2) > n2) {
+			*errnum = EINVAL;
+			for (j = 0; (i2 + j) < n2; )
+				u8s2[j++] = *s2++;
+			u8s2[j] = '\0';
+		} else {
+			(void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper);
+			s2 += sz2;
+		}
+
+		/* Now compare the two characters. */
+		if (sz1 == 1 && sz2 == 1) {
+			if (*u8s1 > *u8s2)
+				return (1);
+			if (*u8s1 < *u8s2)
+				return (-1);
+		} else {
+			f = strcmp((const char *)u8s1, (const char *)u8s2);
+			if (f != 0)
+				return (f);
+		}
+
+		/*
+		 * They were the same. Let's move on to the next
+		 * characters then.
+		 */
+		i1 += sz1;
+		i2 += sz2;
+	}
+
+	/*
+	 * We compared until the end of either or both strings.
+	 *
+	 * If we reached to or went over the ends for the both, that means
+	 * they are the same.
+	 *
+	 * If we reached only one of the two ends, that means the other string
+	 * has something which then the fact can be used to determine
+	 * the return value.
+	 */
+	if (i1 >= n1) {
+		if (i2 >= n2)
+			return (0);
+		return (-1);
+	}
+	return (1);
+}
+
+/*
+ * The combining_class() function checks on the given bytes and find out
+ * the corresponding Unicode combining class value. The return value 0 means
+ * it is a Starter. Any illegal UTF-8 character will also be treated as
+ * a Starter.
+ */
+static uchar_t
+combining_class(size_t uv, uchar_t *s, size_t sz)
+{
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b4 = 0;
+
+	if (sz == 1 || sz > 4)
+		return (0);
+
+	if (sz == 2) {
+		b3 = s[0];
+		b4 = s[1];
+	} else if (sz == 3) {
+		b2 = s[0];
+		b3 = s[1];
+		b4 = s[2];
+	} else if (sz == 4) {
+		b1 = s[0];
+		b2 = s[1];
+		b3 = s[2];
+		b4 = s[3];
+	}
+
+	b1 = u8_common_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return (0);
+
+	b2 = u8_combining_class_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return (0);
+
+	b3 = u8_combining_class_b3_tbl[uv][b2][b3];
+	if (b3 == U8_TBL_ELEMENT_NOT_DEF)
+		return (0);
+
+	return (u8_combining_class_b4_tbl[uv][b3][b4]);
+}
+
+/*
+ * The do_decomp() function finds out a matching decomposition if any
+ * and return. If there is no match, the input bytes are copied and returned.
+ * The function also checks if there is a Hangul, decomposes it if necessary
+ * and returns.
+ *
+ * To save time, a single byte 7-bit ASCII character should be handled by
+ * the caller.
+ *
+ * The function returns the number of bytes returned sans always terminating
+ * the null byte. It will also return a state that will tell if there was
+ * a Hangul character decomposed which then will be used by the caller.
+ */
+static size_t
+do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz,
+    boolean_t canonical_decomposition, u8_normalization_states_t *state)
+{
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b3_tbl;
+	uint16_t b3_base;
+	uint16_t b4 = 0;
+	size_t start_id;
+	size_t end_id;
+	size_t i;
+	uint32_t u1;
+
+	if (sz == 2) {
+		b3 = u8s[0] = s[0];
+		b4 = u8s[1] = s[1];
+		u8s[2] = '\0';
+	} else if (sz == 3) {
+		/* Convert it to a Unicode scalar value. */
+		U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]);
+
+		/*
+		 * If this is a Hangul syllable, we decompose it into
+		 * a leading consonant, a vowel, and an optional trailing
+		 * consonant and then return.
+		 */
+		if (U8_HANGUL_SYLLABLE(u1)) {
+			u1 -= U8_HANGUL_SYL_FIRST;
+
+			b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT;
+			b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT)
+			    / U8_HANGUL_T_COUNT;
+			b3 = u1 % U8_HANGUL_T_COUNT;
+
+			U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1);
+			U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2);
+			if (b3) {
+				b3 += U8_HANGUL_JAMO_T_FIRST;
+				U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3);
+
+				u8s[9] = '\0';
+				*state = U8_STATE_HANGUL_LVT;
+				return (9);
+			}
+
+			u8s[6] = '\0';
+			*state = U8_STATE_HANGUL_LV;
+			return (6);
+		}
+
+		b2 = u8s[0] = s[0];
+		b3 = u8s[1] = s[1];
+		b4 = u8s[2] = s[2];
+		u8s[3] = '\0';
+
+		/*
+		 * If this is a Hangul Jamo, we know there is nothing
+		 * further that we can decompose.
+		 */
+		if (U8_HANGUL_JAMO_L(u1)) {
+			*state = U8_STATE_HANGUL_L;
+			return (3);
+		}
+
+		if (U8_HANGUL_JAMO_V(u1)) {
+			if (*state == U8_STATE_HANGUL_L)
+				*state = U8_STATE_HANGUL_LV;
+			else
+				*state = U8_STATE_HANGUL_V;
+			return (3);
+		}
+
+		if (U8_HANGUL_JAMO_T(u1)) {
+			if (*state == U8_STATE_HANGUL_LV)
+				*state = U8_STATE_HANGUL_LVT;
+			else
+				*state = U8_STATE_HANGUL_T;
+			return (3);
+		}
+	} else if (sz == 4) {
+		b1 = u8s[0] = s[0];
+		b2 = u8s[1] = s[1];
+		b3 = u8s[2] = s[2];
+		b4 = u8s[3] = s[3];
+		u8s[4] = '\0';
+	} else {
+		/*
+		 * This is a fallback and should not happen if the function
+		 * was called properly.
+		 */
+		u8s[0] = s[0];
+		u8s[1] = '\0';
+		*state = U8_STATE_START;
+		return (1);
+	}
+
+	/*
+	 * At this point, this routine does not know what it would get.
+	 * The caller should sort it out if the state isn't a Hangul one.
+	 */
+	*state = U8_STATE_START;
+
+	/* Try to find matching decomposition mapping byte sequence. */
+	b1 = u8_common_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	b2 = u8_decomp_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id;
+	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+		return ((size_t)sz);
+
+	/*
+	 * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR
+	 * which is 0x8000, this means we couldn't fit the mappings into
+	 * the cardinality of a unsigned byte.
+	 */
+	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+		start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4];
+		end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+	} else {
+		start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1];
+	}
+
+	/* This also means there wasn't any matching decomposition. */
+	if (start_id >= end_id)
+		return ((size_t)sz);
+
+	/*
+	 * The final table for decomposition mappings has three types of
+	 * byte sequences depending on whether a mapping is for compatibility
+	 * decomposition, canonical decomposition, or both like the following:
+	 *
+	 * (1) Compatibility decomposition mappings:
+	 *
+	 *	+---+---+-...-+---+
+	 *	| B0| B1| ... | Bm|
+	 *	+---+---+-...-+---+
+	 *
+	 *	The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH).
+	 *
+	 * (2) Canonical decomposition mappings:
+	 *
+	 *	+---+---+---+-...-+---+
+	 *	| T | b0| b1| ... | bn|
+	 *	+---+---+---+-...-+---+
+	 *
+	 *	where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL).
+	 *
+	 * (3) Both mappings:
+	 *
+	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
+	 *	| T | D | b0| b1| ... | bn| B0| B1| ... | Bm|
+	 *	+---+---+---+---+-...-+---+---+---+-...-+---+
+	 *
+	 *	where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement
+	 *	byte, b0 to bn are canonical mapping bytes and B0 to Bm are
+	 *	compatibility mapping bytes.
+	 *
+	 * Note that compatibility decomposition means doing recursive
+	 * decompositions using both compatibility decomposition mappings and
+	 * canonical decomposition mappings. On the other hand, canonical
+	 * decomposition means doing recursive decompositions using only
+	 * canonical decomposition mappings. Since the table we have has gone
+	 * through the recursions already, we do not need to do so during
+	 * runtime, i.e., the table has been completely flattened out
+	 * already.
+	 */
+
+	b3_base = u8_decomp_b3_tbl[uv][b2][b3].base;
+
+	/* Get the type, T, of the byte sequence. */
+	b1 = u8_decomp_final_tbl[uv][b3_base + start_id];
+
+	/*
+	 * If necessary, adjust start_id, end_id, or both. Note that if
+	 * this is compatibility decomposition mapping, there is no
+	 * adjustment.
+	 */
+	if (canonical_decomposition) {
+		/* Is the mapping only for compatibility decomposition? */
+		if (b1 < U8_DECOMP_BOTH)
+			return ((size_t)sz);
+
+		start_id++;
+
+		if (b1 == U8_DECOMP_BOTH) {
+			end_id = start_id +
+			    u8_decomp_final_tbl[uv][b3_base + start_id];
+			start_id++;
+		}
+	} else {
+		/*
+		 * Unless this is a compatibility decomposition mapping,
+		 * we adjust the start_id.
+		 */
+		if (b1 == U8_DECOMP_BOTH) {
+			start_id++;
+			start_id += u8_decomp_final_tbl[uv][b3_base + start_id];
+		} else if (b1 == U8_DECOMP_CANONICAL) {
+			start_id++;
+		}
+	}
+
+	for (i = 0; start_id < end_id; start_id++)
+		u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id];
+	u8s[i] = '\0';
+
+	return (i);
+}
+
+/*
+ * The find_composition_start() function uses the character bytes given and
+ * find out the matching composition mappings if any and return the address
+ * to the composition mappings as explained in the do_composition().
+ */
+static uchar_t *
+find_composition_start(size_t uv, uchar_t *s, size_t sz)
+{
+	uint16_t b1 = 0;
+	uint16_t b2 = 0;
+	uint16_t b3 = 0;
+	uint16_t b3_tbl;
+	uint16_t b3_base;
+	uint16_t b4 = 0;
+	size_t start_id;
+	size_t end_id;
+
+	if (sz == 1) {
+		b4 = s[0];
+	} else if (sz == 2) {
+		b3 = s[0];
+		b4 = s[1];
+	} else if (sz == 3) {
+		b2 = s[0];
+		b3 = s[1];
+		b4 = s[2];
+	} else if (sz == 4) {
+		b1 = s[0];
+		b2 = s[1];
+		b3 = s[2];
+		b4 = s[3];
+	} else {
+		/*
+		 * This is a fallback and should not happen if the function
+		 * was called properly.
+		 */
+		return (NULL);
+	}
+
+	b1 = u8_composition_b1_tbl[uv][b1];
+	if (b1 == U8_TBL_ELEMENT_NOT_DEF)
+		return (NULL);
+
+	b2 = u8_composition_b2_tbl[uv][b1][b2];
+	if (b2 == U8_TBL_ELEMENT_NOT_DEF)
+		return (NULL);
+
+	b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id;
+	if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF)
+		return (NULL);
+
+	if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) {
+		b3_tbl -= U8_16BIT_TABLE_INDICATOR;
+		start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4];
+		end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1];
+	} else {
+		start_id = u8_composition_b4_tbl[uv][b3_tbl][b4];
+		end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1];
+	}
+
+	if (start_id >= end_id)
+		return (NULL);
+
+	b3_base = u8_composition_b3_tbl[uv][b2][b3].base;
+
+	return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id]));
+}
+
+/*
+ * The blocked() function checks on the combining class values of previous
+ * characters in this sequence and return whether it is blocked or not.
+ */
+static boolean_t
+blocked(uchar_t *comb_class, size_t last)
+{
+	uchar_t my_comb_class;
+	size_t i;
+
+	my_comb_class = comb_class[last];
+	for (i = 1; i < last; i++)
+		if (comb_class[i] >= my_comb_class ||
+		    comb_class[i] == U8_COMBINING_CLASS_STARTER)
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * The do_composition() reads the character string pointed by 's' and
+ * do necessary canonical composition and then copy over the result back to
+ * the 's'.
+ *
+ * The input argument 's' cannot contain more than 32 characters.
+ */
+static size_t
+do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start,
+    uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast)
+{
+	uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t tc[U8_MB_CUR_MAX] = { '\0' };
+	uint8_t saved_marks[U8_MAX_CHARS_A_SEQ];
+	size_t saved_marks_count;
+	uchar_t *p;
+	uchar_t *saved_p;
+	uchar_t *q;
+	size_t i;
+	size_t saved_i;
+	size_t j;
+	size_t k;
+	size_t l;
+	size_t C;
+	size_t saved_l;
+	size_t size;
+	uint32_t u1;
+	uint32_t u2;
+	boolean_t match_not_found = B_TRUE;
+
+	/*
+	 * This should never happen unless the callers are doing some strange
+	 * and unexpected things.
+	 *
+	 * The "last" is the index pointing to the last character not last + 1.
+	 */
+	if (last >= U8_MAX_CHARS_A_SEQ)
+		last = U8_UPPER_LIMIT_IN_A_SEQ;
+
+	for (i = l = 0; i <= last; i++) {
+		/*
+		 * The last or any non-Starters at the beginning, we don't
+		 * have any chance to do composition and so we just copy them
+		 * to the temporary buffer.
+		 */
+		if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) {
+SAVE_THE_CHAR:
+			p = s + start[i];
+			size = disp[i];
+			for (k = 0; k < size; k++)
+				t[l++] = *p++;
+			continue;
+		}
+
+		/*
+		 * If this could be a start of Hangul Jamos, then, we try to
+		 * conjoin them.
+		 */
+		if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) {
+			U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]],
+			    s[start[i] + 1], s[start[i] + 2]);
+			U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3],
+			    s[start[i] + 4], s[start[i] + 5]);
+
+			if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) {
+				u1 -= U8_HANGUL_JAMO_L_FIRST;
+				u2 -= U8_HANGUL_JAMO_V_FIRST;
+				u1 = U8_HANGUL_SYL_FIRST +
+				    (u1 * U8_HANGUL_V_COUNT + u2) *
+				    U8_HANGUL_T_COUNT;
+
+				i += 2;
+				if (i <= last) {
+					U8_PUT_3BYTES_INTO_UTF32(u2,
+					    s[start[i]], s[start[i] + 1],
+					    s[start[i] + 2]);
+
+					if (U8_HANGUL_JAMO_T(u2)) {
+						u1 += u2 -
+						    U8_HANGUL_JAMO_T_FIRST;
+						i++;
+					}
+				}
+
+				U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1);
+				i--;
+				l += 3;
+				continue;
+			}
+		}
+
+		/*
+		 * Let's then find out if this Starter has composition
+		 * mapping.
+		 */
+		p = find_composition_start(uv, s + start[i], disp[i]);
+		if (p == NULL)
+			goto SAVE_THE_CHAR;
+
+		/*
+		 * We have a Starter with composition mapping and the next
+		 * character is a non-Starter. Let's try to find out if
+		 * we can do composition.
+		 */
+
+		saved_p = p;
+		saved_i = i;
+		saved_l = l;
+		saved_marks_count = 0;
+
+TRY_THE_NEXT_MARK:
+		q = s + start[++i];
+		size = disp[i];
+
+		/*
+		 * The next for() loop compares the non-Starter pointed by
+		 * 'q' with the possible (joinable) characters pointed by 'p'.
+		 *
+		 * The composition final table entry pointed by the 'p'
+		 * looks like the following:
+		 *
+		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+		 * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F |
+		 * +---+---+---+-...-+---+---+---+---+-...-+---+---+
+		 *
+		 * where C is the count byte indicating the number of
+		 * mapping pairs where each pair would be look like
+		 * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second
+		 * character of a canonical decomposition and the B0-Bm are
+		 * the bytes of a matching composite character. The F is
+		 * a filler byte after each character as the separator.
+		 */
+
+		match_not_found = B_TRUE;
+
+		for (C = *p++; C > 0; C--) {
+			for (k = 0; k < size; p++, k++)
+				if (*p != q[k])
+					break;
+
+			/* Have we found it? */
+			if (k >= size && *p == U8_TBL_ELEMENT_FILLER) {
+				match_not_found = B_FALSE;
+
+				l = saved_l;
+
+				while (*++p != U8_TBL_ELEMENT_FILLER)
+					t[l++] = *p;
+
+				break;
+			}
+
+			/* We didn't find; skip to the next pair. */
+			if (*p != U8_TBL_ELEMENT_FILLER)
+				while (*++p != U8_TBL_ELEMENT_FILLER)
+					;
+			while (*++p != U8_TBL_ELEMENT_FILLER)
+				;
+			p++;
+		}
+
+		/*
+		 * If there was no match, we will need to save the combining
+		 * mark for later appending. After that, if the next one
+		 * is a non-Starter and not blocked, then, we try once
+		 * again to do composition with the next non-Starter.
+		 *
+		 * If there was no match and this was a Starter, then,
+		 * this is a new start.
+		 *
+		 * If there was a match and a composition done and we have
+		 * more to check on, then, we retrieve a new composition final
+		 * table entry for the composite and then try to do the
+		 * composition again.
+		 */
+
+		if (match_not_found) {
+			if (comb_class[i] == U8_COMBINING_CLASS_STARTER) {
+				i--;
+				goto SAVE_THE_CHAR;
+			}
+
+			saved_marks[saved_marks_count++] = i;
+		}
+
+		if (saved_l == l) {
+			while (i < last) {
+				if (blocked(comb_class, i + 1))
+					saved_marks[saved_marks_count++] = ++i;
+				else
+					break;
+			}
+			if (i < last) {
+				p = saved_p;
+				goto TRY_THE_NEXT_MARK;
+			}
+		} else if (i < last) {
+			p = find_composition_start(uv, t + saved_l,
+			    l - saved_l);
+			if (p != NULL) {
+				saved_p = p;
+				goto TRY_THE_NEXT_MARK;
+			}
+		}
+
+		/*
+		 * There is no more composition possible.
+		 *
+		 * If there was no composition what so ever then we copy
+		 * over the original Starter and then append any non-Starters
+		 * remaining at the target string sequentially after that.
+		 */
+
+		if (saved_l == l) {
+			p = s + start[saved_i];
+			size = disp[saved_i];
+			for (j = 0; j < size; j++)
+				t[l++] = *p++;
+		}
+
+		for (k = 0; k < saved_marks_count; k++) {
+			p = s + start[saved_marks[k]];
+			size = disp[saved_marks[k]];
+			for (j = 0; j < size; j++)
+				t[l++] = *p++;
+		}
+	}
+
+	/*
+	 * If the last character is a Starter and if we have a character
+	 * (possibly another Starter) that can be turned into a composite,
+	 * we do so and we do so until there is no more of composition
+	 * possible.
+	 */
+	if (comb_class[last] == U8_COMBINING_CLASS_STARTER) {
+		p = *os;
+		saved_l = l - disp[last];
+
+		while (p < oslast) {
+			size = u8_number_of_bytes[*p];
+			if (size <= 1 || (p + size) > oslast)
+				break;
+
+			saved_p = p;
+
+			for (i = 0; i < size; i++)
+				tc[i] = *p++;
+
+			q = find_composition_start(uv, t + saved_l,
+			    l - saved_l);
+			if (q == NULL) {
+				p = saved_p;
+				break;
+			}
+
+			match_not_found = B_TRUE;
+
+			for (C = *q++; C > 0; C--) {
+				for (k = 0; k < size; q++, k++)
+					if (*q != tc[k])
+						break;
+
+				if (k >= size && *q == U8_TBL_ELEMENT_FILLER) {
+					match_not_found = B_FALSE;
+
+					l = saved_l;
+
+					while (*++q != U8_TBL_ELEMENT_FILLER) {
+						/*
+						 * This is practically
+						 * impossible but we don't
+						 * want to take any chances.
+						 */
+						if (l >=
+						    U8_STREAM_SAFE_TEXT_MAX) {
+							p = saved_p;
+							goto SAFE_RETURN;
+						}
+						t[l++] = *q;
+					}
+
+					break;
+				}
+
+				if (*q != U8_TBL_ELEMENT_FILLER)
+					while (*++q != U8_TBL_ELEMENT_FILLER)
+						;
+				while (*++q != U8_TBL_ELEMENT_FILLER)
+					;
+				q++;
+			}
+
+			if (match_not_found) {
+				p = saved_p;
+				break;
+			}
+		}
+SAFE_RETURN:
+		*os = p;
+	}
+
+	/*
+	 * Now we copy over the temporary string to the target string.
+	 * Since composition always reduces the number of characters or
+	 * the number of characters stay, we don't need to worry about
+	 * the buffer overflow here.
+	 */
+	for (i = 0; i < l; i++)
+		s[i] = t[i];
+	s[l] = '\0';
+
+	return (l);
+}
+
+/*
+ * The collect_a_seq() function checks on the given string s, collect
+ * a sequence of characters at u8s, and return the sequence. While it collects
+ * a sequence, it also applies case conversion, canonical or compatibility
+ * decomposition, canonical decomposition, or some or all of them and
+ * in that order.
+ *
+ * The collected sequence cannot be bigger than 32 characters since if
+ * it is having more than 31 characters, the sequence will be terminated
+ * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into
+ * a Stream-Safe Text. The collected sequence is always terminated with
+ * a null byte and the return value is the byte length of the sequence
+ * including 0. The return value does not include the terminating
+ * null byte.
+ */
+static size_t
+collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast,
+    boolean_t is_it_toupper, boolean_t is_it_tolower,
+    boolean_t canonical_decomposition, boolean_t compatibility_decomposition,
+    boolean_t canonical_composition,
+    int *errnum, u8_normalization_states_t *state)
+{
+	uchar_t *s;
+	int sz;
+	int saved_sz;
+	size_t i;
+	size_t j;
+	size_t k;
+	size_t l;
+	uchar_t comb_class[U8_MAX_CHARS_A_SEQ];
+	uchar_t disp[U8_MAX_CHARS_A_SEQ];
+	uchar_t start[U8_MAX_CHARS_A_SEQ];
+	uchar_t u8t[U8_MB_CUR_MAX] = { '\0' };
+	uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t tc;
+	size_t last;
+	size_t saved_last;
+	uint32_t u1;
+
+	/*
+	 * Save the source string pointer which we will return a changed
+	 * pointer if we do processing.
+	 */
+	s = *source;
+
+	/*
+	 * The following is a fallback for just in case callers are not
+	 * checking the string boundaries before the calling.
+	 */
+	if (s >= slast) {
+		u8s[0] = '\0';
+
+		return (0);
+	}
+
+	/*
+	 * As the first thing, let's collect a character and do case
+	 * conversion if necessary.
+	 */
+
+	sz = u8_number_of_bytes[*s];
+
+	if (sz < 0) {
+		*errnum = EILSEQ;
+
+		u8s[0] = *s++;
+		u8s[1] = '\0';
+
+		*source = s;
+
+		return (1);
+	}
+
+	if (sz == 1) {
+		if (is_it_toupper)
+			u8s[0] = U8_ASCII_TOUPPER(*s);
+		else if (is_it_tolower)
+			u8s[0] = U8_ASCII_TOLOWER(*s);
+		else
+			u8s[0] = *s;
+		s++;
+		u8s[1] = '\0';
+	} else if ((s + sz) > slast) {
+		*errnum = EINVAL;
+
+		for (i = 0; s < slast; )
+			u8s[i++] = *s++;
+		u8s[i] = '\0';
+
+		*source = s;
+
+		return (i);
+	} else {
+		if (is_it_toupper || is_it_tolower) {
+			i = do_case_conv(uv, u8s, s, sz, is_it_toupper);
+			s += sz;
+			sz = i;
+		} else {
+			for (i = 0; i < sz; )
+				u8s[i++] = *s++;
+			u8s[i] = '\0';
+		}
+	}
+
+	/*
+	 * And then canonical/compatibility decomposition followed by
+	 * an optional canonical composition. Please be noted that
+	 * canonical composition is done only when a decomposition is
+	 * done.
+	 */
+	if (canonical_decomposition || compatibility_decomposition) {
+		if (sz == 1) {
+			*state = U8_STATE_START;
+
+			saved_sz = 1;
+
+			comb_class[0] = 0;
+			start[0] = 0;
+			disp[0] = 1;
+
+			last = 1;
+		} else {
+			saved_sz = do_decomp(uv, u8s, u8s, sz,
+			    canonical_decomposition, state);
+
+			last = 0;
+
+			for (i = 0; i < saved_sz; ) {
+				sz = u8_number_of_bytes[u8s[i]];
+
+				comb_class[last] = combining_class(uv,
+				    u8s + i, sz);
+				start[last] = i;
+				disp[last] = sz;
+
+				last++;
+				i += sz;
+			}
+
+			/*
+			 * Decomposition yields various Hangul related
+			 * states but not on combining marks. We need to
+			 * find out at here by checking on the last
+			 * character.
+			 */
+			if (*state == U8_STATE_START) {
+				if (comb_class[last - 1])
+					*state = U8_STATE_COMBINING_MARK;
+			}
+		}
+
+		saved_last = last;
+
+		while (s < slast) {
+			sz = u8_number_of_bytes[*s];
+
+			/*
+			 * If this is an illegal character, an incomplete
+			 * character, or an 7-bit ASCII Starter character,
+			 * then we have collected a sequence; break and let
+			 * the next call deal with the two cases.
+			 *
+			 * Note that this is okay only if you are using this
+			 * function with a fixed length string, not on
+			 * a buffer with multiple calls of one chunk at a time.
+			 */
+			if (sz <= 1) {
+				break;
+			} else if ((s + sz) > slast) {
+				break;
+			} else {
+				/*
+				 * If the previous character was a Hangul Jamo
+				 * and this character is a Hangul Jamo that
+				 * can be conjoined, we collect the Jamo.
+				 */
+				if (*s == U8_HANGUL_JAMO_1ST_BYTE) {
+					U8_PUT_3BYTES_INTO_UTF32(u1,
+					    *s, *(s + 1), *(s + 2));
+
+					if (U8_HANGUL_COMPOSABLE_L_V(*state,
+					    u1)) {
+						i = 0;
+						*state = U8_STATE_HANGUL_LV;
+						goto COLLECT_A_HANGUL;
+					}
+
+					if (U8_HANGUL_COMPOSABLE_LV_T(*state,
+					    u1)) {
+						i = 0;
+						*state = U8_STATE_HANGUL_LVT;
+						goto COLLECT_A_HANGUL;
+					}
+				}
+
+				/*
+				 * Regardless of whatever it was, if this is
+				 * a Starter, we don't collect the character
+				 * since that's a new start and we will deal
+				 * with it at the next time.
+				 */
+				i = combining_class(uv, s, sz);
+				if (i == U8_COMBINING_CLASS_STARTER)
+					break;
+
+				/*
+				 * We know the current character is a combining
+				 * mark. If the previous character wasn't
+				 * a Starter (not Hangul) or a combining mark,
+				 * then, we don't collect this combining mark.
+				 */
+				if (*state != U8_STATE_START &&
+				    *state != U8_STATE_COMBINING_MARK)
+					break;
+
+				*state = U8_STATE_COMBINING_MARK;
+COLLECT_A_HANGUL:
+				/*
+				 * If we collected a Starter and combining
+				 * marks up to 30, i.e., total 31 characters,
+				 * then, we terminate this degenerately long
+				 * combining sequence with a U+034F COMBINING
+				 * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in
+				 * UTF-8 and turn this into a Stream-Safe
+				 * Text. This will be extremely rare but
+				 * possible.
+				 *
+				 * The following will also guarantee that
+				 * we are not writing more than 32 characters
+				 * plus a NULL at u8s[].
+				 */
+				if (last >= U8_UPPER_LIMIT_IN_A_SEQ) {
+TURN_STREAM_SAFE:
+					*state = U8_STATE_START;
+					comb_class[last] = 0;
+					start[last] = saved_sz;
+					disp[last] = 2;
+					last++;
+
+					u8s[saved_sz++] = 0xCD;
+					u8s[saved_sz++] = 0x8F;
+
+					break;
+				}
+
+				/*
+				 * Some combining marks also do decompose into
+				 * another combining mark or marks.
+				 */
+				if (*state == U8_STATE_COMBINING_MARK) {
+					k = last;
+					l = sz;
+					i = do_decomp(uv, uts, s, sz,
+					    canonical_decomposition, state);
+					for (j = 0; j < i; ) {
+						sz = u8_number_of_bytes[uts[j]];
+
+						comb_class[last] =
+						    combining_class(uv,
+						    uts + j, sz);
+						start[last] = saved_sz + j;
+						disp[last] = sz;
+
+						last++;
+						if (last >=
+						    U8_UPPER_LIMIT_IN_A_SEQ) {
+							last = k;
+							goto TURN_STREAM_SAFE;
+						}
+						j += sz;
+					}
+
+					*state = U8_STATE_COMBINING_MARK;
+					sz = i;
+					s += l;
+
+					for (i = 0; i < sz; i++)
+						u8s[saved_sz++] = uts[i];
+				} else {
+					comb_class[last] = i;
+					start[last] = saved_sz;
+					disp[last] = sz;
+					last++;
+
+					for (i = 0; i < sz; i++)
+						u8s[saved_sz++] = *s++;
+				}
+
+				/*
+				 * If this is U+0345 COMBINING GREEK
+				 * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a.,
+				 * iota subscript, and need to be converted to
+				 * uppercase letter, convert it to U+0399 GREEK
+				 * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8),
+				 * i.e., convert to capital adscript form as
+				 * specified in the Unicode standard.
+				 *
+				 * This is the only special case of (ambiguous)
+				 * case conversion at combining marks and
+				 * probably the standard will never have
+				 * anything similar like this in future.
+				 */
+				if (is_it_toupper && sz >= 2 &&
+				    u8s[saved_sz - 2] == 0xCD &&
+				    u8s[saved_sz - 1] == 0x85) {
+					u8s[saved_sz - 2] = 0xCE;
+					u8s[saved_sz - 1] = 0x99;
+				}
+			}
+		}
+
+		/*
+		 * Let's try to ensure a canonical ordering for the collected
+		 * combining marks. We do this only if we have collected
+		 * at least one more non-Starter. (The decomposition mapping
+		 * data tables have fully (and recursively) expanded and
+		 * canonically ordered decompositions.)
+		 *
+		 * The U8_SWAP_COMB_MARKS() convenience macro has some
+		 * assumptions and we are meeting the assumptions.
+		 */
+		last--;
+		if (last >= saved_last) {
+			for (i = 0; i < last; i++)
+				for (j = last; j > i; j--)
+					if (comb_class[j] &&
+					    comb_class[j - 1] > comb_class[j]) {
+						U8_SWAP_COMB_MARKS(j - 1, j);
+					}
+		}
+
+		*source = s;
+
+		if (! canonical_composition) {
+			u8s[saved_sz] = '\0';
+			return (saved_sz);
+		}
+
+		/*
+		 * Now do the canonical composition. Note that we do this
+		 * only after a canonical or compatibility decomposition to
+		 * finish up NFC or NFKC.
+		 */
+		sz = do_composition(uv, u8s, comb_class, start, disp, last,
+		    &s, slast);
+	}
+
+	*source = s;
+
+	return ((size_t)sz);
+}
+
+/*
+ * The do_norm_compare() function does string comparison based on Unicode
+ * simple case mappings and Unicode Normalization definitions.
+ *
+ * It does so by collecting a sequence of character at a time and comparing
+ * the collected sequences from the strings.
+ *
+ * The meanings on the return values are the same as the usual strcmp().
+ */
+static int
+do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2,
+    int flag, int *errnum)
+{
+	int result;
+	size_t sz1;
+	size_t sz2;
+	uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1];
+	uchar_t *s1last;
+	uchar_t *s2last;
+	boolean_t is_it_toupper;
+	boolean_t is_it_tolower;
+	boolean_t canonical_decomposition;
+	boolean_t compatibility_decomposition;
+	boolean_t canonical_composition;
+	u8_normalization_states_t state;
+
+	s1last = s1 + n1;
+	s2last = s2 + n2;
+
+	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+	canonical_decomposition = flag & U8_CANON_DECOMP;
+	compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+	canonical_composition = flag & U8_CANON_COMP;
+
+	while (s1 < s1last && s2 < s2last) {
+		/*
+		 * If the current character is a 7-bit ASCII and the last
+		 * character, or, if the current character and the next
+		 * character are both some 7-bit ASCII characters then
+		 * we treat the current character as a sequence.
+		 *
+		 * In any other cases, we need to call collect_a_seq().
+		 */
+
+		if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last ||
+		    ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) {
+			if (is_it_toupper)
+				u8s1[0] = U8_ASCII_TOUPPER(*s1);
+			else if (is_it_tolower)
+				u8s1[0] = U8_ASCII_TOLOWER(*s1);
+			else
+				u8s1[0] = *s1;
+			u8s1[1] = '\0';
+			sz1 = 1;
+			s1++;
+		} else {
+			state = U8_STATE_START;
+			sz1 = collect_a_seq(uv, u8s1, &s1, s1last,
+			    is_it_toupper, is_it_tolower,
+			    canonical_decomposition,
+			    compatibility_decomposition,
+			    canonical_composition, errnum, &state);
+		}
+
+		if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last ||
+		    ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) {
+			if (is_it_toupper)
+				u8s2[0] = U8_ASCII_TOUPPER(*s2);
+			else if (is_it_tolower)
+				u8s2[0] = U8_ASCII_TOLOWER(*s2);
+			else
+				u8s2[0] = *s2;
+			u8s2[1] = '\0';
+			sz2 = 1;
+			s2++;
+		} else {
+			state = U8_STATE_START;
+			sz2 = collect_a_seq(uv, u8s2, &s2, s2last,
+			    is_it_toupper, is_it_tolower,
+			    canonical_decomposition,
+			    compatibility_decomposition,
+			    canonical_composition, errnum, &state);
+		}
+
+		/*
+		 * Now compare the two characters. If they are the same,
+		 * we move on to the next character sequences.
+		 */
+		if (sz1 == 1 && sz2 == 1) {
+			if (*u8s1 > *u8s2)
+				return (1);
+			if (*u8s1 < *u8s2)
+				return (-1);
+		} else {
+			result = strcmp((const char *)u8s1, (const char *)u8s2);
+			if (result != 0)
+				return (result);
+		}
+	}
+
+	/*
+	 * We compared until the end of either or both strings.
+	 *
+	 * If we reached to or went over the ends for the both, that means
+	 * they are the same.
+	 *
+	 * If we reached only one end, that means the other string has
+	 * something which then can be used to determine the return value.
+	 */
+	if (s1 >= s1last) {
+		if (s2 >= s2last)
+			return (0);
+		return (-1);
+	}
+	return (1);
+}
+
+/*
+ * The u8_strcmp() function compares two UTF-8 strings quite similar to
+ * the strcmp(). For the comparison, however, Unicode Normalization specific
+ * equivalency and Unicode simple case conversion mappings based equivalency
+ * can be requested and checked against.
+ */
+int
+u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv,
+    int *errnum)
+{
+	int f;
+	size_t n1;
+	size_t n2;
+
+	*errnum = 0;
+
+	/*
+	 * Check on the requested Unicode version, case conversion, and
+	 * normalization flag values.
+	 */
+
+	if (uv > U8_UNICODE_LATEST) {
+		*errnum = ERANGE;
+		uv = U8_UNICODE_LATEST;
+	}
+
+	if (flag == 0) {
+		flag = U8_STRCMP_CS;
+	} else {
+		f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER |
+		    U8_STRCMP_CI_LOWER);
+		if (f == 0) {
+			flag |= U8_STRCMP_CS;
+		} else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER &&
+		    f != U8_STRCMP_CI_LOWER) {
+			*errnum = EBADF;
+			flag = U8_STRCMP_CS;
+		}
+
+		f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+		if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC &&
+		    f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) {
+			*errnum = EBADF;
+			flag = U8_STRCMP_CS;
+		}
+	}
+
+	if (flag == U8_STRCMP_CS) {
+		return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n));
+	}
+
+	n1 = strlen(s1);
+	n2 = strlen(s2);
+	if (n != 0) {
+		if (n < n1)
+			n1 = n;
+		if (n < n2)
+			n2 = n;
+	}
+
+	/*
+	 * Simple case conversion can be done much faster and so we do
+	 * them separately here.
+	 */
+	if (flag == U8_STRCMP_CI_UPPER) {
+		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+		    n1, n2, B_TRUE, errnum));
+	} else if (flag == U8_STRCMP_CI_LOWER) {
+		return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2,
+		    n1, n2, B_FALSE, errnum));
+	}
+
+	return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2,
+	    flag, errnum));
+}
+
+size_t
+u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen,
+    int flag, size_t unicode_version, int *errnum)
+{
+	int f;
+	int sz;
+	uchar_t *ib;
+	uchar_t *ibtail;
+	uchar_t *ob;
+	uchar_t *obtail;
+	boolean_t do_not_ignore_null;
+	boolean_t do_not_ignore_invalid;
+	boolean_t is_it_toupper;
+	boolean_t is_it_tolower;
+	boolean_t canonical_decomposition;
+	boolean_t compatibility_decomposition;
+	boolean_t canonical_composition;
+	size_t ret_val;
+	size_t i;
+	size_t j;
+	uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1];
+	u8_normalization_states_t state;
+
+	if (unicode_version > U8_UNICODE_LATEST) {
+		*errnum = ERANGE;
+		return ((size_t)-1);
+	}
+
+	f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER);
+	if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) {
+		*errnum = EBADF;
+		return ((size_t)-1);
+	}
+
+	f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP);
+	if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC &&
+	    f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) {
+		*errnum = EBADF;
+		return ((size_t)-1);
+	}
+
+	if (inarray == NULL || *inlen == 0)
+		return (0);
+
+	if (outarray == NULL) {
+		*errnum = E2BIG;
+		return ((size_t)-1);
+	}
+
+	ib = (uchar_t *)inarray;
+	ob = (uchar_t *)outarray;
+	ibtail = ib + *inlen;
+	obtail = ob + *outlen;
+
+	do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL);
+	do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID);
+	is_it_toupper = flag & U8_TEXTPREP_TOUPPER;
+	is_it_tolower = flag & U8_TEXTPREP_TOLOWER;
+
+	ret_val = 0;
+
+	/*
+	 * If we don't have a normalization flag set, we do the simple case
+	 * conversion based text preparation separately below. Text
+	 * preparation involving Normalization will be done in the false task
+	 * block, again, separately since it will take much more time and
+	 * resource than doing simple case conversions.
+	 */
+	if (f == 0) {
+		while (ib < ibtail) {
+			if (*ib == '\0' && do_not_ignore_null)
+				break;
+
+			sz = u8_number_of_bytes[*ib];
+
+			if (sz < 0) {
+				if (do_not_ignore_invalid) {
+					*errnum = EILSEQ;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				sz = 1;
+				ret_val++;
+			}
+
+			if (sz == 1) {
+				if (ob >= obtail) {
+					*errnum = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if (is_it_toupper)
+					*ob = U8_ASCII_TOUPPER(*ib);
+				else if (is_it_tolower)
+					*ob = U8_ASCII_TOLOWER(*ib);
+				else
+					*ob = *ib;
+				ib++;
+				ob++;
+			} else if ((ib + sz) > ibtail) {
+				if (do_not_ignore_invalid) {
+					*errnum = EINVAL;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if ((obtail - ob) < (ibtail - ib)) {
+					*errnum = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				/*
+				 * We treat the remaining incomplete character
+				 * bytes as a character.
+				 */
+				ret_val++;
+
+				while (ib < ibtail)
+					*ob++ = *ib++;
+			} else {
+				if (is_it_toupper || is_it_tolower) {
+					i = do_case_conv(unicode_version, u8s,
+					    ib, sz, is_it_toupper);
+
+					if ((obtail - ob) < i) {
+						*errnum = E2BIG;
+						ret_val = (size_t)-1;
+						break;
+					}
+
+					ib += sz;
+
+					for (sz = 0; sz < i; sz++)
+						*ob++ = u8s[sz];
+				} else {
+					if ((obtail - ob) < sz) {
+						*errnum = E2BIG;
+						ret_val = (size_t)-1;
+						break;
+					}
+
+					for (i = 0; i < sz; i++)
+						*ob++ = *ib++;
+				}
+			}
+		}
+	} else {
+		canonical_decomposition = flag & U8_CANON_DECOMP;
+		compatibility_decomposition = flag & U8_COMPAT_DECOMP;
+		canonical_composition = flag & U8_CANON_COMP;
+
+		while (ib < ibtail) {
+			if (*ib == '\0' && do_not_ignore_null)
+				break;
+
+			/*
+			 * If the current character is a 7-bit ASCII
+			 * character and it is the last character, or,
+			 * if the current character is a 7-bit ASCII
+			 * character and the next character is also a 7-bit
+			 * ASCII character, then, we copy over this
+			 * character without going through collect_a_seq().
+			 *
+			 * In any other cases, we need to look further with
+			 * the collect_a_seq() function.
+			 */
+			if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail ||
+			    ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) {
+				if (ob >= obtail) {
+					*errnum = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if (is_it_toupper)
+					*ob = U8_ASCII_TOUPPER(*ib);
+				else if (is_it_tolower)
+					*ob = U8_ASCII_TOLOWER(*ib);
+				else
+					*ob = *ib;
+				ib++;
+				ob++;
+			} else {
+				*errnum = 0;
+				state = U8_STATE_START;
+
+				j = collect_a_seq(unicode_version, u8s,
+				    &ib, ibtail,
+				    is_it_toupper,
+				    is_it_tolower,
+				    canonical_decomposition,
+				    compatibility_decomposition,
+				    canonical_composition,
+				    errnum, &state);
+
+				if (*errnum && do_not_ignore_invalid) {
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				if ((obtail - ob) < j) {
+					*errnum = E2BIG;
+					ret_val = (size_t)-1;
+					break;
+				}
+
+				for (i = 0; i < j; i++)
+					*ob++ = u8s[i];
+			}
+		}
+	}
+
+	*inlen = ibtail - ib;
+	*outlen = obtail - ob;
+
+	return (ret_val);
+}
+
+#if defined(_KERNEL)
+static int __init
+unicode_init(void)
+{
+	return (0);
+}
+
+static void __exit
+unicode_fini(void)
+{
+}
+
+module_init(unicode_init);
+module_exit(unicode_fini);
+#endif
+
+ZFS_MODULE_DESCRIPTION("Unicode implementation");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+EXPORT_SYMBOL(u8_validate);
+EXPORT_SYMBOL(u8_strcmp);
+EXPORT_SYMBOL(u8_textprep_str);
diff --git a/sys/contrib/openzfs/module/unicode/uconv.c b/sys/contrib/openzfs/module/unicode/uconv.c
new file mode 100644
index 000000000000..fe84979d08b2
--- /dev/null
+++ b/sys/contrib/openzfs/module/unicode/uconv.c
@@ -0,0 +1,863 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+/*
+ * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
+ * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
+ * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
+ * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
+ * the section 3C man pages.
+ * Interface stability: Committed
+ */
+
+#include <sys/types.h>
+#ifdef	_KERNEL
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#else
+#include <sys/u8_textprep.h>
+#endif	/* _KERNEL */
+#include <sys/byteorder.h>
+#include <sys/errno.h>
+
+
+/*
+ * The max and min values of high and low surrogate pairs of UTF-16,
+ * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
+ */
+#define	UCONV_U16_HI_MIN	(0xd800U)
+#define	UCONV_U16_HI_MAX	(0xdbffU)
+#define	UCONV_U16_LO_MIN	(0xdc00U)
+#define	UCONV_U16_LO_MAX	(0xdfffU)
+#define	UCONV_U16_BIT_SHIFT	(0x0400U)
+#define	UCONV_U16_BIT_MASK	(0x0fffffU)
+#define	UCONV_U16_START		(0x010000U)
+
+/* The maximum value of Unicode coding space and ASCII coding space. */
+#define	UCONV_UNICODE_MAX	(0x10ffffU)
+#define	UCONV_ASCII_MAX		(0x7fU)
+
+/* The mask values for input and output endians. */
+#define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
+#define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
+
+/* Native and reversed endian macros. */
+#ifdef	_ZFS_BIG_ENDIAN
+#define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
+#define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
+#define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
+#define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
+#else
+#define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
+#define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
+#define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
+#define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
+#endif	/* _BIG_ENDIAN */
+
+/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
+#define	UCONV_BOM_NORMAL	(0xfeffU)
+#define	UCONV_BOM_SWAPPED	(0xfffeU)
+#define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
+
+/* UTF-32 boundaries based on UTF-8 character byte lengths. */
+#define	UCONV_U8_ONE_BYTE	(0x7fU)
+#define	UCONV_U8_TWO_BYTES	(0x7ffU)
+#define	UCONV_U8_THREE_BYTES	(0xffffU)
+#define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
+
+/* The common minimum and maximum values at the UTF-8 character bytes. */
+#define	UCONV_U8_BYTE_MIN	(0x80U)
+#define	UCONV_U8_BYTE_MAX	(0xbfU)
+
+/*
+ * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
+ * UTF-8 character bytes.
+ */
+#define	UCONV_U8_BIT_SHIFT	6
+#define	UCONV_U8_BIT_MASK	0x3f
+
+/*
+ * The following vector shows remaining bytes in a UTF-8 character.
+ * Index will be the first byte of the character.
+ */
+static const uchar_t remaining_bytes_tbl[0x100] = {
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+
+/*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
+	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+
+/*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
+	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+
+/*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
+	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+
+/*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
+	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+/*
+ * The following is a vector of bit-masks to get used bits in
+ * the first byte of a UTF-8 character.  Index is remaining bytes at above of
+ * the character.
+ */
+#ifdef	_KERNEL
+const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+#else
+static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+#endif	/* _KERNEL */
+
+/*
+ * The following two vectors are to provide valid minimum and
+ * maximum values for the 2'nd byte of a multibyte UTF-8 character for
+ * better illegal sequence checking. The index value must be the value of
+ * the first byte of the UTF-8 character.
+ */
+static const uchar_t valid_min_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+
+/*	C0    C1    C2    C3    C4    C5    C6    C7 */
+	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	C8    C9    CA    CB    CC    CD    CE    CF */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	D0    D1    D2    D3    D4    D5    D6    D7 */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	D8    D9    DA    DB    DC    DD    DE    DF */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	E0    E1    E2    E3    E4    E5    E6    E7 */
+	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	E8    E9    EA    EB    EC    ED    EE    EF */
+	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+
+/*	F0    F1    F2    F3    F4    F5    F6    F7 */
+	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
+
+	0,    0,    0,    0,    0,    0,    0,    0
+};
+
+static const uchar_t valid_max_2nd_byte[0x100] = {
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+	0,    0,    0,    0,    0,    0,    0,    0,
+
+/*	C0    C1    C2    C3    C4    C5    C6    C7 */
+	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	C8    C9    CA    CB    CC    CD    CE    CF */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	D0    D1    D2    D3    D4    D5    D6    D7 */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	D8    D9    DA    DB    DC    DD    DE    DF */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	E0    E1    E2    E3    E4    E5    E6    E7 */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
+
+/*	E8    E9    EA    EB    EC    ED    EE    EF */
+	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
+
+/*	F0    F1    F2    F3    F4    F5    F6    F7 */
+	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
+
+	0,    0,    0,    0,    0,    0,    0,    0
+};
+
+
+static int
+check_endian(int flag, int *in, int *out)
+{
+	*in = flag & UCONV_IN_ENDIAN_MASKS;
+
+	/* You cannot have both. */
+	if (*in == UCONV_IN_ENDIAN_MASKS)
+		return (EBADF);
+
+	if (*in == 0)
+		*in = UCONV_IN_NAT_ENDIAN;
+
+	*out = flag & UCONV_OUT_ENDIAN_MASKS;
+
+	/* You cannot have both. */
+	if (*out == UCONV_OUT_ENDIAN_MASKS)
+		return (EBADF);
+
+	if (*out == 0)
+		*out = UCONV_OUT_NAT_ENDIAN;
+
+	return (0);
+}
+
+static boolean_t
+check_bom16(const uint16_t *u16s, size_t u16l, int *in)
+{
+	if (u16l > 0) {
+		if (*u16s == UCONV_BOM_NORMAL) {
+			*in = UCONV_IN_NAT_ENDIAN;
+			return (B_TRUE);
+		}
+		if (*u16s == UCONV_BOM_SWAPPED) {
+			*in = UCONV_IN_REV_ENDIAN;
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static boolean_t
+check_bom32(const uint32_t *u32s, size_t u32l, int *in)
+{
+	if (u32l > 0) {
+		if (*u32s == UCONV_BOM_NORMAL) {
+			*in = UCONV_IN_NAT_ENDIAN;
+			return (B_TRUE);
+		}
+		if (*u32s == UCONV_BOM_SWAPPED_32) {
+			*in = UCONV_IN_REV_ENDIAN;
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+int
+uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
+    uint32_t *u32s, size_t *utf32len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u32l;
+	uint32_t hi;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	/*
+	 * Do preliminary validity checks on parameters and collect info on
+	 * endians.
+	 */
+	if (u16s == NULL || utf16len == NULL)
+		return (EILSEQ);
+
+	if (u32s == NULL || utf32len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	/*
+	 * Initialize input and output parameter buffer indices and
+	 * temporary variables.
+	 */
+	u16l = u32l = 0;
+	hi = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	/*
+	 * Check on the BOM at the beginning of the input buffer if required
+	 * and if there is indeed one, process it.
+	 */
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom16(u16s, *utf16len, &inendian))
+		u16l++;
+
+	/*
+	 * Reset inendian and outendian so that after this point, those can be
+	 * used as condition values.
+	 */
+	inendian &= UCONV_IN_NAT_ENDIAN;
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	/*
+	 * If there is something in the input buffer and if necessary and
+	 * requested, save the BOM at the output buffer.
+	 */
+	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED_32;
+
+	/*
+	 * Do conversion; if encounter a surrogate pair, assemble high and
+	 * low pair values to form a UTF-32 character. If a half of a pair
+	 * exists alone, then, either it is an illegal (EILSEQ) or
+	 * invalid (EINVAL) value.
+	 */
+	for (; u16l < *utf16len; u16l++) {
+		if (u16s[u16l] == 0 && do_not_ignore_null)
+			break;
+
+		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
+
+		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
+			if (hi)
+				return (EILSEQ);
+			hi = lo;
+			continue;
+		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
+			if (! hi)
+				return (EILSEQ);
+			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
+			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
+			    + UCONV_U16_START;
+			hi = 0;
+		} else if (hi) {
+			return (EILSEQ);
+		}
+
+		if (u32l >= *utf32len)
+			return (E2BIG);
+
+		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
+	}
+
+	/*
+	 * If high half didn't see low half, then, it's most likely the input
+	 * parameter is incomplete.
+	 */
+	if (hi)
+		return (EINVAL);
+
+	/*
+	 * Save the number of consumed and saved characters. They do not
+	 * include terminating NULL character (U+0000) at the end of
+	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
+	 * the input buffer length is big enough to include the terminating
+	 * NULL character).
+	 */
+	*utf16len = u16l;
+	*utf32len = u32l;
+
+	return (0);
+}
+
+int
+uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
+    uchar_t *u8s, size_t *utf8len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u8l;
+	uint32_t hi;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	if (u16s == NULL || utf16len == NULL)
+		return (EILSEQ);
+
+	if (u8s == NULL || utf8len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u16l = u8l = 0;
+	hi = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom16(u16s, *utf16len, &inendian))
+		u16l++;
+
+	inendian &= UCONV_IN_NAT_ENDIAN;
+
+	for (; u16l < *utf16len; u16l++) {
+		if (u16s[u16l] == 0 && do_not_ignore_null)
+			break;
+
+		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
+
+		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
+			if (hi)
+				return (EILSEQ);
+			hi = lo;
+			continue;
+		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
+			if (! hi)
+				return (EILSEQ);
+			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
+			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
+			    + UCONV_U16_START;
+			hi = 0;
+		} else if (hi) {
+			return (EILSEQ);
+		}
+
+		/*
+		 * Now we convert a UTF-32 character into a UTF-8 character.
+		 * Unicode coding space is between U+0000 and U+10FFFF;
+		 * anything bigger is an illegal character.
+		 */
+		if (lo <= UCONV_U8_ONE_BYTE) {
+			if (u8l >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)lo;
+		} else if (lo <= UCONV_U8_TWO_BYTES) {
+			if ((u8l + 1) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
+		} else if (lo <= UCONV_U8_THREE_BYTES) {
+			if ((u8l + 2) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
+		} else if (lo <= UCONV_U8_FOUR_BYTES) {
+			if ((u8l + 3) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
+		} else {
+			return (EILSEQ);
+		}
+	}
+
+	if (hi)
+		return (EINVAL);
+
+	*utf16len = u16l;
+	*utf8len = u8l;
+
+	return (0);
+}
+
+int
+uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
+    uint16_t *u16s, size_t *utf16len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u32l;
+	uint32_t hi;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	if (u32s == NULL || utf32len == NULL)
+		return (EILSEQ);
+
+	if (u16s == NULL || utf16len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u16l = u32l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom32(u32s, *utf32len, &inendian))
+		u32l++;
+
+	inendian &= UCONV_IN_NAT_ENDIAN;
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED;
+
+	for (; u32l < *utf32len; u32l++) {
+		if (u32s[u32l] == 0 && do_not_ignore_null)
+			break;
+
+		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
+
+		/*
+		 * Anything bigger than the Unicode coding space, i.e.,
+		 * Unicode scalar value bigger than U+10FFFF, is an illegal
+		 * character.
+		 */
+		if (hi > UCONV_UNICODE_MAX)
+			return (EILSEQ);
+
+		/*
+		 * Anything bigger than U+FFFF must be converted into
+		 * a surrogate pair in UTF-16.
+		 */
+		if (hi >= UCONV_U16_START) {
+			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_LO_MIN;
+			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_HI_MIN;
+
+			if ((u16l + 1) >= *utf16len)
+				return (E2BIG);
+
+			if (outendian) {
+				u16s[u16l++] = (uint16_t)hi;
+				u16s[u16l++] = (uint16_t)lo;
+			} else {
+				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
+				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
+			}
+		} else {
+			if (u16l >= *utf16len)
+				return (E2BIG);
+			u16s[u16l++] = (outendian) ? (uint16_t)hi :
+			    BSWAP_16(((uint16_t)hi));
+		}
+	}
+
+	*utf16len = u16l;
+	*utf32len = u32l;
+
+	return (0);
+}
+
+int
+uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
+    uchar_t *u8s, size_t *utf8len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u32l;
+	size_t u8l;
+	uint32_t lo;
+	boolean_t do_not_ignore_null;
+
+	if (u32s == NULL || utf32len == NULL)
+		return (EILSEQ);
+
+	if (u8s == NULL || utf8len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u32l = u8l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	if ((flag & UCONV_IN_ACCEPT_BOM) &&
+	    check_bom32(u32s, *utf32len, &inendian))
+		u32l++;
+
+	inendian &= UCONV_IN_NAT_ENDIAN;
+
+	for (; u32l < *utf32len; u32l++) {
+		if (u32s[u32l] == 0 && do_not_ignore_null)
+			break;
+
+		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
+
+		if (lo <= UCONV_U8_ONE_BYTE) {
+			if (u8l >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)lo;
+		} else if (lo <= UCONV_U8_TWO_BYTES) {
+			if ((u8l + 1) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
+		} else if (lo <= UCONV_U8_THREE_BYTES) {
+			if ((u8l + 2) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
+		} else if (lo <= UCONV_U8_FOUR_BYTES) {
+			if ((u8l + 3) >= *utf8len)
+				return (E2BIG);
+			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
+			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
+			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
+		} else {
+			return (EILSEQ);
+		}
+	}
+
+	*utf32len = u32l;
+	*utf8len = u8l;
+
+	return (0);
+}
+
+int
+uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
+    uint16_t *u16s, size_t *utf16len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u16l;
+	size_t u8l;
+	uint32_t hi;
+	uint32_t lo;
+	int remaining_bytes;
+	int first_b;
+	boolean_t do_not_ignore_null;
+
+	if (u8s == NULL || utf8len == NULL)
+		return (EILSEQ);
+
+	if (u16s == NULL || utf16len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u16l = u8l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED;
+
+	for (; u8l < *utf8len; ) {
+		if (u8s[u8l] == 0 && do_not_ignore_null)
+			break;
+
+		/*
+		 * Collect a UTF-8 character and convert it to a UTF-32
+		 * character. In doing so, we screen out illegally formed
+		 * UTF-8 characters and treat such as illegal characters.
+		 * The algorithm at below also screens out anything bigger
+		 * than the U+10FFFF.
+		 *
+		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
+		 * more details on the illegal values of UTF-8 character
+		 * bytes.
+		 */
+		hi = (uint32_t)u8s[u8l++];
+
+		if (hi > UCONV_ASCII_MAX) {
+			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
+				return (EILSEQ);
+
+			first_b = hi;
+			hi = hi & u8_masks_tbl[remaining_bytes];
+
+			for (; remaining_bytes > 0; remaining_bytes--) {
+				/*
+				 * If we have no more bytes, the current
+				 * UTF-8 character is incomplete.
+				 */
+				if (u8l >= *utf8len)
+					return (EINVAL);
+
+				lo = (uint32_t)u8s[u8l++];
+
+				if (first_b) {
+					if (lo < valid_min_2nd_byte[first_b] ||
+					    lo > valid_max_2nd_byte[first_b])
+						return (EILSEQ);
+					first_b = 0;
+				} else if (lo < UCONV_U8_BYTE_MIN ||
+				    lo > UCONV_U8_BYTE_MAX) {
+					return (EILSEQ);
+				}
+				hi = (hi << UCONV_U8_BIT_SHIFT) |
+				    (lo & UCONV_U8_BIT_MASK);
+			}
+		}
+
+		if (hi >= UCONV_U16_START) {
+			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_LO_MIN;
+			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
+			    UCONV_U16_HI_MIN;
+
+			if ((u16l + 1) >= *utf16len)
+				return (E2BIG);
+
+			if (outendian) {
+				u16s[u16l++] = (uint16_t)hi;
+				u16s[u16l++] = (uint16_t)lo;
+			} else {
+				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
+				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
+			}
+		} else {
+			if (u16l >= *utf16len)
+				return (E2BIG);
+
+			u16s[u16l++] = (outendian) ? (uint16_t)hi :
+			    BSWAP_16(((uint16_t)hi));
+		}
+	}
+
+	*utf16len = u16l;
+	*utf8len = u8l;
+
+	return (0);
+}
+
+int
+uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
+    uint32_t *u32s, size_t *utf32len, int flag)
+{
+	int inendian;
+	int outendian;
+	size_t u32l;
+	size_t u8l;
+	uint32_t hi;
+	uint32_t c;
+	int remaining_bytes;
+	int first_b;
+	boolean_t do_not_ignore_null;
+
+	if (u8s == NULL || utf8len == NULL)
+		return (EILSEQ);
+
+	if (u32s == NULL || utf32len == NULL)
+		return (E2BIG);
+
+	if (check_endian(flag, &inendian, &outendian) != 0)
+		return (EBADF);
+
+	u32l = u8l = 0;
+	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
+
+	outendian &= UCONV_OUT_NAT_ENDIAN;
+
+	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
+		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
+		    UCONV_BOM_SWAPPED_32;
+
+	for (; u8l < *utf8len; ) {
+		if (u8s[u8l] == 0 && do_not_ignore_null)
+			break;
+
+		hi = (uint32_t)u8s[u8l++];
+
+		if (hi > UCONV_ASCII_MAX) {
+			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
+				return (EILSEQ);
+
+			first_b = hi;
+			hi = hi & u8_masks_tbl[remaining_bytes];
+
+			for (; remaining_bytes > 0; remaining_bytes--) {
+				if (u8l >= *utf8len)
+					return (EINVAL);
+
+				c = (uint32_t)u8s[u8l++];
+
+				if (first_b) {
+					if (c < valid_min_2nd_byte[first_b] ||
+					    c > valid_max_2nd_byte[first_b])
+						return (EILSEQ);
+					first_b = 0;
+				} else if (c < UCONV_U8_BYTE_MIN ||
+				    c > UCONV_U8_BYTE_MAX) {
+					return (EILSEQ);
+				}
+				hi = (hi << UCONV_U8_BIT_SHIFT) |
+				    (c & UCONV_U8_BIT_MASK);
+			}
+		}
+
+		if (u32l >= *utf32len)
+			return (E2BIG);
+
+		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
+	}
+
+	*utf32len = u32l;
+	*utf8len = u8l;
+
+	return (0);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(uconv_u16tou32);
+EXPORT_SYMBOL(uconv_u16tou8);
+EXPORT_SYMBOL(uconv_u32tou16);
+EXPORT_SYMBOL(uconv_u32tou8);
+EXPORT_SYMBOL(uconv_u8tou16);
+EXPORT_SYMBOL(uconv_u8tou32);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/Makefile.in b/sys/contrib/openzfs/module/zcommon/Makefile.in
new file mode 100644
index 000000000000..ebc538440445
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/Makefile.in
@@ -0,0 +1,28 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+endif
+
+MODULE := zcommon
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
+
+$(MODULE)-objs += cityhash.o
+$(MODULE)-objs += zfeature_common.o
+$(MODULE)-objs += zfs_comutil.o
+$(MODULE)-objs += zfs_deleg.o
+$(MODULE)-objs += zfs_fletcher.o
+$(MODULE)-objs += zfs_fletcher_superscalar.o
+$(MODULE)-objs += zfs_fletcher_superscalar4.o
+$(MODULE)-objs += zfs_namecheck.o
+$(MODULE)-objs += zfs_prop.o
+$(MODULE)-objs += zpool_prop.o
+$(MODULE)-objs += zprop_common.o
+
+$(MODULE)-$(CONFIG_X86) += zfs_fletcher_intel.o
+$(MODULE)-$(CONFIG_X86) += zfs_fletcher_sse.o
+$(MODULE)-$(CONFIG_X86) += zfs_fletcher_avx512.o
+$(MODULE)-$(CONFIG_ARM64) += zfs_fletcher_aarch64_neon.o
diff --git a/sys/contrib/openzfs/module/zcommon/cityhash.c b/sys/contrib/openzfs/module/zcommon/cityhash.c
new file mode 100644
index 000000000000..413a96df2cda
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/cityhash.c
@@ -0,0 +1,67 @@
+// Copyright (c) 2011 Google, Inc.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+#include <cityhash.h>
+
+#define	HASH_K1 0xb492b66fbe98f273ULL
+#define	HASH_K2 0x9ae16a3b2f90404fULL
+
+/*
+ * Bitwise right rotate.  Normally this will compile to a single
+ * instruction.
+ */
+static inline uint64_t
+rotate(uint64_t val, int shift)
+{
+	// Avoid shifting by 64: doing so yields an undefined result.
+	return (shift == 0 ? val : (val >> shift) | (val << (64 - shift)));
+}
+
+static inline uint64_t
+cityhash_helper(uint64_t u, uint64_t v, uint64_t mul)
+{
+	uint64_t a = (u ^ v) * mul;
+	a ^= (a >> 47);
+	uint64_t b = (v ^ a) * mul;
+	b ^= (b >> 47);
+	b *= mul;
+	return (b);
+}
+
+uint64_t
+cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4)
+{
+	uint64_t mul = HASH_K2 + 64;
+	uint64_t a = w1 * HASH_K1;
+	uint64_t b = w2;
+	uint64_t c = w4 * mul;
+	uint64_t d = w3 * HASH_K2;
+	return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d,
+	    a + rotate(b + HASH_K2, 18) + c, mul));
+
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(cityhash4);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfeature_common.c b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
new file mode 100644
index 000000000000..e95a85e89ba2
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfeature_common.c
@@ -0,0 +1,609 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#ifndef _KERNEL
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#endif
+#include <sys/debug.h>
+#include <sys/fs/zfs.h>
+#include <sys/inttypes.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/zfs_sysfs.h>
+#include "zfeature_common.h"
+
+/*
+ * Set to disable all feature checks while opening pools, allowing pools with
+ * unsupported features to be opened. Set for testing only.
+ */
+boolean_t zfeature_checks_disable = B_FALSE;
+
+zfeature_info_t spa_feature_table[SPA_FEATURES];
+
+/*
+ * Valid characters for feature guids. This list is mainly for aesthetic
+ * purposes and could be expanded in the future. There are different allowed
+ * characters in the guids reverse dns portion (before the colon) and its
+ * short name (after the colon).
+ */
+static int
+valid_char(char c, boolean_t after_colon)
+{
+	return ((c >= 'a' && c <= 'z') ||
+	    (c >= '0' && c <= '9') ||
+	    (after_colon && c == '_') ||
+	    (!after_colon && (c == '.' || c == '-')));
+}
+
+/*
+ * Every feature guid must contain exactly one colon which separates a reverse
+ * dns organization name from the feature's "short" name (e.g.
+ * "com.company:feature_name").
+ */
+boolean_t
+zfeature_is_valid_guid(const char *name)
+{
+	int i;
+	boolean_t has_colon = B_FALSE;
+
+	i = 0;
+	while (name[i] != '\0') {
+		char c = name[i++];
+		if (c == ':') {
+			if (has_colon)
+				return (B_FALSE);
+			has_colon = B_TRUE;
+			continue;
+		}
+		if (!valid_char(c, has_colon))
+			return (B_FALSE);
+	}
+
+	return (has_colon);
+}
+
+boolean_t
+zfeature_is_supported(const char *guid)
+{
+	if (zfeature_checks_disable)
+		return (B_TRUE);
+
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t *feature = &spa_feature_table[i];
+		if (strcmp(guid, feature->fi_guid) == 0)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+int
+zfeature_lookup_guid(const char *guid, spa_feature_t *res)
+{
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t *feature = &spa_feature_table[i];
+		if (!feature->fi_zfs_mod_supported)
+			continue;
+		if (strcmp(guid, feature->fi_guid) == 0) {
+			if (res != NULL)
+				*res = i;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+int
+zfeature_lookup_name(const char *name, spa_feature_t *res)
+{
+	for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t *feature = &spa_feature_table[i];
+		if (!feature->fi_zfs_mod_supported)
+			continue;
+		if (strcmp(name, feature->fi_uname) == 0) {
+			if (res != NULL)
+				*res = i;
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+boolean_t
+zfeature_depends_on(spa_feature_t fid, spa_feature_t check)
+{
+	zfeature_info_t *feature = &spa_feature_table[fid];
+
+	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) {
+		if (feature->fi_depends[i] == check)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static boolean_t
+deps_contains_feature(const spa_feature_t *deps, const spa_feature_t feature)
+{
+	for (int i = 0; deps[i] != SPA_FEATURE_NONE; i++)
+		if (deps[i] == feature)
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+#if !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD)
+static boolean_t
+zfs_mod_supported_impl(const char *scope, const char *name, const char *sysfs)
+{
+	boolean_t supported = B_FALSE;
+	char *path;
+
+	int len = asprintf(&path, "%s%s%s%s%s", sysfs,
+	    scope == NULL ? "" : "/", scope == NULL ? "" : scope,
+	    name == NULL ? "" : "/", name == NULL ? "" : name);
+	if (len > 0) {
+		struct stat64 statbuf;
+		supported = !!(stat64(path, &statbuf) == 0);
+		free(path);
+	}
+
+	return (supported);
+}
+
+boolean_t
+zfs_mod_supported(const char *scope, const char *name)
+{
+	boolean_t supported;
+
+	/*
+	 * Check both the primary and alternate sysfs locations to determine
+	 * if the required functionality is supported.
+	 */
+	supported = (zfs_mod_supported_impl(scope, name, ZFS_SYSFS_DIR) ||
+	    zfs_mod_supported_impl(scope, name, ZFS_SYSFS_ALT_DIR));
+
+	/*
+	 * For backwards compatibility with kernel modules that predate
+	 * supported feature/property checking.  Report the feature/property
+	 * as supported if the kernel module is loaded but the requested
+	 * scope directory does not exist.
+	 */
+	if (supported == B_FALSE) {
+		struct stat64 statbuf;
+		if ((stat64(ZFS_SYSFS_DIR, &statbuf) == 0) &&
+		    !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_DIR) &&
+		    !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_ALT_DIR)) {
+			supported = B_TRUE;
+		}
+	}
+
+	return (supported);
+}
+#endif
+
+static boolean_t
+zfs_mod_supported_feature(const char *name)
+{
+	/*
+	 * The zfs module spa_feature_table[], whether in-kernel or in
+	 * libzpool, always supports all the features. libzfs needs to
+	 * query the running module, via sysfs, to determine which
+	 * features are supported.
+	 *
+	 * The equivalent _can_ be done on FreeBSD by way of the sysctl
+	 * tree, but this has not been done yet.  Therefore, we return
+	 * that all features except edonr are supported.
+	 */
+#if defined(__FreeBSD__)
+	if (strcmp(name, "org.illumos:edonr") == 0)
+		return (B_FALSE);
+	else
+		return (B_TRUE);
+#elif defined(_KERNEL) || defined(LIB_ZPOOL_BUILD)
+	return (B_TRUE);
+#else
+	return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name));
+#endif
+}
+
+static void
+zfeature_register(spa_feature_t fid, const char *guid, const char *name,
+    const char *desc, zfeature_flags_t flags, zfeature_type_t type,
+    const spa_feature_t *deps)
+{
+	zfeature_info_t *feature = &spa_feature_table[fid];
+	static spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
+
+	ASSERT(name != NULL);
+	ASSERT(desc != NULL);
+	ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 ||
+	    (flags & ZFEATURE_FLAG_MOS) == 0);
+	ASSERT3U(fid, <, SPA_FEATURES);
+	ASSERT(zfeature_is_valid_guid(guid));
+
+	if (deps == NULL)
+		deps = nodeps;
+
+	VERIFY(((flags & ZFEATURE_FLAG_PER_DATASET) == 0) ||
+	    (deps_contains_feature(deps, SPA_FEATURE_EXTENSIBLE_DATASET)));
+
+	feature->fi_feature = fid;
+	feature->fi_guid = guid;
+	feature->fi_uname = name;
+	feature->fi_desc = desc;
+	feature->fi_flags = flags;
+	feature->fi_type = type;
+	feature->fi_depends = deps;
+	feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid);
+}
+
+/*
+ * Every feature has a GUID of the form com.example:feature_name.  The
+ * reversed DNS name ensures that the feature's GUID is unique across all ZFS
+ * implementations.  This allows companies to independently develop and
+ * release features.  Examples include org.delphix and org.datto.  Previously,
+ * features developed on one implementation have used that implementation's
+ * domain name (e.g. org.illumos and org.zfsonlinux).  Use of the org.openzfs
+ * domain name is recommended for new features which are developed by the
+ * OpenZFS community and its platforms.  This domain may optionally be used by
+ * companies developing features for initial release through an OpenZFS
+ * implementation.  Use of the org.openzfs domain requires reserving the
+ * feature name in advance with the OpenZFS project.
+ */
+void
+zpool_feature_init(void)
+{
+	zfeature_register(SPA_FEATURE_ASYNC_DESTROY,
+	    "com.delphix:async_destroy", "async_destroy",
+	    "Destroy filesystems asynchronously.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_EMPTY_BPOBJ,
+	    "com.delphix:empty_bpobj", "empty_bpobj",
+	    "Snapshots use less space.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_LZ4_COMPRESS,
+	    "org.illumos:lz4_compress", "lz4_compress",
+	    "LZ4 compression algorithm support.",
+	    ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP,
+	    "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump",
+	    "Crash dumps to multiple vdev pools.",
+	    0, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM,
+	    "com.delphix:spacemap_histogram", "spacemap_histogram",
+	    "Spacemaps maintain space histograms.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_ENABLED_TXG,
+	    "com.delphix:enabled_txg", "enabled_txg",
+	    "Record txg at which a feature is enabled",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	{
+	static const spa_feature_t hole_birth_deps[] = {
+		SPA_FEATURE_ENABLED_TXG,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_HOLE_BIRTH,
+	    "com.delphix:hole_birth", "hole_birth",
+	    "Retain hole birth txg for more precise zfs send",
+	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    ZFEATURE_TYPE_BOOLEAN, hole_birth_deps);
+	}
+
+	zfeature_register(SPA_FEATURE_POOL_CHECKPOINT,
+	    "com.delphix:zpool_checkpoint", "zpool_checkpoint",
+	    "Pool state can be checkpointed, allowing rewind later.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+	    "com.delphix:spacemap_v2", "spacemap_v2",
+	    "Space maps representing large segments are more efficient.",
+	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET,
+	    "com.delphix:extensible_dataset", "extensible_dataset",
+	    "Enhanced dataset functionality, used by other features.",
+	    0, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	{
+	static const spa_feature_t bookmarks_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+
+	zfeature_register(SPA_FEATURE_BOOKMARKS,
+	    "com.delphix:bookmarks", "bookmarks",
+	    "\"zfs bookmark\" command",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+	    bookmarks_deps);
+	}
+
+	{
+	static const spa_feature_t filesystem_limits_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_FS_SS_LIMIT,
+	    "com.joyent:filesystem_limits", "filesystem_limits",
+	    "Filesystem and snapshot limits.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+	    filesystem_limits_deps);
+	}
+
+	zfeature_register(SPA_FEATURE_EMBEDDED_DATA,
+	    "com.delphix:embedded_data", "embedded_data",
+	    "Blocks which compress very well use even less space.",
+	    ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	{
+	static const spa_feature_t livelist_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LIVELIST,
+	    "com.delphix:livelist", "livelist",
+	    "Improved clone deletion performance.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+	    livelist_deps);
+	}
+
+	{
+	static const spa_feature_t log_spacemap_deps[] = {
+		SPA_FEATURE_SPACEMAP_V2,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LOG_SPACEMAP,
+	    "com.delphix:log_spacemap", "log_spacemap",
+	    "Log metaslab changes on a single spacemap and "
+	    "flush them periodically.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+	    log_spacemap_deps);
+	}
+
+	{
+	static const spa_feature_t large_blocks_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LARGE_BLOCKS,
+	    "org.open-zfs:large_blocks", "large_blocks",
+	    "Support for blocks larger than 128KB.",
+	    ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+	    large_blocks_deps);
+	}
+
+	{
+	static const spa_feature_t large_dnode_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_LARGE_DNODE,
+	    "org.zfsonlinux:large_dnode", "large_dnode",
+	    "Variable on-disk size of dnodes.",
+	    ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+	    large_dnode_deps);
+	}
+
+	{
+	static const spa_feature_t sha512_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_SHA512,
+	    "org.illumos:sha512", "sha512",
+	    "SHA-512/256 hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+	    sha512_deps);
+	}
+
+	{
+	static const spa_feature_t skein_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_SKEIN,
+	    "org.illumos:skein", "skein",
+	    "Skein hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+	    skein_deps);
+	}
+
+	{
+	static const spa_feature_t edonr_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_EDONR,
+	    "org.illumos:edonr", "edonr",
+	    "Edon-R hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+	    edonr_deps);
+	}
+
+	{
+	static const spa_feature_t redact_books_deps[] = {
+		SPA_FEATURE_BOOKMARK_V2,
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_BOOKMARKS,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS,
+	    "com.delphix:redaction_bookmarks", "redaction_bookmarks",
+	    "Support for bookmarks which store redaction lists for zfs "
+	    "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN,
+	    redact_books_deps);
+	}
+
+	{
+	static const spa_feature_t redact_datasets_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_REDACTED_DATASETS,
+	    "com.delphix:redacted_datasets", "redacted_datasets", "Support for "
+	    "redacted datasets, produced by receiving a redacted zfs send "
+	    "stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY,
+	    redact_datasets_deps);
+	}
+
+	{
+	static const spa_feature_t bookmark_written_deps[] = {
+		SPA_FEATURE_BOOKMARK_V2,
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_BOOKMARKS,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN,
+	    "com.delphix:bookmark_written", "bookmark_written",
+	    "Additional accounting, enabling the written#<bookmark> property"
+	    "(space written since a bookmark), and estimates of send stream "
+	    "sizes for incrementals from bookmarks.",
+	    0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps);
+	}
+
+	zfeature_register(SPA_FEATURE_DEVICE_REMOVAL,
+	    "com.delphix:device_removal", "device_removal",
+	    "Top-level vdevs can be removed, reducing logical pool size.",
+	    ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	{
+	static const spa_feature_t obsolete_counts_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_DEVICE_REMOVAL,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS,
+	    "com.delphix:obsolete_counts", "obsolete_counts",
+	    "Reduce memory used by removed devices when their blocks are "
+	    "freed or remapped.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN,
+	    obsolete_counts_deps);
+	}
+
+	{
+	static const spa_feature_t userobj_accounting_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_USEROBJ_ACCOUNTING,
+	    "org.zfsonlinux:userobj_accounting", "userobj_accounting",
+	    "User/Group object accounting.",
+	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
+	    ZFEATURE_TYPE_BOOLEAN, userobj_accounting_deps);
+	}
+
+	{
+	static const spa_feature_t bookmark_v2_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_BOOKMARKS,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_BOOKMARK_V2,
+	    "com.datto:bookmark_v2", "bookmark_v2",
+	    "Support for larger bookmarks",
+	    0, ZFEATURE_TYPE_BOOLEAN, bookmark_v2_deps);
+	}
+
+	{
+	static const spa_feature_t encryption_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_BOOKMARK_V2,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_ENCRYPTION,
+	    "com.datto:encryption", "encryption",
+	    "Support for dataset level encryption",
+	    ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+	    encryption_deps);
+	}
+
+	{
+	static const spa_feature_t project_quota_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_PROJECT_QUOTA,
+	    "org.zfsonlinux:project_quota", "project_quota",
+	    "space/object accounting based on project ID.",
+	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET,
+	    ZFEATURE_TYPE_BOOLEAN, project_quota_deps);
+	}
+
+	zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
+	    "org.zfsonlinux:allocation_classes", "allocation_classes",
+	    "Support for separate allocation classes.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_RESILVER_DEFER,
+	    "com.datto:resilver_defer", "resilver_defer",
+	    "Support for deferring new resilvers when one is already running.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	zfeature_register(SPA_FEATURE_DEVICE_REBUILD,
+	    "org.openzfs:device_rebuild", "device_rebuild",
+	    "Support for sequential mirror/dRAID device rebuilds",
+	    ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+	{
+	static const spa_feature_t zstd_deps[] = {
+		SPA_FEATURE_EXTENSIBLE_DATASET,
+		SPA_FEATURE_NONE
+	};
+	zfeature_register(SPA_FEATURE_ZSTD_COMPRESS,
+	    "org.freebsd:zstd_compress", "zstd_compress",
+	    "zstd compression algorithm support.",
+	    ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps);
+	}
+
+	zfeature_register(SPA_FEATURE_DRAID,
+	    "org.openzfs:draid", "draid", "Support for distributed spare RAID",
+	    ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfeature_lookup_guid);
+EXPORT_SYMBOL(zfeature_lookup_name);
+EXPORT_SYMBOL(zfeature_is_supported);
+EXPORT_SYMBOL(zfeature_is_valid_guid);
+EXPORT_SYMBOL(zfeature_depends_on);
+EXPORT_SYMBOL(zpool_feature_init);
+EXPORT_SYMBOL(spa_feature_table);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_comutil.c b/sys/contrib/openzfs/module/zcommon/zfs_comutil.c
new file mode 100644
index 000000000000..1cec60ac1d67
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_comutil.c
@@ -0,0 +1,263 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * This file is intended for functions that ought to be common between user
+ * land (libzfs) and the kernel. When many common routines need to be shared
+ * then a separate file should to be created.
+ */
+
+#if !defined(_KERNEL)
+#include <string.h>
+#endif
+
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/nvpair.h>
+#include "zfs_comutil.h"
+#include <sys/zfs_ratelimit.h>
+
+/*
+ * Are there allocatable vdevs?
+ */
+boolean_t
+zfs_allocatable_devs(nvlist_t *nv)
+{
+	uint64_t is_log;
+	uint_t c;
+	nvlist_t **child;
+	uint_t children;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+		return (B_FALSE);
+	}
+	for (c = 0; c < children; c++) {
+		is_log = 0;
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		if (!is_log)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Are there special vdevs?
+ */
+boolean_t
+zfs_special_devs(nvlist_t *nv, char *type)
+{
+	char *bias;
+	uint_t c;
+	nvlist_t **child;
+	uint_t children;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0) {
+		return (B_FALSE);
+	}
+	for (c = 0; c < children; c++) {
+		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS,
+		    &bias) == 0) {
+			if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0 ||
+			    strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) {
+				if (type != NULL && strcmp(bias, type) == 0) {
+					return (B_TRUE);
+				} else if (type == NULL) {
+					return (B_TRUE);
+				}
+			}
+		}
+	}
+	return (B_FALSE);
+}
+
+void
+zpool_get_load_policy(nvlist_t *nvl, zpool_load_policy_t *zlpp)
+{
+	nvlist_t *policy;
+	nvpair_t *elem;
+	char *nm;
+
+	/* Defaults */
+	zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+	zlpp->zlp_maxmeta = 0;
+	zlpp->zlp_maxdata = UINT64_MAX;
+	zlpp->zlp_txg = UINT64_MAX;
+
+	if (nvl == NULL)
+		return;
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		nm = nvpair_name(elem);
+		if (strcmp(nm, ZPOOL_LOAD_POLICY) == 0) {
+			if (nvpair_value_nvlist(elem, &policy) == 0)
+				zpool_get_load_policy(policy, zlpp);
+			return;
+		} else if (strcmp(nm, ZPOOL_LOAD_REWIND_POLICY) == 0) {
+			if (nvpair_value_uint32(elem, &zlpp->zlp_rewind) == 0)
+				if (zlpp->zlp_rewind & ~ZPOOL_REWIND_POLICIES)
+					zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+		} else if (strcmp(nm, ZPOOL_LOAD_REQUEST_TXG) == 0) {
+			(void) nvpair_value_uint64(elem, &zlpp->zlp_txg);
+		} else if (strcmp(nm, ZPOOL_LOAD_META_THRESH) == 0) {
+			(void) nvpair_value_uint64(elem, &zlpp->zlp_maxmeta);
+		} else if (strcmp(nm, ZPOOL_LOAD_DATA_THRESH) == 0) {
+			(void) nvpair_value_uint64(elem, &zlpp->zlp_maxdata);
+		}
+	}
+	if (zlpp->zlp_rewind == 0)
+		zlpp->zlp_rewind = ZPOOL_NO_REWIND;
+}
+
+typedef struct zfs_version_spa_map {
+	int	version_zpl;
+	int	version_spa;
+} zfs_version_spa_map_t;
+
+/*
+ * Keep this table in monotonically increasing version number order.
+ */
+static zfs_version_spa_map_t zfs_version_table[] = {
+	{ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL},
+	{ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL},
+	{ZPL_VERSION_FUID, SPA_VERSION_FUID},
+	{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+	{ZPL_VERSION_SA, SPA_VERSION_SA},
+	{0, 0}
+};
+
+/*
+ * Return the max zpl version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_zpl_version_map(int spa_version)
+{
+	int i;
+	int version = -1;
+
+	for (i = 0; zfs_version_table[i].version_spa; i++) {
+		if (spa_version >= zfs_version_table[i].version_spa)
+			version = zfs_version_table[i].version_zpl;
+	}
+
+	return (version);
+}
+
+/*
+ * Return the min spa version for a corresponding spa version
+ * -1 is returned if no mapping exists.
+ */
+int
+zfs_spa_version_map(int zpl_version)
+{
+	int i;
+	int version = -1;
+
+	for (i = 0; zfs_version_table[i].version_zpl; i++) {
+		if (zfs_version_table[i].version_zpl >= zpl_version)
+			return (zfs_version_table[i].version_spa);
+	}
+
+	return (version);
+}
+
+/*
+ * This is the table of legacy internal event names; it should not be modified.
+ * The internal events are now stored in the history log as strings.
+ */
+const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = {
+	"invalid event",
+	"pool create",
+	"vdev add",
+	"pool remove",
+	"pool destroy",
+	"pool export",
+	"pool import",
+	"vdev attach",
+	"vdev replace",
+	"vdev detach",
+	"vdev online",
+	"vdev offline",
+	"vdev upgrade",
+	"pool clear",
+	"pool scrub",
+	"pool property set",
+	"create",
+	"clone",
+	"destroy",
+	"destroy_begin_sync",
+	"inherit",
+	"property set",
+	"quota set",
+	"permission update",
+	"permission remove",
+	"permission who remove",
+	"promote",
+	"receive",
+	"rename",
+	"reservation set",
+	"replay_inc_sync",
+	"replay_full_sync",
+	"rollback",
+	"snapshot",
+	"filesystem version upgrade",
+	"refquota set",
+	"refreservation set",
+	"pool scrub done",
+	"user hold",
+	"user release",
+	"pool split",
+};
+
+boolean_t
+zfs_dataset_name_hidden(const char *name)
+{
+	/*
+	 * Skip over datasets that are not visible in this zone,
+	 * internal datasets (which have a $ in their name), and
+	 * temporary datasets (which have a % in their name).
+	 */
+	if (strchr(name, '$') != NULL)
+		return (B_TRUE);
+	if (strchr(name, '%') != NULL)
+		return (B_TRUE);
+	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_allocatable_devs);
+EXPORT_SYMBOL(zfs_special_devs);
+EXPORT_SYMBOL(zpool_get_load_policy);
+EXPORT_SYMBOL(zfs_zpl_version_map);
+EXPORT_SYMBOL(zfs_spa_version_map);
+EXPORT_SYMBOL(zfs_history_event_names);
+EXPORT_SYMBOL(zfs_dataset_name_hidden);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_deleg.c b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
new file mode 100644
index 000000000000..e1f5a353b7a4
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_deleg.c
@@ -0,0 +1,249 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>
+ */
+
+#include <sys/zfs_context.h>
+
+#if defined(_KERNEL)
+#include <sys/sunddi.h>
+#include <sys/ctype.h>
+#else
+#include <stdio.h>
+#include <unistd.h>
+#include <libnvpair.h>
+#include <ctype.h>
+#endif
+#include <sys/strings.h>
+#include <sys/dsl_deleg.h>
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_namecheck.h"
+
+zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
+	{ZFS_DELEG_PERM_ALLOW},
+	{ZFS_DELEG_PERM_BOOKMARK},
+	{ZFS_DELEG_PERM_CLONE},
+	{ZFS_DELEG_PERM_CREATE},
+	{ZFS_DELEG_PERM_DESTROY},
+	{ZFS_DELEG_PERM_DIFF},
+	{ZFS_DELEG_PERM_MOUNT},
+	{ZFS_DELEG_PERM_PROMOTE},
+	{ZFS_DELEG_PERM_RECEIVE},
+	{ZFS_DELEG_PERM_RENAME},
+	{ZFS_DELEG_PERM_ROLLBACK},
+	{ZFS_DELEG_PERM_SNAPSHOT},
+	{ZFS_DELEG_PERM_SHARE},
+	{ZFS_DELEG_PERM_SEND},
+	{ZFS_DELEG_PERM_USERPROP},
+	{ZFS_DELEG_PERM_USERQUOTA},
+	{ZFS_DELEG_PERM_GROUPQUOTA},
+	{ZFS_DELEG_PERM_USERUSED},
+	{ZFS_DELEG_PERM_GROUPUSED},
+	{ZFS_DELEG_PERM_USEROBJQUOTA},
+	{ZFS_DELEG_PERM_GROUPOBJQUOTA},
+	{ZFS_DELEG_PERM_USEROBJUSED},
+	{ZFS_DELEG_PERM_GROUPOBJUSED},
+	{ZFS_DELEG_PERM_HOLD},
+	{ZFS_DELEG_PERM_RELEASE},
+	{ZFS_DELEG_PERM_LOAD_KEY},
+	{ZFS_DELEG_PERM_CHANGE_KEY},
+	{ZFS_DELEG_PERM_PROJECTUSED},
+	{ZFS_DELEG_PERM_PROJECTQUOTA},
+	{ZFS_DELEG_PERM_PROJECTOBJUSED},
+	{ZFS_DELEG_PERM_PROJECTOBJQUOTA},
+	{NULL}
+};
+
+static int
+zfs_valid_permission_name(const char *perm)
+{
+	if (zfs_deleg_canonicalize_perm(perm))
+		return (0);
+
+	return (permset_namecheck(perm, NULL, NULL));
+}
+
+const char *
+zfs_deleg_canonicalize_perm(const char *perm)
+{
+	int i;
+	zfs_prop_t prop;
+
+	for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
+		if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0)
+			return (perm);
+	}
+
+	prop = zfs_name_to_prop(perm);
+	if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop))
+		return (zfs_prop_to_name(prop));
+	return (NULL);
+
+}
+
+static int
+zfs_validate_who(char *who)
+{
+	char *p;
+
+	if (who[2] != ZFS_DELEG_FIELD_SEP_CHR)
+		return (-1);
+
+	switch (who[0]) {
+	case ZFS_DELEG_USER:
+	case ZFS_DELEG_GROUP:
+	case ZFS_DELEG_USER_SETS:
+	case ZFS_DELEG_GROUP_SETS:
+		if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+			return (-1);
+		for (p = &who[3]; *p; p++)
+			if (!isdigit(*p))
+				return (-1);
+		break;
+
+	case ZFS_DELEG_NAMED_SET:
+	case ZFS_DELEG_NAMED_SET_SETS:
+		if (who[1] != ZFS_DELEG_NA)
+			return (-1);
+		return (permset_namecheck(&who[3], NULL, NULL));
+
+	case ZFS_DELEG_CREATE:
+	case ZFS_DELEG_CREATE_SETS:
+		if (who[1] != ZFS_DELEG_NA)
+			return (-1);
+		if (who[3] != '\0')
+			return (-1);
+		break;
+
+	case ZFS_DELEG_EVERYONE:
+	case ZFS_DELEG_EVERYONE_SETS:
+		if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT)
+			return (-1);
+		if (who[3] != '\0')
+			return (-1);
+		break;
+
+	default:
+		return (-1);
+	}
+
+	return (0);
+}
+
+int
+zfs_deleg_verify_nvlist(nvlist_t *nvp)
+{
+	nvpair_t *who, *perm_name;
+	nvlist_t *perms;
+	int error;
+
+	if (nvp == NULL)
+		return (-1);
+
+	who = nvlist_next_nvpair(nvp, NULL);
+	if (who == NULL)
+		return (-1);
+
+	do {
+		if (zfs_validate_who(nvpair_name(who)))
+			return (-1);
+
+		error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms);
+
+		if (error && error != ENOENT)
+			return (-1);
+		if (error == ENOENT)
+			continue;
+
+		perm_name = nvlist_next_nvpair(perms, NULL);
+		if (perm_name == NULL) {
+			return (-1);
+		}
+		do {
+			error = zfs_valid_permission_name(
+			    nvpair_name(perm_name));
+			if (error)
+				return (-1);
+		} while ((perm_name = nvlist_next_nvpair(perms, perm_name))
+		    != NULL);
+	} while ((who = nvlist_next_nvpair(nvp, who)) != NULL);
+	return (0);
+}
+
+/*
+ * Construct the base attribute name.  The base attribute names
+ * are the "key" to locate the jump objects which contain the actual
+ * permissions.  The base attribute names are encoded based on
+ * type of entry and whether it is a local or descendent permission.
+ *
+ * Arguments:
+ * attr - attribute name return string, attribute is assumed to be
+ *        ZFS_MAX_DELEG_NAME long.
+ * type - type of entry to construct
+ * inheritchr - inheritance type (local,descendent, or NA for create and
+ *                               permission set definitions
+ * data - is either a permission set name or a 64 bit uid/gid.
+ */
+void
+zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type,
+    char inheritchr, void *data)
+{
+	int len = ZFS_MAX_DELEG_NAME;
+	uint64_t *id = data;
+
+	switch (type) {
+	case ZFS_DELEG_USER:
+	case ZFS_DELEG_GROUP:
+	case ZFS_DELEG_USER_SETS:
+	case ZFS_DELEG_GROUP_SETS:
+		(void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr,
+		    ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id);
+		break;
+	case ZFS_DELEG_NAMED_SET_SETS:
+	case ZFS_DELEG_NAMED_SET:
+		(void) snprintf(attr, len, "%c-%c%s", type,
+		    ZFS_DELEG_FIELD_SEP_CHR, (char *)data);
+		break;
+	case ZFS_DELEG_CREATE:
+	case ZFS_DELEG_CREATE_SETS:
+		(void) snprintf(attr, len, "%c-%c", type,
+		    ZFS_DELEG_FIELD_SEP_CHR);
+		break;
+	case ZFS_DELEG_EVERYONE:
+	case ZFS_DELEG_EVERYONE_SETS:
+		(void) snprintf(attr, len, "%c%c%c", type, inheritchr,
+		    ZFS_DELEG_FIELD_SEP_CHR);
+		break;
+	default:
+		ASSERT(!"bad zfs_deleg_who_type_t");
+	}
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_deleg_verify_nvlist);
+EXPORT_SYMBOL(zfs_deleg_whokey);
+EXPORT_SYMBOL(zfs_deleg_canonicalize_perm);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
new file mode 100644
index 000000000000..7a9de4a4309d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher.c
@@ -0,0 +1,991 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Fletcher Checksums
+ * ------------------
+ *
+ * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
+ * recurrence relations:
+ *
+ *	a  = a    + f
+ *	 i    i-1    i-1
+ *
+ *	b  = b    + a
+ *	 i    i-1    i
+ *
+ *	c  = c    + b		(fletcher-4 only)
+ *	 i    i-1    i
+ *
+ *	d  = d    + c		(fletcher-4 only)
+ *	 i    i-1    i
+ *
+ * Where
+ *	a_0 = b_0 = c_0 = d_0 = 0
+ * and
+ *	f_0 .. f_(n-1) are the input data.
+ *
+ * Using standard techniques, these translate into the following series:
+ *
+ *	     __n_			     __n_
+ *	     \   |			     \   |
+ *	a  =  >     f			b  =  >     i * f
+ *	 n   /___|   n - i		 n   /___|	 n - i
+ *	     i = 1			     i = 1
+ *
+ *
+ *	     __n_			     __n_
+ *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
+ *	c  =  >     ------- f		d  =  >     ------------- f
+ *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
+ *	     i = 1			     i = 1
+ *
+ * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
+ * Since the additions are done mod (2^64), errors in the high bits may not
+ * be noticed.  For this reason, fletcher-2 is deprecated.
+ *
+ * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
+ * A conservative estimate of how big the buffer can get before we overflow
+ * can be estimated using f_i = 0xffffffff for all i:
+ *
+ * % bc
+ *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
+ * 2264
+ *  quit
+ * %
+ *
+ * So blocks of up to 2k will not overflow.  Our largest block size is
+ * 128k, which has 32k 4-byte words, so we can compute the largest possible
+ * accumulators, then divide by 2^64 to figure the max amount of overflow:
+ *
+ * % bc
+ *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
+ *  a/2^64;b/2^64;c/2^64;d/2^64
+ * 0
+ * 0
+ * 1365
+ * 11186858
+ *  quit
+ * %
+ *
+ * So a and b cannot overflow.  To make sure each bit of input has some
+ * effect on the contents of c and d, we can look at what the factors of
+ * the coefficients in the equations for c_n and d_n are.  The number of 2s
+ * in the factors determines the lowest set bit in the multiplier.  Running
+ * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
+ * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
+ * the 64-bit accumulators, every bit of every f_i effects every accumulator,
+ * even for 128k blocks.
+ *
+ * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
+ * we could do our calculations mod (2^32 - 1) by adding in the carries
+ * periodically, and store the number of carries in the top 32-bits.
+ *
+ * --------------------
+ * Checksum Performance
+ * --------------------
+ *
+ * There are two interesting components to checksum performance: cached and
+ * uncached performance.  With cached data, fletcher-2 is about four times
+ * faster than fletcher-4.  With uncached data, the performance difference is
+ * negligible, since the cost of a cache fill dominates the processing time.
+ * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
+ * efficient pass over the data.
+ *
+ * In normal operation, the data which is being checksummed is in a buffer
+ * which has been filled either by:
+ *
+ *	1. a compression step, which will be mostly cached, or
+ *	2. a bcopy() or copyin(), which will be uncached (because the
+ *	   copy is cache-bypassing).
+ *
+ * For both cached and uncached data, both fletcher checksums are much faster
+ * than sha-256, and slower than 'off', which doesn't touch the data at all.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/spa.h>
+#include <sys/simd.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <zfs_fletcher.h>
+
+#define	FLETCHER_MIN_SIMD_SIZE	64
+
+static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
+static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
+static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size);
+static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size);
+static boolean_t fletcher_4_scalar_valid(void);
+
+static const fletcher_4_ops_t fletcher_4_scalar_ops = {
+	.init_native = fletcher_4_scalar_init,
+	.fini_native = fletcher_4_scalar_fini,
+	.compute_native = fletcher_4_scalar_native,
+	.init_byteswap = fletcher_4_scalar_init,
+	.fini_byteswap = fletcher_4_scalar_fini,
+	.compute_byteswap = fletcher_4_scalar_byteswap,
+	.valid = fletcher_4_scalar_valid,
+	.name = "scalar"
+};
+
+static fletcher_4_ops_t fletcher_4_fastest_impl = {
+	.name = "fastest",
+	.valid = fletcher_4_scalar_valid
+};
+
+static const fletcher_4_ops_t *fletcher_4_impls[] = {
+	&fletcher_4_scalar_ops,
+	&fletcher_4_superscalar_ops,
+	&fletcher_4_superscalar4_ops,
+#if defined(HAVE_SSE2)
+	&fletcher_4_sse2_ops,
+#endif
+#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
+	&fletcher_4_ssse3_ops,
+#endif
+#if defined(HAVE_AVX) && defined(HAVE_AVX2)
+	&fletcher_4_avx2_ops,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512F)
+	&fletcher_4_avx512f_ops,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
+	&fletcher_4_avx512bw_ops,
+#endif
+#if defined(__aarch64__) && !defined(__FreeBSD__)
+	&fletcher_4_aarch64_neon_ops,
+#endif
+};
+
+/* Hold all supported implementations */
+static uint32_t fletcher_4_supp_impls_cnt = 0;
+static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
+
+/* Select fletcher4 implementation */
+#define	IMPL_FASTEST	(UINT32_MAX)
+#define	IMPL_CYCLE	(UINT32_MAX - 1)
+#define	IMPL_SCALAR	(0)
+
+static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
+
+#define	IMPL_READ(i)	(*(volatile uint32_t *) &(i))
+
+static struct fletcher_4_impl_selector {
+	const char	*fis_name;
+	uint32_t	fis_sel;
+} fletcher_4_impl_selectors[] = {
+	{ "cycle",	IMPL_CYCLE },
+	{ "fastest",	IMPL_FASTEST },
+	{ "scalar",	IMPL_SCALAR }
+};
+
+#if defined(_KERNEL)
+static kstat_t *fletcher_4_kstat;
+
+static struct fletcher_4_kstat {
+	uint64_t native;
+	uint64_t byteswap;
+} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
+#endif
+
+/* Indicate that benchmark has been completed */
+static boolean_t fletcher_4_initialized = B_FALSE;
+
+/*ARGSUSED*/
+void
+fletcher_init(zio_cksum_t *zcp)
+{
+	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+int
+fletcher_2_incremental_native(void *buf, size_t size, void *data)
+{
+	zio_cksum_t *zcp = data;
+
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	a0 = zcp->zc_word[0];
+	a1 = zcp->zc_word[1];
+	b0 = zcp->zc_word[2];
+	b1 = zcp->zc_word[3];
+
+	for (; ip < ipend; ip += 2) {
+		a0 += ip[0];
+		a1 += ip[1];
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_2_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) fletcher_2_incremental_native((void *) buf, size, zcp);
+}
+
+int
+fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
+{
+	zio_cksum_t *zcp = data;
+
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	a0 = zcp->zc_word[0];
+	a1 = zcp->zc_word[1];
+	b0 = zcp->zc_word[2];
+	b1 = zcp->zc_word[3];
+
+	for (; ip < ipend; ip += 2) {
+		a0 += BSWAP_64(ip[0]);
+		a1 += BSWAP_64(ip[1]);
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+fletcher_2_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
+}
+
+static void
+fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
+{
+	ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
+}
+
+static void
+fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
+}
+
+static void
+fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = ctx->scalar.zc_word[0];
+	b = ctx->scalar.zc_word[1];
+	c = ctx->scalar.zc_word[2];
+	d = ctx->scalar.zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += ip[0];
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
+}
+
+static void
+fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = ctx->scalar.zc_word[0];
+	b = ctx->scalar.zc_word[1];
+	c = ctx->scalar.zc_word[2];
+	d = ctx->scalar.zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += BSWAP_32(ip[0]);
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
+}
+
+static boolean_t
+fletcher_4_scalar_valid(void)
+{
+	return (B_TRUE);
+}
+
+int
+fletcher_4_impl_set(const char *val)
+{
+	int err = -EINVAL;
+	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+	size_t i, val_len;
+
+	val_len = strlen(val);
+	while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
+		val_len--;
+
+	/* check mandatory implementations */
+	for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
+		const char *name = fletcher_4_impl_selectors[i].fis_name;
+
+		if (val_len == strlen(name) &&
+		    strncmp(val, name, val_len) == 0) {
+			impl = fletcher_4_impl_selectors[i].fis_sel;
+			err = 0;
+			break;
+		}
+	}
+
+	if (err != 0 && fletcher_4_initialized) {
+		/* check all supported implementations */
+		for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
+			const char *name = fletcher_4_supp_impls[i]->name;
+
+			if (val_len == strlen(name) &&
+			    strncmp(val, name, val_len) == 0) {
+				impl = i;
+				err = 0;
+				break;
+			}
+		}
+	}
+
+	if (err == 0) {
+		atomic_swap_32(&fletcher_4_impl_chosen, impl);
+		membar_producer();
+	}
+
+	return (err);
+}
+
+/*
+ * Returns the Fletcher 4 operations for checksums.   When a SIMD
+ * implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
+static inline const fletcher_4_ops_t *
+fletcher_4_impl_get(void)
+{
+	if (!kfpu_allowed())
+		return (&fletcher_4_superscalar4_ops);
+
+	const fletcher_4_ops_t *ops = NULL;
+	uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+
+	switch (impl) {
+	case IMPL_FASTEST:
+		ASSERT(fletcher_4_initialized);
+		ops = &fletcher_4_fastest_impl;
+		break;
+	case IMPL_CYCLE:
+		/* Cycle through supported implementations */
+		ASSERT(fletcher_4_initialized);
+		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+		static uint32_t cycle_count = 0;
+		uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
+		ops = fletcher_4_supp_impls[idx];
+		break;
+	default:
+		ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+		ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
+		ops = fletcher_4_supp_impls[impl];
+		break;
+	}
+
+	ASSERT3P(ops, !=, NULL);
+
+	return (ops);
+}
+
+static inline void
+fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	fletcher_4_ctx_t ctx;
+	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+
+	ops->init_native(&ctx);
+	ops->compute_native(&ctx, buf, size);
+	ops->fini_native(&ctx, zcp);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+
+	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
+
+	if (size == 0 || p2size == 0) {
+		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+
+		if (size > 0)
+			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
+			    buf, size);
+	} else {
+		fletcher_4_native_impl(buf, p2size, zcp);
+
+		if (p2size < size)
+			fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
+			    (char *)buf + p2size, size - p2size);
+	}
+}
+
+void
+fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+	fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
+}
+
+static inline void
+fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	fletcher_4_ctx_t ctx;
+	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+
+	ops->init_byteswap(&ctx);
+	ops->compute_byteswap(&ctx, buf, size);
+	ops->fini_byteswap(&ctx, zcp);
+}
+
+/*ARGSUSED*/
+void
+fletcher_4_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+
+	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
+
+	if (size == 0 || p2size == 0) {
+		ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+
+		if (size > 0)
+			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
+			    buf, size);
+	} else {
+		fletcher_4_byteswap_impl(buf, p2size, zcp);
+
+		if (p2size < size)
+			fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
+			    (char *)buf + p2size, size - p2size);
+	}
+}
+
+/* Incremental Fletcher 4 */
+
+#define	ZFS_FLETCHER_4_INC_MAX_SIZE	(8ULL << 20)
+
+static inline void
+fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size,
+    const zio_cksum_t *nzcp)
+{
+	const uint64_t c1 = size / sizeof (uint32_t);
+	const uint64_t c2 = c1 * (c1 + 1) / 2;
+	const uint64_t c3 = c2 * (c1 + 2) / 3;
+
+	/*
+	 * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
+	 * reason we split incremental fletcher4 computation of large buffers
+	 * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
+	 */
+	ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
+
+	zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
+	    c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
+	zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
+	    c2 * zcp->zc_word[0];
+	zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
+	zcp->zc_word[0] += nzcp->zc_word[0];
+}
+
+static inline void
+fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size,
+    zio_cksum_t *zcp)
+{
+	while (size > 0) {
+		zio_cksum_t nzc;
+		uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
+
+		if (native)
+			fletcher_4_native(buf, len, NULL, &nzc);
+		else
+			fletcher_4_byteswap(buf, len, NULL, &nzc);
+
+		fletcher_4_incremental_combine(zcp, len, &nzc);
+
+		size -= len;
+		buf += len;
+	}
+}
+
+int
+fletcher_4_incremental_native(void *buf, size_t size, void *data)
+{
+	zio_cksum_t *zcp = data;
+	/* Use scalar impl to directly update cksum of small blocks */
+	if (size < SPA_MINBLOCKSIZE)
+		fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
+	else
+		fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
+	return (0);
+}
+
+int
+fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
+{
+	zio_cksum_t *zcp = data;
+	/* Use scalar impl to directly update cksum of small blocks */
+	if (size < SPA_MINBLOCKSIZE)
+		fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
+	else
+		fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
+	return (0);
+}
+
+#if defined(_KERNEL)
+/*
+ * Fletcher 4 kstats
+ */
+static int
+fletcher_4_kstat_headers(char *buf, size_t size)
+{
+	ssize_t off = 0;
+
+	off += snprintf(buf + off, size, "%-17s", "implementation");
+	off += snprintf(buf + off, size - off, "%-15s", "native");
+	(void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
+
+	return (0);
+}
+
+static int
+fletcher_4_kstat_data(char *buf, size_t size, void *data)
+{
+	struct fletcher_4_kstat *fastest_stat =
+	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
+	struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data;
+	ssize_t off = 0;
+
+	if (curr_stat == fastest_stat) {
+		off += snprintf(buf + off, size - off, "%-17s", "fastest");
+		off += snprintf(buf + off, size - off, "%-15s",
+		    fletcher_4_supp_impls[fastest_stat->native]->name);
+		off += snprintf(buf + off, size - off, "%-15s\n",
+		    fletcher_4_supp_impls[fastest_stat->byteswap]->name);
+	} else {
+		ptrdiff_t id = curr_stat - fletcher_4_stat_data;
+
+		off += snprintf(buf + off, size - off, "%-17s",
+		    fletcher_4_supp_impls[id]->name);
+		off += snprintf(buf + off, size - off, "%-15llu",
+		    (u_longlong_t)curr_stat->native);
+		off += snprintf(buf + off, size - off, "%-15llu\n",
+		    (u_longlong_t)curr_stat->byteswap);
+	}
+
+	return (0);
+}
+
+static void *
+fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
+{
+	if (n <= fletcher_4_supp_impls_cnt)
+		ksp->ks_private = (void *) (fletcher_4_stat_data + n);
+	else
+		ksp->ks_private = NULL;
+
+	return (ksp->ks_private);
+}
+#endif
+
+#define	FLETCHER_4_FASTEST_FN_COPY(type, src)				  \
+{									  \
+	fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;	  \
+	fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;	  \
+	fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
+}
+
+#define	FLETCHER_4_BENCH_NS	(MSEC2NSEC(1))		/* 1ms */
+
+typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
+					zio_cksum_t *);
+
+#if defined(_KERNEL)
+static void
+fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
+{
+
+	struct fletcher_4_kstat *fastest_stat =
+	    &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
+	hrtime_t start;
+	uint64_t run_bw, run_time_ns, best_run = 0;
+	zio_cksum_t zc;
+	uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
+
+	fletcher_checksum_func_t *fletcher_4_test = native ?
+	    fletcher_4_native : fletcher_4_byteswap;
+
+	for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
+		struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
+		uint64_t run_count = 0;
+
+		/* temporary set an implementation */
+		fletcher_4_impl_chosen = i;
+
+		kpreempt_disable();
+		start = gethrtime();
+		do {
+			for (l = 0; l < 32; l++, run_count++)
+				fletcher_4_test(data, data_size, NULL, &zc);
+
+			run_time_ns = gethrtime() - start;
+		} while (run_time_ns < FLETCHER_4_BENCH_NS);
+		kpreempt_enable();
+
+		run_bw = data_size * run_count * NANOSEC;
+		run_bw /= run_time_ns;	/* B/s */
+
+		if (native)
+			stat->native = run_bw;
+		else
+			stat->byteswap = run_bw;
+
+		if (run_bw > best_run) {
+			best_run = run_bw;
+
+			if (native) {
+				fastest_stat->native = i;
+				FLETCHER_4_FASTEST_FN_COPY(native,
+				    fletcher_4_supp_impls[i]);
+			} else {
+				fastest_stat->byteswap = i;
+				FLETCHER_4_FASTEST_FN_COPY(byteswap,
+				    fletcher_4_supp_impls[i]);
+			}
+		}
+	}
+
+	/* restore original selection */
+	atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
+}
+#endif /* _KERNEL */
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+fletcher_4_benchmark(void)
+{
+	fletcher_4_ops_t *curr_impl;
+	int i, c;
+
+	/* Move supported implementations into fletcher_4_supp_impls */
+	for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
+		curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
+
+		if (curr_impl->valid && curr_impl->valid())
+			fletcher_4_supp_impls[c++] = curr_impl;
+	}
+	membar_producer();	/* complete fletcher_4_supp_impls[] init */
+	fletcher_4_supp_impls_cnt = c;	/* number of supported impl */
+
+#if defined(_KERNEL)
+	static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+	char *databuf = vmem_alloc(data_size, KM_SLEEP);
+
+	for (i = 0; i < data_size / sizeof (uint64_t); i++)
+		((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
+
+	fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
+	fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
+
+	vmem_free(databuf, data_size);
+#else
+	/*
+	 * Skip the benchmark in user space to avoid impacting libzpool
+	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
+	 * is assumed to be the fastest and used by default.
+	 */
+	memcpy(&fletcher_4_fastest_impl,
+	    fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
+	    sizeof (fletcher_4_fastest_impl));
+	fletcher_4_fastest_impl.name = "fastest";
+	membar_producer();
+#endif /* _KERNEL */
+}
+
+void
+fletcher_4_init(void)
+{
+	/* Determine the fastest available implementation. */
+	fletcher_4_benchmark();
+
+#if defined(_KERNEL)
+	/* Install kstats for all implementations */
+	fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+	if (fletcher_4_kstat != NULL) {
+		fletcher_4_kstat->ks_data = NULL;
+		fletcher_4_kstat->ks_ndata = UINT32_MAX;
+		kstat_set_raw_ops(fletcher_4_kstat,
+		    fletcher_4_kstat_headers,
+		    fletcher_4_kstat_data,
+		    fletcher_4_kstat_addr);
+		kstat_install(fletcher_4_kstat);
+	}
+#endif
+
+	/* Finish initialization */
+	fletcher_4_initialized = B_TRUE;
+}
+
+void
+fletcher_4_fini(void)
+{
+#if defined(_KERNEL)
+	if (fletcher_4_kstat != NULL) {
+		kstat_delete(fletcher_4_kstat);
+		fletcher_4_kstat = NULL;
+	}
+#endif
+}
+
+/* ABD adapters */
+
+static void
+abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
+{
+	const fletcher_4_ops_t *ops = fletcher_4_impl_get();
+	cdp->acd_private = (void *) ops;
+
+	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
+		ops->init_native(cdp->acd_ctx);
+	else
+		ops->init_byteswap(cdp->acd_ctx);
+}
+
+static void
+abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
+{
+	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
+
+	ASSERT(ops);
+
+	if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE)
+		ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
+	else
+		ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
+}
+
+static void
+abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
+    zio_abd_checksum_data_t *cdp)
+{
+	zio_cksum_t *zcp = cdp->acd_zcp;
+
+	ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
+
+	abd_fletcher_4_fini(cdp);
+	cdp->acd_private = (void *)&fletcher_4_scalar_ops;
+
+	if (native)
+		fletcher_4_incremental_native(data, size, zcp);
+	else
+		fletcher_4_incremental_byteswap(data, size, zcp);
+}
+
+static int
+abd_fletcher_4_iter(void *data, size_t size, void *private)
+{
+	zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
+	fletcher_4_ctx_t *ctx = cdp->acd_ctx;
+	fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
+	boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
+	uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
+
+	ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
+
+	if (asize > 0) {
+		if (native)
+			ops->compute_native(ctx, data, asize);
+		else
+			ops->compute_byteswap(ctx, data, asize);
+
+		size -= asize;
+		data = (char *)data + asize;
+	}
+
+	if (size > 0) {
+		ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
+		/* At this point we have to switch to scalar impl */
+		abd_fletcher_4_simd2scalar(native, data, size, cdp);
+	}
+
+	return (0);
+}
+
+zio_abd_checksum_func_t fletcher_4_abd_ops = {
+	.acf_init = abd_fletcher_4_init,
+	.acf_fini = abd_fletcher_4_fini,
+	.acf_iter = abd_fletcher_4_iter
+};
+
+#if defined(_KERNEL)
+
+#define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
+
+#if defined(__linux__)
+
+static int
+fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
+{
+	const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+	char *fmt;
+	int cnt = 0;
+
+	/* list fastest */
+	fmt = IMPL_FMT(impl, IMPL_FASTEST);
+	cnt += sprintf(buffer + cnt, fmt, "fastest");
+
+	/* list all supported implementations */
+	for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
+		fmt = IMPL_FMT(impl, i);
+		cnt += sprintf(buffer + cnt, fmt,
+		    fletcher_4_supp_impls[i]->name);
+	}
+
+	return (cnt);
+}
+
+static int
+fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
+{
+	return (fletcher_4_impl_set(val));
+}
+
+#else
+
+#include <sys/sbuf.h>
+
+static int
+fletcher_4_param(ZFS_MODULE_PARAM_ARGS)
+{
+	int err;
+
+	if (req->newptr == NULL) {
+		const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+		const int init_buflen = 64;
+		const char *fmt;
+		struct sbuf *s;
+
+		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
+
+		/* list fastest */
+		fmt = IMPL_FMT(impl, IMPL_FASTEST);
+		(void) sbuf_printf(s, fmt, "fastest");
+
+		/* list all supported implementations */
+		for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; ++i) {
+			fmt = IMPL_FMT(impl, i);
+			(void) sbuf_printf(s, fmt,
+			    fletcher_4_supp_impls[i]->name);
+		}
+
+		err = sbuf_finish(s);
+		sbuf_delete(s);
+
+		return (err);
+	}
+
+	char buf[16];
+
+	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
+	if (err)
+		return (err);
+	return (-fletcher_4_impl_set(buf));
+}
+
+#endif
+
+#undef IMPL_FMT
+
+/*
+ * Choose a fletcher 4 implementation in ZFS.
+ * Users can choose "cycle" to exercise all implementations, but this is
+ * for testing purpose therefore it can only be set in user space.
+ */
+/* BEGIN CSTYLED */
+ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, fletcher_4_impl,
+	fletcher_4_param_set, fletcher_4_param_get, ZMOD_RW,
+	"Select fletcher 4 implementation.");
+/* END CSTYLED */
+
+EXPORT_SYMBOL(fletcher_init);
+EXPORT_SYMBOL(fletcher_2_incremental_native);
+EXPORT_SYMBOL(fletcher_2_incremental_byteswap);
+EXPORT_SYMBOL(fletcher_4_init);
+EXPORT_SYMBOL(fletcher_4_fini);
+EXPORT_SYMBOL(fletcher_2_native);
+EXPORT_SYMBOL(fletcher_2_byteswap);
+EXPORT_SYMBOL(fletcher_4_native);
+EXPORT_SYMBOL(fletcher_4_native_varsize);
+EXPORT_SYMBOL(fletcher_4_byteswap);
+EXPORT_SYMBOL(fletcher_4_incremental_native);
+EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
+EXPORT_SYMBOL(fletcher_4_abd_ops);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c
new file mode 100644
index 000000000000..c95a71681584
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_aarch64_neon.c
@@ -0,0 +1,215 @@
+/*
+ * Implement fast Fletcher4 with NEON instructions. (aarch64)
+ *
+ * Use the 128-bit NEON SIMD instructions and registers to compute
+ * Fletcher4 in two incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Romain Dolbeau.
+ *
+ * Authors:
+ *	Romain Dolbeau <romain.dolbeau@atos.net>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include <sys/simd.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx)
+{
+	bzero(ctx->aarch64_neon, 4 * sizeof (zfs_fletcher_aarch64_neon_t));
+}
+
+static void
+fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	uint64_t A, B, C, D;
+	A = ctx->aarch64_neon[0].v[0] + ctx->aarch64_neon[0].v[1];
+	B = 2 * ctx->aarch64_neon[1].v[0] + 2 * ctx->aarch64_neon[1].v[1] -
+	    ctx->aarch64_neon[0].v[1];
+	C = 4 * ctx->aarch64_neon[2].v[0] - ctx->aarch64_neon[1].v[0] +
+	    4 * ctx->aarch64_neon[2].v[1] - 3 * ctx->aarch64_neon[1].v[1];
+	D = 8 * ctx->aarch64_neon[3].v[0] - 4 * ctx->aarch64_neon[2].v[0] +
+	    8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] +
+	    ctx->aarch64_neon[1].v[1];
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define	NEON_INIT_LOOP()			\
+	asm("eor %[ZERO].16b,%[ZERO].16b,%[ZERO].16b\n"	\
+	"ld1 { %[ACC0].4s }, %[CTX0]\n"		\
+	"ld1 { %[ACC1].4s }, %[CTX1]\n"		\
+	"ld1 { %[ACC2].4s }, %[CTX2]\n"		\
+	"ld1 { %[ACC3].4s }, %[CTX3]\n"		\
+	: [ZERO] "=w" (ZERO),			\
+	[ACC0] "=w" (ACC0), [ACC1] "=w" (ACC1),	\
+	[ACC2] "=w" (ACC2), [ACC3] "=w" (ACC3)	\
+	: [CTX0] "Q" (ctx->aarch64_neon[0]),	\
+	[CTX1] "Q" (ctx->aarch64_neon[1]),	\
+	[CTX2] "Q" (ctx->aarch64_neon[2]),	\
+	[CTX3] "Q" (ctx->aarch64_neon[3]))
+
+#define	NEON_DO_REVERSE "rev32 %[SRC].16b, %[SRC].16b\n"
+
+#define	NEON_DONT_REVERSE ""
+
+#define	NEON_MAIN_LOOP(REVERSE)				\
+	asm("ld1 { %[SRC].4s }, %[IP]\n"		\
+	REVERSE						\
+	"zip1 %[TMP1].4s, %[SRC].4s, %[ZERO].4s\n"	\
+	"zip2 %[TMP2].4s, %[SRC].4s, %[ZERO].4s\n"	\
+	"add %[ACC0].2d, %[ACC0].2d, %[TMP1].2d\n"	\
+	"add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n"	\
+	"add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n"	\
+	"add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n"	\
+	"add %[ACC0].2d, %[ACC0].2d, %[TMP2].2d\n"	\
+	"add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n"	\
+	"add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n"	\
+	"add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n"	\
+	: [SRC] "=&w" (SRC),				\
+	[TMP1] "=&w" (TMP1), [TMP2] "=&w" (TMP2),	\
+	[ACC0] "+w" (ACC0), [ACC1] "+w" (ACC1),		\
+	[ACC2] "+w" (ACC2), [ACC3] "+w" (ACC3)		\
+	: [ZERO] "w" (ZERO), [IP] "Q" (*ip))
+
+#define	NEON_FINI_LOOP()			\
+	asm("st1 { %[ACC0].4s },%[DST0]\n"	\
+	"st1 { %[ACC1].4s },%[DST1]\n"		\
+	"st1 { %[ACC2].4s },%[DST2]\n"		\
+	"st1 { %[ACC3].4s },%[DST3]\n"		\
+	: [DST0] "=Q" (ctx->aarch64_neon[0]),	\
+	[DST1] "=Q" (ctx->aarch64_neon[1]),	\
+	[DST2] "=Q" (ctx->aarch64_neon[2]),	\
+	[DST3] "=Q" (ctx->aarch64_neon[3])	\
+	: [ACC0] "w" (ACC0), [ACC1] "w" (ACC1),	\
+	[ACC2] "w" (ACC2), [ACC3] "w" (ACC3))
+
+static void
+fletcher_4_aarch64_neon_native(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+#if defined(_KERNEL)
+register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));
+register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));
+register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));
+register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));
+register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));
+register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));
+register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));
+register unsigned char SRC asm("v7") __attribute__((vector_size(16)));
+#else
+unsigned char ZERO __attribute__((vector_size(16)));
+unsigned char ACC0 __attribute__((vector_size(16)));
+unsigned char ACC1 __attribute__((vector_size(16)));
+unsigned char ACC2 __attribute__((vector_size(16)));
+unsigned char ACC3 __attribute__((vector_size(16)));
+unsigned char TMP1 __attribute__((vector_size(16)));
+unsigned char TMP2 __attribute__((vector_size(16)));
+unsigned char SRC __attribute__((vector_size(16)));
+#endif
+
+	kfpu_begin();
+
+	NEON_INIT_LOOP();
+
+	for (; ip < ipend; ip += 2) {
+		NEON_MAIN_LOOP(NEON_DONT_REVERSE);
+	}
+
+	NEON_FINI_LOOP();
+
+	kfpu_end();
+}
+
+static void
+fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+#if defined(_KERNEL)
+register unsigned char ZERO asm("v0") __attribute__((vector_size(16)));
+register unsigned char ACC0 asm("v1") __attribute__((vector_size(16)));
+register unsigned char ACC1 asm("v2") __attribute__((vector_size(16)));
+register unsigned char ACC2 asm("v3") __attribute__((vector_size(16)));
+register unsigned char ACC3 asm("v4") __attribute__((vector_size(16)));
+register unsigned char TMP1 asm("v5") __attribute__((vector_size(16)));
+register unsigned char TMP2 asm("v6") __attribute__((vector_size(16)));
+register unsigned char SRC asm("v7") __attribute__((vector_size(16)));
+#else
+unsigned char ZERO __attribute__((vector_size(16)));
+unsigned char ACC0 __attribute__((vector_size(16)));
+unsigned char ACC1 __attribute__((vector_size(16)));
+unsigned char ACC2 __attribute__((vector_size(16)));
+unsigned char ACC3 __attribute__((vector_size(16)));
+unsigned char TMP1 __attribute__((vector_size(16)));
+unsigned char TMP2 __attribute__((vector_size(16)));
+unsigned char SRC __attribute__((vector_size(16)));
+#endif
+
+	kfpu_begin();
+
+	NEON_INIT_LOOP();
+
+	for (; ip < ipend; ip += 2) {
+		NEON_MAIN_LOOP(NEON_DO_REVERSE);
+	}
+
+	NEON_FINI_LOOP();
+
+	kfpu_end();
+}
+
+static boolean_t fletcher_4_aarch64_neon_valid(void)
+{
+	return (kfpu_allowed());
+}
+
+const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
+	.init_native = fletcher_4_aarch64_neon_init,
+	.compute_native = fletcher_4_aarch64_neon_native,
+	.fini_native = fletcher_4_aarch64_neon_fini,
+	.init_byteswap = fletcher_4_aarch64_neon_init,
+	.compute_byteswap = fletcher_4_aarch64_neon_byteswap,
+	.fini_byteswap = fletcher_4_aarch64_neon_fini,
+	.valid = fletcher_4_aarch64_neon_valid,
+	.name = "aarch64_neon"
+};
+
+#endif /* defined(__aarch64__) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c
new file mode 100644
index 000000000000..300ec4c1fb69
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_avx512.c
@@ -0,0 +1,225 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#if defined(__x86_64) && defined(HAVE_AVX512F)
+
+#include <sys/byteorder.h>
+#include <sys/frame.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <sys/simd.h>
+#include <zfs_fletcher.h>
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+static void
+fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
+{
+	bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t));
+}
+
+static void
+fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	static const uint64_t
+	CcA[] = {   0,   0,   1,   3,   6,  10,  15,  21 },
+	CcB[] = {  28,  36,  44,  52,  60,  68,  76,  84 },
+	DcA[] = {   0,   0,   0,   1,   4,  10,  20,  35 },
+	DcB[] = {  56,  84, 120, 164, 216, 276, 344, 420 },
+	DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 };
+
+	uint64_t A, B, C, D;
+	uint64_t i;
+
+	A = ctx->avx512[0].v[0];
+	B = 8 * ctx->avx512[1].v[0];
+	C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0];
+	D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] +
+	    DcB[0] * ctx->avx512[1].v[0];
+
+	for (i = 1; i < 8; i++) {
+		A += ctx->avx512[0].v[i];
+		B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i];
+		C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] +
+		    CcA[i] * ctx->avx512[0].v[i];
+		D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] +
+		    DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i];
+	}
+
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define	FLETCHER_4_AVX512_RESTORE_CTX(ctx)				\
+{									\
+	__asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0]));	\
+	__asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1]));	\
+	__asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2]));	\
+	__asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3]));	\
+}
+
+#define	FLETCHER_4_AVX512_SAVE_CTX(ctx)					\
+{									\
+	__asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0]));	\
+	__asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1]));	\
+	__asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2]));	\
+	__asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3]));	\
+}
+
+static void
+fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
+	for (; ip < ipend; ip += 8) {
+		__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
+		__asm("vpaddq %zmm4, %zmm0, %zmm0");
+		__asm("vpaddq %zmm0, %zmm1, %zmm1");
+		__asm("vpaddq %zmm1, %zmm2, %zmm2");
+		__asm("vpaddq %zmm2, %zmm3, %zmm3");
+	}
+
+	FLETCHER_4_AVX512_SAVE_CTX(ctx);
+
+	kfpu_end();
+}
+STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native);
+
+static void
+fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
+{
+	static const uint64_t byteswap_mask = 0xFFULL;
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
+	__asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask));
+	__asm("vpsllq $8, %zmm8, %zmm9");
+	__asm("vpsllq $16, %zmm8, %zmm10");
+	__asm("vpsllq $24, %zmm8, %zmm11");
+
+	for (; ip < ipend; ip += 8) {
+		__asm("vpmovzxdq %0, %%zmm5"::"m" (*ip));
+
+		__asm("vpsrlq $24, %zmm5, %zmm6");
+		__asm("vpandd %zmm8, %zmm6, %zmm6");
+		__asm("vpsrlq $8, %zmm5, %zmm7");
+		__asm("vpandd %zmm9, %zmm7, %zmm7");
+		__asm("vpord %zmm6, %zmm7, %zmm4");
+		__asm("vpsllq $8, %zmm5, %zmm6");
+		__asm("vpandd %zmm10, %zmm6, %zmm6");
+		__asm("vpord %zmm6, %zmm4, %zmm4");
+		__asm("vpsllq $24, %zmm5, %zmm5");
+		__asm("vpandd %zmm11, %zmm5, %zmm5");
+		__asm("vpord %zmm5, %zmm4, %zmm4");
+
+		__asm("vpaddq %zmm4, %zmm0, %zmm0");
+		__asm("vpaddq %zmm0, %zmm1, %zmm1");
+		__asm("vpaddq %zmm1, %zmm2, %zmm2");
+		__asm("vpaddq %zmm2, %zmm3, %zmm3");
+	}
+
+	FLETCHER_4_AVX512_SAVE_CTX(ctx)
+
+	kfpu_end();
+}
+STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
+
+static boolean_t
+fletcher_4_avx512f_valid(void)
+{
+	return (kfpu_allowed() && zfs_avx512f_available());
+}
+
+const fletcher_4_ops_t fletcher_4_avx512f_ops = {
+	.init_native = fletcher_4_avx512f_init,
+	.fini_native = fletcher_4_avx512f_fini,
+	.compute_native = fletcher_4_avx512f_native,
+	.init_byteswap = fletcher_4_avx512f_init,
+	.fini_byteswap = fletcher_4_avx512f_fini,
+	.compute_byteswap = fletcher_4_avx512f_byteswap,
+	.valid = fletcher_4_avx512f_valid,
+	.name = "avx512f"
+};
+
+#if defined(HAVE_AVX512BW)
+static void
+fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
+    uint64_t size)
+{
+	static const zfs_fletcher_avx512_t mask = {
+		.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+		0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+		0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+		0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
+	};
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_AVX512_RESTORE_CTX(ctx);
+
+	__asm("vmovdqu64 %0, %%zmm5" :: "m" (mask));
+
+	for (; ip < ipend; ip += 8) {
+		__asm("vpmovzxdq %0, %%zmm4"::"m" (*ip));
+
+		__asm("vpshufb %zmm5, %zmm4, %zmm4");
+
+		__asm("vpaddq %zmm4, %zmm0, %zmm0");
+		__asm("vpaddq %zmm0, %zmm1, %zmm1");
+		__asm("vpaddq %zmm1, %zmm2, %zmm2");
+		__asm("vpaddq %zmm2, %zmm3, %zmm3");
+	}
+
+	FLETCHER_4_AVX512_SAVE_CTX(ctx)
+
+	kfpu_end();
+}
+STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap);
+
+const fletcher_4_ops_t fletcher_4_avx512bw_ops = {
+	.init_native = fletcher_4_avx512f_init,
+	.fini_native = fletcher_4_avx512f_fini,
+	.compute_native = fletcher_4_avx512f_native,
+	.init_byteswap = fletcher_4_avx512f_init,
+	.fini_byteswap = fletcher_4_avx512f_fini,
+	.compute_byteswap = fletcher_4_avx512bw_byteswap,
+	.valid = fletcher_4_avx512f_valid,
+	.name = "avx512bw"
+};
+#endif
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_intel.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_intel.c
new file mode 100644
index 000000000000..5136a01eca51
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_intel.c
@@ -0,0 +1,173 @@
+/*
+ * Implement fast Fletcher4 with AVX2 instructions. (x86_64)
+ *
+ * Use the 256-bit AVX2 SIMD instructions and registers to compute
+ * Fletcher4 in four incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ *
+ * Copyright (C) 2015 Intel Corporation.
+ *
+ * Authors:
+ *      James Guilford <james.guilford@intel.com>
+ *      Jinshan Xiong <jinshan.xiong@intel.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(HAVE_AVX) && defined(HAVE_AVX2)
+
+#include <sys/spa_checksum.h>
+#include <sys/simd.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
+{
+	bzero(ctx->avx, 4 * sizeof (zfs_fletcher_avx_t));
+}
+
+static void
+fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	uint64_t A, B, C, D;
+
+	A = ctx->avx[0].v[0] + ctx->avx[0].v[1] +
+	    ctx->avx[0].v[2] + ctx->avx[0].v[3];
+	B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] +
+	    4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] +
+	    4 * ctx->avx[1].v[3];
+
+	C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] -
+	    10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] -
+	    18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] +
+	    16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] +
+	    16 * ctx->avx[2].v[3];
+
+	D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] +
+	    10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] +
+	    34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] -
+	    64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] -
+	    96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] +
+	    64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] +
+	    64 * ctx->avx[3].v[3];
+
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define	FLETCHER_4_AVX2_RESTORE_CTX(ctx)				\
+{									\
+	asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0]));	\
+	asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1]));	\
+	asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2]));	\
+	asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3]));	\
+}
+
+#define	FLETCHER_4_AVX2_SAVE_CTX(ctx)					\
+{									\
+	asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0]));	\
+	asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1]));	\
+	asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2]));	\
+	asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3]));	\
+}
+
+
+static void
+fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_AVX2_RESTORE_CTX(ctx);
+
+	for (; ip < ipend; ip += 2) {
+		asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
+		asm volatile("vpaddq %ymm4, %ymm0, %ymm0");
+		asm volatile("vpaddq %ymm0, %ymm1, %ymm1");
+		asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
+		asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
+	}
+
+	FLETCHER_4_AVX2_SAVE_CTX(ctx);
+	asm volatile("vzeroupper");
+
+	kfpu_end();
+}
+
+static void
+fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+	static const zfs_fletcher_avx_t mask = {
+		.v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B,
+		    0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B }
+	};
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_AVX2_RESTORE_CTX(ctx);
+
+	asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask));
+
+	for (; ip < ipend; ip += 2) {
+		asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip));
+		asm volatile("vpshufb %ymm5, %ymm4, %ymm4");
+
+		asm volatile("vpaddq %ymm4, %ymm0, %ymm0");
+		asm volatile("vpaddq %ymm0, %ymm1, %ymm1");
+		asm volatile("vpaddq %ymm1, %ymm2, %ymm2");
+		asm volatile("vpaddq %ymm2, %ymm3, %ymm3");
+	}
+
+	FLETCHER_4_AVX2_SAVE_CTX(ctx);
+	asm volatile("vzeroupper");
+
+	kfpu_end();
+}
+
+static boolean_t fletcher_4_avx2_valid(void)
+{
+	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+}
+
+const fletcher_4_ops_t fletcher_4_avx2_ops = {
+	.init_native = fletcher_4_avx2_init,
+	.fini_native = fletcher_4_avx2_fini,
+	.compute_native = fletcher_4_avx2_native,
+	.init_byteswap = fletcher_4_avx2_init,
+	.fini_byteswap = fletcher_4_avx2_fini,
+	.compute_byteswap = fletcher_4_avx2_byteswap,
+	.valid = fletcher_4_avx2_valid,
+	.name = "avx2"
+};
+
+#endif /* defined(HAVE_AVX) && defined(HAVE_AVX2) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c
new file mode 100644
index 000000000000..15ce9b07ffbe
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_sse.c
@@ -0,0 +1,232 @@
+/*
+ * Implement fast Fletcher4 with SSE2,SSSE3 instructions. (x86)
+ *
+ * Use the 128-bit SSE2/SSSE3 SIMD instructions and registers to compute
+ * Fletcher4 in two incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Tyler J. Stachecki.
+ *
+ * Authors:
+ *	Tyler J. Stachecki <stachecki.tyler@gmail.com>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(HAVE_SSE2)
+
+#include <sys/simd.h>
+#include <sys/spa_checksum.h>
+#include <sys/byteorder.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
+{
+	bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t));
+}
+
+static void
+fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	uint64_t A, B, C, D;
+
+	/*
+	 * The mixing matrix for checksum calculation is:
+	 * a = a0 + a1
+	 * b = 2b0 + 2b1 - a1
+	 * c = 4c0 - b0 + 4c1 -3b1
+	 * d = 8d0 - 4c0 + 8d1 - 8c1 + b1;
+	 *
+	 * c and d are multiplied by 4 and 8, respectively,
+	 * before spilling the vectors out to memory.
+	 */
+	A = ctx->sse[0].v[0] + ctx->sse[0].v[1];
+	B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1];
+	C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] -
+	    3 * ctx->sse[1].v[1];
+	D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] -
+	    8 * ctx->sse[2].v[1] + ctx->sse[1].v[1];
+
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+#define	FLETCHER_4_SSE_RESTORE_CTX(ctx)					\
+{									\
+	asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0]));	\
+	asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1]));	\
+	asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2]));	\
+	asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3]));	\
+}
+
+#define	FLETCHER_4_SSE_SAVE_CTX(ctx)					\
+{									\
+	asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0]));	\
+	asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1]));	\
+	asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2]));	\
+	asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3]));	\
+}
+
+static void
+fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
+	asm volatile("pxor %xmm4, %xmm4");
+
+	for (; ip < ipend; ip += 2) {
+		asm volatile("movdqu %0, %%xmm5" :: "m"(*ip));
+		asm volatile("movdqa %xmm5, %xmm6");
+		asm volatile("punpckldq %xmm4, %xmm5");
+		asm volatile("punpckhdq %xmm4, %xmm6");
+		asm volatile("paddq %xmm5, %xmm0");
+		asm volatile("paddq %xmm0, %xmm1");
+		asm volatile("paddq %xmm1, %xmm2");
+		asm volatile("paddq %xmm2, %xmm3");
+		asm volatile("paddq %xmm6, %xmm0");
+		asm volatile("paddq %xmm0, %xmm1");
+		asm volatile("paddq %xmm1, %xmm2");
+		asm volatile("paddq %xmm2, %xmm3");
+	}
+
+	FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+	kfpu_end();
+}
+
+static void
+fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
+	for (; ip < ipend; ip += 2) {
+		uint32_t scratch1 = BSWAP_32(ip[0]);
+		uint32_t scratch2 = BSWAP_32(ip[1]);
+		asm volatile("movd %0, %%xmm5" :: "r"(scratch1));
+		asm volatile("movd %0, %%xmm6" :: "r"(scratch2));
+		asm volatile("punpcklqdq %xmm6, %xmm5");
+		asm volatile("paddq %xmm5, %xmm0");
+		asm volatile("paddq %xmm0, %xmm1");
+		asm volatile("paddq %xmm1, %xmm2");
+		asm volatile("paddq %xmm2, %xmm3");
+	}
+
+	FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+	kfpu_end();
+}
+
+static boolean_t fletcher_4_sse2_valid(void)
+{
+	return (kfpu_allowed() && zfs_sse2_available());
+}
+
+const fletcher_4_ops_t fletcher_4_sse2_ops = {
+	.init_native = fletcher_4_sse2_init,
+	.fini_native = fletcher_4_sse2_fini,
+	.compute_native = fletcher_4_sse2_native,
+	.init_byteswap = fletcher_4_sse2_init,
+	.fini_byteswap = fletcher_4_sse2_fini,
+	.compute_byteswap = fletcher_4_sse2_byteswap,
+	.valid = fletcher_4_sse2_valid,
+	.name = "sse2"
+};
+
+#endif /* defined(HAVE_SSE2) */
+
+#if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
+static void
+fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+{
+	static const zfs_fletcher_sse_t mask = {
+		.v = { 0x0405060700010203, 0x0C0D0E0F08090A0B }
+	};
+
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size);
+
+	kfpu_begin();
+
+	FLETCHER_4_SSE_RESTORE_CTX(ctx);
+
+	asm volatile("movdqu %0, %%xmm7"::"m" (mask));
+	asm volatile("pxor %xmm4, %xmm4");
+
+	for (; ip < ipend; ip += 2) {
+		asm volatile("movdqu %0, %%xmm5"::"m" (*ip));
+		asm volatile("pshufb %xmm7, %xmm5");
+		asm volatile("movdqa %xmm5, %xmm6");
+		asm volatile("punpckldq %xmm4, %xmm5");
+		asm volatile("punpckhdq %xmm4, %xmm6");
+		asm volatile("paddq %xmm5, %xmm0");
+		asm volatile("paddq %xmm0, %xmm1");
+		asm volatile("paddq %xmm1, %xmm2");
+		asm volatile("paddq %xmm2, %xmm3");
+		asm volatile("paddq %xmm6, %xmm0");
+		asm volatile("paddq %xmm0, %xmm1");
+		asm volatile("paddq %xmm1, %xmm2");
+		asm volatile("paddq %xmm2, %xmm3");
+	}
+
+	FLETCHER_4_SSE_SAVE_CTX(ctx);
+
+	kfpu_end();
+}
+
+static boolean_t fletcher_4_ssse3_valid(void)
+{
+	return (kfpu_allowed() && zfs_sse2_available() &&
+	    zfs_ssse3_available());
+}
+
+const fletcher_4_ops_t fletcher_4_ssse3_ops = {
+	.init_native = fletcher_4_sse2_init,
+	.fini_native = fletcher_4_sse2_fini,
+	.compute_native = fletcher_4_sse2_native,
+	.init_byteswap = fletcher_4_sse2_init,
+	.fini_byteswap = fletcher_4_sse2_fini,
+	.compute_byteswap = fletcher_4_ssse3_byteswap,
+	.valid = fletcher_4_ssse3_valid,
+	.name = "ssse3"
+};
+
+#endif /* defined(HAVE_SSE2) && defined(HAVE_SSSE3) */
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c
new file mode 100644
index 000000000000..153f5c7d75e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar.c
@@ -0,0 +1,163 @@
+/*
+ * Implement fast Fletcher4 using superscalar pipelines.
+ *
+ * Use regular C code to compute
+ * Fletcher4 in two incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Romain Dolbeau.
+ *
+ * Authors:
+ *	Romain Dolbeau <romain.dolbeau@atos.net>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx)
+{
+	bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t));
+}
+
+static void
+fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	uint64_t A, B, C, D;
+	A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1];
+	B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] -
+	    ctx->superscalar[0].v[1];
+	C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] +
+	    4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1];
+	D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] +
+	    8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] +
+	    ctx->superscalar[1].v[1];
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+static void
+fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+	uint64_t a2, b2, c2, d2;
+
+	a = ctx->superscalar[0].v[0];
+	b = ctx->superscalar[1].v[0];
+	c = ctx->superscalar[2].v[0];
+	d = ctx->superscalar[3].v[0];
+	a2 = ctx->superscalar[0].v[1];
+	b2 = ctx->superscalar[1].v[1];
+	c2 = ctx->superscalar[2].v[1];
+	d2 = ctx->superscalar[3].v[1];
+
+	for (; ip < ipend; ip += 2) {
+		a += ip[0];
+		a2 += ip[1];
+		b += a;
+		b2 += a2;
+		c += b;
+		c2 += b2;
+		d += c;
+		d2 += c2;
+	}
+
+	ctx->superscalar[0].v[0] = a;
+	ctx->superscalar[1].v[0] = b;
+	ctx->superscalar[2].v[0] = c;
+	ctx->superscalar[3].v[0] = d;
+	ctx->superscalar[0].v[1] = a2;
+	ctx->superscalar[1].v[1] = b2;
+	ctx->superscalar[2].v[1] = c2;
+	ctx->superscalar[3].v[1] = d2;
+}
+
+static void
+fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+	uint64_t a2, b2, c2, d2;
+
+	a = ctx->superscalar[0].v[0];
+	b = ctx->superscalar[1].v[0];
+	c = ctx->superscalar[2].v[0];
+	d = ctx->superscalar[3].v[0];
+	a2 = ctx->superscalar[0].v[1];
+	b2 = ctx->superscalar[1].v[1];
+	c2 = ctx->superscalar[2].v[1];
+	d2 = ctx->superscalar[3].v[1];
+
+	for (; ip < ipend; ip += 2) {
+		a += BSWAP_32(ip[0]);
+		a2 += BSWAP_32(ip[1]);
+		b += a;
+		b2 += a2;
+		c += b;
+		c2 += b2;
+		d += c;
+		d2 += c2;
+	}
+
+	ctx->superscalar[0].v[0] = a;
+	ctx->superscalar[1].v[0] = b;
+	ctx->superscalar[2].v[0] = c;
+	ctx->superscalar[3].v[0] = d;
+	ctx->superscalar[0].v[1] = a2;
+	ctx->superscalar[1].v[1] = b2;
+	ctx->superscalar[2].v[1] = c2;
+	ctx->superscalar[3].v[1] = d2;
+}
+
+static boolean_t fletcher_4_superscalar_valid(void)
+{
+	return (B_TRUE);
+}
+
+const fletcher_4_ops_t fletcher_4_superscalar_ops = {
+	.init_native = fletcher_4_superscalar_init,
+	.compute_native = fletcher_4_superscalar_native,
+	.fini_native = fletcher_4_superscalar_fini,
+	.init_byteswap = fletcher_4_superscalar_init,
+	.compute_byteswap = fletcher_4_superscalar_byteswap,
+	.fini_byteswap = fletcher_4_superscalar_fini,
+	.valid = fletcher_4_superscalar_valid,
+	.name = "superscalar"
+};
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c
new file mode 100644
index 000000000000..75e6a3baf980
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_fletcher_superscalar4.c
@@ -0,0 +1,229 @@
+/*
+ * Implement fast Fletcher4 using superscalar pipelines.
+ *
+ * Use regular C code to compute
+ * Fletcher4 in four incremental 64-bit parallel accumulator streams,
+ * and then combine the streams to form the final four checksum words.
+ * This implementation is a derivative of the AVX SIMD implementation by
+ * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c).
+ *
+ * Copyright (C) 2016 Romain Dolbeau.
+ *
+ * Authors:
+ *	Romain Dolbeau <romain.dolbeau@atos.net>
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/param.h>
+#include <sys/byteorder.h>
+#include <sys/spa_checksum.h>
+#include <sys/strings.h>
+#include <zfs_fletcher.h>
+
+static void
+fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx)
+{
+	bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t));
+}
+
+static void
+fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
+{
+	uint64_t A, B, C, D;
+
+	A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1] +
+	    ctx->superscalar[0].v[2] + ctx->superscalar[0].v[3];
+	B = 0 - ctx->superscalar[0].v[1] - 2 * ctx->superscalar[0].v[2] -
+	    3 * ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] +
+	    4 * ctx->superscalar[1].v[1] + 4 * ctx->superscalar[1].v[2] +
+	    4 * ctx->superscalar[1].v[3];
+
+	C = ctx->superscalar[0].v[2] + 3 * ctx->superscalar[0].v[3] -
+	    6 * ctx->superscalar[1].v[0] - 10 * ctx->superscalar[1].v[1] -
+	    14 * ctx->superscalar[1].v[2] - 18 * ctx->superscalar[1].v[3] +
+	    16 * ctx->superscalar[2].v[0] + 16 * ctx->superscalar[2].v[1] +
+	    16 * ctx->superscalar[2].v[2] + 16 * ctx->superscalar[2].v[3];
+
+	D = 0 - ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] +
+	    10 * ctx->superscalar[1].v[1] + 20 * ctx->superscalar[1].v[2] +
+	    34 * ctx->superscalar[1].v[3] - 48 * ctx->superscalar[2].v[0] -
+	    64 * ctx->superscalar[2].v[1] - 80 * ctx->superscalar[2].v[2] -
+	    96 * ctx->superscalar[2].v[3] + 64 * ctx->superscalar[3].v[0] +
+	    64 * ctx->superscalar[3].v[1] + 64 * ctx->superscalar[3].v[2] +
+	    64 * ctx->superscalar[3].v[3];
+
+	ZIO_SET_CHECKSUM(zcp, A, B, C, D);
+}
+
+static void
+fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+	uint64_t a2, b2, c2, d2;
+	uint64_t a3, b3, c3, d3;
+	uint64_t a4, b4, c4, d4;
+
+	a = ctx->superscalar[0].v[0];
+	b = ctx->superscalar[1].v[0];
+	c = ctx->superscalar[2].v[0];
+	d = ctx->superscalar[3].v[0];
+	a2 = ctx->superscalar[0].v[1];
+	b2 = ctx->superscalar[1].v[1];
+	c2 = ctx->superscalar[2].v[1];
+	d2 = ctx->superscalar[3].v[1];
+	a3 = ctx->superscalar[0].v[2];
+	b3 = ctx->superscalar[1].v[2];
+	c3 = ctx->superscalar[2].v[2];
+	d3 = ctx->superscalar[3].v[2];
+	a4 = ctx->superscalar[0].v[3];
+	b4 = ctx->superscalar[1].v[3];
+	c4 = ctx->superscalar[2].v[3];
+	d4 = ctx->superscalar[3].v[3];
+
+	for (; ip < ipend; ip += 4) {
+		a += ip[0];
+		a2 += ip[1];
+		a3 += ip[2];
+		a4 += ip[3];
+		b += a;
+		b2 += a2;
+		b3 += a3;
+		b4 += a4;
+		c += b;
+		c2 += b2;
+		c3 += b3;
+		c4 += b4;
+		d += c;
+		d2 += c2;
+		d3 += c3;
+		d4 += c4;
+	}
+
+	ctx->superscalar[0].v[0] = a;
+	ctx->superscalar[1].v[0] = b;
+	ctx->superscalar[2].v[0] = c;
+	ctx->superscalar[3].v[0] = d;
+	ctx->superscalar[0].v[1] = a2;
+	ctx->superscalar[1].v[1] = b2;
+	ctx->superscalar[2].v[1] = c2;
+	ctx->superscalar[3].v[1] = d2;
+	ctx->superscalar[0].v[2] = a3;
+	ctx->superscalar[1].v[2] = b3;
+	ctx->superscalar[2].v[2] = c3;
+	ctx->superscalar[3].v[2] = d3;
+	ctx->superscalar[0].v[3] = a4;
+	ctx->superscalar[1].v[3] = b4;
+	ctx->superscalar[2].v[3] = c4;
+	ctx->superscalar[3].v[3] = d4;
+}
+
+static void
+fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx,
+    const void *buf, uint64_t size)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+	uint64_t a2, b2, c2, d2;
+	uint64_t a3, b3, c3, d3;
+	uint64_t a4, b4, c4, d4;
+
+	a = ctx->superscalar[0].v[0];
+	b = ctx->superscalar[1].v[0];
+	c = ctx->superscalar[2].v[0];
+	d = ctx->superscalar[3].v[0];
+	a2 = ctx->superscalar[0].v[1];
+	b2 = ctx->superscalar[1].v[1];
+	c2 = ctx->superscalar[2].v[1];
+	d2 = ctx->superscalar[3].v[1];
+	a3 = ctx->superscalar[0].v[2];
+	b3 = ctx->superscalar[1].v[2];
+	c3 = ctx->superscalar[2].v[2];
+	d3 = ctx->superscalar[3].v[2];
+	a4 = ctx->superscalar[0].v[3];
+	b4 = ctx->superscalar[1].v[3];
+	c4 = ctx->superscalar[2].v[3];
+	d4 = ctx->superscalar[3].v[3];
+
+	for (; ip < ipend; ip += 4) {
+		a += BSWAP_32(ip[0]);
+		a2 += BSWAP_32(ip[1]);
+		a3 += BSWAP_32(ip[2]);
+		a4 += BSWAP_32(ip[3]);
+		b += a;
+		b2 += a2;
+		b3 += a3;
+		b4 += a4;
+		c += b;
+		c2 += b2;
+		c3 += b3;
+		c4 += b4;
+		d += c;
+		d2 += c2;
+		d3 += c3;
+		d4 += c4;
+	}
+
+	ctx->superscalar[0].v[0] = a;
+	ctx->superscalar[1].v[0] = b;
+	ctx->superscalar[2].v[0] = c;
+	ctx->superscalar[3].v[0] = d;
+	ctx->superscalar[0].v[1] = a2;
+	ctx->superscalar[1].v[1] = b2;
+	ctx->superscalar[2].v[1] = c2;
+	ctx->superscalar[3].v[1] = d2;
+	ctx->superscalar[0].v[2] = a3;
+	ctx->superscalar[1].v[2] = b3;
+	ctx->superscalar[2].v[2] = c3;
+	ctx->superscalar[3].v[2] = d3;
+	ctx->superscalar[0].v[3] = a4;
+	ctx->superscalar[1].v[3] = b4;
+	ctx->superscalar[2].v[3] = c4;
+	ctx->superscalar[3].v[3] = d4;
+}
+
+static boolean_t fletcher_4_superscalar4_valid(void)
+{
+	return (B_TRUE);
+}
+
+const fletcher_4_ops_t fletcher_4_superscalar4_ops = {
+	.init_native = fletcher_4_superscalar4_init,
+	.compute_native = fletcher_4_superscalar4_native,
+	.fini_native = fletcher_4_superscalar4_fini,
+	.init_byteswap = fletcher_4_superscalar4_init,
+	.compute_byteswap = fletcher_4_superscalar4_byteswap,
+	.fini_byteswap = fletcher_4_superscalar4_fini,
+	.valid = fletcher_4_superscalar4_valid,
+	.name = "superscalar4"
+};
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c
new file mode 100644
index 000000000000..0011a971cacb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_namecheck.c
@@ -0,0 +1,473 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * Common name validation routines for ZFS.  These routines are shared by the
+ * userland code as well as the ioctl() layer to ensure that we don't
+ * inadvertently expose a hole through direct ioctl()s that never gets tested.
+ * In userland, however, we want significantly more information about _why_ the
+ * name is invalid.  In the kernel, we only care whether it's valid or not.
+ * Each routine therefore takes a 'namecheck_err_t' which describes exactly why
+ * the name failed to validate.
+ */
+
+#if !defined(_KERNEL)
+#include <string.h>
+#endif
+
+#include <sys/dsl_dir.h>
+#include <sys/param.h>
+#include <sys/nvpair.h>
+#include "zfs_namecheck.h"
+#include "zfs_deleg.h"
+
+/*
+ * Deeply nested datasets can overflow the stack, so we put a limit
+ * in the amount of nesting a path can have. zfs_max_dataset_nesting
+ * can be tuned temporarily to fix existing datasets that exceed our
+ * predefined limit.
+ */
+int zfs_max_dataset_nesting = 50;
+
+static int
+valid_char(char c)
+{
+	return ((c >= 'a' && c <= 'z') ||
+	    (c >= 'A' && c <= 'Z') ||
+	    (c >= '0' && c <= '9') ||
+	    c == '-' || c == '_' || c == '.' || c == ':' || c == ' ');
+}
+
+/*
+ * Looks at a path and returns its level of nesting (depth).
+ */
+int
+get_dataset_depth(const char *path)
+{
+	const char *loc = path;
+	int nesting = 0;
+
+	/*
+	 * Keep track of nesting until you hit the end of the
+	 * path or found the snapshot/bookmark separator.
+	 */
+	for (int i = 0; loc[i] != '\0' &&
+	    loc[i] != '@' &&
+	    loc[i] != '#'; i++) {
+		if (loc[i] == '/')
+			nesting++;
+	}
+
+	return (nesting);
+}
+
+/*
+ * Snapshot names must be made up of alphanumeric characters plus the following
+ * characters:
+ *
+ *	[-_.: ]
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+	const char *loc;
+
+	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
+		if (why)
+			*why = NAME_ERR_TOOLONG;
+		return (-1);
+	}
+
+	if (path[0] == '\0') {
+		if (why)
+			*why = NAME_ERR_EMPTY_COMPONENT;
+		return (-1);
+	}
+
+	for (loc = path; *loc; loc++) {
+		if (!valid_char(*loc)) {
+			if (why) {
+				*why = NAME_ERR_INVALCHAR;
+				*what = *loc;
+			}
+			return (-1);
+		}
+	}
+	return (0);
+}
+
+
+/*
+ * Permissions set name must start with the letter '@' followed by the
+ * same character restrictions as snapshot names, except that the name
+ * cannot exceed 64 characters.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+permset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+	if (strlen(path) >= ZFS_PERMSET_MAXLEN) {
+		if (why)
+			*why = NAME_ERR_TOOLONG;
+		return (-1);
+	}
+
+	if (path[0] != '@') {
+		if (why) {
+			*why = NAME_ERR_NO_AT;
+			*what = path[0];
+		}
+		return (-1);
+	}
+
+	return (zfs_component_namecheck(&path[1], why, what));
+}
+
+/*
+ * Dataset paths should not be deeper than zfs_max_dataset_nesting
+ * in terms of nesting.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+dataset_nestcheck(const char *path)
+{
+	return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1);
+}
+
+/*
+ * Entity names must be of the following form:
+ *
+ *	[component/]*[component][(@|#)component]?
+ *
+ * Where each component is made up of alphanumeric characters plus the following
+ * characters:
+ *
+ *	[-_.: %]
+ *
+ * We allow '%' here as we use that character internally to create unique
+ * names for temporary clones (for online recv).
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+entity_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+	const char *end;
+
+	EQUIV(why == NULL, what == NULL);
+
+	/*
+	 * Make sure the name is not too long.
+	 */
+	if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) {
+		if (why)
+			*why = NAME_ERR_TOOLONG;
+		return (-1);
+	}
+
+	/* Explicitly check for a leading slash.  */
+	if (path[0] == '/') {
+		if (why)
+			*why = NAME_ERR_LEADING_SLASH;
+		return (-1);
+	}
+
+	if (path[0] == '\0') {
+		if (why)
+			*why = NAME_ERR_EMPTY_COMPONENT;
+		return (-1);
+	}
+
+	const char *start = path;
+	boolean_t found_delim = B_FALSE;
+	for (;;) {
+		/* Find the end of this component */
+		end = start;
+		while (*end != '/' && *end != '@' && *end != '#' &&
+		    *end != '\0')
+			end++;
+
+		if (*end == '\0' && end[-1] == '/') {
+			/* trailing slashes are not allowed */
+			if (why)
+				*why = NAME_ERR_TRAILING_SLASH;
+			return (-1);
+		}
+
+		/* Validate the contents of this component */
+		for (const char *loc = start; loc != end; loc++) {
+			if (!valid_char(*loc) && *loc != '%') {
+				if (why) {
+					*why = NAME_ERR_INVALCHAR;
+					*what = *loc;
+				}
+				return (-1);
+			}
+		}
+
+		if (*end == '\0' || *end == '/') {
+			int component_length = end - start;
+			/* Validate the contents of this component is not '.' */
+			if (component_length == 1) {
+				if (start[0] == '.') {
+					if (why)
+						*why = NAME_ERR_SELF_REF;
+					return (-1);
+				}
+			}
+
+			/* Validate the content of this component is not '..' */
+			if (component_length == 2) {
+				if (start[0] == '.' && start[1] == '.') {
+					if (why)
+						*why = NAME_ERR_PARENT_REF;
+					return (-1);
+				}
+			}
+		}
+
+		/* Snapshot or bookmark delimiter found */
+		if (*end == '@' || *end == '#') {
+			/* Multiple delimiters are not allowed */
+			if (found_delim != 0) {
+				if (why)
+					*why = NAME_ERR_MULTIPLE_DELIMITERS;
+				return (-1);
+			}
+
+			found_delim = B_TRUE;
+		}
+
+		/* Zero-length components are not allowed */
+		if (start == end) {
+			if (why)
+				*why = NAME_ERR_EMPTY_COMPONENT;
+			return (-1);
+		}
+
+		/* If we've reached the end of the string, we're OK */
+		if (*end == '\0')
+			return (0);
+
+		/*
+		 * If there is a '/' in a snapshot or bookmark name
+		 * then report an error
+		 */
+		if (*end == '/' && found_delim != 0) {
+			if (why)
+				*why = NAME_ERR_TRAILING_SLASH;
+			return (-1);
+		}
+
+		/* Update to the next component */
+		start = end + 1;
+	}
+}
+
+/*
+ * Dataset is any entity, except bookmark
+ */
+int
+dataset_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+	int ret = entity_namecheck(path, why, what);
+
+	if (ret == 0 && strchr(path, '#') != NULL) {
+		if (why != NULL) {
+			*why = NAME_ERR_INVALCHAR;
+			*what = '#';
+		}
+		return (-1);
+	}
+
+	return (ret);
+}
+
+/*
+ * Assert path is a valid bookmark name
+ */
+int
+bookmark_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+	int ret = entity_namecheck(path, why, what);
+
+	if (ret == 0 && strchr(path, '#') == NULL) {
+		if (why != NULL) {
+			*why = NAME_ERR_NO_POUND;
+			*what = '#';
+		}
+		return (-1);
+	}
+
+	return (ret);
+}
+
+/*
+ * Assert path is a valid snapshot name
+ */
+int
+snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
+{
+	int ret = entity_namecheck(path, why, what);
+
+	if (ret == 0 && strchr(path, '@') == NULL) {
+		if (why != NULL) {
+			*why = NAME_ERR_NO_AT;
+			*what = '@';
+		}
+		return (-1);
+	}
+
+	return (ret);
+}
+
+/*
+ * mountpoint names must be of the following form:
+ *
+ *	/[component][/]*[component][/]
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+mountpoint_namecheck(const char *path, namecheck_err_t *why)
+{
+	const char *start, *end;
+
+	/*
+	 * Make sure none of the mountpoint component names are too long.
+	 * If a component name is too long then the mkdir of the mountpoint
+	 * will fail but then the mountpoint property will be set to a value
+	 * that can never be mounted.  Better to fail before setting the prop.
+	 * Extra slashes are OK, they will be tossed by the mountpoint mkdir.
+	 */
+
+	if (path == NULL || *path != '/') {
+		if (why)
+			*why = NAME_ERR_LEADING_SLASH;
+		return (-1);
+	}
+
+	/* Skip leading slash  */
+	start = &path[1];
+	do {
+		end = start;
+		while (*end != '/' && *end != '\0')
+			end++;
+
+		if (end - start >= ZFS_MAX_DATASET_NAME_LEN) {
+			if (why)
+				*why = NAME_ERR_TOOLONG;
+			return (-1);
+		}
+		start = end + 1;
+
+	} while (*end != '\0');
+
+	return (0);
+}
+
+/*
+ * For pool names, we have the same set of valid characters as described in
+ * dataset names, with the additional restriction that the pool name must begin
+ * with a letter.  The pool names 'raidz' and 'mirror' are also reserved names
+ * that cannot be used.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+int
+pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
+{
+	const char *c;
+
+	/*
+	 * Make sure the name is not too long.
+	 * If we're creating a pool with version >= SPA_VERSION_DSL_SCRUB (v11)
+	 * we need to account for additional space needed by the origin ds which
+	 * will also be snapshotted: "poolname"+"/"+"$ORIGIN"+"@"+"$ORIGIN".
+	 * Play it safe and enforce this limit even if the pool version is < 11
+	 * so it can be upgraded without issues.
+	 */
+	if (strlen(pool) >= (ZFS_MAX_DATASET_NAME_LEN - 2 -
+	    strlen(ORIGIN_DIR_NAME) * 2)) {
+		if (why)
+			*why = NAME_ERR_TOOLONG;
+		return (-1);
+	}
+
+	c = pool;
+	while (*c != '\0') {
+		if (!valid_char(*c)) {
+			if (why) {
+				*why = NAME_ERR_INVALCHAR;
+				*what = *c;
+			}
+			return (-1);
+		}
+		c++;
+	}
+
+	if (!(*pool >= 'a' && *pool <= 'z') &&
+	    !(*pool >= 'A' && *pool <= 'Z')) {
+		if (why)
+			*why = NAME_ERR_NOLETTER;
+		return (-1);
+	}
+
+	if (strcmp(pool, "mirror") == 0 ||
+	    strcmp(pool, "raidz") == 0 ||
+	    strcmp(pool, "draid") == 0) {
+		if (why)
+			*why = NAME_ERR_RESERVED;
+		return (-1);
+	}
+
+	if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) {
+		if (why)
+			*why = NAME_ERR_DISKLIKE;
+		return (-1);
+	}
+
+	return (0);
+}
+
+EXPORT_SYMBOL(entity_namecheck);
+EXPORT_SYMBOL(pool_namecheck);
+EXPORT_SYMBOL(dataset_namecheck);
+EXPORT_SYMBOL(bookmark_namecheck);
+EXPORT_SYMBOL(snapshot_namecheck);
+EXPORT_SYMBOL(zfs_component_namecheck);
+EXPORT_SYMBOL(dataset_nestcheck);
+EXPORT_SYMBOL(get_dataset_depth);
+EXPORT_SYMBOL(zfs_max_dataset_nesting);
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_dataset_nesting, INT, ZMOD_RW,
+	"Limit to the amount of nesting a path can have. Defaults to 50.");
diff --git a/sys/contrib/openzfs/module/zcommon/zfs_prop.c b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
new file mode 100644
index 000000000000..b78331187e13
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zfs_prop.c
@@ -0,0 +1,1052 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright 2016, Joyent, Inc.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/u8_textprep.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/dsl_crypt.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_fletcher.h"
+
+#if !defined(_KERNEL)
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
+
+/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
+const char *zfs_userquota_prop_prefixes[] = {
+	"userused@",
+	"userquota@",
+	"groupused@",
+	"groupquota@",
+	"userobjused@",
+	"userobjquota@",
+	"groupobjused@",
+	"groupobjquota@",
+	"projectused@",
+	"projectquota@",
+	"projectobjused@",
+	"projectobjquota@"
+};
+
+zprop_desc_t *
+zfs_prop_get_table(void)
+{
+	return (zfs_prop_table);
+}
+
+void
+zfs_prop_init(void)
+{
+	static zprop_index_t checksum_table[] = {
+		{ "on",		ZIO_CHECKSUM_ON },
+		{ "off",	ZIO_CHECKSUM_OFF },
+		{ "fletcher2",	ZIO_CHECKSUM_FLETCHER_2 },
+		{ "fletcher4",	ZIO_CHECKSUM_FLETCHER_4 },
+		{ "sha256",	ZIO_CHECKSUM_SHA256 },
+		{ "noparity",   ZIO_CHECKSUM_NOPARITY },
+		{ "sha512",	ZIO_CHECKSUM_SHA512 },
+		{ "skein",	ZIO_CHECKSUM_SKEIN },
+#if !defined(__FreeBSD__)
+
+		{ "edonr",	ZIO_CHECKSUM_EDONR },
+#endif
+		{ NULL }
+	};
+
+	static zprop_index_t dedup_table[] = {
+		{ "on",		ZIO_CHECKSUM_ON },
+		{ "off",	ZIO_CHECKSUM_OFF },
+		{ "verify",	ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+		{ "sha256",	ZIO_CHECKSUM_SHA256 },
+		{ "sha256,verify",
+				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+		{ "sha512",	ZIO_CHECKSUM_SHA512 },
+		{ "sha512,verify",
+				ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
+		{ "skein",	ZIO_CHECKSUM_SKEIN },
+		{ "skein,verify",
+				ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
+#if !defined(__FreeBSD__)
+
+		{ "edonr,verify",
+				ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
+#endif
+		{ NULL }
+	};
+
+	static zprop_index_t compress_table[] = {
+		{ "on",		ZIO_COMPRESS_ON },
+		{ "off",	ZIO_COMPRESS_OFF },
+		{ "lzjb",	ZIO_COMPRESS_LZJB },
+		{ "gzip",	ZIO_COMPRESS_GZIP_6 },	/* gzip default */
+		{ "gzip-1",	ZIO_COMPRESS_GZIP_1 },
+		{ "gzip-2",	ZIO_COMPRESS_GZIP_2 },
+		{ "gzip-3",	ZIO_COMPRESS_GZIP_3 },
+		{ "gzip-4",	ZIO_COMPRESS_GZIP_4 },
+		{ "gzip-5",	ZIO_COMPRESS_GZIP_5 },
+		{ "gzip-6",	ZIO_COMPRESS_GZIP_6 },
+		{ "gzip-7",	ZIO_COMPRESS_GZIP_7 },
+		{ "gzip-8",	ZIO_COMPRESS_GZIP_8 },
+		{ "gzip-9",	ZIO_COMPRESS_GZIP_9 },
+		{ "zle",	ZIO_COMPRESS_ZLE },
+		{ "lz4",	ZIO_COMPRESS_LZ4 },
+		{ "zstd",	ZIO_COMPRESS_ZSTD },
+		{ "zstd-fast",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_DEFAULT) },
+
+		/*
+		 * ZSTD 1-19 are synthetic. We store the compression level in a
+		 * separate hidden property to avoid wasting a large amount of
+		 * space in the ZIO_COMPRESS enum.
+		 *
+		 * The compression level is also stored within the header of the
+		 * compressed block since we may need it for later recompression
+		 * to avoid checksum errors (L2ARC).
+		 *
+		 * Note that the level here is defined as bit shifted mask on
+		 * top of the method.
+		 */
+		{ "zstd-1",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_1) },
+		{ "zstd-2",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_2) },
+		{ "zstd-3",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_3) },
+		{ "zstd-4",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_4) },
+		{ "zstd-5",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_5) },
+		{ "zstd-6",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_6) },
+		{ "zstd-7",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_7) },
+		{ "zstd-8",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_8) },
+		{ "zstd-9",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_9) },
+		{ "zstd-10",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_10) },
+		{ "zstd-11",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_11) },
+		{ "zstd-12",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_12) },
+		{ "zstd-13",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_13) },
+		{ "zstd-14",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_14) },
+		{ "zstd-15",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_15) },
+		{ "zstd-16",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_16) },
+		{ "zstd-17",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_17) },
+		{ "zstd-18",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_18) },
+		{ "zstd-19",	ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_19) },
+
+		/*
+		 * The ZSTD-Fast levels are also synthetic.
+		 */
+		{ "zstd-fast-1",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1) },
+		{ "zstd-fast-2",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_2) },
+		{ "zstd-fast-3",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_3) },
+		{ "zstd-fast-4",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_4) },
+		{ "zstd-fast-5",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_5) },
+		{ "zstd-fast-6",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_6) },
+		{ "zstd-fast-7",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_7) },
+		{ "zstd-fast-8",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_8) },
+		{ "zstd-fast-9",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_9) },
+		{ "zstd-fast-10",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_10) },
+		{ "zstd-fast-20",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_20) },
+		{ "zstd-fast-30",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_30) },
+		{ "zstd-fast-40",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_40) },
+		{ "zstd-fast-50",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_50) },
+		{ "zstd-fast-60",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_60) },
+		{ "zstd-fast-70",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_70) },
+		{ "zstd-fast-80",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_80) },
+		{ "zstd-fast-90",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_90) },
+		{ "zstd-fast-100",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_100) },
+		{ "zstd-fast-500",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_500) },
+		{ "zstd-fast-1000",
+		    ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1000) },
+		{ NULL }
+	};
+
+	static zprop_index_t crypto_table[] = {
+		{ "on",			ZIO_CRYPT_ON },
+		{ "off",		ZIO_CRYPT_OFF },
+		{ "aes-128-ccm",	ZIO_CRYPT_AES_128_CCM },
+		{ "aes-192-ccm",	ZIO_CRYPT_AES_192_CCM },
+		{ "aes-256-ccm",	ZIO_CRYPT_AES_256_CCM },
+		{ "aes-128-gcm",	ZIO_CRYPT_AES_128_GCM },
+		{ "aes-192-gcm",	ZIO_CRYPT_AES_192_GCM },
+		{ "aes-256-gcm",	ZIO_CRYPT_AES_256_GCM },
+		{ NULL }
+	};
+
+	static zprop_index_t keyformat_table[] = {
+		{ "none",		ZFS_KEYFORMAT_NONE },
+		{ "raw",		ZFS_KEYFORMAT_RAW },
+		{ "hex",		ZFS_KEYFORMAT_HEX },
+		{ "passphrase",		ZFS_KEYFORMAT_PASSPHRASE },
+		{ NULL }
+	};
+
+	static zprop_index_t snapdir_table[] = {
+		{ "hidden",	ZFS_SNAPDIR_HIDDEN },
+		{ "visible",	ZFS_SNAPDIR_VISIBLE },
+		{ NULL }
+	};
+
+	static zprop_index_t snapdev_table[] = {
+		{ "hidden",	ZFS_SNAPDEV_HIDDEN },
+		{ "visible",	ZFS_SNAPDEV_VISIBLE },
+		{ NULL }
+	};
+
+	static zprop_index_t acl_mode_table[] = {
+		{ "discard",	ZFS_ACL_DISCARD },
+		{ "groupmask",	ZFS_ACL_GROUPMASK },
+		{ "passthrough", ZFS_ACL_PASSTHROUGH },
+		{ "restricted",	ZFS_ACL_RESTRICTED },
+		{ NULL }
+	};
+
+	static zprop_index_t acltype_table[] = {
+		{ "off",	ZFS_ACLTYPE_OFF },
+		{ "posix",	ZFS_ACLTYPE_POSIX },
+		{ "nfsv4",	ZFS_ACLTYPE_NFSV4 },
+		{ "disabled",	ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */
+		{ "noacl",	ZFS_ACLTYPE_OFF }, /* bkwrd compatibility */
+		{ "posixacl",	ZFS_ACLTYPE_POSIX }, /* bkwrd compatibility */
+		{ NULL }
+	};
+
+	static zprop_index_t acl_inherit_table[] = {
+		{ "discard",	ZFS_ACL_DISCARD },
+		{ "noallow",	ZFS_ACL_NOALLOW },
+		{ "restricted",	ZFS_ACL_RESTRICTED },
+		{ "passthrough", ZFS_ACL_PASSTHROUGH },
+		{ "secure",	ZFS_ACL_RESTRICTED }, /* bkwrd compatibility */
+		{ "passthrough-x", ZFS_ACL_PASSTHROUGH_X },
+		{ NULL }
+	};
+
+	static zprop_index_t case_table[] = {
+		{ "sensitive",		ZFS_CASE_SENSITIVE },
+		{ "insensitive",	ZFS_CASE_INSENSITIVE },
+		{ "mixed",		ZFS_CASE_MIXED },
+		{ NULL }
+	};
+
+	static zprop_index_t copies_table[] = {
+		{ "1",		1 },
+		{ "2",		2 },
+		{ "3",		3 },
+		{ NULL }
+	};
+
+	/*
+	 * Use the unique flags we have to send to u8_strcmp() and/or
+	 * u8_textprep() to represent the various normalization property
+	 * values.
+	 */
+	static zprop_index_t normalize_table[] = {
+		{ "none",	0 },
+		{ "formD",	U8_TEXTPREP_NFD },
+		{ "formKC",	U8_TEXTPREP_NFKC },
+		{ "formC",	U8_TEXTPREP_NFC },
+		{ "formKD",	U8_TEXTPREP_NFKD },
+		{ NULL }
+	};
+
+	static zprop_index_t version_table[] = {
+		{ "1",		1 },
+		{ "2",		2 },
+		{ "3",		3 },
+		{ "4",		4 },
+		{ "5",		5 },
+		{ "current",	ZPL_VERSION },
+		{ NULL }
+	};
+
+	static zprop_index_t boolean_table[] = {
+		{ "off",	0 },
+		{ "on",		1 },
+		{ NULL }
+	};
+
+	static zprop_index_t keystatus_table[] = {
+		{ "none",		ZFS_KEYSTATUS_NONE},
+		{ "unavailable",	ZFS_KEYSTATUS_UNAVAILABLE},
+		{ "available",		ZFS_KEYSTATUS_AVAILABLE},
+		{ NULL }
+	};
+
+	static zprop_index_t logbias_table[] = {
+		{ "latency",	ZFS_LOGBIAS_LATENCY },
+		{ "throughput",	ZFS_LOGBIAS_THROUGHPUT },
+		{ NULL }
+	};
+
+	static zprop_index_t canmount_table[] = {
+		{ "off",	ZFS_CANMOUNT_OFF },
+		{ "on",		ZFS_CANMOUNT_ON },
+		{ "noauto",	ZFS_CANMOUNT_NOAUTO },
+		{ NULL }
+	};
+
+	static zprop_index_t cache_table[] = {
+		{ "none",	ZFS_CACHE_NONE },
+		{ "metadata",	ZFS_CACHE_METADATA },
+		{ "all",	ZFS_CACHE_ALL },
+		{ NULL }
+	};
+
+	static zprop_index_t sync_table[] = {
+		{ "standard",	ZFS_SYNC_STANDARD },
+		{ "always",	ZFS_SYNC_ALWAYS },
+		{ "disabled",	ZFS_SYNC_DISABLED },
+		{ NULL }
+	};
+
+	static zprop_index_t xattr_table[] = {
+		{ "off",	ZFS_XATTR_OFF },
+		{ "on",		ZFS_XATTR_DIR },
+		{ "sa",		ZFS_XATTR_SA },
+		{ "dir",	ZFS_XATTR_DIR },
+		{ NULL }
+	};
+
+	static zprop_index_t dnsize_table[] = {
+		{ "legacy",	ZFS_DNSIZE_LEGACY },
+		{ "auto",	ZFS_DNSIZE_AUTO },
+		{ "1k",		ZFS_DNSIZE_1K },
+		{ "2k",		ZFS_DNSIZE_2K },
+		{ "4k",		ZFS_DNSIZE_4K },
+		{ "8k",		ZFS_DNSIZE_8K },
+		{ "16k",	ZFS_DNSIZE_16K },
+		{ NULL }
+	};
+
+	static zprop_index_t redundant_metadata_table[] = {
+		{ "all",	ZFS_REDUNDANT_METADATA_ALL },
+		{ "most",	ZFS_REDUNDANT_METADATA_MOST },
+		{ NULL }
+	};
+
+	static zprop_index_t volmode_table[] = {
+		{ "default",	ZFS_VOLMODE_DEFAULT },
+		{ "full",	ZFS_VOLMODE_GEOM },
+		{ "geom",	ZFS_VOLMODE_GEOM },
+		{ "dev",	ZFS_VOLMODE_DEV },
+		{ "none",	ZFS_VOLMODE_NONE },
+		{ NULL }
+	};
+
+	/* inherit index properties */
+	zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
+	    ZFS_REDUNDANT_METADATA_ALL,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "all | most", "REDUND_MD",
+	    redundant_metadata_table);
+	zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "standard | always | disabled", "SYNC",
+	    sync_table);
+	zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
+	    ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_VOLUME,
+#if !defined(__FreeBSD__)
+	    "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein"
+	    " | edonr",
+#else
+	    "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein",
+#endif
+	    "CHECKSUM", checksum_table);
+	zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "on | off | verify | sha256[,verify] | sha512[,verify] | "
+#if !defined(__FreeBSD__)
+	    "skein[,verify] | edonr,verify",
+#else
+	    "skein[,verify]",
+#endif
+	    "DEDUP", dedup_table);
+	zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
+	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4 | "
+	    "zstd | zstd-[1-19] | "
+	    "zstd-fast-[1-10,20,30,40,50,60,70,80,90,100,500,1000]",
+	    "COMPRESS", compress_table);
+	zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "hidden | visible", "SNAPDIR", snapdir_table);
+	zprop_register_index(ZFS_PROP_SNAPDEV, "snapdev", ZFS_SNAPDEV_HIDDEN,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "hidden | visible", "SNAPDEV", snapdev_table);
+	zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "discard | groupmask | passthrough | restricted", "ACLMODE",
+	    acl_mode_table);
+	zprop_register_index(ZFS_PROP_ACLTYPE, "acltype",
+#ifdef __linux__
+	    /* Linux doesn't natively support ZFS's NFSv4-style ACLs. */
+	    ZFS_ACLTYPE_OFF,
+#else
+	    ZFS_ACLTYPE_NFSV4,
+#endif
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	    "off | nfsv4 | posix", "ACLTYPE", acltype_table);
+	zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit",
+	    ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "discard | noallow | restricted | passthrough | passthrough-x",
+	    "ACLINHERIT", acl_inherit_table);
+	zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "1 | 2 | 3", "COPIES", copies_table);
+	zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache",
+	    ZFS_CACHE_ALL, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+	    "all | none | metadata", "PRIMARYCACHE", cache_table);
+	zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache",
+	    ZFS_CACHE_ALL, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
+	    "all | none | metadata", "SECONDARYCACHE", cache_table);
+	zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "latency | throughput", "LOGBIAS", logbias_table);
+	zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	    "on | off | dir | sa", "XATTR", xattr_table);
+	zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize",
+	    ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table);
+	zprop_register_index(ZFS_PROP_VOLMODE, "volmode",
+	    ZFS_VOLMODE_DEFAULT, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "default | full | geom | dev | none", "VOLMODE", volmode_table);
+
+	/* inherit index (boolean) properties */
+	zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table);
+	zprop_register_index(ZFS_PROP_RELATIME, "relatime", 0, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "RELATIME", boolean_table);
+	zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES",
+	    boolean_table);
+	zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC",
+	    boolean_table);
+	zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID",
+	    boolean_table);
+	zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY",
+	    boolean_table);
+#ifdef __FreeBSD__
+	zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table);
+#else
+	zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table);
+#endif
+	zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table);
+	zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND",
+	    boolean_table);
+	zprop_register_index(ZFS_PROP_OVERLAY, "overlay", 1, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "on | off", "OVERLAY", boolean_table);
+
+	/* default index properties */
+	zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	    "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table);
+	zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
+	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
+	    "CANMOUNT", canmount_table);
+
+	/* readonly index properties */
+	zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
+	    ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+	zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
+	    boolean_table);
+	zprop_register_index(ZFS_PROP_KEYSTATUS, "keystatus",
+	    ZFS_KEYSTATUS_NONE, PROP_READONLY, ZFS_TYPE_DATASET,
+	    "none | unavailable | available",
+	    "KEYSTATUS", keystatus_table);
+
+	/* set once index properties */
+	zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
+	    PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	    "none | formC | formD | formKC | formKD", "NORMALIZATION",
+	    normalize_table);
+	zprop_register_index(ZFS_PROP_CASE, "casesensitivity",
+	    ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM |
+	    ZFS_TYPE_SNAPSHOT,
+	    "sensitive | insensitive | mixed", "CASE", case_table);
+	zprop_register_index(ZFS_PROP_KEYFORMAT, "keyformat",
+	    ZFS_KEYFORMAT_NONE, PROP_ONETIME_DEFAULT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "none | raw | hex | passphrase", "KEYFORMAT", keyformat_table);
+	zprop_register_index(ZFS_PROP_ENCRYPTION, "encryption",
+	    ZIO_CRYPT_DEFAULT, PROP_ONETIME, ZFS_TYPE_DATASET,
+	    "on | off | aes-128-ccm | aes-192-ccm | aes-256-ccm | "
+	    "aes-128-gcm | aes-192-gcm | aes-256-gcm", "ENCRYPTION",
+	    crypto_table);
+
+	/* set once index (boolean) properties */
+	zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
+	    "on | off", "UTF8ONLY", boolean_table);
+
+	/* string properties */
+	zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<snapshot>", "ORIGIN");
+	zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY,
+	    ZFS_TYPE_SNAPSHOT, "<dataset>[,...]", "CLONES");
+	zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "<path> | legacy | none",
+	    "MOUNTPOINT");
+	zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | NFS share options",
+	    "SHARENFS");
+	zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+	    "filesystem | volume | snapshot | bookmark", "TYPE");
+	zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off",
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "on | off | SMB share options", "SHARESMB");
+	zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel",
+	    ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET,
+	    "<sensitivity label>", "MLSLABEL");
+	zprop_register_string(ZFS_PROP_SELINUX_CONTEXT, "context",
+	    "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux context>",
+	    "CONTEXT");
+	zprop_register_string(ZFS_PROP_SELINUX_FSCONTEXT, "fscontext",
+	    "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux fscontext>",
+	    "FSCONTEXT");
+	zprop_register_string(ZFS_PROP_SELINUX_DEFCONTEXT, "defcontext",
+	    "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux defcontext>",
+	    "DEFCONTEXT");
+	zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext",
+	    "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux rootcontext>",
+	    "ROOTCONTEXT");
+	zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
+	    "receive_resume_token",
+	    NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<string token>", "RESUMETOK");
+	zprop_register_string(ZFS_PROP_ENCRYPTION_ROOT, "encryptionroot", NULL,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "<filesystem | volume>",
+	    "ENCROOT");
+	zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation",
+	    "none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "prompt | <file URI>", "KEYLOCATION");
+	zprop_register_string(ZFS_PROP_REDACT_SNAPS,
+	    "redact_snaps", NULL, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<snapshot>[,...]",
+	    "RSNAPS");
+
+	/* readonly number properties */
+	zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
+	    ZFS_TYPE_DATASET, "<size>", "USED");
+	zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "AVAIL");
+	zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<size>",
+	    "REFER");
+	zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+	    "<1.00x or higher if compressed>", "RATIO");
+	zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET,
+	    "<1.00x or higher if compressed>", "REFRATIO");
+	zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
+	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
+	zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDSNAP");
+	zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDDS");
+	zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "USEDCHILD");
+	zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
+	    PROP_READONLY,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+	zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+	    ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
+	zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY,
+	    ZFS_TYPE_DATASET, "<size>", "WRITTEN");
+	zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>",
+	    "LUSED");
+	zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced",
+	    0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<size>",
+	    "LREFER");
+	zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count",
+	    UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM,
+	    "<count>", "FSCOUNT");
+	zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count",
+	    UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<count>", "SSCOUNT");
+	zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "GUID");
+	zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "<uint64>", "CREATETXG");
+	zprop_register_number(ZFS_PROP_PBKDF2_ITERS, "pbkdf2iters",
+	    0, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<iters>", "PBKDF2ITERS");
+	zprop_register_number(ZFS_PROP_OBJSETID, "objsetid", 0,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "<uint64>", "OBJSETID");
+
+	/* default number properties */
+	zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
+	    ZFS_TYPE_FILESYSTEM, "<size> | none", "QUOTA");
+	zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0,
+	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<size> | none", "RESERV");
+	zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT,
+	    ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "<size>", "VOLSIZE");
+	zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT,
+	    ZFS_TYPE_FILESYSTEM, "<size> | none", "REFQUOTA");
+	zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0,
+	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<size> | none", "REFRESERV");
+	zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM,
+	    "<count> | none", "FSLIMIT");
+	zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit",
+	    UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "<count> | none", "SSLIMIT");
+
+	/* inherit number properties */
+	zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize",
+	    SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT,
+	    ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE");
+	zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS,
+	    "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
+	    "zero or 512 to 1M, power of 2", "SPECIAL_SMALL_BLOCKS");
+
+	/* hidden properties */
+	zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
+	zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
+	    PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME");
+	zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions",
+	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+	zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
+	    "STMF_SBD_LU");
+	zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+	    "USERACCOUNTING");
+	zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
+	zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT");
+	zprop_register_hidden(ZFS_PROP_IVSET_GUID, "ivsetguid",
+	    PROP_TYPE_NUMBER, PROP_READONLY,
+	    ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "IVSETGUID");
+	zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING,
+	    PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP");
+	zprop_register_hidden(ZFS_PROP_PBKDF2_SALT, "pbkdf2salt",
+	    PROP_TYPE_NUMBER, PROP_ONETIME_DEFAULT,
+	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT");
+	zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID");
+	zprop_register_hidden(ZFS_PROP_REDACTED, "redacted", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "REDACTED");
+
+	/*
+	 * Properties that are obsolete and not used.  These are retained so
+	 * that we don't have to change the values of the zfs_prop_t enum, or
+	 * have NULL pointers in the zfs_prop_table[].
+	 */
+	zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG");
+
+	/* oddball properties */
+	zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0,
+	    NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK,
+	    "<date>", "CREATION", B_FALSE, B_TRUE, NULL);
+}
+
+boolean_t
+zfs_prop_delegatable(zfs_prop_t prop)
+{
+	zprop_desc_t *pd = &zfs_prop_table[prop];
+
+	/* The mlslabel property is never delegatable. */
+	if (prop == ZFS_PROP_MLSLABEL)
+		return (B_FALSE);
+
+	return (pd->pd_attr != PROP_READONLY);
+}
+
+/*
+ * Given a zfs dataset property name, returns the corresponding property ID.
+ */
+zfs_prop_t
+zfs_name_to_prop(const char *propname)
+{
+	return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
+}
+
+/*
+ * For user property names, we allow all lowercase alphanumeric characters, plus
+ * a few useful punctuation characters.
+ */
+static int
+valid_char(char c)
+{
+	return ((c >= 'a' && c <= 'z') ||
+	    (c >= '0' && c <= '9') ||
+	    c == '-' || c == '_' || c == '.' || c == ':');
+}
+
+/*
+ * Returns true if this is a valid user-defined property (one with a ':').
+ */
+boolean_t
+zfs_prop_user(const char *name)
+{
+	int i;
+	char c;
+	boolean_t foundsep = B_FALSE;
+
+	for (i = 0; i < strlen(name); i++) {
+		c = name[i];
+		if (!valid_char(c))
+			return (B_FALSE);
+		if (c == ':')
+			foundsep = B_TRUE;
+	}
+
+	if (!foundsep)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Returns true if this is a valid userspace-type property (one with a '@').
+ * Note that after the @, any character is valid (eg, another @, for SID
+ * user@domain).
+ */
+boolean_t
+zfs_prop_userquota(const char *name)
+{
+	zfs_userquota_prop_t prop;
+
+	for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
+		if (strncmp(name, zfs_userquota_prop_prefixes[prop],
+		    strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Returns true if this is a valid written@ property.
+ * Note that after the @, any character is valid (eg, another @, for
+ * written@pool/fs@origin).
+ */
+boolean_t
+zfs_prop_written(const char *name)
+{
+	static const char *prop_prefix = "written@";
+	static const char *book_prefix = "written#";
+	return (strncmp(name, prop_prefix, strlen(prop_prefix)) == 0 ||
+	    strncmp(name, book_prefix, strlen(book_prefix)) == 0);
+}
+
+/*
+ * Tables of index types, plus functions to convert between the user view
+ * (strings) and internal representation (uint64_t).
+ */
+int
+zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index)
+{
+	return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET));
+}
+
+int
+zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
+{
+	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
+}
+
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ */
+boolean_t
+zfs_prop_valid_for_type(int prop, zfs_type_t types, boolean_t headcheck)
+{
+	return (zprop_valid_for_type(prop, types, headcheck));
+}
+
+zprop_type_t
+zfs_prop_get_type(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_proptype);
+}
+
+/*
+ * Returns TRUE if the property is readonly.
+ */
+boolean_t
+zfs_prop_readonly(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_attr == PROP_READONLY ||
+	    zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
+	    zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
+}
+
+/*
+ * Returns TRUE if the property is visible (not hidden).
+ */
+boolean_t
+zfs_prop_visible(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_visible &&
+	    zfs_prop_table[prop].pd_zfs_mod_supported);
+}
+
+/*
+ * Returns TRUE if the property is only allowed to be set once.
+ */
+boolean_t
+zfs_prop_setonce(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_attr == PROP_ONETIME ||
+	    zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT);
+}
+
+const char *
+zfs_prop_default_string(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zfs_prop_default_numeric(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Given a dataset property ID, returns the corresponding name.
+ * Assuming the zfs dataset property ID is valid.
+ */
+const char *
+zfs_prop_to_name(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_name);
+}
+
+/*
+ * Returns TRUE if the property is inheritable.
+ */
+boolean_t
+zfs_prop_inheritable(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_attr == PROP_INHERIT ||
+	    zfs_prop_table[prop].pd_attr == PROP_ONETIME);
+}
+
+/*
+ * Returns TRUE if property is one of the encryption properties that requires
+ * a loaded encryption key to modify.
+ */
+boolean_t
+zfs_prop_encryption_key_param(zfs_prop_t prop)
+{
+	/*
+	 * keylocation does not count as an encryption property. It can be
+	 * changed at will without needing the master keys.
+	 */
+	return (prop == ZFS_PROP_PBKDF2_SALT || prop == ZFS_PROP_PBKDF2_ITERS ||
+	    prop == ZFS_PROP_KEYFORMAT);
+}
+
+/*
+ * Helper function used by both kernelspace and userspace to check the
+ * keylocation property. If encrypted is set, the keylocation must be valid
+ * for an encrypted dataset.
+ */
+boolean_t
+zfs_prop_valid_keylocation(const char *str, boolean_t encrypted)
+{
+	if (strcmp("none", str) == 0)
+		return (!encrypted);
+	else if (strcmp("prompt", str) == 0)
+		return (B_TRUE);
+	else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+
+#ifndef _KERNEL
+#include <libzfs.h>
+
+/*
+ * Returns a string describing the set of acceptable values for the given
+ * zfs property, or NULL if it cannot be set.
+ */
+const char *
+zfs_prop_values(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if this property is a string type.  Note that index types
+ * (compression, checksum) are treated as strings in userland, even though they
+ * are stored numerically on disk.
+ */
+int
+zfs_prop_is_string(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING ||
+	    zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX);
+}
+
+/*
+ * Returns the column header for the given property.  Used only in
+ * 'zfs list -o', but centralized here with the other property information.
+ */
+const char *
+zfs_prop_column_name(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_colname);
+}
+
+/*
+ * Returns whether the given property should be displayed right-justified for
+ * 'zfs list'.
+ */
+boolean_t
+zfs_prop_align_right(zfs_prop_t prop)
+{
+	return (zfs_prop_table[prop].pd_rightalign);
+}
+
+#endif
+
+#if defined(_KERNEL)
+
+#include <sys/simd.h>
+
+#if defined(HAVE_KERNEL_FPU_INTERNAL)
+union fpregs_state **zfs_kfpu_fpregs;
+EXPORT_SYMBOL(zfs_kfpu_fpregs);
+#endif /* HAVE_KERNEL_FPU_INTERNAL */
+
+static int __init
+zcommon_init(void)
+{
+	int error = kfpu_init();
+	if (error)
+		return (error);
+
+	fletcher_4_init();
+
+	return (0);
+}
+
+static void __exit
+zcommon_fini(void)
+{
+	fletcher_4_fini();
+	kfpu_fini();
+}
+
+module_init_early(zcommon_init);
+module_exit(zcommon_fini);
+
+#endif
+
+ZFS_MODULE_DESCRIPTION("Generic ZFS support");
+ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR);
+ZFS_MODULE_LICENSE(ZFS_META_LICENSE);
+ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
+
+/* zfs dataset property functions */
+EXPORT_SYMBOL(zfs_userquota_prop_prefixes);
+EXPORT_SYMBOL(zfs_prop_init);
+EXPORT_SYMBOL(zfs_prop_get_type);
+EXPORT_SYMBOL(zfs_prop_get_table);
+EXPORT_SYMBOL(zfs_prop_delegatable);
+EXPORT_SYMBOL(zfs_prop_visible);
+
+/* Dataset property functions shared between libzfs and kernel. */
+EXPORT_SYMBOL(zfs_prop_default_string);
+EXPORT_SYMBOL(zfs_prop_default_numeric);
+EXPORT_SYMBOL(zfs_prop_readonly);
+EXPORT_SYMBOL(zfs_prop_inheritable);
+EXPORT_SYMBOL(zfs_prop_encryption_key_param);
+EXPORT_SYMBOL(zfs_prop_valid_keylocation);
+EXPORT_SYMBOL(zfs_prop_setonce);
+EXPORT_SYMBOL(zfs_prop_to_name);
+EXPORT_SYMBOL(zfs_name_to_prop);
+EXPORT_SYMBOL(zfs_prop_user);
+EXPORT_SYMBOL(zfs_prop_userquota);
+EXPORT_SYMBOL(zfs_prop_index_to_string);
+EXPORT_SYMBOL(zfs_prop_string_to_index);
+EXPORT_SYMBOL(zfs_prop_valid_for_type);
+EXPORT_SYMBOL(zfs_prop_written);
diff --git a/sys/contrib/openzfs/module/zcommon/zpool_prop.c b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
new file mode 100644
index 000000000000..6299d371f25d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zpool_prop.c
@@ -0,0 +1,279 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#if !defined(_KERNEL)
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#endif
+
+static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS];
+
+zprop_desc_t *
+zpool_prop_get_table(void)
+{
+	return (zpool_prop_table);
+}
+
+void
+zpool_prop_init(void)
+{
+	static zprop_index_t boolean_table[] = {
+		{ "off",	0},
+		{ "on",		1},
+		{ NULL }
+	};
+
+	static zprop_index_t failuremode_table[] = {
+		{ "wait",	ZIO_FAILURE_MODE_WAIT },
+		{ "continue",	ZIO_FAILURE_MODE_CONTINUE },
+		{ "panic",	ZIO_FAILURE_MODE_PANIC },
+		{ NULL }
+	};
+
+	/* string properties */
+	zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT,
+	    ZFS_TYPE_POOL, "<path>", "ALTROOT");
+	zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT,
+	    ZFS_TYPE_POOL, "<filesystem>", "BOOTFS");
+	zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<file> | none", "CACHEFILE");
+	zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<comment-string>", "COMMENT");
+	zprop_register_string(ZPOOL_PROP_COMPATIBILITY, "compatibility",
+	    "off", PROP_DEFAULT, ZFS_TYPE_POOL,
+	    "<file[,file...]> | off | legacy", "COMPATIBILITY");
+
+	/* readonly number properties */
+	zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "SIZE");
+	zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "FREE");
+	zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "FREEING");
+	zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CKPOINT");
+	zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "LEAKED");
+	zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
+	zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
+	zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
+	zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "CAP");
+	zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<guid>", "GUID");
+	zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<load_guid>", "LOAD_GUID");
+	zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<state>", "HEALTH");
+	zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
+	    PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
+	    "DEDUP");
+
+	/* default number properties */
+	zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+	zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT,
+	    ZFS_TYPE_POOL, "<ashift, 9-16, or 0=default>", "ASHIFT");
+
+	/* default index (boolean) properties */
+	zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION",
+	    boolean_table);
+	zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
+	zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS",
+	    boolean_table);
+	zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
+	zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table);
+	zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST",
+	    boolean_table);
+
+	/* default index properties */
+	zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
+	    ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL,
+	    "wait | continue | panic", "FAILMODE", failuremode_table);
+	zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim",
+	    SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL,
+	    "on | off", "AUTOTRIM", boolean_table);
+
+	/* hidden properties */
+	zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING,
+	    PROP_READONLY, ZFS_TYPE_POOL, "NAME");
+	zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE");
+	zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING,
+	    PROP_ONETIME, ZFS_TYPE_POOL, "TNAME");
+	zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE");
+	zprop_register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto",
+	    PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO");
+}
+
+/*
+ * Given a property name and its type, returns the corresponding property ID.
+ */
+zpool_prop_t
+zpool_name_to_prop(const char *propname)
+{
+	return (zprop_name_to_prop(propname, ZFS_TYPE_POOL));
+}
+
+/*
+ * Given a pool property ID, returns the corresponding name.
+ * Assuming the pool property ID is valid.
+ */
+const char *
+zpool_prop_to_name(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_name);
+}
+
+zprop_type_t
+zpool_prop_get_type(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_proptype);
+}
+
+boolean_t
+zpool_prop_readonly(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_attr == PROP_READONLY);
+}
+
+boolean_t
+zpool_prop_setonce(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_attr == PROP_ONETIME);
+}
+
+const char *
+zpool_prop_default_string(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_strdefault);
+}
+
+uint64_t
+zpool_prop_default_numeric(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_numdefault);
+}
+
+/*
+ * Returns true if this is a valid feature@ property.
+ */
+boolean_t
+zpool_prop_feature(const char *name)
+{
+	static const char *prefix = "feature@";
+	return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+/*
+ * Returns true if this is a valid unsupported@ property.
+ */
+boolean_t
+zpool_prop_unsupported(const char *name)
+{
+	static const char *prefix = "unsupported@";
+	return (strncmp(name, prefix, strlen(prefix)) == 0);
+}
+
+int
+zpool_prop_string_to_index(zpool_prop_t prop, const char *string,
+    uint64_t *index)
+{
+	return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL));
+}
+
+int
+zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
+    const char **string)
+{
+	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
+}
+
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
+#ifndef _KERNEL
+#include <libzfs.h>
+
+const char *
+zpool_prop_values(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_values);
+}
+
+const char *
+zpool_prop_column_name(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_colname);
+}
+
+boolean_t
+zpool_prop_align_right(zpool_prop_t prop)
+{
+	return (zpool_prop_table[prop].pd_rightalign);
+}
+#endif
+
+#if defined(_KERNEL)
+/* zpool property functions */
+EXPORT_SYMBOL(zpool_prop_init);
+EXPORT_SYMBOL(zpool_prop_get_type);
+EXPORT_SYMBOL(zpool_prop_get_table);
+
+/* Pool property functions shared between libzfs and kernel. */
+EXPORT_SYMBOL(zpool_name_to_prop);
+EXPORT_SYMBOL(zpool_prop_to_name);
+EXPORT_SYMBOL(zpool_prop_default_string);
+EXPORT_SYMBOL(zpool_prop_default_numeric);
+EXPORT_SYMBOL(zpool_prop_readonly);
+EXPORT_SYMBOL(zpool_prop_feature);
+EXPORT_SYMBOL(zpool_prop_unsupported);
+EXPORT_SYMBOL(zpool_prop_index_to_string);
+EXPORT_SYMBOL(zpool_prop_string_to_index);
+#endif
diff --git a/sys/contrib/openzfs/module/zcommon/zprop_common.c b/sys/contrib/openzfs/module/zcommon/zprop_common.c
new file mode 100644
index 000000000000..faab9d9a74fd
--- /dev/null
+++ b/sys/contrib/openzfs/module/zcommon/zprop_common.c
@@ -0,0 +1,480 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+/*
+ * Common routines used by zfs and zpool property management.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_sysfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+
+#if !defined(_KERNEL)
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/stat.h>
+#endif
+
+static zprop_desc_t *
+zprop_get_proptable(zfs_type_t type)
+{
+	if (type == ZFS_TYPE_POOL)
+		return (zpool_prop_get_table());
+	else
+		return (zfs_prop_get_table());
+}
+
+static int
+zprop_get_numprops(zfs_type_t type)
+{
+	if (type == ZFS_TYPE_POOL)
+		return (ZPOOL_NUM_PROPS);
+	else
+		return (ZFS_NUM_PROPS);
+}
+
+static boolean_t
+zfs_mod_supported_prop(const char *name, zfs_type_t type)
+{
+/*
+ * The zfs module spa_feature_table[], whether in-kernel or in libzpool,
+ * always supports all the properties. libzfs needs to query the running
+ * module, via sysfs, to determine which properties are supported.
+ *
+ * The equivalent _can_ be done on FreeBSD by way of the sysctl
+ * tree, but this has not been done yet.
+ */
+#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__)
+	return (B_TRUE);
+#else
+	return (zfs_mod_supported(type == ZFS_TYPE_POOL ?
+	    ZFS_SYSFS_POOL_PROPERTIES : ZFS_SYSFS_DATASET_PROPERTIES, name));
+#endif
+}
+
+void
+zprop_register_impl(int prop, const char *name, zprop_type_t type,
+    uint64_t numdefault, const char *strdefault, zprop_attr_t attr,
+    int objset_types, const char *values, const char *colname,
+    boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl)
+{
+	zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types);
+	zprop_desc_t *pd;
+
+	pd = &prop_tbl[prop];
+
+	ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+	ASSERT(name != NULL);
+	ASSERT(colname != NULL);
+
+	pd->pd_name = name;
+	pd->pd_propnum = prop;
+	pd->pd_proptype = type;
+	pd->pd_numdefault = numdefault;
+	pd->pd_strdefault = strdefault;
+	pd->pd_attr = attr;
+	pd->pd_types = objset_types;
+	pd->pd_values = values;
+	pd->pd_colname = colname;
+	pd->pd_rightalign = rightalign;
+	pd->pd_visible = visible;
+	pd->pd_zfs_mod_supported = zfs_mod_supported_prop(name, objset_types);
+	pd->pd_table = idx_tbl;
+	pd->pd_table_size = 0;
+	while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+		pd->pd_table_size++;
+}
+
+void
+zprop_register_string(int prop, const char *name, const char *def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname)
+{
+	zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr,
+	    objset_types, values, colname, B_FALSE, B_TRUE, NULL);
+
+}
+
+void
+zprop_register_number(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname)
+{
+	zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr,
+	    objset_types, values, colname, B_TRUE, B_TRUE, NULL);
+}
+
+void
+zprop_register_index(int prop, const char *name, uint64_t def,
+    zprop_attr_t attr, int objset_types, const char *values,
+    const char *colname, const zprop_index_t *idx_tbl)
+{
+	zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr,
+	    objset_types, values, colname, B_FALSE, B_TRUE, idx_tbl);
+}
+
+void
+zprop_register_hidden(int prop, const char *name, zprop_type_t type,
+    zprop_attr_t attr, int objset_types, const char *colname)
+{
+	zprop_register_impl(prop, name, type, 0, NULL, attr,
+	    objset_types, NULL, colname,
+	    type == PROP_TYPE_NUMBER, B_FALSE, NULL);
+}
+
+
+/*
+ * A comparison function we can use to order indexes into property tables.
+ */
+static int
+zprop_compare(const void *arg1, const void *arg2)
+{
+	const zprop_desc_t *p1 = *((zprop_desc_t **)arg1);
+	const zprop_desc_t *p2 = *((zprop_desc_t **)arg2);
+	boolean_t p1ro, p2ro;
+
+	p1ro = (p1->pd_attr == PROP_READONLY);
+	p2ro = (p2->pd_attr == PROP_READONLY);
+
+	if (p1ro == p2ro)
+		return (strcmp(p1->pd_name, p2->pd_name));
+
+	return (p1ro ? -1 : 1);
+}
+
+/*
+ * Iterate over all properties in the given property table, calling back
+ * into the specified function for each property. We will continue to
+ * iterate until we either reach the end or the callback function returns
+ * something other than ZPROP_CONT.
+ */
+int
+zprop_iter_common(zprop_func func, void *cb, boolean_t show_all,
+    boolean_t ordered, zfs_type_t type)
+{
+	int i, num_props, size, prop;
+	zprop_desc_t *prop_tbl;
+	zprop_desc_t **order;
+
+	prop_tbl = zprop_get_proptable(type);
+	num_props = zprop_get_numprops(type);
+	size = num_props * sizeof (zprop_desc_t *);
+
+#if defined(_KERNEL)
+	order = kmem_alloc(size, KM_SLEEP);
+#else
+	if ((order = malloc(size)) == NULL)
+		return (ZPROP_CONT);
+#endif
+
+	for (int j = 0; j < num_props; j++)
+		order[j] = &prop_tbl[j];
+
+	if (ordered) {
+		qsort((void *)order, num_props, sizeof (zprop_desc_t *),
+		    zprop_compare);
+	}
+
+	prop = ZPROP_CONT;
+	for (i = 0; i < num_props; i++) {
+		if ((order[i]->pd_visible || show_all) &&
+		    order[i]->pd_zfs_mod_supported &&
+		    (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) {
+			prop = order[i]->pd_propnum;
+			break;
+		}
+	}
+
+#if defined(_KERNEL)
+	kmem_free(order, size);
+#else
+	free(order);
+#endif
+	return (prop);
+}
+
+static boolean_t
+propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
+{
+	const char *propname = prop_entry->pd_name;
+#ifndef _KERNEL
+	const char *colname = prop_entry->pd_colname;
+	int c;
+#endif
+
+	if (len == strlen(propname) &&
+	    strncmp(p, propname, len) == 0)
+		return (B_TRUE);
+
+#ifndef _KERNEL
+	if (colname == NULL || len != strlen(colname))
+		return (B_FALSE);
+
+	for (c = 0; c < len; c++)
+		if (p[c] != tolower(colname[c]))
+			break;
+
+	return (colname[c] == '\0');
+#else
+	return (B_FALSE);
+#endif
+}
+
+typedef struct name_to_prop_cb {
+	const char *propname;
+	zprop_desc_t *prop_tbl;
+} name_to_prop_cb_t;
+
+static int
+zprop_name_to_prop_cb(int prop, void *cb_data)
+{
+	name_to_prop_cb_t *data = cb_data;
+
+	if (propname_match(data->propname, strlen(data->propname),
+	    &data->prop_tbl[prop]))
+		return (prop);
+
+	return (ZPROP_CONT);
+}
+
+int
+zprop_name_to_prop(const char *propname, zfs_type_t type)
+{
+	int prop;
+	name_to_prop_cb_t cb_data;
+
+	cb_data.propname = propname;
+	cb_data.prop_tbl = zprop_get_proptable(type);
+
+	prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data,
+	    B_TRUE, B_FALSE, type);
+
+	return (prop == ZPROP_CONT ? ZPROP_INVAL : prop);
+}
+
+int
+zprop_string_to_index(int prop, const char *string, uint64_t *index,
+    zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl;
+	const zprop_index_t *idx_tbl;
+	int i;
+
+	if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+		return (-1);
+
+	ASSERT(prop < zprop_get_numprops(type));
+	prop_tbl = zprop_get_proptable(type);
+	if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+		return (-1);
+
+	for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+		if (strcmp(string, idx_tbl[i].pi_name) == 0) {
+			*index = idx_tbl[i].pi_value;
+			return (0);
+		}
+	}
+
+	return (-1);
+}
+
+int
+zprop_index_to_string(int prop, uint64_t index, const char **string,
+    zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl;
+	const zprop_index_t *idx_tbl;
+	int i;
+
+	if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+		return (-1);
+
+	ASSERT(prop < zprop_get_numprops(type));
+	prop_tbl = zprop_get_proptable(type);
+	if ((idx_tbl = prop_tbl[prop].pd_table) == NULL)
+		return (-1);
+
+	for (i = 0; idx_tbl[i].pi_name != NULL; i++) {
+		if (idx_tbl[i].pi_value == index) {
+			*string = idx_tbl[i].pi_name;
+			return (0);
+		}
+	}
+
+	return (-1);
+}
+
+/*
+ * Return a random valid property value.  Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl;
+	const zprop_index_t *idx_tbl;
+
+	ASSERT((uint_t)prop < zprop_get_numprops(type));
+	prop_tbl = zprop_get_proptable(type);
+	idx_tbl = prop_tbl[prop].pd_table;
+
+	if (idx_tbl == NULL)
+		return (seed);
+
+	return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
+const char *
+zprop_values(int prop, zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl;
+
+	ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+	ASSERT(prop < zprop_get_numprops(type));
+
+	prop_tbl = zprop_get_proptable(type);
+
+	return (prop_tbl[prop].pd_values);
+}
+
+/*
+ * Returns TRUE if the property applies to any of the given dataset types.
+ *
+ * If headcheck is set, the check is being made against the head dataset
+ * type of a snapshot which requires to return B_TRUE when the property
+ * is only valid for snapshots.
+ */
+boolean_t
+zprop_valid_for_type(int prop, zfs_type_t type, boolean_t headcheck)
+{
+	zprop_desc_t *prop_tbl;
+
+	if (prop == ZPROP_INVAL || prop == ZPROP_CONT)
+		return (B_FALSE);
+
+	ASSERT(prop < zprop_get_numprops(type));
+	prop_tbl = zprop_get_proptable(type);
+	if (headcheck && prop_tbl[prop].pd_types == ZFS_TYPE_SNAPSHOT)
+		return (B_TRUE);
+	return ((prop_tbl[prop].pd_types & type) != 0);
+}
+
+#ifndef _KERNEL
+
+/*
+ * Determines the minimum width for the column, and indicates whether it's fixed
+ * or not.  Only string columns are non-fixed.
+ */
+size_t
+zprop_width(int prop, boolean_t *fixed, zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl, *pd;
+	const zprop_index_t *idx;
+	size_t ret;
+	int i;
+
+	ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT);
+	ASSERT(prop < zprop_get_numprops(type));
+
+	prop_tbl = zprop_get_proptable(type);
+	pd = &prop_tbl[prop];
+
+	*fixed = B_TRUE;
+
+	/*
+	 * Start with the width of the column name.
+	 */
+	ret = strlen(pd->pd_colname);
+
+	/*
+	 * For fixed-width values, make sure the width is large enough to hold
+	 * any possible value.
+	 */
+	switch (pd->pd_proptype) {
+	case PROP_TYPE_NUMBER:
+		/*
+		 * The maximum length of a human-readable number is 5 characters
+		 * ("20.4M", for example).
+		 */
+		if (ret < 5)
+			ret = 5;
+		/*
+		 * 'creation' is handled specially because it's a number
+		 * internally, but displayed as a date string.
+		 */
+		if (prop == ZFS_PROP_CREATION)
+			*fixed = B_FALSE;
+		/*
+		 * 'health' is handled specially because it's a number
+		 * internally, but displayed as a fixed 8 character string.
+		 */
+		if (prop == ZPOOL_PROP_HEALTH)
+			ret = 8;
+		break;
+	case PROP_TYPE_INDEX:
+		idx = prop_tbl[prop].pd_table;
+		for (i = 0; idx[i].pi_name != NULL; i++) {
+			if (strlen(idx[i].pi_name) > ret)
+				ret = strlen(idx[i].pi_name);
+		}
+		break;
+
+	case PROP_TYPE_STRING:
+		*fixed = B_FALSE;
+		break;
+	}
+
+	return (ret);
+}
+
+#endif
+
+#if defined(_KERNEL)
+/* Common routines to initialize property tables */
+EXPORT_SYMBOL(zprop_register_impl);
+EXPORT_SYMBOL(zprop_register_string);
+EXPORT_SYMBOL(zprop_register_number);
+EXPORT_SYMBOL(zprop_register_index);
+EXPORT_SYMBOL(zprop_register_hidden);
+
+/* Common routines for zfs and zpool property management */
+EXPORT_SYMBOL(zprop_iter_common);
+EXPORT_SYMBOL(zprop_name_to_prop);
+EXPORT_SYMBOL(zprop_string_to_index);
+EXPORT_SYMBOL(zprop_index_to_string);
+EXPORT_SYMBOL(zprop_random_value);
+EXPORT_SYMBOL(zprop_values);
+EXPORT_SYMBOL(zprop_valid_for_type);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/Makefile.in b/sys/contrib/openzfs/module/zfs/Makefile.in
new file mode 100644
index 000000000000..653ea0da9bcc
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/Makefile.in
@@ -0,0 +1,157 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+mfdir = $(obj)
+else
+mfdir = $(srctree)/$(src)
+endif
+
+MODULE := zfs
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+# Suppress unused-value warnings in sparc64 architecture headers
+ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
+
+$(MODULE)-objs += abd.o
+$(MODULE)-objs += aggsum.o
+$(MODULE)-objs += arc.o
+$(MODULE)-objs += blkptr.o
+$(MODULE)-objs += bplist.o
+$(MODULE)-objs += bpobj.o
+$(MODULE)-objs += bptree.o
+$(MODULE)-objs += btree.o
+$(MODULE)-objs += bqueue.o
+$(MODULE)-objs += dataset_kstats.o
+$(MODULE)-objs += dbuf.o
+$(MODULE)-objs += dbuf_stats.o
+$(MODULE)-objs += ddt.o
+$(MODULE)-objs += ddt_zap.o
+$(MODULE)-objs += dmu.o
+$(MODULE)-objs += dmu_diff.o
+$(MODULE)-objs += dmu_object.o
+$(MODULE)-objs += dmu_objset.o
+$(MODULE)-objs += dmu_recv.o
+$(MODULE)-objs += dmu_redact.o
+$(MODULE)-objs += dmu_send.o
+$(MODULE)-objs += dmu_traverse.o
+$(MODULE)-objs += dmu_tx.o
+$(MODULE)-objs += dmu_zfetch.o
+$(MODULE)-objs += dnode.o
+$(MODULE)-objs += dnode_sync.o
+$(MODULE)-objs += dsl_bookmark.o
+$(MODULE)-objs += dsl_crypt.o
+$(MODULE)-objs += dsl_dataset.o
+$(MODULE)-objs += dsl_deadlist.o
+$(MODULE)-objs += dsl_deleg.o
+$(MODULE)-objs += dsl_destroy.o
+$(MODULE)-objs += dsl_dir.o
+$(MODULE)-objs += dsl_pool.o
+$(MODULE)-objs += dsl_prop.o
+$(MODULE)-objs += dsl_scan.o
+$(MODULE)-objs += dsl_synctask.o
+$(MODULE)-objs += dsl_userhold.o
+$(MODULE)-objs += edonr_zfs.o
+$(MODULE)-objs += fm.o
+$(MODULE)-objs += gzip.o
+$(MODULE)-objs += hkdf.o
+$(MODULE)-objs += lz4.o
+$(MODULE)-objs += lzjb.o
+$(MODULE)-objs += metaslab.o
+$(MODULE)-objs += mmp.o
+$(MODULE)-objs += multilist.o
+$(MODULE)-objs += objlist.o
+$(MODULE)-objs += pathname.o
+$(MODULE)-objs += range_tree.o
+$(MODULE)-objs += refcount.o
+$(MODULE)-objs += rrwlock.o
+$(MODULE)-objs += sa.o
+$(MODULE)-objs += sha256.o
+$(MODULE)-objs += skein_zfs.o
+$(MODULE)-objs += spa.o
+$(MODULE)-objs += spa_boot.o
+$(MODULE)-objs += spa_checkpoint.o
+$(MODULE)-objs += spa_config.o
+$(MODULE)-objs += spa_errlog.o
+$(MODULE)-objs += spa_history.o
+$(MODULE)-objs += spa_log_spacemap.o
+$(MODULE)-objs += spa_misc.o
+$(MODULE)-objs += spa_stats.o
+$(MODULE)-objs += space_map.o
+$(MODULE)-objs += space_reftree.o
+$(MODULE)-objs += txg.o
+$(MODULE)-objs += uberblock.o
+$(MODULE)-objs += unique.o
+$(MODULE)-objs += vdev.o
+$(MODULE)-objs += vdev_cache.o
+$(MODULE)-objs += vdev_draid.o
+$(MODULE)-objs += vdev_draid_rand.o
+$(MODULE)-objs += vdev_indirect.o
+$(MODULE)-objs += vdev_indirect_births.o
+$(MODULE)-objs += vdev_indirect_mapping.o
+$(MODULE)-objs += vdev_initialize.o
+$(MODULE)-objs += vdev_label.o
+$(MODULE)-objs += vdev_mirror.o
+$(MODULE)-objs += vdev_missing.o
+$(MODULE)-objs += vdev_queue.o
+$(MODULE)-objs += vdev_raidz.o
+$(MODULE)-objs += vdev_raidz_math.o
+$(MODULE)-objs += vdev_raidz_math_scalar.o
+$(MODULE)-objs += vdev_rebuild.o
+$(MODULE)-objs += vdev_removal.o
+$(MODULE)-objs += vdev_root.o
+$(MODULE)-objs += vdev_trim.o
+$(MODULE)-objs += zap.o
+$(MODULE)-objs += zap_leaf.o
+$(MODULE)-objs += zap_micro.o
+$(MODULE)-objs += zcp.o
+$(MODULE)-objs += zcp_get.o
+$(MODULE)-objs += zcp_global.o
+$(MODULE)-objs += zcp_iter.o
+$(MODULE)-objs += zcp_set.o
+$(MODULE)-objs += zcp_synctask.o
+$(MODULE)-objs += zfeature.o
+$(MODULE)-objs += zfs_byteswap.o
+$(MODULE)-objs += zfs_fm.o
+$(MODULE)-objs += zfs_fuid.o
+$(MODULE)-objs += zfs_ioctl.o
+$(MODULE)-objs += zfs_log.o
+$(MODULE)-objs += zfs_onexit.o
+$(MODULE)-objs += zfs_quota.o
+$(MODULE)-objs += zfs_ratelimit.o
+$(MODULE)-objs += zfs_replay.o
+$(MODULE)-objs += zfs_rlock.o
+$(MODULE)-objs += zfs_sa.o
+$(MODULE)-objs += zfs_vnops.o
+$(MODULE)-objs += zil.o
+$(MODULE)-objs += zio.o
+$(MODULE)-objs += zio_checksum.o
+$(MODULE)-objs += zio_compress.o
+$(MODULE)-objs += zio_inject.o
+$(MODULE)-objs += zle.o
+$(MODULE)-objs += zrlock.o
+$(MODULE)-objs += zthr.o
+$(MODULE)-objs += zvol.o
+
+# Suppress incorrect warnings from versions of objtool which are not
+# aware of x86 EVEX prefix instructions used for AVX512.
+OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y
+OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y
+
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o
+$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o
+
+$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o
+$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o
+
+$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o
+$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o
+
+ifeq ($(CONFIG_ALTIVEC),y)
+$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec
+endif
+
+include $(mfdir)/../os/linux/zfs/Makefile
diff --git a/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash
new file mode 100644
index 000000000000..e558b2a50358
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash
@@ -0,0 +1,19 @@
+Copyright (c) 2011 Google, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash.descrip b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash.descrip
new file mode 100644
index 000000000000..f98cb76dfc91
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/THIRDPARTYLICENSE.cityhash.descrip
@@ -0,0 +1 @@
+CITYHASH CHECKSUM FUNCTIONALITY IN ZFS
diff --git a/sys/contrib/openzfs/module/zfs/abd.c b/sys/contrib/openzfs/module/zfs/abd.c
new file mode 100644
index 000000000000..7d3a2f6d69e2
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/abd.c
@@ -0,0 +1,1212 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * ARC buffer data (ABD).
+ *
+ * ABDs are an abstract data structure for the ARC which can use two
+ * different ways of storing the underlying data:
+ *
+ * (a) Linear buffer. In this case, all the data in the ABD is stored in one
+ *     contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
+ *
+ *         +-------------------+
+ *         | ABD (linear)      |
+ *         |   abd_flags = ... |
+ *         |   abd_size = ...  |     +--------------------------------+
+ *         |   abd_buf ------------->| raw buffer of size abd_size    |
+ *         +-------------------+     +--------------------------------+
+ *              no abd_chunks
+ *
+ * (b) Scattered buffer. In this case, the data in the ABD is split into
+ *     equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
+ *     to the chunks recorded in an array at the end of the ABD structure.
+ *
+ *         +-------------------+
+ *         | ABD (scattered)   |
+ *         |   abd_flags = ... |
+ *         |   abd_size = ...  |
+ *         |   abd_offset = 0  |                           +-----------+
+ *         |   abd_chunks[0] ----------------------------->| chunk 0   |
+ *         |   abd_chunks[1] ---------------------+        +-----------+
+ *         |   ...             |                  |        +-----------+
+ *         |   abd_chunks[N-1] ---------+         +------->| chunk 1   |
+ *         +-------------------+        |                  +-----------+
+ *                                      |                      ...
+ *                                      |                  +-----------+
+ *                                      +----------------->| chunk N-1 |
+ *                                                         +-----------+
+ *
+ * In addition to directly allocating a linear or scattered ABD, it is also
+ * possible to create an ABD by requesting the "sub-ABD" starting at an offset
+ * within an existing ABD. In linear buffers this is simple (set abd_buf of
+ * the new ABD to the starting point within the original raw buffer), but
+ * scattered ABDs are a little more complex. The new ABD makes a copy of the
+ * relevant abd_chunks pointers (but not the underlying data). However, to
+ * provide arbitrary rather than only chunk-aligned starting offsets, it also
+ * tracks an abd_offset field which represents the starting point of the data
+ * within the first chunk in abd_chunks. For both linear and scattered ABDs,
+ * creating an offset ABD marks the original ABD as the offset's parent, and the
+ * original ABD's abd_children refcount is incremented. This data allows us to
+ * ensure the root ABD isn't deleted before its children.
+ *
+ * Most consumers should never need to know what type of ABD they're using --
+ * the ABD public API ensures that it's possible to transparently switch from
+ * using a linear ABD to a scattered one when doing so would be beneficial.
+ *
+ * If you need to use the data within an ABD directly, if you know it's linear
+ * (because you allocated it) you can use abd_to_buf() to access the underlying
+ * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
+ * which will allocate a raw buffer if necessary. Use the abd_return_buf*
+ * functions to return any raw buffers that are no longer necessary when you're
+ * done using them.
+ *
+ * There are a variety of ABD APIs that implement basic buffer operations:
+ * compare, copy, read, write, and fill with zeroes. If you need a custom
+ * function which progressively accesses the whole ABD, use the abd_iterate_*
+ * functions.
+ *
+ * As an additional feature, linear and scatter ABD's can be stitched together
+ * by using the gang ABD type (abd_alloc_gang_abd()). This allows for
+ * multiple ABDs to be viewed as a singular ABD.
+ *
+ * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
+ * B_FALSE.
+ */
+
+#include <sys/abd_impl.h>
+#include <sys/param.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+
+/* see block comment above for description */
+int zfs_abd_scatter_enabled = B_TRUE;
+
+void
+abd_verify(abd_t *abd)
+{
+	ASSERT3U(abd->abd_size, >, 0);
+	ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
+	    ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
+	    ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG |
+	    ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS | ABD_FLAG_ALLOCD));
+#ifdef ZFS_DEBUG
+	IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
+#endif
+	IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
+	if (abd_is_linear(abd)) {
+		ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL);
+	} else if (abd_is_gang(abd)) {
+		uint_t child_sizes = 0;
+		for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain);
+		    cabd != NULL;
+		    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+			ASSERT(list_link_active(&cabd->abd_gang_link));
+			child_sizes += cabd->abd_size;
+			abd_verify(cabd);
+		}
+		ASSERT3U(abd->abd_size, ==, child_sizes);
+	} else {
+		abd_verify_scatter(abd);
+	}
+}
+
+static void
+abd_init_struct(abd_t *abd)
+{
+	list_link_init(&abd->abd_gang_link);
+	mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL);
+	abd->abd_flags = 0;
+#ifdef ZFS_DEBUG
+	zfs_refcount_create(&abd->abd_children);
+	abd->abd_parent = NULL;
+#endif
+	abd->abd_size = 0;
+}
+
+static void
+abd_fini_struct(abd_t *abd)
+{
+	mutex_destroy(&abd->abd_mtx);
+	ASSERT(!list_link_active(&abd->abd_gang_link));
+#ifdef ZFS_DEBUG
+	zfs_refcount_destroy(&abd->abd_children);
+#endif
+}
+
+abd_t *
+abd_alloc_struct(size_t size)
+{
+	abd_t *abd = abd_alloc_struct_impl(size);
+	abd_init_struct(abd);
+	abd->abd_flags |= ABD_FLAG_ALLOCD;
+	return (abd);
+}
+
+void
+abd_free_struct(abd_t *abd)
+{
+	abd_fini_struct(abd);
+	abd_free_struct_impl(abd);
+}
+
+/*
+ * Allocate an ABD, along with its own underlying data buffers. Use this if you
+ * don't care whether the ABD is linear or not.
+ */
+abd_t *
+abd_alloc(size_t size, boolean_t is_metadata)
+{
+	if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size))
+		return (abd_alloc_linear(size, is_metadata));
+
+	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+	abd_t *abd = abd_alloc_struct(size);
+	abd->abd_flags |= ABD_FLAG_OWNER;
+	abd->abd_u.abd_scatter.abd_offset = 0;
+	abd_alloc_chunks(abd, size);
+
+	if (is_metadata) {
+		abd->abd_flags |= ABD_FLAG_META;
+	}
+	abd->abd_size = size;
+
+	abd_update_scatter_stats(abd, ABDSTAT_INCR);
+
+	return (abd);
+}
+
+/*
+ * Allocate an ABD that must be linear, along with its own underlying data
+ * buffer. Only use this when it would be very annoying to write your ABD
+ * consumer with a scattered ABD.
+ */
+abd_t *
+abd_alloc_linear(size_t size, boolean_t is_metadata)
+{
+	abd_t *abd = abd_alloc_struct(0);
+
+	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+	abd->abd_flags |= ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
+	if (is_metadata) {
+		abd->abd_flags |= ABD_FLAG_META;
+	}
+	abd->abd_size = size;
+
+	if (is_metadata) {
+		ABD_LINEAR_BUF(abd) = zio_buf_alloc(size);
+	} else {
+		ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size);
+	}
+
+	abd_update_linear_stats(abd, ABDSTAT_INCR);
+
+	return (abd);
+}
+
+static void
+abd_free_linear(abd_t *abd)
+{
+	if (abd_is_linear_page(abd)) {
+		abd_free_linear_page(abd);
+		return;
+	}
+	if (abd->abd_flags & ABD_FLAG_META) {
+		zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
+	} else {
+		zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size);
+	}
+
+	abd_update_linear_stats(abd, ABDSTAT_DECR);
+}
+
+static void
+abd_free_gang(abd_t *abd)
+{
+	ASSERT(abd_is_gang(abd));
+	abd_t *cabd;
+
+	while ((cabd = list_head(&ABD_GANG(abd).abd_gang_chain)) != NULL) {
+		/*
+		 * We must acquire the child ABDs mutex to ensure that if it
+		 * is being added to another gang ABD we will set the link
+		 * as inactive when removing it from this gang ABD and before
+		 * adding it to the other gang ABD.
+		 */
+		mutex_enter(&cabd->abd_mtx);
+		ASSERT(list_link_active(&cabd->abd_gang_link));
+		list_remove(&ABD_GANG(abd).abd_gang_chain, cabd);
+		mutex_exit(&cabd->abd_mtx);
+		if (cabd->abd_flags & ABD_FLAG_GANG_FREE)
+			abd_free(cabd);
+	}
+	list_destroy(&ABD_GANG(abd).abd_gang_chain);
+}
+
+static void
+abd_free_scatter(abd_t *abd)
+{
+	abd_free_chunks(abd);
+	abd_update_scatter_stats(abd, ABDSTAT_DECR);
+}
+
+/*
+ * Free an ABD.  Use with any kind of abd: those created with abd_alloc_*()
+ * and abd_get_*(), including abd_get_offset_struct().
+ *
+ * If the ABD was created with abd_alloc_*(), the underlying data
+ * (scatterlist or linear buffer) will also be freed.  (Subject to ownership
+ * changes via abd_*_ownership_of_buf().)
+ *
+ * Unless the ABD was created with abd_get_offset_struct(), the abd_t will
+ * also be freed.
+ */
+void
+abd_free(abd_t *abd)
+{
+	if (abd == NULL)
+		return;
+
+	abd_verify(abd);
+#ifdef ZFS_DEBUG
+	IMPLY(abd->abd_flags & ABD_FLAG_OWNER, abd->abd_parent == NULL);
+#endif
+
+	if (abd_is_gang(abd)) {
+		abd_free_gang(abd);
+	} else if (abd_is_linear(abd)) {
+		if (abd->abd_flags & ABD_FLAG_OWNER)
+			abd_free_linear(abd);
+	} else {
+		if (abd->abd_flags & ABD_FLAG_OWNER)
+			abd_free_scatter(abd);
+	}
+
+#ifdef ZFS_DEBUG
+	if (abd->abd_parent != NULL) {
+		(void) zfs_refcount_remove_many(&abd->abd_parent->abd_children,
+		    abd->abd_size, abd);
+	}
+#endif
+
+	abd_fini_struct(abd);
+	if (abd->abd_flags & ABD_FLAG_ALLOCD)
+		abd_free_struct_impl(abd);
+}
+
+/*
+ * Allocate an ABD of the same format (same metadata flag, same scatterize
+ * setting) as another ABD.
+ */
+abd_t *
+abd_alloc_sametype(abd_t *sabd, size_t size)
+{
+	boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
+	if (abd_is_linear(sabd) &&
+	    !abd_is_linear_page(sabd)) {
+		return (abd_alloc_linear(size, is_metadata));
+	} else {
+		return (abd_alloc(size, is_metadata));
+	}
+}
+
+/*
+ * Create gang ABD that will be the head of a list of ABD's. This is used
+ * to "chain" scatter/gather lists together when constructing aggregated
+ * IO's. To free this abd, abd_free() must be called.
+ */
+abd_t *
+abd_alloc_gang(void)
+{
+	abd_t *abd = abd_alloc_struct(0);
+	abd->abd_flags |= ABD_FLAG_GANG | ABD_FLAG_OWNER;
+	list_create(&ABD_GANG(abd).abd_gang_chain,
+	    sizeof (abd_t), offsetof(abd_t, abd_gang_link));
+	return (abd);
+}
+
+/*
+ * Add a child gang ABD to a parent gang ABDs chained list.
+ */
+static void
+abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
+{
+	ASSERT(abd_is_gang(pabd));
+	ASSERT(abd_is_gang(cabd));
+
+	if (free_on_free) {
+		/*
+		 * If the parent is responsible for freeing the child gang
+		 * ABD we will just splice the child's children ABD list to
+		 * the parent's list and immediately free the child gang ABD
+		 * struct. The parent gang ABDs children from the child gang
+		 * will retain all the free_on_free settings after being
+		 * added to the parents list.
+		 */
+		pabd->abd_size += cabd->abd_size;
+		list_move_tail(&ABD_GANG(pabd).abd_gang_chain,
+		    &ABD_GANG(cabd).abd_gang_chain);
+		ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
+		abd_verify(pabd);
+		abd_free(cabd);
+	} else {
+		for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain);
+		    child != NULL;
+		    child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) {
+			/*
+			 * We always pass B_FALSE for free_on_free as it is the
+			 * original child gang ABDs responsibilty to determine
+			 * if any of its child ABDs should be free'd on the call
+			 * to abd_free().
+			 */
+			abd_gang_add(pabd, child, B_FALSE);
+		}
+		abd_verify(pabd);
+	}
+}
+
+/*
+ * Add a child ABD to a gang ABD's chained list.
+ */
+void
+abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free)
+{
+	ASSERT(abd_is_gang(pabd));
+	abd_t *child_abd = NULL;
+
+	/*
+	 * If the child being added is a gang ABD, we will add the
+	 * child's ABDs to the parent gang ABD. This allows us to account
+	 * for the offset correctly in the parent gang ABD.
+	 */
+	if (abd_is_gang(cabd)) {
+		ASSERT(!list_link_active(&cabd->abd_gang_link));
+		ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain));
+		return (abd_gang_add_gang(pabd, cabd, free_on_free));
+	}
+	ASSERT(!abd_is_gang(cabd));
+
+	/*
+	 * In order to verify that an ABD is not already part of
+	 * another gang ABD, we must lock the child ABD's abd_mtx
+	 * to check its abd_gang_link status. We unlock the abd_mtx
+	 * only after it is has been added to a gang ABD, which
+	 * will update the abd_gang_link's status. See comment below
+	 * for how an ABD can be in multiple gang ABD's simultaneously.
+	 */
+	mutex_enter(&cabd->abd_mtx);
+	if (list_link_active(&cabd->abd_gang_link)) {
+		/*
+		 * If the child ABD is already part of another
+		 * gang ABD then we must allocate a new
+		 * ABD to use a separate link. We mark the newly
+		 * allocated ABD with ABD_FLAG_GANG_FREE, before
+		 * adding it to the gang ABD's list, to make the
+		 * gang ABD aware that it is responsible to call
+		 * abd_free(). We use abd_get_offset() in order
+		 * to just allocate a new ABD but avoid copying the
+		 * data over into the newly allocated ABD.
+		 *
+		 * An ABD may become part of multiple gang ABD's. For
+		 * example, when writing ditto bocks, the same ABD
+		 * is used to write 2 or 3 locations with 2 or 3
+		 * zio_t's. Each of the zio's may be aggregated with
+		 * different adjacent zio's. zio aggregation uses gang
+		 * zio's, so the single ABD can become part of multiple
+		 * gang zio's.
+		 *
+		 * The ASSERT below is to make sure that if
+		 * free_on_free is passed as B_TRUE, the ABD can
+		 * not be in multiple gang ABD's. The gang ABD
+		 * can not be responsible for cleaning up the child
+		 * ABD memory allocation if the ABD can be in
+		 * multiple gang ABD's at one time.
+		 */
+		ASSERT3B(free_on_free, ==, B_FALSE);
+		child_abd = abd_get_offset(cabd, 0);
+		child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
+	} else {
+		child_abd = cabd;
+		if (free_on_free)
+			child_abd->abd_flags |= ABD_FLAG_GANG_FREE;
+	}
+	ASSERT3P(child_abd, !=, NULL);
+
+	list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd);
+	mutex_exit(&cabd->abd_mtx);
+	pabd->abd_size += child_abd->abd_size;
+}
+
+/*
+ * Locate the ABD for the supplied offset in the gang ABD.
+ * Return a new offset relative to the returned ABD.
+ */
+abd_t *
+abd_gang_get_offset(abd_t *abd, size_t *off)
+{
+	abd_t *cabd;
+
+	ASSERT(abd_is_gang(abd));
+	ASSERT3U(*off, <, abd->abd_size);
+	for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL;
+	    cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
+		if (*off >= cabd->abd_size)
+			*off -= cabd->abd_size;
+		else
+			return (cabd);
+	}
+	VERIFY3P(cabd, !=, NULL);
+	return (cabd);
+}
+
+/*
+ * Allocate a new ABD, using the provided struct (if non-NULL, and if
+ * circumstances allow - otherwise allocate the struct).  The returned ABD will
+ * point to offset off of sabd. It shares the underlying buffer data with sabd.
+ * Use abd_free() to free.  sabd must not be freed while any derived ABDs exist.
+ */
+static abd_t *
+abd_get_offset_impl(abd_t *abd, abd_t *sabd, size_t off, size_t size)
+{
+	abd_verify(sabd);
+	ASSERT3U(off + size, <=, sabd->abd_size);
+
+	if (abd_is_linear(sabd)) {
+		if (abd == NULL)
+			abd = abd_alloc_struct(0);
+		/*
+		 * Even if this buf is filesystem metadata, we only track that
+		 * if we own the underlying data buffer, which is not true in
+		 * this case. Therefore, we don't ever use ABD_FLAG_META here.
+		 */
+		abd->abd_flags |= ABD_FLAG_LINEAR;
+
+		ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off;
+	} else if (abd_is_gang(sabd)) {
+		size_t left = size;
+		if (abd == NULL) {
+			abd = abd_alloc_gang();
+		} else {
+			abd->abd_flags |= ABD_FLAG_GANG;
+			list_create(&ABD_GANG(abd).abd_gang_chain,
+			    sizeof (abd_t), offsetof(abd_t, abd_gang_link));
+		}
+
+		abd->abd_flags &= ~ABD_FLAG_OWNER;
+		for (abd_t *cabd = abd_gang_get_offset(sabd, &off);
+		    cabd != NULL && left > 0;
+		    cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) {
+			int csize = MIN(left, cabd->abd_size - off);
+
+			abd_t *nabd = abd_get_offset_size(cabd, off, csize);
+			abd_gang_add(abd, nabd, B_TRUE);
+			left -= csize;
+			off = 0;
+		}
+		ASSERT3U(left, ==, 0);
+	} else {
+		abd = abd_get_offset_scatter(abd, sabd, off);
+	}
+
+	ASSERT3P(abd, !=, NULL);
+	abd->abd_size = size;
+#ifdef ZFS_DEBUG
+	abd->abd_parent = sabd;
+	(void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
+#endif
+	return (abd);
+}
+
+/*
+ * Like abd_get_offset_size(), but memory for the abd_t is provided by the
+ * caller.  Using this routine can improve performance by avoiding the cost
+ * of allocating memory for the abd_t struct, and updating the abd stats.
+ * Usually, the provided abd is returned, but in some circumstances (FreeBSD,
+ * if sabd is scatter and size is more than 2 pages) a new abd_t may need to
+ * be allocated.  Therefore callers should be careful to use the returned
+ * abd_t*.
+ */
+abd_t *
+abd_get_offset_struct(abd_t *abd, abd_t *sabd, size_t off, size_t size)
+{
+	abd_init_struct(abd);
+	return (abd_get_offset_impl(abd, sabd, off, size));
+}
+
+abd_t *
+abd_get_offset(abd_t *sabd, size_t off)
+{
+	size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
+	VERIFY3U(size, >, 0);
+	return (abd_get_offset_impl(NULL, sabd, off, size));
+}
+
+abd_t *
+abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
+{
+	ASSERT3U(off + size, <=, sabd->abd_size);
+	return (abd_get_offset_impl(NULL, sabd, off, size));
+}
+
+/*
+ * Return a size scatter ABD containing only zeros.
+ */
+abd_t *
+abd_get_zeros(size_t size)
+{
+	ASSERT3P(abd_zero_scatter, !=, NULL);
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	return (abd_get_offset_size(abd_zero_scatter, 0, size));
+}
+
+/*
+ * Allocate a linear ABD structure for buf.
+ */
+abd_t *
+abd_get_from_buf(void *buf, size_t size)
+{
+	abd_t *abd = abd_alloc_struct(0);
+
+	VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
+
+	/*
+	 * Even if this buf is filesystem metadata, we only track that if we
+	 * own the underlying data buffer, which is not true in this case.
+	 * Therefore, we don't ever use ABD_FLAG_META here.
+	 */
+	abd->abd_flags |= ABD_FLAG_LINEAR;
+	abd->abd_size = size;
+
+	ABD_LINEAR_BUF(abd) = buf;
+
+	return (abd);
+}
+
+/*
+ * Get the raw buffer associated with a linear ABD.
+ */
+void *
+abd_to_buf(abd_t *abd)
+{
+	ASSERT(abd_is_linear(abd));
+	abd_verify(abd);
+	return (ABD_LINEAR_BUF(abd));
+}
+
+/*
+ * Borrow a raw buffer from an ABD without copying the contents of the ABD
+ * into the buffer. If the ABD is scattered, this will allocate a raw buffer
+ * whose contents are undefined. To copy over the existing data in the ABD, use
+ * abd_borrow_buf_copy() instead.
+ */
+void *
+abd_borrow_buf(abd_t *abd, size_t n)
+{
+	void *buf;
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, n);
+	if (abd_is_linear(abd)) {
+		buf = abd_to_buf(abd);
+	} else {
+		buf = zio_buf_alloc(n);
+	}
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_add_many(&abd->abd_children, n, buf);
+#endif
+	return (buf);
+}
+
+void *
+abd_borrow_buf_copy(abd_t *abd, size_t n)
+{
+	void *buf = abd_borrow_buf(abd, n);
+	if (!abd_is_linear(abd)) {
+		abd_copy_to_buf(buf, abd, n);
+	}
+	return (buf);
+}
+
+/*
+ * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
+ * not change the contents of the ABD and will ASSERT that you didn't modify
+ * the buffer since it was borrowed. If you want any changes you made to buf to
+ * be copied back to abd, use abd_return_buf_copy() instead.
+ */
+void
+abd_return_buf(abd_t *abd, void *buf, size_t n)
+{
+	abd_verify(abd);
+	ASSERT3U(abd->abd_size, >=, n);
+	if (abd_is_linear(abd)) {
+		ASSERT3P(buf, ==, abd_to_buf(abd));
+	} else {
+		ASSERT0(abd_cmp_buf(abd, buf, n));
+		zio_buf_free(buf, n);
+	}
+#ifdef ZFS_DEBUG
+	(void) zfs_refcount_remove_many(&abd->abd_children, n, buf);
+#endif
+}
+
+void
+abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
+{
+	if (!abd_is_linear(abd)) {
+		abd_copy_from_buf(abd, buf, n);
+	}
+	abd_return_buf(abd, buf, n);
+}
+
+void
+abd_release_ownership_of_buf(abd_t *abd)
+{
+	ASSERT(abd_is_linear(abd));
+	ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
+
+	/*
+	 * abd_free() needs to handle LINEAR_PAGE ABD's specially.
+	 * Since that flag does not survive the
+	 * abd_release_ownership_of_buf() -> abd_get_from_buf() ->
+	 * abd_take_ownership_of_buf() sequence, we don't allow releasing
+	 * these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
+	 */
+	ASSERT(!abd_is_linear_page(abd));
+
+	abd_verify(abd);
+
+	abd->abd_flags &= ~ABD_FLAG_OWNER;
+	/* Disable this flag since we no longer own the data buffer */
+	abd->abd_flags &= ~ABD_FLAG_META;
+
+	abd_update_linear_stats(abd, ABDSTAT_DECR);
+}
+
+
+/*
+ * Give this ABD ownership of the buffer that it's storing. Can only be used on
+ * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
+ * with abd_alloc_linear() which subsequently released ownership of their buf
+ * with abd_release_ownership_of_buf().
+ */
+void
+abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
+{
+	ASSERT(abd_is_linear(abd));
+	ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
+	abd_verify(abd);
+
+	abd->abd_flags |= ABD_FLAG_OWNER;
+	if (is_metadata) {
+		abd->abd_flags |= ABD_FLAG_META;
+	}
+
+	abd_update_linear_stats(abd, ABDSTAT_INCR);
+}
+
+/*
+ * Initializes an abd_iter based on whether the abd is a gang ABD
+ * or just a single ABD.
+ */
+static inline abd_t *
+abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off)
+{
+	abd_t *cabd = NULL;
+
+	if (abd_is_gang(abd)) {
+		cabd = abd_gang_get_offset(abd, &off);
+		if (cabd) {
+			abd_iter_init(aiter, cabd);
+			abd_iter_advance(aiter, off);
+		}
+	} else {
+		abd_iter_init(aiter, abd);
+		abd_iter_advance(aiter, off);
+	}
+	return (cabd);
+}
+
+/*
+ * Advances an abd_iter. We have to be careful with gang ABD as
+ * advancing could mean that we are at the end of a particular ABD and
+ * must grab the ABD in the gang ABD's list.
+ */
+static inline abd_t *
+abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter,
+    size_t len)
+{
+	abd_iter_advance(aiter, len);
+	if (abd_is_gang(abd) && abd_iter_at_end(aiter)) {
+		ASSERT3P(cabd, !=, NULL);
+		cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd);
+		if (cabd) {
+			abd_iter_init(aiter, cabd);
+			abd_iter_advance(aiter, 0);
+		}
+	}
+	return (cabd);
+}
+
+int
+abd_iterate_func(abd_t *abd, size_t off, size_t size,
+    abd_iter_func_t *func, void *private)
+{
+	struct abd_iter aiter;
+	int ret = 0;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(abd);
+	ASSERT3U(off + size, <=, abd->abd_size);
+
+	boolean_t gang = abd_is_gang(abd);
+	abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);
+
+	while (size > 0) {
+		/* If we are at the end of the gang ABD we are done */
+		if (gang && !c_abd)
+			break;
+
+		abd_iter_map(&aiter);
+
+		size_t len = MIN(aiter.iter_mapsize, size);
+		ASSERT3U(len, >, 0);
+
+		ret = func(aiter.iter_mapaddr, len, private);
+
+		abd_iter_unmap(&aiter);
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
+	}
+
+	return (ret);
+}
+
+struct buf_arg {
+	void *arg_buf;
+};
+
+static int
+abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
+{
+	struct buf_arg *ba_ptr = private;
+
+	(void) memcpy(ba_ptr->arg_buf, buf, size);
+	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+	return (0);
+}
+
+/*
+ * Copy abd to buf. (off is the offset in abd.)
+ */
+void
+abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
+{
+	struct buf_arg ba_ptr = { buf };
+
+	(void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
+	    &ba_ptr);
+}
+
+static int
+abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
+{
+	int ret;
+	struct buf_arg *ba_ptr = private;
+
+	ret = memcmp(buf, ba_ptr->arg_buf, size);
+	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+	return (ret);
+}
+
+/*
+ * Compare the contents of abd to buf. (off is the offset in abd.)
+ */
+int
+abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+	struct buf_arg ba_ptr = { (void *) buf };
+
+	return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
+}
+
+static int
+abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
+{
+	struct buf_arg *ba_ptr = private;
+
+	(void) memcpy(buf, ba_ptr->arg_buf, size);
+	ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
+
+	return (0);
+}
+
+/*
+ * Copy from buf to abd. (off is the offset in abd.)
+ */
+void
+abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
+{
+	struct buf_arg ba_ptr = { (void *) buf };
+
+	(void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
+	    &ba_ptr);
+}
+
+/*ARGSUSED*/
+static int
+abd_zero_off_cb(void *buf, size_t size, void *private)
+{
+	(void) memset(buf, 0, size);
+	return (0);
+}
+
+/*
+ * Zero out the abd from a particular offset to the end.
+ */
+void
+abd_zero_off(abd_t *abd, size_t off, size_t size)
+{
+	(void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
+}
+
+/*
+ * Iterate over two ABDs and call func incrementally on the two ABDs' data in
+ * equal-sized chunks (passed to func as raw buffers). func could be called many
+ * times during this iteration.
+ */
+int
+abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
+    size_t size, abd_iter_func2_t *func, void *private)
+{
+	int ret = 0;
+	struct abd_iter daiter, saiter;
+	boolean_t dabd_is_gang_abd, sabd_is_gang_abd;
+	abd_t *c_dabd, *c_sabd;
+
+	if (size == 0)
+		return (0);
+
+	abd_verify(dabd);
+	abd_verify(sabd);
+
+	ASSERT3U(doff + size, <=, dabd->abd_size);
+	ASSERT3U(soff + size, <=, sabd->abd_size);
+
+	dabd_is_gang_abd = abd_is_gang(dabd);
+	sabd_is_gang_abd = abd_is_gang(sabd);
+	c_dabd = abd_init_abd_iter(dabd, &daiter, doff);
+	c_sabd = abd_init_abd_iter(sabd, &saiter, soff);
+
+	while (size > 0) {
+		/* if we are at the end of the gang ABD we are done */
+		if ((dabd_is_gang_abd && !c_dabd) ||
+		    (sabd_is_gang_abd && !c_sabd))
+			break;
+
+		abd_iter_map(&daiter);
+		abd_iter_map(&saiter);
+
+		size_t dlen = MIN(daiter.iter_mapsize, size);
+		size_t slen = MIN(saiter.iter_mapsize, size);
+		size_t len = MIN(dlen, slen);
+		ASSERT(dlen > 0 || slen > 0);
+
+		ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
+		    private);
+
+		abd_iter_unmap(&saiter);
+		abd_iter_unmap(&daiter);
+
+		if (ret != 0)
+			break;
+
+		size -= len;
+		c_dabd =
+		    abd_advance_abd_iter(dabd, c_dabd, &daiter, len);
+		c_sabd =
+		    abd_advance_abd_iter(sabd, c_sabd, &saiter, len);
+	}
+
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
+{
+	(void) memcpy(dbuf, sbuf, size);
+	return (0);
+}
+
+/*
+ * Copy from sabd to dabd starting from soff and doff.
+ */
+void
+abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
+{
+	(void) abd_iterate_func2(dabd, sabd, doff, soff, size,
+	    abd_copy_off_cb, NULL);
+}
+
+/*ARGSUSED*/
+static int
+abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
+{
+	return (memcmp(bufa, bufb, size));
+}
+
+/*
+ * Compares the contents of two ABDs.
+ */
+int
+abd_cmp(abd_t *dabd, abd_t *sabd)
+{
+	ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
+	return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
+	    abd_cmp_cb, NULL));
+}
+
+/*
+ * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
+ *
+ * @cabds          parity ABDs, must have equal size
+ * @dabd           data ABD. Can be NULL (in this case @dsize = 0)
+ * @func_raidz_gen should be implemented so that its behaviour
+ *                 is the same when taking linear and when taking scatter
+ */
+void
+abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
+    ssize_t csize, ssize_t dsize, const unsigned parity,
+    void (*func_raidz_gen)(void **, const void *, size_t, size_t))
+{
+	int i;
+	ssize_t len, dlen;
+	struct abd_iter caiters[3];
+	struct abd_iter daiter = {0};
+	void *caddrs[3];
+	unsigned long flags __maybe_unused = 0;
+	abd_t *c_cabds[3];
+	abd_t *c_dabd = NULL;
+	boolean_t cabds_is_gang_abd[3];
+	boolean_t dabd_is_gang_abd = B_FALSE;
+
+	ASSERT3U(parity, <=, 3);
+
+	for (i = 0; i < parity; i++) {
+		cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
+		c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0);
+	}
+
+	if (dabd) {
+		dabd_is_gang_abd = abd_is_gang(dabd);
+		c_dabd = abd_init_abd_iter(dabd, &daiter, 0);
+	}
+
+	ASSERT3S(dsize, >=, 0);
+
+	abd_enter_critical(flags);
+	while (csize > 0) {
+		/* if we are at the end of the gang ABD we are done */
+		if (dabd_is_gang_abd && !c_dabd)
+			break;
+
+		for (i = 0; i < parity; i++) {
+			/*
+			 * If we are at the end of the gang ABD we are
+			 * done.
+			 */
+			if (cabds_is_gang_abd[i] && !c_cabds[i])
+				break;
+			abd_iter_map(&caiters[i]);
+			caddrs[i] = caiters[i].iter_mapaddr;
+		}
+
+		len = csize;
+
+		if (dabd && dsize > 0)
+			abd_iter_map(&daiter);
+
+		switch (parity) {
+			case 3:
+				len = MIN(caiters[2].iter_mapsize, len);
+				/* falls through */
+			case 2:
+				len = MIN(caiters[1].iter_mapsize, len);
+				/* falls through */
+			case 1:
+				len = MIN(caiters[0].iter_mapsize, len);
+		}
+
+		/* must be progressive */
+		ASSERT3S(len, >, 0);
+
+		if (dabd && dsize > 0) {
+			/* this needs precise iter.length */
+			len = MIN(daiter.iter_mapsize, len);
+			dlen = len;
+		} else
+			dlen = 0;
+
+		/* must be progressive */
+		ASSERT3S(len, >, 0);
+		/*
+		 * The iterated function likely will not do well if each
+		 * segment except the last one is not multiple of 512 (raidz).
+		 */
+		ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+		func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
+
+		for (i = parity-1; i >= 0; i--) {
+			abd_iter_unmap(&caiters[i]);
+			c_cabds[i] =
+			    abd_advance_abd_iter(cabds[i], c_cabds[i],
+			    &caiters[i], len);
+		}
+
+		if (dabd && dsize > 0) {
+			abd_iter_unmap(&daiter);
+			c_dabd =
+			    abd_advance_abd_iter(dabd, c_dabd, &daiter,
+			    dlen);
+			dsize -= dlen;
+		}
+
+		csize -= len;
+
+		ASSERT3S(dsize, >=, 0);
+		ASSERT3S(csize, >=, 0);
+	}
+	abd_exit_critical(flags);
+}
+
+/*
+ * Iterate over code ABDs and data reconstruction target ABDs and call
+ * @func_raidz_rec. Function maps at most 6 pages atomically.
+ *
+ * @cabds           parity ABDs, must have equal size
+ * @tabds           rec target ABDs, at most 3
+ * @tsize           size of data target columns
+ * @func_raidz_rec  expects syndrome data in target columns. Function
+ *                  reconstructs data and overwrites target columns.
+ */
+void
+abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
+    ssize_t tsize, const unsigned parity,
+    void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
+    const unsigned *mul),
+    const unsigned *mul)
+{
+	int i;
+	ssize_t len;
+	struct abd_iter citers[3];
+	struct abd_iter xiters[3];
+	void *caddrs[3], *xaddrs[3];
+	unsigned long flags __maybe_unused = 0;
+	boolean_t cabds_is_gang_abd[3];
+	boolean_t tabds_is_gang_abd[3];
+	abd_t *c_cabds[3];
+	abd_t *c_tabds[3];
+
+	ASSERT3U(parity, <=, 3);
+
+	for (i = 0; i < parity; i++) {
+		cabds_is_gang_abd[i] = abd_is_gang(cabds[i]);
+		tabds_is_gang_abd[i] = abd_is_gang(tabds[i]);
+		c_cabds[i] =
+		    abd_init_abd_iter(cabds[i], &citers[i], 0);
+		c_tabds[i] =
+		    abd_init_abd_iter(tabds[i], &xiters[i], 0);
+	}
+
+	abd_enter_critical(flags);
+	while (tsize > 0) {
+
+		for (i = 0; i < parity; i++) {
+			/*
+			 * If we are at the end of the gang ABD we
+			 * are done.
+			 */
+			if (cabds_is_gang_abd[i] && !c_cabds[i])
+				break;
+			if (tabds_is_gang_abd[i] && !c_tabds[i])
+				break;
+			abd_iter_map(&citers[i]);
+			abd_iter_map(&xiters[i]);
+			caddrs[i] = citers[i].iter_mapaddr;
+			xaddrs[i] = xiters[i].iter_mapaddr;
+		}
+
+		len = tsize;
+		switch (parity) {
+			case 3:
+				len = MIN(xiters[2].iter_mapsize, len);
+				len = MIN(citers[2].iter_mapsize, len);
+				/* falls through */
+			case 2:
+				len = MIN(xiters[1].iter_mapsize, len);
+				len = MIN(citers[1].iter_mapsize, len);
+				/* falls through */
+			case 1:
+				len = MIN(xiters[0].iter_mapsize, len);
+				len = MIN(citers[0].iter_mapsize, len);
+		}
+		/* must be progressive */
+		ASSERT3S(len, >, 0);
+		/*
+		 * The iterated function likely will not do well if each
+		 * segment except the last one is not multiple of 512 (raidz).
+		 */
+		ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
+
+		func_raidz_rec(xaddrs, len, caddrs, mul);
+
+		for (i = parity-1; i >= 0; i--) {
+			abd_iter_unmap(&xiters[i]);
+			abd_iter_unmap(&citers[i]);
+			c_tabds[i] =
+			    abd_advance_abd_iter(tabds[i], c_tabds[i],
+			    &xiters[i], len);
+			c_cabds[i] =
+			    abd_advance_abd_iter(cabds[i], c_cabds[i],
+			    &citers[i], len);
+		}
+
+		tsize -= len;
+		ASSERT3S(tsize, >=, 0);
+	}
+	abd_exit_critical(flags);
+}
diff --git a/sys/contrib/openzfs/module/zfs/aggsum.c b/sys/contrib/openzfs/module/zfs/aggsum.c
new file mode 100644
index 000000000000..e46da95f676c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/aggsum.c
@@ -0,0 +1,240 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/aggsum.h>
+
+/*
+ * Aggregate-sum counters are a form of fanned-out counter, used when atomic
+ * instructions on a single field cause enough CPU cache line contention to
+ * slow system performance. Due to their increased overhead and the expense
+ * involved with precisely reading from them, they should only be used in cases
+ * where the write rate (increment/decrement) is much higher than the read rate
+ * (get value).
+ *
+ * Aggregate sum counters are comprised of two basic parts, the core and the
+ * buckets. The core counter contains a lock for the entire counter, as well
+ * as the current upper and lower bounds on the value of the counter. The
+ * aggsum_bucket structure contains a per-bucket lock to protect the contents of
+ * the bucket, the current amount that this bucket has changed from the global
+ * counter (called the delta), and the amount of increment and decrement we have
+ * "borrowed" from the core counter.
+ *
+ * The basic operation of an aggsum is simple. Threads that wish to modify the
+ * counter will modify one bucket's counter (determined by their current CPU, to
+ * help minimize lock and cache contention). If the bucket already has
+ * sufficient capacity borrowed from the core structure to handle their request,
+ * they simply modify the delta and return.  If the bucket does not, we clear
+ * the bucket's current state (to prevent the borrowed amounts from getting too
+ * large), and borrow more from the core counter. Borrowing is done by adding to
+ * the upper bound (or subtracting from the lower bound) of the core counter,
+ * and setting the borrow value for the bucket to the amount added (or
+ * subtracted).  Clearing the bucket is the opposite; we add the current delta
+ * to both the lower and upper bounds of the core counter, subtract the borrowed
+ * incremental from the upper bound, and add the borrowed decrement from the
+ * lower bound.  Note that only borrowing and clearing require access to the
+ * core counter; since all other operations access CPU-local resources,
+ * performance can be much higher than a traditional counter.
+ *
+ * Threads that wish to read from the counter have a slightly more challenging
+ * task. It is fast to determine the upper and lower bounds of the aggum; this
+ * does not require grabbing any locks. This suffices for cases where an
+ * approximation of the aggsum's value is acceptable. However, if one needs to
+ * know whether some specific value is above or below the current value in the
+ * aggsum, they invoke aggsum_compare(). This function operates by repeatedly
+ * comparing the target value to the upper and lower bounds of the aggsum, and
+ * then clearing a bucket. This proceeds until the target is outside of the
+ * upper and lower bounds and we return a response, or the last bucket has been
+ * cleared and we know that the target is equal to the aggsum's value. Finally,
+ * the most expensive operation is determining the precise value of the aggsum.
+ * To do this, we clear every bucket and then return the upper bound (which must
+ * be equal to the lower bound). What makes aggsum_compare() and aggsum_value()
+ * expensive is clearing buckets. This involves grabbing the global lock
+ * (serializing against themselves and borrow operations), grabbing a bucket's
+ * lock (preventing threads on those CPUs from modifying their delta), and
+ * zeroing out the borrowed value (forcing that thread to borrow on its next
+ * request, which will also be expensive).  This is what makes aggsums well
+ * suited for write-many read-rarely operations.
+ *
+ * Note that the aggsums do not expand if more CPUs are hot-added. In that
+ * case, we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
+ */
+
+/*
+ * We will borrow aggsum_borrow_multiplier times the current request, so we will
+ * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
+ * aggsum_delta().
+ */
+static uint_t aggsum_borrow_multiplier = 10;
+
+void
+aggsum_init(aggsum_t *as, uint64_t value)
+{
+	bzero(as, sizeof (*as));
+	as->as_lower_bound = as->as_upper_bound = value;
+	mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
+	as->as_numbuckets = boot_ncpus;
+	as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
+	    KM_SLEEP);
+	for (int i = 0; i < as->as_numbuckets; i++) {
+		mutex_init(&as->as_buckets[i].asc_lock,
+		    NULL, MUTEX_DEFAULT, NULL);
+	}
+}
+
+void
+aggsum_fini(aggsum_t *as)
+{
+	for (int i = 0; i < as->as_numbuckets; i++)
+		mutex_destroy(&as->as_buckets[i].asc_lock);
+	kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t));
+	mutex_destroy(&as->as_lock);
+}
+
+int64_t
+aggsum_lower_bound(aggsum_t *as)
+{
+	return (as->as_lower_bound);
+}
+
+int64_t
+aggsum_upper_bound(aggsum_t *as)
+{
+	return (as->as_upper_bound);
+}
+
+static void
+aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
+{
+	ASSERT(MUTEX_HELD(&as->as_lock));
+	ASSERT(MUTEX_HELD(&asb->asc_lock));
+
+	/*
+	 * We use atomic instructions for this because we read the upper and
+	 * lower bounds without the lock, so we need stores to be atomic.
+	 */
+	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+	    asb->asc_delta + asb->asc_borrowed);
+	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+	    asb->asc_delta - asb->asc_borrowed);
+	asb->asc_delta = 0;
+	asb->asc_borrowed = 0;
+}
+
+uint64_t
+aggsum_value(aggsum_t *as)
+{
+	int64_t rv;
+
+	mutex_enter(&as->as_lock);
+	if (as->as_lower_bound == as->as_upper_bound) {
+		rv = as->as_lower_bound;
+		for (int i = 0; i < as->as_numbuckets; i++) {
+			ASSERT0(as->as_buckets[i].asc_delta);
+			ASSERT0(as->as_buckets[i].asc_borrowed);
+		}
+		mutex_exit(&as->as_lock);
+		return (rv);
+	}
+	for (int i = 0; i < as->as_numbuckets; i++) {
+		struct aggsum_bucket *asb = &as->as_buckets[i];
+		mutex_enter(&asb->asc_lock);
+		aggsum_flush_bucket(as, asb);
+		mutex_exit(&asb->asc_lock);
+	}
+	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+	rv = as->as_lower_bound;
+	mutex_exit(&as->as_lock);
+
+	return (rv);
+}
+
+void
+aggsum_add(aggsum_t *as, int64_t delta)
+{
+	struct aggsum_bucket *asb;
+	int64_t borrow;
+
+	asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
+
+	/* Try fast path if we already borrowed enough before. */
+	mutex_enter(&asb->asc_lock);
+	if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed &&
+	    asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) {
+		asb->asc_delta += delta;
+		mutex_exit(&asb->asc_lock);
+		return;
+	}
+	mutex_exit(&asb->asc_lock);
+
+	/*
+	 * We haven't borrowed enough.  Take the global lock and borrow
+	 * considering what is requested now and what we borrowed before.
+	 */
+	borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
+	mutex_enter(&as->as_lock);
+	mutex_enter(&asb->asc_lock);
+	delta += asb->asc_delta;
+	asb->asc_delta = 0;
+	if (borrow >= asb->asc_borrowed)
+		borrow -= asb->asc_borrowed;
+	else
+		borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
+	asb->asc_borrowed += borrow;
+	atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
+	    delta - borrow);
+	atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
+	    delta + borrow);
+	mutex_exit(&asb->asc_lock);
+	mutex_exit(&as->as_lock);
+}
+
+/*
+ * Compare the aggsum value to target efficiently. Returns -1 if the value
+ * represented by the aggsum is less than target, 1 if it's greater, and 0 if
+ * they are equal.
+ */
+int
+aggsum_compare(aggsum_t *as, uint64_t target)
+{
+	if (as->as_upper_bound < target)
+		return (-1);
+	if (as->as_lower_bound > target)
+		return (1);
+	mutex_enter(&as->as_lock);
+	for (int i = 0; i < as->as_numbuckets; i++) {
+		struct aggsum_bucket *asb = &as->as_buckets[i];
+		mutex_enter(&asb->asc_lock);
+		aggsum_flush_bucket(as, asb);
+		mutex_exit(&asb->asc_lock);
+		if (as->as_upper_bound < target) {
+			mutex_exit(&as->as_lock);
+			return (-1);
+		}
+		if (as->as_lower_bound > target) {
+			mutex_exit(&as->as_lock);
+			return (1);
+		}
+	}
+	VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
+	ASSERT3U(as->as_lower_bound, ==, target);
+	mutex_exit(&as->as_lock);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
new file mode 100644
index 000000000000..b4f0c8a85b64
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -0,0 +1,10768 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
+ * Copyright (c) 2011, 2020, Delphix. All rights reserved.
+ * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2020, George Amanakis. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ *     under sponsorship from the FreeBSD Foundation.
+ */
+
+/*
+ * DVA-based Adjustable Replacement Cache
+ *
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
+ * presented by Megiddo and Modha at FAST 2003, there are some
+ * significant differences:
+ *
+ * 1. The Megiddo and Modha model assumes any page is evictable.
+ * Pages in its cache cannot be "locked" into memory.  This makes
+ * the eviction algorithm simple: evict the last page in the list.
+ * This also make the performance characteristics easy to reason
+ * about.  Our cache is not so simple.  At any given moment, some
+ * subset of the blocks in the cache are un-evictable because we
+ * have handed out a reference to them.  Blocks are only evictable
+ * when there are no external references active.  This makes
+ * eviction far more problematic:  we choose to evict the evictable
+ * blocks that are the "lowest" in the list.
+ *
+ * There are times when it is not possible to evict the requested
+ * space.  In these circumstances we are unable to adjust the cache
+ * size.  To prevent the cache growing unbounded at these times we
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
+ *
+ * 2. The Megiddo and Modha model assumes a fixed cache size.
+ * Pages are evicted when the cache is full and there is a cache
+ * miss.  Our model has a variable sized cache.  It grows with
+ * high use, but also tries to react to memory pressure from the
+ * operating system: decreasing its size when system memory is
+ * tight.
+ *
+ * 3. The Megiddo and Modha model assumes a fixed page size. All
+ * elements of the cache are therefore exactly the same size.  So
+ * when adjusting the cache size following a cache miss, its simply
+ * a matter of choosing a single page to evict.  In our model, we
+ * have variable sized cache blocks (ranging from 512 bytes to
+ * 128K bytes).  We therefore choose a set of blocks to evict to make
+ * space for a cache miss that approximates as closely as possible
+ * the space used by the new block.
+ *
+ * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
+ * by N. Megiddo & D. Modha, FAST 2003
+ */
+
+/*
+ * The locking model:
+ *
+ * A new reference to a cache buffer can be obtained in two
+ * ways: 1) via a hash table lookup using the DVA as a key,
+ * or 2) via one of the ARC lists.  The arc_read() interface
+ * uses method 1, while the internal ARC algorithms for
+ * adjusting the cache use method 2.  We therefore provide two
+ * types of locks: 1) the hash table lock array, and 2) the
+ * ARC list locks.
+ *
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
+ *
+ * buf_hash_find() returns the appropriate mutex (held) when it
+ * locates the requested buffer in the hash table.  It returns
+ * NULL for the mutex if the buffer was not in the table.
+ *
+ * buf_hash_remove() expects the appropriate hash mutex to be
+ * already held before it is invoked.
+ *
+ * Each ARC state also has a mutex which is used to protect the
+ * buffer list associated with the state.  When attempting to
+ * obtain a hash table lock while holding an ARC list lock you
+ * must use: mutex_tryenter() to avoid deadlock.  Also note that
+ * the active state mutex must be held before the ghost state mutex.
+ *
+ * It as also possible to register a callback which is run when the
+ * arc_meta_limit is reached and no buffers can be safely evicted.  In
+ * this case the arc user should drop a reference on some arc buffers so
+ * they can be reclaimed and the arc_meta_limit honored.  For example,
+ * when using the ZPL each dentry holds a references on a znode.  These
+ * dentries must be pruned before the arc buffer holding the znode can
+ * be safely evicted.
+ *
+ * Note that the majority of the performance stats are manipulated
+ * with atomic operations.
+ *
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
+ *
+ *	- L2ARC buflist creation
+ *	- L2ARC buflist eviction
+ *	- L2ARC write completion, which walks L2ARC buflists
+ *	- ARC header destruction, as it removes from L2ARC buflists
+ *	- ARC header release, as it removes from L2ARC buflists
+ */
+
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
+ *
+ * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
+ * ability to store the physical data (b_pabd) associated with the DVA of the
+ * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
+ * it will match its on-disk compression characteristics. This behavior can be
+ * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pabd will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
+ * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
+ * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer. The ARC will provide references to this data and will keep it
+ * cached until it is no longer in use. The ARC caches only the L1ARC's physical
+ * data block and will evict any arc_buf_t that is no longer referenced. The
+ * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
+ * "overhead_size" kstat.
+ *
+ * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
+ * compressed form. The typical case is that consumers will want uncompressed
+ * data, and when that happens a new data buffer is allocated where the data is
+ * decompressed for them to use. Currently the only consumer who wants
+ * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
+ * exists on disk. When this happens, the arc_buf_t's data buffer is shared
+ * with the arc_buf_hdr_t.
+ *
+ * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
+ * first one is owned by a compressed send consumer (and therefore references
+ * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
+ * used by any other consumer (and has its own uncompressed copy of the data
+ * buffer).
+ *
+ *   arc_buf_hdr_t
+ *   +-----------+
+ *   | fields    |
+ *   | common to |
+ *   | L1- and   |
+ *   | L2ARC     |
+ *   +-----------+
+ *   | l2arc_buf_hdr_t
+ *   |           |
+ *   +-----------+
+ *   | l1arc_buf_hdr_t
+ *   |           |              arc_buf_t
+ *   | b_buf     +------------>+-----------+      arc_buf_t
+ *   | b_pabd    +-+           |b_next     +---->+-----------+
+ *   +-----------+ |           |-----------|     |b_next     +-->NULL
+ *                 |           |b_comp = T |     +-----------+
+ *                 |           |b_data     +-+   |b_comp = F |
+ *                 |           +-----------+ |   |b_data     +-+
+ *                 +->+------+               |   +-----------+ |
+ *        compressed  |      |               |                 |
+ *           data     |      |<--------------+                 | uncompressed
+ *                    +------+          compressed,            |     data
+ *                                        shared               +-->+------+
+ *                                         data                    |      |
+ *                                                                 |      |
+ *                                                                 +------+
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
+ * arc_buf_t and either copies uncompressed data into a new data buffer from an
+ * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
+ * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
+ * hdr is compressed and the desired compression characteristics of the
+ * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
+ * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
+ * the last buffer in the hdr's b_buf list, however a shared compressed buf can
+ * be anywhere in the hdr's list.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
+ * the last element in the buf list):
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t    (shared)
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                                 |      |             |               |
+ *                   uncompressed  |      |             |               |
+ *                        data     +------+             |               |
+ *                                    ^                 +->+------+     |
+ *                                    |       uncompressed |      |     |
+ *                                    |           data     |      |     |
+ *                                    |                    +------+     |
+ *                                    +---------------------------------+
+ *
+ * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
+ * it may compress the data before writing it to disk. The ARC will be called
+ * with the transformed data and will bcopy the transformed on-disk block into
+ * a newly allocated b_pabd. Writes are always done into buffers which have
+ * either been loaned (and hence are new and don't have other readers) or
+ * buffers which have been released (and hence have their own hdr, if there
+ * were originally other readers of the buf's original hdr). This ensures that
+ * the ARC only needs to update a single buf and its hdr after a write occurs.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pabd. The
+ * L2ARC will always write the contents of b_pabd to the L2ARC. This means
+ * that when compressed ARC is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * ARC is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ *
+ * The L1ARC has a slightly different system for storing encrypted data.
+ * Raw (encrypted + possibly compressed) data has a few subtle differences from
+ * data that is just compressed. The biggest difference is that it is not
+ * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
+ * The other difference is that encryption cannot be treated as a suggestion.
+ * If a caller would prefer compressed data, but they actually wind up with
+ * uncompressed data the worst thing that could happen is there might be a
+ * performance hit. If the caller requests encrypted data, however, we must be
+ * sure they actually get it or else secret information could be leaked. Raw
+ * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
+ * may have both an encrypted version and a decrypted version of its data at
+ * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
+ * copied out of this header. To avoid complications with b_pabd, raw buffers
+ * cannot be shared.
+ */
+
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/zfs_refcount.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/multilist.h>
+#include <sys/abd.h>
+#include <sys/zil.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/callb.h>
+#include <sys/kstat.h>
+#include <sys/zthr.h>
+#include <zfs_fletcher.h>
+#include <sys/arc_impl.h>
+#include <sys/trace_zfs.h>
+#include <sys/aggsum.h>
+#include <cityhash.h>
+#include <sys/vdev_trim.h>
+#include <sys/zstd/zstd.h>
+
+#ifndef _KERNEL
+/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
+boolean_t arc_watch = B_FALSE;
+#endif
+
+/*
+ * This thread's job is to keep enough free memory in the system, by
+ * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
+ * arc_available_memory().
+ */
+static zthr_t *arc_reap_zthr;
+
+/*
+ * This thread's job is to keep arc_size under arc_c, by calling
+ * arc_evict(), which improves arc_is_overflowing().
+ */
+static zthr_t *arc_evict_zthr;
+
+static kmutex_t arc_evict_lock;
+static boolean_t arc_evict_needed = B_FALSE;
+
+/*
+ * Count of bytes evicted since boot.
+ */
+static uint64_t arc_evict_count;
+
+/*
+ * List of arc_evict_waiter_t's, representing threads waiting for the
+ * arc_evict_count to reach specific values.
+ */
+static list_t arc_evict_waiters;
+
+/*
+ * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
+ * the requested amount of data to be evicted.  For example, by default for
+ * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
+ * Since this is above 100%, it ensures that progress is made towards getting
+ * arc_size under arc_c.  Since this is finite, it ensures that allocations
+ * can still happen, even during the potentially long time that arc_size is
+ * more than arc_c.
+ */
+int zfs_arc_eviction_pct = 200;
+
+/*
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
+
+/* number of seconds before growing cache again */
+int arc_grow_retry = 5;
+
+/*
+ * Minimum time between calls to arc_kmem_reap_soon().
+ */
+int arc_kmem_cache_reap_retry_ms = 1000;
+
+/* shift of arc_c for calculating overflow limit in arc_get_data_impl */
+int zfs_arc_overflow_shift = 8;
+
+/* shift of arc_c for calculating both min and max arc_p */
+int arc_p_min_shift = 4;
+
+/* log2(fraction of arc to reclaim) */
+int arc_shrink_shift = 7;
+
+/* percent of pagecache to reclaim arc to */
+#ifdef _KERNEL
+uint_t zfs_arc_pc_percent = 0;
+#endif
+
+/*
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
+ */
+int			arc_no_grow_shift = 5;
+
+
+/*
+ * minimum lifespan of a prefetch block in clock ticks
+ * (initialized in arc_init())
+ */
+static int		arc_min_prefetch_ms;
+static int		arc_min_prescient_prefetch_ms;
+
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
+/*
+ * The arc has filled available memory and has now warmed up.
+ */
+boolean_t arc_warm;
+
+/*
+ * These tunables are for performance analysis.
+ */
+unsigned long zfs_arc_max = 0;
+unsigned long zfs_arc_min = 0;
+unsigned long zfs_arc_meta_limit = 0;
+unsigned long zfs_arc_meta_min = 0;
+unsigned long zfs_arc_dnode_limit = 0;
+unsigned long zfs_arc_dnode_reduce_percent = 10;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_arc_p_min_shift = 0;
+int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+
+/*
+ * ARC dirty data constraints for arc_tempreserve_space() throttle.
+ */
+unsigned long zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
+unsigned long zfs_arc_anon_limit_percent = 25;	/* anon block dirty limit */
+unsigned long zfs_arc_pool_dirty_percent = 20;	/* each pool's anon allowance */
+
+/*
+ * Enable or disable compressed arc buffers.
+ */
+int zfs_compressed_arc_enabled = B_TRUE;
+
+/*
+ * ARC will evict meta buffers that exceed arc_meta_limit. This
+ * tunable make arc_meta_limit adjustable for different workloads.
+ */
+unsigned long zfs_arc_meta_limit_percent = 75;
+
+/*
+ * Percentage that can be consumed by dnodes of ARC meta buffers.
+ */
+unsigned long zfs_arc_dnode_limit_percent = 10;
+
+/*
+ * These tunables are Linux specific
+ */
+unsigned long zfs_arc_sys_free = 0;
+int zfs_arc_min_prefetch_ms = 0;
+int zfs_arc_min_prescient_prefetch_ms = 0;
+int zfs_arc_p_dampener_disable = 1;
+int zfs_arc_meta_prune = 10000;
+int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED;
+int zfs_arc_meta_adjust_restarts = 4096;
+int zfs_arc_lotsfree_percent = 10;
+
+/* The 6 states: */
+arc_state_t ARC_anon;
+arc_state_t ARC_mru;
+arc_state_t ARC_mru_ghost;
+arc_state_t ARC_mfu;
+arc_state_t ARC_mfu_ghost;
+arc_state_t ARC_l2c_only;
+
+arc_stats_t arc_stats = {
+	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "misses",			KSTAT_DATA_UINT64 },
+	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
+	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
+	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
+	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
+	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
+	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
+	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
+	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
+	{ "mru_hits",			KSTAT_DATA_UINT64 },
+	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
+	{ "mfu_hits",			KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
+	{ "deleted",			KSTAT_DATA_UINT64 },
+	{ "mutex_miss",			KSTAT_DATA_UINT64 },
+	{ "access_skip",		KSTAT_DATA_UINT64 },
+	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
+	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
+	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
+	{ "evict_l2_eligible_mfu",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_eligible_mru",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
+	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
+	{ "hash_elements",		KSTAT_DATA_UINT64 },
+	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
+	{ "hash_collisions",		KSTAT_DATA_UINT64 },
+	{ "hash_chains",		KSTAT_DATA_UINT64 },
+	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
+	{ "p",				KSTAT_DATA_UINT64 },
+	{ "c",				KSTAT_DATA_UINT64 },
+	{ "c_min",			KSTAT_DATA_UINT64 },
+	{ "c_max",			KSTAT_DATA_UINT64 },
+	{ "size",			KSTAT_DATA_UINT64 },
+	{ "compressed_size",		KSTAT_DATA_UINT64 },
+	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
+	{ "overhead_size",		KSTAT_DATA_UINT64 },
+	{ "hdr_size",			KSTAT_DATA_UINT64 },
+	{ "data_size",			KSTAT_DATA_UINT64 },
+	{ "metadata_size",		KSTAT_DATA_UINT64 },
+	{ "dbuf_size",			KSTAT_DATA_UINT64 },
+	{ "dnode_size",			KSTAT_DATA_UINT64 },
+	{ "bonus_size",			KSTAT_DATA_UINT64 },
+#if defined(COMPAT_FREEBSD11)
+	{ "other_size",			KSTAT_DATA_UINT64 },
+#endif
+	{ "anon_size",			KSTAT_DATA_UINT64 },
+	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mru_size",			KSTAT_DATA_UINT64 },
+	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
+	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+	{ "mfu_size",			KSTAT_DATA_UINT64 },
+	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
+	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
+	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+	{ "l2_hits",			KSTAT_DATA_UINT64 },
+	{ "l2_misses",			KSTAT_DATA_UINT64 },
+	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
+	{ "l2_mru_asize",		KSTAT_DATA_UINT64 },
+	{ "l2_mfu_asize",		KSTAT_DATA_UINT64 },
+	{ "l2_bufc_data_asize",		KSTAT_DATA_UINT64 },
+	{ "l2_bufc_metadata_asize",	KSTAT_DATA_UINT64 },
+	{ "l2_feeds",			KSTAT_DATA_UINT64 },
+	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
+	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
+	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
+	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
+	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
+	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
+	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
+	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
+	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
+	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
+	{ "l2_io_error",		KSTAT_DATA_UINT64 },
+	{ "l2_size",			KSTAT_DATA_UINT64 },
+	{ "l2_asize",			KSTAT_DATA_UINT64 },
+	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
+	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
+	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
+	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
+	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
+	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
+	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
+	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
+	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
+	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
+	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
+	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
+	{ "memory_available_bytes",	KSTAT_DATA_INT64 },
+	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
+	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
+	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
+	{ "arc_prune",			KSTAT_DATA_UINT64 },
+	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
+	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
+	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
+	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
+	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
+	{ "arc_need_free",		KSTAT_DATA_UINT64 },
+	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
+	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
+	{ "cached_only_in_progress",	KSTAT_DATA_UINT64 },
+	{ "abd_chunk_waste_size",	KSTAT_DATA_UINT64 },
+};
+
+#define	ARCSTAT_MAX(stat, val) {					\
+	uint64_t m;							\
+	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
+	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
+		continue;						\
+}
+
+#define	ARCSTAT_MAXSTAT(stat) \
+	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
+
+/*
+ * We define a macro to allow ARC hits/misses to be easily broken down by
+ * two separate conditions, giving a total of four different subtypes for
+ * each of hits and misses (so eight statistics total).
+ */
+#define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
+	if (cond1) {							\
+		if (cond2) {						\
+			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
+		} else {						\
+			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
+		}							\
+	} else {							\
+		if (cond2) {						\
+			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
+		} else {						\
+			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
+		}							\
+	}
+
+/*
+ * This macro allows us to use kstats as floating averages. Each time we
+ * update this kstat, we first factor it and the update value by
+ * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
+ * average. This macro assumes that integer loads and stores are atomic, but
+ * is not safe for multiple writers updating the kstat in parallel (only the
+ * last writer's update will remain).
+ */
+#define	ARCSTAT_F_AVG_FACTOR	3
+#define	ARCSTAT_F_AVG(stat, value) \
+	do { \
+		uint64_t x = ARCSTAT(stat); \
+		x = x - x / ARCSTAT_F_AVG_FACTOR + \
+		    (value) / ARCSTAT_F_AVG_FACTOR; \
+		ARCSTAT(stat) = x; \
+		_NOTE(CONSTCOND) \
+	} while (0)
+
+kstat_t			*arc_ksp;
+static arc_state_t	*arc_anon;
+static arc_state_t	*arc_mru_ghost;
+static arc_state_t	*arc_mfu_ghost;
+static arc_state_t	*arc_l2c_only;
+
+arc_state_t	*arc_mru;
+arc_state_t	*arc_mfu;
+
+/*
+ * There are several ARC variables that are critical to export as kstats --
+ * but we don't want to have to grovel around in the kstat whenever we wish to
+ * manipulate them.  For these variables, we therefore define them to be in
+ * terms of the statistic variable.  This assures that we are not introducing
+ * the possibility of inconsistency by having shadow copies of the variables,
+ * while still allowing the code to be readable.
+ */
+#define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
+#define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
+#define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+/* max size for dnodes */
+#define	arc_dnode_size_limit	ARCSTAT(arcstat_dnode_limit)
+#define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
+#define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
+
+/* size of all b_rabd's in entire arc */
+#define	arc_raw_size	ARCSTAT(arcstat_raw_size)
+/* compressed size of entire arc */
+#define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
+
+/*
+ * There are also some ARC variables that we want to export, but that are
+ * updated so often that having the canonical representation be the statistic
+ * variable causes a performance bottleneck. We want to use aggsum_t's for these
+ * instead, but still be able to export the kstat in the same way as before.
+ * The solution is to always use the aggsum version, except in the kstat update
+ * callback.
+ */
+aggsum_t arc_size;
+aggsum_t arc_meta_used;
+aggsum_t astat_data_size;
+aggsum_t astat_metadata_size;
+aggsum_t astat_dbuf_size;
+aggsum_t astat_dnode_size;
+aggsum_t astat_bonus_size;
+aggsum_t astat_hdr_size;
+aggsum_t astat_l2_hdr_size;
+aggsum_t astat_abd_chunk_waste_size;
+
+hrtime_t arc_growtime;
+list_t arc_prune_list;
+kmutex_t arc_prune_mtx;
+taskq_t *arc_prune_taskq;
+
+#define	GHOST_STATE(state)	\
+	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
+	(state) == arc_l2c_only)
+
+#define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
+#define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+#define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
+#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define	HDR_PRESCIENT_PREFETCH(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+#define	HDR_COMPRESSION_ENABLED(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
+
+#define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define	HDR_L2_READING(hdr)	\
+	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
+	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+#define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
+#define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
+#define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
+#define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
+#define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
+
+#define	HDR_ISTYPE_METADATA(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
+
+#define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+#define	HDR_HAS_RABD(hdr)	\
+	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
+	(hdr)->b_crypt_hdr.b_rabd != NULL)
+#define	HDR_ENCRYPTED(hdr)	\
+	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
+#define	HDR_AUTHENTICATED(hdr)	\
+	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
+
+/* For storing compression mode in b_flags */
+#define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
+#define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
+#define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
+#define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
+
+/*
+ * Other sizes
+ */
+
+#define	HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define	HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
+#define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
+
+/*
+ * Hash table routines
+ */
+
+#define	HT_LOCK_ALIGN	64
+#define	HT_LOCK_PAD	(P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN)))
+
+struct ht_lock {
+	kmutex_t	ht_lock;
+#ifdef _KERNEL
+	unsigned char	pad[HT_LOCK_PAD];
+#endif
+};
+
+#define	BUF_LOCKS 8192
+typedef struct buf_hash_table {
+	uint64_t ht_mask;
+	arc_buf_hdr_t **ht_table;
+	struct ht_lock ht_locks[BUF_LOCKS];
+} buf_hash_table_t;
+
+static buf_hash_table_t buf_hash_table;
+
+#define	BUF_HASH_INDEX(spa, dva, birth) \
+	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
+#define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
+#define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
+#define	HDR_LOCK(hdr) \
+	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
+
+uint64_t zfs_crc64_table[256];
+
+/*
+ * Level 2 ARC
+ */
+
+#define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
+#define	L2ARC_HEADROOM		2			/* num of writes */
+
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define	L2ARC_HEADROOM_BOOST	200
+#define	L2ARC_FEED_SECS		1		/* caching interval secs */
+#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
+
+/*
+ * We can feed L2ARC from two states of ARC buffers, mru and mfu,
+ * and each of the state has two types: data and metadata.
+ */
+#define	L2ARC_FEED_TYPES	4
+
+#define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
+#define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
+
+/* L2ARC Performance Tunables */
+unsigned long l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
+unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+unsigned long l2arc_headroom = L2ARC_HEADROOM;		/* # of dev writes */
+unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+unsigned long l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
+int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
+int l2arc_feed_again = B_TRUE;			/* turbo warmup */
+int l2arc_norw = B_FALSE;			/* no reads during writes */
+int l2arc_meta_percent = 33;			/* limit on headers size */
+
+/*
+ * L2ARC Internals
+ */
+static list_t L2ARC_dev_list;			/* device list */
+static list_t *l2arc_dev_list;			/* device list pointer */
+static kmutex_t l2arc_dev_mtx;			/* device list mutex */
+static l2arc_dev_t *l2arc_dev_last;		/* last device used */
+static list_t L2ARC_free_on_write;		/* free after write buf list */
+static list_t *l2arc_free_on_write;		/* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
+static uint64_t l2arc_ndev;			/* number of devices */
+
+typedef struct l2arc_read_callback {
+	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
+	blkptr_t		l2rcb_bp;		/* original blkptr */
+	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
+	int			l2rcb_flags;		/* original flags */
+	abd_t			*l2rcb_abd;		/* temporary buffer */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_data_free {
+	/* protected by l2arc_free_on_write_mtx */
+	abd_t		*l2df_abd;
+	size_t		l2df_size;
+	arc_buf_contents_t l2df_type;
+	list_node_t	l2df_list_node;
+} l2arc_data_free_t;
+
+typedef enum arc_fill_flags {
+	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
+	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
+	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
+	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
+	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
+} arc_fill_flags_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static kmutex_t l2arc_rebuild_thr_lock;
+static kcondvar_t l2arc_rebuild_thr_cv;
+
+enum arc_hdr_alloc_flags {
+	ARC_HDR_ALLOC_RDATA = 0x1,
+	ARC_HDR_DO_ADAPT = 0x2,
+};
+
+
+static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t);
+static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
+static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
+static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
+static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static void arc_buf_watch(arc_buf_t *);
+
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+
+static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
+static void l2arc_read_done(zio_t *);
+static void l2arc_do_free_on_write(void);
+static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+    boolean_t state_only);
+
+#define	l2arc_hdr_arcstats_increment(hdr) \
+	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
+#define	l2arc_hdr_arcstats_decrement(hdr) \
+	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
+#define	l2arc_hdr_arcstats_increment_state(hdr) \
+	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
+#define	l2arc_hdr_arcstats_decrement_state(hdr) \
+	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
+
+/*
+ * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
+ * 		metadata and data are cached from ARC into L2ARC.
+ */
+int l2arc_mfuonly = 0;
+
+/*
+ * L2ARC TRIM
+ * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
+ * 		the current write size (l2arc_write_max) we should TRIM if we
+ * 		have filled the device. It is defined as a percentage of the
+ * 		write size. If set to 100 we trim twice the space required to
+ * 		accommodate upcoming writes. A minimum of 64MB will be trimmed.
+ * 		It also enables TRIM of the whole L2ARC device upon creation or
+ * 		addition to an existing pool or if the header of the device is
+ * 		invalid upon importing a pool or onlining a cache device. The
+ * 		default is 0, which disables TRIM on L2ARC altogether as it can
+ * 		put significant stress on the underlying storage devices. This
+ * 		will vary depending of how well the specific device handles
+ * 		these commands.
+ */
+unsigned long l2arc_trim_ahead = 0;
+
+/*
+ * Performance tuning of L2ARC persistence:
+ *
+ * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
+ * 		an L2ARC device (either at pool import or later) will attempt
+ * 		to rebuild L2ARC buffer contents.
+ * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
+ * 		whether log blocks are written to the L2ARC device. If the L2ARC
+ * 		device is less than 1GB, the amount of data l2arc_evict()
+ * 		evicts is significant compared to the amount of restored L2ARC
+ * 		data. In this case do not write log blocks in L2ARC in order
+ * 		not to waste space.
+ */
+int l2arc_rebuild_enabled = B_TRUE;
+unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
+
+/* L2ARC persistence rebuild control routines. */
+void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
+static void l2arc_dev_rebuild_thread(void *arg);
+static int l2arc_rebuild(l2arc_dev_t *dev);
+
+/* L2ARC persistence read I/O routines. */
+static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
+static int l2arc_log_blk_read(l2arc_dev_t *dev,
+    const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
+    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+    zio_t *this_io, zio_t **next_io);
+static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
+    const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
+static void l2arc_log_blk_fetch_abort(zio_t *zio);
+
+/* L2ARC persistence block restoration routines. */
+static void l2arc_log_blk_restore(l2arc_dev_t *dev,
+    const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
+static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
+    l2arc_dev_t *dev);
+
+/* L2ARC persistence write I/O routines. */
+static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
+    l2arc_write_callback_t *cb);
+
+/* L2ARC persistence auxiliary routines. */
+boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
+    const l2arc_log_blkptr_t *lbp);
+static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
+    const arc_buf_hdr_t *ab);
+boolean_t l2arc_range_check_overlap(uint64_t bottom,
+    uint64_t top, uint64_t check);
+static void l2arc_blk_fetch_done(zio_t *zio);
+static inline uint64_t
+    l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
+static uint64_t
+buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
+{
+	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
+}
+
+#define	HDR_EMPTY(hdr)						\
+	((hdr)->b_dva.dva_word[0] == 0 &&			\
+	(hdr)->b_dva.dva_word[1] == 0)
+
+#define	HDR_EMPTY_OR_LOCKED(hdr)				\
+	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
+
+#define	HDR_EQUAL(spa, dva, birth, hdr)				\
+	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
+	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
+	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
+
+static void
+buf_discard_identity(arc_buf_hdr_t *hdr)
+{
+	hdr->b_dva.dva_word[0] = 0;
+	hdr->b_dva.dva_word[1] = 0;
+	hdr->b_birth = 0;
+}
+
+static arc_buf_hdr_t *
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
+{
+	const dva_t *dva = BP_IDENTITY(bp);
+	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
+	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
+	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+	arc_buf_hdr_t *hdr;
+
+	mutex_enter(hash_lock);
+	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
+	    hdr = hdr->b_hash_next) {
+		if (HDR_EQUAL(spa, dva, birth, hdr)) {
+			*lockp = hash_lock;
+			return (hdr);
+		}
+	}
+	mutex_exit(hash_lock);
+	*lockp = NULL;
+	return (NULL);
+}
+
+/*
+ * Insert an entry into the hash table.  If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
+ */
+static arc_buf_hdr_t *
+buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
+{
+	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
+	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
+	arc_buf_hdr_t *fhdr;
+	uint32_t i;
+
+	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
+	ASSERT(hdr->b_birth != 0);
+	ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+	if (lockp != NULL) {
+		*lockp = hash_lock;
+		mutex_enter(hash_lock);
+	} else {
+		ASSERT(MUTEX_HELD(hash_lock));
+	}
+
+	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
+	    fhdr = fhdr->b_hash_next, i++) {
+		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+			return (fhdr);
+	}
+
+	hdr->b_hash_next = buf_hash_table.ht_table[idx];
+	buf_hash_table.ht_table[idx] = hdr;
+	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
+	/* collect some hash table performance data */
+	if (i > 0) {
+		ARCSTAT_BUMP(arcstat_hash_collisions);
+		if (i == 1)
+			ARCSTAT_BUMP(arcstat_hash_chains);
+
+		ARCSTAT_MAX(arcstat_hash_chain_max, i);
+	}
+
+	ARCSTAT_BUMP(arcstat_hash_elements);
+	ARCSTAT_MAXSTAT(arcstat_hash_elements);
+
+	return (NULL);
+}
+
+static void
+buf_hash_remove(arc_buf_hdr_t *hdr)
+{
+	arc_buf_hdr_t *fhdr, **hdrp;
+	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
+
+	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+	ASSERT(HDR_IN_HASH_TABLE(hdr));
+
+	hdrp = &buf_hash_table.ht_table[idx];
+	while ((fhdr = *hdrp) != hdr) {
+		ASSERT3P(fhdr, !=, NULL);
+		hdrp = &fhdr->b_hash_next;
+	}
+	*hdrp = hdr->b_hash_next;
+	hdr->b_hash_next = NULL;
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
+	/* collect some hash table performance data */
+	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
+
+	if (buf_hash_table.ht_table[idx] &&
+	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
+		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
+}
+
+/*
+ * Global data structures and functions for the buf kmem cache.
+ */
+
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_full_crypt_cache;
+static kmem_cache_t *hdr_l2only_cache;
+static kmem_cache_t *buf_cache;
+
+static void
+buf_fini(void)
+{
+	int i;
+
+#if defined(_KERNEL)
+	/*
+	 * Large allocations which do not require contiguous pages
+	 * should be using vmem_free() in the linux kernel\
+	 */
+	vmem_free(buf_hash_table.ht_table,
+	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#else
+	kmem_free(buf_hash_table.ht_table,
+	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
+#endif
+	for (i = 0; i < BUF_LOCKS; i++)
+		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
+	kmem_cache_destroy(hdr_full_cache);
+	kmem_cache_destroy(hdr_full_crypt_cache);
+	kmem_cache_destroy(hdr_l2only_cache);
+	kmem_cache_destroy(buf_cache);
+}
+
+/*
+ * Constructor callback - called when the cache is empty
+ * and a new buf is requested.
+ */
+/* ARGSUSED */
+static int
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+	bzero(hdr, HDR_FULL_SIZE);
+	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
+	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_link_init(&hdr->b_l1hdr.b_arc_node);
+	list_link_init(&hdr->b_l2hdr.b_l2node);
+	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+	hdr_full_cons(vbuf, unused, kmflag);
+	bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
+	arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+	bzero(hdr, HDR_L2ONLY_SIZE);
+	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+buf_cons(void *vbuf, void *unused, int kmflag)
+{
+	arc_buf_t *buf = vbuf;
+
+	bzero(buf, sizeof (arc_buf_t));
+	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+
+	return (0);
+}
+
+/*
+ * Destructor callback - called when a cached buf is
+ * no longer required.
+ */
+/* ARGSUSED */
+static void
+hdr_full_dest(void *vbuf, void *unused)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+	ASSERT(HDR_EMPTY(hdr));
+	cv_destroy(&hdr->b_l1hdr.b_cv);
+	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_full_crypt_dest(void *vbuf, void *unused)
+{
+	arc_buf_hdr_t *hdr = vbuf;
+
+	hdr_full_dest(vbuf, unused);
+	arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
+}
+
+/* ARGSUSED */
+static void
+hdr_l2only_dest(void *vbuf, void *unused)
+{
+	arc_buf_hdr_t *hdr __maybe_unused = vbuf;
+
+	ASSERT(HDR_EMPTY(hdr));
+	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+}
+
+/* ARGSUSED */
+static void
+buf_dest(void *vbuf, void *unused)
+{
+	arc_buf_t *buf = vbuf;
+
+	mutex_destroy(&buf->b_evict_lock);
+	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+}
+
+static void
+buf_init(void)
+{
+	uint64_t *ct = NULL;
+	uint64_t hsize = 1ULL << 12;
+	int i, j;
+
+	/*
+	 * The hash table is big enough to fill all of physical memory
+	 * with an average block size of zfs_arc_average_blocksize (default 8K).
+	 * By default, the table will take up
+	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
+	 */
+	while (hsize * zfs_arc_average_blocksize < arc_all_memory())
+		hsize <<= 1;
+retry:
+	buf_hash_table.ht_mask = hsize - 1;
+#if defined(_KERNEL)
+	/*
+	 * Large allocations which do not require contiguous pages
+	 * should be using vmem_alloc() in the linux kernel
+	 */
+	buf_hash_table.ht_table =
+	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+#else
+	buf_hash_table.ht_table =
+	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+#endif
+	if (buf_hash_table.ht_table == NULL) {
+		ASSERT(hsize > (1ULL << 8));
+		hsize >>= 1;
+		goto retry;
+	}
+
+	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
+	hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
+	    HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
+	    NULL, NULL, NULL, 0);
+	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
+	    NULL, NULL, 0);
+	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
+	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
+
+	for (i = 0; i < 256; i++)
+		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
+			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
+
+	for (i = 0; i < BUF_LOCKS; i++) {
+		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
+		    NULL, MUTEX_DEFAULT, NULL);
+	}
+}
+
+#define	ARC_MINTIME	(hz>>4) /* 62 ms */
+
+/*
+ * This is the size that the buf occupies in memory. If the buf is compressed,
+ * it will correspond to the compressed size. You should use this method of
+ * getting the buf size unless you explicitly need the logical size.
+ */
+uint64_t
+arc_buf_size(arc_buf_t *buf)
+{
+	return (ARC_BUF_COMPRESSED(buf) ?
+	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
+}
+
+uint64_t
+arc_buf_lsize(arc_buf_t *buf)
+{
+	return (HDR_GET_LSIZE(buf->b_hdr));
+}
+
+/*
+ * This function will return B_TRUE if the buffer is encrypted in memory.
+ * This buffer can be decrypted by calling arc_untransform().
+ */
+boolean_t
+arc_is_encrypted(arc_buf_t *buf)
+{
+	return (ARC_BUF_ENCRYPTED(buf) != 0);
+}
+
+/*
+ * Returns B_TRUE if the buffer represents data that has not had its MAC
+ * verified yet.
+ */
+boolean_t
+arc_is_unauthenticated(arc_buf_t *buf)
+{
+	return (HDR_NOAUTH(buf->b_hdr) != 0);
+}
+
+void
+arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
+    uint8_t *iv, uint8_t *mac)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT(HDR_PROTECTED(hdr));
+
+	bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
+	bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
+	bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
+	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
+	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
+}
+
+/*
+ * Indicates how this buffer is compressed in memory. If it is not compressed
+ * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
+ * arc_untransform() as long as it is also unencrypted.
+ */
+enum zio_compress
+arc_get_compression(arc_buf_t *buf)
+{
+	return (ARC_BUF_COMPRESSED(buf) ?
+	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
+}
+
+/*
+ * Return the compression algorithm used to store this data in the ARC. If ARC
+ * compression is enabled or this is an encrypted block, this will be the same
+ * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
+ */
+static inline enum zio_compress
+arc_hdr_get_compress(arc_buf_hdr_t *hdr)
+{
+	return (HDR_COMPRESSION_ENABLED(hdr) ?
+	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
+}
+
+uint8_t
+arc_get_complevel(arc_buf_t *buf)
+{
+	return (buf->b_hdr->b_complevel);
+}
+
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+	boolean_t shared = (buf->b_data != NULL &&
+	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
+	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
+	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
+	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+	IMPLY(shared, ARC_BUF_SHARED(buf));
+	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
+
+	/*
+	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
+	 * already being shared" requirement prevents us from doing that.
+	 */
+
+	return (shared);
+}
+
+/*
+ * Free the checksum associated with this header. If there is no checksum, this
+ * is a no-op.
+ */
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+		hdr->b_l1hdr.b_freeze_cksum = NULL;
+	}
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
+/*
+ * Return true iff at least one of the bufs on hdr is not compressed.
+ * Encrypted buffers count as compressed.
+ */
+static boolean_t
+arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
+{
+	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
+
+	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
+		if (!ARC_BUF_COMPRESSED(b)) {
+			return (B_TRUE);
+		}
+	}
+	return (B_FALSE);
+}
+
+
+/*
+ * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
+ * matches the checksum that is stored in the hdr. If there is no checksum,
+ * or if the buf is compressed, this is a no-op.
+ */
+static void
+arc_cksum_verify(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	zio_cksum_t zc;
+
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+		return;
+
+	if (ARC_BUF_COMPRESSED(buf))
+		return;
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+
+	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+		return;
+	}
+
+	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
+	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
+		panic("buffer modified while frozen!");
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
+/*
+ * This function makes the assumption that data stored in the L2ARC
+ * will be transformed exactly as it is in the main pool. Because of
+ * this we can verify the checksum against the reading process's bp.
+ */
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
+{
+	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
+
+	/*
+	 * Block pointers always store the checksum for the logical data.
+	 * If the block pointer has the gang bit set, then the checksum
+	 * it represents is for the reconstituted data and not for an
+	 * individual gang member. The zio pipeline, however, must be able to
+	 * determine the checksum of each of the gang constituents so it
+	 * treats the checksum comparison differently than what we need
+	 * for l2arc blocks. This prevents us from using the
+	 * zio_checksum_error() interface directly. Instead we must call the
+	 * zio_checksum_error_impl() so that we can ensure the checksum is
+	 * generated using the correct checksum algorithm and accounts for the
+	 * logical I/O size and not just a gang fragment.
+	 */
+	return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
+	    zio->io_offset, NULL) == 0);
+}
+
+/*
+ * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
+ * checksum and attaches it to the buf's hdr so that we can ensure that the buf
+ * isn't modified later on. If buf is compressed or there is already a checksum
+ * on the hdr, this is a no-op (we only checksum uncompressed bufs).
+ */
+static void
+arc_cksum_compute(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+		return;
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+		return;
+	}
+
+	ASSERT(!ARC_BUF_ENCRYPTED(buf));
+	ASSERT(!ARC_BUF_COMPRESSED(buf));
+	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+	    KM_SLEEP);
+	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
+	    hdr->b_l1hdr.b_freeze_cksum);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+	arc_buf_watch(buf);
+}
+
+#ifndef _KERNEL
+void
+arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
+{
+	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
+}
+#endif
+
+/* ARGSUSED */
+static void
+arc_buf_unwatch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+	if (arc_watch) {
+		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
+		    PROT_READ | PROT_WRITE));
+	}
+#endif
+}
+
+/* ARGSUSED */
+static void
+arc_buf_watch(arc_buf_t *buf)
+{
+#ifndef _KERNEL
+	if (arc_watch)
+		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
+		    PROT_READ));
+#endif
+}
+
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+	arc_buf_contents_t type;
+	if (HDR_ISTYPE_METADATA(hdr)) {
+		type = ARC_BUFC_METADATA;
+	} else {
+		type = ARC_BUFC_DATA;
+	}
+	VERIFY3U(hdr->b_type, ==, type);
+	return (type);
+}
+
+boolean_t
+arc_is_metadata(arc_buf_t *buf)
+{
+	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+	switch (type) {
+	case ARC_BUFC_DATA:
+		/* metadata field is 0 if buffer contains normal data */
+		return (0);
+	case ARC_BUFC_METADATA:
+		return (ARC_FLAG_BUFC_METADATA);
+	default:
+		break;
+	}
+	panic("undefined ARC buffer type!");
+	return ((uint32_t)-1);
+}
+
+void
+arc_buf_thaw(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+
+	arc_cksum_verify(buf);
+
+	/*
+	 * Compressed buffers do not manipulate the b_freeze_cksum.
+	 */
+	if (ARC_BUF_COMPRESSED(buf))
+		return;
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	arc_cksum_free(hdr);
+	arc_buf_unwatch(buf);
+}
+
+void
+arc_buf_freeze(arc_buf_t *buf)
+{
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+		return;
+
+	if (ARC_BUF_COMPRESSED(buf))
+		return;
+
+	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
+	arc_cksum_compute(buf);
+}
+
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+	hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+	hdr->b_flags &= ~flags;
+}
+
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
+static void
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
+{
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+	/*
+	 * Holes and embedded blocks will always have a psize = 0 so
+	 * we ignore the compression of the blkptr and set the
+	 * want to uncompress them. Mark them as uncompressed.
+	 */
+	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
+	}
+
+	HDR_SET_COMPRESS(hdr, cmp);
+	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+}
+
+/*
+ * Looks for another buf on the same hdr which has the data decompressed, copies
+ * from it, and returns true. If no such buf exists, returns false.
+ */
+static boolean_t
+arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	boolean_t copied = B_FALSE;
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT3P(buf->b_data, !=, NULL);
+	ASSERT(!ARC_BUF_COMPRESSED(buf));
+
+	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
+	    from = from->b_next) {
+		/* can't use our own data buffer */
+		if (from == buf) {
+			continue;
+		}
+
+		if (!ARC_BUF_COMPRESSED(from)) {
+			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
+			copied = B_TRUE;
+			break;
+		}
+	}
+
+	/*
+	 * There were no decompressed bufs, so there should not be a
+	 * checksum on the hdr either.
+	 */
+	if (zfs_flags & ZFS_DEBUG_MODIFY)
+		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
+
+	return (copied);
+}
+
+/*
+ * Allocates an ARC buf header that's in an evicted & L2-cached state.
+ * This is used during l2arc reconstruction to make empty ARC buffers
+ * which circumvent the regular disk->arc->l2arc path and instead come
+ * into being in the reverse order, i.e. l2arc->arc.
+ */
+static arc_buf_hdr_t *
+arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
+    dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
+    enum zio_compress compress, uint8_t complevel, boolean_t protected,
+    boolean_t prefetch, arc_state_type_t arcs_state)
+{
+	arc_buf_hdr_t	*hdr;
+
+	ASSERT(size != 0);
+	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
+	hdr->b_birth = birth;
+	hdr->b_type = type;
+	hdr->b_flags = 0;
+	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
+	HDR_SET_LSIZE(hdr, size);
+	HDR_SET_PSIZE(hdr, psize);
+	arc_hdr_set_compress(hdr, compress);
+	hdr->b_complevel = complevel;
+	if (protected)
+		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
+	if (prefetch)
+		arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+	hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
+
+	hdr->b_dva = dva;
+
+	hdr->b_l2hdr.b_dev = dev;
+	hdr->b_l2hdr.b_daddr = daddr;
+	hdr->b_l2hdr.b_arcs_state = arcs_state;
+
+	return (hdr);
+}
+
+/*
+ * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
+ */
+static uint64_t
+arc_hdr_size(arc_buf_hdr_t *hdr)
+{
+	uint64_t size;
+
+	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
+	    HDR_GET_PSIZE(hdr) > 0) {
+		size = HDR_GET_PSIZE(hdr);
+	} else {
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
+		size = HDR_GET_LSIZE(hdr);
+	}
+	return (size);
+}
+
+static int
+arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
+{
+	int ret;
+	uint64_t csize;
+	uint64_t lsize = HDR_GET_LSIZE(hdr);
+	uint64_t psize = HDR_GET_PSIZE(hdr);
+	void *tmpbuf = NULL;
+	abd_t *abd = hdr->b_l1hdr.b_pabd;
+
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+	ASSERT(HDR_AUTHENTICATED(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+	/*
+	 * The MAC is calculated on the compressed data that is stored on disk.
+	 * However, if compressed arc is disabled we will only have the
+	 * decompressed data available to us now. Compress it into a temporary
+	 * abd so we can verify the MAC. The performance overhead of this will
+	 * be relatively low, since most objects in an encrypted objset will
+	 * be encrypted (instead of authenticated) anyway.
+	 */
+	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+	    !HDR_COMPRESSION_ENABLED(hdr)) {
+		tmpbuf = zio_buf_alloc(lsize);
+		abd = abd_get_from_buf(tmpbuf, lsize);
+		abd_take_ownership_of_buf(abd, B_TRUE);
+		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
+		    hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel);
+		ASSERT3U(csize, <=, psize);
+		abd_zero_off(abd, csize, psize - csize);
+	}
+
+	/*
+	 * Authentication is best effort. We authenticate whenever the key is
+	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
+	 */
+	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+		ASSERT3U(lsize, ==, psize);
+		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
+		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+	} else {
+		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
+		    hdr->b_crypt_hdr.b_mac);
+	}
+
+	if (ret == 0)
+		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
+	else if (ret != ENOENT)
+		goto error;
+
+	if (tmpbuf != NULL)
+		abd_free(abd);
+
+	return (0);
+
+error:
+	if (tmpbuf != NULL)
+		abd_free(abd);
+
+	return (ret);
+}
+
+/*
+ * This function will take a header that only has raw encrypted data in
+ * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
+ * b_l1hdr.b_pabd. If designated in the header flags, this function will
+ * also decompress the data.
+ */
+static int
+arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
+{
+	int ret;
+	abd_t *cabd = NULL;
+	void *tmp = NULL;
+	boolean_t no_crypt = B_FALSE;
+	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+	ASSERT(HDR_ENCRYPTED(hdr));
+
+	arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+
+	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
+	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
+	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
+	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
+	if (ret != 0)
+		goto error;
+
+	if (no_crypt) {
+		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
+		    HDR_GET_PSIZE(hdr));
+	}
+
+	/*
+	 * If this header has disabled arc compression but the b_pabd is
+	 * compressed after decrypting it, we need to decompress the newly
+	 * decrypted data.
+	 */
+	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+	    !HDR_COMPRESSION_ENABLED(hdr)) {
+		/*
+		 * We want to make sure that we are correctly honoring the
+		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
+		 * and then loan a buffer from it, rather than allocating a
+		 * linear buffer and wrapping it in an abd later.
+		 */
+		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE);
+		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
+
+		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
+		if (ret != 0) {
+			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
+			goto error;
+		}
+
+		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+		    arc_hdr_size(hdr), hdr);
+		hdr->b_l1hdr.b_pabd = cabd;
+	}
+
+	return (0);
+
+error:
+	arc_hdr_free_abd(hdr, B_FALSE);
+	if (cabd != NULL)
+		arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
+
+	return (ret);
+}
+
+/*
+ * This function is called during arc_buf_fill() to prepare the header's
+ * abd plaintext pointer for use. This involves authenticated protected
+ * data and decrypting encrypted data into the plaintext abd.
+ */
+static int
+arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
+    const zbookmark_phys_t *zb, boolean_t noauth)
+{
+	int ret;
+
+	ASSERT(HDR_PROTECTED(hdr));
+
+	if (hash_lock != NULL)
+		mutex_enter(hash_lock);
+
+	if (HDR_NOAUTH(hdr) && !noauth) {
+		/*
+		 * The caller requested authenticated data but our data has
+		 * not been authenticated yet. Verify the MAC now if we can.
+		 */
+		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
+		if (ret != 0)
+			goto error;
+	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
+		/*
+		 * If we only have the encrypted version of the data, but the
+		 * unencrypted version was requested we take this opportunity
+		 * to store the decrypted version in the header for future use.
+		 */
+		ret = arc_hdr_decrypt(hdr, spa, zb);
+		if (ret != 0)
+			goto error;
+	}
+
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+	if (hash_lock != NULL)
+		mutex_exit(hash_lock);
+
+	return (0);
+
+error:
+	if (hash_lock != NULL)
+		mutex_exit(hash_lock);
+
+	return (ret);
+}
+
+/*
+ * This function is used by the dbuf code to decrypt bonus buffers in place.
+ * The dbuf code itself doesn't have any locking for decrypting a shared dnode
+ * block, so we use the hash lock here to protect against concurrent calls to
+ * arc_buf_fill().
+ */
+static void
+arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT(HDR_ENCRYPTED(hdr));
+	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
+	    arc_buf_size(buf));
+	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+	hdr->b_crypt_hdr.b_ebufcnt -= 1;
+}
+
+/*
+ * Given a buf that has a data buffer attached to it, this function will
+ * efficiently fill the buf with data of the specified compression setting from
+ * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
+ * are already sharing a data buf, no copy is performed.
+ *
+ * If the buf is marked as compressed but uncompressed data was requested, this
+ * will allocate a new data buffer for the buf, remove that flag, and fill the
+ * buf with uncompressed data. You can't request a compressed buf on a hdr with
+ * uncompressed data, and (since we haven't added support for it yet) if you
+ * want compressed data your buf must already be marked as compressed and have
+ * the correct-sized data buffer.
+ */
+static int
+arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
+    arc_fill_flags_t flags)
+{
+	int error = 0;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	boolean_t hdr_compressed =
+	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
+	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
+	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
+	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
+
+	ASSERT3P(buf->b_data, !=, NULL);
+	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
+	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
+	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
+	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
+	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
+	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
+
+	/*
+	 * If the caller wanted encrypted data we just need to copy it from
+	 * b_rabd and potentially byteswap it. We won't be able to do any
+	 * further transforms on it.
+	 */
+	if (encrypted) {
+		ASSERT(HDR_HAS_RABD(hdr));
+		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
+		    HDR_GET_PSIZE(hdr));
+		goto byteswap;
+	}
+
+	/*
+	 * Adjust encrypted and authenticated headers to accommodate
+	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
+	 * allowed to fail decryption due to keys not being loaded
+	 * without being marked as an IO error.
+	 */
+	if (HDR_PROTECTED(hdr)) {
+		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
+		    zb, !!(flags & ARC_FILL_NOAUTH));
+		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
+			return (error);
+		} else if (error != 0) {
+			if (hash_lock != NULL)
+				mutex_enter(hash_lock);
+			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+			if (hash_lock != NULL)
+				mutex_exit(hash_lock);
+			return (error);
+		}
+	}
+
+	/*
+	 * There is a special case here for dnode blocks which are
+	 * decrypting their bonus buffers. These blocks may request to
+	 * be decrypted in-place. This is necessary because there may
+	 * be many dnodes pointing into this buffer and there is
+	 * currently no method to synchronize replacing the backing
+	 * b_data buffer and updating all of the pointers. Here we use
+	 * the hash lock to ensure there are no races. If the need
+	 * arises for other types to be decrypted in-place, they must
+	 * add handling here as well.
+	 */
+	if ((flags & ARC_FILL_IN_PLACE) != 0) {
+		ASSERT(!hdr_compressed);
+		ASSERT(!compressed);
+		ASSERT(!encrypted);
+
+		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
+			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
+
+			if (hash_lock != NULL)
+				mutex_enter(hash_lock);
+			arc_buf_untransform_in_place(buf, hash_lock);
+			if (hash_lock != NULL)
+				mutex_exit(hash_lock);
+
+			/* Compute the hdr's checksum if necessary */
+			arc_cksum_compute(buf);
+		}
+
+		return (0);
+	}
+
+	if (hdr_compressed == compressed) {
+		if (!arc_buf_is_shared(buf)) {
+			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
+			    arc_buf_size(buf));
+		}
+	} else {
+		ASSERT(hdr_compressed);
+		ASSERT(!compressed);
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+
+		/*
+		 * If the buf is sharing its data with the hdr, unlink it and
+		 * allocate a new data buffer for the buf.
+		 */
+		if (arc_buf_is_shared(buf)) {
+			ASSERT(ARC_BUF_COMPRESSED(buf));
+
+			/* We need to give the buf its own b_data */
+			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+			buf->b_data =
+			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+
+			/* Previously overhead was 0; just add new overhead */
+			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+		} else if (ARC_BUF_COMPRESSED(buf)) {
+			/* We need to reallocate the buf's b_data */
+			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
+			    buf);
+			buf->b_data =
+			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+
+			/* We increased the size of b_data; update overhead */
+			ARCSTAT_INCR(arcstat_overhead_size,
+			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
+		}
+
+		/*
+		 * Regardless of the buf's previous compression settings, it
+		 * should not be compressed at the end of this function.
+		 */
+		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+
+		/*
+		 * Try copying the data from another buf which already has a
+		 * decompressed version. If that's not possible, it's time to
+		 * bite the bullet and decompress the data from the hdr.
+		 */
+		if (arc_buf_try_copy_decompressed_data(buf)) {
+			/* Skip byteswapping and checksumming (already done) */
+			return (0);
+		} else {
+			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+			    hdr->b_l1hdr.b_pabd, buf->b_data,
+			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
+			    &hdr->b_complevel);
+
+			/*
+			 * Absent hardware errors or software bugs, this should
+			 * be impossible, but log it anyway so we can debug it.
+			 */
+			if (error != 0) {
+				zfs_dbgmsg(
+				    "hdr %px, compress %d, psize %d, lsize %d",
+				    hdr, arc_hdr_get_compress(hdr),
+				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
+				if (hash_lock != NULL)
+					mutex_enter(hash_lock);
+				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+				if (hash_lock != NULL)
+					mutex_exit(hash_lock);
+				return (SET_ERROR(EIO));
+			}
+		}
+	}
+
+byteswap:
+	/* Byteswap the buf's data if necessary */
+	if (bswap != DMU_BSWAP_NUMFUNCS) {
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
+		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
+	}
+
+	/* Compute the hdr's checksum if necessary */
+	arc_cksum_compute(buf);
+
+	return (0);
+}
+
+/*
+ * If this function is being called to decrypt an encrypted buffer or verify an
+ * authenticated one, the key must be loaded and a mapping must be made
+ * available in the keystore via spa_keystore_create_mapping() or one of its
+ * callers.
+ */
+int
+arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
+    boolean_t in_place)
+{
+	int ret;
+	arc_fill_flags_t flags = 0;
+
+	if (in_place)
+		flags |= ARC_FILL_IN_PLACE;
+
+	ret = arc_buf_fill(buf, spa, zb, flags);
+	if (ret == ECKSUM) {
+		/*
+		 * Convert authentication and decryption errors to EIO
+		 * (and generate an ereport) before leaving the ARC.
+		 */
+		ret = SET_ERROR(EIO);
+		spa_log_error(spa, zb);
+		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+		    spa, NULL, zb, NULL, 0);
+	}
+
+	return (ret);
+}
+
+/*
+ * Increment the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	if (GHOST_STATE(state)) {
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+		ASSERT(!HDR_HAS_RABD(hdr));
+		(void) zfs_refcount_add_many(&state->arcs_esize[type],
+		    HDR_GET_LSIZE(hdr), hdr);
+		return;
+	}
+
+	ASSERT(!GHOST_STATE(state));
+	if (hdr->b_l1hdr.b_pabd != NULL) {
+		(void) zfs_refcount_add_many(&state->arcs_esize[type],
+		    arc_hdr_size(hdr), hdr);
+	}
+	if (HDR_HAS_RABD(hdr)) {
+		(void) zfs_refcount_add_many(&state->arcs_esize[type],
+		    HDR_GET_PSIZE(hdr), hdr);
+	}
+
+	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+	    buf = buf->b_next) {
+		if (arc_buf_is_shared(buf))
+			continue;
+		(void) zfs_refcount_add_many(&state->arcs_esize[type],
+		    arc_buf_size(buf), buf);
+	}
+}
+
+/*
+ * Decrement the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	if (GHOST_STATE(state)) {
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+		ASSERT(!HDR_HAS_RABD(hdr));
+		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
+		    HDR_GET_LSIZE(hdr), hdr);
+		return;
+	}
+
+	ASSERT(!GHOST_STATE(state));
+	if (hdr->b_l1hdr.b_pabd != NULL) {
+		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
+		    arc_hdr_size(hdr), hdr);
+	}
+	if (HDR_HAS_RABD(hdr)) {
+		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
+		    HDR_GET_PSIZE(hdr), hdr);
+	}
+
+	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+	    buf = buf->b_next) {
+		if (arc_buf_is_shared(buf))
+			continue;
+		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
+		    arc_buf_size(buf), buf);
+	}
+}
+
+/*
+ * Add a reference to this hdr indicating that someone is actively
+ * referencing that memory. When the refcount transitions from 0 to 1,
+ * we remove it from the respective arc_state_t list to indicate that
+ * it is not evictable.
+ */
+static void
+add_reference(arc_buf_hdr_t *hdr, void *tag)
+{
+	arc_state_t *state;
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
+		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+	}
+
+	state = hdr->b_l1hdr.b_state;
+
+	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+	    (state != arc_anon)) {
+		/* We don't use the L2-only state list. */
+		if (state != arc_l2c_only) {
+			multilist_remove(state->arcs_list[arc_buf_type(hdr)],
+			    hdr);
+			arc_evictable_space_decrement(hdr, state);
+		}
+		/* remove the prefetch flag if we get a reference */
+		if (HDR_HAS_L2HDR(hdr))
+			l2arc_hdr_arcstats_decrement_state(hdr);
+		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+		if (HDR_HAS_L2HDR(hdr))
+			l2arc_hdr_arcstats_increment_state(hdr);
+	}
+}
+
+/*
+ * Remove a reference from this hdr. When the reference transitions from
+ * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
+ * list making it eligible for eviction.
+ */
+static int
+remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
+{
+	int cnt;
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
+	ASSERT(!GHOST_STATE(state));
+
+	/*
+	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
+	 * check to prevent usage of the arc_l2c_only list.
+	 */
+	if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
+	    (state != arc_anon)) {
+		multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+		arc_evictable_space_increment(hdr, state);
+	}
+	return (cnt);
+}
+
+/*
+ * Returns detailed information about a specific arc buffer.  When the
+ * state_index argument is set the function will calculate the arc header
+ * list position for its arc state.  Since this requires a linear traversal
+ * callers are strongly encourage not to do this.  However, it can be helpful
+ * for targeted analysis so the functionality is provided.
+ */
+void
+arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
+{
+	arc_buf_hdr_t *hdr = ab->b_hdr;
+	l1arc_buf_hdr_t *l1hdr = NULL;
+	l2arc_buf_hdr_t *l2hdr = NULL;
+	arc_state_t *state = NULL;
+
+	memset(abi, 0, sizeof (arc_buf_info_t));
+
+	if (hdr == NULL)
+		return;
+
+	abi->abi_flags = hdr->b_flags;
+
+	if (HDR_HAS_L1HDR(hdr)) {
+		l1hdr = &hdr->b_l1hdr;
+		state = l1hdr->b_state;
+	}
+	if (HDR_HAS_L2HDR(hdr))
+		l2hdr = &hdr->b_l2hdr;
+
+	if (l1hdr) {
+		abi->abi_bufcnt = l1hdr->b_bufcnt;
+		abi->abi_access = l1hdr->b_arc_access;
+		abi->abi_mru_hits = l1hdr->b_mru_hits;
+		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
+		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
+		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
+		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
+	}
+
+	if (l2hdr) {
+		abi->abi_l2arc_dattr = l2hdr->b_daddr;
+		abi->abi_l2arc_hits = l2hdr->b_hits;
+	}
+
+	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
+	abi->abi_state_contents = arc_buf_type(hdr);
+	abi->abi_size = arc_hdr_size(hdr);
+}
+
+/*
+ * Move the supplied buffer to the indicated state. The hash lock
+ * for the buffer must be held by the caller.
+ */
+static void
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
+    kmutex_t *hash_lock)
+{
+	arc_state_t *old_state;
+	int64_t refcnt;
+	uint32_t bufcnt;
+	boolean_t update_old, update_new;
+	arc_buf_contents_t buftype = arc_buf_type(hdr);
+
+	/*
+	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
+	 * L1 hdr doesn't always exist when we change state to arc_anon before
+	 * destroying a header, in which case reallocating to add the L1 hdr is
+	 * pointless.
+	 */
+	if (HDR_HAS_L1HDR(hdr)) {
+		old_state = hdr->b_l1hdr.b_state;
+		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
+		bufcnt = hdr->b_l1hdr.b_bufcnt;
+		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
+		    HDR_HAS_RABD(hdr));
+	} else {
+		old_state = arc_l2c_only;
+		refcnt = 0;
+		bufcnt = 0;
+		update_old = B_FALSE;
+	}
+	update_new = update_old;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT3P(new_state, !=, old_state);
+	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
+	ASSERT(old_state != arc_anon || bufcnt <= 1);
+
+	/*
+	 * If this buffer is evictable, transfer it from the
+	 * old state list to the new state list.
+	 */
+	if (refcnt == 0) {
+		if (old_state != arc_anon && old_state != arc_l2c_only) {
+			ASSERT(HDR_HAS_L1HDR(hdr));
+			multilist_remove(old_state->arcs_list[buftype], hdr);
+
+			if (GHOST_STATE(old_state)) {
+				ASSERT0(bufcnt);
+				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+				update_old = B_TRUE;
+			}
+			arc_evictable_space_decrement(hdr, old_state);
+		}
+		if (new_state != arc_anon && new_state != arc_l2c_only) {
+			/*
+			 * An L1 header always exists here, since if we're
+			 * moving to some L1-cached state (i.e. not l2c_only or
+			 * anonymous), we realloc the header to add an L1hdr
+			 * beforehand.
+			 */
+			ASSERT(HDR_HAS_L1HDR(hdr));
+			multilist_insert(new_state->arcs_list[buftype], hdr);
+
+			if (GHOST_STATE(new_state)) {
+				ASSERT0(bufcnt);
+				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+				update_new = B_TRUE;
+			}
+			arc_evictable_space_increment(hdr, new_state);
+		}
+	}
+
+	ASSERT(!HDR_EMPTY(hdr));
+	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
+		buf_hash_remove(hdr);
+
+	/* adjust state sizes (ignore arc_l2c_only) */
+
+	if (update_new && new_state != arc_l2c_only) {
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(new_state)) {
+			ASSERT0(bufcnt);
+
+			/*
+			 * When moving a header to a ghost state, we first
+			 * remove all arc buffers. Thus, we'll have a
+			 * bufcnt of zero, and no arc buffer to use for
+			 * the reference. As a result, we use the arc
+			 * header pointer for the reference.
+			 */
+			(void) zfs_refcount_add_many(&new_state->arcs_size,
+			    HDR_GET_LSIZE(hdr), hdr);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+			ASSERT(!HDR_HAS_RABD(hdr));
+		} else {
+			uint32_t buffers = 0;
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				ASSERT3U(bufcnt, !=, 0);
+				buffers++;
+
+				/*
+				 * When the arc_buf_t is sharing the data
+				 * block with the hdr, the owner of the
+				 * reference belongs to the hdr. Only
+				 * add to the refcount if the arc_buf_t is
+				 * not shared.
+				 */
+				if (arc_buf_is_shared(buf))
+					continue;
+
+				(void) zfs_refcount_add_many(
+				    &new_state->arcs_size,
+				    arc_buf_size(buf), buf);
+			}
+			ASSERT3U(bufcnt, ==, buffers);
+
+			if (hdr->b_l1hdr.b_pabd != NULL) {
+				(void) zfs_refcount_add_many(
+				    &new_state->arcs_size,
+				    arc_hdr_size(hdr), hdr);
+			}
+
+			if (HDR_HAS_RABD(hdr)) {
+				(void) zfs_refcount_add_many(
+				    &new_state->arcs_size,
+				    HDR_GET_PSIZE(hdr), hdr);
+			}
+		}
+	}
+
+	if (update_old && old_state != arc_l2c_only) {
+		ASSERT(HDR_HAS_L1HDR(hdr));
+		if (GHOST_STATE(old_state)) {
+			ASSERT0(bufcnt);
+			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+			ASSERT(!HDR_HAS_RABD(hdr));
+
+			/*
+			 * When moving a header off of a ghost state,
+			 * the header will not contain any arc buffers.
+			 * We use the arc header pointer for the reference
+			 * which is exactly what we did when we put the
+			 * header on the ghost state.
+			 */
+
+			(void) zfs_refcount_remove_many(&old_state->arcs_size,
+			    HDR_GET_LSIZE(hdr), hdr);
+		} else {
+			uint32_t buffers = 0;
+
+			/*
+			 * Each individual buffer holds a unique reference,
+			 * thus we must remove each of these references one
+			 * at a time.
+			 */
+			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+			    buf = buf->b_next) {
+				ASSERT3U(bufcnt, !=, 0);
+				buffers++;
+
+				/*
+				 * When the arc_buf_t is sharing the data
+				 * block with the hdr, the owner of the
+				 * reference belongs to the hdr. Only
+				 * add to the refcount if the arc_buf_t is
+				 * not shared.
+				 */
+				if (arc_buf_is_shared(buf))
+					continue;
+
+				(void) zfs_refcount_remove_many(
+				    &old_state->arcs_size, arc_buf_size(buf),
+				    buf);
+			}
+			ASSERT3U(bufcnt, ==, buffers);
+			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+			    HDR_HAS_RABD(hdr));
+
+			if (hdr->b_l1hdr.b_pabd != NULL) {
+				(void) zfs_refcount_remove_many(
+				    &old_state->arcs_size, arc_hdr_size(hdr),
+				    hdr);
+			}
+
+			if (HDR_HAS_RABD(hdr)) {
+				(void) zfs_refcount_remove_many(
+				    &old_state->arcs_size, HDR_GET_PSIZE(hdr),
+				    hdr);
+			}
+		}
+	}
+
+	if (HDR_HAS_L1HDR(hdr)) {
+		hdr->b_l1hdr.b_state = new_state;
+
+		if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
+			l2arc_hdr_arcstats_decrement_state(hdr);
+			hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
+			l2arc_hdr_arcstats_increment_state(hdr);
+		}
+	}
+
+	/*
+	 * L2 headers should never be on the L2 state list since they don't
+	 * have L1 headers allocated.
+	 */
+	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+	    multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
+}
+
+void
+arc_space_consume(uint64_t space, arc_space_type_t type)
+{
+	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+	switch (type) {
+	default:
+		break;
+	case ARC_SPACE_DATA:
+		aggsum_add(&astat_data_size, space);
+		break;
+	case ARC_SPACE_META:
+		aggsum_add(&astat_metadata_size, space);
+		break;
+	case ARC_SPACE_BONUS:
+		aggsum_add(&astat_bonus_size, space);
+		break;
+	case ARC_SPACE_DNODE:
+		aggsum_add(&astat_dnode_size, space);
+		break;
+	case ARC_SPACE_DBUF:
+		aggsum_add(&astat_dbuf_size, space);
+		break;
+	case ARC_SPACE_HDRS:
+		aggsum_add(&astat_hdr_size, space);
+		break;
+	case ARC_SPACE_L2HDRS:
+		aggsum_add(&astat_l2_hdr_size, space);
+		break;
+	case ARC_SPACE_ABD_CHUNK_WASTE:
+		/*
+		 * Note: this includes space wasted by all scatter ABD's, not
+		 * just those allocated by the ARC.  But the vast majority of
+		 * scatter ABD's come from the ARC, because other users are
+		 * very short-lived.
+		 */
+		aggsum_add(&astat_abd_chunk_waste_size, space);
+		break;
+	}
+
+	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
+		aggsum_add(&arc_meta_used, space);
+
+	aggsum_add(&arc_size, space);
+}
+
+void
+arc_space_return(uint64_t space, arc_space_type_t type)
+{
+	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+	switch (type) {
+	default:
+		break;
+	case ARC_SPACE_DATA:
+		aggsum_add(&astat_data_size, -space);
+		break;
+	case ARC_SPACE_META:
+		aggsum_add(&astat_metadata_size, -space);
+		break;
+	case ARC_SPACE_BONUS:
+		aggsum_add(&astat_bonus_size, -space);
+		break;
+	case ARC_SPACE_DNODE:
+		aggsum_add(&astat_dnode_size, -space);
+		break;
+	case ARC_SPACE_DBUF:
+		aggsum_add(&astat_dbuf_size, -space);
+		break;
+	case ARC_SPACE_HDRS:
+		aggsum_add(&astat_hdr_size, -space);
+		break;
+	case ARC_SPACE_L2HDRS:
+		aggsum_add(&astat_l2_hdr_size, -space);
+		break;
+	case ARC_SPACE_ABD_CHUNK_WASTE:
+		aggsum_add(&astat_abd_chunk_waste_size, -space);
+		break;
+	}
+
+	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) {
+		ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
+		/*
+		 * We use the upper bound here rather than the precise value
+		 * because the arc_meta_max value doesn't need to be
+		 * precise. It's only consumed by humans via arcstats.
+		 */
+		if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
+			arc_meta_max = aggsum_upper_bound(&arc_meta_used);
+		aggsum_add(&arc_meta_used, -space);
+	}
+
+	ASSERT(aggsum_compare(&arc_size, space) >= 0);
+	aggsum_add(&arc_size, -space);
+}
+
+/*
+ * Given a hdr and a buf, returns whether that buf can share its b_data buffer
+ * with the hdr's b_pabd.
+ */
+static boolean_t
+arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+	/*
+	 * The criteria for sharing a hdr's data are:
+	 * 1. the buffer is not encrypted
+	 * 2. the hdr's compression matches the buf's compression
+	 * 3. the hdr doesn't need to be byteswapped
+	 * 4. the hdr isn't already being shared
+	 * 5. the buf is either compressed or it is the last buf in the hdr list
+	 *
+	 * Criterion #5 maintains the invariant that shared uncompressed
+	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
+	 * might ask, "if a compressed buf is allocated first, won't that be the
+	 * last thing in the list?", but in that case it's impossible to create
+	 * a shared uncompressed buf anyway (because the hdr must be compressed
+	 * to have the compressed buf). You might also think that #3 is
+	 * sufficient to make this guarantee, however it's possible
+	 * (specifically in the rare L2ARC write race mentioned in
+	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
+	 * is shareable, but wasn't at the time of its allocation. Rather than
+	 * allow a new shared uncompressed buf to be created and then shuffle
+	 * the list around to make it the last element, this simply disallows
+	 * sharing if the new buf isn't the first to be added.
+	 */
+	ASSERT3P(buf->b_hdr, ==, hdr);
+	boolean_t hdr_compressed =
+	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
+	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
+	return (!ARC_BUF_ENCRYPTED(buf) &&
+	    buf_compressed == hdr_compressed &&
+	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+	    !HDR_SHARED_DATA(hdr) &&
+	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
+}
+
+/*
+ * Allocate a buf for this hdr. If you care about the data that's in the hdr,
+ * or if you want a compressed buffer, pass those flags in. Returns 0 if the
+ * copy was made successfully, or an error code otherwise.
+ */
+static int
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
+    void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
+    boolean_t fill, arc_buf_t **ret)
+{
+	arc_buf_t *buf;
+	arc_fill_flags_t flags = ARC_FILL_LOCKED;
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
+	    hdr->b_type == ARC_BUFC_METADATA);
+	ASSERT3P(ret, !=, NULL);
+	ASSERT3P(*ret, ==, NULL);
+	IMPLY(encrypted, compressed);
+
+	hdr->b_l1hdr.b_mru_hits = 0;
+	hdr->b_l1hdr.b_mru_ghost_hits = 0;
+	hdr->b_l1hdr.b_mfu_hits = 0;
+	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+	hdr->b_l1hdr.b_l2_hits = 0;
+
+	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+	buf->b_hdr = hdr;
+	buf->b_data = NULL;
+	buf->b_next = hdr->b_l1hdr.b_buf;
+	buf->b_flags = 0;
+
+	add_reference(hdr, tag);
+
+	/*
+	 * We're about to change the hdr's b_flags. We must either
+	 * hold the hash_lock or be undiscoverable.
+	 */
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+	/*
+	 * Only honor requests for compressed bufs if the hdr is actually
+	 * compressed. This must be overridden if the buffer is encrypted since
+	 * encrypted buffers cannot be decompressed.
+	 */
+	if (encrypted) {
+		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
+		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
+	} else if (compressed &&
+	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
+		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
+		flags |= ARC_FILL_COMPRESSED;
+	}
+
+	if (noauth) {
+		ASSERT0(encrypted);
+		flags |= ARC_FILL_NOAUTH;
+	}
+
+	/*
+	 * If the hdr's data can be shared then we share the data buffer and
+	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
+	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
+	 * buffer to store the buf's data.
+	 *
+	 * There are two additional restrictions here because we're sharing
+	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
+	 * actively involved in an L2ARC write, because if this buf is used by
+	 * an arc_write() then the hdr's data buffer will be released when the
+	 * write completes, even though the L2ARC write might still be using it.
+	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
+	 * need to be ABD-aware.  It must be allocated via
+	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
+	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
+	 * page" buffers because the ABD code needs to handle freeing them
+	 * specially.
+	 */
+	boolean_t can_share = arc_can_share(hdr, buf) &&
+	    !HDR_L2_WRITING(hdr) &&
+	    hdr->b_l1hdr.b_pabd != NULL &&
+	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
+	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
+
+	/* Set up b_data and sharing */
+	if (can_share) {
+		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
+		buf->b_flags |= ARC_BUF_FLAG_SHARED;
+		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+	} else {
+		buf->b_data =
+		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
+		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
+	}
+	VERIFY3P(buf->b_data, !=, NULL);
+
+	hdr->b_l1hdr.b_buf = buf;
+	hdr->b_l1hdr.b_bufcnt += 1;
+	if (encrypted)
+		hdr->b_crypt_hdr.b_ebufcnt += 1;
+
+	/*
+	 * If the user wants the data from the hdr, we need to either copy or
+	 * decompress the data.
+	 */
+	if (fill) {
+		ASSERT3P(zb, !=, NULL);
+		return (arc_buf_fill(buf, spa, zb, flags));
+	}
+
+	return (0);
+}
+
+static char *arc_onloan_tag = "onloan";
+
+static inline void
+arc_loaned_bytes_update(int64_t delta)
+{
+	atomic_add_64(&arc_loaned_bytes, delta);
+
+	/* assert that it did not wrap around */
+	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+}
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
+{
+	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
+	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
+
+	arc_loaned_bytes_update(arc_buf_size(buf));
+
+	return (buf);
+}
+
+arc_buf_t *
+arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
+    enum zio_compress compression_type, uint8_t complevel)
+{
+	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
+	    psize, lsize, compression_type, complevel);
+
+	arc_loaned_bytes_update(arc_buf_size(buf));
+
+	return (buf);
+}
+
+arc_buf_t *
+arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
+    const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
+    dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+    enum zio_compress compression_type, uint8_t complevel)
+{
+	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
+	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
+	    complevel);
+
+	atomic_add_64(&arc_loaned_bytes, psize);
+	return (buf);
+}
+
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT3P(buf->b_data, !=, NULL);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+
+	arc_loaned_bytes_update(-arc_buf_size(buf));
+}
+
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT3P(buf->b_data, !=, NULL);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
+
+	arc_loaned_bytes_update(arc_buf_size(buf));
+}
+
+static void
+l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
+{
+	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
+
+	df->l2df_abd = abd;
+	df->l2df_size = size;
+	df->l2df_type = type;
+	mutex_enter(&l2arc_free_on_write_mtx);
+	list_insert_head(l2arc_free_on_write, df);
+	mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+static void
+arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
+{
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
+	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
+
+	/* protected by hash lock, if in the hash table */
+	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT(state != arc_anon && state != arc_l2c_only);
+
+		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
+		    size, hdr);
+	}
+	(void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
+	if (type == ARC_BUFC_METADATA) {
+		arc_space_return(size, ARC_SPACE_META);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		arc_space_return(size, ARC_SPACE_DATA);
+	}
+
+	if (free_rdata) {
+		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
+	} else {
+		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
+	}
+}
+
+/*
+ * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
+ * data buffer, we transfer the refcount ownership to the hdr and update
+ * the appropriate kstats.
+ */
+static void
+arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+	ASSERT(arc_can_share(hdr, buf));
+	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+	ASSERT(!ARC_BUF_ENCRYPTED(buf));
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+	/*
+	 * Start sharing the data buffer. We transfer the
+	 * refcount ownership to the hdr since it always owns
+	 * the refcount whenever an arc_buf_t is shared.
+	 */
+	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+	    arc_hdr_size(hdr), buf, hdr);
+	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
+	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
+	    HDR_ISTYPE_METADATA(hdr));
+	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+	buf->b_flags |= ARC_BUF_FLAG_SHARED;
+
+	/*
+	 * Since we've transferred ownership to the hdr we need
+	 * to increment its compressed and uncompressed kstats and
+	 * decrement the overhead size.
+	 */
+	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
+}
+
+static void
+arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+	ASSERT(arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+	/*
+	 * We are no longer sharing this buffer so we need
+	 * to transfer its ownership to the rightful owner.
+	 */
+	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
+	    arc_hdr_size(hdr), hdr, buf);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
+	abd_free(hdr->b_l1hdr.b_pabd);
+	hdr->b_l1hdr.b_pabd = NULL;
+	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
+
+	/*
+	 * Since the buffer is no longer shared between
+	 * the arc buf and the hdr, count it as overhead.
+	 */
+	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
+}
+
+/*
+ * Remove an arc_buf_t from the hdr's buf list and return the last
+ * arc_buf_t on the list. If no buffers remain on the list then return
+ * NULL.
+ */
+static arc_buf_t *
+arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
+	arc_buf_t *lastbuf = NULL;
+
+	/*
+	 * Remove the buf from the hdr list and locate the last
+	 * remaining buffer on the list.
+	 */
+	while (*bufp != NULL) {
+		if (*bufp == buf)
+			*bufp = buf->b_next;
+
+		/*
+		 * If we've removed a buffer in the middle of
+		 * the list then update the lastbuf and update
+		 * bufp.
+		 */
+		if (*bufp != NULL) {
+			lastbuf = *bufp;
+			bufp = &(*bufp)->b_next;
+		}
+	}
+	buf->b_next = NULL;
+	ASSERT3P(lastbuf, !=, buf);
+	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
+	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
+	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
+
+	return (lastbuf);
+}
+
+/*
+ * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
+ * list and free it.
+ */
+static void
+arc_buf_destroy_impl(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	/*
+	 * Free up the data associated with the buf but only if we're not
+	 * sharing this with the hdr. If we are sharing it with the hdr, the
+	 * hdr is responsible for doing the free.
+	 */
+	if (buf->b_data != NULL) {
+		/*
+		 * We're about to change the hdr's b_flags. We must either
+		 * hold the hash_lock or be undiscoverable.
+		 */
+		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
+
+		arc_cksum_verify(buf);
+		arc_buf_unwatch(buf);
+
+		if (arc_buf_is_shared(buf)) {
+			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+		} else {
+			uint64_t size = arc_buf_size(buf);
+			arc_free_data_buf(hdr, buf->b_data, size, buf);
+			ARCSTAT_INCR(arcstat_overhead_size, -size);
+		}
+		buf->b_data = NULL;
+
+		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+		hdr->b_l1hdr.b_bufcnt -= 1;
+
+		if (ARC_BUF_ENCRYPTED(buf)) {
+			hdr->b_crypt_hdr.b_ebufcnt -= 1;
+
+			/*
+			 * If we have no more encrypted buffers and we've
+			 * already gotten a copy of the decrypted data we can
+			 * free b_rabd to save some space.
+			 */
+			if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
+			    HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
+			    !HDR_IO_IN_PROGRESS(hdr)) {
+				arc_hdr_free_abd(hdr, B_TRUE);
+			}
+		}
+	}
+
+	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+
+	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
+		/*
+		 * If the current arc_buf_t is sharing its data buffer with the
+		 * hdr, then reassign the hdr's b_pabd to share it with the new
+		 * buffer at the end of the list. The shared buffer is always
+		 * the last one on the hdr's buffer list.
+		 *
+		 * There is an equivalent case for compressed bufs, but since
+		 * they aren't guaranteed to be the last buf in the list and
+		 * that is an exceedingly rare case, we just allow that space be
+		 * wasted temporarily. We must also be careful not to share
+		 * encrypted buffers, since they cannot be shared.
+		 */
+		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
+			/* Only one buf can be shared at once */
+			VERIFY(!arc_buf_is_shared(lastbuf));
+			/* hdr is uncompressed so can't have compressed buf */
+			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
+
+			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+			arc_hdr_free_abd(hdr, B_FALSE);
+
+			/*
+			 * We must setup a new shared block between the
+			 * last buffer and the hdr. The data would have
+			 * been allocated by the arc buf so we need to transfer
+			 * ownership to the hdr since it's now being shared.
+			 */
+			arc_share_buf(hdr, lastbuf);
+		}
+	} else if (HDR_SHARED_DATA(hdr)) {
+		/*
+		 * Uncompressed shared buffers are always at the end
+		 * of the list. Compressed buffers don't have the
+		 * same requirements. This makes it hard to
+		 * simply assert that the lastbuf is shared so
+		 * we rely on the hdr's compression flags to determine
+		 * if we have a compressed, shared buffer.
+		 */
+		ASSERT3P(lastbuf, !=, NULL);
+		ASSERT(arc_buf_is_shared(lastbuf) ||
+		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
+	}
+
+	/*
+	 * Free the checksum if we're removing the last uncompressed buf from
+	 * this hdr.
+	 */
+	if (!arc_hdr_has_uncompressed_buf(hdr)) {
+		arc_cksum_free(hdr);
+	}
+
+	/* clean up the buf */
+	buf->b_hdr = NULL;
+	kmem_cache_free(buf_cache, buf);
+}
+
+static void
+arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
+{
+	uint64_t size;
+	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
+	boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0);
+
+	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
+	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
+
+	if (alloc_rdata) {
+		size = HDR_GET_PSIZE(hdr);
+		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
+		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
+		    do_adapt);
+		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
+		ARCSTAT_INCR(arcstat_raw_size, size);
+	} else {
+		size = arc_hdr_size(hdr);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
+		    do_adapt);
+		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+	}
+
+	ARCSTAT_INCR(arcstat_compressed_size, size);
+	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+}
+
+static void
+arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
+{
+	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
+
+	/*
+	 * If the hdr is currently being written to the l2arc then
+	 * we defer freeing the data by adding it to the l2arc_free_on_write
+	 * list. The l2arc will free the data once it's finished
+	 * writing it to the l2arc device.
+	 */
+	if (HDR_L2_WRITING(hdr)) {
+		arc_hdr_free_on_write(hdr, free_rdata);
+		ARCSTAT_BUMP(arcstat_l2_free_on_write);
+	} else if (free_rdata) {
+		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
+	} else {
+		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
+	}
+
+	if (free_rdata) {
+		hdr->b_crypt_hdr.b_rabd = NULL;
+		ARCSTAT_INCR(arcstat_raw_size, -size);
+	} else {
+		hdr->b_l1hdr.b_pabd = NULL;
+	}
+
+	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
+		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+
+	ARCSTAT_INCR(arcstat_compressed_size, -size);
+	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+}
+
+static arc_buf_hdr_t *
+arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
+    boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
+    arc_buf_contents_t type, boolean_t alloc_rdata)
+{
+	arc_buf_hdr_t *hdr;
+	int flags = ARC_HDR_DO_ADAPT;
+
+	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
+	if (protected) {
+		hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
+	} else {
+		hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
+	}
+	flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0;
+
+	ASSERT(HDR_EMPTY(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+	HDR_SET_PSIZE(hdr, psize);
+	HDR_SET_LSIZE(hdr, lsize);
+	hdr->b_spa = spa;
+	hdr->b_type = type;
+	hdr->b_flags = 0;
+	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
+	arc_hdr_set_compress(hdr, compression_type);
+	hdr->b_complevel = complevel;
+	if (protected)
+		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
+
+	hdr->b_l1hdr.b_state = arc_anon;
+	hdr->b_l1hdr.b_arc_access = 0;
+	hdr->b_l1hdr.b_bufcnt = 0;
+	hdr->b_l1hdr.b_buf = NULL;
+
+	/*
+	 * Allocate the hdr's buffer. This will contain either
+	 * the compressed or uncompressed data depending on the block
+	 * it references and compressed arc enablement.
+	 */
+	arc_hdr_alloc_abd(hdr, flags);
+	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+
+	return (hdr);
+}
+
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
+	arc_buf_hdr_t *nhdr;
+	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+	    (old == hdr_l2only_cache && new == hdr_full_cache));
+
+	/*
+	 * if the caller wanted a new full header and the header is to be
+	 * encrypted we will actually allocate the header from the full crypt
+	 * cache instead. The same applies to freeing from the old cache.
+	 */
+	if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
+		new = hdr_full_crypt_cache;
+	if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
+		old = hdr_full_crypt_cache;
+
+	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+	buf_hash_remove(hdr);
+
+	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
+	if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
+		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
+		/*
+		 * arc_access and arc_change_state need to be aware that a
+		 * header has just come out of L2ARC, so we set its state to
+		 * l2c_only even though it's about to change.
+		 */
+		nhdr->b_l1hdr.b_state = arc_l2c_only;
+
+		/* Verify previous threads set to NULL before freeing */
+		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
+		ASSERT(!HDR_HAS_RABD(hdr));
+	} else {
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT0(hdr->b_l1hdr.b_bufcnt);
+		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+		/*
+		 * If we've reached here, We must have been called from
+		 * arc_evict_hdr(), as such we should have already been
+		 * removed from any ghost list we were previously on
+		 * (which protects us from racing with arc_evict_state),
+		 * thus no locking is needed during this check.
+		 */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		/*
+		 * A buffer must not be moved into the arc_l2c_only
+		 * state if it's not finished being written out to the
+		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
+		 * might try to be accessed, even though it was removed.
+		 */
+		VERIFY(!HDR_L2_WRITING(hdr));
+		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+		ASSERT(!HDR_HAS_RABD(hdr));
+
+		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
+	}
+	/*
+	 * The header has been reallocated so we need to re-insert it into any
+	 * lists it was on.
+	 */
+	(void) buf_hash_insert(nhdr, NULL);
+
+	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+	mutex_enter(&dev->l2ad_mtx);
+
+	/*
+	 * We must place the realloc'ed header back into the list at
+	 * the same spot. Otherwise, if it's placed earlier in the list,
+	 * l2arc_write_buffers() could find it during the function's
+	 * write phase, and try to write it out to the l2arc.
+	 */
+	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+	list_remove(&dev->l2ad_buflist, hdr);
+
+	mutex_exit(&dev->l2ad_mtx);
+
+	/*
+	 * Since we're using the pointer address as the tag when
+	 * incrementing and decrementing the l2ad_alloc refcount, we
+	 * must remove the old pointer (that we're about to destroy) and
+	 * add the new pointer to the refcount. Otherwise we'd remove
+	 * the wrong pointer address when calling arc_hdr_destroy() later.
+	 */
+
+	(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
+	    arc_hdr_size(hdr), hdr);
+	(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+	    arc_hdr_size(nhdr), nhdr);
+
+	buf_discard_identity(hdr);
+	kmem_cache_free(old, hdr);
+
+	return (nhdr);
+}
+
+/*
+ * This function allows an L1 header to be reallocated as a crypt
+ * header and vice versa. If we are going to a crypt header, the
+ * new fields will be zeroed out.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
+{
+	arc_buf_hdr_t *nhdr;
+	arc_buf_t *buf;
+	kmem_cache_t *ncache, *ocache;
+	unsigned nsize, osize;
+
+	/*
+	 * This function requires that hdr is in the arc_anon state.
+	 * Therefore it won't have any L2ARC data for us to worry
+	 * about copying.
+	 */
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!HDR_HAS_L2HDR(hdr));
+	ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
+	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+	ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
+	ASSERT3P(hdr->b_hash_next, ==, NULL);
+
+	if (need_crypt) {
+		ncache = hdr_full_crypt_cache;
+		nsize = sizeof (hdr->b_crypt_hdr);
+		ocache = hdr_full_cache;
+		osize = HDR_FULL_SIZE;
+	} else {
+		ncache = hdr_full_cache;
+		nsize = HDR_FULL_SIZE;
+		ocache = hdr_full_crypt_cache;
+		osize = sizeof (hdr->b_crypt_hdr);
+	}
+
+	nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
+
+	/*
+	 * Copy all members that aren't locks or condvars to the new header.
+	 * No lists are pointing to us (as we asserted above), so we don't
+	 * need to worry about the list nodes.
+	 */
+	nhdr->b_dva = hdr->b_dva;
+	nhdr->b_birth = hdr->b_birth;
+	nhdr->b_type = hdr->b_type;
+	nhdr->b_flags = hdr->b_flags;
+	nhdr->b_psize = hdr->b_psize;
+	nhdr->b_lsize = hdr->b_lsize;
+	nhdr->b_spa = hdr->b_spa;
+	nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
+	nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
+	nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
+	nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
+	nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
+	nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits;
+	nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits;
+	nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits;
+	nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits;
+	nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits;
+	nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
+	nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
+
+	/*
+	 * This zfs_refcount_add() exists only to ensure that the individual
+	 * arc buffers always point to a header that is referenced, avoiding
+	 * a small race condition that could trigger ASSERTs.
+	 */
+	(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
+	nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
+	for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
+		mutex_enter(&buf->b_evict_lock);
+		buf->b_hdr = nhdr;
+		mutex_exit(&buf->b_evict_lock);
+	}
+
+	zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
+	(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
+	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
+
+	if (need_crypt) {
+		arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
+	} else {
+		arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
+	}
+
+	/* unset all members of the original hdr */
+	bzero(&hdr->b_dva, sizeof (dva_t));
+	hdr->b_birth = 0;
+	hdr->b_type = ARC_BUFC_INVALID;
+	hdr->b_flags = 0;
+	hdr->b_psize = 0;
+	hdr->b_lsize = 0;
+	hdr->b_spa = 0;
+	hdr->b_l1hdr.b_freeze_cksum = NULL;
+	hdr->b_l1hdr.b_buf = NULL;
+	hdr->b_l1hdr.b_bufcnt = 0;
+	hdr->b_l1hdr.b_byteswap = 0;
+	hdr->b_l1hdr.b_state = NULL;
+	hdr->b_l1hdr.b_arc_access = 0;
+	hdr->b_l1hdr.b_mru_hits = 0;
+	hdr->b_l1hdr.b_mru_ghost_hits = 0;
+	hdr->b_l1hdr.b_mfu_hits = 0;
+	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+	hdr->b_l1hdr.b_l2_hits = 0;
+	hdr->b_l1hdr.b_acb = NULL;
+	hdr->b_l1hdr.b_pabd = NULL;
+
+	if (ocache == hdr_full_crypt_cache) {
+		ASSERT(!HDR_HAS_RABD(hdr));
+		hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
+		hdr->b_crypt_hdr.b_ebufcnt = 0;
+		hdr->b_crypt_hdr.b_dsobj = 0;
+		bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+		bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+		bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+	}
+
+	buf_discard_identity(hdr);
+	kmem_cache_free(ocache, hdr);
+
+	return (nhdr);
+}
+
+/*
+ * This function is used by the send / receive code to convert a newly
+ * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
+ * is also used to allow the root objset block to be updated without altering
+ * its embedded MACs. Both block types will always be uncompressed so we do not
+ * have to worry about compression type or psize.
+ */
+void
+arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
+    dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
+    const uint8_t *mac)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+
+	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
+	if (!HDR_PROTECTED(hdr))
+		hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
+	hdr->b_crypt_hdr.b_dsobj = dsobj;
+	hdr->b_crypt_hdr.b_ot = ot;
+	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
+	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
+	if (!arc_hdr_has_uncompressed_buf(hdr))
+		arc_cksum_free(hdr);
+
+	if (salt != NULL)
+		bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+	if (iv != NULL)
+		bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+	if (mac != NULL)
+		bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+}
+
+/*
+ * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
+ * The buf is returned thawed since we expect the consumer to modify it.
+ */
+arc_buf_t *
+arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
+{
+	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
+	    B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE);
+
+	arc_buf_t *buf = NULL;
+	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
+	    B_FALSE, B_FALSE, &buf));
+	arc_buf_thaw(buf);
+
+	return (buf);
+}
+
+/*
+ * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
+ * for bufs containing metadata.
+ */
+arc_buf_t *
+arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
+    enum zio_compress compression_type, uint8_t complevel)
+{
+	ASSERT3U(lsize, >, 0);
+	ASSERT3U(lsize, >=, psize);
+	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
+	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
+
+	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+	    B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE);
+
+	arc_buf_t *buf = NULL;
+	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
+	    B_TRUE, B_FALSE, B_FALSE, &buf));
+	arc_buf_thaw(buf);
+	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+	if (!arc_buf_is_shared(buf)) {
+		/*
+		 * To ensure that the hdr has the correct data in it if we call
+		 * arc_untransform() on this buf before it's been written to
+		 * disk, it's easiest if we just set up sharing between the
+		 * buf and the hdr.
+		 */
+		arc_hdr_free_abd(hdr, B_FALSE);
+		arc_share_buf(hdr, buf);
+	}
+
+	return (buf);
+}
+
+arc_buf_t *
+arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
+    const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
+    dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
+    enum zio_compress compression_type, uint8_t complevel)
+{
+	arc_buf_hdr_t *hdr;
+	arc_buf_t *buf;
+	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
+	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
+
+	ASSERT3U(lsize, >, 0);
+	ASSERT3U(lsize, >=, psize);
+	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
+	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
+
+	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
+	    compression_type, complevel, type, B_TRUE);
+
+	hdr->b_crypt_hdr.b_dsobj = dsobj;
+	hdr->b_crypt_hdr.b_ot = ot;
+	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
+	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
+	bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
+	bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
+	bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
+
+	/*
+	 * This buffer will be considered encrypted even if the ot is not an
+	 * encrypted type. It will become authenticated instead in
+	 * arc_write_ready().
+	 */
+	buf = NULL;
+	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
+	    B_FALSE, B_FALSE, &buf));
+	arc_buf_thaw(buf);
+	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+
+	return (buf);
+}
+
+static void
+l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
+    boolean_t state_only)
+{
+	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+	l2arc_dev_t *dev = l2hdr->b_dev;
+	uint64_t lsize = HDR_GET_LSIZE(hdr);
+	uint64_t psize = HDR_GET_PSIZE(hdr);
+	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+	arc_buf_contents_t type = hdr->b_type;
+	int64_t lsize_s;
+	int64_t psize_s;
+	int64_t asize_s;
+
+	if (incr) {
+		lsize_s = lsize;
+		psize_s = psize;
+		asize_s = asize;
+	} else {
+		lsize_s = -lsize;
+		psize_s = -psize;
+		asize_s = -asize;
+	}
+
+	/* If the buffer is a prefetch, count it as such. */
+	if (HDR_PREFETCH(hdr)) {
+		ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
+	} else {
+		/*
+		 * We use the value stored in the L2 header upon initial
+		 * caching in L2ARC. This value will be updated in case
+		 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
+		 * metadata (log entry) cannot currently be updated. Having
+		 * the ARC state in the L2 header solves the problem of a
+		 * possibly absent L1 header (apparent in buffers restored
+		 * from persistent L2ARC).
+		 */
+		switch (hdr->b_l2hdr.b_arcs_state) {
+			case ARC_STATE_MRU_GHOST:
+			case ARC_STATE_MRU:
+				ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
+				break;
+			case ARC_STATE_MFU_GHOST:
+			case ARC_STATE_MFU:
+				ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
+				break;
+			default:
+				break;
+		}
+	}
+
+	if (state_only)
+		return;
+
+	ARCSTAT_INCR(arcstat_l2_psize, psize_s);
+	ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
+
+	switch (type) {
+		case ARC_BUFC_DATA:
+			ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
+			break;
+		case ARC_BUFC_METADATA:
+			ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
+			break;
+		default:
+			break;
+	}
+}
+
+
+static void
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
+{
+	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+	l2arc_dev_t *dev = l2hdr->b_dev;
+	uint64_t psize = HDR_GET_PSIZE(hdr);
+	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+
+	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
+	list_remove(&dev->l2ad_buflist, hdr);
+
+	l2arc_hdr_arcstats_decrement(hdr);
+	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+
+	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
+	    hdr);
+	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+	if (HDR_HAS_L1HDR(hdr)) {
+		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+		    hdr->b_l1hdr.b_bufcnt > 0);
+		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+	}
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
+
+		if (!buflist_held)
+			mutex_enter(&dev->l2ad_mtx);
+
+		/*
+		 * Even though we checked this conditional above, we
+		 * need to check this again now that we have the
+		 * l2ad_mtx. This is because we could be racing with
+		 * another thread calling l2arc_evict() which might have
+		 * destroyed this header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx. If that happens, we don't
+		 * want to re-destroy the header's L2 portion.
+		 */
+		if (HDR_HAS_L2HDR(hdr))
+			arc_hdr_l2hdr_destroy(hdr);
+
+		if (!buflist_held)
+			mutex_exit(&dev->l2ad_mtx);
+	}
+
+	/*
+	 * The header's identify can only be safely discarded once it is no
+	 * longer discoverable.  This requires removing it from the hash table
+	 * and the l2arc header list.  After this point the hash lock can not
+	 * be used to protect the header.
+	 */
+	if (!HDR_EMPTY(hdr))
+		buf_discard_identity(hdr);
+
+	if (HDR_HAS_L1HDR(hdr)) {
+		arc_cksum_free(hdr);
+
+		while (hdr->b_l1hdr.b_buf != NULL)
+			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
+
+		if (hdr->b_l1hdr.b_pabd != NULL)
+			arc_hdr_free_abd(hdr, B_FALSE);
+
+		if (HDR_HAS_RABD(hdr))
+			arc_hdr_free_abd(hdr, B_TRUE);
+	}
+
+	ASSERT3P(hdr->b_hash_next, ==, NULL);
+	if (HDR_HAS_L1HDR(hdr)) {
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+
+		if (!HDR_PROTECTED(hdr)) {
+			kmem_cache_free(hdr_full_cache, hdr);
+		} else {
+			kmem_cache_free(hdr_full_crypt_cache, hdr);
+		}
+	} else {
+		kmem_cache_free(hdr_l2only_cache, hdr);
+	}
+}
+
+void
+arc_buf_destroy(arc_buf_t *buf, void* tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	if (hdr->b_l1hdr.b_state == arc_anon) {
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		VERIFY0(remove_reference(hdr, NULL, tag));
+		arc_hdr_destroy(hdr);
+		return;
+	}
+
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	ASSERT3P(hdr, ==, buf->b_hdr);
+	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
+	ASSERT3P(buf->b_data, !=, NULL);
+
+	(void) remove_reference(hdr, hash_lock, tag);
+	arc_buf_destroy_impl(buf);
+	mutex_exit(hash_lock);
+}
+
+/*
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on its state prior to entering this
+ * function. The following transitions are possible:
+ *
+ *    - arc_mru -> arc_mru_ghost
+ *    - arc_mfu -> arc_mfu_ghost
+ *    - arc_mru_ghost -> arc_l2c_only
+ *    - arc_mru_ghost -> deleted
+ *    - arc_mfu_ghost -> arc_l2c_only
+ *    - arc_mfu_ghost -> deleted
+ */
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+{
+	arc_state_t *evicted_state, *state;
+	int64_t bytes_evicted = 0;
+	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	state = hdr->b_l1hdr.b_state;
+	if (GHOST_STATE(state)) {
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+
+		/*
+		 * l2arc_write_buffers() relies on a header's L1 portion
+		 * (i.e. its b_pabd field) during it's write phase.
+		 * Thus, we cannot push a header onto the arc_l2c_only
+		 * state (removing its L1 piece) until the header is
+		 * done being written to the l2arc.
+		 */
+		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+			ARCSTAT_BUMP(arcstat_evict_l2_skip);
+			return (bytes_evicted);
+		}
+
+		ARCSTAT_BUMP(arcstat_deleted);
+		bytes_evicted += HDR_GET_LSIZE(hdr);
+
+		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+		if (HDR_HAS_L2HDR(hdr)) {
+			ASSERT(hdr->b_l1hdr.b_pabd == NULL);
+			ASSERT(!HDR_HAS_RABD(hdr));
+			/*
+			 * This buffer is cached on the 2nd Level ARC;
+			 * don't destroy the header.
+			 */
+			arc_change_state(arc_l2c_only, hdr, hash_lock);
+			/*
+			 * dropping from L1+L2 cached to L2-only,
+			 * realloc to remove the L1 header.
+			 */
+			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+			    hdr_l2only_cache);
+		} else {
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
+		}
+		return (bytes_evicted);
+	}
+
+	ASSERT(state == arc_mru || state == arc_mfu);
+	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+	/* prefetch buffers have a minimum lifespan */
+	if (HDR_IO_IN_PROGRESS(hdr) ||
+	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+	    MSEC_TO_TICK(min_lifetime))) {
+		ARCSTAT_BUMP(arcstat_evict_skip);
+		return (bytes_evicted);
+	}
+
+	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
+	while (hdr->b_l1hdr.b_buf) {
+		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+		if (!mutex_tryenter(&buf->b_evict_lock)) {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+			break;
+		}
+		if (buf->b_data != NULL)
+			bytes_evicted += HDR_GET_LSIZE(hdr);
+		mutex_exit(&buf->b_evict_lock);
+		arc_buf_destroy_impl(buf);
+	}
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
+	} else {
+		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
+			ARCSTAT_INCR(arcstat_evict_l2_eligible,
+			    HDR_GET_LSIZE(hdr));
+
+			switch (state->arcs_state) {
+				case ARC_STATE_MRU:
+					ARCSTAT_INCR(
+					    arcstat_evict_l2_eligible_mru,
+					    HDR_GET_LSIZE(hdr));
+					break;
+				case ARC_STATE_MFU:
+					ARCSTAT_INCR(
+					    arcstat_evict_l2_eligible_mfu,
+					    HDR_GET_LSIZE(hdr));
+					break;
+				default:
+					break;
+			}
+		} else {
+			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
+			    HDR_GET_LSIZE(hdr));
+		}
+	}
+
+	if (hdr->b_l1hdr.b_bufcnt == 0) {
+		arc_cksum_free(hdr);
+
+		bytes_evicted += arc_hdr_size(hdr);
+
+		/*
+		 * If this hdr is being evicted and has a compressed
+		 * buffer then we discard it here before we change states.
+		 * This ensures that the accounting is updated correctly
+		 * in arc_free_data_impl().
+		 */
+		if (hdr->b_l1hdr.b_pabd != NULL)
+			arc_hdr_free_abd(hdr, B_FALSE);
+
+		if (HDR_HAS_RABD(hdr))
+			arc_hdr_free_abd(hdr, B_TRUE);
+
+		arc_change_state(evicted_state, hdr, hash_lock);
+		ASSERT(HDR_IN_HASH_TABLE(hdr));
+		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
+	}
+
+	return (bytes_evicted);
+}
+
+static void
+arc_set_need_free(void)
+{
+	ASSERT(MUTEX_HELD(&arc_evict_lock));
+	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
+	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
+	if (aw == NULL) {
+		arc_need_free = MAX(-remaining, 0);
+	} else {
+		arc_need_free =
+		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
+	}
+}
+
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+    uint64_t spa, int64_t bytes)
+{
+	multilist_sublist_t *mls;
+	uint64_t bytes_evicted = 0;
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	int evict_count = 0;
+
+	ASSERT3P(marker, !=, NULL);
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+	mls = multilist_sublist_lock(ml, idx);
+
+	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+	    hdr = multilist_sublist_prev(mls, marker)) {
+		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+		    (evict_count >= zfs_arc_evict_batch_limit))
+			break;
+
+		/*
+		 * To keep our iteration location, move the marker
+		 * forward. Since we're not holding hdr's hash lock, we
+		 * must be very careful and not remove 'hdr' from the
+		 * sublist. Otherwise, other consumers might mistake the
+		 * 'hdr' as not being on a sublist when they call the
+		 * multilist_link_active() function (they all rely on
+		 * the hash lock protecting concurrent insertions and
+		 * removals). multilist_sublist_move_forward() was
+		 * specifically implemented to ensure this is the case
+		 * (only 'marker' will be removed and re-inserted).
+		 */
+		multilist_sublist_move_forward(mls, marker);
+
+		/*
+		 * The only case where the b_spa field should ever be
+		 * zero, is the marker headers inserted by
+		 * arc_evict_state(). It's possible for multiple threads
+		 * to be calling arc_evict_state() concurrently (e.g.
+		 * dsl_pool_close() and zio_inject_fault()), so we must
+		 * skip any markers we see from these other threads.
+		 */
+		if (hdr->b_spa == 0)
+			continue;
+
+		/* we're only interested in evicting buffers of a certain spa */
+		if (spa != 0 && hdr->b_spa != spa) {
+			ARCSTAT_BUMP(arcstat_evict_skip);
+			continue;
+		}
+
+		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We aren't calling this function from any code path
+		 * that would already be holding a hash lock, so we're
+		 * asserting on this assumption to be defensive in case
+		 * this ever changes. Without this check, it would be
+		 * possible to incorrectly increment arcstat_mutex_miss
+		 * below (e.g. if the code changed such that we called
+		 * this function with a hash lock held).
+		 */
+		ASSERT(!MUTEX_HELD(hash_lock));
+
+		if (mutex_tryenter(hash_lock)) {
+			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+			mutex_exit(hash_lock);
+
+			bytes_evicted += evicted;
+
+			/*
+			 * If evicted is zero, arc_evict_hdr() must have
+			 * decided to skip this header, don't increment
+			 * evict_count in this case.
+			 */
+			if (evicted != 0)
+				evict_count++;
+
+		} else {
+			ARCSTAT_BUMP(arcstat_mutex_miss);
+		}
+	}
+
+	multilist_sublist_unlock(mls);
+
+	/*
+	 * Increment the count of evicted bytes, and wake up any threads that
+	 * are waiting for the count to reach this value.  Since the list is
+	 * ordered by ascending aew_count, we pop off the beginning of the
+	 * list until we reach the end, or a waiter that's past the current
+	 * "count".  Doing this outside the loop reduces the number of times
+	 * we need to acquire the global arc_evict_lock.
+	 *
+	 * Only wake when there's sufficient free memory in the system
+	 * (specifically, arc_sys_free/2, which by default is a bit more than
+	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
+	 */
+	mutex_enter(&arc_evict_lock);
+	arc_evict_count += bytes_evicted;
+
+	if (arc_free_memory() > arc_sys_free / 2) {
+		arc_evict_waiter_t *aw;
+		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
+		    aw->aew_count <= arc_evict_count) {
+			list_remove(&arc_evict_waiters, aw);
+			cv_broadcast(&aw->aew_cv);
+		}
+	}
+	arc_set_need_free();
+	mutex_exit(&arc_evict_lock);
+
+	/*
+	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
+	 * if the average cached block is small), eviction can be on-CPU for
+	 * many seconds.  To ensure that other threads that may be bound to
+	 * this CPU are able to make progress, make a voluntary preemption
+	 * call here.
+	 */
+	cond_resched();
+
+	return (bytes_evicted);
+}
+
+/*
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
+ *
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
+ * It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
+ */
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
+{
+	uint64_t total_evicted = 0;
+	multilist_t *ml = state->arcs_list[type];
+	int num_sublists;
+	arc_buf_hdr_t **markers;
+
+	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+	num_sublists = multilist_get_num_sublists(ml);
+
+	/*
+	 * If we've tried to evict from each sublist, made some
+	 * progress, but still have not hit the target number of bytes
+	 * to evict, we want to keep trying. The markers allow us to
+	 * pick up where we left off for each individual sublist, rather
+	 * than starting from the tail each time.
+	 */
+	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+	for (int i = 0; i < num_sublists; i++) {
+		multilist_sublist_t *mls;
+
+		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+		/*
+		 * A b_spa of 0 is used to indicate that this header is
+		 * a marker. This fact is used in arc_evict_type() and
+		 * arc_evict_state_impl().
+		 */
+		markers[i]->b_spa = 0;
+
+		mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_insert_tail(mls, markers[i]);
+		multilist_sublist_unlock(mls);
+	}
+
+	/*
+	 * While we haven't hit our target number of bytes to evict, or
+	 * we're evicting all available buffers.
+	 */
+	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+		int sublist_idx = multilist_get_random_index(ml);
+		uint64_t scan_evicted = 0;
+
+		/*
+		 * Try to reduce pinned dnodes with a floor of arc_dnode_limit.
+		 * Request that 10% of the LRUs be scanned by the superblock
+		 * shrinker.
+		 */
+		if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size,
+		    arc_dnode_size_limit) > 0) {
+			arc_prune_async((aggsum_upper_bound(&astat_dnode_size) -
+			    arc_dnode_size_limit) / sizeof (dnode_t) /
+			    zfs_arc_dnode_reduce_percent);
+		}
+
+		/*
+		 * Start eviction using a randomly selected sublist,
+		 * this is to try and evenly balance eviction across all
+		 * sublists. Always starting at the same sublist
+		 * (e.g. index 0) would cause evictions to favor certain
+		 * sublists over others.
+		 */
+		for (int i = 0; i < num_sublists; i++) {
+			uint64_t bytes_remaining;
+			uint64_t bytes_evicted;
+
+			if (bytes == ARC_EVICT_ALL)
+				bytes_remaining = ARC_EVICT_ALL;
+			else if (total_evicted < bytes)
+				bytes_remaining = bytes - total_evicted;
+			else
+				break;
+
+			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+			    markers[sublist_idx], spa, bytes_remaining);
+
+			scan_evicted += bytes_evicted;
+			total_evicted += bytes_evicted;
+
+			/* we've reached the end, wrap to the beginning */
+			if (++sublist_idx >= num_sublists)
+				sublist_idx = 0;
+		}
+
+		/*
+		 * If we didn't evict anything during this scan, we have
+		 * no reason to believe we'll evict more during another
+		 * scan, so break the loop.
+		 */
+		if (scan_evicted == 0) {
+			/* This isn't possible, let's make that obvious */
+			ASSERT3S(bytes, !=, 0);
+
+			/*
+			 * When bytes is ARC_EVICT_ALL, the only way to
+			 * break the loop is when scan_evicted is zero.
+			 * In that case, we actually have evicted enough,
+			 * so we don't want to increment the kstat.
+			 */
+			if (bytes != ARC_EVICT_ALL) {
+				ASSERT3S(total_evicted, <, bytes);
+				ARCSTAT_BUMP(arcstat_evict_not_enough);
+			}
+
+			break;
+		}
+	}
+
+	for (int i = 0; i < num_sublists; i++) {
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		multilist_sublist_remove(mls, markers[i]);
+		multilist_sublist_unlock(mls);
+
+		kmem_cache_free(hdr_full_cache, markers[i]);
+	}
+	kmem_free(markers, sizeof (*markers) * num_sublists);
+
+	return (total_evicted);
+}
+
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to B_FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to B_TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+    boolean_t retry)
+{
+	uint64_t evicted = 0;
+
+	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
+		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
+
+		if (!retry)
+			break;
+	}
+
+	return (evicted);
+}
+
+/*
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
+ */
+static uint64_t
+arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+    arc_buf_contents_t type)
+{
+	int64_t delta;
+
+	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
+		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
+		    bytes);
+		return (arc_evict_state(state, spa, delta, type));
+	}
+
+	return (0);
+}
+
+/*
+ * The goal of this function is to evict enough meta data buffers from the
+ * ARC in order to enforce the arc_meta_limit.  Achieving this is slightly
+ * more complicated than it appears because it is common for data buffers
+ * to have holds on meta data buffers.  In addition, dnode meta data buffers
+ * will be held by the dnodes in the block preventing them from being freed.
+ * This means we can't simply traverse the ARC and expect to always find
+ * enough unheld meta data buffer to release.
+ *
+ * Therefore, this function has been updated to make alternating passes
+ * over the ARC releasing data buffers and then newly unheld meta data
+ * buffers.  This ensures forward progress is maintained and meta_used
+ * will decrease.  Normally this is sufficient, but if required the ARC
+ * will call the registered prune callbacks causing dentry and inodes to
+ * be dropped from the VFS cache.  This will make dnode meta data buffers
+ * available for reclaim.
+ */
+static uint64_t
+arc_evict_meta_balanced(uint64_t meta_used)
+{
+	int64_t delta, prune = 0, adjustmnt;
+	uint64_t total_evicted = 0;
+	arc_buf_contents_t type = ARC_BUFC_DATA;
+	int restarts = MAX(zfs_arc_meta_adjust_restarts, 0);
+
+restart:
+	/*
+	 * This slightly differs than the way we evict from the mru in
+	 * arc_evict because we don't have a "target" value (i.e. no
+	 * "meta" arc_p). As a result, I think we can completely
+	 * cannibalize the metadata in the MRU before we evict the
+	 * metadata from the MFU. I think we probably need to implement a
+	 * "metadata arc_p" value to do this properly.
+	 */
+	adjustmnt = meta_used - arc_meta_limit;
+
+	if (adjustmnt > 0 &&
+	    zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) {
+		delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]),
+		    adjustmnt);
+		total_evicted += arc_evict_impl(arc_mru, 0, delta, type);
+		adjustmnt -= delta;
+	}
+
+	/*
+	 * We can't afford to recalculate adjustmnt here. If we do,
+	 * new metadata buffers can sneak into the MRU or ANON lists,
+	 * thus penalize the MFU metadata. Although the fudge factor is
+	 * small, it has been empirically shown to be significant for
+	 * certain workloads (e.g. creating many empty directories). As
+	 * such, we use the original calculation for adjustmnt, and
+	 * simply decrement the amount of data evicted from the MRU.
+	 */
+
+	if (adjustmnt > 0 &&
+	    zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) {
+		delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]),
+		    adjustmnt);
+		total_evicted += arc_evict_impl(arc_mfu, 0, delta, type);
+	}
+
+	adjustmnt = meta_used - arc_meta_limit;
+
+	if (adjustmnt > 0 &&
+	    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) {
+		delta = MIN(adjustmnt,
+		    zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]));
+		total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type);
+		adjustmnt -= delta;
+	}
+
+	if (adjustmnt > 0 &&
+	    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) {
+		delta = MIN(adjustmnt,
+		    zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]));
+		total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type);
+	}
+
+	/*
+	 * If after attempting to make the requested adjustment to the ARC
+	 * the meta limit is still being exceeded then request that the
+	 * higher layers drop some cached objects which have holds on ARC
+	 * meta buffers.  Requests to the upper layers will be made with
+	 * increasingly large scan sizes until the ARC is below the limit.
+	 */
+	if (meta_used > arc_meta_limit) {
+		if (type == ARC_BUFC_DATA) {
+			type = ARC_BUFC_METADATA;
+		} else {
+			type = ARC_BUFC_DATA;
+
+			if (zfs_arc_meta_prune) {
+				prune += zfs_arc_meta_prune;
+				arc_prune_async(prune);
+			}
+		}
+
+		if (restarts > 0) {
+			restarts--;
+			goto restart;
+		}
+	}
+	return (total_evicted);
+}
+
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_evict_meta_only(uint64_t meta_used)
+{
+	uint64_t total_evicted = 0;
+	int64_t target;
+
+	/*
+	 * If we're over the meta limit, we want to evict enough
+	 * metadata to get back under the meta limit. We don't want to
+	 * evict so much that we drop the MRU below arc_p, though. If
+	 * we're over the meta limit more than we're over arc_p, we
+	 * evict some from the MRU here, and some from the MFU below.
+	 */
+	target = MIN((int64_t)(meta_used - arc_meta_limit),
+	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
+	    zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
+
+	total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+
+	/*
+	 * Similar to the above, we want to evict enough bytes to get us
+	 * below the meta limit, but not so much as to drop us below the
+	 * space allotted to the MFU (which is defined as arc_c - arc_p).
+	 */
+	target = MIN((int64_t)(meta_used - arc_meta_limit),
+	    (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
+	    (arc_c - arc_p)));
+
+	total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+
+	return (total_evicted);
+}
+
+static uint64_t
+arc_evict_meta(uint64_t meta_used)
+{
+	if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY)
+		return (arc_evict_meta_only(meta_used));
+	else
+		return (arc_evict_meta_balanced(meta_used));
+}
+
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_evict_type(arc_state_t *state)
+{
+	multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
+	multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
+	int data_idx = multilist_get_random_index(data_ml);
+	int meta_idx = multilist_get_random_index(meta_ml);
+	multilist_sublist_t *data_mls;
+	multilist_sublist_t *meta_mls;
+	arc_buf_contents_t type;
+	arc_buf_hdr_t *data_hdr;
+	arc_buf_hdr_t *meta_hdr;
+
+	/*
+	 * We keep the sublist lock until we're finished, to prevent
+	 * the headers from being destroyed via arc_evict_state().
+	 */
+	data_mls = multilist_sublist_lock(data_ml, data_idx);
+	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
+
+	/*
+	 * These two loops are to ensure we skip any markers that
+	 * might be at the tail of the lists due to arc_evict_state().
+	 */
+
+	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+		if (data_hdr->b_spa != 0)
+			break;
+	}
+
+	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+		if (meta_hdr->b_spa != 0)
+			break;
+	}
+
+	if (data_hdr == NULL && meta_hdr == NULL) {
+		type = ARC_BUFC_DATA;
+	} else if (data_hdr == NULL) {
+		ASSERT3P(meta_hdr, !=, NULL);
+		type = ARC_BUFC_METADATA;
+	} else if (meta_hdr == NULL) {
+		ASSERT3P(data_hdr, !=, NULL);
+		type = ARC_BUFC_DATA;
+	} else {
+		ASSERT3P(data_hdr, !=, NULL);
+		ASSERT3P(meta_hdr, !=, NULL);
+
+		/* The headers can't be on the sublist without an L1 header */
+		ASSERT(HDR_HAS_L1HDR(data_hdr));
+		ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+		if (data_hdr->b_l1hdr.b_arc_access <
+		    meta_hdr->b_l1hdr.b_arc_access) {
+			type = ARC_BUFC_DATA;
+		} else {
+			type = ARC_BUFC_METADATA;
+		}
+	}
+
+	multilist_sublist_unlock(meta_mls);
+	multilist_sublist_unlock(data_mls);
+
+	return (type);
+}
+
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
+arc_evict(void)
+{
+	uint64_t total_evicted = 0;
+	uint64_t bytes;
+	int64_t target;
+	uint64_t asize = aggsum_value(&arc_size);
+	uint64_t ameta = aggsum_value(&arc_meta_used);
+
+	/*
+	 * If we're over arc_meta_limit, we want to correct that before
+	 * potentially evicting data buffers below.
+	 */
+	total_evicted += arc_evict_meta(ameta);
+
+	/*
+	 * Adjust MRU size
+	 *
+	 * If we're over the target cache size, we want to evict enough
+	 * from the list to get back to our target size. We don't want
+	 * to evict too much from the MRU, such that it drops below
+	 * arc_p. So, if we're over our target cache size more than
+	 * the MRU is over arc_p, we'll evict enough to get back to
+	 * arc_p here, and then evict more from the MFU below.
+	 */
+	target = MIN((int64_t)(asize - arc_c),
+	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
+	    zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
+
+	/*
+	 * If we're below arc_meta_min, always prefer to evict data.
+	 * Otherwise, try to satisfy the requested number of bytes to
+	 * evict from the type which contains older buffers; in an
+	 * effort to keep newer buffers in the cache regardless of their
+	 * type. If we cannot satisfy the number of bytes from this
+	 * type, spill over into the next type.
+	 */
+	if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA &&
+	    ameta > arc_meta_min) {
+		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from metadata.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+	}
+
+	/*
+	 * Re-sum ARC stats after the first round of evictions.
+	 */
+	asize = aggsum_value(&arc_size);
+	ameta = aggsum_value(&arc_meta_used);
+
+
+	/*
+	 * Adjust MFU size
+	 *
+	 * Now that we've tried to evict enough from the MRU to get its
+	 * size back to arc_p, if we're still above the target cache
+	 * size, we evict the rest from the MFU.
+	 */
+	target = asize - arc_c;
+
+	if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA &&
+	    ameta > arc_meta_min) {
+		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * metadata, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+	} else {
+		bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+		total_evicted += bytes;
+
+		/*
+		 * If we couldn't evict our target number of bytes from
+		 * data, we try to get the rest from data.
+		 */
+		target -= bytes;
+
+		total_evicted +=
+		    arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+	}
+
+	/*
+	 * Adjust ghost lists
+	 *
+	 * In addition to the above, the ARC also defines target values
+	 * for the ghost lists. The sum of the mru list and mru ghost
+	 * list should never exceed the target size of the cache, and
+	 * the sum of the mru list, mfu list, mru ghost list, and mfu
+	 * ghost list should never exceed twice the target size of the
+	 * cache. The following logic enforces these limits on the ghost
+	 * caches, and evicts from them as needed.
+	 */
+	target = zfs_refcount_count(&arc_mru->arcs_size) +
+	    zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
+
+	bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
+
+	target -= bytes;
+
+	total_evicted +=
+	    arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
+
+	/*
+	 * We assume the sum of the mru list and mfu list is less than
+	 * or equal to arc_c (we enforced this above), which means we
+	 * can use the simpler of the two equations below:
+	 *
+	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+	 *		    mru ghost + mfu ghost <= arc_c
+	 */
+	target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
+	    zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
+
+	bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+	total_evicted += bytes;
+
+	target -= bytes;
+
+	total_evicted +=
+	    arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
+
+	return (total_evicted);
+}
+
+void
+arc_flush(spa_t *spa, boolean_t retry)
+{
+	uint64_t guid = 0;
+
+	/*
+	 * If retry is B_TRUE, a spa must not be specified since we have
+	 * no good way to determine if all of a spa's buffers have been
+	 * evicted from an arc state.
+	 */
+	ASSERT(!retry || spa == 0);
+
+	if (spa != NULL)
+		guid = spa_load_guid(spa);
+
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
+}
+
+void
+arc_reduce_target_size(int64_t to_free)
+{
+	uint64_t asize = aggsum_value(&arc_size);
+
+	/*
+	 * All callers want the ARC to actually evict (at least) this much
+	 * memory.  Therefore we reduce from the lower of the current size and
+	 * the target size.  This way, even if arc_c is much higher than
+	 * arc_size (as can be the case after many calls to arc_freed(), we will
+	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
+	 * will evict.
+	 */
+	uint64_t c = MIN(arc_c, asize);
+
+	if (c > to_free && c - to_free > arc_c_min) {
+		arc_c = c - to_free;
+		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
+		if (arc_p > arc_c)
+			arc_p = (arc_c >> 1);
+		ASSERT(arc_c >= arc_c_min);
+		ASSERT((int64_t)arc_p >= 0);
+	} else {
+		arc_c = arc_c_min;
+	}
+
+	if (asize > arc_c) {
+		/* See comment in arc_evict_cb_check() on why lock+flag */
+		mutex_enter(&arc_evict_lock);
+		arc_evict_needed = B_TRUE;
+		mutex_exit(&arc_evict_lock);
+		zthr_wakeup(arc_evict_zthr);
+	}
+}
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of B_TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+boolean_t
+arc_reclaim_needed(void)
+{
+	return (arc_available_memory() < 0);
+}
+
+void
+arc_kmem_reap_soon(void)
+{
+	size_t			i;
+	kmem_cache_t		*prev_cache = NULL;
+	kmem_cache_t		*prev_data_cache = NULL;
+	extern kmem_cache_t	*zio_buf_cache[];
+	extern kmem_cache_t	*zio_data_buf_cache[];
+
+#ifdef _KERNEL
+	if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) &&
+	    zfs_arc_meta_prune) {
+		/*
+		 * We are exceeding our meta-data cache limit.
+		 * Prune some entries to release holds on meta-data.
+		 */
+		arc_prune_async(zfs_arc_meta_prune);
+	}
+#if defined(_ILP32)
+	/*
+	 * Reclaim unused memory from all kmem caches.
+	 */
+	kmem_reap();
+#endif
+#endif
+
+	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
+#if defined(_ILP32)
+		/* reach upper limit of cache size on 32-bit */
+		if (zio_buf_cache[i] == NULL)
+			break;
+#endif
+		if (zio_buf_cache[i] != prev_cache) {
+			prev_cache = zio_buf_cache[i];
+			kmem_cache_reap_now(zio_buf_cache[i]);
+		}
+		if (zio_data_buf_cache[i] != prev_data_cache) {
+			prev_data_cache = zio_data_buf_cache[i];
+			kmem_cache_reap_now(zio_data_buf_cache[i]);
+		}
+	}
+	kmem_cache_reap_now(buf_cache);
+	kmem_cache_reap_now(hdr_full_cache);
+	kmem_cache_reap_now(hdr_l2only_cache);
+	kmem_cache_reap_now(zfs_btree_leaf_cache);
+	abd_cache_reap_now();
+}
+
+/* ARGSUSED */
+static boolean_t
+arc_evict_cb_check(void *arg, zthr_t *zthr)
+{
+#ifdef ZFS_DEBUG
+	/*
+	 * This is necessary in order to keep the kstat information
+	 * up to date for tools that display kstat data such as the
+	 * mdb ::arc dcmd and the Linux crash utility.  These tools
+	 * typically do not call kstat's update function, but simply
+	 * dump out stats from the most recent update.  Without
+	 * this call, these commands may show stale stats for the
+	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists.  Even
+	 * with this call, the data might be out of date if the
+	 * evict thread hasn't been woken recently; but that should
+	 * suffice.  The arc_state_t structures can be queried
+	 * directly if more accurate information is needed.
+	 */
+	if (arc_ksp != NULL)
+		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
+#endif
+
+	/*
+	 * We have to rely on arc_wait_for_eviction() to tell us when to
+	 * evict, rather than checking if we are overflowing here, so that we
+	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
+	 * If we have become "not overflowing" since arc_wait_for_eviction()
+	 * checked, we need to wake it up.  We could broadcast the CV here,
+	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
+	 * would need to use a mutex to ensure that this function doesn't
+	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
+	 * the arc_evict_lock).  However, the lock ordering of such a lock
+	 * would necessarily be incorrect with respect to the zthr_lock,
+	 * which is held before this function is called, and is held by
+	 * arc_wait_for_eviction() when it calls zthr_wakeup().
+	 */
+	return (arc_evict_needed);
+}
+
+/*
+ * Keep arc_size under arc_c by running arc_evict which evicts data
+ * from the ARC.
+ */
+/* ARGSUSED */
+static void
+arc_evict_cb(void *arg, zthr_t *zthr)
+{
+	uint64_t evicted = 0;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	/* Evict from cache */
+	evicted = arc_evict();
+
+	/*
+	 * If evicted is zero, we couldn't evict anything
+	 * via arc_evict(). This could be due to hash lock
+	 * collisions, but more likely due to the majority of
+	 * arc buffers being unevictable. Therefore, even if
+	 * arc_size is above arc_c, another pass is unlikely to
+	 * be helpful and could potentially cause us to enter an
+	 * infinite loop.  Additionally, zthr_iscancelled() is
+	 * checked here so that if the arc is shutting down, the
+	 * broadcast will wake any remaining arc evict waiters.
+	 */
+	mutex_enter(&arc_evict_lock);
+	arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
+	    evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
+	if (!arc_evict_needed) {
+		/*
+		 * We're either no longer overflowing, or we
+		 * can't evict anything more, so we should wake
+		 * arc_get_data_impl() sooner.
+		 */
+		arc_evict_waiter_t *aw;
+		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
+			cv_broadcast(&aw->aew_cv);
+		}
+		arc_set_need_free();
+	}
+	mutex_exit(&arc_evict_lock);
+	spl_fstrans_unmark(cookie);
+}
+
+/* ARGSUSED */
+static boolean_t
+arc_reap_cb_check(void *arg, zthr_t *zthr)
+{
+	int64_t free_memory = arc_available_memory();
+	static int reap_cb_check_counter = 0;
+
+	/*
+	 * If a kmem reap is already active, don't schedule more.  We must
+	 * check for this because kmem_cache_reap_soon() won't actually
+	 * block on the cache being reaped (this is to prevent callers from
+	 * becoming implicitly blocked by a system-wide kmem reap -- which,
+	 * on a system with many, many full magazines, can take minutes).
+	 */
+	if (!kmem_cache_reap_active() && free_memory < 0) {
+
+		arc_no_grow = B_TRUE;
+		arc_warm = B_TRUE;
+		/*
+		 * Wait at least zfs_grow_retry (default 5) seconds
+		 * before considering growing.
+		 */
+		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
+		return (B_TRUE);
+	} else if (free_memory < arc_c >> arc_no_grow_shift) {
+		arc_no_grow = B_TRUE;
+	} else if (gethrtime() >= arc_growtime) {
+		arc_no_grow = B_FALSE;
+	}
+
+	/*
+	 * Called unconditionally every 60 seconds to reclaim unused
+	 * zstd compression and decompression context. This is done
+	 * here to avoid the need for an independent thread.
+	 */
+	if (!((reap_cb_check_counter++) % 60))
+		zfs_zstd_cache_reap_now();
+
+	return (B_FALSE);
+}
+
+/*
+ * Keep enough free memory in the system by reaping the ARC's kmem
+ * caches.  To cause more slabs to be reapable, we may reduce the
+ * target size of the cache (arc_c), causing the arc_evict_cb()
+ * to free more buffers.
+ */
+/* ARGSUSED */
+static void
+arc_reap_cb(void *arg, zthr_t *zthr)
+{
+	int64_t free_memory;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	/*
+	 * Kick off asynchronous kmem_reap()'s of all our caches.
+	 */
+	arc_kmem_reap_soon();
+
+	/*
+	 * Wait at least arc_kmem_cache_reap_retry_ms between
+	 * arc_kmem_reap_soon() calls. Without this check it is possible to
+	 * end up in a situation where we spend lots of time reaping
+	 * caches, while we're near arc_c_min.  Waiting here also gives the
+	 * subsequent free memory check a chance of finding that the
+	 * asynchronous reap has already freed enough memory, and we don't
+	 * need to call arc_reduce_target_size().
+	 */
+	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
+
+	/*
+	 * Reduce the target size as needed to maintain the amount of free
+	 * memory in the system at a fraction of the arc_size (1/128th by
+	 * default).  If oversubscribed (free_memory < 0) then reduce the
+	 * target arc_size by the deficit amount plus the fractional
+	 * amount.  If free memory is positive but less then the fractional
+	 * amount, reduce by what is needed to hit the fractional amount.
+	 */
+	free_memory = arc_available_memory();
+
+	int64_t to_free =
+	    (arc_c >> arc_shrink_shift) - free_memory;
+	if (to_free > 0) {
+		arc_reduce_target_size(to_free);
+	}
+	spl_fstrans_unmark(cookie);
+}
+
+#ifdef _KERNEL
+/*
+ * Determine the amount of memory eligible for eviction contained in the
+ * ARC. All clean data reported by the ghost lists can always be safely
+ * evicted. Due to arc_c_min, the same does not hold for all clean data
+ * contained by the regular mru and mfu lists.
+ *
+ * In the case of the regular mru and mfu lists, we need to report as
+ * much clean data as possible, such that evicting that same reported
+ * data will not bring arc_size below arc_c_min. Thus, in certain
+ * circumstances, the total amount of clean data in the mru and mfu
+ * lists might not actually be evictable.
+ *
+ * The following two distinct cases are accounted for:
+ *
+ * 1. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is greater than or equal to arc_c_min.
+ *    (i.e. amount of dirty data >= arc_c_min)
+ *
+ *    This is the easy case; all clean data contained by the mru and mfu
+ *    lists is evictable. Evicting all clean data can only drop arc_size
+ *    to the amount of dirty data, which is greater than arc_c_min.
+ *
+ * 2. The sum of the amount of dirty data contained by both the mru and
+ *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
+ *    is less than arc_c_min.
+ *    (i.e. arc_c_min > amount of dirty data)
+ *
+ *    2.1. arc_size is greater than or equal arc_c_min.
+ *         (i.e. arc_size >= arc_c_min > amount of dirty data)
+ *
+ *         In this case, not all clean data from the regular mru and mfu
+ *         lists is actually evictable; we must leave enough clean data
+ *         to keep arc_size above arc_c_min. Thus, the maximum amount of
+ *         evictable data from the two lists combined, is exactly the
+ *         difference between arc_size and arc_c_min.
+ *
+ *    2.2. arc_size is less than arc_c_min
+ *         (i.e. arc_c_min > arc_size > amount of dirty data)
+ *
+ *         In this case, none of the data contained in the mru and mfu
+ *         lists is evictable, even if it's clean. Since arc_size is
+ *         already below arc_c_min, evicting any more would only
+ *         increase this negative difference.
+ */
+
+#endif /* _KERNEL */
+
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are coming from.  This function is only called
+ * when we are adding new content to the cache.
+ */
+static void
+arc_adapt(int bytes, arc_state_t *state)
+{
+	int mult;
+	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+	int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
+	int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
+
+	ASSERT(bytes > 0);
+	/*
+	 * Adapt the target size of the MRU list:
+	 *	- if we just hit in the MRU ghost list, then increase
+	 *	  the target size of the MRU list.
+	 *	- if we just hit in the MFU ghost list, then increase
+	 *	  the target size of the MFU list by decreasing the
+	 *	  target size of the MRU list.
+	 */
+	if (state == arc_mru_ghost) {
+		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
+		if (!zfs_arc_p_dampener_disable)
+			mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
+
+		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
+	} else if (state == arc_mfu_ghost) {
+		uint64_t delta;
+
+		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
+		if (!zfs_arc_p_dampener_disable)
+			mult = MIN(mult, 10);
+
+		delta = MIN(bytes * mult, arc_p);
+		arc_p = MAX(arc_p_min, arc_p - delta);
+	}
+	ASSERT((int64_t)arc_p >= 0);
+
+	/*
+	 * Wake reap thread if we do not have any available memory
+	 */
+	if (arc_reclaim_needed()) {
+		zthr_wakeup(arc_reap_zthr);
+		return;
+	}
+
+	if (arc_no_grow)
+		return;
+
+	if (arc_c >= arc_c_max)
+		return;
+
+	/*
+	 * If we're within (2 * maxblocksize) bytes of the target
+	 * cache size, increment the target cache size
+	 */
+	ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT);
+	if (aggsum_upper_bound(&arc_size) >=
+	    arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+		atomic_add_64(&arc_c, (int64_t)bytes);
+		if (arc_c > arc_c_max)
+			arc_c = arc_c_max;
+		else if (state == arc_anon)
+			atomic_add_64(&arc_p, (int64_t)bytes);
+		if (arc_p > arc_c)
+			arc_p = arc_c;
+	}
+	ASSERT((int64_t)arc_p >= 0);
+}
+
+/*
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
+ */
+boolean_t
+arc_is_overflowing(void)
+{
+	/* Always allow at least one block of overflow */
+	int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+	    arc_c >> zfs_arc_overflow_shift);
+
+	/*
+	 * We just compare the lower bound here for performance reasons. Our
+	 * primary goals are to make sure that the arc never grows without
+	 * bound, and that it can reach its maximum size. This check
+	 * accomplishes both goals. The maximum amount we could run over by is
+	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
+	 * in the ARC. In practice, that's in the tens of MB, which is low
+	 * enough to be safe.
+	 */
+	return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow);
+}
+
+static abd_t *
+arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+    boolean_t do_adapt)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	arc_get_data_impl(hdr, size, tag, do_adapt);
+	if (type == ARC_BUFC_METADATA) {
+		return (abd_alloc(size, B_TRUE));
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		return (abd_alloc(size, B_FALSE));
+	}
+}
+
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	arc_get_data_impl(hdr, size, tag, B_TRUE);
+	if (type == ARC_BUFC_METADATA) {
+		return (zio_buf_alloc(size));
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		return (zio_data_buf_alloc(size));
+	}
+}
+
+/*
+ * Wait for the specified amount of data (in bytes) to be evicted from the
+ * ARC, and for there to be sufficient free memory in the system.  Waiting for
+ * eviction ensures that the memory used by the ARC decreases.  Waiting for
+ * free memory ensures that the system won't run out of free pages, regardless
+ * of ARC behavior and settings.  See arc_lowmem_init().
+ */
+void
+arc_wait_for_eviction(uint64_t amount)
+{
+	mutex_enter(&arc_evict_lock);
+	if (arc_is_overflowing()) {
+		arc_evict_needed = B_TRUE;
+		zthr_wakeup(arc_evict_zthr);
+
+		if (amount != 0) {
+			arc_evict_waiter_t aw;
+			list_link_init(&aw.aew_node);
+			cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
+
+			uint64_t last_count = 0;
+			if (!list_is_empty(&arc_evict_waiters)) {
+				arc_evict_waiter_t *last =
+				    list_tail(&arc_evict_waiters);
+				last_count = last->aew_count;
+			}
+			/*
+			 * Note, the last waiter's count may be less than
+			 * arc_evict_count if we are low on memory in which
+			 * case arc_evict_state_impl() may have deferred
+			 * wakeups (but still incremented arc_evict_count).
+			 */
+			aw.aew_count =
+			    MAX(last_count, arc_evict_count) + amount;
+
+			list_insert_tail(&arc_evict_waiters, &aw);
+
+			arc_set_need_free();
+
+			DTRACE_PROBE3(arc__wait__for__eviction,
+			    uint64_t, amount,
+			    uint64_t, arc_evict_count,
+			    uint64_t, aw.aew_count);
+
+			/*
+			 * We will be woken up either when arc_evict_count
+			 * reaches aew_count, or when the ARC is no longer
+			 * overflowing and eviction completes.
+			 */
+			cv_wait(&aw.aew_cv, &arc_evict_lock);
+
+			/*
+			 * In case of "false" wakeup, we will still be on the
+			 * list.
+			 */
+			if (list_link_active(&aw.aew_node))
+				list_remove(&arc_evict_waiters, &aw);
+
+			cv_destroy(&aw.aew_cv);
+		}
+	}
+	mutex_exit(&arc_evict_lock);
+}
+
+/*
+ * Allocate a block and return it to the caller. If we are hitting the
+ * hard limit for the cache size, we must sleep, waiting for the eviction
+ * thread to catch up. If we're past the target size but below the hard
+ * limit, we'll only signal the reclaim thread and continue on.
+ */
+static void
+arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
+    boolean_t do_adapt)
+{
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	if (do_adapt)
+		arc_adapt(size, state);
+
+	/*
+	 * If arc_size is currently overflowing, we must be adding data
+	 * faster than we are evicting.  To ensure we don't compound the
+	 * problem by adding more data and forcing arc_size to grow even
+	 * further past it's target size, we wait for the eviction thread to
+	 * make some progress.  We also wait for there to be sufficient free
+	 * memory in the system, as measured by arc_free_memory().
+	 *
+	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
+	 * requested size to be evicted.  This should be more than 100%, to
+	 * ensure that that progress is also made towards getting arc_size
+	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
+	 *
+	 * We do the overflowing check without holding the arc_evict_lock to
+	 * reduce lock contention in this hot path.  Note that
+	 * arc_wait_for_eviction() will acquire the lock and check again to
+	 * ensure we are truly overflowing before blocking.
+	 */
+	if (arc_is_overflowing()) {
+		arc_wait_for_eviction(size *
+		    zfs_arc_eviction_pct / 100);
+	}
+
+	VERIFY3U(hdr->b_type, ==, type);
+	if (type == ARC_BUFC_METADATA) {
+		arc_space_consume(size, ARC_SPACE_META);
+	} else {
+		arc_space_consume(size, ARC_SPACE_DATA);
+	}
+
+	/*
+	 * Update the state size.  Note that ghost states have a
+	 * "ghost size" and so don't need to be updated.
+	 */
+	if (!GHOST_STATE(state)) {
+
+		(void) zfs_refcount_add_many(&state->arcs_size, size, tag);
+
+		/*
+		 * If this is reached via arc_read, the link is
+		 * protected by the hash lock. If reached via
+		 * arc_buf_alloc, the header should not be accessed by
+		 * any other thread. And, if reached via arc_read_done,
+		 * the hash lock will protect it if it's found in the
+		 * hash table; otherwise no other thread should be
+		 * trying to [add|remove]_reference it.
+		 */
+		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+			(void) zfs_refcount_add_many(&state->arcs_esize[type],
+			    size, tag);
+		}
+
+		/*
+		 * If we are growing the cache, and we are adding anonymous
+		 * data, and we have outgrown arc_p, update arc_p
+		 */
+		if (aggsum_upper_bound(&arc_size) < arc_c &&
+		    hdr->b_l1hdr.b_state == arc_anon &&
+		    (zfs_refcount_count(&arc_anon->arcs_size) +
+		    zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
+			arc_p = MIN(arc_c, arc_p + size);
+	}
+}
+
+static void
+arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
+{
+	arc_free_data_impl(hdr, size, tag);
+	abd_free(abd);
+}
+
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
+{
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	arc_free_data_impl(hdr, size, tag);
+	if (type == ARC_BUFC_METADATA) {
+		zio_buf_free(buf, size);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		zio_data_buf_free(buf, size);
+	}
+}
+
+/*
+ * Free the arc data buffer.
+ */
+static void
+arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
+{
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	arc_buf_contents_t type = arc_buf_type(hdr);
+
+	/* protected by hash lock, if in the hash table */
+	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+		ASSERT(state != arc_anon && state != arc_l2c_only);
+
+		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
+		    size, tag);
+	}
+	(void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
+
+	VERIFY3U(hdr->b_type, ==, type);
+	if (type == ARC_BUFC_METADATA) {
+		arc_space_return(size, ARC_SPACE_META);
+	} else {
+		ASSERT(type == ARC_BUFC_DATA);
+		arc_space_return(size, ARC_SPACE_DATA);
+	}
+}
+
+/*
+ * This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
+ */
+static void
+arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
+{
+	clock_t now;
+
+	ASSERT(MUTEX_HELD(hash_lock));
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	if (hdr->b_l1hdr.b_state == arc_anon) {
+		/*
+		 * This buffer is not in the cache, and does not
+		 * appear in our "ghost" list.  Add the new buffer
+		 * to the MRU state.
+		 */
+
+		ASSERT0(hdr->b_l1hdr.b_arc_access);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mru, hdr, hash_lock);
+
+	} else if (hdr->b_l1hdr.b_state == arc_mru) {
+		now = ddi_get_lbolt();
+
+		/*
+		 * If this buffer is here because of a prefetch, then either:
+		 * - clear the flag if this is a "referencing" read
+		 *   (any subsequent access will bump this into the MFU state).
+		 * or
+		 * - move the buffer to the head of the list if this is
+		 *   another prefetch (to make it less likely to be evicted).
+		 */
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+				/* link protected by hash lock */
+				ASSERT(multilist_link_active(
+				    &hdr->b_l1hdr.b_arc_node));
+			} else {
+				if (HDR_HAS_L2HDR(hdr))
+					l2arc_hdr_arcstats_decrement_state(hdr);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREFETCH |
+				    ARC_FLAG_PRESCIENT_PREFETCH);
+				atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
+				ARCSTAT_BUMP(arcstat_mru_hits);
+				if (HDR_HAS_L2HDR(hdr))
+					l2arc_hdr_arcstats_increment_state(hdr);
+			}
+			hdr->b_l1hdr.b_arc_access = now;
+			return;
+		}
+
+		/*
+		 * This buffer has been "accessed" only once so far,
+		 * but it is still in the cache. Move it to the MFU
+		 * state.
+		 */
+		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
+		    ARC_MINTIME)) {
+			/*
+			 * More than 125ms have passed since we
+			 * instantiated this buffer.  Move it to the
+			 * most frequently used state.
+			 */
+			hdr->b_l1hdr.b_arc_access = now;
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+			arc_change_state(arc_mfu, hdr, hash_lock);
+		}
+		atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
+		ARCSTAT_BUMP(arcstat_mru_hits);
+	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
+		arc_state_t	*new_state;
+		/*
+		 * This buffer has been "accessed" recently, but
+		 * was evicted from the cache.  Move it to the
+		 * MFU state.
+		 */
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+			new_state = arc_mru;
+			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+				if (HDR_HAS_L2HDR(hdr))
+					l2arc_hdr_arcstats_decrement_state(hdr);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREFETCH |
+				    ARC_FLAG_PRESCIENT_PREFETCH);
+				if (HDR_HAS_L2HDR(hdr))
+					l2arc_hdr_arcstats_increment_state(hdr);
+			}
+			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+		} else {
+			new_state = arc_mfu;
+			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		}
+
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		arc_change_state(new_state, hdr, hash_lock);
+
+		atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits);
+		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
+	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
+		/*
+		 * This buffer has been accessed more than once and is
+		 * still in the cache.  Keep it in the MFU state.
+		 *
+		 * NOTE: an add_reference() that occurred when we did
+		 * the arc_read() will have kicked this off the list.
+		 * If it was a prefetch, we will explicitly move it to
+		 * the head of the list now.
+		 */
+
+		atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
+		ARCSTAT_BUMP(arcstat_mfu_hits);
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
+		arc_state_t	*new_state = arc_mfu;
+		/*
+		 * This buffer has been accessed more than once but has
+		 * been evicted from the cache.  Move it back to the
+		 * MFU state.
+		 */
+
+		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
+			/*
+			 * This is a prefetch access...
+			 * move this block back to the MRU state.
+			 */
+			new_state = arc_mru;
+		}
+
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		arc_change_state(new_state, hdr, hash_lock);
+
+		atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits);
+		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
+		/*
+		 * This buffer is on the 2nd Level ARC.
+		 */
+
+		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+		arc_change_state(arc_mfu, hdr, hash_lock);
+	} else {
+		cmn_err(CE_PANIC, "invalid arc state 0x%p",
+		    hdr->b_l1hdr.b_state);
+	}
+}
+
+/*
+ * This routine is called by dbuf_hold() to update the arc_access() state
+ * which otherwise would be skipped for entries in the dbuf cache.
+ */
+void
+arc_buf_access(arc_buf_t *buf)
+{
+	mutex_enter(&buf->b_evict_lock);
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	/*
+	 * Avoid taking the hash_lock when possible as an optimization.
+	 * The header must be checked again under the hash_lock in order
+	 * to handle the case where it is concurrently being released.
+	 */
+	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
+		mutex_exit(&buf->b_evict_lock);
+		return;
+	}
+
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
+		mutex_exit(hash_lock);
+		mutex_exit(&buf->b_evict_lock);
+		ARCSTAT_BUMP(arcstat_access_skip);
+		return;
+	}
+
+	mutex_exit(&buf->b_evict_lock);
+
+	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+	    hdr->b_l1hdr.b_state == arc_mfu);
+
+	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+	arc_access(hdr, hash_lock);
+	mutex_exit(hash_lock);
+
+	ARCSTAT_BUMP(arcstat_hits);
+	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr),
+	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
+}
+
+/* a generic arc_read_done_func_t which you can use */
+/* ARGSUSED */
+void
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
+{
+	if (buf == NULL)
+		return;
+
+	bcopy(buf->b_data, arg, arc_buf_size(buf));
+	arc_buf_destroy(buf, arg);
+}
+
+/* a generic arc_read_done_func_t */
+/* ARGSUSED */
+void
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
+{
+	arc_buf_t **bufp = arg;
+
+	if (buf == NULL) {
+		ASSERT(zio == NULL || zio->io_error != 0);
+		*bufp = NULL;
+	} else {
+		ASSERT(zio == NULL || zio->io_error == 0);
+		*bufp = buf;
+		ASSERT(buf->b_data != NULL);
+	}
+}
+
+static void
+arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
+{
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
+		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
+	} else {
+		if (HDR_COMPRESSION_ENABLED(hdr)) {
+			ASSERT3U(arc_hdr_get_compress(hdr), ==,
+			    BP_GET_COMPRESS(bp));
+		}
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
+		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
+	}
+}
+
+static void
+arc_read_done(zio_t *zio)
+{
+	blkptr_t 	*bp = zio->io_bp;
+	arc_buf_hdr_t	*hdr = zio->io_private;
+	kmutex_t	*hash_lock = NULL;
+	arc_callback_t	*callback_list;
+	arc_callback_t	*acb;
+	boolean_t	freeable = B_FALSE;
+
+	/*
+	 * The hdr was inserted into hash-table and removed from lists
+	 * prior to starting I/O.  We should find this header, since
+	 * it's in the hash table, and it should be legit since it's
+	 * not possible to evict it during the I/O.  The only possible
+	 * reason for it not to be found is if we were freed during the
+	 * read.
+	 */
+	if (HDR_IN_HASH_TABLE(hdr)) {
+		arc_buf_hdr_t *found;
+
+		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+		ASSERT3U(hdr->b_dva.dva_word[0], ==,
+		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
+		ASSERT3U(hdr->b_dva.dva_word[1], ==,
+		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
+
+		found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
+
+		ASSERT((found == hdr &&
+		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+		    (found == hdr && HDR_L2_READING(hdr)));
+		ASSERT3P(hash_lock, !=, NULL);
+	}
+
+	if (BP_IS_PROTECTED(bp)) {
+		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
+		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
+		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
+		    hdr->b_crypt_hdr.b_iv);
+
+		if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
+			void *tmpbuf;
+
+			tmpbuf = abd_borrow_buf_copy(zio->io_abd,
+			    sizeof (zil_chain_t));
+			zio_crypt_decode_mac_zil(tmpbuf,
+			    hdr->b_crypt_hdr.b_mac);
+			abd_return_buf(zio->io_abd, tmpbuf,
+			    sizeof (zil_chain_t));
+		} else {
+			zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+		}
+	}
+
+	if (zio->io_error == 0) {
+		/* byteswap if necessary */
+		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+			if (BP_GET_LEVEL(zio->io_bp) > 0) {
+				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+			} else {
+				hdr->b_l1hdr.b_byteswap =
+				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+			}
+		} else {
+			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+		}
+		if (!HDR_L2_READING(hdr)) {
+			hdr->b_complevel = zio->io_prop.zp_complevel;
+		}
+	}
+
+	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
+	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
+		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
+
+	callback_list = hdr->b_l1hdr.b_acb;
+	ASSERT3P(callback_list, !=, NULL);
+
+	if (hash_lock && zio->io_error == 0 &&
+	    hdr->b_l1hdr.b_state == arc_anon) {
+		/*
+		 * Only call arc_access on anonymous buffers.  This is because
+		 * if we've issued an I/O for an evicted buffer, we've already
+		 * called arc_access (to prevent any simultaneous readers from
+		 * getting confused).
+		 */
+		arc_access(hdr, hash_lock);
+	}
+
+	/*
+	 * If a read request has a callback (i.e. acb_done is not NULL), then we
+	 * make a buf containing the data according to the parameters which were
+	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
+	 * aren't needlessly decompressing the data multiple times.
+	 */
+	int callback_cnt = 0;
+	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
+		if (!acb->acb_done || acb->acb_nobuf)
+			continue;
+
+		callback_cnt++;
+
+		if (zio->io_error != 0)
+			continue;
+
+		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
+		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
+		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
+		    &acb->acb_buf);
+
+		/*
+		 * Assert non-speculative zios didn't fail because an
+		 * encryption key wasn't loaded
+		 */
+		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
+		    error != EACCES);
+
+		/*
+		 * If we failed to decrypt, report an error now (as the zio
+		 * layer would have done if it had done the transforms).
+		 */
+		if (error == ECKSUM) {
+			ASSERT(BP_IS_PROTECTED(bp));
+			error = SET_ERROR(EIO);
+			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+				spa_log_error(zio->io_spa, &acb->acb_zb);
+				(void) zfs_ereport_post(
+				    FM_EREPORT_ZFS_AUTHENTICATION,
+				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
+			}
+		}
+
+		if (error != 0) {
+			/*
+			 * Decompression or decryption failed.  Set
+			 * io_error so that when we call acb_done
+			 * (below), we will indicate that the read
+			 * failed. Note that in the unusual case
+			 * where one callback is compressed and another
+			 * uncompressed, we will mark all of them
+			 * as failed, even though the uncompressed
+			 * one can't actually fail.  In this case,
+			 * the hdr will not be anonymous, because
+			 * if there are multiple callbacks, it's
+			 * because multiple threads found the same
+			 * arc buf in the hash table.
+			 */
+			zio->io_error = error;
+		}
+	}
+
+	/*
+	 * If there are multiple callbacks, we must have the hash lock,
+	 * because the only way for multiple threads to find this hdr is
+	 * in the hash table.  This ensures that if there are multiple
+	 * callbacks, the hdr is not anonymous.  If it were anonymous,
+	 * we couldn't use arc_buf_destroy() in the error case below.
+	 */
+	ASSERT(callback_cnt < 2 || hash_lock != NULL);
+
+	hdr->b_l1hdr.b_acb = NULL;
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+	if (callback_cnt == 0)
+		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+
+	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+	    callback_list != NULL);
+
+	if (zio->io_error == 0) {
+		arc_hdr_verify(hdr, zio->io_bp);
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+		if (hdr->b_l1hdr.b_state != arc_anon)
+			arc_change_state(arc_anon, hdr, hash_lock);
+		if (HDR_IN_HASH_TABLE(hdr))
+			buf_hash_remove(hdr);
+		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
+	}
+
+	/*
+	 * Broadcast before we drop the hash_lock to avoid the possibility
+	 * that the hdr (and hence the cv) might be freed before we get to
+	 * the cv_broadcast().
+	 */
+	cv_broadcast(&hdr->b_l1hdr.b_cv);
+
+	if (hash_lock != NULL) {
+		mutex_exit(hash_lock);
+	} else {
+		/*
+		 * This block was freed while we waited for the read to
+		 * complete.  It has been removed from the hash table and
+		 * moved to the anonymous state (so that it won't show up
+		 * in the cache).
+		 */
+		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
+	}
+
+	/* execute each callback and free its structure */
+	while ((acb = callback_list) != NULL) {
+		if (acb->acb_done != NULL) {
+			if (zio->io_error != 0 && acb->acb_buf != NULL) {
+				/*
+				 * If arc_buf_alloc_impl() fails during
+				 * decompression, the buf will still be
+				 * allocated, and needs to be freed here.
+				 */
+				arc_buf_destroy(acb->acb_buf,
+				    acb->acb_private);
+				acb->acb_buf = NULL;
+			}
+			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+			    acb->acb_buf, acb->acb_private);
+		}
+
+		if (acb->acb_zio_dummy != NULL) {
+			acb->acb_zio_dummy->io_error = zio->io_error;
+			zio_nowait(acb->acb_zio_dummy);
+		}
+
+		callback_list = acb->acb_next;
+		kmem_free(acb, sizeof (arc_callback_t));
+	}
+
+	if (freeable)
+		arc_hdr_destroy(hdr);
+}
+
+/*
+ * "Read" the block at the specified DVA (in bp) via the
+ * cache.  If the block is found in the cache, invoke the provided
+ * callback immediately and return.  Note that the `zio' parameter
+ * in the callback will be NULL in this case, since no IO was
+ * required.  If the block is not in the cache pass the read request
+ * on to the spa with a substitute callback function, so that the
+ * requested block will be added to the cache.
+ *
+ * If a read request arrives for a block that has a read in-progress,
+ * either wait for the in-progress read to complete (and return the
+ * results); or, if this is a read with a "done" func, add a record
+ * to the read to invoke the "done" func when the read completes,
+ * and return; or just return.
+ *
+ * arc_read_done() will invoke all the requested "done" functions
+ * for readers of this block.
+ */
+int
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+    arc_read_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
+{
+	arc_buf_hdr_t *hdr = NULL;
+	kmutex_t *hash_lock = NULL;
+	zio_t *rzio;
+	uint64_t guid = spa_load_guid(spa);
+	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
+	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
+	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
+	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
+	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
+	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
+	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
+	int rc = 0;
+
+	ASSERT(!embedded_bp ||
+	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(!BP_IS_REDACTED(bp));
+
+	/*
+	 * Normally SPL_FSTRANS will already be set since kernel threads which
+	 * expect to call the DMU interfaces will set it when created.  System
+	 * calls are similarly handled by setting/cleaning the bit in the
+	 * registered callback (module/os/.../zfs/zpl_*).
+	 *
+	 * External consumers such as Lustre which call the exported DMU
+	 * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
+	 * on the hash_lock always set and clear the bit.
+	 */
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+top:
+	if (!embedded_bp) {
+		/*
+		 * Embedded BP's have no DVA and require no I/O to "read".
+		 * Create an anonymous arc buf to back it.
+		 */
+		hdr = buf_hash_find(guid, bp, &hash_lock);
+	}
+
+	/*
+	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
+	 * we maintain encrypted data separately from compressed / uncompressed
+	 * data. If the user is requesting raw encrypted data and we don't have
+	 * that in the header we will read from disk to guarantee that we can
+	 * get it even if the encryption keys aren't loaded.
+	 */
+	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
+	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
+		arc_buf_t *buf = NULL;
+		*arc_flags |= ARC_FLAG_CACHED;
+
+		if (HDR_IO_IN_PROGRESS(hdr)) {
+			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
+
+			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
+				mutex_exit(hash_lock);
+				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
+				rc = SET_ERROR(ENOENT);
+				goto out;
+			}
+
+			ASSERT3P(head_zio, !=, NULL);
+			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+			    priority == ZIO_PRIORITY_SYNC_READ) {
+				/*
+				 * This is a sync read that needs to wait for
+				 * an in-flight async read. Request that the
+				 * zio have its priority upgraded.
+				 */
+				zio_change_priority(head_zio, priority);
+				DTRACE_PROBE1(arc__async__upgrade__sync,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
+			}
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREDICTIVE_PREFETCH);
+			}
+
+			if (*arc_flags & ARC_FLAG_WAIT) {
+				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+				mutex_exit(hash_lock);
+				goto top;
+			}
+			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
+
+			if (done) {
+				arc_callback_t *acb = NULL;
+
+				acb = kmem_zalloc(sizeof (arc_callback_t),
+				    KM_SLEEP);
+				acb->acb_done = done;
+				acb->acb_private = private;
+				acb->acb_compressed = compressed_read;
+				acb->acb_encrypted = encrypted_read;
+				acb->acb_noauth = noauth_read;
+				acb->acb_nobuf = no_buf;
+				acb->acb_zb = *zb;
+				if (pio != NULL)
+					acb->acb_zio_dummy = zio_null(pio,
+					    spa, NULL, NULL, NULL, zio_flags);
+
+				ASSERT3P(acb->acb_done, !=, NULL);
+				acb->acb_zio_head = head_zio;
+				acb->acb_next = hdr->b_l1hdr.b_acb;
+				hdr->b_l1hdr.b_acb = acb;
+			}
+			mutex_exit(hash_lock);
+			goto out;
+		}
+
+		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+		    hdr->b_l1hdr.b_state == arc_mfu);
+
+		if (done && !no_buf) {
+			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+				/*
+				 * This is a demand read which does not have to
+				 * wait for i/o because we did a predictive
+				 * prefetch i/o for it, which has completed.
+				 */
+				DTRACE_PROBE1(
+				    arc__demand__hit__predictive__prefetch,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_predictive_prefetch);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PREDICTIVE_PREFETCH);
+			}
+
+			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+				ARCSTAT_BUMP(
+				    arcstat_demand_hit_prescient_prefetch);
+				arc_hdr_clear_flags(hdr,
+				    ARC_FLAG_PRESCIENT_PREFETCH);
+			}
+
+			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
+
+			/* Get a buf with the desired data in it. */
+			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
+			    encrypted_read, compressed_read, noauth_read,
+			    B_TRUE, &buf);
+			if (rc == ECKSUM) {
+				/*
+				 * Convert authentication and decryption errors
+				 * to EIO (and generate an ereport if needed)
+				 * before leaving the ARC.
+				 */
+				rc = SET_ERROR(EIO);
+				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+					spa_log_error(spa, zb);
+					(void) zfs_ereport_post(
+					    FM_EREPORT_ZFS_AUTHENTICATION,
+					    spa, NULL, zb, NULL, 0);
+				}
+			}
+			if (rc != 0) {
+				(void) remove_reference(hdr, hash_lock,
+				    private);
+				arc_buf_destroy_impl(buf);
+				buf = NULL;
+			}
+
+			/* assert any errors weren't due to unloaded keys */
+			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
+			    rc != EACCES);
+		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
+		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+			if (HDR_HAS_L2HDR(hdr))
+				l2arc_hdr_arcstats_decrement_state(hdr);
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+			if (HDR_HAS_L2HDR(hdr))
+				l2arc_hdr_arcstats_increment_state(hdr);
+		}
+		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
+		arc_access(hdr, hash_lock);
+		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+		if (*arc_flags & ARC_FLAG_L2CACHE)
+			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+		mutex_exit(hash_lock);
+		ARCSTAT_BUMP(arcstat_hits);
+		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
+		    data, metadata, hits);
+
+		if (done)
+			done(NULL, zb, bp, buf, private);
+	} else {
+		uint64_t lsize = BP_GET_LSIZE(bp);
+		uint64_t psize = BP_GET_PSIZE(bp);
+		arc_callback_t *acb;
+		vdev_t *vd = NULL;
+		uint64_t addr = 0;
+		boolean_t devw = B_FALSE;
+		uint64_t size;
+		abd_t *hdr_abd;
+		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
+
+		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
+			rc = SET_ERROR(ENOENT);
+			if (hash_lock != NULL)
+				mutex_exit(hash_lock);
+			goto out;
+		}
+
+		/*
+		 * Gracefully handle a damaged logical block size as a
+		 * checksum error.
+		 */
+		if (lsize > spa_maxblocksize(spa)) {
+			rc = SET_ERROR(ECKSUM);
+			if (hash_lock != NULL)
+				mutex_exit(hash_lock);
+			goto out;
+		}
+
+		if (hdr == NULL) {
+			/*
+			 * This block is not in the cache or it has
+			 * embedded data.
+			 */
+			arc_buf_hdr_t *exists = NULL;
+			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
+			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type,
+			    encrypted_read);
+
+			if (!embedded_bp) {
+				hdr->b_dva = *BP_IDENTITY(bp);
+				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+				exists = buf_hash_insert(hdr, &hash_lock);
+			}
+			if (exists != NULL) {
+				/* somebody beat us to the hash insert */
+				mutex_exit(hash_lock);
+				buf_discard_identity(hdr);
+				arc_hdr_destroy(hdr);
+				goto top; /* restart the IO request */
+			}
+		} else {
+			/*
+			 * This block is in the ghost cache or encrypted data
+			 * was requested and we didn't have it. If it was
+			 * L2-only (and thus didn't have an L1 hdr),
+			 * we realloc the header to add an L1 hdr.
+			 */
+			if (!HDR_HAS_L1HDR(hdr)) {
+				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+				    hdr_full_cache);
+			}
+
+			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
+				ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+				ASSERT(!HDR_HAS_RABD(hdr));
+				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+				ASSERT0(zfs_refcount_count(
+				    &hdr->b_l1hdr.b_refcnt));
+				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+			} else if (HDR_IO_IN_PROGRESS(hdr)) {
+				/*
+				 * If this header already had an IO in progress
+				 * and we are performing another IO to fetch
+				 * encrypted data we must wait until the first
+				 * IO completes so as not to confuse
+				 * arc_read_done(). This should be very rare
+				 * and so the performance impact shouldn't
+				 * matter.
+				 */
+				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
+				mutex_exit(hash_lock);
+				goto top;
+			}
+
+			/*
+			 * This is a delicate dance that we play here.
+			 * This hdr might be in the ghost list so we access
+			 * it to move it out of the ghost list before we
+			 * initiate the read. If it's a prefetch then
+			 * it won't have a callback so we'll remove the
+			 * reference that arc_buf_alloc_impl() created. We
+			 * do this after we've called arc_access() to
+			 * avoid hitting an assert in remove_reference().
+			 */
+			arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state);
+			arc_access(hdr, hash_lock);
+			arc_hdr_alloc_abd(hdr, alloc_flags);
+		}
+
+		if (encrypted_read) {
+			ASSERT(HDR_HAS_RABD(hdr));
+			size = HDR_GET_PSIZE(hdr);
+			hdr_abd = hdr->b_crypt_hdr.b_rabd;
+			zio_flags |= ZIO_FLAG_RAW;
+		} else {
+			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+			size = arc_hdr_size(hdr);
+			hdr_abd = hdr->b_l1hdr.b_pabd;
+
+			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
+				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+			}
+
+			/*
+			 * For authenticated bp's, we do not ask the ZIO layer
+			 * to authenticate them since this will cause the entire
+			 * IO to fail if the key isn't loaded. Instead, we
+			 * defer authentication until arc_buf_fill(), which will
+			 * verify the data when the key is available.
+			 */
+			if (BP_IS_AUTHENTICATED(bp))
+				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
+		}
+
+		if (*arc_flags & ARC_FLAG_PREFETCH &&
+		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+			if (HDR_HAS_L2HDR(hdr))
+				l2arc_hdr_arcstats_decrement_state(hdr);
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+			if (HDR_HAS_L2HDR(hdr))
+				l2arc_hdr_arcstats_increment_state(hdr);
+		}
+		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+		if (*arc_flags & ARC_FLAG_L2CACHE)
+			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+		if (BP_IS_AUTHENTICATED(bp))
+			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
+		if (BP_GET_LEVEL(bp) > 0)
+			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
+		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
+		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
+
+		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
+		acb->acb_done = done;
+		acb->acb_private = private;
+		acb->acb_compressed = compressed_read;
+		acb->acb_encrypted = encrypted_read;
+		acb->acb_noauth = noauth_read;
+		acb->acb_zb = *zb;
+
+		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+		hdr->b_l1hdr.b_acb = acb;
+		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+		if (HDR_HAS_L2HDR(hdr) &&
+		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+			addr = hdr->b_l2hdr.b_daddr;
+			/*
+			 * Lock out L2ARC device removal.
+			 */
+			if (vdev_is_dead(vd) ||
+			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
+				vd = NULL;
+		}
+
+		/*
+		 * We count both async reads and scrub IOs as asynchronous so
+		 * that both can be upgraded in the event of a cache hit while
+		 * the read IO is still in-flight.
+		 */
+		if (priority == ZIO_PRIORITY_ASYNC_READ ||
+		    priority == ZIO_PRIORITY_SCRUB)
+			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+		else
+			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+
+		/*
+		 * At this point, we have a level 1 cache miss or a blkptr
+		 * with embedded data.  Try again in L2ARC if possible.
+		 */
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
+
+		/*
+		 * Skip ARC stat bump for block pointers with embedded
+		 * data. The data are read from the blkptr itself via
+		 * decode_embedded_bp_compressed().
+		 */
+		if (!embedded_bp) {
+			DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
+			    blkptr_t *, bp, uint64_t, lsize,
+			    zbookmark_phys_t *, zb);
+			ARCSTAT_BUMP(arcstat_misses);
+			ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
+			    metadata, misses);
+		}
+
+		/* Check if the spa even has l2 configured */
+		const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
+		    spa->spa_l2cache.sav_count > 0;
+
+		if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
+			/*
+			 * Read from the L2ARC if the following are true:
+			 * 1. The L2ARC vdev was previously cached.
+			 * 2. This buffer still has L2ARC metadata.
+			 * 3. This buffer isn't currently writing to the L2ARC.
+			 * 4. The L2ARC entry wasn't evicted, which may
+			 *    also have invalidated the vdev.
+			 * 5. This isn't prefetch or l2arc_noprefetch is 0.
+			 */
+			if (HDR_HAS_L2HDR(hdr) &&
+			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
+			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
+				l2arc_read_callback_t *cb;
+				abd_t *abd;
+				uint64_t asize;
+
+				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_l2_hits);
+				atomic_inc_32(&hdr->b_l2hdr.b_hits);
+
+				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+				    KM_SLEEP);
+				cb->l2rcb_hdr = hdr;
+				cb->l2rcb_bp = *bp;
+				cb->l2rcb_zb = *zb;
+				cb->l2rcb_flags = zio_flags;
+
+				/*
+				 * When Compressed ARC is disabled, but the
+				 * L2ARC block is compressed, arc_hdr_size()
+				 * will have returned LSIZE rather than PSIZE.
+				 */
+				if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+				    !HDR_COMPRESSION_ENABLED(hdr) &&
+				    HDR_GET_PSIZE(hdr) != 0) {
+					size = HDR_GET_PSIZE(hdr);
+				}
+
+				asize = vdev_psize_to_asize(vd, size);
+				if (asize != size) {
+					abd = abd_alloc_for_io(asize,
+					    HDR_ISTYPE_METADATA(hdr));
+					cb->l2rcb_abd = abd;
+				} else {
+					abd = hdr_abd;
+				}
+
+				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
+				    addr + asize <= vd->vdev_psize -
+				    VDEV_LABEL_END_SIZE);
+
+				/*
+				 * l2arc read.  The SCL_L2ARC lock will be
+				 * released by l2arc_read_done().
+				 * Issue a null zio if the underlying buffer
+				 * was squashed to zero size by compression.
+				 */
+				ASSERT3U(arc_hdr_get_compress(hdr), !=,
+				    ZIO_COMPRESS_EMPTY);
+				rzio = zio_read_phys(pio, vd, addr,
+				    asize, abd,
+				    ZIO_CHECKSUM_OFF,
+				    l2arc_read_done, cb, priority,
+				    zio_flags | ZIO_FLAG_DONT_CACHE |
+				    ZIO_FLAG_CANFAIL |
+				    ZIO_FLAG_DONT_PROPAGATE |
+				    ZIO_FLAG_DONT_RETRY, B_FALSE);
+				acb->acb_zio_head = rzio;
+
+				if (hash_lock != NULL)
+					mutex_exit(hash_lock);
+
+				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+				    zio_t *, rzio);
+				ARCSTAT_INCR(arcstat_l2_read_bytes,
+				    HDR_GET_PSIZE(hdr));
+
+				if (*arc_flags & ARC_FLAG_NOWAIT) {
+					zio_nowait(rzio);
+					goto out;
+				}
+
+				ASSERT(*arc_flags & ARC_FLAG_WAIT);
+				if (zio_wait(rzio) == 0)
+					goto out;
+
+				/* l2arc read error; goto zio_read() */
+				if (hash_lock != NULL)
+					mutex_enter(hash_lock);
+			} else {
+				DTRACE_PROBE1(l2arc__miss,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_l2_misses);
+				if (HDR_L2_WRITING(hdr))
+					ARCSTAT_BUMP(arcstat_l2_rw_clash);
+				spa_config_exit(spa, SCL_L2ARC, vd);
+			}
+		} else {
+			if (vd != NULL)
+				spa_config_exit(spa, SCL_L2ARC, vd);
+
+			/*
+			 * Only a spa with l2 should contribute to l2
+			 * miss stats.  (Including the case of having a
+			 * faulted cache device - that's also a miss.)
+			 */
+			if (spa_has_l2) {
+				/*
+				 * Skip ARC stat bump for block pointers with
+				 * embedded data. The data are read from the
+				 * blkptr itself via
+				 * decode_embedded_bp_compressed().
+				 */
+				if (!embedded_bp) {
+					DTRACE_PROBE1(l2arc__miss,
+					    arc_buf_hdr_t *, hdr);
+					ARCSTAT_BUMP(arcstat_l2_misses);
+				}
+			}
+		}
+
+		rzio = zio_read(pio, spa, bp, hdr_abd, size,
+		    arc_read_done, hdr, priority, zio_flags, zb);
+		acb->acb_zio_head = rzio;
+
+		if (hash_lock != NULL)
+			mutex_exit(hash_lock);
+
+		if (*arc_flags & ARC_FLAG_WAIT) {
+			rc = zio_wait(rzio);
+			goto out;
+		}
+
+		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
+		zio_nowait(rzio);
+	}
+
+out:
+	/* embedded bps don't actually go to disk */
+	if (!embedded_bp)
+		spa_read_history_add(spa, zb, *arc_flags);
+	spl_fstrans_unmark(cookie);
+	return (rc);
+}
+
+arc_prune_t *
+arc_add_prune_callback(arc_prune_func_t *func, void *private)
+{
+	arc_prune_t *p;
+
+	p = kmem_alloc(sizeof (*p), KM_SLEEP);
+	p->p_pfunc = func;
+	p->p_private = private;
+	list_link_init(&p->p_node);
+	zfs_refcount_create(&p->p_refcnt);
+
+	mutex_enter(&arc_prune_mtx);
+	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
+	list_insert_head(&arc_prune_list, p);
+	mutex_exit(&arc_prune_mtx);
+
+	return (p);
+}
+
+void
+arc_remove_prune_callback(arc_prune_t *p)
+{
+	boolean_t wait = B_FALSE;
+	mutex_enter(&arc_prune_mtx);
+	list_remove(&arc_prune_list, p);
+	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
+		wait = B_TRUE;
+	mutex_exit(&arc_prune_mtx);
+
+	/* wait for arc_prune_task to finish */
+	if (wait)
+		taskq_wait_outstanding(arc_prune_taskq, 0);
+	ASSERT0(zfs_refcount_count(&p->p_refcnt));
+	zfs_refcount_destroy(&p->p_refcnt);
+	kmem_free(p, sizeof (*p));
+}
+
+/*
+ * Notify the arc that a block was freed, and thus will never be used again.
+ */
+void
+arc_freed(spa_t *spa, const blkptr_t *bp)
+{
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	uint64_t guid = spa_load_guid(spa);
+
+	ASSERT(!BP_IS_EMBEDDED(bp));
+
+	hdr = buf_hash_find(guid, bp, &hash_lock);
+	if (hdr == NULL)
+		return;
+
+	/*
+	 * We might be trying to free a block that is still doing I/O
+	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
+	 * dmu_sync-ed block). If this block is being prefetched, then it
+	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
+	 * until the I/O completes. A block may also have a reference if it is
+	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
+	 * have written the new block to its final resting place on disk but
+	 * without the dedup flag set. This would have left the hdr in the MRU
+	 * state and discoverable. When the txg finally syncs it detects that
+	 * the block was overridden in open context and issues an override I/O.
+	 * Since this is a dedup block, the override I/O will determine if the
+	 * block is already in the DDT. If so, then it will replace the io_bp
+	 * with the bp from the DDT and allow the I/O to finish. When the I/O
+	 * reaches the done callback, dbuf_write_override_done, it will
+	 * check to see if the io_bp and io_bp_override are identical.
+	 * If they are not, then it indicates that the bp was replaced with
+	 * the bp in the DDT and the override bp is freed. This allows
+	 * us to arrive here with a reference on a block that is being
+	 * freed. So if we have an I/O in progress, or a reference to
+	 * this hdr, then we don't destroy the hdr.
+	 */
+	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
+	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
+		arc_change_state(arc_anon, hdr, hash_lock);
+		arc_hdr_destroy(hdr);
+		mutex_exit(hash_lock);
+	} else {
+		mutex_exit(hash_lock);
+	}
+
+}
+
+/*
+ * Release this buffer from the cache, making it an anonymous buffer.  This
+ * must be done after a read and prior to modifying the buffer contents.
+ * If the buffer has more than one reference, we must make
+ * a new hdr for the buffer.
+ */
+void
+arc_release(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	/*
+	 * It would be nice to assert that if its DMU metadata (level >
+	 * 0 || it's the dnode file), then it must be syncing context.
+	 * But we don't know that information at this level.
+	 */
+
+	mutex_enter(&buf->b_evict_lock);
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	/*
+	 * We don't grab the hash lock prior to this check, because if
+	 * the buffer's header is in the arc_anon state, it won't be
+	 * linked into the hash table.
+	 */
+	if (hdr->b_l1hdr.b_state == arc_anon) {
+		mutex_exit(&buf->b_evict_lock);
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		ASSERT(!HDR_IN_HASH_TABLE(hdr));
+		ASSERT(!HDR_HAS_L2HDR(hdr));
+		ASSERT(HDR_EMPTY(hdr));
+
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+		hdr->b_l1hdr.b_arc_access = 0;
+
+		/*
+		 * If the buf is being overridden then it may already
+		 * have a hdr that is not empty.
+		 */
+		buf_discard_identity(hdr);
+		arc_buf_thaw(buf);
+
+		return;
+	}
+
+	kmutex_t *hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+
+	/*
+	 * This assignment is only valid as long as the hash_lock is
+	 * held, we must be careful not to reference state or the
+	 * b_state field after dropping the lock.
+	 */
+	arc_state_t *state = hdr->b_l1hdr.b_state;
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+	ASSERT3P(state, !=, arc_anon);
+
+	/* this buffer is not on any list */
+	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
+
+	if (HDR_HAS_L2HDR(hdr)) {
+		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+		/*
+		 * We have to recheck this conditional again now that
+		 * we're holding the l2ad_mtx to prevent a race with
+		 * another thread which might be concurrently calling
+		 * l2arc_evict(). In that case, l2arc_evict() might have
+		 * destroyed the header's L2 portion as we were waiting
+		 * to acquire the l2ad_mtx.
+		 */
+		if (HDR_HAS_L2HDR(hdr))
+			arc_hdr_l2hdr_destroy(hdr);
+
+		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+	}
+
+	/*
+	 * Do we have more than one buf?
+	 */
+	if (hdr->b_l1hdr.b_bufcnt > 1) {
+		arc_buf_hdr_t *nhdr;
+		uint64_t spa = hdr->b_spa;
+		uint64_t psize = HDR_GET_PSIZE(hdr);
+		uint64_t lsize = HDR_GET_LSIZE(hdr);
+		boolean_t protected = HDR_PROTECTED(hdr);
+		enum zio_compress compress = arc_hdr_get_compress(hdr);
+		arc_buf_contents_t type = arc_buf_type(hdr);
+		VERIFY3U(hdr->b_type, ==, type);
+
+		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
+		(void) remove_reference(hdr, hash_lock, tag);
+
+		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
+			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+			ASSERT(ARC_BUF_LAST(buf));
+		}
+
+		/*
+		 * Pull the data off of this hdr and attach it to
+		 * a new anonymous hdr. Also find the last buffer
+		 * in the hdr's buffer list.
+		 */
+		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
+		ASSERT3P(lastbuf, !=, NULL);
+
+		/*
+		 * If the current arc_buf_t and the hdr are sharing their data
+		 * buffer, then we must stop sharing that block.
+		 */
+		if (arc_buf_is_shared(buf)) {
+			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+			VERIFY(!arc_buf_is_shared(lastbuf));
+
+			/*
+			 * First, sever the block sharing relationship between
+			 * buf and the arc_buf_hdr_t.
+			 */
+			arc_unshare_buf(hdr, buf);
+
+			/*
+			 * Now we need to recreate the hdr's b_pabd. Since we
+			 * have lastbuf handy, we try to share with it, but if
+			 * we can't then we allocate a new b_pabd and copy the
+			 * data from buf into it.
+			 */
+			if (arc_can_share(hdr, lastbuf)) {
+				arc_share_buf(hdr, lastbuf);
+			} else {
+				arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
+				    buf->b_data, psize);
+			}
+			VERIFY3P(lastbuf->b_data, !=, NULL);
+		} else if (HDR_SHARED_DATA(hdr)) {
+			/*
+			 * Uncompressed shared buffers are always at the end
+			 * of the list. Compressed buffers don't have the
+			 * same requirements. This makes it hard to
+			 * simply assert that the lastbuf is shared so
+			 * we rely on the hdr's compression flags to determine
+			 * if we have a compressed, shared buffer.
+			 */
+			ASSERT(arc_buf_is_shared(lastbuf) ||
+			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
+			ASSERT(!ARC_BUF_SHARED(buf));
+		}
+
+		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
+		ASSERT3P(state, !=, arc_l2c_only);
+
+		(void) zfs_refcount_remove_many(&state->arcs_size,
+		    arc_buf_size(buf), buf);
+
+		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+			ASSERT3P(state, !=, arc_l2c_only);
+			(void) zfs_refcount_remove_many(
+			    &state->arcs_esize[type],
+			    arc_buf_size(buf), buf);
+		}
+
+		hdr->b_l1hdr.b_bufcnt -= 1;
+		if (ARC_BUF_ENCRYPTED(buf))
+			hdr->b_crypt_hdr.b_ebufcnt -= 1;
+
+		arc_cksum_verify(buf);
+		arc_buf_unwatch(buf);
+
+		/* if this is the last uncompressed buf free the checksum */
+		if (!arc_hdr_has_uncompressed_buf(hdr))
+			arc_cksum_free(hdr);
+
+		mutex_exit(hash_lock);
+
+		/*
+		 * Allocate a new hdr. The new hdr will contain a b_pabd
+		 * buffer which will be freed in arc_write().
+		 */
+		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
+		    compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr));
+		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
+		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
+		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
+		VERIFY3U(nhdr->b_type, ==, type);
+		ASSERT(!HDR_SHARED_DATA(nhdr));
+
+		nhdr->b_l1hdr.b_buf = buf;
+		nhdr->b_l1hdr.b_bufcnt = 1;
+		if (ARC_BUF_ENCRYPTED(buf))
+			nhdr->b_crypt_hdr.b_ebufcnt = 1;
+		nhdr->b_l1hdr.b_mru_hits = 0;
+		nhdr->b_l1hdr.b_mru_ghost_hits = 0;
+		nhdr->b_l1hdr.b_mfu_hits = 0;
+		nhdr->b_l1hdr.b_mfu_ghost_hits = 0;
+		nhdr->b_l1hdr.b_l2_hits = 0;
+		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
+		buf->b_hdr = nhdr;
+
+		mutex_exit(&buf->b_evict_lock);
+		(void) zfs_refcount_add_many(&arc_anon->arcs_size,
+		    arc_buf_size(buf), buf);
+	} else {
+		mutex_exit(&buf->b_evict_lock);
+		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+		/* protected by hash lock, or hdr is on arc_anon */
+		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+		hdr->b_l1hdr.b_mru_hits = 0;
+		hdr->b_l1hdr.b_mru_ghost_hits = 0;
+		hdr->b_l1hdr.b_mfu_hits = 0;
+		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
+		hdr->b_l1hdr.b_l2_hits = 0;
+		arc_change_state(arc_anon, hdr, hash_lock);
+		hdr->b_l1hdr.b_arc_access = 0;
+
+		mutex_exit(hash_lock);
+		buf_discard_identity(hdr);
+		arc_buf_thaw(buf);
+	}
+}
+
+int
+arc_released(arc_buf_t *buf)
+{
+	int released;
+
+	mutex_enter(&buf->b_evict_lock);
+	released = (buf->b_data != NULL &&
+	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
+	mutex_exit(&buf->b_evict_lock);
+	return (released);
+}
+
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+	int referenced;
+
+	mutex_enter(&buf->b_evict_lock);
+	referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
+	mutex_exit(&buf->b_evict_lock);
+	return (referenced);
+}
+#endif
+
+static void
+arc_write_ready(zio_t *zio)
+{
+	arc_write_callback_t *callback = zio->io_private;
+	arc_buf_t *buf = callback->awcb_buf;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+
+	/*
+	 * If we're reexecuting this zio because the pool suspended, then
+	 * cleanup any state that was previously set the first time the
+	 * callback was invoked.
+	 */
+	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
+		arc_cksum_free(hdr);
+		arc_buf_unwatch(buf);
+		if (hdr->b_l1hdr.b_pabd != NULL) {
+			if (arc_buf_is_shared(buf)) {
+				arc_unshare_buf(hdr, buf);
+			} else {
+				arc_hdr_free_abd(hdr, B_FALSE);
+			}
+		}
+
+		if (HDR_HAS_RABD(hdr))
+			arc_hdr_free_abd(hdr, B_TRUE);
+	}
+	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+	ASSERT(!HDR_HAS_RABD(hdr));
+	ASSERT(!HDR_SHARED_DATA(hdr));
+	ASSERT(!arc_buf_is_shared(buf));
+
+	callback->awcb_ready(zio, buf, callback->awcb_private);
+
+	if (HDR_IO_IN_PROGRESS(hdr))
+		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
+
+	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+	if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
+		hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
+
+	if (BP_IS_PROTECTED(bp)) {
+		/* ZIL blocks are written through zio_rewrite */
+		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
+		ASSERT(HDR_PROTECTED(hdr));
+
+		if (BP_SHOULD_BYTESWAP(bp)) {
+			if (BP_GET_LEVEL(bp) > 0) {
+				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+			} else {
+				hdr->b_l1hdr.b_byteswap =
+				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+			}
+		} else {
+			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+		}
+
+		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
+		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
+		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
+		    hdr->b_crypt_hdr.b_iv);
+		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
+	}
+
+	/*
+	 * If this block was written for raw encryption but the zio layer
+	 * ended up only authenticating it, adjust the buffer flags now.
+	 */
+	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
+		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
+		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
+			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
+		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
+		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
+	}
+
+	/* this must be done after the buffer flags are adjusted */
+	arc_cksum_compute(buf);
+
+	enum zio_compress compress;
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+		compress = ZIO_COMPRESS_OFF;
+	} else {
+		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+		compress = BP_GET_COMPRESS(bp);
+	}
+	HDR_SET_PSIZE(hdr, psize);
+	arc_hdr_set_compress(hdr, compress);
+	hdr->b_complevel = zio->io_prop.zp_complevel;
+
+	if (zio->io_error != 0 || psize == 0)
+		goto out;
+
+	/*
+	 * Fill the hdr with data. If the buffer is encrypted we have no choice
+	 * but to copy the data into b_radb. If the hdr is compressed, the data
+	 * we want is available from the zio, otherwise we can take it from
+	 * the buf.
+	 *
+	 * We might be able to share the buf's data with the hdr here. However,
+	 * doing so would cause the ARC to be full of linear ABDs if we write a
+	 * lot of shareable data. As a compromise, we check whether scattered
+	 * ABDs are allowed, and assume that if they are then the user wants
+	 * the ARC to be primarily filled with them regardless of the data being
+	 * written. Therefore, if they're allowed then we allocate one and copy
+	 * the data into it; otherwise, we share the data directly if we can.
+	 */
+	if (ARC_BUF_ENCRYPTED(buf)) {
+		ASSERT3U(psize, >, 0);
+		ASSERT(ARC_BUF_COMPRESSED(buf));
+		arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
+		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
+	} else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
+		/*
+		 * Ideally, we would always copy the io_abd into b_pabd, but the
+		 * user may have disabled compressed ARC, thus we must check the
+		 * hdr's compression setting rather than the io_bp's.
+		 */
+		if (BP_IS_ENCRYPTED(bp)) {
+			ASSERT3U(psize, >, 0);
+			arc_hdr_alloc_abd(hdr,
+			    ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA);
+			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
+		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
+		    !ARC_BUF_COMPRESSED(buf)) {
+			ASSERT3U(psize, >, 0);
+			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
+		} else {
+			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
+			arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT);
+			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
+			    arc_buf_size(buf));
+		}
+	} else {
+		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
+		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
+		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+
+		arc_share_buf(hdr, buf);
+	}
+
+out:
+	arc_hdr_verify(hdr, bp);
+	spl_fstrans_unmark(cookie);
+}
+
+static void
+arc_write_children_ready(zio_t *zio)
+{
+	arc_write_callback_t *callback = zio->io_private;
+	arc_buf_t *buf = callback->awcb_buf;
+
+	callback->awcb_children_ready(zio, buf, callback->awcb_private);
+}
+
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write.  See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+	arc_write_callback_t *cb = zio->io_private;
+	if (cb->awcb_physdone != NULL)
+		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
+static void
+arc_write_done(zio_t *zio)
+{
+	arc_write_callback_t *callback = zio->io_private;
+	arc_buf_t *buf = callback->awcb_buf;
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+
+	if (zio->io_error == 0) {
+		arc_hdr_verify(hdr, zio->io_bp);
+
+		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+			buf_discard_identity(hdr);
+		} else {
+			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+		}
+	} else {
+		ASSERT(HDR_EMPTY(hdr));
+	}
+
+	/*
+	 * If the block to be written was all-zero or compressed enough to be
+	 * embedded in the BP, no write was performed so there will be no
+	 * dva/birth/checksum.  The buffer must therefore remain anonymous
+	 * (and uncached).
+	 */
+	if (!HDR_EMPTY(hdr)) {
+		arc_buf_hdr_t *exists;
+		kmutex_t *hash_lock;
+
+		ASSERT3U(zio->io_error, ==, 0);
+
+		arc_cksum_verify(buf);
+
+		exists = buf_hash_insert(hdr, &hash_lock);
+		if (exists != NULL) {
+			/*
+			 * This can only happen if we overwrite for
+			 * sync-to-convergence, because we remove
+			 * buffers from the hash table when we arc_free().
+			 */
+			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad overwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
+				ASSERT(zfs_refcount_is_zero(
+				    &exists->b_l1hdr.b_refcnt));
+				arc_change_state(arc_anon, exists, hash_lock);
+				arc_hdr_destroy(exists);
+				mutex_exit(hash_lock);
+				exists = buf_hash_insert(hdr, &hash_lock);
+				ASSERT3P(exists, ==, NULL);
+			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+				/* nopwrite */
+				ASSERT(zio->io_prop.zp_nopwrite);
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad nopwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
+			} else {
+				/* Dedup */
+				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+				ASSERT(BP_GET_DEDUP(zio->io_bp));
+				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+			}
+		}
+		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+		/* if it's not anon, we are doing a scrub */
+		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
+			arc_access(hdr, hash_lock);
+		mutex_exit(hash_lock);
+	} else {
+		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+	}
+
+	ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+	callback->awcb_done(zio, buf, callback->awcb_private);
+
+	abd_free(zio->io_abd);
+	kmem_free(callback, sizeof (arc_write_callback_t));
+}
+
+zio_t *
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc,
+    const zio_prop_t *zp, arc_write_done_func_t *ready,
+    arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+    arc_write_done_func_t *done, void *private, zio_priority_t priority,
+    int zio_flags, const zbookmark_phys_t *zb)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	arc_write_callback_t *callback;
+	zio_t *zio;
+	zio_prop_t localprop = *zp;
+
+	ASSERT3P(ready, !=, NULL);
+	ASSERT3P(done, !=, NULL);
+	ASSERT(!HDR_IO_ERROR(hdr));
+	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+	if (l2arc)
+		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+
+	if (ARC_BUF_ENCRYPTED(buf)) {
+		ASSERT(ARC_BUF_COMPRESSED(buf));
+		localprop.zp_encrypt = B_TRUE;
+		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+		localprop.zp_complevel = hdr->b_complevel;
+		localprop.zp_byteorder =
+		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
+		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
+		bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
+		    ZIO_DATA_SALT_LEN);
+		bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
+		    ZIO_DATA_IV_LEN);
+		bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
+		    ZIO_DATA_MAC_LEN);
+		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
+			localprop.zp_nopwrite = B_FALSE;
+			localprop.zp_copies =
+			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
+		}
+		zio_flags |= ZIO_FLAG_RAW;
+	} else if (ARC_BUF_COMPRESSED(buf)) {
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
+		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
+		localprop.zp_complevel = hdr->b_complevel;
+		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+	}
+	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
+	callback->awcb_ready = ready;
+	callback->awcb_children_ready = children_ready;
+	callback->awcb_physdone = physdone;
+	callback->awcb_done = done;
+	callback->awcb_private = private;
+	callback->awcb_buf = buf;
+
+	/*
+	 * The hdr's b_pabd is now stale, free it now. A new data block
+	 * will be allocated when the zio pipeline calls arc_write_ready().
+	 */
+	if (hdr->b_l1hdr.b_pabd != NULL) {
+		/*
+		 * If the buf is currently sharing the data block with
+		 * the hdr then we need to break that relationship here.
+		 * The hdr will remain with a NULL data pointer and the
+		 * buf will take sole ownership of the block.
+		 */
+		if (arc_buf_is_shared(buf)) {
+			arc_unshare_buf(hdr, buf);
+		} else {
+			arc_hdr_free_abd(hdr, B_FALSE);
+		}
+		VERIFY3P(buf->b_data, !=, NULL);
+	}
+
+	if (HDR_HAS_RABD(hdr))
+		arc_hdr_free_abd(hdr, B_TRUE);
+
+	if (!(zio_flags & ZIO_FLAG_RAW))
+		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
+
+	ASSERT(!arc_buf_is_shared(buf));
+	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
+
+	zio = zio_write(pio, spa, txg, bp,
+	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
+	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
+	    (children_ready != NULL) ? arc_write_children_ready : NULL,
+	    arc_write_physdone, arc_write_done, callback,
+	    priority, zio_flags, zb);
+
+	return (zio);
+}
+
+void
+arc_tempreserve_clear(uint64_t reserve)
+{
+	atomic_add_64(&arc_tempreserve, -reserve);
+	ASSERT((int64_t)arc_tempreserve >= 0);
+}
+
+int
+arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
+{
+	int error;
+	uint64_t anon_size;
+
+	if (!arc_no_grow &&
+	    reserve > arc_c/4 &&
+	    reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
+		arc_c = MIN(arc_c_max, reserve * 4);
+
+	/*
+	 * Throttle when the calculated memory footprint for the TXG
+	 * exceeds the target ARC size.
+	 */
+	if (reserve > arc_c) {
+		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
+		return (SET_ERROR(ERESTART));
+	}
+
+	/*
+	 * Don't count loaned bufs as in flight dirty data to prevent long
+	 * network delays from blocking transactions that are ready to be
+	 * assigned to a txg.
+	 */
+
+	/* assert that it has not wrapped around */
+	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
+
+	anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
+	    arc_loaned_bytes), 0);
+
+	/*
+	 * Writes will, almost always, require additional memory allocations
+	 * in order to compress/encrypt/etc the data.  We therefore need to
+	 * make sure that there is sufficient available memory for this.
+	 */
+	error = arc_memory_throttle(spa, reserve, txg);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Throttle writes when the amount of dirty data in the cache
+	 * gets too large.  We try to keep the cache less than half full
+	 * of dirty blocks so that our sync times don't grow too large.
+	 *
+	 * In the case of one pool being built on another pool, we want
+	 * to make sure we don't end up throttling the lower (backing)
+	 * pool when the upper pool is the majority contributor to dirty
+	 * data. To insure we make forward progress during throttling, we
+	 * also check the current pool's net dirty data and only throttle
+	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
+	 * data in the cache.
+	 *
+	 * Note: if two requests come in concurrently, we might let them
+	 * both succeed, when one of them should fail.  Not a huge deal.
+	 */
+	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
+	uint64_t spa_dirty_anon = spa_dirty_data(spa);
+	uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
+	if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
+	    anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
+	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
+#ifdef ZFS_DEBUG
+		uint64_t meta_esize = zfs_refcount_count(
+		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+		uint64_t data_esize =
+		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
+		    "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
+		    arc_tempreserve >> 10, meta_esize >> 10,
+		    data_esize >> 10, reserve >> 10, rarc_c >> 10);
+#endif
+		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
+		return (SET_ERROR(ERESTART));
+	}
+	atomic_add_64(&arc_tempreserve, reserve);
+	return (0);
+}
+
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+    kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+	size->value.ui64 = zfs_refcount_count(&state->arcs_size);
+	evict_data->value.ui64 =
+	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
+	evict_metadata->value.ui64 =
+	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+	arc_stats_t *as = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE) {
+		return (SET_ERROR(EACCES));
+	} else {
+		arc_kstat_update_state(arc_anon,
+		    &as->arcstat_anon_size,
+		    &as->arcstat_anon_evictable_data,
+		    &as->arcstat_anon_evictable_metadata);
+		arc_kstat_update_state(arc_mru,
+		    &as->arcstat_mru_size,
+		    &as->arcstat_mru_evictable_data,
+		    &as->arcstat_mru_evictable_metadata);
+		arc_kstat_update_state(arc_mru_ghost,
+		    &as->arcstat_mru_ghost_size,
+		    &as->arcstat_mru_ghost_evictable_data,
+		    &as->arcstat_mru_ghost_evictable_metadata);
+		arc_kstat_update_state(arc_mfu,
+		    &as->arcstat_mfu_size,
+		    &as->arcstat_mfu_evictable_data,
+		    &as->arcstat_mfu_evictable_metadata);
+		arc_kstat_update_state(arc_mfu_ghost,
+		    &as->arcstat_mfu_ghost_size,
+		    &as->arcstat_mfu_ghost_evictable_data,
+		    &as->arcstat_mfu_ghost_evictable_metadata);
+
+		ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
+		ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
+		ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
+		ARCSTAT(arcstat_metadata_size) =
+		    aggsum_value(&astat_metadata_size);
+		ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
+		ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
+		ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size);
+#if defined(COMPAT_FREEBSD11)
+		ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) +
+		    aggsum_value(&astat_dnode_size) +
+		    aggsum_value(&astat_dbuf_size);
+#endif
+		ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size);
+		ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size);
+		ARCSTAT(arcstat_abd_chunk_waste_size) =
+		    aggsum_value(&astat_abd_chunk_waste_size);
+
+		as->arcstat_memory_all_bytes.value.ui64 =
+		    arc_all_memory();
+		as->arcstat_memory_free_bytes.value.ui64 =
+		    arc_free_memory();
+		as->arcstat_memory_available_bytes.value.i64 =
+		    arc_available_memory();
+	}
+
+	return (0);
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+static unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+	arc_buf_hdr_t *hdr = obj;
+
+	/*
+	 * We rely on b_dva to generate evenly distributed index
+	 * numbers using buf_hash below. So, as an added precaution,
+	 * let's make sure we never add empty buffers to the arc lists.
+	 */
+	ASSERT(!HDR_EMPTY(hdr));
+
+	/*
+	 * The assumption here, is the hash value for a given
+	 * arc_buf_hdr_t will remain constant throughout its lifetime
+	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
+	 * Thus, we don't need to store the header's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+	    multilist_get_num_sublists(ml));
+}
+
+#define	WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {	\
+	if ((do_warn) && (tuning) && ((tuning) != (value))) {	\
+		cmn_err(CE_WARN,				\
+		    "ignoring tunable %s (using %llu instead)",	\
+		    (#tuning), (value));			\
+	}							\
+} while (0)
+
+/*
+ * Called during module initialization and periodically thereafter to
+ * apply reasonable changes to the exposed performance tunings.  Can also be
+ * called explicitly by param_set_arc_*() functions when ARC tunables are
+ * updated manually.  Non-zero zfs_* values which differ from the currently set
+ * values will be applied.
+ */
+void
+arc_tuning_update(boolean_t verbose)
+{
+	uint64_t allmem = arc_all_memory();
+	unsigned long limit;
+
+	/* Valid range: 32M - <arc_c_max> */
+	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
+	    (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
+	    (zfs_arc_min <= arc_c_max)) {
+		arc_c_min = zfs_arc_min;
+		arc_c = MAX(arc_c, arc_c_min);
+	}
+	WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
+
+	/* Valid range: 64M - <all physical memory> */
+	if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
+	    (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) &&
+	    (zfs_arc_max > arc_c_min)) {
+		arc_c_max = zfs_arc_max;
+		arc_c = MIN(arc_c, arc_c_max);
+		arc_p = (arc_c >> 1);
+		if (arc_meta_limit > arc_c_max)
+			arc_meta_limit = arc_c_max;
+		if (arc_dnode_size_limit > arc_meta_limit)
+			arc_dnode_size_limit = arc_meta_limit;
+	}
+	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
+
+	/* Valid range: 16M - <arc_c_max> */
+	if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) &&
+	    (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) &&
+	    (zfs_arc_meta_min <= arc_c_max)) {
+		arc_meta_min = zfs_arc_meta_min;
+		if (arc_meta_limit < arc_meta_min)
+			arc_meta_limit = arc_meta_min;
+		if (arc_dnode_size_limit < arc_meta_min)
+			arc_dnode_size_limit = arc_meta_min;
+	}
+	WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose);
+
+	/* Valid range: <arc_meta_min> - <arc_c_max> */
+	limit = zfs_arc_meta_limit ? zfs_arc_meta_limit :
+	    MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100;
+	if ((limit != arc_meta_limit) &&
+	    (limit >= arc_meta_min) &&
+	    (limit <= arc_c_max))
+		arc_meta_limit = limit;
+	WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose);
+
+	/* Valid range: <arc_meta_min> - <arc_meta_limit> */
+	limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
+	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100;
+	if ((limit != arc_dnode_size_limit) &&
+	    (limit >= arc_meta_min) &&
+	    (limit <= arc_meta_limit))
+		arc_dnode_size_limit = limit;
+	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit,
+	    verbose);
+
+	/* Valid range: 1 - N */
+	if (zfs_arc_grow_retry)
+		arc_grow_retry = zfs_arc_grow_retry;
+
+	/* Valid range: 1 - N */
+	if (zfs_arc_shrink_shift) {
+		arc_shrink_shift = zfs_arc_shrink_shift;
+		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
+	}
+
+	/* Valid range: 1 - N */
+	if (zfs_arc_p_min_shift)
+		arc_p_min_shift = zfs_arc_p_min_shift;
+
+	/* Valid range: 1 - N ms */
+	if (zfs_arc_min_prefetch_ms)
+		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
+
+	/* Valid range: 1 - N ms */
+	if (zfs_arc_min_prescient_prefetch_ms) {
+		arc_min_prescient_prefetch_ms =
+		    zfs_arc_min_prescient_prefetch_ms;
+	}
+
+	/* Valid range: 0 - 100 */
+	if ((zfs_arc_lotsfree_percent >= 0) &&
+	    (zfs_arc_lotsfree_percent <= 100))
+		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
+	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
+	    verbose);
+
+	/* Valid range: 0 - <all physical memory> */
+	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
+		arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem);
+	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
+}
+
+static void
+arc_state_init(void)
+{
+	arc_anon = &ARC_anon;
+	arc_mru = &ARC_mru;
+	arc_mru_ghost = &ARC_mru_ghost;
+	arc_mfu = &ARC_mfu;
+	arc_mfu_ghost = &ARC_mfu_ghost;
+	arc_l2c_only = &ARC_l2c_only;
+
+	arc_mru->arcs_list[ARC_BUFC_METADATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_mru->arcs_list[ARC_BUFC_DATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_mfu->arcs_list[ARC_BUFC_METADATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_mfu->arcs_list[ARC_BUFC_DATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+	arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
+	    multilist_create(sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+	    arc_state_multilist_index_func);
+
+	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+	zfs_refcount_create(&arc_anon->arcs_size);
+	zfs_refcount_create(&arc_mru->arcs_size);
+	zfs_refcount_create(&arc_mru_ghost->arcs_size);
+	zfs_refcount_create(&arc_mfu->arcs_size);
+	zfs_refcount_create(&arc_mfu_ghost->arcs_size);
+	zfs_refcount_create(&arc_l2c_only->arcs_size);
+
+	aggsum_init(&arc_meta_used, 0);
+	aggsum_init(&arc_size, 0);
+	aggsum_init(&astat_data_size, 0);
+	aggsum_init(&astat_metadata_size, 0);
+	aggsum_init(&astat_hdr_size, 0);
+	aggsum_init(&astat_l2_hdr_size, 0);
+	aggsum_init(&astat_bonus_size, 0);
+	aggsum_init(&astat_dnode_size, 0);
+	aggsum_init(&astat_dbuf_size, 0);
+	aggsum_init(&astat_abd_chunk_waste_size, 0);
+
+	arc_anon->arcs_state = ARC_STATE_ANON;
+	arc_mru->arcs_state = ARC_STATE_MRU;
+	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
+	arc_mfu->arcs_state = ARC_STATE_MFU;
+	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
+	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
+}
+
+static void
+arc_state_fini(void)
+{
+	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+	zfs_refcount_destroy(&arc_anon->arcs_size);
+	zfs_refcount_destroy(&arc_mru->arcs_size);
+	zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
+	zfs_refcount_destroy(&arc_mfu->arcs_size);
+	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
+	zfs_refcount_destroy(&arc_l2c_only->arcs_size);
+
+	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
+	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
+
+	aggsum_fini(&arc_meta_used);
+	aggsum_fini(&arc_size);
+	aggsum_fini(&astat_data_size);
+	aggsum_fini(&astat_metadata_size);
+	aggsum_fini(&astat_hdr_size);
+	aggsum_fini(&astat_l2_hdr_size);
+	aggsum_fini(&astat_bonus_size);
+	aggsum_fini(&astat_dnode_size);
+	aggsum_fini(&astat_dbuf_size);
+	aggsum_fini(&astat_abd_chunk_waste_size);
+}
+
+uint64_t
+arc_target_bytes(void)
+{
+	return (arc_c);
+}
+
+void
+arc_set_limits(uint64_t allmem)
+{
+	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
+	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
+
+	/* How to set default max varies by platform. */
+	arc_c_max = arc_default_max(arc_c_min, allmem);
+}
+void
+arc_init(void)
+{
+	uint64_t percent, allmem = arc_all_memory();
+	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
+	    offsetof(arc_evict_waiter_t, aew_node));
+
+	arc_min_prefetch_ms = 1000;
+	arc_min_prescient_prefetch_ms = 6000;
+
+#if defined(_KERNEL)
+	arc_lowmem_init();
+#endif
+
+	arc_set_limits(allmem);
+
+#ifndef _KERNEL
+	/*
+	 * In userland, there's only the memory pressure that we artificially
+	 * create (see arc_available_memory()).  Don't let arc_c get too
+	 * small, because it can cause transactions to be larger than
+	 * arc_c, causing arc_tempreserve_space() to fail.
+	 */
+	arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
+#endif
+
+	arc_c = arc_c_min;
+	arc_p = (arc_c >> 1);
+
+	/* Set min to 1/2 of arc_c_min */
+	arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT;
+	/* Initialize maximum observed usage to zero */
+	arc_meta_max = 0;
+	/*
+	 * Set arc_meta_limit to a percent of arc_c_max with a floor of
+	 * arc_meta_min, and a ceiling of arc_c_max.
+	 */
+	percent = MIN(zfs_arc_meta_limit_percent, 100);
+	arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100);
+	percent = MIN(zfs_arc_dnode_limit_percent, 100);
+	arc_dnode_size_limit = (percent * arc_meta_limit) / 100;
+
+	/* Apply user specified tunings */
+	arc_tuning_update(B_TRUE);
+
+	/* if kmem_flags are set, lets try to use less memory */
+	if (kmem_debugging())
+		arc_c = arc_c / 2;
+	if (arc_c < arc_c_min)
+		arc_c = arc_c_min;
+
+	arc_register_hotplug();
+
+	arc_state_init();
+
+	buf_init();
+
+	list_create(&arc_prune_list, sizeof (arc_prune_t),
+	    offsetof(arc_prune_t, p_node));
+	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+	arc_prune_taskq = taskq_create("arc_prune", 100, defclsyspri,
+	    boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+	    TASKQ_THREADS_CPU_PCT);
+
+	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+
+	if (arc_ksp != NULL) {
+		arc_ksp->ks_data = &arc_stats;
+		arc_ksp->ks_update = arc_kstat_update;
+		kstat_install(arc_ksp);
+	}
+
+	arc_evict_zthr = zthr_create("arc_evict",
+	    arc_evict_cb_check, arc_evict_cb, NULL);
+	arc_reap_zthr = zthr_create_timer("arc_reap",
+	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1));
+
+	arc_warm = B_FALSE;
+
+	/*
+	 * Calculate maximum amount of dirty data per pool.
+	 *
+	 * If it has been set by a module parameter, take that.
+	 * Otherwise, use a percentage of physical memory defined by
+	 * zfs_dirty_data_max_percent (default 10%) with a cap at
+	 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
+	 */
+#ifdef __LP64__
+	if (zfs_dirty_data_max_max == 0)
+		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
+		    allmem * zfs_dirty_data_max_max_percent / 100);
+#else
+	if (zfs_dirty_data_max_max == 0)
+		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
+		    allmem * zfs_dirty_data_max_max_percent / 100);
+#endif
+
+	if (zfs_dirty_data_max == 0) {
+		zfs_dirty_data_max = allmem *
+		    zfs_dirty_data_max_percent / 100;
+		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+		    zfs_dirty_data_max_max);
+	}
+}
+
+void
+arc_fini(void)
+{
+	arc_prune_t *p;
+
+#ifdef _KERNEL
+	arc_lowmem_fini();
+#endif /* _KERNEL */
+
+	/* Use B_TRUE to ensure *all* buffers are evicted */
+	arc_flush(NULL, B_TRUE);
+
+	if (arc_ksp != NULL) {
+		kstat_delete(arc_ksp);
+		arc_ksp = NULL;
+	}
+
+	taskq_wait(arc_prune_taskq);
+	taskq_destroy(arc_prune_taskq);
+
+	mutex_enter(&arc_prune_mtx);
+	while ((p = list_head(&arc_prune_list)) != NULL) {
+		list_remove(&arc_prune_list, p);
+		zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
+		zfs_refcount_destroy(&p->p_refcnt);
+		kmem_free(p, sizeof (*p));
+	}
+	mutex_exit(&arc_prune_mtx);
+
+	list_destroy(&arc_prune_list);
+	mutex_destroy(&arc_prune_mtx);
+
+	(void) zthr_cancel(arc_evict_zthr);
+	(void) zthr_cancel(arc_reap_zthr);
+
+	mutex_destroy(&arc_evict_lock);
+	list_destroy(&arc_evict_waiters);
+
+	/*
+	 * Free any buffers that were tagged for destruction.  This needs
+	 * to occur before arc_state_fini() runs and destroys the aggsum
+	 * values which are updated when freeing scatter ABDs.
+	 */
+	l2arc_do_free_on_write();
+
+	/*
+	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
+	 * trigger the release of kmem magazines, which can callback to
+	 * arc_space_return() which accesses aggsums freed in act_state_fini().
+	 */
+	buf_fini();
+	arc_state_fini();
+
+	arc_unregister_hotplug();
+
+	/*
+	 * We destroy the zthrs after all the ARC state has been
+	 * torn down to avoid the case of them receiving any
+	 * wakeup() signals after they are destroyed.
+	 */
+	zthr_destroy(arc_evict_zthr);
+	zthr_destroy(arc_reap_zthr);
+
+	ASSERT0(arc_loaned_bytes);
+}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes.  The main role of this cache is to boost
+ * the performance of random read workloads.  The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ *                 +-----------------------+
+ *                 |         ARC           |
+ *                 +-----------------------+
+ *                    |         ^     ^
+ *                    |         |     |
+ *      l2arc_feed_thread()    arc_read()
+ *                    |         |     |
+ *                    |  l2arc read   |
+ *                    V         |     |
+ *               +---------------+    |
+ *               |     L2ARC     |    |
+ *               +---------------+    |
+ *                   |    ^           |
+ *          l2arc_write() |           |
+ *                   |    |           |
+ *                   V    |           |
+ *                 +-------+      +-------+
+ *                 | vdev  |      | vdev  |
+ *                 | cache |      | cache |
+ *                 +-------+      +-------+
+ *                 +=========+     .-----.
+ *                 :  L2ARC  :    |-_____-|
+ *                 : devices :    | Disks |
+ *                 +=========+    `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ *	1) ARC
+ *	2) vdev cache of L2ARC devices
+ *	3) L2ARC devices
+ *	4) vdev cache of disks
+ *	5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists.  The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ *	       head -->                        tail
+ *	        +---------------------+----------+
+ *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
+ *	        +---------------------+----------+   |   o L2ARC eligible
+ *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
+ *	        +---------------------+----------+   |
+ *	             15.9 Gbytes      ^ 32 Mbytes    |
+ *	                           headroom          |
+ *	                                      l2arc_feed_thread()
+ *	                                             |
+ *	                 l2arc write hand <--[oooo]--'
+ *	                         |           8 Mbyte
+ *	                         |          write max
+ *	                         V
+ *		  +==============================+
+ *	L2ARC dev |####|#|###|###|    |####| ... |
+ *	          +==============================+
+ *	                     32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers.  This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes.  This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. After system boot and before the ARC has filled main memory, there are
+ * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
+ * lists can remain mostly static.  Instead of searching from tail of these
+ * lists as pictured, the l2arc_feed_thread() will search from the list heads
+ * for eligible buffers, greatly increasing its chance of finding them.
+ *
+ * The L2ARC device write speed is also boosted during this time so that
+ * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
+ * there are no L2ARC reads, and no fear of degrading read performance
+ * through increased writes.
+ *
+ * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes.  Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 7. The L2ARC does not store dirty content.  It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 8. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ *	l2arc_write_max		max write bytes per interval
+ *	l2arc_write_boost	extra write bytes during device warmup
+ *	l2arc_noprefetch	skip caching prefetched buffers
+ *	l2arc_headroom		number of max device writes to precache
+ *	l2arc_headroom_boost	when we find compressed buffers during ARC
+ *				scanning, we multiply headroom by this
+ *				percentage factor for the next scan cycle,
+ *				since more compressed buffers are likely to
+ *				be present
+ *	l2arc_feed_secs		seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ *
+ * There are three key functions that control how the L2ARC warms up:
+ *
+ *	l2arc_write_eligible()	check if a buffer is eligible to cache
+ *	l2arc_write_size()	calculate how much to write
+ *	l2arc_write_interval()	calculate sleep delay between writes
+ *
+ * These three functions determine what to write, how much, and how quickly
+ * to send writes.
+ *
+ * L2ARC persistence:
+ *
+ * When writing buffers to L2ARC, we periodically add some metadata to
+ * make sure we can pick them up after reboot, thus dramatically reducing
+ * the impact that any downtime has on the performance of storage systems
+ * with large caches.
+ *
+ * The implementation works fairly simply by integrating the following two
+ * modifications:
+ *
+ * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
+ *    which is an additional piece of metadata which describes what's been
+ *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
+ *    main ARC buffers. There are 2 linked-lists of log blocks headed by
+ *    dh_start_lbps[2]. We alternate which chain we append to, so they are
+ *    time-wise and offset-wise interleaved, but that is an optimization rather
+ *    than for correctness. The log block also includes a pointer to the
+ *    previous block in its chain.
+ *
+ * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
+ *    for our header bookkeeping purposes. This contains a device header,
+ *    which contains our top-level reference structures. We update it each
+ *    time we write a new log block, so that we're able to locate it in the
+ *    L2ARC device. If this write results in an inconsistent device header
+ *    (e.g. due to power failure), we detect this by verifying the header's
+ *    checksum and simply fail to reconstruct the L2ARC after reboot.
+ *
+ * Implementation diagram:
+ *
+ * +=== L2ARC device (not to scale) ======================================+
+ * |       ___two newest log block pointers__.__________                  |
+ * |      /                                   \dh_start_lbps[1]           |
+ * |	 /				       \         \dh_start_lbps[0]|
+ * |.___/__.                                    V         V               |
+ * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
+ * ||   hdr|      ^         /^       /^        /         /                |
+ * |+------+  ...--\-------/  \-----/--\------/         /                 |
+ * |                \--------------/    \--------------/                  |
+ * +======================================================================+
+ *
+ * As can be seen on the diagram, rather than using a simple linked list,
+ * we use a pair of linked lists with alternating elements. This is a
+ * performance enhancement due to the fact that we only find out the
+ * address of the next log block access once the current block has been
+ * completely read in. Obviously, this hurts performance, because we'd be
+ * keeping the device's I/O queue at only a 1 operation deep, thus
+ * incurring a large amount of I/O round-trip latency. Having two lists
+ * allows us to fetch two log blocks ahead of where we are currently
+ * rebuilding L2ARC buffers.
+ *
+ * On-device data structures:
+ *
+ * L2ARC device header:	l2arc_dev_hdr_phys_t
+ * L2ARC log block:	l2arc_log_blk_phys_t
+ *
+ * L2ARC reconstruction:
+ *
+ * When writing data, we simply write in the standard rotary fashion,
+ * evicting buffers as we go and simply writing new data over them (writing
+ * a new log block every now and then). This obviously means that once we
+ * loop around the end of the device, we will start cutting into an already
+ * committed log block (and its referenced data buffers), like so:
+ *
+ *    current write head__       __old tail
+ *                        \     /
+ *                        V    V
+ * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
+ *                         ^    ^^^^^^^^^___________________________________
+ *                         |                                                \
+ *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
+ *
+ * When importing the pool, we detect this situation and use it to stop
+ * our scanning process (see l2arc_rebuild).
+ *
+ * There is one significant caveat to consider when rebuilding ARC contents
+ * from an L2ARC device: what about invalidated buffers? Given the above
+ * construction, we cannot update blocks which we've already written to amend
+ * them to remove buffers which were invalidated. Thus, during reconstruction,
+ * we might be populating the cache with buffers for data that's not on the
+ * main pool anymore, or may have been overwritten!
+ *
+ * As it turns out, this isn't a problem. Every arc_read request includes
+ * both the DVA and, crucially, the birth TXG of the BP the caller is
+ * looking for. So even if the cache were populated by completely rotten
+ * blocks for data that had been long deleted and/or overwritten, we'll
+ * never actually return bad data from the cache, since the DVA with the
+ * birth TXG uniquely identify a block in space and time - once created,
+ * a block is immutable on disk. The worst thing we have done is wasted
+ * some time and memory at l2arc rebuild to reconstruct outdated ARC
+ * entries that will get dropped from the l2arc as it is being updated
+ * with new blocks.
+ *
+ * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
+ * hand are not restored. This is done by saving the offset (in bytes)
+ * l2arc_evict() has evicted to in the L2ARC device header and taking it
+ * into account when restoring buffers.
+ */
+
+static boolean_t
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
+{
+	/*
+	 * A buffer is *not* eligible for the L2ARC if it:
+	 * 1. belongs to a different spa.
+	 * 2. is already cached on the L2ARC.
+	 * 3. has an I/O in progress (it may be an incomplete read).
+	 * 4. is flagged not eligible (zfs property).
+	 */
+	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
+	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static uint64_t
+l2arc_write_size(l2arc_dev_t *dev)
+{
+	uint64_t size, dev_size, tsize;
+
+	/*
+	 * Make sure our globals have meaningful values in case the user
+	 * altered them.
+	 */
+	size = l2arc_write_max;
+	if (size == 0) {
+		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+		    "be greater than zero, resetting it to the default (%d)",
+		    L2ARC_WRITE_SIZE);
+		size = l2arc_write_max = L2ARC_WRITE_SIZE;
+	}
+
+	if (arc_warm == B_FALSE)
+		size += l2arc_write_boost;
+
+	/*
+	 * Make sure the write size does not exceed the size of the cache
+	 * device. This is important in l2arc_evict(), otherwise infinite
+	 * iteration can occur.
+	 */
+	dev_size = dev->l2ad_end - dev->l2ad_start;
+	tsize = size + l2arc_log_blk_overhead(size, dev);
+	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0)
+		tsize += MAX(64 * 1024 * 1024,
+		    (tsize * l2arc_trim_ahead) / 100);
+
+	if (tsize >= dev_size) {
+		cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost "
+		    "plus the overhead of log blocks (persistent L2ARC, "
+		    "%llu bytes) exceeds the size of the cache device "
+		    "(guid %llu), resetting them to the default (%d)",
+		    l2arc_log_blk_overhead(size, dev),
+		    dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE);
+		size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE;
+
+		if (arc_warm == B_FALSE)
+			size += l2arc_write_boost;
+	}
+
+	return (size);
+
+}
+
+static clock_t
+l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
+{
+	clock_t interval, next, now;
+
+	/*
+	 * If the ARC lists are busy, increase our write rate; if the
+	 * lists are stale, idle back.  This is achieved by checking
+	 * how much we previously wrote - if it was more than half of
+	 * what we wanted, schedule the next write much sooner.
+	 */
+	if (l2arc_feed_again && wrote > (wanted / 2))
+		interval = (hz * l2arc_feed_min_ms) / 1000;
+	else
+		interval = hz * l2arc_feed_secs;
+
+	now = ddi_get_lbolt();
+	next = MAX(now, MIN(now + interval, began + interval));
+
+	return (next);
+}
+
+/*
+ * Cycle through L2ARC devices.  This is how L2ARC load balances.
+ * If a device is returned, this also returns holding the spa config lock.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+	l2arc_dev_t *first, *next = NULL;
+
+	/*
+	 * Lock out the removal of spas (spa_namespace_lock), then removal
+	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
+	 * both locks will be dropped and a spa config lock held instead.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	mutex_enter(&l2arc_dev_mtx);
+
+	/* if there are no vdevs, there is nothing to do */
+	if (l2arc_ndev == 0)
+		goto out;
+
+	first = NULL;
+	next = l2arc_dev_last;
+	do {
+		/* loop around the list looking for a non-faulted vdev */
+		if (next == NULL) {
+			next = list_head(l2arc_dev_list);
+		} else {
+			next = list_next(l2arc_dev_list, next);
+			if (next == NULL)
+				next = list_head(l2arc_dev_list);
+		}
+
+		/* if we have come back to the start, bail out */
+		if (first == NULL)
+			first = next;
+		else if (next == first)
+			break;
+
+	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+	    next->l2ad_trim_all);
+
+	/* if we were unable to find any usable vdevs, return NULL */
+	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
+	    next->l2ad_trim_all)
+		next = NULL;
+
+	l2arc_dev_last = next;
+
+out:
+	mutex_exit(&l2arc_dev_mtx);
+
+	/*
+	 * Grab the config lock to prevent the 'next' device from being
+	 * removed while we are writing to it.
+	 */
+	if (next != NULL)
+		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
+	mutex_exit(&spa_namespace_lock);
+
+	return (next);
+}
+
+/*
+ * Free buffers that were tagged for destruction.
+ */
+static void
+l2arc_do_free_on_write(void)
+{
+	list_t *buflist;
+	l2arc_data_free_t *df, *df_prev;
+
+	mutex_enter(&l2arc_free_on_write_mtx);
+	buflist = l2arc_free_on_write;
+
+	for (df = list_tail(buflist); df; df = df_prev) {
+		df_prev = list_prev(buflist, df);
+		ASSERT3P(df->l2df_abd, !=, NULL);
+		abd_free(df->l2df_abd);
+		list_remove(buflist, df);
+		kmem_free(df, sizeof (l2arc_data_free_t));
+	}
+
+	mutex_exit(&l2arc_free_on_write_mtx);
+}
+
+/*
+ * A write to a cache device has completed.  Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+	l2arc_write_callback_t	*cb;
+	l2arc_lb_abd_buf_t	*abd_buf;
+	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
+	l2arc_dev_t		*dev;
+	l2arc_dev_hdr_phys_t	*l2dhdr;
+	list_t			*buflist;
+	arc_buf_hdr_t		*head, *hdr, *hdr_prev;
+	kmutex_t		*hash_lock;
+	int64_t			bytes_dropped = 0;
+
+	cb = zio->io_private;
+	ASSERT3P(cb, !=, NULL);
+	dev = cb->l2wcb_dev;
+	l2dhdr = dev->l2ad_dev_hdr;
+	ASSERT3P(dev, !=, NULL);
+	head = cb->l2wcb_head;
+	ASSERT3P(head, !=, NULL);
+	buflist = &dev->l2ad_buflist;
+	ASSERT3P(buflist, !=, NULL);
+	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+	    l2arc_write_callback_t *, cb);
+
+	/*
+	 * All writes completed, or an error was hit.
+	 */
+top:
+	mutex_enter(&dev->l2ad_mtx);
+	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
+		hdr_prev = list_prev(buflist, hdr);
+
+		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
+		if (!mutex_tryenter(hash_lock)) {
+			/*
+			 * Missed the hash lock. We must retry so we
+			 * don't leave the ARC_FLAG_L2_WRITING bit set.
+			 */
+			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+			/*
+			 * We don't want to rescan the headers we've
+			 * already marked as having been written out, so
+			 * we reinsert the head node so we can pick up
+			 * where we left off.
+			 */
+			list_remove(buflist, head);
+			list_insert_after(buflist, hdr, head);
+
+			mutex_exit(&dev->l2ad_mtx);
+
+			/*
+			 * We wait for the hash lock to become available
+			 * to try and prevent busy waiting, and increase
+			 * the chance we'll be able to acquire the lock
+			 * the next time around.
+			 */
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			goto top;
+		}
+
+		/*
+		 * We could not have been moved into the arc_l2c_only
+		 * state while in-flight due to our ARC_FLAG_L2_WRITING
+		 * bit being set. Let's just ensure that's being enforced.
+		 */
+		ASSERT(HDR_HAS_L1HDR(hdr));
+
+		/*
+		 * Skipped - drop L2ARC entry and mark the header as no
+		 * longer L2 eligibile.
+		 */
+		if (zio->io_error != 0) {
+			/*
+			 * Error - drop L2ARC entry.
+			 */
+			list_remove(buflist, hdr);
+			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+			uint64_t psize = HDR_GET_PSIZE(hdr);
+			l2arc_hdr_arcstats_decrement(hdr);
+
+			bytes_dropped +=
+			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
+			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+		}
+
+		/*
+		 * Allow ARC to begin reads and ghost list evictions to
+		 * this L2ARC entry.
+		 */
+		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
+
+		mutex_exit(hash_lock);
+	}
+
+	/*
+	 * Free the allocated abd buffers for writing the log blocks.
+	 * If the zio failed reclaim the allocated space and remove the
+	 * pointers to these log blocks from the log block pointer list
+	 * of the L2ARC device.
+	 */
+	while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
+		abd_free(abd_buf->abd);
+		zio_buf_free(abd_buf, sizeof (*abd_buf));
+		if (zio->io_error != 0) {
+			lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
+			/*
+			 * L2BLK_GET_PSIZE returns aligned size for log
+			 * blocks.
+			 */
+			uint64_t asize =
+			    L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
+			bytes_dropped += asize;
+			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+			    lb_ptr_buf);
+			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+			kmem_free(lb_ptr_buf->lb_ptr,
+			    sizeof (l2arc_log_blkptr_t));
+			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+		}
+	}
+	list_destroy(&cb->l2wcb_abd_list);
+
+	if (zio->io_error != 0) {
+		ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+		/*
+		 * Restore the lbps array in the header to its previous state.
+		 * If the list of log block pointers is empty, zero out the
+		 * log block pointers in the device header.
+		 */
+		lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
+		for (int i = 0; i < 2; i++) {
+			if (lb_ptr_buf == NULL) {
+				/*
+				 * If the list is empty zero out the device
+				 * header. Otherwise zero out the second log
+				 * block pointer in the header.
+				 */
+				if (i == 0) {
+					bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+				} else {
+					bzero(&l2dhdr->dh_start_lbps[i],
+					    sizeof (l2arc_log_blkptr_t));
+				}
+				break;
+			}
+			bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i],
+			    sizeof (l2arc_log_blkptr_t));
+			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
+			    lb_ptr_buf);
+		}
+	}
+
+	atomic_inc_64(&l2arc_writes_done);
+	list_remove(buflist, head);
+	ASSERT(!HDR_HAS_L1HDR(head));
+	kmem_cache_free(hdr_l2only_cache, head);
+	mutex_exit(&dev->l2ad_mtx);
+
+	ASSERT(dev->l2ad_vdev != NULL);
+	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
+
+	l2arc_do_free_on_write();
+
+	kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+static int
+l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
+{
+	int ret;
+	spa_t *spa = zio->io_spa;
+	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
+	blkptr_t *bp = zio->io_bp;
+	uint8_t salt[ZIO_DATA_SALT_LEN];
+	uint8_t iv[ZIO_DATA_IV_LEN];
+	uint8_t mac[ZIO_DATA_MAC_LEN];
+	boolean_t no_crypt = B_FALSE;
+
+	/*
+	 * ZIL data is never be written to the L2ARC, so we don't need
+	 * special handling for its unique MAC storage.
+	 */
+	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+
+	/*
+	 * If the data was encrypted, decrypt it now. Note that
+	 * we must check the bp here and not the hdr, since the
+	 * hdr does not have its encryption parameters updated
+	 * until arc_read_done().
+	 */
+	if (BP_IS_ENCRYPTED(bp)) {
+		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
+		    B_TRUE);
+
+		zio_crypt_decode_params_bp(bp, salt, iv);
+		zio_crypt_decode_mac_bp(bp, mac);
+
+		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
+		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
+		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
+		    hdr->b_l1hdr.b_pabd, &no_crypt);
+		if (ret != 0) {
+			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
+			goto error;
+		}
+
+		/*
+		 * If we actually performed decryption, replace b_pabd
+		 * with the decrypted data. Otherwise we can just throw
+		 * our decryption buffer away.
+		 */
+		if (!no_crypt) {
+			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+			    arc_hdr_size(hdr), hdr);
+			hdr->b_l1hdr.b_pabd = eabd;
+			zio->io_abd = eabd;
+		} else {
+			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
+		}
+	}
+
+	/*
+	 * If the L2ARC block was compressed, but ARC compression
+	 * is disabled we decompress the data into a new buffer and
+	 * replace the existing data.
+	 */
+	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+	    !HDR_COMPRESSION_ENABLED(hdr)) {
+		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
+		    B_TRUE);
+		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
+
+		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
+		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
+		if (ret != 0) {
+			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
+			goto error;
+		}
+
+		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
+		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
+		    arc_hdr_size(hdr), hdr);
+		hdr->b_l1hdr.b_pabd = cabd;
+		zio->io_abd = cabd;
+		zio->io_size = HDR_GET_LSIZE(hdr);
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+
+/*
+ * A read to a cache device completed.  Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+	int tfm_error = 0;
+	l2arc_read_callback_t *cb = zio->io_private;
+	arc_buf_hdr_t *hdr;
+	kmutex_t *hash_lock;
+	boolean_t valid_cksum;
+	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
+	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
+
+	ASSERT3P(zio->io_vd, !=, NULL);
+	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
+
+	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
+
+	ASSERT3P(cb, !=, NULL);
+	hdr = cb->l2rcb_hdr;
+	ASSERT3P(hdr, !=, NULL);
+
+	hash_lock = HDR_LOCK(hdr);
+	mutex_enter(hash_lock);
+	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+
+	/*
+	 * If the data was read into a temporary buffer,
+	 * move it and free the buffer.
+	 */
+	if (cb->l2rcb_abd != NULL) {
+		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
+		if (zio->io_error == 0) {
+			if (using_rdata) {
+				abd_copy(hdr->b_crypt_hdr.b_rabd,
+				    cb->l2rcb_abd, arc_hdr_size(hdr));
+			} else {
+				abd_copy(hdr->b_l1hdr.b_pabd,
+				    cb->l2rcb_abd, arc_hdr_size(hdr));
+			}
+		}
+
+		/*
+		 * The following must be done regardless of whether
+		 * there was an error:
+		 * - free the temporary buffer
+		 * - point zio to the real ARC buffer
+		 * - set zio size accordingly
+		 * These are required because zio is either re-used for
+		 * an I/O of the block in the case of the error
+		 * or the zio is passed to arc_read_done() and it
+		 * needs real data.
+		 */
+		abd_free(cb->l2rcb_abd);
+		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
+
+		if (using_rdata) {
+			ASSERT(HDR_HAS_RABD(hdr));
+			zio->io_abd = zio->io_orig_abd =
+			    hdr->b_crypt_hdr.b_rabd;
+		} else {
+			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
+			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
+		}
+	}
+
+	ASSERT3P(zio->io_abd, !=, NULL);
+
+	/*
+	 * Check this survived the L2ARC journey.
+	 */
+	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
+	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
+	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
+	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
+	zio->io_prop.zp_complevel = hdr->b_complevel;
+
+	valid_cksum = arc_cksum_is_equal(hdr, zio);
+
+	/*
+	 * b_rabd will always match the data as it exists on disk if it is
+	 * being used. Therefore if we are reading into b_rabd we do not
+	 * attempt to untransform the data.
+	 */
+	if (valid_cksum && !using_rdata)
+		tfm_error = l2arc_untransform(zio, cb);
+
+	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
+	    !HDR_L2_EVICTED(hdr)) {
+		mutex_exit(hash_lock);
+		zio->io_private = hdr;
+		arc_read_done(zio);
+	} else {
+		/*
+		 * Buffer didn't survive caching.  Increment stats and
+		 * reissue to the original storage device.
+		 */
+		if (zio->io_error != 0) {
+			ARCSTAT_BUMP(arcstat_l2_io_error);
+		} else {
+			zio->io_error = SET_ERROR(EIO);
+		}
+		if (!valid_cksum || tfm_error != 0)
+			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+		/*
+		 * If there's no waiter, issue an async i/o to the primary
+		 * storage now.  If there *is* a waiter, the caller must
+		 * issue the i/o in a context where it's OK to block.
+		 */
+		if (zio->io_waiter == NULL) {
+			zio_t *pio = zio_unique_parent(zio);
+			void *abd = (using_rdata) ?
+			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
+
+			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+			zio = zio_read(pio, zio->io_spa, zio->io_bp,
+			    abd, zio->io_size, arc_read_done,
+			    hdr, zio->io_priority, cb->l2rcb_flags,
+			    &cb->l2rcb_zb);
+
+			/*
+			 * Original ZIO will be freed, so we need to update
+			 * ARC header with the new ZIO pointer to be used
+			 * by zio_change_priority() in arc_read().
+			 */
+			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
+			    acb != NULL; acb = acb->acb_next)
+				acb->acb_zio_head = zio;
+
+			mutex_exit(hash_lock);
+			zio_nowait(zio);
+		} else {
+			mutex_exit(hash_lock);
+		}
+	}
+
+	kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache.  This is used within loops (0..3) to cycle through lists in the
+ * desired order.  This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists.  This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
+{
+	multilist_t *ml = NULL;
+	unsigned int idx;
+
+	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
+
+	switch (list_num) {
+	case 0:
+		ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
+		break;
+	case 1:
+		ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
+		break;
+	case 2:
+		ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
+		break;
+	case 3:
+		ml = arc_mru->arcs_list[ARC_BUFC_DATA];
+		break;
+	default:
+		return (NULL);
+	}
+
+	/*
+	 * Return a randomly-selected sublist. This is acceptable
+	 * because the caller feeds only a little bit of data for each
+	 * call (8MB). Subsequent calls will result in different
+	 * sublists being selected.
+	 */
+	idx = multilist_get_random_index(ml);
+	return (multilist_sublist_lock(ml, idx));
+}
+
+/*
+ * Calculates the maximum overhead of L2ARC metadata log blocks for a given
+ * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
+ * overhead in processing to make sure there is enough headroom available
+ * when writing buffers.
+ */
+static inline uint64_t
+l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
+{
+	if (dev->l2ad_log_entries == 0) {
+		return (0);
+	} else {
+		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
+
+		uint64_t log_blocks = (log_entries +
+		    dev->l2ad_log_entries - 1) /
+		    dev->l2ad_log_entries;
+
+		return (vdev_psize_to_asize(dev->l2ad_vdev,
+		    sizeof (l2arc_log_blk_phys_t)) * log_blocks);
+	}
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+	list_t *buflist;
+	arc_buf_hdr_t *hdr, *hdr_prev;
+	kmutex_t *hash_lock;
+	uint64_t taddr;
+	l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
+	vdev_t *vd = dev->l2ad_vdev;
+	boolean_t rerun;
+
+	buflist = &dev->l2ad_buflist;
+
+	/*
+	 * We need to add in the worst case scenario of log block overhead.
+	 */
+	distance += l2arc_log_blk_overhead(distance, dev);
+	if (vd->vdev_has_trim && l2arc_trim_ahead > 0) {
+		/*
+		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
+		 * times the write size, whichever is greater.
+		 */
+		distance += MAX(64 * 1024 * 1024,
+		    (distance * l2arc_trim_ahead) / 100);
+	}
+
+top:
+	rerun = B_FALSE;
+	if (dev->l2ad_hand >= (dev->l2ad_end - distance)) {
+		/*
+		 * When there is no space to accommodate upcoming writes,
+		 * evict to the end. Then bump the write and evict hands
+		 * to the start and iterate. This iteration does not
+		 * happen indefinitely as we make sure in
+		 * l2arc_write_size() that when the write hand is reset,
+		 * the write size does not exceed the end of the device.
+		 */
+		rerun = B_TRUE;
+		taddr = dev->l2ad_end;
+	} else {
+		taddr = dev->l2ad_hand + distance;
+	}
+	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+	    uint64_t, taddr, boolean_t, all);
+
+	if (!all) {
+		/*
+		 * This check has to be placed after deciding whether to
+		 * iterate (rerun).
+		 */
+		if (dev->l2ad_first) {
+			/*
+			 * This is the first sweep through the device. There is
+			 * nothing to evict. We have already trimmmed the
+			 * whole device.
+			 */
+			goto out;
+		} else {
+			/*
+			 * Trim the space to be evicted.
+			 */
+			if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
+			    l2arc_trim_ahead > 0) {
+				/*
+				 * We have to drop the spa_config lock because
+				 * vdev_trim_range() will acquire it.
+				 * l2ad_evict already accounts for the label
+				 * size. To prevent vdev_trim_ranges() from
+				 * adding it again, we subtract it from
+				 * l2ad_evict.
+				 */
+				spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
+				vdev_trim_simple(vd,
+				    dev->l2ad_evict - VDEV_LABEL_START_SIZE,
+				    taddr - dev->l2ad_evict);
+				spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
+				    RW_READER);
+			}
+
+			/*
+			 * When rebuilding L2ARC we retrieve the evict hand
+			 * from the header of the device. Of note, l2arc_evict()
+			 * does not actually delete buffers from the cache
+			 * device, but trimming may do so depending on the
+			 * hardware implementation. Thus keeping track of the
+			 * evict hand is useful.
+			 */
+			dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
+		}
+	}
+
+retry:
+	mutex_enter(&dev->l2ad_mtx);
+	/*
+	 * We have to account for evicted log blocks. Run vdev_space_update()
+	 * on log blocks whose offset (in bytes) is before the evicted offset
+	 * (in bytes) by searching in the list of pointers to log blocks
+	 * present in the L2ARC device.
+	 */
+	for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
+	    lb_ptr_buf = lb_ptr_buf_prev) {
+
+		lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
+
+		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
+		uint64_t asize = L2BLK_GET_PSIZE(
+		    (lb_ptr_buf->lb_ptr)->lbp_prop);
+
+		/*
+		 * We don't worry about log blocks left behind (ie
+		 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
+		 * will never write more than l2arc_evict() evicts.
+		 */
+		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
+			break;
+		} else {
+			vdev_space_update(vd, -asize, 0, 0);
+			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
+			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
+			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
+			    lb_ptr_buf);
+			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
+			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
+			kmem_free(lb_ptr_buf->lb_ptr,
+			    sizeof (l2arc_log_blkptr_t));
+			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
+		}
+	}
+
+	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
+		hdr_prev = list_prev(buflist, hdr);
+
+		ASSERT(!HDR_EMPTY(hdr));
+		hash_lock = HDR_LOCK(hdr);
+
+		/*
+		 * We cannot use mutex_enter or else we can deadlock
+		 * with l2arc_write_buffers (due to swapping the order
+		 * the hash lock and l2ad_mtx are taken).
+		 */
+		if (!mutex_tryenter(hash_lock)) {
+			/*
+			 * Missed the hash lock.  Retry.
+			 */
+			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+			mutex_exit(&dev->l2ad_mtx);
+			mutex_enter(hash_lock);
+			mutex_exit(hash_lock);
+			goto retry;
+		}
+
+		/*
+		 * A header can't be on this list if it doesn't have L2 header.
+		 */
+		ASSERT(HDR_HAS_L2HDR(hdr));
+
+		/* Ensure this header has finished being written. */
+		ASSERT(!HDR_L2_WRITING(hdr));
+		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
+
+		if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
+		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
+			/*
+			 * We've evicted to the target address,
+			 * or the end of the device.
+			 */
+			mutex_exit(hash_lock);
+			break;
+		}
+
+		if (!HDR_HAS_L1HDR(hdr)) {
+			ASSERT(!HDR_L2_READING(hdr));
+			/*
+			 * This doesn't exist in the ARC.  Destroy.
+			 * arc_hdr_destroy() will call list_remove()
+			 * and decrement arcstat_l2_lsize.
+			 */
+			arc_change_state(arc_anon, hdr, hash_lock);
+			arc_hdr_destroy(hdr);
+		} else {
+			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
+			/*
+			 * Invalidate issued or about to be issued
+			 * reads, since we may be about to write
+			 * over this location.
+			 */
+			if (HDR_L2_READING(hdr)) {
+				ARCSTAT_BUMP(arcstat_l2_evict_reading);
+				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
+			}
+
+			arc_hdr_l2hdr_destroy(hdr);
+		}
+		mutex_exit(hash_lock);
+	}
+	mutex_exit(&dev->l2ad_mtx);
+
+out:
+	/*
+	 * We need to check if we evict all buffers, otherwise we may iterate
+	 * unnecessarily.
+	 */
+	if (!all && rerun) {
+		/*
+		 * Bump device hand to the device start if it is approaching the
+		 * end. l2arc_evict() has already evicted ahead for this case.
+		 */
+		dev->l2ad_hand = dev->l2ad_start;
+		dev->l2ad_evict = dev->l2ad_start;
+		dev->l2ad_first = B_FALSE;
+		goto top;
+	}
+
+	if (!all) {
+		/*
+		 * In case of cache device removal (all) the following
+		 * assertions may be violated without functional consequences
+		 * as the device is about to be removed.
+		 */
+		ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end);
+		if (!dev->l2ad_first)
+			ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict);
+	}
+}
+
+/*
+ * Handle any abd transforms that might be required for writing to the L2ARC.
+ * If successful, this function will always return an abd with the data
+ * transformed as it is on disk in a new abd of asize bytes.
+ */
+static int
+l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
+    abd_t **abd_out)
+{
+	int ret;
+	void *tmp = NULL;
+	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
+	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+	uint64_t psize = HDR_GET_PSIZE(hdr);
+	uint64_t size = arc_hdr_size(hdr);
+	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
+	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
+	dsl_crypto_key_t *dck = NULL;
+	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
+	boolean_t no_crypt = B_FALSE;
+
+	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+	    !HDR_COMPRESSION_ENABLED(hdr)) ||
+	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
+	ASSERT3U(psize, <=, asize);
+
+	/*
+	 * If this data simply needs its own buffer, we simply allocate it
+	 * and copy the data. This may be done to eliminate a dependency on a
+	 * shared buffer or to reallocate the buffer to match asize.
+	 */
+	if (HDR_HAS_RABD(hdr) && asize != psize) {
+		ASSERT3U(asize, >=, psize);
+		to_write = abd_alloc_for_io(asize, ismd);
+		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
+		if (psize != asize)
+			abd_zero_off(to_write, psize, asize - psize);
+		goto out;
+	}
+
+	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
+	    !HDR_ENCRYPTED(hdr)) {
+		ASSERT3U(size, ==, psize);
+		to_write = abd_alloc_for_io(asize, ismd);
+		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+		if (size != asize)
+			abd_zero_off(to_write, size, asize - size);
+		goto out;
+	}
+
+	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
+		cabd = abd_alloc_for_io(asize, ismd);
+		tmp = abd_borrow_buf(cabd, asize);
+
+		psize = zio_compress_data(compress, to_write, tmp, size,
+		    hdr->b_complevel);
+
+		if (psize >= size) {
+			abd_return_buf(cabd, tmp, asize);
+			HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+			to_write = cabd;
+			abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
+			if (size != asize)
+				abd_zero_off(to_write, size, asize - size);
+			goto encrypt;
+		}
+		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
+		if (psize < asize)
+			bzero((char *)tmp + psize, asize - psize);
+		psize = HDR_GET_PSIZE(hdr);
+		abd_return_buf_copy(cabd, tmp, asize);
+		to_write = cabd;
+	}
+
+encrypt:
+	if (HDR_ENCRYPTED(hdr)) {
+		eabd = abd_alloc_for_io(asize, ismd);
+
+		/*
+		 * If the dataset was disowned before the buffer
+		 * made it to this point, the key to re-encrypt
+		 * it won't be available. In this case we simply
+		 * won't write the buffer to the L2ARC.
+		 */
+		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
+		    FTAG, &dck);
+		if (ret != 0)
+			goto error;
+
+		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
+		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
+		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
+		    &no_crypt);
+		if (ret != 0)
+			goto error;
+
+		if (no_crypt)
+			abd_copy(eabd, to_write, psize);
+
+		if (psize != asize)
+			abd_zero_off(eabd, psize, asize - psize);
+
+		/* assert that the MAC we got here matches the one we saved */
+		ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
+		spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+		if (to_write == cabd)
+			abd_free(cabd);
+
+		to_write = eabd;
+	}
+
+out:
+	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
+	*abd_out = to_write;
+	return (0);
+
+error:
+	if (dck != NULL)
+		spa_keystore_dsl_key_rele(spa, dck, FTAG);
+	if (cabd != NULL)
+		abd_free(cabd);
+	if (eabd != NULL)
+		abd_free(eabd);
+
+	*abd_out = NULL;
+	return (ret);
+}
+
+static void
+l2arc_blk_fetch_done(zio_t *zio)
+{
+	l2arc_read_callback_t *cb;
+
+	cb = zio->io_private;
+	if (cb->l2rcb_abd != NULL)
+		abd_free(cb->l2rcb_abd);
+	kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment and the
+ * writing of log blocks).
+ */
+static uint64_t
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+{
+	arc_buf_hdr_t 		*hdr, *hdr_prev, *head;
+	uint64_t 		write_asize, write_psize, write_lsize, headroom;
+	boolean_t		full;
+	l2arc_write_callback_t	*cb = NULL;
+	zio_t 			*pio, *wzio;
+	uint64_t 		guid = spa_load_guid(spa);
+	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
+
+	ASSERT3P(dev->l2ad_vdev, !=, NULL);
+
+	pio = NULL;
+	write_lsize = write_asize = write_psize = 0;
+	full = B_FALSE;
+	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
+	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+
+	/*
+	 * Copy buffers for L2ARC writing.
+	 */
+	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
+		/*
+		 * If pass == 1 or 3, we cache MRU metadata and data
+		 * respectively.
+		 */
+		if (l2arc_mfuonly) {
+			if (pass == 1 || pass == 3)
+				continue;
+		}
+
+		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+		uint64_t passed_sz = 0;
+
+		VERIFY3P(mls, !=, NULL);
+
+		/*
+		 * L2ARC fast warmup.
+		 *
+		 * Until the ARC is warm and starts to evict, read from the
+		 * head of the ARC lists rather than the tail.
+		 */
+		if (arc_warm == B_FALSE)
+			hdr = multilist_sublist_head(mls);
+		else
+			hdr = multilist_sublist_tail(mls);
+
+		headroom = target_sz * l2arc_headroom;
+		if (zfs_compressed_arc_enabled)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
+
+		for (; hdr; hdr = hdr_prev) {
+			kmutex_t *hash_lock;
+			abd_t *to_write = NULL;
+
+			if (arc_warm == B_FALSE)
+				hdr_prev = multilist_sublist_next(mls, hdr);
+			else
+				hdr_prev = multilist_sublist_prev(mls, hdr);
+
+			hash_lock = HDR_LOCK(hdr);
+			if (!mutex_tryenter(hash_lock)) {
+				/*
+				 * Skip this buffer rather than waiting.
+				 */
+				continue;
+			}
+
+			passed_sz += HDR_GET_LSIZE(hdr);
+			if (l2arc_headroom != 0 && passed_sz > headroom) {
+				/*
+				 * Searched too far.
+				 */
+				mutex_exit(hash_lock);
+				break;
+			}
+
+			if (!l2arc_write_eligible(guid, hdr)) {
+				mutex_exit(hash_lock);
+				continue;
+			}
+
+			/*
+			 * We rely on the L1 portion of the header below, so
+			 * it's invalid for this header to have been evicted out
+			 * of the ghost cache, prior to being written out. The
+			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 */
+			ASSERT(HDR_HAS_L1HDR(hdr));
+
+			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+			ASSERT3U(arc_hdr_size(hdr), >, 0);
+			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+			    HDR_HAS_RABD(hdr));
+			uint64_t psize = HDR_GET_PSIZE(hdr);
+			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
+			    psize);
+
+			if ((write_asize + asize) > target_sz) {
+				full = B_TRUE;
+				mutex_exit(hash_lock);
+				break;
+			}
+
+			/*
+			 * We rely on the L1 portion of the header below, so
+			 * it's invalid for this header to have been evicted out
+			 * of the ghost cache, prior to being written out. The
+			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+			 */
+			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+			ASSERT(HDR_HAS_L1HDR(hdr));
+
+			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
+			    HDR_HAS_RABD(hdr));
+			ASSERT3U(arc_hdr_size(hdr), >, 0);
+
+			/*
+			 * If this header has b_rabd, we can use this since it
+			 * must always match the data exactly as it exists on
+			 * disk. Otherwise, the L2ARC can normally use the
+			 * hdr's data, but if we're sharing data between the
+			 * hdr and one of its bufs, L2ARC needs its own copy of
+			 * the data so that the ZIO below can't race with the
+			 * buf consumer. To ensure that this copy will be
+			 * available for the lifetime of the ZIO and be cleaned
+			 * up afterwards, we add it to the l2arc_free_on_write
+			 * queue. If we need to apply any transforms to the
+			 * data (compression, encryption) we will also need the
+			 * extra buffer.
+			 */
+			if (HDR_HAS_RABD(hdr) && psize == asize) {
+				to_write = hdr->b_crypt_hdr.b_rabd;
+			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
+			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
+			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
+			    psize == asize) {
+				to_write = hdr->b_l1hdr.b_pabd;
+			} else {
+				int ret;
+				arc_buf_contents_t type = arc_buf_type(hdr);
+
+				ret = l2arc_apply_transforms(spa, hdr, asize,
+				    &to_write);
+				if (ret != 0) {
+					arc_hdr_clear_flags(hdr,
+					    ARC_FLAG_L2_WRITING);
+					mutex_exit(hash_lock);
+					continue;
+				}
+
+				l2arc_free_abd_on_write(to_write, asize, type);
+			}
+
+			if (pio == NULL) {
+				/*
+				 * Insert a dummy header on the buflist so
+				 * l2arc_write_done() can find where the
+				 * write buffers begin without searching.
+				 */
+				mutex_enter(&dev->l2ad_mtx);
+				list_insert_head(&dev->l2ad_buflist, head);
+				mutex_exit(&dev->l2ad_mtx);
+
+				cb = kmem_alloc(
+				    sizeof (l2arc_write_callback_t), KM_SLEEP);
+				cb->l2wcb_dev = dev;
+				cb->l2wcb_head = head;
+				/*
+				 * Create a list to save allocated abd buffers
+				 * for l2arc_log_blk_commit().
+				 */
+				list_create(&cb->l2wcb_abd_list,
+				    sizeof (l2arc_lb_abd_buf_t),
+				    offsetof(l2arc_lb_abd_buf_t, node));
+				pio = zio_root(spa, l2arc_write_done, cb,
+				    ZIO_FLAG_CANFAIL);
+			}
+
+			hdr->b_l2hdr.b_dev = dev;
+			hdr->b_l2hdr.b_hits = 0;
+
+			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+			hdr->b_l2hdr.b_arcs_state =
+			    hdr->b_l1hdr.b_state->arcs_state;
+			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+			mutex_enter(&dev->l2ad_mtx);
+			list_insert_head(&dev->l2ad_buflist, hdr);
+			mutex_exit(&dev->l2ad_mtx);
+
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(hdr), hdr);
+
+			wzio = zio_write_phys(pio, dev->l2ad_vdev,
+			    hdr->b_l2hdr.b_daddr, asize, to_write,
+			    ZIO_CHECKSUM_OFF, NULL, hdr,
+			    ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_CANFAIL, B_FALSE);
+
+			write_lsize += HDR_GET_LSIZE(hdr);
+			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+			    zio_t *, wzio);
+
+			write_psize += psize;
+			write_asize += asize;
+			dev->l2ad_hand += asize;
+			l2arc_hdr_arcstats_increment(hdr);
+			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+			mutex_exit(hash_lock);
+
+			/*
+			 * Append buf info to current log and commit if full.
+			 * arcstat_l2_{size,asize} kstats are updated
+			 * internally.
+			 */
+			if (l2arc_log_blk_insert(dev, hdr))
+				l2arc_log_blk_commit(dev, pio, cb);
+
+			zio_nowait(wzio);
+		}
+
+		multilist_sublist_unlock(mls);
+
+		if (full == B_TRUE)
+			break;
+	}
+
+	/* No buffers selected for writing? */
+	if (pio == NULL) {
+		ASSERT0(write_lsize);
+		ASSERT(!HDR_HAS_L1HDR(head));
+		kmem_cache_free(hdr_l2only_cache, head);
+
+		/*
+		 * Although we did not write any buffers l2ad_evict may
+		 * have advanced.
+		 */
+		if (dev->l2ad_evict != l2dhdr->dh_evict)
+			l2arc_dev_hdr_update(dev);
+
+		return (0);
+	}
+
+	if (!dev->l2ad_first)
+		ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
+
+	ASSERT3U(write_asize, <=, target_sz);
+	ARCSTAT_BUMP(arcstat_l2_writes_sent);
+	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
+
+	dev->l2ad_writing = B_TRUE;
+	(void) zio_wait(pio);
+	dev->l2ad_writing = B_FALSE;
+
+	/*
+	 * Update the device header after the zio completes as
+	 * l2arc_write_done() may have updated the memory holding the log block
+	 * pointers in the device header.
+	 */
+	l2arc_dev_hdr_update(dev);
+
+	return (write_asize);
+}
+
+static boolean_t
+l2arc_hdr_limit_reached(void)
+{
+	int64_t s = aggsum_upper_bound(&astat_l2_hdr_size);
+
+	return (arc_reclaim_needed() || (s > arc_meta_limit * 3 / 4) ||
+	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals.  This is the beating
+ * heart of the L2ARC.
+ */
+/* ARGSUSED */
+static void
+l2arc_feed_thread(void *unused)
+{
+	callb_cpr_t cpr;
+	l2arc_dev_t *dev;
+	spa_t *spa;
+	uint64_t size, wrote;
+	clock_t begin, next = ddi_get_lbolt();
+	fstrans_cookie_t cookie;
+
+	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&l2arc_feed_thr_lock);
+
+	cookie = spl_fstrans_mark();
+	while (l2arc_thread_exit == 0) {
+		CALLB_CPR_SAFE_BEGIN(&cpr);
+		(void) cv_timedwait_idle(&l2arc_feed_thr_cv,
+		    &l2arc_feed_thr_lock, next);
+		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+		next = ddi_get_lbolt() + hz;
+
+		/*
+		 * Quick check for L2ARC devices.
+		 */
+		mutex_enter(&l2arc_dev_mtx);
+		if (l2arc_ndev == 0) {
+			mutex_exit(&l2arc_dev_mtx);
+			continue;
+		}
+		mutex_exit(&l2arc_dev_mtx);
+		begin = ddi_get_lbolt();
+
+		/*
+		 * This selects the next l2arc device to write to, and in
+		 * doing so the next spa to feed from: dev->l2ad_spa.   This
+		 * will return NULL if there are now no l2arc devices or if
+		 * they are all faulted.
+		 *
+		 * If a device is returned, its spa's config lock is also
+		 * held to prevent device removal.  l2arc_dev_get_next()
+		 * will grab and release l2arc_dev_mtx.
+		 */
+		if ((dev = l2arc_dev_get_next()) == NULL)
+			continue;
+
+		spa = dev->l2ad_spa;
+		ASSERT3P(spa, !=, NULL);
+
+		/*
+		 * If the pool is read-only then force the feed thread to
+		 * sleep a little longer.
+		 */
+		if (!spa_writeable(spa)) {
+			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
+			spa_config_exit(spa, SCL_L2ARC, dev);
+			continue;
+		}
+
+		/*
+		 * Avoid contributing to memory pressure.
+		 */
+		if (l2arc_hdr_limit_reached()) {
+			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+			spa_config_exit(spa, SCL_L2ARC, dev);
+			continue;
+		}
+
+		ARCSTAT_BUMP(arcstat_l2_feeds);
+
+		size = l2arc_write_size(dev);
+
+		/*
+		 * Evict L2ARC buffers that will be overwritten.
+		 */
+		l2arc_evict(dev, size, B_FALSE);
+
+		/*
+		 * Write ARC buffers.
+		 */
+		wrote = l2arc_write_buffers(spa, dev, size);
+
+		/*
+		 * Calculate interval between writes.
+		 */
+		next = l2arc_write_interval(begin, size, wrote);
+		spa_config_exit(spa, SCL_L2ARC, dev);
+	}
+	spl_fstrans_unmark(cookie);
+
+	l2arc_thread_exit = 0;
+	cv_broadcast(&l2arc_feed_thr_cv);
+	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
+	thread_exit();
+}
+
+boolean_t
+l2arc_vdev_present(vdev_t *vd)
+{
+	return (l2arc_vdev_get(vd) != NULL);
+}
+
+/*
+ * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
+ * the vdev_t isn't an L2ARC device.
+ */
+l2arc_dev_t *
+l2arc_vdev_get(vdev_t *vd)
+{
+	l2arc_dev_t	*dev;
+
+	mutex_enter(&l2arc_dev_mtx);
+	for (dev = list_head(l2arc_dev_list); dev != NULL;
+	    dev = list_next(l2arc_dev_list, dev)) {
+		if (dev->l2ad_vdev == vd)
+			break;
+	}
+	mutex_exit(&l2arc_dev_mtx);
+
+	return (dev);
+}
+
+/*
+ * Add a vdev for use by the L2ARC.  By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
+{
+	l2arc_dev_t		*adddev;
+	uint64_t		l2dhdr_asize;
+
+	ASSERT(!l2arc_vdev_present(vd));
+
+	/*
+	 * Create a new l2arc device entry.
+	 */
+	adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+	adddev->l2ad_spa = spa;
+	adddev->l2ad_vdev = vd;
+	/* leave extra size for an l2arc device header */
+	l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
+	    MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
+	adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
+	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
+	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
+	adddev->l2ad_hand = adddev->l2ad_start;
+	adddev->l2ad_evict = adddev->l2ad_start;
+	adddev->l2ad_first = B_TRUE;
+	adddev->l2ad_writing = B_FALSE;
+	adddev->l2ad_trim_all = B_FALSE;
+	list_link_init(&adddev->l2ad_node);
+	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
+
+	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
+	/*
+	 * This is a list of all ARC buffers that are still valid on the
+	 * device.
+	 */
+	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
+
+	/*
+	 * This is a list of pointers to log blocks that are still present
+	 * on the device.
+	 */
+	list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
+	    offsetof(l2arc_lb_ptr_buf_t, node));
+
+	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+	zfs_refcount_create(&adddev->l2ad_alloc);
+	zfs_refcount_create(&adddev->l2ad_lb_asize);
+	zfs_refcount_create(&adddev->l2ad_lb_count);
+
+	/*
+	 * Add device to global list
+	 */
+	mutex_enter(&l2arc_dev_mtx);
+	list_insert_head(l2arc_dev_list, adddev);
+	atomic_inc_64(&l2arc_ndev);
+	mutex_exit(&l2arc_dev_mtx);
+
+	/*
+	 * Decide if vdev is eligible for L2ARC rebuild
+	 */
+	l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
+}
+
+void
+l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
+{
+	l2arc_dev_t		*dev = NULL;
+	l2arc_dev_hdr_phys_t	*l2dhdr;
+	uint64_t		l2dhdr_asize;
+	spa_t			*spa;
+
+	dev = l2arc_vdev_get(vd);
+	ASSERT3P(dev, !=, NULL);
+	spa = dev->l2ad_spa;
+	l2dhdr = dev->l2ad_dev_hdr;
+	l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+
+	/*
+	 * The L2ARC has to hold at least the payload of one log block for
+	 * them to be restored (persistent L2ARC). The payload of a log block
+	 * depends on the amount of its log entries. We always write log blocks
+	 * with 1022 entries. How many of them are committed or restored depends
+	 * on the size of the L2ARC device. Thus the maximum payload of
+	 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
+	 * is less than that, we reduce the amount of committed and restored
+	 * log entries per block so as to enable persistence.
+	 */
+	if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
+		dev->l2ad_log_entries = 0;
+	} else {
+		dev->l2ad_log_entries = MIN((dev->l2ad_end -
+		    dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
+		    L2ARC_LOG_BLK_MAX_ENTRIES);
+	}
+
+	/*
+	 * Read the device header, if an error is returned do not rebuild L2ARC.
+	 */
+	if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
+		/*
+		 * If we are onlining a cache device (vdev_reopen) that was
+		 * still present (l2arc_vdev_present()) and rebuild is enabled,
+		 * we should evict all ARC buffers and pointers to log blocks
+		 * and reclaim their space before restoring its contents to
+		 * L2ARC.
+		 */
+		if (reopen) {
+			if (!l2arc_rebuild_enabled) {
+				return;
+			} else {
+				l2arc_evict(dev, 0, B_TRUE);
+				/* start a new log block */
+				dev->l2ad_log_ent_idx = 0;
+				dev->l2ad_log_blk_payload_asize = 0;
+				dev->l2ad_log_blk_payload_start = 0;
+			}
+		}
+		/*
+		 * Just mark the device as pending for a rebuild. We won't
+		 * be starting a rebuild in line here as it would block pool
+		 * import. Instead spa_load_impl will hand that off to an
+		 * async task which will call l2arc_spa_rebuild_start.
+		 */
+		dev->l2ad_rebuild = B_TRUE;
+	} else if (spa_writeable(spa)) {
+		/*
+		 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
+		 * otherwise create a new header. We zero out the memory holding
+		 * the header to reset dh_start_lbps. If we TRIM the whole
+		 * device the new header will be written by
+		 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
+		 * trim_state in the header too. When reading the header, if
+		 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
+		 * we opt to TRIM the whole device again.
+		 */
+		if (l2arc_trim_ahead > 0) {
+			dev->l2ad_trim_all = B_TRUE;
+		} else {
+			bzero(l2dhdr, l2dhdr_asize);
+			l2arc_dev_hdr_update(dev);
+		}
+	}
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+	l2arc_dev_t *remdev = NULL;
+
+	/*
+	 * Find the device by vdev
+	 */
+	remdev = l2arc_vdev_get(vd);
+	ASSERT3P(remdev, !=, NULL);
+
+	/*
+	 * Cancel any ongoing or scheduled rebuild.
+	 */
+	mutex_enter(&l2arc_rebuild_thr_lock);
+	if (remdev->l2ad_rebuild_began == B_TRUE) {
+		remdev->l2ad_rebuild_cancel = B_TRUE;
+		while (remdev->l2ad_rebuild == B_TRUE)
+			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
+	}
+	mutex_exit(&l2arc_rebuild_thr_lock);
+
+	/*
+	 * Remove device from global list
+	 */
+	mutex_enter(&l2arc_dev_mtx);
+	list_remove(l2arc_dev_list, remdev);
+	l2arc_dev_last = NULL;		/* may have been invalidated */
+	atomic_dec_64(&l2arc_ndev);
+	mutex_exit(&l2arc_dev_mtx);
+
+	/*
+	 * Clear all buflists and ARC references.  L2ARC device flush.
+	 */
+	l2arc_evict(remdev, 0, B_TRUE);
+	list_destroy(&remdev->l2ad_buflist);
+	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
+	list_destroy(&remdev->l2ad_lbptr_list);
+	mutex_destroy(&remdev->l2ad_mtx);
+	zfs_refcount_destroy(&remdev->l2ad_alloc);
+	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
+	zfs_refcount_destroy(&remdev->l2ad_lb_count);
+	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
+	vmem_free(remdev, sizeof (l2arc_dev_t));
+}
+
+void
+l2arc_init(void)
+{
+	l2arc_thread_exit = 0;
+	l2arc_ndev = 0;
+	l2arc_writes_sent = 0;
+	l2arc_writes_done = 0;
+
+	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+	l2arc_dev_list = &L2ARC_dev_list;
+	l2arc_free_on_write = &L2ARC_free_on_write;
+	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+	    offsetof(l2arc_dev_t, l2ad_node));
+	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+	    offsetof(l2arc_data_free_t, l2df_list_node));
+}
+
+void
+l2arc_fini(void)
+{
+	mutex_destroy(&l2arc_feed_thr_lock);
+	cv_destroy(&l2arc_feed_thr_cv);
+	mutex_destroy(&l2arc_rebuild_thr_lock);
+	cv_destroy(&l2arc_rebuild_thr_cv);
+	mutex_destroy(&l2arc_dev_mtx);
+	mutex_destroy(&l2arc_free_on_write_mtx);
+
+	list_destroy(l2arc_dev_list);
+	list_destroy(l2arc_free_on_write);
+}
+
+void
+l2arc_start(void)
+{
+	if (!(spa_mode_global & SPA_MODE_WRITE))
+		return;
+
+	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+	    TS_RUN, defclsyspri);
+}
+
+void
+l2arc_stop(void)
+{
+	if (!(spa_mode_global & SPA_MODE_WRITE))
+		return;
+
+	mutex_enter(&l2arc_feed_thr_lock);
+	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
+	l2arc_thread_exit = 1;
+	while (l2arc_thread_exit != 0)
+		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+	mutex_exit(&l2arc_feed_thr_lock);
+}
+
+/*
+ * Punches out rebuild threads for the L2ARC devices in a spa. This should
+ * be called after pool import from the spa async thread, since starting
+ * these threads directly from spa_import() will make them part of the
+ * "zpool import" context and delay process exit (and thus pool import).
+ */
+void
+l2arc_spa_rebuild_start(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	/*
+	 * Locate the spa's l2arc devices and kick off rebuild threads.
+	 */
+	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+		l2arc_dev_t *dev =
+		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
+		if (dev == NULL) {
+			/* Don't attempt a rebuild if the vdev is UNAVAIL */
+			continue;
+		}
+		mutex_enter(&l2arc_rebuild_thr_lock);
+		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
+			dev->l2ad_rebuild_began = B_TRUE;
+			(void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
+			    dev, 0, &p0, TS_RUN, minclsyspri);
+		}
+		mutex_exit(&l2arc_rebuild_thr_lock);
+	}
+}
+
+/*
+ * Main entry point for L2ARC rebuilding.
+ */
+static void
+l2arc_dev_rebuild_thread(void *arg)
+{
+	l2arc_dev_t *dev = arg;
+
+	VERIFY(!dev->l2ad_rebuild_cancel);
+	VERIFY(dev->l2ad_rebuild);
+	(void) l2arc_rebuild(dev);
+	mutex_enter(&l2arc_rebuild_thr_lock);
+	dev->l2ad_rebuild_began = B_FALSE;
+	dev->l2ad_rebuild = B_FALSE;
+	mutex_exit(&l2arc_rebuild_thr_lock);
+
+	thread_exit();
+}
+
+/*
+ * This function implements the actual L2ARC metadata rebuild. It:
+ * starts reading the log block chain and restores each block's contents
+ * to memory (reconstructing arc_buf_hdr_t's).
+ *
+ * Operation stops under any of the following conditions:
+ *
+ * 1) We reach the end of the log block chain.
+ * 2) We encounter *any* error condition (cksum errors, io errors)
+ */
+static int
+l2arc_rebuild(l2arc_dev_t *dev)
+{
+	vdev_t			*vd = dev->l2ad_vdev;
+	spa_t			*spa = vd->vdev_spa;
+	int			err = 0;
+	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
+	l2arc_log_blk_phys_t	*this_lb, *next_lb;
+	zio_t			*this_io = NULL, *next_io = NULL;
+	l2arc_log_blkptr_t	lbps[2];
+	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
+	boolean_t		lock_held;
+
+	this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
+	next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
+
+	/*
+	 * We prevent device removal while issuing reads to the device,
+	 * then during the rebuilding phases we drop this lock again so
+	 * that a spa_unload or device remove can be initiated - this is
+	 * safe, because the spa will signal us to stop before removing
+	 * our device and wait for us to stop.
+	 */
+	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
+	lock_held = B_TRUE;
+
+	/*
+	 * Retrieve the persistent L2ARC device state.
+	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
+	 */
+	dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
+	dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
+	    L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
+	    dev->l2ad_start);
+	dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
+
+	vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
+	vd->vdev_trim_state = l2dhdr->dh_trim_state;
+
+	/*
+	 * In case the zfs module parameter l2arc_rebuild_enabled is false
+	 * we do not start the rebuild process.
+	 */
+	if (!l2arc_rebuild_enabled)
+		goto out;
+
+	/* Prepare the rebuild process */
+	bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps));
+
+	/* Start the rebuild process */
+	for (;;) {
+		if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
+			break;
+
+		if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
+		    this_lb, next_lb, this_io, &next_io)) != 0)
+			goto out;
+
+		/*
+		 * Our memory pressure valve. If the system is running low
+		 * on memory, rather than swamping memory with new ARC buf
+		 * hdrs, we opt not to rebuild the L2ARC. At this point,
+		 * however, we have already set up our L2ARC dev to chain in
+		 * new metadata log blocks, so the user may choose to offline/
+		 * online the L2ARC dev at a later time (or re-import the pool)
+		 * to reconstruct it (when there's less memory pressure).
+		 */
+		if (l2arc_hdr_limit_reached()) {
+			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
+			cmn_err(CE_NOTE, "System running low on memory, "
+			    "aborting L2ARC rebuild.");
+			err = SET_ERROR(ENOMEM);
+			goto out;
+		}
+
+		spa_config_exit(spa, SCL_L2ARC, vd);
+		lock_held = B_FALSE;
+
+		/*
+		 * Now that we know that the next_lb checks out alright, we
+		 * can start reconstruction from this log block.
+		 * L2BLK_GET_PSIZE returns aligned size for log blocks.
+		 */
+		uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
+		l2arc_log_blk_restore(dev, this_lb, asize);
+
+		/*
+		 * log block restored, include its pointer in the list of
+		 * pointers to log blocks present in the L2ARC device.
+		 */
+		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
+		    KM_SLEEP);
+		bcopy(&lbps[0], lb_ptr_buf->lb_ptr,
+		    sizeof (l2arc_log_blkptr_t));
+		mutex_enter(&dev->l2ad_mtx);
+		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
+		ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+		ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+		zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+		zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+		mutex_exit(&dev->l2ad_mtx);
+		vdev_space_update(vd, asize, 0, 0);
+
+		/*
+		 * Protection against loops of log blocks:
+		 *
+		 *				       l2ad_hand  l2ad_evict
+		 *                                         V	      V
+		 * l2ad_start |=======================================| l2ad_end
+		 *             -----|||----|||---|||----|||
+		 *                  (3)    (2)   (1)    (0)
+		 *             ---|||---|||----|||---|||
+		 *		  (7)   (6)    (5)   (4)
+		 *
+		 * In this situation the pointer of log block (4) passes
+		 * l2arc_log_blkptr_valid() but the log block should not be
+		 * restored as it is overwritten by the payload of log block
+		 * (0). Only log blocks (0)-(3) should be restored. We check
+		 * whether l2ad_evict lies in between the payload starting
+		 * offset of the next log block (lbps[1].lbp_payload_start)
+		 * and the payload starting offset of the present log block
+		 * (lbps[0].lbp_payload_start). If true and this isn't the
+		 * first pass, we are looping from the beginning and we should
+		 * stop.
+		 */
+		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
+		    lbps[0].lbp_payload_start, dev->l2ad_evict) &&
+		    !dev->l2ad_first)
+			goto out;
+
+		cond_resched();
+		for (;;) {
+			mutex_enter(&l2arc_rebuild_thr_lock);
+			if (dev->l2ad_rebuild_cancel) {
+				dev->l2ad_rebuild = B_FALSE;
+				cv_signal(&l2arc_rebuild_thr_cv);
+				mutex_exit(&l2arc_rebuild_thr_lock);
+				err = SET_ERROR(ECANCELED);
+				goto out;
+			}
+			mutex_exit(&l2arc_rebuild_thr_lock);
+			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
+			    RW_READER)) {
+				lock_held = B_TRUE;
+				break;
+			}
+			/*
+			 * L2ARC config lock held by somebody in writer,
+			 * possibly due to them trying to remove us. They'll
+			 * likely to want us to shut down, so after a little
+			 * delay, we check l2ad_rebuild_cancel and retry
+			 * the lock again.
+			 */
+			delay(1);
+		}
+
+		/*
+		 * Continue with the next log block.
+		 */
+		lbps[0] = lbps[1];
+		lbps[1] = this_lb->lb_prev_lbp;
+		PTR_SWAP(this_lb, next_lb);
+		this_io = next_io;
+		next_io = NULL;
+	}
+
+	if (this_io != NULL)
+		l2arc_log_blk_fetch_abort(this_io);
+out:
+	if (next_io != NULL)
+		l2arc_log_blk_fetch_abort(next_io);
+	vmem_free(this_lb, sizeof (*this_lb));
+	vmem_free(next_lb, sizeof (*next_lb));
+
+	if (!l2arc_rebuild_enabled) {
+		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+		    "disabled");
+	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
+		ARCSTAT_BUMP(arcstat_l2_rebuild_success);
+		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+		    "successful, restored %llu blocks",
+		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
+		/*
+		 * No error but also nothing restored, meaning the lbps array
+		 * in the device header points to invalid/non-present log
+		 * blocks. Reset the header.
+		 */
+		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+		    "no valid log blocks");
+		bzero(l2dhdr, dev->l2ad_dev_hdr_asize);
+		l2arc_dev_hdr_update(dev);
+	} else if (err == ECANCELED) {
+		/*
+		 * In case the rebuild was canceled do not log to spa history
+		 * log as the pool may be in the process of being removed.
+		 */
+		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
+		    zfs_refcount_count(&dev->l2ad_lb_count));
+	} else if (err != 0) {
+		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
+		    "aborted, restored %llu blocks",
+		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
+	}
+
+	if (lock_held)
+		spa_config_exit(spa, SCL_L2ARC, vd);
+
+	return (err);
+}
+
+/*
+ * Attempts to read the device header on the provided L2ARC device and writes
+ * it to `hdr'. On success, this function returns 0, otherwise the appropriate
+ * error code is returned.
+ */
+static int
+l2arc_dev_hdr_read(l2arc_dev_t *dev)
+{
+	int			err;
+	uint64_t		guid;
+	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
+	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+	abd_t 			*abd;
+
+	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+
+	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
+	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
+	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+	    ZIO_FLAG_SPECULATIVE, B_FALSE));
+
+	abd_free(abd);
+
+	if (err != 0) {
+		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
+		zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
+		    "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+		return (err);
+	}
+
+	if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
+		byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
+
+	if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
+	    l2dhdr->dh_spa_guid != guid ||
+	    l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
+	    l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
+	    l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
+	    l2dhdr->dh_end != dev->l2ad_end ||
+	    !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
+	    l2dhdr->dh_evict) ||
+	    (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
+	    l2arc_trim_ahead > 0)) {
+		/*
+		 * Attempt to rebuild a device containing no actual dev hdr
+		 * or containing a header from some other pool or from another
+		 * version of persistent L2ARC.
+		 */
+		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	return (0);
+}
+
+/*
+ * Reads L2ARC log blocks from storage and validates their contents.
+ *
+ * This function implements a simple fetcher to make sure that while
+ * we're processing one buffer the L2ARC is already fetching the next
+ * one in the chain.
+ *
+ * The arguments this_lp and next_lp point to the current and next log block
+ * address in the block chain. Similarly, this_lb and next_lb hold the
+ * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
+ *
+ * The `this_io' and `next_io' arguments are used for block fetching.
+ * When issuing the first blk IO during rebuild, you should pass NULL for
+ * `this_io'. This function will then issue a sync IO to read the block and
+ * also issue an async IO to fetch the next block in the block chain. The
+ * fetched IO is returned in `next_io'. On subsequent calls to this
+ * function, pass the value returned in `next_io' from the previous call
+ * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
+ * Prior to the call, you should initialize your `next_io' pointer to be
+ * NULL. If no fetch IO was issued, the pointer is left set at NULL.
+ *
+ * On success, this function returns 0, otherwise it returns an appropriate
+ * error code. On error the fetching IO is aborted and cleared before
+ * returning from this function. Therefore, if we return `success', the
+ * caller can assume that we have taken care of cleanup of fetch IOs.
+ */
+static int
+l2arc_log_blk_read(l2arc_dev_t *dev,
+    const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
+    l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
+    zio_t *this_io, zio_t **next_io)
+{
+	int		err = 0;
+	zio_cksum_t	cksum;
+	abd_t		*abd = NULL;
+	uint64_t	asize;
+
+	ASSERT(this_lbp != NULL && next_lbp != NULL);
+	ASSERT(this_lb != NULL && next_lb != NULL);
+	ASSERT(next_io != NULL && *next_io == NULL);
+	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
+
+	/*
+	 * Check to see if we have issued the IO for this log block in a
+	 * previous run. If not, this is the first call, so issue it now.
+	 */
+	if (this_io == NULL) {
+		this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
+		    this_lb);
+	}
+
+	/*
+	 * Peek to see if we can start issuing the next IO immediately.
+	 */
+	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
+		/*
+		 * Start issuing IO for the next log block early - this
+		 * should help keep the L2ARC device busy while we
+		 * decompress and restore this log block.
+		 */
+		*next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
+		    next_lb);
+	}
+
+	/* Wait for the IO to read this log block to complete */
+	if ((err = zio_wait(this_io)) != 0) {
+		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
+		zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
+		    "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr,
+		    dev->l2ad_vdev->vdev_guid);
+		goto cleanup;
+	}
+
+	/*
+	 * Make sure the buffer checks out.
+	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
+	 */
+	asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
+	fletcher_4_native(this_lb, asize, NULL, &cksum);
+	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
+		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
+		zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
+		    "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
+		    this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid,
+		    dev->l2ad_hand, dev->l2ad_evict);
+		err = SET_ERROR(ECKSUM);
+		goto cleanup;
+	}
+
+	/* Now we can take our time decoding this buffer */
+	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
+	case ZIO_COMPRESS_OFF:
+		break;
+	case ZIO_COMPRESS_LZ4:
+		abd = abd_alloc_for_io(asize, B_TRUE);
+		abd_copy_from_buf_off(abd, this_lb, 0, asize);
+		if ((err = zio_decompress_data(
+		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
+		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
+			err = SET_ERROR(EINVAL);
+			goto cleanup;
+		}
+		break;
+	default:
+		err = SET_ERROR(EINVAL);
+		goto cleanup;
+	}
+	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
+		byteswap_uint64_array(this_lb, sizeof (*this_lb));
+	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
+		err = SET_ERROR(EINVAL);
+		goto cleanup;
+	}
+cleanup:
+	/* Abort an in-flight fetch I/O in case of error */
+	if (err != 0 && *next_io != NULL) {
+		l2arc_log_blk_fetch_abort(*next_io);
+		*next_io = NULL;
+	}
+	if (abd != NULL)
+		abd_free(abd);
+	return (err);
+}
+
+/*
+ * Restores the payload of a log block to ARC. This creates empty ARC hdr
+ * entries which only contain an l2arc hdr, essentially restoring the
+ * buffers to their L2ARC evicted state. This function also updates space
+ * usage on the L2ARC vdev to make sure it tracks restored buffers.
+ */
+static void
+l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
+    uint64_t lb_asize)
+{
+	uint64_t	size = 0, asize = 0;
+	uint64_t	log_entries = dev->l2ad_log_entries;
+
+	/*
+	 * Usually arc_adapt() is called only for data, not headers, but
+	 * since we may allocate significant amount of memory here, let ARC
+	 * grow its arc_c.
+	 */
+	arc_adapt(log_entries * HDR_L2ONLY_SIZE, arc_l2c_only);
+
+	for (int i = log_entries - 1; i >= 0; i--) {
+		/*
+		 * Restore goes in the reverse temporal direction to preserve
+		 * correct temporal ordering of buffers in the l2ad_buflist.
+		 * l2arc_hdr_restore also does a list_insert_tail instead of
+		 * list_insert_head on the l2ad_buflist:
+		 *
+		 *		LIST	l2ad_buflist		LIST
+		 *		HEAD  <------ (time) ------	TAIL
+		 * direction	+-----+-----+-----+-----+-----+    direction
+		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
+		 * fill		+-----+-----+-----+-----+-----+
+		 *		^				^
+		 *		|				|
+		 *		|				|
+		 *	l2arc_feed_thread		l2arc_rebuild
+		 *	will place new bufs here	restores bufs here
+		 *
+		 * During l2arc_rebuild() the device is not used by
+		 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
+		 */
+		size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
+		asize += vdev_psize_to_asize(dev->l2ad_vdev,
+		    L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
+		l2arc_hdr_restore(&lb->lb_entries[i], dev);
+	}
+
+	/*
+	 * Record rebuild stats:
+	 *	size		Logical size of restored buffers in the L2ARC
+	 *	asize		Aligned size of restored buffers in the L2ARC
+	 */
+	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
+	ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
+	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
+	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
+	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
+	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
+}
+
+/*
+ * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
+ * into a state indicating that it has been evicted to L2ARC.
+ */
+static void
+l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
+{
+	arc_buf_hdr_t		*hdr, *exists;
+	kmutex_t		*hash_lock;
+	arc_buf_contents_t	type = L2BLK_GET_TYPE((le)->le_prop);
+	uint64_t		asize;
+
+	/*
+	 * Do all the allocation before grabbing any locks, this lets us
+	 * sleep if memory is full and we don't have to deal with failed
+	 * allocations.
+	 */
+	hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
+	    dev, le->le_dva, le->le_daddr,
+	    L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
+	    L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
+	    L2BLK_GET_PROTECTED((le)->le_prop),
+	    L2BLK_GET_PREFETCH((le)->le_prop),
+	    L2BLK_GET_STATE((le)->le_prop));
+	asize = vdev_psize_to_asize(dev->l2ad_vdev,
+	    L2BLK_GET_PSIZE((le)->le_prop));
+
+	/*
+	 * vdev_space_update() has to be called before arc_hdr_destroy() to
+	 * avoid underflow since the latter also calls vdev_space_update().
+	 */
+	l2arc_hdr_arcstats_increment(hdr);
+	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+	mutex_enter(&dev->l2ad_mtx);
+	list_insert_tail(&dev->l2ad_buflist, hdr);
+	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+	mutex_exit(&dev->l2ad_mtx);
+
+	exists = buf_hash_insert(hdr, &hash_lock);
+	if (exists) {
+		/* Buffer was already cached, no need to restore it. */
+		arc_hdr_destroy(hdr);
+		/*
+		 * If the buffer is already cached, check whether it has
+		 * L2ARC metadata. If not, enter them and update the flag.
+		 * This is important is case of onlining a cache device, since
+		 * we previously evicted all L2ARC metadata from ARC.
+		 */
+		if (!HDR_HAS_L2HDR(exists)) {
+			arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
+			exists->b_l2hdr.b_dev = dev;
+			exists->b_l2hdr.b_daddr = le->le_daddr;
+			exists->b_l2hdr.b_arcs_state =
+			    L2BLK_GET_STATE((le)->le_prop);
+			mutex_enter(&dev->l2ad_mtx);
+			list_insert_tail(&dev->l2ad_buflist, exists);
+			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
+			    arc_hdr_size(exists), exists);
+			mutex_exit(&dev->l2ad_mtx);
+			l2arc_hdr_arcstats_increment(exists);
+			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+		}
+		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
+	}
+
+	mutex_exit(hash_lock);
+}
+
+/*
+ * Starts an asynchronous read IO to read a log block. This is used in log
+ * block reconstruction to start reading the next block before we are done
+ * decoding and reconstructing the current block, to keep the l2arc device
+ * nice and hot with read IO to process.
+ * The returned zio will contain a newly allocated memory buffers for the IO
+ * data which should then be freed by the caller once the zio is no longer
+ * needed (i.e. due to it having completed). If you wish to abort this
+ * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
+ * care of disposing of the allocated buffers correctly.
+ */
+static zio_t *
+l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
+    l2arc_log_blk_phys_t *lb)
+{
+	uint32_t		asize;
+	zio_t			*pio;
+	l2arc_read_callback_t	*cb;
+
+	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
+	asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+	ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
+
+	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
+	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
+	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
+	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+	    ZIO_FLAG_DONT_RETRY);
+	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
+	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
+
+	return (pio);
+}
+
+/*
+ * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
+ * buffers allocated for it.
+ */
+static void
+l2arc_log_blk_fetch_abort(zio_t *zio)
+{
+	(void) zio_wait(zio);
+}
+
+/*
+ * Creates a zio to update the device header on an l2arc device.
+ */
+void
+l2arc_dev_hdr_update(l2arc_dev_t *dev)
+{
+	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
+	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
+	abd_t			*abd;
+	int			err;
+
+	VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
+
+	l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
+	l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
+	l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
+	l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
+	l2dhdr->dh_log_entries = dev->l2ad_log_entries;
+	l2dhdr->dh_evict = dev->l2ad_evict;
+	l2dhdr->dh_start = dev->l2ad_start;
+	l2dhdr->dh_end = dev->l2ad_end;
+	l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
+	l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
+	l2dhdr->dh_flags = 0;
+	l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
+	l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
+	if (dev->l2ad_first)
+		l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
+
+	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
+
+	err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
+	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
+	    NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
+
+	abd_free(abd);
+
+	if (err != 0) {
+		zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
+		    "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid);
+	}
+}
+
+/*
+ * Commits a log block to the L2ARC device. This routine is invoked from
+ * l2arc_write_buffers when the log block fills up.
+ * This function allocates some memory to temporarily hold the serialized
+ * buffer to be written. This is then released in l2arc_write_done.
+ */
+static void
+l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
+{
+	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
+	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
+	uint64_t		psize, asize;
+	zio_t			*wzio;
+	l2arc_lb_abd_buf_t	*abd_buf;
+	uint8_t			*tmpbuf;
+	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
+
+	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
+
+	tmpbuf = zio_buf_alloc(sizeof (*lb));
+	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
+	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
+	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
+	lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
+
+	/* link the buffer into the block chain */
+	lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
+	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
+
+	/*
+	 * l2arc_log_blk_commit() may be called multiple times during a single
+	 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
+	 * so we can free them in l2arc_write_done() later on.
+	 */
+	list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
+
+	/* try to compress the buffer */
+	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
+	    abd_buf->abd, tmpbuf, sizeof (*lb), 0);
+
+	/* a log block is never entirely zero */
+	ASSERT(psize != 0);
+	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
+	ASSERT(asize <= sizeof (*lb));
+
+	/*
+	 * Update the start log block pointer in the device header to point
+	 * to the log block we're about to write.
+	 */
+	l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
+	l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
+	l2dhdr->dh_start_lbps[0].lbp_payload_asize =
+	    dev->l2ad_log_blk_payload_asize;
+	l2dhdr->dh_start_lbps[0].lbp_payload_start =
+	    dev->l2ad_log_blk_payload_start;
+	_NOTE(CONSTCOND)
+	L2BLK_SET_LSIZE(
+	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
+	L2BLK_SET_PSIZE(
+	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
+	L2BLK_SET_CHECKSUM(
+	    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+	    ZIO_CHECKSUM_FLETCHER_4);
+	if (asize < sizeof (*lb)) {
+		/* compression succeeded */
+		bzero(tmpbuf + psize, asize - psize);
+		L2BLK_SET_COMPRESS(
+		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+		    ZIO_COMPRESS_LZ4);
+	} else {
+		/* compression failed */
+		bcopy(lb, tmpbuf, sizeof (*lb));
+		L2BLK_SET_COMPRESS(
+		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
+		    ZIO_COMPRESS_OFF);
+	}
+
+	/* checksum what we're about to write */
+	fletcher_4_native(tmpbuf, asize, NULL,
+	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
+
+	abd_free(abd_buf->abd);
+
+	/* perform the write itself */
+	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
+	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
+	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
+	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
+	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
+	(void) zio_nowait(wzio);
+
+	dev->l2ad_hand += asize;
+	/*
+	 * Include the committed log block's pointer  in the list of pointers
+	 * to log blocks present in the L2ARC device.
+	 */
+	bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr,
+	    sizeof (l2arc_log_blkptr_t));
+	mutex_enter(&dev->l2ad_mtx);
+	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
+	ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
+	ARCSTAT_BUMP(arcstat_l2_log_blk_count);
+	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
+	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
+	mutex_exit(&dev->l2ad_mtx);
+	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
+
+	/* bump the kstats */
+	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
+	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
+	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
+	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
+	    dev->l2ad_log_blk_payload_asize / asize);
+
+	/* start a new log block */
+	dev->l2ad_log_ent_idx = 0;
+	dev->l2ad_log_blk_payload_asize = 0;
+	dev->l2ad_log_blk_payload_start = 0;
+}
+
+/*
+ * Validates an L2ARC log block address to make sure that it can be read
+ * from the provided L2ARC device.
+ */
+boolean_t
+l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
+{
+	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
+	uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
+	uint64_t end = lbp->lbp_daddr + asize - 1;
+	uint64_t start = lbp->lbp_payload_start;
+	boolean_t evicted = B_FALSE;
+
+	/*
+	 * A log block is valid if all of the following conditions are true:
+	 * - it fits entirely (including its payload) between l2ad_start and
+	 *   l2ad_end
+	 * - it has a valid size
+	 * - neither the log block itself nor part of its payload was evicted
+	 *   by l2arc_evict():
+	 *
+	 *		l2ad_hand          l2ad_evict
+	 *		|			 |	lbp_daddr
+	 *		|     start		 |	|  end
+	 *		|     |			 |	|  |
+	 *		V     V		         V	V  V
+	 *   l2ad_start ============================================ l2ad_end
+	 *                    --------------------------||||
+	 *				^		 ^
+	 *				|		log block
+	 *				payload
+	 */
+
+	evicted =
+	    l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
+	    l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
+	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
+	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
+
+	return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
+	    asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
+	    (!evicted || dev->l2ad_first));
+}
+
+/*
+ * Inserts ARC buffer header `hdr' into the current L2ARC log block on
+ * the device. The buffer being inserted must be present in L2ARC.
+ * Returns B_TRUE if the L2ARC log block is full and needs to be committed
+ * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
+ */
+static boolean_t
+l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
+{
+	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
+	l2arc_log_ent_phys_t	*le;
+
+	if (dev->l2ad_log_entries == 0)
+		return (B_FALSE);
+
+	int index = dev->l2ad_log_ent_idx++;
+
+	ASSERT3S(index, <, dev->l2ad_log_entries);
+	ASSERT(HDR_HAS_L2HDR(hdr));
+
+	le = &lb->lb_entries[index];
+	bzero(le, sizeof (*le));
+	le->le_dva = hdr->b_dva;
+	le->le_birth = hdr->b_birth;
+	le->le_daddr = hdr->b_l2hdr.b_daddr;
+	if (index == 0)
+		dev->l2ad_log_blk_payload_start = le->le_daddr;
+	L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
+	L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
+	L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
+	le->le_complevel = hdr->b_complevel;
+	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
+	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
+	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
+	L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+
+	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
+	    HDR_GET_PSIZE(hdr));
+
+	return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
+}
+
+/*
+ * Checks whether a given L2ARC device address sits in a time-sequential
+ * range. The trick here is that the L2ARC is a rotary buffer, so we can't
+ * just do a range comparison, we need to handle the situation in which the
+ * range wraps around the end of the L2ARC device. Arguments:
+ *	bottom -- Lower end of the range to check (written to earlier).
+ *	top    -- Upper end of the range to check (written to later).
+ *	check  -- The address for which we want to determine if it sits in
+ *		  between the top and bottom.
+ *
+ * The 3-way conditional below represents the following cases:
+ *
+ *	bottom < top : Sequentially ordered case:
+ *	  <check>--------+-------------------+
+ *	                 |  (overlap here?)  |
+ *	 L2ARC dev       V                   V
+ *	 |---------------<bottom>============<top>--------------|
+ *
+ *	bottom > top: Looped-around case:
+ *	                      <check>--------+------------------+
+ *	                                     |  (overlap here?) |
+ *	 L2ARC dev                           V                  V
+ *	 |===============<top>---------------<bottom>===========|
+ *	 ^               ^
+ *	 |  (or here?)   |
+ *	 +---------------+---------<check>
+ *
+ *	top == bottom : Just a single address comparison.
+ */
+boolean_t
+l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
+{
+	if (bottom < top)
+		return (bottom <= check && check <= top);
+	else if (bottom > top)
+		return (check <= top || bottom <= check);
+	else
+		return (check == top);
+}
+
+EXPORT_SYMBOL(arc_buf_size);
+EXPORT_SYMBOL(arc_write);
+EXPORT_SYMBOL(arc_read);
+EXPORT_SYMBOL(arc_buf_info);
+EXPORT_SYMBOL(arc_getbuf_func);
+EXPORT_SYMBOL(arc_add_prune_callback);
+EXPORT_SYMBOL(arc_remove_prune_callback);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_long,
+	param_get_long, ZMOD_RW, "Min arc size");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_long,
+	param_get_long, ZMOD_RW, "Max arc size");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long,
+	param_get_long, ZMOD_RW, "Metadata limit for arc size");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent,
+	param_set_arc_long, param_get_long, ZMOD_RW,
+	"Percent of arc size for arc meta limit");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long,
+	param_get_long, ZMOD_RW, "Min arc metadata");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW,
+	"Meta objects to scan for prune");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW,
+	"Limit number of restarts in arc_evict_meta");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW,
+	"Meta reclaim strategy");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
+	param_get_int, ZMOD_RW, "Seconds before growing arc size");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW,
+	"Disable arc_p adapt dampener");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
+	param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
+	"Percent of pagecache to reclaim arc to");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int,
+	param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD,
+	"Target average block size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
+	"Disable compressed arc buffers");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
+	param_get_int, ZMOD_RW, "Min life of prefetch block in ms");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
+	param_set_arc_int, param_get_int, ZMOD_RW,
+	"Min life of prescient prefetched block in ms");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW,
+	"Max write bytes per interval");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW,
+	"Extra write bytes during device warmup");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW,
+	"Number of max device writes to precache");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW,
+	"Compressed l2arc_headroom multiplier");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW,
+	"TRIM ahead L2ARC write size multiplier");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW,
+	"Seconds between L2ARC writing");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW,
+	"Min feed interval in milliseconds");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
+	"Skip caching prefetched buffers");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
+	"Turbo L2ARC warmup");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
+	"No reads during writes");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, INT, ZMOD_RW,
+	"Percent of ARC size allowed for L2ARC-only headers");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
+	"Rebuild the L2ARC when importing a pool");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW,
+	"Min size in bytes to write rebuild log blocks in L2ARC");
+
+ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
+	"Cache only MFU data from ARC into L2ARC");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
+	param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long,
+	param_get_long, ZMOD_RW, "System free memory target size in bytes");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long,
+	param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc");
+
+ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
+	param_set_arc_long, param_get_long, ZMOD_RW,
+	"Percent of ARC meta buffers for dnodes");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW,
+	"Percentage of excess dnodes to try to unpin");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW,
+	"When full, ARC allocation waits for eviction of this % of alloc size");
+
+ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, INT, ZMOD_RW,
+	"The number of headers to evict per sublist before moving to the next");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/blkptr.c b/sys/contrib/openzfs/module/zfs/blkptr.c
new file mode 100644
index 000000000000..aa09ded8dba3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/blkptr.c
@@ -0,0 +1,153 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/blkptr.h>
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+
+/*
+ * Embedded-data Block Pointers
+ *
+ * Normally, block pointers point (via their DVAs) to a block which holds data.
+ * If the data that we need to store is very small, this is an inefficient
+ * use of space, because a block must be at minimum 1 sector (typically 512
+ * bytes or 4KB).  Additionally, reading these small blocks tends to generate
+ * more random reads.
+ *
+ * Embedded-data Block Pointers allow small pieces of data (the "payload",
+ * up to 112 bytes) to be stored in the block pointer itself, instead of
+ * being pointed to.  The "Pointer" part of this name is a bit of a
+ * misnomer, as nothing is pointed to.
+ *
+ * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to
+ * be embedded in the block pointer.  The logic for this is handled in
+ * the SPA, by the zio pipeline.  Therefore most code outside the zio
+ * pipeline doesn't need special-cases to handle these block pointers.
+ *
+ * See spa.h for details on the exact layout of embedded block pointers.
+ */
+
+void
+encode_embedded_bp_compressed(blkptr_t *bp, void *data,
+    enum zio_compress comp, int uncompressed_size, int compressed_size)
+{
+	uint64_t *bp64 = (uint64_t *)bp;
+	uint64_t w = 0;
+	uint8_t *data8 = data;
+
+	ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE);
+	ASSERT(uncompressed_size == compressed_size ||
+	    comp != ZIO_COMPRESS_OFF);
+	ASSERT3U(comp, >=, ZIO_COMPRESS_OFF);
+	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+
+	bzero(bp, sizeof (*bp));
+	BP_SET_EMBEDDED(bp, B_TRUE);
+	BP_SET_COMPRESS(bp, comp);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+	BPE_SET_LSIZE(bp, uncompressed_size);
+	BPE_SET_PSIZE(bp, compressed_size);
+
+	/*
+	 * Encode the byte array into the words of the block pointer.
+	 * First byte goes into low bits of first word (little endian).
+	 */
+	for (int i = 0; i < compressed_size; i++) {
+		BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]);
+		if (i % sizeof (w) == sizeof (w) - 1) {
+			/* we've reached the end of a word */
+			ASSERT3P(bp64, <, bp + 1);
+			*bp64 = w;
+			bp64++;
+			if (!BPE_IS_PAYLOADWORD(bp, bp64))
+				bp64++;
+			w = 0;
+		}
+	}
+	/* write last partial word */
+	if (bp64 < (uint64_t *)(bp + 1))
+		*bp64 = w;
+}
+
+/*
+ * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be
+ * more than BPE_PAYLOAD_SIZE bytes).
+ */
+void
+decode_embedded_bp_compressed(const blkptr_t *bp, void *buf)
+{
+	int psize;
+	uint8_t *buf8 = buf;
+	uint64_t w = 0;
+	const uint64_t *bp64 = (const uint64_t *)bp;
+
+	ASSERT(BP_IS_EMBEDDED(bp));
+
+	psize = BPE_GET_PSIZE(bp);
+
+	/*
+	 * Decode the words of the block pointer into the byte array.
+	 * Low bits of first word are the first byte (little endian).
+	 */
+	for (int i = 0; i < psize; i++) {
+		if (i % sizeof (w) == 0) {
+			/* beginning of a word */
+			ASSERT3P(bp64, <, bp + 1);
+			w = *bp64;
+			bp64++;
+			if (!BPE_IS_PAYLOADWORD(bp, bp64))
+				bp64++;
+		}
+		buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY);
+	}
+}
+
+/*
+ * Fill in the buffer with the (decompressed) payload of the embedded
+ * blkptr_t.  Takes into account compression and byteorder (the payload is
+ * treated as a stream of bytes).
+ * Return 0 on success, or ENOSPC if it won't fit in the buffer.
+ */
+int
+decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen)
+{
+	int lsize, psize;
+
+	ASSERT(BP_IS_EMBEDDED(bp));
+
+	lsize = BPE_GET_LSIZE(bp);
+	psize = BPE_GET_PSIZE(bp);
+
+	if (lsize > buflen)
+		return (SET_ERROR(ENOSPC));
+	ASSERT3U(lsize, ==, buflen);
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+		uint8_t dstbuf[BPE_PAYLOAD_SIZE];
+		decode_embedded_bp_compressed(bp, dstbuf);
+		VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp),
+		    dstbuf, buf, psize, buflen, NULL));
+	} else {
+		ASSERT3U(lsize, ==, psize);
+		decode_embedded_bp_compressed(bp, buf);
+	}
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bplist.c b/sys/contrib/openzfs/module/zfs/bplist.c
new file mode 100644
index 000000000000..47ea364ef26f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bplist.c
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/bplist.h>
+#include <sys/zfs_context.h>
+
+
+void
+bplist_create(bplist_t *bpl)
+{
+	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&bpl->bpl_list, sizeof (bplist_entry_t),
+	    offsetof(bplist_entry_t, bpe_node));
+}
+
+void
+bplist_destroy(bplist_t *bpl)
+{
+	list_destroy(&bpl->bpl_list);
+	mutex_destroy(&bpl->bpl_lock);
+}
+
+void
+bplist_append(bplist_t *bpl, const blkptr_t *bp)
+{
+	bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP);
+
+	mutex_enter(&bpl->bpl_lock);
+	bpe->bpe_blk = *bp;
+	list_insert_tail(&bpl->bpl_list, bpe);
+	mutex_exit(&bpl->bpl_lock);
+}
+
+/*
+ * To aid debugging, we keep the most recently removed entry.  This way if
+ * we are in the callback, we can easily locate the entry.
+ */
+static bplist_entry_t *bplist_iterate_last_removed;
+
+void
+bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx)
+{
+	bplist_entry_t *bpe;
+
+	mutex_enter(&bpl->bpl_lock);
+	while ((bpe = list_head(&bpl->bpl_list))) {
+		bplist_iterate_last_removed = bpe;
+		list_remove(&bpl->bpl_list, bpe);
+		mutex_exit(&bpl->bpl_lock);
+		func(arg, &bpe->bpe_blk, tx);
+		kmem_free(bpe, sizeof (*bpe));
+		mutex_enter(&bpl->bpl_lock);
+	}
+	mutex_exit(&bpl->bpl_lock);
+}
+
+void
+bplist_clear(bplist_t *bpl)
+{
+	bplist_entry_t *bpe;
+
+	mutex_enter(&bpl->bpl_lock);
+	while ((bpe = list_head(&bpl->bpl_list))) {
+		bplist_iterate_last_removed = bpe;
+		list_remove(&bpl->bpl_list, bpe);
+		kmem_free(bpe, sizeof (*bpe));
+	}
+	mutex_exit(&bpl->bpl_lock);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bpobj.c b/sys/contrib/openzfs/module/zfs/bpobj.c
new file mode 100644
index 000000000000..e75ba5cccde6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bpobj.c
@@ -0,0 +1,943 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ */
+
+#include <sys/bpobj.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_refcount.h>
+#include <sys/dsl_pool.h>
+#include <sys/zfeature.h>
+#include <sys/zap.h>
+
+/*
+ * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
+ */
+uint64_t
+bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+		if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+			ASSERT0(dp->dp_empty_bpobj);
+			dp->dp_empty_bpobj =
+			    bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
+			VERIFY(zap_add(os,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+			    &dp->dp_empty_bpobj, tx) == 0);
+		}
+		spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
+		ASSERT(dp->dp_empty_bpobj != 0);
+		return (dp->dp_empty_bpobj);
+	} else {
+		return (bpobj_alloc(os, blocksize, tx));
+	}
+}
+
+void
+bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
+	if (!spa_feature_is_active(dmu_objset_spa(os),
+	    SPA_FEATURE_EMPTY_BPOBJ)) {
+		VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_EMPTY_BPOBJ, tx));
+		VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
+		dp->dp_empty_bpobj = 0;
+	}
+}
+
+uint64_t
+bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	int size;
+
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
+		size = BPOBJ_SIZE_V0;
+	else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		size = BPOBJ_SIZE_V1;
+	else if (!spa_feature_is_active(dmu_objset_spa(os),
+	    SPA_FEATURE_LIVELIST))
+		size = BPOBJ_SIZE_V2;
+	else
+		size = sizeof (bpobj_phys_t);
+
+	return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
+	    DMU_OT_BPOBJ_HDR, size, tx));
+}
+
+void
+bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	int64_t i;
+	bpobj_t bpo;
+	dmu_object_info_t doi;
+	int epb;
+	dmu_buf_t *dbuf = NULL;
+
+	ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
+	VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
+
+	mutex_enter(&bpo.bpo_lock);
+
+	if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
+		goto out;
+
+	VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
+	epb = doi.doi_data_block_size / sizeof (uint64_t);
+
+	for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
+		uint64_t *objarray;
+		uint64_t offset, blkoff;
+
+		offset = i * sizeof (uint64_t);
+		blkoff = P2PHASE(i, epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			VERIFY3U(0, ==, dmu_buf_hold(os,
+			    bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		objarray = dbuf->db_data;
+		bpobj_free(os, objarray[blkoff], tx);
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
+
+out:
+	mutex_exit(&bpo.bpo_lock);
+	bpobj_close(&bpo);
+
+	VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
+}
+
+int
+bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+	int err;
+
+	err = dmu_object_info(os, object, &doi);
+	if (err)
+		return (err);
+
+	bzero(bpo, sizeof (*bpo));
+	mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	ASSERT(bpo->bpo_dbuf == NULL);
+	ASSERT(bpo->bpo_phys == NULL);
+	ASSERT(object != 0);
+	ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
+	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
+
+	err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
+	if (err)
+		return (err);
+
+	bpo->bpo_os = os;
+	bpo->bpo_object = object;
+	bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
+	bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
+	bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
+	bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2);
+	bpo->bpo_phys = bpo->bpo_dbuf->db_data;
+	return (0);
+}
+
+boolean_t
+bpobj_is_open(const bpobj_t *bpo)
+{
+	return (bpo->bpo_object != 0);
+}
+
+void
+bpobj_close(bpobj_t *bpo)
+{
+	/* Lame workaround for closing a bpobj that was never opened. */
+	if (bpo->bpo_object == 0)
+		return;
+
+	dmu_buf_rele(bpo->bpo_dbuf, bpo);
+	if (bpo->bpo_cached_dbuf != NULL)
+		dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+	bpo->bpo_dbuf = NULL;
+	bpo->bpo_phys = NULL;
+	bpo->bpo_cached_dbuf = NULL;
+	bpo->bpo_object = 0;
+
+	mutex_destroy(&bpo->bpo_lock);
+}
+
+static boolean_t
+bpobj_is_empty_impl(bpobj_t *bpo)
+{
+	ASSERT(MUTEX_HELD(&bpo->bpo_lock));
+	return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
+	    (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
+}
+
+boolean_t
+bpobj_is_empty(bpobj_t *bpo)
+{
+	mutex_enter(&bpo->bpo_lock);
+	boolean_t is_empty = bpobj_is_empty_impl(bpo);
+	mutex_exit(&bpo->bpo_lock);
+	return (is_empty);
+}
+
+/*
+ * A recursive iteration of the bpobjs would be nice here but we run the risk
+ * of overflowing function stack space.  Instead, find each subobj and add it
+ * to the head of our list so it can be scanned for subjobjs.  Like a
+ * recursive implementation, the "deepest" subobjs will be freed first.
+ * When a subobj is found to have no additional subojs, free it.
+ */
+typedef struct bpobj_info {
+	bpobj_t *bpi_bpo;
+	/*
+	 * This object is a subobj of bpi_parent,
+	 * at bpi_index in its subobj array.
+	 */
+	struct bpobj_info *bpi_parent;
+	uint64_t bpi_index;
+	/* How many of our subobj's are left to process. */
+	uint64_t bpi_unprocessed_subobjs;
+	/* True after having visited this bpo's directly referenced BPs. */
+	boolean_t bpi_visited;
+	list_node_t bpi_node;
+} bpobj_info_t;
+
+static bpobj_info_t *
+bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index)
+{
+	bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP);
+	bpi->bpi_bpo = bpo;
+	bpi->bpi_parent = parent;
+	bpi->bpi_index = index;
+	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
+		bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs;
+	}
+	return (bpi);
+}
+
+/*
+ * Update bpobj and all of its parents with new space accounting.
+ */
+static void
+propagate_space_reduction(bpobj_info_t *bpi, int64_t freed,
+    int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx)
+{
+
+	for (; bpi != NULL; bpi = bpi->bpi_parent) {
+		bpobj_t *p = bpi->bpi_bpo;
+		ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx));
+		p->bpo_phys->bpo_bytes -= freed;
+		ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0);
+		if (p->bpo_havecomp) {
+			p->bpo_phys->bpo_comp -= comp_freed;
+			p->bpo_phys->bpo_uncomp -= uncomp_freed;
+		}
+	}
+}
+
+static int
+bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
+    int64_t start, dmu_tx_t *tx, boolean_t free)
+{
+	int err = 0;
+	int64_t freed = 0, comp_freed = 0, uncomp_freed = 0;
+	dmu_buf_t *dbuf = NULL;
+	bpobj_t *bpo = bpi->bpi_bpo;
+
+	for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+		uint64_t offset = i * sizeof (blkptr_t);
+		uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
+
+		if (dbuf == NULL || dbuf->db_offset > offset) {
+			if (dbuf)
+				dmu_buf_rele(dbuf, FTAG);
+			err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+			    offset, FTAG, &dbuf, 0);
+			if (err)
+				break;
+		}
+
+		ASSERT3U(offset, >=, dbuf->db_offset);
+		ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
+
+		blkptr_t *bparray = dbuf->db_data;
+		blkptr_t *bp = &bparray[blkoff];
+
+		boolean_t bp_freed = BP_GET_FREE(bp);
+		err = func(arg, bp, bp_freed, tx);
+		if (err)
+			break;
+
+		if (free) {
+			int sign = bp_freed ? -1 : +1;
+			spa_t *spa = dmu_objset_spa(bpo->bpo_os);
+			freed += sign * bp_get_dsize_sync(spa, bp);
+			comp_freed += sign * BP_GET_PSIZE(bp);
+			uncomp_freed += sign * BP_GET_UCSIZE(bp);
+			ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx));
+			bpo->bpo_phys->bpo_num_blkptrs--;
+			ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
+			if (bp_freed) {
+				ASSERT(bpo->bpo_havefreed);
+				bpo->bpo_phys->bpo_num_freed--;
+				ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0);
+			}
+		}
+	}
+	if (free) {
+		propagate_space_reduction(bpi, freed, comp_freed,
+		    uncomp_freed, tx);
+		VERIFY0(dmu_free_range(bpo->bpo_os,
+		    bpo->bpo_object,
+		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+		    DMU_OBJECT_END, tx));
+	}
+	if (dbuf) {
+		dmu_buf_rele(dbuf, FTAG);
+		dbuf = NULL;
+	}
+	return (err);
+}
+
+/*
+ * Given an initial bpo, start by freeing the BPs that are directly referenced
+ * by that bpo. If the bpo has subobjs, read in its last subobj and push the
+ * subobj to our stack. By popping items off our stack, eventually we will
+ * encounter a bpo that has no subobjs.  We can free its bpobj_info_t, and if
+ * requested also free the now-empty bpo from disk and decrement
+ * its parent's subobj count. We continue popping each subobj from our stack,
+ * visiting its last subobj until they too have no more subobjs, and so on.
+ */
+static int
+bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
+    dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size)
+{
+	list_t stack;
+	bpobj_info_t *bpi;
+	int err = 0;
+
+	/*
+	 * Create a "stack" for us to work with without worrying about
+	 * stack overflows. Initialize it with the initial_bpo.
+	 */
+	list_create(&stack, sizeof (bpobj_info_t),
+	    offsetof(bpobj_info_t, bpi_node));
+	mutex_enter(&initial_bpo->bpo_lock);
+
+	if (bpobj_size != NULL)
+		*bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs;
+
+	list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0));
+
+	while ((bpi = list_head(&stack)) != NULL) {
+		bpobj_t *bpo = bpi->bpi_bpo;
+
+		ASSERT3P(bpo, !=, NULL);
+		ASSERT(MUTEX_HELD(&bpo->bpo_lock));
+		ASSERT(bpobj_is_open(bpo));
+
+		if (free)
+			dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+		if (bpi->bpi_visited == B_FALSE) {
+			err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx,
+			    free);
+			bpi->bpi_visited = B_TRUE;
+			if (err != 0)
+				break;
+		}
+		/*
+		 * We've finished with this bpo's directly-referenced BP's and
+		 * it has no more unprocessed subobjs. We can free its
+		 * bpobj_info_t (unless it is the topmost, initial_bpo).
+		 * If we are freeing from disk, we can also do that.
+		 */
+		if (bpi->bpi_unprocessed_subobjs == 0) {
+			/*
+			 * If there are no entries, there should
+			 * be no bytes.
+			 */
+			if (bpobj_is_empty_impl(bpo)) {
+				ASSERT0(bpo->bpo_phys->bpo_bytes);
+				ASSERT0(bpo->bpo_phys->bpo_comp);
+				ASSERT0(bpo->bpo_phys->bpo_uncomp);
+			}
+
+			/* The initial_bpo has no parent and is not closed. */
+			if (bpi->bpi_parent != NULL) {
+				if (free) {
+					bpobj_t *p = bpi->bpi_parent->bpi_bpo;
+
+					ASSERT0(bpo->bpo_phys->bpo_num_blkptrs);
+					ASSERT3U(p->bpo_phys->bpo_num_subobjs,
+					    >, 0);
+					ASSERT3U(bpi->bpi_index, ==,
+					    p->bpo_phys->bpo_num_subobjs - 1);
+					ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf,
+					    tx));
+
+					p->bpo_phys->bpo_num_subobjs--;
+
+					VERIFY0(dmu_free_range(p->bpo_os,
+					    p->bpo_phys->bpo_subobjs,
+					    bpi->bpi_index * sizeof (uint64_t),
+					    sizeof (uint64_t), tx));
+
+					/* eliminate the empty subobj list */
+					if (bpo->bpo_havesubobj &&
+					    bpo->bpo_phys->bpo_subobjs != 0) {
+						ASSERT0(bpo->bpo_phys->
+						    bpo_num_subobjs);
+						err = dmu_object_free(
+						    bpo->bpo_os,
+						    bpo->bpo_phys->bpo_subobjs,
+						    tx);
+						if (err)
+							break;
+						bpo->bpo_phys->bpo_subobjs = 0;
+					}
+					err = dmu_object_free(p->bpo_os,
+					    bpo->bpo_object, tx);
+					if (err)
+						break;
+				}
+
+				mutex_exit(&bpo->bpo_lock);
+				bpobj_close(bpo);
+				kmem_free(bpo, sizeof (bpobj_t));
+			} else {
+				mutex_exit(&bpo->bpo_lock);
+			}
+
+			/*
+			 * Finished processing this bpo. Unlock, and free
+			 * our "stack" info.
+			 */
+			list_remove_head(&stack);
+			kmem_free(bpi, sizeof (bpobj_info_t));
+		} else {
+			/*
+			 * We have unprocessed subobjs. Process the next one.
+			 */
+			ASSERT(bpo->bpo_havecomp);
+			ASSERT3P(bpobj_size, ==, NULL);
+
+			/* Add the last subobj to stack. */
+			int64_t i = bpi->bpi_unprocessed_subobjs - 1;
+			uint64_t offset = i * sizeof (uint64_t);
+
+			uint64_t obj_from_sublist;
+			err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+			    offset, sizeof (uint64_t), &obj_from_sublist,
+			    DMU_READ_PREFETCH);
+			if (err)
+				break;
+			bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
+			    KM_SLEEP);
+
+			err = bpobj_open(sublist, bpo->bpo_os,
+			    obj_from_sublist);
+			if (err)
+				break;
+
+			list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
+			mutex_enter(&sublist->bpo_lock);
+			bpi->bpi_unprocessed_subobjs--;
+		}
+	}
+	/*
+	 * Cleanup anything left on the "stack" after we left the loop.
+	 * Every bpo on the stack is locked so we must remember to undo
+	 * that now (in LIFO order).
+	 */
+	while ((bpi = list_remove_head(&stack)) != NULL) {
+		bpobj_t *bpo = bpi->bpi_bpo;
+		ASSERT(err != 0);
+		ASSERT3P(bpo, !=, NULL);
+
+		mutex_exit(&bpo->bpo_lock);
+
+		/* do not free the initial_bpo */
+		if (bpi->bpi_parent != NULL) {
+			bpobj_close(bpi->bpi_bpo);
+			kmem_free(bpi->bpi_bpo, sizeof (bpobj_t));
+		}
+		kmem_free(bpi, sizeof (bpobj_info_t));
+	}
+
+	list_destroy(&stack);
+
+	return (err);
+}
+
+/*
+ * Iterate and remove the entries.  If func returns nonzero, iteration
+ * will stop and that entry will not be removed.
+ */
+int
+bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL));
+}
+
+/*
+ * Iterate the entries.  If func returns nonzero, iteration will stop.
+ *
+ * If there are no subobjs:
+ *
+ * *bpobj_size can be used to return the number of block pointers in the
+ * bpobj.  Note that this may be different from the number of block pointers
+ * that are iterated over, if iteration is terminated early (e.g. by the func
+ * returning nonzero).
+ *
+ * If there are concurrent (or subsequent) modifications to the bpobj then the
+ * returned *bpobj_size can be passed as "start" to
+ * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries.
+ */
+int
+bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+    uint64_t *bpobj_size)
+{
+	return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size));
+}
+
+/*
+ * Iterate over the blkptrs in the bpobj beginning at index start. If func
+ * returns nonzero, iteration will stop. This is a livelist specific function
+ * since it assumes that there are no subobjs present.
+ */
+int
+livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg,
+    int64_t start)
+{
+	if (bpo->bpo_havesubobj)
+		VERIFY0(bpo->bpo_phys->bpo_subobjs);
+	bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0);
+	int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE);
+	kmem_free(bpi, sizeof (bpobj_info_t));
+	return (err);
+}
+
+/*
+ * Logically add subobj's contents to the parent bpobj.
+ *
+ * In the most general case, this is accomplished in constant time by adding
+ * a reference to subobj.  This case is used when enqueuing a large subobj:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj |
+ * +----+----+----+----+----+              +-----+-----+-----+
+ *
+ * +--------------+                        +--------------+
+ * | sub-bpobj    |----------------------> | subsubobj    |
+ * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
+ * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
+ *
+ * Result: sub-bpobj added to parent's subobj list.
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+-----+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj | OBJ |
+ * +----+----+----+----+----+              +-----+-----+-----+--|--+
+ *                                                              |
+ *       /-----------------------------------------------------/
+ *       v
+ * +--------------+                        +--------------+
+ * | sub-bpobj    |----------------------> | subsubobj    |
+ * +----+----+----+----+---------+----+    +-----+-----+--+--------+-----+
+ * | bp | bp | bp | bp |   ...   | bp |    | obj | obj |    ...    | obj |
+ * +----+----+----+----+---------+----+    +-----+-----+-----------+-----+
+ *
+ *
+ * In a common case, the subobj is small: its bp's and its list of subobj's
+ * are each stored in a single block.  In this case we copy the subobj's
+ * contents to the parent:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj |
+ * +----+----+----+----+----+              +-----+-----+-----+
+ *                          ^                                ^
+ * +--------------+         |              +--------------+  |
+ * | sub-bpobj    |---------^------------> | subsubobj    |  ^
+ * +----+----+----+         |              +-----+-----+--+  |
+ * | BP | BP |-->-->-->-->-/               | OBJ | OBJ |-->-/
+ * +----+----+                             +-----+-----+
+ *
+ * Result: subobj destroyed, contents copied to parent:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+----+----+    +-----+-----+--+--+-----+-----+
+ * | bp | bp | bp | bp | bp | BP | BP |    | obj | obj | obj | OBJ | OBJ |
+ * +----+----+----+----+----+----+----+    +-----+-----+-----+-----+-----+
+ *
+ *
+ * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
+ * but retain the sub-bpobj:
+ * +--------------+                        +--------------+
+ * | bpobj        |----------------------->| subobj list  |
+ * +----+----+----+----+----+              +-----+-----+--+--+
+ * | bp | bp | bp | bp | bp |              | obj | obj | obj |
+ * +----+----+----+----+----+              +-----+-----+-----+
+ *                                                           ^
+ * +--------------+                        +--------------+  |
+ * | sub-bpobj    |----------------------> | subsubobj    |  ^
+ * +----+----+----+----+---------+----+    +-----+-----+--+  |
+ * | bp | bp | bp | bp |   ...   | bp |    | OBJ | OBJ |-->-/
+ * +----+----+----+----+---------+----+    +-----+-----+
+ *
+ * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
+ * +--------------+                     +--------------+
+ * | bpobj        |-------------------->| subobj list  |
+ * +----+----+----+----+----+           +-----+-----+--+--+-----+-----+------+
+ * | bp | bp | bp | bp | bp |           | obj | obj | obj | OBJ | OBJ | OBJ* |
+ * +----+----+----+----+----+           +-----+-----+-----+-----+-----+--|---+
+ *                                                                       |
+ *       /--------------------------------------------------------------/
+ *       v
+ * +--------------+
+ * | sub-bpobj    |
+ * +----+----+----+----+---------+----+
+ * | bp | bp | bp | bp |   ...   | bp |
+ * +----+----+----+----+---------+----+
+ */
+void
+bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
+{
+	bpobj_t subbpo;
+	uint64_t used, comp, uncomp, subsubobjs;
+	boolean_t copy_subsub = B_TRUE;
+	boolean_t copy_bps = B_TRUE;
+
+	ASSERT(bpobj_is_open(bpo));
+	ASSERT(subobj != 0);
+	ASSERT(bpo->bpo_havesubobj);
+	ASSERT(bpo->bpo_havecomp);
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+	if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
+		bpobj_decr_empty(bpo->bpo_os, tx);
+		return;
+	}
+
+	VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
+	VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
+
+	if (bpobj_is_empty(&subbpo)) {
+		/* No point in having an empty subobj. */
+		bpobj_close(&subbpo);
+		bpobj_free(bpo->bpo_os, subobj, tx);
+		return;
+	}
+
+	mutex_enter(&bpo->bpo_lock);
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+
+	dmu_object_info_t doi;
+
+	if (bpo->bpo_phys->bpo_subobjs != 0) {
+		ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+		    &doi));
+		ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
+	}
+
+	/*
+	 * If subobj has only one block of subobjs, then move subobj's
+	 * subobjs to bpo's subobj list directly.  This reduces recursion in
+	 * bpobj_iterate due to nested subobjs.
+	 */
+	subsubobjs = subbpo.bpo_phys->bpo_subobjs;
+	if (subsubobjs != 0) {
+		VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
+		if (doi.doi_max_offset > doi.doi_data_block_size) {
+			copy_subsub = B_FALSE;
+		}
+	}
+
+	/*
+	 * If, in addition to having only one block of subobj's, subobj has
+	 * only one block of bp's, then move subobj's bp's to bpo's bp list
+	 * directly. This reduces recursion in bpobj_iterate due to nested
+	 * subobjs.
+	 */
+	VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
+	if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
+		copy_bps = B_FALSE;
+	}
+
+	if (copy_subsub && subsubobjs != 0) {
+		dmu_buf_t *subdb;
+		uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
+
+		VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
+		    0, FTAG, &subdb, 0));
+		/*
+		 * Make sure that we are not asking dmu_write()
+		 * to write more data than we have in our buffer.
+		 */
+		VERIFY3U(subdb->db_size, >=,
+		    numsubsub * sizeof (subobj));
+		if (bpo->bpo_phys->bpo_subobjs == 0) {
+			bpo->bpo_phys->bpo_subobjs =
+			    dmu_object_alloc(bpo->bpo_os,
+			    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+			    DMU_OT_NONE, 0, tx);
+		}
+		dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+		    numsubsub * sizeof (subobj), subdb->db_data, tx);
+		dmu_buf_rele(subdb, FTAG);
+		bpo->bpo_phys->bpo_num_subobjs += numsubsub;
+
+		dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
+		subbpo.bpo_phys->bpo_subobjs = 0;
+		VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
+	}
+
+	if (copy_bps) {
+		dmu_buf_t *bps;
+		uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
+
+		ASSERT(copy_subsub);
+		VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
+		    0, FTAG, &bps, 0));
+
+		/*
+		 * Make sure that we are not asking dmu_write()
+		 * to write more data than we have in our buffer.
+		 */
+		VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
+		dmu_write(bpo->bpo_os, bpo->bpo_object,
+		    bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
+		    numbps * sizeof (blkptr_t),
+		    bps->db_data, tx);
+		dmu_buf_rele(bps, FTAG);
+		bpo->bpo_phys->bpo_num_blkptrs += numbps;
+
+		bpobj_close(&subbpo);
+		VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
+	} else {
+		bpobj_close(&subbpo);
+		if (bpo->bpo_phys->bpo_subobjs == 0) {
+			bpo->bpo_phys->bpo_subobjs =
+			    dmu_object_alloc(bpo->bpo_os,
+			    DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+			    DMU_OT_NONE, 0, tx);
+		}
+
+		dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
+		    bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
+		    sizeof (subobj), &subobj, tx);
+		bpo->bpo_phys->bpo_num_subobjs++;
+	}
+
+	bpo->bpo_phys->bpo_bytes += used;
+	bpo->bpo_phys->bpo_comp += comp;
+	bpo->bpo_phys->bpo_uncomp += uncomp;
+	mutex_exit(&bpo->bpo_lock);
+
+}
+
+void
+bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	blkptr_t stored_bp = *bp;
+	uint64_t offset;
+	int blkoff;
+	blkptr_t *bparray;
+
+	ASSERT(bpobj_is_open(bpo));
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+
+	if (BP_IS_EMBEDDED(bp)) {
+		/*
+		 * The bpobj will compress better without the payload.
+		 *
+		 * Note that we store EMBEDDED bp's because they have an
+		 * uncompressed size, which must be accounted for.  An
+		 * alternative would be to add their size to bpo_uncomp
+		 * without storing the bp, but that would create additional
+		 * complications: bpo_uncomp would be inconsistent with the
+		 * set of BP's stored, and bpobj_iterate() wouldn't visit
+		 * all the space accounted for in the bpobj.
+		 */
+		bzero(&stored_bp, sizeof (stored_bp));
+		stored_bp.blk_prop = bp->blk_prop;
+		stored_bp.blk_birth = bp->blk_birth;
+	} else if (!BP_GET_DEDUP(bp)) {
+		/* The bpobj will compress better without the checksum */
+		bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+	}
+
+	stored_bp.blk_fill = 0;
+	BP_SET_FREE(&stored_bp, bp_freed);
+
+	mutex_enter(&bpo->bpo_lock);
+
+	offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
+	blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
+
+	if (bpo->bpo_cached_dbuf == NULL ||
+	    offset < bpo->bpo_cached_dbuf->db_offset ||
+	    offset >= bpo->bpo_cached_dbuf->db_offset +
+	    bpo->bpo_cached_dbuf->db_size) {
+		if (bpo->bpo_cached_dbuf)
+			dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
+		VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
+		    offset, bpo, &bpo->bpo_cached_dbuf, 0));
+	}
+
+	dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
+	bparray = bpo->bpo_cached_dbuf->db_data;
+	bparray[blkoff] = stored_bp;
+
+	dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
+	bpo->bpo_phys->bpo_num_blkptrs++;
+	int sign = bp_freed ? -1 : +1;
+	bpo->bpo_phys->bpo_bytes += sign *
+	    bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
+	if (bpo->bpo_havecomp) {
+		bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp);
+		bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp);
+	}
+	if (bp_freed) {
+		ASSERT(bpo->bpo_havefreed);
+		bpo->bpo_phys->bpo_num_freed++;
+	}
+	mutex_exit(&bpo->bpo_lock);
+}
+
+struct space_range_arg {
+	spa_t *spa;
+	uint64_t mintxg;
+	uint64_t maxtxg;
+	uint64_t used;
+	uint64_t comp;
+	uint64_t uncomp;
+};
+
+/* ARGSUSED */
+static int
+space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+	struct space_range_arg *sra = arg;
+
+	if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
+		if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
+			sra->used += bp_get_dsize_sync(sra->spa, bp);
+		else
+			sra->used += bp_get_dsize(sra->spa, bp);
+		sra->comp += BP_GET_PSIZE(bp);
+		sra->uncomp += BP_GET_UCSIZE(bp);
+	}
+	return (0);
+}
+
+int
+bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	ASSERT(bpobj_is_open(bpo));
+	mutex_enter(&bpo->bpo_lock);
+
+	*usedp = bpo->bpo_phys->bpo_bytes;
+	if (bpo->bpo_havecomp) {
+		*compp = bpo->bpo_phys->bpo_comp;
+		*uncompp = bpo->bpo_phys->bpo_uncomp;
+		mutex_exit(&bpo->bpo_lock);
+		return (0);
+	} else {
+		mutex_exit(&bpo->bpo_lock);
+		return (bpobj_space_range(bpo, 0, UINT64_MAX,
+		    usedp, compp, uncompp));
+	}
+}
+
+/*
+ * Return the amount of space in the bpobj which is:
+ * mintxg < blk_birth <= maxtxg
+ */
+int
+bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	struct space_range_arg sra = { 0 };
+	int err;
+
+	ASSERT(bpobj_is_open(bpo));
+
+	/*
+	 * As an optimization, if they want the whole txg range, just
+	 * get bpo_bytes rather than iterating over the bps.
+	 */
+	if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
+		return (bpobj_space(bpo, usedp, compp, uncompp));
+
+	sra.spa = dmu_objset_spa(bpo->bpo_os);
+	sra.mintxg = mintxg;
+	sra.maxtxg = maxtxg;
+
+	err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
+	*usedp = sra.used;
+	*compp = sra.comp;
+	*uncompp = sra.uncomp;
+	return (err);
+}
+
+/*
+ * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a
+ * bpobj are designated as free or allocated that information is not preserved
+ * in bplists.
+ */
+/* ARGSUSED */
+int
+bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	bplist_t *bpl = arg;
+	bplist_append(bpl, bp);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bptree.c b/sys/contrib/openzfs/module/zfs/bptree.c
new file mode 100644
index 000000000000..1827a3c4e326
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bptree.c
@@ -0,0 +1,303 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/arc.h>
+#include <sys/bptree.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+
+/*
+ * A bptree is a queue of root block pointers from destroyed datasets. When a
+ * dataset is destroyed its root block pointer is put on the end of the pool's
+ * bptree queue so the dataset's blocks can be freed asynchronously by
+ * dsl_scan_sync. This allows the delete operation to finish without traversing
+ * all the dataset's blocks.
+ *
+ * Note that while bt_begin and bt_end are only ever incremented in this code,
+ * they are effectively reset to 0 every time the entire bptree is freed because
+ * the bptree's object is destroyed and re-created.
+ */
+
+struct bptree_args {
+	bptree_phys_t *ba_phys;	/* data in bonus buffer, dirtied if freeing */
+	boolean_t ba_free;	/* true if freeing during traversal */
+
+	bptree_itor_t *ba_func;	/* function to call for each blockpointer */
+	void *ba_arg;		/* caller supplied argument to ba_func */
+	dmu_tx_t *ba_tx;	/* caller supplied tx, NULL if not freeing */
+} bptree_args_t;
+
+uint64_t
+bptree_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+	    sizeof (bptree_phys_t), tx);
+
+	/*
+	 * Bonus buffer contents are already initialized to 0, but for
+	 * readability we make it explicit.
+	 */
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	bt = db->db_data;
+	bt->bt_begin = 0;
+	bt->bt_end = 0;
+	bt->bt_bytes = 0;
+	bt->bt_comp = 0;
+	bt->bt_uncomp = 0;
+	dmu_buf_rele(db, FTAG);
+
+	return (obj);
+}
+
+int
+bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	ASSERT3U(bt->bt_begin, ==, bt->bt_end);
+	ASSERT0(bt->bt_bytes);
+	ASSERT0(bt->bt_comp);
+	ASSERT0(bt->bt_uncomp);
+	dmu_buf_rele(db, FTAG);
+
+	return (dmu_object_free(os, obj, tx));
+}
+
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	boolean_t rv;
+
+	VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+	rv = (bt->bt_begin == bt->bt_end);
+	dmu_buf_rele(db, FTAG);
+	return (rv);
+}
+
+void
+bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
+    uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+	bptree_phys_t *bt;
+	bptree_entry_phys_t *bte;
+
+	/*
+	 * bptree objects are in the pool mos, therefore they can only be
+	 * modified in syncing context. Furthermore, this is only modified
+	 * by the sync thread, so no locking is necessary.
+	 */
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
+	bt = db->db_data;
+
+	bte = kmem_zalloc(sizeof (*bte), KM_SLEEP);
+	bte->be_birth_txg = birth_txg;
+	bte->be_bp = *bp;
+	dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx);
+	kmem_free(bte, sizeof (*bte));
+
+	dmu_buf_will_dirty(db, tx);
+	bt->bt_end++;
+	bt->bt_bytes += bytes;
+	bt->bt_comp += comp;
+	bt->bt_uncomp += uncomp;
+	dmu_buf_rele(db, FTAG);
+}
+
+/* ARGSUSED */
+static int
+bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	int err;
+	struct bptree_args *ba = arg;
+
+	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+	    BP_IS_REDACTED(bp))
+		return (0);
+
+	err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
+	if (err == 0 && ba->ba_free) {
+		ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp);
+		ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp);
+		ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp);
+	}
+	return (err);
+}
+
+/*
+ * If "free" is set:
+ *  - It is assumed that "func" will be freeing the block pointers.
+ *  - If "func" returns nonzero, the bookmark will be remembered and
+ *    iteration will be restarted from this point on next invocation.
+ *  - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ *    bptree_iterate will remember the bookmark, continue traversing
+ *    any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
+int
+bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
+    void *arg, dmu_tx_t *tx)
+{
+	boolean_t ioerr = B_FALSE;
+	int err;
+	uint64_t i;
+	dmu_buf_t *db;
+	struct bptree_args ba;
+
+	ASSERT(!free || dmu_tx_is_syncing(tx));
+
+	err = dmu_bonus_hold(os, obj, FTAG, &db);
+	if (err != 0)
+		return (err);
+
+	if (free)
+		dmu_buf_will_dirty(db, tx);
+
+	ba.ba_phys = db->db_data;
+	ba.ba_free = free;
+	ba.ba_func = func;
+	ba.ba_arg = arg;
+	ba.ba_tx = tx;
+
+	err = 0;
+	for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
+		bptree_entry_phys_t bte;
+		int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST |
+		    TRAVERSE_NO_DECRYPT;
+
+		err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
+		    &bte, DMU_READ_NO_PREFETCH);
+		if (err != 0)
+			break;
+
+		if (zfs_free_leak_on_eio)
+			flags |= TRAVERSE_HARD;
+		zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
+		    "bookmark %lld/%lld/%lld/%lld",
+		    (longlong_t)i,
+		    (longlong_t)bte.be_birth_txg,
+		    (longlong_t)bte.be_zb.zb_objset,
+		    (longlong_t)bte.be_zb.zb_object,
+		    (longlong_t)bte.be_zb.zb_level,
+		    (longlong_t)bte.be_zb.zb_blkid);
+		err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
+		    bte.be_birth_txg, &bte.be_zb, flags,
+		    bptree_visit_cb, &ba);
+		if (free) {
+			/*
+			 * The callback has freed the visited block pointers.
+			 * Record our traversal progress on disk, either by
+			 * updating this record's bookmark, or by logically
+			 * removing this record by advancing bt_begin.
+			 */
+			if (err != 0) {
+				/* save bookmark for future resume */
+				ASSERT3U(bte.be_zb.zb_objset, ==,
+				    ZB_DESTROYED_OBJSET);
+				ASSERT0(bte.be_zb.zb_level);
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+				if (err == EIO || err == ECKSUM ||
+				    err == ENXIO) {
+					/*
+					 * Skip the rest of this tree and
+					 * continue on to the next entry.
+					 */
+					err = 0;
+					ioerr = B_TRUE;
+				} else {
+					break;
+				}
+			} else if (ioerr) {
+				/*
+				 * This entry is finished, but there were
+				 * i/o errors on previous entries, so we
+				 * can't adjust bt_begin.  Set this entry's
+				 * be_birth_txg such that it will be
+				 * treated as a no-op in future traversals.
+				 */
+				bte.be_birth_txg = UINT64_MAX;
+				dmu_write(os, obj, i * sizeof (bte),
+				    sizeof (bte), &bte, tx);
+			}
+
+			if (!ioerr) {
+				ba.ba_phys->bt_begin++;
+				(void) dmu_free_range(os, obj,
+				    i * sizeof (bte), sizeof (bte), tx);
+			}
+		} else if (err != 0) {
+			break;
+		}
+	}
+
+	ASSERT(!free || err != 0 || ioerr ||
+	    ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+
+	/* if all blocks are free there should be no used space */
+	if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+		if (zfs_free_leak_on_eio) {
+			ba.ba_phys->bt_bytes = 0;
+			ba.ba_phys->bt_comp = 0;
+			ba.ba_phys->bt_uncomp = 0;
+		}
+
+		ASSERT0(ba.ba_phys->bt_bytes);
+		ASSERT0(ba.ba_phys->bt_comp);
+		ASSERT0(ba.ba_phys->bt_uncomp);
+	}
+
+	dmu_buf_rele(db, FTAG);
+
+	return (err);
+}
diff --git a/sys/contrib/openzfs/module/zfs/bqueue.c b/sys/contrib/openzfs/module/zfs/bqueue.c
new file mode 100644
index 000000000000..22539efc4e23
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/bqueue.c
@@ -0,0 +1,155 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2014, 2018 by Delphix. All rights reserved.
+ */
+
+#include	<sys/bqueue.h>
+#include	<sys/zfs_context.h>
+
+static inline bqueue_node_t *
+obj2node(bqueue_t *q, void *data)
+{
+	return ((bqueue_node_t *)((char *)data + q->bq_node_offset));
+}
+
+/*
+ * Initialize a blocking queue  The maximum capacity of the queue is set to
+ * size.  Types that are stored in a bqueue must contain a bqueue_node_t,
+ * and node_offset must be its offset from the start of the struct.
+ * fill_fraction is a performance tuning value; when the queue is full, any
+ * threads attempting to enqueue records will block.  They will block until
+ * they're signaled, which will occur when the queue is at least 1/fill_fraction
+ * empty.  Similar behavior occurs on dequeue; if the queue is empty, threads
+ * block.  They will be signalled when the queue has 1/fill_fraction full, or
+ * when bqueue_flush is called.  As a result, you must call bqueue_flush when
+ * you enqueue your final record on a thread, in case the dequeueing threads are
+ * currently blocked and that enqueue does not cause them to be awoken.
+ * Alternatively, this behavior can be disabled (causing signaling to happen
+ * immediately) by setting fill_fraction to any value larger than size.
+ * Return 0 on success, or -1 on failure.
+ */
+int
+bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size,
+    size_t node_offset)
+{
+	if (fill_fraction == 0) {
+		return (-1);
+	}
+	list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t),
+	    node_offset + offsetof(bqueue_node_t, bqn_node));
+	cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL);
+	q->bq_node_offset = node_offset;
+	q->bq_size = 0;
+	q->bq_maxsize = size;
+	q->bq_fill_fraction = fill_fraction;
+	return (0);
+}
+
+/*
+ * Destroy a blocking queue.  This function asserts that there are no
+ * elements in the queue, and no one is blocked on the condition
+ * variables.
+ */
+void
+bqueue_destroy(bqueue_t *q)
+{
+	mutex_enter(&q->bq_lock);
+	ASSERT0(q->bq_size);
+	cv_destroy(&q->bq_add_cv);
+	cv_destroy(&q->bq_pop_cv);
+	list_destroy(&q->bq_list);
+	mutex_exit(&q->bq_lock);
+	mutex_destroy(&q->bq_lock);
+}
+
+static void
+bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size,
+    boolean_t flush)
+{
+	ASSERT3U(item_size, >, 0);
+	ASSERT3U(item_size, <=, q->bq_maxsize);
+	mutex_enter(&q->bq_lock);
+	obj2node(q, data)->bqn_size = item_size;
+	while (q->bq_size + item_size > q->bq_maxsize) {
+		cv_wait_sig(&q->bq_add_cv, &q->bq_lock);
+	}
+	q->bq_size += item_size;
+	list_insert_tail(&q->bq_list, data);
+	if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction)
+		cv_signal(&q->bq_pop_cv);
+	if (flush)
+		cv_broadcast(&q->bq_pop_cv);
+	mutex_exit(&q->bq_lock);
+}
+
+/*
+ * Add data to q, consuming size units of capacity.  If there is insufficient
+ * capacity to consume size units, block until capacity exists.  Asserts size is
+ * > 0.
+ */
+void
+bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size)
+{
+	bqueue_enqueue_impl(q, data, item_size, B_FALSE);
+}
+
+/*
+ * Enqueue an entry, and then flush the queue.  This forces the popping threads
+ * to wake up, even if we're below the fill fraction.  We have this in a single
+ * function, rather than having a separate call, because it prevents race
+ * conditions between the enqueuing thread and the dequeueing thread, where the
+ * enqueueing thread will wake up the dequeueing thread, that thread will
+ * destroy the condvar before the enqueuing thread is done.
+ */
+void
+bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size)
+{
+	bqueue_enqueue_impl(q, data, item_size, B_TRUE);
+}
+
+/*
+ * Take the first element off of q.  If there are no elements on the queue, wait
+ * until one is put there.  Return the removed element.
+ */
+void *
+bqueue_dequeue(bqueue_t *q)
+{
+	void *ret = NULL;
+	uint64_t item_size;
+	mutex_enter(&q->bq_lock);
+	while (q->bq_size == 0) {
+		cv_wait_sig(&q->bq_pop_cv, &q->bq_lock);
+	}
+	ret = list_remove_head(&q->bq_list);
+	ASSERT3P(ret, !=, NULL);
+	item_size = obj2node(q, ret)->bqn_size;
+	q->bq_size -= item_size;
+	if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction))
+		cv_signal(&q->bq_add_cv);
+	mutex_exit(&q->bq_lock);
+	return (ret);
+}
+
+/*
+ * Returns true if the space used is 0.
+ */
+boolean_t
+bqueue_empty(bqueue_t *q)
+{
+	return (q->bq_size == 0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/btree.c b/sys/contrib/openzfs/module/zfs/btree.c
new file mode 100644
index 000000000000..57b9dbbb2b50
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/btree.c
@@ -0,0 +1,2124 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2019 by Delphix. All rights reserved.
+ */
+
+#include	<sys/btree.h>
+#include	<sys/bitops.h>
+#include	<sys/zfs_context.h>
+
+kmem_cache_t *zfs_btree_leaf_cache;
+
+/*
+ * Control the extent of the verification that occurs when zfs_btree_verify is
+ * called. Primarily used for debugging when extending the btree logic and
+ * functionality. As the intensity is increased, new verification steps are
+ * added. These steps are cumulative; intensity = 3 includes the intensity = 1
+ * and intensity = 2 steps as well.
+ *
+ * Intensity 1: Verify that the tree's height is consistent throughout.
+ * Intensity 2: Verify that a core node's children's parent pointers point
+ * to the core node.
+ * Intensity 3: Verify that the total number of elements in the tree matches the
+ * sum of the number of elements in each node. Also verifies that each node's
+ * count obeys the invariants (less than or equal to maximum value, greater than
+ * or equal to half the maximum minus one).
+ * Intensity 4: Verify that each element compares less than the element
+ * immediately after it and greater than the one immediately before it using the
+ * comparator function. For core nodes, also checks that each element is greater
+ * than the last element in the first of the two nodes it separates, and less
+ * than the first element in the second of the two nodes.
+ * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside
+ * of each node is poisoned appropriately. Note that poisoning always occurs if
+ * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal
+ * operation.
+ *
+ * Intensity 4 and 5 are particularly expensive to perform; the previous levels
+ * are a few memory operations per node, while these levels require multiple
+ * operations per element. In addition, when creating large btrees, these
+ * operations are called at every step, resulting in extremely slow operation
+ * (while the asymptotic complexity of the other steps is the same, the
+ * importance of the constant factors cannot be denied).
+ */
+int zfs_btree_verify_intensity = 0;
+
+/*
+ * A convenience function to silence warnings from memmove's return value and
+ * change argument order to src, dest.
+ */
+static void
+bmov(const void *src, void *dest, size_t size)
+{
+	(void) memmove(dest, src, size);
+}
+
+#ifdef _ILP32
+#define	BTREE_POISON 0xabadb10c
+#else
+#define	BTREE_POISON 0xabadb10cdeadbeef
+#endif
+
+static void
+zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+#ifdef ZFS_DEBUG
+	size_t size = tree->bt_elem_size;
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		(void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f,
+		    BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) -
+		    hdr->bth_count * size);
+	} else {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+			node->btc_children[i] =
+			    (zfs_btree_hdr_t *)BTREE_POISON;
+		}
+		(void) memset(node->btc_elems + hdr->bth_count * size, 0x0f,
+		    (BTREE_CORE_ELEMS - hdr->bth_count) * size);
+	}
+#endif
+}
+
+static inline void
+zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+    uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+	size_t size = tree->bt_elem_size;
+	ASSERT3U(offset, >=, hdr->bth_count);
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		(void) memset(leaf->btl_elems + offset * size, 0x0f, size);
+	} else {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		node->btc_children[offset + 1] =
+		    (zfs_btree_hdr_t *)BTREE_POISON;
+		(void) memset(node->btc_elems + offset * size, 0x0f, size);
+	}
+#endif
+}
+
+static inline void
+zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+    uint64_t offset)
+{
+#ifdef ZFS_DEBUG
+	size_t size = tree->bt_elem_size;
+	uint8_t eval = 0x0f;
+	if (hdr->bth_core) {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON;
+		VERIFY3P(node->btc_children[offset + 1], ==, cval);
+		for (int i = 0; i < size; i++)
+			VERIFY3U(node->btc_elems[offset * size + i], ==, eval);
+	} else  {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		for (int i = 0; i < size; i++)
+			VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval);
+	}
+#endif
+}
+
+void
+zfs_btree_init(void)
+{
+	zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache",
+	    BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL,
+	    NULL, 0);
+}
+
+void
+zfs_btree_fini(void)
+{
+	kmem_cache_destroy(zfs_btree_leaf_cache);
+}
+
+void
+zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *),
+    size_t size)
+{
+	/*
+	 * We need a minimmum of 4 elements so that when we split a node we
+	 * always have at least two elements in each node. This simplifies the
+	 * logic in zfs_btree_bulk_finish, since it means the last leaf will
+	 * always have a left sibling to share with (unless it's the root).
+	 */
+	ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4);
+
+	bzero(tree, sizeof (*tree));
+	tree->bt_compar = compar;
+	tree->bt_elem_size = size;
+	tree->bt_height = -1;
+	tree->bt_bulk = NULL;
+}
+
+/*
+ * Find value in the array of elements provided. Uses a simple binary search.
+ */
+static void *
+zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems,
+    const void *value, zfs_btree_index_t *where)
+{
+	uint64_t max = nelems;
+	uint64_t min = 0;
+	while (max > min) {
+		uint64_t idx = (min + max) / 2;
+		uint8_t *cur = buf + idx * tree->bt_elem_size;
+		int comp = tree->bt_compar(cur, value);
+		if (comp == -1) {
+			min = idx + 1;
+		} else if (comp == 1) {
+			max = idx;
+		} else {
+			ASSERT0(comp);
+			where->bti_offset = idx;
+			where->bti_before = B_FALSE;
+			return (cur);
+		}
+	}
+
+	where->bti_offset = max;
+	where->bti_before = B_TRUE;
+	return (NULL);
+}
+
+/*
+ * Find the given value in the tree. where may be passed as null to use as a
+ * membership test or if the btree is being used as a map.
+ */
+void *
+zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where)
+{
+	if (tree->bt_height == -1) {
+		if (where != NULL) {
+			where->bti_node = NULL;
+			where->bti_offset = 0;
+		}
+		ASSERT0(tree->bt_num_elems);
+		return (NULL);
+	}
+
+	/*
+	 * If we're in bulk-insert mode, we check the last spot in the tree
+	 * and the last leaf in the tree before doing the normal search,
+	 * because for most workloads the vast majority of finds in
+	 * bulk-insert mode are to insert new elements.
+	 */
+	zfs_btree_index_t idx;
+	if (tree->bt_bulk != NULL) {
+		zfs_btree_leaf_t *last_leaf = tree->bt_bulk;
+		int compar = tree->bt_compar(last_leaf->btl_elems +
+		    ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size),
+		    value);
+		if (compar < 0) {
+			/*
+			 * If what they're looking for is after the last
+			 * element, it's not in the tree.
+			 */
+			if (where != NULL) {
+				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+				where->bti_offset =
+				    last_leaf->btl_hdr.bth_count;
+				where->bti_before = B_TRUE;
+			}
+			return (NULL);
+		} else if (compar == 0) {
+			if (where != NULL) {
+				where->bti_node = (zfs_btree_hdr_t *)last_leaf;
+				where->bti_offset =
+				    last_leaf->btl_hdr.bth_count - 1;
+				where->bti_before = B_FALSE;
+			}
+			return (last_leaf->btl_elems +
+			    ((last_leaf->btl_hdr.bth_count - 1) *
+			    tree->bt_elem_size));
+		}
+		if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) {
+			/*
+			 * If what they're looking for is after the first
+			 * element in the last leaf, it's in the last leaf or
+			 * it's not in the tree.
+			 */
+			void *d = zfs_btree_find_in_buf(tree,
+			    last_leaf->btl_elems, last_leaf->btl_hdr.bth_count,
+			    value, &idx);
+
+			if (where != NULL) {
+				idx.bti_node = (zfs_btree_hdr_t *)last_leaf;
+				*where = idx;
+			}
+			return (d);
+		}
+	}
+
+	zfs_btree_core_t *node = NULL;
+	uint64_t child = 0;
+	uint64_t depth = 0;
+
+	/*
+	 * Iterate down the tree, finding which child the value should be in
+	 * by comparing with the separators.
+	 */
+	for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height;
+	    node = (zfs_btree_core_t *)node->btc_children[child], depth++) {
+		ASSERT3P(node, !=, NULL);
+		void *d = zfs_btree_find_in_buf(tree, node->btc_elems,
+		    node->btc_hdr.bth_count, value, &idx);
+		EQUIV(d != NULL, !idx.bti_before);
+		if (d != NULL) {
+			if (where != NULL) {
+				idx.bti_node = (zfs_btree_hdr_t *)node;
+				*where = idx;
+			}
+			return (d);
+		}
+		ASSERT(idx.bti_before);
+		child = idx.bti_offset;
+	}
+
+	/*
+	 * The value is in this leaf, or it would be if it were in the
+	 * tree. Find its proper location and return it.
+	 */
+	zfs_btree_leaf_t *leaf = (depth == 0 ?
+	    (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node);
+	void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems,
+	    leaf->btl_hdr.bth_count, value, &idx);
+
+	if (where != NULL) {
+		idx.bti_node = (zfs_btree_hdr_t *)leaf;
+		*where = idx;
+	}
+
+	return (d);
+}
+
+/*
+ * To explain the following functions, it is useful to understand the four
+ * kinds of shifts used in btree operation. First, a shift is a movement of
+ * elements within a node. It is used to create gaps for inserting new
+ * elements and children, or cover gaps created when things are removed. A
+ * shift has two fundamental properties, each of which can be one of two
+ * values, making four types of shifts.  There is the direction of the shift
+ * (left or right) and the shape of the shift (parallelogram or isoceles
+ * trapezoid (shortened to trapezoid hereafter)). The shape distinction only
+ * applies to shifts of core nodes.
+ *
+ * The names derive from the following imagining of the layout of a node:
+ *
+ *  Elements:       *   *   *   *   *   *   *   ...   *   *   *
+ *  Children:     *   *   *   *   *   *   *   *   ...   *   *   *
+ *
+ * This layout follows from the fact that the elements act as separators
+ * between pairs of children, and that children root subtrees "below" the
+ * current node. A left and right shift are fairly self-explanatory; a left
+ * shift moves things to the left, while a right shift moves things to the
+ * right. A parallelogram shift is a shift with the same number of elements
+ * and children being moved, while a trapezoid shift is a shift that moves one
+ * more children than elements. An example follows:
+ *
+ * A parallelogram shift could contain the following:
+ *      _______________
+ *      \*   *   *   * \ *   *   *   ...   *   *   *
+ *     * \ *   *   *   *\  *   *   *   ...   *   *   *
+ *        ---------------
+ * A trapezoid shift could contain the following:
+ *          ___________
+ *       * / *   *   * \ *   *   *   ...   *   *   *
+ *     *  / *  *   *   *\  *   *   *   ...   *   *   *
+ *        ---------------
+ *
+ * Note that a parallelogram shift is always shaped like a "left-leaning"
+ * parallelogram, where the starting index of the children being moved is
+ * always one higher than the starting index of the elements being moved. No
+ * "right-leaning" parallelogram shifts are needed (shifts where the starting
+ * element index and starting child index being moved are the same) to achieve
+ * any btree operations, so we ignore them.
+ */
+
+enum bt_shift_shape {
+	BSS_TRAPEZOID,
+	BSS_PARALLELOGRAM
+};
+
+enum bt_shift_direction {
+	BSD_LEFT,
+	BSD_RIGHT
+};
+
+/*
+ * Shift elements and children in the provided core node by off spots.  The
+ * first element moved is idx, and count elements are moved. The shape of the
+ * shift is determined by shape. The direction is determined by dir.
+ */
+static inline void
+bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+    uint64_t count, uint64_t off, enum bt_shift_shape shape,
+    enum bt_shift_direction dir)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(node->btc_hdr.bth_core);
+
+	uint8_t *e_start = node->btc_elems + idx * size;
+	int sign = (dir == BSD_LEFT ? -1 : +1);
+	uint8_t *e_out = e_start + sign * off * size;
+	uint64_t e_count = count;
+	bmov(e_start, e_out, e_count * size);
+
+	zfs_btree_hdr_t **c_start = node->btc_children + idx +
+	    (shape == BSS_TRAPEZOID ? 0 : 1);
+	zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off :
+	    c_start + off);
+	uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+	bmov(c_start, c_out, c_count * sizeof (*c_start));
+}
+
+/*
+ * Shift elements and children in the provided core node left by one spot.
+ * The first element moved is idx, and count elements are moved. The
+ * shape of the shift is determined by trap; true if the shift is a trapezoid,
+ * false if it is a parallelogram.
+ */
+static inline void
+bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+    uint64_t count, enum bt_shift_shape shape)
+{
+	bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT);
+}
+
+/*
+ * Shift elements and children in the provided core node right by one spot.
+ * Starts with elements[idx] and children[idx] and one more child than element.
+ */
+static inline void
+bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx,
+    uint64_t count, enum bt_shift_shape shape)
+{
+	bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT);
+}
+
+/*
+ * Shift elements and children in the provided leaf node by off spots.
+ * The first element moved is idx, and count elements are moved. The direction
+ * is determined by left.
+ */
+static inline void
+bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx,
+    uint64_t count, uint64_t off, enum bt_shift_direction dir)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(!node->btl_hdr.bth_core);
+
+	uint8_t *start = node->btl_elems + idx * size;
+	int sign = (dir == BSD_LEFT ? -1 : +1);
+	uint8_t *out = start + sign * off * size;
+	bmov(start, out, count * size);
+}
+
+static inline void
+bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+    uint64_t count)
+{
+	bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT);
+}
+
+static inline void
+bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx,
+    uint64_t count)
+{
+	bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT);
+}
+
+/*
+ * Move children and elements from one core node to another. The shape
+ * parameter behaves the same as it does in the shift logic.
+ */
+static inline void
+bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx,
+    uint64_t count, zfs_btree_core_t *dest, uint64_t didx,
+    enum bt_shift_shape shape)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(source->btc_hdr.bth_core);
+	ASSERT(dest->btc_hdr.bth_core);
+
+	bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size,
+	    count * size);
+
+	uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0);
+	bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1),
+	    dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1),
+	    c_count * sizeof (*source->btc_children));
+}
+
+static inline void
+bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx,
+    uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx)
+{
+	size_t size = tree->bt_elem_size;
+	ASSERT(!source->btl_hdr.bth_core);
+	ASSERT(!dest->btl_hdr.bth_core);
+
+	bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size,
+	    count * size);
+}
+
+/*
+ * Find the first element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where)
+{
+	zfs_btree_hdr_t *node;
+
+	for (node = hdr; node->bth_core; node =
+	    ((zfs_btree_core_t *)node)->btc_children[0])
+		;
+
+	ASSERT(!node->bth_core);
+	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+	if (where != NULL) {
+		where->bti_node = node;
+		where->bti_offset = 0;
+		where->bti_before = B_FALSE;
+	}
+	return (&leaf->btl_elems[0]);
+}
+
+/* Insert an element and a child into a core node at the given offset. */
+static void
+zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent,
+    uint64_t offset, zfs_btree_hdr_t *new_node, void *buf)
+{
+	uint64_t size = tree->bt_elem_size;
+	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+	ASSERT3P(par_hdr, ==, new_node->bth_parent);
+	ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS);
+
+	if (zfs_btree_verify_intensity >= 5) {
+		zfs_btree_verify_poison_at(tree, par_hdr,
+		    par_hdr->bth_count);
+	}
+	/* Shift existing elements and children */
+	uint64_t count = par_hdr->bth_count - offset;
+	bt_shift_core_right(tree, parent, offset, count,
+	    BSS_PARALLELOGRAM);
+
+	/* Insert new values */
+	parent->btc_children[offset + 1] = new_node;
+	bmov(buf, parent->btc_elems + offset * size, size);
+	par_hdr->bth_count++;
+}
+
+/*
+ * Insert new_node into the parent of old_node directly after old_node, with
+ * buf as the dividing element between the two.
+ */
+static void
+zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node,
+    zfs_btree_hdr_t *new_node, void *buf)
+{
+	ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent);
+	uint64_t size = tree->bt_elem_size;
+	zfs_btree_core_t *parent = old_node->bth_parent;
+	zfs_btree_hdr_t *par_hdr = &parent->btc_hdr;
+
+	/*
+	 * If this is the root node we were splitting, we create a new root
+	 * and increase the height of the tree.
+	 */
+	if (parent == NULL) {
+		ASSERT3P(old_node, ==, tree->bt_root);
+		tree->bt_num_nodes++;
+		zfs_btree_core_t *new_root =
+		    kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS *
+		    size, KM_SLEEP);
+		zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr;
+		new_root_hdr->bth_parent = NULL;
+		new_root_hdr->bth_core = B_TRUE;
+		new_root_hdr->bth_count = 1;
+
+		old_node->bth_parent = new_node->bth_parent = new_root;
+		new_root->btc_children[0] = old_node;
+		new_root->btc_children[1] = new_node;
+		bmov(buf, new_root->btc_elems, size);
+
+		tree->bt_height++;
+		tree->bt_root = new_root_hdr;
+		zfs_btree_poison_node(tree, new_root_hdr);
+		return;
+	}
+
+	/*
+	 * Since we have the new separator, binary search for where to put
+	 * new_node.
+	 */
+	zfs_btree_index_t idx;
+	ASSERT(par_hdr->bth_core);
+	VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+	    par_hdr->bth_count, buf, &idx), ==, NULL);
+	ASSERT(idx.bti_before);
+	uint64_t offset = idx.bti_offset;
+	ASSERT3U(offset, <=, par_hdr->bth_count);
+	ASSERT3P(parent->btc_children[offset], ==, old_node);
+
+	/*
+	 * If the parent isn't full, shift things to accommodate our insertions
+	 * and return.
+	 */
+	if (par_hdr->bth_count != BTREE_CORE_ELEMS) {
+		zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf);
+		return;
+	}
+
+	/*
+	 * We need to split this core node into two. Currently there are
+	 * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for
+	 * BTREE_CORE_ELEMS + 2. Some of the children will be part of the
+	 * current node, and the others will be moved to the new core node.
+	 * There are BTREE_CORE_ELEMS + 1 elements including the new one. One
+	 * will be used as the new separator in our parent, and the others
+	 * will be split among the two core nodes.
+	 *
+	 * Usually we will split the node in half evenly, with
+	 * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we
+	 * instead move only about a quarter of the elements (and children) to
+	 * the new node. Since the average state after a long time is a 3/4
+	 * full node, shortcutting directly to that state improves efficiency.
+	 *
+	 * We do this in two stages: first we split into two nodes, and then we
+	 * reuse our existing logic to insert the new element and child.
+	 */
+	uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ?
+	    2 : 4)) - 1, 2);
+	uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1;
+	ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2);
+	tree->bt_num_nodes++;
+	zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) +
+	    BTREE_CORE_ELEMS * size, KM_SLEEP);
+	zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr;
+	new_par_hdr->bth_parent = par_hdr->bth_parent;
+	new_par_hdr->bth_core = B_TRUE;
+	new_par_hdr->bth_count = move_count;
+	zfs_btree_poison_node(tree, new_par_hdr);
+
+	par_hdr->bth_count = keep_count;
+
+	bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent,
+	    0, BSS_TRAPEZOID);
+
+	/* Store the new separator in a buffer. */
+	uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP);
+	bmov(parent->btc_elems + keep_count * size, tmp_buf,
+	    size);
+	zfs_btree_poison_node(tree, par_hdr);
+
+	if (offset < keep_count) {
+		/* Insert the new node into the left half */
+		zfs_btree_insert_core_impl(tree, parent, offset, new_node,
+		    buf);
+
+		/*
+		 * Move the new separator to the existing buffer.
+		 */
+		bmov(tmp_buf, buf, size);
+	} else if (offset > keep_count) {
+		/* Insert the new node into the right half */
+		new_node->bth_parent = new_parent;
+		zfs_btree_insert_core_impl(tree, new_parent,
+		    offset - keep_count - 1, new_node, buf);
+
+		/*
+		 * Move the new separator to the existing buffer.
+		 */
+		bmov(tmp_buf, buf, size);
+	} else {
+		/*
+		 * Move the new separator into the right half, and replace it
+		 * with buf. We also need to shift back the elements in the
+		 * right half to accommodate new_node.
+		 */
+		bt_shift_core_right(tree, new_parent, 0, move_count,
+		    BSS_TRAPEZOID);
+		new_parent->btc_children[0] = new_node;
+		bmov(tmp_buf, new_parent->btc_elems, size);
+		new_par_hdr->bth_count++;
+	}
+	kmem_free(tmp_buf, size);
+	zfs_btree_poison_node(tree, par_hdr);
+
+	for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++)
+		new_parent->btc_children[i]->bth_parent = new_parent;
+
+	for (int i = 0; i <= parent->btc_hdr.bth_count; i++)
+		ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent);
+
+	/*
+	 * Now that the node is split, we need to insert the new node into its
+	 * parent. This may cause further splitting.
+	 */
+	zfs_btree_insert_into_parent(tree, &parent->btc_hdr,
+	    &new_parent->btc_hdr, buf);
+}
+
+/* Insert an element into a leaf node at the given offset. */
+static void
+zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+    uint64_t idx, const void *value)
+{
+	uint64_t size = tree->bt_elem_size;
+	uint8_t *start = leaf->btl_elems + (idx * size);
+	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+	uint64_t capacity __maybe_unused = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+	uint64_t count = leaf->btl_hdr.bth_count - idx;
+	ASSERT3U(leaf->btl_hdr.bth_count, <, capacity);
+
+	if (zfs_btree_verify_intensity >= 5) {
+		zfs_btree_verify_poison_at(tree, &leaf->btl_hdr,
+		    leaf->btl_hdr.bth_count);
+	}
+
+	bt_shift_leaf_right(tree, leaf, idx, count);
+	bmov(value, start, size);
+	hdr->bth_count++;
+}
+
+/* Helper function for inserting a new value into leaf at the given index. */
+static void
+zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf,
+    const void *value, uint64_t idx)
+{
+	uint64_t size = tree->bt_elem_size;
+	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+
+	/*
+	 * If the leaf isn't full, shift the elements after idx and insert
+	 * value.
+	 */
+	if (leaf->btl_hdr.bth_count != capacity) {
+		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+		return;
+	}
+
+	/*
+	 * Otherwise, we split the leaf node into two nodes. If we're not bulk
+	 * inserting, each is of size (capacity / 2).  If we are bulk
+	 * inserting, we move a quarter of the elements to the new node so
+	 * inserts into the old node don't cause immediate splitting but the
+	 * tree stays relatively dense. Since the average state after a long
+	 * time is a 3/4 full node, shortcutting directly to that state
+	 * improves efficiency.  At the end of the bulk insertion process
+	 * we'll need to go through and fix up any nodes (the last leaf and
+	 * its ancestors, potentially) that are below the minimum.
+	 *
+	 * In either case, we're left with one extra element. The leftover
+	 * element will become the new dividing element between the two nodes.
+	 */
+	uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) -
+	    1, 2);
+	uint64_t keep_count = capacity - move_count - 1;
+	ASSERT3U(capacity - move_count, >=, 2);
+	tree->bt_num_nodes++;
+	zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+	    KM_SLEEP);
+	zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr;
+	new_hdr->bth_parent = leaf->btl_hdr.bth_parent;
+	new_hdr->bth_core = B_FALSE;
+	new_hdr->bth_count = move_count;
+	zfs_btree_poison_node(tree, new_hdr);
+
+	leaf->btl_hdr.bth_count = keep_count;
+
+	if (tree->bt_bulk != NULL && leaf == tree->bt_bulk)
+		tree->bt_bulk = new_leaf;
+
+	/* Copy the back part to the new leaf. */
+	bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf,
+	    0);
+
+	/* We store the new separator in a buffer we control for simplicity. */
+	uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+	bmov(leaf->btl_elems + (keep_count * size), buf, size);
+	zfs_btree_poison_node(tree, &leaf->btl_hdr);
+
+	if (idx < keep_count) {
+		/* Insert into the existing leaf. */
+		zfs_btree_insert_leaf_impl(tree, leaf, idx, value);
+	} else if (idx > keep_count) {
+		/* Insert into the new leaf. */
+		zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count -
+		    1, value);
+	} else {
+		/*
+		 * Shift the elements in the new leaf to make room for the
+		 * separator, and use the new value as the new separator.
+		 */
+		bt_shift_leaf_right(tree, new_leaf, 0, move_count);
+		bmov(buf, new_leaf->btl_elems, size);
+		bmov(value, buf, size);
+		new_hdr->bth_count++;
+	}
+
+	/*
+	 * Now that the node is split, we need to insert the new node into its
+	 * parent. This may cause further splitting, bur only of core nodes.
+	 */
+	zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr,
+	    buf);
+	kmem_free(buf, size);
+}
+
+static uint64_t
+zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	void *buf;
+	if (hdr->bth_core) {
+		buf = ((zfs_btree_core_t *)hdr)->btc_elems;
+	} else {
+		buf = ((zfs_btree_leaf_t *)hdr)->btl_elems;
+	}
+	zfs_btree_index_t idx;
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems,
+	    parent->btc_hdr.bth_count, buf, &idx), ==, NULL);
+	ASSERT(idx.bti_before);
+	ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count);
+	ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr);
+	return (idx.bti_offset);
+}
+
+/*
+ * Take the b-tree out of bulk insert mode. During bulk-insert mode, some
+ * nodes may violate the invariant that non-root nodes must be at least half
+ * full. All nodes violating this invariant should be the last node in their
+ * particular level. To correct the invariant, we take values from their left
+ * neighbor until they are half full. They must have a left neighbor at their
+ * level because the last node at a level is not the first node unless it's
+ * the root.
+ */
+static void
+zfs_btree_bulk_finish(zfs_btree_t *tree)
+{
+	ASSERT3P(tree->bt_bulk, !=, NULL);
+	ASSERT3P(tree->bt_root, !=, NULL);
+	zfs_btree_leaf_t *leaf = tree->bt_bulk;
+	zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	uint64_t size = tree->bt_elem_size;
+	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+
+	/*
+	 * The invariant doesn't apply to the root node, if that's the only
+	 * node in the tree we're done.
+	 */
+	if (parent == NULL) {
+		tree->bt_bulk = NULL;
+		return;
+	}
+
+	/* First, take elements to rebalance the leaf node. */
+	if (hdr->bth_count < capacity / 2) {
+		/*
+		 * First, find the left neighbor. The simplest way to do this
+		 * is to call zfs_btree_prev twice; the first time finds some
+		 * ancestor of this node, and the second time finds the left
+		 * neighbor. The ancestor found is the lowest common ancestor
+		 * of leaf and the neighbor.
+		 */
+		zfs_btree_index_t idx = {
+			.bti_node = hdr,
+			.bti_offset = 0
+		};
+		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+		ASSERT(idx.bti_node->bth_core);
+		zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node;
+		uint64_t common_idx = idx.bti_offset;
+
+		VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL);
+		ASSERT(!idx.bti_node->bth_core);
+		zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node;
+		zfs_btree_hdr_t *l_hdr = idx.bti_node;
+		uint64_t move_count = (capacity / 2) - hdr->bth_count;
+		ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=,
+		    capacity / 2);
+
+		if (zfs_btree_verify_intensity >= 5) {
+			for (int i = 0; i < move_count; i++) {
+				zfs_btree_verify_poison_at(tree, hdr,
+				    leaf->btl_hdr.bth_count + i);
+			}
+		}
+
+		/* First, shift elements in leaf back. */
+		bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count,
+		    BSD_RIGHT);
+
+		/* Next, move the separator from the common ancestor to leaf. */
+		uint8_t *separator = common->btc_elems + (common_idx * size);
+		uint8_t *out = leaf->btl_elems + ((move_count - 1) * size);
+		bmov(separator, out, size);
+		move_count--;
+
+		/*
+		 * Now we move elements from the tail of the left neighbor to
+		 * fill the remaining spots in leaf.
+		 */
+		bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count -
+		    move_count, move_count, leaf, 0);
+
+		/*
+		 * Finally, move the new last element in the left neighbor to
+		 * the separator.
+		 */
+		bmov(l_neighbor->btl_elems + (l_hdr->bth_count -
+		    move_count - 1) * size, separator, size);
+
+		/* Adjust the node's counts, and we're done. */
+		l_hdr->bth_count -= move_count + 1;
+		hdr->bth_count += move_count + 1;
+
+		ASSERT3U(l_hdr->bth_count, >=, capacity / 2);
+		ASSERT3U(hdr->bth_count, >=, capacity / 2);
+		zfs_btree_poison_node(tree, l_hdr);
+	}
+
+	/*
+	 * Now we have to rebalance any ancestors of leaf that may also
+	 * violate the invariant.
+	 */
+	capacity = BTREE_CORE_ELEMS;
+	while (parent->btc_hdr.bth_parent != NULL) {
+		zfs_btree_core_t *cur = parent;
+		zfs_btree_hdr_t *hdr = &cur->btc_hdr;
+		parent = hdr->bth_parent;
+		/*
+		 * If the invariant isn't violated, move on to the next
+		 * ancestor.
+		 */
+		if (hdr->bth_count >= capacity / 2)
+			continue;
+
+		/*
+		 * Because the smallest number of nodes we can move when
+		 * splitting is 2, we never need to worry about not having a
+		 * left sibling (a sibling is a neighbor with the same parent).
+		 */
+		uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+		ASSERT3U(parent_idx, >, 0);
+		zfs_btree_core_t *l_neighbor =
+		    (zfs_btree_core_t *)parent->btc_children[parent_idx - 1];
+		uint64_t move_count = (capacity / 2) - hdr->bth_count;
+		ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=,
+		    capacity / 2);
+
+		if (zfs_btree_verify_intensity >= 5) {
+			for (int i = 0; i < move_count; i++) {
+				zfs_btree_verify_poison_at(tree, hdr,
+				    hdr->bth_count + i);
+			}
+		}
+		/* First, shift things in the right node back. */
+		bt_shift_core(tree, cur, 0, hdr->bth_count, move_count,
+		    BSS_TRAPEZOID, BSD_RIGHT);
+
+		/* Next, move the separator to the right node. */
+		uint8_t *separator = parent->btc_elems + ((parent_idx - 1) *
+		    size);
+		uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size);
+		bmov(separator, e_out, size);
+
+		/*
+		 * Now, move elements and children from the left node to the
+		 * right.  We move one more child than elements.
+		 */
+		move_count--;
+		uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count;
+		bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0,
+		    BSS_TRAPEZOID);
+
+		/*
+		 * Finally, move the last element in the left node to the
+		 * separator's position.
+		 */
+		move_idx--;
+		bmov(l_neighbor->btc_elems + move_idx * size, separator, size);
+
+		l_neighbor->btc_hdr.bth_count -= move_count + 1;
+		hdr->bth_count += move_count + 1;
+
+		ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2);
+		ASSERT3U(hdr->bth_count, >=, capacity / 2);
+
+		zfs_btree_poison_node(tree, &l_neighbor->btc_hdr);
+
+		for (int i = 0; i <= hdr->bth_count; i++)
+			cur->btc_children[i]->bth_parent = cur;
+	}
+
+	tree->bt_bulk = NULL;
+}
+
+/*
+ * Insert value into tree at the location specified by where.
+ */
+void
+zfs_btree_add_idx(zfs_btree_t *tree, const void *value,
+    const zfs_btree_index_t *where)
+{
+	zfs_btree_index_t idx = {0};
+
+	/* If we're not inserting in the last leaf, end bulk insert mode. */
+	if (tree->bt_bulk != NULL) {
+		if (where->bti_node != &tree->bt_bulk->btl_hdr) {
+			zfs_btree_bulk_finish(tree);
+			VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL);
+			where = &idx;
+		}
+	}
+
+	tree->bt_num_elems++;
+	/*
+	 * If this is the first element in the tree, create a leaf root node
+	 * and add the value to it.
+	 */
+	if (where->bti_node == NULL) {
+		ASSERT3U(tree->bt_num_elems, ==, 1);
+		ASSERT3S(tree->bt_height, ==, -1);
+		ASSERT3P(tree->bt_root, ==, NULL);
+		ASSERT0(where->bti_offset);
+
+		tree->bt_num_nodes++;
+		zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache,
+		    KM_SLEEP);
+		tree->bt_root = &leaf->btl_hdr;
+		tree->bt_height++;
+
+		zfs_btree_hdr_t *hdr = &leaf->btl_hdr;
+		hdr->bth_parent = NULL;
+		hdr->bth_core = B_FALSE;
+		hdr->bth_count = 0;
+		zfs_btree_poison_node(tree, hdr);
+
+		zfs_btree_insert_into_leaf(tree, leaf, value, 0);
+		tree->bt_bulk = leaf;
+	} else if (!where->bti_node->bth_core) {
+		/*
+		 * If we're inserting into a leaf, go directly to the helper
+		 * function.
+		 */
+		zfs_btree_insert_into_leaf(tree,
+		    (zfs_btree_leaf_t *)where->bti_node, value,
+		    where->bti_offset);
+	} else {
+		/*
+		 * If we're inserting into a core node, we can't just shift
+		 * the existing element in that slot in the same node without
+		 * breaking our ordering invariants. Instead we place the new
+		 * value in the node at that spot and then insert the old
+		 * separator into the first slot in the subtree to the right.
+		 */
+		ASSERT(where->bti_node->bth_core);
+		zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node;
+
+		/*
+		 * We can ignore bti_before, because either way the value
+		 * should end up in bti_offset.
+		 */
+		uint64_t off = where->bti_offset;
+		zfs_btree_hdr_t *subtree = node->btc_children[off + 1];
+		size_t size = tree->bt_elem_size;
+		uint8_t *buf = kmem_alloc(size, KM_SLEEP);
+		bmov(node->btc_elems + off * size, buf, size);
+		bmov(value, node->btc_elems + off * size, size);
+
+		/*
+		 * Find the first slot in the subtree to the right, insert
+		 * there.
+		 */
+		zfs_btree_index_t new_idx;
+		VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL);
+		ASSERT0(new_idx.bti_offset);
+		ASSERT(!new_idx.bti_node->bth_core);
+		zfs_btree_insert_into_leaf(tree,
+		    (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0);
+		kmem_free(buf, size);
+	}
+	zfs_btree_verify(tree);
+}
+
+/*
+ * Return the first element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+	if (tree->bt_height == -1) {
+		ASSERT0(tree->bt_num_elems);
+		return (NULL);
+	}
+	return (zfs_btree_first_helper(tree->bt_root, where));
+}
+
+/*
+ * Find the last element in the subtree rooted at hdr, return its value and
+ * put its location in where if non-null.
+ */
+static void *
+zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr,
+    zfs_btree_index_t *where)
+{
+	zfs_btree_hdr_t *node;
+
+	for (node = hdr; node->bth_core; node =
+	    ((zfs_btree_core_t *)node)->btc_children[node->bth_count])
+		;
+
+	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node;
+	if (where != NULL) {
+		where->bti_node = node;
+		where->bti_offset = node->bth_count - 1;
+		where->bti_before = B_FALSE;
+	}
+	return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size);
+}
+
+/*
+ * Return the last element in the tree, and put its location in where if
+ * non-null.
+ */
+void *
+zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+	if (tree->bt_height == -1) {
+		ASSERT0(tree->bt_num_elems);
+		return (NULL);
+	}
+	return (zfs_btree_last_helper(tree, tree->bt_root, where));
+}
+
+/*
+ * This function contains the logic to find the next node in the tree. A
+ * helper function is used because there are multiple internal consumemrs of
+ * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each
+ * node after we've finished with it.
+ */
+static void *
+zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+    zfs_btree_index_t *out_idx,
+    void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *))
+{
+	if (idx->bti_node == NULL) {
+		ASSERT3S(tree->bt_height, ==, -1);
+		return (NULL);
+	}
+
+	uint64_t offset = idx->bti_offset;
+	if (!idx->bti_node->bth_core) {
+		/*
+		 * When finding the next element of an element in a leaf,
+		 * there are two cases. If the element isn't the last one in
+		 * the leaf, in which case we just return the next element in
+		 * the leaf. Otherwise, we need to traverse up our parents
+		 * until we find one where our ancestor isn't the last child
+		 * of its parent. Once we do, the next element is the
+		 * separator after our ancestor in its parent.
+		 */
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+		uint64_t new_off = offset + (idx->bti_before ? 0 : 1);
+		if (leaf->btl_hdr.bth_count > new_off) {
+			out_idx->bti_node = &leaf->btl_hdr;
+			out_idx->bti_offset = new_off;
+			out_idx->bti_before = B_FALSE;
+			return (leaf->btl_elems + new_off * tree->bt_elem_size);
+		}
+
+		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+		    node != NULL; node = node->btc_hdr.bth_parent) {
+			zfs_btree_hdr_t *hdr = &node->btc_hdr;
+			ASSERT(hdr->bth_core);
+			uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+			if (done_func != NULL)
+				done_func(tree, prev);
+			if (i == hdr->bth_count) {
+				prev = hdr;
+				continue;
+			}
+			out_idx->bti_node = hdr;
+			out_idx->bti_offset = i;
+			out_idx->bti_before = B_FALSE;
+			return (node->btc_elems + i * tree->bt_elem_size);
+		}
+		if (done_func != NULL)
+			done_func(tree, prev);
+		/*
+		 * We've traversed all the way up and been at the end of the
+		 * node every time, so this was the last element in the tree.
+		 */
+		return (NULL);
+	}
+
+	/* If we were before an element in a core node, return that element. */
+	ASSERT(idx->bti_node->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+	if (idx->bti_before) {
+		out_idx->bti_before = B_FALSE;
+		return (node->btc_elems + offset * tree->bt_elem_size);
+	}
+
+	/*
+	 * The next element from one in a core node is the first element in
+	 * the subtree just to the right of the separator.
+	 */
+	zfs_btree_hdr_t *child = node->btc_children[offset + 1];
+	return (zfs_btree_first_helper(child, out_idx));
+}
+
+/*
+ * Return the next valued node in the tree.  The same address can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+    zfs_btree_index_t *out_idx)
+{
+	return (zfs_btree_next_helper(tree, idx, out_idx, NULL));
+}
+
+/*
+ * Return the previous valued node in the tree.  The same value can be safely
+ * passed for idx and out_idx.
+ */
+void *
+zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx,
+    zfs_btree_index_t *out_idx)
+{
+	if (idx->bti_node == NULL) {
+		ASSERT3S(tree->bt_height, ==, -1);
+		return (NULL);
+	}
+
+	uint64_t offset = idx->bti_offset;
+	if (!idx->bti_node->bth_core) {
+		/*
+		 * When finding the previous element of an element in a leaf,
+		 * there are two cases. If the element isn't the first one in
+		 * the leaf, in which case we just return the previous element
+		 * in the leaf. Otherwise, we need to traverse up our parents
+		 * until we find one where our previous ancestor isn't the
+		 * first child. Once we do, the previous element is the
+		 * separator after our previous ancestor.
+		 */
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+		if (offset != 0) {
+			out_idx->bti_node = &leaf->btl_hdr;
+			out_idx->bti_offset = offset - 1;
+			out_idx->bti_before = B_FALSE;
+			return (leaf->btl_elems + (offset - 1) *
+			    tree->bt_elem_size);
+		}
+		zfs_btree_hdr_t *prev = &leaf->btl_hdr;
+		for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent;
+		    node != NULL; node = node->btc_hdr.bth_parent) {
+			zfs_btree_hdr_t *hdr = &node->btc_hdr;
+			ASSERT(hdr->bth_core);
+			uint64_t i = zfs_btree_find_parent_idx(tree, prev);
+			if (i == 0) {
+				prev = hdr;
+				continue;
+			}
+			out_idx->bti_node = hdr;
+			out_idx->bti_offset = i - 1;
+			out_idx->bti_before = B_FALSE;
+			return (node->btc_elems + (i - 1) * tree->bt_elem_size);
+		}
+		/*
+		 * We've traversed all the way up and been at the start of the
+		 * node every time, so this was the first node in the tree.
+		 */
+		return (NULL);
+	}
+
+	/*
+	 * The previous element from one in a core node is the last element in
+	 * the subtree just to the left of the separator.
+	 */
+	ASSERT(idx->bti_node->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+	zfs_btree_hdr_t *child = node->btc_children[offset];
+	return (zfs_btree_last_helper(tree, child, out_idx));
+}
+
+/*
+ * Get the value at the provided index in the tree.
+ *
+ * Note that the value returned from this function can be mutated, but only
+ * if it will not change the ordering of the element with respect to any other
+ * elements that could be in the tree.
+ */
+void *
+zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx)
+{
+	ASSERT(!idx->bti_before);
+	if (!idx->bti_node->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node;
+		return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size);
+	}
+	ASSERT(idx->bti_node->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node;
+	return (node->btc_elems + idx->bti_offset * tree->bt_elem_size);
+}
+
+/* Add the given value to the tree. Must not already be in the tree. */
+void
+zfs_btree_add(zfs_btree_t *tree, const void *node)
+{
+	zfs_btree_index_t where = {0};
+	VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL);
+	zfs_btree_add_idx(tree, node, &where);
+}
+
+/* Helper function to free a tree node. */
+static void
+zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node)
+{
+	tree->bt_num_nodes--;
+	if (!node->bth_core) {
+		kmem_cache_free(zfs_btree_leaf_cache, node);
+	} else {
+		kmem_free(node, sizeof (zfs_btree_core_t) +
+		    BTREE_CORE_ELEMS * tree->bt_elem_size);
+	}
+}
+
+/*
+ * Remove the rm_hdr and the separator to its left from the parent node. The
+ * buffer that rm_hdr was stored in may already be freed, so its contents
+ * cannot be accessed.
+ */
+static void
+zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node,
+    zfs_btree_hdr_t *rm_hdr)
+{
+	size_t size = tree->bt_elem_size;
+	uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1;
+	zfs_btree_hdr_t *hdr = &node->btc_hdr;
+	/*
+	 * If the node is the root node and rm_hdr is one of two children,
+	 * promote the other child to the root.
+	 */
+	if (hdr->bth_parent == NULL && hdr->bth_count <= 1) {
+		ASSERT3U(hdr->bth_count, ==, 1);
+		ASSERT3P(tree->bt_root, ==, node);
+		ASSERT3P(node->btc_children[1], ==, rm_hdr);
+		tree->bt_root = node->btc_children[0];
+		node->btc_children[0]->bth_parent = NULL;
+		zfs_btree_node_destroy(tree, hdr);
+		tree->bt_height--;
+		return;
+	}
+
+	uint64_t idx;
+	for (idx = 0; idx <= hdr->bth_count; idx++) {
+		if (node->btc_children[idx] == rm_hdr)
+			break;
+	}
+	ASSERT3U(idx, <=, hdr->bth_count);
+
+	/*
+	 * If the node is the root or it has more than the minimum number of
+	 * children, just remove the child and separator, and return.
+	 */
+	if (hdr->bth_parent == NULL ||
+	    hdr->bth_count > min_count) {
+		/*
+		 * Shift the element and children to the right of rm_hdr to
+		 * the left by one spot.
+		 */
+		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+		    BSS_PARALLELOGRAM);
+		hdr->bth_count--;
+		zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+		return;
+	}
+
+	ASSERT3U(hdr->bth_count, ==, min_count);
+
+	/*
+	 * Now we try to take a node from a neighbor. We check left, then
+	 * right. If the neighbor exists and has more than the minimum number
+	 * of elements, we move the separator between us and them to our
+	 * node, move their closest element (last for left, first for right)
+	 * to the separator, and move their closest child to our node. Along
+	 * the way we need to collapse the gap made by idx, and (for our right
+	 * neighbor) the gap made by removing their first element and child.
+	 *
+	 * Note: this logic currently doesn't support taking from a neighbor
+	 * that isn't a sibling (i.e. a neighbor with a different
+	 * parent). This isn't critical functionality, but may be worth
+	 * implementing in the future for completeness' sake.
+	 */
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+	    parent->btc_children[parent_idx - 1]);
+	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+		/* We can take a node from the left neighbor. */
+		ASSERT(l_hdr->bth_core);
+		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr;
+
+		/*
+		 * Start by shifting the elements and children in the current
+		 * node to the right by one spot.
+		 */
+		bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID);
+
+		/*
+		 * Move the separator between node and neighbor to the first
+		 * element slot in the current node.
+		 */
+		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+		    size;
+		bmov(separator, node->btc_elems, size);
+
+		/* Move the last child of neighbor to our first child slot. */
+		zfs_btree_hdr_t **take_child = neighbor->btc_children +
+		    l_hdr->bth_count;
+		bmov(take_child, node->btc_children, sizeof (*take_child));
+		node->btc_children[0]->bth_parent = node;
+
+		/* Move the last element of neighbor to the separator spot. */
+		uint8_t *take_elem = neighbor->btc_elems +
+		    (l_hdr->bth_count - 1) * size;
+		bmov(take_elem, separator, size);
+		l_hdr->bth_count--;
+		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+		return;
+	}
+
+	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+	    NULL : parent->btc_children[parent_idx + 1]);
+	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+		/* We can take a node from the right neighbor. */
+		ASSERT(r_hdr->bth_core);
+		zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr;
+
+		/*
+		 * Shift elements in node left by one spot to overwrite rm_hdr
+		 * and the separator before it.
+		 */
+		bt_shift_core_left(tree, node, idx, hdr->bth_count - idx,
+		    BSS_PARALLELOGRAM);
+
+		/*
+		 * Move the separator between node and neighbor to the last
+		 * element spot in node.
+		 */
+		uint8_t *separator = parent->btc_elems + parent_idx * size;
+		bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size,
+		    size);
+
+		/*
+		 * Move the first child of neighbor to the last child spot in
+		 * node.
+		 */
+		zfs_btree_hdr_t **take_child = neighbor->btc_children;
+		bmov(take_child, node->btc_children + hdr->bth_count,
+		    sizeof (*take_child));
+		node->btc_children[hdr->bth_count]->bth_parent = node;
+
+		/* Move the first element of neighbor to the separator spot. */
+		uint8_t *take_elem = neighbor->btc_elems;
+		bmov(take_elem, separator, size);
+		r_hdr->bth_count--;
+
+		/*
+		 * Shift the elements and children of neighbor to cover the
+		 * stolen elements.
+		 */
+		bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count,
+		    BSS_TRAPEZOID);
+		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+		return;
+	}
+
+	/*
+	 * In this case, neither of our neighbors can spare an element, so we
+	 * need to merge with one of them. We prefer the left one,
+	 * arbitrarily. Move the separator into the leftmost merging node
+	 * (which may be us or the left neighbor), and then move the right
+	 * merging node's elements. Once that's done, we go back and delete
+	 * the element we're removing. Finally, go into the parent and delete
+	 * the right merging node and the separator. This may cause further
+	 * merging.
+	 */
+	zfs_btree_hdr_t *new_rm_hdr, *keep_hdr;
+	uint64_t new_idx = idx;
+	if (l_hdr != NULL) {
+		keep_hdr = l_hdr;
+		new_rm_hdr = hdr;
+		new_idx += keep_hdr->bth_count + 1;
+	} else {
+		ASSERT3P(r_hdr, !=, NULL);
+		keep_hdr = hdr;
+		new_rm_hdr = r_hdr;
+		parent_idx++;
+	}
+
+	ASSERT(keep_hdr->bth_core);
+	ASSERT(new_rm_hdr->bth_core);
+
+	zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr;
+	zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr;
+
+	if (zfs_btree_verify_intensity >= 5) {
+		for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) {
+			zfs_btree_verify_poison_at(tree, keep_hdr,
+			    keep_hdr->bth_count + i);
+		}
+	}
+
+	/* Move the separator into the left node. */
+	uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size;
+	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+	    size;
+	bmov(separator, e_out, size);
+	keep_hdr->bth_count++;
+
+	/* Move all our elements and children into the left node. */
+	bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep,
+	    keep_hdr->bth_count, BSS_TRAPEZOID);
+
+	uint64_t old_count = keep_hdr->bth_count;
+
+	/* Update bookkeeping */
+	keep_hdr->bth_count += new_rm_hdr->bth_count;
+	ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1);
+
+	/*
+	 * Shift the element and children to the right of rm_hdr to
+	 * the left by one spot.
+	 */
+	ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr);
+	bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx,
+	    BSS_PARALLELOGRAM);
+	keep_hdr->bth_count--;
+
+	/* Reparent all our children to point to the left node. */
+	zfs_btree_hdr_t **new_start = keep->btc_children +
+	    old_count - 1;
+	for (int i = 0; i < new_rm_hdr->bth_count + 1; i++)
+		new_start[i]->bth_parent = keep;
+	for (int i = 0; i <= keep_hdr->bth_count; i++) {
+		ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep);
+		ASSERT3P(keep->btc_children[i], !=, rm_hdr);
+	}
+	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+	new_rm_hdr->bth_count = 0;
+	zfs_btree_node_destroy(tree, new_rm_hdr);
+	zfs_btree_remove_from_node(tree, parent, new_rm_hdr);
+}
+
+/* Remove the element at the specific location. */
+void
+zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where)
+{
+	size_t size = tree->bt_elem_size;
+	zfs_btree_hdr_t *hdr = where->bti_node;
+	uint64_t idx = where->bti_offset;
+	uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+	    sizeof (zfs_btree_hdr_t)) / size, 2);
+
+	ASSERT(!where->bti_before);
+	if (tree->bt_bulk != NULL) {
+		/*
+		 * Leave bulk insert mode. Note that our index would be
+		 * invalid after we correct the tree, so we copy the value
+		 * we're planning to remove and find it again after
+		 * bulk_finish.
+		 */
+		uint8_t *value = zfs_btree_get(tree, where);
+		uint8_t *tmp = kmem_alloc(size, KM_SLEEP);
+		bmov(value, tmp, size);
+		zfs_btree_bulk_finish(tree);
+		VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL);
+		kmem_free(tmp, size);
+		hdr = where->bti_node;
+		idx = where->bti_offset;
+	}
+
+	tree->bt_num_elems--;
+	/*
+	 * If the element happens to be in a core node, we move a leaf node's
+	 * element into its place and then remove the leaf node element. This
+	 * makes the rebalance logic not need to be recursive both upwards and
+	 * downwards.
+	 */
+	if (hdr->bth_core) {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		zfs_btree_hdr_t *left_subtree = node->btc_children[idx];
+		void *new_value = zfs_btree_last_helper(tree, left_subtree,
+		    where);
+		ASSERT3P(new_value, !=, NULL);
+
+		bmov(new_value, node->btc_elems + idx * size, size);
+
+		hdr = where->bti_node;
+		idx = where->bti_offset;
+		ASSERT(!where->bti_before);
+	}
+
+	/*
+	 * First, we'll update the leaf's metadata. Then, we shift any
+	 * elements after the idx to the left. After that, we rebalance if
+	 * needed.
+	 */
+	ASSERT(!hdr->bth_core);
+	zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+	ASSERT3U(hdr->bth_count, >, 0);
+
+	uint64_t min_count = (capacity / 2) - 1;
+
+	/*
+	 * If we're over the minimum size or this is the root, just overwrite
+	 * the value and return.
+	 */
+	if (hdr->bth_count > min_count || hdr->bth_parent == NULL) {
+		hdr->bth_count--;
+		bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx);
+		if (hdr->bth_parent == NULL) {
+			ASSERT0(tree->bt_height);
+			if (hdr->bth_count == 0) {
+				tree->bt_root = NULL;
+				tree->bt_height--;
+				zfs_btree_node_destroy(tree, &leaf->btl_hdr);
+			}
+		}
+		if (tree->bt_root != NULL)
+			zfs_btree_poison_node_at(tree, hdr, hdr->bth_count);
+		zfs_btree_verify(tree);
+		return;
+	}
+	ASSERT3U(hdr->bth_count, ==, min_count);
+
+	/*
+	 * Now we try to take a node from a sibling. We check left, then
+	 * right. If they exist and have more than the minimum number of
+	 * elements, we move the separator between us and them to our node
+	 * and move their closest element (last for left, first for right) to
+	 * the separator. Along the way we need to collapse the gap made by
+	 * idx, and (for our right neighbor) the gap made by removing their
+	 * first element.
+	 *
+	 * Note: this logic currently doesn't support taking from a neighbor
+	 * that isn't a sibling. This isn't critical functionality, but may be
+	 * worth implementing in the future for completeness' sake.
+	 */
+	zfs_btree_core_t *parent = hdr->bth_parent;
+	uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr);
+
+	zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL :
+	    parent->btc_children[parent_idx - 1]);
+	if (l_hdr != NULL && l_hdr->bth_count > min_count) {
+		/* We can take a node from the left neighbor. */
+		ASSERT(!l_hdr->bth_core);
+
+		/*
+		 * Move our elements back by one spot to make room for the
+		 * stolen element and overwrite the element being removed.
+		 */
+		bt_shift_leaf_right(tree, leaf, 0, idx);
+		uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+		    size;
+		uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems +
+		    (l_hdr->bth_count - 1) * size;
+		/* Move the separator to our first spot. */
+		bmov(separator, leaf->btl_elems, size);
+
+		/* Move our neighbor's last element to the separator. */
+		bmov(take_elem, separator, size);
+
+		/* Update the bookkeeping. */
+		l_hdr->bth_count--;
+		zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count);
+
+		zfs_btree_verify(tree);
+		return;
+	}
+
+	zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ?
+	    NULL : parent->btc_children[parent_idx + 1]);
+	if (r_hdr != NULL && r_hdr->bth_count > min_count) {
+		/* We can take a node from the right neighbor. */
+		ASSERT(!r_hdr->bth_core);
+		zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr;
+
+		/*
+		 * Move our elements after the element being removed forwards
+		 * by one spot to make room for the stolen element and
+		 * overwrite the element being removed.
+		 */
+		bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx -
+		    1);
+
+		uint8_t *separator = parent->btc_elems + parent_idx * size;
+		uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems;
+		/* Move the separator between us to our last spot. */
+		bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size,
+		    size);
+
+		/* Move our neighbor's first element to the separator. */
+		bmov(take_elem, separator, size);
+
+		/* Update the bookkeeping. */
+		r_hdr->bth_count--;
+
+		/*
+		 * Move our neighbors elements forwards to overwrite the
+		 * stolen element.
+		 */
+		bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count);
+		zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count);
+		zfs_btree_verify(tree);
+		return;
+	}
+
+	/*
+	 * In this case, neither of our neighbors can spare an element, so we
+	 * need to merge with one of them. We prefer the left one,
+	 * arbitrarily. Move the separator into the leftmost merging node
+	 * (which may be us or the left neighbor), and then move the right
+	 * merging node's elements. Once that's done, we go back and delete
+	 * the element we're removing. Finally, go into the parent and delete
+	 * the right merging node and the separator. This may cause further
+	 * merging.
+	 */
+	zfs_btree_hdr_t *rm_hdr, *keep_hdr;
+	uint64_t new_idx = idx;
+	if (l_hdr != NULL) {
+		keep_hdr = l_hdr;
+		rm_hdr = hdr;
+		new_idx += keep_hdr->bth_count + 1; // 449
+	} else {
+		ASSERT3P(r_hdr, !=, NULL);
+		keep_hdr = hdr;
+		rm_hdr = r_hdr;
+		parent_idx++;
+	}
+
+	ASSERT(!keep_hdr->bth_core);
+	ASSERT(!rm_hdr->bth_core);
+	ASSERT3U(keep_hdr->bth_count, ==, min_count);
+	ASSERT3U(rm_hdr->bth_count, ==, min_count);
+
+	zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr;
+	zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr;
+
+	if (zfs_btree_verify_intensity >= 5) {
+		for (int i = 0; i < rm_hdr->bth_count + 1; i++) {
+			zfs_btree_verify_poison_at(tree, keep_hdr,
+			    keep_hdr->bth_count + i);
+		}
+	}
+	/*
+	 * Move the separator into the first open spot in the left
+	 * neighbor.
+	 */
+	uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size;
+	uint8_t *separator = parent->btc_elems + (parent_idx - 1) *
+	    size;
+	bmov(separator, out, size);
+	keep_hdr->bth_count++;
+
+	/* Move our elements to the left neighbor. */
+	bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep,
+	    keep_hdr->bth_count);
+
+	/* Update the bookkeeping. */
+	keep_hdr->bth_count += rm_hdr->bth_count;
+	ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1);
+
+	/* Remove the value from the node */
+	keep_hdr->bth_count--;
+	bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count -
+	    new_idx);
+	zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count);
+
+	rm_hdr->bth_count = 0;
+	zfs_btree_node_destroy(tree, rm_hdr);
+	/* Remove the emptied node from the parent. */
+	zfs_btree_remove_from_node(tree, parent, rm_hdr);
+	zfs_btree_verify(tree);
+}
+
+/* Remove the given value from the tree. */
+void
+zfs_btree_remove(zfs_btree_t *tree, const void *value)
+{
+	zfs_btree_index_t where = {0};
+	VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL);
+	zfs_btree_remove_idx(tree, &where);
+}
+
+/* Return the number of elements in the tree. */
+ulong_t
+zfs_btree_numnodes(zfs_btree_t *tree)
+{
+	return (tree->bt_num_elems);
+}
+
+/*
+ * This function is used to visit all the elements in the tree before
+ * destroying the tree. This allows the calling code to perform any cleanup it
+ * needs to do. This is more efficient than just removing the first element
+ * over and over, because it removes all rebalancing. Once the destroy_nodes()
+ * function has been called, no other btree operations are valid until it
+ * returns NULL, which point the only valid operation is zfs_btree_destroy().
+ *
+ * example:
+ *
+ *      zfs_btree_index_t *cookie = NULL;
+ *      my_data_t *node;
+ *
+ *      while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL)
+ *              free(node->ptr);
+ *      zfs_btree_destroy(tree);
+ *
+ */
+void *
+zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie)
+{
+	if (*cookie == NULL) {
+		if (tree->bt_height == -1)
+			return (NULL);
+		*cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP);
+		return (zfs_btree_first(tree, *cookie));
+	}
+
+	void *rval = zfs_btree_next_helper(tree, *cookie, *cookie,
+	    zfs_btree_node_destroy);
+	if (rval == NULL)   {
+		tree->bt_root = NULL;
+		tree->bt_height = -1;
+		tree->bt_num_elems = 0;
+		kmem_free(*cookie, sizeof (**cookie));
+		tree->bt_bulk = NULL;
+	}
+	return (rval);
+}
+
+static void
+zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	if (hdr->bth_core) {
+		zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr;
+		for (int i = 0; i <= hdr->bth_count; i++) {
+			zfs_btree_clear_helper(tree, btc->btc_children[i]);
+		}
+	}
+
+	zfs_btree_node_destroy(tree, hdr);
+}
+
+void
+zfs_btree_clear(zfs_btree_t *tree)
+{
+	if (tree->bt_root == NULL) {
+		ASSERT0(tree->bt_num_elems);
+		return;
+	}
+
+	zfs_btree_clear_helper(tree, tree->bt_root);
+	tree->bt_num_elems = 0;
+	tree->bt_root = NULL;
+	tree->bt_num_nodes = 0;
+	tree->bt_height = -1;
+	tree->bt_bulk = NULL;
+}
+
+void
+zfs_btree_destroy(zfs_btree_t *tree)
+{
+	ASSERT0(tree->bt_num_elems);
+	ASSERT3P(tree->bt_root, ==, NULL);
+}
+
+/* Verify that every child of this node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	if (!hdr->bth_core)
+		return;
+
+	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+	for (int i = 0; i <= hdr->bth_count; i++) {
+		VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr);
+		zfs_btree_verify_pointers_helper(tree, node->btc_children[i]);
+	}
+}
+
+/* Verify that every node has the correct parent pointer. */
+static void
+zfs_btree_verify_pointers(zfs_btree_t *tree)
+{
+	if (tree->bt_height == -1) {
+		VERIFY3P(tree->bt_root, ==, NULL);
+		return;
+	}
+	VERIFY3P(tree->bt_root->bth_parent, ==, NULL);
+	zfs_btree_verify_pointers_helper(tree, tree->bt_root);
+}
+
+/*
+ * Verify that all the current node and its children satisfy the count
+ * invariants, and return the total count in the subtree rooted in this node.
+ */
+static uint64_t
+zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	if (!hdr->bth_core) {
+		if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) {
+			uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE -
+			    sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2);
+			VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1);
+		}
+
+		return (hdr->bth_count);
+	} else {
+
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		uint64_t ret = hdr->bth_count;
+		if (tree->bt_root != hdr && tree->bt_bulk == NULL)
+			VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1);
+		for (int i = 0; i <= hdr->bth_count; i++) {
+			ret += zfs_btree_verify_counts_helper(tree,
+			    node->btc_children[i]);
+		}
+
+		return (ret);
+	}
+}
+
+/*
+ * Verify that all nodes satisfy the invariants and that the total number of
+ * elements is correct.
+ */
+static void
+zfs_btree_verify_counts(zfs_btree_t *tree)
+{
+	EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1);
+	if (tree->bt_height == -1) {
+		return;
+	}
+	VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==,
+	    tree->bt_num_elems);
+}
+
+/*
+ * Check that the subtree rooted at this node has a uniform height. Returns
+ * the number of nodes under this node, to help verify bt_num_nodes.
+ */
+static uint64_t
+zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr,
+    int64_t height)
+{
+	if (!hdr->bth_core) {
+		VERIFY0(height);
+		return (1);
+	}
+
+	VERIFY(hdr->bth_core);
+	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+	uint64_t ret = 1;
+	for (int i = 0; i <= hdr->bth_count; i++) {
+		ret += zfs_btree_verify_height_helper(tree,
+		    node->btc_children[i], height - 1);
+	}
+	return (ret);
+}
+
+/*
+ * Check that the tree rooted at this node has a uniform height, and that the
+ * bt_height in the tree is correct.
+ */
+static void
+zfs_btree_verify_height(zfs_btree_t *tree)
+{
+	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+	if (tree->bt_height == -1) {
+		return;
+	}
+
+	VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root,
+	    tree->bt_height), ==, tree->bt_num_nodes);
+}
+
+/*
+ * Check that the elements in this node are sorted, and that if this is a core
+ * node, the separators are properly between the subtrees they separaate and
+ * that the children also satisfy this requirement.
+ */
+static void
+zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	size_t size = tree->bt_elem_size;
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		for (int i = 1; i < hdr->bth_count; i++) {
+			VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) *
+			    size, leaf->btl_elems + i * size), ==, -1);
+		}
+		return;
+	}
+
+	zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+	for (int i = 1; i < hdr->bth_count; i++) {
+		VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size,
+		    node->btc_elems + i * size), ==, -1);
+	}
+	for (int i = 0; i < hdr->bth_count; i++) {
+		uint8_t *left_child_last = NULL;
+		zfs_btree_hdr_t *left_child_hdr = node->btc_children[i];
+		if (left_child_hdr->bth_core) {
+			zfs_btree_core_t *left_child =
+			    (zfs_btree_core_t *)left_child_hdr;
+			left_child_last = left_child->btc_elems +
+			    (left_child_hdr->bth_count - 1) * size;
+		} else {
+			zfs_btree_leaf_t *left_child =
+			    (zfs_btree_leaf_t *)left_child_hdr;
+			left_child_last = left_child->btl_elems +
+			    (left_child_hdr->bth_count - 1) * size;
+		}
+		if (tree->bt_compar(node->btc_elems + i * size,
+		    left_child_last) != 1) {
+			panic("btree: compar returned %d (expected 1) at "
+			    "%px %d: compar(%px,  %px)", tree->bt_compar(
+			    node->btc_elems + i * size, left_child_last),
+			    (void *)node, i, (void *)(node->btc_elems + i *
+			    size), (void *)left_child_last);
+		}
+
+		uint8_t *right_child_first = NULL;
+		zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1];
+		if (right_child_hdr->bth_core) {
+			zfs_btree_core_t *right_child =
+			    (zfs_btree_core_t *)right_child_hdr;
+			right_child_first = right_child->btc_elems;
+		} else {
+			zfs_btree_leaf_t *right_child =
+			    (zfs_btree_leaf_t *)right_child_hdr;
+			right_child_first = right_child->btl_elems;
+		}
+		if (tree->bt_compar(node->btc_elems + i * size,
+		    right_child_first) != -1) {
+			panic("btree: compar returned %d (expected -1) at "
+			    "%px %d: compar(%px,  %px)", tree->bt_compar(
+			    node->btc_elems + i * size, right_child_first),
+			    (void *)node, i, (void *)(node->btc_elems + i *
+			    size), (void *)right_child_first);
+		}
+	}
+	for (int i = 0; i <= hdr->bth_count; i++) {
+		zfs_btree_verify_order_helper(tree, node->btc_children[i]);
+	}
+}
+
+/* Check that all elements in the tree are in sorted order. */
+static void
+zfs_btree_verify_order(zfs_btree_t *tree)
+{
+	EQUIV(tree->bt_height == -1, tree->bt_root == NULL);
+	if (tree->bt_height == -1) {
+		return;
+	}
+
+	zfs_btree_verify_order_helper(tree, tree->bt_root);
+}
+
+#ifdef ZFS_DEBUG
+/* Check that all unused memory is poisoned correctly. */
+static void
+zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr)
+{
+	size_t size = tree->bt_elem_size;
+	if (!hdr->bth_core) {
+		zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr;
+		uint8_t val = 0x0f;
+		for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE -
+		    sizeof (zfs_btree_hdr_t); i++) {
+			VERIFY3U(leaf->btl_elems[i], ==, val);
+		}
+	} else {
+		zfs_btree_core_t *node = (zfs_btree_core_t *)hdr;
+		uint8_t val = 0x0f;
+		for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size;
+		    i++) {
+			VERIFY3U(node->btc_elems[i], ==, val);
+		}
+
+		for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) {
+			VERIFY3P(node->btc_children[i], ==,
+			    (zfs_btree_hdr_t *)BTREE_POISON);
+		}
+
+		for (int i = 0; i <= hdr->bth_count; i++) {
+			zfs_btree_verify_poison_helper(tree,
+			    node->btc_children[i]);
+		}
+	}
+}
+#endif
+
+/* Check that unused memory in the tree is still poisoned. */
+static void
+zfs_btree_verify_poison(zfs_btree_t *tree)
+{
+#ifdef ZFS_DEBUG
+	if (tree->bt_height == -1)
+		return;
+	zfs_btree_verify_poison_helper(tree, tree->bt_root);
+#endif
+}
+
+void
+zfs_btree_verify(zfs_btree_t *tree)
+{
+	if (zfs_btree_verify_intensity == 0)
+		return;
+	zfs_btree_verify_height(tree);
+	if (zfs_btree_verify_intensity == 1)
+		return;
+	zfs_btree_verify_pointers(tree);
+	if (zfs_btree_verify_intensity == 2)
+		return;
+	zfs_btree_verify_counts(tree);
+	if (zfs_btree_verify_intensity == 3)
+		return;
+	zfs_btree_verify_order(tree);
+
+	if (zfs_btree_verify_intensity == 4)
+		return;
+	zfs_btree_verify_poison(tree);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
new file mode 100644
index 000000000000..e46a0926d557
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
@@ -0,0 +1,215 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2018 Datto Inc.
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+
+static dataset_kstat_values_t empty_dataset_kstats = {
+	{ "dataset_name",	KSTAT_DATA_STRING },
+	{ "writes",	KSTAT_DATA_UINT64 },
+	{ "nwritten",	KSTAT_DATA_UINT64 },
+	{ "reads",	KSTAT_DATA_UINT64 },
+	{ "nread",	KSTAT_DATA_UINT64 },
+	{ "nunlinks",	KSTAT_DATA_UINT64 },
+	{ "nunlinked",	KSTAT_DATA_UINT64 },
+};
+
+static int
+dataset_kstats_update(kstat_t *ksp, int rw)
+{
+	dataset_kstats_t *dk = ksp->ks_private;
+	ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data);
+
+	if (rw == KSTAT_WRITE)
+		return (EACCES);
+
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	dkv->dkv_writes.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_writes);
+	dkv->dkv_nwritten.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nwritten);
+	dkv->dkv_reads.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_reads);
+	dkv->dkv_nread.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nread);
+	dkv->dkv_nunlinks.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nunlinks);
+	dkv->dkv_nunlinked.value.ui64 =
+	    aggsum_value(&dk->dk_aggsums.das_nunlinked);
+
+	return (0);
+}
+
+void
+dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
+{
+	/*
+	 * There should not be anything wrong with having kstats for
+	 * snapshots. Since we are not sure how useful they would be
+	 * though nor how much their memory overhead would matter in
+	 * a filesystem with many snapshots, we skip them for now.
+	 */
+	if (dmu_objset_is_snapshot(objset))
+		return;
+
+	/*
+	 * At the time of this writing, KSTAT_STRLEN is 255 in Linux,
+	 * and the spa_name can theoretically be up to 256 characters.
+	 * In reality though the spa_name can be 240 characters max
+	 * [see origin directory name check in pool_namecheck()]. Thus,
+	 * the naming scheme for the module name below should not cause
+	 * any truncations. In the event that a truncation does happen
+	 * though, due to some future change, we silently skip creating
+	 * the kstat and log the event.
+	 */
+	char kstat_module_name[KSTAT_STRLEN];
+	int n = snprintf(kstat_module_name, sizeof (kstat_module_name),
+	    "zfs/%s", spa_name(dmu_objset_spa(objset)));
+	if (n < 0) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    " snprintf() for kstat module name returned %d",
+		    (unsigned long long)dmu_objset_id(objset), n);
+		return;
+	} else if (n >= KSTAT_STRLEN) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    "kstat module name length (%d) exceeds limit (%d)",
+		    (unsigned long long)dmu_objset_id(objset),
+		    n, KSTAT_STRLEN);
+		return;
+	}
+
+	char kstat_name[KSTAT_STRLEN];
+	n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx",
+	    (unsigned long long)dmu_objset_id(objset));
+	if (n < 0) {
+		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
+		    " snprintf() for kstat name returned %d",
+		    (unsigned long long)dmu_objset_id(objset), n);
+		return;
+	}
+	ASSERT3U(n, <, KSTAT_STRLEN);
+
+	kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
+	    "dataset", KSTAT_TYPE_NAMED,
+	    sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (kstat == NULL)
+		return;
+
+	dataset_kstat_values_t *dk_kstats =
+	    kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
+	bcopy(&empty_dataset_kstats, dk_kstats,
+	    sizeof (empty_dataset_kstats));
+
+	char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	dsl_dataset_name(objset->os_dsl_dataset, ds_name);
+	KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name;
+	KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) =
+	    ZFS_MAX_DATASET_NAME_LEN;
+
+	kstat->ks_data = dk_kstats;
+	kstat->ks_update = dataset_kstats_update;
+	kstat->ks_private = dk;
+	kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN;
+
+	kstat_install(kstat);
+	dk->dk_kstats = kstat;
+
+	aggsum_init(&dk->dk_aggsums.das_writes, 0);
+	aggsum_init(&dk->dk_aggsums.das_nwritten, 0);
+	aggsum_init(&dk->dk_aggsums.das_reads, 0);
+	aggsum_init(&dk->dk_aggsums.das_nread, 0);
+	aggsum_init(&dk->dk_aggsums.das_nunlinks, 0);
+	aggsum_init(&dk->dk_aggsums.das_nunlinked, 0);
+}
+
+void
+dataset_kstats_destroy(dataset_kstats_t *dk)
+{
+	if (dk->dk_kstats == NULL)
+		return;
+
+	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
+	kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
+	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
+	kmem_free(dkv, sizeof (empty_dataset_kstats));
+
+	kstat_delete(dk->dk_kstats);
+	dk->dk_kstats = NULL;
+
+	aggsum_fini(&dk->dk_aggsums.das_writes);
+	aggsum_fini(&dk->dk_aggsums.das_nwritten);
+	aggsum_fini(&dk->dk_aggsums.das_reads);
+	aggsum_fini(&dk->dk_aggsums.das_nread);
+	aggsum_fini(&dk->dk_aggsums.das_nunlinks);
+	aggsum_fini(&dk->dk_aggsums.das_nunlinked);
+}
+
+void
+dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
+    int64_t nwritten)
+{
+	ASSERT3S(nwritten, >=, 0);
+
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_writes, 1);
+	aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten);
+}
+
+void
+dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
+    int64_t nread)
+{
+	ASSERT3S(nread, >=, 0);
+
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_reads, 1);
+	aggsum_add(&dk->dk_aggsums.das_nread, nread);
+}
+
+void
+dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_nunlinks, delta);
+}
+
+void
+dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
+{
+	if (dk->dk_kstats == NULL)
+		return;
+
+	aggsum_add(&dk->dk_aggsums.das_nunlinked, delta);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dbuf.c b/sys/contrib/openzfs/module/zfs/dbuf.c
new file mode 100644
index 000000000000..a6cdc017cd21
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dbuf.c
@@ -0,0 +1,4958 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/arc.h>
+#include <sys/dmu.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/trace_zfs.h>
+#include <sys/callb.h>
+#include <sys/abd.h>
+#include <sys/vdev.h>
+#include <cityhash.h>
+#include <sys/spa_impl.h>
+
+kstat_t *dbuf_ksp;
+
+typedef struct dbuf_stats {
+	/*
+	 * Various statistics about the size of the dbuf cache.
+	 */
+	kstat_named_t cache_count;
+	kstat_named_t cache_size_bytes;
+	kstat_named_t cache_size_bytes_max;
+	/*
+	 * Statistics regarding the bounds on the dbuf cache size.
+	 */
+	kstat_named_t cache_target_bytes;
+	kstat_named_t cache_lowater_bytes;
+	kstat_named_t cache_hiwater_bytes;
+	/*
+	 * Total number of dbuf cache evictions that have occurred.
+	 */
+	kstat_named_t cache_total_evicts;
+	/*
+	 * The distribution of dbuf levels in the dbuf cache and
+	 * the total size of all dbufs at each level.
+	 */
+	kstat_named_t cache_levels[DN_MAX_LEVELS];
+	kstat_named_t cache_levels_bytes[DN_MAX_LEVELS];
+	/*
+	 * Statistics about the dbuf hash table.
+	 */
+	kstat_named_t hash_hits;
+	kstat_named_t hash_misses;
+	kstat_named_t hash_collisions;
+	kstat_named_t hash_elements;
+	kstat_named_t hash_elements_max;
+	/*
+	 * Number of sublists containing more than one dbuf in the dbuf
+	 * hash table. Keep track of the longest hash chain.
+	 */
+	kstat_named_t hash_chains;
+	kstat_named_t hash_chain_max;
+	/*
+	 * Number of times a dbuf_create() discovers that a dbuf was
+	 * already created and in the dbuf hash table.
+	 */
+	kstat_named_t hash_insert_race;
+	/*
+	 * Statistics about the size of the metadata dbuf cache.
+	 */
+	kstat_named_t metadata_cache_count;
+	kstat_named_t metadata_cache_size_bytes;
+	kstat_named_t metadata_cache_size_bytes_max;
+	/*
+	 * For diagnostic purposes, this is incremented whenever we can't add
+	 * something to the metadata cache because it's full, and instead put
+	 * the data in the regular dbuf cache.
+	 */
+	kstat_named_t metadata_cache_overflow;
+} dbuf_stats_t;
+
+dbuf_stats_t dbuf_stats = {
+	{ "cache_count",			KSTAT_DATA_UINT64 },
+	{ "cache_size_bytes",			KSTAT_DATA_UINT64 },
+	{ "cache_size_bytes_max",		KSTAT_DATA_UINT64 },
+	{ "cache_target_bytes",			KSTAT_DATA_UINT64 },
+	{ "cache_lowater_bytes",		KSTAT_DATA_UINT64 },
+	{ "cache_hiwater_bytes",		KSTAT_DATA_UINT64 },
+	{ "cache_total_evicts",			KSTAT_DATA_UINT64 },
+	{ { "cache_levels_N",			KSTAT_DATA_UINT64 } },
+	{ { "cache_levels_bytes_N",		KSTAT_DATA_UINT64 } },
+	{ "hash_hits",				KSTAT_DATA_UINT64 },
+	{ "hash_misses",			KSTAT_DATA_UINT64 },
+	{ "hash_collisions",			KSTAT_DATA_UINT64 },
+	{ "hash_elements",			KSTAT_DATA_UINT64 },
+	{ "hash_elements_max",			KSTAT_DATA_UINT64 },
+	{ "hash_chains",			KSTAT_DATA_UINT64 },
+	{ "hash_chain_max",			KSTAT_DATA_UINT64 },
+	{ "hash_insert_race",			KSTAT_DATA_UINT64 },
+	{ "metadata_cache_count",		KSTAT_DATA_UINT64 },
+	{ "metadata_cache_size_bytes",		KSTAT_DATA_UINT64 },
+	{ "metadata_cache_size_bytes_max",	KSTAT_DATA_UINT64 },
+	{ "metadata_cache_overflow",		KSTAT_DATA_UINT64 }
+};
+
+#define	DBUF_STAT_INCR(stat, val)	\
+	atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
+#define	DBUF_STAT_DECR(stat, val)	\
+	DBUF_STAT_INCR(stat, -(val));
+#define	DBUF_STAT_BUMP(stat)		\
+	DBUF_STAT_INCR(stat, 1);
+#define	DBUF_STAT_BUMPDOWN(stat)	\
+	DBUF_STAT_INCR(stat, -1);
+#define	DBUF_STAT_MAX(stat, v) {					\
+	uint64_t _m;							\
+	while ((v) > (_m = dbuf_stats.stat.value.ui64) &&		\
+	    (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\
+		continue;						\
+}
+
+static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
+static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
+static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
+
+extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
+    dmu_buf_evict_func_t *evict_func_sync,
+    dmu_buf_evict_func_t *evict_func_async,
+    dmu_buf_t **clear_on_evict_dbufp);
+
+/*
+ * Global data structures and functions for the dbuf cache.
+ */
+static kmem_cache_t *dbuf_kmem_cache;
+static taskq_t *dbu_evict_taskq;
+
+static kthread_t *dbuf_cache_evict_thread;
+static kmutex_t dbuf_evict_lock;
+static kcondvar_t dbuf_evict_cv;
+static boolean_t dbuf_evict_thread_exit;
+
+/*
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ *    that represent the metadata that describes filesystems/snapshots/
+ *    bookmarks/properties/etc. We only evict from this cache when we export a
+ *    pool, to short-circuit as much I/O as possible for all administrative
+ *    commands that need the metadata. There is no eviction policy for this
+ *    cache, because we try to only include types in it which would occupy a
+ *    very small amount of space per object but create a large impact on the
+ *    performance of these commands. Instead, after it reaches a maximum size
+ *    (which should only happen on very small memory systems with a very large
+ *    number of filesystem objects), we stop taking new dbufs into the
+ *    metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
+ *    are not currently held but have been recently released. These dbufs
+ *    are not eligible for arc eviction until they are aged out of the cache.
+ *    Dbufs that are aged out of the cache will be immediately destroyed and
+ *    become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
+ */
+typedef struct dbuf_cache {
+	multilist_t *cache;
+	zfs_refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
+
+/* Size limits for the caches */
+unsigned long dbuf_cache_max_bytes = ULONG_MAX;
+unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+
+/* Set the default sizes of the caches to log2 fraction of arc size */
+int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
+
+static unsigned long dbuf_cache_target_bytes(void);
+static unsigned long dbuf_metadata_cache_target_bytes(void);
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
+ *	- A low water marker designates when the dbuf eviction thread
+ *	should stop evicting from the dbuf cache.
+ *	- When we reach the maximum size (aka mid water mark), we
+ *	signal the eviction thread to run.
+ *	- The high water mark indicates when the eviction thread
+ *	is unable to keep up with the incoming load and eviction must
+ *	happen in the context of the calling thread.
+ *
+ * The dbuf cache:
+ *                                                 (max size)
+ *                                      low water   mid water   hi water
+ * +----------------------------------------+----------+----------+
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * |                                        |          |          |
+ * +----------------------------------------+----------+----------+
+ *                                        stop        signal     evict
+ *                                      evicting     eviction   directly
+ *                                                    thread
+ *
+ * The high and low water marks indicate the operating range for the eviction
+ * thread. The low water mark is, by default, 90% of the total size of the
+ * cache and the high water mark is at 110% (both of these percentages can be
+ * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
+ * respectively). The eviction thread will try to ensure that the cache remains
+ * within this range by waking up every second and checking if the cache is
+ * above the low water mark. The thread can also be woken up by callers adding
+ * elements into the cache if the cache is larger than the mid water (i.e max
+ * cache size). Once the eviction thread is woken up and eviction is required,
+ * it will continue evicting buffers until it's able to reduce the cache size
+ * to the low water mark. If the cache size continues to grow and hits the high
+ * water mark, then callers adding elements to the cache will begin to evict
+ * directly from the cache until the cache is no longer above the high water
+ * mark.
+ */
+
+/*
+ * The percentage above and below the maximum cache size.
+ */
+uint_t dbuf_cache_hiwater_pct = 10;
+uint_t dbuf_cache_lowater_pct = 10;
+
+/* ARGSUSED */
+static int
+dbuf_cons(void *vdb, void *unused, int kmflag)
+{
+	dmu_buf_impl_t *db = vdb;
+	bzero(db, sizeof (dmu_buf_impl_t));
+
+	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
+	rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
+	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+	multilist_link_init(&db->db_cache_link);
+	zfs_refcount_create(&db->db_holds);
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dbuf_dest(void *vdb, void *unused)
+{
+	dmu_buf_impl_t *db = vdb;
+	mutex_destroy(&db->db_mtx);
+	rw_destroy(&db->db_rwlock);
+	cv_destroy(&db->db_changed);
+	ASSERT(!multilist_link_active(&db->db_cache_link));
+	zfs_refcount_destroy(&db->db_holds);
+}
+
+/*
+ * dbuf hash table routines
+ */
+static dbuf_hash_table_t dbuf_hash_table;
+
+static uint64_t dbuf_hash_count;
+
+/*
+ * We use Cityhash for this. It's fast, and has good hash properties without
+ * requiring any large static buffers.
+ */
+static uint64_t
+dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
+{
+	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
+}
+
+#define	DTRACE_SET_STATE(db, why) \
+	DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db,	\
+	    const char *, why)
+
+#define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
+	((dbuf)->db.db_object == (obj) &&		\
+	(dbuf)->db_objset == (os) &&			\
+	(dbuf)->db_level == (level) &&			\
+	(dbuf)->db_blkid == (blkid))
+
+dmu_buf_impl_t *
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	uint64_t hv;
+	uint64_t idx;
+	dmu_buf_impl_t *db;
+
+	hv = dbuf_hash(os, obj, level, blkid);
+	idx = hv & h->hash_table_mask;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
+		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
+			mutex_enter(&db->db_mtx);
+			if (db->db_state != DB_EVICTING) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (db);
+			}
+			mutex_exit(&db->db_mtx);
+		}
+	}
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	return (NULL);
+}
+
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+	dnode_t *dn;
+	dmu_buf_impl_t *db = NULL;
+
+	if (dnode_hold(os, object, FTAG, &dn) == 0) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		if (dn->dn_bonus != NULL) {
+			db = dn->dn_bonus;
+			mutex_enter(&db->db_mtx);
+		}
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_rele(dn, FTAG);
+	}
+	return (db);
+}
+
+/*
+ * Insert an entry into the hash table.  If there is already an element
+ * equal to elem in the hash table, then the already existing element
+ * will be returned and the new element will not be inserted.
+ * Otherwise returns NULL.
+ */
+static dmu_buf_impl_t *
+dbuf_hash_insert(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	objset_t *os = db->db_objset;
+	uint64_t obj = db->db.db_object;
+	int level = db->db_level;
+	uint64_t blkid, hv, idx;
+	dmu_buf_impl_t *dbf;
+	uint32_t i;
+
+	blkid = db->db_blkid;
+	hv = dbuf_hash(os, obj, level, blkid);
+	idx = hv & h->hash_table_mask;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
+	    dbf = dbf->db_hash_next, i++) {
+		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
+			mutex_enter(&dbf->db_mtx);
+			if (dbf->db_state != DB_EVICTING) {
+				mutex_exit(DBUF_HASH_MUTEX(h, idx));
+				return (dbf);
+			}
+			mutex_exit(&dbf->db_mtx);
+		}
+	}
+
+	if (i > 0) {
+		DBUF_STAT_BUMP(hash_collisions);
+		if (i == 1)
+			DBUF_STAT_BUMP(hash_chains);
+
+		DBUF_STAT_MAX(hash_chain_max, i);
+	}
+
+	mutex_enter(&db->db_mtx);
+	db->db_hash_next = h->hash_table[idx];
+	h->hash_table[idx] = db;
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_inc_64(&dbuf_hash_count);
+	DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
+
+	return (NULL);
+}
+
+/*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+	DB_DNODE_ENTER(db);
+	dmu_object_type_t type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	/* Check if this dbuf is one of the types we care about */
+	if (DMU_OT_IS_METADATA_CACHED(type)) {
+		/* If we hit this, then we set something up wrong in dmu_ot */
+		ASSERT(DMU_OT_IS_METADATA(type));
+
+		/*
+		 * Sanity check for small-memory systems: don't allocate too
+		 * much memory for this purpose.
+		 */
+		if (zfs_refcount_count(
+		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+		    dbuf_metadata_cache_target_bytes()) {
+			DBUF_STAT_BUMP(metadata_cache_overflow);
+			return (B_FALSE);
+		}
+
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Remove an entry from the hash table.  It must be in the EVICTING state.
+ */
+static void
+dbuf_hash_remove(dmu_buf_impl_t *db)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	uint64_t hv, idx;
+	dmu_buf_impl_t *dbf, **dbp;
+
+	hv = dbuf_hash(db->db_objset, db->db.db_object,
+	    db->db_level, db->db_blkid);
+	idx = hv & h->hash_table_mask;
+
+	/*
+	 * We mustn't hold db_mtx to maintain lock ordering:
+	 * DBUF_HASH_MUTEX > db_mtx.
+	 */
+	ASSERT(zfs_refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_state == DB_EVICTING);
+	ASSERT(!MUTEX_HELD(&db->db_mtx));
+
+	mutex_enter(DBUF_HASH_MUTEX(h, idx));
+	dbp = &h->hash_table[idx];
+	while ((dbf = *dbp) != db) {
+		dbp = &dbf->db_hash_next;
+		ASSERT(dbf != NULL);
+	}
+	*dbp = db->db_hash_next;
+	db->db_hash_next = NULL;
+	if (h->hash_table[idx] &&
+	    h->hash_table[idx]->db_hash_next == NULL)
+		DBUF_STAT_BUMPDOWN(hash_chains);
+	mutex_exit(DBUF_HASH_MUTEX(h, idx));
+	atomic_dec_64(&dbuf_hash_count);
+}
+
+typedef enum {
+	DBVU_EVICTING,
+	DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
+
+static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+	int64_t holds;
+
+	if (db->db_user == NULL)
+		return;
+
+	/* Only data blocks support the attachment of user data. */
+	ASSERT(db->db_level == 0);
+
+	/* Clients must resolve a dbuf before attaching user data. */
+	ASSERT(db->db.db_data != NULL);
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+
+	holds = zfs_refcount_count(&db->db_holds);
+	if (verify_type == DBVU_EVICTING) {
+		/*
+		 * Immediate eviction occurs when holds == dirtycnt.
+		 * For normal eviction buffers, holds is zero on
+		 * eviction, except when dbuf_fix_old_data() calls
+		 * dbuf_clear_data().  However, the hold count can grow
+		 * during eviction even though db_mtx is held (see
+		 * dmu_bonus_hold() for an example), so we can only
+		 * test the generic invariant that holds >= dirtycnt.
+		 */
+		ASSERT3U(holds, >=, db->db_dirtycnt);
+	} else {
+		if (db->db_user_immediate_evict == TRUE)
+			ASSERT3U(holds, >=, db->db_dirtycnt);
+		else
+			ASSERT3U(holds, >, 0);
+	}
+#endif
+}
+
+static void
+dbuf_evict_user(dmu_buf_impl_t *db)
+{
+	dmu_buf_user_t *dbu = db->db_user;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (dbu == NULL)
+		return;
+
+	dbuf_verify_user(db, DBVU_EVICTING);
+	db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+	if (dbu->dbu_clear_on_evict_dbufp != NULL)
+		*dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+	/*
+	 * There are two eviction callbacks - one that we call synchronously
+	 * and one that we invoke via a taskq.  The async one is useful for
+	 * avoiding lock order reversals and limiting stack depth.
+	 *
+	 * Note that if we have a sync callback but no async callback,
+	 * it's likely that the sync callback will free the structure
+	 * containing the dbu.  In that case we need to take care to not
+	 * dereference dbu after calling the sync evict func.
+	 */
+	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
+
+	if (dbu->dbu_evict_func_sync != NULL)
+		dbu->dbu_evict_func_sync(dbu);
+
+	if (has_async) {
+		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
+		    dbu, 0, &dbu->dbu_tqent);
+	}
+}
+
+boolean_t
+dbuf_is_metadata(dmu_buf_impl_t *db)
+{
+	/*
+	 * Consider indirect blocks and spill blocks to be meta data.
+	 */
+	if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
+		return (B_TRUE);
+	} else {
+		boolean_t is_metadata;
+
+		DB_DNODE_ENTER(db);
+		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
+		DB_DNODE_EXIT(db);
+
+		return (is_metadata);
+	}
+}
+
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the dbuf eviction
+ * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+static unsigned int
+dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
+{
+	dmu_buf_impl_t *db = obj;
+
+	/*
+	 * The assumption here, is the hash value for a given
+	 * dmu_buf_impl_t will remain constant throughout it's lifetime
+	 * (i.e. it's objset, object, level and blkid fields don't change).
+	 * Thus, we don't need to store the dbuf's sublist index
+	 * on insertion, as this index can be recalculated on removal.
+	 *
+	 * Also, the low order bits of the hash value are thought to be
+	 * distributed evenly. Otherwise, in the case that the multilist
+	 * has a power of two number of sublists, each sublists' usage
+	 * would not be evenly distributed.
+	 */
+	return (dbuf_hash(db->db_objset, db->db.db_object,
+	    db->db_level, db->db_blkid) %
+	    multilist_get_num_sublists(ml));
+}
+
+/*
+ * The target size of the dbuf cache can grow with the ARC target,
+ * unless limited by the tunable dbuf_cache_max_bytes.
+ */
+static inline unsigned long
+dbuf_cache_target_bytes(void)
+{
+	return (MIN(dbuf_cache_max_bytes,
+	    arc_target_bytes() >> dbuf_cache_shift));
+}
+
+/*
+ * The target size of the dbuf metadata cache can grow with the ARC target,
+ * unless limited by the tunable dbuf_metadata_cache_max_bytes.
+ */
+static inline unsigned long
+dbuf_metadata_cache_target_bytes(void)
+{
+	return (MIN(dbuf_metadata_cache_max_bytes,
+	    arc_target_bytes() >> dbuf_metadata_cache_shift));
+}
+
+static inline uint64_t
+dbuf_cache_hiwater_bytes(void)
+{
+	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+	return (dbuf_cache_target +
+	    (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100);
+}
+
+static inline uint64_t
+dbuf_cache_lowater_bytes(void)
+{
+	uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+	return (dbuf_cache_target -
+	    (dbuf_cache_target * dbuf_cache_lowater_pct) / 100);
+}
+
+static inline boolean_t
+dbuf_cache_above_lowater(void)
+{
+	return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+	    dbuf_cache_lowater_bytes());
+}
+
+/*
+ * Evict the oldest eligible dbuf from the dbuf cache.
+ */
+static void
+dbuf_evict_one(void)
+{
+	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+	multilist_sublist_t *mls = multilist_sublist_lock(
+	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
+
+	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
+
+	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
+	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
+		db = multilist_sublist_prev(mls, db);
+	}
+
+	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
+	    multilist_sublist_t *, mls);
+
+	if (db != NULL) {
+		multilist_sublist_remove(mls, db);
+		multilist_sublist_unlock(mls);
+		(void) zfs_refcount_remove_many(
+		    &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+		DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+		DBUF_STAT_BUMPDOWN(cache_count);
+		DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+		    db->db.db_size);
+		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+		db->db_caching_status = DB_NO_CACHE;
+		dbuf_destroy(db);
+		DBUF_STAT_BUMP(cache_total_evicts);
+	} else {
+		multilist_sublist_unlock(mls);
+	}
+}
+
+/*
+ * The dbuf evict thread is responsible for aging out dbufs from the
+ * cache. Once the cache has reached it's maximum size, dbufs are removed
+ * and destroyed. The eviction thread will continue running until the size
+ * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
+ * out of the cache it is destroyed and becomes eligible for arc eviction.
+ */
+/* ARGSUSED */
+static void
+dbuf_evict_thread(void *unused)
+{
+	callb_cpr_t cpr;
+
+	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
+
+	mutex_enter(&dbuf_evict_lock);
+	while (!dbuf_evict_thread_exit) {
+		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+			CALLB_CPR_SAFE_BEGIN(&cpr);
+			(void) cv_timedwait_idle_hires(&dbuf_evict_cv,
+			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
+		}
+		mutex_exit(&dbuf_evict_lock);
+
+		/*
+		 * Keep evicting as long as we're above the low water mark
+		 * for the cache. We do this without holding the locks to
+		 * minimize lock contention.
+		 */
+		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+			dbuf_evict_one();
+		}
+
+		mutex_enter(&dbuf_evict_lock);
+	}
+
+	dbuf_evict_thread_exit = B_FALSE;
+	cv_broadcast(&dbuf_evict_cv);
+	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
+	thread_exit();
+}
+
+/*
+ * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
+ * If the dbuf cache is at its high water mark, then evict a dbuf from the
+ * dbuf cache using the callers context.
+ */
+static void
+dbuf_evict_notify(uint64_t size)
+{
+	/*
+	 * We check if we should evict without holding the dbuf_evict_lock,
+	 * because it's OK to occasionally make the wrong decision here,
+	 * and grabbing the lock results in massive lock contention.
+	 */
+	if (size > dbuf_cache_target_bytes()) {
+		if (size > dbuf_cache_hiwater_bytes())
+			dbuf_evict_one();
+		cv_signal(&dbuf_evict_cv);
+	}
+}
+
+static int
+dbuf_kstat_update(kstat_t *ksp, int rw)
+{
+	dbuf_stats_t *ds = ksp->ks_data;
+
+	if (rw == KSTAT_WRITE) {
+		return (SET_ERROR(EACCES));
+	} else {
+		ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
+		    &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
+		ds->cache_size_bytes.value.ui64 =
+		    zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
+		ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
+		ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
+		ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
+		ds->hash_elements.value.ui64 = dbuf_hash_count;
+	}
+
+	return (0);
+}
+
+void
+dbuf_init(void)
+{
+	uint64_t hsize = 1ULL << 16;
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	/*
+	 * The hash table is big enough to fill all of physical memory
+	 * with an average block size of zfs_arc_average_blocksize (default 8K).
+	 * By default, the table will take up
+	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
+	 */
+	while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
+		hsize <<= 1;
+
+retry:
+	h->hash_table_mask = hsize - 1;
+#if defined(_KERNEL)
+	/*
+	 * Large allocations which do not require contiguous pages
+	 * should be using vmem_alloc() in the linux kernel
+	 */
+	h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+#else
+	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+#endif
+	if (h->hash_table == NULL) {
+		/* XXX - we should really return an error instead of assert */
+		ASSERT(hsize > (1ULL << 10));
+		hsize >>= 1;
+		goto retry;
+	}
+
+	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
+	    sizeof (dmu_buf_impl_t),
+	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+	dbuf_stats_init(h);
+
+	/*
+	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+	 * configuration is not required.
+	 */
+	dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
+
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		dbuf_caches[dcs].cache =
+		    multilist_create(sizeof (dmu_buf_impl_t),
+		    offsetof(dmu_buf_impl_t, db_cache_link),
+		    dbuf_cache_multilist_index_func);
+		zfs_refcount_create(&dbuf_caches[dcs].size);
+	}
+
+	dbuf_evict_thread_exit = B_FALSE;
+	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
+	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
+	    NULL, 0, &p0, TS_RUN, minclsyspri);
+
+	dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (dbuf_ksp != NULL) {
+		for (i = 0; i < DN_MAX_LEVELS; i++) {
+			snprintf(dbuf_stats.cache_levels[i].name,
+			    KSTAT_STRLEN, "cache_level_%d", i);
+			dbuf_stats.cache_levels[i].data_type =
+			    KSTAT_DATA_UINT64;
+			snprintf(dbuf_stats.cache_levels_bytes[i].name,
+			    KSTAT_STRLEN, "cache_level_%d_bytes", i);
+			dbuf_stats.cache_levels_bytes[i].data_type =
+			    KSTAT_DATA_UINT64;
+		}
+		dbuf_ksp->ks_data = &dbuf_stats;
+		dbuf_ksp->ks_update = dbuf_kstat_update;
+		kstat_install(dbuf_ksp);
+	}
+}
+
+void
+dbuf_fini(void)
+{
+	dbuf_hash_table_t *h = &dbuf_hash_table;
+	int i;
+
+	dbuf_stats_destroy();
+
+	for (i = 0; i < DBUF_MUTEXES; i++)
+		mutex_destroy(&h->hash_mutexes[i]);
+#if defined(_KERNEL)
+	/*
+	 * Large allocations which do not require contiguous pages
+	 * should be using vmem_free() in the linux kernel
+	 */
+	vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#else
+	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
+#endif
+	kmem_cache_destroy(dbuf_kmem_cache);
+	taskq_destroy(dbu_evict_taskq);
+
+	mutex_enter(&dbuf_evict_lock);
+	dbuf_evict_thread_exit = B_TRUE;
+	while (dbuf_evict_thread_exit) {
+		cv_signal(&dbuf_evict_cv);
+		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
+	}
+	mutex_exit(&dbuf_evict_lock);
+
+	mutex_destroy(&dbuf_evict_lock);
+	cv_destroy(&dbuf_evict_cv);
+
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		zfs_refcount_destroy(&dbuf_caches[dcs].size);
+		multilist_destroy(dbuf_caches[dcs].cache);
+	}
+
+	if (dbuf_ksp != NULL) {
+		kstat_delete(dbuf_ksp);
+		dbuf_ksp = NULL;
+	}
+}
+
+/*
+ * Other stuff.
+ */
+
+#ifdef ZFS_DEBUG
+static void
+dbuf_verify(dmu_buf_impl_t *db)
+{
+	dnode_t *dn;
+	dbuf_dirty_record_t *dr;
+	uint32_t txg_prev;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
+		return;
+
+	ASSERT(db->db_objset != NULL);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dn == NULL) {
+		ASSERT(db->db_parent == NULL);
+		ASSERT(db->db_blkptr == NULL);
+	} else {
+		ASSERT3U(db->db.db_object, ==, dn->dn_object);
+		ASSERT3P(db->db_objset, ==, dn->dn_objset);
+		ASSERT3U(db->db_level, <, dn->dn_nlevels);
+		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+		    db->db_blkid == DMU_SPILL_BLKID ||
+		    !avl_is_empty(&dn->dn_dbufs));
+	}
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		ASSERT(dn != NULL);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
+	} else if (db->db_blkid == DMU_SPILL_BLKID) {
+		ASSERT(dn != NULL);
+		ASSERT0(db->db.db_offset);
+	} else {
+		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
+	}
+
+	if ((dr = list_head(&db->db_dirty_records)) != NULL) {
+		ASSERT(dr->dr_dbuf == db);
+		txg_prev = dr->dr_txg;
+		for (dr = list_next(&db->db_dirty_records, dr); dr != NULL;
+		    dr = list_next(&db->db_dirty_records, dr)) {
+			ASSERT(dr->dr_dbuf == db);
+			ASSERT(txg_prev > dr->dr_txg);
+			txg_prev = dr->dr_txg;
+		}
+	}
+
+	/*
+	 * We can't assert that db_size matches dn_datablksz because it
+	 * can be momentarily different when another thread is doing
+	 * dnode_set_blksz().
+	 */
+	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
+		dr = db->db_data_pending;
+		/*
+		 * It should only be modified in syncing context, so
+		 * make sure we only have one copy of the data.
+		 */
+		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
+	}
+
+	/* verify db->db_blkptr */
+	if (db->db_blkptr) {
+		if (db->db_parent == dn->dn_dbuf) {
+			/* db is pointed to by the dnode */
+			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
+			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
+				ASSERT(db->db_parent == NULL);
+			else
+				ASSERT(db->db_parent != NULL);
+			if (db->db_blkid != DMU_SPILL_BLKID)
+				ASSERT3P(db->db_blkptr, ==,
+				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		} else {
+			/* db is pointed to by an indirect block */
+			int epb __maybe_unused = db->db_parent->db.db_size >>
+			    SPA_BLKPTRSHIFT;
+			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
+			ASSERT3U(db->db_parent->db.db_object, ==,
+			    db->db.db_object);
+			/*
+			 * dnode_grow_indblksz() can make this fail if we don't
+			 * have the parent's rwlock.  XXX indblksz no longer
+			 * grows.  safe to do this now?
+			 */
+			if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) {
+				ASSERT3P(db->db_blkptr, ==,
+				    ((blkptr_t *)db->db_parent->db.db_data +
+				    db->db_blkid % epb));
+			}
+		}
+	}
+	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
+	    (db->db_buf == NULL || db->db_buf->b_data) &&
+	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
+	    db->db_state != DB_FILL && !dn->dn_free_txg) {
+		/*
+		 * If the blkptr isn't set but they have nonzero data,
+		 * it had better be dirty, otherwise we'll lose that
+		 * data when we evict this buffer.
+		 *
+		 * There is an exception to this rule for indirect blocks; in
+		 * this case, if the indirect block is a hole, we fill in a few
+		 * fields on each of the child blocks (importantly, birth time)
+		 * to prevent hole birth times from being lost when you
+		 * partially fill in a hole.
+		 */
+		if (db->db_dirtycnt == 0) {
+			if (db->db_level == 0) {
+				uint64_t *buf = db->db.db_data;
+				int i;
+
+				for (i = 0; i < db->db.db_size >> 3; i++) {
+					ASSERT(buf[i] == 0);
+				}
+			} else {
+				blkptr_t *bps = db->db.db_data;
+				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
+				    db->db.db_size);
+				/*
+				 * We want to verify that all the blkptrs in the
+				 * indirect block are holes, but we may have
+				 * automatically set up a few fields for them.
+				 * We iterate through each blkptr and verify
+				 * they only have those fields set.
+				 */
+				for (int i = 0;
+				    i < db->db.db_size / sizeof (blkptr_t);
+				    i++) {
+					blkptr_t *bp = &bps[i];
+					ASSERT(ZIO_CHECKSUM_IS_ZERO(
+					    &bp->blk_cksum));
+					ASSERT(
+					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
+					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
+					    DVA_IS_EMPTY(&bp->blk_dva[2]));
+					ASSERT0(bp->blk_fill);
+					ASSERT0(bp->blk_pad[0]);
+					ASSERT0(bp->blk_pad[1]);
+					ASSERT(!BP_IS_EMBEDDED(bp));
+					ASSERT(BP_IS_HOLE(bp));
+					ASSERT0(bp->blk_phys_birth);
+				}
+			}
+		}
+	}
+	DB_DNODE_EXIT(db);
+}
+#endif
+
+static void
+dbuf_clear_data(dmu_buf_impl_t *db)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	dbuf_evict_user(db);
+	ASSERT3P(db->db_buf, ==, NULL);
+	db->db.db_data = NULL;
+	if (db->db_state != DB_NOFILL) {
+		db->db_state = DB_UNCACHED;
+		DTRACE_SET_STATE(db, "clear data");
+	}
+}
+
+static void
+dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(buf != NULL);
+
+	db->db_buf = buf;
+	ASSERT(buf->b_data != NULL);
+	db->db.db_data = buf->b_data;
+}
+
+static arc_buf_t *
+dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data)
+{
+	objset_t *os = db->db_objset;
+	spa_t *spa = os->os_spa;
+	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+	enum zio_compress compress_type;
+	uint8_t complevel;
+	int psize, lsize;
+
+	psize = arc_buf_size(data);
+	lsize = arc_buf_lsize(data);
+	compress_type = arc_get_compression(data);
+	complevel = arc_get_complevel(data);
+
+	if (arc_is_encrypted(data)) {
+		boolean_t byteorder;
+		uint8_t salt[ZIO_DATA_SALT_LEN];
+		uint8_t iv[ZIO_DATA_IV_LEN];
+		uint8_t mac[ZIO_DATA_MAC_LEN];
+		dnode_t *dn = DB_DNODE(db);
+
+		arc_get_raw_params(data, &byteorder, salt, iv, mac);
+		data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os),
+		    byteorder, salt, iv, mac, dn->dn_type, psize, lsize,
+		    compress_type, complevel);
+	} else if (compress_type != ZIO_COMPRESS_OFF) {
+		ASSERT3U(type, ==, ARC_BUFC_DATA);
+		data = arc_alloc_compressed_buf(spa, db,
+		    psize, lsize, compress_type, complevel);
+	} else {
+		data = arc_alloc_buf(spa, db, type, psize);
+	}
+	return (data);
+}
+
+static arc_buf_t *
+dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
+{
+	spa_t *spa = db->db_objset->os_spa;
+
+	return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size));
+}
+
+/*
+ * Loan out an arc_buf for read.  Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+	arc_buf_t *abuf;
+
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	mutex_enter(&db->db_mtx);
+	if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) {
+		int blksz = db->db.db_size;
+		spa_t *spa = db->db_objset->os_spa;
+
+		mutex_exit(&db->db_mtx);
+		abuf = arc_loan_buf(spa, B_FALSE, blksz);
+		bcopy(db->db.db_data, abuf->b_data, blksz);
+	} else {
+		abuf = db->db_buf;
+		arc_loan_inuse_buf(abuf, db);
+		db->db_buf = NULL;
+		dbuf_clear_data(db);
+		mutex_exit(&db->db_mtx);
+	}
+	return (abuf);
+}
+
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
+uint64_t
+dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
+{
+	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+		/*
+		 * The level n blkid is equal to the level 0 blkid divided by
+		 * the number of level 0s in a level n block.
+		 *
+		 * The level 0 blkid is offset >> datablkshift =
+		 * offset / 2^datablkshift.
+		 *
+		 * The number of level 0s in a level n is the number of block
+		 * pointers in an indirect block, raised to the power of level.
+		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+		 *
+		 * Thus, the level n blkid is: offset /
+		 * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT))))
+		 * = offset / 2^(datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 * = offset >> (datablkshift + level *
+		 *   (indblkshift - SPA_BLKPTRSHIFT))
+		 */
+
+		const unsigned exp = dn->dn_datablkshift +
+		    level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+
+		if (exp >= 8 * sizeof (offset)) {
+			/* This only happens on the highest indirection level */
+			ASSERT3U(level, ==, dn->dn_nlevels - 1);
+			return (0);
+		}
+
+		ASSERT3U(exp, <, 8 * sizeof (offset));
+
+		return (offset >> exp);
+	} else {
+		ASSERT3U(offset, <, dn->dn_datablksz);
+		return (0);
+	}
+}
+
+/*
+ * This function is used to lock the parent of the provided dbuf. This should be
+ * used when modifying or reading db_blkptr.
+ */
+db_lock_type_t
+dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
+{
+	enum db_lock_type ret = DLT_NONE;
+	if (db->db_parent != NULL) {
+		rw_enter(&db->db_parent->db_rwlock, rw);
+		ret = DLT_PARENT;
+	} else if (dmu_objset_ds(db->db_objset) != NULL) {
+		rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw,
+		    tag);
+		ret = DLT_OBJSET;
+	}
+	/*
+	 * We only return a DLT_NONE lock when it's the top-most indirect block
+	 * of the meta-dnode of the MOS.
+	 */
+	return (ret);
+}
+
+/*
+ * We need to pass the lock type in because it's possible that the block will
+ * move from being the topmost indirect block in a dnode (and thus, have no
+ * parent) to not the top-most via an indirection increase. This would cause a
+ * panic if we didn't pass the lock type in.
+ */
+void
+dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
+{
+	if (type == DLT_PARENT)
+		rw_exit(&db->db_parent->db_rwlock);
+	else if (type == DLT_OBJSET)
+		rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag);
+}
+
+static void
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+
+	mutex_enter(&db->db_mtx);
+	ASSERT3U(db->db_state, ==, DB_READ);
+	/*
+	 * All reads are synchronous, so we must have a hold on the dbuf
+	 */
+	ASSERT(zfs_refcount_count(&db->db_holds) > 0);
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db.db_data == NULL);
+	if (buf == NULL) {
+		/* i/o error */
+		ASSERT(zio == NULL || zio->io_error != 0);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT3P(db->db_buf, ==, NULL);
+		db->db_state = DB_UNCACHED;
+		DTRACE_SET_STATE(db, "i/o error");
+	} else if (db->db_level == 0 && db->db_freed_in_flight) {
+		/* freed in flight */
+		ASSERT(zio == NULL || zio->io_error == 0);
+		arc_release(buf, db);
+		bzero(buf->b_data, db->db.db_size);
+		arc_buf_freeze(buf);
+		db->db_freed_in_flight = FALSE;
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+		DTRACE_SET_STATE(db, "freed in flight");
+	} else {
+		/* success */
+		ASSERT(zio == NULL || zio->io_error == 0);
+		dbuf_set_data(db, buf);
+		db->db_state = DB_CACHED;
+		DTRACE_SET_STATE(db, "successful read");
+	}
+	cv_broadcast(&db->db_changed);
+	dbuf_rele_and_unlock(db, NULL, B_FALSE);
+}
+
+/*
+ * Shortcut for performing reads on bonus dbufs.  Returns
+ * an error if we fail to verify the dnode associated with
+ * a decrypted block. Otherwise success.
+ */
+static int
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+{
+	int bonuslen, max_bonuslen, err;
+
+	err = dbuf_read_verify_dnode_crypt(db, flags);
+	if (err)
+		return (err);
+
+	bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
+	max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(DB_DNODE_HELD(db));
+	ASSERT3U(bonuslen, <=, db->db.db_size);
+	db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
+	arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
+	if (bonuslen < max_bonuslen)
+		bzero(db->db.db_data, max_bonuslen);
+	if (bonuslen)
+		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+	db->db_state = DB_CACHED;
+	DTRACE_SET_STATE(db, "bonus buffer filled");
+	return (0);
+}
+
+static void
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+{
+	blkptr_t *bps = db->db.db_data;
+	uint32_t indbs = 1ULL << dn->dn_indblkshift;
+	int n_bps = indbs >> SPA_BLKPTRSHIFT;
+
+	for (int i = 0; i < n_bps; i++) {
+		blkptr_t *bp = &bps[i];
+
+		ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
+		BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
+		    dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
+		BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
+		BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
+		BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+	}
+}
+
+/*
+ * Handle reads on dbufs that are holes, if necessary.  This function
+ * requires that the dbuf's mutex is held. Returns success (0) if action
+ * was taken, ENOENT if no action was taken.
+ */
+static int
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+{
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+	/*
+	 * For level 0 blocks only, if the above check fails:
+	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+	 * processes the delete record and clears the bp while we are waiting
+	 * for the dn_mtx (resulting in a "no" from block_freed).
+	 */
+	if (!is_hole && db->db_level == 0) {
+		is_hole = dnode_block_freed(dn, db->db_blkid) ||
+		    BP_IS_HOLE(db->db_blkptr);
+	}
+
+	if (is_hole) {
+		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
+		bzero(db->db.db_data, db->db.db_size);
+
+		if (db->db_blkptr != NULL && db->db_level > 0 &&
+		    BP_IS_HOLE(db->db_blkptr) &&
+		    db->db_blkptr->blk_birth != 0) {
+			dbuf_handle_indirect_hole(db, dn);
+		}
+		db->db_state = DB_CACHED;
+		DTRACE_SET_STATE(db, "hole read satisfied");
+		return (0);
+	}
+	return (ENOENT);
+}
+
+/*
+ * This function ensures that, when doing a decrypting read of a block,
+ * we make sure we have decrypted the dnode associated with it. We must do
+ * this so that we ensure we are fully authenticating the checksum-of-MACs
+ * tree from the root of the objset down to this block. Indirect blocks are
+ * always verified against their secure checksum-of-MACs assuming that the
+ * dnode containing them is correct. Now that we are doing a decrypting read,
+ * we can be sure that the key is loaded and verify that assumption. This is
+ * especially important considering that we always read encrypted dnode
+ * blocks as raw data (without verifying their MACs) to start, and
+ * decrypt / authenticate them when we need to read an encrypted bonus buffer.
+ */
+static int
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+{
+	int err = 0;
+	objset_t *os = db->db_objset;
+	arc_buf_t *dnode_abuf;
+	dnode_t *dn;
+	zbookmark_phys_t zb;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (!os->os_encrypted || os->os_raw_receive ||
+	    (flags & DB_RF_NO_DECRYPT) != 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
+
+	if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
+		DB_DNODE_EXIT(db);
+		return (0);
+	}
+
+	SET_BOOKMARK(&zb, dmu_objset_id(os),
+	    DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
+	err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+
+	/*
+	 * An error code of EACCES tells us that the key is still not
+	 * available. This is ok if we are only reading authenticated
+	 * (and therefore non-encrypted) blocks.
+	 */
+	if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID &&
+	    !DMU_OT_IS_ENCRYPTED(dn->dn_type)) ||
+	    (db->db_blkid == DMU_BONUS_BLKID &&
+	    !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
+		err = 0;
+
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Drops db_mtx and the parent lock specified by dblt and tag before
+ * returning.
+ */
+static int
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
+    db_lock_type_t dblt, void *tag)
+{
+	dnode_t *dn;
+	zbookmark_phys_t zb;
+	uint32_t aflags = ARC_FLAG_NOWAIT;
+	int err, zio_flags;
+	boolean_t bonus_read;
+
+	err = zio_flags = 0;
+	bonus_read = B_FALSE;
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_state == DB_UNCACHED);
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db_parent == NULL ||
+	    RW_LOCK_HELD(&db->db_parent->db_rwlock));
+
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		err = dbuf_read_bonus(db, dn, flags);
+		goto early_unlock;
+	}
+
+	err = dbuf_read_hole(db, dn, flags);
+	if (err == 0)
+		goto early_unlock;
+
+	/*
+	 * Any attempt to read a redacted block should result in an error. This
+	 * will never happen under normal conditions, but can be useful for
+	 * debugging purposes.
+	 */
+	if (BP_IS_REDACTED(db->db_blkptr)) {
+		ASSERT(dsl_dataset_feature_is_active(
+		    db->db_objset->os_dsl_dataset,
+		    SPA_FEATURE_REDACTED_DATASETS));
+		err = SET_ERROR(EIO);
+		goto early_unlock;
+	}
+
+	SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	/*
+	 * All bps of an encrypted os should have the encryption bit set.
+	 * If this is not true it indicates tampering and we report an error.
+	 */
+	if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+		spa_log_error(db->db_objset->os_spa, &zb);
+		zfs_panic_recover("unencrypted block in encrypted "
+		    "object set %llu", dmu_objset_id(db->db_objset));
+		err = SET_ERROR(EIO);
+		goto early_unlock;
+	}
+
+	err = dbuf_read_verify_dnode_crypt(db, flags);
+	if (err != 0)
+		goto early_unlock;
+
+	DB_DNODE_EXIT(db);
+
+	db->db_state = DB_READ;
+	DTRACE_SET_STATE(db, "read issued");
+	mutex_exit(&db->db_mtx);
+
+	if (DBUF_IS_L2CACHEABLE(db))
+		aflags |= ARC_FLAG_L2CACHE;
+
+	dbuf_add_ref(db, NULL);
+
+	zio_flags = (flags & DB_RF_CANFAIL) ?
+	    ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
+
+	if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+		zio_flags |= ZIO_FLAG_RAW;
+	/*
+	 * The zio layer will copy the provided blkptr later, but we need to
+	 * do this now so that we can release the parent's rwlock. We have to
+	 * do that now so that if dbuf_read_done is called synchronously (on
+	 * an l1 cache hit) we don't acquire the db_mtx while holding the
+	 * parent's rwlock, which would be a lock ordering violation.
+	 */
+	blkptr_t bp = *db->db_blkptr;
+	dmu_buf_unlock_parent(db, dblt, tag);
+	(void) arc_read(zio, db->db_objset->os_spa, &bp,
+	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
+	    &aflags, &zb);
+	return (err);
+early_unlock:
+	DB_DNODE_EXIT(db);
+	mutex_exit(&db->db_mtx);
+	dmu_buf_unlock_parent(db, dblt, tag);
+	return (err);
+}
+
+/*
+ * This is our just-in-time copy function.  It makes a copy of buffers that
+ * have been modified in a previous transaction group before we access them in
+ * the current active group.
+ *
+ * This function is used in three places: when we are dirtying a buffer for the
+ * first time in a txg, when we are freeing a range in a dnode that includes
+ * this buffer, and when we are accessing a buffer which was received compressed
+ * and later referenced in a WRITE_BYREF record.
+ *
+ * Note that when we are called from dbuf_free_range() we do not put a hold on
+ * the buffer, we just traverse the active dbuf list for the dnode.
+ */
+static void
+dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+	dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db.db_data != NULL);
+	ASSERT(db->db_level == 0);
+	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
+
+	if (dr == NULL ||
+	    (dr->dt.dl.dr_data !=
+	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
+		return;
+
+	/*
+	 * If the last dirty record for this dbuf has not yet synced
+	 * and its referencing the dbuf data, either:
+	 *	reset the reference to point to a new copy,
+	 * or (if there a no active holders)
+	 *	just null out the current db_data pointer.
+	 */
+	ASSERT3U(dr->dr_txg, >=, txg - 2);
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		dnode_t *dn = DB_DNODE(db);
+		int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+		dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
+		arc_space_consume(bonuslen, ARC_SPACE_BONUS);
+		bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
+	} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
+		arc_buf_t *buf = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
+		dr->dt.dl.dr_data = buf;
+		bcopy(db->db.db_data, buf->b_data, arc_buf_size(buf));
+	} else {
+		db->db_buf = NULL;
+		dbuf_clear_data(db);
+	}
+}
+
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
+{
+	int err = 0;
+	boolean_t prefetch;
+	dnode_t *dn;
+
+	/*
+	 * We don't have to hold the mutex to check db_state because it
+	 * can't be freed while we have a hold on the buffer.
+	 */
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+	if (db->db_state == DB_NOFILL)
+		return (SET_ERROR(EIO));
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
+	    DBUF_IS_CACHEABLE(db);
+
+	mutex_enter(&db->db_mtx);
+	if (db->db_state == DB_CACHED) {
+		spa_t *spa = dn->dn_objset->os_spa;
+
+		/*
+		 * Ensure that this block's dnode has been decrypted if
+		 * the caller has requested decrypted data.
+		 */
+		err = dbuf_read_verify_dnode_crypt(db, flags);
+
+		/*
+		 * If the arc buf is compressed or encrypted and the caller
+		 * requested uncompressed data, we need to untransform it
+		 * before returning. We also call arc_untransform() on any
+		 * unauthenticated blocks, which will verify their MAC if
+		 * the key is now available.
+		 */
+		if (err == 0 && db->db_buf != NULL &&
+		    (flags & DB_RF_NO_DECRYPT) == 0 &&
+		    (arc_is_encrypted(db->db_buf) ||
+		    arc_is_unauthenticated(db->db_buf) ||
+		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+			zbookmark_phys_t zb;
+
+			SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+			    db->db.db_object, db->db_level, db->db_blkid);
+			dbuf_fix_old_data(db, spa_syncing_txg(spa));
+			err = arc_untransform(db->db_buf, spa, &zb, B_FALSE);
+			dbuf_set_data(db, db->db_buf);
+		}
+		mutex_exit(&db->db_mtx);
+		if (err == 0 && prefetch) {
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+			    flags & DB_RF_HAVESTRUCT);
+		}
+		DB_DNODE_EXIT(db);
+		DBUF_STAT_BUMP(hash_hits);
+	} else if (db->db_state == DB_UNCACHED) {
+		spa_t *spa = dn->dn_objset->os_spa;
+		boolean_t need_wait = B_FALSE;
+
+		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+
+		if (zio == NULL &&
+		    db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+			need_wait = B_TRUE;
+		}
+		err = dbuf_read_impl(db, zio, flags, dblt, FTAG);
+		/*
+		 * dbuf_read_impl has dropped db_mtx and our parent's rwlock
+		 * for us
+		 */
+		if (!err && prefetch) {
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+			    flags & DB_RF_HAVESTRUCT);
+		}
+
+		DB_DNODE_EXIT(db);
+		DBUF_STAT_BUMP(hash_misses);
+
+		/*
+		 * If we created a zio_root we must execute it to avoid
+		 * leaking it, even if it isn't attached to any work due
+		 * to an error in dbuf_read_impl().
+		 */
+		if (need_wait) {
+			if (err == 0)
+				err = zio_wait(zio);
+			else
+				VERIFY0(zio_wait(zio));
+		}
+	} else {
+		/*
+		 * Another reader came in while the dbuf was in flight
+		 * between UNCACHED and CACHED.  Either a writer will finish
+		 * writing the buffer (sending the dbuf to CACHED) or the
+		 * first reader's request will reach the read_done callback
+		 * and send the dbuf to CACHED.  Otherwise, a failure
+		 * occurred and the dbuf went to UNCACHED.
+		 */
+		mutex_exit(&db->db_mtx);
+		if (prefetch) {
+			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+			    flags & DB_RF_HAVESTRUCT);
+		}
+		DB_DNODE_EXIT(db);
+		DBUF_STAT_BUMP(hash_misses);
+
+		/* Skip the wait per the caller's request. */
+		if ((flags & DB_RF_NEVERWAIT) == 0) {
+			mutex_enter(&db->db_mtx);
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL) {
+				ASSERT(db->db_state == DB_READ ||
+				    (flags & DB_RF_HAVESTRUCT) == 0);
+				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+				    db, zio_t *, zio);
+				cv_wait(&db->db_changed, &db->db_mtx);
+			}
+			if (db->db_state == DB_UNCACHED)
+				err = SET_ERROR(EIO);
+			mutex_exit(&db->db_mtx);
+		}
+	}
+
+	return (err);
+}
+
+static void
+dbuf_noread(dmu_buf_impl_t *db)
+{
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	mutex_enter(&db->db_mtx);
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+	if (db->db_state == DB_UNCACHED) {
+		ASSERT(db->db_buf == NULL);
+		ASSERT(db->db.db_data == NULL);
+		dbuf_set_data(db, dbuf_alloc_arcbuf(db));
+		db->db_state = DB_FILL;
+		DTRACE_SET_STATE(db, "assigning filled buffer");
+	} else if (db->db_state == DB_NOFILL) {
+		dbuf_clear_data(db);
+	} else {
+		ASSERT3U(db->db_state, ==, DB_CACHED);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+void
+dbuf_unoverride(dbuf_dirty_record_t *dr)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
+	uint64_t txg = dr->dr_txg;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	/*
+	 * This assert is valid because dmu_sync() expects to be called by
+	 * a zilog's get_data while holding a range lock.  This call only
+	 * comes from dbuf_dirty() callers who must also hold a range lock.
+	 */
+	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
+	ASSERT(db->db_level == 0);
+
+	if (db->db_blkid == DMU_BONUS_BLKID ||
+	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
+		return;
+
+	ASSERT(db->db_data_pending != dr);
+
+	/* free this block */
+	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+		zio_free(db->db_objset->os_spa, txg, bp);
+
+	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	dr->dt.dl.dr_nopwrite = B_FALSE;
+	dr->dt.dl.dr_has_raw_params = B_FALSE;
+
+	/*
+	 * Release the already-written buffer, so we leave it in
+	 * a consistent dirty state.  Note that all callers are
+	 * modifying the buffer, so they will immediately do
+	 * another (redundant) arc_release().  Therefore, leave
+	 * the buf thawed to save the effort of freezing &
+	 * immediately re-thawing it.
+	 */
+	arc_release(dr->dt.dl.dr_data, db);
+}
+
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks.
+ */
+void
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db_search;
+	dmu_buf_impl_t *db, *db_next;
+	uint64_t txg = tx->tx_txg;
+	avl_index_t where;
+	dbuf_dirty_record_t *dr;
+
+	if (end_blkid > dn->dn_maxblkid &&
+	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
+		end_blkid = dn->dn_maxblkid;
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+	db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+	db_search->db_level = 0;
+	db_search->db_blkid = start_blkid;
+	db_search->db_state = DB_SEARCH;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	db = avl_find(&dn->dn_dbufs, db_search, &where);
+	ASSERT3P(db, ==, NULL);
+
+	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+	for (; db != NULL; db = db_next) {
+		db_next = AVL_NEXT(&dn->dn_dbufs, db);
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+		if (db->db_level != 0 || db->db_blkid > end_blkid) {
+			break;
+		}
+		ASSERT3U(db->db_blkid, >=, start_blkid);
+
+		/* found a level 0 buffer in the range */
+		mutex_enter(&db->db_mtx);
+		if (dbuf_undirty(db, tx)) {
+			/* mutex has been dropped and dbuf destroyed */
+			continue;
+		}
+
+		if (db->db_state == DB_UNCACHED ||
+		    db->db_state == DB_NOFILL ||
+		    db->db_state == DB_EVICTING) {
+			ASSERT(db->db.db_data == NULL);
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
+			/* will be handled in dbuf_read_done or dbuf_rele */
+			db->db_freed_in_flight = TRUE;
+			mutex_exit(&db->db_mtx);
+			continue;
+		}
+		if (zfs_refcount_count(&db->db_holds) == 0) {
+			ASSERT(db->db_buf);
+			dbuf_destroy(db);
+			continue;
+		}
+		/* The dbuf is referenced */
+
+		dr = list_head(&db->db_dirty_records);
+		if (dr != NULL) {
+			if (dr->dr_txg == txg) {
+				/*
+				 * This buffer is "in-use", re-adjust the file
+				 * size to reflect that this buffer may
+				 * contain new data when we sync.
+				 */
+				if (db->db_blkid != DMU_SPILL_BLKID &&
+				    db->db_blkid > dn->dn_maxblkid)
+					dn->dn_maxblkid = db->db_blkid;
+				dbuf_unoverride(dr);
+			} else {
+				/*
+				 * This dbuf is not dirty in the open context.
+				 * Either uncache it (if its not referenced in
+				 * the open context) or reset its contents to
+				 * empty.
+				 */
+				dbuf_fix_old_data(db, txg);
+			}
+		}
+		/* clear the contents if its cached */
+		if (db->db_state == DB_CACHED) {
+			ASSERT(db->db.db_data != NULL);
+			arc_release(db->db_buf, db);
+			rw_enter(&db->db_rwlock, RW_WRITER);
+			bzero(db->db.db_data, db->db.db_size);
+			rw_exit(&db->db_rwlock);
+			arc_buf_freeze(db->db_buf);
+		}
+
+		mutex_exit(&db->db_mtx);
+	}
+
+	kmem_free(db_search, sizeof (dmu_buf_impl_t));
+	mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+void
+dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
+{
+	arc_buf_t *buf, *old_buf;
+	dbuf_dirty_record_t *dr;
+	int osize = db->db.db_size;
+	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+	dnode_t *dn;
+
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	/*
+	 * XXX we should be doing a dbuf_read, checking the return
+	 * value and returning that up to our callers
+	 */
+	dmu_buf_will_dirty(&db->db, tx);
+
+	/* create the data buffer for the new block */
+	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
+
+	/* copy old block data to the new block */
+	old_buf = db->db_buf;
+	bcopy(old_buf->b_data, buf->b_data, MIN(osize, size));
+	/* zero the remainder */
+	if (size > osize)
+		bzero((uint8_t *)buf->b_data + osize, size - osize);
+
+	mutex_enter(&db->db_mtx);
+	dbuf_set_data(db, buf);
+	arc_buf_destroy(old_buf, db);
+	db->db.db_size = size;
+
+	dr = list_head(&db->db_dirty_records);
+	/* dirty record added by dmu_buf_will_dirty() */
+	VERIFY(dr != NULL);
+	if (db->db_level == 0)
+		dr->dt.dl.dr_data = buf;
+	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+	ASSERT3U(dr->dr_accounted, ==, osize);
+	dr->dr_accounted = size;
+	mutex_exit(&db->db_mtx);
+
+	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
+	DB_DNODE_EXIT(db);
+}
+
+void
+dbuf_release_bp(dmu_buf_impl_t *db)
+{
+	objset_t *os __maybe_unused = db->db_objset;
+
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(arc_released(os->os_phys_buf) ||
+	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
+	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
+	(void) arc_release(db->db_buf, db);
+}
+
+/*
+ * We already have a dirty record for this TXG, and we are being
+ * dirtied again.
+ */
+static void
+dbuf_redirty(dbuf_dirty_record_t *dr)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+		/*
+		 * If this buffer has already been written out,
+		 * we now need to reset its state.
+		 */
+		dbuf_unoverride(dr);
+		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+		    db->db_state != DB_NOFILL) {
+			/* Already released on initial dirty, so just thaw. */
+			ASSERT(arc_released(db->db_buf));
+			arc_buf_thaw(db->db_buf);
+		}
+	}
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
+	dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
+	ASSERT(dn->dn_maxblkid >= blkid);
+
+	dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
+	list_link_init(&dr->dr_dirty_node);
+	list_link_init(&dr->dr_dbuf_node);
+	dr->dr_dnode = dn;
+	dr->dr_txg = tx->tx_txg;
+	dr->dt.dll.dr_blkid = blkid;
+	dr->dr_accounted = dn->dn_datablksz;
+
+	/*
+	 * There should not be any dbuf for the block that we're dirtying.
+	 * Otherwise the buffer contents could be inconsistent between the
+	 * dbuf and the lightweight dirty record.
+	 */
+	ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+
+	mutex_enter(&dn->dn_mtx);
+	int txgoff = tx->tx_txg & TXG_MASK;
+	if (dn->dn_free_ranges[txgoff] != NULL) {
+		range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
+	}
+
+	if (dn->dn_nlevels == 1) {
+		ASSERT3U(blkid, <, dn->dn_nblkptr);
+		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+		mutex_exit(&dn->dn_mtx);
+		rw_exit(&dn->dn_struct_rwlock);
+		dnode_setdirty(dn, tx);
+	} else {
+		mutex_exit(&dn->dn_mtx);
+
+		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+		dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
+		    1, blkid >> epbs, FTAG);
+		rw_exit(&dn->dn_struct_rwlock);
+		if (parent_db == NULL) {
+			kmem_free(dr, sizeof (*dr));
+			return (NULL);
+		}
+		int err = dbuf_read(parent_db, NULL,
+		    (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+		if (err != 0) {
+			dbuf_rele(parent_db, FTAG);
+			kmem_free(dr, sizeof (*dr));
+			return (NULL);
+		}
+
+		dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
+		dbuf_rele(parent_db, FTAG);
+		mutex_enter(&parent_dr->dt.di.dr_mtx);
+		ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
+		list_insert_tail(&parent_dr->dt.di.dr_children, dr);
+		mutex_exit(&parent_dr->dt.di.dr_mtx);
+		dr->dr_parent = parent_dr;
+	}
+
+	dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
+
+	return (dr);
+}
+
+dbuf_dirty_record_t *
+dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	objset_t *os;
+	dbuf_dirty_record_t *dr, *dr_next, *dr_head;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	boolean_t drop_struct_rwlock = B_FALSE;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+	DMU_TX_DIRTY_BUF(tx, db);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	/*
+	 * Shouldn't dirty a regular buffer in syncing context.  Private
+	 * objects may be dirtied in syncing context, but only if they
+	 * were already pre-dirtied in open context.
+	 */
+#ifdef ZFS_DEBUG
+	if (dn->dn_objset->os_dsl_dataset != NULL) {
+		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+		    RW_READER, FTAG);
+	}
+	ASSERT(!dmu_tx_is_syncing(tx) ||
+	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
+	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    dn->dn_objset->os_dsl_dataset == NULL);
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
+	/*
+	 * We make this assert for private objects as well, but after we
+	 * check if we're already dirty.  They are allowed to re-dirty
+	 * in syncing context.
+	 */
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	mutex_enter(&db->db_mtx);
+	/*
+	 * XXX make this true for indirects too?  The problem is that
+	 * transactions created with dmu_tx_create_assigned() from
+	 * syncing context don't bother holding ahead.
+	 */
+	ASSERT(db->db_level != 0 ||
+	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
+	    db->db_state == DB_NOFILL);
+
+	mutex_enter(&dn->dn_mtx);
+	dnode_set_dirtyctx(dn, tx, db);
+	if (tx->tx_txg > dn->dn_dirty_txg)
+		dn->dn_dirty_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+
+	if (db->db_blkid == DMU_SPILL_BLKID)
+		dn->dn_have_spill = B_TRUE;
+
+	/*
+	 * If this buffer is already dirty, we're done.
+	 */
+	dr_head = list_head(&db->db_dirty_records);
+	ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg ||
+	    db->db.db_object == DMU_META_DNODE_OBJECT);
+	dr_next = dbuf_find_dirty_lte(db, tx->tx_txg);
+	if (dr_next && dr_next->dr_txg == tx->tx_txg) {
+		DB_DNODE_EXIT(db);
+
+		dbuf_redirty(dr_next);
+		mutex_exit(&db->db_mtx);
+		return (dr_next);
+	}
+
+	/*
+	 * Only valid if not already dirty.
+	 */
+	ASSERT(dn->dn_object == 0 ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
+
+	ASSERT3U(dn->dn_nlevels, >, db->db_level);
+
+	/*
+	 * We should only be dirtying in syncing context if it's the
+	 * mos or we're initializing the os or it's a special object.
+	 * However, we are allowed to dirty in syncing context provided
+	 * we already dirtied it in open context.  Hence we must make
+	 * this assertion only if we're not already dirty.
+	 */
+	os = dn->dn_objset;
+	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
+#ifdef ZFS_DEBUG
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+	if (dn->dn_objset->os_dsl_dataset != NULL)
+		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
+	ASSERT(db->db.db_size != 0);
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	if (db->db_blkid != DMU_BONUS_BLKID) {
+		dmu_objset_willuse_space(os, db->db.db_size, tx);
+	}
+
+	/*
+	 * If this buffer is dirty in an old transaction group we need
+	 * to make a copy of it so that the changes we make in this
+	 * transaction group won't leak out when we sync the older txg.
+	 */
+	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
+	list_link_init(&dr->dr_dirty_node);
+	list_link_init(&dr->dr_dbuf_node);
+	dr->dr_dnode = dn;
+	if (db->db_level == 0) {
+		void *data_old = db->db_buf;
+
+		if (db->db_state != DB_NOFILL) {
+			if (db->db_blkid == DMU_BONUS_BLKID) {
+				dbuf_fix_old_data(db, tx->tx_txg);
+				data_old = db->db.db_data;
+			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
+				/*
+				 * Release the data buffer from the cache so
+				 * that we can modify it without impacting
+				 * possible other users of this cached data
+				 * block.  Note that indirect blocks and
+				 * private objects are not released until the
+				 * syncing state (since they are only modified
+				 * then).
+				 */
+				arc_release(db->db_buf, db);
+				dbuf_fix_old_data(db, tx->tx_txg);
+				data_old = db->db_buf;
+			}
+			ASSERT(data_old != NULL);
+		}
+		dr->dt.dl.dr_data = data_old;
+	} else {
+		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL);
+		list_create(&dr->dt.di.dr_children,
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+	}
+	if (db->db_blkid != DMU_BONUS_BLKID)
+		dr->dr_accounted = db->db.db_size;
+	dr->dr_dbuf = db;
+	dr->dr_txg = tx->tx_txg;
+	list_insert_before(&db->db_dirty_records, dr_next, dr);
+
+	/*
+	 * We could have been freed_in_flight between the dbuf_noread
+	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
+	 * happened after the free.
+	 */
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+	    db->db_blkid != DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		if (dn->dn_free_ranges[txgoff] != NULL) {
+			range_tree_clear(dn->dn_free_ranges[txgoff],
+			    db->db_blkid, 1);
+		}
+		mutex_exit(&dn->dn_mtx);
+		db->db_freed_in_flight = FALSE;
+	}
+
+	/*
+	 * This buffer is now part of this txg
+	 */
+	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
+	db->db_dirtycnt += 1;
+	ASSERT3U(db->db_dirtycnt, <=, 3);
+
+	mutex_exit(&db->db_mtx);
+
+	if (db->db_blkid == DMU_BONUS_BLKID ||
+	    db->db_blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		ASSERT(!list_link_active(&dr->dr_dirty_node));
+		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+		mutex_exit(&dn->dn_mtx);
+		dnode_setdirty(dn, tx);
+		DB_DNODE_EXIT(db);
+		return (dr);
+	}
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_rwlock = B_TRUE;
+	}
+
+	/*
+	 * If we are overwriting a dedup BP, then unless it is snapshotted,
+	 * when we get to syncing context we will need to decrement its
+	 * refcount in the DDT.  Prefetch the relevant DDT block so that
+	 * syncing context won't have to wait for the i/o.
+	 */
+	if (db->db_blkptr != NULL) {
+		db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+		ddt_prefetch(os->os_spa, db->db_blkptr);
+		dmu_buf_unlock_parent(db, dblt, FTAG);
+	}
+
+	/*
+	 * We need to hold the dn_struct_rwlock to make this assertion,
+	 * because it protects dn_phys / dn_next_nlevels from changing.
+	 */
+	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+	    dn->dn_phys->dn_nlevels > db->db_level ||
+	    dn->dn_next_nlevels[txgoff] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
+
+	if (db->db_level == 0) {
+		ASSERT(!db->db_objset->os_raw_receive ||
+		    dn->dn_maxblkid >= db->db_blkid);
+		dnode_new_blkid(dn, db->db_blkid, tx,
+		    drop_struct_rwlock, B_FALSE);
+		ASSERT(dn->dn_maxblkid >= db->db_blkid);
+	}
+
+	if (db->db_level+1 < dn->dn_nlevels) {
+		dmu_buf_impl_t *parent = db->db_parent;
+		dbuf_dirty_record_t *di;
+		int parent_held = FALSE;
+
+		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
+			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+			parent = dbuf_hold_level(dn, db->db_level + 1,
+			    db->db_blkid >> epbs, FTAG);
+			ASSERT(parent != NULL);
+			parent_held = TRUE;
+		}
+		if (drop_struct_rwlock)
+			rw_exit(&dn->dn_struct_rwlock);
+		ASSERT3U(db->db_level + 1, ==, parent->db_level);
+		di = dbuf_dirty(parent, tx);
+		if (parent_held)
+			dbuf_rele(parent, FTAG);
+
+		mutex_enter(&db->db_mtx);
+		/*
+		 * Since we've dropped the mutex, it's possible that
+		 * dbuf_undirty() might have changed this out from under us.
+		 */
+		if (list_head(&db->db_dirty_records) == dr ||
+		    dn->dn_object == DMU_META_DNODE_OBJECT) {
+			mutex_enter(&di->dt.di.dr_mtx);
+			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
+			ASSERT(!list_link_active(&dr->dr_dirty_node));
+			list_insert_tail(&di->dt.di.dr_children, dr);
+			mutex_exit(&di->dt.di.dr_mtx);
+			dr->dr_parent = di;
+		}
+		mutex_exit(&db->db_mtx);
+	} else {
+		ASSERT(db->db_level + 1 == dn->dn_nlevels);
+		ASSERT(db->db_blkid < dn->dn_nblkptr);
+		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
+		mutex_enter(&dn->dn_mtx);
+		ASSERT(!list_link_active(&dr->dr_dirty_node));
+		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+		mutex_exit(&dn->dn_mtx);
+		if (drop_struct_rwlock)
+			rw_exit(&dn->dn_struct_rwlock);
+	}
+
+	dnode_setdirty(dn, tx);
+	DB_DNODE_EXIT(db);
+	return (dr);
+}
+
+static void
+dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	if (dr->dt.dl.dr_data != db->db.db_data) {
+		struct dnode *dn = dr->dr_dnode;
+		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+
+		kmem_free(dr->dt.dl.dr_data, max_bonuslen);
+		arc_space_return(max_bonuslen, ARC_SPACE_BONUS);
+	}
+	db->db_data_pending = NULL;
+	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
+	list_remove(&db->db_dirty_records, dr);
+	if (dr->dr_dbuf->db_level != 0) {
+		mutex_destroy(&dr->dt.di.dr_mtx);
+		list_destroy(&dr->dt.di.dr_children);
+	}
+	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+	ASSERT3U(db->db_dirtycnt, >, 0);
+	db->db_dirtycnt -= 1;
+}
+
+/*
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction.  Return whether this evicted the dbuf.
+ */
+static boolean_t
+dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	uint64_t txg = tx->tx_txg;
+
+	ASSERT(txg != 0);
+
+	/*
+	 * Due to our use of dn_nlevels below, this can only be called
+	 * in open context, unless we are operating on the MOS.
+	 * From syncing context, dn_nlevels may be different from the
+	 * dn_nlevels used when dbuf was dirtied.
+	 */
+	ASSERT(db->db_objset ==
+	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT0(db->db_level);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	/*
+	 * If this buffer is not dirty, we're done.
+	 */
+	dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
+	if (dr == NULL)
+		return (B_FALSE);
+	ASSERT(dr->dr_dbuf == db);
+
+	dnode_t *dn = dr->dr_dnode;
+
+	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+
+	ASSERT(db->db.db_size != 0);
+
+	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+	    dr->dr_accounted, txg);
+
+	list_remove(&db->db_dirty_records, dr);
+
+	/*
+	 * Note that there are three places in dbuf_dirty()
+	 * where this dirty record may be put on a list.
+	 * Make sure to do a list_remove corresponding to
+	 * every one of those list_insert calls.
+	 */
+	if (dr->dr_parent) {
+		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
+		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
+		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
+	} else if (db->db_blkid == DMU_SPILL_BLKID ||
+	    db->db_level + 1 == dn->dn_nlevels) {
+		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
+		mutex_enter(&dn->dn_mtx);
+		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	if (db->db_state != DB_NOFILL) {
+		dbuf_unoverride(dr);
+
+		ASSERT(db->db_buf != NULL);
+		ASSERT(dr->dt.dl.dr_data != NULL);
+		if (dr->dt.dl.dr_data != db->db_buf)
+			arc_buf_destroy(dr->dt.dl.dr_data, db);
+	}
+
+	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+
+	if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
+		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+		dbuf_destroy(db);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static void
+dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+	/*
+	 * Quick check for dirtiness.  For already dirty blocks, this
+	 * reduces runtime of this function by >90%, and overall performance
+	 * by 50% for some workloads (e.g. file deletion with indirect blocks
+	 * cached).
+	 */
+	mutex_enter(&db->db_mtx);
+
+	if (db->db_state == DB_CACHED) {
+		dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+		/*
+		 * It's possible that it is already dirty but not cached,
+		 * because there are some calls to dbuf_dirty() that don't
+		 * go through dmu_buf_will_dirty().
+		 */
+		if (dr != NULL) {
+			/* This dbuf is already dirty and cached. */
+			dbuf_redirty(dr);
+			mutex_exit(&db->db_mtx);
+			return;
+		}
+	}
+	mutex_exit(&db->db_mtx);
+
+	DB_DNODE_ENTER(db);
+	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
+		flags |= DB_RF_HAVESTRUCT;
+	DB_DNODE_EXIT(db);
+	(void) dbuf_read(db, NULL, flags);
+	(void) dbuf_dirty(db, tx);
+}
+
+void
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_will_dirty_impl(db_fake,
+	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
+}
+
+boolean_t
+dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dbuf_dirty_record_t *dr;
+
+	mutex_enter(&db->db_mtx);
+	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+	mutex_exit(&db->db_mtx);
+	return (dr != NULL);
+}
+
+void
+dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	db->db_state = DB_NOFILL;
+	DTRACE_SET_STATE(db, "allocating NOFILL buffer");
+	dmu_buf_will_fill(db_fake, tx);
+}
+
+void
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(db->db_level == 0);
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+
+	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
+	    dmu_tx_private_ok(tx));
+
+	dbuf_noread(db);
+	(void) dbuf_dirty(db, tx);
+}
+
+/*
+ * This function is effectively the same as dmu_buf_will_dirty(), but
+ * indicates the caller expects raw encrypted data in the db, and provides
+ * the crypt params (byteorder, salt, iv, mac) which should be stored in the
+ * blkptr_t when this dbuf is written.  This is only used for blocks of
+ * dnodes, during raw receive.
+ */
+void
+dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
+    const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dbuf_dirty_record_t *dr;
+
+	/*
+	 * dr_has_raw_params is only processed for blocks of dnodes
+	 * (see dbuf_sync_dnode_leaf_crypt()).
+	 */
+	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
+	ASSERT3U(db->db_level, ==, 0);
+	ASSERT(db->db_objset->os_raw_receive);
+
+	dmu_buf_will_dirty_impl(db_fake,
+	    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
+
+	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+
+	ASSERT3P(dr, !=, NULL);
+
+	dr->dt.dl.dr_has_raw_params = B_TRUE;
+	dr->dt.dl.dr_byteorder = byteorder;
+	bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
+	bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
+	bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
+}
+
+static void
+dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	struct dirty_leaf *dl;
+	dbuf_dirty_record_t *dr;
+
+	dr = list_head(&db->db_dirty_records);
+	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+	dl = &dr->dt.dl;
+	dl->dr_overridden_by = *bp;
+	dl->dr_override_state = DR_OVERRIDDEN;
+	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+}
+
+/* ARGSUSED */
+void
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	dbuf_states_t old_state;
+	mutex_enter(&db->db_mtx);
+	DBUF_VERIFY(db);
+
+	old_state = db->db_state;
+	db->db_state = DB_CACHED;
+	if (old_state == DB_FILL) {
+		if (db->db_level == 0 && db->db_freed_in_flight) {
+			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+			/* we were freed while filling */
+			/* XXX dbuf_undirty? */
+			bzero(db->db.db_data, db->db.db_size);
+			db->db_freed_in_flight = FALSE;
+			DTRACE_SET_STATE(db,
+			    "fill done handling freed in flight");
+		} else {
+			DTRACE_SET_STATE(db, "fill done");
+		}
+		cv_broadcast(&db->db_changed);
+	}
+	mutex_exit(&db->db_mtx);
+}
+
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	struct dirty_leaf *dl;
+	dmu_object_type_t type;
+	dbuf_dirty_record_t *dr;
+
+	if (etype == BP_EMBEDDED_TYPE_DATA) {
+		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+		    SPA_FEATURE_EMBEDDED_DATA));
+	}
+
+	DB_DNODE_ENTER(db);
+	type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	ASSERT0(db->db_level);
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+	dmu_buf_will_not_fill(dbuf, tx);
+
+	dr = list_head(&db->db_dirty_records);
+	ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+	dl = &dr->dt.dl;
+	encode_embedded_bp_compressed(&dl->dr_overridden_by,
+	    data, comp, uncompressed_size, compressed_size);
+	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+	BP_SET_TYPE(&dl->dr_overridden_by, type);
+	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+	dl->dr_override_state = DR_OVERRIDDEN;
+	dl->dr_overridden_by.blk_birth = dr->dr_txg;
+}
+
+void
+dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+	dmu_object_type_t type;
+	ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset,
+	    SPA_FEATURE_REDACTED_DATASETS));
+
+	DB_DNODE_ENTER(db);
+	type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	ASSERT0(db->db_level);
+	dmu_buf_will_not_fill(dbuf, tx);
+
+	blkptr_t bp = { { { {0} } } };
+	BP_SET_TYPE(&bp, type);
+	BP_SET_LEVEL(&bp, 0);
+	BP_SET_BIRTH(&bp, tx->tx_txg, 0);
+	BP_SET_REDACTED(&bp);
+	BPE_SET_LSIZE(&bp, dbuf->db_size);
+
+	dbuf_override_impl(db, &bp, tx);
+}
+
+/*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+	ASSERT(!zfs_refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+	ASSERT(db->db_level == 0);
+	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
+	ASSERT(buf != NULL);
+	ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
+	ASSERT(tx->tx_txg != 0);
+
+	arc_return_buf(buf, db);
+	ASSERT(arc_released(buf));
+
+	mutex_enter(&db->db_mtx);
+
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+
+	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+	if (db->db_state == DB_CACHED &&
+	    zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+		/*
+		 * In practice, we will never have a case where we have an
+		 * encrypted arc buffer while additional holds exist on the
+		 * dbuf. We don't handle this here so we simply assert that
+		 * fact instead.
+		 */
+		ASSERT(!arc_is_encrypted(buf));
+		mutex_exit(&db->db_mtx);
+		(void) dbuf_dirty(db, tx);
+		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+		arc_buf_destroy(buf, db);
+		return;
+	}
+
+	if (db->db_state == DB_CACHED) {
+		dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+
+		ASSERT(db->db_buf != NULL);
+		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+			ASSERT(dr->dt.dl.dr_data == db->db_buf);
+
+			if (!arc_released(db->db_buf)) {
+				ASSERT(dr->dt.dl.dr_override_state ==
+				    DR_OVERRIDDEN);
+				arc_release(db->db_buf, db);
+			}
+			dr->dt.dl.dr_data = buf;
+			arc_buf_destroy(db->db_buf, db);
+		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+			arc_release(db->db_buf, db);
+			arc_buf_destroy(db->db_buf, db);
+		}
+		db->db_buf = NULL;
+	}
+	ASSERT(db->db_buf == NULL);
+	dbuf_set_data(db, buf);
+	db->db_state = DB_FILL;
+	DTRACE_SET_STATE(db, "filling assigned arcbuf");
+	mutex_exit(&db->db_mtx);
+	(void) dbuf_dirty(db, tx);
+	dmu_buf_fill_done(&db->db, tx);
+}
+
+void
+dbuf_destroy(dmu_buf_impl_t *db)
+{
+	dnode_t *dn;
+	dmu_buf_impl_t *parent = db->db_parent;
+	dmu_buf_impl_t *dndb;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(zfs_refcount_is_zero(&db->db_holds));
+
+	if (db->db_buf != NULL) {
+		arc_buf_destroy(db->db_buf, db);
+		db->db_buf = NULL;
+	}
+
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		int slots = DB_DNODE(db)->dn_num_slots;
+		int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
+		if (db->db.db_data != NULL) {
+			kmem_free(db->db.db_data, bonuslen);
+			arc_space_return(bonuslen, ARC_SPACE_BONUS);
+			db->db_state = DB_UNCACHED;
+			DTRACE_SET_STATE(db, "buffer cleared");
+		}
+	}
+
+	dbuf_clear_data(db);
+
+	if (multilist_link_active(&db->db_cache_link)) {
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) zfs_refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
+		    db->db.db_size, db);
+
+		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+			DBUF_STAT_BUMPDOWN(metadata_cache_count);
+		} else {
+			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+			DBUF_STAT_BUMPDOWN(cache_count);
+			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+			    db->db.db_size);
+		}
+		db->db_caching_status = DB_NO_CACHE;
+	}
+
+	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
+	ASSERT(db->db_data_pending == NULL);
+	ASSERT(list_is_empty(&db->db_dirty_records));
+
+	db->db_state = DB_EVICTING;
+	DTRACE_SET_STATE(db, "buffer eviction started");
+	db->db_blkptr = NULL;
+
+	/*
+	 * Now that db_state is DB_EVICTING, nobody else can find this via
+	 * the hash table.  We can now drop db_mtx, which allows us to
+	 * acquire the dn_dbufs_mtx.
+	 */
+	mutex_exit(&db->db_mtx);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dndb = dn->dn_dbuf;
+	if (db->db_blkid != DMU_BONUS_BLKID) {
+		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+		if (needlock)
+			mutex_enter_nested(&dn->dn_dbufs_mtx,
+			    NESTED_SINGLE);
+		avl_remove(&dn->dn_dbufs, db);
+		membar_producer();
+		DB_DNODE_EXIT(db);
+		if (needlock)
+			mutex_exit(&dn->dn_dbufs_mtx);
+		/*
+		 * Decrementing the dbuf count means that the hold corresponding
+		 * to the removed dbuf is no longer discounted in dnode_move(),
+		 * so the dnode cannot be moved until after we release the hold.
+		 * The membar_producer() ensures visibility of the decremented
+		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
+		 * release any lock.
+		 */
+		mutex_enter(&dn->dn_mtx);
+		dnode_rele_and_unlock(dn, db, B_TRUE);
+		db->db_dnode_handle = NULL;
+
+		dbuf_hash_remove(db);
+	} else {
+		DB_DNODE_EXIT(db);
+	}
+
+	ASSERT(zfs_refcount_is_zero(&db->db_holds));
+
+	db->db_parent = NULL;
+
+	ASSERT(db->db_buf == NULL);
+	ASSERT(db->db.db_data == NULL);
+	ASSERT(db->db_hash_next == NULL);
+	ASSERT(db->db_blkptr == NULL);
+	ASSERT(db->db_data_pending == NULL);
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+	ASSERT(!multilist_link_active(&db->db_cache_link));
+
+	kmem_cache_free(dbuf_kmem_cache, db);
+	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+
+	/*
+	 * If this dbuf is referenced from an indirect dbuf,
+	 * decrement the ref count on the indirect dbuf.
+	 */
+	if (parent && parent != dndb) {
+		mutex_enter(&parent->db_mtx);
+		dbuf_rele_and_unlock(parent, db, B_TRUE);
+	}
+}
+
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or {user|group|project}used
+ * object.
+ */
+__attribute__((always_inline))
+static inline int
+dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
+    dmu_buf_impl_t **parentp, blkptr_t **bpp)
+{
+	*parentp = NULL;
+	*bpp = NULL;
+
+	ASSERT(blkid != DMU_BONUS_BLKID);
+
+	if (blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		if (dn->dn_have_spill &&
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+			*bpp = DN_SPILL_BLKPTR(dn->dn_phys);
+		else
+			*bpp = NULL;
+		dbuf_add_ref(dn->dn_dbuf, NULL);
+		*parentp = dn->dn_dbuf;
+		mutex_exit(&dn->dn_mtx);
+		return (0);
+	}
+
+	int nlevels =
+	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	ASSERT3U(level * epbs, <, 64);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	/*
+	 * This assertion shouldn't trip as long as the max indirect block size
+	 * is less than 1M.  The reason for this is that up to that point,
+	 * the number of levels required to address an entire object with blocks
+	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.	 In
+	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
+	 * (i.e. we can address the entire object), objects will all use at most
+	 * N-1 levels and the assertion won't overflow.	 However, once epbs is
+	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
+	 * enough to address an entire object, so objects will have 5 levels,
+	 * but then this assertion will overflow.
+	 *
+	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
+	 * need to redo this logic to handle overflows.
+	 */
+	ASSERT(level >= nlevels ||
+	    ((nlevels - level - 1) * epbs) +
+	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
+	if (level >= nlevels ||
+	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
+	    ((nlevels - level - 1) * epbs)) ||
+	    (fail_sparse &&
+	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
+		/* the buffer has no parent yet */
+		return (SET_ERROR(ENOENT));
+	} else if (level < nlevels-1) {
+		/* this block is referenced from an indirect block */
+		int err;
+
+		err = dbuf_hold_impl(dn, level + 1,
+		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
+
+		if (err)
+			return (err);
+		err = dbuf_read(*parentp, NULL,
+		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+		if (err) {
+			dbuf_rele(*parentp, NULL);
+			*parentp = NULL;
+			return (err);
+		}
+		rw_enter(&(*parentp)->db_rwlock, RW_READER);
+		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+		    (blkid & ((1ULL << epbs) - 1));
+		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
+			ASSERT(BP_IS_HOLE(*bpp));
+		rw_exit(&(*parentp)->db_rwlock);
+		return (0);
+	} else {
+		/* the block is referenced from the dnode */
+		ASSERT3U(level, ==, nlevels-1);
+		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
+		    blkid < dn->dn_phys->dn_nblkptr);
+		if (dn->dn_dbuf) {
+			dbuf_add_ref(dn->dn_dbuf, NULL);
+			*parentp = dn->dn_dbuf;
+		}
+		*bpp = &dn->dn_phys->dn_blkptr[blkid];
+		return (0);
+	}
+}
+
+static dmu_buf_impl_t *
+dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
+    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+{
+	objset_t *os = dn->dn_objset;
+	dmu_buf_impl_t *db, *odb;
+
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
+
+	list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t),
+	    offsetof(dbuf_dirty_record_t, dr_dbuf_node));
+
+	db->db_objset = os;
+	db->db.db_object = dn->dn_object;
+	db->db_level = level;
+	db->db_blkid = blkid;
+	db->db_dirtycnt = 0;
+	db->db_dnode_handle = dn->dn_handle;
+	db->db_parent = parent;
+	db->db_blkptr = blkptr;
+
+	db->db_user = NULL;
+	db->db_user_immediate_evict = FALSE;
+	db->db_freed_in_flight = FALSE;
+	db->db_pending_evict = FALSE;
+
+	if (blkid == DMU_BONUS_BLKID) {
+		ASSERT3P(parent, ==, dn->dn_dbuf);
+		db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
+		db->db.db_offset = DMU_BONUS_BLKID;
+		db->db_state = DB_UNCACHED;
+		DTRACE_SET_STATE(db, "bonus buffer created");
+		db->db_caching_status = DB_NO_CACHE;
+		/* the bonus dbuf is not placed in the hash table */
+		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+		return (db);
+	} else if (blkid == DMU_SPILL_BLKID) {
+		db->db.db_size = (blkptr != NULL) ?
+		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
+		db->db.db_offset = 0;
+	} else {
+		int blocksize =
+		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
+		db->db.db_size = blocksize;
+		db->db.db_offset = db->db_blkid * blocksize;
+	}
+
+	/*
+	 * Hold the dn_dbufs_mtx while we get the new dbuf
+	 * in the hash table *and* added to the dbufs list.
+	 * This prevents a possible deadlock with someone
+	 * trying to look up this dbuf before it's added to the
+	 * dn_dbufs list.
+	 */
+	mutex_enter(&dn->dn_dbufs_mtx);
+	db->db_state = DB_EVICTING; /* not worth logging this state change */
+	if ((odb = dbuf_hash_insert(db)) != NULL) {
+		/* someone else inserted it first */
+		kmem_cache_free(dbuf_kmem_cache, db);
+		mutex_exit(&dn->dn_dbufs_mtx);
+		DBUF_STAT_BUMP(hash_insert_race);
+		return (odb);
+	}
+	avl_add(&dn->dn_dbufs, db);
+
+	db->db_state = DB_UNCACHED;
+	DTRACE_SET_STATE(db, "regular buffer created");
+	db->db_caching_status = DB_NO_CACHE;
+	mutex_exit(&dn->dn_dbufs_mtx);
+	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
+
+	if (parent && parent != dn->dn_dbuf)
+		dbuf_add_ref(parent, db);
+
+	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+	    zfs_refcount_count(&dn->dn_holds) > 0);
+	(void) zfs_refcount_add(&dn->dn_holds, db);
+
+	dprintf_dbuf(db, "db=%p\n", db);
+
+	return (db);
+}
+
+/*
+ * This function returns a block pointer and information about the object,
+ * given a dnode and a block.  This is a publicly accessible version of
+ * dbuf_findbp that only returns some information, rather than the
+ * dbuf.  Note that the dnode passed in must be held, and the dn_struct_rwlock
+ * should be locked as (at least) a reader.
+ */
+int
+dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
+    blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift)
+{
+	dmu_buf_impl_t *dbp = NULL;
+	blkptr_t *bp2;
+	int err = 0;
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+	err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
+	if (err == 0) {
+		*bp = *bp2;
+		if (dbp != NULL)
+			dbuf_rele(dbp, NULL);
+		if (datablkszsec != NULL)
+			*datablkszsec = dn->dn_phys->dn_datablkszsec;
+		if (indblkshift != NULL)
+			*indblkshift = dn->dn_phys->dn_indblkshift;
+	}
+
+	return (err);
+}
+
+typedef struct dbuf_prefetch_arg {
+	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
+	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+	int dpa_curlevel; /* The current level that we're reading */
+	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
+	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+	dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
+	void *dpa_arg; /* prefetch completion arg */
+} dbuf_prefetch_arg_t;
+
+static void
+dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
+{
+	if (dpa->dpa_cb != NULL)
+		dpa->dpa_cb(dpa->dpa_arg, io_done);
+	kmem_free(dpa, sizeof (*dpa));
+}
+
+static void
+dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+	dbuf_prefetch_arg_t *dpa = private;
+
+	dbuf_prefetch_fini(dpa, B_TRUE);
+	if (abuf != NULL)
+		arc_buf_destroy(abuf, private);
+}
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
+{
+	ASSERT(!BP_IS_REDACTED(bp) ||
+	    dsl_dataset_feature_is_active(
+	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
+	    SPA_FEATURE_REDACTED_DATASETS));
+
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
+		return (dbuf_prefetch_fini(dpa, B_FALSE));
+
+	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+	arc_flags_t aflags =
+	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+	    ARC_FLAG_NO_BUF;
+
+	/* dnodes are always read as raw and then converted later */
+	if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
+	    dpa->dpa_curlevel == 0)
+		zio_flags |= ZIO_FLAG_RAW;
+
+	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+	ASSERT(dpa->dpa_zio != NULL);
+	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
+	    dbuf_issue_final_prefetch_done, dpa,
+	    dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
+}
+
+/*
+ * Called when an indirect block above our prefetch target is read in.  This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
+static void
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+	dbuf_prefetch_arg_t *dpa = private;
+
+	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+	ASSERT3S(dpa->dpa_curlevel, >, 0);
+
+	if (abuf == NULL) {
+		ASSERT(zio == NULL || zio->io_error != 0);
+		return (dbuf_prefetch_fini(dpa, B_TRUE));
+	}
+	ASSERT(zio == NULL || zio->io_error == 0);
+
+	/*
+	 * The dpa_dnode is only valid if we are called with a NULL
+	 * zio. This indicates that the arc_read() returned without
+	 * first calling zio_read() to issue a physical read. Once
+	 * a physical read is made the dpa_dnode must be invalidated
+	 * as the locks guarding it may have been dropped. If the
+	 * dpa_dnode is still valid, then we want to add it to the dbuf
+	 * cache. To do so, we must hold the dbuf associated with the block
+	 * we just prefetched, read its contents so that we associate it
+	 * with an arc_buf_t, and then release it.
+	 */
+	if (zio != NULL) {
+		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+		if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
+			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
+		} else {
+			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
+		}
+		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+
+		dpa->dpa_dnode = NULL;
+	} else if (dpa->dpa_dnode != NULL) {
+		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
+		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
+		    dpa->dpa_zb.zb_level));
+		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
+		    dpa->dpa_curlevel, curblkid, FTAG);
+		if (db == NULL) {
+			arc_buf_destroy(abuf, private);
+			return (dbuf_prefetch_fini(dpa, B_TRUE));
+		}
+		(void) dbuf_read(db, NULL,
+		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
+		dbuf_rele(db, FTAG);
+	}
+
+	dpa->dpa_curlevel--;
+	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+
+	ASSERT(!BP_IS_REDACTED(bp) ||
+	    dsl_dataset_feature_is_active(
+	    dpa->dpa_dnode->dn_objset->os_dsl_dataset,
+	    SPA_FEATURE_REDACTED_DATASETS));
+	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
+		dbuf_prefetch_fini(dpa, B_TRUE);
+	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+		dbuf_issue_final_prefetch(dpa, bp);
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
+		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
+			iter_aflags |= ARC_FLAG_L2CACHE;
+
+		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
+	}
+
+	arc_buf_destroy(abuf, private);
+}
+
+/*
+ * Issue prefetch reads for the given block on the given level.  If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously.  As a result, this call never blocks waiting for a read to
+ * complete. Note that the prefetch might fail if the dataset is encrypted and
+ * the encryption key is unmapped before the IO completes.
+ */
+int
+dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
+    zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
+    void *arg)
+{
+	blkptr_t bp;
+	int epbs, nlevels, curlevel;
+	uint64_t curblkid;
+
+	ASSERT(blkid != DMU_BONUS_BLKID);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+	if (blkid > dn->dn_maxblkid)
+		goto no_issue;
+
+	if (level == 0 && dnode_block_freed(dn, blkid))
+		goto no_issue;
+
+	/*
+	 * This dnode hasn't been written to disk yet, so there's nothing to
+	 * prefetch.
+	 */
+	nlevels = dn->dn_phys->dn_nlevels;
+	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+		goto no_issue;
+
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+		goto no_issue;
+
+	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+	    level, blkid);
+	if (db != NULL) {
+		mutex_exit(&db->db_mtx);
+		/*
+		 * This dbuf already exists.  It is either CACHED, or
+		 * (we assume) about to be read or filled.
+		 */
+		goto no_issue;
+	}
+
+	/*
+	 * Find the closest ancestor (indirect block) of the target block
+	 * that is present in the cache.  In this indirect block, we will
+	 * find the bp that is at curlevel, curblkid.
+	 */
+	curlevel = level;
+	curblkid = blkid;
+	while (curlevel < nlevels - 1) {
+		int parent_level = curlevel + 1;
+		uint64_t parent_blkid = curblkid >> epbs;
+		dmu_buf_impl_t *db;
+
+		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+		    FALSE, TRUE, FTAG, &db) == 0) {
+			blkptr_t *bpp = db->db_buf->b_data;
+			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+			dbuf_rele(db, FTAG);
+			break;
+		}
+
+		curlevel = parent_level;
+		curblkid = parent_blkid;
+	}
+
+	if (curlevel == nlevels - 1) {
+		/* No cached indirect blocks found. */
+		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+		bp = dn->dn_phys->dn_blkptr[curblkid];
+	}
+	ASSERT(!BP_IS_REDACTED(&bp) ||
+	    dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
+	    SPA_FEATURE_REDACTED_DATASETS));
+	if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
+		goto no_issue;
+
+	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
+
+	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
+
+	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+	    dn->dn_object, level, blkid);
+	dpa->dpa_curlevel = curlevel;
+	dpa->dpa_prio = prio;
+	dpa->dpa_aflags = aflags;
+	dpa->dpa_spa = dn->dn_objset->os_spa;
+	dpa->dpa_dnode = dn;
+	dpa->dpa_epbs = epbs;
+	dpa->dpa_zio = pio;
+	dpa->dpa_cb = cb;
+	dpa->dpa_arg = arg;
+
+	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
+	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
+
+	/*
+	 * If we have the indirect just above us, no need to do the asynchronous
+	 * prefetch chain; we'll just run the last step ourselves.  If we're at
+	 * a higher level, though, we want to issue the prefetches for all the
+	 * indirect blocks asynchronously, so we can go on with whatever we were
+	 * doing.
+	 */
+	if (curlevel == level) {
+		ASSERT3U(curblkid, ==, blkid);
+		dbuf_issue_final_prefetch(dpa, &bp);
+	} else {
+		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+		zbookmark_phys_t zb;
+
+		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
+		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+			iter_aflags |= ARC_FLAG_L2CACHE;
+
+		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+		    dn->dn_object, curlevel, curblkid);
+		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+		    &iter_aflags, &zb);
+	}
+	/*
+	 * We use pio here instead of dpa_zio since it's possible that
+	 * dpa may have already been freed.
+	 */
+	zio_nowait(pio);
+	return (1);
+no_issue:
+	if (cb != NULL)
+		cb(arg, B_FALSE);
+	return (0);
+}
+
+int
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+    arc_flags_t aflags)
+{
+
+	return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
+}
+
+/*
+ * Helper function for dbuf_hold_impl() to copy a buffer. Handles
+ * the case of encrypted, compressed and uncompressed buffers by
+ * allocating the new buffer, respectively, with arc_alloc_raw_buf(),
+ * arc_alloc_compressed_buf() or arc_alloc_buf().*
+ *
+ * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl().
+ */
+noinline static void
+dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
+{
+	dbuf_dirty_record_t *dr = db->db_data_pending;
+	arc_buf_t *newdata, *data = dr->dt.dl.dr_data;
+
+	newdata = dbuf_alloc_arcbuf_from_arcbuf(db, data);
+	dbuf_set_data(db, newdata);
+	rw_enter(&db->db_rwlock, RW_WRITER);
+	bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+	rw_exit(&db->db_rwlock);
+}
+
+/*
+ * Returns with db_holds incremented, and db_mtx not held.
+ * Note: dn_struct_rwlock must be held.
+ */
+int
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+    boolean_t fail_sparse, boolean_t fail_uncached,
+    void *tag, dmu_buf_impl_t **dbp)
+{
+	dmu_buf_impl_t *db, *parent = NULL;
+
+	/* If the pool has been created, verify the tx_sync_lock is not held */
+	spa_t *spa = dn->dn_objset->os_spa;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	if (dp != NULL) {
+		ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock));
+	}
+
+	ASSERT(blkid != DMU_BONUS_BLKID);
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+	ASSERT3U(dn->dn_nlevels, >, level);
+
+	*dbp = NULL;
+
+	/* dbuf_find() returns with db_mtx held */
+	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+
+	if (db == NULL) {
+		blkptr_t *bp = NULL;
+		int err;
+
+		if (fail_uncached)
+			return (SET_ERROR(ENOENT));
+
+		ASSERT3P(parent, ==, NULL);
+		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
+		if (fail_sparse) {
+			if (err == 0 && bp && BP_IS_HOLE(bp))
+				err = SET_ERROR(ENOENT);
+			if (err) {
+				if (parent)
+					dbuf_rele(parent, NULL);
+				return (err);
+			}
+		}
+		if (err && err != ENOENT)
+			return (err);
+		db = dbuf_create(dn, level, blkid, parent, bp);
+	}
+
+	if (fail_uncached && db->db_state != DB_CACHED) {
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (db->db_buf != NULL) {
+		arc_buf_access(db->db_buf);
+		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+	}
+
+	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
+	/*
+	 * If this buffer is currently syncing out, and we are
+	 * still referencing it from db_data, we need to make a copy
+	 * of it in case we decide we want to dirty it again in this txg.
+	 */
+	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
+	    db->db_state == DB_CACHED && db->db_data_pending) {
+		dbuf_dirty_record_t *dr = db->db_data_pending;
+		if (dr->dt.dl.dr_data == db->db_buf)
+			dbuf_hold_copy(dn, db);
+	}
+
+	if (multilist_link_active(&db->db_cache_link)) {
+		ASSERT(zfs_refcount_is_zero(&db->db_holds));
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) zfs_refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
+		    db->db.db_size, db);
+
+		if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
+			DBUF_STAT_BUMPDOWN(metadata_cache_count);
+		} else {
+			DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
+			DBUF_STAT_BUMPDOWN(cache_count);
+			DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
+			    db->db.db_size);
+		}
+		db->db_caching_status = DB_NO_CACHE;
+	}
+	(void) zfs_refcount_add(&db->db_holds, tag);
+	DBUF_VERIFY(db);
+	mutex_exit(&db->db_mtx);
+
+	/* NOTE: we can't rele the parent until after we drop the db_mtx */
+	if (parent)
+		dbuf_rele(parent, NULL);
+
+	ASSERT3P(DB_DNODE(db), ==, dn);
+	ASSERT3U(db->db_blkid, ==, blkid);
+	ASSERT3U(db->db_level, ==, level);
+	*dbp = db;
+
+	return (0);
+}
+
+dmu_buf_impl_t *
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+{
+	return (dbuf_hold_level(dn, 0, blkid, tag));
+}
+
+dmu_buf_impl_t *
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+{
+	dmu_buf_impl_t *db;
+	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
+	return (err ? NULL : db);
+}
+
+void
+dbuf_create_bonus(dnode_t *dn)
+{
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	ASSERT(dn->dn_bonus == NULL);
+	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+}
+
+int
+dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	if (db->db_blkid != DMU_SPILL_BLKID)
+		return (SET_ERROR(ENOTSUP));
+	if (blksz == 0)
+		blksz = SPA_MINBLOCKSIZE;
+	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+
+	dbuf_new_size(db, blksz, tx);
+
+	return (0);
+}
+
+void
+dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
+}
+
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
+void
+dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+{
+	int64_t holds = zfs_refcount_add(&db->db_holds, tag);
+	VERIFY3S(holds, >, 1);
+}
+
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+    void *tag)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dmu_buf_impl_t *found_db;
+	boolean_t result = B_FALSE;
+
+	if (blkid == DMU_BONUS_BLKID)
+		found_db = dbuf_find_bonus(os, obj);
+	else
+		found_db = dbuf_find(os, obj, 0, blkid);
+
+	if (found_db != NULL) {
+		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+			(void) zfs_refcount_add(&db->db_holds, tag);
+			result = B_TRUE;
+		}
+		mutex_exit(&found_db->db_mtx);
+	}
+	return (result);
+}
+
+/*
+ * If you call dbuf_rele() you had better not be referencing the dnode handle
+ * unless you have some other direct or indirect hold on the dnode. (An indirect
+ * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
+ * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
+ * dnode's parent dbuf evicting its dnode handles.
+ */
+void
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+	mutex_enter(&db->db_mtx);
+	dbuf_rele_and_unlock(db, tag, B_FALSE);
+}
+
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+	dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.  The 'evicting'
+ * argument should be set if we are already in the dbuf-evicting code
+ * path, in which case we don't want to recursively evict.  This allows us to
+ * avoid deeply nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ *	^						|
+ *	|						|
+ *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+{
+	int64_t holds;
+	uint64_t size;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	DBUF_VERIFY(db);
+
+	/*
+	 * Remove the reference to the dbuf before removing its hold on the
+	 * dnode so we can guarantee in dnode_move() that a referenced bonus
+	 * buffer has a corresponding dnode hold.
+	 */
+	holds = zfs_refcount_remove(&db->db_holds, tag);
+	ASSERT(holds >= 0);
+
+	/*
+	 * We can't freeze indirects if there is a possibility that they
+	 * may be modified in the current syncing context.
+	 */
+	if (db->db_buf != NULL &&
+	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
+		arc_buf_freeze(db->db_buf);
+	}
+
+	if (holds == db->db_dirtycnt &&
+	    db->db_level == 0 && db->db_user_immediate_evict)
+		dbuf_evict_user(db);
+
+	if (holds == 0) {
+		if (db->db_blkid == DMU_BONUS_BLKID) {
+			dnode_t *dn;
+			boolean_t evict_dbuf = db->db_pending_evict;
+
+			/*
+			 * If the dnode moves here, we cannot cross this
+			 * barrier until the move completes.
+			 */
+			DB_DNODE_ENTER(db);
+
+			dn = DB_DNODE(db);
+			atomic_dec_32(&dn->dn_dbufs_count);
+
+			/*
+			 * Decrementing the dbuf count means that the bonus
+			 * buffer's dnode hold is no longer discounted in
+			 * dnode_move(). The dnode cannot move until after
+			 * the dnode_rele() below.
+			 */
+			DB_DNODE_EXIT(db);
+
+			/*
+			 * Do not reference db after its lock is dropped.
+			 * Another thread may evict it.
+			 */
+			mutex_exit(&db->db_mtx);
+
+			if (evict_dbuf)
+				dnode_evict_bonus(dn);
+
+			dnode_rele(dn, db);
+		} else if (db->db_buf == NULL) {
+			/*
+			 * This is a special case: we never associated this
+			 * dbuf with any data allocated from the ARC.
+			 */
+			ASSERT(db->db_state == DB_UNCACHED ||
+			    db->db_state == DB_NOFILL);
+			dbuf_destroy(db);
+		} else if (arc_released(db->db_buf)) {
+			/*
+			 * This dbuf has anonymous data associated with it.
+			 */
+			dbuf_destroy(db);
+		} else {
+			boolean_t do_arc_evict = B_FALSE;
+			blkptr_t bp;
+			spa_t *spa = dmu_objset_spa(db->db_objset);
+
+			if (!DBUF_IS_CACHEABLE(db) &&
+			    db->db_blkptr != NULL &&
+			    !BP_IS_HOLE(db->db_blkptr) &&
+			    !BP_IS_EMBEDDED(db->db_blkptr)) {
+				do_arc_evict = B_TRUE;
+				bp = *db->db_blkptr;
+			}
+
+			if (!DBUF_IS_CACHEABLE(db) ||
+			    db->db_pending_evict) {
+				dbuf_destroy(db);
+			} else if (!multilist_link_active(&db->db_cache_link)) {
+				ASSERT3U(db->db_caching_status, ==,
+				    DB_NO_CACHE);
+
+				dbuf_cached_state_t dcs =
+				    dbuf_include_in_metadata_cache(db) ?
+				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+				db->db_caching_status = dcs;
+
+				multilist_insert(dbuf_caches[dcs].cache, db);
+				size = zfs_refcount_add_many(
+				    &dbuf_caches[dcs].size,
+				    db->db.db_size, db);
+
+				if (dcs == DB_DBUF_METADATA_CACHE) {
+					DBUF_STAT_BUMP(metadata_cache_count);
+					DBUF_STAT_MAX(
+					    metadata_cache_size_bytes_max,
+					    size);
+				} else {
+					DBUF_STAT_BUMP(
+					    cache_levels[db->db_level]);
+					DBUF_STAT_BUMP(cache_count);
+					DBUF_STAT_INCR(
+					    cache_levels_bytes[db->db_level],
+					    db->db.db_size);
+					DBUF_STAT_MAX(cache_size_bytes_max,
+					    size);
+				}
+				mutex_exit(&db->db_mtx);
+
+				if (dcs == DB_DBUF_CACHE && !evicting)
+					dbuf_evict_notify(size);
+			}
+
+			if (do_arc_evict)
+				arc_freed(spa, &bp);
+		}
+	} else {
+		mutex_exit(&db->db_mtx);
+	}
+
+}
+
+#pragma weak dmu_buf_refcount = dbuf_refcount
+uint64_t
+dbuf_refcount(dmu_buf_impl_t *db)
+{
+	return (zfs_refcount_count(&db->db_holds));
+}
+
+uint64_t
+dmu_buf_user_refcount(dmu_buf_t *db_fake)
+{
+	uint64_t holds;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	mutex_enter(&db->db_mtx);
+	ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt);
+	holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt;
+	mutex_exit(&db->db_mtx);
+
+	return (holds);
+}
+
+void *
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+    dmu_buf_user_t *new_user)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	mutex_enter(&db->db_mtx);
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	if (db->db_user == old_user)
+		db->db_user = new_user;
+	else
+		old_user = db->db_user;
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	mutex_exit(&db->db_mtx);
+
+	return (old_user);
+}
+
+void *
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	return (dmu_buf_replace_user(db_fake, NULL, user));
+}
+
+void *
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	db->db_user_immediate_evict = TRUE;
+	return (dmu_buf_set_user(db_fake, user));
+}
+
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+	return (dmu_buf_replace_user(db_fake, user, NULL));
+}
+
+void *
+dmu_buf_get_user(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	dbuf_verify_user(db, DBVU_NOT_EVICTING);
+	return (db->db_user);
+}
+
+void
+dmu_buf_user_evict_wait()
+{
+	taskq_wait(dbu_evict_taskq);
+}
+
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_blkptr);
+}
+
+objset_t *
+dmu_buf_get_objset(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_objset);
+}
+
+dnode_t *
+dmu_buf_dnode_enter(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_ENTER(dbi);
+	return (DB_DNODE(dbi));
+}
+
+void
+dmu_buf_dnode_exit(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	DB_DNODE_EXIT(dbi);
+}
+
+static void
+dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
+{
+	/* ASSERT(dmu_tx_is_syncing(tx) */
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+
+	if (db->db_blkptr != NULL)
+		return;
+
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys);
+		BP_ZERO(db->db_blkptr);
+		return;
+	}
+	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
+		/*
+		 * This buffer was allocated at a time when there was
+		 * no available blkptrs from the dnode, or it was
+		 * inappropriate to hook it in (i.e., nlevels mismatch).
+		 */
+		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
+		ASSERT(db->db_parent == NULL);
+		db->db_parent = dn->dn_dbuf;
+		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
+		DBUF_VERIFY(db);
+	} else {
+		dmu_buf_impl_t *parent = db->db_parent;
+		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+		ASSERT(dn->dn_phys->dn_nlevels > 1);
+		if (parent == NULL) {
+			mutex_exit(&db->db_mtx);
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			parent = dbuf_hold_level(dn, db->db_level + 1,
+			    db->db_blkid >> epbs, db);
+			rw_exit(&dn->dn_struct_rwlock);
+			mutex_enter(&db->db_mtx);
+			db->db_parent = parent;
+		}
+		db->db_blkptr = (blkptr_t *)parent->db.db_data +
+		    (db->db_blkid & ((1ULL << epbs) - 1));
+		DBUF_VERIFY(db);
+	}
+}
+
+static void
+dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	void *data = dr->dt.dl.dr_data;
+
+	ASSERT0(db->db_level);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT(db->db_blkid == DMU_BONUS_BLKID);
+	ASSERT(data != NULL);
+
+	dnode_t *dn = dr->dr_dnode;
+	ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
+	    DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
+	bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
+
+	dbuf_sync_leaf_verify_bonus_dnode(dr);
+
+	dbuf_undirty_bonus(dr);
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
+}
+
+/*
+ * When syncing out a blocks of dnodes, adjust the block to deal with
+ * encryption.  Normally, we make sure the block is decrypted before writing
+ * it.  If we have crypt params, then we are writing a raw (encrypted) block,
+ * from a raw receive.  In this case, set the ARC buf's crypt params so
+ * that the BP will be filled with the correct byteorder, salt, iv, and mac.
+ */
+static void
+dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr)
+{
+	int err;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	ASSERT(MUTEX_HELD(&db->db_mtx));
+	ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT);
+	ASSERT3U(db->db_level, ==, 0);
+
+	if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) {
+		zbookmark_phys_t zb;
+
+		/*
+		 * Unfortunately, there is currently no mechanism for
+		 * syncing context to handle decryption errors. An error
+		 * here is only possible if an attacker maliciously
+		 * changed a dnode block and updated the associated
+		 * checksums going up the block tree.
+		 */
+		SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
+		    db->db.db_object, db->db_level, db->db_blkid);
+		err = arc_untransform(db->db_buf, db->db_objset->os_spa,
+		    &zb, B_TRUE);
+		if (err)
+			panic("Invalid dnode block MAC");
+	} else if (dr->dt.dl.dr_has_raw_params) {
+		(void) arc_release(dr->dt.dl.dr_data, db);
+		arc_convert_to_raw(dr->dt.dl.dr_data,
+		    dmu_objset_id(db->db_objset),
+		    dr->dt.dl.dr_byteorder, DMU_OT_DNODE,
+		    dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac);
+	}
+}
+
+/*
+ * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
+ * is critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn = dr->dr_dnode;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+	mutex_enter(&db->db_mtx);
+
+	ASSERT(db->db_level > 0);
+	DBUF_VERIFY(db);
+
+	/* Read the block if it hasn't been read yet. */
+	if (db->db_buf == NULL) {
+		mutex_exit(&db->db_mtx);
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+		mutex_enter(&db->db_mtx);
+	}
+	ASSERT3U(db->db_state, ==, DB_CACHED);
+	ASSERT(db->db_buf != NULL);
+
+	/* Indirect block size must match what the dnode thinks it is. */
+	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+	dbuf_check_blkptr(dn, db);
+
+	/* Provide the pending dirty record to child dbufs */
+	db->db_data_pending = dr;
+
+	mutex_exit(&db->db_mtx);
+
+	dbuf_write(dr, db->db_buf, tx);
+
+	zio_t *zio = dr->dr_zio;
+	mutex_enter(&dr->dt.di.dr_mtx);
+	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
+	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+	mutex_exit(&dr->dt.di.dr_mtx);
+	zio_nowait(zio);
+}
+
+/*
+ * Verify that the size of the data in our bonus buffer does not exceed
+ * its recorded size.
+ *
+ * The purpose of this verification is to catch any cases in development
+ * where the size of a phys structure (i.e space_map_phys_t) grows and,
+ * due to incorrect feature management, older pools expect to read more
+ * data even though they didn't actually write it to begin with.
+ *
+ * For a example, this would catch an error in the feature logic where we
+ * open an older pool and we expect to write the space map histogram of
+ * a space map with size SPACE_MAP_SIZE_V0.
+ */
+static void
+dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
+{
+#ifdef ZFS_DEBUG
+	dnode_t *dn = dr->dr_dnode;
+
+	/*
+	 * Encrypted bonus buffers can have data past their bonuslen.
+	 * Skip the verification of these blocks.
+	 */
+	if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))
+		return;
+
+	uint16_t bonuslen = dn->dn_phys->dn_bonuslen;
+	uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+	ASSERT3U(bonuslen, <=, maxbonuslen);
+
+	arc_buf_t *datap = dr->dt.dl.dr_data;
+	char *datap_end = ((char *)datap) + bonuslen;
+	char *datap_max = ((char *)datap) + maxbonuslen;
+
+	/* ensure that everything is zero after our data */
+	for (; datap_end < datap_max; datap_end++)
+		ASSERT(*datap_end == 0);
+#endif
+}
+
+static blkptr_t *
+dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
+{
+	/* This must be a lightweight dirty record. */
+	ASSERT3P(dr->dr_dbuf, ==, NULL);
+	dnode_t *dn = dr->dr_dnode;
+
+	if (dn->dn_phys->dn_nlevels == 1) {
+		VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
+		return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
+	} else {
+		dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
+		int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+		VERIFY3U(parent_db->db_level, ==, 1);
+		VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
+		VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
+		blkptr_t *bp = parent_db->db.db_data;
+		return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
+	}
+}
+
+static void
+dbuf_lightweight_ready(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio->io_error != 0)
+		return;
+
+	dnode_t *dn = dr->dr_dnode;
+
+	blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
+	spa_t *spa = dmu_objset_spa(dn->dn_objset);
+	int64_t delta = bp_get_dsize_sync(spa, bp) -
+	    bp_get_dsize_sync(spa, bp_orig);
+	dnode_diduse_space(dn, delta);
+
+	uint64_t blkid = dr->dt.dll.dr_blkid;
+	mutex_enter(&dn->dn_mtx);
+	if (blkid > dn->dn_phys->dn_maxblkid) {
+		ASSERT0(dn->dn_objset->os_raw_receive);
+		dn->dn_phys->dn_maxblkid = blkid;
+	}
+	mutex_exit(&dn->dn_mtx);
+
+	if (!BP_IS_EMBEDDED(bp)) {
+		uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
+		BP_SET_FILL(bp, fill);
+	}
+
+	dmu_buf_impl_t *parent_db;
+	EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
+	if (dr->dr_parent == NULL) {
+		parent_db = dn->dn_dbuf;
+	} else {
+		parent_db = dr->dr_parent->dr_dbuf;
+	}
+	rw_enter(&parent_db->db_rwlock, RW_WRITER);
+	*bp_orig = *bp;
+	rw_exit(&parent_db->db_rwlock);
+}
+
+static void
+dbuf_lightweight_physdone(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
+	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+	/*
+	 * The callback will be called io_phys_children times.  Retire one
+	 * portion of our dirty space each time we are called.  Any rounding
+	 * error will be cleaned up by dbuf_lightweight_done().
+	 */
+	int delta = dr->dr_accounted / zio->io_phys_children;
+	dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+static void
+dbuf_lightweight_done(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+
+	VERIFY0(zio->io_error);
+
+	objset_t *os = dr->dr_dnode->dn_objset;
+	dmu_tx_t *tx = os->os_synctx;
+
+	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+		ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+	} else {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		(void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, zio->io_bp, tx);
+	}
+
+	/*
+	 * See comment in dbuf_write_done().
+	 */
+	if (zio->io_phys_children == 0) {
+		dsl_pool_undirty_space(dmu_objset_pool(os),
+		    dr->dr_accounted, zio->io_txg);
+	} else {
+		dsl_pool_undirty_space(dmu_objset_pool(os),
+		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+	}
+
+	abd_free(dr->dt.dll.dr_abd);
+	kmem_free(dr, sizeof (*dr));
+}
+
+noinline static void
+dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+	dnode_t *dn = dr->dr_dnode;
+	zio_t *pio;
+	if (dn->dn_phys->dn_nlevels == 1) {
+		pio = dn->dn_zio;
+	} else {
+		pio = dr->dr_parent->dr_zio;
+	}
+
+	zbookmark_phys_t zb = {
+		.zb_objset = dmu_objset_id(dn->dn_objset),
+		.zb_object = dn->dn_object,
+		.zb_level = 0,
+		.zb_blkid = dr->dt.dll.dr_blkid,
+	};
+
+	/*
+	 * See comment in dbuf_write().  This is so that zio->io_bp_orig
+	 * will have the old BP in dbuf_lightweight_done().
+	 */
+	dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
+
+	dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
+	    dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
+	    dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
+	    &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
+	    dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
+	    ZIO_PRIORITY_ASYNC_WRITE,
+	    ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
+
+	zio_nowait(dr->dr_zio);
+}
+
+/*
+ * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
+ * critical the we not allow the compiler to inline this function in to
+ * dbuf_sync_list() thereby drastically bloating the stack usage.
+ */
+noinline static void
+dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+	arc_buf_t **datap = &dr->dt.dl.dr_data;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn = dr->dr_dnode;
+	objset_t *os;
+	uint64_t txg = tx->tx_txg;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
+
+	mutex_enter(&db->db_mtx);
+	/*
+	 * To be synced, we must be dirtied.  But we
+	 * might have been freed after the dirty.
+	 */
+	if (db->db_state == DB_UNCACHED) {
+		/* This buffer has been freed since it was dirtied */
+		ASSERT(db->db.db_data == NULL);
+	} else if (db->db_state == DB_FILL) {
+		/* This buffer was freed and is now being re-filled */
+		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+	} else {
+		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
+	}
+	DBUF_VERIFY(db);
+
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		mutex_enter(&dn->dn_mtx);
+		if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+			/*
+			 * In the previous transaction group, the bonus buffer
+			 * was entirely used to store the attributes for the
+			 * dnode which overrode the dn_spill field.  However,
+			 * when adding more attributes to the file a spill
+			 * block was required to hold the extra attributes.
+			 *
+			 * Make sure to clear the garbage left in the dn_spill
+			 * field from the previous attributes in the bonus
+			 * buffer.  Otherwise, after writing out the spill
+			 * block to the new allocated dva, it will free
+			 * the old block pointed to by the invalid dn_spill.
+			 */
+			db->db_blkptr = NULL;
+		}
+		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	/*
+	 * If this is a bonus buffer, simply copy the bonus data into the
+	 * dnode.  It will be written out when the dnode is synced (and it
+	 * will be synced, since it must have been dirty for dbuf_sync to
+	 * be called).
+	 */
+	if (db->db_blkid == DMU_BONUS_BLKID) {
+		ASSERT(dr->dr_dbuf == db);
+		dbuf_sync_bonus(dr, tx);
+		return;
+	}
+
+	os = dn->dn_objset;
+
+	/*
+	 * This function may have dropped the db_mtx lock allowing a dmu_sync
+	 * operation to sneak in. As a result, we need to ensure that we
+	 * don't check the dr_override_state until we have returned from
+	 * dbuf_check_blkptr.
+	 */
+	dbuf_check_blkptr(dn, db);
+
+	/*
+	 * If this buffer is in the middle of an immediate write,
+	 * wait for the synchronous IO to complete.
+	 */
+	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+		cv_wait(&db->db_changed, &db->db_mtx);
+		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
+	}
+
+	/*
+	 * If this is a dnode block, ensure it is appropriately encrypted
+	 * or decrypted, depending on what we are writing to it this txg.
+	 */
+	if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
+		dbuf_prepare_encrypted_dnode_leaf(dr);
+
+	if (db->db_state != DB_NOFILL &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
+	    zfs_refcount_count(&db->db_holds) > 1 &&
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
+	    *datap == db->db_buf) {
+		/*
+		 * If this buffer is currently "in use" (i.e., there
+		 * are active holds and db_data still references it),
+		 * then make a copy before we start the write so that
+		 * any modifications from the open txg will not leak
+		 * into this write.
+		 *
+		 * NOTE: this copy does not need to be made for
+		 * objects only modified in the syncing context (e.g.
+		 * DNONE_DNODE blocks).
+		 */
+		*datap = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
+		bcopy(db->db.db_data, (*datap)->b_data, arc_buf_size(*datap));
+	}
+	db->db_data_pending = dr;
+
+	mutex_exit(&db->db_mtx);
+
+	dbuf_write(dr, *datap, tx);
+
+	ASSERT(!list_link_active(&dr->dr_dirty_node));
+	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
+		list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
+	} else {
+		zio_nowait(dr->dr_zio);
+	}
+}
+
+void
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
+{
+	dbuf_dirty_record_t *dr;
+
+	while ((dr = list_head(list))) {
+		if (dr->dr_zio != NULL) {
+			/*
+			 * If we find an already initialized zio then we
+			 * are processing the meta-dnode, and we have finished.
+			 * The dbufs for all dnodes are put back on the list
+			 * during processing, so that we can zio_wait()
+			 * these IOs after initiating all child IOs.
+			 */
+			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
+			    DMU_META_DNODE_OBJECT);
+			break;
+		}
+		list_remove(list, dr);
+		if (dr->dr_dbuf == NULL) {
+			dbuf_sync_lightweight(dr, tx);
+		} else {
+			if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+			    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+				VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+			}
+			if (dr->dr_dbuf->db_level > 0)
+				dbuf_sync_indirect(dr, tx);
+			else
+				dbuf_sync_leaf(dr, tx);
+		}
+	}
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn;
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	spa_t *spa = zio->io_spa;
+	int64_t delta;
+	uint64_t fill = 0;
+	int i;
+
+	ASSERT3P(db->db_blkptr, !=, NULL);
+	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+	zio->io_prev_space_delta = delta;
+
+	if (bp->blk_birth != 0) {
+		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_type) ||
+		    (db->db_blkid == DMU_SPILL_BLKID &&
+		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+		    BP_IS_EMBEDDED(bp));
+		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+	}
+
+	mutex_enter(&db->db_mtx);
+
+#ifdef ZFS_DEBUG
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+		ASSERT(!(BP_IS_HOLE(bp)) &&
+		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
+	}
+#endif
+
+	if (db->db_level == 0) {
+		mutex_enter(&dn->dn_mtx);
+		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
+		    db->db_blkid != DMU_SPILL_BLKID) {
+			ASSERT0(db->db_objset->os_raw_receive);
+			dn->dn_phys->dn_maxblkid = db->db_blkid;
+		}
+		mutex_exit(&dn->dn_mtx);
+
+		if (dn->dn_type == DMU_OT_DNODE) {
+			i = 0;
+			while (i < db->db.db_size) {
+				dnode_phys_t *dnp =
+				    (void *)(((char *)db->db.db_data) + i);
+
+				i += DNODE_MIN_SIZE;
+				if (dnp->dn_type != DMU_OT_NONE) {
+					fill++;
+					i += dnp->dn_extra_slots *
+					    DNODE_MIN_SIZE;
+				}
+			}
+		} else {
+			if (BP_IS_HOLE(bp)) {
+				fill = 0;
+			} else {
+				fill = 1;
+			}
+		}
+	} else {
+		blkptr_t *ibp = db->db.db_data;
+		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+			if (BP_IS_HOLE(ibp))
+				continue;
+			fill += BP_GET_FILL(ibp);
+		}
+	}
+	DB_DNODE_EXIT(db);
+
+	if (!BP_IS_EMBEDDED(bp))
+		BP_SET_FILL(bp, fill);
+
+	mutex_exit(&db->db_mtx);
+
+	db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG);
+	*db->db_blkptr = *bp;
+	dmu_buf_unlock_parent(db, dblt, FTAG);
+}
+
+/* ARGSUSED */
+/*
+ * This function gets called just prior to running through the compression
+ * stage of the zio pipeline. If we're an indirect block comprised of only
+ * holes, then we want this indirect to be compressed away to a hole. In
+ * order to do that we must zero out any information about the holes that
+ * this indirect points to prior to before we try to compress it.
+ */
+static void
+dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	dnode_t *dn;
+	blkptr_t *bp;
+	unsigned int epbs, i;
+
+	ASSERT3U(db->db_level, >, 0);
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	ASSERT3U(epbs, <, 31);
+
+	/* Determine if all our children are holes */
+	for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) {
+		if (!BP_IS_HOLE(bp))
+			break;
+	}
+
+	/*
+	 * If all the children are holes, then zero them all out so that
+	 * we may get compressed away.
+	 */
+	if (i == 1ULL << epbs) {
+		/*
+		 * We only found holes. Grab the rwlock to prevent
+		 * anybody from reading the blocks we're about to
+		 * zero out.
+		 */
+		rw_enter(&db->db_rwlock, RW_WRITER);
+		bzero(db->db.db_data, db->db.db_size);
+		rw_exit(&db->db_rwlock);
+	}
+	DB_DNODE_EXIT(db);
+}
+
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times).  This
+ * allows the DMU to monitor the progress of each logical i/o.  For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block.  There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+	dmu_buf_impl_t *db = arg;
+	objset_t *os = db->db_objset;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+	dbuf_dirty_record_t *dr;
+	int delta = 0;
+
+	dr = db->db_data_pending;
+	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+	/*
+	 * The callback will be called io_phys_children times.  Retire one
+	 * portion of our dirty space each time we are called.  Any rounding
+	 * error will be cleaned up by dbuf_write_done().
+	 */
+	delta = dr->dr_accounted / zio->io_phys_children;
+	dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+/* ARGSUSED */
+static void
+dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+	dmu_buf_impl_t *db = vdb;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	blkptr_t *bp = db->db_blkptr;
+	objset_t *os = db->db_objset;
+	dmu_tx_t *tx = os->os_synctx;
+
+	ASSERT0(zio->io_error);
+	ASSERT(db->db_blkptr == bp);
+
+	/*
+	 * For nopwrites and rewrites we ensure that the bp matches our
+	 * original and bypass all the accounting.
+	 */
+	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+		ASSERT(BP_EQUAL(bp, bp_orig));
+	} else {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, bp, tx);
+	}
+
+	mutex_enter(&db->db_mtx);
+
+	DBUF_VERIFY(db);
+
+	dbuf_dirty_record_t *dr = db->db_data_pending;
+	dnode_t *dn = dr->dr_dnode;
+	ASSERT(!list_link_active(&dr->dr_dirty_node));
+	ASSERT(dr->dr_dbuf == db);
+	ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
+	list_remove(&db->db_dirty_records, dr);
+
+#ifdef ZFS_DEBUG
+	if (db->db_blkid == DMU_SPILL_BLKID) {
+		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
+		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+		    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
+	}
+#endif
+
+	if (db->db_level == 0) {
+		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+		if (db->db_state != DB_NOFILL) {
+			if (dr->dt.dl.dr_data != db->db_buf)
+				arc_buf_destroy(dr->dt.dl.dr_data, db);
+		}
+	} else {
+		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
+		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+		if (!BP_IS_HOLE(db->db_blkptr)) {
+			int epbs __maybe_unused = dn->dn_phys->dn_indblkshift -
+			    SPA_BLKPTRSHIFT;
+			ASSERT3U(db->db_blkid, <=,
+			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
+			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+			    db->db.db_size);
+		}
+		mutex_destroy(&dr->dt.di.dr_mtx);
+		list_destroy(&dr->dt.di.dr_children);
+	}
+
+	cv_broadcast(&db->db_changed);
+	ASSERT(db->db_dirtycnt > 0);
+	db->db_dirtycnt -= 1;
+	db->db_data_pending = NULL;
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
+
+	/*
+	 * If we didn't do a physical write in this ZIO and we
+	 * still ended up here, it means that the space of the
+	 * dbuf that we just released (and undirtied) above hasn't
+	 * been marked as undirtied in the pool's accounting.
+	 *
+	 * Thus, we undirty that space in the pool's view of the
+	 * world here. For physical writes this type of update
+	 * happens in dbuf_write_physdone().
+	 *
+	 * If we did a physical write, cleanup any rounding errors
+	 * that came up due to writing multiple copies of a block
+	 * on disk [see dbuf_write_physdone()].
+	 */
+	if (zio->io_phys_children == 0) {
+		dsl_pool_undirty_space(dmu_objset_pool(os),
+		    dr->dr_accounted, zio->io_txg);
+	} else {
+		dsl_pool_undirty_space(dmu_objset_pool(os),
+		    dr->dr_accounted % zio->io_phys_children, zio->io_txg);
+	}
+
+	kmem_free(dr, sizeof (dbuf_dirty_record_t));
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+	dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+	dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+	mutex_enter(&db->db_mtx);
+	if (!BP_EQUAL(zio->io_bp, obp)) {
+		if (!BP_IS_HOLE(obp))
+			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+		arc_release(dr->dt.dl.dr_data, db);
+	}
+	mutex_exit(&db->db_mtx);
+
+	dbuf_write_done(zio, NULL, db);
+
+	if (zio->io_abd != NULL)
+		abd_free(zio->io_abd);
+}
+
+typedef struct dbuf_remap_impl_callback_arg {
+	objset_t	*drica_os;
+	uint64_t	drica_blk_birth;
+	dmu_tx_t	*drica_tx;
+} dbuf_remap_impl_callback_arg_t;
+
+static void
+dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
+    void *arg)
+{
+	dbuf_remap_impl_callback_arg_t *drica = arg;
+	objset_t *os = drica->drica_os;
+	spa_t *spa = dmu_objset_spa(os);
+	dmu_tx_t *tx = drica->drica_tx;
+
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+	if (os == spa_meta_objset(spa)) {
+		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+	} else {
+		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
+		    size, drica->drica_blk_birth, tx);
+	}
+}
+
+static void
+dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
+{
+	blkptr_t bp_copy = *bp;
+	spa_t *spa = dmu_objset_spa(dn->dn_objset);
+	dbuf_remap_impl_callback_arg_t drica;
+
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+	drica.drica_os = dn->dn_objset;
+	drica.drica_blk_birth = bp->blk_birth;
+	drica.drica_tx = tx;
+	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
+	    &drica)) {
+		/*
+		 * If the blkptr being remapped is tracked by a livelist,
+		 * then we need to make sure the livelist reflects the update.
+		 * First, cancel out the old blkptr by appending a 'FREE'
+		 * entry. Next, add an 'ALLOC' to track the new version. This
+		 * way we avoid trying to free an inaccurate blkptr at delete.
+		 * Note that embedded blkptrs are not tracked in livelists.
+		 */
+		if (dn->dn_objset != spa_meta_objset(spa)) {
+			dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
+			if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+			    bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+				ASSERT(!BP_IS_EMBEDDED(bp));
+				ASSERT(dsl_dir_is_clone(ds->ds_dir));
+				ASSERT(spa_feature_is_enabled(spa,
+				    SPA_FEATURE_LIVELIST));
+				bplist_append(&ds->ds_dir->dd_pending_frees,
+				    bp);
+				bplist_append(&ds->ds_dir->dd_pending_allocs,
+				    &bp_copy);
+			}
+		}
+
+		/*
+		 * The db_rwlock prevents dbuf_read_impl() from
+		 * dereferencing the BP while we are changing it.  To
+		 * avoid lock contention, only grab it when we are actually
+		 * changing the BP.
+		 */
+		if (rw != NULL)
+			rw_enter(rw, RW_WRITER);
+		*bp = bp_copy;
+		if (rw != NULL)
+			rw_exit(rw);
+	}
+}
+
+/*
+ * Remap any existing BP's to concrete vdevs, if possible.
+ */
+static void
+dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(db->db_objset);
+	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
+		return;
+
+	if (db->db_level > 0) {
+		blkptr_t *bp = db->db.db_data;
+		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
+			dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx);
+		}
+	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+		dnode_phys_t *dnp = db->db.db_data;
+		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
+		    DMU_OT_DNODE);
+		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT;
+		    i += dnp[i].dn_extra_slots + 1) {
+			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
+				krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL :
+				    &dn->dn_dbuf->db_rwlock);
+				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock,
+				    tx);
+			}
+		}
+	}
+}
+
+
+/* Issue I/O to commit a dirty buffer to disk. */
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn = dr->dr_dnode;
+	objset_t *os;
+	dmu_buf_impl_t *parent = db->db_parent;
+	uint64_t txg = tx->tx_txg;
+	zbookmark_phys_t zb;
+	zio_prop_t zp;
+	zio_t *pio; /* parent I/O */
+	int wp_flag = 0;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	os = dn->dn_objset;
+
+	if (db->db_state != DB_NOFILL) {
+		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+			/*
+			 * Private object buffers are released here rather
+			 * than in dbuf_dirty() since they are only modified
+			 * in the syncing context and we don't want the
+			 * overhead of making multiple copies of the data.
+			 */
+			if (BP_IS_HOLE(db->db_blkptr)) {
+				arc_buf_thaw(data);
+			} else {
+				dbuf_release_bp(db);
+			}
+			dbuf_remap(dn, db, tx);
+		}
+	}
+
+	if (parent != dn->dn_dbuf) {
+		/* Our parent is an indirect block. */
+		/* We have a dirty parent that has been scheduled for write. */
+		ASSERT(parent && parent->db_data_pending);
+		/* Our parent's buffer is one level closer to the dnode. */
+		ASSERT(db->db_level == parent->db_level-1);
+		/*
+		 * We're about to modify our parent's db_data by modifying
+		 * our block pointer, so the parent must be released.
+		 */
+		ASSERT(arc_released(parent->db_buf));
+		pio = parent->db_data_pending->dr_zio;
+	} else {
+		/* Our parent is the dnode itself. */
+		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
+		    db->db_blkid != DMU_SPILL_BLKID) ||
+		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
+		if (db->db_blkid != DMU_SPILL_BLKID)
+			ASSERT3P(db->db_blkptr, ==,
+			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		pio = dn->dn_zio;
+	}
+
+	ASSERT(db->db_level == 0 || data == db->db_buf);
+	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT(pio);
+
+	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	if (db->db_blkid == DMU_SPILL_BLKID)
+		wp_flag = WP_SPILL;
+	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
+
+	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
+
+	/*
+	 * We copy the blkptr now (rather than when we instantiate the dirty
+	 * record), because its value can change between open context and
+	 * syncing context. We do not need to hold dn_struct_rwlock to read
+	 * db_blkptr because we are in syncing context.
+	 */
+	dr->dr_bp_copy = *db->db_blkptr;
+
+	if (db->db_level == 0 &&
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		/*
+		 * The BP for this block has been provided by open context
+		 * (by dmu_sync() or dmu_buf_write_embedded()).
+		 */
+		abd_t *contents = (data != NULL) ?
+		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
+
+		dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
+		    contents, db->db.db_size, db->db.db_size, &zp,
+		    dbuf_write_override_ready, NULL, NULL,
+		    dbuf_write_override_done,
+		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		mutex_enter(&db->db_mtx);
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+		mutex_exit(&db->db_mtx);
+	} else if (db->db_state == DB_NOFILL) {
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
+		dr->dr_zio = zio_write(pio, os->os_spa, txg,
+		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
+		    dbuf_write_nofill_ready, NULL, NULL,
+		    dbuf_write_nofill_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+	} else {
+		ASSERT(arc_released(data));
+
+		/*
+		 * For indirect blocks, we want to setup the children
+		 * ready callback so that we can properly handle an indirect
+		 * block that only contains holes.
+		 */
+		arc_write_done_func_t *children_ready_cb = NULL;
+		if (db->db_level != 0)
+			children_ready_cb = dbuf_write_children_ready;
+
+		dr->dr_zio = arc_write(pio, os->os_spa, txg,
+		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
+		    &zp, dbuf_write_ready,
+		    children_ready_cb, dbuf_write_physdone,
+		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_MUSTSUCCEED, &zb);
+	}
+}
+
+EXPORT_SYMBOL(dbuf_find);
+EXPORT_SYMBOL(dbuf_is_metadata);
+EXPORT_SYMBOL(dbuf_destroy);
+EXPORT_SYMBOL(dbuf_loan_arcbuf);
+EXPORT_SYMBOL(dbuf_whichblock);
+EXPORT_SYMBOL(dbuf_read);
+EXPORT_SYMBOL(dbuf_unoverride);
+EXPORT_SYMBOL(dbuf_free_range);
+EXPORT_SYMBOL(dbuf_new_size);
+EXPORT_SYMBOL(dbuf_release_bp);
+EXPORT_SYMBOL(dbuf_dirty);
+EXPORT_SYMBOL(dmu_buf_set_crypt_params);
+EXPORT_SYMBOL(dmu_buf_will_dirty);
+EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_not_fill);
+EXPORT_SYMBOL(dmu_buf_will_fill);
+EXPORT_SYMBOL(dmu_buf_fill_done);
+EXPORT_SYMBOL(dmu_buf_rele);
+EXPORT_SYMBOL(dbuf_assign_arcbuf);
+EXPORT_SYMBOL(dbuf_prefetch);
+EXPORT_SYMBOL(dbuf_hold_impl);
+EXPORT_SYMBOL(dbuf_hold);
+EXPORT_SYMBOL(dbuf_hold_level);
+EXPORT_SYMBOL(dbuf_create_bonus);
+EXPORT_SYMBOL(dbuf_spill_set_blksz);
+EXPORT_SYMBOL(dbuf_rm_spill);
+EXPORT_SYMBOL(dbuf_add_ref);
+EXPORT_SYMBOL(dbuf_rele);
+EXPORT_SYMBOL(dbuf_rele_and_unlock);
+EXPORT_SYMBOL(dbuf_refcount);
+EXPORT_SYMBOL(dbuf_sync_list);
+EXPORT_SYMBOL(dmu_buf_set_user);
+EXPORT_SYMBOL(dmu_buf_set_user_ie);
+EXPORT_SYMBOL(dmu_buf_get_user);
+EXPORT_SYMBOL(dmu_buf_get_blkptr);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+	"Maximum size in bytes of the dbuf cache.");
+
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
+	"Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
+	"directly.");
+
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
+	"Percentage below dbuf_cache_max_bytes when the evict thread stops "
+	"evicting dbufs.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
+	"Maximum size in bytes of the dbuf metadata cache.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
+	"Set the size of the dbuf cache to a log2 fraction of arc size.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
+	"Set the size of the dbuf metadata cache to a log2 fraction of arc "
+	"size.");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dbuf_stats.c b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
new file mode 100644
index 000000000000..12bb568a08cc
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dbuf_stats.c
@@ -0,0 +1,232 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_objset.h>
+
+/*
+ * Calculate the index of the arc header for the state, disabled by default.
+ */
+int zfs_dbuf_state_index = 0;
+
+/*
+ * ==========================================================================
+ * Dbuf Hash Read Routines
+ * ==========================================================================
+ */
+typedef struct dbuf_stats_t {
+	kmutex_t		lock;
+	kstat_t			*kstat;
+	dbuf_hash_table_t	*hash;
+	int			idx;
+} dbuf_stats_t;
+
+static dbuf_stats_t dbuf_stats_hash_table;
+
+static int
+dbuf_stats_hash_table_headers(char *buf, size_t size)
+{
+	(void) snprintf(buf, size,
+	    "%-96s | %-119s | %s\n"
+	    "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | "
+	    "%-5s %-5s %-9s %-6s %-8s %-12s "
+	    "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | "
+	    "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n",
+	    "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level",
+	    "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc",
+	    "list", "atype", "flags", "count", "asize", "access",
+	    "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize",
+	    "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs",
+	    "bsize", "lvls", "dholds", "blocks", "dsize");
+
+	return (0);
+}
+
+static int
+__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
+{
+	arc_buf_info_t abi = { 0 };
+	dmu_object_info_t doi = { 0 };
+	dnode_t *dn = DB_DNODE(db);
+	size_t nwritten;
+
+	if (db->db_buf)
+		arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
+
+	__dmu_object_info_from_dnode(dn, &doi);
+
+	nwritten = snprintf(buf, size,
+	    "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d "
+	    "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu "
+	    "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | "
+	    "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n",
+	    /* dmu_buf_impl_t */
+	    spa_name(dn->dn_objset->os_spa),
+	    (u_longlong_t)dmu_objset_id(db->db_objset),
+	    (longlong_t)db->db.db_object,
+	    (longlong_t)db->db_level,
+	    (longlong_t)db->db_blkid,
+	    (u_longlong_t)db->db.db_offset,
+	    (u_longlong_t)db->db.db_size,
+	    !!dbuf_is_metadata(db),
+	    db->db_state,
+	    (ulong_t)zfs_refcount_count(&db->db_holds),
+	    multilist_link_active(&db->db_cache_link),
+	    /* arc_buf_info_t */
+	    abi.abi_state_type,
+	    abi.abi_state_contents,
+	    abi.abi_flags,
+	    (ulong_t)abi.abi_bufcnt,
+	    (u_longlong_t)abi.abi_size,
+	    (u_longlong_t)abi.abi_access,
+	    (ulong_t)abi.abi_mru_hits,
+	    (ulong_t)abi.abi_mru_ghost_hits,
+	    (ulong_t)abi.abi_mfu_hits,
+	    (ulong_t)abi.abi_mfu_ghost_hits,
+	    (ulong_t)abi.abi_l2arc_hits,
+	    (u_longlong_t)abi.abi_l2arc_dattr,
+	    (u_longlong_t)abi.abi_l2arc_asize,
+	    abi.abi_l2arc_compress,
+	    (ulong_t)abi.abi_holds,
+	    /* dmu_object_info_t */
+	    doi.doi_type,
+	    doi.doi_bonus_type,
+	    (ulong_t)doi.doi_data_block_size,
+	    (ulong_t)doi.doi_metadata_block_size,
+	    (u_longlong_t)doi.doi_bonus_size,
+	    (ulong_t)doi.doi_indirection,
+	    (ulong_t)zfs_refcount_count(&dn->dn_holds),
+	    (u_longlong_t)doi.doi_fill_count,
+	    (u_longlong_t)doi.doi_max_offset);
+
+	if (nwritten >= size)
+		return (size);
+
+	return (nwritten + 1);
+}
+
+static int
+dbuf_stats_hash_table_data(char *buf, size_t size, void *data)
+{
+	dbuf_stats_t *dsh = (dbuf_stats_t *)data;
+	dbuf_hash_table_t *h = dsh->hash;
+	dmu_buf_impl_t *db;
+	int length, error = 0;
+
+	ASSERT3S(dsh->idx, >=, 0);
+	ASSERT3S(dsh->idx, <=, h->hash_table_mask);
+	if (size)
+		buf[0] = 0;
+
+	mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx));
+	for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) {
+		/*
+		 * Returning ENOMEM will cause the data and header functions
+		 * to be called with a larger scratch buffers.
+		 */
+		if (size < 512) {
+			error = SET_ERROR(ENOMEM);
+			break;
+		}
+
+		mutex_enter(&db->db_mtx);
+
+		if (db->db_state != DB_EVICTING) {
+			length = __dbuf_stats_hash_table_data(buf, size, db);
+			buf += length;
+			size -= length;
+		}
+
+		mutex_exit(&db->db_mtx);
+	}
+	mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx));
+
+	return (error);
+}
+
+static void *
+dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n)
+{
+	dbuf_stats_t *dsh = ksp->ks_private;
+
+	ASSERT(MUTEX_HELD(&dsh->lock));
+
+	if (n <= dsh->hash->hash_table_mask) {
+		dsh->idx = n;
+		return (dsh);
+	}
+
+	return (NULL);
+}
+
+static void
+dbuf_stats_hash_table_init(dbuf_hash_table_t *hash)
+{
+	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+	kstat_t *ksp;
+
+	mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL);
+	dsh->hash = hash;
+
+	ksp = kstat_create("zfs", 0, "dbufs", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+	dsh->kstat = ksp;
+
+	if (ksp) {
+		ksp->ks_lock = &dsh->lock;
+		ksp->ks_ndata = UINT32_MAX;
+		ksp->ks_private = dsh;
+		kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers,
+		    dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr);
+		kstat_install(ksp);
+	}
+}
+
+static void
+dbuf_stats_hash_table_destroy(void)
+{
+	dbuf_stats_t *dsh = &dbuf_stats_hash_table;
+	kstat_t *ksp;
+
+	ksp = dsh->kstat;
+	if (ksp)
+		kstat_delete(ksp);
+
+	mutex_destroy(&dsh->lock);
+}
+
+void
+dbuf_stats_init(dbuf_hash_table_t *hash)
+{
+	dbuf_stats_hash_table_init(hash);
+}
+
+void
+dbuf_stats_destroy(void)
+{
+	dbuf_stats_hash_table_destroy();
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW,
+	"Calculate arc header index");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
new file mode 100644
index 000000000000..b94a9f54ece3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -0,0 +1,1187 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+
+static kmem_cache_t *ddt_cache;
+static kmem_cache_t *ddt_entry_cache;
+
+/*
+ * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
+ */
+int zfs_dedup_prefetch = 0;
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+	&ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+	"ditto",
+	"duplicate",
+	"unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+	objset_t *os = ddt->ddt_os;
+	uint64_t *objectp = &ddt->ddt_object[type][class];
+	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP;
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	ASSERT(*objectp == 0);
+	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+	ASSERT(*objectp != 0);
+
+	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, objectp, tx) == 0);
+
+	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+	objset_t *os = ddt->ddt_os;
+	uint64_t *objectp = &ddt->ddt_object[type][class];
+	uint64_t count;
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	ASSERT(*objectp != 0);
+	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+	VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0);
+	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+	bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
+
+	*objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+	dmu_object_info_t doi;
+	uint64_t count;
+	char name[DDT_NAMELEN];
+	int error;
+
+	ddt_object_name(ddt, type, class, name);
+
+	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+	if (error != 0)
+		return (error);
+
+	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class]);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Seed the cached statistics.
+	 */
+	error = ddt_object_info(ddt, type, class, &doi);
+	if (error)
+		return (error);
+
+	error = ddt_object_count(ddt, type, class, &count);
+	if (error)
+		return (error);
+
+	ddo->ddo_count = count;
+	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+	return (0);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
+	dmu_object_info_t doi;
+	uint64_t count;
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class], tx) == 0);
+
+	/*
+	 * Cache DDT statistics; this is the only time they'll change.
+	 */
+	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
+	VERIFY(ddt_object_count(ddt, type, class, &count) == 0);
+
+	ddo->ddo_count = count;
+	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
+	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return (SET_ERROR(ENOENT));
+
+	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde));
+}
+
+static void
+ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return;
+
+	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde);
+}
+
+int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    uint64_t *walk, ddt_entry_t *dde)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, walk));
+}
+
+int
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    uint64_t *count)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+	    ddt->ddt_object[type][class], count));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_object_info_t *doi)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return (SET_ERROR(ENOENT));
+
+	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+	    doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    char *name)
+{
+	(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name,
+	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+	ASSERT(txg != 0);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		bp->blk_dva[d] = ddp->ddp_dva[d];
+	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+/*
+ * The bp created via this function may be used for repairs and scrub, but it
+ * will be missing the salt / IV required to do a full decrypting read.
+ */
+void
+ddt_bp_create(enum zio_checksum checksum,
+    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+	BP_ZERO(bp);
+
+	if (ddp != NULL)
+		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+	bp->blk_cksum = ddk->ddk_cksum;
+
+	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+	BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
+	BP_SET_FILL(bp, 1);
+	BP_SET_CHECKSUM(bp, checksum);
+	BP_SET_TYPE(bp, DMU_OT_DEDUP);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 1);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+	ddk->ddk_cksum = bp->blk_cksum;
+	ddk->ddk_prop = 0;
+
+	ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
+
+	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+	DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+	ASSERT(ddp->ddp_phys_birth == 0);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		ddp->ddp_dva[d] = bp->blk_dva[d];
+	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+	bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+	ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+	if (ddp) {
+		ASSERT(ddp->ddp_refcnt > 0);
+		ddp->ddp_refcnt--;
+	}
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+	blkptr_t blk;
+
+	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+
+	/*
+	 * We clear the dedup bit so that zio_free() will actually free the
+	 * space, rather than just decrementing the refcount in the DDT.
+	 */
+	BP_SET_DEDUP(&blk, 0);
+
+	ddt_phys_clear(ddp);
+	zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+			return (ddp);
+	}
+	return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+	uint64_t refcnt = 0;
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+		refcnt += dde->dde_phys[p].ddp_refcnt;
+
+	return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	uint64_t lsize = DDK_GET_LSIZE(ddk);
+	uint64_t psize = DDK_GET_PSIZE(ddk);
+
+	bzero(dds, sizeof (*dds));
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		uint64_t dsize = 0;
+		uint64_t refcnt = ddp->ddp_refcnt;
+
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+
+		for (int d = 0; d < DDE_GET_NDVAS(dde); d++)
+			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+		dds->dds_blocks += 1;
+		dds->dds_lsize += lsize;
+		dds->dds_psize += psize;
+		dds->dds_dsize += dsize;
+
+		dds->dds_ref_blocks += refcnt;
+		dds->dds_ref_lsize += lsize * refcnt;
+		dds->dds_ref_psize += psize * refcnt;
+		dds->dds_ref_dsize += dsize * refcnt;
+	}
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+	const uint64_t *s = (const uint64_t *)src;
+	uint64_t *d = (uint64_t *)dst;
+	uint64_t *d_end = (uint64_t *)(dst + 1);
+
+	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+
+	while (d < d_end)
+		*d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+	ddt_stat_t dds;
+	ddt_histogram_t *ddh;
+	int bucket;
+
+	ddt_stat_generate(ddt, dde, &dds);
+
+	bucket = highbit64(dds.dds_ref_blocks) - 1;
+	ASSERT(bucket >= 0);
+
+	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+	bzero(dds, sizeof (*dds));
+
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+	const uint64_t *s = (const uint64_t *)ddh;
+	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+	while (s < s_end)
+		if (*s++ != 0)
+			return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
+{
+	/* Sum the statistics we cached in ddt_object_sync(). */
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_object_t *ddo =
+				    &ddt->ddt_object_stats[type][class];
+				ddo_total->ddo_count += ddo->ddo_count;
+				ddo_total->ddo_dspace += ddo->ddo_dspace;
+				ddo_total->ddo_mspace += ddo->ddo_mspace;
+			}
+		}
+	}
+
+	/* ... and compute the averages. */
+	if (ddo_total->ddo_count != 0) {
+		ddo_total->ddo_dspace /= ddo_total->ddo_count;
+		ddo_total->ddo_mspace /= ddo_total->ddo_count;
+	}
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_histogram_add(ddh,
+				    &ddt->ddt_histogram_cache[type][class]);
+			}
+		}
+	}
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+	ddt_histogram_t *ddh_total;
+
+	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+	ddt_get_dedup_histogram(spa, ddh_total);
+	ddt_histogram_stat(dds_total, ddh_total);
+	kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+	ddt_stat_t dds_total;
+
+	if (spa->spa_dedup_dspace != ~0ULL)
+		return (spa->spa_dedup_dspace);
+
+	bzero(&dds_total, sizeof (ddt_stat_t));
+
+	/* Calculate and cache the stats */
+	ddt_get_dedup_stats(spa, &dds_total);
+	spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize;
+	return (spa->spa_dedup_dspace);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+	ddt_stat_t dds_total = { 0 };
+
+	ddt_get_dedup_stats(spa, &dds_total);
+	if (dds_total.dds_dsize == 0)
+		return (100);
+
+	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+	uchar_t *version = dst++;
+	int cpfunc = ZIO_COMPRESS_ZLE;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	size_t c_len;
+
+	ASSERT(d_len >= s_len + 1);	/* no compression plus version byte */
+
+	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+	if (c_len == s_len) {
+		cpfunc = ZIO_COMPRESS_OFF;
+		bcopy(src, dst, s_len);
+	}
+
+	*version = cpfunc;
+	/* CONSTCOND */
+	if (ZFS_HOST_BYTEORDER)
+		*version |= DDT_COMPRESS_BYTEORDER_MASK;
+
+	return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+	uchar_t version = *src++;
+	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+	if (ci->ci_decompress != NULL)
+		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+	else
+		bcopy(src, dst, d_len);
+
+	if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
+	    (ZFS_HOST_BYTEORDER != 0))
+		byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+	mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+	mutex_exit(&ddt->ddt_lock);
+}
+
+void
+ddt_init(void)
+{
+	ddt_cache = kmem_cache_create("ddt_cache",
+	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	ddt_entry_cache = kmem_cache_create("ddt_entry_cache",
+	    sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+ddt_fini(void)
+{
+	kmem_cache_destroy(ddt_entry_cache);
+	kmem_cache_destroy(ddt_cache);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+	ddt_entry_t *dde;
+
+	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
+	bzero(dde, sizeof (ddt_entry_t));
+	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+	dde->dde_key = *ddk;
+
+	return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+	ASSERT(!dde->dde_loading);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++)
+		ASSERT(dde->dde_lead_zio[p] == NULL);
+
+	if (dde->dde_repair_abd != NULL)
+		abd_free(dde->dde_repair_abd);
+
+	cv_destroy(&dde->dde_cv);
+	kmem_cache_free(ddt_entry_cache, dde);
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+	avl_remove(&ddt->ddt_tree, dde);
+	ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+	ddt_entry_t *dde, dde_search;
+	enum ddt_type type;
+	enum ddt_class class;
+	avl_index_t where;
+	int error;
+
+	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+	ddt_key_fill(&dde_search.dde_key, bp);
+
+	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+	if (dde == NULL) {
+		if (!add)
+			return (NULL);
+		dde = ddt_alloc(&dde_search.dde_key);
+		avl_insert(&ddt->ddt_tree, dde, where);
+	}
+
+	while (dde->dde_loading)
+		cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+	if (dde->dde_loaded)
+		return (dde);
+
+	dde->dde_loading = B_TRUE;
+
+	ddt_exit(ddt);
+
+	error = ENOENT;
+
+	for (type = 0; type < DDT_TYPES; type++) {
+		for (class = 0; class < DDT_CLASSES; class++) {
+			error = ddt_object_lookup(ddt, type, class, dde);
+			if (error != ENOENT) {
+				ASSERT0(error);
+				break;
+			}
+		}
+		if (error != ENOENT)
+			break;
+	}
+
+	ddt_enter(ddt);
+
+	ASSERT(dde->dde_loaded == B_FALSE);
+	ASSERT(dde->dde_loading == B_TRUE);
+
+	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
+	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
+	dde->dde_loaded = B_TRUE;
+	dde->dde_loading = B_FALSE;
+
+	if (error == 0)
+		ddt_stat_update(ddt, dde, -1ULL);
+
+	cv_broadcast(&dde->dde_cv);
+
+	return (dde);
+}
+
+void
+ddt_prefetch(spa_t *spa, const blkptr_t *bp)
+{
+	ddt_t *ddt;
+	ddt_entry_t dde;
+
+	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
+		return;
+
+	/*
+	 * We only remove the DDT once all tables are empty and only
+	 * prefetch dedup blocks when there are entries in the DDT.
+	 * Thus no locking is required as the DDT can't disappear on us.
+	 */
+	ddt = ddt_select(spa, bp);
+	ddt_key_fill(&dde.dde_key, bp);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			ddt_object_prefetch(ddt, type, class, &dde);
+		}
+	}
+}
+
+/*
+ * Opaque struct used for ddt_key comparison
+ */
+#define	DDT_KEY_CMP_LEN	(sizeof (ddt_key_t) / sizeof (uint16_t))
+
+typedef struct ddt_key_cmp {
+	uint16_t	u16[DDT_KEY_CMP_LEN];
+} ddt_key_cmp_t;
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+	const ddt_entry_t *dde1 = x1;
+	const ddt_entry_t *dde2 = x2;
+	const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key;
+	const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key;
+	int32_t cmp = 0;
+
+	for (int i = 0; i < DDT_KEY_CMP_LEN; i++) {
+		cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i];
+		if (likely(cmp))
+			break;
+	}
+
+	return (TREE_ISIGN(cmp));
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+	ddt_t *ddt;
+
+	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
+	bzero(ddt, sizeof (ddt_t));
+
+	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&ddt->ddt_tree, ddt_entry_compare,
+	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+	ddt->ddt_checksum = c;
+	ddt->ddt_spa = spa;
+	ddt->ddt_os = spa->spa_meta_objset;
+
+	return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+	avl_destroy(&ddt->ddt_tree);
+	avl_destroy(&ddt->ddt_repair_tree);
+	mutex_destroy(&ddt->ddt_lock);
+	kmem_cache_free(ddt_cache, ddt);
+}
+
+void
+ddt_create(spa_t *spa)
+{
+	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+		spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+	int error;
+
+	ddt_create(spa);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+	    &spa->spa_ddt_stat_object);
+
+	if (error)
+		return (error == ENOENT ? 0 : error);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				error = ddt_object_load(ddt, type, class);
+				if (error != 0 && error != ENOENT)
+					return (error);
+			}
+		}
+
+		/*
+		 * Seed the cached histograms.
+		 */
+		bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+		    sizeof (ddt->ddt_histogram));
+		spa->spa_dedup_dspace = ~0ULL;
+	}
+
+	return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		if (spa->spa_ddt[c]) {
+			ddt_table_free(spa->spa_ddt[c]);
+			spa->spa_ddt[c] = NULL;
+		}
+	}
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+	ddt_t *ddt;
+	ddt_entry_t *dde;
+
+	if (!BP_GET_DEDUP(bp))
+		return (B_FALSE);
+
+	if (max_class == DDT_CLASS_UNIQUE)
+		return (B_TRUE);
+
+	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+	dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP);
+
+	ddt_key_fill(&(dde->dde_key), bp);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class <= max_class; class++) {
+			if (ddt_object_lookup(ddt, type, class, dde) == 0) {
+				kmem_cache_free(ddt_entry_cache, dde);
+				return (B_TRUE);
+			}
+		}
+	}
+
+	kmem_cache_free(ddt_entry_cache, dde);
+	return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+	ddt_key_t ddk;
+	ddt_entry_t *dde;
+
+	ddt_key_fill(&ddk, bp);
+
+	dde = ddt_alloc(&ddk);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			/*
+			 * We can only do repair if there are multiple copies
+			 * of the block.  For anything in the UNIQUE class,
+			 * there's definitely only one copy, so don't even try.
+			 */
+			if (class != DDT_CLASS_UNIQUE &&
+			    ddt_object_lookup(ddt, type, class, dde) == 0)
+				return (dde);
+		}
+	}
+
+	bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+	return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+	avl_index_t where;
+
+	ddt_enter(ddt);
+
+	if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) &&
+	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+		avl_insert(&ddt->ddt_repair_tree, dde, where);
+	else
+		ddt_free(dde);
+
+	ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+	ddt_entry_t *rdde = zio->io_private;
+
+	ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_phys_t *rddp = rdde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	ddt_key_t *rddk = &rdde->dde_key;
+	zio_t *zio;
+	blkptr_t blk;
+
+	zio = zio_null(rio, rio->io_spa, NULL,
+	    ddt_repair_entry_done, rdde, rio->io_flags);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+			continue;
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+		    rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL,
+		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+	}
+
+	zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_entry_t *dde, *rdde_next, *rdde;
+	avl_tree_t *t = &ddt->ddt_repair_tree;
+	blkptr_t blk;
+
+	if (spa_sync_pass(spa) > 1)
+		return;
+
+	ddt_enter(ddt);
+	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+		rdde_next = AVL_NEXT(t, rdde);
+		avl_remove(&ddt->ddt_repair_tree, rdde);
+		ddt_exit(ddt);
+		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+		dde = ddt_repair_start(ddt, &blk);
+		ddt_repair_entry(ddt, dde, rdde, rio);
+		ddt_repair_done(ddt, dde);
+		ddt_enter(ddt);
+	}
+	ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	enum ddt_type otype = dde->dde_type;
+	enum ddt_type ntype = DDT_TYPE_CURRENT;
+	enum ddt_class oclass = dde->dde_class;
+	enum ddt_class nclass;
+	uint64_t total_refcnt = 0;
+
+	ASSERT(dde->dde_loaded);
+	ASSERT(!dde->dde_loading);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		ASSERT(dde->dde_lead_zio[p] == NULL);
+		if (ddp->ddp_phys_birth == 0) {
+			ASSERT(ddp->ddp_refcnt == 0);
+			continue;
+		}
+		if (p == DDT_PHYS_DITTO) {
+			/*
+			 * Note, we no longer create DDT-DITTO blocks, but we
+			 * don't want to leak any written by older software.
+			 */
+			ddt_phys_free(ddt, ddk, ddp, txg);
+			continue;
+		}
+		if (ddp->ddp_refcnt == 0)
+			ddt_phys_free(ddt, ddk, ddp, txg);
+		total_refcnt += ddp->ddp_refcnt;
+	}
+
+	/* We do not create new DDT-DITTO blocks. */
+	ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth);
+	if (total_refcnt > 1)
+		nclass = DDT_CLASS_DUPLICATE;
+	else
+		nclass = DDT_CLASS_UNIQUE;
+
+	if (otype != DDT_TYPES &&
+	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+		VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+		ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+	}
+
+	if (total_refcnt != 0) {
+		dde->dde_type = ntype;
+		dde->dde_class = nclass;
+		ddt_stat_update(ddt, dde, 0);
+		if (!ddt_object_exists(ddt, ntype, nclass))
+			ddt_object_create(ddt, ntype, nclass, tx);
+		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+		/*
+		 * If the class changes, the order that we scan this bp
+		 * changes.  If it decreases, we could miss it, so
+		 * scan it right now.  (This covers both class changing
+		 * while we are doing ddt_walk(), and when we are
+		 * traversing.)
+		 */
+		if (nclass < oclass) {
+			dsl_scan_ddt_entry(dp->dp_scan,
+			    ddt->ddt_checksum, dde, tx);
+		}
+	}
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, tx);
+	}
+
+	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+		ddt_sync_entry(ddt, dde, tx, txg);
+		ddt_free(dde);
+	}
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		uint64_t add, count = 0;
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			if (ddt_object_exists(ddt, type, class)) {
+				ddt_object_sync(ddt, type, class, tx);
+				VERIFY(ddt_object_count(ddt, type, class,
+				    &add) == 0);
+				count += add;
+			}
+		}
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			if (count == 0 && ddt_object_exists(ddt, type, class))
+				ddt_object_destroy(ddt, type, class, tx);
+		}
+	}
+
+	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
+	    sizeof (ddt->ddt_histogram));
+	spa->spa_dedup_dspace = ~0ULL;
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+	dmu_tx_t *tx;
+	zio_t *rio;
+
+	ASSERT(spa_syncing_txg(spa) == txg);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	rio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+
+	/*
+	 * This function may cause an immediate scan of ddt blocks (see
+	 * the comment above dsl_scan_ddt() for details). We set the
+	 * scan's root zio here so that we can wait for any scan IOs in
+	 * addition to the regular ddt IOs.
+	 */
+	ASSERT3P(scn->scn_zio_root, ==, NULL);
+	scn->scn_zio_root = rio;
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL)
+			continue;
+		ddt_sync_table(ddt, tx, txg);
+		ddt_repair_table(ddt, rio);
+	}
+
+	(void) zio_wait(rio);
+	scn->scn_zio_root = NULL;
+
+	dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+	do {
+		do {
+			do {
+				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+				int error = ENOENT;
+				if (ddt_object_exists(ddt, ddb->ddb_type,
+				    ddb->ddb_class)) {
+					error = ddt_object_walk(ddt,
+					    ddb->ddb_type, ddb->ddb_class,
+					    &ddb->ddb_cursor, dde);
+				}
+				dde->dde_type = ddb->ddb_type;
+				dde->dde_class = ddb->ddb_class;
+				if (error == 0)
+					return (0);
+				if (error != ENOENT)
+					return (error);
+				ddb->ddb_cursor = 0;
+			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+			ddb->ddb_checksum = 0;
+		} while (++ddb->ddb_type < DDT_TYPES);
+		ddb->ddb_type = 0;
+	} while (++ddb->ddb_class < DDT_CLASSES);
+
+	return (SET_ERROR(ENOENT));
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
+	"Enable prefetching dedup-ed blks");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/ddt_zap.c b/sys/contrib/openzfs/module/zfs/ddt_zap.c
new file mode 100644
index 000000000000..c5c9eda0b2d0
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/ddt_zap.c
@@ -0,0 +1,168 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+	zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+	if (prehash)
+		flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+	    ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+	    DMU_OT_NONE, 0, tx);
+
+	return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+	uchar_t *cbuf;
+	uint64_t one, csize;
+	int error;
+
+	cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_SLEEP);
+
+	error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, &one, &csize);
+	if (error)
+		goto out;
+
+	ASSERT(one == 1);
+	ASSERT(csize <= (sizeof (dde->dde_phys) + 1));
+
+	error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf);
+	if (error)
+		goto out;
+
+	ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+out:
+	kmem_free(cbuf, sizeof (dde->dde_phys) + 1);
+
+	return (error);
+}
+
+static void
+ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+	(void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+	uint64_t csize;
+
+	csize = ddt_compress(dde->dde_phys, cbuf,
+	    sizeof (dde->dde_phys), sizeof (cbuf));
+
+	return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int error;
+
+	if (*walk == 0) {
+		/*
+		 * We don't want to prefetch the entire ZAP object, because
+		 * it can be enormous.  Also the primary use of DDT iteration
+		 * is for scrubbing, in which case we will be issuing many
+		 * scrub I/Os for each ZAP block that we read in, so
+		 * reading the ZAP is unlikely to be the bottleneck.
+		 */
+		zap_cursor_init_noprefetch(&zc, os, object);
+	} else {
+		zap_cursor_init_serialized(&zc, os, object, *walk);
+	}
+	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+		uint64_t csize = za.za_num_integers;
+		ASSERT(za.za_integer_length == 1);
+		error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+		    DDT_KEY_WORDS, 1, csize, cbuf);
+		ASSERT(error == 0);
+		if (error == 0) {
+			ddt_decompress(cbuf, dde->dde_phys, csize,
+			    sizeof (dde->dde_phys));
+			dde->dde_key = *(ddt_key_t *)za.za_name;
+		}
+		zap_cursor_advance(&zc);
+		*walk = zap_cursor_serialize(&zc);
+	}
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+static int
+ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count)
+{
+	return (zap_count(os, object, count));
+}
+
+const ddt_ops_t ddt_zap_ops = {
+	"zap",
+	ddt_zap_create,
+	ddt_zap_destroy,
+	ddt_zap_lookup,
+	ddt_zap_prefetch,
+	ddt_zap_update,
+	ddt_zap_remove,
+	ddt_zap_walk,
+	ddt_zap_count,
+};
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
new file mode 100644
index 000000000000..ed345f0b6ec3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -0,0 +1,2333 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_prop.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/sa.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/trace_zfs.h>
+#include <sys/zfs_rlock.h>
+#ifdef _KERNEL
+#include <sys/vmsystm.h>
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+
+/*
+ * Tunable to control percentage of dirtied L1 blocks from frees allowed into
+ * one TXG. After this threshold is crossed, additional dirty blocks from frees
+ * will wait until the next TXG.
+ * A value of zero will disable this throttle.
+ */
+unsigned long zfs_per_txg_dirty_frees_percent = 5;
+
+/*
+ * Enable/disable forcing txg sync when dirty in dmu_offset_next.
+ */
+int zfs_dmu_offset_next_sync = 0;
+
+/*
+ * Limit the amount we can prefetch with one call to this amount.  This
+ * helps to limit the amount of memory that can be used by prefetching.
+ * Larger objects should be prefetched a bit at a time.
+ */
+int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
+
+const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
+	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
+	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
+	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
+	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
+	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
+	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
+	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
+	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
+	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
+	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
+	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
+	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
+};
+
+const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
+	{	byteswap_uint8_array,	"uint8"		},
+	{	byteswap_uint16_array,	"uint16"	},
+	{	byteswap_uint32_array,	"uint32"	},
+	{	byteswap_uint64_array,	"uint64"	},
+	{	zap_byteswap,		"zap"		},
+	{	dnode_buf_byteswap,	"dnode"		},
+	{	dmu_objset_byteswap,	"objset"	},
+	{	zfs_znode_byteswap,	"znode"		},
+	{	zfs_oldacl_byteswap,	"oldacl"	},
+	{	zfs_acl_byteswap,	"acl"		}
+};
+
+static int
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
+{
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = dbuf_whichblock(dn, 0, offset);
+	db = dbuf_hold(dn, blkid, tag);
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (0);
+}
+int
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	dmu_buf_impl_t *db;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = dbuf_whichblock(dn, 0, offset);
+	db = dbuf_hold(dn, blkid, tag);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+
+	*dbp = &db->db;
+	return (err);
+}
+
+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+	if (flags & DMU_READ_NO_DECRYPT)
+		db_flags |= DB_RF_NO_DECRYPT;
+
+	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
+			dbuf_rele(db, tag);
+			*dbp = NULL;
+		}
+	}
+
+	return (err);
+}
+
+int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+    void *tag, dmu_buf_t **dbp, int flags)
+{
+	int err;
+	int db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+	if (flags & DMU_READ_NO_DECRYPT)
+		db_flags |= DB_RF_NO_DECRYPT;
+
+	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+	if (err == 0) {
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+		err = dbuf_read(db, NULL, db_flags);
+		if (err != 0) {
+			dbuf_rele(db, tag);
+			*dbp = NULL;
+		}
+	}
+
+	return (err);
+}
+
+int
+dmu_bonus_max(void)
+{
+	return (DN_OLD_MAX_BONUSLEN);
+}
+
+int
+dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (dn->dn_bonus != db) {
+		error = SET_ERROR(EINVAL);
+	} else if (newsize < 0 || newsize > db_fake->db_size) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		dnode_setbonuslen(dn, newsize, tx);
+		error = 0;
+	}
+
+	DB_DNODE_EXIT(db);
+	return (error);
+}
+
+int
+dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int error;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (!DMU_OT_IS_VALID(type)) {
+		error = SET_ERROR(EINVAL);
+	} else if (dn->dn_bonus != db) {
+		error = SET_ERROR(EINVAL);
+	} else {
+		dnode_setbonus_type(dn, type, tx);
+		error = 0;
+	}
+
+	DB_DNODE_EXIT(db);
+	return (error);
+}
+
+dmu_object_type_t
+dmu_get_bonustype(dmu_buf_t *db_fake)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	dmu_object_type_t type;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	type = dn->dn_bonustype;
+	DB_DNODE_EXIT(db);
+
+	return (type);
+}
+
+int
+dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int error;
+
+	error = dnode_hold(os, object, FTAG, &dn);
+	dbuf_rm_spill(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_rm_spill(dn, tx);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	return (error);
+}
+
+/*
+ * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
+ * has not yet been allocated a new bonus dbuf a will be allocated.
+ * Returns ENOENT, EIO, or 0.
+ */
+int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp,
+    uint32_t flags)
+{
+	dmu_buf_impl_t *db;
+	int error;
+	uint32_t db_flags = DB_RF_MUST_SUCCEED;
+
+	if (flags & DMU_READ_NO_PREFETCH)
+		db_flags |= DB_RF_NOPREFETCH;
+	if (flags & DMU_READ_NO_DECRYPT)
+		db_flags |= DB_RF_NO_DECRYPT;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_bonus == NULL) {
+		rw_exit(&dn->dn_struct_rwlock);
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		if (dn->dn_bonus == NULL)
+			dbuf_create_bonus(dn);
+	}
+	db = dn->dn_bonus;
+
+	/* as long as the bonus buf is held, the dnode will be held */
+	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
+		VERIFY(dnode_add_ref(dn, db));
+		atomic_inc_32(&dn->dn_dbufs_count);
+	}
+
+	/*
+	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
+	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
+	 * a dnode hold for every dbuf.
+	 */
+	rw_exit(&dn->dn_struct_rwlock);
+
+	error = dbuf_read(db, NULL, db_flags);
+	if (error) {
+		dnode_evict_bonus(dn);
+		dbuf_rele(db, tag);
+		*dbp = NULL;
+		return (error);
+	}
+
+	*dbp = &db->db;
+	return (0);
+}
+
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
+{
+	dnode_t *dn;
+	int error;
+
+	error = dnode_hold(os, object, FTAG, &dn);
+	if (error)
+		return (error);
+
+	error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
+	dnode_rele(dn, FTAG);
+
+	return (error);
+}
+
+/*
+ * returns ENOENT, EIO, or 0.
+ *
+ * This interface will allocate a blank spill dbuf when a spill blk
+ * doesn't already exist on the dnode.
+ *
+ * if you only want to find an already existing spill db, then
+ * dmu_spill_hold_existing() should be used.
+ */
+int
+dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = NULL;
+	int err;
+
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
+
+	if ((flags & DB_RF_HAVESTRUCT) == 0)
+		rw_exit(&dn->dn_struct_rwlock);
+
+	if (db == NULL) {
+		*dbp = NULL;
+		return (SET_ERROR(EIO));
+	}
+	err = dbuf_read(db, NULL, flags);
+	if (err == 0)
+		*dbp = &db->db;
+	else {
+		dbuf_rele(db, tag);
+		*dbp = NULL;
+	}
+	return (err);
+}
+
+int
+dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
+		err = SET_ERROR(EINVAL);
+	} else {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+		if (!dn->dn_have_spill) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			err = dmu_spill_hold_by_dnode(dn,
+			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
+		}
+
+		rw_exit(&dn->dn_struct_rwlock);
+	}
+
+	DB_DNODE_EXIT(db);
+	return (err);
+}
+
+int
+dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag,
+    dmu_buf_t **dbp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
+	dnode_t *dn;
+	int err;
+	uint32_t db_flags = DB_RF_CANFAIL;
+
+	if (flags & DMU_READ_NO_DECRYPT)
+		db_flags |= DB_RF_NO_DECRYPT;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
+ * to take a held dnode rather than <os, object> -- the lookup is wasteful,
+ * and can induce severe lock contention when writing to several files
+ * whose dnodes are in the same block.
+ */
+int
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+    boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+{
+	dmu_buf_t **dbp;
+	uint64_t blkid, nblks, i;
+	uint32_t dbuf_flags;
+	int err;
+	zio_t *zio = NULL;
+
+	ASSERT(length <= DMU_MAX_ACCESS);
+
+	/*
+	 * Note: We directly notify the prefetch code of this read, so that
+	 * we can tell it about the multi-block read.  dbuf_read() only knows
+	 * about the one block it is accessing.
+	 */
+	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
+	    DB_RF_NOPREFETCH;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (dn->dn_datablkshift) {
+		int blkshift = dn->dn_datablkshift;
+		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
+		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
+	} else {
+		if (offset + length > dn->dn_datablksz) {
+			zfs_panic_recover("zfs: accessing past end of object "
+			    "%llx/%llx (size=%u access=%llu+%llu)",
+			    (longlong_t)dn->dn_objset->
+			    os_dsl_dataset->ds_object,
+			    (longlong_t)dn->dn_object, dn->dn_datablksz,
+			    (longlong_t)offset, (longlong_t)length);
+			rw_exit(&dn->dn_struct_rwlock);
+			return (SET_ERROR(EIO));
+		}
+		nblks = 1;
+	}
+	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+
+	if (read)
+		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+	blkid = dbuf_whichblock(dn, 0, offset);
+	for (i = 0; i < nblks; i++) {
+		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
+		if (db == NULL) {
+			rw_exit(&dn->dn_struct_rwlock);
+			dmu_buf_rele_array(dbp, nblks, tag);
+			if (read)
+				zio_nowait(zio);
+			return (SET_ERROR(EIO));
+		}
+
+		/* initiate async i/o */
+		if (read)
+			(void) dbuf_read(db, zio, dbuf_flags);
+		dbp[i] = &db->db;
+	}
+
+	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
+	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
+		    read && DNODE_IS_CACHEABLE(dn), B_TRUE);
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (read) {
+		/* wait for async read i/o */
+		err = zio_wait(zio);
+		if (err) {
+			dmu_buf_rele_array(dbp, nblks, tag);
+			return (err);
+		}
+
+		/* wait for other io to complete */
+		for (i = 0; i < nblks; i++) {
+			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+			mutex_enter(&db->db_mtx);
+			while (db->db_state == DB_READ ||
+			    db->db_state == DB_FILL)
+				cv_wait(&db->db_changed, &db->db_mtx);
+			if (db->db_state == DB_UNCACHED)
+				err = SET_ERROR(EIO);
+			mutex_exit(&db->db_mtx);
+			if (err) {
+				dmu_buf_rele_array(dbp, nblks, tag);
+				return (err);
+			}
+		}
+	}
+
+	*numbufsp = nblks;
+	*dbpp = dbp;
+	return (0);
+}
+
+static int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+int
+dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
+    uint64_t length, boolean_t read, void *tag, int *numbufsp,
+    dmu_buf_t ***dbpp)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+	int err;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+void
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
+{
+	int i;
+	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
+
+	if (numbufs == 0)
+		return;
+
+	for (i = 0; i < numbufs; i++) {
+		if (dbp[i])
+			dbuf_rele(dbp[i], tag);
+	}
+
+	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
+}
+
+/*
+ * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
+ * indirect blocks prefetched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
+ *
+ * Note that if the indirect blocks above the blocks being prefetched are not
+ * in cache, they will be asynchronously read in.
+ */
+void
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+    uint64_t len, zio_priority_t pri)
+{
+	dnode_t *dn;
+	uint64_t blkid;
+	int nblks, err;
+
+	if (len == 0) {  /* they're interested in the bonus buffer */
+		dn = DMU_META_DNODE(os);
+
+		if (object == 0 || object >= DN_MAX_OBJECT)
+			return;
+
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		blkid = dbuf_whichblock(dn, level,
+		    object * sizeof (dnode_phys_t));
+		dbuf_prefetch(dn, level, blkid, pri, 0);
+		rw_exit(&dn->dn_struct_rwlock);
+		return;
+	}
+
+	/*
+	 * See comment before the definition of dmu_prefetch_max.
+	 */
+	len = MIN(len, dmu_prefetch_max);
+
+	/*
+	 * XXX - Note, if the dnode for the requested object is not
+	 * already cached, we will do a *synchronous* read in the
+	 * dnode_hold() call.  The same is true for any indirects.
+	 */
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err != 0)
+		return;
+
+	/*
+	 * offset + len - 1 is the last byte we want to prefetch for, and offset
+	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
+	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
+	 * offset)  is the first.  Then the number we need to prefetch is the
+	 * last - first + 1.
+	 */
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	if (level > 0 || dn->dn_datablkshift != 0) {
+		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+		    dbuf_whichblock(dn, level, offset) + 1;
+	} else {
+		nblks = (offset < dn->dn_datablksz);
+	}
+
+	if (nblks != 0) {
+		blkid = dbuf_whichblock(dn, level, offset);
+		for (int i = 0; i < nblks; i++)
+			dbuf_prefetch(dn, level, blkid + i, pri, 0);
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	dnode_rele(dn, FTAG);
+}
+
+/*
+ * Get the next "chunk" of file data to free.  We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state).  We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length").  On return, *start will be the first
+ * offset that should be freed and l1blks is set to the number of level 1
+ * indirect blocks found within the chunk.
+ */
+static int
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
+{
+	uint64_t blks;
+	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+	/* bytes of data covered by a level-1 indirect block */
+	uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
+	    EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+	ASSERT3U(minimum, <=, *start);
+
+	/*
+	 * Check if we can free the entire range assuming that all of the
+	 * L1 blocks in this range have data. If we can, we use this
+	 * worst case value as an estimate so we can avoid having to look
+	 * at the object's actual data.
+	 */
+	uint64_t total_l1blks =
+	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
+	    iblkrange;
+	if (total_l1blks <= maxblks) {
+		*l1blks = total_l1blks;
+		*start = minimum;
+		return (0);
+	}
+	ASSERT(ISP2(iblkrange));
+
+	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
+		int err;
+
+		/*
+		 * dnode_next_offset(BACKWARDS) will find an allocated L1
+		 * indirect block at or before the input offset.  We must
+		 * decrement *start so that it is at the end of the region
+		 * to search.
+		 */
+		(*start)--;
+
+		err = dnode_next_offset(dn,
+		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
+
+		/* if there are no indirect blocks before start, we are done */
+		if (err == ESRCH) {
+			*start = minimum;
+			break;
+		} else if (err != 0) {
+			*l1blks = blks;
+			return (err);
+		}
+
+		/* set start to the beginning of this L1 indirect */
+		*start = P2ALIGN(*start, iblkrange);
+	}
+	if (*start < minimum)
+		*start = minimum;
+	*l1blks = blks;
+
+	return (0);
+}
+
+/*
+ * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
+ * otherwise return false.
+ * Used below in dmu_free_long_range_impl() to enable abort when unmounting
+ */
+/*ARGSUSED*/
+static boolean_t
+dmu_objset_zfs_unmounting(objset_t *os)
+{
+#ifdef _KERNEL
+	if (dmu_objset_type(os) == DMU_OST_ZFS)
+		return (zfs_get_vfs_flag_unmounted(os));
+#endif
+	return (B_FALSE);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+    uint64_t length)
+{
+	uint64_t object_size;
+	int err;
+	uint64_t dirty_frees_threshold;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	if (dn == NULL)
+		return (SET_ERROR(EINVAL));
+
+	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+	if (offset >= object_size)
+		return (0);
+
+	if (zfs_per_txg_dirty_frees_percent <= 100)
+		dirty_frees_threshold =
+		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
+	else
+		dirty_frees_threshold = zfs_dirty_data_max / 20;
+
+	if (length == DMU_OBJECT_END || offset + length > object_size)
+		length = object_size - offset;
+
+	while (length != 0) {
+		uint64_t chunk_end, chunk_begin, chunk_len;
+		uint64_t l1blks;
+		dmu_tx_t *tx;
+
+		if (dmu_objset_zfs_unmounting(dn->dn_objset))
+			return (SET_ERROR(EINTR));
+
+		chunk_end = chunk_begin = offset + length;
+
+		/* move chunk_begin backwards to the beginning of this chunk */
+		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
+		if (err)
+			return (err);
+		ASSERT3U(chunk_begin, >=, offset);
+		ASSERT3U(chunk_begin, <=, chunk_end);
+
+		chunk_len = chunk_end - chunk_begin;
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
+
+		/*
+		 * Mark this transaction as typically resulting in a net
+		 * reduction in space used.
+		 */
+		dmu_tx_mark_netfree(tx);
+		err = dmu_tx_assign(tx, TXG_WAIT);
+		if (err) {
+			dmu_tx_abort(tx);
+			return (err);
+		}
+
+		uint64_t txg = dmu_tx_get_txg(tx);
+
+		mutex_enter(&dp->dp_lock);
+		uint64_t long_free_dirty =
+		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
+		mutex_exit(&dp->dp_lock);
+
+		/*
+		 * To avoid filling up a TXG with just frees, wait for
+		 * the next TXG to open before freeing more chunks if
+		 * we have reached the threshold of frees.
+		 */
+		if (dirty_frees_threshold != 0 &&
+		    long_free_dirty >= dirty_frees_threshold) {
+			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
+			dmu_tx_commit(tx);
+			txg_wait_open(dp, 0, B_TRUE);
+			continue;
+		}
+
+		/*
+		 * In order to prevent unnecessary write throttling, for each
+		 * TXG, we track the cumulative size of L1 blocks being dirtied
+		 * in dnode_free_range() below. We compare this number to a
+		 * tunable threshold, past which we prevent new L1 dirty freeing
+		 * blocks from being added into the open TXG. See
+		 * dmu_free_long_range_impl() for details. The threshold
+		 * prevents write throttle activation due to dirty freeing L1
+		 * blocks taking up a large percentage of zfs_dirty_data_max.
+		 */
+		mutex_enter(&dp->dp_lock);
+		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
+		    l1blks << dn->dn_indblkshift;
+		mutex_exit(&dp->dp_lock);
+		DTRACE_PROBE3(free__long__range,
+		    uint64_t, long_free_dirty, uint64_t, chunk_len,
+		    uint64_t, txg);
+		dnode_free_range(dn, chunk_begin, chunk_len, tx);
+
+		dmu_tx_commit(tx);
+
+		length -= chunk_len;
+	}
+	return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t length)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err != 0)
+		return (err);
+	err = dmu_free_long_range_impl(os, dn, offset, length);
+
+	/*
+	 * It is important to zero out the maxblkid when freeing the entire
+	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
+	 * will take the fast path, and (b) dnode_reallocate() can verify
+	 * that the entire file has been freed.
+	 */
+	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
+		dn->dn_maxblkid = 0;
+
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_free_long_object(objset_t *os, uint64_t object)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
+	if (err != 0)
+		return (err);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, object);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+	dmu_tx_mark_netfree(tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err == 0) {
+		if (err == 0)
+			err = dmu_object_free(os, object, tx);
+
+		dmu_tx_commit(tx);
+	} else {
+		dmu_tx_abort(tx);
+	}
+
+	return (err);
+}
+
+int
+dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t size, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	ASSERT(offset < UINT64_MAX);
+	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
+	dnode_free_range(dn, offset, size, tx);
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+static int
+dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
+    void *buf, uint32_t flags)
+{
+	dmu_buf_t **dbp;
+	int numbufs, err = 0;
+
+	/*
+	 * Deal with odd block sizes, where there can't be data past the first
+	 * block.  If we ever do the tail block optimization, we will need to
+	 * handle that here as well.
+	 */
+	if (dn->dn_maxblkid == 0) {
+		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
+		    MIN(size, dn->dn_datablksz - offset);
+		bzero((char *)buf + newsz, size - newsz);
+		size = newsz;
+	}
+
+	while (size > 0) {
+		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+		int i;
+
+		/*
+		 * NB: we could do this block-at-a-time, but it's nice
+		 * to be reading in parallel.
+		 */
+		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
+		    TRUE, FTAG, &numbufs, &dbp, flags);
+		if (err)
+			break;
+
+		for (i = 0; i < numbufs; i++) {
+			uint64_t tocpy;
+			int64_t bufoff;
+			dmu_buf_t *db = dbp[i];
+
+			ASSERT(size > 0);
+
+			bufoff = offset - db->db_offset;
+			tocpy = MIN(db->db_size - bufoff, size);
+
+			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
+
+			offset += tocpy;
+			size -= tocpy;
+			buf = (char *)buf + tocpy;
+		}
+		dmu_buf_rele_array(dbp, numbufs, FTAG);
+	}
+	return (err);
+}
+
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    void *buf, uint32_t flags)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err != 0)
+		return (err);
+
+	err = dmu_read_impl(dn, offset, size, buf, flags);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
+    uint32_t flags)
+{
+	return (dmu_read_impl(dn, offset, size, buf, flags));
+}
+
+static void
+dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx)
+{
+	int i;
+
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = offset - db->db_offset;
+		tocpy = MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		offset += tocpy;
+		size -= tocpy;
+		buf = (char *)buf + tocpy;
+	}
+}
+
+void
+dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs;
+
+	if (size == 0)
+		return;
+
+	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp));
+	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+/*
+ * Note: Lustre is an external consumer of this interface.
+ */
+void
+dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
+    const void *buf, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs;
+
+	if (size == 0)
+		return;
+
+	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
+	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i;
+
+	if (size == 0)
+		return;
+
+	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+	    FALSE, FTAG, &numbufs, &dbp));
+
+	for (i = 0; i < numbufs; i++) {
+		dmu_buf_t *db = dbp[i];
+
+		dmu_buf_will_not_fill(db, tx);
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+    void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+    int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+
+	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+	VERIFY0(dmu_buf_hold_noread(os, object, offset,
+	    FTAG, &db));
+
+	dmu_buf_write_embedded(db,
+	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+	    uncompressed_size, compressed_size, byteorder, tx);
+
+	dmu_buf_rele(db, FTAG);
+}
+
+void
+dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+    dmu_tx_t *tx)
+{
+	int numbufs, i;
+	dmu_buf_t **dbp;
+
+	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
+	    &numbufs, &dbp));
+	for (i = 0; i < numbufs; i++)
+		dmu_buf_redact(dbp[i], tx);
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
+#ifdef _KERNEL
+int
+dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
+{
+	dmu_buf_t **dbp;
+	int numbufs, i, err;
+
+	/*
+	 * NB: we could do this block-at-a-time, but it's nice
+	 * to be reading in parallel.
+	 */
+	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
+	    TRUE, FTAG, &numbufs, &dbp, 0);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = zfs_uio_offset(uio) - db->db_offset;
+		tocpy = MIN(db->db_size - bufoff, size);
+
+		err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy,
+		    UIO_READ, uio);
+
+		if (err)
+			break;
+
+		size -= tocpy;
+	}
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+	return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at zfs_uio_offset(uio).
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_read_uio_dnode(dn, uio, size);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset zfs_uio_offset(uio).
+ */
+int
+dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
+{
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_read_uio_dnode(dn, uio, size);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+int
+dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
+{
+	dmu_buf_t **dbp;
+	int numbufs;
+	int err = 0;
+	int i;
+
+	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
+	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
+	if (err)
+		return (err);
+
+	for (i = 0; i < numbufs; i++) {
+		uint64_t tocpy;
+		int64_t bufoff;
+		dmu_buf_t *db = dbp[i];
+
+		ASSERT(size > 0);
+
+		bufoff = zfs_uio_offset(uio) - db->db_offset;
+		tocpy = MIN(db->db_size - bufoff, size);
+
+		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+		if (tocpy == db->db_size)
+			dmu_buf_will_fill(db, tx);
+		else
+			dmu_buf_will_dirty(db, tx);
+
+		/*
+		 * XXX zfs_uiomove could block forever (eg.nfs-backed
+		 * pages).  There needs to be a uiolockdown() function
+		 * to lock the pages in memory, so that zfs_uiomove won't
+		 * block.
+		 */
+		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
+		    tocpy, UIO_WRITE, uio);
+
+		if (tocpy == db->db_size)
+			dmu_buf_fill_done(db, tx);
+
+		if (err)
+			break;
+
+		size -= tocpy;
+	}
+
+	dmu_buf_rele_array(dbp, numbufs, FTAG);
+	return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset zfs_uio_offset(uio).
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	err = dmu_write_uio_dnode(dn, uio, size, tx);
+	DB_DNODE_EXIT(db);
+
+	return (err);
+}
+
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset zfs_uio_offset(uio).
+ */
+int
+dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	if (size == 0)
+		return (0);
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	err = dmu_write_uio_dnode(dn, uio, size, tx);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+#endif /* _KERNEL */
+
+/*
+ * Allocate a loaned anonymous arc buffer.
+ */
+arc_buf_t *
+dmu_request_arcbuf(dmu_buf_t *handle, int size)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
+
+	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
+}
+
+/*
+ * Free a loaned arc buffer.
+ */
+void
+dmu_return_arcbuf(arc_buf_t *buf)
+{
+	arc_return_buf(buf, FTAG);
+	arc_buf_destroy(buf, FTAG);
+}
+
+/*
+ * A "lightweight" write is faster than a regular write (e.g.
+ * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
+ * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t.  However, the
+ * data can not be read or overwritten until the transaction's txg has been
+ * synced.  This makes it appropriate for workloads that are known to be
+ * (temporarily) write-only, like "zfs receive".
+ *
+ * A single block is written, starting at the specified offset in bytes.  If
+ * the call is successful, it returns 0 and the provided abd has been
+ * consumed (the caller should not free it).
+ */
+int
+dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
+    const zio_prop_t *zp, enum zio_flag flags, dmu_tx_t *tx)
+{
+	dbuf_dirty_record_t *dr =
+	    dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
+	if (dr == NULL)
+		return (SET_ERROR(EIO));
+	dr->dt.dll.dr_abd = abd;
+	dr->dt.dll.dr_props = *zp;
+	dr->dt.dll.dr_flags = flags;
+	return (0);
+}
+
+/*
+ * When possible directly assign passed loaned arc buffer to a dbuf.
+ * If this is not possible copy the contents of passed arc buf via
+ * dmu_write().
+ */
+int
+dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	objset_t *os = dn->dn_objset;
+	uint64_t object = dn->dn_object;
+	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
+	uint64_t blkid;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = dbuf_whichblock(dn, 0, offset);
+	db = dbuf_hold(dn, blkid, FTAG);
+	if (db == NULL)
+		return (SET_ERROR(EIO));
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/*
+	 * We can only assign if the offset is aligned and the arc buf is the
+	 * same size as the dbuf.
+	 */
+	if (offset == db->db.db_offset && blksz == db->db.db_size) {
+		dbuf_assign_arcbuf(db, buf, tx);
+		dbuf_rele(db, FTAG);
+	} else {
+		/* compressed bufs must always be assignable to their dbuf */
+		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
+		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
+
+		dbuf_rele(db, FTAG);
+		dmu_write(os, object, offset, blksz, buf->b_data, tx);
+		dmu_return_arcbuf(buf);
+	}
+
+	return (0);
+}
+
+int
+dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+    dmu_tx_t *tx)
+{
+	int err;
+	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
+
+	DB_DNODE_ENTER(dbuf);
+	err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx);
+	DB_DNODE_EXIT(dbuf);
+
+	return (err);
+}
+
+typedef struct {
+	dbuf_dirty_record_t	*dsa_dr;
+	dmu_sync_cb_t		*dsa_done;
+	zgd_t			*dsa_zgd;
+	dmu_tx_t		*dsa_tx;
+} dmu_sync_arg_t;
+
+/* ARGSUSED */
+static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+	dmu_sync_arg_t *dsa = varg;
+	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio->io_error == 0) {
+		if (BP_IS_HOLE(bp)) {
+			/*
+			 * A block of zeros may compress to a hole, but the
+			 * block size still needs to be known for replay.
+			 */
+			BP_SET_LSIZE(bp, db->db_size);
+		} else if (!BP_IS_EMBEDDED(bp)) {
+			ASSERT(BP_GET_LEVEL(bp) == 0);
+			BP_SET_FILL(bp, 1);
+		}
+	}
+}
+
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+	dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
+/* ARGSUSED */
+static void
+dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+	dmu_sync_arg_t *dsa = varg;
+	dbuf_dirty_record_t *dr = dsa->dsa_dr;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	zgd_t *zgd = dsa->dsa_zgd;
+
+	/*
+	 * Record the vdev(s) backing this blkptr so they can be flushed after
+	 * the writes for the lwb have completed.
+	 */
+	if (zio->io_error == 0) {
+		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+	}
+
+	mutex_enter(&db->db_mtx);
+	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
+	if (zio->io_error == 0) {
+		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+		if (dr->dt.dl.dr_nopwrite) {
+			blkptr_t *bp = zio->io_bp;
+			blkptr_t *bp_orig = &zio->io_bp_orig;
+			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
+
+			ASSERT(BP_EQUAL(bp, bp_orig));
+			VERIFY(BP_EQUAL(bp, db->db_blkptr));
+			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+			VERIFY(zio_checksum_table[chksum].ci_flags &
+			    ZCHECKSUM_FLAG_NOPWRITE);
+		}
+		dr->dt.dl.dr_overridden_by = *zio->io_bp;
+		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+
+		/*
+		 * Old style holes are filled with all zeros, whereas
+		 * new-style holes maintain their lsize, type, level,
+		 * and birth time (see zio_write_compress). While we
+		 * need to reset the BP_SET_LSIZE() call that happened
+		 * in dmu_sync_ready for old style holes, we do *not*
+		 * want to wipe out the information contained in new
+		 * style holes. Thus, only zero out the block pointer if
+		 * it's an old style hole.
+		 */
+		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
+		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
+			BP_ZERO(&dr->dt.dl.dr_overridden_by);
+	} else {
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	}
+	cv_broadcast(&db->db_changed);
+	mutex_exit(&db->db_mtx);
+
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dmu_sync_arg_t *dsa = zio->io_private;
+	zgd_t *zgd = dsa->dsa_zgd;
+
+	if (zio->io_error == 0) {
+		/*
+		 * Record the vdev(s) backing this blkptr so they can be
+		 * flushed after the writes for the lwb have completed.
+		 */
+		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
+
+		if (!BP_IS_HOLE(bp)) {
+			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
+			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
+			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		}
+	}
+
+	dmu_tx_commit(dsa->dsa_tx);
+
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	abd_free(zio->io_abd);
+	kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+    zio_prop_t *zp, zbookmark_phys_t *zb)
+{
+	dmu_sync_arg_t *dsa;
+	dmu_tx_t *tx;
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+		dmu_tx_abort(tx);
+		/* Make zl_get_data do txg_waited_synced() */
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * In order to prevent the zgd's lwb from being free'd prior to
+	 * dmu_sync_late_arrival_done() being called, we have to ensure
+	 * the lwb's "max txg" takes this tx's txg into account.
+	 */
+	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
+
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = NULL;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = tx;
+
+	/*
+	 * Since we are currently syncing this txg, it's nontrivial to
+	 * determine what BP to nopwrite against, so we disable nopwrite.
+	 *
+	 * When syncing, the db_blkptr is initially the BP of the previous
+	 * txg.  We can not nopwrite against it because it will be changed
+	 * (this is similar to the non-late-arrival case where the dbuf is
+	 * dirty in a future txg).
+	 *
+	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
+	 * We can not nopwrite against it because although the BP will not
+	 * (typically) be changed, the data has not yet been persisted to this
+	 * location.
+	 *
+	 * Finally, when dbuf_write_done() is called, it is theoretically
+	 * possible to always nopwrite, because the data that was written in
+	 * this txg is the same data that we are trying to write.  However we
+	 * would need to check that this dbuf is not dirty in any future
+	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
+	 * don't nopwrite in this case.
+	 */
+	zp->zp_nopwrite = B_FALSE;
+
+	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
+	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
+	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
+	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+	return (0);
+}
+
+/*
+ * Intent log support: sync the block associated with db to disk.
+ * N.B. and XXX: the caller is responsible for making sure that the
+ * data isn't changing while dmu_sync() is writing it.
+ *
+ * Return values:
+ *
+ *	EEXIST: this txg has already been synced, so there's nothing to do.
+ *		The caller should not log the write.
+ *
+ *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
+ *		The caller should not log the write.
+ *
+ *	EALREADY: this block is already in the process of being synced.
+ *		The caller should track its progress (somehow).
+ *
+ *	EIO: could not do the I/O.
+ *		The caller should do a txg_wait_synced().
+ *
+ *	0: the I/O has been initiated.
+ *		The caller should log this blkptr in the done callback.
+ *		It is possible that the I/O will fail, in which case
+ *		the error will be reported to the done callback and
+ *		propagated to pio from zio_done().
+ */
+int
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+	objset_t *os = db->db_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	dbuf_dirty_record_t *dr, *dr_next;
+	dmu_sync_arg_t *dsa;
+	zbookmark_phys_t zb;
+	zio_prop_t zp;
+	dnode_t *dn;
+
+	ASSERT(pio != NULL);
+	ASSERT(txg != 0);
+
+	SET_BOOKMARK(&zb, ds->ds_object,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
+	DB_DNODE_EXIT(db);
+
+	/*
+	 * If we're frozen (running ziltest), we always need to generate a bp.
+	 */
+	if (txg > spa_freeze_txg(os->os_spa))
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+
+	/*
+	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+	 * and us.  If we determine that this txg is not yet syncing,
+	 * but it begins to sync a moment later, that's OK because the
+	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
+	 */
+	mutex_enter(&db->db_mtx);
+
+	if (txg <= spa_last_synced_txg(os->os_spa)) {
+		/*
+		 * This txg has already synced.  There's nothing to do.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(EEXIST));
+	}
+
+	if (txg <= spa_syncing_txg(os->os_spa)) {
+		/*
+		 * This txg is currently syncing, so we can't mess with
+		 * the dirty record anymore; just write a new log block.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
+	}
+
+	dr = dbuf_find_dirty_eq(db, txg);
+
+	if (dr == NULL) {
+		/*
+		 * There's no dr for this dbuf, so it must have been freed.
+		 * There's no need to log writes to freed blocks, so we're done.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(ENOENT));
+	}
+
+	dr_next = list_next(&db->db_dirty_records, dr);
+	ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
+
+	if (db->db_blkptr != NULL) {
+		/*
+		 * We need to fill in zgd_bp with the current blkptr so that
+		 * the nopwrite code can check if we're writing the same
+		 * data that's already on disk.  We can only nopwrite if we
+		 * are sure that after making the copy, db_blkptr will not
+		 * change until our i/o completes.  We ensure this by
+		 * holding the db_mtx, and only allowing nopwrite if the
+		 * block is not already dirty (see below).  This is verified
+		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
+		 * not changed.
+		 */
+		*zgd->zgd_bp = *db->db_blkptr;
+	}
+
+	/*
+	 * Assume the on-disk data is X, the current syncing data (in
+	 * txg - 1) is Y, and the current in-memory data is Z (currently
+	 * in dmu_sync).
+	 *
+	 * We usually want to perform a nopwrite if X and Z are the
+	 * same.  However, if Y is different (i.e. the BP is going to
+	 * change before this write takes effect), then a nopwrite will
+	 * be incorrect - we would override with X, which could have
+	 * been freed when Y was written.
+	 *
+	 * (Note that this is not a concern when we are nop-writing from
+	 * syncing context, because X and Y must be identical, because
+	 * all previous txgs have been synced.)
+	 *
+	 * Therefore, we disable nopwrite if the current BP could change
+	 * before this TXG.  There are two ways it could change: by
+	 * being dirty (dr_next is non-NULL), or by being freed
+	 * (dnode_block_freed()).  This behavior is verified by
+	 * zio_done(), which VERIFYs that the override BP is identical
+	 * to the on-disk BP.
+	 */
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
+		zp.zp_nopwrite = B_FALSE;
+	DB_DNODE_EXIT(db);
+
+	ASSERT(dr->dr_txg == txg);
+	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		/*
+		 * We have already issued a sync write for this buffer,
+		 * or this buffer has already been synced.  It could not
+		 * have been dirtied since, or we would have cleared the state.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (SET_ERROR(EALREADY));
+	}
+
+	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
+	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
+	mutex_exit(&db->db_mtx);
+
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = dr;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = NULL;
+
+	zio_nowait(arc_write(pio, os->os_spa, txg,
+	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+
+	return (0);
+}
+
+int
+dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	err = dnode_set_nlevels(dn, nlevels, tx);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	err = dnode_set_blksz(dn, size, ibs, tx);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
+	rw_exit(&dn->dn_struct_rwlock);
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+void
+dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	/*
+	 * Send streams include each object's checksum function.  This
+	 * check ensures that the receiving system can understand the
+	 * checksum function transmitted.
+	 */
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
+	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
+	dn->dn_checksum = checksum;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+void
+dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	/*
+	 * Send streams include each object's compression function.  This
+	 * check ensures that the receiving system can understand the
+	 * compression function transmitted.
+	 */
+	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+	VERIFY0(dnode_hold(os, object, FTAG, &dn));
+	dn->dn_compress = compress;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+}
+
+/*
+ * When the "redundant_metadata" property is set to "most", only indirect
+ * blocks of this level and higher will have an additional ditto block.
+ */
+int zfs_redundant_metadata_most_ditto_level = 2;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
+	    (wp & WP_SPILL));
+	enum zio_checksum checksum = os->os_checksum;
+	enum zio_compress compress = os->os_compress;
+	uint8_t complevel = os->os_complevel;
+	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+	boolean_t dedup = B_FALSE;
+	boolean_t nopwrite = B_FALSE;
+	boolean_t dedup_verify = os->os_dedup_verify;
+	boolean_t encrypt = B_FALSE;
+	int copies = os->os_copies;
+
+	/*
+	 * We maintain different write policies for each of the following
+	 * types of data:
+	 *	 1. metadata
+	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+	 *	 3. all other level 0 blocks
+	 */
+	if (ismd) {
+		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		compress = zio_compress_select(os->os_spa,
+		    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
+
+		/*
+		 * Metadata always gets checksummed.  If the data
+		 * checksum is multi-bit correctable, and it's not a
+		 * ZBT-style checksum, then it's suitable for metadata
+		 * as well.  Otherwise, the metadata checksum defaults
+		 * to fletcher4.
+		 */
+		if (!(zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_METADATA) ||
+		    (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_EMBEDDED))
+			checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+		    (os->os_redundant_metadata ==
+		    ZFS_REDUNDANT_METADATA_MOST &&
+		    (level >= zfs_redundant_metadata_most_ditto_level ||
+		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+			copies++;
+	} else if (wp & WP_NOFILL) {
+		ASSERT(level == 0);
+
+		/*
+		 * If we're writing preallocated blocks, we aren't actually
+		 * writing them so don't set any policy properties.  These
+		 * blocks are currently only used by an external subsystem
+		 * outside of zfs (i.e. dump) and not written by the zio
+		 * pipeline.
+		 */
+		compress = ZIO_COMPRESS_OFF;
+		checksum = ZIO_CHECKSUM_OFF;
+	} else {
+		compress = zio_compress_select(os->os_spa, dn->dn_compress,
+		    compress);
+		complevel = zio_complevel_select(os->os_spa, compress,
+		    complevel, complevel);
+
+		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+		    zio_checksum_select(dn->dn_checksum, checksum) :
+		    dedup_checksum;
+
+		/*
+		 * Determine dedup setting.  If we are in dmu_sync(),
+		 * we won't actually dedup now because that's all
+		 * done in syncing context; but we do want to use the
+		 * dedup checksum.  If the checksum is not strong
+		 * enough to ensure unique signatures, force
+		 * dedup_verify.
+		 */
+		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+			if (!(zio_checksum_table[checksum].ci_flags &
+			    ZCHECKSUM_FLAG_DEDUP))
+				dedup_verify = B_TRUE;
+		}
+
+		/*
+		 * Enable nopwrite if we have secure enough checksum
+		 * algorithm (see comment in zio_nop_write) and
+		 * compression is enabled.  We don't enable nopwrite if
+		 * dedup is enabled as the two features are mutually
+		 * exclusive.
+		 */
+		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE) &&
+		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
+	}
+
+	/*
+	 * All objects in an encrypted objset are protected from modification
+	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
+	 * in the bp, so we cannot use all copies. Encrypted objects are also
+	 * not subject to nopwrite since writing the same data will still
+	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
+	 * to avoid ambiguity in the dedup code since the DDT does not store
+	 * object types.
+	 */
+	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
+		encrypt = B_TRUE;
+
+		if (DMU_OT_IS_ENCRYPTED(type)) {
+			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
+			nopwrite = B_FALSE;
+		} else {
+			dedup = B_FALSE;
+		}
+
+		if (level <= 0 &&
+		    (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
+			compress = ZIO_COMPRESS_EMPTY;
+		}
+	}
+
+	zp->zp_compress = compress;
+	zp->zp_complevel = complevel;
+	zp->zp_checksum = checksum;
+	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
+	zp->zp_level = level;
+	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
+	zp->zp_dedup = dedup;
+	zp->zp_dedup_verify = dedup && dedup_verify;
+	zp->zp_nopwrite = nopwrite;
+	zp->zp_encrypt = encrypt;
+	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
+	bzero(zp->zp_salt, ZIO_DATA_SALT_LEN);
+	bzero(zp->zp_iv, ZIO_DATA_IV_LEN);
+	bzero(zp->zp_mac, ZIO_DATA_MAC_LEN);
+	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
+	    os->os_zpl_special_smallblock : 0;
+
+	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
+}
+
+/*
+ * This function is only called from zfs_holey_common() for zpl_llseek()
+ * in order to determine the location of holes.  In order to accurately
+ * report holes all dirty data must be synced to disk.  This causes extremely
+ * poor performance when seeking for holes in a dirty file.  As a compromise,
+ * only provide hole data when the dnode is clean.  When a dnode is dirty
+ * report the dnode as having no holes which is always a safe thing to do.
+ */
+int
+dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
+{
+	dnode_t *dn;
+	int i, err;
+	boolean_t clean = B_TRUE;
+
+	err = dnode_hold(os, object, FTAG, &dn);
+	if (err)
+		return (err);
+
+	/*
+	 * Check if dnode is dirty
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (multilist_link_active(&dn->dn_dirty_link[i])) {
+			clean = B_FALSE;
+			break;
+		}
+	}
+
+	/*
+	 * If compatibility option is on, sync any current changes before
+	 * we go trundling through the block pointers.
+	 */
+	if (!clean && zfs_dmu_offset_next_sync) {
+		clean = B_TRUE;
+		dnode_rele(dn, FTAG);
+		txg_wait_synced(dmu_objset_pool(os), 0);
+		err = dnode_hold(os, object, FTAG, &dn);
+		if (err)
+			return (err);
+	}
+
+	if (clean)
+		err = dnode_next_offset(dn,
+		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
+	else
+		err = SET_ERROR(EBUSY);
+
+	dnode_rele(dn, FTAG);
+
+	return (err);
+}
+
+void
+__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+	dnode_phys_t *dnp = dn->dn_phys;
+
+	doi->doi_data_block_size = dn->dn_datablksz;
+	doi->doi_metadata_block_size = dn->dn_indblkshift ?
+	    1ULL << dn->dn_indblkshift : 0;
+	doi->doi_type = dn->dn_type;
+	doi->doi_bonus_type = dn->dn_bonustype;
+	doi->doi_bonus_size = dn->dn_bonuslen;
+	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
+	doi->doi_indirection = dn->dn_nlevels;
+	doi->doi_checksum = dn->dn_checksum;
+	doi->doi_compress = dn->dn_compress;
+	doi->doi_nblkptr = dn->dn_nblkptr;
+	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+	doi->doi_fill_count = 0;
+	for (int i = 0; i < dnp->dn_nblkptr; i++)
+		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
+}
+
+void
+dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
+{
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	mutex_enter(&dn->dn_mtx);
+
+	__dmu_object_info_from_dnode(dn, doi);
+
+	mutex_exit(&dn->dn_mtx);
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+/*
+ * Get information on a DMU object.
+ * If doi is NULL, just indicates whether the object exists.
+ */
+int
+dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
+{
+	dnode_t *dn;
+	int err = dnode_hold(os, object, FTAG, &dn);
+
+	if (err)
+		return (err);
+
+	if (doi != NULL)
+		dmu_object_info_from_dnode(dn, doi);
+
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+/*
+ * As above, but faster; can be used when you have a held dbuf in hand.
+ */
+void
+dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+	DB_DNODE_ENTER(db);
+	dmu_object_info_from_dnode(DB_DNODE(db), doi);
+	DB_DNODE_EXIT(db);
+}
+
+/*
+ * Faster still when you only care about the size.
+ */
+void
+dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
+    u_longlong_t *nblk512)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	*blksize = dn->dn_datablksz;
+	/* add in number of slots used for the dnode itself */
+	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
+	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
+	DB_DNODE_EXIT(db);
+}
+
+void
+dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	*dnsize = dn->dn_num_slots << DNODE_SHIFT;
+	DB_DNODE_EXIT(db);
+}
+
+void
+byteswap_uint64_array(void *vbuf, size_t size)
+{
+	uint64_t *buf = vbuf;
+	size_t count = size >> 3;
+	int i;
+
+	ASSERT((size & 7) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_64(buf[i]);
+}
+
+void
+byteswap_uint32_array(void *vbuf, size_t size)
+{
+	uint32_t *buf = vbuf;
+	size_t count = size >> 2;
+	int i;
+
+	ASSERT((size & 3) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_32(buf[i]);
+}
+
+void
+byteswap_uint16_array(void *vbuf, size_t size)
+{
+	uint16_t *buf = vbuf;
+	size_t count = size >> 1;
+	int i;
+
+	ASSERT((size & 1) == 0);
+
+	for (i = 0; i < count; i++)
+		buf[i] = BSWAP_16(buf[i]);
+}
+
+/* ARGSUSED */
+void
+byteswap_uint8_array(void *vbuf, size_t size)
+{
+}
+
+void
+dmu_init(void)
+{
+	abd_init();
+	zfs_dbgmsg_init();
+	sa_cache_init();
+	dmu_objset_init();
+	dnode_init();
+	zfetch_init();
+	dmu_tx_init();
+	l2arc_init();
+	arc_init();
+	dbuf_init();
+}
+
+void
+dmu_fini(void)
+{
+	arc_fini(); /* arc depends on l2arc, so arc must go first */
+	l2arc_fini();
+	dmu_tx_fini();
+	zfetch_fini();
+	dbuf_fini();
+	dnode_fini();
+	dmu_objset_fini();
+	sa_cache_fini();
+	zfs_dbgmsg_fini();
+	abd_fini();
+}
+
+EXPORT_SYMBOL(dmu_bonus_hold);
+EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
+EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
+EXPORT_SYMBOL(dmu_buf_rele_array);
+EXPORT_SYMBOL(dmu_prefetch);
+EXPORT_SYMBOL(dmu_free_range);
+EXPORT_SYMBOL(dmu_free_long_range);
+EXPORT_SYMBOL(dmu_free_long_object);
+EXPORT_SYMBOL(dmu_read);
+EXPORT_SYMBOL(dmu_read_by_dnode);
+EXPORT_SYMBOL(dmu_write);
+EXPORT_SYMBOL(dmu_write_by_dnode);
+EXPORT_SYMBOL(dmu_prealloc);
+EXPORT_SYMBOL(dmu_object_info);
+EXPORT_SYMBOL(dmu_object_info_from_dnode);
+EXPORT_SYMBOL(dmu_object_info_from_db);
+EXPORT_SYMBOL(dmu_object_size_from_db);
+EXPORT_SYMBOL(dmu_object_dnsize_from_db);
+EXPORT_SYMBOL(dmu_object_set_nlevels);
+EXPORT_SYMBOL(dmu_object_set_blocksize);
+EXPORT_SYMBOL(dmu_object_set_maxblkid);
+EXPORT_SYMBOL(dmu_object_set_checksum);
+EXPORT_SYMBOL(dmu_object_set_compress);
+EXPORT_SYMBOL(dmu_offset_next);
+EXPORT_SYMBOL(dmu_write_policy);
+EXPORT_SYMBOL(dmu_sync);
+EXPORT_SYMBOL(dmu_request_arcbuf);
+EXPORT_SYMBOL(dmu_return_arcbuf);
+EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
+EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
+EXPORT_SYMBOL(dmu_buf_hold);
+EXPORT_SYMBOL(dmu_ot);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
+	"Enable NOP writes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW,
+	"Percentage of dirtied blocks from frees in one TXG");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
+	"Enable forcing txg sync to find holes");
+
+ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW,
+	"Limit one prefetch call to this size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_diff.c b/sys/contrib/openzfs/module/zfs/dmu_diff.c
new file mode 100644
index 000000000000..a573a2e1bd41
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_diff.c
@@ -0,0 +1,240 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_file.h>
+
+
+typedef struct dmu_diffarg {
+	zfs_file_t *da_fp;		/* file to which we are reporting */
+	offset_t *da_offp;
+	int da_err;			/* error that stopped diff search */
+	dmu_diff_record_t da_ddr;
+} dmu_diffarg_t;
+
+static int
+write_record(dmu_diffarg_t *da)
+{
+	zfs_file_t *fp;
+	ssize_t resid;
+
+	if (da->da_ddr.ddr_type == DDR_NONE) {
+		da->da_err = 0;
+		return (0);
+	}
+
+	fp = da->da_fp;
+	da->da_err = zfs_file_write(fp, (caddr_t)&da->da_ddr,
+	    sizeof (da->da_ddr), &resid);
+	*da->da_offp += sizeof (da->da_ddr);
+	return (da->da_err);
+}
+
+static int
+report_free_dnode_range(dmu_diffarg_t *da, uint64_t first, uint64_t last)
+{
+	ASSERT(first <= last);
+	if (da->da_ddr.ddr_type != DDR_FREE ||
+	    first != da->da_ddr.ddr_last + 1) {
+		if (write_record(da) != 0)
+			return (da->da_err);
+		da->da_ddr.ddr_type = DDR_FREE;
+		da->da_ddr.ddr_first = first;
+		da->da_ddr.ddr_last = last;
+		return (0);
+	}
+	da->da_ddr.ddr_last = last;
+	return (0);
+}
+
+static int
+report_dnode(dmu_diffarg_t *da, uint64_t object, dnode_phys_t *dnp)
+{
+	ASSERT(dnp != NULL);
+	if (dnp->dn_type == DMU_OT_NONE)
+		return (report_free_dnode_range(da, object, object));
+
+	if (da->da_ddr.ddr_type != DDR_INUSE ||
+	    object != da->da_ddr.ddr_last + 1) {
+		if (write_record(da) != 0)
+			return (da->da_err);
+		da->da_ddr.ddr_type = DDR_INUSE;
+		da->da_ddr.ddr_first = da->da_ddr.ddr_last = object;
+		return (0);
+	}
+	da->da_ddr.ddr_last = object;
+	return (0);
+}
+
+#define	DBP_SPAN(dnp, level)				  \
+	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
+	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+
+/* ARGSUSED */
+static int
+diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	dmu_diffarg_t *da = arg;
+	int err = 0;
+
+	if (issig(JUSTLOOKING) && issig(FORREAL))
+		return (SET_ERROR(EINTR));
+
+	if (zb->zb_level == ZB_DNODE_LEVEL ||
+	    zb->zb_object != DMU_META_DNODE_OBJECT)
+		return (0);
+
+	if (BP_IS_HOLE(bp)) {
+		uint64_t span = DBP_SPAN(dnp, zb->zb_level);
+		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
+
+		err = report_free_dnode_range(da, dnobj,
+		    dnobj + (span >> DNODE_SHIFT) - 1);
+		if (err)
+			return (err);
+	} else if (zb->zb_level == 0) {
+		dnode_phys_t *blk;
+		arc_buf_t *abuf;
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+		int zio_flags = ZIO_FLAG_CANFAIL;
+		int i;
+
+		if (BP_IS_PROTECTED(bp))
+			zio_flags |= ZIO_FLAG_RAW;
+
+		if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &aflags, zb) != 0)
+			return (SET_ERROR(EIO));
+
+		blk = abuf->b_data;
+		for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) {
+			uint64_t dnobj = (zb->zb_blkid <<
+			    (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
+			err = report_dnode(da, dnobj, blk+i);
+			if (err)
+				break;
+		}
+		arc_buf_destroy(abuf, &abuf);
+		if (err)
+			return (err);
+		/* Don't care about the data blocks */
+		return (TRAVERSE_VISIT_NO_CHILDREN);
+	}
+	return (0);
+}
+
+int
+dmu_diff(const char *tosnap_name, const char *fromsnap_name,
+    zfs_file_t *fp, offset_t *offp)
+{
+	dmu_diffarg_t da;
+	dsl_dataset_t *fromsnap;
+	dsl_dataset_t *tosnap;
+	dsl_pool_t *dp;
+	int error;
+	uint64_t fromtxg;
+
+	if (strchr(tosnap_name, '@') == NULL ||
+	    strchr(fromsnap_name, '@') == NULL)
+		return (SET_ERROR(EINVAL));
+
+	error = dsl_pool_hold(tosnap_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap);
+	if (error != 0) {
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
+		dsl_dataset_rele(fromsnap, FTAG);
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
+	dsl_dataset_rele(fromsnap, FTAG);
+
+	dsl_dataset_long_hold(tosnap, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	da.da_fp = fp;
+	da.da_offp = offp;
+	da.da_ddr.ddr_type = DDR_NONE;
+	da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0;
+	da.da_err = 0;
+
+	/*
+	 * Since zfs diff only looks at dnodes which are stored in plaintext
+	 * (other than bonus buffers), we don't technically need to decrypt
+	 * the dataset to perform this operation. However, the command line
+	 * utility will still fail if the keys are not loaded because the
+	 * dataset isn't mounted and because it will fail when it attempts to
+	 * call the ZFS_IOC_OBJ_TO_STATS ioctl.
+	 */
+	error = traverse_dataset(tosnap, fromtxg,
+	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT,
+	    diff_cb, &da);
+
+	if (error != 0) {
+		da.da_err = error;
+	} else {
+		/* we set the da.da_err we return as side-effect */
+		(void) write_record(&da);
+	}
+
+	dsl_dataset_long_rele(tosnap, FTAG);
+	dsl_dataset_rele(tosnap, FTAG);
+
+	return (da.da_err);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_object.c b/sys/contrib/openzfs/module/zfs/dmu_object.c
new file mode 100644
index 000000000000..12cdbd68b104
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_object.c
@@ -0,0 +1,523 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ */
+
+#include <sys/dbuf.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * Each of the concurrent object allocators will grab
+ * 2^dmu_object_alloc_chunk_shift dnode slots at a time.  The default is to
+ * grab 128 slots, which is 4 blocks worth.  This was experimentally
+ * determined to be the lowest value that eliminates the measurable effect
+ * of lock contention from this code path.
+ */
+int dmu_object_alloc_chunk_shift = 7;
+
+static uint64_t
+dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize,
+    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+	uint64_t object;
+	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
+	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
+	dnode_t *dn = NULL;
+	int dn_slots = dnodesize >> DNODE_SHIFT;
+	boolean_t restarted = B_FALSE;
+	uint64_t *cpuobj = NULL;
+	int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+	int error;
+
+	cpuobj = &os->os_obj_next_percpu[CPU_SEQID_UNSTABLE %
+	    os->os_obj_next_percpu_len];
+
+	if (dn_slots == 0) {
+		dn_slots = DNODE_MIN_SLOTS;
+	} else {
+		ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+		ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+	}
+
+	/*
+	 * The "chunk" of dnodes that is assigned to a CPU-specific
+	 * allocator needs to be at least one block's worth, to avoid
+	 * lock contention on the dbuf.  It can be at most one L1 block's
+	 * worth, so that the "rescan after polishing off a L1's worth"
+	 * logic below will be sure to kick in.
+	 */
+	if (dnodes_per_chunk < DNODES_PER_BLOCK)
+		dnodes_per_chunk = DNODES_PER_BLOCK;
+	if (dnodes_per_chunk > L1_dnode_count)
+		dnodes_per_chunk = L1_dnode_count;
+
+	/*
+	 * The caller requested the dnode be returned as a performance
+	 * optimization in order to avoid releasing the hold only to
+	 * immediately reacquire it.  Since they caller is responsible
+	 * for releasing the hold they must provide the tag.
+	 */
+	if (allocated_dnode != NULL) {
+		ASSERT3P(tag, !=, NULL);
+	} else {
+		ASSERT3P(tag, ==, NULL);
+		tag = FTAG;
+	}
+
+	object = *cpuobj;
+	for (;;) {
+		/*
+		 * If we finished a chunk of dnodes, get a new one from
+		 * the global allocator.
+		 */
+		if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+		    (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+		    dn_slots)) {
+			DNODE_STAT_BUMP(dnode_alloc_next_chunk);
+			mutex_enter(&os->os_obj_lock);
+			ASSERT0(P2PHASE(os->os_obj_next_chunk,
+			    dnodes_per_chunk));
+			object = os->os_obj_next_chunk;
+
+			/*
+			 * Each time we polish off a L1 bp worth of dnodes
+			 * (2^12 objects), move to another L1 bp that's
+			 * still reasonably sparse (at most 1/4 full). Look
+			 * from the beginning at most once per txg. If we
+			 * still can't allocate from that L1 block, search
+			 * for an empty L0 block, which will quickly skip
+			 * to the end of the metadnode if no nearby L0
+			 * blocks are empty. This fallback avoids a
+			 * pathology where full dnode blocks containing
+			 * large dnodes appear sparse because they have a
+			 * low blk_fill, leading to many failed allocation
+			 * attempts. In the long term a better mechanism to
+			 * search for sparse metadnode regions, such as
+			 * spacemaps, could be implemented.
+			 *
+			 * os_scan_dnodes is set during txg sync if enough
+			 * objects have been freed since the previous
+			 * rescan to justify backfilling again.
+			 *
+			 * Note that dmu_traverse depends on the behavior
+			 * that we use multiple blocks of the dnode object
+			 * before going back to reuse objects.  Any change
+			 * to this algorithm should preserve that property
+			 * or find another solution to the issues described
+			 * in traverse_visitbp.
+			 */
+			if (P2PHASE(object, L1_dnode_count) == 0) {
+				uint64_t offset;
+				uint64_t blkfill;
+				int minlvl;
+				if (os->os_rescan_dnodes) {
+					offset = 0;
+					os->os_rescan_dnodes = B_FALSE;
+				} else {
+					offset = object << DNODE_SHIFT;
+				}
+				blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2;
+				minlvl = restarted ? 1 : 2;
+				restarted = B_TRUE;
+				error = dnode_next_offset(DMU_META_DNODE(os),
+				    DNODE_FIND_HOLE, &offset, minlvl,
+				    blkfill, 0);
+				if (error == 0) {
+					object = offset >> DNODE_SHIFT;
+				}
+			}
+			/*
+			 * Note: if "restarted", we may find a L0 that
+			 * is not suitably aligned.
+			 */
+			os->os_obj_next_chunk =
+			    P2ALIGN(object, dnodes_per_chunk) +
+			    dnodes_per_chunk;
+			(void) atomic_swap_64(cpuobj, object);
+			mutex_exit(&os->os_obj_lock);
+		}
+
+		/*
+		 * The value of (*cpuobj) before adding dn_slots is the object
+		 * ID assigned to us.  The value afterwards is the object ID
+		 * assigned to whoever wants to do an allocation next.
+		 */
+		object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+
+		/*
+		 * XXX We should check for an i/o error here and return
+		 * up to our caller.  Actually we should pre-read it in
+		 * dmu_tx_assign(), but there is currently no mechanism
+		 * to do so.
+		 */
+		error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
+		    dn_slots, tag, &dn);
+		if (error == 0) {
+			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+			/*
+			 * Another thread could have allocated it; check
+			 * again now that we have the struct lock.
+			 */
+			if (dn->dn_type == DMU_OT_NONE) {
+				dnode_allocate(dn, ot, blocksize,
+				    indirect_blockshift, bonustype,
+				    bonuslen, dn_slots, tx);
+				rw_exit(&dn->dn_struct_rwlock);
+				dmu_tx_add_new_object(tx, dn);
+
+				/*
+				 * Caller requested the allocated dnode be
+				 * returned and is responsible for the hold.
+				 */
+				if (allocated_dnode != NULL)
+					*allocated_dnode = dn;
+				else
+					dnode_rele(dn, tag);
+
+				return (object);
+			}
+			rw_exit(&dn->dn_struct_rwlock);
+			dnode_rele(dn, tag);
+			DNODE_STAT_BUMP(dnode_alloc_race);
+		}
+
+		/*
+		 * Skip to next known valid starting point on error.  This
+		 * is the start of the next block of dnodes.
+		 */
+		if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
+			object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+			DNODE_STAT_BUMP(dnode_alloc_next_block);
+		}
+		(void) atomic_swap_64(cpuobj, object);
+	}
+}
+
+uint64_t
+dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+	    bonuslen, 0, NULL, NULL, tx);
+}
+
+uint64_t
+dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
+    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+    dmu_tx_t *tx)
+{
+	return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+	    bonustype, bonuslen, 0, NULL, NULL, tx);
+}
+
+uint64_t
+dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype,
+	    bonuslen, dnodesize, NULL, NULL, tx));
+}
+
+/*
+ * Allocate a new object and return a pointer to the newly allocated dnode
+ * via the allocated_dnode argument.  The returned dnode will be held and
+ * the caller is responsible for releasing the hold by calling dnode_rele().
+ */
+uint64_t
+dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize,
+    int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen,
+    int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+	return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift,
+	    bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx));
+}
+
+int
+dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype,
+	    bonuslen, 0, tx));
+}
+
+int
+dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen,
+    int dnodesize, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int dn_slots = dnodesize >> DNODE_SHIFT;
+	int err;
+
+	if (dn_slots == 0)
+		dn_slots = DNODE_MIN_SLOTS;
+	ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS);
+	ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS);
+
+	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+		return (SET_ERROR(EBADF));
+
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots,
+	    FTAG, &dn);
+	if (err)
+		return (err);
+
+	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx);
+	dmu_tx_add_new_object(tx, dn);
+
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+int
+dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
+	    bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
+}
+
+int
+dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
+    int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    boolean_t keep_spill, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int dn_slots = dnodesize >> DNODE_SHIFT;
+	int err;
+
+	if (dn_slots == 0)
+		dn_slots = DNODE_MIN_SLOTS;
+
+	if (object == DMU_META_DNODE_OBJECT)
+		return (SET_ERROR(EBADF));
+
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+	    FTAG, &dn);
+	if (err)
+		return (err);
+
+	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
+	    keep_spill, tx);
+
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+	    FTAG, &dn);
+	if (err)
+		return (err);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		dbuf_rm_spill(dn, tx);
+		dnode_rm_spill(dn, tx);
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+int
+dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int err;
+
+	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+	    FTAG, &dn);
+	if (err)
+		return (err);
+
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+	/*
+	 * If we don't create this free range, we'll leak indirect blocks when
+	 * we get to freeing the dnode in syncing context.
+	 */
+	dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+	dnode_free(dn, tx);
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
+int
+dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
+{
+	uint64_t offset;
+	uint64_t start_obj;
+	struct dsl_dataset *ds = os->os_dsl_dataset;
+	int error;
+
+	if (*objectp == 0) {
+		start_obj = 1;
+	} else if (ds && dsl_dataset_feature_is_active(ds,
+	    SPA_FEATURE_LARGE_DNODE)) {
+		uint64_t i = *objectp + 1;
+		uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+		dmu_object_info_t doi;
+
+		/*
+		 * Scan through the remaining meta dnode block.  The contents
+		 * of each slot in the block are known so it can be quickly
+		 * checked.  If the block is exhausted without a match then
+		 * hand off to dnode_next_offset() for further scanning.
+		 */
+		while (i <= last_obj) {
+			error = dmu_object_info(os, i, &doi);
+			if (error == ENOENT) {
+				if (hole) {
+					*objectp = i;
+					return (0);
+				} else {
+					i++;
+				}
+			} else if (error == EEXIST) {
+				i++;
+			} else if (error == 0) {
+				if (hole) {
+					i += doi.doi_dnodesize >> DNODE_SHIFT;
+				} else {
+					*objectp = i;
+					return (0);
+				}
+			} else {
+				return (error);
+			}
+		}
+
+		start_obj = i;
+	} else {
+		start_obj = *objectp + 1;
+	}
+
+	offset = start_obj << DNODE_SHIFT;
+
+	error = dnode_next_offset(DMU_META_DNODE(os),
+	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
+
+	*objectp = offset >> DNODE_SHIFT;
+
+	return (error);
+}
+
+/*
+ * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
+ * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
+ *
+ * Only for use from syncing context, on MOS objects.
+ */
+void
+dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+	if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
+		dnode_rele(dn, FTAG);
+		return;
+	}
+	ASSERT3U(dn->dn_type, ==, old_type);
+	ASSERT0(dn->dn_maxblkid);
+
+	/*
+	 * We must initialize the ZAP data before changing the type,
+	 * so that concurrent calls to *_is_zapified() can determine if
+	 * the object has been completely zapified by checking the type.
+	 */
+	mzap_create_impl(dn, 0, 0, tx);
+
+	dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
+	    DMU_OTN_ZAP_METADATA;
+	dnode_setdirty(dn, tx);
+	dnode_rele(dn, FTAG);
+
+	spa_feature_incr(dmu_objset_spa(mos),
+	    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+}
+
+void
+dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	dmu_object_type_t t;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+	t = dn->dn_type;
+	dnode_rele(dn, FTAG);
+
+	if (t == DMU_OTN_ZAP_METADATA) {
+		spa_feature_decr(dmu_objset_spa(mos),
+		    SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+	}
+	VERIFY0(dmu_object_free(mos, object, tx));
+}
+
+EXPORT_SYMBOL(dmu_object_alloc);
+EXPORT_SYMBOL(dmu_object_alloc_ibs);
+EXPORT_SYMBOL(dmu_object_alloc_dnsize);
+EXPORT_SYMBOL(dmu_object_alloc_hold);
+EXPORT_SYMBOL(dmu_object_claim);
+EXPORT_SYMBOL(dmu_object_claim_dnsize);
+EXPORT_SYMBOL(dmu_object_reclaim);
+EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
+EXPORT_SYMBOL(dmu_object_rm_spill);
+EXPORT_SYMBOL(dmu_object_free);
+EXPORT_SYMBOL(dmu_object_next);
+EXPORT_SYMBOL(dmu_object_zapify);
+EXPORT_SYMBOL(dmu_object_free_zapified);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW,
+	"CPU-specific allocator grabs 2^N objects at once");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_objset.c b/sys/contrib/openzfs/module/zfs/dmu_objset.c
new file mode 100644
index 000000000000..bfb4adf262d5
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_objset.c
@@ -0,0 +1,3044 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/cred.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dnode.h>
+#include <sys/dbuf.h>
+#include <sys/zvol.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dmu_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/sa.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu_recv.h>
+#include <sys/zfs_project.h>
+#include "zfs_namecheck.h"
+
+/*
+ * Needed to close a window in dnode_move() that allows the objset to be freed
+ * before it can be safely accessed.
+ */
+krwlock_t os_lock;
+
+/*
+ * Tunable to overwrite the maximum number of threads for the parallelization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+/*
+ * Backfill lower metadnode objects after this many have been freed.
+ * Backfilling negatively impacts object creation rates, so only do it
+ * if there are enough holes to fill.
+ */
+int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
+
+static char *upgrade_tag = "upgrade_tag";
+
+static void dmu_objset_find_dp_cb(void *arg);
+
+static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);
+static void dmu_objset_upgrade_stop(objset_t *os);
+
+void
+dmu_objset_init(void)
+{
+	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
+}
+
+void
+dmu_objset_fini(void)
+{
+	rw_destroy(&os_lock);
+}
+
+spa_t *
+dmu_objset_spa(objset_t *os)
+{
+	return (os->os_spa);
+}
+
+zilog_t *
+dmu_objset_zil(objset_t *os)
+{
+	return (os->os_zil);
+}
+
+dsl_pool_t *
+dmu_objset_pool(objset_t *os)
+{
+	dsl_dataset_t *ds;
+
+	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
+		return (ds->ds_dir->dd_pool);
+	else
+		return (spa_get_dsl(os->os_spa));
+}
+
+dsl_dataset_t *
+dmu_objset_ds(objset_t *os)
+{
+	return (os->os_dsl_dataset);
+}
+
+dmu_objset_type_t
+dmu_objset_type(objset_t *os)
+{
+	return (os->os_phys->os_type);
+}
+
+void
+dmu_objset_name(objset_t *os, char *buf)
+{
+	dsl_dataset_name(os->os_dsl_dataset, buf);
+}
+
+uint64_t
+dmu_objset_id(objset_t *os)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+
+	return (ds ? ds->ds_object : 0);
+}
+
+uint64_t
+dmu_objset_dnodesize(objset_t *os)
+{
+	return (os->os_dnodesize);
+}
+
+zfs_sync_type_t
+dmu_objset_syncprop(objset_t *os)
+{
+	return (os->os_sync);
+}
+
+zfs_logbias_op_t
+dmu_objset_logbias(objset_t *os)
+{
+	return (os->os_logbias);
+}
+
+static void
+checksum_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+}
+
+static void
+compression_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval != ZIO_COMPRESS_INHERIT);
+
+	os->os_compress = zio_compress_select(os->os_spa,
+	    ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON);
+	os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress,
+	    ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT);
+}
+
+static void
+copies_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval > 0);
+	ASSERT(newval <= spa_max_replication(os->os_spa));
+
+	os->os_copies = newval;
+}
+
+static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+	spa_t *spa = os->os_spa;
+	enum zio_checksum checksum;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
+
+	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
+}
+
+static void
+primary_cache_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+	    newval == ZFS_CACHE_METADATA);
+
+	os->os_primary_cache = newval;
+}
+
+static void
+secondary_cache_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
+	    newval == ZFS_CACHE_METADATA);
+
+	os->os_secondary_cache = newval;
+}
+
+static void
+sync_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
+	    newval == ZFS_SYNC_DISABLED);
+
+	os->os_sync = newval;
+	if (os->os_zil)
+		zil_set_sync(os->os_zil, newval);
+}
+
+static void
+redundant_metadata_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
+	    newval == ZFS_REDUNDANT_METADATA_MOST);
+
+	os->os_redundant_metadata = newval;
+}
+
+static void
+dnodesize_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	switch (newval) {
+	case ZFS_DNSIZE_LEGACY:
+		os->os_dnodesize = DNODE_MIN_SIZE;
+		break;
+	case ZFS_DNSIZE_AUTO:
+		/*
+		 * Choose a dnode size that will work well for most
+		 * workloads if the user specified "auto". Future code
+		 * improvements could dynamically select a dnode size
+		 * based on observed workload patterns.
+		 */
+		os->os_dnodesize = DNODE_MIN_SIZE * 2;
+		break;
+	case ZFS_DNSIZE_1K:
+	case ZFS_DNSIZE_2K:
+	case ZFS_DNSIZE_4K:
+	case ZFS_DNSIZE_8K:
+	case ZFS_DNSIZE_16K:
+		os->os_dnodesize = newval;
+		break;
+	}
+}
+
+static void
+smallblk_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	/*
+	 * Inheritance and range checking should have been done by now.
+	 */
+	ASSERT(newval <= SPA_MAXBLOCKSIZE);
+	ASSERT(ISP2(newval));
+
+	os->os_zpl_special_smallblock = newval;
+}
+
+static void
+logbias_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
+	    newval == ZFS_LOGBIAS_THROUGHPUT);
+	os->os_logbias = newval;
+	if (os->os_zil)
+		zil_set_logbias(os->os_zil, newval);
+}
+
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	os->os_recordsize = newval;
+}
+
+void
+dmu_objset_byteswap(void *buf, size_t size)
+{
+	objset_phys_t *osp = buf;
+
+	ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 ||
+	    size == sizeof (objset_phys_t));
+	dnode_byteswap(&osp->os_meta_dnode);
+	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
+	osp->os_type = BSWAP_64(osp->os_type);
+	osp->os_flags = BSWAP_64(osp->os_flags);
+	if (size >= OBJSET_PHYS_SIZE_V2) {
+		dnode_byteswap(&osp->os_userused_dnode);
+		dnode_byteswap(&osp->os_groupused_dnode);
+		if (size >= sizeof (objset_phys_t))
+			dnode_byteswap(&osp->os_projectused_dnode);
+	}
+}
+
+/*
+ * The hash is a CRC-based hash of the objset_t pointer and the object number.
+ */
+static uint64_t
+dnode_hash(const objset_t *os, uint64_t obj)
+{
+	uintptr_t osv = (uintptr_t)os;
+	uint64_t crc = -1ULL;
+
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	/*
+	 * The low 6 bits of the pointer don't have much entropy, because
+	 * the objset_t is larger than 2^6 bytes long.
+	 */
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
+	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
+
+	crc ^= (osv>>14) ^ (obj>>24);
+
+	return (crc);
+}
+
+static unsigned int
+dnode_multilist_index_func(multilist_t *ml, void *obj)
+{
+	dnode_t *dn = obj;
+	return (dnode_hash(dn->dn_objset, dn->dn_object) %
+	    multilist_get_num_sublists(ml));
+}
+
+/*
+ * Instantiates the objset_t in-memory structure corresponding to the
+ * objset_phys_t that's pointed to by the specified blkptr_t.
+ */
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+    objset_t **osp)
+{
+	objset_t *os;
+	int i, err;
+
+	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
+	ASSERT(!BP_IS_REDACTED(bp));
+
+	/*
+	 * We need the pool config lock to get properties.
+	 */
+	ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+	/*
+	 * The $ORIGIN dataset (if it exists) doesn't have an associated
+	 * objset, so there's no reason to open it. The $ORIGIN dataset
+	 * will not exist on pools older than SPA_VERSION_ORIGIN.
+	 */
+	if (ds != NULL && spa_get_dsl(spa) != NULL &&
+	    spa_get_dsl(spa)->dp_origin_snap != NULL) {
+		ASSERT3P(ds->ds_dir, !=,
+		    spa_get_dsl(spa)->dp_origin_snap->ds_dir);
+	}
+
+	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
+	os->os_dsl_dataset = ds;
+	os->os_spa = spa;
+	os->os_rootbp = bp;
+	if (!BP_IS_HOLE(os->os_rootbp)) {
+		arc_flags_t aflags = ARC_FLAG_WAIT;
+		zbookmark_phys_t zb;
+		int size;
+		enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+		if (DMU_OS_IS_L2CACHEABLE(os))
+			aflags |= ARC_FLAG_L2CACHE;
+
+		if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
+			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+			ASSERT(BP_IS_AUTHENTICATED(bp));
+			zio_flags |= ZIO_FLAG_RAW;
+		}
+
+		dprintf_bp(os->os_rootbp, "reading %s", "");
+		err = arc_read(NULL, spa, os->os_rootbp,
+		    arc_getbuf_func, &os->os_phys_buf,
+		    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+		if (err != 0) {
+			kmem_free(os, sizeof (objset_t));
+			/* convert checksum errors into IO errors */
+			if (err == ECKSUM)
+				err = SET_ERROR(EIO);
+			return (err);
+		}
+
+		if (spa_version(spa) < SPA_VERSION_USERSPACE)
+			size = OBJSET_PHYS_SIZE_V1;
+		else if (!spa_feature_is_enabled(spa,
+		    SPA_FEATURE_PROJECT_QUOTA))
+			size = OBJSET_PHYS_SIZE_V2;
+		else
+			size = sizeof (objset_phys_t);
+
+		/* Increase the blocksize if we are permitted. */
+		if (arc_buf_size(os->os_phys_buf) < size) {
+			arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
+			    ARC_BUFC_METADATA, size);
+			bzero(buf->b_data, size);
+			bcopy(os->os_phys_buf->b_data, buf->b_data,
+			    arc_buf_size(os->os_phys_buf));
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+			os->os_phys_buf = buf;
+		}
+
+		os->os_phys = os->os_phys_buf->b_data;
+		os->os_flags = os->os_phys->os_flags;
+	} else {
+		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
+		    sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1;
+		os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
+		    ARC_BUFC_METADATA, size);
+		os->os_phys = os->os_phys_buf->b_data;
+		bzero(os->os_phys, size);
+	}
+	/*
+	 * These properties will be filled in by the logic in zfs_get_zplprop()
+	 * when they are queried for the first time.
+	 */
+	os->os_version = OBJSET_PROP_UNINITIALIZED;
+	os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+	os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+	os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
+
+	/*
+	 * Note: the changed_cb will be called once before the register
+	 * func returns, thus changing the checksum/compression from the
+	 * default (fletcher2/off).  Snapshots don't need to know about
+	 * checksum/compression/copies.
+	 */
+	if (ds != NULL) {
+		os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
+
+		err = dsl_prop_register(ds,
+		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
+		    primary_cache_changed_cb, os);
+		if (err == 0) {
+			err = dsl_prop_register(ds,
+			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
+			    secondary_cache_changed_cb, os);
+		}
+		if (!ds->ds_is_snapshot) {
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
+				    checksum_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
+				    compression_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_COPIES),
+				    copies_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_DEDUP),
+				    dedup_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
+				    logbias_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_SYNC),
+				    sync_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(
+				    ZFS_PROP_REDUNDANT_METADATA),
+				    redundant_metadata_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+				    recordsize_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(ZFS_PROP_DNODESIZE),
+				    dnodesize_changed_cb, os);
+			}
+			if (err == 0) {
+				err = dsl_prop_register(ds,
+				    zfs_prop_to_name(
+				    ZFS_PROP_SPECIAL_SMALL_BLOCKS),
+				    smallblk_changed_cb, os);
+			}
+		}
+		if (err != 0) {
+			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+			kmem_free(os, sizeof (objset_t));
+			return (err);
+		}
+	} else {
+		/* It's the meta-objset. */
+		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+		os->os_compress = ZIO_COMPRESS_ON;
+		os->os_complevel = ZIO_COMPLEVEL_DEFAULT;
+		os->os_encrypted = B_FALSE;
+		os->os_copies = spa_max_replication(spa);
+		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+		os->os_dedup_verify = B_FALSE;
+		os->os_logbias = ZFS_LOGBIAS_LATENCY;
+		os->os_sync = ZFS_SYNC_STANDARD;
+		os->os_primary_cache = ZFS_CACHE_ALL;
+		os->os_secondary_cache = ZFS_CACHE_ALL;
+		os->os_dnodesize = DNODE_MIN_SIZE;
+	}
+
+	if (ds == NULL || !ds->ds_is_snapshot)
+		os->os_zil_header = os->os_phys->os_zil_header;
+	os->os_zil = zil_alloc(os, &os->os_zil_header);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[i]),
+		    dnode_multilist_index_func);
+	}
+	list_create(&os->os_dnodes, sizeof (dnode_t),
+	    offsetof(dnode_t, dn_link));
+	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+
+	list_link_init(&os->os_evicting_node);
+
+	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+	os->os_obj_next_percpu_len = boot_ncpus;
+	os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
+	    sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
+
+	dnode_special_open(os, &os->os_phys->os_meta_dnode,
+	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
+	if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) {
+		dnode_special_open(os, &os->os_phys->os_userused_dnode,
+		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
+		if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf))
+			dnode_special_open(os,
+			    &os->os_phys->os_projectused_dnode,
+			    DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode);
+	}
+
+	mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	*osp = os;
+	return (0);
+}
+
+int
+dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
+{
+	int err = 0;
+
+	/*
+	 * We need the pool_config lock to manipulate the dsl_dataset_t.
+	 * Even if the dataset is long-held, we need the pool_config lock
+	 * to open the objset, as it needs to get properties.
+	 */
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+	mutex_enter(&ds->ds_opening_lock);
+	if (ds->ds_objset == NULL) {
+		objset_t *os;
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+		    ds, dsl_dataset_get_blkptr(ds), &os);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+		if (err == 0) {
+			mutex_enter(&ds->ds_lock);
+			ASSERT(ds->ds_objset == NULL);
+			ds->ds_objset = os;
+			mutex_exit(&ds->ds_lock);
+		}
+	}
+	*osp = ds->ds_objset;
+	mutex_exit(&ds->ds_opening_lock);
+	return (err);
+}
+
+/*
+ * Holds the pool while the objset is held.  Therefore only one objset
+ * can be held at a time.
+ */
+int
+dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag,
+    objset_t **osp)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+	ds_hold_flags_t flags;
+
+	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+	err = dsl_pool_hold(name, tag, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, tag);
+		return (err);
+	}
+
+	err = dmu_objset_from_ds(ds, osp);
+	if (err != 0) {
+		dsl_dataset_rele(ds, tag);
+		dsl_pool_rele(dp, tag);
+	}
+
+	return (err);
+}
+
+int
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
+{
+	return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
+}
+
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+    boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+	int err;
+
+	err = dmu_objset_from_ds(ds, osp);
+	if (err != 0) {
+		return (err);
+	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+		return (SET_ERROR(EINVAL));
+	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+		return (SET_ERROR(EROFS));
+	} else if (!readonly && decrypt &&
+	    dsl_dir_incompatible_encryption_version(ds->ds_dir)) {
+		return (SET_ERROR(EROFS));
+	}
+
+	/* if we are decrypting, we can now check MACs in os->os_phys_buf */
+	if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {
+		zbookmark_phys_t zb;
+
+		SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+		    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+		err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,
+		    &zb, B_FALSE);
+		if (err != 0)
+			return (err);
+
+		ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));
+	}
+
+	return (0);
+}
+
+/*
+ * dsl_pool must not be held when this is called.
+ * Upon successful return, there will be a longhold on the dataset,
+ * and the dsl_pool will not be held.
+ */
+int
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+	ds_hold_flags_t flags;
+
+	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+	err = dsl_pool_hold(name, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_own(dp, name, flags, tag, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+	err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
+	if (err != 0) {
+		dsl_dataset_disown(ds, flags, tag);
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	/*
+	 * User accounting requires the dataset to be decrypted and rw.
+	 * We also don't begin user accounting during claiming to help
+	 * speed up pool import times and to keep this txg reserved
+	 * completely for recovery work.
+	 */
+	if (!readonly && !dp->dp_spa->spa_claiming &&
+	    (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) {
+		if (dmu_objset_userobjspace_upgradable(*osp) ||
+		    dmu_objset_projectquota_upgradable(*osp)) {
+			dmu_objset_id_quota_upgrade(*osp);
+		} else if (dmu_objset_userused_enabled(*osp)) {
+			dmu_objset_userspace_upgrade(*osp);
+		}
+	}
+
+	dsl_pool_rele(dp, FTAG);
+	return (0);
+}
+
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+    boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp)
+{
+	dsl_dataset_t *ds;
+	int err;
+	ds_hold_flags_t flags;
+
+	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+	err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
+	if (err != 0)
+		return (err);
+
+	err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
+	if (err != 0) {
+		dsl_dataset_disown(ds, flags, tag);
+		return (err);
+	}
+
+	return (0);
+}
+
+void
+dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag)
+{
+	ds_hold_flags_t flags;
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+	dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
+	dsl_pool_rele(dp, tag);
+}
+
+void
+dmu_objset_rele(objset_t *os, void *tag)
+{
+	dmu_objset_rele_flags(os, B_FALSE, tag);
+}
+
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag.  We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
+void
+dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
+    boolean_t decrypt, void *tag)
+{
+	dsl_pool_t *dp;
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	ds_hold_flags_t flags;
+
+	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+	VERIFY3P(ds, !=, NULL);
+	VERIFY3P(ds->ds_owner, ==, tag);
+	VERIFY(dsl_dataset_long_held(ds));
+
+	dsl_dataset_name(ds, name);
+	dp = ds->ds_dir->dd_pool;
+	dsl_pool_config_enter(dp, FTAG);
+	dsl_dataset_disown(ds, flags, tag);
+	VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds));
+	dsl_pool_config_exit(dp, FTAG);
+}
+
+void
+dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag)
+{
+	ds_hold_flags_t flags;
+
+	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
+	/*
+	 * Stop upgrading thread
+	 */
+	dmu_objset_upgrade_stop(os);
+	dsl_dataset_disown(os->os_dsl_dataset, flags, tag);
+}
+
+void
+dmu_objset_evict_dbufs(objset_t *os)
+{
+	dnode_t *dn_marker;
+	dnode_t *dn;
+
+	dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
+
+	mutex_enter(&os->os_lock);
+	dn = list_head(&os->os_dnodes);
+	while (dn != NULL) {
+		/*
+		 * Skip dnodes without holds.  We have to do this dance
+		 * because dnode_add_ref() only works if there is already a
+		 * hold.  If the dnode has no holds, then it has no dbufs.
+		 */
+		if (dnode_add_ref(dn, FTAG)) {
+			list_insert_after(&os->os_dnodes, dn, dn_marker);
+			mutex_exit(&os->os_lock);
+
+			dnode_evict_dbufs(dn);
+			dnode_rele(dn, FTAG);
+
+			mutex_enter(&os->os_lock);
+			dn = list_next(&os->os_dnodes, dn_marker);
+			list_remove(&os->os_dnodes, dn_marker);
+		} else {
+			dn = list_next(&os->os_dnodes, dn);
+		}
+	}
+	mutex_exit(&os->os_lock);
+
+	kmem_free(dn_marker, sizeof (dnode_t));
+
+	if (DMU_USERUSED_DNODE(os) != NULL) {
+		if (DMU_PROJECTUSED_DNODE(os) != NULL)
+			dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os));
+		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
+	}
+	dnode_evict_dbufs(DMU_META_DNODE(os));
+}
+
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction.  Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ *       dnode_buf_pageout()), it is possible for the meta dnode for the
+ *       objset to have no holds even though os->os_dnodes is not empty.
+ */
+void
+dmu_objset_evict(objset_t *os)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+
+	for (int t = 0; t < TXG_SIZE; t++)
+		ASSERT(!dmu_objset_is_dirty(os, t));
+
+	if (ds)
+		dsl_prop_unregister_all(ds, os);
+
+	if (os->os_sa)
+		sa_tear_down(os);
+
+	dmu_objset_evict_dbufs(os);
+
+	mutex_enter(&os->os_lock);
+	spa_evicting_os_register(os->os_spa, os);
+	if (list_is_empty(&os->os_dnodes)) {
+		mutex_exit(&os->os_lock);
+		dmu_objset_evict_done(os);
+	} else {
+		mutex_exit(&os->os_lock);
+	}
+
+
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
+	dnode_special_close(&os->os_meta_dnode);
+	if (DMU_USERUSED_DNODE(os)) {
+		if (DMU_PROJECTUSED_DNODE(os))
+			dnode_special_close(&os->os_projectused_dnode);
+		dnode_special_close(&os->os_userused_dnode);
+		dnode_special_close(&os->os_groupused_dnode);
+	}
+	zil_free(os->os_zil);
+
+	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
+
+	/*
+	 * This is a barrier to prevent the objset from going away in
+	 * dnode_move() until we can safely ensure that the objset is still in
+	 * use. We consider the objset valid before the barrier and invalid
+	 * after the barrier.
+	 */
+	rw_enter(&os_lock, RW_READER);
+	rw_exit(&os_lock);
+
+	kmem_free(os->os_obj_next_percpu,
+	    os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
+
+	mutex_destroy(&os->os_lock);
+	mutex_destroy(&os->os_userused_lock);
+	mutex_destroy(&os->os_obj_lock);
+	mutex_destroy(&os->os_user_ptr_lock);
+	mutex_destroy(&os->os_upgrade_lock);
+	for (int i = 0; i < TXG_SIZE; i++) {
+		multilist_destroy(os->os_dirty_dnodes[i]);
+	}
+	spa_evicting_os_deregister(os->os_spa, os);
+	kmem_free(os, sizeof (objset_t));
+}
+
+inode_timespec_t
+dmu_objset_snap_cmtime(objset_t *os)
+{
+	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
+}
+
+objset_t *
+dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)
+{
+	objset_t *os;
+	dnode_t *mdn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	if (blksz == 0)
+		blksz = DNODE_BLOCK_SIZE;
+	if (ibs == 0)
+		ibs = DN_MAX_INDBLKSHIFT;
+
+	if (ds != NULL)
+		VERIFY0(dmu_objset_from_ds(ds, &os));
+	else
+		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
+
+	mdn = DMU_META_DNODE(os);
+
+	dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,
+	    DNODE_MIN_SLOTS, tx);
+
+	/*
+	 * We don't want to have to increase the meta-dnode's nlevels
+	 * later, because then we could do it in quiescing context while
+	 * we are also accessing it in open context.
+	 *
+	 * This precaution is not necessary for the MOS (ds == NULL),
+	 * because the MOS is only updated in syncing context.
+	 * This is most fortunate: the MOS is the only objset that
+	 * needs to be synced multiple times as spa_sync() iterates
+	 * to convergence, so minimizing its dn_nlevels matters.
+	 */
+	if (ds != NULL) {
+		if (levels == 0) {
+			levels = 1;
+
+			/*
+			 * Determine the number of levels necessary for the
+			 * meta-dnode to contain DN_MAX_OBJECT dnodes.  Note
+			 * that in order to ensure that we do not overflow
+			 * 64 bits, there has to be a nlevels that gives us a
+			 * number of blocks > DN_MAX_OBJECT but < 2^64.
+			 * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)
+			 * (10) must be less than (64 - log2(DN_MAX_OBJECT))
+			 * (16).
+			 */
+			while ((uint64_t)mdn->dn_nblkptr <<
+			    (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *
+			    (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+			    DN_MAX_OBJECT)
+				levels++;
+		}
+
+		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
+		    mdn->dn_nlevels = levels;
+	}
+
+	ASSERT(type != DMU_OST_NONE);
+	ASSERT(type != DMU_OST_ANY);
+	ASSERT(type < DMU_OST_NUMTYPES);
+	os->os_phys->os_type = type;
+
+	/*
+	 * Enable user accounting if it is enabled and this is not an
+	 * encrypted receive.
+	 */
+	if (dmu_objset_userused_enabled(os) &&
+	    (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
+		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+		if (dmu_objset_userobjused_enabled(os)) {
+			ds->ds_feature_activation[
+			    SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
+			os->os_phys->os_flags |=
+			    OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+		}
+		if (dmu_objset_projectquota_enabled(os)) {
+			ds->ds_feature_activation[
+			    SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
+			os->os_phys->os_flags |=
+			    OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
+		}
+		os->os_flags = os->os_phys->os_flags;
+	}
+
+	dsl_dataset_dirty(ds, tx);
+
+	return (os);
+}
+
+/* called from dsl for meta-objset */
+objset_t *
+dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_objset_type_t type, dmu_tx_t *tx)
+{
+	return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));
+}
+
+typedef struct dmu_objset_create_arg {
+	const char *doca_name;
+	cred_t *doca_cred;
+	proc_t *doca_proc;
+	void (*doca_userfunc)(objset_t *os, void *arg,
+	    cred_t *cr, dmu_tx_t *tx);
+	void *doca_userarg;
+	dmu_objset_type_t doca_type;
+	uint64_t doca_flags;
+	dsl_crypto_params_t *doca_dcp;
+} dmu_objset_create_arg_t;
+
+/*ARGSUSED*/
+static int
+dmu_objset_create_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_objset_create_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	dsl_dataset_t *parentds;
+	objset_t *parentos;
+	const char *tail;
+	int error;
+
+	if (strchr(doca->doca_name, '@') != NULL)
+		return (SET_ERROR(EINVAL));
+
+	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	if (dataset_nestcheck(doca->doca_name) != 0)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
+	if (error != 0)
+		return (error);
+	if (tail == NULL) {
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(EEXIST));
+	}
+
+	error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL);
+	if (error != 0) {
+		dsl_dir_rele(pdd, FTAG);
+		return (error);
+	}
+
+	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+	    doca->doca_cred, doca->doca_proc);
+	if (error != 0) {
+		dsl_dir_rele(pdd, FTAG);
+		return (error);
+	}
+
+	/* can't create below anything but filesystems (eg. no ZVOLs) */
+	error = dsl_dataset_hold_obj(pdd->dd_pool,
+	    dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
+	if (error != 0) {
+		dsl_dir_rele(pdd, FTAG);
+		return (error);
+	}
+	error = dmu_objset_from_ds(parentds, &parentos);
+	if (error != 0) {
+		dsl_dataset_rele(parentds, FTAG);
+		dsl_dir_rele(pdd, FTAG);
+		return (error);
+	}
+	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
+		dsl_dataset_rele(parentds, FTAG);
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+	}
+	dsl_dataset_rele(parentds, FTAG);
+	dsl_dir_rele(pdd, FTAG);
+
+	return (error);
+}
+
+static void
+dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_objset_create_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	dsl_dir_t *pdd;
+	const char *tail;
+	dsl_dataset_t *ds;
+	uint64_t obj;
+	blkptr_t *bp;
+	objset_t *os;
+	zio_t *rzio;
+
+	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
+
+	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
+	    doca->doca_cred, doca->doca_dcp, tx);
+
+	VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,
+	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	bp = dsl_dataset_get_blkptr(ds);
+	os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	if (doca->doca_userfunc != NULL) {
+		doca->doca_userfunc(os, doca->doca_userarg,
+		    doca->doca_cred, tx);
+	}
+
+	/*
+	 * The doca_userfunc() may write out some data that needs to be
+	 * encrypted if the dataset is encrypted (specifically the root
+	 * directory).  This data must be written out before the encryption
+	 * key mapping is removed by dsl_dataset_rele_flags().  Force the
+	 * I/O to occur immediately by invoking the relevant sections of
+	 * dsl_pool_sync().
+	 */
+	if (os->os_encrypted) {
+		dsl_dataset_t *tmpds = NULL;
+		boolean_t need_sync_done = B_FALSE;
+
+		mutex_enter(&ds->ds_lock);
+		ds->ds_owner = FTAG;
+		mutex_exit(&ds->ds_lock);
+
+		rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+		tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
+		    tx->tx_txg);
+		if (tmpds != NULL) {
+			dsl_dataset_sync(ds, rzio, tx);
+			need_sync_done = B_TRUE;
+		}
+		VERIFY0(zio_wait(rzio));
+
+		dmu_objset_sync_done(os, tx);
+		taskq_wait(dp->dp_sync_taskq);
+		if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
+			ASSERT3P(ds->ds_key_mapping, !=, NULL);
+			key_mapping_rele(spa, ds->ds_key_mapping, ds);
+		}
+
+		rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+		tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
+		    tx->tx_txg);
+		if (tmpds != NULL) {
+			dmu_buf_rele(ds->ds_dbuf, ds);
+			dsl_dataset_sync(ds, rzio, tx);
+		}
+		VERIFY0(zio_wait(rzio));
+
+		if (need_sync_done) {
+			ASSERT3P(ds->ds_key_mapping, !=, NULL);
+			key_mapping_rele(spa, ds->ds_key_mapping, ds);
+			dsl_dataset_sync_done(ds, tx);
+		}
+
+		mutex_enter(&ds->ds_lock);
+		ds->ds_owner = NULL;
+		mutex_exit(&ds->ds_lock);
+	}
+
+	spa_history_log_internal_ds(ds, "create", tx, " ");
+
+	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+	dsl_dir_rele(pdd, FTAG);
+}
+
+int
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
+    dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)
+{
+	dmu_objset_create_arg_t doca;
+	dsl_crypto_params_t tmp_dcp = { 0 };
+
+	doca.doca_name = name;
+	doca.doca_cred = CRED();
+	doca.doca_proc = curproc;
+	doca.doca_flags = flags;
+	doca.doca_userfunc = func;
+	doca.doca_userarg = arg;
+	doca.doca_type = type;
+
+	/*
+	 * Some callers (mostly for testing) do not provide a dcp on their
+	 * own but various code inside the sync task will require it to be
+	 * allocated. Rather than adding NULL checks throughout this code
+	 * or adding dummy dcp's to all of the callers we simply create a
+	 * dummy one here and use that. This zero dcp will have the same
+	 * effect as asking for inheritance of all encryption params.
+	 */
+	doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;
+
+	int rv = dsl_sync_task(name,
+	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
+	    6, ZFS_SPACE_CHECK_NORMAL);
+
+	if (rv == 0)
+		zvol_create_minor(name);
+	return (rv);
+}
+
+typedef struct dmu_objset_clone_arg {
+	const char *doca_clone;
+	const char *doca_origin;
+	cred_t *doca_cred;
+	proc_t *doca_proc;
+} dmu_objset_clone_arg_t;
+
+/*ARGSUSED*/
+static int
+dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_objset_clone_arg_t *doca = arg;
+	dsl_dir_t *pdd;
+	const char *tail;
+	int error;
+	dsl_dataset_t *origin;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	if (strchr(doca->doca_clone, '@') != NULL)
+		return (SET_ERROR(EINVAL));
+
+	if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
+	if (error != 0)
+		return (error);
+	if (tail == NULL) {
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(EEXIST));
+	}
+
+	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+	    doca->doca_cred, doca->doca_proc);
+	if (error != 0) {
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
+	if (error != 0) {
+		dsl_dir_rele(pdd, FTAG);
+		return (error);
+	}
+
+	/* You can only clone snapshots, not the head datasets. */
+	if (!origin->ds_is_snapshot) {
+		dsl_dataset_rele(origin, FTAG);
+		dsl_dir_rele(pdd, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	dsl_dataset_rele(origin, FTAG);
+	dsl_dir_rele(pdd, FTAG);
+
+	return (0);
+}
+
+static void
+dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_objset_clone_arg_t *doca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *pdd;
+	const char *tail;
+	dsl_dataset_t *origin, *ds;
+	uint64_t obj;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
+	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
+
+	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
+	    doca->doca_cred, NULL, tx);
+
+	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+	dsl_dataset_name(origin, namebuf);
+	spa_history_log_internal_ds(ds, "clone", tx,
+	    "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_dataset_rele(origin, FTAG);
+	dsl_dir_rele(pdd, FTAG);
+}
+
+int
+dmu_objset_clone(const char *clone, const char *origin)
+{
+	dmu_objset_clone_arg_t doca;
+
+	doca.doca_clone = clone;
+	doca.doca_origin = origin;
+	doca.doca_cred = CRED();
+	doca.doca_proc = curproc;
+
+	int rv = dsl_sync_task(clone,
+	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
+	    6, ZFS_SPACE_CHECK_NORMAL);
+
+	if (rv == 0)
+		zvol_create_minor(clone);
+
+	return (rv);
+}
+
+int
+dmu_objset_snapshot_one(const char *fsname, const char *snapname)
+{
+	int err;
+	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
+	nvlist_t *snaps = fnvlist_alloc();
+
+	fnvlist_add_boolean(snaps, longsnap);
+	kmem_strfree(longsnap);
+	err = dsl_dataset_snapshot(snaps, NULL, NULL);
+	fnvlist_free(snaps);
+	return (err);
+}
+
+static void
+dmu_objset_upgrade_task_cb(void *data)
+{
+	objset_t *os = data;
+
+	mutex_enter(&os->os_upgrade_lock);
+	os->os_upgrade_status = EINTR;
+	if (!os->os_upgrade_exit) {
+		int status;
+
+		mutex_exit(&os->os_upgrade_lock);
+
+		status = os->os_upgrade_cb(os);
+
+		mutex_enter(&os->os_upgrade_lock);
+
+		os->os_upgrade_status = status;
+	}
+	os->os_upgrade_exit = B_TRUE;
+	os->os_upgrade_id = 0;
+	mutex_exit(&os->os_upgrade_lock);
+	dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+}
+
+static void
+dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
+{
+	if (os->os_upgrade_id != 0)
+		return;
+
+	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+	dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);
+
+	mutex_enter(&os->os_upgrade_lock);
+	if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {
+		os->os_upgrade_exit = B_FALSE;
+		os->os_upgrade_cb = cb;
+		os->os_upgrade_id = taskq_dispatch(
+		    os->os_spa->spa_upgrade_taskq,
+		    dmu_objset_upgrade_task_cb, os, TQ_SLEEP);
+		if (os->os_upgrade_id == TASKQID_INVALID) {
+			dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+			os->os_upgrade_status = ENOMEM;
+		}
+	} else {
+		dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+	}
+	mutex_exit(&os->os_upgrade_lock);
+}
+
+static void
+dmu_objset_upgrade_stop(objset_t *os)
+{
+	mutex_enter(&os->os_upgrade_lock);
+	os->os_upgrade_exit = B_TRUE;
+	if (os->os_upgrade_id != 0) {
+		taskqid_t id = os->os_upgrade_id;
+
+		os->os_upgrade_id = 0;
+		mutex_exit(&os->os_upgrade_lock);
+
+		if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {
+			dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
+		}
+		txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
+	} else {
+		mutex_exit(&os->os_upgrade_lock);
+	}
+}
+
+static void
+dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+
+	while ((dn = multilist_sublist_head(list)) != NULL) {
+		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+		ASSERT(dn->dn_dbuf->db_data_pending);
+		/*
+		 * Initialize dn_zio outside dnode_sync() because the
+		 * meta-dnode needs to set it outside dnode_sync().
+		 */
+		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
+		ASSERT(dn->dn_zio);
+
+		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
+		multilist_sublist_remove(list, dn);
+
+		/*
+		 * See the comment above dnode_rele_task() for an explanation
+		 * of why this dnode hold is always needed (even when not
+		 * doing user accounting).
+		 */
+		multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
+		(void) dnode_add_ref(dn, newlist);
+		multilist_insert(newlist, dn);
+
+		dnode_sync(dn, tx);
+	}
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+	blkptr_t *bp = zio->io_bp;
+	objset_t *os = arg;
+	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
+	uint64_t fill = 0;
+
+	ASSERT(!BP_IS_EMBEDDED(bp));
+	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
+	ASSERT0(BP_GET_LEVEL(bp));
+
+	/*
+	 * Update rootbp fill count: it should be the number of objects
+	 * allocated in the object set (not counting the "special"
+	 * objects that are stored in the objset_phys_t -- the meta
+	 * dnode and user/group/project accounting objects).
+	 */
+	for (int i = 0; i < dnp->dn_nblkptr; i++)
+		fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+
+	BP_SET_FILL(bp, fill);
+
+	if (os->os_dsl_dataset != NULL)
+		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
+	*os->os_rootbp = *bp;
+	if (os->os_dsl_dataset != NULL)
+		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	objset_t *os = arg;
+
+	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+		ASSERT(BP_EQUAL(bp, bp_orig));
+	} else {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		dmu_tx_t *tx = os->os_synctx;
+
+		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, bp, tx);
+	}
+	kmem_free(bp, sizeof (*bp));
+}
+
+typedef struct sync_dnodes_arg {
+	multilist_t *sda_list;
+	int sda_sublist_idx;
+	multilist_t *sda_newlist;
+	dmu_tx_t *sda_tx;
+} sync_dnodes_arg_t;
+
+static void
+sync_dnodes_task(void *arg)
+{
+	sync_dnodes_arg_t *sda = arg;
+
+	multilist_sublist_t *ms =
+	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+
+	dmu_objset_sync_dnodes(ms, sda->sda_tx);
+
+	multilist_sublist_unlock(ms);
+
+	kmem_free(sda, sizeof (*sda));
+}
+
+
+/* called from dsl */
+void
+dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
+{
+	int txgoff;
+	zbookmark_phys_t zb;
+	zio_prop_t zp;
+	zio_t *zio;
+	list_t *list;
+	dbuf_dirty_record_t *dr;
+	int num_sublists;
+	multilist_t *ml;
+	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
+	*blkptr_copy = *os->os_rootbp;
+
+	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* XXX the write_done callback should really give us the tx... */
+	os->os_synctx = tx;
+
+	if (os->os_dsl_dataset == NULL) {
+		/*
+		 * This is the MOS.  If we have upgraded,
+		 * spa_max_replication() could change, so reset
+		 * os_copies here.
+		 */
+		os->os_copies = spa_max_replication(os->os_spa);
+	}
+
+	/*
+	 * Create the root block IO
+	 */
+	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	arc_release(os->os_phys_buf, &os->os_phys_buf);
+
+	dmu_write_policy(os, NULL, 0, 0, &zp);
+
+	/*
+	 * If we are either claiming the ZIL or doing a raw receive, write
+	 * out the os_phys_buf raw. Neither of these actions will effect the
+	 * MAC at this point.
+	 */
+	if (os->os_raw_receive ||
+	    os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
+		ASSERT(os->os_encrypted);
+		arc_convert_to_raw(os->os_phys_buf,
+		    os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,
+		    DMU_OT_OBJSET, NULL, NULL, NULL);
+	}
+
+	zio = arc_write(pio, os->os_spa, tx->tx_txg,
+	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+
+	/*
+	 * Sync special dnodes - the parent IO for the sync is the root block
+	 */
+	DMU_META_DNODE(os)->dn_zio = zio;
+	dnode_sync(DMU_META_DNODE(os), tx);
+
+	os->os_phys->os_flags = os->os_flags;
+
+	if (DMU_USERUSED_DNODE(os) &&
+	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+		DMU_USERUSED_DNODE(os)->dn_zio = zio;
+		dnode_sync(DMU_USERUSED_DNODE(os), tx);
+		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
+		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
+	}
+
+	if (DMU_PROJECTUSED_DNODE(os) &&
+	    DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
+		DMU_PROJECTUSED_DNODE(os)->dn_zio = zio;
+		dnode_sync(DMU_PROJECTUSED_DNODE(os), tx);
+	}
+
+	txgoff = tx->tx_txg & TXG_MASK;
+
+	/*
+	 * We must create the list here because it uses the
+	 * dn_dirty_link[] of this txg.  But it may already
+	 * exist because we call dsl_dataset_sync() twice per txg.
+	 */
+	if (os->os_synced_dnodes == NULL) {
+		os->os_synced_dnodes =
+		    multilist_create(sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[txgoff]),
+		    dnode_multilist_index_func);
+	} else {
+		ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
+		    offsetof(dnode_t, dn_dirty_link[txgoff]));
+	}
+
+	ml = os->os_dirty_dnodes[txgoff];
+	num_sublists = multilist_get_num_sublists(ml);
+	for (int i = 0; i < num_sublists; i++) {
+		if (multilist_sublist_is_empty_idx(ml, i))
+			continue;
+		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
+		sda->sda_list = ml;
+		sda->sda_sublist_idx = i;
+		sda->sda_tx = tx;
+		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
+		    sync_dnodes_task, sda, 0);
+		/* callback frees sda */
+	}
+	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
+
+	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
+	while ((dr = list_head(list)) != NULL) {
+		ASSERT0(dr->dr_dbuf->db_level);
+		list_remove(list, dr);
+		zio_nowait(dr->dr_zio);
+	}
+
+	/* Enable dnode backfill if enough objects have been freed. */
+	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+		os->os_rescan_dnodes = B_TRUE;
+		os->os_freed_dnodes = 0;
+	}
+
+	/*
+	 * Free intent log blocks up to this tx.
+	 */
+	zil_sync(os->os_zil, tx);
+	os->os_phys->os_zil_header = os->os_zil_header;
+	zio_nowait(zio);
+}
+
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+	return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
+}
+
+static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES];
+
+void
+dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb)
+{
+	file_cbs[ost] = cb;
+}
+
+int
+dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data,
+    zfs_file_info_t *zfi)
+{
+	file_info_cb_t *cb = file_cbs[os->os_phys->os_type];
+	if (cb == NULL)
+		return (EINVAL);
+	return (cb(bonustype, data, zfi));
+}
+
+boolean_t
+dmu_objset_userused_enabled(objset_t *os)
+{
+	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
+	    file_cbs[os->os_phys->os_type] != NULL &&
+	    DMU_USERUSED_DNODE(os) != NULL);
+}
+
+boolean_t
+dmu_objset_userobjused_enabled(objset_t *os)
+{
+	return (dmu_objset_userused_enabled(os) &&
+	    spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));
+}
+
+boolean_t
+dmu_objset_projectquota_enabled(objset_t *os)
+{
+	return (file_cbs[os->os_phys->os_type] != NULL &&
+	    DMU_PROJECTUSED_DNODE(os) != NULL &&
+	    spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA));
+}
+
+typedef struct userquota_node {
+	/* must be in the first filed, see userquota_update_cache() */
+	char		uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];
+	int64_t		uqn_delta;
+	avl_node_t	uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+	avl_tree_t uqc_user_deltas;
+	avl_tree_t uqc_group_deltas;
+	avl_tree_t uqc_project_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+	const userquota_node_t *luqn = l;
+	const userquota_node_t *ruqn = r;
+	int rv;
+
+	/*
+	 * NB: can only access uqn_id because userquota_update_cache() doesn't
+	 * pass in an entire userquota_node_t.
+	 */
+	rv = strcmp(luqn->uqn_id, ruqn->uqn_id);
+
+	return (TREE_ISIGN(rv));
+}
+
+static void
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
+{
+	void *cookie;
+	userquota_node_t *uqn;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+	    &cookie)) != NULL) {
+		/*
+		 * os_userused_lock protects against concurrent calls to
+		 * zap_increment_int().  It's needed because zap_increment_int()
+		 * is not thread-safe (i.e. not atomic).
+		 */
+		mutex_enter(&os->os_userused_lock);
+		VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		mutex_exit(&os->os_userused_lock);
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_user_deltas);
+
+	cookie = NULL;
+	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+	    &cookie)) != NULL) {
+		mutex_enter(&os->os_userused_lock);
+		VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,
+		    uqn->uqn_id, uqn->uqn_delta, tx));
+		mutex_exit(&os->os_userused_lock);
+		kmem_free(uqn, sizeof (*uqn));
+	}
+	avl_destroy(&cache->uqc_group_deltas);
+
+	if (dmu_objset_projectquota_enabled(os)) {
+		cookie = NULL;
+		while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas,
+		    &cookie)) != NULL) {
+			mutex_enter(&os->os_userused_lock);
+			VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT,
+			    uqn->uqn_id, uqn->uqn_delta, tx));
+			mutex_exit(&os->os_userused_lock);
+			kmem_free(uqn, sizeof (*uqn));
+		}
+		avl_destroy(&cache->uqc_project_deltas);
+	}
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)
+{
+	userquota_node_t *uqn;
+	avl_index_t idx;
+
+	ASSERT(strlen(id) < sizeof (uqn->uqn_id));
+	/*
+	 * Use id directly for searching because uqn_id is the first field of
+	 * userquota_node_t and fields after uqn_id won't be accessed in
+	 * avl_find().
+	 */
+	uqn = avl_find(avl, (const void *)id, &idx);
+	if (uqn == NULL) {
+		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+		strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id));
+		avl_insert(avl, uqn, idx);
+	}
+	uqn->uqn_delta += delta;
+}
+
+static void
+do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used,
+    uint64_t flags, uint64_t user, uint64_t group, uint64_t project,
+    boolean_t subtract)
+{
+	if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) {
+		int64_t delta = DNODE_MIN_SIZE + used;
+		char name[20];
+
+		if (subtract)
+			delta = -delta;
+
+		(void) snprintf(name, sizeof (name), "%llx", (longlong_t)user);
+		userquota_update_cache(&cache->uqc_user_deltas, name, delta);
+
+		(void) snprintf(name, sizeof (name), "%llx", (longlong_t)group);
+		userquota_update_cache(&cache->uqc_group_deltas, name, delta);
+
+		if (dmu_objset_projectquota_enabled(os)) {
+			(void) snprintf(name, sizeof (name), "%llx",
+			    (longlong_t)project);
+			userquota_update_cache(&cache->uqc_project_deltas,
+			    name, delta);
+		}
+	}
+}
+
+static void
+do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags,
+    uint64_t user, uint64_t group, uint64_t project, boolean_t subtract)
+{
+	if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {
+		char name[20 + DMU_OBJACCT_PREFIX_LEN];
+		int delta = subtract ? -1 : 1;
+
+		(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
+		    (longlong_t)user);
+		userquota_update_cache(&cache->uqc_user_deltas, name, delta);
+
+		(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
+		    (longlong_t)group);
+		userquota_update_cache(&cache->uqc_group_deltas, name, delta);
+
+		if (dmu_objset_projectquota_enabled(os)) {
+			(void) snprintf(name, sizeof (name),
+			    DMU_OBJACCT_PREFIX "%llx", (longlong_t)project);
+			userquota_update_cache(&cache->uqc_project_deltas,
+			    name, delta);
+		}
+	}
+}
+
+typedef struct userquota_updates_arg {
+	objset_t *uua_os;
+	int uua_sublist_idx;
+	dmu_tx_t *uua_tx;
+} userquota_updates_arg_t;
+
+static void
+userquota_updates_task(void *arg)
+{
+	userquota_updates_arg_t *uua = arg;
+	objset_t *os = uua->uua_os;
+	dmu_tx_t *tx = uua->uua_tx;
+	dnode_t *dn;
+	userquota_cache_t cache = { { 0 } };
+
+	multilist_sublist_t *list =
+	    multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
+
+	ASSERT(multilist_sublist_head(list) == NULL ||
+	    dmu_objset_userused_enabled(os));
+	avl_create(&cache.uqc_user_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+	avl_create(&cache.uqc_group_deltas, userquota_compare,
+	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+	if (dmu_objset_projectquota_enabled(os))
+		avl_create(&cache.uqc_project_deltas, userquota_compare,
+		    sizeof (userquota_node_t), offsetof(userquota_node_t,
+		    uqn_node));
+
+	while ((dn = multilist_sublist_head(list)) != NULL) {
+		int flags;
+		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
+		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
+		    dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED);
+
+		flags = dn->dn_id_flags;
+		ASSERT(flags);
+		if (flags & DN_ID_OLD_EXIST)  {
+			do_userquota_update(os, &cache, dn->dn_oldused,
+			    dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid,
+			    dn->dn_oldprojid, B_TRUE);
+			do_userobjquota_update(os, &cache, dn->dn_oldflags,
+			    dn->dn_olduid, dn->dn_oldgid,
+			    dn->dn_oldprojid, B_TRUE);
+		}
+		if (flags & DN_ID_NEW_EXIST) {
+			do_userquota_update(os, &cache,
+			    DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,
+			    dn->dn_newuid, dn->dn_newgid,
+			    dn->dn_newprojid, B_FALSE);
+			do_userobjquota_update(os, &cache,
+			    dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid,
+			    dn->dn_newprojid, B_FALSE);
+		}
+
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_oldused = 0;
+		dn->dn_oldflags = 0;
+		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
+			dn->dn_olduid = dn->dn_newuid;
+			dn->dn_oldgid = dn->dn_newgid;
+			dn->dn_oldprojid = dn->dn_newprojid;
+			dn->dn_id_flags |= DN_ID_OLD_EXIST;
+			if (dn->dn_bonuslen == 0)
+				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+			else
+				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+		}
+		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
+		mutex_exit(&dn->dn_mtx);
+
+		multilist_sublist_remove(list, dn);
+		dnode_rele(dn, os->os_synced_dnodes);
+	}
+	do_userquota_cacheflush(os, &cache, tx);
+	multilist_sublist_unlock(list);
+	kmem_free(uua, sizeof (*uua));
+}
+
+/*
+ * Release dnode holds from dmu_objset_sync_dnodes().  When the dnode is being
+ * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be
+ * evicted because the block containing the dnode can't be evicted until it is
+ * written out.  However, this hold is necessary to prevent the dnode_t from
+ * being moved (via dnode_move()) while it's still referenced by
+ * dbuf_dirty_record_t:dr_dnode.  And dr_dnode is needed for
+ * dirty_lightweight_leaf-type dirty records.
+ *
+ * If we are doing user-object accounting, the dnode_rele() happens from
+ * userquota_updates_task() instead.
+ */
+static void
+dnode_rele_task(void *arg)
+{
+	userquota_updates_arg_t *uua = arg;
+	objset_t *os = uua->uua_os;
+
+	multilist_sublist_t *list =
+	    multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
+
+	dnode_t *dn;
+	while ((dn = multilist_sublist_head(list)) != NULL) {
+		multilist_sublist_remove(list, dn);
+		dnode_rele(dn, os->os_synced_dnodes);
+	}
+	multilist_sublist_unlock(list);
+	kmem_free(uua, sizeof (*uua));
+}
+
+/*
+ * Return TRUE if userquota updates are needed.
+ */
+static boolean_t
+dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)
+{
+	if (!dmu_objset_userused_enabled(os))
+		return (B_FALSE);
+
+	/*
+	 * If this is a raw receive just return and handle accounting
+	 * later when we have the keys loaded. We also don't do user
+	 * accounting during claiming since the datasets are not owned
+	 * for the duration of claiming and this txg should only be
+	 * used for recovery.
+	 */
+	if (os->os_encrypted && dmu_objset_is_receiving(os))
+		return (B_FALSE);
+
+	if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)
+		return (B_FALSE);
+
+	/* Allocate the user/group/project used objects if necessary. */
+	if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+		VERIFY0(zap_create_claim(os,
+		    DMU_USERUSED_OBJECT,
+		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+		VERIFY0(zap_create_claim(os,
+		    DMU_GROUPUSED_OBJECT,
+		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+	}
+
+	if (dmu_objset_projectquota_enabled(os) &&
+	    DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
+		VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,
+		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+	}
+	return (B_TRUE);
+}
+
+/*
+ * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and
+ * also release the holds on the dnodes from dmu_objset_sync_dnodes().
+ * The caller must taskq_wait(dp_sync_taskq).
+ */
+void
+dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)
+{
+	boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);
+
+	int num_sublists = multilist_get_num_sublists(os->os_synced_dnodes);
+	for (int i = 0; i < num_sublists; i++) {
+		userquota_updates_arg_t *uua =
+		    kmem_alloc(sizeof (*uua), KM_SLEEP);
+		uua->uua_os = os;
+		uua->uua_sublist_idx = i;
+		uua->uua_tx = tx;
+
+		/*
+		 * If we don't need to update userquotas, use
+		 * dnode_rele_task() to call dnode_rele()
+		 */
+		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
+		    need_userquota ? userquota_updates_task : dnode_rele_task,
+		    uua, 0);
+		/* callback frees uua */
+	}
+}
+
+
+/*
+ * Returns a pointer to data to find uid/gid from
+ *
+ * If a dirty record for transaction group that is syncing can't
+ * be found then NULL is returned.  In the NULL case it is assumed
+ * the uid/gid aren't changing.
+ */
+static void *
+dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
+{
+	dbuf_dirty_record_t *dr;
+	void *data;
+
+	if (db->db_dirtycnt == 0)
+		return (db->db.db_data);  /* Nothing is changing */
+
+	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
+
+	if (dr == NULL) {
+		data = NULL;
+	} else {
+		if (dr->dr_dnode->dn_bonuslen == 0 &&
+		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
+			data = dr->dt.dl.dr_data->b_data;
+		else
+			data = dr->dt.dl.dr_data;
+	}
+
+	return (data);
+}
+
+void
+dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
+{
+	objset_t *os = dn->dn_objset;
+	void *data = NULL;
+	dmu_buf_impl_t *db = NULL;
+	int flags = dn->dn_id_flags;
+	int error;
+	boolean_t have_spill = B_FALSE;
+
+	if (!dmu_objset_userused_enabled(dn->dn_objset))
+		return;
+
+	/*
+	 * Raw receives introduce a problem with user accounting. Raw
+	 * receives cannot update the user accounting info because the
+	 * user ids and the sizes are encrypted. To guarantee that we
+	 * never end up with bad user accounting, we simply disable it
+	 * during raw receives. We also disable this for normal receives
+	 * so that an incremental raw receive may be done on top of an
+	 * existing non-raw receive.
+	 */
+	if (os->os_encrypted && dmu_objset_is_receiving(os))
+		return;
+
+	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
+	    DN_ID_CHKED_SPILL)))
+		return;
+
+	if (before && dn->dn_bonuslen != 0)
+		data = DN_BONUS(dn->dn_phys);
+	else if (!before && dn->dn_bonuslen != 0) {
+		if (dn->dn_bonus) {
+			db = dn->dn_bonus;
+			mutex_enter(&db->db_mtx);
+			data = dmu_objset_userquota_find_data(db, tx);
+		} else {
+			data = DN_BONUS(dn->dn_phys);
+		}
+	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
+			int rf = 0;
+
+			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
+				rf |= DB_RF_HAVESTRUCT;
+			error = dmu_spill_hold_by_dnode(dn,
+			    rf | DB_RF_MUST_SUCCEED,
+			    FTAG, (dmu_buf_t **)&db);
+			ASSERT(error == 0);
+			mutex_enter(&db->db_mtx);
+			data = (before) ? db->db.db_data :
+			    dmu_objset_userquota_find_data(db, tx);
+			have_spill = B_TRUE;
+	} else {
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+		mutex_exit(&dn->dn_mtx);
+		return;
+	}
+
+	/*
+	 * Must always call the callback in case the object
+	 * type has changed and that type isn't an object type to track
+	 */
+	zfs_file_info_t zfi;
+	error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
+
+	if (before) {
+		ASSERT(data);
+		dn->dn_olduid = zfi.zfi_user;
+		dn->dn_oldgid = zfi.zfi_group;
+		dn->dn_oldprojid = zfi.zfi_project;
+	} else if (data) {
+		dn->dn_newuid = zfi.zfi_user;
+		dn->dn_newgid = zfi.zfi_group;
+		dn->dn_newprojid = zfi.zfi_project;
+	}
+
+	/*
+	 * Preserve existing uid/gid when the callback can't determine
+	 * what the new uid/gid are and the callback returned EEXIST.
+	 * The EEXIST error tells us to just use the existing uid/gid.
+	 * If we don't know what the old values are then just assign
+	 * them to 0, since that is a new file  being created.
+	 */
+	if (!before && data == NULL && error == EEXIST) {
+		if (flags & DN_ID_OLD_EXIST) {
+			dn->dn_newuid = dn->dn_olduid;
+			dn->dn_newgid = dn->dn_oldgid;
+			dn->dn_newprojid = dn->dn_oldprojid;
+		} else {
+			dn->dn_newuid = 0;
+			dn->dn_newgid = 0;
+			dn->dn_newprojid = ZFS_DEFAULT_PROJID;
+		}
+		error = 0;
+	}
+
+	if (db)
+		mutex_exit(&db->db_mtx);
+
+	mutex_enter(&dn->dn_mtx);
+	if (error == 0 && before)
+		dn->dn_id_flags |= DN_ID_OLD_EXIST;
+	if (error == 0 && !before)
+		dn->dn_id_flags |= DN_ID_NEW_EXIST;
+
+	if (have_spill) {
+		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
+	} else {
+		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
+	}
+	mutex_exit(&dn->dn_mtx);
+	if (have_spill)
+		dmu_buf_rele((dmu_buf_t *)db, FTAG);
+}
+
+boolean_t
+dmu_objset_userspace_present(objset_t *os)
+{
+	return (os->os_phys->os_flags &
+	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
+}
+
+boolean_t
+dmu_objset_userobjspace_present(objset_t *os)
+{
+	return (os->os_phys->os_flags &
+	    OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
+}
+
+boolean_t
+dmu_objset_projectquota_present(objset_t *os)
+{
+	return (os->os_phys->os_flags &
+	    OBJSET_FLAG_PROJECTQUOTA_COMPLETE);
+}
+
+static int
+dmu_objset_space_upgrade(objset_t *os)
+{
+	uint64_t obj;
+	int err = 0;
+
+	/*
+	 * We simply need to mark every object dirty, so that it will be
+	 * synced out and now accounted.  If this is called
+	 * concurrently, or if we already did some work before crashing,
+	 * that's fine, since we track each object's accounted state
+	 * independently.
+	 */
+
+	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+		dmu_tx_t *tx;
+		dmu_buf_t *db;
+		int objerr;
+
+		mutex_enter(&os->os_upgrade_lock);
+		if (os->os_upgrade_exit)
+			err = SET_ERROR(EINTR);
+		mutex_exit(&os->os_upgrade_lock);
+		if (err != 0)
+			return (err);
+
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			return (SET_ERROR(EINTR));
+
+		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
+		if (objerr != 0)
+			continue;
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, obj);
+		objerr = dmu_tx_assign(tx, TXG_WAIT);
+		if (objerr != 0) {
+			dmu_buf_rele(db, FTAG);
+			dmu_tx_abort(tx);
+			continue;
+		}
+		dmu_buf_will_dirty(db, tx);
+		dmu_buf_rele(db, FTAG);
+		dmu_tx_commit(tx);
+	}
+	return (0);
+}
+
+static int
+dmu_objset_userspace_upgrade_cb(objset_t *os)
+{
+	int err = 0;
+
+	if (dmu_objset_userspace_present(os))
+		return (0);
+	if (dmu_objset_is_snapshot(os))
+		return (SET_ERROR(EINVAL));
+	if (!dmu_objset_userused_enabled(os))
+		return (SET_ERROR(ENOTSUP));
+
+	err = dmu_objset_space_upgrade(os);
+	if (err)
+		return (err);
+
+	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+	txg_wait_synced(dmu_objset_pool(os), 0);
+	return (0);
+}
+
+void
+dmu_objset_userspace_upgrade(objset_t *os)
+{
+	dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb);
+}
+
+static int
+dmu_objset_id_quota_upgrade_cb(objset_t *os)
+{
+	int err = 0;
+
+	if (dmu_objset_userobjspace_present(os) &&
+	    dmu_objset_projectquota_present(os))
+		return (0);
+	if (dmu_objset_is_snapshot(os))
+		return (SET_ERROR(EINVAL));
+	if (!dmu_objset_userused_enabled(os))
+		return (SET_ERROR(ENOTSUP));
+	if (!dmu_objset_projectquota_enabled(os) &&
+	    dmu_objset_userobjspace_present(os))
+		return (SET_ERROR(ENOTSUP));
+
+	if (dmu_objset_userobjused_enabled(os))
+		dmu_objset_ds(os)->ds_feature_activation[
+		    SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
+	if (dmu_objset_projectquota_enabled(os))
+		dmu_objset_ds(os)->ds_feature_activation[
+		    SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
+
+	err = dmu_objset_space_upgrade(os);
+	if (err)
+		return (err);
+
+	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+	if (dmu_objset_userobjused_enabled(os))
+		os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+	if (dmu_objset_projectquota_enabled(os))
+		os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
+
+	txg_wait_synced(dmu_objset_pool(os), 0);
+	return (0);
+}
+
+void
+dmu_objset_id_quota_upgrade(objset_t *os)
+{
+	dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb);
+}
+
+boolean_t
+dmu_objset_userobjspace_upgradable(objset_t *os)
+{
+	return (dmu_objset_type(os) == DMU_OST_ZFS &&
+	    !dmu_objset_is_snapshot(os) &&
+	    dmu_objset_userobjused_enabled(os) &&
+	    !dmu_objset_userobjspace_present(os) &&
+	    spa_writeable(dmu_objset_spa(os)));
+}
+
+boolean_t
+dmu_objset_projectquota_upgradable(objset_t *os)
+{
+	return (dmu_objset_type(os) == DMU_OST_ZFS &&
+	    !dmu_objset_is_snapshot(os) &&
+	    dmu_objset_projectquota_enabled(os) &&
+	    !dmu_objset_projectquota_present(os) &&
+	    spa_writeable(dmu_objset_spa(os)));
+}
+
+void
+dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
+    uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
+	    usedobjsp, availobjsp);
+}
+
+uint64_t
+dmu_objset_fsid_guid(objset_t *os)
+{
+	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
+}
+
+void
+dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
+{
+	stat->dds_type = os->os_phys->os_type;
+	if (os->os_dsl_dataset)
+		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
+}
+
+void
+dmu_objset_stats(objset_t *os, nvlist_t *nv)
+{
+	ASSERT(os->os_dsl_dataset ||
+	    os->os_phys->os_type == DMU_OST_META);
+
+	if (os->os_dsl_dataset != NULL)
+		dsl_dataset_stats(os->os_dsl_dataset, nv);
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
+	    os->os_phys->os_type);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
+	    dmu_objset_userspace_present(os));
+}
+
+int
+dmu_objset_is_snapshot(objset_t *os)
+{
+	if (os->os_dsl_dataset != NULL)
+		return (os->os_dsl_dataset->ds_is_snapshot);
+	else
+		return (B_FALSE);
+}
+
+int
+dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen,
+    boolean_t *conflict)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	uint64_t ignored;
+
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+		return (SET_ERROR(ENOENT));
+
+	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+	    MT_NORMALIZE, real, maxlen, conflict));
+}
+
+int
+dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+
+	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
+
+	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
+		return (SET_ERROR(ENOENT));
+
+	zap_cursor_init_serialized(&cursor,
+	    ds->ds_dir->dd_pool->dp_meta_objset,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
+
+	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+		zap_cursor_fini(&cursor);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (strlen(attr.za_name) + 1 > namelen) {
+		zap_cursor_fini(&cursor);
+		return (SET_ERROR(ENAMETOOLONG));
+	}
+
+	(void) strlcpy(name, attr.za_name, namelen);
+	if (idp)
+		*idp = attr.za_first_integer;
+	if (case_conflict)
+		*case_conflict = attr.za_normalization_conflict;
+	zap_cursor_advance(&cursor);
+	*offp = zap_cursor_serialize(&cursor);
+	zap_cursor_fini(&cursor);
+
+	return (0);
+}
+
+int
+dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)
+{
+	return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));
+}
+
+int
+dmu_dir_list_next(objset_t *os, int namelen, char *name,
+    uint64_t *idp, uint64_t *offp)
+{
+	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
+	zap_cursor_t cursor;
+	zap_attribute_t attr;
+
+	/* there is no next dir on a snapshot! */
+	if (os->os_dsl_dataset->ds_object !=
+	    dsl_dir_phys(dd)->dd_head_dataset_obj)
+		return (SET_ERROR(ENOENT));
+
+	zap_cursor_init_serialized(&cursor,
+	    dd->dd_pool->dp_meta_objset,
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
+
+	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
+		zap_cursor_fini(&cursor);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (strlen(attr.za_name) + 1 > namelen) {
+		zap_cursor_fini(&cursor);
+		return (SET_ERROR(ENAMETOOLONG));
+	}
+
+	(void) strlcpy(name, attr.za_name, namelen);
+	if (idp)
+		*idp = attr.za_first_integer;
+	zap_cursor_advance(&cursor);
+	*offp = zap_cursor_serialize(&cursor);
+	zap_cursor_fini(&cursor);
+
+	return (0);
+}
+
+typedef struct dmu_objset_find_ctx {
+	taskq_t		*dc_tq;
+	dsl_pool_t	*dc_dp;
+	uint64_t	dc_ddobj;
+	char		*dc_ddname; /* last component of ddobj's name */
+	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+	void		*dc_arg;
+	int		dc_flags;
+	kmutex_t	*dc_error_lock;
+	int		*dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
+{
+	dsl_pool_t *dp = dcp->dc_dp;
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	zap_cursor_t zc;
+	zap_attribute_t *attr;
+	uint64_t thisobj;
+	int err = 0;
+
+	/* don't process if there already was an error */
+	if (*dcp->dc_error != 0)
+		goto out;
+
+	/*
+	 * Note: passing the name (dc_ddname) here is optional, but it
+	 * improves performance because we don't need to call
+	 * zap_value_search() to determine the name.
+	 */
+	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
+	if (err != 0)
+		goto out;
+
+	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+	if (dd->dd_myname[0] == '$') {
+		dsl_dir_rele(dd, FTAG);
+		goto out;
+	}
+
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/*
+	 * Iterate over all children.
+	 */
+	if (dcp->dc_flags & DS_FIND_CHILDREN) {
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT3U(attr->za_integer_length, ==,
+			    sizeof (uint64_t));
+			ASSERT3U(attr->za_num_integers, ==, 1);
+
+			dmu_objset_find_ctx_t *child_dcp =
+			    kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+			*child_dcp = *dcp;
+			child_dcp->dc_ddobj = attr->za_first_integer;
+			child_dcp->dc_ddname = spa_strdup(attr->za_name);
+			if (dcp->dc_tq != NULL)
+				(void) taskq_dispatch(dcp->dc_tq,
+				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+			else
+				dmu_objset_find_dp_impl(child_dcp);
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	/*
+	 * Iterate over all snapshots.
+	 */
+	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
+		dsl_dataset_t *ds;
+		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+		if (err == 0) {
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+			dsl_dataset_rele(ds, FTAG);
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+			    zap_cursor_retrieve(&zc, attr) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				ASSERT3U(attr->za_integer_length, ==,
+				    sizeof (uint64_t));
+				ASSERT3U(attr->za_num_integers, ==, 1);
+
+				err = dsl_dataset_hold_obj(dp,
+				    attr->za_first_integer, FTAG, &ds);
+				if (err != 0)
+					break;
+				err = dcp->dc_func(dp, ds, dcp->dc_arg);
+				dsl_dataset_rele(ds, FTAG);
+				if (err != 0)
+					break;
+			}
+			zap_cursor_fini(&zc);
+		}
+	}
+
+	kmem_free(attr, sizeof (zap_attribute_t));
+
+	if (err != 0) {
+		dsl_dir_rele(dd, FTAG);
+		goto out;
+	}
+
+	/*
+	 * Apply to self.
+	 */
+	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+	/*
+	 * Note: we hold the dir while calling dsl_dataset_hold_obj() so
+	 * that the dir will remain cached, and we won't have to re-instantiate
+	 * it (which could be expensive due to finding its name via
+	 * zap_value_search()).
+	 */
+	dsl_dir_rele(dd, FTAG);
+	if (err != 0)
+		goto out;
+	err = dcp->dc_func(dp, ds, dcp->dc_arg);
+	dsl_dataset_rele(ds, FTAG);
+
+out:
+	if (err != 0) {
+		mutex_enter(dcp->dc_error_lock);
+		/* only keep first error */
+		if (*dcp->dc_error == 0)
+			*dcp->dc_error = err;
+		mutex_exit(dcp->dc_error_lock);
+	}
+
+	if (dcp->dc_ddname != NULL)
+		spa_strfree(dcp->dc_ddname);
+	kmem_free(dcp, sizeof (*dcp));
+}
+
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+	dmu_objset_find_ctx_t *dcp = arg;
+	dsl_pool_t *dp = dcp->dc_dp;
+
+	/*
+	 * We need to get a pool_config_lock here, as there are several
+	 * assert(pool_config_held) down the stack. Getting a lock via
+	 * dsl_pool_config_enter is risky, as it might be stalled by a
+	 * pending writer. This would deadlock, as the write lock can
+	 * only be granted when our parent thread gives up the lock.
+	 * The _prio interface gives us priority over a pending writer.
+	 */
+	dsl_pool_config_enter_prio(dp, FTAG);
+
+	dmu_objset_find_dp_impl(dcp);
+
+	dsl_pool_config_exit(dp, FTAG);
+}
+
+/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+    int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+	int error = 0;
+	taskq_t *tq = NULL;
+	int ntasks;
+	dmu_objset_find_ctx_t *dcp;
+	kmutex_t err_lock;
+
+	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+	dcp->dc_tq = NULL;
+	dcp->dc_dp = dp;
+	dcp->dc_ddobj = ddobj;
+	dcp->dc_ddname = NULL;
+	dcp->dc_func = func;
+	dcp->dc_arg = arg;
+	dcp->dc_flags = flags;
+	dcp->dc_error_lock = &err_lock;
+	dcp->dc_error = &error;
+
+	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+		/*
+		 * In case a write lock is held we can't make use of
+		 * parallelism, as down the stack of the worker threads
+		 * the lock is asserted via dsl_pool_config_held.
+		 * In case of a read lock this is solved by getting a read
+		 * lock in each worker thread, which isn't possible in case
+		 * of a writer lock. So we fall back to the synchronous path
+		 * here.
+		 * In the future it might be possible to get some magic into
+		 * dsl_pool_config_held in a way that it returns true for
+		 * the worker threads so that a single lock held from this
+		 * thread suffices. For now, stay single threaded.
+		 */
+		dmu_objset_find_dp_impl(dcp);
+		mutex_destroy(&err_lock);
+
+		return (error);
+	}
+
+	ntasks = dmu_find_threads;
+	if (ntasks == 0)
+		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+	tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,
+	    INT_MAX, 0);
+	if (tq == NULL) {
+		kmem_free(dcp, sizeof (*dcp));
+		mutex_destroy(&err_lock);
+
+		return (SET_ERROR(ENOMEM));
+	}
+	dcp->dc_tq = tq;
+
+	/* dcp will be freed by task */
+	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+	/*
+	 * PORTING: this code relies on the property of taskq_wait to wait
+	 * until no more tasks are queued and no more tasks are active. As
+	 * we always queue new tasks from within other tasks, task_wait
+	 * reliably waits for the full recursion to finish, even though we
+	 * enqueue new tasks after taskq_wait has been called.
+	 * On platforms other than illumos, taskq_wait may not have this
+	 * property.
+	 */
+	taskq_wait(tq);
+	taskq_destroy(tq);
+	mutex_destroy(&err_lock);
+
+	return (error);
+}
+
+/*
+ * Find all objsets under name, and for each, call 'func(child_name, arg)'.
+ * The dp_config_rwlock must not be held when this is called, and it
+ * will not be held when the callback is called.
+ * Therefore this function should only be used when the pool is not changing
+ * (e.g. in syncing context), or the callback can deal with the possible races.
+ */
+static int
+dmu_objset_find_impl(spa_t *spa, const char *name,
+    int func(const char *, void *), void *arg, int flags)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	dsl_dataset_t *ds;
+	zap_cursor_t zc;
+	zap_attribute_t *attr;
+	char *child;
+	uint64_t thisobj;
+	int err;
+
+	dsl_pool_config_enter(dp, FTAG);
+
+	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
+	if (err != 0) {
+		dsl_pool_config_exit(dp, FTAG);
+		return (err);
+	}
+
+	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
+	if (dd->dd_myname[0] == '$') {
+		dsl_dir_rele(dd, FTAG);
+		dsl_pool_config_exit(dp, FTAG);
+		return (0);
+	}
+
+	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/*
+	 * Iterate over all children.
+	 */
+	if (flags & DS_FIND_CHILDREN) {
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
+		    zap_cursor_retrieve(&zc, attr) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			ASSERT3U(attr->za_integer_length, ==,
+			    sizeof (uint64_t));
+			ASSERT3U(attr->za_num_integers, ==, 1);
+
+			child = kmem_asprintf("%s/%s", name, attr->za_name);
+			dsl_pool_config_exit(dp, FTAG);
+			err = dmu_objset_find_impl(spa, child,
+			    func, arg, flags);
+			dsl_pool_config_enter(dp, FTAG);
+			kmem_strfree(child);
+			if (err != 0)
+				break;
+		}
+		zap_cursor_fini(&zc);
+
+		if (err != 0) {
+			dsl_dir_rele(dd, FTAG);
+			dsl_pool_config_exit(dp, FTAG);
+			kmem_free(attr, sizeof (zap_attribute_t));
+			return (err);
+		}
+	}
+
+	/*
+	 * Iterate over all snapshots.
+	 */
+	if (flags & DS_FIND_SNAPSHOTS) {
+		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
+
+		if (err == 0) {
+			uint64_t snapobj;
+
+			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+			dsl_dataset_rele(ds, FTAG);
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
+			    zap_cursor_retrieve(&zc, attr) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				ASSERT3U(attr->za_integer_length, ==,
+				    sizeof (uint64_t));
+				ASSERT3U(attr->za_num_integers, ==, 1);
+
+				child = kmem_asprintf("%s@%s",
+				    name, attr->za_name);
+				dsl_pool_config_exit(dp, FTAG);
+				err = func(child, arg);
+				dsl_pool_config_enter(dp, FTAG);
+				kmem_strfree(child);
+				if (err != 0)
+					break;
+			}
+			zap_cursor_fini(&zc);
+		}
+	}
+
+	dsl_dir_rele(dd, FTAG);
+	kmem_free(attr, sizeof (zap_attribute_t));
+	dsl_pool_config_exit(dp, FTAG);
+
+	if (err != 0)
+		return (err);
+
+	/* Apply to self. */
+	return (func(name, arg));
+}
+
+/*
+ * See comment above dmu_objset_find_impl().
+ */
+int
+dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
+    int flags)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	error = dmu_objset_find_impl(spa, name, func, arg, flags);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+boolean_t
+dmu_objset_incompatible_encryption_version(objset_t *os)
+{
+	return (dsl_dir_incompatible_encryption_version(
+	    os->os_dsl_dataset->ds_dir));
+}
+
+void
+dmu_objset_set_user(objset_t *os, void *user_ptr)
+{
+	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+	os->os_user_ptr = user_ptr;
+}
+
+void *
+dmu_objset_get_user(objset_t *os)
+{
+	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+	return (os->os_user_ptr);
+}
+
+/*
+ * Determine name of filesystem, given name of snapshot.
+ * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
+ */
+int
+dmu_fsname(const char *snapname, char *buf)
+{
+	char *atp = strchr(snapname, '@');
+	if (atp == NULL)
+		return (SET_ERROR(EINVAL));
+	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	(void) strlcpy(buf, snapname, atp - snapname + 1);
+	return (0);
+}
+
+/*
+ * Call when we think we're going to write/free space in open context
+ * to track the amount of dirty data in the open txg, which is also the
+ * amount of memory that can not be evicted until this txg syncs.
+ *
+ * Note that there are two conditions where this can be called from
+ * syncing context:
+ *
+ * [1] When we just created the dataset, in which case we go on with
+ *     updating any accounting of dirty data as usual.
+ * [2] When we are dirtying MOS data, in which case we only update the
+ *     pool's accounting of dirty data.
+ */
+void
+dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
+
+	if (ds != NULL) {
+		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+	}
+
+	dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dmu_objset_zil);
+EXPORT_SYMBOL(dmu_objset_pool);
+EXPORT_SYMBOL(dmu_objset_ds);
+EXPORT_SYMBOL(dmu_objset_type);
+EXPORT_SYMBOL(dmu_objset_name);
+EXPORT_SYMBOL(dmu_objset_hold);
+EXPORT_SYMBOL(dmu_objset_hold_flags);
+EXPORT_SYMBOL(dmu_objset_own);
+EXPORT_SYMBOL(dmu_objset_rele);
+EXPORT_SYMBOL(dmu_objset_rele_flags);
+EXPORT_SYMBOL(dmu_objset_disown);
+EXPORT_SYMBOL(dmu_objset_from_ds);
+EXPORT_SYMBOL(dmu_objset_create);
+EXPORT_SYMBOL(dmu_objset_clone);
+EXPORT_SYMBOL(dmu_objset_stats);
+EXPORT_SYMBOL(dmu_objset_fast_stat);
+EXPORT_SYMBOL(dmu_objset_spa);
+EXPORT_SYMBOL(dmu_objset_space);
+EXPORT_SYMBOL(dmu_objset_fsid_guid);
+EXPORT_SYMBOL(dmu_objset_find);
+EXPORT_SYMBOL(dmu_objset_byteswap);
+EXPORT_SYMBOL(dmu_objset_evict_dbufs);
+EXPORT_SYMBOL(dmu_objset_snap_cmtime);
+EXPORT_SYMBOL(dmu_objset_dnodesize);
+
+EXPORT_SYMBOL(dmu_objset_sync);
+EXPORT_SYMBOL(dmu_objset_is_dirty);
+EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);
+EXPORT_SYMBOL(dmu_objset_create_impl);
+EXPORT_SYMBOL(dmu_objset_open_impl);
+EXPORT_SYMBOL(dmu_objset_evict);
+EXPORT_SYMBOL(dmu_objset_register_type);
+EXPORT_SYMBOL(dmu_objset_sync_done);
+EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
+EXPORT_SYMBOL(dmu_objset_userused_enabled);
+EXPORT_SYMBOL(dmu_objset_userspace_upgrade);
+EXPORT_SYMBOL(dmu_objset_userspace_present);
+EXPORT_SYMBOL(dmu_objset_userobjused_enabled);
+EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable);
+EXPORT_SYMBOL(dmu_objset_userobjspace_present);
+EXPORT_SYMBOL(dmu_objset_projectquota_enabled);
+EXPORT_SYMBOL(dmu_objset_projectquota_present);
+EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);
+EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dmu_recv.c b/sys/contrib/openzfs/module/zfs/dmu_recv.c
new file mode 100644
index 000000000000..a0fd157ebc5f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_recv.c
@@ -0,0 +1,3390 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zvol.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
+#include <sys/objlist.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+#include <sys/zfs_file.h>
+
+int zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
+int zfs_recv_queue_ff = 20;
+int zfs_recv_write_batch_size = 1024 * 1024;
+
+static char *dmu_recv_tag = "dmu_recv_tag";
+const char *recv_clone_name = "%recv";
+
+static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
+    void *buf);
+
+struct receive_record_arg {
+	dmu_replay_record_t header;
+	void *payload; /* Pointer to a buffer containing the payload */
+	/*
+	 * If the record is a WRITE or SPILL, pointer to the abd containing the
+	 * payload.
+	 */
+	abd_t *abd;
+	int payload_size;
+	uint64_t bytes_read; /* bytes read from stream when record created */
+	boolean_t eos_marker; /* Marks the end of the stream */
+	bqueue_node_t node;
+};
+
+struct receive_writer_arg {
+	objset_t *os;
+	boolean_t byteswap;
+	bqueue_t q;
+
+	/*
+	 * These three members are used to signal to the main thread when
+	 * we're done.
+	 */
+	kmutex_t mutex;
+	kcondvar_t cv;
+	boolean_t done;
+
+	int err;
+	boolean_t resumable;
+	boolean_t raw;   /* DMU_BACKUP_FEATURE_RAW set */
+	boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
+	boolean_t full;  /* this is a full send stream */
+	uint64_t last_object;
+	uint64_t last_offset;
+	uint64_t max_object; /* highest object ID referenced in stream */
+	uint64_t bytes_read; /* bytes read when current record created */
+
+	list_t write_batch;
+
+	/* Encryption parameters for the last received DRR_OBJECT_RANGE */
+	boolean_t or_crypt_params_present;
+	uint64_t or_firstobj;
+	uint64_t or_numslots;
+	uint8_t or_salt[ZIO_DATA_SALT_LEN];
+	uint8_t or_iv[ZIO_DATA_IV_LEN];
+	uint8_t or_mac[ZIO_DATA_MAC_LEN];
+	boolean_t or_byteorder;
+};
+
+typedef struct dmu_recv_begin_arg {
+	const char *drba_origin;
+	dmu_recv_cookie_t *drba_cookie;
+	cred_t *drba_cred;
+	proc_t *drba_proc;
+	dsl_crypto_params_t *drba_dcp;
+} dmu_recv_begin_arg_t;
+
+static void
+byteswap_record(dmu_replay_record_t *drr)
+{
+#define	DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
+#define	DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
+	drr->drr_type = BSWAP_32(drr->drr_type);
+	drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
+	switch (drr->drr_type) {
+	case DRR_BEGIN:
+		DO64(drr_begin.drr_magic);
+		DO64(drr_begin.drr_versioninfo);
+		DO64(drr_begin.drr_creation_time);
+		DO32(drr_begin.drr_type);
+		DO32(drr_begin.drr_flags);
+		DO64(drr_begin.drr_toguid);
+		DO64(drr_begin.drr_fromguid);
+		break;
+	case DRR_OBJECT:
+		DO64(drr_object.drr_object);
+		DO32(drr_object.drr_type);
+		DO32(drr_object.drr_bonustype);
+		DO32(drr_object.drr_blksz);
+		DO32(drr_object.drr_bonuslen);
+		DO32(drr_object.drr_raw_bonuslen);
+		DO64(drr_object.drr_toguid);
+		DO64(drr_object.drr_maxblkid);
+		break;
+	case DRR_FREEOBJECTS:
+		DO64(drr_freeobjects.drr_firstobj);
+		DO64(drr_freeobjects.drr_numobjs);
+		DO64(drr_freeobjects.drr_toguid);
+		break;
+	case DRR_WRITE:
+		DO64(drr_write.drr_object);
+		DO32(drr_write.drr_type);
+		DO64(drr_write.drr_offset);
+		DO64(drr_write.drr_logical_size);
+		DO64(drr_write.drr_toguid);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
+		DO64(drr_write.drr_key.ddk_prop);
+		DO64(drr_write.drr_compressed_size);
+		break;
+	case DRR_WRITE_EMBEDDED:
+		DO64(drr_write_embedded.drr_object);
+		DO64(drr_write_embedded.drr_offset);
+		DO64(drr_write_embedded.drr_length);
+		DO64(drr_write_embedded.drr_toguid);
+		DO32(drr_write_embedded.drr_lsize);
+		DO32(drr_write_embedded.drr_psize);
+		break;
+	case DRR_FREE:
+		DO64(drr_free.drr_object);
+		DO64(drr_free.drr_offset);
+		DO64(drr_free.drr_length);
+		DO64(drr_free.drr_toguid);
+		break;
+	case DRR_SPILL:
+		DO64(drr_spill.drr_object);
+		DO64(drr_spill.drr_length);
+		DO64(drr_spill.drr_toguid);
+		DO64(drr_spill.drr_compressed_size);
+		DO32(drr_spill.drr_type);
+		break;
+	case DRR_OBJECT_RANGE:
+		DO64(drr_object_range.drr_firstobj);
+		DO64(drr_object_range.drr_numslots);
+		DO64(drr_object_range.drr_toguid);
+		break;
+	case DRR_REDACT:
+		DO64(drr_redact.drr_object);
+		DO64(drr_redact.drr_offset);
+		DO64(drr_redact.drr_length);
+		DO64(drr_redact.drr_toguid);
+		break;
+	case DRR_END:
+		DO64(drr_end.drr_toguid);
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
+		break;
+	default:
+		break;
+	}
+
+	if (drr->drr_type != DRR_BEGIN) {
+		ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+	}
+
+#undef DO64
+#undef DO32
+}
+
+static boolean_t
+redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
+{
+	for (int i = 0; i < num_snaps; i++) {
+		if (snaps[i] == guid)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Check that the new stream we're trying to receive is redacted with respect to
+ * a subset of the snapshots that the origin was redacted with respect to.  For
+ * the reasons behind this, see the man page on redacted zfs sends and receives.
+ */
+static boolean_t
+compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps,
+    uint64_t *redact_snaps, uint64_t num_redact_snaps)
+{
+	/*
+	 * Short circuit the comparison; if we are redacted with respect to
+	 * more snapshots than the origin, we can't be redacted with respect
+	 * to a subset.
+	 */
+	if (num_redact_snaps > origin_num_snaps) {
+		return (B_FALSE);
+	}
+
+	for (int i = 0; i < num_redact_snaps; i++) {
+		if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
+		    redact_snaps[i])) {
+			return (B_FALSE);
+		}
+	}
+	return (B_TRUE);
+}
+
+static boolean_t
+redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin)
+{
+	uint64_t *origin_snaps;
+	uint64_t origin_num_snaps;
+	dmu_recv_cookie_t *drc = drba->drba_cookie;
+	struct drr_begin *drrb = drc->drc_drrb;
+	int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+	int err = 0;
+	boolean_t ret = B_TRUE;
+	uint64_t *redact_snaps;
+	uint_t numredactsnaps;
+
+	/*
+	 * If this is a full send stream, we're safe no matter what.
+	 */
+	if (drrb->drr_fromguid == 0)
+		return (ret);
+
+	VERIFY(dsl_dataset_get_uint64_array_feature(origin,
+	    SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps));
+
+	if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+	    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) ==
+	    0) {
+		/*
+		 * If the send stream was sent from the redaction bookmark or
+		 * the redacted version of the dataset, then we're safe.  Verify
+		 * that this is from the a compatible redaction bookmark or
+		 * redacted dataset.
+		 */
+		if (!compatible_redact_snaps(origin_snaps, origin_num_snaps,
+		    redact_snaps, numredactsnaps)) {
+			err = EINVAL;
+		}
+	} else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
+		/*
+		 * If the stream is redacted, it must be redacted with respect
+		 * to a subset of what the origin is redacted with respect to.
+		 * See case number 2 in the zfs man page section on redacted zfs
+		 * send.
+		 */
+		err = nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps);
+
+		if (err != 0 || !compatible_redact_snaps(origin_snaps,
+		    origin_num_snaps, redact_snaps, numredactsnaps)) {
+			err = EINVAL;
+		}
+	} else if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
+	    drrb->drr_toguid)) {
+		/*
+		 * If the stream isn't redacted but the origin is, this must be
+		 * one of the snapshots the origin is redacted with respect to.
+		 * See case number 1 in the zfs man page section on redacted zfs
+		 * send.
+		 */
+		err = EINVAL;
+	}
+
+	if (err != 0)
+		ret = B_FALSE;
+	return (ret);
+}
+
+/*
+ * If we previously received a stream with --large-block, we don't support
+ * receiving an incremental on top of it without --large-block.  This avoids
+ * forcing a read-modify-write or trying to re-aggregate a string of WRITE
+ * records.
+ */
+static int
+recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags)
+{
+	if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) &&
+	    !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH));
+	return (0);
+}
+
+static int
+recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
+    uint64_t fromguid, uint64_t featureflags)
+{
+	uint64_t val;
+	uint64_t children;
+	int error;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
+	boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
+	boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
+
+	/* Temporary clone name must not exist. */
+	error = zap_lookup(dp->dp_meta_objset,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
+	    8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? SET_ERROR(EBUSY) : error);
+
+	/* Resume state must not be set. */
+	if (dsl_dataset_has_resume_receive_state(ds))
+		return (SET_ERROR(EBUSY));
+
+	/* New snapshot name must not exist. */
+	error = zap_lookup(dp->dp_meta_objset,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    drba->drba_cookie->drc_tosnap, 8, 1, &val);
+	if (error != ENOENT)
+		return (error == 0 ? SET_ERROR(EEXIST) : error);
+
+	/* Must not have children if receiving a ZVOL. */
+	error = zap_count(dp->dp_meta_objset,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
+	if (error != 0)
+		return (error);
+	if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
+	    children > 0)
+		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+
+	/*
+	 * Check snapshot limit before receiving. We'll recheck again at the
+	 * end, but might as well abort before receiving if we're already over
+	 * the limit.
+	 *
+	 * Note that we do not check the file system limit with
+	 * dsl_dir_fscount_check because the temporary %clones don't count
+	 * against that limit.
+	 */
+	error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+	    NULL, drba->drba_cred, drba->drba_proc);
+	if (error != 0)
+		return (error);
+
+	if (fromguid != 0) {
+		dsl_dataset_t *snap;
+		uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+
+		/* Can't perform a raw receive on top of a non-raw receive */
+		if (!encrypted && raw)
+			return (SET_ERROR(EINVAL));
+
+		/* Encryption is incompatible with embedded data */
+		if (encrypted && embed)
+			return (SET_ERROR(EINVAL));
+
+		/* Find snapshot in this dir that matches fromguid. */
+		while (obj != 0) {
+			error = dsl_dataset_hold_obj(dp, obj, FTAG,
+			    &snap);
+			if (error != 0)
+				return (SET_ERROR(ENODEV));
+			if (snap->ds_dir != ds->ds_dir) {
+				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ENODEV));
+			}
+			if (dsl_dataset_phys(snap)->ds_guid == fromguid)
+				break;
+			obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+			dsl_dataset_rele(snap, FTAG);
+		}
+		if (obj == 0)
+			return (SET_ERROR(ENODEV));
+
+		if (drba->drba_cookie->drc_force) {
+			drba->drba_cookie->drc_fromsnapobj = obj;
+		} else {
+			/*
+			 * If we are not forcing, there must be no
+			 * changes since fromsnap. Raw sends have an
+			 * additional constraint that requires that
+			 * no "noop" snapshots exist between fromsnap
+			 * and tosnap for the IVset checking code to
+			 * work properly.
+			 */
+			if (dsl_dataset_modified_since_snap(ds, snap) ||
+			    (raw &&
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj !=
+			    snap->ds_object)) {
+				dsl_dataset_rele(snap, FTAG);
+				return (SET_ERROR(ETXTBSY));
+			}
+			drba->drba_cookie->drc_fromsnapobj =
+			    ds->ds_prev->ds_object;
+		}
+
+		if (dsl_dataset_feature_is_active(snap,
+		    SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba,
+		    snap)) {
+			dsl_dataset_rele(snap, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+
+		error = recv_check_large_blocks(snap, featureflags);
+		if (error != 0) {
+			dsl_dataset_rele(snap, FTAG);
+			return (error);
+		}
+
+		dsl_dataset_rele(snap, FTAG);
+	} else {
+		/* if full, then must be forced */
+		if (!drba->drba_cookie->drc_force)
+			return (SET_ERROR(EEXIST));
+
+		/*
+		 * We don't support using zfs recv -F to blow away
+		 * encrypted filesystems. This would require the
+		 * dsl dir to point to the old encryption key and
+		 * the new one at the same time during the receive.
+		 */
+		if ((!encrypted && raw) || encrypted)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * Perform the same encryption checks we would if
+		 * we were creating a new dataset from scratch.
+		 */
+		if (!raw) {
+			boolean_t will_encrypt;
+
+			error = dmu_objset_create_crypt_check(
+			    ds->ds_dir->dd_parent, drba->drba_dcp,
+			    &will_encrypt);
+			if (error != 0)
+				return (error);
+
+			if (will_encrypt && embed)
+				return (SET_ERROR(EINVAL));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Check that any feature flags used in the data stream we're receiving are
+ * supported by the pool we are receiving into.
+ *
+ * Note that some of the features we explicitly check here have additional
+ * (implicit) features they depend on, but those dependencies are enforced
+ * through the zfeature_register() calls declaring the features that we
+ * explicitly check.
+ */
+static int
+recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa)
+{
+	/*
+	 * Check if there are any unsupported feature flags.
+	 */
+	if (!DMU_STREAM_SUPPORTED(featureflags)) {
+		return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE));
+	}
+
+	/* Verify pool version supports SA if SA_SPILL feature set */
+	if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+	    spa_version(spa) < SPA_VERSION_SA)
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks,
+	 * and large_dnodes in the stream can only be used if those pool
+	 * features are enabled because we don't attempt to decompress /
+	 * un-embed / un-mooch / split up the blocks / dnodes during the
+	 * receive process.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SET_ERROR(ENOTSUP));
+	if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * Receiving redacted streams requires that redacted datasets are
+	 * enabled.
+	 */
+	if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS))
+		return (SET_ERROR(ENOTSUP));
+
+	return (0);
+}
+
+static int
+dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+	uint64_t fromguid = drrb->drr_fromguid;
+	int flags = drrb->drr_flags;
+	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+	int error;
+	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
+	dsl_dataset_t *ds;
+	const char *tofs = drba->drba_cookie->drc_tofs;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES ||
+	    ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
+		return (SET_ERROR(EINVAL));
+
+	error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa);
+	if (error != 0)
+		return (error);
+
+	/* Resumable receives require extensible datasets */
+	if (drba->drba_cookie->drc_resumable &&
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+		return (SET_ERROR(ENOTSUP));
+
+	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+		/* raw receives require the encryption feature */
+		if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
+			return (SET_ERROR(ENOTSUP));
+
+		/* embedded data is incompatible with encryption and raw recv */
+		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+			return (SET_ERROR(EINVAL));
+
+		/* raw receives require spill block allocation flag */
+		if (!(flags & DRR_FLAG_SPILL_BLOCK))
+			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
+	} else {
+		dsflags |= DS_HOLD_FLAG_DECRYPT;
+	}
+
+	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
+	if (error == 0) {
+		/* target fs already exists; recv into temp clone */
+
+		/* Can't recv a clone into an existing fs */
+		if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
+			dsl_dataset_rele_flags(ds, dsflags, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+
+		error = recv_begin_check_existing_impl(drba, ds, fromguid,
+		    featureflags);
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+	} else if (error == ENOENT) {
+		/* target fs does not exist; must be a full backup or clone */
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
+		objset_t *os;
+
+		/*
+		 * If it's a non-clone incremental, we are missing the
+		 * target fs, so fail the recv.
+		 */
+		if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) ||
+		    drba->drba_origin))
+			return (SET_ERROR(ENOENT));
+
+		/*
+		 * If we're receiving a full send as a clone, and it doesn't
+		 * contain all the necessary free records and freeobject
+		 * records, reject it.
+		 */
+		if (fromguid == 0 && drba->drba_origin != NULL &&
+		    !(flags & DRR_FLAG_FREERECORDS))
+			return (SET_ERROR(EINVAL));
+
+		/* Open the parent of tofs */
+		ASSERT3U(strlen(tofs), <, sizeof (buf));
+		(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
+		error = dsl_dataset_hold(dp, buf, FTAG, &ds);
+		if (error != 0)
+			return (error);
+
+		if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
+		    drba->drba_origin == NULL) {
+			boolean_t will_encrypt;
+
+			/*
+			 * Check that we aren't breaking any encryption rules
+			 * and that we have all the parameters we need to
+			 * create an encrypted dataset if necessary. If we are
+			 * making an encrypted dataset the stream can't have
+			 * embedded data.
+			 */
+			error = dmu_objset_create_crypt_check(ds->ds_dir,
+			    drba->drba_dcp, &will_encrypt);
+			if (error != 0) {
+				dsl_dataset_rele(ds, FTAG);
+				return (error);
+			}
+
+			if (will_encrypt &&
+			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(EINVAL));
+			}
+		}
+
+		/*
+		 * Check filesystem and snapshot limits before receiving. We'll
+		 * recheck snapshot limits again at the end (we create the
+		 * filesystems and increment those counts during begin_sync).
+		 */
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+		    drba->drba_cred, drba->drba_proc);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+
+		error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+		    drba->drba_cred, drba->drba_proc);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+
+		/* can't recv below anything but filesystems (eg. no ZVOLs) */
+		error = dmu_objset_from_ds(ds, &os);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+		if (dmu_objset_type(os) != DMU_OST_ZFS) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+		}
+
+		if (drba->drba_origin != NULL) {
+			dsl_dataset_t *origin;
+			error = dsl_dataset_hold_flags(dp, drba->drba_origin,
+			    dsflags, FTAG, &origin);
+			if (error != 0) {
+				dsl_dataset_rele(ds, FTAG);
+				return (error);
+			}
+			if (!origin->ds_is_snapshot) {
+				dsl_dataset_rele_flags(origin, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(EINVAL));
+			}
+			if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+			    fromguid != 0) {
+				dsl_dataset_rele_flags(origin, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(ENODEV));
+			}
+
+			if (origin->ds_dir->dd_crypto_obj != 0 &&
+			    (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
+				dsl_dataset_rele_flags(origin, dsflags, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(EINVAL));
+			}
+
+			/*
+			 * If the origin is redacted we need to verify that this
+			 * send stream can safely be received on top of the
+			 * origin.
+			 */
+			if (dsl_dataset_feature_is_active(origin,
+			    SPA_FEATURE_REDACTED_DATASETS)) {
+				if (!redact_check(drba, origin)) {
+					dsl_dataset_rele_flags(origin, dsflags,
+					    FTAG);
+					dsl_dataset_rele_flags(ds, dsflags,
+					    FTAG);
+					return (SET_ERROR(EINVAL));
+				}
+			}
+
+			error = recv_check_large_blocks(ds, featureflags);
+			if (error != 0) {
+				dsl_dataset_rele_flags(origin, dsflags, FTAG);
+				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				return (error);
+			}
+
+			dsl_dataset_rele_flags(origin, dsflags, FTAG);
+		}
+
+		dsl_dataset_rele(ds, FTAG);
+		error = 0;
+	}
+	return (error);
+}
+
+static void
+dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_recv_cookie_t *drc = drba->drba_cookie;
+	struct drr_begin *drrb = drc->drc_drrb;
+	const char *tofs = drc->drc_tofs;
+	uint64_t featureflags = drc->drc_featureflags;
+	dsl_dataset_t *ds, *newds;
+	objset_t *os;
+	uint64_t dsobj;
+	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+	int error;
+	uint64_t crflags = 0;
+	dsl_crypto_params_t dummy_dcp = { 0 };
+	dsl_crypto_params_t *dcp = drba->drba_dcp;
+
+	if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+		crflags |= DS_FLAG_CI_DATASET;
+
+	if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
+		dsflags |= DS_HOLD_FLAG_DECRYPT;
+
+	/*
+	 * Raw, non-incremental recvs always use a dummy dcp with
+	 * the raw cmd set. Raw incremental recvs do not use a dcp
+	 * since the encryption parameters are already set in stone.
+	 */
+	if (dcp == NULL && drrb->drr_fromguid == 0 &&
+	    drba->drba_origin == NULL) {
+		ASSERT3P(dcp, ==, NULL);
+		dcp = &dummy_dcp;
+
+		if (featureflags & DMU_BACKUP_FEATURE_RAW)
+			dcp->cp_cmd = DCP_CMD_RAW_RECV;
+	}
+
+	error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
+	if (error == 0) {
+		/* create temporary clone */
+		dsl_dataset_t *snap = NULL;
+
+		if (drba->drba_cookie->drc_fromsnapobj != 0) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
+			ASSERT3P(dcp, ==, NULL);
+		}
+		dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
+		    snap, crflags, drba->drba_cred, dcp, tx);
+		if (drba->drba_cookie->drc_fromsnapobj != 0)
+			dsl_dataset_rele(snap, FTAG);
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+	} else {
+		dsl_dir_t *dd;
+		const char *tail;
+		dsl_dataset_t *origin = NULL;
+
+		VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
+
+		if (drba->drba_origin != NULL) {
+			VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
+			    FTAG, &origin));
+			ASSERT3P(dcp, ==, NULL);
+		}
+
+		/* Create new dataset. */
+		dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
+		    origin, crflags, drba->drba_cred, dcp, tx);
+		if (origin != NULL)
+			dsl_dataset_rele(origin, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		drc->drc_newfs = B_TRUE;
+	}
+	VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag,
+	    &newds));
+	if (dsl_dataset_feature_is_active(newds,
+	    SPA_FEATURE_REDACTED_DATASETS)) {
+		/*
+		 * If the origin dataset is redacted, the child will be redacted
+		 * when we create it.  We clear the new dataset's
+		 * redaction info; if it should be redacted, we'll fill
+		 * in its information later.
+		 */
+		dsl_dataset_deactivate_feature(newds,
+		    SPA_FEATURE_REDACTED_DATASETS, tx);
+	}
+	VERIFY0(dmu_objset_from_ds(newds, &os));
+
+	if (drc->drc_resumable) {
+		dsl_dataset_zapify(newds, tx);
+		if (drrb->drr_fromguid != 0) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+			    8, 1, &drrb->drr_fromguid, tx));
+		}
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+		    8, 1, &drrb->drr_toguid, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+		    1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+		uint64_t one = 1;
+		uint64_t zero = 0;
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+		    8, 1, &one, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+		    8, 1, &zero, tx));
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+		    8, 1, &zero, tx));
+		if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
+			    8, 1, &one, tx));
+		}
+		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+			    8, 1, &one, tx));
+		}
+		if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
+			    8, 1, &one, tx));
+		}
+		if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+			VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
+			    8, 1, &one, tx));
+		}
+
+		uint64_t *redact_snaps;
+		uint_t numredactsnaps;
+		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+		    BEGINNV_REDACT_FROM_SNAPS, &redact_snaps,
+		    &numredactsnaps) == 0) {
+			VERIFY0(zap_add(mos, dsobj,
+			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS,
+			    sizeof (*redact_snaps), numredactsnaps,
+			    redact_snaps, tx));
+		}
+	}
+
+	/*
+	 * Usually the os->os_encrypted value is tied to the presence of a
+	 * DSL Crypto Key object in the dd. However, that will not be received
+	 * until dmu_recv_stream(), so we set the value manually for now.
+	 */
+	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+		os->os_encrypted = B_TRUE;
+		drba->drba_cookie->drc_raw = B_TRUE;
+	}
+
+	if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
+		uint64_t *redact_snaps;
+		uint_t numredactsnaps;
+		VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+		    BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps));
+		dsl_dataset_activate_redaction(newds, redact_snaps,
+		    numredactsnaps, tx);
+	}
+
+	dmu_buf_will_dirty(newds->ds_dbuf, tx);
+	dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	/*
+	 * If we actually created a non-clone, we need to create the objset
+	 * in our new dataset. If this is a raw send we postpone this until
+	 * dmu_recv_stream() so that we can allocate the metadnode with the
+	 * properties from the DRR_BEGIN payload.
+	 */
+	rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
+	if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
+	    (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
+		(void) dmu_objset_create_impl(dp->dp_spa,
+		    newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
+	}
+	rrw_exit(&newds->ds_bp_rwlock, FTAG);
+
+	drba->drba_cookie->drc_ds = newds;
+	drba->drba_cookie->drc_os = os;
+
+	spa_history_log_internal_ds(newds, "receive", tx, " ");
+}
+
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dmu_recv_cookie_t *drc = drba->drba_cookie;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	struct drr_begin *drrb = drc->drc_drrb;
+	int error;
+	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+	dsl_dataset_t *ds;
+	const char *tofs = drc->drc_tofs;
+
+	/* already checked */
+	ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+	ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM ||
+	    drrb->drr_type >= DMU_OST_NUMTYPES)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * This is mostly a sanity check since we should have already done these
+	 * checks during a previous attempt to receive the data.
+	 */
+	error = recv_begin_check_feature_flags_impl(drc->drc_featureflags,
+	    dp->dp_spa);
+	if (error != 0)
+		return (error);
+
+	/* 6 extra bytes for /%recv */
+	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s",
+	    tofs, recv_clone_name);
+
+	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+		/* raw receives require spill block allocation flag */
+		if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
+			return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
+	} else {
+		dsflags |= DS_HOLD_FLAG_DECRYPT;
+	}
+
+	if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
+		/* %recv does not exist; continue in tofs */
+		error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
+		if (error != 0)
+			return (error);
+	}
+
+	/* check that ds is marked inconsistent */
+	if (!DS_IS_INCONSISTENT(ds)) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* check that there is resuming data, and that the toguid matches */
+	if (!dsl_dataset_is_zapified(ds)) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	uint64_t val;
+	error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+	if (error != 0 || drrb->drr_toguid != val) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Check if the receive is still running.  If so, it will be owned.
+	 * Note that nothing else can own the dataset (e.g. after the receive
+	 * fails) because it will be marked inconsistent.
+	 */
+	if (dsl_dataset_has_owner(ds)) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (SET_ERROR(EBUSY));
+	}
+
+	/* There should not be any snapshots of this fs yet. */
+	if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Note: resume point will be checked when we process the first WRITE
+	 * record.
+	 */
+
+	/* check that the origin matches */
+	val = 0;
+	(void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+	if (drrb->drr_fromguid != val) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (ds->ds_prev != NULL && drrb->drr_fromguid != 0)
+		drc->drc_fromsnapobj = ds->ds_prev->ds_object;
+
+	/*
+	 * If we're resuming, and the send is redacted, then the original send
+	 * must have been redacted, and must have been redacted with respect to
+	 * the same snapshots.
+	 */
+	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) {
+		uint64_t num_ds_redact_snaps;
+		uint64_t *ds_redact_snaps;
+
+		uint_t num_stream_redact_snaps;
+		uint64_t *stream_redact_snaps;
+
+		if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
+		    BEGINNV_REDACT_SNAPS, &stream_redact_snaps,
+		    &num_stream_redact_snaps) != 0) {
+			dsl_dataset_rele_flags(ds, dsflags, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+
+		if (!dsl_dataset_get_uint64_array_feature(ds,
+		    SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps,
+		    &ds_redact_snaps)) {
+			dsl_dataset_rele_flags(ds, dsflags, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+
+		for (int i = 0; i < num_ds_redact_snaps; i++) {
+			if (!redact_snaps_contains(ds_redact_snaps,
+			    num_ds_redact_snaps, stream_redact_snaps[i])) {
+				dsl_dataset_rele_flags(ds, dsflags, FTAG);
+				return (SET_ERROR(EINVAL));
+			}
+		}
+	}
+
+	error = recv_check_large_blocks(ds, drc->drc_featureflags);
+	if (error != 0) {
+		dsl_dataset_rele_flags(ds, dsflags, FTAG);
+		return (error);
+	}
+
+	dsl_dataset_rele_flags(ds, dsflags, FTAG);
+	return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_begin_arg_t *drba = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	const char *tofs = drba->drba_cookie->drc_tofs;
+	uint64_t featureflags = drba->drba_cookie->drc_featureflags;
+	dsl_dataset_t *ds;
+	ds_hold_flags_t dsflags = DS_HOLD_FLAG_NONE;
+	/* 6 extra bytes for /%recv */
+	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+	(void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs,
+	    recv_clone_name);
+
+	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+		drba->drba_cookie->drc_raw = B_TRUE;
+	} else {
+		dsflags |= DS_HOLD_FLAG_DECRYPT;
+	}
+
+	if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds)
+	    != 0) {
+		/* %recv does not exist; continue in tofs */
+		VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag,
+		    &ds));
+		drba->drba_cookie->drc_newfs = B_TRUE;
+	}
+
+	ASSERT(DS_IS_INCONSISTENT(ds));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) ||
+	    drba->drba_cookie->drc_raw);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	drba->drba_cookie->drc_ds = ds;
+	VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os));
+	drba->drba_cookie->drc_should_save = B_TRUE;
+
+	spa_history_log_internal_ds(ds, "resume receive", tx, " ");
+}
+
+/*
+ * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
+ * succeeds; otherwise we will leak the holds on the datasets.
+ */
+int
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+    boolean_t force, boolean_t resumable, nvlist_t *localprops,
+    nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc,
+    zfs_file_t *fp, offset_t *voffp)
+{
+	dmu_recv_begin_arg_t drba = { 0 };
+	int err;
+
+	bzero(drc, sizeof (dmu_recv_cookie_t));
+	drc->drc_drr_begin = drr_begin;
+	drc->drc_drrb = &drr_begin->drr_u.drr_begin;
+	drc->drc_tosnap = tosnap;
+	drc->drc_tofs = tofs;
+	drc->drc_force = force;
+	drc->drc_resumable = resumable;
+	drc->drc_cred = CRED();
+	drc->drc_proc = curproc;
+	drc->drc_clone = (origin != NULL);
+
+	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
+		drc->drc_byteswap = B_TRUE;
+		(void) fletcher_4_incremental_byteswap(drr_begin,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+		byteswap_record(drr_begin);
+	} else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+		(void) fletcher_4_incremental_native(drr_begin,
+		    sizeof (dmu_replay_record_t), &drc->drc_cksum);
+	} else {
+		return (SET_ERROR(EINVAL));
+	}
+
+	drc->drc_fp = fp;
+	drc->drc_voff = *voffp;
+	drc->drc_featureflags =
+	    DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+	uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
+	void *payload = NULL;
+	if (payloadlen != 0)
+		payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+	err = receive_read_payload_and_next_header(drc, payloadlen,
+	    payload);
+	if (err != 0) {
+		kmem_free(payload, payloadlen);
+		return (err);
+	}
+	if (payloadlen != 0) {
+		err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
+		    KM_SLEEP);
+		kmem_free(payload, payloadlen);
+		if (err != 0) {
+			kmem_free(drc->drc_next_rrd,
+			    sizeof (*drc->drc_next_rrd));
+			return (err);
+		}
+	}
+
+	if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
+		drc->drc_spill = B_TRUE;
+
+	drba.drba_origin = origin;
+	drba.drba_cookie = drc;
+	drba.drba_cred = CRED();
+	drba.drba_proc = curproc;
+
+	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+		err = dsl_sync_task(tofs,
+		    dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+		    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
+	} else {
+
+		/*
+		 * For non-raw, non-incremental, non-resuming receives the
+		 * user can specify encryption parameters on the command line
+		 * with "zfs recv -o". For these receives we create a dcp and
+		 * pass it to the sync task. Creating the dcp will implicitly
+		 * remove the encryption params from the localprops nvlist,
+		 * which avoids errors when trying to set these normally
+		 * read-only properties. Any other kind of receive that
+		 * attempts to set these properties will fail as a result.
+		 */
+		if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+		    DMU_BACKUP_FEATURE_RAW) == 0 &&
+		    origin == NULL && drc->drc_drrb->drr_fromguid == 0) {
+			err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
+			    localprops, hidden_args, &drba.drba_dcp);
+		}
+
+		if (err == 0) {
+			err = dsl_sync_task(tofs,
+			    dmu_recv_begin_check, dmu_recv_begin_sync,
+			    &drba, 5, ZFS_SPACE_CHECK_NORMAL);
+			dsl_crypto_params_free(drba.drba_dcp, !!err);
+		}
+	}
+
+	if (err != 0) {
+		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+		nvlist_free(drc->drc_begin_nvl);
+	}
+	return (err);
+}
+
+static int
+receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
+{
+	int done = 0;
+
+	/*
+	 * The code doesn't rely on this (lengths being multiples of 8).  See
+	 * comment in dump_bytes.
+	 */
+	ASSERT(len % 8 == 0 ||
+	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
+
+	while (done < len) {
+		ssize_t resid;
+		zfs_file_t *fp = drc->drc_fp;
+		int err = zfs_file_read(fp, (char *)buf + done,
+		    len - done, &resid);
+		if (resid == len - done) {
+			/*
+			 * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates
+			 * that the receive was interrupted and can
+			 * potentially be resumed.
+			 */
+			err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED);
+		}
+		drc->drc_voff += len - done - resid;
+		done = len - resid;
+		if (err != 0)
+			return (err);
+	}
+
+	drc->drc_bytes_read += len;
+
+	ASSERT3U(done, ==, len);
+	return (0);
+}
+
+static inline uint8_t
+deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+{
+	if (bonus_type == DMU_OT_SA) {
+		return (1);
+	} else {
+		return (1 +
+		    ((DN_OLD_MAX_BONUSLEN -
+		    MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
+	}
+}
+
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+    uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+	if (!rwa->resumable)
+		return;
+
+	/*
+	 * We use ds_resume_bytes[] != 0 to indicate that we need to
+	 * update this on disk, so it must not be 0.
+	 */
+	ASSERT(rwa->bytes_read != 0);
+
+	/*
+	 * We only resume from write records, which have a valid
+	 * (non-meta-dnode) object number.
+	 */
+	ASSERT(object != 0);
+
+	/*
+	 * For resuming to work correctly, we must receive records in order,
+	 * sorted by object,offset.  This is checked by the callers, but
+	 * assert it here for good measure.
+	 */
+	ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+	ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+	    offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+	ASSERT3U(rwa->bytes_read, >=,
+	    rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+	rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+	rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+	rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
+static int
+receive_object_is_same_generation(objset_t *os, uint64_t object,
+    dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type,
+    const void *new_bonus, boolean_t *samegenp)
+{
+	zfs_file_info_t zoi;
+	int err;
+
+	dmu_buf_t *old_bonus_dbuf;
+	err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf);
+	if (err != 0)
+		return (err);
+	err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data,
+	    &zoi);
+	dmu_buf_rele(old_bonus_dbuf, FTAG);
+	if (err != 0)
+		return (err);
+	uint64_t old_gen = zoi.zfi_generation;
+
+	err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi);
+	if (err != 0)
+		return (err);
+	uint64_t new_gen = zoi.zfi_generation;
+
+	*samegenp = (old_gen == new_gen);
+	return (0);
+}
+
+static int
+receive_handle_existing_object(const struct receive_writer_arg *rwa,
+    const struct drr_object *drro, const dmu_object_info_t *doi,
+    const void *bonus_data,
+    uint64_t *object_to_hold, uint32_t *new_blksz)
+{
+	uint32_t indblksz = drro->drr_indblkshift ?
+	    1ULL << drro->drr_indblkshift : 0;
+	int nblkptr = deduce_nblkptr(drro->drr_bonustype,
+	    drro->drr_bonuslen);
+	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
+	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
+	boolean_t do_free_range = B_FALSE;
+	int err;
+
+	*object_to_hold = drro->drr_object;
+
+	/* nblkptr should be bounded by the bonus size and type */
+	if (rwa->raw && nblkptr != drro->drr_nblkptr)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * After the previous send stream, the sending system may
+	 * have freed this object, and then happened to re-allocate
+	 * this object number in a later txg. In this case, we are
+	 * receiving a different logical file, and the block size may
+	 * appear to be different.  i.e. we may have a different
+	 * block size for this object than what the send stream says.
+	 * In this case we need to remove the object's contents,
+	 * so that its structure can be changed and then its contents
+	 * entirely replaced by subsequent WRITE records.
+	 *
+	 * If this is a -L (--large-block) incremental stream, and
+	 * the previous stream was not -L, the block size may appear
+	 * to increase.  i.e. we may have a smaller block size for
+	 * this object than what the send stream says.  In this case
+	 * we need to keep the object's contents and block size
+	 * intact, so that we don't lose parts of the object's
+	 * contents that are not changed by this incremental send
+	 * stream.
+	 *
+	 * We can distinguish between the two above cases by using
+	 * the ZPL's generation number (see
+	 * receive_object_is_same_generation()).  However, we only
+	 * want to rely on the generation number when absolutely
+	 * necessary, because with raw receives, the generation is
+	 * encrypted.  We also want to minimize dependence on the
+	 * ZPL, so that other types of datasets can also be received
+	 * (e.g. ZVOLs, although note that ZVOLS currently do not
+	 * reallocate their objects or change their structure).
+	 * Therefore, we check a number of different cases where we
+	 * know it is safe to discard the object's contents, before
+	 * using the ZPL's generation number to make the above
+	 * distinction.
+	 */
+	if (drro->drr_blksz != doi->doi_data_block_size) {
+		if (rwa->raw) {
+			/*
+			 * RAW streams always have large blocks, so
+			 * we are sure that the data is not needed
+			 * due to changing --large-block to be on.
+			 * Which is fortunate since the bonus buffer
+			 * (which contains the ZPL generation) is
+			 * encrypted, and the key might not be
+			 * loaded.
+			 */
+			do_free_range = B_TRUE;
+		} else if (rwa->full) {
+			/*
+			 * This is a full send stream, so it always
+			 * replaces what we have.  Even if the
+			 * generation numbers happen to match, this
+			 * can not actually be the same logical file.
+			 * This is relevant when receiving a full
+			 * send as a clone.
+			 */
+			do_free_range = B_TRUE;
+		} else if (drro->drr_type !=
+		    DMU_OT_PLAIN_FILE_CONTENTS ||
+		    doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) {
+			/*
+			 * PLAIN_FILE_CONTENTS are the only type of
+			 * objects that have ever been stored with
+			 * large blocks, so we don't need the special
+			 * logic below.  ZAP blocks can shrink (when
+			 * there's only one block), so we don't want
+			 * to hit the error below about block size
+			 * only increasing.
+			 */
+			do_free_range = B_TRUE;
+		} else if (doi->doi_max_offset <=
+		    doi->doi_data_block_size) {
+			/*
+			 * There is only one block.  We can free it,
+			 * because its contents will be replaced by a
+			 * WRITE record.  This can not be the no-L ->
+			 * -L case, because the no-L case would have
+			 * resulted in multiple blocks.  If we
+			 * supported -L -> no-L, it would not be safe
+			 * to free the file's contents.  Fortunately,
+			 * that is not allowed (see
+			 * recv_check_large_blocks()).
+			 */
+			do_free_range = B_TRUE;
+		} else {
+			boolean_t is_same_gen;
+			err = receive_object_is_same_generation(rwa->os,
+			    drro->drr_object, doi->doi_bonus_type,
+			    drro->drr_bonustype, bonus_data, &is_same_gen);
+			if (err != 0)
+				return (SET_ERROR(EINVAL));
+
+			if (is_same_gen) {
+				/*
+				 * This is the same logical file, and
+				 * the block size must be increasing.
+				 * It could only decrease if
+				 * --large-block was changed to be
+				 * off, which is checked in
+				 * recv_check_large_blocks().
+				 */
+				if (drro->drr_blksz <=
+				    doi->doi_data_block_size)
+					return (SET_ERROR(EINVAL));
+				/*
+				 * We keep the existing blocksize and
+				 * contents.
+				 */
+				*new_blksz =
+				    doi->doi_data_block_size;
+			} else {
+				do_free_range = B_TRUE;
+			}
+		}
+	}
+
+	/* nblkptr can only decrease if the object was reallocated */
+	if (nblkptr < doi->doi_nblkptr)
+		do_free_range = B_TRUE;
+
+	/* number of slots can only change on reallocation */
+	if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT)
+		do_free_range = B_TRUE;
+
+	/*
+	 * For raw sends we also check a few other fields to
+	 * ensure we are preserving the objset structure exactly
+	 * as it was on the receive side:
+	 *     - A changed indirect block size
+	 *     - A smaller nlevels
+	 */
+	if (rwa->raw) {
+		if (indblksz != doi->doi_metadata_block_size)
+			do_free_range = B_TRUE;
+		if (drro->drr_nlevels < doi->doi_indirection)
+			do_free_range = B_TRUE;
+	}
+
+	if (do_free_range) {
+		err = dmu_free_long_range(rwa->os, drro->drr_object,
+		    0, DMU_OBJECT_END);
+		if (err != 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * The dmu does not currently support decreasing nlevels
+	 * or changing the number of dnode slots on an object. For
+	 * non-raw sends, this does not matter and the new object
+	 * can just use the previous one's nlevels. For raw sends,
+	 * however, the structure of the received dnode (including
+	 * nlevels and dnode slots) must match that of the send
+	 * side. Therefore, instead of using dmu_object_reclaim(),
+	 * we must free the object completely and call
+	 * dmu_object_claim_dnsize() instead.
+	 */
+	if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) ||
+	    dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) {
+		err = dmu_free_long_object(rwa->os, drro->drr_object);
+		if (err != 0)
+			return (SET_ERROR(EINVAL));
+
+		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+		*object_to_hold = DMU_NEW_OBJECT;
+	}
+
+	/*
+	 * For raw receives, free everything beyond the new incoming
+	 * maxblkid. Normally this would be done with a DRR_FREE
+	 * record that would come after this DRR_OBJECT record is
+	 * processed. However, for raw receives we manually set the
+	 * maxblkid from the drr_maxblkid and so we must first free
+	 * everything above that blkid to ensure the DMU is always
+	 * consistent with itself. We will never free the first block
+	 * of the object here because a maxblkid of 0 could indicate
+	 * an object with a single block or one with no blocks. This
+	 * free may be skipped when dmu_free_long_range() was called
+	 * above since it covers the entire object's contents.
+	 */
+	if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) {
+		err = dmu_free_long_range(rwa->os, drro->drr_object,
+		    (drro->drr_maxblkid + 1) * doi->doi_data_block_size,
+		    DMU_OBJECT_END);
+		if (err != 0)
+			return (SET_ERROR(EINVAL));
+	}
+	return (0);
+}
+
+noinline static int
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+    void *data)
+{
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	int err;
+	uint32_t new_blksz = drro->drr_blksz;
+	uint8_t dn_slots = drro->drr_dn_slots != 0 ?
+	    drro->drr_dn_slots : DNODE_MIN_SLOTS;
+
+	if (drro->drr_type == DMU_OT_NONE ||
+	    !DMU_OT_IS_VALID(drro->drr_type) ||
+	    !DMU_OT_IS_VALID(drro->drr_bonustype) ||
+	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
+	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
+	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
+	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
+	    drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
+	    drro->drr_bonuslen >
+	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
+	    dn_slots >
+	    (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (rwa->raw) {
+		/*
+		 * We should have received a DRR_OBJECT_RANGE record
+		 * containing this block and stored it in rwa.
+		 */
+		if (drro->drr_object < rwa->or_firstobj ||
+		    drro->drr_object >= rwa->or_firstobj + rwa->or_numslots ||
+		    drro->drr_raw_bonuslen < drro->drr_bonuslen ||
+		    drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
+		    drro->drr_nlevels > DN_MAX_LEVELS ||
+		    drro->drr_nblkptr > DN_MAX_NBLKPTR ||
+		    DN_SLOTS_TO_BONUSLEN(dn_slots) <
+		    drro->drr_raw_bonuslen)
+			return (SET_ERROR(EINVAL));
+	} else {
+		/*
+		 * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
+		 * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
+		 */
+		if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
+		    (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
+			return (SET_ERROR(EINVAL));
+		}
+
+		if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
+		    drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
+			return (SET_ERROR(EINVAL));
+		}
+	}
+
+	err = dmu_object_info(rwa->os, drro->drr_object, &doi);
+
+	if (err != 0 && err != ENOENT && err != EEXIST)
+		return (SET_ERROR(EINVAL));
+
+	if (drro->drr_object > rwa->max_object)
+		rwa->max_object = drro->drr_object;
+
+	/*
+	 * If we are losing blkptrs or changing the block size this must
+	 * be a new file instance.  We must clear out the previous file
+	 * contents before we can change this type of metadata in the dnode.
+	 * Raw receives will also check that the indirect structure of the
+	 * dnode hasn't changed.
+	 */
+	uint64_t object_to_hold;
+	if (err == 0) {
+		err = receive_handle_existing_object(rwa, drro, &doi, data,
+		    &object_to_hold, &new_blksz);
+	} else if (err == EEXIST) {
+		/*
+		 * The object requested is currently an interior slot of a
+		 * multi-slot dnode. This will be resolved when the next txg
+		 * is synced out, since the send stream will have told us
+		 * to free this slot when we freed the associated dnode
+		 * earlier in the stream.
+		 */
+		txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+
+		if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT)
+			return (SET_ERROR(EINVAL));
+
+		/* object was freed and we are about to allocate a new one */
+		object_to_hold = DMU_NEW_OBJECT;
+	} else {
+		/* object is free and we are about to allocate a new one */
+		object_to_hold = DMU_NEW_OBJECT;
+	}
+
+	/*
+	 * If this is a multi-slot dnode there is a chance that this
+	 * object will expand into a slot that is already used by
+	 * another object from the previous snapshot. We must free
+	 * these objects before we attempt to allocate the new dnode.
+	 */
+	if (dn_slots > 1) {
+		boolean_t need_sync = B_FALSE;
+
+		for (uint64_t slot = drro->drr_object + 1;
+		    slot < drro->drr_object + dn_slots;
+		    slot++) {
+			dmu_object_info_t slot_doi;
+
+			err = dmu_object_info(rwa->os, slot, &slot_doi);
+			if (err == ENOENT || err == EEXIST)
+				continue;
+			else if (err != 0)
+				return (err);
+
+			err = dmu_free_long_object(rwa->os, slot);
+			if (err != 0)
+				return (err);
+
+			need_sync = B_TRUE;
+		}
+
+		if (need_sync)
+			txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+	}
+
+	tx = dmu_tx_create(rwa->os);
+	dmu_tx_hold_bonus(tx, object_to_hold);
+	dmu_tx_hold_write(tx, object_to_hold, 0, 0);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	if (object_to_hold == DMU_NEW_OBJECT) {
+		/* Currently free, wants to be allocated */
+		err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
+		    drro->drr_type, new_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen,
+		    dn_slots << DNODE_SHIFT, tx);
+	} else if (drro->drr_type != doi.doi_type ||
+	    new_blksz != doi.doi_data_block_size ||
+	    drro->drr_bonustype != doi.doi_bonus_type ||
+	    drro->drr_bonuslen != doi.doi_bonus_size) {
+		/* Currently allocated, but with different properties */
+		err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
+		    drro->drr_type, new_blksz,
+		    drro->drr_bonustype, drro->drr_bonuslen,
+		    dn_slots << DNODE_SHIFT, rwa->spill ?
+		    DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
+	} else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
+		/*
+		 * Currently allocated, the existing version of this object
+		 * may reference a spill block that is no longer allocated
+		 * at the source and needs to be freed.
+		 */
+		err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
+	}
+
+	if (err != 0) {
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (rwa->or_crypt_params_present) {
+		/*
+		 * Set the crypt params for the buffer associated with this
+		 * range of dnodes.  This causes the blkptr_t to have the
+		 * same crypt params (byteorder, salt, iv, mac) as on the
+		 * sending side.
+		 *
+		 * Since we are committing this tx now, it is possible for
+		 * the dnode block to end up on-disk with the incorrect MAC,
+		 * if subsequent objects in this block are received in a
+		 * different txg.  However, since the dataset is marked as
+		 * inconsistent, no code paths will do a non-raw read (or
+		 * decrypt the block / verify the MAC). The receive code and
+		 * scrub code can safely do raw reads and verify the
+		 * checksum.  They don't need to verify the MAC.
+		 */
+		dmu_buf_t *db = NULL;
+		uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE;
+
+		err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os),
+		    offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
+		if (err != 0) {
+			dmu_tx_commit(tx);
+			return (SET_ERROR(EINVAL));
+		}
+
+		dmu_buf_set_crypt_params(db, rwa->or_byteorder,
+		    rwa->or_salt, rwa->or_iv, rwa->or_mac, tx);
+
+		dmu_buf_rele(db, FTAG);
+
+		rwa->or_crypt_params_present = B_FALSE;
+	}
+
+	dmu_object_set_checksum(rwa->os, drro->drr_object,
+	    drro->drr_checksumtype, tx);
+	dmu_object_set_compress(rwa->os, drro->drr_object,
+	    drro->drr_compress, tx);
+
+	/* handle more restrictive dnode structuring for raw recvs */
+	if (rwa->raw) {
+		/*
+		 * Set the indirect block size, block shift, nlevels.
+		 * This will not fail because we ensured all of the
+		 * blocks were freed earlier if this is a new object.
+		 * For non-new objects block size and indirect block
+		 * shift cannot change and nlevels can only increase.
+		 */
+		ASSERT3U(new_blksz, ==, drro->drr_blksz);
+		VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
+		    drro->drr_blksz, drro->drr_indblkshift, tx));
+		VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
+		    drro->drr_nlevels, tx));
+
+		/*
+		 * Set the maxblkid. This will always succeed because
+		 * we freed all blocks beyond the new maxblkid above.
+		 */
+		VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
+		    drro->drr_maxblkid, tx));
+	}
+
+	if (data != NULL) {
+		dmu_buf_t *db;
+		dnode_t *dn;
+		uint32_t flags = DMU_READ_NO_PREFETCH;
+
+		if (rwa->raw)
+			flags |= DMU_READ_NO_DECRYPT;
+
+		VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
+		VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
+
+		dmu_buf_will_dirty(db, tx);
+
+		ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
+		bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro));
+
+		/*
+		 * Raw bonus buffers have their byteorder determined by the
+		 * DRR_OBJECT_RANGE record.
+		 */
+		if (rwa->byteswap && !rwa->raw) {
+			dmu_object_byteswap_t byteswap =
+			    DMU_OT_BYTESWAP(drro->drr_bonustype);
+			dmu_ot_byteswap[byteswap].ob_func(db->db_data,
+			    DRR_OBJECT_PAYLOAD_SIZE(drro));
+		}
+		dmu_buf_rele(db, FTAG);
+		dnode_rele(dn, FTAG);
+	}
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+receive_freeobjects(struct receive_writer_arg *rwa,
+    struct drr_freeobjects *drrfo)
+{
+	uint64_t obj;
+	int next_err = 0;
+
+	if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
+		return (SET_ERROR(EINVAL));
+
+	for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
+	    obj < drrfo->drr_firstobj + drrfo->drr_numobjs &&
+	    obj < DN_MAX_OBJECT && next_err == 0;
+	    next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
+		dmu_object_info_t doi;
+		int err;
+
+		err = dmu_object_info(rwa->os, obj, &doi);
+		if (err == ENOENT)
+			continue;
+		else if (err != 0)
+			return (err);
+
+		err = dmu_free_long_object(rwa->os, obj);
+
+		if (err != 0)
+			return (err);
+	}
+	if (next_err != ESRCH)
+		return (next_err);
+	return (0);
+}
+
+/*
+ * Note: if this fails, the caller will clean up any records left on the
+ * rwa->write_batch list.
+ */
+static int
+flush_write_batch_impl(struct receive_writer_arg *rwa)
+{
+	dnode_t *dn;
+	int err;
+
+	if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0)
+		return (SET_ERROR(EINVAL));
+
+	struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch);
+	struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write;
+
+	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
+	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
+
+	ASSERT3U(rwa->last_object, ==, last_drrw->drr_object);
+	ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset);
+
+	dmu_tx_t *tx = dmu_tx_create(rwa->os);
+	dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset,
+	    last_drrw->drr_offset - first_drrw->drr_offset +
+	    last_drrw->drr_logical_size);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		dnode_rele(dn, FTAG);
+		return (err);
+	}
+
+	struct receive_record_arg *rrd;
+	while ((rrd = list_head(&rwa->write_batch)) != NULL) {
+		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+		abd_t *abd = rrd->abd;
+
+		ASSERT3U(drrw->drr_object, ==, rwa->last_object);
+
+		if (drrw->drr_logical_size != dn->dn_datablksz) {
+			/*
+			 * The WRITE record is larger than the object's block
+			 * size.  We must be receiving an incremental
+			 * large-block stream into a dataset that previously did
+			 * a non-large-block receive.  Lightweight writes must
+			 * be exactly one block, so we need to decompress the
+			 * data (if compressed) and do a normal dmu_write().
+			 */
+			ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz);
+			if (DRR_WRITE_COMPRESSED(drrw)) {
+				abd_t *decomp_abd =
+				    abd_alloc_linear(drrw->drr_logical_size,
+				    B_FALSE);
+
+				err = zio_decompress_data(
+				    drrw->drr_compressiontype,
+				    abd, abd_to_buf(decomp_abd),
+				    abd_get_size(abd),
+				    abd_get_size(decomp_abd), NULL);
+
+				if (err == 0) {
+					dmu_write_by_dnode(dn,
+					    drrw->drr_offset,
+					    drrw->drr_logical_size,
+					    abd_to_buf(decomp_abd), tx);
+				}
+				abd_free(decomp_abd);
+			} else {
+				dmu_write_by_dnode(dn,
+				    drrw->drr_offset,
+				    drrw->drr_logical_size,
+				    abd_to_buf(abd), tx);
+			}
+			if (err == 0)
+				abd_free(abd);
+		} else {
+			zio_prop_t zp;
+			dmu_write_policy(rwa->os, dn, 0, 0, &zp);
+
+			enum zio_flag zio_flags = 0;
+
+			if (rwa->raw) {
+				zp.zp_encrypt = B_TRUE;
+				zp.zp_compress = drrw->drr_compressiontype;
+				zp.zp_byteorder = ZFS_HOST_BYTEORDER ^
+				    !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
+				    rwa->byteswap;
+				bcopy(drrw->drr_salt, zp.zp_salt,
+				    ZIO_DATA_SALT_LEN);
+				bcopy(drrw->drr_iv, zp.zp_iv,
+				    ZIO_DATA_IV_LEN);
+				bcopy(drrw->drr_mac, zp.zp_mac,
+				    ZIO_DATA_MAC_LEN);
+				if (DMU_OT_IS_ENCRYPTED(zp.zp_type)) {
+					zp.zp_nopwrite = B_FALSE;
+					zp.zp_copies = MIN(zp.zp_copies,
+					    SPA_DVAS_PER_BP - 1);
+				}
+				zio_flags |= ZIO_FLAG_RAW;
+			} else if (DRR_WRITE_COMPRESSED(drrw)) {
+				ASSERT3U(drrw->drr_compressed_size, >, 0);
+				ASSERT3U(drrw->drr_logical_size, >=,
+				    drrw->drr_compressed_size);
+				zp.zp_compress = drrw->drr_compressiontype;
+				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
+			} else if (rwa->byteswap) {
+				/*
+				 * Note: compressed blocks never need to be
+				 * byteswapped, because WRITE records for
+				 * metadata blocks are never compressed. The
+				 * exception is raw streams, which are written
+				 * in the original byteorder, and the byteorder
+				 * bit is preserved in the BP by setting
+				 * zp_byteorder above.
+				 */
+				dmu_object_byteswap_t byteswap =
+				    DMU_OT_BYTESWAP(drrw->drr_type);
+				dmu_ot_byteswap[byteswap].ob_func(
+				    abd_to_buf(abd),
+				    DRR_WRITE_PAYLOAD_SIZE(drrw));
+			}
+
+			/*
+			 * Since this data can't be read until the receive
+			 * completes, we can do a "lightweight" write for
+			 * improved performance.
+			 */
+			err = dmu_lightweight_write_by_dnode(dn,
+			    drrw->drr_offset, abd, &zp, zio_flags, tx);
+		}
+
+		if (err != 0) {
+			/*
+			 * This rrd is left on the list, so the caller will
+			 * free it (and the abd).
+			 */
+			break;
+		}
+
+		/*
+		 * Note: If the receive fails, we want the resume stream to
+		 * start with the same record that we last successfully
+		 * received (as opposed to the next record), so that we can
+		 * verify that we are resuming from the correct location.
+		 */
+		save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
+
+		list_remove(&rwa->write_batch, rrd);
+		kmem_free(rrd, sizeof (*rrd));
+	}
+
+	dmu_tx_commit(tx);
+	dnode_rele(dn, FTAG);
+	return (err);
+}
+
+noinline static int
+flush_write_batch(struct receive_writer_arg *rwa)
+{
+	if (list_is_empty(&rwa->write_batch))
+		return (0);
+	int err = rwa->err;
+	if (err == 0)
+		err = flush_write_batch_impl(rwa);
+	if (err != 0) {
+		struct receive_record_arg *rrd;
+		while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) {
+			abd_free(rrd->abd);
+			kmem_free(rrd, sizeof (*rrd));
+		}
+	}
+	ASSERT(list_is_empty(&rwa->write_batch));
+	return (err);
+}
+
+noinline static int
+receive_process_write_record(struct receive_writer_arg *rwa,
+    struct receive_record_arg *rrd)
+{
+	int err = 0;
+
+	ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE);
+	struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+
+	if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
+	    !DMU_OT_IS_VALID(drrw->drr_type))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * For resuming to work, records must be in increasing order
+	 * by (object, offset).
+	 */
+	if (drrw->drr_object < rwa->last_object ||
+	    (drrw->drr_object == rwa->last_object &&
+	    drrw->drr_offset < rwa->last_offset)) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	struct receive_record_arg *first_rrd = list_head(&rwa->write_batch);
+	struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write;
+	uint64_t batch_size =
+	    MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2);
+	if (first_rrd != NULL &&
+	    (drrw->drr_object != first_drrw->drr_object ||
+	    drrw->drr_offset >= first_drrw->drr_offset + batch_size)) {
+		err = flush_write_batch(rwa);
+		if (err != 0)
+			return (err);
+	}
+
+	rwa->last_object = drrw->drr_object;
+	rwa->last_offset = drrw->drr_offset;
+
+	if (rwa->last_object > rwa->max_object)
+		rwa->max_object = rwa->last_object;
+
+	list_insert_tail(&rwa->write_batch, rrd);
+	/*
+	 * Return EAGAIN to indicate that we will use this rrd again,
+	 * so the caller should not free it
+	 */
+	return (EAGAIN);
+}
+
+static int
+receive_write_embedded(struct receive_writer_arg *rwa,
+    struct drr_write_embedded *drrwe, void *data)
+{
+	dmu_tx_t *tx;
+	int err;
+
+	if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
+		return (SET_ERROR(EINVAL));
+
+	if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
+		return (SET_ERROR(EINVAL));
+
+	if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+		return (SET_ERROR(EINVAL));
+	if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+		return (SET_ERROR(EINVAL));
+	if (rwa->raw)
+		return (SET_ERROR(EINVAL));
+
+	if (drrwe->drr_object > rwa->max_object)
+		rwa->max_object = drrwe->drr_object;
+
+	tx = dmu_tx_create(rwa->os);
+
+	dmu_tx_hold_write(tx, drrwe->drr_object,
+	    drrwe->drr_offset, drrwe->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	dmu_write_embedded(rwa->os, drrwe->drr_object,
+	    drrwe->drr_offset, data, drrwe->drr_etype,
+	    drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
+	    rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+	/* See comment in restore_write. */
+	save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+static int
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+    abd_t *abd)
+{
+	dmu_buf_t *db, *db_spill;
+	int err;
+
+	if (drrs->drr_length < SPA_MINBLOCKSIZE ||
+	    drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * This is an unmodified spill block which was added to the stream
+	 * to resolve an issue with incorrectly removing spill blocks.  It
+	 * should be ignored by current versions of the code which support
+	 * the DRR_FLAG_SPILL_BLOCK flag.
+	 */
+	if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
+		abd_free(abd);
+		return (0);
+	}
+
+	if (rwa->raw) {
+		if (!DMU_OT_IS_VALID(drrs->drr_type) ||
+		    drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
+		    drrs->drr_compressed_size == 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (drrs->drr_object > rwa->max_object)
+		rwa->max_object = drrs->drr_object;
+
+	VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
+	if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
+	    &db_spill)) != 0) {
+		dmu_buf_rele(db, FTAG);
+		return (err);
+	}
+
+	dmu_tx_t *tx = dmu_tx_create(rwa->os);
+
+	dmu_tx_hold_spill(tx, db->db_object);
+
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err != 0) {
+		dmu_buf_rele(db, FTAG);
+		dmu_buf_rele(db_spill, FTAG);
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	/*
+	 * Spill blocks may both grow and shrink.  When a change in size
+	 * occurs any existing dbuf must be updated to match the logical
+	 * size of the provided arc_buf_t.
+	 */
+	if (db_spill->db_size != drrs->drr_length) {
+		dmu_buf_will_fill(db_spill, tx);
+		VERIFY0(dbuf_spill_set_blksz(db_spill,
+		    drrs->drr_length, tx));
+	}
+
+	arc_buf_t *abuf;
+	if (rwa->raw) {
+		boolean_t byteorder = ZFS_HOST_BYTEORDER ^
+		    !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
+		    rwa->byteswap;
+
+		abuf = arc_loan_raw_buf(dmu_objset_spa(rwa->os),
+		    drrs->drr_object, byteorder, drrs->drr_salt,
+		    drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
+		    drrs->drr_compressed_size, drrs->drr_length,
+		    drrs->drr_compressiontype, 0);
+	} else {
+		abuf = arc_loan_buf(dmu_objset_spa(rwa->os),
+		    DMU_OT_IS_METADATA(drrs->drr_type),
+		    drrs->drr_length);
+		if (rwa->byteswap) {
+			dmu_object_byteswap_t byteswap =
+			    DMU_OT_BYTESWAP(drrs->drr_type);
+			dmu_ot_byteswap[byteswap].ob_func(abd_to_buf(abd),
+			    DRR_SPILL_PAYLOAD_SIZE(drrs));
+		}
+	}
+
+	bcopy(abd_to_buf(abd), abuf->b_data, DRR_SPILL_PAYLOAD_SIZE(drrs));
+	abd_free(abd);
+	dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
+
+	dmu_buf_rele(db, FTAG);
+	dmu_buf_rele(db_spill, FTAG);
+
+	dmu_tx_commit(tx);
+	return (0);
+}
+
+/* ARGSUSED */
+noinline static int
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
+{
+	int err;
+
+	if (drrf->drr_length != -1ULL &&
+	    drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
+		return (SET_ERROR(EINVAL));
+
+	if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (drrf->drr_object > rwa->max_object)
+		rwa->max_object = drrf->drr_object;
+
+	err = dmu_free_long_range(rwa->os, drrf->drr_object,
+	    drrf->drr_offset, drrf->drr_length);
+
+	return (err);
+}
+
+static int
+receive_object_range(struct receive_writer_arg *rwa,
+    struct drr_object_range *drror)
+{
+	/*
+	 * By default, we assume this block is in our native format
+	 * (ZFS_HOST_BYTEORDER). We then take into account whether
+	 * the send stream is byteswapped (rwa->byteswap). Finally,
+	 * we need to byteswap again if this particular block was
+	 * in non-native format on the send side.
+	 */
+	boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
+	    !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
+
+	/*
+	 * Since dnode block sizes are constant, we should not need to worry
+	 * about making sure that the dnode block size is the same on the
+	 * sending and receiving sides for the time being. For non-raw sends,
+	 * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
+	 * record at all). Raw sends require this record type because the
+	 * encryption parameters are used to protect an entire block of bonus
+	 * buffers. If the size of dnode blocks ever becomes variable,
+	 * handling will need to be added to ensure that dnode block sizes
+	 * match on the sending and receiving side.
+	 */
+	if (drror->drr_numslots != DNODES_PER_BLOCK ||
+	    P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
+	    !rwa->raw)
+		return (SET_ERROR(EINVAL));
+
+	if (drror->drr_firstobj > rwa->max_object)
+		rwa->max_object = drror->drr_firstobj;
+
+	/*
+	 * The DRR_OBJECT_RANGE handling must be deferred to receive_object()
+	 * so that the block of dnodes is not written out when it's empty,
+	 * and converted to a HOLE BP.
+	 */
+	rwa->or_crypt_params_present = B_TRUE;
+	rwa->or_firstobj = drror->drr_firstobj;
+	rwa->or_numslots = drror->drr_numslots;
+	bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN);
+	bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN);
+	bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);
+	rwa->or_byteorder = byteorder;
+
+	return (0);
+}
+
+/*
+ * Until we have the ability to redact large ranges of data efficiently, we
+ * process these records as frees.
+ */
+/* ARGSUSED */
+noinline static int
+receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
+{
+	struct drr_free drrf = {0};
+	drrf.drr_length = drrr->drr_length;
+	drrf.drr_object = drrr->drr_object;
+	drrf.drr_offset = drrr->drr_offset;
+	drrf.drr_toguid = drrr->drr_toguid;
+	return (receive_free(rwa, &drrf));
+}
+
+/* used to destroy the drc_ds on error */
+static void
+dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
+{
+	dsl_dataset_t *ds = drc->drc_ds;
+	ds_hold_flags_t dsflags;
+
+	dsflags = (drc->drc_raw) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
+	/*
+	 * Wait for the txg sync before cleaning up the receive. For
+	 * resumable receives, this ensures that our resume state has
+	 * been written out to disk. For raw receives, this ensures
+	 * that the user accounting code will not attempt to do anything
+	 * after we stopped receiving the dataset.
+	 */
+	txg_wait_synced(ds->ds_dir->dd_pool, 0);
+	ds->ds_objset->os_raw_receive = B_FALSE;
+
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	if (drc->drc_resumable && drc->drc_should_save &&
+	    !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
+	} else {
+		char name[ZFS_MAX_DATASET_NAME_LEN];
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+		dsl_dataset_name(ds, name);
+		dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
+		(void) dsl_destroy_head(name);
+	}
+}
+
+static void
+receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf)
+{
+	if (drc->drc_byteswap) {
+		(void) fletcher_4_incremental_byteswap(buf, len,
+		    &drc->drc_cksum);
+	} else {
+		(void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum);
+	}
+}
+
+/*
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate drc->drc_next_rrd and read the next record's header into
+ * drc->drc_next_rrd->header.
+ * Verify checksum of payload and next record.
+ */
+static int
+receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
+{
+	int err;
+
+	if (len != 0) {
+		ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+		err = receive_read(drc, len, buf);
+		if (err != 0)
+			return (err);
+		receive_cksum(drc, len, buf);
+
+		/* note: rrd is NULL when reading the begin record's payload */
+		if (drc->drc_rrd != NULL) {
+			drc->drc_rrd->payload = buf;
+			drc->drc_rrd->payload_size = len;
+			drc->drc_rrd->bytes_read = drc->drc_bytes_read;
+		}
+	} else {
+		ASSERT3P(buf, ==, NULL);
+	}
+
+	drc->drc_prev_cksum = drc->drc_cksum;
+
+	drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP);
+	err = receive_read(drc, sizeof (drc->drc_next_rrd->header),
+	    &drc->drc_next_rrd->header);
+	drc->drc_next_rrd->bytes_read = drc->drc_bytes_read;
+
+	if (err != 0) {
+		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+		drc->drc_next_rrd = NULL;
+		return (err);
+	}
+	if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) {
+		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+		drc->drc_next_rrd = NULL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Note: checksum is of everything up to but not including the
+	 * checksum itself.
+	 */
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	receive_cksum(drc,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    &drc->drc_next_rrd->header);
+
+	zio_cksum_t cksum_orig =
+	    drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
+	zio_cksum_t *cksump =
+	    &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
+
+	if (drc->drc_byteswap)
+		byteswap_record(&drc->drc_next_rrd->header);
+
+	if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+	    !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) {
+		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+		drc->drc_next_rrd = NULL;
+		return (SET_ERROR(ECKSUM));
+	}
+
+	receive_cksum(drc, sizeof (cksum_orig), &cksum_orig);
+
+	return (0);
+}
+
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object.  We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches).  We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records).  As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
+    uint64_t length)
+{
+	if (!objlist_exists(drc->drc_ignore_objlist, object)) {
+		dmu_prefetch(drc->drc_os, object, 1, offset, length,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
+static int
+receive_read_record(dmu_recv_cookie_t *drc)
+{
+	int err;
+
+	switch (drc->drc_rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro =
+		    &drc->drc_rrd->header.drr_u.drr_object;
+		uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
+		void *buf = NULL;
+		dmu_object_info_t doi;
+
+		if (size != 0)
+			buf = kmem_zalloc(size, KM_SLEEP);
+
+		err = receive_read_payload_and_next_header(drc, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
+			return (err);
+		}
+		err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
+		/*
+		 * See receive_read_prefetch for an explanation why we're
+		 * storing this object in the ignore_obj_list.
+		 */
+		if (err == ENOENT || err == EEXIST ||
+		    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+			objlist_insert(drc->drc_ignore_objlist,
+			    drro->drr_object);
+			err = 0;
+		}
+		return (err);
+	}
+	case DRR_FREEOBJECTS:
+	{
+		err = receive_read_payload_and_next_header(drc, 0, NULL);
+		return (err);
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
+		int size = DRR_WRITE_PAYLOAD_SIZE(drrw);
+		abd_t *abd = abd_alloc_linear(size, B_FALSE);
+		err = receive_read_payload_and_next_header(drc, size,
+		    abd_to_buf(abd));
+		if (err != 0) {
+			abd_free(abd);
+			return (err);
+		}
+		drc->drc_rrd->abd = abd;
+		receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
+		    drrw->drr_logical_size);
+		return (err);
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &drc->drc_rrd->header.drr_u.drr_write_embedded;
+		uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+		void *buf = kmem_zalloc(size, KM_SLEEP);
+
+		err = receive_read_payload_and_next_header(drc, size, buf);
+		if (err != 0) {
+			kmem_free(buf, size);
+			return (err);
+		}
+
+		receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset,
+		    drrwe->drr_length);
+		return (err);
+	}
+	case DRR_FREE:
+	case DRR_REDACT:
+	{
+		/*
+		 * It might be beneficial to prefetch indirect blocks here, but
+		 * we don't really have the data to decide for sure.
+		 */
+		err = receive_read_payload_and_next_header(drc, 0, NULL);
+		return (err);
+	}
+	case DRR_END:
+	{
+		struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end;
+		if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum,
+		    drre->drr_checksum))
+			return (SET_ERROR(ECKSUM));
+		return (0);
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
+		int size = DRR_SPILL_PAYLOAD_SIZE(drrs);
+		abd_t *abd = abd_alloc_linear(size, B_FALSE);
+		err = receive_read_payload_and_next_header(drc, size,
+		    abd_to_buf(abd));
+		if (err != 0)
+			abd_free(abd);
+		else
+			drc->drc_rrd->abd = abd;
+		return (err);
+	}
+	case DRR_OBJECT_RANGE:
+	{
+		err = receive_read_payload_and_next_header(drc, 0, NULL);
+		return (err);
+
+	}
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+}
+
+
+
+static void
+dprintf_drr(struct receive_record_arg *rrd, int err)
+{
+#ifdef ZFS_DEBUG
+	switch (rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &rrd->header.drr_u.drr_object;
+		dprintf("drr_type = OBJECT obj = %llu type = %u "
+		    "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u "
+		    "compress = %u dn_slots = %u err = %d\n",
+		    drro->drr_object, drro->drr_type,  drro->drr_bonustype,
+		    drro->drr_blksz, drro->drr_bonuslen,
+		    drro->drr_checksumtype, drro->drr_compress,
+		    drro->drr_dn_slots, err);
+		break;
+	}
+	case DRR_FREEOBJECTS:
+	{
+		struct drr_freeobjects *drrfo =
+		    &rrd->header.drr_u.drr_freeobjects;
+		dprintf("drr_type = FREEOBJECTS firstobj = %llu "
+		    "numobjs = %llu err = %d\n",
+		    drrfo->drr_firstobj, drrfo->drr_numobjs, err);
+		break;
+	}
+	case DRR_WRITE:
+	{
+		struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+		dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu "
+		    "lsize = %llu cksumtype = %u flags = %u "
+		    "compress = %u psize = %llu err = %d\n",
+		    drrw->drr_object, drrw->drr_type, drrw->drr_offset,
+		    drrw->drr_logical_size, drrw->drr_checksumtype,
+		    drrw->drr_flags, drrw->drr_compressiontype,
+		    drrw->drr_compressed_size, err);
+		break;
+	}
+	case DRR_WRITE_BYREF:
+	{
+		struct drr_write_byref *drrwbr =
+		    &rrd->header.drr_u.drr_write_byref;
+		dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu "
+		    "length = %llu toguid = %llx refguid = %llx "
+		    "refobject = %llu refoffset = %llu cksumtype = %u "
+		    "flags = %u err = %d\n",
+		    drrwbr->drr_object, drrwbr->drr_offset,
+		    drrwbr->drr_length, drrwbr->drr_toguid,
+		    drrwbr->drr_refguid, drrwbr->drr_refobject,
+		    drrwbr->drr_refoffset, drrwbr->drr_checksumtype,
+		    drrwbr->drr_flags, err);
+		break;
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &rrd->header.drr_u.drr_write_embedded;
+		dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu "
+		    "length = %llu compress = %u etype = %u lsize = %u "
+		    "psize = %u err = %d\n",
+		    drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length,
+		    drrwe->drr_compression, drrwe->drr_etype,
+		    drrwe->drr_lsize, drrwe->drr_psize, err);
+		break;
+	}
+	case DRR_FREE:
+	{
+		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+		dprintf("drr_type = FREE obj = %llu offset = %llu "
+		    "length = %lld err = %d\n",
+		    drrf->drr_object, drrf->drr_offset, drrf->drr_length,
+		    err);
+		break;
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+		dprintf("drr_type = SPILL obj = %llu length = %llu "
+		    "err = %d\n", drrs->drr_object, drrs->drr_length, err);
+		break;
+	}
+	case DRR_OBJECT_RANGE:
+	{
+		struct drr_object_range *drror =
+		    &rrd->header.drr_u.drr_object_range;
+		dprintf("drr_type = OBJECT_RANGE firstobj = %llu "
+		    "numslots = %llu flags = %u err = %d\n",
+		    drror->drr_firstobj, drror->drr_numslots,
+		    drror->drr_flags, err);
+		break;
+	}
+	default:
+		return;
+	}
+#endif
+}
+
+/*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+    struct receive_record_arg *rrd)
+{
+	int err;
+
+	/* Processing in order, therefore bytes_read should be increasing. */
+	ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+	rwa->bytes_read = rrd->bytes_read;
+
+	if (rrd->header.drr_type != DRR_WRITE) {
+		err = flush_write_batch(rwa);
+		if (err != 0) {
+			if (rrd->abd != NULL) {
+				abd_free(rrd->abd);
+				rrd->abd = NULL;
+				rrd->payload = NULL;
+			} else if (rrd->payload != NULL) {
+				kmem_free(rrd->payload, rrd->payload_size);
+				rrd->payload = NULL;
+			}
+
+			return (err);
+		}
+	}
+
+	switch (rrd->header.drr_type) {
+	case DRR_OBJECT:
+	{
+		struct drr_object *drro = &rrd->header.drr_u.drr_object;
+		err = receive_object(rwa, drro, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		break;
+	}
+	case DRR_FREEOBJECTS:
+	{
+		struct drr_freeobjects *drrfo =
+		    &rrd->header.drr_u.drr_freeobjects;
+		err = receive_freeobjects(rwa, drrfo);
+		break;
+	}
+	case DRR_WRITE:
+	{
+		err = receive_process_write_record(rwa, rrd);
+		if (err != EAGAIN) {
+			/*
+			 * On success, receive_process_write_record() returns
+			 * EAGAIN to indicate that we do not want to free
+			 * the rrd or arc_buf.
+			 */
+			ASSERT(err != 0);
+			abd_free(rrd->abd);
+			rrd->abd = NULL;
+		}
+		break;
+	}
+	case DRR_WRITE_EMBEDDED:
+	{
+		struct drr_write_embedded *drrwe =
+		    &rrd->header.drr_u.drr_write_embedded;
+		err = receive_write_embedded(rwa, drrwe, rrd->payload);
+		kmem_free(rrd->payload, rrd->payload_size);
+		rrd->payload = NULL;
+		break;
+	}
+	case DRR_FREE:
+	{
+		struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+		err = receive_free(rwa, drrf);
+		break;
+	}
+	case DRR_SPILL:
+	{
+		struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+		err = receive_spill(rwa, drrs, rrd->abd);
+		if (err != 0)
+			abd_free(rrd->abd);
+		rrd->abd = NULL;
+		rrd->payload = NULL;
+		break;
+	}
+	case DRR_OBJECT_RANGE:
+	{
+		struct drr_object_range *drror =
+		    &rrd->header.drr_u.drr_object_range;
+		err = receive_object_range(rwa, drror);
+		break;
+	}
+	case DRR_REDACT:
+	{
+		struct drr_redact *drrr = &rrd->header.drr_u.drr_redact;
+		err = receive_redact(rwa, drrr);
+		break;
+	}
+	default:
+		err = (SET_ERROR(EINVAL));
+	}
+
+	if (err != 0)
+		dprintf_drr(rrd, err);
+
+	return (err);
+}
+
+/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record  When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+	struct receive_writer_arg *rwa = arg;
+	struct receive_record_arg *rrd;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+	    rrd = bqueue_dequeue(&rwa->q)) {
+		/*
+		 * If there's an error, the main thread will stop putting things
+		 * on the queue, but we need to clear everything in it before we
+		 * can exit.
+		 */
+		int err = 0;
+		if (rwa->err == 0) {
+			err = receive_process_record(rwa, rrd);
+		} else if (rrd->abd != NULL) {
+			abd_free(rrd->abd);
+			rrd->abd = NULL;
+			rrd->payload = NULL;
+		} else if (rrd->payload != NULL) {
+			kmem_free(rrd->payload, rrd->payload_size);
+			rrd->payload = NULL;
+		}
+		/*
+		 * EAGAIN indicates that this record has been saved (on
+		 * raw->write_batch), and will be used again, so we don't
+		 * free it.
+		 */
+		if (err != EAGAIN) {
+			if (rwa->err == 0)
+				rwa->err = err;
+			kmem_free(rrd, sizeof (*rrd));
+		}
+	}
+	kmem_free(rrd, sizeof (*rrd));
+
+	int err = flush_write_batch(rwa);
+	if (rwa->err == 0)
+		rwa->err = err;
+
+	mutex_enter(&rwa->mutex);
+	rwa->done = B_TRUE;
+	cv_signal(&rwa->cv);
+	mutex_exit(&rwa->mutex);
+	spl_fstrans_unmark(cookie);
+	thread_exit();
+}
+
+static int
+resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl)
+{
+	uint64_t val;
+	objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset;
+	uint64_t dsobj = dmu_objset_id(drc->drc_os);
+	uint64_t resume_obj, resume_off;
+
+	if (nvlist_lookup_uint64(begin_nvl,
+	    "resume_object", &resume_obj) != 0 ||
+	    nvlist_lookup_uint64(begin_nvl,
+	    "resume_offset", &resume_off) != 0) {
+		return (SET_ERROR(EINVAL));
+	}
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+	if (resume_obj != val)
+		return (SET_ERROR(EINVAL));
+	VERIFY0(zap_lookup(mos, dsobj,
+	    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+	if (resume_off != val)
+		return (SET_ERROR(EINVAL));
+
+	return (0);
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool.  There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks.  It will then push the records
+ * onto an internal blocking queue.  The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU.  This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
+ * NB: callers *must* call dmu_recv_end() if this succeeds.
+ */
+int
+dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp)
+{
+	int err = 0;
+	struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
+
+	if (dsl_dataset_is_zapified(drc->drc_ds)) {
+		uint64_t bytes;
+		(void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+		    drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+		    sizeof (bytes), 1, &bytes);
+		drc->drc_bytes_read += bytes;
+	}
+
+	drc->drc_ignore_objlist = objlist_create();
+
+	/* these were verified in dmu_recv_begin */
+	ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
+	    DMU_SUBSTREAM);
+	ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
+
+	ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
+	ASSERT0(drc->drc_os->os_encrypted &&
+	    (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
+
+	/* handle DSL encryption key payload */
+	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+		nvlist_t *keynvl = NULL;
+
+		ASSERT(drc->drc_os->os_encrypted);
+		ASSERT(drc->drc_raw);
+
+		err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata",
+		    &keynvl);
+		if (err != 0)
+			goto out;
+
+		/*
+		 * If this is a new dataset we set the key immediately.
+		 * Otherwise we don't want to change the key until we
+		 * are sure the rest of the receive succeeded so we stash
+		 * the keynvl away until then.
+		 */
+		err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
+		    drc->drc_ds->ds_object, drc->drc_fromsnapobj,
+		    drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
+		if (err != 0)
+			goto out;
+
+		/* see comment in dmu_recv_end_sync() */
+		drc->drc_ivset_guid = 0;
+		(void) nvlist_lookup_uint64(keynvl, "to_ivset_guid",
+		    &drc->drc_ivset_guid);
+
+		if (!drc->drc_newfs)
+			drc->drc_keynvl = fnvlist_dup(keynvl);
+	}
+
+	if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+		err = resume_check(drc, drc->drc_begin_nvl);
+		if (err != 0)
+			goto out;
+	}
+
+	/*
+	 * If we failed before this point we will clean up any new resume
+	 * state that was created. Now that we've gotten past the initial
+	 * checks we are ok to retain that resume state.
+	 */
+	drc->drc_should_save = B_TRUE;
+
+	(void) bqueue_init(&rwa->q, zfs_recv_queue_ff,
+	    MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize),
+	    offsetof(struct receive_record_arg, node));
+	cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
+	rwa->os = drc->drc_os;
+	rwa->byteswap = drc->drc_byteswap;
+	rwa->resumable = drc->drc_resumable;
+	rwa->raw = drc->drc_raw;
+	rwa->spill = drc->drc_spill;
+	rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0);
+	rwa->os->os_raw_receive = drc->drc_raw;
+	list_create(&rwa->write_batch, sizeof (struct receive_record_arg),
+	    offsetof(struct receive_record_arg, node.bqn_node));
+
+	(void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
+	    TS_RUN, minclsyspri);
+	/*
+	 * We're reading rwa->err without locks, which is safe since we are the
+	 * only reader, and the worker thread is the only writer.  It's ok if we
+	 * miss a write for an iteration or two of the loop, since the writer
+	 * thread will keep freeing records we send it until we send it an eos
+	 * marker.
+	 *
+	 * We can leave this loop in 3 ways:  First, if rwa->err is
+	 * non-zero.  In that case, the writer thread will free the rrd we just
+	 * pushed.  Second, if  we're interrupted; in that case, either it's the
+	 * first loop and drc->drc_rrd was never allocated, or it's later, and
+	 * drc->drc_rrd has been handed off to the writer thread who will free
+	 * it.  Finally, if receive_read_record fails or we're at the end of the
+	 * stream, then we free drc->drc_rrd and exit.
+	 */
+	while (rwa->err == 0) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+			err = SET_ERROR(EINTR);
+			break;
+		}
+
+		ASSERT3P(drc->drc_rrd, ==, NULL);
+		drc->drc_rrd = drc->drc_next_rrd;
+		drc->drc_next_rrd = NULL;
+		/* Allocates and loads header into drc->drc_next_rrd */
+		err = receive_read_record(drc);
+
+		if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) {
+			kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd));
+			drc->drc_rrd = NULL;
+			break;
+		}
+
+		bqueue_enqueue(&rwa->q, drc->drc_rrd,
+		    sizeof (struct receive_record_arg) +
+		    drc->drc_rrd->payload_size);
+		drc->drc_rrd = NULL;
+	}
+
+	ASSERT3P(drc->drc_rrd, ==, NULL);
+	drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP);
+	drc->drc_rrd->eos_marker = B_TRUE;
+	bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1);
+
+	mutex_enter(&rwa->mutex);
+	while (!rwa->done) {
+		/*
+		 * We need to use cv_wait_sig() so that any process that may
+		 * be sleeping here can still fork.
+		 */
+		(void) cv_wait_sig(&rwa->cv, &rwa->mutex);
+	}
+	mutex_exit(&rwa->mutex);
+
+	/*
+	 * If we are receiving a full stream as a clone, all object IDs which
+	 * are greater than the maximum ID referenced in the stream are
+	 * by definition unused and must be freed.
+	 */
+	if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
+		uint64_t obj = rwa->max_object + 1;
+		int free_err = 0;
+		int next_err = 0;
+
+		while (next_err == 0) {
+			free_err = dmu_free_long_object(rwa->os, obj);
+			if (free_err != 0 && free_err != ENOENT)
+				break;
+
+			next_err = dmu_object_next(rwa->os, &obj, FALSE, 0);
+		}
+
+		if (err == 0) {
+			if (free_err != 0 && free_err != ENOENT)
+				err = free_err;
+			else if (next_err != ESRCH)
+				err = next_err;
+		}
+	}
+
+	cv_destroy(&rwa->cv);
+	mutex_destroy(&rwa->mutex);
+	bqueue_destroy(&rwa->q);
+	list_destroy(&rwa->write_batch);
+	if (err == 0)
+		err = rwa->err;
+
+out:
+	/*
+	 * If we hit an error before we started the receive_writer_thread
+	 * we need to clean up the next_rrd we create by processing the
+	 * DRR_BEGIN record.
+	 */
+	if (drc->drc_next_rrd != NULL)
+		kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
+
+	/*
+	 * The objset will be invalidated by dmu_recv_end() when we do
+	 * dsl_dataset_clone_swap_sync_impl().
+	 */
+	drc->drc_os = NULL;
+
+	kmem_free(rwa, sizeof (*rwa));
+	nvlist_free(drc->drc_begin_nvl);
+
+	if (err != 0) {
+		/*
+		 * Clean up references. If receive is not resumable,
+		 * destroy what we created, so we don't leave it in
+		 * the inconsistent state.
+		 */
+		dmu_recv_cleanup_ds(drc);
+		nvlist_free(drc->drc_keynvl);
+	}
+
+	objlist_destroy(drc->drc_ignore_objlist);
+	drc->drc_ignore_objlist = NULL;
+	*voffp = drc->drc_voff;
+	return (err);
+}
+
+static int
+dmu_recv_end_check(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int error;
+
+	ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
+
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
+
+		error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
+		if (error != 0)
+			return (error);
+		if (drc->drc_force) {
+			/*
+			 * We will destroy any snapshots in tofs (i.e. before
+			 * origin_head) that are after the origin (which is
+			 * the snap before drc_ds, because drc_ds can not
+			 * have any snaps of its own).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				error = dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap);
+				if (error != 0)
+					break;
+				if (snap->ds_dir != origin_head->ds_dir)
+					error = SET_ERROR(EINVAL);
+				if (error == 0)  {
+					error = dsl_destroy_snapshot_check_impl(
+					    snap, B_FALSE);
+				}
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_dataset_rele(snap, FTAG);
+				if (error != 0)
+					break;
+			}
+			if (error != 0) {
+				dsl_dataset_rele(origin_head, FTAG);
+				return (error);
+			}
+		}
+		if (drc->drc_keynvl != NULL) {
+			error = dsl_crypto_recv_raw_key_check(drc->drc_ds,
+			    drc->drc_keynvl, tx);
+			if (error != 0) {
+				dsl_dataset_rele(origin_head, FTAG);
+				return (error);
+			}
+		}
+
+		error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
+		    origin_head, drc->drc_force, drc->drc_owner, tx);
+		if (error != 0) {
+			dsl_dataset_rele(origin_head, FTAG);
+			return (error);
+		}
+		error = dsl_dataset_snapshot_check_impl(origin_head,
+		    drc->drc_tosnap, tx, B_TRUE, 1,
+		    drc->drc_cred, drc->drc_proc);
+		dsl_dataset_rele(origin_head, FTAG);
+		if (error != 0)
+			return (error);
+
+		error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
+	} else {
+		error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
+		    drc->drc_tosnap, tx, B_TRUE, 1,
+		    drc->drc_cred, drc->drc_proc);
+	}
+	return (error);
+}
+
+static void
+dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
+{
+	dmu_recv_cookie_t *drc = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
+	uint64_t newsnapobj;
+
+	spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
+	    tx, "snap=%s", drc->drc_tosnap);
+	drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
+
+	if (!drc->drc_newfs) {
+		dsl_dataset_t *origin_head;
+
+		VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
+		    &origin_head));
+
+		if (drc->drc_force) {
+			/*
+			 * Destroy any snapshots of drc_tofs (origin_head)
+			 * after the origin (the snap before drc_ds).
+			 */
+			uint64_t obj;
+
+			obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+			while (obj !=
+			    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+				dsl_dataset_t *snap;
+				VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
+				    &snap));
+				ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
+				obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+				dsl_destroy_snapshot_sync_impl(snap,
+				    B_FALSE, tx);
+				dsl_dataset_rele(snap, FTAG);
+			}
+		}
+		if (drc->drc_keynvl != NULL) {
+			dsl_crypto_recv_raw_key_sync(drc->drc_ds,
+			    drc->drc_keynvl, tx);
+			nvlist_free(drc->drc_keynvl);
+			drc->drc_keynvl = NULL;
+		}
+
+		VERIFY3P(drc->drc_ds->ds_prev, ==,
+		    origin_head->ds_prev);
+
+		dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
+		    origin_head, tx);
+		/*
+		 * The objset was evicted by dsl_dataset_clone_swap_sync_impl,
+		 * so drc_os is no longer valid.
+		 */
+		drc->drc_os = NULL;
+
+		dsl_dataset_snapshot_sync_impl(origin_head,
+		    drc->drc_tosnap, tx);
+
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
+		dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+		dsl_dataset_phys(origin_head)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		newsnapobj =
+		    dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+
+		dsl_dataset_rele(origin_head, FTAG);
+		dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+		if (drc->drc_owner != NULL)
+			VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
+	} else {
+		dsl_dataset_t *ds = drc->drc_ds;
+
+		dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
+
+		/* set snapshot's creation time and guid */
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
+		    drc->drc_drrb->drr_creation_time;
+		dsl_dataset_phys(ds->ds_prev)->ds_guid =
+		    drc->drc_drrb->drr_toguid;
+		dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+		    ~DS_FLAG_INCONSISTENT;
+
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+		if (dsl_dataset_has_resume_receive_state(ds)) {
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_FROMGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OBJECT, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_OFFSET, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_BYTES, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TOGUID, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_TONAME, tx);
+			(void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx);
+		}
+		newsnapobj =
+		    dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
+	}
+
+	/*
+	 * If this is a raw receive, the crypt_keydata nvlist will include
+	 * a to_ivset_guid for us to set on the new snapshot. This value
+	 * will override the value generated by the snapshot code. However,
+	 * this value may not be present, because older implementations of
+	 * the raw send code did not include this value, and we are still
+	 * allowed to receive them if the zfs_disable_ivset_guid_check
+	 * tunable is set, in which case we will leave the newly-generated
+	 * value.
+	 */
+	if (drc->drc_raw && drc->drc_ivset_guid != 0) {
+		dmu_object_zapify(dp->dp_meta_objset, newsnapobj,
+		    DMU_OT_DSL_DATASET, tx);
+		VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj,
+		    DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
+		    &drc->drc_ivset_guid, tx));
+	}
+
+	/*
+	 * Release the hold from dmu_recv_begin.  This must be done before
+	 * we return to open context, so that when we free the dataset's dnode
+	 * we can evict its bonus buffer. Since the dataset may be destroyed
+	 * at this point (and therefore won't have a valid pointer to the spa)
+	 * we release the key mapping manually here while we do have a valid
+	 * pointer, if it exists.
+	 */
+	if (!drc->drc_raw && encrypted) {
+		(void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
+		    drc->drc_ds->ds_object, drc->drc_ds);
+	}
+	dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
+	drc->drc_ds = NULL;
+}
+
+static int dmu_recv_end_modified_blocks = 3;
+
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
+{
+#ifdef _KERNEL
+	/*
+	 * We will be destroying the ds; make sure its origin is unmounted if
+	 * necessary.
+	 */
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+	dsl_dataset_name(drc->drc_ds, name);
+	zfs_destroy_unmount_origin(name);
+#endif
+
+	return (dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+	return (dsl_sync_task(drc->drc_tofs,
+	    dmu_recv_end_check, dmu_recv_end_sync, drc,
+	    dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
+{
+	int error;
+
+	drc->drc_owner = owner;
+
+	if (drc->drc_newfs)
+		error = dmu_recv_new_end(drc);
+	else
+		error = dmu_recv_existing_end(drc);
+
+	if (error != 0) {
+		dmu_recv_cleanup_ds(drc);
+		nvlist_free(drc->drc_keynvl);
+	} else {
+		if (drc->drc_newfs) {
+			zvol_create_minor(drc->drc_tofs);
+		}
+		char *snapname = kmem_asprintf("%s@%s",
+		    drc->drc_tofs, drc->drc_tosnap);
+		zvol_create_minor(snapname);
+		kmem_strfree(snapname);
+	}
+	return (error);
+}
+
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
+{
+	return (os->os_dsl_dataset != NULL &&
+	    os->os_dsl_dataset->ds_owner == dmu_recv_tag);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW,
+	"Maximum receive queue length");
+
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW,
+	"Receive queue fill fraction");
+
+ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW,
+	"Maximum amount of writes to batch into one transaction");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_redact.c b/sys/contrib/openzfs/module/zfs/dmu_redact.c
new file mode 100644
index 000000000000..62c7d01d4bd2
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_redact.c
@@ -0,0 +1,1199 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/txg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_redact.h>
+#include <sys/bqueue.h>
+#include <sys/objlist.h>
+#include <sys/dmu_tx.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#include <sys/zap.h>
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * This controls the number of entries in the buffer the redaction_list_update
+ * synctask uses to buffer writes to the redaction list.
+ */
+int redact_sync_bufsize = 1024;
+
+/*
+ * Controls how often to update the redaction list when creating a redaction
+ * list.
+ */
+uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */
+
+/*
+ * This tunable controls the length of the queues that zfs redact worker threads
+ * use to communicate.  If the dmu_redact_snap thread is blocking on these
+ * queues, this variable may need to be increased.  If there is a significant
+ * slowdown at the start of a redact operation as these threads consume all the
+ * available IO resources, or the queues are consuming too much memory, this
+ * variable may need to be decreased.
+ */
+int zfs_redact_queue_length = 1024 * 1024;
+
+/*
+ * These tunables control the fill fraction of the queues by zfs redact. The
+ * fill fraction controls the frequency with which threads have to be
+ * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these
+ * should be tuned down.  If the queues empty before the signalled thread can
+ * catch up, then these should be tuned up.
+ */
+uint64_t zfs_redact_queue_ff = 20;
+
+struct redact_record {
+	bqueue_node_t		ln;
+	boolean_t		eos_marker; /* Marks the end of the stream */
+	uint64_t		start_object;
+	uint64_t		start_blkid;
+	uint64_t		end_object;
+	uint64_t		end_blkid;
+	uint8_t			indblkshift;
+	uint32_t		datablksz;
+};
+
+struct redact_thread_arg {
+	bqueue_t	q;
+	objset_t	*os;		/* Objset to traverse */
+	dsl_dataset_t	*ds;		/* Dataset to traverse */
+	struct redact_record *current_record;
+	int		error_code;
+	boolean_t	cancel;
+	zbookmark_phys_t resume;
+	objlist_t	*deleted_objs;
+	uint64_t	*num_blocks_visited;
+	uint64_t	ignore_object;	/* ignore further callbacks on this */
+	uint64_t	txg; /* txg to traverse since */
+};
+
+/*
+ * The redaction node is a wrapper around the redaction record that is used
+ * by the redaction merging thread to sort the records and determine overlaps.
+ *
+ * It contains two nodes; one sorts the records by their start_zb, and the other
+ * sorts the records by their end_zb.
+ */
+struct redact_node {
+	avl_node_t			avl_node_start;
+	avl_node_t			avl_node_end;
+	struct redact_record		*record;
+	struct redact_thread_arg	*rt_arg;
+	uint32_t			thread_num;
+};
+
+struct merge_data {
+	list_t				md_redact_block_pending;
+	redact_block_phys_t		md_coalesce_block;
+	uint64_t			md_last_time;
+	redact_block_phys_t		md_furthest[TXG_SIZE];
+	/* Lists of struct redact_block_list_node. */
+	list_t				md_blocks[TXG_SIZE];
+	boolean_t			md_synctask_txg[TXG_SIZE];
+	uint64_t			md_latest_synctask_txg;
+	redaction_list_t		*md_redaction_list;
+};
+
+/*
+ * A wrapper around struct redact_block so it can be stored in a list_t.
+ */
+struct redact_block_list_node {
+	redact_block_phys_t	block;
+	list_node_t		node;
+};
+
+/*
+ * We've found a new redaction candidate.  In order to improve performance, we
+ * coalesce these blocks when they're adjacent to each other.  This function
+ * handles that.  If the new candidate block range is immediately after the
+ * range we're building, coalesce it into the range we're building.  Otherwise,
+ * put the record we're building on the queue, and update the build pointer to
+ * point to the new record.
+ */
+static void
+record_merge_enqueue(bqueue_t *q, struct redact_record **build,
+    struct redact_record *new)
+{
+	if (new->eos_marker) {
+		if (*build != NULL)
+			bqueue_enqueue(q, *build, sizeof (*build));
+		bqueue_enqueue_flush(q, new, sizeof (*new));
+		return;
+	}
+	if (*build == NULL) {
+		*build = new;
+		return;
+	}
+	struct redact_record *curbuild = *build;
+	if ((curbuild->end_object == new->start_object &&
+	    curbuild->end_blkid + 1 == new->start_blkid &&
+	    curbuild->end_blkid != UINT64_MAX) ||
+	    (curbuild->end_object + 1 == new->start_object &&
+	    curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) {
+		curbuild->end_object = new->end_object;
+		curbuild->end_blkid = new->end_blkid;
+		kmem_free(new, sizeof (*new));
+	} else {
+		bqueue_enqueue(q, curbuild, sizeof (*curbuild));
+		*build = new;
+	}
+}
+#ifdef _KERNEL
+struct objnode {
+	avl_node_t node;
+	uint64_t obj;
+};
+
+static int
+objnode_compare(const void *o1, const void *o2)
+{
+	const struct objnode *obj1 = o1;
+	const struct objnode *obj2 = o2;
+	if (obj1->obj < obj2->obj)
+		return (-1);
+	if (obj1->obj > obj2->obj)
+		return (1);
+	return (0);
+}
+
+
+static objlist_t *
+zfs_get_deleteq(objset_t *os)
+{
+	objlist_t *deleteq_objlist = objlist_create();
+	uint64_t deleteq_obj;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	dmu_object_info_t doi;
+
+	ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
+	VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi));
+	ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE);
+
+	VERIFY0(zap_lookup(os, MASTER_NODE_OBJ,
+	    ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj));
+
+	/*
+	 * In order to insert objects into the objlist, they must be in sorted
+	 * order. We don't know what order we'll get them out of the ZAP in, so
+	 * we insert them into and remove them from an avl_tree_t to sort them.
+	 */
+	avl_tree_t at;
+	avl_create(&at, objnode_compare, sizeof (struct objnode),
+	    offsetof(struct objnode, node));
+
+	for (zap_cursor_init(&zc, os, deleteq_obj);
+	    zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+		struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP);
+		obj->obj = za.za_first_integer;
+		avl_add(&at, obj);
+	}
+	zap_cursor_fini(&zc);
+
+	struct objnode *next, *found = avl_first(&at);
+	while (found != NULL) {
+		next = AVL_NEXT(&at, found);
+		objlist_insert(deleteq_objlist, found->obj);
+		found = next;
+	}
+
+	void *cookie = NULL;
+	while ((found = avl_destroy_nodes(&at, &cookie)) != NULL)
+		kmem_free(found, sizeof (*found));
+	avl_destroy(&at);
+	return (deleteq_objlist);
+}
+#endif
+
+/*
+ * This is the callback function to traverse_dataset for the redaction threads
+ * for dmu_redact_snap.  This thread is responsible for creating redaction
+ * records for all the data that is modified by the snapshots we're redacting
+ * with respect to.  Redaction records represent ranges of data that have been
+ * modified by one of the redaction snapshots, and are stored in the
+ * redact_record struct. We need to create redaction records for three
+ * cases:
+ *
+ * First, if there's a normal write, we need to create a redaction record for
+ * that block.
+ *
+ * Second, if there's a hole, we need to create a redaction record that covers
+ * the whole range of the hole.  If the hole is in the meta-dnode, it must cover
+ * every block in all of the objects in the hole.
+ *
+ * Third, if there is a deleted object, we need to create a redaction record for
+ * all of the blocks in that object.
+ */
+/*ARGSUSED*/
+static int
+redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+	struct redact_thread_arg *rta = arg;
+	struct redact_record *record;
+
+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= rta->resume.zb_object);
+
+	if (rta->cancel)
+		return (SET_ERROR(EINTR));
+
+	if (rta->ignore_object == zb->zb_object)
+		return (0);
+
+	/*
+	 * If we're visiting a dnode, we need to handle the case where the
+	 * object has been deleted.
+	 */
+	if (zb->zb_level == ZB_DNODE_LEVEL) {
+		ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+
+		if (zb->zb_object == 0)
+			return (0);
+
+		/*
+		 * If the object has been deleted, redact all of the blocks in
+		 * it.
+		 */
+		if (dnp->dn_type == DMU_OT_NONE ||
+		    objlist_exists(rta->deleted_objs, zb->zb_object)) {
+			rta->ignore_object = zb->zb_object;
+			record = kmem_zalloc(sizeof (struct redact_record),
+			    KM_SLEEP);
+
+			record->eos_marker = B_FALSE;
+			record->start_object = record->end_object =
+			    zb->zb_object;
+			record->start_blkid = 0;
+			record->end_blkid = UINT64_MAX;
+			record_merge_enqueue(&rta->q,
+			    &rta->current_record, record);
+		}
+		return (0);
+	} else if (zb->zb_level < 0) {
+		return (0);
+	} else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) {
+		/*
+		 * If this is an indirect block, but not a hole, it doesn't
+		 * provide any useful information for redaction, so ignore it.
+		 */
+		return (0);
+	}
+
+	/*
+	 * At this point, there are two options left for the type of block we're
+	 * looking at.  Either this is a hole (which could be in the dnode or
+	 * the meta-dnode), or it's a level 0 block of some sort.  If it's a
+	 * hole, we create a redaction record that covers the whole range.  If
+	 * the hole is in a dnode, we need to redact all the blocks in that
+	 * hole.  If the hole is in the meta-dnode, we instead need to redact
+	 * all blocks in every object covered by that hole.  If it's a level 0
+	 * block, we only need to redact that single block.
+	 */
+	record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP);
+	record->eos_marker = B_FALSE;
+
+	record->start_object = record->end_object = zb->zb_object;
+	if (BP_IS_HOLE(bp)) {
+		record->start_blkid = zb->zb_blkid *
+		    bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level);
+
+		record->end_blkid = ((zb->zb_blkid + 1) *
+		    bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1;
+
+		if (zb->zb_object == DMU_META_DNODE_OBJECT) {
+			record->start_object = record->start_blkid *
+			    ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) /
+			    sizeof (dnode_phys_t));
+			record->start_blkid = 0;
+			record->end_object = ((record->end_blkid +
+			    1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) /
+			    sizeof (dnode_phys_t))) - 1;
+			record->end_blkid = UINT64_MAX;
+		}
+	} else if (zb->zb_level != 0 ||
+	    zb->zb_object == DMU_META_DNODE_OBJECT) {
+		kmem_free(record, sizeof (*record));
+		return (0);
+	} else {
+		record->start_blkid = record->end_blkid = zb->zb_blkid;
+	}
+	record->indblkshift = dnp->dn_indblkshift;
+	record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	record_merge_enqueue(&rta->q, &rta->current_record, record);
+
+	return (0);
+}
+
+static void
+redact_traverse_thread(void *arg)
+{
+	struct redact_thread_arg *rt_arg = arg;
+	int err;
+	struct redact_record *data;
+#ifdef _KERNEL
+	if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS)
+		rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os);
+	else
+		rt_arg->deleted_objs = objlist_create();
+#else
+	rt_arg->deleted_objs = objlist_create();
+#endif
+
+	err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg,
+	    &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+	    redact_cb, rt_arg);
+
+	if (err != EINTR)
+		rt_arg->error_code = err;
+	objlist_destroy(rt_arg->deleted_objs);
+	data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+	data->eos_marker = B_TRUE;
+	record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data);
+	thread_exit();
+}
+
+static inline void
+create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object,
+    uint64_t blkid)
+{
+	zb->zb_object = object;
+	zb->zb_level = 0;
+	zb->zb_blkid = blkid;
+}
+
+/*
+ * This is a utility function that can do the comparison for the start or ends
+ * of the ranges in a redact_record.
+ */
+static int
+redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1,
+    uint64_t obj2, uint64_t off2, uint32_t dbss2)
+{
+	zbookmark_phys_t z1, z2;
+	create_zbookmark_from_obj_off(&z1, obj1, off1);
+	create_zbookmark_from_obj_off(&z2, obj2, off2);
+
+	return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0,
+	    dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2));
+}
+
+/*
+ * Compare two redaction records by their range's start location.  Also makes
+ * eos records always compare last.  We use the thread number in the redact_node
+ * to ensure that records do not compare equal (which is not allowed in our avl
+ * trees).
+ */
+static int
+redact_node_compare_start(const void *arg1, const void *arg2)
+{
+	const struct redact_node *rn1 = arg1;
+	const struct redact_node *rn2 = arg2;
+	const struct redact_record *rr1 = rn1->record;
+	const struct redact_record *rr2 = rn2->record;
+	if (rr1->eos_marker)
+		return (1);
+	if (rr2->eos_marker)
+		return (-1);
+
+	int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid,
+	    rr1->datablksz, rr2->start_object, rr2->start_blkid,
+	    rr2->datablksz);
+	if (cmp == 0)
+		cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1);
+	return (cmp);
+}
+
+/*
+ * Compare two redaction records by their range's end location.  Also makes
+ * eos records always compare last.  We use the thread number in the redact_node
+ * to ensure that records do not compare equal (which is not allowed in our avl
+ * trees).
+ */
+static int
+redact_node_compare_end(const void *arg1, const void *arg2)
+{
+	const struct redact_node *rn1 = arg1;
+	const struct redact_node *rn2 = arg2;
+	const struct redact_record *srr1 = rn1->record;
+	const struct redact_record *srr2 = rn2->record;
+	if (srr1->eos_marker)
+		return (1);
+	if (srr2->eos_marker)
+		return (-1);
+
+	int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid,
+	    srr1->datablksz, srr2->end_object, srr2->end_blkid,
+	    srr2->datablksz);
+	if (cmp == 0)
+		cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1);
+	return (cmp);
+}
+
+/*
+ * Utility function that compares two redaction records to determine if any part
+ * of the "from" record is before any part of the "to" record. Also causes End
+ * of Stream redaction records to compare after all others, so that the
+ * redaction merging logic can stay simple.
+ */
+static boolean_t
+redact_record_before(const struct redact_record *from,
+    const struct redact_record *to)
+{
+	if (from->eos_marker == B_TRUE)
+		return (B_FALSE);
+	else if (to->eos_marker == B_TRUE)
+		return (B_TRUE);
+	return (redact_range_compare(from->start_object, from->start_blkid,
+	    from->datablksz, to->end_object, to->end_blkid,
+	    to->datablksz) <= 0);
+}
+
+/*
+ * Pop a new redaction record off the queue, check that the records are in the
+ * right order, and free the old data.
+ */
+static struct redact_record *
+get_next_redact_record(bqueue_t *bq, struct redact_record *prev)
+{
+	struct redact_record *next = bqueue_dequeue(bq);
+	ASSERT(redact_record_before(prev, next));
+	kmem_free(prev, sizeof (*prev));
+	return (next);
+}
+
+/*
+ * Remove the given redaction node from both trees, pull a new redaction record
+ * off the queue, free the old redaction record, update the redaction node, and
+ * reinsert the node into the trees.
+ */
+static int
+update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree,
+    struct redact_node *redact_node)
+{
+	avl_remove(start_tree, redact_node);
+	avl_remove(end_tree, redact_node);
+	redact_node->record = get_next_redact_record(&redact_node->rt_arg->q,
+	    redact_node->record);
+	avl_add(end_tree, redact_node);
+	avl_add(start_tree, redact_node);
+	return (redact_node->rt_arg->error_code);
+}
+
+/*
+ * Synctask for updating redaction lists.  We first take this txg's list of
+ * redacted blocks and append those to the redaction list.  We then update the
+ * redaction list's bonus buffer.  We store the furthest blocks we visited and
+ * the list of snapshots that we're redacting with respect to.  We need these so
+ * that redacted sends and receives can be correctly resumed.
+ */
+static void
+redaction_list_update_sync(void *arg, dmu_tx_t *tx)
+{
+	struct merge_data *md = arg;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	list_t *list = &md->md_blocks[txg & TXG_MASK];
+	redact_block_phys_t *furthest_visited =
+	    &md->md_furthest[txg & TXG_MASK];
+	objset_t *mos = tx->tx_pool->dp_meta_objset;
+	redaction_list_t *rl = md->md_redaction_list;
+	int bufsize = redact_sync_bufsize;
+	redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf),
+	    KM_SLEEP);
+	int index = 0;
+
+	dmu_buf_will_dirty(rl->rl_dbuf, tx);
+
+	for (struct redact_block_list_node *rbln = list_remove_head(list);
+	    rbln != NULL; rbln = list_remove_head(list)) {
+		ASSERT3U(rbln->block.rbp_object, <=,
+		    furthest_visited->rbp_object);
+		ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object ||
+		    rbln->block.rbp_blkid <= furthest_visited->rbp_blkid);
+		buf[index] = rbln->block;
+		index++;
+		if (index == bufsize) {
+			dmu_write(mos, rl->rl_object,
+			    rl->rl_phys->rlp_num_entries * sizeof (*buf),
+			    bufsize * sizeof (*buf), buf, tx);
+			rl->rl_phys->rlp_num_entries += bufsize;
+			index = 0;
+		}
+		kmem_free(rbln, sizeof (*rbln));
+	}
+	if (index > 0) {
+		dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries *
+		    sizeof (*buf), index * sizeof (*buf), buf, tx);
+		rl->rl_phys->rlp_num_entries += index;
+	}
+	kmem_free(buf, bufsize * sizeof (*buf));
+
+	md->md_synctask_txg[txg & TXG_MASK] = B_FALSE;
+	rl->rl_phys->rlp_last_object = furthest_visited->rbp_object;
+	rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid;
+}
+
+static void
+commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object,
+    uint64_t blkid)
+{
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir);
+	dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node));
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	uint64_t txg = dmu_tx_get_txg(tx);
+	if (!md->md_synctask_txg[txg & TXG_MASK]) {
+		dsl_sync_task_nowait(dmu_tx_pool(tx),
+		    redaction_list_update_sync, md, tx);
+		md->md_synctask_txg[txg & TXG_MASK] = B_TRUE;
+		md->md_latest_synctask_txg = txg;
+	}
+	md->md_furthest[txg & TXG_MASK].rbp_object = object;
+	md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid;
+	list_move_tail(&md->md_blocks[txg & TXG_MASK],
+	    &md->md_redact_block_pending);
+	dmu_tx_commit(tx);
+	md->md_last_time = gethrtime();
+}
+
+/*
+ * We want to store the list of blocks that we're redacting in the bookmark's
+ * redaction list.  However, this list is stored in the MOS, which means it can
+ * only be written to in syncing context.  To get around this, we create a
+ * synctask that will write to the mos for us.  We tell it what to write by
+ * a linked list for each current transaction group; every time we decide to
+ * redact a block, we append it to the transaction group that is currently in
+ * open context.  We also update some progress information that the synctask
+ * will store to enable resumable redacted sends.
+ */
+static void
+update_redaction_list(struct merge_data *md, objset_t *os,
+    uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz)
+{
+	boolean_t enqueue = B_FALSE;
+	redact_block_phys_t cur = {0};
+	uint64_t count = endblkid - blkid + 1;
+	while (count > REDACT_BLOCK_MAX_COUNT) {
+		update_redaction_list(md, os, object, blkid,
+		    blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz);
+		blkid += REDACT_BLOCK_MAX_COUNT;
+		count -= REDACT_BLOCK_MAX_COUNT;
+	}
+	redact_block_phys_t *coalesce = &md->md_coalesce_block;
+	boolean_t new;
+	if (coalesce->rbp_size_count == 0) {
+		new = B_TRUE;
+		enqueue = B_FALSE;
+	} else  {
+		uint64_t old_count = redact_block_get_count(coalesce);
+		if (coalesce->rbp_object == object &&
+		    coalesce->rbp_blkid + old_count == blkid &&
+		    old_count + count <= REDACT_BLOCK_MAX_COUNT) {
+			ASSERT3U(redact_block_get_size(coalesce), ==, blksz);
+			redact_block_set_count(coalesce, old_count + count);
+			new = B_FALSE;
+			enqueue = B_FALSE;
+		} else {
+			new = B_TRUE;
+			enqueue = B_TRUE;
+		}
+	}
+
+	if (new) {
+		cur = *coalesce;
+		coalesce->rbp_blkid = blkid;
+		coalesce->rbp_object = object;
+
+		redact_block_set_count(coalesce, count);
+		redact_block_set_size(coalesce, blksz);
+	}
+
+	if (enqueue && redact_block_get_size(&cur) != 0) {
+		struct redact_block_list_node *rbln =
+		    kmem_alloc(sizeof (struct redact_block_list_node),
+		    KM_SLEEP);
+		rbln->block = cur;
+		list_insert_tail(&md->md_redact_block_pending, rbln);
+	}
+
+	if (gethrtime() > md->md_last_time +
+	    redaction_list_update_interval_ns) {
+		commit_rl_updates(os, md, object, blkid);
+	}
+}
+
+/*
+ * This thread merges all the redaction records provided by the worker threads,
+ * and determines which blocks are redacted by all the snapshots.  The algorithm
+ * for doing so is similar to performing a merge in mergesort with n sub-lists
+ * instead of 2, with some added complexity due to the fact that the entries are
+ * ranges, not just single blocks.  This algorithm relies on the fact that the
+ * queues are sorted, which is ensured by the fact that traverse_dataset
+ * traverses the dataset in a consistent order.  We pull one entry off the front
+ * of the queues of each secure dataset traversal thread.  Then we repeat the
+ * following: each record represents a range of blocks modified by one of the
+ * redaction snapshots, and each block in that range may need to be redacted in
+ * the send stream.  Find the record with the latest start of its range, and the
+ * record with the earliest end of its range. If the last start is before the
+ * first end, then we know that the blocks in the range [last_start, first_end]
+ * are covered by all of the ranges at the front of the queues, which means
+ * every thread redacts that whole range.  For example, let's say the ranges on
+ * each queue look like this:
+ *
+ * Block Id   1  2  3  4  5  6  7  8  9 10 11
+ * Thread 1 |    [====================]
+ * Thread 2 |       [========]
+ * Thread 3 |             [=================]
+ *
+ * Thread 3 has the last start (5), and the thread 2 has the last end (6).  All
+ * three threads modified the range [5,6], so that data should not be sent over
+ * the wire.  After we've determined whether or not to redact anything, we take
+ * the record with the first end.  We discard that record, and pull a new one
+ * off the front of the queue it came from.  In the above example, we would
+ * discard Thread 2's record, and pull a new one.  Let's say the next record we
+ * pulled from Thread 2 covered range [10,11].  The new layout would look like
+ * this:
+ *
+ * Block Id   1  2  3  4  5  6  7  8  9 10 11
+ * Thread 1 |    [====================]
+ * Thread 2 |                            [==]
+ * Thread 3 |             [=================]
+ *
+ * When we compare the last start (10, from Thread 2) and the first end (9, from
+ * Thread 1), we see that the last start is greater than the first end.
+ * Therefore, we do not redact anything from these records.  We'll iterate by
+ * replacing the record from Thread 1.
+ *
+ * We iterate by replacing the record with the lowest end because we know
+ * that the record with the lowest end has helped us as much as it can.  All the
+ * ranges before it that we will ever redact have been redacted.  In addition,
+ * by replacing the one with the lowest end, we guarantee we catch all ranges
+ * that need to be redacted.  For example, if in the case above we had replaced
+ * the record from Thread 1 instead, we might have ended up with the following:
+ *
+ * Block Id   1  2  3  4  5  6  7  8  9 10 11 12
+ * Thread 1 |                               [==]
+ * Thread 2 |       [========]
+ * Thread 3 |             [=================]
+ *
+ * If the next record from Thread 2 had been [8,10], for example, we should have
+ * redacted part of that range, but because we updated Thread 1's record, we
+ * missed it.
+ *
+ * We implement this algorithm by using two trees.  The first sorts the
+ * redaction records by their start_zb, and the second sorts them by their
+ * end_zb.  We use these to find the record with the last start and the record
+ * with the first end.  We create a record with that start and end, and send it
+ * on.  The overall runtime of this implementation is O(n log m), where n is the
+ * total number of redaction records from all the different redaction snapshots,
+ * and m is the number of redaction snapshots.
+ *
+ * If we redact with respect to zero snapshots, we create a redaction
+ * record with the start object and blkid to 0, and the end object and blkid to
+ * UINT64_MAX.  This will result in us redacting every block.
+ */
+static int
+perform_thread_merge(bqueue_t *q, uint32_t num_threads,
+    struct redact_thread_arg *thread_args, boolean_t *cancel)
+{
+	struct redact_node *redact_nodes = NULL;
+	avl_tree_t start_tree, end_tree;
+	struct redact_record *record;
+	struct redact_record *current_record = NULL;
+	int err = 0;
+	struct merge_data md = { {0} };
+	list_create(&md.md_redact_block_pending,
+	    sizeof (struct redact_block_list_node),
+	    offsetof(struct redact_block_list_node, node));
+
+	/*
+	 * If we're redacting with respect to zero snapshots, then no data is
+	 * permitted to be sent.  We enqueue a record that redacts all blocks,
+	 * and an eos marker.
+	 */
+	if (num_threads == 0) {
+		record = kmem_zalloc(sizeof (struct redact_record),
+		    KM_SLEEP);
+		// We can't redact object 0, so don't try.
+		record->start_object = 1;
+		record->start_blkid = 0;
+		record->end_object = record->end_blkid = UINT64_MAX;
+		bqueue_enqueue(q, record, sizeof (*record));
+		return (0);
+	}
+	if (num_threads > 0) {
+		redact_nodes = kmem_zalloc(num_threads *
+		    sizeof (*redact_nodes), KM_SLEEP);
+	}
+
+	avl_create(&start_tree, redact_node_compare_start,
+	    sizeof (struct redact_node),
+	    offsetof(struct redact_node, avl_node_start));
+	avl_create(&end_tree, redact_node_compare_end,
+	    sizeof (struct redact_node),
+	    offsetof(struct redact_node, avl_node_end));
+
+	for (int i = 0; i < num_threads; i++) {
+		struct redact_node *node = &redact_nodes[i];
+		struct redact_thread_arg *targ = &thread_args[i];
+		node->record = bqueue_dequeue(&targ->q);
+		node->rt_arg = targ;
+		node->thread_num = i;
+		avl_add(&start_tree, node);
+		avl_add(&end_tree, node);
+	}
+
+	/*
+	 * Once the first record in the end tree has returned EOS, every record
+	 * must be an EOS record, so we should stop.
+	 */
+	while (err == 0 && !((struct redact_node *)avl_first(&end_tree))->
+	    record->eos_marker) {
+		if (*cancel) {
+			err = EINTR;
+			break;
+		}
+		struct redact_node *last_start = avl_last(&start_tree);
+		struct redact_node *first_end = avl_first(&end_tree);
+
+		/*
+		 * If the last start record is before the first end record,
+		 * then we have blocks that are redacted by all threads.
+		 * Therefore, we should redact them.  Copy the record, and send
+		 * it to the main thread.
+		 */
+		if (redact_record_before(last_start->record,
+		    first_end->record)) {
+			record = kmem_zalloc(sizeof (struct redact_record),
+			    KM_SLEEP);
+			*record = *first_end->record;
+			record->start_object = last_start->record->start_object;
+			record->start_blkid = last_start->record->start_blkid;
+			record_merge_enqueue(q, &current_record,
+			    record);
+		}
+		err = update_avl_trees(&start_tree, &end_tree, first_end);
+	}
+
+	/*
+	 * We're done; if we were cancelled, we need to cancel our workers and
+	 * clear out their queues.  Either way, we need to remove every thread's
+	 * redact_node struct from the avl trees.
+	 */
+	for (int i = 0; i < num_threads; i++) {
+		if (err != 0) {
+			thread_args[i].cancel = B_TRUE;
+			while (!redact_nodes[i].record->eos_marker) {
+				(void) update_avl_trees(&start_tree, &end_tree,
+				    &redact_nodes[i]);
+			}
+		}
+		avl_remove(&start_tree, &redact_nodes[i]);
+		avl_remove(&end_tree, &redact_nodes[i]);
+		kmem_free(redact_nodes[i].record,
+		    sizeof (struct redact_record));
+	}
+
+	avl_destroy(&start_tree);
+	avl_destroy(&end_tree);
+	kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes));
+	if (current_record != NULL)
+		bqueue_enqueue(q, current_record, sizeof (current_record));
+	return (err);
+}
+
+struct redact_merge_thread_arg {
+	bqueue_t q;
+	spa_t *spa;
+	int numsnaps;
+	struct redact_thread_arg *thr_args;
+	boolean_t cancel;
+	int error_code;
+};
+
+static void
+redact_merge_thread(void *arg)
+{
+	struct redact_merge_thread_arg *rmta = arg;
+	rmta->error_code = perform_thread_merge(&rmta->q,
+	    rmta->numsnaps, rmta->thr_args, &rmta->cancel);
+	struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP);
+	rec->eos_marker = B_TRUE;
+	bqueue_enqueue_flush(&rmta->q, rec, 1);
+	thread_exit();
+}
+
+/*
+ * Find the next object in or after the redaction range passed in, and hold
+ * its dnode with the provided tag.  Also update *object to contain the new
+ * object number.
+ */
+static int
+hold_next_object(objset_t *os, struct redact_record *rec, void *tag,
+    uint64_t *object, dnode_t **dn)
+{
+	int err = 0;
+	if (*dn != NULL)
+		dnode_rele(*dn, tag);
+	*dn = NULL;
+	if (*object < rec->start_object) {
+		*object = rec->start_object - 1;
+	}
+	err = dmu_object_next(os, object, B_FALSE, 0);
+	if (err != 0)
+		return (err);
+
+	err = dnode_hold(os, *object, tag, dn);
+	while (err == 0 && (*object < rec->start_object ||
+	    DMU_OT_IS_METADATA((*dn)->dn_type))) {
+		dnode_rele(*dn, tag);
+		*dn = NULL;
+		err = dmu_object_next(os, object, B_FALSE, 0);
+		if (err != 0)
+			break;
+		err = dnode_hold(os, *object, tag, dn);
+	}
+	return (err);
+}
+
+static int
+perform_redaction(objset_t *os, redaction_list_t *rl,
+    struct redact_merge_thread_arg *rmta)
+{
+	int err = 0;
+	bqueue_t *q = &rmta->q;
+	struct redact_record *rec = NULL;
+	struct merge_data md = { {0} };
+
+	list_create(&md.md_redact_block_pending,
+	    sizeof (struct redact_block_list_node),
+	    offsetof(struct redact_block_list_node, node));
+	md.md_redaction_list = rl;
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		list_create(&md.md_blocks[i],
+		    sizeof (struct redact_block_list_node),
+		    offsetof(struct redact_block_list_node, node));
+	}
+	dnode_t *dn = NULL;
+	uint64_t prev_obj = 0;
+	for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0;
+	    rec = get_next_redact_record(q, rec)) {
+		ASSERT3U(rec->start_object, !=, 0);
+		uint64_t object;
+		if (prev_obj != rec->start_object) {
+			object = rec->start_object - 1;
+			err = hold_next_object(os, rec, FTAG, &object, &dn);
+		} else {
+			object = prev_obj;
+		}
+		while (err == 0 && object <= rec->end_object) {
+			if (issig(JUSTLOOKING) && issig(FORREAL)) {
+				err = EINTR;
+				break;
+			}
+			/*
+			 * Part of the current object is contained somewhere in
+			 * the range covered by rec.
+			 */
+			uint64_t startblkid;
+			uint64_t endblkid;
+			uint64_t maxblkid = dn->dn_phys->dn_maxblkid;
+
+			if (rec->start_object < object)
+				startblkid = 0;
+			else if (rec->start_blkid > maxblkid)
+				break;
+			else
+				startblkid = rec->start_blkid;
+
+			if (rec->end_object > object || rec->end_blkid >
+			    maxblkid) {
+				endblkid = maxblkid;
+			} else {
+				endblkid = rec->end_blkid;
+			}
+			update_redaction_list(&md, os, object, startblkid,
+			    endblkid, dn->dn_datablksz);
+
+			if (object == rec->end_object)
+				break;
+			err = hold_next_object(os, rec, FTAG, &object, &dn);
+		}
+		if (err == ESRCH)
+			err = 0;
+		if (dn != NULL)
+			prev_obj = object;
+	}
+	if (err == 0 && dn != NULL)
+		dnode_rele(dn, FTAG);
+
+	if (err == ESRCH)
+		err = 0;
+	rmta->cancel = B_TRUE;
+	while (!rec->eos_marker)
+		rec = get_next_redact_record(q, rec);
+	kmem_free(rec, sizeof (*rec));
+
+	/*
+	 * There may be a block that's being coalesced, sync that out before we
+	 * return.
+	 */
+	if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) {
+		struct redact_block_list_node *rbln =
+		    kmem_alloc(sizeof (struct redact_block_list_node),
+		    KM_SLEEP);
+		rbln->block = md.md_coalesce_block;
+		list_insert_tail(&md.md_redact_block_pending, rbln);
+	}
+	commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX);
+
+	/*
+	 * Wait for all the redaction info to sync out before we return, so that
+	 * anyone who attempts to resume this redaction will have all the data
+	 * they need.
+	 */
+	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
+	if (md.md_latest_synctask_txg != 0)
+		txg_wait_synced(dp, md.md_latest_synctask_txg);
+	for (int i = 0; i < TXG_SIZE; i++)
+		list_destroy(&md.md_blocks[i]);
+	return (err);
+}
+
+static boolean_t
+redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
+{
+	for (int i = 0; i < num_snaps; i++) {
+		if (snaps[i] == guid)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+int
+dmu_redact_snap(const char *snapname, nvlist_t *redactnvl,
+    const char *redactbook)
+{
+	int err = 0;
+	dsl_pool_t *dp = NULL;
+	dsl_dataset_t *ds = NULL;
+	int numsnaps = 0;
+	objset_t *os;
+	struct redact_thread_arg *args = NULL;
+	redaction_list_t *new_rl = NULL;
+	char *newredactbook;
+
+	if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0)
+		return (err);
+
+	newredactbook = kmem_zalloc(sizeof (char) * ZFS_MAX_DATASET_NAME_LEN,
+	    KM_SLEEP);
+
+	if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT,
+	    FTAG, &ds)) != 0) {
+		goto out;
+	}
+	dsl_dataset_long_hold(ds, FTAG);
+	if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) {
+		err = EINVAL;
+		goto out;
+	}
+	if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) {
+		err = EALREADY;
+		goto out;
+	}
+
+	numsnaps = fnvlist_num_pairs(redactnvl);
+	if (numsnaps > 0)
+		args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP);
+
+	nvpair_t *pair = NULL;
+	for (int i = 0; i < numsnaps; i++) {
+		pair = nvlist_next_nvpair(redactnvl, pair);
+		const char *name = nvpair_name(pair);
+		struct redact_thread_arg *rta = &args[i];
+		err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT,
+		    FTAG, &rta->ds);
+		if (err != 0)
+			break;
+		/*
+		 * We want to do the long hold before we can get any other
+		 * errors, because the cleanup code will release the long
+		 * hold if rta->ds is filled in.
+		 */
+		dsl_dataset_long_hold(rta->ds, FTAG);
+
+		err = dmu_objset_from_ds(rta->ds, &rta->os);
+		if (err != 0)
+			break;
+		if (!dsl_dataset_is_before(rta->ds, ds, 0)) {
+			err = EINVAL;
+			break;
+		}
+		if (dsl_dataset_feature_is_active(rta->ds,
+		    SPA_FEATURE_REDACTED_DATASETS)) {
+			err = EALREADY;
+			break;
+
+		}
+	}
+	if (err != 0)
+		goto out;
+	VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL);
+
+	boolean_t resuming = B_FALSE;
+	zfs_bookmark_phys_t bookmark;
+
+	(void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN);
+	char *c = strchr(newredactbook, '@');
+	ASSERT3P(c, !=, NULL);
+	int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook),
+	    "#%s", redactbook);
+	if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) {
+		dsl_pool_rele(dp, FTAG);
+		kmem_free(newredactbook,
+		    sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
+		if (args != NULL)
+			kmem_free(args, numsnaps * sizeof (*args));
+		return (SET_ERROR(ENAMETOOLONG));
+	}
+	err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark);
+	if (err == 0) {
+		resuming = B_TRUE;
+		if (bookmark.zbm_redaction_obj == 0) {
+			err = EEXIST;
+			goto out;
+		}
+		err = dsl_redaction_list_hold_obj(dp,
+		    bookmark.zbm_redaction_obj, FTAG, &new_rl);
+		if (err != 0) {
+			err = EIO;
+			goto out;
+		}
+		dsl_redaction_list_long_hold(dp, new_rl, FTAG);
+		if (new_rl->rl_phys->rlp_num_snaps != numsnaps) {
+			err = ESRCH;
+			goto out;
+		}
+		for (int i = 0; i < numsnaps; i++) {
+			struct redact_thread_arg *rta = &args[i];
+			if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps,
+			    new_rl->rl_phys->rlp_num_snaps,
+			    dsl_dataset_phys(rta->ds)->ds_guid)) {
+				err = ESRCH;
+				goto out;
+			}
+		}
+		if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
+		    new_rl->rl_phys->rlp_last_object == UINT64_MAX) {
+			err = EEXIST;
+			goto out;
+		}
+		dsl_pool_rele(dp, FTAG);
+		dp = NULL;
+	} else {
+		uint64_t *guids = NULL;
+		if (numsnaps > 0) {
+			guids = kmem_zalloc(numsnaps * sizeof (uint64_t),
+			    KM_SLEEP);
+		}
+		for (int i = 0; i < numsnaps; i++) {
+			struct redact_thread_arg *rta = &args[i];
+			guids[i] = dsl_dataset_phys(rta->ds)->ds_guid;
+		}
+
+		dsl_pool_rele(dp, FTAG);
+		dp = NULL;
+		err = dsl_bookmark_create_redacted(newredactbook, snapname,
+		    numsnaps, guids, FTAG, &new_rl);
+		kmem_free(guids, numsnaps * sizeof (uint64_t));
+		if (err != 0) {
+			goto out;
+		}
+	}
+
+	for (int i = 0; i < numsnaps; i++) {
+		struct redact_thread_arg *rta = &args[i];
+		(void) bqueue_init(&rta->q, zfs_redact_queue_ff,
+		    zfs_redact_queue_length,
+		    offsetof(struct redact_record, ln));
+		if (resuming) {
+			rta->resume.zb_blkid =
+			    new_rl->rl_phys->rlp_last_blkid;
+			rta->resume.zb_object =
+			    new_rl->rl_phys->rlp_last_object;
+		}
+		rta->txg = dsl_dataset_phys(ds)->ds_creation_txg;
+		(void) thread_create(NULL, 0, redact_traverse_thread, rta,
+		    0, curproc, TS_RUN, minclsyspri);
+	}
+
+	struct redact_merge_thread_arg *rmta;
+	rmta = kmem_zalloc(sizeof (struct redact_merge_thread_arg), KM_SLEEP);
+
+	(void) bqueue_init(&rmta->q, zfs_redact_queue_ff,
+	    zfs_redact_queue_length, offsetof(struct redact_record, ln));
+	rmta->numsnaps = numsnaps;
+	rmta->spa = os->os_spa;
+	rmta->thr_args = args;
+	(void) thread_create(NULL, 0, redact_merge_thread, rmta, 0, curproc,
+	    TS_RUN, minclsyspri);
+	err = perform_redaction(os, new_rl, rmta);
+	kmem_free(rmta, sizeof (struct redact_merge_thread_arg));
+
+out:
+	kmem_free(newredactbook, sizeof (char) * ZFS_MAX_DATASET_NAME_LEN);
+
+	if (new_rl != NULL) {
+		dsl_redaction_list_long_rele(new_rl, FTAG);
+		dsl_redaction_list_rele(new_rl, FTAG);
+	}
+	for (int i = 0; i < numsnaps; i++) {
+		struct redact_thread_arg *rta = &args[i];
+		/*
+		 * rta->ds may be NULL if we got an error while filling
+		 * it in.
+		 */
+		if (rta->ds != NULL) {
+			dsl_dataset_long_rele(rta->ds, FTAG);
+			dsl_dataset_rele_flags(rta->ds,
+			    DS_HOLD_FLAG_DECRYPT, FTAG);
+		}
+	}
+
+	if (args != NULL)
+		kmem_free(args, numsnaps * sizeof (*args));
+	if (dp != NULL)
+		dsl_pool_rele(dp, FTAG);
+	if (ds != NULL) {
+		dsl_dataset_long_rele(ds, FTAG);
+		dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+	}
+	return (SET_ERROR(err));
+
+}
diff --git a/sys/contrib/openzfs/module/zfs/dmu_send.c b/sys/contrib/openzfs/module/zfs/dmu_send.c
new file mode 100644
index 000000000000..d654382237c0
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_send.c
@@ -0,0 +1,3094 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
+#include <zfs_fletcher.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
+#include <sys/zvol.h>
+#include <sys/policy.h>
+#include <sys/objlist.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
+int zfs_send_corrupt_data = B_FALSE;
+/*
+ * This tunable controls the amount of data (measured in bytes) that will be
+ * prefetched by zfs send.  If the main thread is blocking on reads that haven't
+ * completed, this variable might need to be increased.  If instead the main
+ * thread is issuing new reads because the prefetches have fallen out of the
+ * cache, this may need to be decreased.
+ */
+int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
+/*
+ * This tunable controls the length of the queues that zfs send worker threads
+ * use to communicate.  If the send_main_thread is blocking on these queues,
+ * this variable may need to be increased.  If there is a significant slowdown
+ * at the start of a send as these threads consume all the available IO
+ * resources, this variable may need to be decreased.
+ */
+int zfs_send_no_prefetch_queue_length = 1024 * 1024;
+/*
+ * These tunables control the fill fraction of the queues by zfs send.  The fill
+ * fraction controls the frequency with which threads have to be cv_signaled.
+ * If a lot of cpu time is being spent on cv_signal, then these should be tuned
+ * down.  If the queues empty before the signalled thread can catch up, then
+ * these should be tuned up.
+ */
+int zfs_send_queue_ff = 20;
+int zfs_send_no_prefetch_queue_ff = 20;
+
+/*
+ * Use this to override the recordsize calculation for fast zfs send estimates.
+ */
+int zfs_override_estimate_recordsize = 0;
+
+/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
+int zfs_send_set_freerecords_bit = B_TRUE;
+
+/* Set this tunable to FALSE is disable sending unmodified spill blocks. */
+int zfs_send_unmodified_spill_blocks = B_TRUE;
+
+static inline boolean_t
+overflow_multiply(uint64_t a, uint64_t b, uint64_t *c)
+{
+	uint64_t temp = a * b;
+	if (b != 0 && temp / b != a)
+		return (B_FALSE);
+	*c = temp;
+	return (B_TRUE);
+}
+
+struct send_thread_arg {
+	bqueue_t	q;
+	objset_t	*os;		/* Objset to traverse */
+	uint64_t	fromtxg;	/* Traverse from this txg */
+	int		flags;		/* flags to pass to traverse_dataset */
+	int		error_code;
+	boolean_t	cancel;
+	zbookmark_phys_t resume;
+	uint64_t	*num_blocks_visited;
+};
+
+struct redact_list_thread_arg {
+	boolean_t		cancel;
+	bqueue_t		q;
+	zbookmark_phys_t	resume;
+	redaction_list_t	*rl;
+	boolean_t		mark_redact;
+	int			error_code;
+	uint64_t		*num_blocks_visited;
+};
+
+struct send_merge_thread_arg {
+	bqueue_t			q;
+	objset_t			*os;
+	struct redact_list_thread_arg	*from_arg;
+	struct send_thread_arg		*to_arg;
+	struct redact_list_thread_arg	*redact_arg;
+	int				error;
+	boolean_t			cancel;
+};
+
+struct send_range {
+	boolean_t		eos_marker; /* Marks the end of the stream */
+	uint64_t		object;
+	uint64_t		start_blkid;
+	uint64_t		end_blkid;
+	bqueue_node_t		ln;
+	enum type {DATA, HOLE, OBJECT, OBJECT_RANGE, REDACT,
+	    PREVIOUSLY_REDACTED} type;
+	union {
+		struct srd {
+			dmu_object_type_t	obj_type;
+			uint32_t		datablksz; // logical size
+			uint32_t		datasz; // payload size
+			blkptr_t		bp;
+			arc_buf_t		*abuf;
+			abd_t			*abd;
+			kmutex_t		lock;
+			kcondvar_t		cv;
+			boolean_t		io_outstanding;
+			int			io_err;
+		} data;
+		struct srh {
+			uint32_t		datablksz;
+		} hole;
+		struct sro {
+			/*
+			 * This is a pointer because embedding it in the
+			 * struct causes these structures to be massively larger
+			 * for all range types; this makes the code much less
+			 * memory efficient.
+			 */
+			dnode_phys_t		*dnp;
+			blkptr_t		bp;
+		} object;
+		struct srr {
+			uint32_t		datablksz;
+		} redact;
+		struct sror {
+			blkptr_t		bp;
+		} object_range;
+	} sru;
+};
+
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another.  Multiple calls to dump_free(),
+ * dump_freeobjects(), and dump_redact() can be aggregated into a single
+ * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record.
+ */
+typedef enum {
+	PENDING_NONE,
+	PENDING_FREE,
+	PENDING_FREEOBJECTS,
+	PENDING_REDACT
+} dmu_pendop_t;
+
+typedef struct dmu_send_cookie {
+	dmu_replay_record_t *dsc_drr;
+	dmu_send_outparams_t *dsc_dso;
+	offset_t *dsc_off;
+	objset_t *dsc_os;
+	zio_cksum_t dsc_zc;
+	uint64_t dsc_toguid;
+	uint64_t dsc_fromtxg;
+	int dsc_err;
+	dmu_pendop_t dsc_pending_op;
+	uint64_t dsc_featureflags;
+	uint64_t dsc_last_data_object;
+	uint64_t dsc_last_data_offset;
+	uint64_t dsc_resume_object;
+	uint64_t dsc_resume_offset;
+	boolean_t dsc_sent_begin;
+	boolean_t dsc_sent_end;
+} dmu_send_cookie_t;
+
+static int do_dump(dmu_send_cookie_t *dscp, struct send_range *range);
+
+static void
+range_free(struct send_range *range)
+{
+	if (range->type == OBJECT) {
+		size_t size = sizeof (dnode_phys_t) *
+		    (range->sru.object.dnp->dn_extra_slots + 1);
+		kmem_free(range->sru.object.dnp, size);
+	} else if (range->type == DATA) {
+		mutex_enter(&range->sru.data.lock);
+		while (range->sru.data.io_outstanding)
+			cv_wait(&range->sru.data.cv, &range->sru.data.lock);
+		if (range->sru.data.abd != NULL)
+			abd_free(range->sru.data.abd);
+		if (range->sru.data.abuf != NULL) {
+			arc_buf_destroy(range->sru.data.abuf,
+			    &range->sru.data.abuf);
+		}
+		mutex_exit(&range->sru.data.lock);
+
+		cv_destroy(&range->sru.data.cv);
+		mutex_destroy(&range->sru.data.lock);
+	}
+	kmem_free(range, sizeof (*range));
+}
+
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
+static int
+dump_record(dmu_send_cookie_t *dscp, void *payload, int payload_len)
+{
+	dmu_send_outparams_t *dso = dscp->dsc_dso;
+	ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+	(void) fletcher_4_incremental_native(dscp->dsc_drr,
+	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+	    &dscp->dsc_zc);
+	if (dscp->dsc_drr->drr_type == DRR_BEGIN) {
+		dscp->dsc_sent_begin = B_TRUE;
+	} else {
+		ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp->dsc_drr->drr_u.
+		    drr_checksum.drr_checksum));
+		dscp->dsc_drr->drr_u.drr_checksum.drr_checksum = dscp->dsc_zc;
+	}
+	if (dscp->dsc_drr->drr_type == DRR_END) {
+		dscp->dsc_sent_end = B_TRUE;
+	}
+	(void) fletcher_4_incremental_native(&dscp->dsc_drr->
+	    drr_u.drr_checksum.drr_checksum,
+	    sizeof (zio_cksum_t), &dscp->dsc_zc);
+	*dscp->dsc_off += sizeof (dmu_replay_record_t);
+	dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, dscp->dsc_drr,
+	    sizeof (dmu_replay_record_t), dso->dso_arg);
+	if (dscp->dsc_err != 0)
+		return (SET_ERROR(EINTR));
+	if (payload_len != 0) {
+		*dscp->dsc_off += payload_len;
+		/*
+		 * payload is null when dso_dryrun == B_TRUE (i.e. when we're
+		 * doing a send size calculation)
+		 */
+		if (payload != NULL) {
+			(void) fletcher_4_incremental_native(
+			    payload, payload_len, &dscp->dsc_zc);
+		}
+
+		/*
+		 * The code does not rely on this (len being a multiple of 8).
+		 * We keep this assertion because of the corresponding assertion
+		 * in receive_read().  Keeping this assertion ensures that we do
+		 * not inadvertently break backwards compatibility (causing the
+		 * assertion in receive_read() to trigger on old software).
+		 *
+		 * Raw sends cannot be received on old software, and so can
+		 * bypass this assertion.
+		 */
+
+		ASSERT((payload_len % 8 == 0) ||
+		    (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW));
+
+		dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, payload,
+		    payload_len, dso->dso_arg);
+		if (dscp->dsc_err != 0)
+			return (SET_ERROR(EINTR));
+	}
+	return (0);
+}
+
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
+static int
+dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
+    uint64_t length)
+{
+	struct drr_free *drrf = &(dscp->dsc_drr->drr_u.drr_free);
+
+	/*
+	 * When we receive a free record, dbuf_free_range() assumes
+	 * that the receiving system doesn't have any dbufs in the range
+	 * being freed.  This is always true because there is a one-record
+	 * constraint: we only send one WRITE record for any given
+	 * object,offset.  We know that the one-record constraint is
+	 * true because we always send data in increasing order by
+	 * object,offset.
+	 *
+	 * If the increasing-order constraint ever changes, we should find
+	 * another way to assert that the one-record constraint is still
+	 * satisfied.
+	 */
+	ASSERT(object > dscp->dsc_last_data_object ||
+	    (object == dscp->dsc_last_data_object &&
+	    offset > dscp->dsc_last_data_offset));
+
+	/*
+	 * If there is a pending op, but it's not PENDING_FREE, push it out,
+	 * since free block aggregation can only be done for blocks of the
+	 * same type (i.e., DRR_FREE records can only be aggregated with
+	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
+	 * aggregated with other DRR_FREEOBJECTS records).
+	 */
+	if (dscp->dsc_pending_op != PENDING_NONE &&
+	    dscp->dsc_pending_op != PENDING_FREE) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+
+	if (dscp->dsc_pending_op == PENDING_FREE) {
+		/*
+		 * Check to see whether this free block can be aggregated
+		 * with pending one.
+		 */
+		if (drrf->drr_object == object && drrf->drr_offset +
+		    drrf->drr_length == offset) {
+			if (offset + length < offset || length == UINT64_MAX)
+				drrf->drr_length = UINT64_MAX;
+			else
+				drrf->drr_length += length;
+			return (0);
+		} else {
+			/* not a continuation.  Push out pending record */
+			if (dump_record(dscp, NULL, 0) != 0)
+				return (SET_ERROR(EINTR));
+			dscp->dsc_pending_op = PENDING_NONE;
+		}
+	}
+	/* create a FREE record and make it pending */
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_FREE;
+	drrf->drr_object = object;
+	drrf->drr_offset = offset;
+	if (offset + length < offset)
+		drrf->drr_length = DMU_OBJECT_END;
+	else
+		drrf->drr_length = length;
+	drrf->drr_toguid = dscp->dsc_toguid;
+	if (length == DMU_OBJECT_END) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+	} else {
+		dscp->dsc_pending_op = PENDING_FREE;
+	}
+
+	return (0);
+}
+
+/*
+ * Fill in the drr_redact struct, or perform aggregation if the previous record
+ * is also a redaction record, and the two are adjacent.
+ */
+static int
+dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
+    uint64_t length)
+{
+	struct drr_redact *drrr = &dscp->dsc_drr->drr_u.drr_redact;
+
+	/*
+	 * If there is a pending op, but it's not PENDING_REDACT, push it out,
+	 * since free block aggregation can only be done for blocks of the
+	 * same type (i.e., DRR_REDACT records can only be aggregated with
+	 * other DRR_REDACT records).
+	 */
+	if (dscp->dsc_pending_op != PENDING_NONE &&
+	    dscp->dsc_pending_op != PENDING_REDACT) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+
+	if (dscp->dsc_pending_op == PENDING_REDACT) {
+		/*
+		 * Check to see whether this redacted block can be aggregated
+		 * with pending one.
+		 */
+		if (drrr->drr_object == object && drrr->drr_offset +
+		    drrr->drr_length == offset) {
+			drrr->drr_length += length;
+			return (0);
+		} else {
+			/* not a continuation.  Push out pending record */
+			if (dump_record(dscp, NULL, 0) != 0)
+				return (SET_ERROR(EINTR));
+			dscp->dsc_pending_op = PENDING_NONE;
+		}
+	}
+	/* create a REDACT record and make it pending */
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_REDACT;
+	drrr->drr_object = object;
+	drrr->drr_offset = offset;
+	drrr->drr_length = length;
+	drrr->drr_toguid = dscp->dsc_toguid;
+	dscp->dsc_pending_op = PENDING_REDACT;
+
+	return (0);
+}
+
+static int
+dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object,
+    uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
+{
+	uint64_t payload_size;
+	boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
+	struct drr_write *drrw = &(dscp->dsc_drr->drr_u.drr_write);
+
+	/*
+	 * We send data in increasing object, offset order.
+	 * See comment in dump_free() for details.
+	 */
+	ASSERT(object > dscp->dsc_last_data_object ||
+	    (object == dscp->dsc_last_data_object &&
+	    offset > dscp->dsc_last_data_offset));
+	dscp->dsc_last_data_object = object;
+	dscp->dsc_last_data_offset = offset + lsize - 1;
+
+	/*
+	 * If there is any kind of pending aggregation (currently either
+	 * a grouping of free objects or free blocks), push it out to
+	 * the stream, since aggregation can't be done across operations
+	 * of different types.
+	 */
+	if (dscp->dsc_pending_op != PENDING_NONE) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+	/* write a WRITE record */
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_WRITE;
+	drrw->drr_object = object;
+	drrw->drr_type = type;
+	drrw->drr_offset = offset;
+	drrw->drr_toguid = dscp->dsc_toguid;
+	drrw->drr_logical_size = lsize;
+
+	/* only set the compression fields if the buf is compressed or raw */
+	if (raw || lsize != psize) {
+		ASSERT(raw || dscp->dsc_featureflags &
+		    DMU_BACKUP_FEATURE_COMPRESSED);
+		ASSERT(!BP_IS_EMBEDDED(bp));
+		ASSERT3S(psize, >, 0);
+
+		if (raw) {
+			ASSERT(BP_IS_PROTECTED(bp));
+
+			/*
+			 * This is a raw protected block so we need to pass
+			 * along everything the receiving side will need to
+			 * interpret this block, including the byteswap, salt,
+			 * IV, and MAC.
+			 */
+			if (BP_SHOULD_BYTESWAP(bp))
+				drrw->drr_flags |= DRR_RAW_BYTESWAP;
+			zio_crypt_decode_params_bp(bp, drrw->drr_salt,
+			    drrw->drr_iv);
+			zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
+		} else {
+			/* this is a compressed block */
+			ASSERT(dscp->dsc_featureflags &
+			    DMU_BACKUP_FEATURE_COMPRESSED);
+			ASSERT(!BP_SHOULD_BYTESWAP(bp));
+			ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
+			ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
+			ASSERT3S(lsize, >=, psize);
+		}
+
+		/* set fields common to compressed and raw sends */
+		drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
+		drrw->drr_compressed_size = psize;
+		payload_size = drrw->drr_compressed_size;
+	} else {
+		payload_size = drrw->drr_logical_size;
+	}
+
+	if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
+		/*
+		 * There's no pre-computed checksum for partial-block writes,
+		 * embedded BP's, or encrypted BP's that are being sent as
+		 * plaintext, so (like fletcher4-checksummed blocks) userland
+		 * will have to compute a dedup-capable checksum itself.
+		 */
+		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+	} else {
+		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)
+			drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
+		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+		DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+		DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
+		drrw->drr_key.ddk_cksum = bp->blk_cksum;
+	}
+
+	if (dump_record(dscp, data, payload_size) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static int
+dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset,
+    int blksz, const blkptr_t *bp)
+{
+	char buf[BPE_PAYLOAD_SIZE];
+	struct drr_write_embedded *drrw =
+	    &(dscp->dsc_drr->drr_u.drr_write_embedded);
+
+	if (dscp->dsc_pending_op != PENDING_NONE) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+
+	ASSERT(BP_IS_EMBEDDED(bp));
+
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED;
+	drrw->drr_object = object;
+	drrw->drr_offset = offset;
+	drrw->drr_length = blksz;
+	drrw->drr_toguid = dscp->dsc_toguid;
+	drrw->drr_compression = BP_GET_COMPRESS(bp);
+	drrw->drr_etype = BPE_GET_ETYPE(bp);
+	drrw->drr_lsize = BPE_GET_LSIZE(bp);
+	drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+	decode_embedded_bp_compressed(bp, buf);
+
+	if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static int
+dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
+    void *data)
+{
+	struct drr_spill *drrs = &(dscp->dsc_drr->drr_u.drr_spill);
+	uint64_t blksz = BP_GET_LSIZE(bp);
+	uint64_t payload_size = blksz;
+
+	if (dscp->dsc_pending_op != PENDING_NONE) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+
+	/* write a SPILL record */
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_SPILL;
+	drrs->drr_object = object;
+	drrs->drr_length = blksz;
+	drrs->drr_toguid = dscp->dsc_toguid;
+
+	/* See comment in dump_dnode() for full details */
+	if (zfs_send_unmodified_spill_blocks &&
+	    (bp->blk_birth <= dscp->dsc_fromtxg)) {
+		drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
+	}
+
+	/* handle raw send fields */
+	if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+		ASSERT(BP_IS_PROTECTED(bp));
+
+		if (BP_SHOULD_BYTESWAP(bp))
+			drrs->drr_flags |= DRR_RAW_BYTESWAP;
+		drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
+		drrs->drr_compressed_size = BP_GET_PSIZE(bp);
+		zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
+		zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
+		payload_size = drrs->drr_compressed_size;
+	}
+
+	if (dump_record(dscp, data, payload_size) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static int
+dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs)
+{
+	struct drr_freeobjects *drrfo = &(dscp->dsc_drr->drr_u.drr_freeobjects);
+	uint64_t maxobj = DNODES_PER_BLOCK *
+	    (DMU_META_DNODE(dscp->dsc_os)->dn_maxblkid + 1);
+
+	/*
+	 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
+	 * leading to zfs recv never completing. to avoid this issue, don't
+	 * send FREEOBJECTS records for object IDs which cannot exist on the
+	 * receiving side.
+	 */
+	if (maxobj > 0) {
+		if (maxobj <= firstobj)
+			return (0);
+
+		if (maxobj < firstobj + numobjs)
+			numobjs = maxobj - firstobj;
+	}
+
+	/*
+	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+	 * push it out, since free block aggregation can only be done for
+	 * blocks of the same type (i.e., DRR_FREE records can only be
+	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
+	 * can only be aggregated with other DRR_FREEOBJECTS records).
+	 */
+	if (dscp->dsc_pending_op != PENDING_NONE &&
+	    dscp->dsc_pending_op != PENDING_FREEOBJECTS) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+
+	if (dscp->dsc_pending_op == PENDING_FREEOBJECTS) {
+		/*
+		 * See whether this free object array can be aggregated
+		 * with pending one
+		 */
+		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+			drrfo->drr_numobjs += numobjs;
+			return (0);
+		} else {
+			/* can't be aggregated.  Push out pending record */
+			if (dump_record(dscp, NULL, 0) != 0)
+				return (SET_ERROR(EINTR));
+			dscp->dsc_pending_op = PENDING_NONE;
+		}
+	}
+
+	/* write a FREEOBJECTS record */
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_FREEOBJECTS;
+	drrfo->drr_firstobj = firstobj;
+	drrfo->drr_numobjs = numobjs;
+	drrfo->drr_toguid = dscp->dsc_toguid;
+
+	dscp->dsc_pending_op = PENDING_FREEOBJECTS;
+
+	return (0);
+}
+
+static int
+dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object,
+    dnode_phys_t *dnp)
+{
+	struct drr_object *drro = &(dscp->dsc_drr->drr_u.drr_object);
+	int bonuslen;
+
+	if (object < dscp->dsc_resume_object) {
+		/*
+		 * Note: when resuming, we will visit all the dnodes in
+		 * the block of dnodes that we are resuming from.  In
+		 * this case it's unnecessary to send the dnodes prior to
+		 * the one we are resuming from.  We should be at most one
+		 * block's worth of dnodes behind the resume point.
+		 */
+		ASSERT3U(dscp->dsc_resume_object - object, <,
+		    1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+		return (0);
+	}
+
+	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
+		return (dump_freeobjects(dscp, object, 1));
+
+	if (dscp->dsc_pending_op != PENDING_NONE) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+
+	/* write an OBJECT record */
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_OBJECT;
+	drro->drr_object = object;
+	drro->drr_type = dnp->dn_type;
+	drro->drr_bonustype = dnp->dn_bonustype;
+	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	drro->drr_bonuslen = dnp->dn_bonuslen;
+	drro->drr_dn_slots = dnp->dn_extra_slots + 1;
+	drro->drr_checksumtype = dnp->dn_checksum;
+	drro->drr_compress = dnp->dn_compress;
+	drro->drr_toguid = dscp->dsc_toguid;
+
+	if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+	    drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+		drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
+
+	bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
+
+	if ((dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) {
+		ASSERT(BP_IS_ENCRYPTED(bp));
+
+		if (BP_SHOULD_BYTESWAP(bp))
+			drro->drr_flags |= DRR_RAW_BYTESWAP;
+
+		/* needed for reconstructing dnp on recv side */
+		drro->drr_maxblkid = dnp->dn_maxblkid;
+		drro->drr_indblkshift = dnp->dn_indblkshift;
+		drro->drr_nlevels = dnp->dn_nlevels;
+		drro->drr_nblkptr = dnp->dn_nblkptr;
+
+		/*
+		 * Since we encrypt the entire bonus area, the (raw) part
+		 * beyond the bonuslen is actually nonzero, so we need
+		 * to send it.
+		 */
+		if (bonuslen != 0) {
+			drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
+			bonuslen = drro->drr_raw_bonuslen;
+		}
+	}
+
+	/*
+	 * DRR_OBJECT_SPILL is set for every dnode which references a
+	 * spill block.	 This allows the receiving pool to definitively
+	 * determine when a spill block should be kept or freed.
+	 */
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+		drro->drr_flags |= DRR_OBJECT_SPILL;
+
+	if (dump_record(dscp, DN_BONUS(dnp), bonuslen) != 0)
+		return (SET_ERROR(EINTR));
+
+	/* Free anything past the end of the file. */
+	if (dump_free(dscp, object, (dnp->dn_maxblkid + 1) *
+	    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
+		return (SET_ERROR(EINTR));
+
+	/*
+	 * Send DRR_SPILL records for unmodified spill blocks.	This is useful
+	 * because changing certain attributes of the object (e.g. blocksize)
+	 * can cause old versions of ZFS to incorrectly remove a spill block.
+	 * Including these records in the stream forces an up to date version
+	 * to always be written ensuring they're never lost.  Current versions
+	 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
+	 * ignore these unmodified spill blocks.
+	 */
+	if (zfs_send_unmodified_spill_blocks &&
+	    (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+	    (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) {
+		struct send_range record;
+		blkptr_t *bp = DN_SPILL_BLKPTR(dnp);
+
+		bzero(&record, sizeof (struct send_range));
+		record.type = DATA;
+		record.object = object;
+		record.eos_marker = B_FALSE;
+		record.start_blkid = DMU_SPILL_BLKID;
+		record.end_blkid = record.start_blkid + 1;
+		record.sru.data.bp = *bp;
+		record.sru.data.obj_type = dnp->dn_type;
+		record.sru.data.datablksz = BP_GET_LSIZE(bp);
+
+		if (do_dump(dscp, &record) != 0)
+			return (SET_ERROR(EINTR));
+	}
+
+	if (dscp->dsc_err != 0)
+		return (SET_ERROR(EINTR));
+
+	return (0);
+}
+
+static int
+dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp,
+    uint64_t firstobj, uint64_t numslots)
+{
+	struct drr_object_range *drror =
+	    &(dscp->dsc_drr->drr_u.drr_object_range);
+
+	/* we only use this record type for raw sends */
+	ASSERT(BP_IS_PROTECTED(bp));
+	ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW);
+	ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
+	ASSERT0(BP_GET_LEVEL(bp));
+
+	if (dscp->dsc_pending_op != PENDING_NONE) {
+		if (dump_record(dscp, NULL, 0) != 0)
+			return (SET_ERROR(EINTR));
+		dscp->dsc_pending_op = PENDING_NONE;
+	}
+
+	bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t));
+	dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE;
+	drror->drr_firstobj = firstobj;
+	drror->drr_numslots = numslots;
+	drror->drr_toguid = dscp->dsc_toguid;
+	if (BP_SHOULD_BYTESWAP(bp))
+		drror->drr_flags |= DRR_RAW_BYTESWAP;
+	zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
+	zio_crypt_decode_mac_bp(bp, drror->drr_mac);
+
+	if (dump_record(dscp, NULL, 0) != 0)
+		return (SET_ERROR(EINTR));
+	return (0);
+}
+
+static boolean_t
+send_do_embed(const blkptr_t *bp, uint64_t featureflags)
+{
+	if (!BP_IS_EMBEDDED(bp))
+		return (B_FALSE);
+
+	/*
+	 * Compression function must be legacy, or explicitly enabled.
+	 */
+	if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+	    !(featureflags & DMU_BACKUP_FEATURE_LZ4)))
+		return (B_FALSE);
+
+	/*
+	 * If we have not set the ZSTD feature flag, we can't send ZSTD
+	 * compressed embedded blocks, as the receiver may not support them.
+	 */
+	if ((BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD &&
+	    !(featureflags & DMU_BACKUP_FEATURE_ZSTD)))
+		return (B_FALSE);
+
+	/*
+	 * Embed type must be explicitly enabled.
+	 */
+	switch (BPE_GET_ETYPE(bp)) {
+	case BP_EMBEDDED_TYPE_DATA:
+		if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+			return (B_TRUE);
+		break;
+	default:
+		return (B_FALSE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, and calling the appropriate helper function.  In most cases,
+ * the data has already been read by send_reader_thread().
+ */
+static int
+do_dump(dmu_send_cookie_t *dscp, struct send_range *range)
+{
+	int err = 0;
+	switch (range->type) {
+	case OBJECT:
+		err = dump_dnode(dscp, &range->sru.object.bp, range->object,
+		    range->sru.object.dnp);
+		return (err);
+	case OBJECT_RANGE: {
+		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
+		if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) {
+			return (0);
+		}
+		uint64_t epb = BP_GET_LSIZE(&range->sru.object_range.bp) >>
+		    DNODE_SHIFT;
+		uint64_t firstobj = range->start_blkid * epb;
+		err = dump_object_range(dscp, &range->sru.object_range.bp,
+		    firstobj, epb);
+		break;
+	}
+	case REDACT: {
+		struct srr *srrp = &range->sru.redact;
+		err = dump_redact(dscp, range->object, range->start_blkid *
+		    srrp->datablksz, (range->end_blkid - range->start_blkid) *
+		    srrp->datablksz);
+		return (err);
+	}
+	case DATA: {
+		struct srd *srdp = &range->sru.data;
+		blkptr_t *bp = &srdp->bp;
+		spa_t *spa =
+		    dmu_objset_spa(dscp->dsc_os);
+
+		ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp));
+		ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
+		if (BP_GET_TYPE(bp) == DMU_OT_SA) {
+			arc_flags_t aflags = ARC_FLAG_WAIT;
+			enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+
+			if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) {
+				ASSERT(BP_IS_PROTECTED(bp));
+				zioflags |= ZIO_FLAG_RAW;
+			}
+
+			zbookmark_phys_t zb;
+			ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID);
+			zb.zb_objset = dmu_objset_id(dscp->dsc_os);
+			zb.zb_object = range->object;
+			zb.zb_level = 0;
+			zb.zb_blkid = range->start_blkid;
+
+			arc_buf_t *abuf = NULL;
+			if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa,
+			    bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
+			    zioflags, &aflags, &zb) != 0)
+				return (SET_ERROR(EIO));
+
+			err = dump_spill(dscp, bp, zb.zb_object,
+			    (abuf == NULL ? NULL : abuf->b_data));
+			if (abuf != NULL)
+				arc_buf_destroy(abuf, &abuf);
+			return (err);
+		}
+		if (send_do_embed(bp, dscp->dsc_featureflags)) {
+			err = dump_write_embedded(dscp, range->object,
+			    range->start_blkid * srdp->datablksz,
+			    srdp->datablksz, bp);
+			return (err);
+		}
+		ASSERT(range->object > dscp->dsc_resume_object ||
+		    (range->object == dscp->dsc_resume_object &&
+		    range->start_blkid * srdp->datablksz >=
+		    dscp->dsc_resume_offset));
+		/* it's a level-0 block of a regular object */
+
+		mutex_enter(&srdp->lock);
+		while (srdp->io_outstanding)
+			cv_wait(&srdp->cv, &srdp->lock);
+		err = srdp->io_err;
+		mutex_exit(&srdp->lock);
+
+		if (err != 0) {
+			if (zfs_send_corrupt_data &&
+			    !dscp->dsc_dso->dso_dryrun) {
+				/*
+				 * Send a block filled with 0x"zfs badd bloc"
+				 */
+				srdp->abuf = arc_alloc_buf(spa, &srdp->abuf,
+				    ARC_BUFC_DATA, srdp->datablksz);
+				uint64_t *ptr;
+				for (ptr = srdp->abuf->b_data;
+				    (char *)ptr < (char *)srdp->abuf->b_data +
+				    srdp->datablksz; ptr++)
+					*ptr = 0x2f5baddb10cULL;
+			} else {
+				return (SET_ERROR(EIO));
+			}
+		}
+
+		ASSERT(dscp->dsc_dso->dso_dryrun ||
+		    srdp->abuf != NULL || srdp->abd != NULL);
+
+		uint64_t offset = range->start_blkid * srdp->datablksz;
+
+		char *data = NULL;
+		if (srdp->abd != NULL) {
+			data = abd_to_buf(srdp->abd);
+			ASSERT3P(srdp->abuf, ==, NULL);
+		} else if (srdp->abuf != NULL) {
+			data = srdp->abuf->b_data;
+		}
+
+		/*
+		 * If we have large blocks stored on disk but the send flags
+		 * don't allow us to send large blocks, we split the data from
+		 * the arc buf into chunks.
+		 */
+		if (srdp->datablksz > SPA_OLD_MAXBLOCKSIZE &&
+		    !(dscp->dsc_featureflags &
+		    DMU_BACKUP_FEATURE_LARGE_BLOCKS)) {
+			while (srdp->datablksz > 0 && err == 0) {
+				int n = MIN(srdp->datablksz,
+				    SPA_OLD_MAXBLOCKSIZE);
+				err = dmu_dump_write(dscp, srdp->obj_type,
+				    range->object, offset, n, n, NULL, data);
+				offset += n;
+				/*
+				 * When doing dry run, data==NULL is used as a
+				 * sentinel value by
+				 * dmu_dump_write()->dump_record().
+				 */
+				if (data != NULL)
+					data += n;
+				srdp->datablksz -= n;
+			}
+		} else {
+			err = dmu_dump_write(dscp, srdp->obj_type,
+			    range->object, offset,
+			    srdp->datablksz, srdp->datasz, bp, data);
+		}
+		return (err);
+	}
+	case HOLE: {
+		struct srh *srhp = &range->sru.hole;
+		if (range->object == DMU_META_DNODE_OBJECT) {
+			uint32_t span = srhp->datablksz >> DNODE_SHIFT;
+			uint64_t first_obj = range->start_blkid * span;
+			uint64_t numobj = range->end_blkid * span - first_obj;
+			return (dump_freeobjects(dscp, first_obj, numobj));
+		}
+		uint64_t offset = 0;
+
+		/*
+		 * If this multiply overflows, we don't need to send this block.
+		 * Even if it has a birth time, it can never not be a hole, so
+		 * we don't need to send records for it.
+		 */
+		if (!overflow_multiply(range->start_blkid, srhp->datablksz,
+		    &offset)) {
+			return (0);
+		}
+		uint64_t len = 0;
+
+		if (!overflow_multiply(range->end_blkid, srhp->datablksz, &len))
+			len = UINT64_MAX;
+		len = len - offset;
+		return (dump_free(dscp, range->object, offset, len));
+	}
+	default:
+		panic("Invalid range type in do_dump: %d", range->type);
+	}
+	return (err);
+}
+
+static struct send_range *
+range_alloc(enum type type, uint64_t object, uint64_t start_blkid,
+    uint64_t end_blkid, boolean_t eos)
+{
+	struct send_range *range = kmem_alloc(sizeof (*range), KM_SLEEP);
+	range->type = type;
+	range->object = object;
+	range->start_blkid = start_blkid;
+	range->end_blkid = end_blkid;
+	range->eos_marker = eos;
+	if (type == DATA) {
+		range->sru.data.abd = NULL;
+		range->sru.data.abuf = NULL;
+		mutex_init(&range->sru.data.lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL);
+		range->sru.data.io_outstanding = 0;
+		range->sru.data.io_err = 0;
+	}
+	return (range);
+}
+
+/*
+ * This is the callback function to traverse_dataset that acts as a worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
+static int
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
+{
+	struct send_thread_arg *sta = arg;
+	struct send_range *record;
+
+	ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+	    zb->zb_object >= sta->resume.zb_object);
+
+	/*
+	 * All bps of an encrypted os should have the encryption bit set.
+	 * If this is not true it indicates tampering and we report an error.
+	 */
+	if (sta->os->os_encrypted &&
+	    !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
+		spa_log_error(spa, zb);
+		zfs_panic_recover("unencrypted block in encrypted "
+		    "object set %llu", dmu_objset_id(sta->os));
+		return (SET_ERROR(EIO));
+	}
+
+	if (sta->cancel)
+		return (SET_ERROR(EINTR));
+	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+	    DMU_OBJECT_IS_SPECIAL(zb->zb_object))
+		return (0);
+	atomic_inc_64(sta->num_blocks_visited);
+
+	if (zb->zb_level == ZB_DNODE_LEVEL) {
+		if (zb->zb_object == DMU_META_DNODE_OBJECT)
+			return (0);
+		record = range_alloc(OBJECT, zb->zb_object, 0, 0, B_FALSE);
+		record->sru.object.bp = *bp;
+		size_t size  = sizeof (*dnp) * (dnp->dn_extra_slots + 1);
+		record->sru.object.dnp = kmem_alloc(size, KM_SLEEP);
+		bcopy(dnp, record->sru.object.dnp, size);
+		bqueue_enqueue(&sta->q, record, sizeof (*record));
+		return (0);
+	}
+	if (zb->zb_level == 0 && zb->zb_object == DMU_META_DNODE_OBJECT &&
+	    !BP_IS_HOLE(bp)) {
+		record = range_alloc(OBJECT_RANGE, 0, zb->zb_blkid,
+		    zb->zb_blkid + 1, B_FALSE);
+		record->sru.object_range.bp = *bp;
+		bqueue_enqueue(&sta->q, record, sizeof (*record));
+		return (0);
+	}
+	if (zb->zb_level < 0 || (zb->zb_level > 0 && !BP_IS_HOLE(bp)))
+		return (0);
+	if (zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp))
+		return (0);
+
+	uint64_t span = bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level);
+	uint64_t start;
+
+	/*
+	 * If this multiply overflows, we don't need to send this block.
+	 * Even if it has a birth time, it can never not be a hole, so
+	 * we don't need to send records for it.
+	 */
+	if (!overflow_multiply(span, zb->zb_blkid, &start) || (!(zb->zb_blkid ==
+	    DMU_SPILL_BLKID || DMU_OT_IS_METADATA(dnp->dn_type)) &&
+	    span * zb->zb_blkid > dnp->dn_maxblkid)) {
+		ASSERT(BP_IS_HOLE(bp));
+		return (0);
+	}
+
+	if (zb->zb_blkid == DMU_SPILL_BLKID)
+		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
+
+	enum type record_type = DATA;
+	if (BP_IS_HOLE(bp))
+		record_type = HOLE;
+	else if (BP_IS_REDACTED(bp))
+		record_type = REDACT;
+	else
+		record_type = DATA;
+
+	record = range_alloc(record_type, zb->zb_object, start,
+	    (start + span < start ? 0 : start + span), B_FALSE);
+
+	uint64_t datablksz = (zb->zb_blkid == DMU_SPILL_BLKID ?
+	    BP_GET_LSIZE(bp) : dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+
+	if (BP_IS_HOLE(bp)) {
+		record->sru.hole.datablksz = datablksz;
+	} else if (BP_IS_REDACTED(bp)) {
+		record->sru.redact.datablksz = datablksz;
+	} else {
+		record->sru.data.datablksz = datablksz;
+		record->sru.data.obj_type = dnp->dn_type;
+		record->sru.data.bp = *bp;
+	}
+
+	bqueue_enqueue(&sta->q, record, sizeof (*record));
+	return (0);
+}
+
+struct redact_list_cb_arg {
+	uint64_t *num_blocks_visited;
+	bqueue_t *q;
+	boolean_t *cancel;
+	boolean_t mark_redact;
+};
+
+static int
+redact_list_cb(redact_block_phys_t *rb, void *arg)
+{
+	struct redact_list_cb_arg *rlcap = arg;
+
+	atomic_inc_64(rlcap->num_blocks_visited);
+	if (*rlcap->cancel)
+		return (-1);
+
+	struct send_range *data = range_alloc(REDACT, rb->rbp_object,
+	    rb->rbp_blkid, rb->rbp_blkid + redact_block_get_count(rb), B_FALSE);
+	ASSERT3U(data->end_blkid, >, rb->rbp_blkid);
+	if (rlcap->mark_redact) {
+		data->type = REDACT;
+		data->sru.redact.datablksz = redact_block_get_size(rb);
+	} else {
+		data->type = PREVIOUSLY_REDACTED;
+	}
+	bqueue_enqueue(rlcap->q, data, sizeof (*data));
+
+	return (0);
+}
+
+/*
+ * This function kicks off the traverse_dataset.  It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+	struct send_thread_arg *st_arg = arg;
+	int err = 0;
+	struct send_range *data;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	err = traverse_dataset_resume(st_arg->os->os_dsl_dataset,
+	    st_arg->fromtxg, &st_arg->resume,
+	    st_arg->flags, send_cb, st_arg);
+
+	if (err != EINTR)
+		st_arg->error_code = err;
+	data = range_alloc(DATA, 0, 0, 0, B_TRUE);
+	bqueue_enqueue_flush(&st_arg->q, data, sizeof (*data));
+	spl_fstrans_unmark(cookie);
+	thread_exit();
+}
+
+/*
+ * Utility function that causes End of Stream records to compare after of all
+ * others, so that other threads' comparison logic can stay simple.
+ */
+static int __attribute__((unused))
+send_range_after(const struct send_range *from, const struct send_range *to)
+{
+	if (from->eos_marker == B_TRUE)
+		return (1);
+	if (to->eos_marker == B_TRUE)
+		return (-1);
+
+	uint64_t from_obj = from->object;
+	uint64_t from_end_obj = from->object + 1;
+	uint64_t to_obj = to->object;
+	uint64_t to_end_obj = to->object + 1;
+	if (from_obj == 0) {
+		ASSERT(from->type == HOLE || from->type == OBJECT_RANGE);
+		from_obj = from->start_blkid << DNODES_PER_BLOCK_SHIFT;
+		from_end_obj = from->end_blkid << DNODES_PER_BLOCK_SHIFT;
+	}
+	if (to_obj == 0) {
+		ASSERT(to->type == HOLE || to->type == OBJECT_RANGE);
+		to_obj = to->start_blkid << DNODES_PER_BLOCK_SHIFT;
+		to_end_obj = to->end_blkid << DNODES_PER_BLOCK_SHIFT;
+	}
+
+	if (from_end_obj <= to_obj)
+		return (-1);
+	if (from_obj >= to_end_obj)
+		return (1);
+	int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type ==
+	    OBJECT_RANGE);
+	if (unlikely(cmp))
+		return (cmp);
+	cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT);
+	if (unlikely(cmp))
+		return (cmp);
+	if (from->end_blkid <= to->start_blkid)
+		return (-1);
+	if (from->start_blkid >= to->end_blkid)
+		return (1);
+	return (0);
+}
+
+/*
+ * Pop the new data off the queue, check that the records we receive are in
+ * the right order, but do not free the old data.  This is used so that the
+ * records can be sent on to the main thread without copying the data.
+ */
+static struct send_range *
+get_next_range_nofree(bqueue_t *bq, struct send_range *prev)
+{
+	struct send_range *next = bqueue_dequeue(bq);
+	ASSERT3S(send_range_after(prev, next), ==, -1);
+	return (next);
+}
+
+/*
+ * Pop the new data off the queue, check that the records we receive are in
+ * the right order, and free the old data.
+ */
+static struct send_range *
+get_next_range(bqueue_t *bq, struct send_range *prev)
+{
+	struct send_range *next = get_next_range_nofree(bq, prev);
+	range_free(prev);
+	return (next);
+}
+
+static void
+redact_list_thread(void *arg)
+{
+	struct redact_list_thread_arg *rlt_arg = arg;
+	struct send_range *record;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	if (rlt_arg->rl != NULL) {
+		struct redact_list_cb_arg rlcba = {0};
+		rlcba.cancel = &rlt_arg->cancel;
+		rlcba.q = &rlt_arg->q;
+		rlcba.num_blocks_visited = rlt_arg->num_blocks_visited;
+		rlcba.mark_redact = rlt_arg->mark_redact;
+		int err = dsl_redaction_list_traverse(rlt_arg->rl,
+		    &rlt_arg->resume, redact_list_cb, &rlcba);
+		if (err != EINTR)
+			rlt_arg->error_code = err;
+	}
+	record = range_alloc(DATA, 0, 0, 0, B_TRUE);
+	bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record));
+	spl_fstrans_unmark(cookie);
+
+	thread_exit();
+}
+
+/*
+ * Compare the start point of the two provided ranges. End of stream ranges
+ * compare last, objects compare before any data or hole inside that object and
+ * multi-object holes that start at the same object.
+ */
+static int
+send_range_start_compare(struct send_range *r1, struct send_range *r2)
+{
+	uint64_t r1_objequiv = r1->object;
+	uint64_t r1_l0equiv = r1->start_blkid;
+	uint64_t r2_objequiv = r2->object;
+	uint64_t r2_l0equiv = r2->start_blkid;
+	int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker);
+	if (unlikely(cmp))
+		return (cmp);
+	if (r1->object == 0) {
+		r1_objequiv = r1->start_blkid * DNODES_PER_BLOCK;
+		r1_l0equiv = 0;
+	}
+	if (r2->object == 0) {
+		r2_objequiv = r2->start_blkid * DNODES_PER_BLOCK;
+		r2_l0equiv = 0;
+	}
+
+	cmp = TREE_CMP(r1_objequiv, r2_objequiv);
+	if (likely(cmp))
+		return (cmp);
+	cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE);
+	if (unlikely(cmp))
+		return (cmp);
+	cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT);
+	if (unlikely(cmp))
+		return (cmp);
+
+	return (TREE_CMP(r1_l0equiv, r2_l0equiv));
+}
+
+enum q_idx {
+	REDACT_IDX = 0,
+	TO_IDX,
+	FROM_IDX,
+	NUM_THREADS
+};
+
+/*
+ * This function returns the next range the send_merge_thread should operate on.
+ * The inputs are two arrays; the first one stores the range at the front of the
+ * queues stored in the second one.  The ranges are sorted in descending
+ * priority order; the metadata from earlier ranges overrules metadata from
+ * later ranges.  out_mask is used to return which threads the ranges came from;
+ * bit i is set if ranges[i] started at the same place as the returned range.
+ *
+ * This code is not hardcoded to compare a specific number of threads; it could
+ * be used with any number, just by changing the q_idx enum.
+ *
+ * The "next range" is the one with the earliest start; if two starts are equal,
+ * the highest-priority range is the next to operate on.  If a higher-priority
+ * range starts in the middle of the first range, then the first range will be
+ * truncated to end where the higher-priority range starts, and we will operate
+ * on that one next time.   In this way, we make sure that each block covered by
+ * some range gets covered by a returned range, and each block covered is
+ * returned using the metadata of the highest-priority range it appears in.
+ *
+ * For example, if the three ranges at the front of the queues were [2,4),
+ * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata
+ * from the third range, [2,4) with the metadata from the first range, and then
+ * [4,5) with the metadata from the second.
+ */
+static struct send_range *
+find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask)
+{
+	int idx = 0; // index of the range with the earliest start
+	int i;
+	uint64_t bmask = 0;
+	for (i = 1; i < NUM_THREADS; i++) {
+		if (send_range_start_compare(ranges[i], ranges[idx]) < 0)
+			idx = i;
+	}
+	if (ranges[idx]->eos_marker) {
+		struct send_range *ret = range_alloc(DATA, 0, 0, 0, B_TRUE);
+		*out_mask = 0;
+		return (ret);
+	}
+	/*
+	 * Find all the ranges that start at that same point.
+	 */
+	for (i = 0; i < NUM_THREADS; i++) {
+		if (send_range_start_compare(ranges[i], ranges[idx]) == 0)
+			bmask |= 1 << i;
+	}
+	*out_mask = bmask;
+	/*
+	 * OBJECT_RANGE records only come from the TO thread, and should always
+	 * be treated as overlapping with nothing and sent on immediately.  They
+	 * are only used in raw sends, and are never redacted.
+	 */
+	if (ranges[idx]->type == OBJECT_RANGE) {
+		ASSERT3U(idx, ==, TO_IDX);
+		ASSERT3U(*out_mask, ==, 1 << TO_IDX);
+		struct send_range *ret = ranges[idx];
+		ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]);
+		return (ret);
+	}
+	/*
+	 * Find the first start or end point after the start of the first range.
+	 */
+	uint64_t first_change = ranges[idx]->end_blkid;
+	for (i = 0; i < NUM_THREADS; i++) {
+		if (i == idx || ranges[i]->eos_marker ||
+		    ranges[i]->object > ranges[idx]->object ||
+		    ranges[i]->object == DMU_META_DNODE_OBJECT)
+			continue;
+		ASSERT3U(ranges[i]->object, ==, ranges[idx]->object);
+		if (first_change > ranges[i]->start_blkid &&
+		    (bmask & (1 << i)) == 0)
+			first_change = ranges[i]->start_blkid;
+		else if (first_change > ranges[i]->end_blkid)
+			first_change = ranges[i]->end_blkid;
+	}
+	/*
+	 * Update all ranges to no longer overlap with the range we're
+	 * returning. All such ranges must start at the same place as the range
+	 * being returned, and end at or after first_change. Thus we update
+	 * their start to first_change. If that makes them size 0, then free
+	 * them and pull a new range from that thread.
+	 */
+	for (i = 0; i < NUM_THREADS; i++) {
+		if (i == idx || (bmask & (1 << i)) == 0)
+			continue;
+		ASSERT3U(first_change, >, ranges[i]->start_blkid);
+		ranges[i]->start_blkid = first_change;
+		ASSERT3U(ranges[i]->start_blkid, <=, ranges[i]->end_blkid);
+		if (ranges[i]->start_blkid == ranges[i]->end_blkid)
+			ranges[i] = get_next_range(qs[i], ranges[i]);
+	}
+	/*
+	 * Short-circuit the simple case; if the range doesn't overlap with
+	 * anything else, or it only overlaps with things that start at the same
+	 * place and are longer, send it on.
+	 */
+	if (first_change == ranges[idx]->end_blkid) {
+		struct send_range *ret = ranges[idx];
+		ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]);
+		return (ret);
+	}
+
+	/*
+	 * Otherwise, return a truncated copy of ranges[idx] and move the start
+	 * of ranges[idx] back to first_change.
+	 */
+	struct send_range *ret = kmem_alloc(sizeof (*ret), KM_SLEEP);
+	*ret = *ranges[idx];
+	ret->end_blkid = first_change;
+	ranges[idx]->start_blkid = first_change;
+	return (ret);
+}
+
+#define	FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX))
+
+/*
+ * Merge the results from the from thread and the to thread, and then hand the
+ * records off to send_prefetch_thread to prefetch them.  If this is not a
+ * send from a redaction bookmark, the from thread will push an end of stream
+ * record and stop, and we'll just send everything that was changed in the
+ * to_ds since the ancestor's creation txg. If it is, then since
+ * traverse_dataset has a canonical order, we can compare each change as
+ * they're pulled off the queues.  That will give us a stream that is
+ * appropriately sorted, and covers all records.  In addition, we pull the
+ * data from the redact_list_thread and use that to determine which blocks
+ * should be redacted.
+ */
+static void
+send_merge_thread(void *arg)
+{
+	struct send_merge_thread_arg *smt_arg = arg;
+	struct send_range *front_ranges[NUM_THREADS];
+	bqueue_t *queues[NUM_THREADS];
+	int err = 0;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	if (smt_arg->redact_arg == NULL) {
+		front_ranges[REDACT_IDX] =
+		    kmem_zalloc(sizeof (struct send_range), KM_SLEEP);
+		front_ranges[REDACT_IDX]->eos_marker = B_TRUE;
+		front_ranges[REDACT_IDX]->type = REDACT;
+		queues[REDACT_IDX] = NULL;
+	} else {
+		front_ranges[REDACT_IDX] =
+		    bqueue_dequeue(&smt_arg->redact_arg->q);
+		queues[REDACT_IDX] = &smt_arg->redact_arg->q;
+	}
+	front_ranges[TO_IDX] = bqueue_dequeue(&smt_arg->to_arg->q);
+	queues[TO_IDX] = &smt_arg->to_arg->q;
+	front_ranges[FROM_IDX] = bqueue_dequeue(&smt_arg->from_arg->q);
+	queues[FROM_IDX] = &smt_arg->from_arg->q;
+	uint64_t mask = 0;
+	struct send_range *range;
+	for (range = find_next_range(front_ranges, queues, &mask);
+	    !range->eos_marker && err == 0 && !smt_arg->cancel;
+	    range = find_next_range(front_ranges, queues, &mask)) {
+		/*
+		 * If the range in question was in both the from redact bookmark
+		 * and the bookmark we're using to redact, then don't send it.
+		 * It's already redacted on the receiving system, so a redaction
+		 * record would be redundant.
+		 */
+		if ((mask & FROM_AND_REDACT_BITS) == FROM_AND_REDACT_BITS) {
+			ASSERT3U(range->type, ==, REDACT);
+			range_free(range);
+			continue;
+		}
+		bqueue_enqueue(&smt_arg->q, range, sizeof (*range));
+
+		if (smt_arg->to_arg->error_code != 0) {
+			err = smt_arg->to_arg->error_code;
+		} else if (smt_arg->from_arg->error_code != 0) {
+			err = smt_arg->from_arg->error_code;
+		} else if (smt_arg->redact_arg != NULL &&
+		    smt_arg->redact_arg->error_code != 0) {
+			err = smt_arg->redact_arg->error_code;
+		}
+	}
+	if (smt_arg->cancel && err == 0)
+		err = SET_ERROR(EINTR);
+	smt_arg->error = err;
+	if (smt_arg->error != 0) {
+		smt_arg->to_arg->cancel = B_TRUE;
+		smt_arg->from_arg->cancel = B_TRUE;
+		if (smt_arg->redact_arg != NULL)
+			smt_arg->redact_arg->cancel = B_TRUE;
+	}
+	for (int i = 0; i < NUM_THREADS; i++) {
+		while (!front_ranges[i]->eos_marker) {
+			front_ranges[i] = get_next_range(queues[i],
+			    front_ranges[i]);
+		}
+		range_free(front_ranges[i]);
+	}
+	if (range == NULL)
+		range = kmem_zalloc(sizeof (*range), KM_SLEEP);
+	range->eos_marker = B_TRUE;
+	bqueue_enqueue_flush(&smt_arg->q, range, 1);
+	spl_fstrans_unmark(cookie);
+	thread_exit();
+}
+
+struct send_reader_thread_arg {
+	struct send_merge_thread_arg *smta;
+	bqueue_t q;
+	boolean_t cancel;
+	boolean_t issue_reads;
+	uint64_t featureflags;
+	int error;
+};
+
+static void
+dmu_send_read_done(zio_t *zio)
+{
+	struct send_range *range = zio->io_private;
+
+	mutex_enter(&range->sru.data.lock);
+	if (zio->io_error != 0) {
+		abd_free(range->sru.data.abd);
+		range->sru.data.abd = NULL;
+		range->sru.data.io_err = zio->io_error;
+	}
+
+	ASSERT(range->sru.data.io_outstanding);
+	range->sru.data.io_outstanding = B_FALSE;
+	cv_broadcast(&range->sru.data.cv);
+	mutex_exit(&range->sru.data.lock);
+}
+
+static void
+issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range)
+{
+	struct srd *srdp = &range->sru.data;
+	blkptr_t *bp = &srdp->bp;
+	objset_t *os = srta->smta->os;
+
+	ASSERT3U(range->type, ==, DATA);
+	ASSERT3U(range->start_blkid + 1, ==, range->end_blkid);
+	/*
+	 * If we have large blocks stored on disk but
+	 * the send flags don't allow us to send large
+	 * blocks, we split the data from the arc buf
+	 * into chunks.
+	 */
+	boolean_t split_large_blocks =
+	    srdp->datablksz > SPA_OLD_MAXBLOCKSIZE &&
+	    !(srta->featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
+	/*
+	 * We should only request compressed data from the ARC if all
+	 * the following are true:
+	 *  - stream compression was requested
+	 *  - we aren't splitting large blocks into smaller chunks
+	 *  - the data won't need to be byteswapped before sending
+	 *  - this isn't an embedded block
+	 *  - this isn't metadata (if receiving on a different endian
+	 *    system it can be byteswapped more easily)
+	 */
+	boolean_t request_compressed =
+	    (srta->featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
+	    !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
+	    !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
+
+	enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
+
+	if (srta->featureflags & DMU_BACKUP_FEATURE_RAW)
+		zioflags |= ZIO_FLAG_RAW;
+	else if (request_compressed)
+		zioflags |= ZIO_FLAG_RAW_COMPRESS;
+
+	srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ?
+	    BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp);
+
+	if (!srta->issue_reads)
+		return;
+	if (BP_IS_REDACTED(bp))
+		return;
+	if (send_do_embed(bp, srta->featureflags))
+		return;
+
+	zbookmark_phys_t zb = {
+	    .zb_objset = dmu_objset_id(os),
+	    .zb_object = range->object,
+	    .zb_level = 0,
+	    .zb_blkid = range->start_blkid,
+	};
+
+	arc_flags_t aflags = ARC_FLAG_CACHED_ONLY;
+
+	int arc_err = arc_read(NULL, os->os_spa, bp,
+	    arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ,
+	    zioflags, &aflags, &zb);
+	/*
+	 * If the data is not already cached in the ARC, we read directly
+	 * from zio.  This avoids the performance overhead of adding a new
+	 * entry to the ARC, and we also avoid polluting the ARC cache with
+	 * data that is not likely to be used in the future.
+	 */
+	if (arc_err != 0) {
+		srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE);
+		srdp->io_outstanding = B_TRUE;
+		zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd,
+		    srdp->datasz, dmu_send_read_done, range,
+		    ZIO_PRIORITY_ASYNC_READ, zioflags, &zb));
+	}
+}
+
+/*
+ * Create a new record with the given values.
+ */
+static void
+enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn,
+    uint64_t blkid, uint64_t count, const blkptr_t *bp, uint32_t datablksz)
+{
+	enum type range_type = (bp == NULL || BP_IS_HOLE(bp) ? HOLE :
+	    (BP_IS_REDACTED(bp) ? REDACT : DATA));
+
+	struct send_range *range = range_alloc(range_type, dn->dn_object,
+	    blkid, blkid + count, B_FALSE);
+
+	if (blkid == DMU_SPILL_BLKID)
+		ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA);
+
+	switch (range_type) {
+	case HOLE:
+		range->sru.hole.datablksz = datablksz;
+		break;
+	case DATA:
+		ASSERT3U(count, ==, 1);
+		range->sru.data.datablksz = datablksz;
+		range->sru.data.obj_type = dn->dn_type;
+		range->sru.data.bp = *bp;
+		issue_data_read(srta, range);
+		break;
+	case REDACT:
+		range->sru.redact.datablksz = datablksz;
+		break;
+	default:
+		break;
+	}
+	bqueue_enqueue(q, range, datablksz);
+}
+
+/*
+ * This thread is responsible for two things: First, it retrieves the correct
+ * blkptr in the to ds if we need to send the data because of something from
+ * the from thread.  As a result of this, we're the first ones to discover that
+ * some indirect blocks can be discarded because they're not holes. Second,
+ * it issues prefetches for the data we need to send.
+ */
+static void
+send_reader_thread(void *arg)
+{
+	struct send_reader_thread_arg *srta = arg;
+	struct send_merge_thread_arg *smta = srta->smta;
+	bqueue_t *inq = &smta->q;
+	bqueue_t *outq = &srta->q;
+	objset_t *os = smta->os;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	struct send_range *range = bqueue_dequeue(inq);
+	int err = 0;
+
+	/*
+	 * If the record we're analyzing is from a redaction bookmark from the
+	 * fromds, then we need to know whether or not it exists in the tods so
+	 * we know whether to create records for it or not. If it does, we need
+	 * the datablksz so we can generate an appropriate record for it.
+	 * Finally, if it isn't redacted, we need the blkptr so that we can send
+	 * a WRITE record containing the actual data.
+	 */
+	uint64_t last_obj = UINT64_MAX;
+	uint64_t last_obj_exists = B_TRUE;
+	while (!range->eos_marker && !srta->cancel && smta->error == 0 &&
+	    err == 0) {
+		switch (range->type) {
+		case DATA:
+			issue_data_read(srta, range);
+			bqueue_enqueue(outq, range, range->sru.data.datablksz);
+			range = get_next_range_nofree(inq, range);
+			break;
+		case HOLE:
+		case OBJECT:
+		case OBJECT_RANGE:
+		case REDACT: // Redacted blocks must exist
+			bqueue_enqueue(outq, range, sizeof (*range));
+			range = get_next_range_nofree(inq, range);
+			break;
+		case PREVIOUSLY_REDACTED: {
+			/*
+			 * This entry came from the "from bookmark" when
+			 * sending from a bookmark that has a redaction
+			 * list.  We need to check if this object/blkid
+			 * exists in the target ("to") dataset, and if
+			 * not then we drop this entry.  We also need
+			 * to fill in the block pointer so that we know
+			 * what to prefetch.
+			 *
+			 * To accomplish the above, we first cache whether or
+			 * not the last object we examined exists.  If it
+			 * doesn't, we can drop this record. If it does, we hold
+			 * the dnode and use it to call dbuf_dnode_findbp. We do
+			 * this instead of dbuf_bookmark_findbp because we will
+			 * often operate on large ranges, and holding the dnode
+			 * once is more efficient.
+			 */
+			boolean_t object_exists = B_TRUE;
+			/*
+			 * If the data is redacted, we only care if it exists,
+			 * so that we don't send records for objects that have
+			 * been deleted.
+			 */
+			dnode_t *dn;
+			if (range->object == last_obj && !last_obj_exists) {
+				/*
+				 * If we're still examining the same object as
+				 * previously, and it doesn't exist, we don't
+				 * need to call dbuf_bookmark_findbp.
+				 */
+				object_exists = B_FALSE;
+			} else {
+				err = dnode_hold(os, range->object, FTAG, &dn);
+				if (err == ENOENT) {
+					object_exists = B_FALSE;
+					err = 0;
+				}
+				last_obj = range->object;
+				last_obj_exists = object_exists;
+			}
+
+			if (err != 0) {
+				break;
+			} else if (!object_exists) {
+				/*
+				 * The block was modified, but doesn't
+				 * exist in the to dataset; if it was
+				 * deleted in the to dataset, then we'll
+				 * visit the hole bp for it at some point.
+				 */
+				range = get_next_range(inq, range);
+				continue;
+			}
+			uint64_t file_max =
+			    (dn->dn_maxblkid < range->end_blkid ?
+			    dn->dn_maxblkid : range->end_blkid);
+			/*
+			 * The object exists, so we need to try to find the
+			 * blkptr for each block in the range we're processing.
+			 */
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			for (uint64_t blkid = range->start_blkid;
+			    blkid < file_max; blkid++) {
+				blkptr_t bp;
+				uint32_t datablksz =
+				    dn->dn_phys->dn_datablkszsec <<
+				    SPA_MINBLOCKSHIFT;
+				uint64_t offset = blkid * datablksz;
+				/*
+				 * This call finds the next non-hole block in
+				 * the object. This is to prevent a
+				 * performance problem where we're unredacting
+				 * a large hole. Using dnode_next_offset to
+				 * skip over the large hole avoids iterating
+				 * over every block in it.
+				 */
+				err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+				    &offset, 1, 1, 0);
+				if (err == ESRCH) {
+					offset = UINT64_MAX;
+					err = 0;
+				} else if (err != 0) {
+					break;
+				}
+				if (offset != blkid * datablksz) {
+					/*
+					 * if there is a hole from here
+					 * (blkid) to offset
+					 */
+					offset = MIN(offset, file_max *
+					    datablksz);
+					uint64_t nblks = (offset / datablksz) -
+					    blkid;
+					enqueue_range(srta, outq, dn, blkid,
+					    nblks, NULL, datablksz);
+					blkid += nblks;
+				}
+				if (blkid >= file_max)
+					break;
+				err = dbuf_dnode_findbp(dn, 0, blkid, &bp,
+				    NULL, NULL);
+				if (err != 0)
+					break;
+				ASSERT(!BP_IS_HOLE(&bp));
+				enqueue_range(srta, outq, dn, blkid, 1, &bp,
+				    datablksz);
+			}
+			rw_exit(&dn->dn_struct_rwlock);
+			dnode_rele(dn, FTAG);
+			range = get_next_range(inq, range);
+		}
+		}
+	}
+	if (srta->cancel || err != 0) {
+		smta->cancel = B_TRUE;
+		srta->error = err;
+	} else if (smta->error != 0) {
+		srta->error = smta->error;
+	}
+	while (!range->eos_marker)
+		range = get_next_range(inq, range);
+
+	bqueue_enqueue_flush(outq, range, 1);
+	spl_fstrans_unmark(cookie);
+	thread_exit();
+}
+
+#define	NUM_SNAPS_NOT_REDACTED UINT64_MAX
+
+struct dmu_send_params {
+	/* Pool args */
+	void *tag; // Tag that dp was held with, will be used to release dp.
+	dsl_pool_t *dp;
+	/* To snapshot args */
+	const char *tosnap;
+	dsl_dataset_t *to_ds;
+	/* From snapshot args */
+	zfs_bookmark_phys_t ancestor_zb;
+	uint64_t *fromredactsnaps;
+	/* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */
+	uint64_t numfromredactsnaps;
+	/* Stream params */
+	boolean_t is_clone;
+	boolean_t embedok;
+	boolean_t large_block_ok;
+	boolean_t compressok;
+	boolean_t rawok;
+	boolean_t savedok;
+	uint64_t resumeobj;
+	uint64_t resumeoff;
+	uint64_t saved_guid;
+	zfs_bookmark_phys_t *redactbook;
+	/* Stream output params */
+	dmu_send_outparams_t *dso;
+
+	/* Stream progress params */
+	offset_t *off;
+	int outfd;
+	char saved_toname[MAXNAMELEN];
+};
+
+static int
+setup_featureflags(struct dmu_send_params *dspp, objset_t *os,
+    uint64_t *featureflags)
+{
+	dsl_dataset_t *to_ds = dspp->to_ds;
+	dsl_pool_t *dp = dspp->dp;
+#ifdef _KERNEL
+	if (dmu_objset_type(os) == DMU_OST_ZFS) {
+		uint64_t version;
+		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0)
+			return (SET_ERROR(EINVAL));
+
+		if (version >= ZPL_VERSION_SA)
+			*featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
+	}
+#endif
+
+	/* raw sends imply large_block_ok */
+	if ((dspp->rawok || dspp->large_block_ok) &&
+	    dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) {
+		*featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+	}
+
+	/* encrypted datasets will not have embedded blocks */
+	if ((dspp->embedok || dspp->rawok) && !os->os_encrypted &&
+	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+		*featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+	}
+
+	/* raw send implies compressok */
+	if (dspp->compressok || dspp->rawok)
+		*featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
+
+	if (dspp->rawok && os->os_encrypted)
+		*featureflags |= DMU_BACKUP_FEATURE_RAW;
+
+	if ((*featureflags &
+	    (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
+	    DMU_BACKUP_FEATURE_RAW)) != 0 &&
+	    spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
+		*featureflags |= DMU_BACKUP_FEATURE_LZ4;
+	}
+
+	/*
+	 * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to
+	 * allow sending ZSTD compressed datasets to a receiver that does not
+	 * support ZSTD
+	 */
+	if ((*featureflags &
+	    (DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 &&
+	    dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_ZSTD_COMPRESS)) {
+		*featureflags |= DMU_BACKUP_FEATURE_ZSTD;
+	}
+
+	if (dspp->resumeobj != 0 || dspp->resumeoff != 0) {
+		*featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+	}
+
+	if (dspp->redactbook != NULL) {
+		*featureflags |= DMU_BACKUP_FEATURE_REDACTED;
+	}
+
+	if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) {
+		*featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
+	}
+	return (0);
+}
+
+static dmu_replay_record_t *
+create_begin_record(struct dmu_send_params *dspp, objset_t *os,
+    uint64_t featureflags)
+{
+	dmu_replay_record_t *drr = kmem_zalloc(sizeof (dmu_replay_record_t),
+	    KM_SLEEP);
+	drr->drr_type = DRR_BEGIN;
+
+	struct drr_begin *drrb = &drr->drr_u.drr_begin;
+	dsl_dataset_t *to_ds = dspp->to_ds;
+
+	drrb->drr_magic = DMU_BACKUP_MAGIC;
+	drrb->drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time;
+	drrb->drr_type = dmu_objset_type(os);
+	drrb->drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+	drrb->drr_fromguid = dspp->ancestor_zb.zbm_guid;
+
+	DMU_SET_STREAM_HDRTYPE(drrb->drr_versioninfo, DMU_SUBSTREAM);
+	DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, featureflags);
+
+	if (dspp->is_clone)
+		drrb->drr_flags |= DRR_FLAG_CLONE;
+	if (dsl_dataset_phys(dspp->to_ds)->ds_flags & DS_FLAG_CI_DATASET)
+		drrb->drr_flags |= DRR_FLAG_CI_DATA;
+	if (zfs_send_set_freerecords_bit)
+		drrb->drr_flags |= DRR_FLAG_FREERECORDS;
+	drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
+
+	if (dspp->savedok) {
+		drrb->drr_toguid = dspp->saved_guid;
+		strlcpy(drrb->drr_toname, dspp->saved_toname,
+		    sizeof (drrb->drr_toname));
+	} else {
+		dsl_dataset_name(to_ds, drrb->drr_toname);
+		if (!to_ds->ds_is_snapshot) {
+			(void) strlcat(drrb->drr_toname, "@--head--",
+			    sizeof (drrb->drr_toname));
+		}
+	}
+	return (drr);
+}
+
+static void
+setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os,
+    dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok)
+{
+	VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff,
+	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+	    offsetof(struct send_range, ln)));
+	to_arg->error_code = 0;
+	to_arg->cancel = B_FALSE;
+	to_arg->os = to_os;
+	to_arg->fromtxg = fromtxg;
+	to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA;
+	if (rawok)
+		to_arg->flags |= TRAVERSE_NO_DECRYPT;
+	to_arg->num_blocks_visited = &dssp->dss_blocks;
+	(void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0,
+	    curproc, TS_RUN, minclsyspri);
+}
+
+static void
+setup_from_thread(struct redact_list_thread_arg *from_arg,
+    redaction_list_t *from_rl, dmu_sendstatus_t *dssp)
+{
+	VERIFY0(bqueue_init(&from_arg->q, zfs_send_no_prefetch_queue_ff,
+	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+	    offsetof(struct send_range, ln)));
+	from_arg->error_code = 0;
+	from_arg->cancel = B_FALSE;
+	from_arg->rl = from_rl;
+	from_arg->mark_redact = B_FALSE;
+	from_arg->num_blocks_visited = &dssp->dss_blocks;
+	/*
+	 * If from_ds is null, send_traverse_thread just returns success and
+	 * enqueues an eos marker.
+	 */
+	(void) thread_create(NULL, 0, redact_list_thread, from_arg, 0,
+	    curproc, TS_RUN, minclsyspri);
+}
+
+static void
+setup_redact_list_thread(struct redact_list_thread_arg *rlt_arg,
+    struct dmu_send_params *dspp, redaction_list_t *rl, dmu_sendstatus_t *dssp)
+{
+	if (dspp->redactbook == NULL)
+		return;
+
+	rlt_arg->cancel = B_FALSE;
+	VERIFY0(bqueue_init(&rlt_arg->q, zfs_send_no_prefetch_queue_ff,
+	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+	    offsetof(struct send_range, ln)));
+	rlt_arg->error_code = 0;
+	rlt_arg->mark_redact = B_TRUE;
+	rlt_arg->rl = rl;
+	rlt_arg->num_blocks_visited = &dssp->dss_blocks;
+
+	(void) thread_create(NULL, 0, redact_list_thread, rlt_arg, 0,
+	    curproc, TS_RUN, minclsyspri);
+}
+
+static void
+setup_merge_thread(struct send_merge_thread_arg *smt_arg,
+    struct dmu_send_params *dspp, struct redact_list_thread_arg *from_arg,
+    struct send_thread_arg *to_arg, struct redact_list_thread_arg *rlt_arg,
+    objset_t *os)
+{
+	VERIFY0(bqueue_init(&smt_arg->q, zfs_send_no_prefetch_queue_ff,
+	    MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize),
+	    offsetof(struct send_range, ln)));
+	smt_arg->cancel = B_FALSE;
+	smt_arg->error = 0;
+	smt_arg->from_arg = from_arg;
+	smt_arg->to_arg = to_arg;
+	if (dspp->redactbook != NULL)
+		smt_arg->redact_arg = rlt_arg;
+
+	smt_arg->os = os;
+	(void) thread_create(NULL, 0, send_merge_thread, smt_arg, 0, curproc,
+	    TS_RUN, minclsyspri);
+}
+
+static void
+setup_reader_thread(struct send_reader_thread_arg *srt_arg,
+    struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg,
+    uint64_t featureflags)
+{
+	VERIFY0(bqueue_init(&srt_arg->q, zfs_send_queue_ff,
+	    MAX(zfs_send_queue_length, 2 * zfs_max_recordsize),
+	    offsetof(struct send_range, ln)));
+	srt_arg->smta = smt_arg;
+	srt_arg->issue_reads = !dspp->dso->dso_dryrun;
+	srt_arg->featureflags = featureflags;
+	(void) thread_create(NULL, 0, send_reader_thread, srt_arg, 0,
+	    curproc, TS_RUN, minclsyspri);
+}
+
+static int
+setup_resume_points(struct dmu_send_params *dspp,
+    struct send_thread_arg *to_arg, struct redact_list_thread_arg *from_arg,
+    struct redact_list_thread_arg *rlt_arg,
+    struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os,
+    redaction_list_t *redact_rl, nvlist_t *nvl)
+{
+	dsl_dataset_t *to_ds = dspp->to_ds;
+	int err = 0;
+
+	uint64_t obj = 0;
+	uint64_t blkid = 0;
+	if (resuming) {
+		obj = dspp->resumeobj;
+		dmu_object_info_t to_doi;
+		err = dmu_object_info(os, obj, &to_doi);
+		if (err != 0)
+			return (err);
+
+		blkid = dspp->resumeoff / to_doi.doi_data_block_size;
+	}
+	/*
+	 * If we're resuming a redacted send, we can skip to the appropriate
+	 * point in the redaction bookmark by binary searching through it.
+	 */
+	if (redact_rl != NULL) {
+		SET_BOOKMARK(&rlt_arg->resume, to_ds->ds_object, obj, 0, blkid);
+	}
+
+	SET_BOOKMARK(&to_arg->resume, to_ds->ds_object, obj, 0, blkid);
+	if (nvlist_exists(nvl, BEGINNV_REDACT_FROM_SNAPS)) {
+		uint64_t objset = dspp->ancestor_zb.zbm_redaction_obj;
+		/*
+		 * Note: If the resume point is in an object whose
+		 * blocksize is different in the from vs to snapshots,
+		 * we will have divided by the "wrong" blocksize.
+		 * However, in this case fromsnap's send_cb() will
+		 * detect that the blocksize has changed and therefore
+		 * ignore this object.
+		 *
+		 * If we're resuming a send from a redaction bookmark,
+		 * we still cannot accidentally suggest blocks behind
+		 * the to_ds.  In addition, we know that any blocks in
+		 * the object in the to_ds will have to be sent, since
+		 * the size changed.  Therefore, we can't cause any harm
+		 * this way either.
+		 */
+		SET_BOOKMARK(&from_arg->resume, objset, obj, 0, blkid);
+	}
+	if (resuming) {
+		fnvlist_add_uint64(nvl, BEGINNV_RESUME_OBJECT, dspp->resumeobj);
+		fnvlist_add_uint64(nvl, BEGINNV_RESUME_OFFSET, dspp->resumeoff);
+	}
+	return (0);
+}
+
+static dmu_sendstatus_t *
+setup_send_progress(struct dmu_send_params *dspp)
+{
+	dmu_sendstatus_t *dssp = kmem_zalloc(sizeof (*dssp), KM_SLEEP);
+	dssp->dss_outfd = dspp->outfd;
+	dssp->dss_off = dspp->off;
+	dssp->dss_proc = curproc;
+	mutex_enter(&dspp->to_ds->ds_sendstream_lock);
+	list_insert_head(&dspp->to_ds->ds_sendstreams, dssp);
+	mutex_exit(&dspp->to_ds->ds_sendstream_lock);
+	return (dssp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * The idea is that we want to do a send from ancestor_zb to to_ds.  We also
+ * want to not send any data that has been modified by all the datasets in
+ * redactsnaparr, and store the list of blocks that are redacted in this way in
+ * a bookmark named redactbook, created on the to_ds.  We do this by creating
+ * several worker threads, whose function is described below.
+ *
+ * There are three cases.
+ * The first case is a redacted zfs send.  In this case there are 5 threads.
+ * The first thread is the to_ds traversal thread: it calls dataset_traverse on
+ * the to_ds and finds all the blocks that have changed since ancestor_zb (if
+ * it's a full send, that's all blocks in the dataset).  It then sends those
+ * blocks on to the send merge thread. The redact list thread takes the data
+ * from the redaction bookmark and sends those blocks on to the send merge
+ * thread.  The send merge thread takes the data from the to_ds traversal
+ * thread, and combines it with the redaction records from the redact list
+ * thread.  If a block appears in both the to_ds's data and the redaction data,
+ * the send merge thread will mark it as redacted and send it on to the prefetch
+ * thread.  Otherwise, the send merge thread will send the block on to the
+ * prefetch thread unchanged. The prefetch thread will issue prefetch reads for
+ * any data that isn't redacted, and then send the data on to the main thread.
+ * The main thread behaves the same as in a normal send case, issuing demand
+ * reads for data blocks and sending out records over the network
+ *
+ * The graphic below diagrams the flow of data in the case of a redacted zfs
+ * send.  Each box represents a thread, and each line represents the flow of
+ * data.
+ *
+ *             Records from the |
+ *           redaction bookmark |
+ * +--------------------+       |  +---------------------------+
+ * |                    |       v  | Send Merge Thread         |
+ * | Redact List Thread +----------> Apply redaction marks to  |
+ * |                    |          | records as specified by   |
+ * +--------------------+          | redaction ranges          |
+ *                                 +----^---------------+------+
+ *                                      |               | Merged data
+ *                                      |               |
+ *                                      |  +------------v--------+
+ *                                      |  | Prefetch Thread     |
+ * +--------------------+               |  | Issues prefetch     |
+ * | to_ds Traversal    |               |  | reads of data blocks|
+ * | Thread (finds      +---------------+  +------------+--------+
+ * | candidate blocks)  |  Blocks modified              | Prefetched data
+ * +--------------------+  by to_ds since               |
+ *                         ancestor_zb     +------------v----+
+ *                                         | Main Thread     |  File Descriptor
+ *                                         | Sends data over +->(to zfs receive)
+ *                                         | wire            |
+ *                                         +-----------------+
+ *
+ * The second case is an incremental send from a redaction bookmark.  The to_ds
+ * traversal thread and the main thread behave the same as in the redacted
+ * send case.  The new thread is the from bookmark traversal thread.  It
+ * iterates over the redaction list in the redaction bookmark, and enqueues
+ * records for each block that was redacted in the original send.  The send
+ * merge thread now has to merge the data from the two threads.  For details
+ * about that process, see the header comment of send_merge_thread().  Any data
+ * it decides to send on will be prefetched by the prefetch thread.  Note that
+ * you can perform a redacted send from a redaction bookmark; in that case,
+ * the data flow behaves very similarly to the flow in the redacted send case,
+ * except with the addition of the bookmark traversal thread iterating over the
+ * redaction bookmark.  The send_merge_thread also has to take on the
+ * responsibility of merging the redact list thread's records, the bookmark
+ * traversal thread's records, and the to_ds records.
+ *
+ * +---------------------+
+ * |                     |
+ * | Redact List Thread  +--------------+
+ * |                     |              |
+ * +---------------------+              |
+ *        Blocks in redaction list      | Ranges modified by every secure snap
+ *        of from bookmark              | (or EOS if not readcted)
+ *                                      |
+ * +---------------------+   |     +----v----------------------+
+ * | bookmark Traversal  |   v     | Send Merge Thread         |
+ * | Thread (finds       +---------> Merges bookmark, rlt, and |
+ * | candidate blocks)   |         | to_ds send records        |
+ * +---------------------+         +----^---------------+------+
+ *                                      |               | Merged data
+ *                                      |  +------------v--------+
+ *                                      |  | Prefetch Thread     |
+ * +--------------------+               |  | Issues prefetch     |
+ * | to_ds Traversal    |               |  | reads of data blocks|
+ * | Thread (finds      +---------------+  +------------+--------+
+ * | candidate blocks)  |  Blocks modified              | Prefetched data
+ * +--------------------+  by to_ds since  +------------v----+
+ *                         ancestor_zb     | Main Thread     |  File Descriptor
+ *                                         | Sends data over +->(to zfs receive)
+ *                                         | wire            |
+ *                                         +-----------------+
+ *
+ * The final case is a simple zfs full or incremental send.  The to_ds traversal
+ * thread behaves the same as always. The redact list thread is never started.
+ * The send merge thread takes all the blocks that the to_ds traversal thread
+ * sends it, prefetches the data, and sends the blocks on to the main thread.
+ * The main thread sends the data over the wire.
+ *
+ * To keep performance acceptable, we want to prefetch the data in the worker
+ * threads.  While the to_ds thread could simply use the TRAVERSE_PREFETCH
+ * feature built into traverse_dataset, the combining and deletion of records
+ * due to redaction and sends from redaction bookmarks mean that we could
+ * issue many unnecessary prefetches.  As a result, we only prefetch data
+ * after we've determined that the record is not going to be redacted.  To
+ * prevent the prefetching from getting too far ahead of the main thread, the
+ * blocking queues that are used for communication are capped not by the
+ * number of entries in the queue, but by the sum of the size of the
+ * prefetches associated with them.  The limit on the amount of data that the
+ * thread can prefetch beyond what the main thread has reached is controlled
+ * by the global variable zfs_send_queue_length.  In addition, to prevent poor
+ * performance in the beginning of a send, we also limit the distance ahead
+ * that the traversal threads can be.  That distance is controlled by the
+ * zfs_send_no_prefetch_queue_length tunable.
+ *
+ * Note: Releases dp using the specified tag.
+ */
+static int
+dmu_send_impl(struct dmu_send_params *dspp)
+{
+	objset_t *os;
+	dmu_replay_record_t *drr;
+	dmu_sendstatus_t *dssp;
+	dmu_send_cookie_t dsc = {0};
+	int err;
+	uint64_t fromtxg = dspp->ancestor_zb.zbm_creation_txg;
+	uint64_t featureflags = 0;
+	struct redact_list_thread_arg *from_arg;
+	struct send_thread_arg *to_arg;
+	struct redact_list_thread_arg *rlt_arg;
+	struct send_merge_thread_arg *smt_arg;
+	struct send_reader_thread_arg *srt_arg;
+	struct send_range *range;
+	redaction_list_t *from_rl = NULL;
+	redaction_list_t *redact_rl = NULL;
+	boolean_t resuming = (dspp->resumeobj != 0 || dspp->resumeoff != 0);
+	boolean_t book_resuming = resuming;
+
+	dsl_dataset_t *to_ds = dspp->to_ds;
+	zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb;
+	dsl_pool_t *dp = dspp->dp;
+	void *tag = dspp->tag;
+
+	err = dmu_objset_from_ds(to_ds, &os);
+	if (err != 0) {
+		dsl_pool_rele(dp, tag);
+		return (err);
+	}
+
+	/*
+	 * If this is a non-raw send of an encrypted ds, we can ensure that
+	 * the objset_phys_t is authenticated. This is safe because this is
+	 * either a snapshot or we have owned the dataset, ensuring that
+	 * it can't be modified.
+	 */
+	if (!dspp->rawok && os->os_encrypted &&
+	    arc_is_unauthenticated(os->os_phys_buf)) {
+		zbookmark_phys_t zb;
+
+		SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT,
+		    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+		err = arc_untransform(os->os_phys_buf, os->os_spa,
+		    &zb, B_FALSE);
+		if (err != 0) {
+			dsl_pool_rele(dp, tag);
+			return (err);
+		}
+
+		ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
+	}
+
+	if ((err = setup_featureflags(dspp, os, &featureflags)) != 0) {
+		dsl_pool_rele(dp, tag);
+		return (err);
+	}
+
+	/*
+	 * If we're doing a redacted send, hold the bookmark's redaction list.
+	 */
+	if (dspp->redactbook != NULL) {
+		err = dsl_redaction_list_hold_obj(dp,
+		    dspp->redactbook->zbm_redaction_obj, FTAG,
+		    &redact_rl);
+		if (err != 0) {
+			dsl_pool_rele(dp, tag);
+			return (SET_ERROR(EINVAL));
+		}
+		dsl_redaction_list_long_hold(dp, redact_rl, FTAG);
+	}
+
+	/*
+	 * If we're sending from a redaction bookmark, hold the redaction list
+	 * so that we can consider sending the redacted blocks.
+	 */
+	if (ancestor_zb->zbm_redaction_obj != 0) {
+		err = dsl_redaction_list_hold_obj(dp,
+		    ancestor_zb->zbm_redaction_obj, FTAG, &from_rl);
+		if (err != 0) {
+			if (redact_rl != NULL) {
+				dsl_redaction_list_long_rele(redact_rl, FTAG);
+				dsl_redaction_list_rele(redact_rl, FTAG);
+			}
+			dsl_pool_rele(dp, tag);
+			return (SET_ERROR(EINVAL));
+		}
+		dsl_redaction_list_long_hold(dp, from_rl, FTAG);
+	}
+
+	dsl_dataset_long_hold(to_ds, FTAG);
+
+	from_arg = kmem_zalloc(sizeof (*from_arg), KM_SLEEP);
+	to_arg = kmem_zalloc(sizeof (*to_arg), KM_SLEEP);
+	rlt_arg = kmem_zalloc(sizeof (*rlt_arg), KM_SLEEP);
+	smt_arg = kmem_zalloc(sizeof (*smt_arg), KM_SLEEP);
+	srt_arg = kmem_zalloc(sizeof (*srt_arg), KM_SLEEP);
+
+	drr = create_begin_record(dspp, os, featureflags);
+	dssp = setup_send_progress(dspp);
+
+	dsc.dsc_drr = drr;
+	dsc.dsc_dso = dspp->dso;
+	dsc.dsc_os = os;
+	dsc.dsc_off = dspp->off;
+	dsc.dsc_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+	dsc.dsc_fromtxg = fromtxg;
+	dsc.dsc_pending_op = PENDING_NONE;
+	dsc.dsc_featureflags = featureflags;
+	dsc.dsc_resume_object = dspp->resumeobj;
+	dsc.dsc_resume_offset = dspp->resumeoff;
+
+	dsl_pool_rele(dp, tag);
+
+	void *payload = NULL;
+	size_t payload_len = 0;
+	nvlist_t *nvl = fnvlist_alloc();
+
+	/*
+	 * If we're doing a redacted send, we include the snapshots we're
+	 * redacted with respect to so that the target system knows what send
+	 * streams can be correctly received on top of this dataset. If we're
+	 * instead sending a redacted dataset, we include the snapshots that the
+	 * dataset was created with respect to.
+	 */
+	if (dspp->redactbook != NULL) {
+		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS,
+		    redact_rl->rl_phys->rlp_snaps,
+		    redact_rl->rl_phys->rlp_num_snaps);
+	} else if (dsl_dataset_feature_is_active(to_ds,
+	    SPA_FEATURE_REDACTED_DATASETS)) {
+		uint64_t *tods_guids;
+		uint64_t length;
+		VERIFY(dsl_dataset_get_uint64_array_feature(to_ds,
+		    SPA_FEATURE_REDACTED_DATASETS, &length, &tods_guids));
+		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, tods_guids,
+		    length);
+	}
+
+	/*
+	 * If we're sending from a redaction bookmark, then we should retrieve
+	 * the guids of that bookmark so we can send them over the wire.
+	 */
+	if (from_rl != NULL) {
+		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS,
+		    from_rl->rl_phys->rlp_snaps,
+		    from_rl->rl_phys->rlp_num_snaps);
+	}
+
+	/*
+	 * If the snapshot we're sending from is redacted, include the redaction
+	 * list in the stream.
+	 */
+	if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) {
+		ASSERT3P(from_rl, ==, NULL);
+		fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS,
+		    dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps);
+		if (dspp->numfromredactsnaps > 0) {
+			kmem_free(dspp->fromredactsnaps,
+			    dspp->numfromredactsnaps * sizeof (uint64_t));
+			dspp->fromredactsnaps = NULL;
+		}
+	}
+
+	if (resuming || book_resuming) {
+		err = setup_resume_points(dspp, to_arg, from_arg,
+		    rlt_arg, smt_arg, resuming, os, redact_rl, nvl);
+		if (err != 0)
+			goto out;
+	}
+
+	if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+		uint64_t ivset_guid = (ancestor_zb != NULL) ?
+		    ancestor_zb->zbm_ivset_guid : 0;
+		nvlist_t *keynvl = NULL;
+		ASSERT(os->os_encrypted);
+
+		err = dsl_crypto_populate_key_nvlist(os, ivset_guid,
+		    &keynvl);
+		if (err != 0) {
+			fnvlist_free(nvl);
+			goto out;
+		}
+
+		fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
+		fnvlist_free(keynvl);
+	}
+
+	if (!nvlist_empty(nvl)) {
+		payload = fnvlist_pack(nvl, &payload_len);
+		drr->drr_payloadlen = payload_len;
+	}
+
+	fnvlist_free(nvl);
+	err = dump_record(&dsc, payload, payload_len);
+	fnvlist_pack_free(payload, payload_len);
+	if (err != 0) {
+		err = dsc.dsc_err;
+		goto out;
+	}
+
+	setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok);
+	setup_from_thread(from_arg, from_rl, dssp);
+	setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp);
+	setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os);
+	setup_reader_thread(srt_arg, dspp, smt_arg, featureflags);
+
+	range = bqueue_dequeue(&srt_arg->q);
+	while (err == 0 && !range->eos_marker) {
+		err = do_dump(&dsc, range);
+		range = get_next_range(&srt_arg->q, range);
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			err = SET_ERROR(EINTR);
+	}
+
+	/*
+	 * If we hit an error or are interrupted, cancel our worker threads and
+	 * clear the queue of any pending records.  The threads will pass the
+	 * cancel up the tree of worker threads, and each one will clean up any
+	 * pending records before exiting.
+	 */
+	if (err != 0) {
+		srt_arg->cancel = B_TRUE;
+		while (!range->eos_marker) {
+			range = get_next_range(&srt_arg->q, range);
+		}
+	}
+	range_free(range);
+
+	bqueue_destroy(&srt_arg->q);
+	bqueue_destroy(&smt_arg->q);
+	if (dspp->redactbook != NULL)
+		bqueue_destroy(&rlt_arg->q);
+	bqueue_destroy(&to_arg->q);
+	bqueue_destroy(&from_arg->q);
+
+	if (err == 0 && srt_arg->error != 0)
+		err = srt_arg->error;
+
+	if (err != 0)
+		goto out;
+
+	if (dsc.dsc_pending_op != PENDING_NONE)
+		if (dump_record(&dsc, NULL, 0) != 0)
+			err = SET_ERROR(EINTR);
+
+	if (err != 0) {
+		if (err == EINTR && dsc.dsc_err != 0)
+			err = dsc.dsc_err;
+		goto out;
+	}
+
+	/*
+	 * Send the DRR_END record if this is not a saved stream.
+	 * Otherwise, the omitted DRR_END record will signal to
+	 * the receive side that the stream is incomplete.
+	 */
+	if (!dspp->savedok) {
+		bzero(drr, sizeof (dmu_replay_record_t));
+		drr->drr_type = DRR_END;
+		drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc;
+		drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid;
+
+		if (dump_record(&dsc, NULL, 0) != 0)
+			err = dsc.dsc_err;
+	}
+out:
+	mutex_enter(&to_ds->ds_sendstream_lock);
+	list_remove(&to_ds->ds_sendstreams, dssp);
+	mutex_exit(&to_ds->ds_sendstream_lock);
+
+	VERIFY(err != 0 || (dsc.dsc_sent_begin &&
+	    (dsc.dsc_sent_end || dspp->savedok)));
+
+	kmem_free(drr, sizeof (dmu_replay_record_t));
+	kmem_free(dssp, sizeof (dmu_sendstatus_t));
+	kmem_free(from_arg, sizeof (*from_arg));
+	kmem_free(to_arg, sizeof (*to_arg));
+	kmem_free(rlt_arg, sizeof (*rlt_arg));
+	kmem_free(smt_arg, sizeof (*smt_arg));
+	kmem_free(srt_arg, sizeof (*srt_arg));
+
+	dsl_dataset_long_rele(to_ds, FTAG);
+	if (from_rl != NULL) {
+		dsl_redaction_list_long_rele(from_rl, FTAG);
+		dsl_redaction_list_rele(from_rl, FTAG);
+	}
+	if (redact_rl != NULL) {
+		dsl_redaction_list_long_rele(redact_rl, FTAG);
+		dsl_redaction_list_rele(redact_rl, FTAG);
+	}
+
+	return (err);
+}
+
+int
+dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+    boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
+    boolean_t rawok, boolean_t savedok, int outfd, offset_t *off,
+    dmu_send_outparams_t *dsop)
+{
+	int err;
+	dsl_dataset_t *fromds;
+	ds_hold_flags_t dsflags;
+	struct dmu_send_params dspp = {0};
+	dspp.embedok = embedok;
+	dspp.large_block_ok = large_block_ok;
+	dspp.compressok = compressok;
+	dspp.outfd = outfd;
+	dspp.off = off;
+	dspp.dso = dsop;
+	dspp.tag = FTAG;
+	dspp.rawok = rawok;
+	dspp.savedok = savedok;
+
+	dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
+	err = dsl_pool_hold(pool, FTAG, &dspp.dp);
+	if (err != 0)
+		return (err);
+
+	err = dsl_dataset_hold_obj_flags(dspp.dp, tosnap, dsflags, FTAG,
+	    &dspp.to_ds);
+	if (err != 0) {
+		dsl_pool_rele(dspp.dp, FTAG);
+		return (err);
+	}
+
+	if (fromsnap != 0) {
+		err = dsl_dataset_hold_obj_flags(dspp.dp, fromsnap, dsflags,
+		    FTAG, &fromds);
+		if (err != 0) {
+			dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
+			dsl_pool_rele(dspp.dp, FTAG);
+			return (err);
+		}
+		dspp.ancestor_zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+		dspp.ancestor_zb.zbm_creation_txg =
+		    dsl_dataset_phys(fromds)->ds_creation_txg;
+		dspp.ancestor_zb.zbm_creation_time =
+		    dsl_dataset_phys(fromds)->ds_creation_time;
+
+		if (dsl_dataset_is_zapified(fromds)) {
+			(void) zap_lookup(dspp.dp->dp_meta_objset,
+			    fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
+			    &dspp.ancestor_zb.zbm_ivset_guid);
+		}
+
+		/* See dmu_send for the reasons behind this. */
+		uint64_t *fromredact;
+
+		if (!dsl_dataset_get_uint64_array_feature(fromds,
+		    SPA_FEATURE_REDACTED_DATASETS,
+		    &dspp.numfromredactsnaps,
+		    &fromredact)) {
+			dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+		} else if (dspp.numfromredactsnaps > 0) {
+			uint64_t size = dspp.numfromredactsnaps *
+			    sizeof (uint64_t);
+			dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP);
+			bcopy(fromredact, dspp.fromredactsnaps, size);
+		}
+
+		boolean_t is_before =
+		    dsl_dataset_is_before(dspp.to_ds, fromds, 0);
+		dspp.is_clone = (dspp.to_ds->ds_dir !=
+		    fromds->ds_dir);
+		dsl_dataset_rele(fromds, FTAG);
+		if (!is_before) {
+			dsl_pool_rele(dspp.dp, FTAG);
+			err = SET_ERROR(EXDEV);
+		} else {
+			err = dmu_send_impl(&dspp);
+		}
+	} else {
+		dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+		err = dmu_send_impl(&dspp);
+	}
+	dsl_dataset_rele(dspp.to_ds, FTAG);
+	return (err);
+}
+
+int
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
+    boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff,
+    const char *redactbook, int outfd, offset_t *off,
+    dmu_send_outparams_t *dsop)
+{
+	int err = 0;
+	ds_hold_flags_t dsflags;
+	boolean_t owned = B_FALSE;
+	dsl_dataset_t *fromds = NULL;
+	zfs_bookmark_phys_t book = {0};
+	struct dmu_send_params dspp = {0};
+
+	dsflags = (rawok) ? DS_HOLD_FLAG_NONE : DS_HOLD_FLAG_DECRYPT;
+	dspp.tosnap = tosnap;
+	dspp.embedok = embedok;
+	dspp.large_block_ok = large_block_ok;
+	dspp.compressok = compressok;
+	dspp.outfd = outfd;
+	dspp.off = off;
+	dspp.dso = dsop;
+	dspp.tag = FTAG;
+	dspp.resumeobj = resumeobj;
+	dspp.resumeoff = resumeoff;
+	dspp.rawok = rawok;
+	dspp.savedok = savedok;
+
+	if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
+		return (SET_ERROR(EINVAL));
+
+	err = dsl_pool_hold(tosnap, FTAG, &dspp.dp);
+	if (err != 0)
+		return (err);
+
+	if (strchr(tosnap, '@') == NULL && spa_writeable(dspp.dp->dp_spa)) {
+		/*
+		 * We are sending a filesystem or volume.  Ensure
+		 * that it doesn't change by owning the dataset.
+		 */
+
+		if (savedok) {
+			/*
+			 * We are looking for the dataset that represents the
+			 * partially received send stream. If this stream was
+			 * received as a new snapshot of an existing dataset,
+			 * this will be saved in a hidden clone named
+			 * "<pool>/<dataset>/%recv". Otherwise, the stream
+			 * will be saved in the live dataset itself. In
+			 * either case we need to use dsl_dataset_own_force()
+			 * because the stream is marked as inconsistent,
+			 * which would normally make it unavailable to be
+			 * owned.
+			 */
+			char *name = kmem_asprintf("%s/%s", tosnap,
+			    recv_clone_name);
+			err = dsl_dataset_own_force(dspp.dp, name, dsflags,
+			    FTAG, &dspp.to_ds);
+			if (err == ENOENT) {
+				err = dsl_dataset_own_force(dspp.dp, tosnap,
+				    dsflags, FTAG, &dspp.to_ds);
+			}
+
+			if (err == 0) {
+				err = zap_lookup(dspp.dp->dp_meta_objset,
+				    dspp.to_ds->ds_object,
+				    DS_FIELD_RESUME_TOGUID, 8, 1,
+				    &dspp.saved_guid);
+			}
+
+			if (err == 0) {
+				err = zap_lookup(dspp.dp->dp_meta_objset,
+				    dspp.to_ds->ds_object,
+				    DS_FIELD_RESUME_TONAME, 1,
+				    sizeof (dspp.saved_toname),
+				    dspp.saved_toname);
+			}
+			if (err != 0)
+				dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
+
+			kmem_strfree(name);
+		} else {
+			err = dsl_dataset_own(dspp.dp, tosnap, dsflags,
+			    FTAG, &dspp.to_ds);
+		}
+		owned = B_TRUE;
+	} else {
+		err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG,
+		    &dspp.to_ds);
+	}
+
+	if (err != 0) {
+		dsl_pool_rele(dspp.dp, FTAG);
+		return (err);
+	}
+
+	if (redactbook != NULL) {
+		char path[ZFS_MAX_DATASET_NAME_LEN];
+		(void) strlcpy(path, tosnap, sizeof (path));
+		char *at = strchr(path, '@');
+		if (at == NULL) {
+			err = EINVAL;
+		} else {
+			(void) snprintf(at, sizeof (path) - (at - path), "#%s",
+			    redactbook);
+			err = dsl_bookmark_lookup(dspp.dp, path,
+			    NULL, &book);
+			dspp.redactbook = &book;
+		}
+	}
+
+	if (err != 0) {
+		dsl_pool_rele(dspp.dp, FTAG);
+		if (owned)
+			dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
+		else
+			dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
+		return (err);
+	}
+
+	if (fromsnap != NULL) {
+		zfs_bookmark_phys_t *zb = &dspp.ancestor_zb;
+		int fsnamelen;
+		if (strpbrk(tosnap, "@#") != NULL)
+			fsnamelen = strpbrk(tosnap, "@#") - tosnap;
+		else
+			fsnamelen = strlen(tosnap);
+
+		/*
+		 * If the fromsnap is in a different filesystem, then
+		 * mark the send stream as a clone.
+		 */
+		if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
+		    (fromsnap[fsnamelen] != '@' &&
+		    fromsnap[fsnamelen] != '#')) {
+			dspp.is_clone = B_TRUE;
+		}
+
+		if (strchr(fromsnap, '@') != NULL) {
+			err = dsl_dataset_hold(dspp.dp, fromsnap, FTAG,
+			    &fromds);
+
+			if (err != 0) {
+				ASSERT3P(fromds, ==, NULL);
+			} else {
+				/*
+				 * We need to make a deep copy of the redact
+				 * snapshots of the from snapshot, because the
+				 * array will be freed when we evict from_ds.
+				 */
+				uint64_t *fromredact;
+				if (!dsl_dataset_get_uint64_array_feature(
+				    fromds, SPA_FEATURE_REDACTED_DATASETS,
+				    &dspp.numfromredactsnaps,
+				    &fromredact)) {
+					dspp.numfromredactsnaps =
+					    NUM_SNAPS_NOT_REDACTED;
+				} else if (dspp.numfromredactsnaps > 0) {
+					uint64_t size =
+					    dspp.numfromredactsnaps *
+					    sizeof (uint64_t);
+					dspp.fromredactsnaps = kmem_zalloc(size,
+					    KM_SLEEP);
+					bcopy(fromredact, dspp.fromredactsnaps,
+					    size);
+				}
+				if (!dsl_dataset_is_before(dspp.to_ds, fromds,
+				    0)) {
+					err = SET_ERROR(EXDEV);
+				} else {
+					zb->zbm_creation_txg =
+					    dsl_dataset_phys(fromds)->
+					    ds_creation_txg;
+					zb->zbm_creation_time =
+					    dsl_dataset_phys(fromds)->
+					    ds_creation_time;
+					zb->zbm_guid =
+					    dsl_dataset_phys(fromds)->ds_guid;
+					zb->zbm_redaction_obj = 0;
+
+					if (dsl_dataset_is_zapified(fromds)) {
+						(void) zap_lookup(
+						    dspp.dp->dp_meta_objset,
+						    fromds->ds_object,
+						    DS_FIELD_IVSET_GUID, 8, 1,
+						    &zb->zbm_ivset_guid);
+					}
+				}
+				dsl_dataset_rele(fromds, FTAG);
+			}
+		} else {
+			dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+			err = dsl_bookmark_lookup(dspp.dp, fromsnap, dspp.to_ds,
+			    zb);
+			if (err == EXDEV && zb->zbm_redaction_obj != 0 &&
+			    zb->zbm_guid ==
+			    dsl_dataset_phys(dspp.to_ds)->ds_guid)
+				err = 0;
+		}
+
+		if (err == 0) {
+			/* dmu_send_impl will call dsl_pool_rele for us. */
+			err = dmu_send_impl(&dspp);
+		} else {
+			dsl_pool_rele(dspp.dp, FTAG);
+		}
+	} else {
+		dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED;
+		err = dmu_send_impl(&dspp);
+	}
+	if (owned)
+		dsl_dataset_disown(dspp.to_ds, dsflags, FTAG);
+	else
+		dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG);
+	return (err);
+}
+
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
+    uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
+{
+	int err = 0;
+	uint64_t size;
+	/*
+	 * Assume that space (both on-disk and in-stream) is dominated by
+	 * data.  We will adjust for indirect blocks and the copies property,
+	 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+	 */
+
+	uint64_t recordsize;
+	uint64_t record_count;
+	objset_t *os;
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+
+	/* Assume all (uncompressed) blocks are recordsize. */
+	if (zfs_override_estimate_recordsize != 0) {
+		recordsize = zfs_override_estimate_recordsize;
+	} else if (os->os_phys->os_type == DMU_OST_ZVOL) {
+		err = dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
+	} else {
+		err = dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
+	}
+	if (err != 0)
+		return (err);
+	record_count = uncompressed / recordsize;
+
+	/*
+	 * If we're estimating a send size for a compressed stream, use the
+	 * compressed data size to estimate the stream size. Otherwise, use the
+	 * uncompressed data size.
+	 */
+	size = stream_compressed ? compressed : uncompressed;
+
+	/*
+	 * Subtract out approximate space used by indirect blocks.
+	 * Assume most space is used by data blocks (non-indirect, non-dnode).
+	 * Assume no ditto blocks or internal fragmentation.
+	 *
+	 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+	 * block.
+	 */
+	size -= record_count * sizeof (blkptr_t);
+
+	/* Add in the space for the record associated with each block. */
+	size += record_count * sizeof (dmu_replay_record_t);
+
+	*sizep = size;
+
+	return (0);
+}
+
+int
+dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds,
+    zfs_bookmark_phys_t *frombook, boolean_t stream_compressed,
+    boolean_t saved, uint64_t *sizep)
+{
+	int err;
+	dsl_dataset_t *ds = origds;
+	uint64_t uncomp, comp;
+
+	ASSERT(dsl_pool_config_held(origds->ds_dir->dd_pool));
+	ASSERT(fromds == NULL || frombook == NULL);
+
+	/*
+	 * If this is a saved send we may actually be sending
+	 * from the %recv clone used for resuming.
+	 */
+	if (saved) {
+		objset_t *mos = origds->ds_dir->dd_pool->dp_meta_objset;
+		uint64_t guid;
+		char dsname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+		dsl_dataset_name(origds, dsname);
+		(void) strcat(dsname, "/");
+		(void) strcat(dsname, recv_clone_name);
+
+		err = dsl_dataset_hold(origds->ds_dir->dd_pool,
+		    dsname, FTAG, &ds);
+		if (err != ENOENT && err != 0) {
+			return (err);
+		} else if (err == ENOENT) {
+			ds = origds;
+		}
+
+		/* check that this dataset has partially received data */
+		err = zap_lookup(mos, ds->ds_object,
+		    DS_FIELD_RESUME_TOGUID, 8, 1, &guid);
+		if (err != 0) {
+			err = SET_ERROR(err == ENOENT ? EINVAL : err);
+			goto out;
+		}
+
+		err = zap_lookup(mos, ds->ds_object,
+		    DS_FIELD_RESUME_TONAME, 1, sizeof (dsname), dsname);
+		if (err != 0) {
+			err = SET_ERROR(err == ENOENT ? EINVAL : err);
+			goto out;
+		}
+	}
+
+	/* tosnap must be a snapshot or the target of a saved send */
+	if (!ds->ds_is_snapshot && ds == origds)
+		return (SET_ERROR(EINVAL));
+
+	if (fromds != NULL) {
+		uint64_t used;
+		if (!fromds->ds_is_snapshot) {
+			err = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		if (!dsl_dataset_is_before(ds, fromds, 0)) {
+			err = SET_ERROR(EXDEV);
+			goto out;
+		}
+
+		err = dsl_dataset_space_written(fromds, ds, &used, &comp,
+		    &uncomp);
+		if (err != 0)
+			goto out;
+	} else if (frombook != NULL) {
+		uint64_t used;
+		err = dsl_dataset_space_written_bookmark(frombook, ds, &used,
+		    &comp, &uncomp);
+		if (err != 0)
+			goto out;
+	} else {
+		uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+		comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
+	}
+
+	err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
+	    stream_compressed, sizep);
+	/*
+	 * Add the size of the BEGIN and END records to the estimate.
+	 */
+	*sizep += 2 * sizeof (dmu_replay_record_t);
+
+out:
+	if (ds != origds)
+		dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW,
+	"Allow sending corrupt data");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, INT, ZMOD_RW,
+	"Maximum send queue length");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW,
+	"Send unmodified spill blocks");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, INT, ZMOD_RW,
+	"Maximum send queue length for non-prefetch queues");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, INT, ZMOD_RW,
+	"Send queue fill fraction");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, INT, ZMOD_RW,
+	"Send queue fill fraction for non-prefetch queues");
+
+ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, INT, ZMOD_RW,
+	"Override block size estimate with fixed size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_traverse.c b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
new file mode 100644
index 000000000000..31db49dae68c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_traverse.c
@@ -0,0 +1,788 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dnode.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/dmu_impl.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/callb.h>
+#include <sys/zfeature.h>
+
+int32_t zfs_pd_bytes_max = 50 * 1024 * 1024;	/* 50MB */
+int32_t send_holes_without_birth_time = 1;
+
+typedef struct prefetch_data {
+	kmutex_t pd_mtx;
+	kcondvar_t pd_cv;
+	int32_t pd_bytes_fetched;
+	int pd_flags;
+	boolean_t pd_cancel;
+	boolean_t pd_exited;
+	zbookmark_phys_t pd_resume;
+} prefetch_data_t;
+
+typedef struct traverse_data {
+	spa_t *td_spa;
+	uint64_t td_objset;
+	blkptr_t *td_rootbp;
+	uint64_t td_min_txg;
+	zbookmark_phys_t *td_resume;
+	int td_flags;
+	prefetch_data_t *td_pfd;
+	boolean_t td_paused;
+	uint64_t td_hole_birth_enabled_txg;
+	blkptr_cb_t *td_func;
+	void *td_arg;
+	boolean_t td_realloc_possible;
+} traverse_data_t;
+
+static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp,
+    const dnode_phys_t *dnp, uint64_t objset, uint64_t object);
+static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
+    uint64_t objset, uint64_t object);
+
+static int
+traverse_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
+    uint64_t claim_txg)
+{
+	traverse_data_t *td = arg;
+	zbookmark_phys_t zb;
+
+	if (BP_IS_HOLE(bp))
+		return (0);
+
+	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa))
+		return (-1);
+
+	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+
+	return (0);
+}
+
+static int
+traverse_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
+    uint64_t claim_txg)
+{
+	traverse_data_t *td = arg;
+
+	if (lrc->lrc_txtype == TX_WRITE) {
+		lr_write_t *lr = (lr_write_t *)lrc;
+		blkptr_t *bp = &lr->lr_blkptr;
+		zbookmark_phys_t zb;
+
+		if (BP_IS_HOLE(bp))
+			return (0);
+
+		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+			return (0);
+
+		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
+		    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+		    td->td_arg);
+	}
+	return (0);
+}
+
+static void
+traverse_zil(traverse_data_t *td, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed; plus blocks that are already stable in read-only mode.
+	 */
+	if (claim_txg == 0 && spa_writeable(td->td_spa))
+		return;
+
+	zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
+	(void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
+	    claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT));
+	zil_free(zilog);
+}
+
+typedef enum resume_skip {
+	RESUME_SKIP_ALL,
+	RESUME_SKIP_NONE,
+	RESUME_SKIP_CHILDREN
+} resume_skip_t;
+
+/*
+ * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and
+ * the block indicated by zb does not need to be visited at all. Returns
+ * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the
+ * resume point. This indicates that this block should be visited but not its
+ * children (since they must have been visited in a previous traversal).
+ * Otherwise returns RESUME_SKIP_NONE.
+ */
+static resume_skip_t
+resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
+    const zbookmark_phys_t *zb)
+{
+	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
+		/*
+		 * If we already visited this bp & everything below,
+		 * don't bother doing it again.
+		 */
+		if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
+			return (RESUME_SKIP_ALL);
+
+		/*
+		 * If we found the block we're trying to resume from, zero
+		 * the bookmark out to indicate that we have resumed.
+		 */
+		if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
+			bzero(td->td_resume, sizeof (*zb));
+			if (td->td_flags & TRAVERSE_POST)
+				return (RESUME_SKIP_CHILDREN);
+		}
+	}
+	return (RESUME_SKIP_NONE);
+}
+
+static void
+traverse_prefetch_metadata(traverse_data_t *td,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+
+	if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
+		return;
+	/*
+	 * If we are in the process of resuming, don't prefetch, because
+	 * some children will not be needed (and in fact may have already
+	 * been freed).
+	 */
+	if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume))
+		return;
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg)
+		return;
+	if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)
+		return;
+	ASSERT(!BP_IS_REDACTED(bp));
+
+	if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+		zio_flags |= ZIO_FLAG_RAW;
+
+	(void) arc_read(NULL, td->td_spa, bp, NULL, NULL,
+	    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+}
+
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+	ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp))
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+static int
+traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	int err = 0;
+	arc_buf_t *buf = NULL;
+	prefetch_data_t *pd = td->td_pfd;
+
+	switch (resume_skip_check(td, dnp, zb)) {
+	case RESUME_SKIP_ALL:
+		return (0);
+	case RESUME_SKIP_CHILDREN:
+		goto post;
+	case RESUME_SKIP_NONE:
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	if (bp->blk_birth == 0) {
+		/*
+		 * Since this block has a birth time of 0 it must be one of
+		 * two things: a hole created before the
+		 * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
+		 * which has always been a hole in an object.
+		 *
+		 * If a file is written sparsely, then the unwritten parts of
+		 * the file were "always holes" -- that is, they have been
+		 * holes since this object was allocated.  However, we (and
+		 * our callers) can not necessarily tell when an object was
+		 * allocated.  Therefore, if it's possible that this object
+		 * was freed and then its object number reused, we need to
+		 * visit all the holes with birth==0.
+		 *
+		 * If it isn't possible that the object number was reused,
+		 * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
+		 * all the blocks we will visit as part of this traversal,
+		 * then this hole must have always existed, so we can skip
+		 * it.  We visit blocks born after (exclusive) td_min_txg.
+		 *
+		 * Note that the meta-dnode cannot be reallocated.
+		 */
+		if (!send_holes_without_birth_time &&
+		    (!td->td_realloc_possible ||
+		    zb->zb_object == DMU_META_DNODE_OBJECT) &&
+		    td->td_hole_birth_enabled_txg <= td->td_min_txg)
+			return (0);
+	} else if (bp->blk_birth <= td->td_min_txg) {
+		return (0);
+	}
+
+	if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
+		uint64_t size = BP_GET_LSIZE(bp);
+		mutex_enter(&pd->pd_mtx);
+		ASSERT(pd->pd_bytes_fetched >= 0);
+		while (pd->pd_bytes_fetched < size && !pd->pd_exited)
+			cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
+		pd->pd_bytes_fetched -= size;
+		cv_broadcast(&pd->pd_cv);
+		mutex_exit(&pd->pd_mtx);
+	}
+
+	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+		if (err != 0)
+			goto post;
+		return (0);
+	}
+
+	if (td->td_flags & TRAVERSE_PRE) {
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			goto post;
+	}
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		uint32_t flags = ARC_FLAG_WAIT;
+		int32_t i;
+		int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		zbookmark_phys_t *czb;
+
+		ASSERT(!BP_IS_PROTECTED(bp));
+
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
+		if (err != 0)
+			goto post;
+
+		czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+		for (i = 0; i < epb; i++) {
+			SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			traverse_prefetch_metadata(td,
+			    &((blkptr_t *)buf->b_data)[i], czb);
+		}
+
+		/* recursively visitbp() blocks below this */
+		for (i = 0; i < epb; i++) {
+			SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			err = traverse_visitbp(td, dnp,
+			    &((blkptr_t *)buf->b_data)[i], czb);
+			if (err != 0)
+				break;
+		}
+
+		kmem_free(czb, sizeof (zbookmark_phys_t));
+
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		uint32_t flags = ARC_FLAG_WAIT;
+		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
+		int32_t i;
+		int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+		dnode_phys_t *child_dnp;
+
+		/*
+		 * dnode blocks might have their bonus buffers encrypted, so
+		 * we must be careful to honor TRAVERSE_NO_DECRYPT
+		 */
+		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+			zio_flags |= ZIO_FLAG_RAW;
+
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err != 0)
+			goto post;
+
+		child_dnp = buf->b_data;
+
+		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+			prefetch_dnode_metadata(td, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
+		}
+
+		/* recursively visitbp() blocks below this */
+		for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+			err = traverse_dnode(td, bp, &child_dnp[i],
+			    zb->zb_objset, zb->zb_blkid * epb + i);
+			if (err != 0)
+				break;
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		uint32_t zio_flags = ZIO_FLAG_CANFAIL;
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+
+		if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+			zio_flags |= ZIO_FLAG_RAW;
+
+		err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
+		if (err != 0)
+			goto post;
+
+		osp = buf->b_data;
+		prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
+		/*
+		 * See the block comment above for the goal of this variable.
+		 * If the maxblkid of the meta-dnode is 0, then we know that
+		 * we've never had more than DNODES_PER_BLOCK objects in the
+		 * dataset, which means we can't have reused any object ids.
+		 */
+		if (osp->os_meta_dnode.dn_maxblkid == 0)
+			td->td_realloc_possible = B_FALSE;
+
+		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
+				prefetch_dnode_metadata(td,
+				    &osp->os_projectused_dnode,
+				    zb->zb_objset, DMU_PROJECTUSED_OBJECT);
+			prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+			    zb->zb_objset, DMU_GROUPUSED_OBJECT);
+			prefetch_dnode_metadata(td, &osp->os_userused_dnode,
+			    zb->zb_objset, DMU_USERUSED_OBJECT);
+		}
+
+		err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
+		if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) {
+			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
+				err = traverse_dnode(td, bp,
+				    &osp->os_projectused_dnode, zb->zb_objset,
+				    DMU_PROJECTUSED_OBJECT);
+			if (err == 0)
+				err = traverse_dnode(td, bp,
+				    &osp->os_groupused_dnode, zb->zb_objset,
+				    DMU_GROUPUSED_OBJECT);
+			if (err == 0)
+				err = traverse_dnode(td, bp,
+				    &osp->os_userused_dnode, zb->zb_objset,
+				    DMU_USERUSED_OBJECT);
+		}
+	}
+
+	if (buf)
+		arc_buf_destroy(buf, &buf);
+
+post:
+	if (err == 0 && (td->td_flags & TRAVERSE_POST))
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+
+	if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) {
+		/*
+		 * Ignore this disk error as requested by the HARD flag,
+		 * and continue traversal.
+		 */
+		err = 0;
+	}
+
+	/*
+	 * If we are stopping here, set td_resume.
+	 */
+	if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+		td->td_resume->zb_objset = zb->zb_objset;
+		td->td_resume->zb_object = zb->zb_object;
+		td->td_resume->zb_level = 0;
+		/*
+		 * If we have stopped on an indirect block (e.g. due to
+		 * i/o error), we have not visited anything below it.
+		 * Set the bookmark to the first level-0 block that we need
+		 * to visit.  This way, the resuming code does not need to
+		 * deal with resuming from indirect blocks.
+		 *
+		 * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+		 * to dereference it.
+		 */
+		td->td_resume->zb_blkid = zb->zb_blkid;
+		if (zb->zb_level > 0) {
+			td->td_resume->zb_blkid <<= zb->zb_level *
+			    (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+		}
+		td->td_paused = B_TRUE;
+	}
+
+	return (err);
+}
+
+static void
+prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+	int j;
+	zbookmark_phys_t czb;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+		traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb);
+	}
+}
+
+static int
+traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+	int j, err = 0;
+	zbookmark_phys_t czb;
+
+	if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+	    object < td->td_resume->zb_object)
+		return (0);
+
+	if (td->td_flags & TRAVERSE_PRE) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
+		if (err != 0)
+			break;
+	}
+
+	if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
+		SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
+		err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb);
+	}
+
+	if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+		SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+		    ZB_DNODE_BLKID);
+		err = td->td_func(td->td_spa, NULL, bp, &czb, dnp,
+		    td->td_arg);
+		if (err == TRAVERSE_VISIT_NO_CHILDREN)
+			return (0);
+		if (err != 0)
+			return (err);
+	}
+	return (err);
+}
+
+/* ARGSUSED */
+static int
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	prefetch_data_t *pfd = arg;
+	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+	arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+	    ARC_FLAG_PRESCIENT_PREFETCH;
+
+	ASSERT(pfd->pd_bytes_fetched >= 0);
+	if (zb->zb_level == ZB_DNODE_LEVEL)
+		return (0);
+	if (pfd->pd_cancel)
+		return (SET_ERROR(EINTR));
+
+	if (!prefetch_needed(pfd, bp))
+		return (0);
+
+	mutex_enter(&pfd->pd_mtx);
+	while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
+		cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx);
+	pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
+	cv_broadcast(&pfd->pd_cv);
+	mutex_exit(&pfd->pd_mtx);
+
+	if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp))
+		zio_flags |= ZIO_FLAG_RAW;
+
+	(void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+	    zio_flags, &aflags, zb);
+
+	return (0);
+}
+
+static void
+traverse_prefetch_thread(void *arg)
+{
+	traverse_data_t *td_main = arg;
+	traverse_data_t td = *td_main;
+	zbookmark_phys_t czb;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	td.td_func = traverse_prefetcher;
+	td.td_arg = td_main->td_pfd;
+	td.td_pfd = NULL;
+	td.td_resume = &td_main->td_pfd->pd_resume;
+
+	SET_BOOKMARK(&czb, td.td_objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+	(void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb);
+
+	mutex_enter(&td_main->td_pfd->pd_mtx);
+	td_main->td_pfd->pd_exited = B_TRUE;
+	cv_broadcast(&td_main->td_pfd->pd_cv);
+	mutex_exit(&td_main->td_pfd->pd_mtx);
+	spl_fstrans_unmark(cookie);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+static int
+traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	traverse_data_t *td;
+	prefetch_data_t *pd;
+	zbookmark_phys_t *czb;
+	int err;
+
+	ASSERT(ds == NULL || objset == ds->ds_object);
+	ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
+
+	td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
+	pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
+	czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+	td->td_spa = spa;
+	td->td_objset = objset;
+	td->td_rootbp = rootbp;
+	td->td_min_txg = txg_start;
+	td->td_resume = resume;
+	td->td_func = func;
+	td->td_arg = arg;
+	td->td_pfd = pd;
+	td->td_flags = flags;
+	td->td_paused = B_FALSE;
+	td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+		VERIFY(spa_feature_enabled_txg(spa,
+		    SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg));
+	} else {
+		td->td_hole_birth_enabled_txg = UINT64_MAX;
+	}
+
+	pd->pd_flags = flags;
+	if (resume != NULL)
+		pd->pd_resume = *resume;
+	mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
+
+	SET_BOOKMARK(czb, td->td_objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+	/* See comment on ZIL traversal in dsl_scan_visitds. */
+	if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+		enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+		uint32_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+		arc_buf_t *buf;
+		ASSERT(!BP_IS_REDACTED(rootbp));
+
+		if ((td->td_flags & TRAVERSE_NO_DECRYPT) &&
+		    BP_IS_PROTECTED(rootbp))
+			zio_flags |= ZIO_FLAG_RAW;
+
+		err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func,
+		    &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb);
+		if (err != 0) {
+			/*
+			 * If both TRAVERSE_HARD and TRAVERSE_PRE are set,
+			 * continue to visitbp so that td_func can be called
+			 * in pre stage, and err will reset to zero.
+			 */
+			if (!(td->td_flags & TRAVERSE_HARD) ||
+			    !(td->td_flags & TRAVERSE_PRE))
+				goto out;
+		} else {
+			osp = buf->b_data;
+			traverse_zil(td, &osp->os_zil_header);
+			arc_buf_destroy(buf, &buf);
+		}
+	}
+
+	if (!(flags & TRAVERSE_PREFETCH_DATA) ||
+	    taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread,
+	    td, TQ_NOQUEUE) == TASKQID_INVALID)
+		pd->pd_exited = B_TRUE;
+
+	err = traverse_visitbp(td, NULL, rootbp, czb);
+
+	mutex_enter(&pd->pd_mtx);
+	pd->pd_cancel = B_TRUE;
+	cv_broadcast(&pd->pd_cv);
+	while (!pd->pd_exited)
+		cv_wait_sig(&pd->pd_cv, &pd->pd_mtx);
+	mutex_exit(&pd->pd_mtx);
+out:
+	mutex_destroy(&pd->pd_mtx);
+	cv_destroy(&pd->pd_cv);
+
+	kmem_free(czb, sizeof (zbookmark_phys_t));
+	kmem_free(pd, sizeof (struct prefetch_data));
+	kmem_free(td, sizeof (struct traverse_data));
+
+	return (err);
+}
+
+/*
+ * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
+ * in syncing context).
+ */
+int
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume,
+    int flags, blkptr_cb_t func, void *arg)
+{
+	return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
+	    &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
+}
+
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+    int flags, blkptr_cb_t func, void *arg)
+{
+	return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
+}
+
+int
+traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
+    uint64_t txg_start, zbookmark_phys_t *resume, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
+	    blkptr, txg_start, resume, flags, func, arg));
+}
+
+/*
+ * NB: pool must not be changing on-disk (eg, from zdb or sync context).
+ */
+int
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
+{
+	int err;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	objset_t *mos = dp->dp_meta_objset;
+	boolean_t hard = (flags & TRAVERSE_HARD);
+
+	/* visit the MOS */
+	err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa),
+	    txg_start, NULL, flags, func, arg);
+	if (err != 0)
+		return (err);
+
+	/* visit each dataset */
+	for (uint64_t obj = 1; err == 0;
+	    err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
+		dmu_object_info_t doi;
+
+		err = dmu_object_info(mos, obj, &doi);
+		if (err != 0) {
+			if (hard)
+				continue;
+			break;
+		}
+
+		if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
+			dsl_dataset_t *ds;
+			uint64_t txg = txg_start;
+
+			dsl_pool_config_enter(dp, FTAG);
+			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+			dsl_pool_config_exit(dp, FTAG);
+			if (err != 0) {
+				if (hard)
+					continue;
+				break;
+			}
+			if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+				txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+			err = traverse_dataset(ds, txg, flags, func, arg);
+			dsl_dataset_rele(ds, FTAG);
+			if (err != 0)
+				break;
+		}
+	}
+	if (err == ESRCH)
+		err = 0;
+	return (err);
+}
+
+EXPORT_SYMBOL(traverse_dataset);
+EXPORT_SYMBOL(traverse_pool);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW,
+	"Max number of bytes to prefetch");
+
+#if defined(_KERNEL)
+module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644);
+MODULE_PARM_DESC(ignore_hole_birth,
+	"Alias for send_holes_without_birth_time");
+#endif
+
+ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW,
+	"Ignore hole_birth txg for zfs send");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu_tx.c b/sys/contrib/openzfs/module/zfs/dmu_tx.c
new file mode 100644
index 000000000000..73667915df0f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_tx.c
@@ -0,0 +1,1417 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/zap_impl.h>
+#include <sys/spa.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/trace_zfs.h>
+
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+    uint64_t arg1, uint64_t arg2);
+
+dmu_tx_stats_t dmu_tx_stats = {
+	{ "dmu_tx_assigned",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_delay",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_error",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_suspended",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_group",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_memory_reserve",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_memory_reclaim",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_dirty_throttle",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_dirty_delay",		KSTAT_DATA_UINT64 },
+	{ "dmu_tx_dirty_over_max",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_dirty_frees_delay",	KSTAT_DATA_UINT64 },
+	{ "dmu_tx_quota",		KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dmu_tx_ksp;
+
+dmu_tx_t *
+dmu_tx_create_dd(dsl_dir_t *dd)
+{
+	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
+	tx->tx_dir = dd;
+	if (dd != NULL)
+		tx->tx_pool = dd->dd_pool;
+	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
+	    offsetof(dmu_tx_hold_t, txh_node));
+	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+	    offsetof(dmu_tx_callback_t, dcb_node));
+	tx->tx_start = gethrtime();
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create(objset_t *os)
+{
+	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
+	tx->tx_objset = os;
+	return (tx);
+}
+
+dmu_tx_t *
+dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
+{
+	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
+
+	TXG_VERIFY(dp->dp_spa, txg);
+	tx->tx_pool = dp;
+	tx->tx_txg = txg;
+	tx->tx_anyobj = TRUE;
+
+	return (tx);
+}
+
+int
+dmu_tx_is_syncing(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj);
+}
+
+int
+dmu_tx_private_ok(dmu_tx_t *tx)
+{
+	return (tx->tx_anyobj);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
+    uint64_t arg1, uint64_t arg2)
+{
+	dmu_tx_hold_t *txh;
+
+	if (dn != NULL) {
+		(void) zfs_refcount_add(&dn->dn_holds, tx);
+		if (tx->tx_txg != 0) {
+			mutex_enter(&dn->dn_mtx);
+			/*
+			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
+			 * problem, but there's no way for it to happen (for
+			 * now, at least).
+			 */
+			ASSERT(dn->dn_assigned_txg == 0);
+			dn->dn_assigned_txg = tx->tx_txg;
+			(void) zfs_refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+	}
+
+	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
+	txh->txh_tx = tx;
+	txh->txh_dnode = dn;
+	zfs_refcount_create(&txh->txh_space_towrite);
+	zfs_refcount_create(&txh->txh_memory_tohold);
+	txh->txh_type = type;
+	txh->txh_arg1 = arg1;
+	txh->txh_arg2 = arg2;
+	list_insert_tail(&tx->tx_holds, txh);
+
+	return (txh);
+}
+
+static dmu_tx_hold_t *
+dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
+    enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
+{
+	dnode_t *dn = NULL;
+	dmu_tx_hold_t *txh;
+	int err;
+
+	if (object != DMU_NEW_OBJECT) {
+		err = dnode_hold(os, object, FTAG, &dn);
+		if (err != 0) {
+			tx->tx_err = err;
+			return (NULL);
+		}
+	}
+	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
+	if (dn != NULL)
+		dnode_rele(dn, FTAG);
+	return (txh);
+}
+
+void
+dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
+{
+	/*
+	 * If we're syncing, they can manipulate any object anyhow, and
+	 * the hold on the dnode_t can cause problems.
+	 */
+	if (!dmu_tx_is_syncing(tx))
+		(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
+}
+
+/*
+ * This function reads specified data from disk.  The specified data will
+ * be needed to perform the transaction -- i.e, it will be read after
+ * we do dmu_tx_assign().  There are two reasons that we read the data now
+ * (before dmu_tx_assign()):
+ *
+ * 1. Reading it now has potentially better performance.  The transaction
+ * has not yet been assigned, so the TXG is not held open, and also the
+ * caller typically has less locks held when calling dmu_tx_hold_*() than
+ * after the transaction has been assigned.  This reduces the lock (and txg)
+ * hold times, thus reducing lock contention.
+ *
+ * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
+ * that are detected before they start making changes to the DMU state
+ * (i.e. now).  Once the transaction has been assigned, and some DMU
+ * state has been changed, it can be difficult to recover from an i/o
+ * error (e.g. to undo the changes already made in memory at the DMU
+ * layer).  Typically code to do so does not exist in the caller -- it
+ * assumes that the data has already been cached and thus i/o errors are
+ * not possible.
+ *
+ * It has been observed that the i/o initiated here can be a performance
+ * problem, and it appears to be optional, because we don't look at the
+ * data which is read.  However, removing this read would only serve to
+ * move the work elsewhere (after the dmu_tx_assign()), where it may
+ * have a greater impact on performance (in addition to the impact on
+ * fault tolerance noted above).
+ */
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+	int err;
+	dmu_buf_impl_t *db;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	db = dbuf_hold_level(dn, level, blkid, FTAG);
+	rw_exit(&dn->dn_struct_rwlock);
+	if (db == NULL)
+		return (SET_ERROR(EIO));
+	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
+	dbuf_rele(db, FTAG);
+	return (err);
+}
+
+/* ARGSUSED */
+static void
+dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+	dnode_t *dn = txh->txh_dnode;
+	int err = 0;
+
+	if (len == 0)
+		return;
+
+	(void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG);
+
+	if (dn == NULL)
+		return;
+
+	/*
+	 * For i/o error checking, read the blocks that will be needed
+	 * to perform the write: the first and last level-0 blocks (if
+	 * they are not aligned, i.e. if they are partial-block writes),
+	 * and all the level-1 blocks.
+	 */
+	if (dn->dn_maxblkid == 0) {
+		if (off < dn->dn_datablksz &&
+		    (off > 0 || len < dn->dn_datablksz)) {
+			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+			if (err != 0) {
+				txh->txh_tx->tx_err = err;
+			}
+		}
+	} else {
+		zio_t *zio = zio_root(dn->dn_objset->os_spa,
+		    NULL, NULL, ZIO_FLAG_CANFAIL);
+
+		/* first level-0 block */
+		uint64_t start = off >> dn->dn_datablkshift;
+		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
+			err = dmu_tx_check_ioerr(zio, dn, 0, start);
+			if (err != 0) {
+				txh->txh_tx->tx_err = err;
+			}
+		}
+
+		/* last level-0 block */
+		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
+		if (end != start && end <= dn->dn_maxblkid &&
+		    P2PHASE(off + len, dn->dn_datablksz)) {
+			err = dmu_tx_check_ioerr(zio, dn, 0, end);
+			if (err != 0) {
+				txh->txh_tx->tx_err = err;
+			}
+		}
+
+		/* level-1 blocks */
+		if (dn->dn_nlevels > 1) {
+			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+			for (uint64_t i = (start >> shft) + 1;
+			    i < end >> shft; i++) {
+				err = dmu_tx_check_ioerr(zio, dn, 1, i);
+				if (err != 0) {
+					txh->txh_tx->tx_err = err;
+				}
+			}
+		}
+
+		err = zio_wait(zio);
+		if (err != 0) {
+			txh->txh_tx->tx_err = err;
+		}
+	}
+}
+
+static void
+dmu_tx_count_dnode(dmu_tx_hold_t *txh)
+{
+	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
+	    DNODE_MIN_SIZE, FTAG);
+}
+
+void
+dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+	ASSERT3U(len, <=, DMU_MAX_ACCESS);
+	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_WRITE, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_write(txh, off, len);
+		dmu_tx_count_dnode(txh);
+	}
+}
+
+void
+dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+	ASSERT3U(len, <=, DMU_MAX_ACCESS);
+	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
+	if (txh != NULL) {
+		dmu_tx_count_write(txh, off, len);
+		dmu_tx_count_dnode(txh);
+	}
+}
+
+/*
+ * This function marks the transaction as being a "net free".  The end
+ * result is that refquotas will be disabled for this transaction, and
+ * this transaction will be able to use half of the pool space overhead
+ * (see dsl_pool_adjustedsize()).  Therefore this function should only
+ * be called for transactions that we expect will not cause a net increase
+ * in the amount of space used (but it's OK if that is occasionally not true).
+ */
+void
+dmu_tx_mark_netfree(dmu_tx_t *tx)
+{
+	tx->tx_netfree = B_TRUE;
+}
+
+static void
+dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+	dmu_tx_t *tx = txh->txh_tx;
+	dnode_t *dn = txh->txh_dnode;
+	int err;
+
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_count_dnode(txh);
+
+	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
+		return;
+	if (len == DMU_OBJECT_END)
+		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
+
+	dmu_tx_count_dnode(txh);
+
+	/*
+	 * For i/o error checking, we read the first and last level-0
+	 * blocks if they are not aligned, and all the level-1 blocks.
+	 *
+	 * Note:  dbuf_free_range() assumes that we have not instantiated
+	 * any level-0 dbufs that will be completely freed.  Therefore we must
+	 * exercise care to not read or count the first and last blocks
+	 * if they are blocksize-aligned.
+	 */
+	if (dn->dn_datablkshift == 0) {
+		if (off != 0 || len < dn->dn_datablksz)
+			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
+	} else {
+		/* first block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off, 1);
+		/* last block will be modified if it is not aligned */
+		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+			dmu_tx_count_write(txh, off + len, 1);
+	}
+
+	/*
+	 * Check level-1 blocks.
+	 */
+	if (dn->dn_nlevels > 1) {
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		uint64_t start = off >> shift;
+		uint64_t end = (off + len) >> shift;
+
+		ASSERT(dn->dn_indblkshift != 0);
+
+		/*
+		 * dnode_reallocate() can result in an object with indirect
+		 * blocks having an odd data block size.  In this case,
+		 * just check the single block.
+		 */
+		if (dn->dn_datablkshift == 0)
+			start = end = 0;
+
+		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
+		    NULL, NULL, ZIO_FLAG_CANFAIL);
+		for (uint64_t i = start; i <= end; i++) {
+			uint64_t ibyte = i << shift;
+			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
+			i = ibyte >> shift;
+			if (err == ESRCH || i > end)
+				break;
+			if (err != 0) {
+				tx->tx_err = err;
+				(void) zio_wait(zio);
+				return;
+			}
+
+			(void) zfs_refcount_add_many(&txh->txh_memory_tohold,
+			    1 << dn->dn_indblkshift, FTAG);
+
+			err = dmu_tx_check_ioerr(zio, dn, 1, i);
+			if (err != 0) {
+				tx->tx_err = err;
+				(void) zio_wait(zio);
+				return;
+			}
+		}
+		err = zio_wait(zio);
+		if (err != 0) {
+			tx->tx_err = err;
+			return;
+		}
+	}
+}
+
+void
+dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
+{
+	dmu_tx_hold_t *txh;
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_FREE, off, len);
+	if (txh != NULL)
+		(void) dmu_tx_hold_free_impl(txh, off, len);
+}
+
+void
+dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
+{
+	dmu_tx_hold_t *txh;
+
+	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
+	if (txh != NULL)
+		(void) dmu_tx_hold_free_impl(txh, off, len);
+}
+
+static void
+dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
+{
+	dmu_tx_t *tx = txh->txh_tx;
+	dnode_t *dn = txh->txh_dnode;
+	int err;
+
+	ASSERT(tx->tx_txg == 0);
+
+	dmu_tx_count_dnode(txh);
+
+	/*
+	 * Modifying a almost-full microzap is around the worst case (128KB)
+	 *
+	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
+	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
+	 * - 4 new blocks written if adding:
+	 *    - 2 blocks for possibly split leaves,
+	 *    - 2 grown ptrtbl blocks
+	 */
+	(void) zfs_refcount_add_many(&txh->txh_space_towrite,
+	    MZAP_MAX_BLKSZ, FTAG);
+
+	if (dn == NULL)
+		return;
+
+	ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
+
+	if (dn->dn_maxblkid == 0 || name == NULL) {
+		/*
+		 * This is a microzap (only one block), or we don't know
+		 * the name.  Check the first block for i/o errors.
+		 */
+		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+		if (err != 0) {
+			tx->tx_err = err;
+		}
+	} else {
+		/*
+		 * Access the name so that we'll check for i/o errors to
+		 * the leaf blocks, etc.  We ignore ENOENT, as this name
+		 * may not yet exist.
+		 */
+		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
+		if (err == EIO || err == ECKSUM || err == ENXIO) {
+			tx->tx_err = err;
+		}
+	}
+}
+
+void
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_ZAP, add, (uintptr_t)name);
+	if (txh != NULL)
+		dmu_tx_hold_zap_impl(txh, name);
+}
+
+void
+dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+	ASSERT(dn != NULL);
+
+	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
+	if (txh != NULL)
+		dmu_tx_hold_zap_impl(txh, name);
+}
+
+void
+dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg == 0);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    object, THT_BONUS, 0, 0);
+	if (txh)
+		dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT0(tx->tx_txg);
+
+	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
+	if (txh)
+		dmu_tx_count_dnode(txh);
+}
+
+void
+dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
+{
+	dmu_tx_hold_t *txh;
+
+	ASSERT(tx->tx_txg == 0);
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
+	if (txh) {
+		(void) zfs_refcount_add_many(
+		    &txh->txh_space_towrite, space, FTAG);
+	}
+}
+
+#ifdef ZFS_DEBUG
+void
+dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
+{
+	boolean_t match_object = B_FALSE;
+	boolean_t match_offset = B_FALSE;
+
+	DB_DNODE_ENTER(db);
+	dnode_t *dn = DB_DNODE(db);
+	ASSERT(tx->tx_txg != 0);
+	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
+	ASSERT3U(dn->dn_object, ==, db->db.db_object);
+
+	if (tx->tx_anyobj) {
+		DB_DNODE_EXIT(db);
+		return;
+	}
+
+	/* XXX No checking on the meta dnode for now */
+	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
+		DB_DNODE_EXIT(db);
+		return;
+	}
+
+	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
+			match_object = TRUE;
+		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
+			int datablkshift = dn->dn_datablkshift ?
+			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
+			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+			int shift = datablkshift + epbs * db->db_level;
+			uint64_t beginblk = shift >= 64 ? 0 :
+			    (txh->txh_arg1 >> shift);
+			uint64_t endblk = shift >= 64 ? 0 :
+			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
+			uint64_t blkid = db->db_blkid;
+
+			/* XXX txh_arg2 better not be zero... */
+
+			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
+			    txh->txh_type, beginblk, endblk);
+
+			switch (txh->txh_type) {
+			case THT_WRITE:
+				if (blkid >= beginblk && blkid <= endblk)
+					match_offset = TRUE;
+				/*
+				 * We will let this hold work for the bonus
+				 * or spill buffer so that we don't need to
+				 * hold it when creating a new object.
+				 */
+				if (blkid == DMU_BONUS_BLKID ||
+				    blkid == DMU_SPILL_BLKID)
+					match_offset = TRUE;
+				/*
+				 * They might have to increase nlevels,
+				 * thus dirtying the new TLIBs.  Or the
+				 * might have to change the block size,
+				 * thus dirying the new lvl=0 blk=0.
+				 */
+				if (blkid == 0)
+					match_offset = TRUE;
+				break;
+			case THT_FREE:
+				/*
+				 * We will dirty all the level 1 blocks in
+				 * the free range and perhaps the first and
+				 * last level 0 block.
+				 */
+				if (blkid >= beginblk && (blkid <= endblk ||
+				    txh->txh_arg2 == DMU_OBJECT_END))
+					match_offset = TRUE;
+				break;
+			case THT_SPILL:
+				if (blkid == DMU_SPILL_BLKID)
+					match_offset = TRUE;
+				break;
+			case THT_BONUS:
+				if (blkid == DMU_BONUS_BLKID)
+					match_offset = TRUE;
+				break;
+			case THT_ZAP:
+				match_offset = TRUE;
+				break;
+			case THT_NEWOBJECT:
+				match_object = TRUE;
+				break;
+			default:
+				cmn_err(CE_PANIC, "bad txh_type %d",
+				    txh->txh_type);
+			}
+		}
+		if (match_object && match_offset) {
+			DB_DNODE_EXIT(db);
+			return;
+		}
+	}
+	DB_DNODE_EXIT(db);
+	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
+	    (u_longlong_t)db->db.db_object, db->db_level,
+	    (u_longlong_t)db->db_blkid);
+}
+#endif
+
+/*
+ * If we can't do 10 iops, something is wrong.  Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting.  This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time.  This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ *     min_time = scale * (dirty - min) / (max - dirty)
+ *     min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ *  10ms +-------------------------------------------------------------*+
+ *       |                                                             *|
+ *   9ms +                                                             *+
+ *       |                                                             *|
+ *   8ms +                                                             *+
+ *       |                                                            * |
+ *   7ms +                                                            * +
+ *       |                                                            * |
+ *   6ms +                                                            * +
+ *       |                                                            * |
+ *   5ms +                                                           *  +
+ *       |                                                           *  |
+ *   4ms +                                                           *  +
+ *       |                                                           *  |
+ *   3ms +                                                          *   +
+ *       |                                                          *   |
+ *   2ms +                                              (midpoint) *    +
+ *       |                                                  |    **     |
+ *   1ms +                                                  v ***       +
+ *       |             zfs_delay_scale ---------->     ********         |
+ *     0 +-------------------------------------*********----------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                             *+
+ *  10ms +                                                             *+
+ *       +                                                           ** +
+ *       |                                              (midpoint)  **  |
+ *       +                                                  |     **    +
+ *   1ms +                                                  v ****      +
+ *       +             zfs_delay_scale ---------->        *****         +
+ *       |                                             ****             |
+ *       +                                          ****                +
+ * 100us +                                        **                    +
+ *       +                                       *                      +
+ *       |                                      *                       |
+ *       +                                     *                        +
+ *  10us +                                     *                        +
+ *       +                                                              +
+ *       |                                                              |
+ *       +                                                              +
+ *       +--------------------------------------------------------------+
+ *       0%                    <- zfs_dirty_data_max ->               100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+	dsl_pool_t *dp = tx->tx_pool;
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	hrtime_t wakeup, min_tx_time, now;
+
+	if (dirty <= delay_min_bytes)
+		return;
+
+	/*
+	 * The caller has already waited until we are under the max.
+	 * We make them pass us the amount of dirty data so we don't
+	 * have to handle the case of it being >= the max, which could
+	 * cause a divide-by-zero if it's == the max.
+	 */
+	ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+	now = gethrtime();
+	min_tx_time = zfs_delay_scale *
+	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+	if (now > tx->tx_start + min_tx_time)
+		return;
+
+	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+	    uint64_t, min_tx_time);
+
+	mutex_enter(&dp->dp_lock);
+	wakeup = MAX(tx->tx_start + min_tx_time,
+	    dp->dp_last_wakeup + min_tx_time);
+	dp->dp_last_wakeup = wakeup;
+	mutex_exit(&dp->dp_lock);
+
+	zfs_sleep_until(wakeup);
+}
+
+/*
+ * This routine attempts to assign the transaction to a transaction group.
+ * To do so, we must determine if there is sufficient free space on disk.
+ *
+ * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
+ * on it), then it is assumed that there is sufficient free space,
+ * unless there's insufficient slop space in the pool (see the comment
+ * above spa_slop_shift in spa_misc.c).
+ *
+ * If it is not a "netfree" transaction, then if the data already on disk
+ * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
+ * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
+ * plus the rough estimate of this transaction's changes, may exceed the
+ * allowed usage, then this will fail with ERESTART, which will cause the
+ * caller to wait for the pending changes to be written to disk (by waiting
+ * for the next TXG to open), and then check the space usage again.
+ *
+ * The rough estimate of pending changes is comprised of the sum of:
+ *
+ *  - this transaction's holds' txh_space_towrite
+ *
+ *  - dd_tempreserved[], which is the sum of in-flight transactions'
+ *    holds' txh_space_towrite (i.e. those transactions that have called
+ *    dmu_tx_assign() but not yet called dmu_tx_commit()).
+ *
+ *  - dd_space_towrite[], which is the amount of dirtied dbufs.
+ *
+ * Note that all of these values are inflated by spa_get_worst_case_asize(),
+ * which means that we may get ERESTART well before we are actually in danger
+ * of running out of space, but this also mitigates any small inaccuracies
+ * in the rough estimate (e.g. txh_space_towrite doesn't take into account
+ * indirect blocks, and dd_space_towrite[] doesn't take into account changes
+ * to the MOS).
+ *
+ * Note that due to this algorithm, it is possible to exceed the allowed
+ * usage by one transaction.  Also, as we approach the allowed usage,
+ * we will allow a very limited amount of changes into each TXG, thus
+ * decreasing performance.
+ */
+static int
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+	spa_t *spa = tx->tx_pool->dp_spa;
+
+	ASSERT0(tx->tx_txg);
+
+	if (tx->tx_err) {
+		DMU_TX_STAT_BUMP(dmu_tx_error);
+		return (tx->tx_err);
+	}
+
+	if (spa_suspended(spa)) {
+		DMU_TX_STAT_BUMP(dmu_tx_suspended);
+
+		/*
+		 * If the user has indicated a blocking failure mode
+		 * then return ERESTART which will block in dmu_tx_wait().
+		 * Otherwise, return EIO so that an error can get
+		 * propagated back to the VOP calls.
+		 *
+		 * Note that we always honor the txg_how flag regardless
+		 * of the failuremode setting.
+		 */
+		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
+		    !(txg_how & TXG_WAIT))
+			return (SET_ERROR(EIO));
+
+		return (SET_ERROR(ERESTART));
+	}
+
+	if (!tx->tx_dirty_delayed &&
+	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
+		tx->tx_wait_dirty = B_TRUE;
+		DMU_TX_STAT_BUMP(dmu_tx_dirty_delay);
+		return (SET_ERROR(ERESTART));
+	}
+
+	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
+	tx->tx_needassign_txh = NULL;
+
+	/*
+	 * NB: No error returns are allowed after txg_hold_open, but
+	 * before processing the dnode holds, due to the
+	 * dmu_tx_unassign() logic.
+	 */
+
+	uint64_t towrite = 0;
+	uint64_t tohold = 0;
+	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		dnode_t *dn = txh->txh_dnode;
+		if (dn != NULL) {
+			/*
+			 * This thread can't hold the dn_struct_rwlock
+			 * while assigning the tx, because this can lead to
+			 * deadlock. Specifically, if this dnode is already
+			 * assigned to an earlier txg, this thread may need
+			 * to wait for that txg to sync (the ERESTART case
+			 * below).  The other thread that has assigned this
+			 * dnode to an earlier txg prevents this txg from
+			 * syncing until its tx can complete (calling
+			 * dmu_tx_commit()), but it may need to acquire the
+			 * dn_struct_rwlock to do so (e.g. via
+			 * dmu_buf_hold*()).
+			 *
+			 * Note that this thread can't hold the lock for
+			 * read either, but the rwlock doesn't record
+			 * enough information to make that assertion.
+			 */
+			ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+			mutex_enter(&dn->dn_mtx);
+			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
+				mutex_exit(&dn->dn_mtx);
+				tx->tx_needassign_txh = txh;
+				DMU_TX_STAT_BUMP(dmu_tx_group);
+				return (SET_ERROR(ERESTART));
+			}
+			if (dn->dn_assigned_txg == 0)
+				dn->dn_assigned_txg = tx->tx_txg;
+			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+			(void) zfs_refcount_add(&dn->dn_tx_holds, tx);
+			mutex_exit(&dn->dn_mtx);
+		}
+		towrite += zfs_refcount_count(&txh->txh_space_towrite);
+		tohold += zfs_refcount_count(&txh->txh_memory_tohold);
+	}
+
+	/* needed allocation: worst-case estimate of write space */
+	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
+	/* calculate memory footprint estimate */
+	uint64_t memory = towrite + tohold;
+
+	if (tx->tx_dir != NULL && asize != 0) {
+		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
+		    asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
+		if (err != 0)
+			return (err);
+	}
+
+	DMU_TX_STAT_BUMP(dmu_tx_assigned);
+
+	return (0);
+}
+
+static void
+dmu_tx_unassign(dmu_tx_t *tx)
+{
+	if (tx->tx_txg == 0)
+		return;
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	/*
+	 * Walk the transaction's hold list, removing the hold on the
+	 * associated dnode, and notifying waiters if the refcount drops to 0.
+	 */
+	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
+	    txh && txh != tx->tx_needassign_txh;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		dnode_t *dn = txh->txh_dnode;
+
+		if (dn == NULL)
+			continue;
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+		if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	txg_rele_to_sync(&tx->tx_txgh);
+
+	tx->tx_lasttried_txg = tx->tx_txg;
+	tx->tx_txg = 0;
+}
+
+/*
+ * Assign tx to a transaction group; txg_how is a bitmask:
+ *
+ * If TXG_WAIT is set and the currently open txg is full, this function
+ * will wait until there's a new txg. This should be used when no locks
+ * are being held. With this bit set, this function will only fail if
+ * we're truly out of space (or over quota).
+ *
+ * If TXG_WAIT is *not* set and we can't assign into the currently open
+ * txg without blocking, this function will return immediately with
+ * ERESTART. This should be used whenever locks are being held.  On an
+ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
+ * and try again.
+ *
+ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
+ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
+ * details on the throttle). This is used by the VFS operations, after
+ * they have already called dmu_tx_wait() (though most likely on a
+ * different tx).
+ *
+ * It is guaranteed that subsequent successful calls to dmu_tx_assign()
+ * will assign the tx to monotonically increasing txgs. Of course this is
+ * not strong monotonicity, because the same txg can be returned multiple
+ * times in a row. This guarantee holds both for subsequent calls from
+ * one thread and for multiple threads. For example, it is impossible to
+ * observe the following sequence of events:
+ *
+ *          Thread 1                            Thread 2
+ *
+ *     dmu_tx_assign(T1, ...)
+ *     1 <- dmu_tx_get_txg(T1)
+ *                                       dmu_tx_assign(T2, ...)
+ *                                       2 <- dmu_tx_get_txg(T2)
+ *     dmu_tx_assign(T3, ...)
+ *     1 <- dmu_tx_get_txg(T3)
+ */
+int
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
+{
+	int err;
+
+	ASSERT(tx->tx_txg == 0);
+	ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
+	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
+
+	/* If we might wait, we must not hold the config lock. */
+	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
+
+	if ((txg_how & TXG_NOTHROTTLE))
+		tx->tx_dirty_delayed = B_TRUE;
+
+	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
+		dmu_tx_unassign(tx);
+
+		if (err != ERESTART || !(txg_how & TXG_WAIT))
+			return (err);
+
+		dmu_tx_wait(tx);
+	}
+
+	txg_rele_to_quiesce(&tx->tx_txgh);
+
+	return (0);
+}
+
+void
+dmu_tx_wait(dmu_tx_t *tx)
+{
+	spa_t *spa = tx->tx_pool->dp_spa;
+	dsl_pool_t *dp = tx->tx_pool;
+	hrtime_t before;
+
+	ASSERT(tx->tx_txg == 0);
+	ASSERT(!dsl_pool_config_held(tx->tx_pool));
+
+	before = gethrtime();
+
+	if (tx->tx_wait_dirty) {
+		uint64_t dirty;
+
+		/*
+		 * dmu_tx_try_assign() has determined that we need to wait
+		 * because we've consumed much or all of the dirty buffer
+		 * space.
+		 */
+		mutex_enter(&dp->dp_lock);
+		if (dp->dp_dirty_total >= zfs_dirty_data_max)
+			DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max);
+		while (dp->dp_dirty_total >= zfs_dirty_data_max)
+			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+		dirty = dp->dp_dirty_total;
+		mutex_exit(&dp->dp_lock);
+
+		dmu_tx_delay(tx, dirty);
+
+		tx->tx_wait_dirty = B_FALSE;
+
+		/*
+		 * Note: setting tx_dirty_delayed only has effect if the
+		 * caller used TX_WAIT.  Otherwise they are going to
+		 * destroy this tx and try again.  The common case,
+		 * zfs_write(), uses TX_WAIT.
+		 */
+		tx->tx_dirty_delayed = B_TRUE;
+	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+		/*
+		 * If the pool is suspended we need to wait until it
+		 * is resumed.  Note that it's possible that the pool
+		 * has become active after this thread has tried to
+		 * obtain a tx.  If that's the case then tx_lasttried_txg
+		 * would not have been set.
+		 */
+		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
+	} else if (tx->tx_needassign_txh) {
+		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
+
+		mutex_enter(&dn->dn_mtx);
+		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
+			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
+		mutex_exit(&dn->dn_mtx);
+		tx->tx_needassign_txh = NULL;
+	} else {
+		/*
+		 * If we have a lot of dirty data just wait until we sync
+		 * out a TXG at which point we'll hopefully have synced
+		 * a portion of the changes.
+		 */
+		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
+	}
+
+	spa_tx_assign_add_nsecs(spa, gethrtime() - before);
+}
+
+static void
+dmu_tx_destroy(dmu_tx_t *tx)
+{
+	dmu_tx_hold_t *txh;
+
+	while ((txh = list_head(&tx->tx_holds)) != NULL) {
+		dnode_t *dn = txh->txh_dnode;
+
+		list_remove(&tx->tx_holds, txh);
+		zfs_refcount_destroy_many(&txh->txh_space_towrite,
+		    zfs_refcount_count(&txh->txh_space_towrite));
+		zfs_refcount_destroy_many(&txh->txh_memory_tohold,
+		    zfs_refcount_count(&txh->txh_memory_tohold));
+		kmem_free(txh, sizeof (dmu_tx_hold_t));
+		if (dn != NULL)
+			dnode_rele(dn, tx);
+	}
+
+	list_destroy(&tx->tx_callbacks);
+	list_destroy(&tx->tx_holds);
+	kmem_free(tx, sizeof (dmu_tx_t));
+}
+
+void
+dmu_tx_commit(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+
+	/*
+	 * Go through the transaction's hold list and remove holds on
+	 * associated dnodes, notifying waiters if no holds remain.
+	 */
+	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+	    txh = list_next(&tx->tx_holds, txh)) {
+		dnode_t *dn = txh->txh_dnode;
+
+		if (dn == NULL)
+			continue;
+
+		mutex_enter(&dn->dn_mtx);
+		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
+
+		if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) {
+			dn->dn_assigned_txg = 0;
+			cv_broadcast(&dn->dn_notxholds);
+		}
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	if (tx->tx_tempreserve_cookie)
+		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
+
+	if (!list_is_empty(&tx->tx_callbacks))
+		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
+	if (tx->tx_anyobj == FALSE)
+		txg_rele_to_sync(&tx->tx_txgh);
+
+	dmu_tx_destroy(tx);
+}
+
+void
+dmu_tx_abort(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg == 0);
+
+	/*
+	 * Call any registered callbacks with an error code.
+	 */
+	if (!list_is_empty(&tx->tx_callbacks))
+		dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED));
+
+	dmu_tx_destroy(tx);
+}
+
+uint64_t
+dmu_tx_get_txg(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_txg != 0);
+	return (tx->tx_txg);
+}
+
+dsl_pool_t *
+dmu_tx_pool(dmu_tx_t *tx)
+{
+	ASSERT(tx->tx_pool != NULL);
+	return (tx->tx_pool);
+}
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+	dmu_tx_callback_t *dcb;
+
+	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+	dcb->dcb_func = func;
+	dcb->dcb_data = data;
+
+	list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+	dmu_tx_callback_t *dcb;
+
+	while ((dcb = list_tail(cb_list)) != NULL) {
+		list_remove(cb_list, dcb);
+		dcb->dcb_func(dcb->dcb_data, error);
+		kmem_free(dcb, sizeof (dmu_tx_callback_t));
+	}
+}
+
+/*
+ * Interface to hold a bunch of attributes.
+ * used for creating new files.
+ * attrsize is the total size of all attributes
+ * to be added during object creation
+ *
+ * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
+ */
+
+/*
+ * hold necessary attribute name for attribute registration.
+ * should be a very rare case where this is needed.  If it does
+ * happen it would only happen on the first write to the file system.
+ */
+static void
+dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
+{
+	if (!sa->sa_need_attr_registration)
+		return;
+
+	for (int i = 0; i != sa->sa_num_attrs; i++) {
+		if (!sa->sa_attr_table[i].sa_registered) {
+			if (sa->sa_reg_attr_obj)
+				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+			else
+				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
+				    B_TRUE, sa->sa_attr_table[i].sa_name);
+		}
+	}
+}
+
+void
+dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
+{
+	dmu_tx_hold_t *txh;
+
+	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object,
+	    THT_SPILL, 0, 0);
+	if (txh != NULL)
+		(void) zfs_refcount_add_many(&txh->txh_space_towrite,
+		    SPA_OLD_MAXBLOCKSIZE, FTAG);
+}
+
+void
+dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
+{
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+	} else {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill)
+		return;
+
+	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
+	    THT_SPILL, 0, 0);
+}
+
+/*
+ * Hold SA attribute
+ *
+ * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
+ *
+ * variable_size is the total size of all variable sized attributes
+ * passed to this function.  It is not the total size of all
+ * variable size attributes that *may* exist on this object.
+ */
+void
+dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
+{
+	uint64_t object;
+	sa_os_t *sa = tx->tx_objset->os_sa;
+
+	ASSERT(hdl != NULL);
+
+	object = sa_handle_object(hdl);
+
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+	DB_DNODE_ENTER(db);
+	dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db));
+	DB_DNODE_EXIT(db);
+
+	if (tx->tx_objset->os_sa->sa_master_obj == 0)
+		return;
+
+	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
+	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
+		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	}
+
+	dmu_tx_sa_registration_hold(sa, tx);
+
+	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
+		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
+
+	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
+		ASSERT(tx->tx_txg == 0);
+		dmu_tx_hold_spill(tx, object);
+	} else {
+		dnode_t *dn;
+
+		DB_DNODE_ENTER(db);
+		dn = DB_DNODE(db);
+		if (dn->dn_have_spill) {
+			ASSERT(tx->tx_txg == 0);
+			dmu_tx_hold_spill(tx, object);
+		}
+		DB_DNODE_EXIT(db);
+	}
+}
+
+void
+dmu_tx_init(void)
+{
+	dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (dmu_tx_ksp != NULL) {
+		dmu_tx_ksp->ks_data = &dmu_tx_stats;
+		kstat_install(dmu_tx_ksp);
+	}
+}
+
+void
+dmu_tx_fini(void)
+{
+	if (dmu_tx_ksp != NULL) {
+		kstat_delete(dmu_tx_ksp);
+		dmu_tx_ksp = NULL;
+	}
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dmu_tx_create);
+EXPORT_SYMBOL(dmu_tx_hold_write);
+EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_free);
+EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_zap);
+EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode);
+EXPORT_SYMBOL(dmu_tx_hold_bonus);
+EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode);
+EXPORT_SYMBOL(dmu_tx_abort);
+EXPORT_SYMBOL(dmu_tx_assign);
+EXPORT_SYMBOL(dmu_tx_wait);
+EXPORT_SYMBOL(dmu_tx_commit);
+EXPORT_SYMBOL(dmu_tx_mark_netfree);
+EXPORT_SYMBOL(dmu_tx_get_txg);
+EXPORT_SYMBOL(dmu_tx_callback_register);
+EXPORT_SYMBOL(dmu_tx_do_callbacks);
+EXPORT_SYMBOL(dmu_tx_hold_spill);
+EXPORT_SYMBOL(dmu_tx_hold_sa_create);
+EXPORT_SYMBOL(dmu_tx_hold_sa);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
new file mode 100644
index 000000000000..5d061fe3813e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -0,0 +1,471 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dnode.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+#include <sys/kstat.h>
+
+/*
+ * This tunable disables predictive prefetch.  Note that it leaves "prescient"
+ * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
+ * prescient prefetch never issues i/os that end up not being needed,
+ * so it can't hurt performance.
+ */
+
+int zfs_prefetch_disable = B_FALSE;
+
+/* max # of streams per zfetch */
+unsigned int	zfetch_max_streams = 8;
+/* min time before stream reclaim */
+unsigned int	zfetch_min_sec_reap = 2;
+/* max bytes to prefetch per stream (default 8MB) */
+unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+unsigned int	zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
+unsigned long	zfetch_array_rd_sz = 1024 * 1024;
+
+typedef struct zfetch_stats {
+	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_misses;
+	kstat_named_t zfetchstat_max_streams;
+	kstat_named_t zfetchstat_max_completion_us;
+	kstat_named_t zfetchstat_last_completion_us;
+	kstat_named_t zfetchstat_io_issued;
+} zfetch_stats_t;
+
+static zfetch_stats_t zfetch_stats = {
+	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "misses",			KSTAT_DATA_UINT64 },
+	{ "max_streams",		KSTAT_DATA_UINT64 },
+	{ "max_completion_us",		KSTAT_DATA_UINT64 },
+	{ "last_completion_us",		KSTAT_DATA_UINT64 },
+	{ "io_issued",		KSTAT_DATA_UINT64 },
+};
+
+#define	ZFETCHSTAT_BUMP(stat) \
+	atomic_inc_64(&zfetch_stats.stat.value.ui64)
+#define	ZFETCHSTAT_ADD(stat, val)				\
+	atomic_add_64(&zfetch_stats.stat.value.ui64, val)
+#define	ZFETCHSTAT_SET(stat, val)				\
+	zfetch_stats.stat.value.ui64 = val
+#define	ZFETCHSTAT_GET(stat)					\
+	zfetch_stats.stat.value.ui64
+
+
+kstat_t		*zfetch_ksp;
+
+void
+zfetch_init(void)
+{
+	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zfetch_ksp != NULL) {
+		zfetch_ksp->ks_data = &zfetch_stats;
+		kstat_install(zfetch_ksp);
+	}
+}
+
+void
+zfetch_fini(void)
+{
+	if (zfetch_ksp != NULL) {
+		kstat_delete(zfetch_ksp);
+		zfetch_ksp = NULL;
+	}
+}
+
+/*
+ * This takes a pointer to a zfetch structure and a dnode.  It performs the
+ * necessary setup for the zfetch structure, grokking data from the
+ * associated dnode.
+ */
+void
+dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
+{
+	if (zf == NULL)
+		return;
+	zf->zf_dnode = dno;
+	zf->zf_numstreams = 0;
+
+	list_create(&zf->zf_stream, sizeof (zstream_t),
+	    offsetof(zstream_t, zs_node));
+
+	mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+static void
+dmu_zfetch_stream_fini(zstream_t *zs)
+{
+	mutex_destroy(&zs->zs_lock);
+	kmem_free(zs, sizeof (*zs));
+}
+
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
+{
+	ASSERT(MUTEX_HELD(&zf->zf_lock));
+	list_remove(&zf->zf_stream, zs);
+	dmu_zfetch_stream_fini(zs);
+	zf->zf_numstreams--;
+}
+
+static void
+dmu_zfetch_stream_orphan(zfetch_t *zf, zstream_t *zs)
+{
+	ASSERT(MUTEX_HELD(&zf->zf_lock));
+	list_remove(&zf->zf_stream, zs);
+	zs->zs_fetch = NULL;
+	zf->zf_numstreams--;
+}
+
+/*
+ * Clean-up state associated with a zfetch structure (e.g. destroy the
+ * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
+ */
+void
+dmu_zfetch_fini(zfetch_t *zf)
+{
+	zstream_t *zs;
+
+	mutex_enter(&zf->zf_lock);
+	while ((zs = list_head(&zf->zf_stream)) != NULL) {
+		if (zfs_refcount_count(&zs->zs_blocks) != 0)
+			dmu_zfetch_stream_orphan(zf, zs);
+		else
+			dmu_zfetch_stream_remove(zf, zs);
+	}
+	mutex_exit(&zf->zf_lock);
+	list_destroy(&zf->zf_stream);
+	mutex_destroy(&zf->zf_lock);
+
+	zf->zf_dnode = NULL;
+}
+
+/*
+ * If there aren't too many streams already, create a new stream.
+ * The "blkid" argument is the next block that we expect this stream to access.
+ * While we're here, clean up old streams (which haven't been
+ * accessed for at least zfetch_min_sec_reap seconds).
+ */
+static void
+dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
+{
+	zstream_t *zs_next;
+	hrtime_t now = gethrtime();
+
+	ASSERT(MUTEX_HELD(&zf->zf_lock));
+
+	/*
+	 * Clean up old streams.
+	 */
+	for (zstream_t *zs = list_head(&zf->zf_stream);
+	    zs != NULL; zs = zs_next) {
+		zs_next = list_next(&zf->zf_stream, zs);
+		/*
+		 * Skip gethrtime() call if there are still references
+		 */
+		if (zfs_refcount_count(&zs->zs_blocks) != 0)
+			continue;
+		if (((now - zs->zs_atime) / NANOSEC) >
+		    zfetch_min_sec_reap)
+			dmu_zfetch_stream_remove(zf, zs);
+	}
+
+	/*
+	 * The maximum number of streams is normally zfetch_max_streams,
+	 * but for small files we lower it such that it's at least possible
+	 * for all the streams to be non-overlapping.
+	 *
+	 * If we are already at the maximum number of streams for this file,
+	 * even after removing old streams, then don't create this stream.
+	 */
+	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
+	    zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+	    zfetch_max_distance));
+	if (zf->zf_numstreams >= max_streams) {
+		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
+		return;
+	}
+
+	zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+	zs->zs_blkid = blkid;
+	zs->zs_pf_blkid = blkid;
+	zs->zs_ipf_blkid = blkid;
+	zs->zs_atime = now;
+	zs->zs_fetch = zf;
+	zfs_refcount_create(&zs->zs_blocks);
+	mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
+	zf->zf_numstreams++;
+	list_insert_head(&zf->zf_stream, zs);
+}
+
+static void
+dmu_zfetch_stream_done(void *arg, boolean_t io_issued)
+{
+	zstream_t *zs = arg;
+
+	if (zs->zs_start_time && io_issued) {
+		hrtime_t now = gethrtime();
+		hrtime_t delta = NSEC2USEC(now - zs->zs_start_time);
+
+		zs->zs_start_time = 0;
+		ZFETCHSTAT_SET(zfetchstat_last_completion_us, delta);
+		if (delta > ZFETCHSTAT_GET(zfetchstat_max_completion_us))
+			ZFETCHSTAT_SET(zfetchstat_max_completion_us, delta);
+	}
+
+	if (zfs_refcount_remove(&zs->zs_blocks, NULL) != 0)
+		return;
+
+	/*
+	 * The parent fetch structure has gone away
+	 */
+	if (zs->zs_fetch == NULL)
+		dmu_zfetch_stream_fini(zs);
+}
+
+/*
+ * This is the predictive prefetch entry point.  It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ *   FALSE -- prefetch only indirect blocks for predicted data blocks;
+ *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
+ */
+void
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
+    boolean_t have_lock)
+{
+	zstream_t *zs;
+	int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+	int64_t pf_ahead_blks, max_blks;
+	int epbs, max_dist_blks, pf_nblks, ipf_nblks, issued;
+	uint64_t end_of_access_blkid;
+	end_of_access_blkid = blkid + nblks;
+	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
+
+	if (zfs_prefetch_disable)
+		return;
+	/*
+	 * If we haven't yet loaded the indirect vdevs' mappings, we
+	 * can only read from blocks that we carefully ensure are on
+	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
+	 * can't allow the predictive prefetcher to attempt reads of other
+	 * blocks (e.g. of the MOS's dnode object).
+	 */
+	if (!spa_indirect_vdevs_loaded(spa))
+		return;
+
+	/*
+	 * As a fast path for small (single-block) files, ignore access
+	 * to the first block.
+	 */
+	if (!have_lock && blkid == 0)
+		return;
+
+	if (!have_lock)
+		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
+
+	/*
+	 * A fast path for small files for which no prefetch will
+	 * happen.
+	 */
+	if (zf->zf_dnode->dn_maxblkid < 2) {
+		if (!have_lock)
+			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+		return;
+	}
+	mutex_enter(&zf->zf_lock);
+
+	/*
+	 * Find matching prefetch stream.  Depending on whether the accesses
+	 * are block-aligned, first block of the new access may either follow
+	 * the last block of the previous access, or be equal to it.
+	 */
+	for (zs = list_head(&zf->zf_stream); zs != NULL;
+	    zs = list_next(&zf->zf_stream, zs)) {
+		if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
+			mutex_enter(&zs->zs_lock);
+			/*
+			 * zs_blkid could have changed before we
+			 * acquired zs_lock; re-check them here.
+			 */
+			if (blkid == zs->zs_blkid) {
+				break;
+			} else if (blkid + 1 == zs->zs_blkid) {
+				blkid++;
+				nblks--;
+				if (nblks == 0) {
+					/* Already prefetched this before. */
+					mutex_exit(&zs->zs_lock);
+					mutex_exit(&zf->zf_lock);
+					if (!have_lock) {
+						rw_exit(&zf->zf_dnode->
+						    dn_struct_rwlock);
+					}
+					return;
+				}
+				break;
+			}
+			mutex_exit(&zs->zs_lock);
+		}
+	}
+
+	if (zs == NULL) {
+		/*
+		 * This access is not part of any existing stream.  Create
+		 * a new stream for it.
+		 */
+		ZFETCHSTAT_BUMP(zfetchstat_misses);
+
+		dmu_zfetch_stream_create(zf, end_of_access_blkid);
+		mutex_exit(&zf->zf_lock);
+		if (!have_lock)
+			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+		return;
+	}
+
+	/*
+	 * This access was to a block that we issued a prefetch for on
+	 * behalf of this stream. Issue further prefetches for this stream.
+	 *
+	 * Normally, we start prefetching where we stopped
+	 * prefetching last (zs_pf_blkid).  But when we get our first
+	 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
+	 * want to prefetch the block we just accessed.  In this case,
+	 * start just after the block we just accessed.
+	 */
+	pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
+
+	/*
+	 * Double our amount of prefetched data, but don't let the
+	 * prefetch get further ahead than zfetch_max_distance.
+	 */
+	if (fetch_data) {
+		max_dist_blks =
+		    zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
+		/*
+		 * Previously, we were (zs_pf_blkid - blkid) ahead.  We
+		 * want to now be double that, so read that amount again,
+		 * plus the amount we are catching up by (i.e. the amount
+		 * read just now).
+		 */
+		pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+		max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+		pf_nblks = MIN(pf_ahead_blks, max_blks);
+	} else {
+		pf_nblks = 0;
+	}
+
+	zs->zs_pf_blkid = pf_start + pf_nblks;
+
+	/*
+	 * Do the same for indirects, starting from where we stopped last,
+	 * or where we will stop reading data blocks (and the indirects
+	 * that point to them).
+	 */
+	ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+	max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+	/*
+	 * We want to double our distance ahead of the data prefetch
+	 * (or reader, if we are not prefetching data).  Previously, we
+	 * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
+	 * that amount again, plus the amount we are catching up by
+	 * (i.e. the amount read now + the amount of data prefetched now).
+	 */
+	pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+	max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+	ipf_nblks = MIN(pf_ahead_blks, max_blks);
+	zs->zs_ipf_blkid = ipf_start + ipf_nblks;
+
+	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+	ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+	ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
+
+	zs->zs_atime = gethrtime();
+	/* no prior reads in progress */
+	if (zfs_refcount_count(&zs->zs_blocks) == 0)
+		zs->zs_start_time = zs->zs_atime;
+	zs->zs_blkid = end_of_access_blkid;
+	zfs_refcount_add_many(&zs->zs_blocks, pf_nblks + ipf_iend - ipf_istart,
+	    NULL);
+	mutex_exit(&zs->zs_lock);
+	mutex_exit(&zf->zf_lock);
+	issued = 0;
+
+	/*
+	 * dbuf_prefetch() is asynchronous (even when it needs to read
+	 * indirect blocks), but we still prefer to drop our locks before
+	 * calling it to reduce the time we hold them.
+	 */
+
+	for (int i = 0; i < pf_nblks; i++) {
+		issued += dbuf_prefetch_impl(zf->zf_dnode, 0, pf_start + i,
+		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+		    dmu_zfetch_stream_done, zs);
+	}
+	for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+		issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
+		    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH,
+		    dmu_zfetch_stream_done, zs);
+	}
+	if (!have_lock)
+		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
+	ZFETCHSTAT_BUMP(zfetchstat_hits);
+
+	if (issued)
+		ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
+	"Disable all ZFS prefetching");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
+	"Max number of streams per zfetch");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
+	"Min time before stream reclaim");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
+	"Max bytes to prefetch per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
+	"Max bytes to prefetch indirects for per stream");
+
+ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW,
+	"Number of bytes in a array_read");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dnode.c b/sys/contrib/openzfs/module/zfs/dnode.c
new file mode 100644
index 000000000000..eaba9c0c0e7f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dnode.c
@@ -0,0 +1,2583 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
+#include <sys/trace_zfs.h>
+#include <sys/zfs_project.h>
+
+dnode_stats_t dnode_stats = {
+	{ "dnode_hold_dbuf_hold",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_dbuf_read",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_alloc_hits",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_alloc_misses",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_alloc_interior",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_alloc_lock_retry",	KSTAT_DATA_UINT64 },
+	{ "dnode_hold_alloc_lock_misses",	KSTAT_DATA_UINT64 },
+	{ "dnode_hold_alloc_type_none",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_free_hits",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_free_misses",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_free_lock_misses",	KSTAT_DATA_UINT64 },
+	{ "dnode_hold_free_lock_retry",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_free_overflow",		KSTAT_DATA_UINT64 },
+	{ "dnode_hold_free_refcount",		KSTAT_DATA_UINT64 },
+	{ "dnode_free_interior_lock_retry",	KSTAT_DATA_UINT64 },
+	{ "dnode_allocate",			KSTAT_DATA_UINT64 },
+	{ "dnode_reallocate",			KSTAT_DATA_UINT64 },
+	{ "dnode_buf_evict",			KSTAT_DATA_UINT64 },
+	{ "dnode_alloc_next_chunk",		KSTAT_DATA_UINT64 },
+	{ "dnode_alloc_race",			KSTAT_DATA_UINT64 },
+	{ "dnode_alloc_next_block",		KSTAT_DATA_UINT64 },
+	{ "dnode_move_invalid",			KSTAT_DATA_UINT64 },
+	{ "dnode_move_recheck1",		KSTAT_DATA_UINT64 },
+	{ "dnode_move_recheck2",		KSTAT_DATA_UINT64 },
+	{ "dnode_move_special",			KSTAT_DATA_UINT64 },
+	{ "dnode_move_handle",			KSTAT_DATA_UINT64 },
+	{ "dnode_move_rwlock",			KSTAT_DATA_UINT64 },
+	{ "dnode_move_active",			KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
+static kmem_cache_t *dnode_cache;
+
+static dnode_phys_t dnode_phys_zero __maybe_unused;
+
+int zfs_default_bs = SPA_MINBLOCKSHIFT;
+int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
+
+#ifdef	_KERNEL
+static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
+#endif /* _KERNEL */
+
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+	const dmu_buf_impl_t *d1 = x1;
+	const dmu_buf_impl_t *d2 = x2;
+
+	int cmp = TREE_CMP(d1->db_level, d2->db_level);
+	if (likely(cmp))
+		return (cmp);
+
+	cmp = TREE_CMP(d1->db_blkid, d2->db_blkid);
+	if (likely(cmp))
+		return (cmp);
+
+	if (d1->db_state == DB_SEARCH) {
+		ASSERT3S(d2->db_state, !=, DB_SEARCH);
+		return (-1);
+	} else if (d2->db_state == DB_SEARCH) {
+		ASSERT3S(d1->db_state, !=, DB_SEARCH);
+		return (1);
+	}
+
+	return (TREE_PCMP(d1, d2));
+}
+
+/* ARGSUSED */
+static int
+dnode_cons(void *arg, void *unused, int kmflag)
+{
+	dnode_t *dn = arg;
+	int i;
+
+	rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
+	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+	cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
+
+	/*
+	 * Every dbuf has a reference, and dropping a tracked reference is
+	 * O(number of references), so don't track dn_holds.
+	 */
+	zfs_refcount_create_untracked(&dn->dn_holds);
+	zfs_refcount_create(&dn->dn_tx_holds);
+	list_link_init(&dn->dn_link);
+
+	bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
+	bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
+	bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
+	bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
+	bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
+	bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
+	bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
+	bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		multilist_link_init(&dn->dn_dirty_link[i]);
+		dn->dn_free_ranges[i] = NULL;
+		list_create(&dn->dn_dirty_records[i],
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+	}
+
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+	dn->dn_dirty_txg = 0;
+	dn->dn_dirtyctx = 0;
+	dn->dn_dirtyctx_firstset = NULL;
+	dn->dn_bonus = NULL;
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_zio = NULL;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
+	dn->dn_id_flags = 0;
+
+	dn->dn_dbufs_count = 0;
+	avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+
+	dn->dn_moved = 0;
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dnode_dest(void *arg, void *unused)
+{
+	int i;
+	dnode_t *dn = arg;
+
+	rw_destroy(&dn->dn_struct_rwlock);
+	mutex_destroy(&dn->dn_mtx);
+	mutex_destroy(&dn->dn_dbufs_mtx);
+	cv_destroy(&dn->dn_notxholds);
+	cv_destroy(&dn->dn_nodnholds);
+	zfs_refcount_destroy(&dn->dn_holds);
+	zfs_refcount_destroy(&dn->dn_tx_holds);
+	ASSERT(!list_link_active(&dn->dn_link));
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+		list_destroy(&dn->dn_dirty_records[i]);
+		ASSERT0(dn->dn_next_nblkptr[i]);
+		ASSERT0(dn->dn_next_nlevels[i]);
+		ASSERT0(dn->dn_next_indblkshift[i]);
+		ASSERT0(dn->dn_next_bonustype[i]);
+		ASSERT0(dn->dn_rm_spillblk[i]);
+		ASSERT0(dn->dn_next_bonuslen[i]);
+		ASSERT0(dn->dn_next_blksz[i]);
+		ASSERT0(dn->dn_next_maxblkid[i]);
+	}
+
+	ASSERT0(dn->dn_allocated_txg);
+	ASSERT0(dn->dn_free_txg);
+	ASSERT0(dn->dn_assigned_txg);
+	ASSERT0(dn->dn_dirty_txg);
+	ASSERT0(dn->dn_dirtyctx);
+	ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
+	ASSERT3P(dn->dn_bonus, ==, NULL);
+	ASSERT(!dn->dn_have_spill);
+	ASSERT3P(dn->dn_zio, ==, NULL);
+	ASSERT0(dn->dn_oldused);
+	ASSERT0(dn->dn_oldflags);
+	ASSERT0(dn->dn_olduid);
+	ASSERT0(dn->dn_oldgid);
+	ASSERT0(dn->dn_oldprojid);
+	ASSERT0(dn->dn_newuid);
+	ASSERT0(dn->dn_newgid);
+	ASSERT0(dn->dn_newprojid);
+	ASSERT0(dn->dn_id_flags);
+
+	ASSERT0(dn->dn_dbufs_count);
+	avl_destroy(&dn->dn_dbufs);
+}
+
+void
+dnode_init(void)
+{
+	ASSERT(dnode_cache == NULL);
+	dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
+	    0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
+	kmem_cache_set_move(dnode_cache, dnode_move);
+
+	dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (dnode_ksp != NULL) {
+		dnode_ksp->ks_data = &dnode_stats;
+		kstat_install(dnode_ksp);
+	}
+}
+
+void
+dnode_fini(void)
+{
+	if (dnode_ksp != NULL) {
+		kstat_delete(dnode_ksp);
+		dnode_ksp = NULL;
+	}
+
+	kmem_cache_destroy(dnode_cache);
+	dnode_cache = NULL;
+}
+
+
+#ifdef ZFS_DEBUG
+void
+dnode_verify(dnode_t *dn)
+{
+	int drop_struct_lock = FALSE;
+
+	ASSERT(dn->dn_phys);
+	ASSERT(dn->dn_objset);
+	ASSERT(dn->dn_handle->dnh_dnode == dn);
+
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+
+	if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY))
+		return;
+
+	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
+		int i;
+		int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+		ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
+		if (dn->dn_datablkshift) {
+			ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
+			ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT);
+			ASSERT3U(1<<dn->dn_datablkshift, ==, dn->dn_datablksz);
+		}
+		ASSERT3U(dn->dn_nlevels, <=, 30);
+		ASSERT(DMU_OT_IS_VALID(dn->dn_type));
+		ASSERT3U(dn->dn_nblkptr, >=, 1);
+		ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+		ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
+		ASSERT3U(dn->dn_datablksz, ==,
+		    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
+		ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
+		    dn->dn_bonuslen, <=, max_bonuslen);
+		for (i = 0; i < TXG_SIZE; i++) {
+			ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
+		}
+	}
+	if (dn->dn_phys->dn_type != DMU_OT_NONE)
+		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
+	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
+	if (dn->dn_dbuf != NULL) {
+		ASSERT3P(dn->dn_phys, ==,
+		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
+		    (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT)));
+	}
+	if (drop_struct_lock)
+		rw_exit(&dn->dn_struct_rwlock);
+}
+#endif
+
+void
+dnode_byteswap(dnode_phys_t *dnp)
+{
+	uint64_t *buf64 = (void*)&dnp->dn_blkptr;
+	int i;
+
+	if (dnp->dn_type == DMU_OT_NONE) {
+		bzero(dnp, sizeof (dnode_phys_t));
+		return;
+	}
+
+	dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
+	dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+	dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
+	dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
+	dnp->dn_used = BSWAP_64(dnp->dn_used);
+
+	/*
+	 * dn_nblkptr is only one byte, so it's OK to read it in either
+	 * byte order.  We can't read dn_bouslen.
+	 */
+	ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT);
+	ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR);
+	for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++)
+		buf64[i] = BSWAP_64(buf64[i]);
+
+	/*
+	 * OK to check dn_bonuslen for zero, because it won't matter if
+	 * we have the wrong byte order.  This is necessary because the
+	 * dnode dnode is smaller than a regular dnode.
+	 */
+	if (dnp->dn_bonuslen != 0) {
+		/*
+		 * Note that the bonus length calculated here may be
+		 * longer than the actual bonus buffer.  This is because
+		 * we always put the bonus buffer after the last block
+		 * pointer (instead of packing it against the end of the
+		 * dnode buffer).
+		 */
+		int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
+		int slots = dnp->dn_extra_slots + 1;
+		size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
+		dmu_object_byteswap_t byteswap;
+		ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
+		byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
+		dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
+	}
+
+	/* Swap SPILL block if we have one */
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+		byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
+}
+
+void
+dnode_buf_byteswap(void *vbuf, size_t size)
+{
+	int i = 0;
+
+	ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
+	ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
+
+	while (i < size) {
+		dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
+		dnode_byteswap(dnp);
+
+		i += DNODE_MIN_SIZE;
+		if (dnp->dn_type != DMU_OT_NONE)
+			i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
+	}
+}
+
+void
+dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
+{
+	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+	    (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+
+	if (newsize < dn->dn_bonuslen) {
+		/* clear any data after the end of the new size */
+		size_t diff = dn->dn_bonuslen - newsize;
+		char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
+		bzero(data_end, diff);
+	}
+
+	dn->dn_bonuslen = newsize;
+	if (newsize == 0)
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
+	else
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
+{
+	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+	dnode_setdirty(dn, tx);
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dn->dn_bonustype = newtype;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+void
+dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
+{
+	ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+	dnode_setdirty(dn, tx);
+	dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;
+	dn->dn_have_spill = B_FALSE;
+}
+
+static void
+dnode_setdblksz(dnode_t *dn, int size)
+{
+	ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
+	    1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
+	dn->dn_datablksz = size;
+	dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
+	dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
+}
+
+static dnode_t *
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+    uint64_t object, dnode_handle_t *dnh)
+{
+	dnode_t *dn;
+
+	dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+	dn->dn_moved = 0;
+
+	/*
+	 * Defer setting dn_objset until the dnode is ready to be a candidate
+	 * for the dnode_move() callback.
+	 */
+	dn->dn_object = object;
+	dn->dn_dbuf = db;
+	dn->dn_handle = dnh;
+	dn->dn_phys = dnp;
+
+	if (dnp->dn_datablkszsec) {
+		dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	} else {
+		dn->dn_datablksz = 0;
+		dn->dn_datablkszsec = 0;
+		dn->dn_datablkshift = 0;
+	}
+	dn->dn_indblkshift = dnp->dn_indblkshift;
+	dn->dn_nlevels = dnp->dn_nlevels;
+	dn->dn_type = dnp->dn_type;
+	dn->dn_nblkptr = dnp->dn_nblkptr;
+	dn->dn_checksum = dnp->dn_checksum;
+	dn->dn_compress = dnp->dn_compress;
+	dn->dn_bonustype = dnp->dn_bonustype;
+	dn->dn_bonuslen = dnp->dn_bonuslen;
+	dn->dn_num_slots = dnp->dn_extra_slots + 1;
+	dn->dn_maxblkid = dnp->dn_maxblkid;
+	dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
+	dn->dn_id_flags = 0;
+
+	dmu_zfetch_init(&dn->dn_zfetch, dn);
+
+	ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+	ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+	ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
+
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * Exclude special dnodes from os_dnodes so an empty os_dnodes
+	 * signifies that the special dnodes have no references from
+	 * their children (the entries in os_dnodes).  This allows
+	 * dnode_destroy() to easily determine if the last child has
+	 * been removed and then complete eviction of the objset.
+	 */
+	if (!DMU_OBJECT_IS_SPECIAL(object))
+		list_insert_head(&os->os_dnodes, dn);
+	membar_producer();
+
+	/*
+	 * Everything else must be valid before assigning dn_objset
+	 * makes the dnode eligible for dnode_move().
+	 */
+	dn->dn_objset = os;
+
+	dnh->dnh_dnode = dn;
+	mutex_exit(&os->os_lock);
+
+	arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+	return (dn);
+}
+
+/*
+ * Caller must be holding the dnode handle, which is released upon return.
+ */
+static void
+dnode_destroy(dnode_t *dn)
+{
+	objset_t *os = dn->dn_objset;
+	boolean_t complete_os_eviction = B_FALSE;
+
+	ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
+
+	mutex_enter(&os->os_lock);
+	POINTER_INVALIDATE(&dn->dn_objset);
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		list_remove(&os->os_dnodes, dn);
+		complete_os_eviction =
+		    list_is_empty(&os->os_dnodes) &&
+		    list_link_active(&os->os_evicting_node);
+	}
+	mutex_exit(&os->os_lock);
+
+	/* the dnode can no longer move, so we can release the handle */
+	if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+		zrl_remove(&dn->dn_handle->dnh_zrlock);
+
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_assigned_txg = 0;
+	dn->dn_dirty_txg = 0;
+
+	dn->dn_dirtyctx = 0;
+	dn->dn_dirtyctx_firstset = NULL;
+	if (dn->dn_bonus != NULL) {
+		mutex_enter(&dn->dn_bonus->db_mtx);
+		dbuf_destroy(dn->dn_bonus);
+		dn->dn_bonus = NULL;
+	}
+	dn->dn_zio = NULL;
+
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_oldused = 0;
+	dn->dn_oldflags = 0;
+	dn->dn_olduid = 0;
+	dn->dn_oldgid = 0;
+	dn->dn_oldprojid = ZFS_DEFAULT_PROJID;
+	dn->dn_newuid = 0;
+	dn->dn_newgid = 0;
+	dn->dn_newprojid = ZFS_DEFAULT_PROJID;
+	dn->dn_id_flags = 0;
+
+	dmu_zfetch_fini(&dn->dn_zfetch);
+	kmem_cache_free(dnode_cache, dn);
+	arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+	if (complete_os_eviction)
+		dmu_objset_evict_done(os);
+}
+
+void
+dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+{
+	int i;
+
+	ASSERT3U(dn_slots, >, 0);
+	ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+	    spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+	if (blocksize == 0)
+		blocksize = 1 << zfs_default_bs;
+	else
+		blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
+
+	if (ibs == 0)
+		ibs = zfs_default_ibs;
+
+	ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
+
+	dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+	    dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+	DNODE_STAT_BUMP(dnode_allocate);
+
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+	ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
+	ASSERT(ot != DMU_OT_NONE);
+	ASSERT(DMU_OT_IS_VALID(ot));
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0));
+	ASSERT(DMU_OT_IS_VALID(bonustype));
+	ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
+	ASSERT(dn->dn_type == DMU_OT_NONE);
+	ASSERT0(dn->dn_maxblkid);
+	ASSERT0(dn->dn_allocated_txg);
+	ASSERT0(dn->dn_assigned_txg);
+	ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
+	ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
+	ASSERT(avl_is_empty(&dn->dn_dbufs));
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		ASSERT0(dn->dn_next_nblkptr[i]);
+		ASSERT0(dn->dn_next_nlevels[i]);
+		ASSERT0(dn->dn_next_indblkshift[i]);
+		ASSERT0(dn->dn_next_bonuslen[i]);
+		ASSERT0(dn->dn_next_bonustype[i]);
+		ASSERT0(dn->dn_rm_spillblk[i]);
+		ASSERT0(dn->dn_next_blksz[i]);
+		ASSERT0(dn->dn_next_maxblkid[i]);
+		ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
+		ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
+		ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
+	}
+
+	dn->dn_type = ot;
+	dnode_setdblksz(dn, blocksize);
+	dn->dn_indblkshift = ibs;
+	dn->dn_nlevels = 1;
+	dn->dn_num_slots = dn_slots;
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		dn->dn_nblkptr = 1;
+	else {
+		dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+		    SPA_BLKPTRSHIFT));
+	}
+
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	dn->dn_dirtyctx = 0;
+
+	dn->dn_free_txg = 0;
+	dn->dn_dirtyctx_firstset = NULL;
+	dn->dn_dirty_txg = 0;
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	dn->dn_id_flags = 0;
+
+	dnode_setdirty(dn, tx);
+	dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
+	dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen;
+	dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype;
+	dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz;
+}
+
+void
+dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots,
+    boolean_t keep_spill, dmu_tx_t *tx)
+{
+	int nblkptr;
+
+	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
+	ASSERT3U(blocksize, <=,
+	    spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+	ASSERT0(blocksize % SPA_MINBLOCKSIZE);
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
+	ASSERT(tx->tx_txg != 0);
+	ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
+	    (bonustype != DMU_OT_NONE && bonuslen != 0) ||
+	    (bonustype == DMU_OT_SA && bonuslen == 0));
+	ASSERT(DMU_OT_IS_VALID(bonustype));
+	ASSERT3U(bonuslen, <=,
+	    DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+	ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
+
+	dnode_free_interior_slots(dn);
+	DNODE_STAT_BUMP(dnode_reallocate);
+
+	/* clean up any unreferenced dbufs */
+	dnode_evict_dbufs(dn);
+
+	dn->dn_id_flags = 0;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	dnode_setdirty(dn, tx);
+	if (dn->dn_datablksz != blocksize) {
+		/* change blocksize */
+		ASSERT0(dn->dn_maxblkid);
+		ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+		    dnode_block_freed(dn, 0));
+
+		dnode_setdblksz(dn, blocksize);
+		dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;
+	}
+	if (dn->dn_bonuslen != bonuslen)
+		dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;
+
+	if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
+		nblkptr = 1;
+	else
+		nblkptr = MIN(DN_MAX_NBLKPTR,
+		    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+		    SPA_BLKPTRSHIFT));
+	if (dn->dn_bonustype != bonustype)
+		dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
+	if (dn->dn_nblkptr != nblkptr)
+		dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
+	if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
+		dbuf_rm_spill(dn, tx);
+		dnode_rm_spill(dn, tx);
+	}
+
+	rw_exit(&dn->dn_struct_rwlock);
+
+	/* change type */
+	dn->dn_type = ot;
+
+	/* change bonus size and type */
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_bonustype = bonustype;
+	dn->dn_bonuslen = bonuslen;
+	dn->dn_num_slots = dn_slots;
+	dn->dn_nblkptr = nblkptr;
+	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
+	dn->dn_compress = ZIO_COMPRESS_INHERIT;
+	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
+
+	/* fix up the bonus db_size */
+	if (dn->dn_bonus) {
+		dn->dn_bonus->db.db_size =
+		    DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
+	}
+
+	dn->dn_allocated_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+}
+
+#ifdef	_KERNEL
+static void
+dnode_move_impl(dnode_t *odn, dnode_t *ndn)
+{
+	int i;
+
+	ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
+	ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
+	ASSERT(!MUTEX_HELD(&odn->dn_zfetch.zf_lock));
+
+	/* Copy fields. */
+	ndn->dn_objset = odn->dn_objset;
+	ndn->dn_object = odn->dn_object;
+	ndn->dn_dbuf = odn->dn_dbuf;
+	ndn->dn_handle = odn->dn_handle;
+	ndn->dn_phys = odn->dn_phys;
+	ndn->dn_type = odn->dn_type;
+	ndn->dn_bonuslen = odn->dn_bonuslen;
+	ndn->dn_bonustype = odn->dn_bonustype;
+	ndn->dn_nblkptr = odn->dn_nblkptr;
+	ndn->dn_checksum = odn->dn_checksum;
+	ndn->dn_compress = odn->dn_compress;
+	ndn->dn_nlevels = odn->dn_nlevels;
+	ndn->dn_indblkshift = odn->dn_indblkshift;
+	ndn->dn_datablkshift = odn->dn_datablkshift;
+	ndn->dn_datablkszsec = odn->dn_datablkszsec;
+	ndn->dn_datablksz = odn->dn_datablksz;
+	ndn->dn_maxblkid = odn->dn_maxblkid;
+	ndn->dn_num_slots = odn->dn_num_slots;
+	bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+	    sizeof (odn->dn_next_type));
+	bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+	    sizeof (odn->dn_next_nblkptr));
+	bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+	    sizeof (odn->dn_next_nlevels));
+	bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+	    sizeof (odn->dn_next_indblkshift));
+	bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+	    sizeof (odn->dn_next_bonustype));
+	bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+	    sizeof (odn->dn_rm_spillblk));
+	bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+	    sizeof (odn->dn_next_bonuslen));
+	bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+	    sizeof (odn->dn_next_blksz));
+	bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
+	    sizeof (odn->dn_next_maxblkid));
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_move_tail(&ndn->dn_dirty_records[i],
+		    &odn->dn_dirty_records[i]);
+	}
+	bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+	    sizeof (odn->dn_free_ranges));
+	ndn->dn_allocated_txg = odn->dn_allocated_txg;
+	ndn->dn_free_txg = odn->dn_free_txg;
+	ndn->dn_assigned_txg = odn->dn_assigned_txg;
+	ndn->dn_dirty_txg = odn->dn_dirty_txg;
+	ndn->dn_dirtyctx = odn->dn_dirtyctx;
+	ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
+	ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
+	zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+	ASSERT(avl_is_empty(&ndn->dn_dbufs));
+	avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
+	ndn->dn_dbufs_count = odn->dn_dbufs_count;
+	ndn->dn_bonus = odn->dn_bonus;
+	ndn->dn_have_spill = odn->dn_have_spill;
+	ndn->dn_zio = odn->dn_zio;
+	ndn->dn_oldused = odn->dn_oldused;
+	ndn->dn_oldflags = odn->dn_oldflags;
+	ndn->dn_olduid = odn->dn_olduid;
+	ndn->dn_oldgid = odn->dn_oldgid;
+	ndn->dn_oldprojid = odn->dn_oldprojid;
+	ndn->dn_newuid = odn->dn_newuid;
+	ndn->dn_newgid = odn->dn_newgid;
+	ndn->dn_newprojid = odn->dn_newprojid;
+	ndn->dn_id_flags = odn->dn_id_flags;
+	dmu_zfetch_init(&ndn->dn_zfetch, NULL);
+	list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
+	ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+
+	/*
+	 * Update back pointers. Updating the handle fixes the back pointer of
+	 * every descendant dbuf as well as the bonus dbuf.
+	 */
+	ASSERT(ndn->dn_handle->dnh_dnode == odn);
+	ndn->dn_handle->dnh_dnode = ndn;
+	if (ndn->dn_zfetch.zf_dnode == odn) {
+		ndn->dn_zfetch.zf_dnode = ndn;
+	}
+
+	/*
+	 * Invalidate the original dnode by clearing all of its back pointers.
+	 */
+	odn->dn_dbuf = NULL;
+	odn->dn_handle = NULL;
+	avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
+	    offsetof(dmu_buf_impl_t, db_link));
+	odn->dn_dbufs_count = 0;
+	odn->dn_bonus = NULL;
+	dmu_zfetch_fini(&odn->dn_zfetch);
+
+	/*
+	 * Set the low bit of the objset pointer to ensure that dnode_move()
+	 * recognizes the dnode as invalid in any subsequent callback.
+	 */
+	POINTER_INVALIDATE(&odn->dn_objset);
+
+	/*
+	 * Satisfy the destructor.
+	 */
+	for (i = 0; i < TXG_SIZE; i++) {
+		list_create(&odn->dn_dirty_records[i],
+		    sizeof (dbuf_dirty_record_t),
+		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
+		odn->dn_free_ranges[i] = NULL;
+		odn->dn_next_nlevels[i] = 0;
+		odn->dn_next_indblkshift[i] = 0;
+		odn->dn_next_bonustype[i] = 0;
+		odn->dn_rm_spillblk[i] = 0;
+		odn->dn_next_bonuslen[i] = 0;
+		odn->dn_next_blksz[i] = 0;
+	}
+	odn->dn_allocated_txg = 0;
+	odn->dn_free_txg = 0;
+	odn->dn_assigned_txg = 0;
+	odn->dn_dirty_txg = 0;
+	odn->dn_dirtyctx = 0;
+	odn->dn_dirtyctx_firstset = NULL;
+	odn->dn_have_spill = B_FALSE;
+	odn->dn_zio = NULL;
+	odn->dn_oldused = 0;
+	odn->dn_oldflags = 0;
+	odn->dn_olduid = 0;
+	odn->dn_oldgid = 0;
+	odn->dn_oldprojid = ZFS_DEFAULT_PROJID;
+	odn->dn_newuid = 0;
+	odn->dn_newgid = 0;
+	odn->dn_newprojid = ZFS_DEFAULT_PROJID;
+	odn->dn_id_flags = 0;
+
+	/*
+	 * Mark the dnode.
+	 */
+	ndn->dn_moved = 1;
+	odn->dn_moved = (uint8_t)-1;
+}
+
+/*ARGSUSED*/
+static kmem_cbrc_t
+dnode_move(void *buf, void *newbuf, size_t size, void *arg)
+{
+	dnode_t *odn = buf, *ndn = newbuf;
+	objset_t *os;
+	int64_t refcount;
+	uint32_t dbufs;
+
+	/*
+	 * The dnode is on the objset's list of known dnodes if the objset
+	 * pointer is valid. We set the low bit of the objset pointer when
+	 * freeing the dnode to invalidate it, and the memory patterns written
+	 * by kmem (baddcafe and deadbeef) set at least one of the two low bits.
+	 * A newly created dnode sets the objset pointer last of all to indicate
+	 * that the dnode is known and in a valid state to be moved by this
+	 * function.
+	 */
+	os = odn->dn_objset;
+	if (!POINTER_IS_VALID(os)) {
+		DNODE_STAT_BUMP(dnode_move_invalid);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * Ensure that the objset does not go away during the move.
+	 */
+	rw_enter(&os_lock, RW_WRITER);
+	if (os != odn->dn_objset) {
+		rw_exit(&os_lock);
+		DNODE_STAT_BUMP(dnode_move_recheck1);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * If the dnode is still valid, then so is the objset. We know that no
+	 * valid objset can be freed while we hold os_lock, so we can safely
+	 * ensure that the objset remains in use.
+	 */
+	mutex_enter(&os->os_lock);
+
+	/*
+	 * Recheck the objset pointer in case the dnode was removed just before
+	 * acquiring the lock.
+	 */
+	if (os != odn->dn_objset) {
+		mutex_exit(&os->os_lock);
+		rw_exit(&os_lock);
+		DNODE_STAT_BUMP(dnode_move_recheck2);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * At this point we know that as long as we hold os->os_lock, the dnode
+	 * cannot be freed and fields within the dnode can be safely accessed.
+	 * The objset listing this dnode cannot go away as long as this dnode is
+	 * on its list.
+	 */
+	rw_exit(&os_lock);
+	if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_BUMP(dnode_move_special);
+		return (KMEM_CBRC_NO);
+	}
+	ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
+
+	/*
+	 * Lock the dnode handle to prevent the dnode from obtaining any new
+	 * holds. This also prevents the descendant dbufs and the bonus dbuf
+	 * from accessing the dnode, so that we can discount their holds. The
+	 * handle is safe to access because we know that while the dnode cannot
+	 * go away, neither can its handle. Once we hold dnh_zrlock, we can
+	 * safely move any dnode referenced only by dbufs.
+	 */
+	if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_BUMP(dnode_move_handle);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * Ensure a consistent view of the dnode's holds and the dnode's dbufs.
+	 * We need to guarantee that there is a hold for every dbuf in order to
+	 * determine whether the dnode is actively referenced. Falsely matching
+	 * a dbuf to an active hold would lead to an unsafe move. It's possible
+	 * that a thread already having an active dnode hold is about to add a
+	 * dbuf, and we can't compare hold and dbuf counts while the add is in
+	 * progress.
+	 */
+	if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_BUMP(dnode_move_rwlock);
+		return (KMEM_CBRC_LATER);
+	}
+
+	/*
+	 * A dbuf may be removed (evicted) without an active dnode hold. In that
+	 * case, the dbuf count is decremented under the handle lock before the
+	 * dbuf's hold is released. This order ensures that if we count the hold
+	 * after the dbuf is removed but before its hold is released, we will
+	 * treat the unmatched hold as active and exit safely. If we count the
+	 * hold before the dbuf is removed, the hold is discounted, and the
+	 * removal is blocked until the move completes.
+	 */
+	refcount = zfs_refcount_count(&odn->dn_holds);
+	ASSERT(refcount >= 0);
+	dbufs = DN_DBUFS_COUNT(odn);
+
+	/* We can't have more dbufs than dnode holds. */
+	ASSERT3U(dbufs, <=, refcount);
+	DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount,
+	    uint32_t, dbufs);
+
+	if (refcount > dbufs) {
+		rw_exit(&odn->dn_struct_rwlock);
+		zrl_exit(&odn->dn_handle->dnh_zrlock);
+		mutex_exit(&os->os_lock);
+		DNODE_STAT_BUMP(dnode_move_active);
+		return (KMEM_CBRC_LATER);
+	}
+
+	rw_exit(&odn->dn_struct_rwlock);
+
+	/*
+	 * At this point we know that anyone with a hold on the dnode is not
+	 * actively referencing it. The dnode is known and in a valid state to
+	 * move. We're holding the locks needed to execute the critical section.
+	 */
+	dnode_move_impl(odn, ndn);
+
+	list_link_replace(&odn->dn_link, &ndn->dn_link);
+	/* If the dnode was safe to move, the refcount cannot have changed. */
+	ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
+	ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
+	zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
+	mutex_exit(&os->os_lock);
+
+	return (KMEM_CBRC_YES);
+}
+#endif	/* _KERNEL */
+
+static void
+dnode_slots_hold(dnode_children_t *children, int idx, int slots)
+{
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	for (int i = idx; i < idx + slots; i++) {
+		dnode_handle_t *dnh = &children->dnc_children[i];
+		zrl_add(&dnh->dnh_zrlock);
+	}
+}
+
+static void
+dnode_slots_rele(dnode_children_t *children, int idx, int slots)
+{
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	for (int i = idx; i < idx + slots; i++) {
+		dnode_handle_t *dnh = &children->dnc_children[i];
+
+		if (zrl_is_locked(&dnh->dnh_zrlock))
+			zrl_exit(&dnh->dnh_zrlock);
+		else
+			zrl_remove(&dnh->dnh_zrlock);
+	}
+}
+
+static int
+dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
+{
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	for (int i = idx; i < idx + slots; i++) {
+		dnode_handle_t *dnh = &children->dnc_children[i];
+
+		if (!zrl_tryenter(&dnh->dnh_zrlock)) {
+			for (int j = idx; j < i; j++) {
+				dnh = &children->dnc_children[j];
+				zrl_exit(&dnh->dnh_zrlock);
+			}
+
+			return (0);
+		}
+	}
+
+	return (1);
+}
+
+static void
+dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	for (int i = idx; i < idx + slots; i++) {
+		dnode_handle_t *dnh = &children->dnc_children[i];
+		dnh->dnh_dnode = ptr;
+	}
+}
+
+static boolean_t
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
+{
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	/*
+	 * If all dnode slots are either already free or
+	 * evictable return B_TRUE.
+	 */
+	for (int i = idx; i < idx + slots; i++) {
+		dnode_handle_t *dnh = &children->dnc_children[i];
+		dnode_t *dn = dnh->dnh_dnode;
+
+		if (dn == DN_SLOT_FREE) {
+			continue;
+		} else if (DN_SLOT_IS_PTR(dn)) {
+			mutex_enter(&dn->dn_mtx);
+			boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
+			    zfs_refcount_is_zero(&dn->dn_holds) &&
+			    !DNODE_IS_DIRTY(dn));
+			mutex_exit(&dn->dn_mtx);
+
+			if (!can_free)
+				return (B_FALSE);
+			else
+				continue;
+		} else {
+			return (B_FALSE);
+		}
+	}
+
+	return (B_TRUE);
+}
+
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	for (int i = idx; i < idx + slots; i++) {
+		dnode_handle_t *dnh = &children->dnc_children[i];
+
+		ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+			ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+			dnode_destroy(dnh->dnh_dnode);
+			dnh->dnh_dnode = DN_SLOT_FREE;
+		}
+	}
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+	dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+	int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+	int idx = (dn->dn_object & (epb - 1)) + 1;
+	int slots = dn->dn_num_slots - 1;
+
+	if (slots == 0)
+		return;
+
+	ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+	while (!dnode_slots_tryenter(children, idx, slots)) {
+		DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+		cond_resched();
+	}
+
+	dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+	dnode_slots_rele(children, idx, slots);
+}
+
+void
+dnode_special_close(dnode_handle_t *dnh)
+{
+	dnode_t *dn = dnh->dnh_dnode;
+
+	/*
+	 * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
+	 * zfs_refcount_remove()
+	 */
+	mutex_enter(&dn->dn_mtx);
+	if (zfs_refcount_count(&dn->dn_holds) > 0)
+		cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
+	mutex_exit(&dn->dn_mtx);
+	ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
+
+	ASSERT(dn->dn_dbuf == NULL ||
+	    dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
+	zrl_add(&dnh->dnh_zrlock);
+	dnode_destroy(dn); /* implicit zrl_remove() */
+	zrl_destroy(&dnh->dnh_zrlock);
+	dnh->dnh_dnode = NULL;
+}
+
+void
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
+    dnode_handle_t *dnh)
+{
+	dnode_t *dn;
+
+	zrl_init(&dnh->dnh_zrlock);
+	VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
+
+	dn = dnode_create(os, dnp, NULL, object, dnh);
+	DNODE_VERIFY(dn);
+
+	zrl_exit(&dnh->dnh_zrlock);
+}
+
+static void
+dnode_buf_evict_async(void *dbu)
+{
+	dnode_children_t *dnc = dbu;
+
+	DNODE_STAT_BUMP(dnode_buf_evict);
+
+	for (int i = 0; i < dnc->dnc_count; i++) {
+		dnode_handle_t *dnh = &dnc->dnc_children[i];
+		dnode_t *dn;
+
+		/*
+		 * The dnode handle lock guards against the dnode moving to
+		 * another valid address, so there is no need here to guard
+		 * against changes to or from NULL.
+		 */
+		if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+			zrl_destroy(&dnh->dnh_zrlock);
+			dnh->dnh_dnode = DN_SLOT_UNINIT;
+			continue;
+		}
+
+		zrl_add(&dnh->dnh_zrlock);
+		dn = dnh->dnh_dnode;
+		/*
+		 * If there are holds on this dnode, then there should
+		 * be holds on the dnode's containing dbuf as well; thus
+		 * it wouldn't be eligible for eviction and this function
+		 * would not have been called.
+		 */
+		ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
+		ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
+
+		dnode_destroy(dn); /* implicit zrl_remove() for first slot */
+		zrl_destroy(&dnh->dnh_zrlock);
+		dnh->dnh_dnode = DN_SLOT_UNINIT;
+	}
+	kmem_free(dnc, sizeof (dnode_children_t) +
+	    dnc->dnc_count * sizeof (dnode_handle_t));
+}
+
+/*
+ * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used
+ * to ensure the hole at the specified object offset is large enough to
+ * hold the dnode being created. The slots parameter is also used to ensure
+ * a dnode does not span multiple dnode blocks. In both of these cases, if
+ * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases
+ * are only possible when using DNODE_MUST_BE_FREE.
+ *
+ * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0.
+ * dnode_hold_impl() will check if the requested dnode is already consumed
+ * as an extra dnode slot by an large dnode, in which case it returns
+ * ENOENT.
+ *
+ * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
+ * return whether the hold would succeed or not. tag and dnp should set to
+ * NULL in this case.
+ *
+ * errors:
+ * EINVAL - Invalid object number or flags.
+ * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
+ * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
+ *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
+ * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
+ * EIO    - I/O error when reading the meta dnode dbuf.
+ *
+ * succeeds even for free dnodes.
+ */
+int
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
+    void *tag, dnode_t **dnp)
+{
+	int epb, idx, err;
+	int drop_struct_lock = FALSE;
+	int type;
+	uint64_t blk;
+	dnode_t *mdn, *dn;
+	dmu_buf_impl_t *db;
+	dnode_children_t *dnc;
+	dnode_phys_t *dn_block;
+	dnode_handle_t *dnh;
+
+	ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+	ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+	IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
+
+	/*
+	 * If you are holding the spa config lock as writer, you shouldn't
+	 * be asking the DMU to do *anything* unless it's the root pool
+	 * which may require us to read from the root filesystem while
+	 * holding some (not all) of the locks as writer.
+	 */
+	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 ||
+	    (spa_is_root(os->os_spa) &&
+	    spa_config_held(os->os_spa, SCL_STATE, RW_WRITER)));
+
+	ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE));
+
+	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT ||
+	    object == DMU_PROJECTUSED_OBJECT) {
+		if (object == DMU_USERUSED_OBJECT)
+			dn = DMU_USERUSED_DNODE(os);
+		else if (object == DMU_GROUPUSED_OBJECT)
+			dn = DMU_GROUPUSED_DNODE(os);
+		else
+			dn = DMU_PROJECTUSED_DNODE(os);
+		if (dn == NULL)
+			return (SET_ERROR(ENOENT));
+		type = dn->dn_type;
+		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
+			return (SET_ERROR(ENOENT));
+		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
+			return (SET_ERROR(EEXIST));
+		DNODE_VERIFY(dn);
+		/* Don't actually hold if dry run, just return 0 */
+		if (!(flag & DNODE_DRY_RUN)) {
+			(void) zfs_refcount_add(&dn->dn_holds, tag);
+			*dnp = dn;
+		}
+		return (0);
+	}
+
+	if (object == 0 || object >= DN_MAX_OBJECT)
+		return (SET_ERROR(EINVAL));
+
+	mdn = DMU_META_DNODE(os);
+	ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
+
+	DNODE_VERIFY(mdn);
+
+	if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) {
+		rw_enter(&mdn->dn_struct_rwlock, RW_READER);
+		drop_struct_lock = TRUE;
+	}
+
+	blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
+	db = dbuf_hold(mdn, blk, FTAG);
+	if (drop_struct_lock)
+		rw_exit(&mdn->dn_struct_rwlock);
+	if (db == NULL) {
+		DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
+		return (SET_ERROR(EIO));
+	}
+
+	/*
+	 * We do not need to decrypt to read the dnode so it doesn't matter
+	 * if we get the encrypted or decrypted version.
+	 */
+	err = dbuf_read(db, NULL, DB_RF_CANFAIL |
+	    DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
+	if (err) {
+		DNODE_STAT_BUMP(dnode_hold_dbuf_read);
+		dbuf_rele(db, FTAG);
+		return (err);
+	}
+
+	ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
+	epb = db->db.db_size >> DNODE_SHIFT;
+
+	idx = object & (epb - 1);
+	dn_block = (dnode_phys_t *)db->db.db_data;
+
+	ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
+	dnc = dmu_buf_get_user(&db->db);
+	dnh = NULL;
+	if (dnc == NULL) {
+		dnode_children_t *winner;
+		int skip = 0;
+
+		dnc = kmem_zalloc(sizeof (dnode_children_t) +
+		    epb * sizeof (dnode_handle_t), KM_SLEEP);
+		dnc->dnc_count = epb;
+		dnh = &dnc->dnc_children[0];
+
+		/* Initialize dnode slot status from dnode_phys_t */
+		for (int i = 0; i < epb; i++) {
+			zrl_init(&dnh[i].dnh_zrlock);
+
+			if (skip) {
+				skip--;
+				continue;
+			}
+
+			if (dn_block[i].dn_type != DMU_OT_NONE) {
+				int interior = dn_block[i].dn_extra_slots;
+
+				dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
+				dnode_set_slots(dnc, i + 1, interior,
+				    DN_SLOT_INTERIOR);
+				skip = interior;
+			} else {
+				dnh[i].dnh_dnode = DN_SLOT_FREE;
+				skip = 0;
+			}
+		}
+
+		dmu_buf_init_user(&dnc->dnc_dbu, NULL,
+		    dnode_buf_evict_async, NULL);
+		winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
+		if (winner != NULL) {
+
+			for (int i = 0; i < epb; i++)
+				zrl_destroy(&dnh[i].dnh_zrlock);
+
+			kmem_free(dnc, sizeof (dnode_children_t) +
+			    epb * sizeof (dnode_handle_t));
+			dnc = winner;
+		}
+	}
+
+	ASSERT(dnc->dnc_count == epb);
+
+	if (flag & DNODE_MUST_BE_ALLOCATED) {
+		slots = 1;
+
+		dnode_slots_hold(dnc, idx, slots);
+		dnh = &dnc->dnc_children[idx];
+
+		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+			dn = dnh->dnh_dnode;
+		} else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+			DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (SET_ERROR(EEXIST));
+		} else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+			DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (SET_ERROR(ENOENT));
+		} else {
+			dnode_slots_rele(dnc, idx, slots);
+			while (!dnode_slots_tryenter(dnc, idx, slots)) {
+				DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
+				cond_resched();
+			}
+
+			/*
+			 * Someone else won the race and called dnode_create()
+			 * after we checked DN_SLOT_IS_PTR() above but before
+			 * we acquired the lock.
+			 */
+			if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+				DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
+				dn = dnh->dnh_dnode;
+			} else {
+				dn = dnode_create(os, dn_block + idx, db,
+				    object, dnh);
+			}
+		}
+
+		mutex_enter(&dn->dn_mtx);
+		if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
+			DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
+			mutex_exit(&dn->dn_mtx);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (SET_ERROR(ENOENT));
+		}
+
+		/* Don't actually hold if dry run, just return 0 */
+		if (flag & DNODE_DRY_RUN) {
+			mutex_exit(&dn->dn_mtx);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (0);
+		}
+
+		DNODE_STAT_BUMP(dnode_hold_alloc_hits);
+	} else if (flag & DNODE_MUST_BE_FREE) {
+
+		if (idx + slots - 1 >= DNODES_PER_BLOCK) {
+			DNODE_STAT_BUMP(dnode_hold_free_overflow);
+			dbuf_rele(db, FTAG);
+			return (SET_ERROR(ENOSPC));
+		}
+
+		dnode_slots_hold(dnc, idx, slots);
+
+		if (!dnode_check_slots_free(dnc, idx, slots)) {
+			DNODE_STAT_BUMP(dnode_hold_free_misses);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (SET_ERROR(ENOSPC));
+		}
+
+		dnode_slots_rele(dnc, idx, slots);
+		while (!dnode_slots_tryenter(dnc, idx, slots)) {
+			DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+			cond_resched();
+		}
+
+		if (!dnode_check_slots_free(dnc, idx, slots)) {
+			DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (SET_ERROR(ENOSPC));
+		}
+
+		/*
+		 * Allocated but otherwise free dnodes which would
+		 * be in the interior of a multi-slot dnodes need
+		 * to be freed.  Single slot dnodes can be safely
+		 * re-purposed as a performance optimization.
+		 */
+		if (slots > 1)
+			dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
+		dnh = &dnc->dnc_children[idx];
+		if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+			dn = dnh->dnh_dnode;
+		} else {
+			dn = dnode_create(os, dn_block + idx, db,
+			    object, dnh);
+		}
+
+		mutex_enter(&dn->dn_mtx);
+		if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
+			DNODE_STAT_BUMP(dnode_hold_free_refcount);
+			mutex_exit(&dn->dn_mtx);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (SET_ERROR(EEXIST));
+		}
+
+		/* Don't actually hold if dry run, just return 0 */
+		if (flag & DNODE_DRY_RUN) {
+			mutex_exit(&dn->dn_mtx);
+			dnode_slots_rele(dnc, idx, slots);
+			dbuf_rele(db, FTAG);
+			return (0);
+		}
+
+		dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
+		DNODE_STAT_BUMP(dnode_hold_free_hits);
+	} else {
+		dbuf_rele(db, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	ASSERT0(dn->dn_free_txg);
+
+	if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
+		dbuf_add_ref(db, dnh);
+
+	mutex_exit(&dn->dn_mtx);
+
+	/* Now we can rely on the hold to prevent the dnode from moving. */
+	dnode_slots_rele(dnc, idx, slots);
+
+	DNODE_VERIFY(dn);
+	ASSERT3P(dnp, !=, NULL);
+	ASSERT3P(dn->dn_dbuf, ==, db);
+	ASSERT3U(dn->dn_object, ==, object);
+	dbuf_rele(db, FTAG);
+
+	*dnp = dn;
+	return (0);
+}
+
+/*
+ * Return held dnode if the object is allocated, NULL if not.
+ */
+int
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
+{
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+	    dnp));
+}
+
+/*
+ * Can only add a reference if there is already at least one
+ * reference on the dnode.  Returns FALSE if unable to add a
+ * new reference.
+ */
+boolean_t
+dnode_add_ref(dnode_t *dn, void *tag)
+{
+	mutex_enter(&dn->dn_mtx);
+	if (zfs_refcount_is_zero(&dn->dn_holds)) {
+		mutex_exit(&dn->dn_mtx);
+		return (FALSE);
+	}
+	VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
+	mutex_exit(&dn->dn_mtx);
+	return (TRUE);
+}
+
+void
+dnode_rele(dnode_t *dn, void *tag)
+{
+	mutex_enter(&dn->dn_mtx);
+	dnode_rele_and_unlock(dn, tag, B_FALSE);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
+{
+	uint64_t refs;
+	/* Get while the hold prevents the dnode from moving. */
+	dmu_buf_impl_t *db = dn->dn_dbuf;
+	dnode_handle_t *dnh = dn->dn_handle;
+
+	refs = zfs_refcount_remove(&dn->dn_holds, tag);
+	if (refs == 0)
+		cv_broadcast(&dn->dn_nodnholds);
+	mutex_exit(&dn->dn_mtx);
+	/* dnode could get destroyed at this point, so don't use it anymore */
+
+	/*
+	 * It's unsafe to release the last hold on a dnode by dnode_rele() or
+	 * indirectly by dbuf_rele() while relying on the dnode handle to
+	 * prevent the dnode from moving, since releasing the last hold could
+	 * result in the dnode's parent dbuf evicting its dnode handles. For
+	 * that reason anyone calling dnode_rele() or dbuf_rele() without some
+	 * other direct or indirect hold on the dnode must first drop the dnode
+	 * handle.
+	 */
+	ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+
+	/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
+	if (refs == 0 && db != NULL) {
+		/*
+		 * Another thread could add a hold to the dnode handle in
+		 * dnode_hold_impl() while holding the parent dbuf. Since the
+		 * hold on the parent dbuf prevents the handle from being
+		 * destroyed, the hold on the handle is OK. We can't yet assert
+		 * that the handle has zero references, but that will be
+		 * asserted anyway when the handle gets destroyed.
+		 */
+		mutex_enter(&db->db_mtx);
+		dbuf_rele_and_unlock(db, dnh, evicting);
+	}
+}
+
+/*
+ * Test whether we can create a dnode at the specified location.
+ */
+int
+dnode_try_claim(objset_t *os, uint64_t object, int slots)
+{
+	return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
+	    slots, NULL, NULL));
+}
+
+void
+dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
+{
+	objset_t *os = dn->dn_objset;
+	uint64_t txg = tx->tx_txg;
+
+	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		dsl_dataset_dirty(os->os_dsl_dataset, tx);
+		return;
+	}
+
+	DNODE_VERIFY(dn);
+
+#ifdef ZFS_DEBUG
+	mutex_enter(&dn->dn_mtx);
+	ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg);
+	ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg);
+	mutex_exit(&dn->dn_mtx);
+#endif
+
+	/*
+	 * Determine old uid/gid when necessary
+	 */
+	dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
+
+	multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+	multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
+
+	/*
+	 * If we are already marked dirty, we're done.
+	 */
+	if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
+		multilist_sublist_unlock(mls);
+		return;
+	}
+
+	ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
+	    !avl_is_empty(&dn->dn_dbufs));
+	ASSERT(dn->dn_datablksz != 0);
+	ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);
+	ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);
+	ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
+
+	dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
+	    dn->dn_object, txg);
+
+	multilist_sublist_insert_head(mls, dn);
+
+	multilist_sublist_unlock(mls);
+
+	/*
+	 * The dnode maintains a hold on its containing dbuf as
+	 * long as there are holds on it.  Each instantiated child
+	 * dbuf maintains a hold on the dnode.  When the last child
+	 * drops its hold, the dnode will drop its hold on the
+	 * containing dbuf. We add a "dirty hold" here so that the
+	 * dnode will hang around after we finish processing its
+	 * children.
+	 */
+	VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg));
+
+	(void) dbuf_dirty(dn->dn_dbuf, tx);
+
+	dsl_dataset_dirty(os->os_dsl_dataset, tx);
+}
+
+void
+dnode_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
+		mutex_exit(&dn->dn_mtx);
+		return;
+	}
+	dn->dn_free_txg = tx->tx_txg;
+	mutex_exit(&dn->dn_mtx);
+
+	dnode_setdirty(dn, tx);
+}
+
+/*
+ * Try to change the block size for the indicated dnode.  This can only
+ * succeed if there are no blocks allocated or dirty beyond first block
+ */
+int
+dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int err;
+
+	ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+	if (size == 0)
+		size = SPA_MINBLOCKSIZE;
+	else
+		size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
+
+	if (ibs == dn->dn_indblkshift)
+		ibs = 0;
+
+	if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+		return (0);
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	/* Check for any allocated blocks beyond the first */
+	if (dn->dn_maxblkid != 0)
+		goto fail;
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = avl_first(&dn->dn_dbufs); db != NULL;
+	    db = AVL_NEXT(&dn->dn_dbufs, db)) {
+		if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
+		    db->db_blkid != DMU_SPILL_BLKID) {
+			mutex_exit(&dn->dn_dbufs_mtx);
+			goto fail;
+		}
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	if (ibs && dn->dn_nlevels != 1)
+		goto fail;
+
+	/* resize the old block */
+	err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+	if (err == 0) {
+		dbuf_new_size(db, size, tx);
+	} else if (err != ENOENT) {
+		goto fail;
+	}
+
+	dnode_setdblksz(dn, size);
+	dnode_setdirty(dn, tx);
+	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+	if (ibs) {
+		dn->dn_indblkshift = ibs;
+		dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+	}
+	/* release after we have fixed the blocksize in the dnode */
+	if (db)
+		dbuf_rele(db, FTAG);
+
+	rw_exit(&dn->dn_struct_rwlock);
+	return (0);
+
+fail:
+	rw_exit(&dn->dn_struct_rwlock);
+	return (SET_ERROR(ENOTSUP));
+}
+
+static void
+dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
+{
+	uint64_t txgoff = tx->tx_txg & TXG_MASK;
+	int old_nlevels = dn->dn_nlevels;
+	dmu_buf_impl_t *db;
+	list_t *list;
+	dbuf_dirty_record_t *new, *dr, *dr_next;
+
+	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	ASSERT3U(new_nlevels, >, dn->dn_nlevels);
+	dn->dn_nlevels = new_nlevels;
+
+	ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+	dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+	/* dirty the left indirects */
+	db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+	ASSERT(db != NULL);
+	new = dbuf_dirty(db, tx);
+	dbuf_rele(db, FTAG);
+
+	/* transfer the dirty records to the new indirect */
+	mutex_enter(&dn->dn_mtx);
+	mutex_enter(&new->dt.di.dr_mtx);
+	list = &dn->dn_dirty_records[txgoff];
+	for (dr = list_head(list); dr; dr = dr_next) {
+		dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+
+		IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
+		if (dr->dr_dbuf == NULL ||
+		    (dr->dr_dbuf->db_level == old_nlevels - 1 &&
+		    dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
+			list_remove(&dn->dn_dirty_records[txgoff], dr);
+			list_insert_tail(&new->dt.di.dr_children, dr);
+			dr->dr_parent = new;
+		}
+	}
+	mutex_exit(&new->dt.di.dr_mtx);
+	mutex_exit(&dn->dn_mtx);
+}
+
+int
+dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
+{
+	int ret = 0;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	if (dn->dn_nlevels == nlevels) {
+		ret = 0;
+		goto out;
+	} else if (nlevels < dn->dn_nlevels) {
+		ret = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	dnode_set_nlevels_impl(dn, nlevels, tx);
+
+out:
+	rw_exit(&dn->dn_struct_rwlock);
+	return (ret);
+}
+
+/* read-holding callers must not rely on the lock being continuously held */
+void
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
+    boolean_t force)
+{
+	int epbs, new_nlevels;
+	uint64_t sz;
+
+	ASSERT(blkid != DMU_BONUS_BLKID);
+
+	ASSERT(have_read ?
+	    RW_READ_HELD(&dn->dn_struct_rwlock) :
+	    RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+	/*
+	 * if we have a read-lock, check to see if we need to do any work
+	 * before upgrading to a write-lock.
+	 */
+	if (have_read) {
+		if (blkid <= dn->dn_maxblkid)
+			return;
+
+		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
+			rw_exit(&dn->dn_struct_rwlock);
+			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		}
+	}
+
+	/*
+	 * Raw sends (indicated by the force flag) require that we take the
+	 * given blkid even if the value is lower than the current value.
+	 */
+	if (!force && blkid <= dn->dn_maxblkid)
+		goto out;
+
+	/*
+	 * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
+	 * to indicate that this field is set. This allows us to set the
+	 * maxblkid to 0 on an existing object in dnode_sync().
+	 */
+	dn->dn_maxblkid = blkid;
+	dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
+	    blkid | DMU_NEXT_MAXBLKID_SET;
+
+	/*
+	 * Compute the number of levels necessary to support the new maxblkid.
+	 * Raw sends will ensure nlevels is set correctly for us.
+	 */
+	new_nlevels = 1;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	for (sz = dn->dn_nblkptr;
+	    sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
+		new_nlevels++;
+
+	ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
+	if (!force) {
+		if (new_nlevels > dn->dn_nlevels)
+			dnode_set_nlevels_impl(dn, new_nlevels, tx);
+	} else {
+		ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
+	}
+
+out:
+	if (have_read)
+		rw_downgrade(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+	if (db != NULL) {
+		dmu_buf_will_dirty(&db->db, tx);
+		dbuf_rele(db, FTAG);
+	}
+}
+
+/*
+ * Dirty all the in-core level-1 dbufs in the range specified by start_blkid
+ * and end_blkid.
+ */
+static void
+dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+    dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db_search;
+	dmu_buf_impl_t *db;
+	avl_index_t where;
+
+	db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+
+	db_search->db_level = 1;
+	db_search->db_blkid = start_blkid + 1;
+	db_search->db_state = DB_SEARCH;
+	for (;;) {
+
+		db = avl_find(&dn->dn_dbufs, db_search, &where);
+		if (db == NULL)
+			db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+		if (db == NULL || db->db_level != 1 ||
+		    db->db_blkid >= end_blkid) {
+			break;
+		}
+
+		/*
+		 * Setup the next blkid we want to search for.
+		 */
+		db_search->db_blkid = db->db_blkid + 1;
+		ASSERT3U(db->db_blkid, >=, start_blkid);
+
+		/*
+		 * If the dbuf transitions to DB_EVICTING while we're trying
+		 * to dirty it, then we will be unable to discover it in
+		 * the dbuf hash table. This will result in a call to
+		 * dbuf_create() which needs to acquire the dn_dbufs_mtx
+		 * lock. To avoid a deadlock, we drop the lock before
+		 * dirtying the level-1 dbuf.
+		 */
+		mutex_exit(&dn->dn_dbufs_mtx);
+		dnode_dirty_l1(dn, db->db_blkid, tx);
+		mutex_enter(&dn->dn_dbufs_mtx);
+	}
+
+#ifdef ZFS_DEBUG
+	/*
+	 * Walk all the in-core level-1 dbufs and verify they have been dirtied.
+	 */
+	db_search->db_level = 1;
+	db_search->db_blkid = start_blkid + 1;
+	db_search->db_state = DB_SEARCH;
+	db = avl_find(&dn->dn_dbufs, db_search, &where);
+	if (db == NULL)
+		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+	for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
+		if (db->db_level != 1 || db->db_blkid >= end_blkid)
+			break;
+		if (db->db_state != DB_EVICTING)
+			ASSERT(db->db_dirtycnt > 0);
+	}
+#endif
+	kmem_free(db_search, sizeof (dmu_buf_impl_t));
+	mutex_exit(&dn->dn_dbufs_mtx);
+}
+
+void
+dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag)
+{
+	/*
+	 * Don't set dirtyctx to SYNC if we're just modifying this as we
+	 * initialize the objset.
+	 */
+	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
+		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+
+		if (ds != NULL) {
+			rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
+		}
+		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+			if (dmu_tx_is_syncing(tx))
+				dn->dn_dirtyctx = DN_DIRTY_SYNC;
+			else
+				dn->dn_dirtyctx = DN_DIRTY_OPEN;
+			dn->dn_dirtyctx_firstset = tag;
+		}
+		if (ds != NULL) {
+			rrw_exit(&ds->ds_bp_rwlock, tag);
+		}
+	}
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	uint64_t blkoff, blkid, nblks;
+	int blksz, blkshift, head, tail;
+	int trunc = FALSE;
+	int epbs;
+
+	blksz = dn->dn_datablksz;
+	blkshift = dn->dn_datablkshift;
+	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+	if (len == DMU_OBJECT_END) {
+		len = UINT64_MAX - off;
+		trunc = TRUE;
+	}
+
+	/*
+	 * First, block align the region to free:
+	 */
+	if (ISP2(blksz)) {
+		head = P2NPHASE(off, blksz);
+		blkoff = P2PHASE(off, blksz);
+		if ((off >> blkshift) > dn->dn_maxblkid)
+			return;
+	} else {
+		ASSERT(dn->dn_maxblkid == 0);
+		if (off == 0 && len >= blksz) {
+			/*
+			 * Freeing the whole block; fast-track this request.
+			 */
+			blkid = 0;
+			nblks = 1;
+			if (dn->dn_nlevels > 1) {
+				rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+				dnode_dirty_l1(dn, 0, tx);
+				rw_exit(&dn->dn_struct_rwlock);
+			}
+			goto done;
+		} else if (off >= blksz) {
+			/* Freeing past end-of-data */
+			return;
+		} else {
+			/* Freeing part of the block. */
+			head = blksz - off;
+			ASSERT3U(head, >, 0);
+		}
+		blkoff = off;
+	}
+	/* zero out any partial block data at the start of the range */
+	if (head) {
+		int res;
+		ASSERT3U(blkoff + head, ==, blksz);
+		if (len < head)
+			head = len;
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+		    TRUE, FALSE, FTAG, &db);
+		rw_exit(&dn->dn_struct_rwlock);
+		if (res == 0) {
+			caddr_t data;
+			boolean_t dirty;
+
+			db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
+			    FTAG);
+			/* don't dirty if it isn't on disk and isn't dirty */
+			dirty = !list_is_empty(&db->db_dirty_records) ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+			dmu_buf_unlock_parent(db, dblt, FTAG);
+			if (dirty) {
+				dmu_buf_will_dirty(&db->db, tx);
+				data = db->db.db_data;
+				bzero(data + blkoff, head);
+			}
+			dbuf_rele(db, FTAG);
+		}
+		off += head;
+		len -= head;
+	}
+
+	/* If the range was less than one block, we're done */
+	if (len == 0)
+		return;
+
+	/* If the remaining range is past end of file, we're done */
+	if ((off >> blkshift) > dn->dn_maxblkid)
+		return;
+
+	ASSERT(ISP2(blksz));
+	if (trunc)
+		tail = 0;
+	else
+		tail = P2PHASE(len, blksz);
+
+	ASSERT0(P2PHASE(off, blksz));
+	/* zero out any partial block data at the end of the range */
+	if (tail) {
+		int res;
+		if (len < tail)
+			tail = len;
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+		    TRUE, FALSE, FTAG, &db);
+		rw_exit(&dn->dn_struct_rwlock);
+		if (res == 0) {
+			boolean_t dirty;
+			/* don't dirty if not on disk and not dirty */
+			db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
+			    FTAG);
+			dirty = !list_is_empty(&db->db_dirty_records) ||
+			    (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+			dmu_buf_unlock_parent(db, type, FTAG);
+			if (dirty) {
+				dmu_buf_will_dirty(&db->db, tx);
+				bzero(db->db.db_data, tail);
+			}
+			dbuf_rele(db, FTAG);
+		}
+		len -= tail;
+	}
+
+	/* If the range did not include a full block, we are done */
+	if (len == 0)
+		return;
+
+	ASSERT(IS_P2ALIGNED(off, blksz));
+	ASSERT(trunc || IS_P2ALIGNED(len, blksz));
+	blkid = off >> blkshift;
+	nblks = len >> blkshift;
+	if (trunc)
+		nblks += 1;
+
+	/*
+	 * Dirty all the indirect blocks in this range.  Note that only
+	 * the first and last indirect blocks can actually be written
+	 * (if they were partially freed) -- they must be dirtied, even if
+	 * they do not exist on disk yet.  The interior blocks will
+	 * be freed by free_children(), so they will not actually be written.
+	 * Even though these interior blocks will not be written, we
+	 * dirty them for two reasons:
+	 *
+	 *  - It ensures that the indirect blocks remain in memory until
+	 *    syncing context.  (They have already been prefetched by
+	 *    dmu_tx_hold_free(), so we don't have to worry about reading
+	 *    them serially here.)
+	 *
+	 *  - The dirty space accounting will put pressure on the txg sync
+	 *    mechanism to begin syncing, and to delay transactions if there
+	 *    is a large amount of freeing.  Even though these indirect
+	 *    blocks will not be written, we could need to write the same
+	 *    amount of space if we copy the freed BPs into deadlists.
+	 */
+	if (dn->dn_nlevels > 1) {
+		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+		uint64_t first, last;
+
+		first = blkid >> epbs;
+		dnode_dirty_l1(dn, first, tx);
+		if (trunc)
+			last = dn->dn_maxblkid >> epbs;
+		else
+			last = (blkid + nblks - 1) >> epbs;
+		if (last != first)
+			dnode_dirty_l1(dn, last, tx);
+
+		dnode_dirty_l1range(dn, first, last, tx);
+
+		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+		    SPA_BLKPTRSHIFT;
+		for (uint64_t i = first + 1; i < last; i++) {
+			/*
+			 * Set i to the blockid of the next non-hole
+			 * level-1 indirect block at or after i.  Note
+			 * that dnode_next_offset() operates in terms of
+			 * level-0-equivalent bytes.
+			 */
+			uint64_t ibyte = i << shift;
+			int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+			    &ibyte, 2, 1, 0);
+			i = ibyte >> shift;
+			if (i >= last)
+				break;
+
+			/*
+			 * Normally we should not see an error, either
+			 * from dnode_next_offset() or dbuf_hold_level()
+			 * (except for ESRCH from dnode_next_offset).
+			 * If there is an i/o error, then when we read
+			 * this block in syncing context, it will use
+			 * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+			 * to the "failmode" property.  dnode_next_offset()
+			 * doesn't have a flag to indicate MUSTSUCCEED.
+			 */
+			if (err != 0)
+				break;
+
+			dnode_dirty_l1(dn, i, tx);
+		}
+		rw_exit(&dn->dn_struct_rwlock);
+	}
+
+done:
+	/*
+	 * Add this range to the dnode range list.
+	 * We will finish up this free operation in the syncing phase.
+	 */
+	mutex_enter(&dn->dn_mtx);
+	{
+		int txgoff = tx->tx_txg & TXG_MASK;
+		if (dn->dn_free_ranges[txgoff] == NULL) {
+			dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
+			    RANGE_SEG64, NULL, 0, 0);
+		}
+		range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+		range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+	}
+	dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+	    blkid, nblks, tx->tx_txg);
+	mutex_exit(&dn->dn_mtx);
+
+	dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
+	dnode_setdirty(dn, tx);
+}
+
+static boolean_t
+dnode_spill_freed(dnode_t *dn)
+{
+	int i;
+
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK)
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
+/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */
+uint64_t
+dnode_block_freed(dnode_t *dn, uint64_t blkid)
+{
+	void *dp = spa_get_dsl(dn->dn_objset->os_spa);
+	int i;
+
+	if (blkid == DMU_BONUS_BLKID)
+		return (FALSE);
+
+	/*
+	 * If we're in the process of opening the pool, dp will not be
+	 * set yet, but there shouldn't be anything dirty.
+	 */
+	if (dp == NULL)
+		return (FALSE);
+
+	if (dn->dn_free_txg)
+		return (TRUE);
+
+	if (blkid == DMU_SPILL_BLKID)
+		return (dnode_spill_freed(dn));
+
+	mutex_enter(&dn->dn_mtx);
+	for (i = 0; i < TXG_SIZE; i++) {
+		if (dn->dn_free_ranges[i] != NULL &&
+		    range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
+			break;
+	}
+	mutex_exit(&dn->dn_mtx);
+	return (i < TXG_SIZE);
+}
+
+/* call from syncing context when we actually write/free space for this dnode */
+void
+dnode_diduse_space(dnode_t *dn, int64_t delta)
+{
+	uint64_t space;
+	dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n",
+	    dn, dn->dn_phys,
+	    (u_longlong_t)dn->dn_phys->dn_used,
+	    (longlong_t)delta);
+
+	mutex_enter(&dn->dn_mtx);
+	space = DN_USED_BYTES(dn->dn_phys);
+	if (delta > 0) {
+		ASSERT3U(space + delta, >=, space); /* no overflow */
+	} else {
+		ASSERT3U(space, >=, -delta); /* no underflow */
+	}
+	space += delta;
+	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
+		ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
+		ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
+		dn->dn_phys->dn_used = space >> DEV_BSHIFT;
+	} else {
+		dn->dn_phys->dn_used = space;
+		dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES;
+	}
+	mutex_exit(&dn->dn_mtx);
+}
+
+/*
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers.  If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
+ *
+ * The basic search algorithm used below by dnode_next_offset() is to
+ * use this function to search up the block tree (widen the search) until
+ * we find something (i.e., we don't return ESRCH) and then search back
+ * down the tree (narrow the search) until we reach our original search
+ * level.
+ */
+static int
+dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
+    int lvl, uint64_t blkfill, uint64_t txg)
+{
+	dmu_buf_impl_t *db = NULL;
+	void *data = NULL;
+	uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t epb = 1ULL << epbs;
+	uint64_t minfill, maxfill;
+	boolean_t hole;
+	int i, inc, error, span;
+
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
+	hole = ((flags & DNODE_FIND_HOLE) != 0);
+	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
+	ASSERT(txg == 0 || !hole);
+
+	if (lvl == dn->dn_phys->dn_nlevels) {
+		error = 0;
+		epb = dn->dn_phys->dn_nblkptr;
+		data = dn->dn_phys->dn_blkptr;
+	} else {
+		uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+		error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
+		if (error) {
+			if (error != ENOENT)
+				return (error);
+			if (hole)
+				return (0);
+			/*
+			 * This can only happen when we are searching up
+			 * the block tree for data.  We don't really need to
+			 * adjust the offset, as we will just end up looking
+			 * at the pointer to this block in its parent, and its
+			 * going to be unallocated, so we will skip over it.
+			 */
+			return (SET_ERROR(ESRCH));
+		}
+		error = dbuf_read(db, NULL,
+		    DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
+		    DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
+		if (error) {
+			dbuf_rele(db, FTAG);
+			return (error);
+		}
+		data = db->db.db_data;
+		rw_enter(&db->db_rwlock, RW_READER);
+	}
+
+	if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+	    db->db_blkptr->blk_birth <= txg ||
+	    BP_IS_HOLE(db->db_blkptr))) {
+		/*
+		 * This can only happen when we are searching up the tree
+		 * and these conditions mean that we need to keep climbing.
+		 */
+		error = SET_ERROR(ESRCH);
+	} else if (lvl == 0) {
+		dnode_phys_t *dnp = data;
+
+		ASSERT(dn->dn_type == DMU_OT_DNODE);
+		ASSERT(!(flags & DNODE_FIND_BACKWARDS));
+
+		for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+		    i < blkfill; i += dnp[i].dn_extra_slots + 1) {
+			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
+				break;
+		}
+
+		if (i == blkfill)
+			error = SET_ERROR(ESRCH);
+
+		*offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+		    (i << DNODE_SHIFT);
+	} else {
+		blkptr_t *bp = data;
+		uint64_t start = *offset;
+		span = (lvl - 1) * epbs + dn->dn_datablkshift;
+		minfill = 0;
+		maxfill = blkfill << ((lvl - 1) * epbs);
+
+		if (hole)
+			maxfill--;
+		else
+			minfill++;
+
+		if (span >= 8 * sizeof (*offset)) {
+			/* This only happens on the highest indirection level */
+			ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+			*offset = 0;
+		} else {
+			*offset = *offset >> span;
+		}
+
+		for (i = BF64_GET(*offset, 0, epbs);
+		    i >= 0 && i < epb; i += inc) {
+			if (BP_GET_FILL(&bp[i]) >= minfill &&
+			    BP_GET_FILL(&bp[i]) <= maxfill &&
+			    (hole || bp[i].blk_birth > txg))
+				break;
+			if (inc > 0 || *offset > 0)
+				*offset += inc;
+		}
+
+		if (span >= 8 * sizeof (*offset)) {
+			*offset = start;
+		} else {
+			*offset = *offset << span;
+		}
+
+		if (inc < 0) {
+			/* traversing backwards; position offset at the end */
+			ASSERT3U(*offset, <=, start);
+			*offset = MIN(*offset + (1ULL << span) - 1, start);
+		} else if (*offset < start) {
+			*offset = start;
+		}
+		if (i < 0 || i >= epb)
+			error = SET_ERROR(ESRCH);
+	}
+
+	if (db != NULL) {
+		rw_exit(&db->db_rwlock);
+		dbuf_rele(db, FTAG);
+	}
+
+	return (error);
+}
+
+/*
+ * Find the next hole, data, or sparse region at or after *offset.
+ * The value 'blkfill' tells us how many items we expect to find
+ * in an L0 data block; this value is 1 for normal objects,
+ * DNODES_PER_BLOCK for the meta dnode, and some fraction of
+ * DNODES_PER_BLOCK when searching for sparse regions thereof.
+ *
+ * Examples:
+ *
+ * dnode_next_offset(dn, flags, offset, 1, 1, 0);
+ *	Finds the next/previous hole/data in a file.
+ *	Used in dmu_offset_next().
+ *
+ * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg);
+ *	Finds the next free/allocated dnode an objset's meta-dnode.
+ *	Only finds objects that have new contents since txg (ie.
+ *	bonus buffer changes and content removal are ignored).
+ *	Used in dmu_object_next().
+ *
+ * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0);
+ *	Finds the next L2 meta-dnode bp that's at most 1/4 full.
+ *	Used in dmu_object_alloc().
+ */
+int
+dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
+    int minlvl, uint64_t blkfill, uint64_t txg)
+{
+	uint64_t initial_offset = *offset;
+	int lvl, maxlvl;
+	int error = 0;
+
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+
+	if (dn->dn_phys->dn_nlevels == 0) {
+		error = SET_ERROR(ESRCH);
+		goto out;
+	}
+
+	if (dn->dn_datablkshift == 0) {
+		if (*offset < dn->dn_datablksz) {
+			if (flags & DNODE_FIND_HOLE)
+				*offset = dn->dn_datablksz;
+		} else {
+			error = SET_ERROR(ESRCH);
+		}
+		goto out;
+	}
+
+	maxlvl = dn->dn_phys->dn_nlevels;
+
+	for (lvl = minlvl; lvl <= maxlvl; lvl++) {
+		error = dnode_next_offset_level(dn,
+		    flags, offset, lvl, blkfill, txg);
+		if (error != ESRCH)
+			break;
+	}
+
+	while (error == 0 && --lvl >= minlvl) {
+		error = dnode_next_offset_level(dn,
+		    flags, offset, lvl, blkfill, txg);
+	}
+
+	/*
+	 * There's always a "virtual hole" at the end of the object, even
+	 * if all BP's which physically exist are non-holes.
+	 */
+	if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+	    minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+		error = 0;
+	}
+
+	if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
+	    initial_offset < *offset : initial_offset > *offset))
+		error = SET_ERROR(ESRCH);
+out:
+	if (!(flags & DNODE_FIND_HAVELOCK))
+		rw_exit(&dn->dn_struct_rwlock);
+
+	return (error);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dnode_hold);
+EXPORT_SYMBOL(dnode_rele);
+EXPORT_SYMBOL(dnode_set_nlevels);
+EXPORT_SYMBOL(dnode_set_blksz);
+EXPORT_SYMBOL(dnode_free_range);
+EXPORT_SYMBOL(dnode_evict_dbufs);
+EXPORT_SYMBOL(dnode_evict_bonus);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dnode_sync.c b/sys/contrib/openzfs/module/zfs/dnode_sync.c
new file mode 100644
index 000000000000..66e48a1e17d4
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dnode_sync.c
@@ -0,0 +1,858 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2020 Oxide Computer Company
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_recv.h>
+#include <sys/dsl_dataset.h>
+#include <sys/spa.h>
+#include <sys/range_tree.h>
+#include <sys/zfeature.h>
+
+static void
+dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	int nblkptr = dn->dn_phys->dn_nblkptr;
+	int old_toplvl = dn->dn_phys->dn_nlevels - 1;
+	int new_level = dn->dn_next_nlevels[txgoff];
+	int i;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+	/* this dnode can't be paged out because it's dirty */
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0);
+
+	db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+	ASSERT(db != NULL);
+
+	dn->dn_phys->dn_nlevels = new_level;
+	dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
+	    dn->dn_object, dn->dn_phys->dn_nlevels);
+
+	/*
+	 * Lock ordering requires that we hold the children's db_mutexes (by
+	 * calling dbuf_find()) before holding the parent's db_rwlock.  The lock
+	 * order is imposed by dbuf_read's steps of "grab the lock to protect
+	 * db_parent, get db_parent, hold db_parent's db_rwlock".
+	 */
+	dmu_buf_impl_t *children[DN_MAX_NBLKPTR];
+	ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR);
+	for (i = 0; i < nblkptr; i++) {
+		children[i] =
+		    dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
+	}
+
+	/* transfer dnode's block pointers to new indirect block */
+	(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+	if (dn->dn_dbuf != NULL)
+		rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER);
+	rw_enter(&db->db_rwlock, RW_WRITER);
+	ASSERT(db->db.db_data);
+	ASSERT(arc_released(db->db_buf));
+	ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+	bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+	    sizeof (blkptr_t) * nblkptr);
+	arc_buf_freeze(db->db_buf);
+
+	/* set dbuf's parent pointers to new indirect buf */
+	for (i = 0; i < nblkptr; i++) {
+		dmu_buf_impl_t *child = children[i];
+
+		if (child == NULL)
+			continue;
+#ifdef	ZFS_DEBUG
+		DB_DNODE_ENTER(child);
+		ASSERT3P(DB_DNODE(child), ==, dn);
+		DB_DNODE_EXIT(child);
+#endif	/* DEBUG */
+		if (child->db_parent && child->db_parent != dn->dn_dbuf) {
+			ASSERT(child->db_parent->db_level == db->db_level);
+			ASSERT(child->db_blkptr !=
+			    &dn->dn_phys->dn_blkptr[child->db_blkid]);
+			mutex_exit(&child->db_mtx);
+			continue;
+		}
+		ASSERT(child->db_parent == NULL ||
+		    child->db_parent == dn->dn_dbuf);
+
+		child->db_parent = db;
+		dbuf_add_ref(db, child);
+		if (db->db.db_data)
+			child->db_blkptr = (blkptr_t *)db->db.db_data + i;
+		else
+			child->db_blkptr = NULL;
+		dprintf_dbuf_bp(child, child->db_blkptr,
+		    "changed db_blkptr to new indirect %s", "");
+
+		mutex_exit(&child->db_mtx);
+	}
+
+	bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr);
+
+	rw_exit(&db->db_rwlock);
+	if (dn->dn_dbuf != NULL)
+		rw_exit(&dn->dn_dbuf->db_rwlock);
+
+	dbuf_rele(db, FTAG);
+
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+	uint64_t bytesfreed = 0;
+
+	dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
+
+	for (int i = 0; i < num; i++, bp++) {
+		if (BP_IS_HOLE(bp))
+			continue;
+
+		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
+		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+
+		/*
+		 * Save some useful information on the holes being
+		 * punched, including logical size, type, and indirection
+		 * level. Retaining birth time enables detection of when
+		 * holes are punched for reducing the number of free
+		 * records transmitted during a zfs send.
+		 */
+
+		uint64_t lsize = BP_GET_LSIZE(bp);
+		dmu_object_type_t type = BP_GET_TYPE(bp);
+		uint64_t lvl = BP_GET_LEVEL(bp);
+
+		bzero(bp, sizeof (blkptr_t));
+
+		if (spa_feature_is_active(dn->dn_objset->os_spa,
+		    SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, type);
+			BP_SET_LEVEL(bp, lvl);
+			BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
+		}
+	}
+	dnode_diduse_space(dn, -bytesfreed);
+}
+
+#ifdef ZFS_DEBUG
+static void
+free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
+{
+	int off, num;
+	int i, err, epbs;
+	uint64_t txg = tx->tx_txg;
+	dnode_t *dn;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	off = start - (db->db_blkid * 1<<epbs);
+	num = end - start + 1;
+
+	ASSERT3U(off, >=, 0);
+	ASSERT3U(num, >=, 0);
+	ASSERT3U(db->db_level, >, 0);
+	ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
+	ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT);
+	ASSERT(db->db_blkptr != NULL);
+
+	for (i = off; i < off+num; i++) {
+		uint64_t *buf;
+		dmu_buf_impl_t *child;
+		dbuf_dirty_record_t *dr;
+		int j;
+
+		ASSERT(db->db_level == 1);
+
+		rw_enter(&dn->dn_struct_rwlock, RW_READER);
+		err = dbuf_hold_impl(dn, db->db_level - 1,
+		    (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
+		rw_exit(&dn->dn_struct_rwlock);
+		if (err == ENOENT)
+			continue;
+		ASSERT(err == 0);
+		ASSERT(child->db_level == 0);
+		dr = dbuf_find_dirty_eq(child, txg);
+
+		/* data_old better be zeroed */
+		if (dr) {
+			buf = dr->dt.dl.dr_data->b_data;
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    (void *)child, i, off, num);
+				}
+			}
+		}
+
+		/*
+		 * db_data better be zeroed unless it's dirty in a
+		 * future txg.
+		 */
+		mutex_enter(&child->db_mtx);
+		buf = child->db.db_data;
+		if (buf != NULL && child->db_state != DB_FILL &&
+		    list_is_empty(&child->db_dirty_records)) {
+			for (j = 0; j < child->db.db_size >> 3; j++) {
+				if (buf[j] != 0) {
+					panic("freed data not zero: "
+					    "child=%p i=%d off=%d num=%d\n",
+					    (void *)child, i, off, num);
+				}
+			}
+		}
+		mutex_exit(&child->db_mtx);
+
+		dbuf_rele(child, FTAG);
+	}
+	DB_DNODE_EXIT(db);
+}
+#endif
+
+/*
+ * We don't usually free the indirect blocks here.  If in one txg we have a
+ * free_range and a write to the same indirect block, it's important that we
+ * preserve the hole's birth times. Therefore, we don't free any any indirect
+ * blocks in free_children().  If an indirect block happens to turn into all
+ * holes, it will be freed by dbuf_write_children_ready, which happens at a
+ * point in the syncing process where we know for certain the contents of the
+ * indirect block.
+ *
+ * However, if we're freeing a dnode, its space accounting must go to zero
+ * before we actually try to free the dnode, or we will trip an assertion. In
+ * addition, we know the case described above cannot occur, because the dnode is
+ * being freed.  Therefore, we free the indirect blocks immediately in that
+ * case.
+ */
+static void
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
+    boolean_t free_indirects, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	blkptr_t *bp;
+	dmu_buf_impl_t *subdb;
+	uint64_t start, end, dbstart, dbend;
+	unsigned int epbs, shift, i;
+
+	/*
+	 * There is a small possibility that this block will not be cached:
+	 *   1 - if level > 1 and there are no children with level <= 1
+	 *   2 - if this block was evicted since we read it from
+	 *	 dmu_tx_hold_free().
+	 */
+	if (db->db_state != DB_CACHED)
+		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
+
+	/*
+	 * If we modify this indirect block, and we are not freeing the
+	 * dnode (!free_indirects), then this indirect block needs to get
+	 * written to disk by dbuf_write().  If it is dirty, we know it will
+	 * be written (otherwise, we would have incorrect on-disk state
+	 * because the space would be freed but still referenced by the BP
+	 * in this indirect block).  Therefore we VERIFY that it is
+	 * dirty.
+	 *
+	 * Our VERIFY covers some cases that do not actually have to be
+	 * dirty, but the open-context code happens to dirty.  E.g. if the
+	 * blocks we are freeing are all holes, because in that case, we
+	 * are only freeing part of this indirect block, so it is an
+	 * ancestor of the first or last block to be freed.  The first and
+	 * last L1 indirect blocks are always dirtied by dnode_free_range().
+	 */
+	db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+	VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0);
+	dmu_buf_unlock_parent(db, dblt, FTAG);
+
+	dbuf_release_bp(db);
+	bp = db->db.db_data;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+	ASSERT3U(epbs, <, 31);
+	shift = (db->db_level - 1) * epbs;
+	dbstart = db->db_blkid << epbs;
+	start = blkid >> shift;
+	if (dbstart < start) {
+		bp += start - dbstart;
+	} else {
+		start = dbstart;
+	}
+	dbend = ((db->db_blkid + 1) << epbs) - 1;
+	end = (blkid + nblks - 1) >> shift;
+	if (dbend <= end)
+		end = dbend;
+
+	ASSERT3U(start, <=, end);
+
+	if (db->db_level == 1) {
+		FREE_VERIFY(db, start, end, tx);
+		rw_enter(&db->db_rwlock, RW_WRITER);
+		free_blocks(dn, bp, end - start + 1, tx);
+		rw_exit(&db->db_rwlock);
+	} else {
+		for (uint64_t id = start; id <= end; id++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
+			    id, TRUE, FALSE, FTAG, &subdb));
+			rw_exit(&dn->dn_struct_rwlock);
+			ASSERT3P(bp, ==, subdb->db_blkptr);
+
+			free_children(subdb, blkid, nblks, free_indirects, tx);
+			dbuf_rele(subdb, FTAG);
+		}
+	}
+
+	if (free_indirects) {
+		rw_enter(&db->db_rwlock, RW_WRITER);
+		for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++)
+			ASSERT(BP_IS_HOLE(bp));
+		bzero(db->db.db_data, db->db.db_size);
+		free_blocks(dn, db->db_blkptr, 1, tx);
+		rw_exit(&db->db_rwlock);
+	}
+
+	DB_DNODE_EXIT(db);
+	arc_buf_freeze(db->db_buf);
+}
+
+/*
+ * Traverse the indicated range of the provided file
+ * and "free" all the blocks contained there.
+ */
+static void
+dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
+    boolean_t free_indirects, dmu_tx_t *tx)
+{
+	blkptr_t *bp = dn->dn_phys->dn_blkptr;
+	int dnlevel = dn->dn_phys->dn_nlevels;
+	boolean_t trunc = B_FALSE;
+
+	if (blkid > dn->dn_phys->dn_maxblkid)
+		return;
+
+	ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
+	if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
+		nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+		trunc = B_TRUE;
+	}
+
+	/* There are no indirect blocks in the object */
+	if (dnlevel == 1) {
+		if (blkid >= dn->dn_phys->dn_nblkptr) {
+			/* this range was never made persistent */
+			return;
+		}
+		ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
+		free_blocks(dn, bp + blkid, nblks, tx);
+	} else {
+		int shift = (dnlevel - 1) *
+		    (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+		int start = blkid >> shift;
+		int end = (blkid + nblks - 1) >> shift;
+		dmu_buf_impl_t *db;
+
+		ASSERT(start < dn->dn_phys->dn_nblkptr);
+		bp += start;
+		for (int i = start; i <= end; i++, bp++) {
+			if (BP_IS_HOLE(bp))
+				continue;
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
+			    TRUE, FALSE, FTAG, &db));
+			rw_exit(&dn->dn_struct_rwlock);
+			free_children(db, blkid, nblks, free_indirects, tx);
+			dbuf_rele(db, FTAG);
+		}
+	}
+
+	/*
+	 * Do not truncate the maxblkid if we are performing a raw
+	 * receive. The raw receive sets the maxblkid manually and
+	 * must not be overridden. Usually, the last DRR_FREE record
+	 * will be at the maxblkid, because the source system sets
+	 * the maxblkid when truncating. However, if the last block
+	 * was freed by overwriting with zeros and being compressed
+	 * away to a hole, the source system will generate a DRR_FREE
+	 * record while leaving the maxblkid after the end of that
+	 * record. In this case we need to leave the maxblkid as
+	 * indicated in the DRR_OBJECT record, so that it matches the
+	 * source system, ensuring that the cryptographic hashes will
+	 * match.
+	 */
+	if (trunc && !dn->dn_objset->os_raw_receive) {
+		uint64_t off __maybe_unused;
+		dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
+
+		off = (dn->dn_phys->dn_maxblkid + 1) *
+		    (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+		ASSERT(off < dn->dn_phys->dn_maxblkid ||
+		    dn->dn_phys->dn_maxblkid == 0 ||
+		    dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
+	}
+}
+
+typedef struct dnode_sync_free_range_arg {
+	dnode_t *dsfra_dnode;
+	dmu_tx_t *dsfra_tx;
+	boolean_t dsfra_free_indirects;
+} dnode_sync_free_range_arg_t;
+
+static void
+dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
+{
+	dnode_sync_free_range_arg_t *dsfra = arg;
+	dnode_t *dn = dsfra->dsfra_dnode;
+
+	mutex_exit(&dn->dn_mtx);
+	dnode_sync_free_range_impl(dn, blkid, nblks,
+	    dsfra->dsfra_free_indirects, dsfra->dsfra_tx);
+	mutex_enter(&dn->dn_mtx);
+}
+
+/*
+ * Try to kick all the dnode's dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+	dmu_buf_impl_t *db_marker;
+	dmu_buf_impl_t *db, *db_next;
+
+	db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
+	mutex_enter(&dn->dn_dbufs_mtx);
+	for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
+
+#ifdef	ZFS_DEBUG
+		DB_DNODE_ENTER(db);
+		ASSERT3P(DB_DNODE(db), ==, dn);
+		DB_DNODE_EXIT(db);
+#endif	/* DEBUG */
+
+		mutex_enter(&db->db_mtx);
+		if (db->db_state != DB_EVICTING &&
+		    zfs_refcount_is_zero(&db->db_holds)) {
+			db_marker->db_level = db->db_level;
+			db_marker->db_blkid = db->db_blkid;
+			db_marker->db_state = DB_SEARCH;
+			avl_insert_here(&dn->dn_dbufs, db_marker, db,
+			    AVL_BEFORE);
+
+			/*
+			 * We need to use the "marker" dbuf rather than
+			 * simply getting the next dbuf, because
+			 * dbuf_destroy() may actually remove multiple dbufs.
+			 * It can call itself recursively on the parent dbuf,
+			 * which may also be removed from dn_dbufs.  The code
+			 * flow would look like:
+			 *
+			 * dbuf_destroy():
+			 *   dnode_rele_and_unlock(parent_dbuf, evicting=TRUE):
+			 *	if (!cacheable || pending_evict)
+			 *	  dbuf_destroy()
+			 */
+			dbuf_destroy(db);
+
+			db_next = AVL_NEXT(&dn->dn_dbufs, db_marker);
+			avl_remove(&dn->dn_dbufs, db_marker);
+		} else {
+			db->db_pending_evict = TRUE;
+			mutex_exit(&db->db_mtx);
+			db_next = AVL_NEXT(&dn->dn_dbufs, db);
+		}
+	}
+	mutex_exit(&dn->dn_dbufs_mtx);
+
+	kmem_free(db_marker, sizeof (dmu_buf_impl_t));
+
+	dnode_evict_bonus(dn);
+}
+
+void
+dnode_evict_bonus(dnode_t *dn)
+{
+	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+	if (dn->dn_bonus != NULL) {
+		if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) {
+			mutex_enter(&dn->dn_bonus->db_mtx);
+			dbuf_destroy(dn->dn_bonus);
+			dn->dn_bonus = NULL;
+		} else {
+			dn->dn_bonus->db_pending_evict = TRUE;
+		}
+	}
+	rw_exit(&dn->dn_struct_rwlock);
+}
+
+static void
+dnode_undirty_dbufs(list_t *list)
+{
+	dbuf_dirty_record_t *dr;
+
+	while ((dr = list_head(list))) {
+		dmu_buf_impl_t *db = dr->dr_dbuf;
+		uint64_t txg = dr->dr_txg;
+
+		if (db->db_level != 0)
+			dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
+		mutex_enter(&db->db_mtx);
+		/* XXX - use dbuf_undirty()? */
+		list_remove(list, dr);
+		ASSERT(list_head(&db->db_dirty_records) == dr);
+		list_remove_head(&db->db_dirty_records);
+		ASSERT(list_is_empty(&db->db_dirty_records));
+		db->db_dirtycnt -= 1;
+		if (db->db_level == 0) {
+			ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
+			    dr->dt.dl.dr_data == db->db_buf);
+			dbuf_unoverride(dr);
+		} else {
+			mutex_destroy(&dr->dt.di.dr_mtx);
+			list_destroy(&dr->dt.di.dr_children);
+		}
+		kmem_free(dr, sizeof (dbuf_dirty_record_t));
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE);
+	}
+}
+
+static void
+dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
+{
+	int txgoff = tx->tx_txg & TXG_MASK;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/*
+	 * Our contents should have been freed in dnode_sync() by the
+	 * free range record inserted by the caller of dnode_free().
+	 */
+	ASSERT0(DN_USED_BYTES(dn->dn_phys));
+	ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr));
+
+	dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
+	dnode_evict_dbufs(dn);
+
+	/*
+	 * XXX - It would be nice to assert this, but we may still
+	 * have residual holds from async evictions from the arc...
+	 *
+	 * zfs_obj_to_path() also depends on this being
+	 * commented out.
+	 *
+	 * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1);
+	 */
+
+	/* Undirty next bits */
+	dn->dn_next_nlevels[txgoff] = 0;
+	dn->dn_next_indblkshift[txgoff] = 0;
+	dn->dn_next_blksz[txgoff] = 0;
+	dn->dn_next_maxblkid[txgoff] = 0;
+
+	/* ASSERT(blkptrs are zero); */
+	ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
+	ASSERT(dn->dn_type != DMU_OT_NONE);
+
+	ASSERT(dn->dn_free_txg > 0);
+	if (dn->dn_allocated_txg != dn->dn_free_txg)
+		dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
+	bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+	dnode_free_interior_slots(dn);
+
+	mutex_enter(&dn->dn_mtx);
+	dn->dn_type = DMU_OT_NONE;
+	dn->dn_maxblkid = 0;
+	dn->dn_allocated_txg = 0;
+	dn->dn_free_txg = 0;
+	dn->dn_have_spill = B_FALSE;
+	dn->dn_num_slots = 1;
+	mutex_exit(&dn->dn_mtx);
+
+	ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
+
+	dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+	/*
+	 * Now that we've released our hold, the dnode may
+	 * be evicted, so we mustn't access it.
+	 */
+}
+
+/*
+ * Write out the dnode's dirty buffers.
+ */
+void
+dnode_sync(dnode_t *dn, dmu_tx_t *tx)
+{
+	objset_t *os = dn->dn_objset;
+	dnode_phys_t *dnp = dn->dn_phys;
+	int txgoff = tx->tx_txg & TXG_MASK;
+	list_t *list = &dn->dn_dirty_records[txgoff];
+	static const dnode_phys_t zerodn __maybe_unused = { 0 };
+	boolean_t kill_spill = B_FALSE;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+	ASSERT(dnp->dn_type != DMU_OT_NONE ||
+	    bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0);
+	DNODE_VERIFY(dn);
+
+	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
+
+	/*
+	 * Do user accounting if it is enabled and this is not
+	 * an encrypted receive.
+	 */
+	if (dmu_objset_userused_enabled(os) &&
+	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object) &&
+	    (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
+		mutex_enter(&dn->dn_mtx);
+		dn->dn_oldused = DN_USED_BYTES(dn->dn_phys);
+		dn->dn_oldflags = dn->dn_phys->dn_flags;
+		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+		if (dmu_objset_userobjused_enabled(dn->dn_objset))
+			dn->dn_phys->dn_flags |=
+			    DNODE_FLAG_USEROBJUSED_ACCOUNTED;
+		mutex_exit(&dn->dn_mtx);
+		dmu_objset_userquota_get_ids(dn, B_FALSE, tx);
+	} else {
+		/* Once we account for it, we should always account for it */
+		ASSERT(!(dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED));
+		ASSERT(!(dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USEROBJUSED_ACCOUNTED));
+	}
+
+	mutex_enter(&dn->dn_mtx);
+	if (dn->dn_allocated_txg == tx->tx_txg) {
+		/* The dnode is newly allocated or reallocated */
+		if (dnp->dn_type == DMU_OT_NONE) {
+			/* this is a first alloc, not a realloc */
+			dnp->dn_nlevels = 1;
+			dnp->dn_nblkptr = dn->dn_nblkptr;
+		}
+
+		dnp->dn_type = dn->dn_type;
+		dnp->dn_bonustype = dn->dn_bonustype;
+		dnp->dn_bonuslen = dn->dn_bonuslen;
+	}
+
+	dnp->dn_extra_slots = dn->dn_num_slots - 1;
+
+	ASSERT(dnp->dn_nlevels > 1 ||
+	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
+	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
+	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+	ASSERT(dnp->dn_nlevels < 2 ||
+	    BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+	    BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
+
+	if (dn->dn_next_type[txgoff] != 0) {
+		dnp->dn_type = dn->dn_type;
+		dn->dn_next_type[txgoff] = 0;
+	}
+
+	if (dn->dn_next_blksz[txgoff] != 0) {
+		ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
+		    SPA_MINBLOCKSIZE) == 0);
+		ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+		    dn->dn_maxblkid == 0 || list_head(list) != NULL ||
+		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
+		    dnp->dn_datablkszsec ||
+		    !range_tree_is_empty(dn->dn_free_ranges[txgoff]));
+		dnp->dn_datablkszsec =
+		    dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
+		dn->dn_next_blksz[txgoff] = 0;
+	}
+
+	if (dn->dn_next_bonuslen[txgoff] != 0) {
+		if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
+			dnp->dn_bonuslen = 0;
+		else
+			dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff];
+		ASSERT(dnp->dn_bonuslen <=
+		    DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1));
+		dn->dn_next_bonuslen[txgoff] = 0;
+	}
+
+	if (dn->dn_next_bonustype[txgoff] != 0) {
+		ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
+		dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
+		dn->dn_next_bonustype[txgoff] = 0;
+	}
+
+	boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
+	    dn->dn_free_txg <= tx->tx_txg;
+
+	/*
+	 * Remove the spill block if we have been explicitly asked to
+	 * remove it, or if the object is being removed.
+	 */
+	if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+			kill_spill = B_TRUE;
+		dn->dn_rm_spillblk[txgoff] = 0;
+	}
+
+	if (dn->dn_next_indblkshift[txgoff] != 0) {
+		ASSERT(dnp->dn_nlevels == 1);
+		dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
+		dn->dn_next_indblkshift[txgoff] = 0;
+	}
+
+	/*
+	 * Just take the live (open-context) values for checksum and compress.
+	 * Strictly speaking it's a future leak, but nothing bad happens if we
+	 * start using the new checksum or compress algorithm a little early.
+	 */
+	dnp->dn_checksum = dn->dn_checksum;
+	dnp->dn_compress = dn->dn_compress;
+
+	mutex_exit(&dn->dn_mtx);
+
+	if (kill_spill) {
+		free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx);
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	/* process all the "freed" ranges in the file */
+	if (dn->dn_free_ranges[txgoff] != NULL) {
+		dnode_sync_free_range_arg_t dsfra;
+		dsfra.dsfra_dnode = dn;
+		dsfra.dsfra_tx = tx;
+		dsfra.dsfra_free_indirects = freeing_dnode;
+		mutex_enter(&dn->dn_mtx);
+		if (freeing_dnode) {
+			ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff],
+			    0, dn->dn_maxblkid + 1));
+		}
+		/*
+		 * Because dnode_sync_free_range() must drop dn_mtx during its
+		 * processing, using it as a callback to range_tree_vacate() is
+		 * not safe.  No other operations (besides destroy) are allowed
+		 * once range_tree_vacate() has begun, and dropping dn_mtx
+		 * would leave a window open for another thread to observe that
+		 * invalid (and unsafe) state.
+		 */
+		range_tree_walk(dn->dn_free_ranges[txgoff],
+		    dnode_sync_free_range, &dsfra);
+		range_tree_vacate(dn->dn_free_ranges[txgoff], NULL, NULL);
+		range_tree_destroy(dn->dn_free_ranges[txgoff]);
+		dn->dn_free_ranges[txgoff] = NULL;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	if (freeing_dnode) {
+		dn->dn_objset->os_freed_dnodes++;
+		dnode_sync_free(dn, tx);
+		return;
+	}
+
+	if (dn->dn_num_slots > DNODE_MIN_SLOTS) {
+		dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+		mutex_enter(&ds->ds_lock);
+		ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] =
+		    (void *)B_TRUE;
+		mutex_exit(&ds->ds_lock);
+	}
+
+	if (dn->dn_next_nlevels[txgoff]) {
+		dnode_increase_indirection(dn, tx);
+		dn->dn_next_nlevels[txgoff] = 0;
+	}
+
+	/*
+	 * This must be done after dnode_sync_free_range()
+	 * and dnode_increase_indirection(). See dnode_new_blkid()
+	 * for an explanation of the high bit being set.
+	 */
+	if (dn->dn_next_maxblkid[txgoff]) {
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_maxblkid =
+		    dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET;
+		dn->dn_next_maxblkid[txgoff] = 0;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	if (dn->dn_next_nblkptr[txgoff]) {
+		/* this should only happen on a realloc */
+		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
+		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
+			/* zero the new blkptrs we are gaining */
+			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+			    sizeof (blkptr_t) *
+			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
+#ifdef ZFS_DEBUG
+		} else {
+			int i;
+			ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
+			/* the blkptrs we are losing better be unallocated */
+			for (i = 0; i < dnp->dn_nblkptr; i++) {
+				if (i >= dn->dn_next_nblkptr[txgoff])
+					ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
+			}
+#endif
+		}
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
+		dn->dn_next_nblkptr[txgoff] = 0;
+		mutex_exit(&dn->dn_mtx);
+	}
+
+	dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
+
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		ASSERT3P(list_head(list), ==, NULL);
+		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
+	}
+
+	/*
+	 * Although we have dropped our reference to the dnode, it
+	 * can't be evicted until its written, and we haven't yet
+	 * initiated the IO for the dnode's dbuf.  Additionally, the caller
+	 * has already added a reference to the dnode because it's on the
+	 * os_synced_dnodes list.
+	 */
+}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_bookmark.c b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
new file mode 100644
index 000000000000..2faf1af52991
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_bookmark.c
@@ -0,0 +1,1734 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright 2019, 2020 by Christian Schwarz. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/spa.h>
+#include <sys/dsl_bookmark.h>
+#include <zfs_namecheck.h>
+#include <sys/dmu_send.h>
+
+static int
+dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname,
+    dsl_dataset_t **dsp, void *tag, char **shortnamep)
+{
+	char buf[ZFS_MAX_DATASET_NAME_LEN];
+	char *hashp;
+
+	if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	hashp = strchr(fullname, '#');
+	if (hashp == NULL)
+		return (SET_ERROR(EINVAL));
+
+	*shortnamep = hashp + 1;
+	if (zfs_component_namecheck(*shortnamep, NULL, NULL))
+		return (SET_ERROR(EINVAL));
+	(void) strlcpy(buf, fullname, hashp - fullname + 1);
+	return (dsl_dataset_hold(dp, buf, tag, dsp));
+}
+
+/*
+ * When reading BOOKMARK_V1 bookmarks, the BOOKMARK_V2 fields are guaranteed
+ * to be zeroed.
+ *
+ * Returns ESRCH if bookmark is not found.
+ * Note, we need to use the ZAP rather than the AVL to look up bookmarks
+ * by name, because only the ZAP honors the casesensitivity setting.
+ */
+int
+dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname,
+    zfs_bookmark_phys_t *bmark_phys)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
+	matchtype_t mt = 0;
+	int err;
+
+	if (bmark_zapobj == 0)
+		return (SET_ERROR(ESRCH));
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_NORMALIZE;
+
+	/*
+	 * Zero out the bookmark in case the one stored on disk
+	 * is in an older, shorter format.
+	 */
+	bzero(bmark_phys, sizeof (*bmark_phys));
+
+	err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t),
+	    sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0,
+	    NULL);
+
+	return (err == ENOENT ? SET_ERROR(ESRCH) : err);
+}
+
+/*
+ * If later_ds is non-NULL, this will return EXDEV if the specified bookmark
+ * does not represents an earlier point in later_ds's timeline.  However,
+ * bmp will still be filled in if we return EXDEV.
+ *
+ * Returns ENOENT if the dataset containing the bookmark does not exist.
+ * Returns ESRCH if the dataset exists but the bookmark was not found in it.
+ */
+int
+dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname,
+    dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp)
+{
+	char *shortname;
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname);
+	if (error != 0)
+		return (error);
+
+	error = dsl_bookmark_lookup_impl(ds, shortname, bmp);
+	if (error == 0 && later_ds != NULL) {
+		if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg))
+			error = SET_ERROR(EXDEV);
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+/*
+ * Validates that
+ * - bmark is a full dataset path of a bookmark (bookmark_namecheck)
+ * - source is a full path of a snapshot or bookmark
+ *   ({bookmark,snapshot}_namecheck)
+ *
+ * Returns 0 if valid, -1 otherwise.
+ */
+static int
+dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source)
+{
+	if (bookmark_namecheck(bmark, NULL, NULL) != 0)
+		return (-1);
+
+	int is_bmark, is_snap;
+	is_bmark = bookmark_namecheck(source, NULL, NULL) == 0;
+	is_snap = snapshot_namecheck(source, NULL, NULL) == 0;
+	if (!is_bmark && !is_snap)
+		return (-1);
+
+	return (0);
+}
+
+/*
+ * Check that the given nvlist corresponds to the following schema:
+ *  { newbookmark -> source, ... }
+ * where
+ * - each pair passes dsl_bookmark_create_nvl_validate_pair
+ * - all newbookmarks are in the same pool
+ * - all newbookmarks have unique names
+ *
+ * Note that this function is only validates above schema. Callers must ensure
+ * that the bookmarks can be created, e.g. that sources exist.
+ *
+ * Returns 0 if the nvlist adheres to above schema.
+ * Returns -1 if it doesn't.
+ */
+int
+dsl_bookmark_create_nvl_validate(nvlist_t *bmarks)
+{
+	char *first;
+	size_t first_len;
+
+	first = NULL;
+	for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) {
+
+		char *bmark = nvpair_name(pair);
+		char *source;
+
+		/* list structure: values must be snapshots XOR bookmarks */
+		if (nvpair_value_string(pair, &source) != 0)
+			return (-1);
+		if (dsl_bookmark_create_nvl_validate_pair(bmark, source) != 0)
+			return (-1);
+
+		/* same pool check */
+		if (first == NULL) {
+			char *cp = strpbrk(bmark, "/#");
+			if (cp == NULL)
+				return (-1);
+			first = bmark;
+			first_len = cp - bmark;
+		}
+		if (strncmp(first, bmark, first_len) != 0)
+			return (-1);
+		switch (*(bmark + first_len)) {
+			case '/': /* fallthrough */
+			case '#':
+				break;
+			default:
+				return (-1);
+		}
+
+		/* unique newbookmark names; todo: O(n^2) */
+		for (nvpair_t *pair2 = nvlist_next_nvpair(bmarks, pair);
+		    pair2 != NULL; pair2 = nvlist_next_nvpair(bmarks, pair2)) {
+			if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
+				return (-1);
+		}
+
+	}
+	return (0);
+}
+
+/*
+ * expects that newbm and source have been validated using
+ * dsl_bookmark_create_nvl_validate_pair
+ */
+static int
+dsl_bookmark_create_check_impl(dsl_pool_t *dp,
+    const char *newbm, const char *source)
+{
+	ASSERT0(dsl_bookmark_create_nvl_validate_pair(newbm, source));
+	/* defer source namecheck until we know it's a snapshot or bookmark */
+
+	int error;
+	dsl_dataset_t *newbm_ds;
+	char *newbm_short;
+	zfs_bookmark_phys_t bmark_phys;
+
+	error = dsl_bookmark_hold_ds(dp, newbm, &newbm_ds, FTAG, &newbm_short);
+	if (error != 0)
+		return (error);
+
+	/* Verify that the new bookmark does not already exist */
+	error = dsl_bookmark_lookup_impl(newbm_ds, newbm_short, &bmark_phys);
+	switch (error) {
+	case ESRCH:
+		/* happy path: new bmark doesn't exist, proceed after switch */
+		error = 0;
+		break;
+	case 0:
+		error = SET_ERROR(EEXIST);
+		goto eholdnewbmds;
+	default:
+		/* dsl_bookmark_lookup_impl already did SET_ERRROR */
+		goto eholdnewbmds;
+	}
+
+	/* error is retval of the following if-cascade */
+	if (strchr(source, '@') != NULL) {
+		dsl_dataset_t *source_snap_ds;
+		ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0);
+		error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds);
+		if (error == 0) {
+			VERIFY(source_snap_ds->ds_is_snapshot);
+			/*
+			 * Verify that source snapshot is an earlier point in
+			 * newbm_ds's timeline (source may be newbm_ds's origin)
+			 */
+			if (!dsl_dataset_is_before(newbm_ds, source_snap_ds, 0))
+				error = SET_ERROR(
+				    ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
+			dsl_dataset_rele(source_snap_ds, FTAG);
+		}
+	} else if (strchr(source, '#') != NULL) {
+		zfs_bookmark_phys_t source_phys;
+		ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0);
+		/*
+		 * Source must exists and be an earlier point in newbm_ds's
+		 * timeline (newbm_ds's origin may be a snap of source's ds)
+		 */
+		error = dsl_bookmark_lookup(dp, source, newbm_ds, &source_phys);
+		switch (error) {
+		case 0:
+			break; /* happy path */
+		case EXDEV:
+			error = SET_ERROR(ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR);
+			break;
+		default:
+			/* dsl_bookmark_lookup already did SET_ERRROR */
+			break;
+		}
+	} else {
+		/*
+		 * dsl_bookmark_create_nvl_validate validates that source is
+		 * either snapshot or bookmark
+		 */
+		panic("unreachable code: %s", source);
+	}
+
+eholdnewbmds:
+	dsl_dataset_rele(newbm_ds, FTAG);
+	return (error);
+}
+
+int
+dsl_bookmark_create_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_create_arg_t *dbca = arg;
+	int rv = 0;
+	int schema_err = 0;
+	ASSERT3P(dbca, !=, NULL);
+	ASSERT3P(dbca->dbca_bmarks, !=, NULL);
+	/* dbca->dbca_errors is allowed to be NULL */
+
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+		return (SET_ERROR(ENOTSUP));
+
+	if (dsl_bookmark_create_nvl_validate(dbca->dbca_bmarks) != 0)
+		rv = schema_err = SET_ERROR(EINVAL);
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+		char *new = nvpair_name(pair);
+
+		int error = schema_err;
+		if (error == 0) {
+			char *source = fnvpair_value_string(pair);
+			error = dsl_bookmark_create_check_impl(dp, new, source);
+			if (error != 0)
+				error = SET_ERROR(error);
+		}
+
+		if (error != 0) {
+			rv = error;
+			if (dbca->dbca_errors != NULL)
+				fnvlist_add_int32(dbca->dbca_errors,
+				    new, error);
+		}
+	}
+
+	return (rv);
+}
+
+static dsl_bookmark_node_t *
+dsl_bookmark_node_alloc(char *shortname)
+{
+	dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP);
+	dbn->dbn_name = spa_strdup(shortname);
+	dbn->dbn_dirty = B_FALSE;
+	mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (dbn);
+}
+
+/*
+ * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot.
+ */
+static void
+dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap)
+{
+	spa_t *spa = dsl_dataset_get_spa(snap);
+	objset_t *mos = spa_get_dsl(spa)->dp_meta_objset;
+	dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap);
+	zbm->zbm_guid = dsp->ds_guid;
+	zbm->zbm_creation_txg = dsp->ds_creation_txg;
+	zbm->zbm_creation_time = dsp->ds_creation_time;
+	zbm->zbm_redaction_obj = 0;
+
+	/*
+	 * If the dataset is encrypted create a larger bookmark to
+	 * accommodate the IVset guid. The IVset guid was added
+	 * after the encryption feature to prevent a problem with
+	 * raw sends. If we encounter an encrypted dataset without
+	 * an IVset guid we fall back to a normal bookmark.
+	 */
+	if (snap->ds_dir->dd_crypto_obj != 0 &&
+	    spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
+		(void) zap_lookup(mos, snap->ds_object,
+		    DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
+		    &zbm->zbm_ivset_guid);
+	}
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) {
+		zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN;
+		zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
+		zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
+		zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
+
+		dsl_dataset_t *nextds;
+		VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool,
+		    dsp->ds_next_snap_obj, FTAG, &nextds));
+		dsl_deadlist_space(&nextds->ds_deadlist,
+		    &zbm->zbm_referenced_freed_before_next_snap,
+		    &zbm->zbm_compressed_freed_before_next_snap,
+		    &zbm->zbm_uncompressed_freed_before_next_snap);
+		dsl_dataset_rele(nextds, FTAG);
+	} else {
+		bzero(&zbm->zbm_flags,
+		    sizeof (zfs_bookmark_phys_t) -
+		    offsetof(zfs_bookmark_phys_t, zbm_flags));
+	}
+}
+
+/*
+ * Add dsl_bookmark_node_t `dbn` to the given dataset and increment appropriate
+ * SPA feature counters.
+ */
+void
+dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (hds->ds_bookmarks_obj == 0) {
+		hds->ds_bookmarks_obj = zap_create_norm(mos,
+		    U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0,
+		    tx);
+		spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+
+		dsl_dataset_zapify(hds, tx);
+		VERIFY0(zap_add(mos, hds->ds_object,
+		    DS_FIELD_BOOKMARK_NAMES,
+		    sizeof (hds->ds_bookmarks_obj), 1,
+		    &hds->ds_bookmarks_obj, tx));
+	}
+
+	avl_add(&hds->ds_bookmarks, dbn);
+
+	/*
+	 * To maintain backwards compatibility with software that doesn't
+	 * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest
+	 * possible bookmark size.
+	 */
+	uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1;
+	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) &&
+	    (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags &
+	    ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) {
+		bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2;
+		spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx);
+	}
+
+	__attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 };
+	ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size,
+	    &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size));
+
+	VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name,
+	    sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t),
+	    &dbn->dbn_phys, tx));
+}
+
+/*
+ * If redaction_list is non-null, we create a redacted bookmark and redaction
+ * list, and store the object number of the redaction list in redact_obj.
+ */
+static void
+dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
+    dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag,
+    redaction_list_t **redaction_list)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *snapds, *bmark_fs;
+	char *shortname;
+	boolean_t bookmark_redacted;
+	uint64_t *dsredactsnaps;
+	uint64_t dsnumsnaps;
+
+	VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds));
+	VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG,
+	    &shortname));
+
+	dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname);
+	dsl_bookmark_set_phys(&dbn->dbn_phys, snapds);
+
+	bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds,
+	    SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps);
+	if (redaction_list != NULL || bookmark_redacted) {
+		redaction_list_t *local_rl;
+		if (bookmark_redacted) {
+			redact_snaps = dsredactsnaps;
+			num_redact_snaps = dsnumsnaps;
+		}
+		dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos,
+		    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+		    DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) +
+		    num_redact_snaps * sizeof (uint64_t), tx);
+		spa_feature_incr(dp->dp_spa,
+		    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+
+		VERIFY0(dsl_redaction_list_hold_obj(dp,
+		    dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl));
+		dsl_redaction_list_long_hold(dp, local_rl, tag);
+
+		ASSERT3U((local_rl)->rl_dbuf->db_size, >=,
+		    sizeof (redaction_list_phys_t) + num_redact_snaps *
+		    sizeof (uint64_t));
+		dmu_buf_will_dirty(local_rl->rl_dbuf, tx);
+		bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps,
+		    sizeof (uint64_t) * num_redact_snaps);
+		local_rl->rl_phys->rlp_num_snaps = num_redact_snaps;
+		if (bookmark_redacted) {
+			ASSERT3P(redaction_list, ==, NULL);
+			local_rl->rl_phys->rlp_last_blkid = UINT64_MAX;
+			local_rl->rl_phys->rlp_last_object = UINT64_MAX;
+			dsl_redaction_list_long_rele(local_rl, tag);
+			dsl_redaction_list_rele(local_rl, tag);
+		} else {
+			*redaction_list = local_rl;
+		}
+	}
+
+	if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+		spa_feature_incr(dp->dp_spa,
+		    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+	}
+
+	dsl_bookmark_node_add(bmark_fs, dbn, tx);
+
+	spa_history_log_internal_ds(bmark_fs, "bookmark", tx,
+	    "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu",
+	    shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg,
+	    (longlong_t)snapds->ds_object,
+	    (longlong_t)dbn->dbn_phys.zbm_redaction_obj);
+
+	dsl_dataset_rele(bmark_fs, FTAG);
+	dsl_dataset_rele(snapds, FTAG);
+}
+
+
+static void
+dsl_bookmark_create_sync_impl_book(
+    const char *new_name, const char *source_name, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *bmark_fs_source, *bmark_fs_new;
+	char *source_shortname, *new_shortname;
+	zfs_bookmark_phys_t source_phys;
+
+	VERIFY0(dsl_bookmark_hold_ds(dp, source_name, &bmark_fs_source, FTAG,
+	    &source_shortname));
+	VERIFY0(dsl_bookmark_hold_ds(dp, new_name, &bmark_fs_new, FTAG,
+	    &new_shortname));
+
+	/*
+	 * create a copy of the source bookmark by copying most of its members
+	 *
+	 * Caveat: bookmarking a redaction bookmark yields a normal bookmark
+	 * -----------------------------------------------------------------
+	 * Reasoning:
+	 * - The zbm_redaction_obj would be referred to by both source and new
+	 *   bookmark, but would be destroyed once either source or new is
+	 *   destroyed, resulting in use-after-free of the referrred object.
+	 * - User expectation when issuing the `zfs bookmark` command is that
+	 *   a normal bookmark of the source is created
+	 *
+	 * Design Alternatives For Full Redaction Bookmark Copying:
+	 * - reference-count the redaction object => would require on-disk
+	 *   format change for existing redaction objects
+	 * - Copy the redaction object => cannot be done in syncing context
+	 *   because the redaction object might be too large
+	 */
+
+	VERIFY0(dsl_bookmark_lookup_impl(bmark_fs_source, source_shortname,
+	    &source_phys));
+	dsl_bookmark_node_t *new_dbn = dsl_bookmark_node_alloc(new_shortname);
+
+	memcpy(&new_dbn->dbn_phys, &source_phys, sizeof (source_phys));
+	new_dbn->dbn_phys.zbm_redaction_obj = 0;
+
+	/* update feature counters */
+	if (new_dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+		spa_feature_incr(dp->dp_spa,
+		    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+	}
+	/* no need for redaction bookmark counter; nulled zbm_redaction_obj */
+	/* dsl_bookmark_node_add bumps bookmarks and v2-bookmarks counter */
+
+	/*
+	 * write new bookmark
+	 *
+	 * Note that dsl_bookmark_lookup_impl guarantees that, if source is a
+	 * v1 bookmark, the v2-only fields are zeroed.
+	 * And dsl_bookmark_node_add writes back a v1-sized bookmark if
+	 * v2 bookmarks are disabled and/or v2-only fields are zeroed.
+	 * => bookmark copying works on pre-bookmark-v2 pools
+	 */
+	dsl_bookmark_node_add(bmark_fs_new, new_dbn, tx);
+
+	spa_history_log_internal_ds(bmark_fs_source, "bookmark", tx,
+	    "name=%s creation_txg=%llu source_guid=%llu",
+	    new_shortname, (longlong_t)new_dbn->dbn_phys.zbm_creation_txg,
+	    (longlong_t)source_phys.zbm_guid);
+
+	dsl_dataset_rele(bmark_fs_source, FTAG);
+	dsl_dataset_rele(bmark_fs_new, FTAG);
+}
+
+void
+dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_create_arg_t *dbca = arg;
+
+	ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa,
+	    SPA_FEATURE_BOOKMARKS));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) {
+
+		char *new = nvpair_name(pair);
+		char *source = fnvpair_value_string(pair);
+
+		if (strchr(source, '@') != NULL) {
+			dsl_bookmark_create_sync_impl_snap(new, source, tx,
+			    0, NULL, NULL, NULL);
+		} else if (strchr(source, '#') != NULL) {
+			dsl_bookmark_create_sync_impl_book(new, source, tx);
+		} else {
+			panic("unreachable code");
+		}
+
+	}
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors)
+{
+	nvpair_t *pair;
+	dsl_bookmark_create_arg_t dbca;
+
+	pair = nvlist_next_nvpair(bmarks, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dbca.dbca_bmarks = bmarks;
+	dbca.dbca_errors = errors;
+
+	return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check,
+	    dsl_bookmark_create_sync, &dbca,
+	    fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL));
+}
+
+static int
+dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_create_redacted_arg_t *dbcra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int rv = 0;
+
+	if (!spa_feature_is_enabled(dp->dp_spa,
+	    SPA_FEATURE_REDACTION_BOOKMARKS))
+		return (SET_ERROR(ENOTSUP));
+	/*
+	 * If the list of redact snaps will not fit in the bonus buffer with
+	 * the furthest reached object and offset, fail.
+	 */
+	if (dbcra->dbcra_numsnaps > (dmu_bonus_max() -
+	    sizeof (redaction_list_phys_t)) / sizeof (uint64_t))
+		return (SET_ERROR(E2BIG));
+
+	if (dsl_bookmark_create_nvl_validate_pair(
+	    dbcra->dbcra_bmark, dbcra->dbcra_snap) != 0)
+		return (SET_ERROR(EINVAL));
+
+	rv = dsl_bookmark_create_check_impl(dp,
+	    dbcra->dbcra_bmark, dbcra->dbcra_snap);
+	return (rv);
+}
+
+static void
+dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_create_redacted_arg_t *dbcra = arg;
+	dsl_bookmark_create_sync_impl_snap(dbcra->dbcra_bmark,
+	    dbcra->dbcra_snap, tx, dbcra->dbcra_numsnaps, dbcra->dbcra_snaps,
+	    dbcra->dbcra_tag, dbcra->dbcra_rl);
+}
+
+int
+dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot,
+    uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl)
+{
+	dsl_bookmark_create_redacted_arg_t dbcra;
+
+	dbcra.dbcra_bmark = bookmark;
+	dbcra.dbcra_snap = snapshot;
+	dbcra.dbcra_rl = rl;
+	dbcra.dbcra_numsnaps = numsnaps;
+	dbcra.dbcra_snaps = snapguids;
+	dbcra.dbcra_tag = tag;
+
+	return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check,
+	    dsl_bookmark_create_redacted_sync, &dbcra, 5,
+	    ZFS_SPACE_CHECK_NORMAL));
+}
+
+/*
+ * Retrieve the list of properties given in the 'props' nvlist for a bookmark.
+ * If 'props' is NULL, retrieves all properties.
+ */
+static void
+dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys,
+    nvlist_t *props, nvlist_t *out_props)
+{
+	ASSERT3P(dp, !=, NULL);
+	ASSERT3P(bmark_phys, !=, NULL);
+	ASSERT3P(out_props, !=, NULL);
+	ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock));
+
+	if (props == NULL || nvlist_exists(props,
+	    zfs_prop_to_name(ZFS_PROP_GUID))) {
+		dsl_prop_nvlist_add_uint64(out_props,
+		    ZFS_PROP_GUID, bmark_phys->zbm_guid);
+	}
+	if (props == NULL || nvlist_exists(props,
+	    zfs_prop_to_name(ZFS_PROP_CREATETXG))) {
+		dsl_prop_nvlist_add_uint64(out_props,
+		    ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg);
+	}
+	if (props == NULL || nvlist_exists(props,
+	    zfs_prop_to_name(ZFS_PROP_CREATION))) {
+		dsl_prop_nvlist_add_uint64(out_props,
+		    ZFS_PROP_CREATION, bmark_phys->zbm_creation_time);
+	}
+	if (props == NULL || nvlist_exists(props,
+	    zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) {
+		dsl_prop_nvlist_add_uint64(out_props,
+		    ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid);
+	}
+	if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) {
+		if (props == NULL || nvlist_exists(props,
+		    zfs_prop_to_name(ZFS_PROP_REFERENCED))) {
+			dsl_prop_nvlist_add_uint64(out_props,
+			    ZFS_PROP_REFERENCED,
+			    bmark_phys->zbm_referenced_bytes_refd);
+		}
+		if (props == NULL || nvlist_exists(props,
+		    zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) {
+			dsl_prop_nvlist_add_uint64(out_props,
+			    ZFS_PROP_LOGICALREFERENCED,
+			    bmark_phys->zbm_uncompressed_bytes_refd);
+		}
+		if (props == NULL || nvlist_exists(props,
+		    zfs_prop_to_name(ZFS_PROP_REFRATIO))) {
+			uint64_t ratio =
+			    bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 :
+			    bmark_phys->zbm_uncompressed_bytes_refd * 100 /
+			    bmark_phys->zbm_compressed_bytes_refd;
+			dsl_prop_nvlist_add_uint64(out_props,
+			    ZFS_PROP_REFRATIO, ratio);
+		}
+	}
+
+	if ((props == NULL || nvlist_exists(props, "redact_snaps") ||
+	    nvlist_exists(props, "redact_complete")) &&
+	    bmark_phys->zbm_redaction_obj != 0) {
+		redaction_list_t *rl;
+		int err = dsl_redaction_list_hold_obj(dp,
+		    bmark_phys->zbm_redaction_obj, FTAG, &rl);
+		if (err == 0) {
+			if (nvlist_exists(props, "redact_snaps")) {
+				nvlist_t *nvl;
+				nvl = fnvlist_alloc();
+				fnvlist_add_uint64_array(nvl, ZPROP_VALUE,
+				    rl->rl_phys->rlp_snaps,
+				    rl->rl_phys->rlp_num_snaps);
+				fnvlist_add_nvlist(out_props, "redact_snaps",
+				    nvl);
+				nvlist_free(nvl);
+			}
+			if (nvlist_exists(props, "redact_complete")) {
+				nvlist_t *nvl;
+				nvl = fnvlist_alloc();
+				fnvlist_add_boolean_value(nvl, ZPROP_VALUE,
+				    rl->rl_phys->rlp_last_blkid == UINT64_MAX &&
+				    rl->rl_phys->rlp_last_object == UINT64_MAX);
+				fnvlist_add_nvlist(out_props, "redact_complete",
+				    nvl);
+				nvlist_free(nvl);
+			}
+			dsl_redaction_list_rele(rl, FTAG);
+		}
+	}
+}
+
+int
+dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	if (dsl_dataset_is_snapshot(ds))
+		return (SET_ERROR(EINVAL));
+
+	for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
+	    dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
+		nvlist_t *out_props = fnvlist_alloc();
+
+		dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props);
+
+		fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props);
+		fnvlist_free(out_props);
+	}
+	return (0);
+}
+
+/*
+ * Comparison func for ds_bookmarks AVL tree.  We sort the bookmarks by
+ * their TXG, then by their FBN-ness.  The "FBN-ness" component ensures
+ * that all bookmarks at the same TXG that HAS_FBN are adjacent, which
+ * dsl_bookmark_destroy_sync_impl() depends on.  Note that there may be
+ * multiple bookmarks at the same TXG (with the same FBN-ness).  In this
+ * case we differentiate them by an arbitrary metric (in this case,
+ * their names).
+ */
+static int
+dsl_bookmark_compare(const void *l, const void *r)
+{
+	const dsl_bookmark_node_t *ldbn = l;
+	const dsl_bookmark_node_t *rdbn = r;
+
+	int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg,
+	    rdbn->dbn_phys.zbm_creation_txg);
+	if (likely(cmp))
+		return (cmp);
+	cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN),
+	    (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
+	if (likely(cmp))
+		return (cmp);
+	cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name);
+	return (TREE_ISIGN(cmp));
+}
+
+/*
+ * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree.
+ */
+int
+dsl_bookmark_init_ds(dsl_dataset_t *ds)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	ASSERT(!ds->ds_is_snapshot);
+
+	avl_create(&ds->ds_bookmarks, dsl_bookmark_compare,
+	    sizeof (dsl_bookmark_node_t),
+	    offsetof(dsl_bookmark_node_t, dbn_node));
+
+	if (!dsl_dataset_is_zapified(ds))
+		return (0);
+
+	int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES,
+	    sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj);
+	if (zaperr == ENOENT)
+		return (0);
+	if (zaperr != 0)
+		return (zaperr);
+
+	if (ds->ds_bookmarks_obj == 0)
+		return (0);
+
+	int err = 0;
+	zap_cursor_t zc;
+	zap_attribute_t attr;
+
+	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
+	    (err = zap_cursor_retrieve(&zc, &attr)) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_bookmark_node_t *dbn =
+		    dsl_bookmark_node_alloc(attr.za_name);
+
+		err = dsl_bookmark_lookup_impl(ds,
+		    dbn->dbn_name, &dbn->dbn_phys);
+		ASSERT3U(err, !=, ENOENT);
+		if (err != 0) {
+			kmem_free(dbn, sizeof (*dbn));
+			break;
+		}
+		avl_add(&ds->ds_bookmarks, dbn);
+	}
+	zap_cursor_fini(&zc);
+	if (err == ENOENT)
+		err = 0;
+	return (err);
+}
+
+void
+dsl_bookmark_fini_ds(dsl_dataset_t *ds)
+{
+	void *cookie = NULL;
+	dsl_bookmark_node_t *dbn;
+
+	if (ds->ds_is_snapshot)
+		return;
+
+	while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) {
+		spa_strfree(dbn->dbn_name);
+		mutex_destroy(&dbn->dbn_lock);
+		kmem_free(dbn, sizeof (*dbn));
+	}
+	avl_destroy(&ds->ds_bookmarks);
+}
+
+/*
+ * Retrieve the bookmarks that exist in the specified dataset, and the
+ * requested properties of each bookmark.
+ *
+ * The "props" nvlist specifies which properties are requested.
+ * See lzc_get_bookmarks() for the list of valid properties.
+ */
+int
+dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_pool_hold(dsname, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	err = dsl_get_bookmarks_impl(ds, props, outnvl);
+
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (err);
+}
+
+/*
+ * Retrieve all properties for a single bookmark in the given dataset.
+ */
+int
+dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	zfs_bookmark_phys_t bmark_phys = { 0 };
+	int err;
+
+	err = dsl_pool_hold(dsname, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys);
+	if (err != 0)
+		goto out;
+
+	dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props);
+out:
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (err);
+}
+
+typedef struct dsl_bookmark_destroy_arg {
+	nvlist_t *dbda_bmarks;
+	nvlist_t *dbda_success;
+	nvlist_t *dbda_errors;
+} dsl_bookmark_destroy_arg_t;
+
+static void
+dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name,
+    dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t bmark_zapobj = ds->ds_bookmarks_obj;
+	matchtype_t mt = 0;
+	uint64_t int_size, num_ints;
+	/*
+	 * 'search' must be zeroed so that dbn_flags (which is used in
+	 * dsl_bookmark_compare()) will be zeroed even if the on-disk
+	 * (in ZAP) bookmark is shorter than offsetof(dbn_flags).
+	 */
+	dsl_bookmark_node_t search = { 0 };
+	char realname[ZFS_MAX_DATASET_NAME_LEN];
+
+	/*
+	 * Find the real name of this bookmark, which may be different
+	 * from the given name if the dataset is case-insensitive.  Then
+	 * use the real name to find the node in the ds_bookmarks AVL tree.
+	 */
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_NORMALIZE;
+
+	VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints));
+
+	ASSERT3U(int_size, ==, sizeof (uint64_t));
+
+	if (num_ints * int_size > BOOKMARK_PHYS_SIZE_V1) {
+		spa_feature_decr(dmu_objset_spa(mos),
+		    SPA_FEATURE_BOOKMARK_V2, tx);
+	}
+	VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t),
+	    num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL));
+
+	search.dbn_name = realname;
+	dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL);
+	ASSERT(dbn != NULL);
+
+	if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+		/*
+		 * If this bookmark HAS_FBN, and it is before the most
+		 * recent snapshot, then its TXG is a key in the head's
+		 * deadlist (and all clones' heads' deadlists).  If this is
+		 * the last thing keeping the key (i.e. there are no more
+		 * bookmarks with HAS_FBN at this TXG, and there is no
+		 * snapshot at this TXG), then remove the key.
+		 *
+		 * Note that this algorithm depends on ds_bookmarks being
+		 * sorted such that all bookmarks at the same TXG with
+		 * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks
+		 * at the same TXG in between them).  If this were not
+		 * the case, we would need to examine *all* bookmarks
+		 * at this TXG, rather than just the adjacent ones.
+		 */
+
+		dsl_bookmark_node_t *dbn_prev =
+		    AVL_PREV(&ds->ds_bookmarks, dbn);
+		dsl_bookmark_node_t *dbn_next =
+		    AVL_NEXT(&ds->ds_bookmarks, dbn);
+
+		boolean_t more_bookmarks_at_this_txg =
+		    (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg ==
+		    dbn->dbn_phys.zbm_creation_txg &&
+		    (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) ||
+		    (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg ==
+		    dbn->dbn_phys.zbm_creation_txg &&
+		    (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN));
+
+		if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) &&
+		    !more_bookmarks_at_this_txg &&
+		    dbn->dbn_phys.zbm_creation_txg <
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+			dsl_dir_remove_clones_key(ds->ds_dir,
+			    dbn->dbn_phys.zbm_creation_txg, tx);
+			dsl_deadlist_remove_key(&ds->ds_deadlist,
+			    dbn->dbn_phys.zbm_creation_txg, tx);
+		}
+
+		spa_feature_decr(dmu_objset_spa(mos),
+		    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+	}
+
+	if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+		VERIFY0(dmu_object_free(mos,
+		    dbn->dbn_phys.zbm_redaction_obj, tx));
+		spa_feature_decr(dmu_objset_spa(mos),
+		    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+	}
+
+	avl_remove(&ds->ds_bookmarks, dbn);
+	spa_strfree(dbn->dbn_name);
+	mutex_destroy(&dbn->dbn_lock);
+	kmem_free(dbn, sizeof (*dbn));
+
+	VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx));
+}
+
+static int
+dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_destroy_arg_t *dbda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int rv = 0;
+
+	ASSERT(nvlist_empty(dbda->dbda_success));
+	ASSERT(nvlist_empty(dbda->dbda_errors));
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS))
+		return (0);
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) {
+		const char *fullname = nvpair_name(pair);
+		dsl_dataset_t *ds;
+		zfs_bookmark_phys_t bm;
+		int error;
+		char *shortname;
+
+		error = dsl_bookmark_hold_ds(dp, fullname, &ds,
+		    FTAG, &shortname);
+		if (error == ENOENT) {
+			/* ignore it; the bookmark is "already destroyed" */
+			continue;
+		}
+		if (error == 0) {
+			error = dsl_bookmark_lookup_impl(ds, shortname, &bm);
+			dsl_dataset_rele(ds, FTAG);
+			if (error == ESRCH) {
+				/*
+				 * ignore it; the bookmark is
+				 * "already destroyed"
+				 */
+				continue;
+			}
+			if (error == 0 && bm.zbm_redaction_obj != 0) {
+				redaction_list_t *rl = NULL;
+				error = dsl_redaction_list_hold_obj(tx->tx_pool,
+				    bm.zbm_redaction_obj, FTAG, &rl);
+				if (error == ENOENT) {
+					error = 0;
+				} else if (error == 0 &&
+				    dsl_redaction_list_long_held(rl)) {
+					error = SET_ERROR(EBUSY);
+				}
+				if (rl != NULL) {
+					dsl_redaction_list_rele(rl, FTAG);
+				}
+			}
+		}
+		if (error == 0) {
+			if (dmu_tx_is_syncing(tx)) {
+				fnvlist_add_boolean(dbda->dbda_success,
+				    fullname);
+			}
+		} else {
+			fnvlist_add_int32(dbda->dbda_errors, fullname, error);
+			rv = error;
+		}
+	}
+	return (rv);
+}
+
+static void
+dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_bookmark_destroy_arg_t *dbda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) {
+		dsl_dataset_t *ds;
+		char *shortname;
+		uint64_t zap_cnt;
+
+		VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair),
+		    &ds, FTAG, &shortname));
+		dsl_bookmark_destroy_sync_impl(ds, shortname, tx);
+
+		/*
+		 * If all of this dataset's bookmarks have been destroyed,
+		 * free the zap object and decrement the feature's use count.
+		 */
+		VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt));
+		if (zap_cnt == 0) {
+			dmu_buf_will_dirty(ds->ds_dbuf, tx);
+			VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx));
+			ds->ds_bookmarks_obj = 0;
+			spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+			VERIFY0(zap_remove(mos, ds->ds_object,
+			    DS_FIELD_BOOKMARK_NAMES, tx));
+		}
+
+		spa_history_log_internal_ds(ds, "remove bookmark", tx,
+		    "name=%s", shortname);
+
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The bookmarks must all be in the same pool.
+ */
+int
+dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors)
+{
+	int rv;
+	dsl_bookmark_destroy_arg_t dbda;
+	nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dbda.dbda_bmarks = bmarks;
+	dbda.dbda_errors = errors;
+	dbda.dbda_success = fnvlist_alloc();
+
+	rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check,
+	    dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks),
+	    ZFS_SPACE_CHECK_RESERVED);
+	fnvlist_free(dbda.dbda_success);
+	return (rv);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_redaction_list_long_held(redaction_list_t *rl)
+{
+	return (!zfs_refcount_is_zero(&rl->rl_longholds));
+}
+
+void
+dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag)
+{
+	ASSERT(dsl_pool_config_held(dp));
+	(void) zfs_refcount_add(&rl->rl_longholds, tag);
+}
+
+void
+dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag)
+{
+	(void) zfs_refcount_remove(&rl->rl_longholds, tag);
+}
+
+/* ARGSUSED */
+static void
+redaction_list_evict_sync(void *rlu)
+{
+	redaction_list_t *rl = rlu;
+	zfs_refcount_destroy(&rl->rl_longholds);
+
+	kmem_free(rl, sizeof (redaction_list_t));
+}
+
+void
+dsl_redaction_list_rele(redaction_list_t *rl, void *tag)
+{
+	dmu_buf_rele(rl->rl_dbuf, tag);
+}
+
+int
+dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag,
+    redaction_list_t **rlp)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_buf_t *dbuf;
+	redaction_list_t *rl;
+	int err;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dmu_bonus_hold(mos, rlobj, tag, &dbuf);
+	if (err != 0)
+		return (err);
+
+	rl = dmu_buf_get_user(dbuf);
+	if (rl == NULL) {
+		redaction_list_t *winner = NULL;
+
+		rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP);
+		rl->rl_dbuf = dbuf;
+		rl->rl_object = rlobj;
+		rl->rl_phys = dbuf->db_data;
+		rl->rl_mos = dp->dp_meta_objset;
+		zfs_refcount_create(&rl->rl_longholds);
+		dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL,
+		    &rl->rl_dbuf);
+		if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) {
+			kmem_free(rl, sizeof (*rl));
+			rl = winner;
+		}
+	}
+	*rlp = rl;
+	return (0);
+}
+
+/*
+ * Snapshot ds is being destroyed.
+ *
+ * Adjust the "freed_before_next" of any bookmarks between this snap
+ * and the previous snapshot, because their "next snapshot" is changing.
+ *
+ * If there are any bookmarks with HAS_FBN at this snapshot, remove
+ * their HAS_SNAP flag (note: there can be at most one snapshot of
+ * each filesystem at a given txg), and return B_TRUE.  In this case
+ * the caller can not remove the key in the deadlist at this TXG, because
+ * the HAS_FBN bookmarks require the key be there.
+ *
+ * Returns B_FALSE if there are no bookmarks with HAS_FBN at this
+ * snapshot's TXG.  In this case the caller can remove the key in the
+ * deadlist at this TXG.
+ */
+boolean_t
+dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	dsl_dataset_t *head, *next;
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head));
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next));
+
+	/*
+	 * Find the first bookmark that HAS_FBN at or after the
+	 * previous snapshot.
+	 */
+	dsl_bookmark_node_t search = { 0 };
+	avl_index_t idx;
+	search.dbn_phys.zbm_creation_txg =
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
+	/*
+	 * The empty-string name can't be in the AVL, and it compares
+	 * before any entries with this TXG.
+	 */
+	search.dbn_name = "";
+	VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
+	dsl_bookmark_node_t *dbn =
+	    avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
+
+	/*
+	 * Iterate over all bookmarks that are at or after the previous
+	 * snapshot, and before this (being deleted) snapshot.  Adjust
+	 * their FBN based on their new next snapshot.
+	 */
+	for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg <
+	    dsl_dataset_phys(ds)->ds_creation_txg;
+	    dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
+		if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN))
+			continue;
+		/*
+		 * Increase our FBN by the amount of space that was live
+		 * (referenced) at the time of this bookmark (i.e.
+		 * birth <= zbm_creation_txg), and killed between this
+		 * (being deleted) snapshot and the next snapshot (i.e.
+		 * on the next snapshot's deadlist).  (Space killed before
+		 * this are already on our FBN.)
+		 */
+		uint64_t referenced, compressed, uncompressed;
+		dsl_deadlist_space_range(&next->ds_deadlist,
+		    0, dbn->dbn_phys.zbm_creation_txg,
+		    &referenced, &compressed, &uncompressed);
+		dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
+		    referenced;
+		dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
+		    compressed;
+		dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
+		    uncompressed;
+		VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
+		    dbn->dbn_name, sizeof (uint64_t),
+		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+		    &dbn->dbn_phys, tx));
+	}
+	dsl_dataset_rele(next, FTAG);
+
+	/*
+	 * There may be several bookmarks at this txg (the TXG of the
+	 * snapshot being deleted).  We need to clear the SNAPSHOT_EXISTS
+	 * flag on all of them, and return TRUE if there is at least 1
+	 * bookmark here with HAS_FBN (thus preventing the deadlist
+	 * key from being removed).
+	 */
+	boolean_t rv = B_FALSE;
+	for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
+	    dsl_dataset_phys(ds)->ds_creation_txg;
+	    dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
+		if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
+			ASSERT(!(dbn->dbn_phys.zbm_flags &
+			    ZBM_FLAG_SNAPSHOT_EXISTS));
+			continue;
+		}
+		ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS);
+		dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS;
+		VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
+		    dbn->dbn_name, sizeof (uint64_t),
+		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+		    &dbn->dbn_phys, tx));
+		rv = B_TRUE;
+	}
+	dsl_dataset_rele(head, FTAG);
+	return (rv);
+}
+
+/*
+ * A snapshot is being created of this (head) dataset.
+ *
+ * We don't keep keys in the deadlist for the most recent snapshot, or any
+ * bookmarks at or after it, because there can't be any blocks on the
+ * deadlist in this range.  Now that the most recent snapshot is after
+ * all bookmarks, we need to add these keys.  Note that the caller always
+ * adds a key at the previous snapshot, so we only add keys for bookmarks
+ * after that.
+ */
+void
+dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t last_key_added = UINT64_MAX;
+	for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg >
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	    dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
+		uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg;
+		ASSERT3U(creation_txg, <=, last_key_added);
+		/*
+		 * Note, there may be multiple bookmarks at this TXG,
+		 * and we only want to add the key for this TXG once.
+		 * The ds_bookmarks AVL is sorted by TXG, so we will visit
+		 * these bookmarks in sequence.
+		 */
+		if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) &&
+		    creation_txg != last_key_added) {
+			dsl_deadlist_add_key(&ds->ds_deadlist,
+			    creation_txg, tx);
+			last_key_added = creation_txg;
+		}
+	}
+}
+
+/*
+ * The next snapshot of the origin dataset has changed, due to
+ * promote or clone swap.  If there are any bookmarks at this dataset,
+ * we need to update their zbm_*_freed_before_next_snap to reflect this.
+ * The head dataset has the relevant bookmarks in ds_bookmarks.
+ */
+void
+dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	/*
+	 * Find the first bookmark that HAS_FBN at the origin snapshot.
+	 */
+	dsl_bookmark_node_t search = { 0 };
+	avl_index_t idx;
+	search.dbn_phys.zbm_creation_txg =
+	    dsl_dataset_phys(origin)->ds_creation_txg;
+	search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN;
+	/*
+	 * The empty-string name can't be in the AVL, and it compares
+	 * before any entries with this TXG.
+	 */
+	search.dbn_name = "";
+	VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL);
+	dsl_bookmark_node_t *dbn =
+	    avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER);
+
+	/*
+	 * Iterate over all bookmarks that are at the origin txg.
+	 * Adjust their FBN based on their new next snapshot.
+	 */
+	for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg ==
+	    dsl_dataset_phys(origin)->ds_creation_txg &&
+	    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
+	    dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) {
+
+		/*
+		 * Bookmark is at the origin, therefore its
+		 * "next dataset" is changing, so we need
+		 * to reset its FBN by recomputing it in
+		 * dsl_bookmark_set_phys().
+		 */
+		ASSERT3U(dbn->dbn_phys.zbm_guid, ==,
+		    dsl_dataset_phys(origin)->ds_guid);
+		ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==,
+		    dsl_dataset_phys(origin)->ds_referenced_bytes);
+		ASSERT(dbn->dbn_phys.zbm_flags &
+		    ZBM_FLAG_SNAPSHOT_EXISTS);
+		/*
+		 * Save and restore the zbm_redaction_obj, which
+		 * is zeroed by dsl_bookmark_set_phys().
+		 */
+		uint64_t redaction_obj =
+		    dbn->dbn_phys.zbm_redaction_obj;
+		dsl_bookmark_set_phys(&dbn->dbn_phys, origin);
+		dbn->dbn_phys.zbm_redaction_obj = redaction_obj;
+
+		VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj,
+		    dbn->dbn_name, sizeof (uint64_t),
+		    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+		    &dbn->dbn_phys, tx));
+	}
+}
+
+/*
+ * This block is no longer referenced by this (head) dataset.
+ *
+ * Adjust the FBN of any bookmarks that reference this block, whose "next"
+ * is the head dataset.
+ */
+/* ARGSUSED */
+void
+dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	/*
+	 * Iterate over bookmarks whose "next" is the head dataset.
+	 */
+	for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	    dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
+		/*
+		 * If the block was live (referenced) at the time of this
+		 * bookmark, add its space to the bookmark's FBN.
+		 */
+		if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg &&
+		    (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) {
+			mutex_enter(&dbn->dbn_lock);
+			dbn->dbn_phys.zbm_referenced_freed_before_next_snap +=
+			    bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp);
+			dbn->dbn_phys.zbm_compressed_freed_before_next_snap +=
+			    BP_GET_PSIZE(bp);
+			dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap +=
+			    BP_GET_UCSIZE(bp);
+			/*
+			 * Changing the ZAP object here would be too
+			 * expensive.  Also, we may be called from the zio
+			 * interrupt thread, which can't block on i/o.
+			 * Therefore, we mark this bookmark as dirty and
+			 * modify the ZAP once per txg, in
+			 * dsl_bookmark_sync_done().
+			 */
+			dbn->dbn_dirty = B_TRUE;
+			mutex_exit(&dbn->dbn_lock);
+		}
+	}
+}
+
+void
+dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	if (dsl_dataset_is_snapshot(ds))
+		return;
+
+	/*
+	 * We only dirty bookmarks that are at or after the most recent
+	 * snapshot.  We can't create snapshots between
+	 * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we
+	 * don't need to look at any bookmarks before ds_prev_snap_txg.
+	 */
+	for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg >=
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	    dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) {
+		if (dbn->dbn_dirty) {
+			/*
+			 * We only dirty nodes with HAS_FBN, therefore
+			 * we can always use the current bookmark struct size.
+			 */
+			ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN);
+			VERIFY0(zap_update(dp->dp_meta_objset,
+			    ds->ds_bookmarks_obj,
+			    dbn->dbn_name, sizeof (uint64_t),
+			    sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t),
+			    &dbn->dbn_phys, tx));
+			dbn->dbn_dirty = B_FALSE;
+		}
+	}
+#ifdef ZFS_DEBUG
+	for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks);
+	    dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) {
+		ASSERT(!dbn->dbn_dirty);
+	}
+#endif
+}
+
+/*
+ * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks).
+ */
+uint64_t
+dsl_bookmark_latest_txg(dsl_dataset_t *ds)
+{
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks);
+	if (dbn == NULL)
+		return (0);
+	return (dbn->dbn_phys.zbm_creation_txg);
+}
+
+/*
+ * Compare the redact_block_phys_t to the bookmark. If the last block in the
+ * redact_block_phys_t is before the bookmark, return -1.  If the first block in
+ * the redact_block_phys_t is after the bookmark, return 1.  Otherwise, the
+ * bookmark is inside the range of the redact_block_phys_t, and we return 0.
+ */
+static int
+redact_block_zb_compare(redact_block_phys_t *first,
+    zbookmark_phys_t *second)
+{
+	/*
+	 * If the block_phys is for a previous object, or the last block in the
+	 * block_phys is strictly before the block in the bookmark, the
+	 * block_phys is earlier.
+	 */
+	if (first->rbp_object < second->zb_object ||
+	    (first->rbp_object == second->zb_object &&
+	    first->rbp_blkid + (redact_block_get_count(first) - 1) <
+	    second->zb_blkid)) {
+		return (-1);
+	}
+
+	/*
+	 * If the bookmark is for a previous object, or the block in the
+	 * bookmark is strictly before the first block in the block_phys, the
+	 * bookmark is earlier.
+	 */
+	if (first->rbp_object > second->zb_object ||
+	    (first->rbp_object == second->zb_object &&
+	    first->rbp_blkid > second->zb_blkid)) {
+		return (1);
+	}
+
+	return (0);
+}
+
+/*
+ * Traverse the redaction list in the provided object, and call the callback for
+ * each entry we find. Don't call the callback for any records before resume.
+ */
+int
+dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume,
+    rl_traverse_callback_t cb, void *arg)
+{
+	objset_t *mos = rl->rl_mos;
+	int err = 0;
+
+	if (rl->rl_phys->rlp_last_object != UINT64_MAX ||
+	    rl->rl_phys->rlp_last_blkid != UINT64_MAX) {
+		/*
+		 * When we finish a send, we update the last object and offset
+		 * to UINT64_MAX.  If a send fails partway through, the last
+		 * object and offset will have some other value, indicating how
+		 * far the send got. The redaction list must be complete before
+		 * it can be traversed, so return EINVAL if the last object and
+		 * blkid are not set to UINT64_MAX.
+		 */
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * This allows us to skip the binary search and resume checking logic
+	 * below, if we're not resuming a redacted send.
+	 */
+	if (ZB_IS_ZERO(resume))
+		resume = NULL;
+
+	/*
+	 * Binary search for the point to resume from.
+	 */
+	uint64_t maxidx = rl->rl_phys->rlp_num_entries - 1;
+	uint64_t minidx = 0;
+	while (resume != NULL && maxidx > minidx) {
+		redact_block_phys_t rbp = { 0 };
+		ASSERT3U(maxidx, >, minidx);
+		uint64_t mididx = minidx + ((maxidx - minidx) / 2);
+		err = dmu_read(mos, rl->rl_object, mididx * sizeof (rbp),
+		    sizeof (rbp), &rbp, DMU_READ_NO_PREFETCH);
+		if (err != 0)
+			break;
+
+		int cmp = redact_block_zb_compare(&rbp, resume);
+
+		if (cmp == 0) {
+			minidx = mididx;
+			break;
+		} else if (cmp > 0) {
+			maxidx =
+			    (mididx == minidx ? minidx : mididx - 1);
+		} else {
+			minidx = mididx + 1;
+		}
+	}
+
+	unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE;
+	redact_block_phys_t *buf = zio_data_buf_alloc(bufsize);
+
+	unsigned int entries_per_buf = bufsize / sizeof (redact_block_phys_t);
+	uint64_t start_block = minidx / entries_per_buf;
+	err = dmu_read(mos, rl->rl_object, start_block * bufsize, bufsize, buf,
+	    DMU_READ_PREFETCH);
+
+	for (uint64_t curidx = minidx;
+	    err == 0 && curidx < rl->rl_phys->rlp_num_entries;
+	    curidx++) {
+		/*
+		 * We read in the redaction list one block at a time.  Once we
+		 * finish with all the entries in a given block, we read in a
+		 * new one.  The predictive prefetcher will take care of any
+		 * prefetching, and this code shouldn't be the bottleneck, so we
+		 * don't need to do manual prefetching.
+		 */
+		if (curidx % entries_per_buf == 0) {
+			err = dmu_read(mos, rl->rl_object, curidx *
+			    sizeof (*buf), bufsize, buf,
+			    DMU_READ_PREFETCH);
+			if (err != 0)
+				break;
+		}
+		redact_block_phys_t *rb = &buf[curidx % entries_per_buf];
+		/*
+		 * If resume is non-null, we should either not send the data, or
+		 * null out resume so we don't have to keep doing these
+		 * comparisons.
+		 */
+		if (resume != NULL) {
+			/*
+			 * It is possible that after the binary search we got
+			 * a record before the resume point. There's two cases
+			 * where this can occur. If the record is the last
+			 * redaction record, and the resume point is after the
+			 * end of the redacted data, curidx will be the last
+			 * redaction record. In that case, the loop will end
+			 * after this iteration. The second case is if the
+			 * resume point is between two redaction records, the
+			 * binary search can return either the record before
+			 * or after the resume point. In that case, the next
+			 * iteration will be greater than the resume point.
+			 */
+			if (redact_block_zb_compare(rb, resume) < 0) {
+				ASSERT3U(curidx, ==, minidx);
+				continue;
+			} else {
+				/*
+				 * If the place to resume is in the middle of
+				 * the range described by this
+				 * redact_block_phys, then modify the
+				 * redact_block_phys in memory so we generate
+				 * the right records.
+				 */
+				if (resume->zb_object == rb->rbp_object &&
+				    resume->zb_blkid > rb->rbp_blkid) {
+					uint64_t diff = resume->zb_blkid -
+					    rb->rbp_blkid;
+					rb->rbp_blkid = resume->zb_blkid;
+					redact_block_set_count(rb,
+					    redact_block_get_count(rb) - diff);
+				}
+				resume = NULL;
+			}
+		}
+
+		if (cb(rb, arg) != 0) {
+			err = EINTR;
+			break;
+		}
+	}
+
+	zio_data_buf_free(buf, bufsize);
+	return (err);
+}
diff --git a/sys/contrib/openzfs/module/zfs/dsl_crypt.c b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
new file mode 100644
index 000000000000..e38ec0cae827
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_crypt.c
@@ -0,0 +1,2863 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/dsl_crypt.h>
+#include <sys/dsl_pool.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/zvol.h>
+
+/*
+ * This file's primary purpose is for managing master encryption keys in
+ * memory and on disk. For more info on how these keys are used, see the
+ * block comment in zio_crypt.c.
+ *
+ * All master keys are stored encrypted on disk in the form of the DSL
+ * Crypto Key ZAP object. The binary key data in this object is always
+ * randomly generated and is encrypted with the user's wrapping key. This
+ * layer of indirection allows the user to change their key without
+ * needing to re-encrypt the entire dataset. The ZAP also holds on to the
+ * (non-encrypted) encryption algorithm identifier, IV, and MAC needed to
+ * safely decrypt the master key. For more info on the user's key see the
+ * block comment in libzfs_crypto.c
+ *
+ * In-memory encryption keys are managed through the spa_keystore. The
+ * keystore consists of 3 AVL trees, which are as follows:
+ *
+ * The Wrapping Key Tree:
+ * The wrapping key (wkey) tree stores the user's keys that are fed into the
+ * kernel through 'zfs load-key' and related commands. Datasets inherit their
+ * parent's wkey by default, so these structures are refcounted. The wrapping
+ * keys remain in memory until they are explicitly unloaded (with
+ * "zfs unload-key"). Unloading is only possible when no datasets are using
+ * them (refcount=0).
+ *
+ * The DSL Crypto Key Tree:
+ * The DSL Crypto Keys (DCK) are the in-memory representation of decrypted
+ * master keys. They are used by the functions in zio_crypt.c to perform
+ * encryption, decryption, and authentication. Snapshots and clones of a given
+ * dataset will share a DSL Crypto Key, so they are also refcounted. Once the
+ * refcount on a key hits zero, it is immediately zeroed out and freed.
+ *
+ * The Crypto Key Mapping Tree:
+ * The zio layer needs to lookup master keys by their dataset object id. Since
+ * the DSL Crypto Keys can belong to multiple datasets, we maintain a tree of
+ * dsl_key_mapping_t's which essentially just map the dataset object id to its
+ * appropriate DSL Crypto Key. The management for creating and destroying these
+ * mappings hooks into the code for owning and disowning datasets. Usually,
+ * there will only be one active dataset owner, but there are times
+ * (particularly during dataset creation and destruction) when this may not be
+ * true or the dataset may not be initialized enough to own. As a result, this
+ * object is also refcounted.
+ */
+
+/*
+ * This tunable allows datasets to be raw received even if the stream does
+ * not include IVset guids or if the guids don't match. This is used as part
+ * of the resolution for ZPOOL_ERRATA_ZOL_8308_ENCRYPTION.
+ */
+int zfs_disable_ivset_guid_check = 0;
+
+static void
+dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag)
+{
+	(void) zfs_refcount_add(&wkey->wk_refcnt, tag);
+}
+
+static void
+dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag)
+{
+	(void) zfs_refcount_remove(&wkey->wk_refcnt, tag);
+}
+
+static void
+dsl_wrapping_key_free(dsl_wrapping_key_t *wkey)
+{
+	ASSERT0(zfs_refcount_count(&wkey->wk_refcnt));
+
+	if (wkey->wk_key.ck_data) {
+		bzero(wkey->wk_key.ck_data,
+		    CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
+		kmem_free(wkey->wk_key.ck_data,
+		    CRYPTO_BITS2BYTES(wkey->wk_key.ck_length));
+	}
+
+	zfs_refcount_destroy(&wkey->wk_refcnt);
+	kmem_free(wkey, sizeof (dsl_wrapping_key_t));
+}
+
+static void
+dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat,
+    uint64_t salt, uint64_t iters, dsl_wrapping_key_t **wkey_out)
+{
+	dsl_wrapping_key_t *wkey;
+
+	/* allocate the wrapping key */
+	wkey = kmem_alloc(sizeof (dsl_wrapping_key_t), KM_SLEEP);
+
+	/* allocate and initialize the underlying crypto key */
+	wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP);
+
+	wkey->wk_key.ck_format = CRYPTO_KEY_RAW;
+	wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN);
+	bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN);
+
+	/* initialize the rest of the struct */
+	zfs_refcount_create(&wkey->wk_refcnt);
+	wkey->wk_keyformat = keyformat;
+	wkey->wk_salt = salt;
+	wkey->wk_iters = iters;
+
+	*wkey_out = wkey;
+}
+
+int
+dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props,
+    nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out)
+{
+	int ret;
+	uint64_t crypt = ZIO_CRYPT_INHERIT;
+	uint64_t keyformat = ZFS_KEYFORMAT_NONE;
+	uint64_t salt = 0, iters = 0;
+	dsl_crypto_params_t *dcp = NULL;
+	dsl_wrapping_key_t *wkey = NULL;
+	uint8_t *wkeydata = NULL;
+	uint_t wkeydata_len = 0;
+	char *keylocation = NULL;
+
+	dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP);
+	dcp->cp_cmd = cmd;
+
+	/* get relevant arguments from the nvlists */
+	if (props != NULL) {
+		(void) nvlist_lookup_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt);
+		(void) nvlist_lookup_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat);
+		(void) nvlist_lookup_string(props,
+		    zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation);
+		(void) nvlist_lookup_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), &salt);
+		(void) nvlist_lookup_uint64(props,
+		    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters);
+
+		dcp->cp_crypt = crypt;
+	}
+
+	if (crypto_args != NULL) {
+		(void) nvlist_lookup_uint8_array(crypto_args, "wkeydata",
+		    &wkeydata, &wkeydata_len);
+	}
+
+	/* check for valid command */
+	if (dcp->cp_cmd >= DCP_CMD_MAX) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	} else {
+		dcp->cp_cmd = cmd;
+	}
+
+	/* check for valid crypt */
+	if (dcp->cp_crypt >= ZIO_CRYPT_FUNCTIONS) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	} else {
+		dcp->cp_crypt = crypt;
+	}
+
+	/* check for valid keyformat */
+	if (keyformat >= ZFS_KEYFORMAT_FORMATS) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* check for a valid keylocation (of any kind) and copy it in */
+	if (keylocation != NULL) {
+		if (!zfs_prop_valid_keylocation(keylocation, B_FALSE)) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+
+		dcp->cp_keylocation = spa_strdup(keylocation);
+	}
+
+	/* check wrapping key length, if given */
+	if (wkeydata != NULL && wkeydata_len != WRAPPING_KEY_LEN) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* if the user asked for the default crypt, determine that now */
+	if (dcp->cp_crypt == ZIO_CRYPT_ON)
+		dcp->cp_crypt = ZIO_CRYPT_ON_VALUE;
+
+	/* create the wrapping key from the raw data */
+	if (wkeydata != NULL) {
+		/* create the wrapping key with the verified parameters */
+		dsl_wrapping_key_create(wkeydata, keyformat, salt,
+		    iters, &wkey);
+		dcp->cp_wkey = wkey;
+	}
+
+	/*
+	 * Remove the encryption properties from the nvlist since they are not
+	 * maintained through the DSL.
+	 */
+	(void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION));
+	(void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT));
+	(void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT));
+	(void) nvlist_remove_all(props,
+	    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS));
+
+	*dcp_out = dcp;
+
+	return (0);
+
+error:
+	kmem_free(dcp, sizeof (dsl_crypto_params_t));
+	*dcp_out = NULL;
+	return (ret);
+}
+
+void
+dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload)
+{
+	if (dcp == NULL)
+		return;
+
+	if (dcp->cp_keylocation != NULL)
+		spa_strfree(dcp->cp_keylocation);
+	if (unload && dcp->cp_wkey != NULL)
+		dsl_wrapping_key_free(dcp->cp_wkey);
+
+	kmem_free(dcp, sizeof (dsl_crypto_params_t));
+}
+
+static int
+spa_crypto_key_compare(const void *a, const void *b)
+{
+	const dsl_crypto_key_t *dcka = a;
+	const dsl_crypto_key_t *dckb = b;
+
+	if (dcka->dck_obj < dckb->dck_obj)
+		return (-1);
+	if (dcka->dck_obj > dckb->dck_obj)
+		return (1);
+	return (0);
+}
+
+static int
+spa_key_mapping_compare(const void *a, const void *b)
+{
+	const dsl_key_mapping_t *kma = a;
+	const dsl_key_mapping_t *kmb = b;
+
+	if (kma->km_dsobj < kmb->km_dsobj)
+		return (-1);
+	if (kma->km_dsobj > kmb->km_dsobj)
+		return (1);
+	return (0);
+}
+
+static int
+spa_wkey_compare(const void *a, const void *b)
+{
+	const dsl_wrapping_key_t *wka = a;
+	const dsl_wrapping_key_t *wkb = b;
+
+	if (wka->wk_ddobj < wkb->wk_ddobj)
+		return (-1);
+	if (wka->wk_ddobj > wkb->wk_ddobj)
+		return (1);
+	return (0);
+}
+
+void
+spa_keystore_init(spa_keystore_t *sk)
+{
+	rw_init(&sk->sk_dk_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&sk->sk_km_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&sk->sk_wkeys_lock, NULL, RW_DEFAULT, NULL);
+	avl_create(&sk->sk_dsl_keys, spa_crypto_key_compare,
+	    sizeof (dsl_crypto_key_t),
+	    offsetof(dsl_crypto_key_t, dck_avl_link));
+	avl_create(&sk->sk_key_mappings, spa_key_mapping_compare,
+	    sizeof (dsl_key_mapping_t),
+	    offsetof(dsl_key_mapping_t, km_avl_link));
+	avl_create(&sk->sk_wkeys, spa_wkey_compare, sizeof (dsl_wrapping_key_t),
+	    offsetof(dsl_wrapping_key_t, wk_avl_link));
+}
+
+void
+spa_keystore_fini(spa_keystore_t *sk)
+{
+	dsl_wrapping_key_t *wkey;
+	void *cookie = NULL;
+
+	ASSERT(avl_is_empty(&sk->sk_dsl_keys));
+	ASSERT(avl_is_empty(&sk->sk_key_mappings));
+
+	while ((wkey = avl_destroy_nodes(&sk->sk_wkeys, &cookie)) != NULL)
+		dsl_wrapping_key_free(wkey);
+
+	avl_destroy(&sk->sk_wkeys);
+	avl_destroy(&sk->sk_key_mappings);
+	avl_destroy(&sk->sk_dsl_keys);
+	rw_destroy(&sk->sk_wkeys_lock);
+	rw_destroy(&sk->sk_km_lock);
+	rw_destroy(&sk->sk_dk_lock);
+}
+
+static int
+dsl_dir_get_encryption_root_ddobj(dsl_dir_t *dd, uint64_t *rddobj)
+{
+	if (dd->dd_crypto_obj == 0)
+		return (SET_ERROR(ENOENT));
+
+	return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+	    DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, rddobj));
+}
+
+static int
+dsl_dir_get_encryption_version(dsl_dir_t *dd, uint64_t *version)
+{
+	*version = 0;
+
+	if (dd->dd_crypto_obj == 0)
+		return (SET_ERROR(ENOENT));
+
+	/* version 0 is implied by ENOENT */
+	(void) zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+	    DSL_CRYPTO_KEY_VERSION, 8, 1, version);
+
+	return (0);
+}
+
+boolean_t
+dsl_dir_incompatible_encryption_version(dsl_dir_t *dd)
+{
+	int ret;
+	uint64_t version = 0;
+
+	ret = dsl_dir_get_encryption_version(dd, &version);
+	if (ret != 0)
+		return (B_FALSE);
+
+	return (version != ZIO_CRYPT_KEY_CURRENT_VERSION);
+}
+
+static int
+spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj,
+    void *tag, dsl_wrapping_key_t **wkey_out)
+{
+	int ret;
+	dsl_wrapping_key_t search_wkey;
+	dsl_wrapping_key_t *found_wkey;
+
+	ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_wkeys_lock));
+
+	/* init the search wrapping key */
+	search_wkey.wk_ddobj = ddobj;
+
+	/* lookup the wrapping key */
+	found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &search_wkey, NULL);
+	if (!found_wkey) {
+		ret = SET_ERROR(ENOENT);
+		goto error;
+	}
+
+	/* increment the refcount */
+	dsl_wrapping_key_hold(found_wkey, tag);
+
+	*wkey_out = found_wkey;
+	return (0);
+
+error:
+	*wkey_out = NULL;
+	return (ret);
+}
+
+static int
+spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+    dsl_wrapping_key_t **wkey_out)
+{
+	int ret;
+	dsl_wrapping_key_t *wkey;
+	uint64_t rddobj;
+	boolean_t locked = B_FALSE;
+
+	if (!RW_WRITE_HELD(&spa->spa_keystore.sk_wkeys_lock)) {
+		rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_READER);
+		locked = B_TRUE;
+	}
+
+	/* get the ddobj that the keylocation property was inherited from */
+	ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+	if (ret != 0)
+		goto error;
+
+	/* lookup the wkey in the avl tree */
+	ret = spa_keystore_wkey_hold_ddobj_impl(spa, rddobj, tag, &wkey);
+	if (ret != 0)
+		goto error;
+
+	/* unlock the wkey tree if we locked it */
+	if (locked)
+		rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+	*wkey_out = wkey;
+	return (0);
+
+error:
+	if (locked)
+		rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+	*wkey_out = NULL;
+	return (ret);
+}
+
+int
+dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation)
+{
+	int ret = 0;
+	dsl_dir_t *dd = NULL;
+	dsl_pool_t *dp = NULL;
+	uint64_t rddobj;
+
+	/* hold the dsl dir */
+	ret = dsl_pool_hold(dsname, FTAG, &dp);
+	if (ret != 0)
+		goto out;
+
+	ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+	if (ret != 0) {
+		dd = NULL;
+		goto out;
+	}
+
+	/* if dd is not encrypted, the value may only be "none" */
+	if (dd->dd_crypto_obj == 0) {
+		if (strcmp(keylocation, "none") != 0) {
+			ret = SET_ERROR(EACCES);
+			goto out;
+		}
+
+		ret = 0;
+		goto out;
+	}
+
+	/* check for a valid keylocation for encrypted datasets */
+	if (!zfs_prop_valid_keylocation(keylocation, B_TRUE)) {
+		ret = SET_ERROR(EINVAL);
+		goto out;
+	}
+
+	/* check that this is an encryption root */
+	ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+	if (ret != 0)
+		goto out;
+
+	if (rddobj != dd->dd_object) {
+		ret = SET_ERROR(EACCES);
+		goto out;
+	}
+
+	dsl_dir_rele(dd, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	return (0);
+
+out:
+	if (dd != NULL)
+		dsl_dir_rele(dd, FTAG);
+	if (dp != NULL)
+		dsl_pool_rele(dp, FTAG);
+
+	return (ret);
+}
+
+static void
+dsl_crypto_key_free(dsl_crypto_key_t *dck)
+{
+	ASSERT(zfs_refcount_count(&dck->dck_holds) == 0);
+
+	/* destroy the zio_crypt_key_t */
+	zio_crypt_key_destroy(&dck->dck_key);
+
+	/* free the refcount, wrapping key, and lock */
+	zfs_refcount_destroy(&dck->dck_holds);
+	if (dck->dck_wkey)
+		dsl_wrapping_key_rele(dck->dck_wkey, dck);
+
+	/* free the key */
+	kmem_free(dck, sizeof (dsl_crypto_key_t));
+}
+
+static void
+dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag)
+{
+	if (zfs_refcount_remove(&dck->dck_holds, tag) == 0)
+		dsl_crypto_key_free(dck);
+}
+
+static int
+dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey,
+    uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out)
+{
+	int ret;
+	uint64_t crypt = 0, guid = 0, version = 0;
+	uint8_t raw_keydata[MASTER_KEY_MAX_LEN];
+	uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN];
+	uint8_t iv[WRAPPING_IV_LEN];
+	uint8_t mac[WRAPPING_MAC_LEN];
+	dsl_crypto_key_t *dck;
+
+	/* allocate and initialize the key */
+	dck = kmem_zalloc(sizeof (dsl_crypto_key_t), KM_SLEEP);
+
+	/* fetch all of the values we need from the ZAP */
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+	    &crypt);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+	    MASTER_KEY_MAX_LEN, raw_keydata);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+	    SHA512_HMAC_KEYLEN, raw_hmac_keydata);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+	    iv);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+	    mac);
+	if (ret != 0)
+		goto error;
+
+	/* the initial on-disk format for encryption did not have a version */
+	(void) zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version);
+
+	/*
+	 * Unwrap the keys. If there is an error return EACCES to indicate
+	 * an authentication failure.
+	 */
+	ret = zio_crypt_key_unwrap(&wkey->wk_key, crypt, version, guid,
+	    raw_keydata, raw_hmac_keydata, iv, mac, &dck->dck_key);
+	if (ret != 0) {
+		ret = SET_ERROR(EACCES);
+		goto error;
+	}
+
+	/* finish initializing the dsl_crypto_key_t */
+	zfs_refcount_create(&dck->dck_holds);
+	dsl_wrapping_key_hold(wkey, dck);
+	dck->dck_wkey = wkey;
+	dck->dck_obj = dckobj;
+	zfs_refcount_add(&dck->dck_holds, tag);
+
+	*dck_out = dck;
+	return (0);
+
+error:
+	if (dck != NULL) {
+		bzero(dck, sizeof (dsl_crypto_key_t));
+		kmem_free(dck, sizeof (dsl_crypto_key_t));
+	}
+
+	*dck_out = NULL;
+	return (ret);
+}
+
+static int
+spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag,
+    dsl_crypto_key_t **dck_out)
+{
+	int ret;
+	dsl_crypto_key_t search_dck;
+	dsl_crypto_key_t *found_dck;
+
+	ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_dk_lock));
+
+	/* init the search key */
+	search_dck.dck_obj = dckobj;
+
+	/* find the matching key in the keystore */
+	found_dck = avl_find(&spa->spa_keystore.sk_dsl_keys, &search_dck, NULL);
+	if (!found_dck) {
+		ret = SET_ERROR(ENOENT);
+		goto error;
+	}
+
+	/* increment the refcount */
+	zfs_refcount_add(&found_dck->dck_holds, tag);
+
+	*dck_out = found_dck;
+	return (0);
+
+error:
+	*dck_out = NULL;
+	return (ret);
+}
+
+static int
+spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag,
+    dsl_crypto_key_t **dck_out)
+{
+	int ret;
+	avl_index_t where;
+	dsl_crypto_key_t *dck_io = NULL, *dck_ks = NULL;
+	dsl_wrapping_key_t *wkey = NULL;
+	uint64_t dckobj = dd->dd_crypto_obj;
+
+	/* Lookup the key in the tree of currently loaded keys */
+	rw_enter(&spa->spa_keystore.sk_dk_lock, RW_READER);
+	ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks);
+	rw_exit(&spa->spa_keystore.sk_dk_lock);
+	if (ret == 0) {
+		*dck_out = dck_ks;
+		return (0);
+	}
+
+	/* Lookup the wrapping key from the keystore */
+	ret = spa_keystore_wkey_hold_dd(spa, dd, FTAG, &wkey);
+	if (ret != 0) {
+		*dck_out = NULL;
+		return (SET_ERROR(EACCES));
+	}
+
+	/* Read the key from disk */
+	ret = dsl_crypto_key_open(spa->spa_meta_objset, wkey, dckobj,
+	    tag, &dck_io);
+	if (ret != 0) {
+		dsl_wrapping_key_rele(wkey, FTAG);
+		*dck_out = NULL;
+		return (ret);
+	}
+
+	/*
+	 * Add the key to the keystore.  It may already exist if it was
+	 * added while performing the read from disk.  In this case discard
+	 * it and return the key from the keystore.
+	 */
+	rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
+	ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks);
+	if (ret != 0) {
+		avl_find(&spa->spa_keystore.sk_dsl_keys, dck_io, &where);
+		avl_insert(&spa->spa_keystore.sk_dsl_keys, dck_io, where);
+		*dck_out = dck_io;
+	} else {
+		dsl_crypto_key_free(dck_io);
+		*dck_out = dck_ks;
+	}
+
+	/* Release the wrapping key (the dsl key now has a reference to it) */
+	dsl_wrapping_key_rele(wkey, FTAG);
+	rw_exit(&spa->spa_keystore.sk_dk_lock);
+
+	return (0);
+}
+
+void
+spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag)
+{
+	rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER);
+
+	if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) {
+		avl_remove(&spa->spa_keystore.sk_dsl_keys, dck);
+		dsl_crypto_key_free(dck);
+	}
+
+	rw_exit(&spa->spa_keystore.sk_dk_lock);
+}
+
+int
+spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey)
+{
+	int ret;
+	avl_index_t where;
+	dsl_wrapping_key_t *found_wkey;
+
+	rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+	/* insert the wrapping key into the keystore */
+	found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where);
+	if (found_wkey != NULL) {
+		ret = SET_ERROR(EEXIST);
+		goto error_unlock;
+	}
+	avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where);
+
+	rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+	return (0);
+
+error_unlock:
+	rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+	return (ret);
+}
+
+int
+spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp,
+    boolean_t noop)
+{
+	int ret;
+	dsl_dir_t *dd = NULL;
+	dsl_crypto_key_t *dck = NULL;
+	dsl_wrapping_key_t *wkey = dcp->cp_wkey;
+	dsl_pool_t *dp = NULL;
+	uint64_t rddobj, keyformat, salt, iters;
+
+	/*
+	 * We don't validate the wrapping key's keyformat, salt, or iters
+	 * since they will never be needed after the DCK has been wrapped.
+	 */
+	if (dcp->cp_wkey == NULL ||
+	    dcp->cp_cmd != DCP_CMD_NONE ||
+	    dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+	    dcp->cp_keylocation != NULL)
+		return (SET_ERROR(EINVAL));
+
+	ret = dsl_pool_hold(dsname, FTAG, &dp);
+	if (ret != 0)
+		goto error;
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+		ret = SET_ERROR(ENOTSUP);
+		goto error;
+	}
+
+	/* hold the dsl dir */
+	ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+	if (ret != 0) {
+		dd = NULL;
+		goto error;
+	}
+
+	/* confirm that dd is the encryption root */
+	ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+	if (ret != 0 || rddobj != dd->dd_object) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* initialize the wkey's ddobj */
+	wkey->wk_ddobj = dd->dd_object;
+
+	/* verify that the wkey is correct by opening its dsl key */
+	ret = dsl_crypto_key_open(dp->dp_meta_objset, wkey,
+	    dd->dd_crypto_obj, FTAG, &dck);
+	if (ret != 0)
+		goto error;
+
+	/* initialize the wkey encryption parameters from the DSL Crypto Key */
+	ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj,
+	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &keyformat);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj,
+	    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj,
+	    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters);
+	if (ret != 0)
+		goto error;
+
+	ASSERT3U(keyformat, <, ZFS_KEYFORMAT_FORMATS);
+	ASSERT3U(keyformat, !=, ZFS_KEYFORMAT_NONE);
+	IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, iters != 0);
+	IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, salt != 0);
+	IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, iters == 0);
+	IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, salt == 0);
+
+	wkey->wk_keyformat = keyformat;
+	wkey->wk_salt = salt;
+	wkey->wk_iters = iters;
+
+	/*
+	 * At this point we have verified the wkey and confirmed that it can
+	 * be used to decrypt a DSL Crypto Key. We can simply cleanup and
+	 * return if this is all the user wanted to do.
+	 */
+	if (noop)
+		goto error;
+
+	/* insert the wrapping key into the keystore */
+	ret = spa_keystore_load_wkey_impl(dp->dp_spa, wkey);
+	if (ret != 0)
+		goto error;
+
+	dsl_crypto_key_rele(dck, FTAG);
+	dsl_dir_rele(dd, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	/* create any zvols under this ds */
+	zvol_create_minors_recursive(dsname);
+
+	return (0);
+
+error:
+	if (dck != NULL)
+		dsl_crypto_key_rele(dck, FTAG);
+	if (dd != NULL)
+		dsl_dir_rele(dd, FTAG);
+	if (dp != NULL)
+		dsl_pool_rele(dp, FTAG);
+
+	return (ret);
+}
+
+int
+spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj)
+{
+	int ret;
+	dsl_wrapping_key_t search_wkey;
+	dsl_wrapping_key_t *found_wkey;
+
+	/* init the search wrapping key */
+	search_wkey.wk_ddobj = ddobj;
+
+	rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+	/* remove the wrapping key from the keystore */
+	found_wkey = avl_find(&spa->spa_keystore.sk_wkeys,
+	    &search_wkey, NULL);
+	if (!found_wkey) {
+		ret = SET_ERROR(EACCES);
+		goto error_unlock;
+	} else if (zfs_refcount_count(&found_wkey->wk_refcnt) != 0) {
+		ret = SET_ERROR(EBUSY);
+		goto error_unlock;
+	}
+	avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey);
+
+	rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+	/* free the wrapping key */
+	dsl_wrapping_key_free(found_wkey);
+
+	return (0);
+
+error_unlock:
+	rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+	return (ret);
+}
+
+int
+spa_keystore_unload_wkey(const char *dsname)
+{
+	int ret = 0;
+	dsl_dir_t *dd = NULL;
+	dsl_pool_t *dp = NULL;
+	spa_t *spa = NULL;
+
+	ret = spa_open(dsname, &spa, FTAG);
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * Wait for any outstanding txg IO to complete, releasing any
+	 * remaining references on the wkey.
+	 */
+	if (spa_mode(spa) != SPA_MODE_READ)
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+
+	spa_close(spa, FTAG);
+
+	/* hold the dsl dir */
+	ret = dsl_pool_hold(dsname, FTAG, &dp);
+	if (ret != 0)
+		goto error;
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+		ret = (SET_ERROR(ENOTSUP));
+		goto error;
+	}
+
+	ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL);
+	if (ret != 0) {
+		dd = NULL;
+		goto error;
+	}
+
+	/* unload the wkey */
+	ret = spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
+	if (ret != 0)
+		goto error;
+
+	dsl_dir_rele(dd, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	/* remove any zvols under this ds */
+	zvol_remove_minors(dp->dp_spa, dsname, B_TRUE);
+
+	return (0);
+
+error:
+	if (dd != NULL)
+		dsl_dir_rele(dd, FTAG);
+	if (dp != NULL)
+		dsl_pool_rele(dp, FTAG);
+
+	return (ret);
+}
+
+void
+key_mapping_add_ref(dsl_key_mapping_t *km, void *tag)
+{
+	ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
+	zfs_refcount_add(&km->km_refcnt, tag);
+}
+
+/*
+ * The locking here is a little tricky to ensure we don't cause unnecessary
+ * performance problems. We want to release a key mapping whenever someone
+ * decrements the refcount to 0, but freeing the mapping requires removing
+ * it from the spa_keystore, which requires holding sk_km_lock as a writer.
+ * Most of the time we don't want to hold this lock as a writer, since the
+ * same lock is held as a reader for each IO that needs to encrypt / decrypt
+ * data for any dataset and in practice we will only actually free the
+ * mapping after unmounting a dataset.
+ */
+void
+key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag)
+{
+	ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1);
+
+	if (zfs_refcount_remove(&km->km_refcnt, tag) != 0)
+		return;
+
+	/*
+	 * We think we are going to need to free the mapping. Add a
+	 * reference to prevent most other releasers from thinking
+	 * this might be their responsibility. This is inherently
+	 * racy, so we will confirm that we are legitimately the
+	 * last holder once we have the sk_km_lock as a writer.
+	 */
+	zfs_refcount_add(&km->km_refcnt, FTAG);
+
+	rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER);
+	if (zfs_refcount_remove(&km->km_refcnt, FTAG) != 0) {
+		rw_exit(&spa->spa_keystore.sk_km_lock);
+		return;
+	}
+
+	avl_remove(&spa->spa_keystore.sk_key_mappings, km);
+	rw_exit(&spa->spa_keystore.sk_km_lock);
+
+	spa_keystore_dsl_key_rele(spa, km->km_key, km);
+	zfs_refcount_destroy(&km->km_refcnt);
+	kmem_free(km, sizeof (dsl_key_mapping_t));
+}
+
+int
+spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag,
+    dsl_key_mapping_t **km_out)
+{
+	int ret;
+	avl_index_t where;
+	dsl_key_mapping_t *km, *found_km;
+	boolean_t should_free = B_FALSE;
+
+	/* Allocate and initialize the mapping */
+	km = kmem_zalloc(sizeof (dsl_key_mapping_t), KM_SLEEP);
+	zfs_refcount_create(&km->km_refcnt);
+
+	ret = spa_keystore_dsl_key_hold_dd(spa, ds->ds_dir, km, &km->km_key);
+	if (ret != 0) {
+		zfs_refcount_destroy(&km->km_refcnt);
+		kmem_free(km, sizeof (dsl_key_mapping_t));
+
+		if (km_out != NULL)
+			*km_out = NULL;
+		return (ret);
+	}
+
+	km->km_dsobj = ds->ds_object;
+
+	rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER);
+
+	/*
+	 * If a mapping already exists, simply increment its refcount and
+	 * cleanup the one we made. We want to allocate / free outside of
+	 * the lock because this lock is also used by the zio layer to lookup
+	 * key mappings. Otherwise, use the one we created. Normally, there will
+	 * only be one active reference at a time (the objset owner), but there
+	 * are times when there could be multiple async users.
+	 */
+	found_km = avl_find(&spa->spa_keystore.sk_key_mappings, km, &where);
+	if (found_km != NULL) {
+		should_free = B_TRUE;
+		zfs_refcount_add(&found_km->km_refcnt, tag);
+		if (km_out != NULL)
+			*km_out = found_km;
+	} else {
+		zfs_refcount_add(&km->km_refcnt, tag);
+		avl_insert(&spa->spa_keystore.sk_key_mappings, km, where);
+		if (km_out != NULL)
+			*km_out = km;
+	}
+
+	rw_exit(&spa->spa_keystore.sk_km_lock);
+
+	if (should_free) {
+		spa_keystore_dsl_key_rele(spa, km->km_key, km);
+		zfs_refcount_destroy(&km->km_refcnt);
+		kmem_free(km, sizeof (dsl_key_mapping_t));
+	}
+
+	return (0);
+}
+
+int
+spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag)
+{
+	int ret;
+	dsl_key_mapping_t search_km;
+	dsl_key_mapping_t *found_km;
+
+	/* init the search key mapping */
+	search_km.km_dsobj = dsobj;
+
+	rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER);
+
+	/* find the matching mapping */
+	found_km = avl_find(&spa->spa_keystore.sk_key_mappings,
+	    &search_km, NULL);
+	if (found_km == NULL) {
+		ret = SET_ERROR(ENOENT);
+		goto error_unlock;
+	}
+
+	rw_exit(&spa->spa_keystore.sk_km_lock);
+
+	key_mapping_rele(spa, found_km, tag);
+
+	return (0);
+
+error_unlock:
+	rw_exit(&spa->spa_keystore.sk_km_lock);
+	return (ret);
+}
+
+/*
+ * This function is primarily used by the zio and arc layer to lookup
+ * DSL Crypto Keys for encryption. Callers must release the key with
+ * spa_keystore_dsl_key_rele(). The function may also be called with
+ * dck_out == NULL and tag == NULL to simply check that a key exists
+ * without getting a reference to it.
+ */
+int
+spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag,
+    dsl_crypto_key_t **dck_out)
+{
+	int ret;
+	dsl_key_mapping_t search_km;
+	dsl_key_mapping_t *found_km;
+
+	ASSERT((tag != NULL && dck_out != NULL) ||
+	    (tag == NULL && dck_out == NULL));
+
+	/* init the search key mapping */
+	search_km.km_dsobj = dsobj;
+
+	rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER);
+
+	/* remove the mapping from the tree */
+	found_km = avl_find(&spa->spa_keystore.sk_key_mappings, &search_km,
+	    NULL);
+	if (found_km == NULL) {
+		ret = SET_ERROR(ENOENT);
+		goto error_unlock;
+	}
+
+	if (found_km && tag)
+		zfs_refcount_add(&found_km->km_key->dck_holds, tag);
+
+	rw_exit(&spa->spa_keystore.sk_km_lock);
+
+	if (dck_out != NULL)
+		*dck_out = found_km->km_key;
+	return (0);
+
+error_unlock:
+	rw_exit(&spa->spa_keystore.sk_km_lock);
+
+	if (dck_out != NULL)
+		*dck_out = NULL;
+	return (ret);
+}
+
+static int
+dmu_objset_check_wkey_loaded(dsl_dir_t *dd)
+{
+	int ret;
+	dsl_wrapping_key_t *wkey = NULL;
+
+	ret = spa_keystore_wkey_hold_dd(dd->dd_pool->dp_spa, dd, FTAG,
+	    &wkey);
+	if (ret != 0)
+		return (SET_ERROR(EACCES));
+
+	dsl_wrapping_key_rele(wkey, FTAG);
+
+	return (0);
+}
+
+static zfs_keystatus_t
+dsl_dataset_get_keystatus(dsl_dir_t *dd)
+{
+	/* check if this dd has a has a dsl key */
+	if (dd->dd_crypto_obj == 0)
+		return (ZFS_KEYSTATUS_NONE);
+
+	return (dmu_objset_check_wkey_loaded(dd) == 0 ?
+	    ZFS_KEYSTATUS_AVAILABLE : ZFS_KEYSTATUS_UNAVAILABLE);
+}
+
+static int
+dsl_dir_get_crypt(dsl_dir_t *dd, uint64_t *crypt)
+{
+	if (dd->dd_crypto_obj == 0) {
+		*crypt = ZIO_CRYPT_OFF;
+		return (0);
+	}
+
+	return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+	    DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, crypt));
+}
+
+static void
+dsl_crypto_key_sync_impl(objset_t *mos, uint64_t dckobj, uint64_t crypt,
+    uint64_t root_ddobj, uint64_t guid, uint8_t *iv, uint8_t *mac,
+    uint8_t *keydata, uint8_t *hmac_keydata, uint64_t keyformat,
+    uint64_t salt, uint64_t iters, dmu_tx_t *tx)
+{
+	VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+	    &crypt, tx));
+	VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1,
+	    &root_ddobj, tx));
+	VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1,
+	    &guid, tx));
+	VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+	    iv, tx));
+	VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+	    mac, tx));
+	VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+	    MASTER_KEY_MAX_LEN, keydata, tx));
+	VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+	    SHA512_HMAC_KEYLEN, hmac_keydata, tx));
+	VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT),
+	    8, 1, &keyformat, tx));
+	VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
+	    8, 1, &salt, tx));
+	VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
+	    8, 1, &iters, tx));
+}
+
+static void
+dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx)
+{
+	zio_crypt_key_t *key = &dck->dck_key;
+	dsl_wrapping_key_t *wkey = dck->dck_wkey;
+	uint8_t keydata[MASTER_KEY_MAX_LEN];
+	uint8_t hmac_keydata[SHA512_HMAC_KEYLEN];
+	uint8_t iv[WRAPPING_IV_LEN];
+	uint8_t mac[WRAPPING_MAC_LEN];
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3U(key->zk_crypt, <, ZIO_CRYPT_FUNCTIONS);
+
+	/* encrypt and store the keys along with the IV and MAC */
+	VERIFY0(zio_crypt_key_wrap(&dck->dck_wkey->wk_key, key, iv, mac,
+	    keydata, hmac_keydata));
+
+	/* update the ZAP with the obtained values */
+	dsl_crypto_key_sync_impl(tx->tx_pool->dp_meta_objset, dck->dck_obj,
+	    key->zk_crypt, wkey->wk_ddobj, key->zk_guid, iv, mac, keydata,
+	    hmac_keydata, wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters,
+	    tx);
+}
+
+typedef struct spa_keystore_change_key_args {
+	const char *skcka_dsname;
+	dsl_crypto_params_t *skcka_cp;
+} spa_keystore_change_key_args_t;
+
+static int
+spa_keystore_change_key_check(void *arg, dmu_tx_t *tx)
+{
+	int ret;
+	dsl_dir_t *dd = NULL;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_keystore_change_key_args_t *skcka = arg;
+	dsl_crypto_params_t *dcp = skcka->skcka_cp;
+	uint64_t rddobj;
+
+	/* check for the encryption feature */
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) {
+		ret = SET_ERROR(ENOTSUP);
+		goto error;
+	}
+
+	/* check for valid key change command */
+	if (dcp->cp_cmd != DCP_CMD_NEW_KEY &&
+	    dcp->cp_cmd != DCP_CMD_INHERIT &&
+	    dcp->cp_cmd != DCP_CMD_FORCE_NEW_KEY &&
+	    dcp->cp_cmd != DCP_CMD_FORCE_INHERIT) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* hold the dd */
+	ret = dsl_dir_hold(dp, skcka->skcka_dsname, FTAG, &dd, NULL);
+	if (ret != 0) {
+		dd = NULL;
+		goto error;
+	}
+
+	/* verify that the dataset is encrypted */
+	if (dd->dd_crypto_obj == 0) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* clones must always use their origin's key */
+	if (dsl_dir_is_clone(dd)) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* lookup the ddobj we are inheriting the keylocation from */
+	ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj);
+	if (ret != 0)
+		goto error;
+
+	/* Handle inheritance */
+	if (dcp->cp_cmd == DCP_CMD_INHERIT ||
+	    dcp->cp_cmd == DCP_CMD_FORCE_INHERIT) {
+		/* no other encryption params should be given */
+		if (dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+		    dcp->cp_keylocation != NULL ||
+		    dcp->cp_wkey != NULL) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+
+		/* check that this is an encryption root */
+		if (dd->dd_object != rddobj) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+
+		/* check that the parent is encrypted */
+		if (dd->dd_parent->dd_crypto_obj == 0) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+
+		/* if we are rewrapping check that both keys are loaded */
+		if (dcp->cp_cmd == DCP_CMD_INHERIT) {
+			ret = dmu_objset_check_wkey_loaded(dd);
+			if (ret != 0)
+				goto error;
+
+			ret = dmu_objset_check_wkey_loaded(dd->dd_parent);
+			if (ret != 0)
+				goto error;
+		}
+
+		dsl_dir_rele(dd, FTAG);
+		return (0);
+	}
+
+	/* handle forcing an encryption root without rewrapping */
+	if (dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) {
+		/* no other encryption params should be given */
+		if (dcp->cp_crypt != ZIO_CRYPT_INHERIT ||
+		    dcp->cp_keylocation != NULL ||
+		    dcp->cp_wkey != NULL) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+
+		/* check that this is not an encryption root */
+		if (dd->dd_object == rddobj) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+
+		dsl_dir_rele(dd, FTAG);
+		return (0);
+	}
+
+	/* crypt cannot be changed after creation */
+	if (dcp->cp_crypt != ZIO_CRYPT_INHERIT) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* we are not inheritting our parent's wkey so we need one ourselves */
+	if (dcp->cp_wkey == NULL) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* check for a valid keyformat for the new wrapping key */
+	if (dcp->cp_wkey->wk_keyformat >= ZFS_KEYFORMAT_FORMATS ||
+	    dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_NONE) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/*
+	 * If this dataset is not currently an encryption root we need a new
+	 * keylocation for this dataset's new wrapping key. Otherwise we can
+	 * just keep the one we already had.
+	 */
+	if (dd->dd_object != rddobj && dcp->cp_keylocation == NULL) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* check that the keylocation is valid if it is not NULL */
+	if (dcp->cp_keylocation != NULL &&
+	    !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	/* passphrases require pbkdf2 salt and iters */
+	if (dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_PASSPHRASE) {
+		if (dcp->cp_wkey->wk_salt == 0 ||
+		    dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+	} else {
+		if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) {
+			ret = SET_ERROR(EINVAL);
+			goto error;
+		}
+	}
+
+	/* make sure the dd's wkey is loaded */
+	ret = dmu_objset_check_wkey_loaded(dd);
+	if (ret != 0)
+		goto error;
+
+	dsl_dir_rele(dd, FTAG);
+
+	return (0);
+
+error:
+	if (dd != NULL)
+		dsl_dir_rele(dd, FTAG);
+
+	return (ret);
+}
+
+/*
+ * This function deals with the intricacies of updating wrapping
+ * key references and encryption roots recursively in the event
+ * of a call to 'zfs change-key' or 'zfs promote'. The 'skip'
+ * parameter should always be set to B_FALSE when called
+ * externally.
+ */
+static void
+spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj,
+    uint64_t new_rddobj, dsl_wrapping_key_t *wkey, boolean_t skip,
+    dmu_tx_t *tx)
+{
+	int ret;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd = NULL;
+	dsl_crypto_key_t *dck = NULL;
+	uint64_t curr_rddobj;
+
+	ASSERT(RW_WRITE_HELD(&dp->dp_spa->spa_keystore.sk_wkeys_lock));
+
+	/* hold the dd */
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+	/* ignore special dsl dirs */
+	if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') {
+		dsl_dir_rele(dd, FTAG);
+		return;
+	}
+
+	ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj);
+	VERIFY(ret == 0 || ret == ENOENT);
+
+	/*
+	 * Stop recursing if this dsl dir didn't inherit from the root
+	 * or if this dd is a clone.
+	 */
+	if (ret == ENOENT ||
+	    (!skip && (curr_rddobj != rddobj || dsl_dir_is_clone(dd)))) {
+		dsl_dir_rele(dd, FTAG);
+		return;
+	}
+
+	/*
+	 * If we don't have a wrapping key just update the dck to reflect the
+	 * new encryption root. Otherwise rewrap the entire dck and re-sync it
+	 * to disk. If skip is set, we don't do any of this work.
+	 */
+	if (!skip) {
+		if (wkey == NULL) {
+			VERIFY0(zap_update(dp->dp_meta_objset,
+			    dd->dd_crypto_obj,
+			    DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1,
+			    &new_rddobj, tx));
+		} else {
+			VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd,
+			    FTAG, &dck));
+			dsl_wrapping_key_hold(wkey, dck);
+			dsl_wrapping_key_rele(dck->dck_wkey, dck);
+			dck->dck_wkey = wkey;
+			dsl_crypto_key_sync(dck, tx);
+			spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG);
+		}
+	}
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/* Recurse into all child dsl dirs. */
+	for (zap_cursor_init(zc, dp->dp_meta_objset,
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		spa_keystore_change_key_sync_impl(rddobj,
+		    za->za_first_integer, new_rddobj, wkey, B_FALSE, tx);
+	}
+	zap_cursor_fini(zc);
+
+	/*
+	 * Recurse into all dsl dirs of clones. We utilize the skip parameter
+	 * here so that we don't attempt to process the clones directly. This
+	 * is because the clone and its origin share the same dck, which has
+	 * already been updated.
+	 */
+	for (zap_cursor_init(zc, dp->dp_meta_objset,
+	    dsl_dir_phys(dd)->dd_clones);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		dsl_dataset_t *clone;
+
+		VERIFY0(dsl_dataset_hold_obj(dp, za->za_first_integer,
+		    FTAG, &clone));
+		spa_keystore_change_key_sync_impl(rddobj,
+		    clone->ds_dir->dd_object, new_rddobj, wkey, B_TRUE, tx);
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(zc);
+
+	kmem_free(za, sizeof (zap_attribute_t));
+	kmem_free(zc, sizeof (zap_cursor_t));
+
+	dsl_dir_rele(dd, FTAG);
+}
+
+static void
+spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds;
+	avl_index_t where;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	spa_keystore_change_key_args_t *skcka = arg;
+	dsl_crypto_params_t *dcp = skcka->skcka_cp;
+	dsl_wrapping_key_t *wkey = NULL, *found_wkey;
+	dsl_wrapping_key_t wkey_search;
+	char *keylocation = dcp->cp_keylocation;
+	uint64_t rddobj, new_rddobj;
+
+	/* create and initialize the wrapping key */
+	VERIFY0(dsl_dataset_hold(dp, skcka->skcka_dsname, FTAG, &ds));
+	ASSERT(!ds->ds_is_snapshot);
+
+	if (dcp->cp_cmd == DCP_CMD_NEW_KEY ||
+	    dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) {
+		/*
+		 * We are changing to a new wkey. Set additional properties
+		 * which can be sent along with this ioctl. Note that this
+		 * command can set keylocation even if it can't normally be
+		 * set via 'zfs set' due to a non-local keylocation.
+		 */
+		if (dcp->cp_cmd == DCP_CMD_NEW_KEY) {
+			wkey = dcp->cp_wkey;
+			wkey->wk_ddobj = ds->ds_dir->dd_object;
+		} else {
+			keylocation = "prompt";
+		}
+
+		if (keylocation != NULL) {
+			dsl_prop_set_sync_impl(ds,
+			    zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+			    ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1,
+			    keylocation, tx);
+		}
+
+		VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj));
+		new_rddobj = ds->ds_dir->dd_object;
+	} else {
+		/*
+		 * We are inheritting the parent's wkey. Unset any local
+		 * keylocation and grab a reference to the wkey.
+		 */
+		if (dcp->cp_cmd == DCP_CMD_INHERIT) {
+			VERIFY0(spa_keystore_wkey_hold_dd(spa,
+			    ds->ds_dir->dd_parent, FTAG, &wkey));
+		}
+
+		dsl_prop_set_sync_impl(ds,
+		    zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_NONE,
+		    0, 0, NULL, tx);
+
+		rddobj = ds->ds_dir->dd_object;
+		VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir->dd_parent,
+		    &new_rddobj));
+	}
+
+	if (wkey == NULL) {
+		ASSERT(dcp->cp_cmd == DCP_CMD_FORCE_INHERIT ||
+		    dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY);
+	}
+
+	rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+
+	/* recurse through all children and rewrap their keys */
+	spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object,
+	    new_rddobj, wkey, B_FALSE, tx);
+
+	/*
+	 * All references to the old wkey should be released now (if it
+	 * existed). Replace the wrapping key.
+	 */
+	wkey_search.wk_ddobj = ds->ds_dir->dd_object;
+	found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &wkey_search, NULL);
+	if (found_wkey != NULL) {
+		ASSERT0(zfs_refcount_count(&found_wkey->wk_refcnt));
+		avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey);
+		dsl_wrapping_key_free(found_wkey);
+	}
+
+	if (dcp->cp_cmd == DCP_CMD_NEW_KEY) {
+		avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where);
+		avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where);
+	} else if (wkey != NULL) {
+		dsl_wrapping_key_rele(wkey, FTAG);
+	}
+
+	rw_exit(&spa->spa_keystore.sk_wkeys_lock);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp)
+{
+	spa_keystore_change_key_args_t skcka;
+
+	/* initialize the args struct */
+	skcka.skcka_dsname = dsname;
+	skcka.skcka_cp = dcp;
+
+	/*
+	 * Perform the actual work in syncing context. The blocks modified
+	 * here could be calculated but it would require holding the pool
+	 * lock and traversing all of the datasets that will have their keys
+	 * changed.
+	 */
+	return (dsl_sync_task(dsname, spa_keystore_change_key_check,
+	    spa_keystore_change_key_sync, &skcka, 15,
+	    ZFS_SPACE_CHECK_RESERVED));
+}
+
+int
+dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent)
+{
+	int ret;
+	uint64_t curr_rddobj, parent_rddobj;
+
+	if (dd->dd_crypto_obj == 0)
+		return (0);
+
+	ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj);
+	if (ret != 0)
+		goto error;
+
+	/*
+	 * if this is not an encryption root, we must make sure we are not
+	 * moving dd to a new encryption root
+	 */
+	if (dd->dd_object != curr_rddobj) {
+		ret = dsl_dir_get_encryption_root_ddobj(newparent,
+		    &parent_rddobj);
+		if (ret != 0)
+			goto error;
+
+		if (parent_rddobj != curr_rddobj) {
+			ret = SET_ERROR(EACCES);
+			goto error;
+		}
+	}
+
+	return (0);
+
+error:
+	return (ret);
+}
+
+/*
+ * Check to make sure that a promote from targetdd to origindd will not require
+ * any key rewraps.
+ */
+int
+dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin)
+{
+	int ret;
+	uint64_t rddobj, op_rddobj, tp_rddobj;
+
+	/* If the dataset is not encrypted we don't need to check anything */
+	if (origin->dd_crypto_obj == 0)
+		return (0);
+
+	/*
+	 * If we are not changing the first origin snapshot in a chain
+	 * the encryption root won't change either.
+	 */
+	if (dsl_dir_is_clone(origin))
+		return (0);
+
+	/*
+	 * If the origin is the encryption root we will update
+	 * the DSL Crypto Key to point to the target instead.
+	 */
+	ret = dsl_dir_get_encryption_root_ddobj(origin, &rddobj);
+	if (ret != 0)
+		return (ret);
+
+	if (rddobj == origin->dd_object)
+		return (0);
+
+	/*
+	 * The origin is inheriting its encryption root from its parent.
+	 * Check that the parent of the target has the same encryption root.
+	 */
+	ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj);
+	if (ret == ENOENT)
+		return (SET_ERROR(EACCES));
+	else if (ret != 0)
+		return (ret);
+
+	ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj);
+	if (ret == ENOENT)
+		return (SET_ERROR(EACCES));
+	else if (ret != 0)
+		return (ret);
+
+	if (op_rddobj != tp_rddobj)
+		return (SET_ERROR(EACCES));
+
+	return (0);
+}
+
+void
+dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin,
+    dmu_tx_t *tx)
+{
+	uint64_t rddobj;
+	dsl_pool_t *dp = target->dd_pool;
+	dsl_dataset_t *targetds;
+	dsl_dataset_t *originds;
+	char *keylocation;
+
+	if (origin->dd_crypto_obj == 0)
+		return;
+	if (dsl_dir_is_clone(origin))
+		return;
+
+	VERIFY0(dsl_dir_get_encryption_root_ddobj(origin, &rddobj));
+
+	if (rddobj != origin->dd_object)
+		return;
+
+	/*
+	 * If the target is being promoted to the encryption root update the
+	 * DSL Crypto Key and keylocation to reflect that. We also need to
+	 * update the DSL Crypto Keys of all children inheritting their
+	 * encryption root to point to the new target. Otherwise, the check
+	 * function ensured that the encryption root will not change.
+	 */
+	keylocation = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(target)->dd_head_dataset_obj, FTAG, &targetds));
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(origin)->dd_head_dataset_obj, FTAG, &originds));
+
+	VERIFY0(dsl_prop_get_dd(origin, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+	    1, ZAP_MAXVALUELEN, keylocation, NULL, B_FALSE));
+	dsl_prop_set_sync_impl(targetds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+	    ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx);
+	dsl_prop_set_sync_impl(originds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+	    ZPROP_SRC_NONE, 0, 0, NULL, tx);
+
+	rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER);
+	spa_keystore_change_key_sync_impl(rddobj, origin->dd_object,
+	    target->dd_object, NULL, B_FALSE, tx);
+	rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock);
+
+	dsl_dataset_rele(targetds, FTAG);
+	dsl_dataset_rele(originds, FTAG);
+	kmem_free(keylocation, ZAP_MAXVALUELEN);
+}
+
+int
+dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp,
+    boolean_t *will_encrypt)
+{
+	int ret;
+	uint64_t pcrypt, crypt;
+	dsl_crypto_params_t dummy_dcp = { 0 };
+
+	if (will_encrypt != NULL)
+		*will_encrypt = B_FALSE;
+
+	if (dcp == NULL)
+		dcp = &dummy_dcp;
+
+	if (dcp->cp_cmd != DCP_CMD_NONE)
+		return (SET_ERROR(EINVAL));
+
+	if (parentdd != NULL) {
+		ret = dsl_dir_get_crypt(parentdd, &pcrypt);
+		if (ret != 0)
+			return (ret);
+	} else {
+		pcrypt = ZIO_CRYPT_OFF;
+	}
+
+	crypt = (dcp->cp_crypt == ZIO_CRYPT_INHERIT) ? pcrypt : dcp->cp_crypt;
+
+	ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT);
+	ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT);
+
+	/* check for valid dcp with no encryption (inherited or local) */
+	if (crypt == ZIO_CRYPT_OFF) {
+		/* Must not specify encryption params */
+		if (dcp->cp_wkey != NULL ||
+		    (dcp->cp_keylocation != NULL &&
+		    strcmp(dcp->cp_keylocation, "none") != 0))
+			return (SET_ERROR(EINVAL));
+
+		return (0);
+	}
+
+	if (will_encrypt != NULL)
+		*will_encrypt = B_TRUE;
+
+	/*
+	 * We will now definitely be encrypting. Check the feature flag. When
+	 * creating the pool the caller will check this for us since we won't
+	 * technically have the feature activated yet.
+	 */
+	if (parentdd != NULL &&
+	    !spa_feature_is_enabled(parentdd->dd_pool->dp_spa,
+	    SPA_FEATURE_ENCRYPTION)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	/* Check for errata #4 (encryption enabled, bookmark_v2 disabled) */
+	if (parentdd != NULL &&
+	    !spa_feature_is_enabled(parentdd->dd_pool->dp_spa,
+	    SPA_FEATURE_BOOKMARK_V2)) {
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+
+	/* handle inheritance */
+	if (dcp->cp_wkey == NULL) {
+		ASSERT3P(parentdd, !=, NULL);
+
+		/* key must be fully unspecified */
+		if (dcp->cp_keylocation != NULL)
+			return (SET_ERROR(EINVAL));
+
+		/* parent must have a key to inherit */
+		if (pcrypt == ZIO_CRYPT_OFF)
+			return (SET_ERROR(EINVAL));
+
+		/* check for parent key */
+		ret = dmu_objset_check_wkey_loaded(parentdd);
+		if (ret != 0)
+			return (ret);
+
+		return (0);
+	}
+
+	/* At this point we should have a fully specified key. Check location */
+	if (dcp->cp_keylocation == NULL ||
+	    !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE))
+		return (SET_ERROR(EINVAL));
+
+	/* Must have fully specified keyformat */
+	switch (dcp->cp_wkey->wk_keyformat) {
+	case ZFS_KEYFORMAT_HEX:
+	case ZFS_KEYFORMAT_RAW:
+		/* requires no pbkdf2 iters and salt */
+		if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0)
+			return (SET_ERROR(EINVAL));
+		break;
+	case ZFS_KEYFORMAT_PASSPHRASE:
+		/* requires pbkdf2 iters and salt */
+		if (dcp->cp_wkey->wk_salt == 0 ||
+		    dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS)
+			return (SET_ERROR(EINVAL));
+		break;
+	case ZFS_KEYFORMAT_NONE:
+	default:
+		/* keyformat must be specified and valid */
+		return (SET_ERROR(EINVAL));
+	}
+
+	return (0);
+}
+
+void
+dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd,
+    dsl_dataset_t *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dd->dd_pool;
+	uint64_t crypt;
+	dsl_wrapping_key_t *wkey;
+
+	/* clones always use their origin's wrapping key */
+	if (dsl_dir_is_clone(dd)) {
+		ASSERT3P(dcp, ==, NULL);
+
+		/*
+		 * If this is an encrypted clone we just need to clone the
+		 * dck into dd. Zapify the dd so we can do that.
+		 */
+		if (origin->ds_dir->dd_crypto_obj != 0) {
+			dmu_buf_will_dirty(dd->dd_dbuf, tx);
+			dsl_dir_zapify(dd, tx);
+
+			dd->dd_crypto_obj =
+			    dsl_crypto_key_clone_sync(origin->ds_dir, tx);
+			VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object,
+			    DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1,
+			    &dd->dd_crypto_obj, tx));
+		}
+
+		return;
+	}
+
+	/*
+	 * A NULL dcp at this point indicates this is the origin dataset
+	 * which does not have an objset to encrypt. Raw receives will handle
+	 * encryption separately later. In both cases we can simply return.
+	 */
+	if (dcp == NULL || dcp->cp_cmd == DCP_CMD_RAW_RECV)
+		return;
+
+	crypt = dcp->cp_crypt;
+	wkey = dcp->cp_wkey;
+
+	/* figure out the effective crypt */
+	if (crypt == ZIO_CRYPT_INHERIT && dd->dd_parent != NULL)
+		VERIFY0(dsl_dir_get_crypt(dd->dd_parent, &crypt));
+
+	/* if we aren't doing encryption just return */
+	if (crypt == ZIO_CRYPT_OFF || crypt == ZIO_CRYPT_INHERIT)
+		return;
+
+	/* zapify the dd so that we can add the crypto key obj to it */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dsl_dir_zapify(dd, tx);
+
+	/* use the new key if given or inherit from the parent */
+	if (wkey == NULL) {
+		VERIFY0(spa_keystore_wkey_hold_dd(dp->dp_spa,
+		    dd->dd_parent, FTAG, &wkey));
+	} else {
+		wkey->wk_ddobj = dd->dd_object;
+	}
+
+	ASSERT3P(wkey, !=, NULL);
+
+	/* Create or clone the DSL crypto key and activate the feature */
+	dd->dd_crypto_obj = dsl_crypto_key_create_sync(crypt, wkey, tx);
+	VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj,
+	    tx));
+	dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION,
+	    (void *)B_TRUE, tx);
+
+	/*
+	 * If we inherited the wrapping key we release our reference now.
+	 * Otherwise, this is a new key and we need to load it into the
+	 * keystore.
+	 */
+	if (dcp->cp_wkey == NULL) {
+		dsl_wrapping_key_rele(wkey, FTAG);
+	} else {
+		VERIFY0(spa_keystore_load_wkey_impl(dp->dp_spa, wkey));
+	}
+}
+
+typedef struct dsl_crypto_recv_key_arg {
+	uint64_t dcrka_dsobj;
+	uint64_t dcrka_fromobj;
+	dmu_objset_type_t dcrka_ostype;
+	nvlist_t *dcrka_nvl;
+	boolean_t dcrka_do_key;
+} dsl_crypto_recv_key_arg_t;
+
+static int
+dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds,
+    dmu_objset_type_t ostype, nvlist_t *nvl, dmu_tx_t *tx)
+{
+	int ret;
+	objset_t *os;
+	dnode_t *mdn;
+	uint8_t *buf = NULL;
+	uint_t len;
+	uint64_t intval, nlevels, blksz, ibs;
+	uint64_t nblkptr, maxblkid;
+
+	if (ostype != DMU_OST_ZFS && ostype != DMU_OST_ZVOL)
+		return (SET_ERROR(EINVAL));
+
+	/* raw receives also need info about the structure of the metadnode */
+	ret = nvlist_lookup_uint64(nvl, "mdn_compress", &intval);
+	if (ret != 0 || intval >= ZIO_COMPRESS_LEGACY_FUNCTIONS)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint64(nvl, "mdn_checksum", &intval);
+	if (ret != 0 || intval >= ZIO_CHECKSUM_LEGACY_FUNCTIONS)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint64(nvl, "mdn_nlevels", &nlevels);
+	if (ret != 0 || nlevels > DN_MAX_LEVELS)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint64(nvl, "mdn_blksz", &blksz);
+	if (ret != 0 || blksz < SPA_MINBLOCKSIZE)
+		return (SET_ERROR(EINVAL));
+	else if (blksz > spa_maxblocksize(tx->tx_pool->dp_spa))
+		return (SET_ERROR(ENOTSUP));
+
+	ret = nvlist_lookup_uint64(nvl, "mdn_indblkshift", &ibs);
+	if (ret != 0 || ibs < DN_MIN_INDBLKSHIFT || ibs > DN_MAX_INDBLKSHIFT)
+		return (SET_ERROR(ENOTSUP));
+
+	ret = nvlist_lookup_uint64(nvl, "mdn_nblkptr", &nblkptr);
+	if (ret != 0 || nblkptr != DN_MAX_NBLKPTR)
+		return (SET_ERROR(ENOTSUP));
+
+	ret = nvlist_lookup_uint64(nvl, "mdn_maxblkid", &maxblkid);
+	if (ret != 0)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint8_array(nvl, "portable_mac", &buf, &len);
+	if (ret != 0 || len != ZIO_OBJSET_MAC_LEN)
+		return (SET_ERROR(EINVAL));
+
+	ret = dmu_objset_from_ds(ds, &os);
+	if (ret != 0)
+		return (ret);
+
+	mdn = DMU_META_DNODE(os);
+
+	/*
+	 * If we already created the objset, make sure its unchangeable
+	 * properties match the ones received in the nvlist.
+	 */
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	if (!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) &&
+	    (mdn->dn_nlevels != nlevels || mdn->dn_datablksz != blksz ||
+	    mdn->dn_indblkshift != ibs || mdn->dn_nblkptr != nblkptr)) {
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	/*
+	 * Check that the ivset guid of the fromds matches the one from the
+	 * send stream. Older versions of the encryption code did not have
+	 * an ivset guid on the from dataset and did not send one in the
+	 * stream. For these streams we provide the
+	 * zfs_disable_ivset_guid_check tunable to allow these datasets to
+	 * be received with a generated ivset guid.
+	 */
+	if (fromds != NULL && !zfs_disable_ivset_guid_check) {
+		uint64_t from_ivset_guid = 0;
+		intval = 0;
+
+		(void) nvlist_lookup_uint64(nvl, "from_ivset_guid", &intval);
+		(void) zap_lookup(tx->tx_pool->dp_meta_objset,
+		    fromds->ds_object, DS_FIELD_IVSET_GUID,
+		    sizeof (from_ivset_guid), 1, &from_ivset_guid);
+
+		if (intval == 0 || from_ivset_guid == 0)
+			return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISSING));
+
+		if (intval != from_ivset_guid)
+			return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISMATCH));
+	}
+
+	return (0);
+}
+
+static void
+dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    nvlist_t *nvl, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = tx->tx_pool;
+	objset_t *os;
+	dnode_t *mdn;
+	zio_t *zio;
+	uint8_t *portable_mac;
+	uint_t len;
+	uint64_t compress, checksum, nlevels, blksz, ibs, maxblkid;
+	boolean_t newds = B_FALSE;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	mdn = DMU_META_DNODE(os);
+
+	/*
+	 * Fetch the values we need from the nvlist. "to_ivset_guid" must
+	 * be set on the snapshot, which doesn't exist yet. The receive
+	 * code will take care of this for us later.
+	 */
+	compress = fnvlist_lookup_uint64(nvl, "mdn_compress");
+	checksum = fnvlist_lookup_uint64(nvl, "mdn_checksum");
+	nlevels = fnvlist_lookup_uint64(nvl, "mdn_nlevels");
+	blksz = fnvlist_lookup_uint64(nvl, "mdn_blksz");
+	ibs = fnvlist_lookup_uint64(nvl, "mdn_indblkshift");
+	maxblkid = fnvlist_lookup_uint64(nvl, "mdn_maxblkid");
+	VERIFY0(nvlist_lookup_uint8_array(nvl, "portable_mac", &portable_mac,
+	    &len));
+
+	/* if we haven't created an objset for the ds yet, do that now */
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	if (BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
+		(void) dmu_objset_create_impl_dnstats(dp->dp_spa, ds,
+		    dsl_dataset_get_blkptr(ds), ostype, nlevels, blksz,
+		    ibs, tx);
+		newds = B_TRUE;
+	}
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	/*
+	 * Set the portable MAC. The local MAC will always be zero since the
+	 * incoming data will all be portable and user accounting will be
+	 * deferred until the next mount. Afterwards, flag the os to be
+	 * written out raw next time.
+	 */
+	arc_release(os->os_phys_buf, &os->os_phys_buf);
+	bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN);
+	os->os_phys->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+	os->os_phys->os_flags &= ~OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
+	os->os_flags = os->os_phys->os_flags;
+	bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN);
+	os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+
+	/* set metadnode compression and checksum */
+	mdn->dn_compress = compress;
+	mdn->dn_checksum = checksum;
+
+	rw_enter(&mdn->dn_struct_rwlock, RW_WRITER);
+	dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE);
+	rw_exit(&mdn->dn_struct_rwlock);
+
+	/*
+	 * We can't normally dirty the dataset in syncing context unless
+	 * we are creating a new dataset. In this case, we perform a
+	 * pseudo txg sync here instead.
+	 */
+	if (newds) {
+		dsl_dataset_dirty(ds, tx);
+	} else {
+		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+		dsl_dataset_sync(ds, zio, tx);
+		VERIFY0(zio_wait(zio));
+
+		/* dsl_dataset_sync_done will drop this reference. */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+		dsl_dataset_sync_done(ds, tx);
+	}
+}
+
+int
+dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
+{
+	int ret;
+	objset_t *mos = tx->tx_pool->dp_meta_objset;
+	uint8_t *buf = NULL;
+	uint_t len;
+	uint64_t intval, key_guid, version;
+	boolean_t is_passphrase = B_FALSE;
+
+	ASSERT(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT);
+
+	/*
+	 * Read and check all the encryption values from the nvlist. We need
+	 * all of the fields of a DSL Crypto Key, as well as a fully specified
+	 * wrapping key.
+	 */
+	ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval);
+	if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS ||
+	    intval <= ZIO_CRYPT_OFF)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval);
+	if (ret != 0)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * If this is an incremental receive make sure the given key guid
+	 * matches the one we already have.
+	 */
+	if (ds->ds_dir->dd_crypto_obj != 0) {
+		ret = zap_lookup(mos, ds->ds_dir->dd_crypto_obj,
+		    DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid);
+		if (ret != 0)
+			return (ret);
+		if (intval != key_guid)
+			return (SET_ERROR(EACCES));
+	}
+
+	ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+	    &buf, &len);
+	if (ret != 0 || len != MASTER_KEY_MAX_LEN)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+	    &buf, &len);
+	if (ret != 0 || len != SHA512_HMAC_KEYLEN)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &buf, &len);
+	if (ret != 0 || len != WRAPPING_IV_LEN)
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &buf, &len);
+	if (ret != 0 || len != WRAPPING_MAC_LEN)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * We don't support receiving old on-disk formats. The version 0
+	 * implementation protected several fields in an objset that were
+	 * not always portable during a raw receive. As a result, we call
+	 * the old version an on-disk errata #3.
+	 */
+	ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_VERSION, &version);
+	if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION)
+		return (SET_ERROR(ENOTSUP));
+
+	ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT),
+	    &intval);
+	if (ret != 0 || intval >= ZFS_KEYFORMAT_FORMATS ||
+	    intval == ZFS_KEYFORMAT_NONE)
+		return (SET_ERROR(EINVAL));
+
+	is_passphrase = (intval == ZFS_KEYFORMAT_PASSPHRASE);
+
+	/*
+	 * for raw receives we allow any number of pbkdf2iters since there
+	 * won't be a chance for the user to change it.
+	 */
+	ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
+	    &intval);
+	if (ret != 0 || (is_passphrase == (intval == 0)))
+		return (SET_ERROR(EINVAL));
+
+	ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
+	    &intval);
+	if (ret != 0 || (is_passphrase == (intval == 0)))
+		return (SET_ERROR(EINVAL));
+
+	return (0);
+}
+
+void
+dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = tx->tx_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dir_t *dd = ds->ds_dir;
+	uint_t len;
+	uint64_t rddobj, one = 1;
+	uint8_t *keydata, *hmac_keydata, *iv, *mac;
+	uint64_t crypt, key_guid, keyformat, iters, salt;
+	uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+	char *keylocation = "prompt";
+
+	/* lookup the values we need to create the DSL Crypto Key */
+	crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE);
+	key_guid = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID);
+	keyformat = fnvlist_lookup_uint64(nvl,
+	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT));
+	iters = fnvlist_lookup_uint64(nvl,
+	    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS));
+	salt = fnvlist_lookup_uint64(nvl,
+	    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT));
+	VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+	    &keydata, &len));
+	VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+	    &hmac_keydata, &len));
+	VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &iv, &len));
+	VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &mac, &len));
+
+	/* if this is a new dataset setup the DSL Crypto Key. */
+	if (dd->dd_crypto_obj == 0) {
+		/* zapify the dsl dir so we can add the key object to it */
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dsl_dir_zapify(dd, tx);
+
+		/* create the DSL Crypto Key on disk and activate the feature */
+		dd->dd_crypto_obj = zap_create(mos,
+		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    dd->dd_crypto_obj, DSL_CRYPTO_KEY_REFCOUNT,
+		    sizeof (uint64_t), 1, &one, tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    dd->dd_crypto_obj, DSL_CRYPTO_KEY_VERSION,
+		    sizeof (uint64_t), 1, &version, tx));
+
+		dsl_dataset_activate_feature(ds->ds_object,
+		    SPA_FEATURE_ENCRYPTION, (void *)B_TRUE, tx);
+		ds->ds_feature[SPA_FEATURE_ENCRYPTION] = (void *)B_TRUE;
+
+		/* save the dd_crypto_obj on disk */
+		VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ,
+		    sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx));
+
+		/*
+		 * Set the keylocation to prompt by default. If keylocation
+		 * has been provided via the properties, this will be overridden
+		 * later.
+		 */
+		dsl_prop_set_sync_impl(ds,
+		    zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
+		    ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1,
+		    keylocation, tx);
+
+		rddobj = dd->dd_object;
+	} else {
+		VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &rddobj));
+	}
+
+	/* sync the key data to the ZAP object on disk */
+	dsl_crypto_key_sync_impl(mos, dd->dd_crypto_obj, crypt,
+	    rddobj, key_guid, iv, mac, keydata, hmac_keydata, keyformat, salt,
+	    iters, tx);
+}
+
+static int
+dsl_crypto_recv_key_check(void *arg, dmu_tx_t *tx)
+{
+	int ret;
+	dsl_crypto_recv_key_arg_t *dcrka = arg;
+	dsl_dataset_t *ds = NULL, *fromds = NULL;
+
+	ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj,
+	    FTAG, &ds);
+	if (ret != 0)
+		goto out;
+
+	if (dcrka->dcrka_fromobj != 0) {
+		ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_fromobj,
+		    FTAG, &fromds);
+		if (ret != 0)
+			goto out;
+	}
+
+	ret = dsl_crypto_recv_raw_objset_check(ds, fromds,
+	    dcrka->dcrka_ostype, dcrka->dcrka_nvl, tx);
+	if (ret != 0)
+		goto out;
+
+	/*
+	 * We run this check even if we won't be doing this part of
+	 * the receive now so that we don't make the user wait until
+	 * the receive finishes to fail.
+	 */
+	ret = dsl_crypto_recv_raw_key_check(ds, dcrka->dcrka_nvl, tx);
+	if (ret != 0)
+		goto out;
+
+out:
+	if (ds != NULL)
+		dsl_dataset_rele(ds, FTAG);
+	if (fromds != NULL)
+		dsl_dataset_rele(fromds, FTAG);
+	return (ret);
+}
+
+static void
+dsl_crypto_recv_key_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_crypto_recv_key_arg_t *dcrka = arg;
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj,
+	    FTAG, &ds));
+	dsl_crypto_recv_raw_objset_sync(ds, dcrka->dcrka_ostype,
+	    dcrka->dcrka_nvl, tx);
+	if (dcrka->dcrka_do_key)
+		dsl_crypto_recv_raw_key_sync(ds, dcrka->dcrka_nvl, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * This function is used to sync an nvlist representing a DSL Crypto Key and
+ * the associated encryption parameters. The key will be written exactly as is
+ * without wrapping it.
+ */
+int
+dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj,
+    dmu_objset_type_t ostype, nvlist_t *nvl, boolean_t do_key)
+{
+	dsl_crypto_recv_key_arg_t dcrka;
+
+	dcrka.dcrka_dsobj = dsobj;
+	dcrka.dcrka_fromobj = fromobj;
+	dcrka.dcrka_ostype = ostype;
+	dcrka.dcrka_nvl = nvl;
+	dcrka.dcrka_do_key = do_key;
+
+	return (dsl_sync_task(poolname, dsl_crypto_recv_key_check,
+	    dsl_crypto_recv_key_sync, &dcrka, 1, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dsl_crypto_populate_key_nvlist(objset_t *os, uint64_t from_ivset_guid,
+    nvlist_t **nvl_out)
+{
+	int ret;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	dnode_t *mdn;
+	uint64_t rddobj;
+	nvlist_t *nvl = NULL;
+	uint64_t dckobj = ds->ds_dir->dd_crypto_obj;
+	dsl_dir_t *rdd = NULL;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t crypt = 0, key_guid = 0, format = 0;
+	uint64_t iters = 0, salt = 0, version = 0;
+	uint64_t to_ivset_guid = 0;
+	uint8_t raw_keydata[MASTER_KEY_MAX_LEN];
+	uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN];
+	uint8_t iv[WRAPPING_IV_LEN];
+	uint8_t mac[WRAPPING_MAC_LEN];
+
+	ASSERT(dckobj != 0);
+
+	mdn = DMU_META_DNODE(os);
+
+	nvl = fnvlist_alloc();
+
+	/* lookup values from the DSL Crypto Key */
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1,
+	    &crypt);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1,
+	    MASTER_KEY_MAX_LEN, raw_keydata);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1,
+	    SHA512_HMAC_KEYLEN, raw_hmac_keydata);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN,
+	    iv);
+	if (ret != 0)
+		goto error;
+
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN,
+	    mac);
+	if (ret != 0)
+		goto error;
+
+	/* see zfs_disable_ivset_guid_check tunable for errata info */
+	ret = zap_lookup(mos, ds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
+	    &to_ivset_guid);
+	if (ret != 0)
+		ASSERT3U(dp->dp_spa->spa_errata, !=, 0);
+
+	/*
+	 * We don't support raw sends of legacy on-disk formats. See the
+	 * comment in dsl_crypto_recv_key_check() for details.
+	 */
+	ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version);
+	if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) {
+		dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
+		ret = SET_ERROR(ENOTSUP);
+		goto error;
+	}
+
+	/*
+	 * Lookup wrapping key properties. An early version of the code did
+	 * not correctly add these values to the wrapping key or the DSL
+	 * Crypto Key on disk for non encryption roots, so to be safe we
+	 * always take the slightly circuitous route of looking it up from
+	 * the encryption root's key.
+	 */
+	ret = dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj);
+	if (ret != 0)
+		goto error;
+
+	dsl_pool_config_enter(dp, FTAG);
+
+	ret = dsl_dir_hold_obj(dp, rddobj, NULL, FTAG, &rdd);
+	if (ret != 0)
+		goto error_unlock;
+
+	ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj,
+	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &format);
+	if (ret != 0)
+		goto error_unlock;
+
+	if (format == ZFS_KEYFORMAT_PASSPHRASE) {
+		ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj,
+		    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters);
+		if (ret != 0)
+			goto error_unlock;
+
+		ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj,
+		    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt);
+		if (ret != 0)
+			goto error_unlock;
+	}
+
+	dsl_dir_rele(rdd, FTAG);
+	dsl_pool_config_exit(dp, FTAG);
+
+	fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, crypt);
+	fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_GUID, key_guid);
+	fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_VERSION, version);
+	VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY,
+	    raw_keydata, MASTER_KEY_MAX_LEN));
+	VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY,
+	    raw_hmac_keydata, SHA512_HMAC_KEYLEN));
+	VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_IV, iv,
+	    WRAPPING_IV_LEN));
+	VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, mac,
+	    WRAPPING_MAC_LEN));
+	VERIFY0(nvlist_add_uint8_array(nvl, "portable_mac",
+	    os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN));
+	fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), format);
+	fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters);
+	fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt);
+	fnvlist_add_uint64(nvl, "mdn_checksum", mdn->dn_checksum);
+	fnvlist_add_uint64(nvl, "mdn_compress", mdn->dn_compress);
+	fnvlist_add_uint64(nvl, "mdn_nlevels", mdn->dn_nlevels);
+	fnvlist_add_uint64(nvl, "mdn_blksz", mdn->dn_datablksz);
+	fnvlist_add_uint64(nvl, "mdn_indblkshift", mdn->dn_indblkshift);
+	fnvlist_add_uint64(nvl, "mdn_nblkptr", mdn->dn_nblkptr);
+	fnvlist_add_uint64(nvl, "mdn_maxblkid", mdn->dn_maxblkid);
+	fnvlist_add_uint64(nvl, "to_ivset_guid", to_ivset_guid);
+	fnvlist_add_uint64(nvl, "from_ivset_guid", from_ivset_guid);
+
+	*nvl_out = nvl;
+	return (0);
+
+error_unlock:
+	dsl_pool_config_exit(dp, FTAG);
+error:
+	if (rdd != NULL)
+		dsl_dir_rele(rdd, FTAG);
+	nvlist_free(nvl);
+
+	*nvl_out = NULL;
+	return (ret);
+}
+
+uint64_t
+dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey,
+    dmu_tx_t *tx)
+{
+	dsl_crypto_key_t dck;
+	uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION;
+	uint64_t one = 1ULL;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
+	ASSERT3U(crypt, >, ZIO_CRYPT_OFF);
+
+	/* create the DSL Crypto Key ZAP object */
+	dck.dck_obj = zap_create(tx->tx_pool->dp_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+
+	/* fill in the key (on the stack) and sync it to disk */
+	dck.dck_wkey = wkey;
+	VERIFY0(zio_crypt_key_init(crypt, &dck.dck_key));
+
+	dsl_crypto_key_sync(&dck, tx);
+	VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj,
+	    DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &one, tx));
+	VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj,
+	    DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx));
+
+	zio_crypt_key_destroy(&dck.dck_key);
+	bzero(&dck.dck_key, sizeof (zio_crypt_key_t));
+
+	return (dck.dck_obj);
+}
+
+uint64_t
+dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx)
+{
+	objset_t *mos = tx->tx_pool->dp_meta_objset;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	VERIFY0(zap_increment(mos, origindd->dd_crypto_obj,
+	    DSL_CRYPTO_KEY_REFCOUNT, 1, tx));
+
+	return (origindd->dd_crypto_obj);
+}
+
+void
+dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx)
+{
+	objset_t *mos = tx->tx_pool->dp_meta_objset;
+	uint64_t refcnt;
+
+	/* Decrement the refcount, destroy if this is the last reference */
+	VERIFY0(zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT,
+	    sizeof (uint64_t), 1, &refcnt));
+
+	if (refcnt != 1) {
+		VERIFY0(zap_increment(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT,
+		    -1, tx));
+	} else {
+		VERIFY0(zap_destroy(mos, dckobj, tx));
+	}
+}
+
+void
+dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	uint64_t intval;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_dir_t *enc_root;
+	char buf[ZFS_MAX_DATASET_NAME_LEN];
+
+	if (dd->dd_crypto_obj == 0)
+		return;
+
+	intval = dsl_dataset_get_keystatus(dd);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYSTATUS, intval);
+
+	if (dsl_dir_get_crypt(dd, &intval) == 0)
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_ENCRYPTION, intval);
+	if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+	    DSL_CRYPTO_KEY_GUID, 8, 1, &intval) == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEY_GUID, intval);
+	}
+	if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &intval) == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYFORMAT, intval);
+	}
+	if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+	    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &intval) == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_SALT, intval);
+	}
+	if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
+	    zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &intval) == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_ITERS, intval);
+	}
+	if (zap_lookup(dd->dd_pool->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_IVSET_GUID, 8, 1, &intval) == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_IVSET_GUID, intval);
+	}
+
+	if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) {
+		if (dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG,
+		    &enc_root) == 0) {
+			dsl_dir_name(enc_root, buf);
+			dsl_dir_rele(enc_root, FTAG);
+			dsl_prop_nvlist_add_string(nv,
+			    ZFS_PROP_ENCRYPTION_ROOT, buf);
+		}
+	}
+}
+
+int
+spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt)
+{
+	int ret;
+	dsl_crypto_key_t *dck = NULL;
+
+	/* look up the key from the spa's keystore */
+	ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+	if (ret != 0)
+		goto error;
+
+	ret = zio_crypt_key_get_salt(&dck->dck_key, salt);
+	if (ret != 0)
+		goto error;
+
+	spa_keystore_dsl_key_rele(spa, dck, FTAG);
+	return (0);
+
+error:
+	if (dck != NULL)
+		spa_keystore_dsl_key_rele(spa, dck, FTAG);
+	return (ret);
+}
+
+/*
+ * Objset blocks are a special case for MAC generation. These blocks have 2
+ * 256-bit MACs which are embedded within the block itself, rather than a
+ * single 128 bit MAC. As a result, this function handles encoding and decoding
+ * the MACs on its own, unlike other functions in this file.
+ */
+int
+spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj,
+    abd_t *abd, uint_t datalen, boolean_t byteswap)
+{
+	int ret;
+	dsl_crypto_key_t *dck = NULL;
+	void *buf = abd_borrow_buf_copy(abd, datalen);
+	objset_phys_t *osp = buf;
+	uint8_t portable_mac[ZIO_OBJSET_MAC_LEN];
+	uint8_t local_mac[ZIO_OBJSET_MAC_LEN];
+
+	/* look up the key from the spa's keystore */
+	ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+	if (ret != 0)
+		goto error;
+
+	/* calculate both HMACs */
+	ret = zio_crypt_do_objset_hmacs(&dck->dck_key, buf, datalen,
+	    byteswap, portable_mac, local_mac);
+	if (ret != 0)
+		goto error;
+
+	spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+	/* if we are generating encode the HMACs in the objset_phys_t */
+	if (generate) {
+		bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN);
+		bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN);
+		abd_return_buf_copy(abd, buf, datalen);
+		return (0);
+	}
+
+	if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 ||
+	    bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) {
+		abd_return_buf(abd, buf, datalen);
+		return (SET_ERROR(ECKSUM));
+	}
+
+	abd_return_buf(abd, buf, datalen);
+
+	return (0);
+
+error:
+	if (dck != NULL)
+		spa_keystore_dsl_key_rele(spa, dck, FTAG);
+	abd_return_buf(abd, buf, datalen);
+	return (ret);
+}
+
+int
+spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd,
+    uint_t datalen, uint8_t *mac)
+{
+	int ret;
+	dsl_crypto_key_t *dck = NULL;
+	uint8_t *buf = abd_borrow_buf_copy(abd, datalen);
+	uint8_t digestbuf[ZIO_DATA_MAC_LEN];
+
+	/* look up the key from the spa's keystore */
+	ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck);
+	if (ret != 0)
+		goto error;
+
+	/* perform the hmac */
+	ret = zio_crypt_do_hmac(&dck->dck_key, buf, datalen,
+	    digestbuf, ZIO_DATA_MAC_LEN);
+	if (ret != 0)
+		goto error;
+
+	abd_return_buf(abd, buf, datalen);
+	spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+	/*
+	 * Truncate and fill in mac buffer if we were asked to generate a MAC.
+	 * Otherwise verify that the MAC matched what we expected.
+	 */
+	if (generate) {
+		bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN);
+		return (0);
+	}
+
+	if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0)
+		return (SET_ERROR(ECKSUM));
+
+	return (0);
+
+error:
+	if (dck != NULL)
+		spa_keystore_dsl_key_rele(spa, dck, FTAG);
+	abd_return_buf(abd, buf, datalen);
+	return (ret);
+}
+
+/*
+ * This function serves as a multiplexer for encryption and decryption of
+ * all blocks (except the L2ARC). For encryption, it will populate the IV,
+ * salt, MAC, and cabd (the ciphertext). On decryption it will simply use
+ * these fields to populate pabd (the plaintext).
+ */
+int
+spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb,
+    dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt,
+    uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd,
+    boolean_t *no_crypt)
+{
+	int ret;
+	dsl_crypto_key_t *dck = NULL;
+	uint8_t *plainbuf = NULL, *cipherbuf = NULL;
+
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
+
+	/* look up the key from the spa's keystore */
+	ret = spa_keystore_lookup_key(spa, zb->zb_objset, FTAG, &dck);
+	if (ret != 0) {
+		ret = SET_ERROR(EACCES);
+		return (ret);
+	}
+
+	if (encrypt) {
+		plainbuf = abd_borrow_buf_copy(pabd, datalen);
+		cipherbuf = abd_borrow_buf(cabd, datalen);
+	} else {
+		plainbuf = abd_borrow_buf(pabd, datalen);
+		cipherbuf = abd_borrow_buf_copy(cabd, datalen);
+	}
+
+	/*
+	 * Both encryption and decryption functions need a salt for key
+	 * generation and an IV. When encrypting a non-dedup block, we
+	 * generate the salt and IV randomly to be stored by the caller. Dedup
+	 * blocks perform a (more expensive) HMAC of the plaintext to obtain
+	 * the salt and the IV. ZIL blocks have their salt and IV generated
+	 * at allocation time in zio_alloc_zil(). On decryption, we simply use
+	 * the provided values.
+	 */
+	if (encrypt && ot != DMU_OT_INTENT_LOG && !dedup) {
+		ret = zio_crypt_key_get_salt(&dck->dck_key, salt);
+		if (ret != 0)
+			goto error;
+
+		ret = zio_crypt_generate_iv(iv);
+		if (ret != 0)
+			goto error;
+	} else if (encrypt && dedup) {
+		ret = zio_crypt_generate_iv_salt_dedup(&dck->dck_key,
+		    plainbuf, datalen, iv, salt);
+		if (ret != 0)
+			goto error;
+	}
+
+	/* call lower level function to perform encryption / decryption */
+	ret = zio_do_crypt_data(encrypt, &dck->dck_key, ot, bswap, salt, iv,
+	    mac, datalen, plainbuf, cipherbuf, no_crypt);
+
+	/*
+	 * Handle injected decryption faults. Unfortunately, we cannot inject
+	 * faults for dnode blocks because we might trigger the panic in
+	 * dbuf_prepare_encrypted_dnode_leaf(), which exists because syncing
+	 * context is not prepared to handle malicious decryption failures.
+	 */
+	if (zio_injection_enabled && !encrypt && ot != DMU_OT_DNODE && ret == 0)
+		ret = zio_handle_decrypt_injection(spa, zb, ot, ECKSUM);
+	if (ret != 0)
+		goto error;
+
+	if (encrypt) {
+		abd_return_buf(pabd, plainbuf, datalen);
+		abd_return_buf_copy(cabd, cipherbuf, datalen);
+	} else {
+		abd_return_buf_copy(pabd, plainbuf, datalen);
+		abd_return_buf(cabd, cipherbuf, datalen);
+	}
+
+	spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+	return (0);
+
+error:
+	if (encrypt) {
+		/* zero out any state we might have changed while encrypting */
+		bzero(salt, ZIO_DATA_SALT_LEN);
+		bzero(iv, ZIO_DATA_IV_LEN);
+		bzero(mac, ZIO_DATA_MAC_LEN);
+		abd_return_buf(pabd, plainbuf, datalen);
+		abd_return_buf_copy(cabd, cipherbuf, datalen);
+	} else {
+		abd_return_buf_copy(pabd, plainbuf, datalen);
+		abd_return_buf(cabd, cipherbuf, datalen);
+	}
+
+	spa_keystore_dsl_key_rele(spa, dck, FTAG);
+
+	return (ret);
+}
+
+ZFS_MODULE_PARAM(zfs, zfs_, disable_ivset_guid_check, INT, ZMOD_RW,
+	"Set to allow raw receives without IVset guids");
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dataset.c b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
new file mode 100644
index 000000000000..6da5faf01edf
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_dataset.c
@@ -0,0 +1,5014 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2020 The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ *     under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/unique.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/policy.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
+#include <sys/zio_checksum.h>
+
+/*
+ * The SPA supports block sizes up to 16MB.  However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator.  Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB).  Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+int zfs_allow_redacted_dataset_mount = 0;
+
+#define	SWITCH64(x, y) \
+	{ \
+		uint64_t __tmp = (x); \
+		(x) = (y); \
+		(y) = __tmp; \
+	}
+
+#define	DS_REF_MAX	(1ULL << 62)
+
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
+
+static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds,
+    uint64_t obj, dmu_tx_t *tx);
+static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds,
+    dmu_tx_t *tx);
+
+static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f);
+
+extern int spa_asize_inflation;
+
+static zil_header_t zero_zil;
+
+/*
+ * Figure out how much of this delta should be propagated to the dsl_dir
+ * layer.  If there's a refreservation, that space has already been
+ * partially accounted for in our ancestors.
+ */
+static int64_t
+parent_delta(dsl_dataset_t *ds, int64_t delta)
+{
+	dsl_dataset_phys_t *ds_phys;
+	uint64_t old_bytes, new_bytes;
+
+	if (ds->ds_reserved == 0)
+		return (delta);
+
+	ds_phys = dsl_dataset_phys(ds);
+	old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+	new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+
+	ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
+	return (new_bytes - old_bytes);
+}
+
+void
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	int used = bp_get_dsize_sync(spa, bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+	int64_t delta;
+	spa_feature_t f;
+
+	dprintf_bp(bp, "ds=%p", ds);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	/* It could have been compressed away to nothing */
+	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
+		return;
+	ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE);
+	ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp)));
+	if (ds == NULL) {
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    used, compressed, uncompressed);
+		return;
+	}
+
+	ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_lock);
+	delta = parent_delta(ds, used);
+	dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+	dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
+	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+		ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] =
+		    (void *)B_TRUE;
+	}
+
+
+	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+	if (f != SPA_FEATURE_NONE) {
+		ASSERT3S(spa_feature_table[f].fi_type, ==,
+		    ZFEATURE_TYPE_BOOLEAN);
+		ds->ds_feature_activation[f] = (void *)B_TRUE;
+	}
+
+	f = zio_compress_to_feature(BP_GET_COMPRESS(bp));
+	if (f != SPA_FEATURE_NONE) {
+		ASSERT3S(spa_feature_table[f].fi_type, ==,
+		    ZFEATURE_TYPE_BOOLEAN);
+		ds->ds_feature_activation[f] = (void *)B_TRUE;
+	}
+
+	/*
+	 * Track block for livelist, but ignore embedded blocks because
+	 * they do not need to be freed.
+	 */
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    !(BP_IS_EMBEDDED(bp))) {
+		ASSERT(dsl_dir_is_clone(ds->ds_dir));
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LIVELIST));
+		bplist_append(&ds->ds_dir->dd_pending_allocs, bp);
+	}
+
+	mutex_exit(&ds->ds_lock);
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
+	    compressed, uncompressed, tx);
+	dsl_dir_transfer_space(ds->ds_dir, used - delta,
+	    DD_USED_REFRSRV, DD_USED_HEAD, tx);
+}
+
+/*
+ * Called when the specified segment has been remapped, and is thus no
+ * longer referenced in the head dataset.  The vdev must be indirect.
+ *
+ * If the segment is referenced by a snapshot, put it on the remap deadlist.
+ * Otherwise, add this segment to the obsolete spacemap.
+ */
+void
+dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset,
+    uint64_t size, uint64_t birth, dmu_tx_t *tx)
+{
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(birth <= tx->tx_txg);
+	ASSERT(!ds->ds_is_snapshot);
+
+	if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
+	} else {
+		blkptr_t fakebp;
+		dva_t *dva = &fakebp.blk_dva[0];
+
+		ASSERT(ds != NULL);
+
+		mutex_enter(&ds->ds_remap_deadlist_lock);
+		if (!dsl_dataset_remap_deadlist_exists(ds)) {
+			dsl_dataset_create_remap_deadlist(ds, tx);
+		}
+		mutex_exit(&ds->ds_remap_deadlist_lock);
+
+		BP_ZERO(&fakebp);
+		fakebp.blk_birth = birth;
+		DVA_SET_VDEV(dva, vdev);
+		DVA_SET_OFFSET(dva, offset);
+		DVA_SET_ASIZE(dva, size);
+		dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE,
+		    tx);
+	}
+}
+
+int
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+    boolean_t async)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	int used = bp_get_dsize_sync(spa, bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
+	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
+		return (0);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(bp->blk_birth <= tx->tx_txg);
+
+	if (ds == NULL) {
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
+		dsl_pool_mos_diduse_space(tx->tx_pool,
+		    -used, -compressed, -uncompressed);
+		return (used);
+	}
+	ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
+
+	ASSERT(!ds->ds_is_snapshot);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+
+	/*
+	 * Track block for livelist, but ignore embedded blocks because
+	 * they do not need to be freed.
+	 */
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
+	    bp->blk_birth > ds->ds_dir->dd_origin_txg &&
+	    !(BP_IS_EMBEDDED(bp))) {
+		ASSERT(dsl_dir_is_clone(ds->ds_dir));
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LIVELIST));
+		bplist_append(&ds->ds_dir->dd_pending_frees, bp);
+	}
+
+	if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+		int64_t delta;
+
+		dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
+
+		mutex_enter(&ds->ds_lock);
+		ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
+		    !DS_UNIQUE_IS_ACCURATE(ds));
+		delta = parent_delta(ds, -used);
+		dsl_dataset_phys(ds)->ds_unique_bytes -= used;
+		mutex_exit(&ds->ds_lock);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+		    delta, -compressed, -uncompressed, tx);
+		dsl_dir_transfer_space(ds->ds_dir, -used - delta,
+		    DD_USED_REFRSRV, DD_USED_HEAD, tx);
+	} else {
+		dprintf_bp(bp, "putting on dead list: %s", "");
+		if (async) {
+			/*
+			 * We are here as part of zio's write done callback,
+			 * which means we're a zio interrupt thread.  We can't
+			 * call dsl_deadlist_insert() now because it may block
+			 * waiting for I/O.  Instead, put bp on the deferred
+			 * queue and let dsl_pool_sync() finish the job.
+			 */
+			bplist_append(&ds->ds_pending_deadlist, bp);
+		} else {
+			dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx);
+		}
+		ASSERT3U(ds->ds_prev->ds_object, ==,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj);
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
+		/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object && bp->blk_birth >
+		    dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
+			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+			mutex_enter(&ds->ds_prev->ds_lock);
+			dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
+			mutex_exit(&ds->ds_prev->ds_lock);
+		}
+		if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+			dsl_dir_transfer_space(ds->ds_dir, used,
+			    DD_USED_HEAD, DD_USED_SNAP, tx);
+		}
+	}
+
+	dsl_bookmark_block_killed(ds, bp, tx);
+
+	mutex_enter(&ds->ds_lock);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+	dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+	dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+	dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
+	mutex_exit(&ds->ds_lock);
+
+	return (used);
+}
+
+struct feature_type_uint64_array_arg {
+	uint64_t length;
+	uint64_t *array;
+};
+
+static void
+unload_zfeature(dsl_dataset_t *ds, spa_feature_t f)
+{
+	switch (spa_feature_table[f].fi_type) {
+	case ZFEATURE_TYPE_BOOLEAN:
+		break;
+	case ZFEATURE_TYPE_UINT64_ARRAY:
+	{
+		struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];
+		kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t));
+		kmem_free(ftuaa, sizeof (*ftuaa));
+		break;
+	}
+	default:
+		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+	}
+}
+
+static int
+load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f)
+{
+	int err = 0;
+	switch (spa_feature_table[f].fi_type) {
+	case ZFEATURE_TYPE_BOOLEAN:
+		err = zap_contains(mos, ds->ds_object,
+		    spa_feature_table[f].fi_guid);
+		if (err == 0) {
+			ds->ds_feature[f] = (void *)B_TRUE;
+		} else {
+			ASSERT3U(err, ==, ENOENT);
+			err = 0;
+		}
+		break;
+	case ZFEATURE_TYPE_UINT64_ARRAY:
+	{
+		uint64_t int_size, num_int;
+		uint64_t *data;
+		err = zap_length(mos, ds->ds_object,
+		    spa_feature_table[f].fi_guid, &int_size, &num_int);
+		if (err != 0) {
+			ASSERT3U(err, ==, ENOENT);
+			err = 0;
+			break;
+		}
+		ASSERT3U(int_size, ==, sizeof (uint64_t));
+		data = kmem_alloc(int_size * num_int, KM_SLEEP);
+		VERIFY0(zap_lookup(mos, ds->ds_object,
+		    spa_feature_table[f].fi_guid, int_size, num_int, data));
+		struct feature_type_uint64_array_arg *ftuaa =
+		    kmem_alloc(sizeof (*ftuaa), KM_SLEEP);
+		ftuaa->length = num_int;
+		ftuaa->array = data;
+		ds->ds_feature[f] = ftuaa;
+		break;
+	}
+	default:
+		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+	}
+	return (err);
+}
+
+/*
+ * We have to release the fsid synchronously or we risk that a subsequent
+ * mount of the same dataset will fail to unique_insert the fsid.  This
+ * failure would manifest itself as the fsid of this dataset changing
+ * between mounts which makes NFS clients quite unhappy.
+ */
+static void
+dsl_dataset_evict_sync(void *dbu)
+{
+	dsl_dataset_t *ds = dbu;
+
+	ASSERT(ds->ds_owner == NULL);
+
+	unique_remove(ds->ds_fsid_guid);
+}
+
+static void
+dsl_dataset_evict_async(void *dbu)
+{
+	dsl_dataset_t *ds = dbu;
+
+	ASSERT(ds->ds_owner == NULL);
+
+	ds->ds_dbuf = NULL;
+
+	if (ds->ds_objset != NULL)
+		dmu_objset_evict(ds->ds_objset);
+
+	if (ds->ds_prev) {
+		dsl_dataset_rele(ds->ds_prev, ds);
+		ds->ds_prev = NULL;
+	}
+
+	dsl_bookmark_fini_ds(ds);
+
+	bplist_destroy(&ds->ds_pending_deadlist);
+	if (dsl_deadlist_is_open(&ds->ds_deadlist))
+		dsl_deadlist_close(&ds->ds_deadlist);
+	if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+		dsl_deadlist_close(&ds->ds_remap_deadlist);
+	if (ds->ds_dir)
+		dsl_dir_async_rele(ds->ds_dir, ds);
+
+	ASSERT(!list_link_active(&ds->ds_synced_link));
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (dsl_dataset_feature_is_active(ds, f))
+			unload_zfeature(ds, f);
+	}
+
+	list_destroy(&ds->ds_prop_cbs);
+	mutex_destroy(&ds->ds_lock);
+	mutex_destroy(&ds->ds_opening_lock);
+	mutex_destroy(&ds->ds_sendstream_lock);
+	mutex_destroy(&ds->ds_remap_deadlist_lock);
+	zfs_refcount_destroy(&ds->ds_longholds);
+	rrw_destroy(&ds->ds_bp_rwlock);
+
+	kmem_free(ds, sizeof (dsl_dataset_t));
+}
+
+int
+dsl_dataset_get_snapname(dsl_dataset_t *ds)
+{
+	dsl_dataset_phys_t *headphys;
+	int err;
+	dmu_buf_t *headdbuf;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (ds->ds_snapname[0])
+		return (0);
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
+		return (0);
+
+	err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
+	    FTAG, &headdbuf);
+	if (err != 0)
+		return (err);
+	headphys = headdbuf->db_data;
+	err = zap_value_search(dp->dp_meta_objset,
+	    headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname);
+	if (err != 0 && zfs_recover == B_TRUE) {
+		err = 0;
+		(void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname),
+		    "SNAPOBJ=%llu-ERR=%d",
+		    (unsigned long long)ds->ds_object, err);
+	}
+	dmu_buf_rele(headdbuf, FTAG);
+	return (err);
+}
+
+int
+dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+	matchtype_t mt = 0;
+	int err;
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_NORMALIZE;
+
+	err = zap_lookup_norm(mos, snapobj, name, 8, 1,
+	    value, mt, NULL, 0, NULL);
+	if (err == ENOTSUP && (mt & MT_NORMALIZE))
+		err = zap_lookup(mos, snapobj, name, 8, 1, value);
+	return (err);
+}
+
+int
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+    boolean_t adj_cnt)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
+	matchtype_t mt = 0;
+	int err;
+
+	dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+	if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
+		mt = MT_NORMALIZE;
+
+	err = zap_remove_norm(mos, snapobj, name, mt, tx);
+	if (err == ENOTSUP && (mt & MT_NORMALIZE))
+		err = zap_remove(mos, snapobj, name, tx);
+
+	if (err == 0 && adj_cnt)
+		dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
+	return (err);
+}
+
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+	dmu_buf_t *dbuf = ds->ds_dbuf;
+	boolean_t result = B_FALSE;
+
+	if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+	    ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+		if (ds == dmu_buf_get_user(dbuf))
+			result = B_TRUE;
+		else
+			dmu_buf_rele(dbuf, tag);
+	}
+
+	return (result);
+}
+
+int
+dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
+    dsl_dataset_t **dsp)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	dmu_buf_t *dbuf;
+	dsl_dataset_t *ds;
+	int err;
+	dmu_object_info_t doi;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+	if (err != 0)
+		return (err);
+
+	/* Make sure dsobj has the correct object type. */
+	dmu_object_info_from_db(dbuf, &doi);
+	if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
+		dmu_buf_rele(dbuf, tag);
+		return (SET_ERROR(EINVAL));
+	}
+
+	ds = dmu_buf_get_user(dbuf);
+	if (ds == NULL) {
+		dsl_dataset_t *winner = NULL;
+
+		ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
+		ds->ds_dbuf = dbuf;
+		ds->ds_object = dsobj;
+		ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
+		list_link_init(&ds->ds_synced_link);
+
+		err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj,
+		    NULL, ds, &ds->ds_dir);
+		if (err != 0) {
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			dmu_buf_rele(dbuf, tag);
+			return (err);
+		}
+
+		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_remap_deadlist_lock,
+		    NULL, MUTEX_DEFAULT, NULL);
+		rrw_init(&ds->ds_bp_rwlock, B_FALSE);
+		zfs_refcount_create(&ds->ds_longholds);
+
+		bplist_create(&ds->ds_pending_deadlist);
+
+		list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t),
+		    offsetof(dmu_sendstatus_t, dss_link));
+
+		list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
+		    offsetof(dsl_prop_cb_record_t, cbr_ds_node));
+
+		if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+			spa_feature_t f;
+
+			for (f = 0; f < SPA_FEATURES; f++) {
+				if (!(spa_feature_table[f].fi_flags &
+				    ZFEATURE_FLAG_PER_DATASET))
+					continue;
+				err = load_zfeature(mos, ds, f);
+			}
+		}
+
+		if (!ds->ds_is_snapshot) {
+			ds->ds_snapname[0] = '\0';
+			if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+				err = dsl_dataset_hold_obj(dp,
+				    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+				    ds, &ds->ds_prev);
+			}
+			err = dsl_bookmark_init_ds(ds);
+		} else {
+			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+				err = dsl_dataset_get_snapname(ds);
+			if (err == 0 &&
+			    dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+				err = zap_count(
+				    ds->ds_dir->dd_pool->dp_meta_objset,
+				    dsl_dataset_phys(ds)->ds_userrefs_obj,
+				    &ds->ds_userrefs);
+			}
+		}
+
+		if (err == 0 && !ds->ds_is_snapshot) {
+			err = dsl_prop_get_int_ds(ds,
+			    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+			    &ds->ds_reserved);
+			if (err == 0) {
+				err = dsl_prop_get_int_ds(ds,
+				    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+				    &ds->ds_quota);
+			}
+		} else {
+			ds->ds_reserved = ds->ds_quota = 0;
+		}
+
+		if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 &&
+		    ds->ds_is_snapshot &&
+		    zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) {
+			dp->dp_spa->spa_errata =
+			    ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
+		}
+
+		dsl_deadlist_open(&ds->ds_deadlist,
+		    mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
+		uint64_t remap_deadlist_obj =
+		    dsl_dataset_get_remap_deadlist_object(ds);
+		if (remap_deadlist_obj != 0) {
+			dsl_deadlist_open(&ds->ds_remap_deadlist, mos,
+			    remap_deadlist_obj);
+		}
+
+		dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync,
+		    dsl_dataset_evict_async, &ds->ds_dbuf);
+		if (err == 0)
+			winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+		if (err != 0 || winner != NULL) {
+			bplist_destroy(&ds->ds_pending_deadlist);
+			dsl_deadlist_close(&ds->ds_deadlist);
+			if (dsl_deadlist_is_open(&ds->ds_remap_deadlist))
+				dsl_deadlist_close(&ds->ds_remap_deadlist);
+			dsl_bookmark_fini_ds(ds);
+			if (ds->ds_prev)
+				dsl_dataset_rele(ds->ds_prev, ds);
+			dsl_dir_rele(ds->ds_dir, ds);
+			for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+				if (dsl_dataset_feature_is_active(ds, f))
+					unload_zfeature(ds, f);
+			}
+
+			list_destroy(&ds->ds_prop_cbs);
+			list_destroy(&ds->ds_sendstreams);
+			mutex_destroy(&ds->ds_lock);
+			mutex_destroy(&ds->ds_opening_lock);
+			mutex_destroy(&ds->ds_sendstream_lock);
+			mutex_destroy(&ds->ds_remap_deadlist_lock);
+			zfs_refcount_destroy(&ds->ds_longholds);
+			rrw_destroy(&ds->ds_bp_rwlock);
+			kmem_free(ds, sizeof (dsl_dataset_t));
+			if (err != 0) {
+				dmu_buf_rele(dbuf, tag);
+				return (err);
+			}
+			ds = winner;
+		} else {
+			ds->ds_fsid_guid =
+			    unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
+			if (ds->ds_fsid_guid !=
+			    dsl_dataset_phys(ds)->ds_fsid_guid) {
+				zfs_dbgmsg("ds_fsid_guid changed from "
+				    "%llx to %llx for pool %s dataset id %llu",
+				    (long long)
+				    dsl_dataset_phys(ds)->ds_fsid_guid,
+				    (long long)ds->ds_fsid_guid,
+				    spa_name(dp->dp_spa),
+				    dsobj);
+			}
+		}
+	}
+
+	ASSERT3P(ds->ds_dbuf, ==, dbuf);
+	ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
+	    spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
+	    dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
+	*dsp = ds;
+
+	return (0);
+}
+
+int
+dsl_dataset_create_key_mapping(dsl_dataset_t *ds)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+
+	if (dd->dd_crypto_obj == 0)
+		return (0);
+
+	return (spa_keystore_create_mapping(dd->dd_pool->dp_spa,
+	    ds, ds, &ds->ds_key_mapping));
+}
+
+int
+dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj,
+    ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+{
+	int err;
+
+	err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
+	if (err != 0)
+		return (err);
+
+	ASSERT3P(*dsp, !=, NULL);
+
+	if (flags & DS_HOLD_FLAG_DECRYPT) {
+		err = dsl_dataset_create_key_mapping(*dsp);
+		if (err != 0)
+			dsl_dataset_rele(*dsp, tag);
+	}
+
+	return (err);
+}
+
+int
+dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+    void *tag, dsl_dataset_t **dsp)
+{
+	dsl_dir_t *dd;
+	const char *snapname;
+	uint64_t obj;
+	int err = 0;
+	dsl_dataset_t *ds;
+
+	err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
+	if (err != 0)
+		return (err);
+
+	ASSERT(dsl_pool_config_held(dp));
+	obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
+	if (obj != 0)
+		err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds);
+	else
+		err = SET_ERROR(ENOENT);
+
+	/* we may be looking for a snapshot */
+	if (err == 0 && snapname != NULL) {
+		dsl_dataset_t *snap_ds;
+
+		if (*snapname++ != '@') {
+			dsl_dataset_rele_flags(ds, flags, tag);
+			dsl_dir_rele(dd, FTAG);
+			return (SET_ERROR(ENOENT));
+		}
+
+		dprintf("looking for snapshot '%s'\n", snapname);
+		err = dsl_dataset_snap_lookup(ds, snapname, &obj);
+		if (err == 0) {
+			err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag,
+			    &snap_ds);
+		}
+		dsl_dataset_rele_flags(ds, flags, tag);
+
+		if (err == 0) {
+			mutex_enter(&snap_ds->ds_lock);
+			if (snap_ds->ds_snapname[0] == 0)
+				(void) strlcpy(snap_ds->ds_snapname, snapname,
+				    sizeof (snap_ds->ds_snapname));
+			mutex_exit(&snap_ds->ds_lock);
+			ds = snap_ds;
+		}
+	}
+	if (err == 0)
+		*dsp = ds;
+	dsl_dir_rele(dd, FTAG);
+	return (err);
+}
+
+int
+dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag,
+    dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp));
+}
+
+static int
+dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
+    void *tag, boolean_t override, dsl_dataset_t **dsp)
+{
+	int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp);
+	if (err != 0)
+		return (err);
+	if (!dsl_dataset_tryown(*dsp, tag, override)) {
+		dsl_dataset_rele_flags(*dsp, flags, tag);
+		*dsp = NULL;
+		return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+
+int
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags,
+    void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp));
+}
+
+int
+dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj,
+    ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp));
+}
+
+static int
+dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+    void *tag, boolean_t override, dsl_dataset_t **dsp)
+{
+	int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp);
+	if (err != 0)
+		return (err);
+	if (!dsl_dataset_tryown(*dsp, tag, override)) {
+		dsl_dataset_rele_flags(*dsp, flags, tag);
+		return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+int
+dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+    void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp));
+}
+
+int
+dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags,
+    void *tag, dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp));
+}
+
+/*
+ * See the comment above dsl_pool_hold() for details.  In summary, a long
+ * hold is used to prevent destruction of a dataset while the pool hold
+ * is dropped, allowing other concurrent operations (e.g. spa_sync()).
+ *
+ * The dataset and pool must be held when this function is called.  After it
+ * is called, the pool hold may be released while the dataset is still held
+ * and accessed.
+ */
+void
+dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag)
+{
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	(void) zfs_refcount_add(&ds->ds_longholds, tag);
+}
+
+void
+dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag)
+{
+	(void) zfs_refcount_remove(&ds->ds_longholds, tag);
+}
+
+/* Return B_TRUE if there are any long holds on this dataset. */
+boolean_t
+dsl_dataset_long_held(dsl_dataset_t *ds)
+{
+	return (!zfs_refcount_is_zero(&ds->ds_longholds));
+}
+
+void
+dsl_dataset_name(dsl_dataset_t *ds, char *name)
+{
+	if (ds == NULL) {
+		(void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN);
+	} else {
+		dsl_dir_name(ds->ds_dir, name);
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		if (ds->ds_snapname[0]) {
+			VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
+			    <, ZFS_MAX_DATASET_NAME_LEN);
+			/*
+			 * We use a "recursive" mutex so that we
+			 * can call dprintf_ds() with ds_lock held.
+			 */
+			if (!MUTEX_HELD(&ds->ds_lock)) {
+				mutex_enter(&ds->ds_lock);
+				VERIFY3U(strlcat(name, ds->ds_snapname,
+				    ZFS_MAX_DATASET_NAME_LEN), <,
+				    ZFS_MAX_DATASET_NAME_LEN);
+				mutex_exit(&ds->ds_lock);
+			} else {
+				VERIFY3U(strlcat(name, ds->ds_snapname,
+				    ZFS_MAX_DATASET_NAME_LEN), <,
+				    ZFS_MAX_DATASET_NAME_LEN);
+			}
+		}
+	}
+}
+
+int
+dsl_dataset_namelen(dsl_dataset_t *ds)
+{
+	VERIFY0(dsl_dataset_get_snapname(ds));
+	mutex_enter(&ds->ds_lock);
+	int len = strlen(ds->ds_snapname);
+	mutex_exit(&ds->ds_lock);
+	/* add '@' if ds is a snap */
+	if (len > 0)
+		len++;
+	len += dsl_dir_namelen(ds->ds_dir);
+	return (len);
+}
+
+void
+dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
+{
+	dmu_buf_rele(ds->ds_dbuf, tag);
+}
+
+void
+dsl_dataset_remove_key_mapping(dsl_dataset_t *ds)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+
+	if (dd == NULL || dd->dd_crypto_obj == 0)
+		return;
+
+	(void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa,
+	    ds->ds_object, ds);
+}
+
+void
+dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+{
+	if (flags & DS_HOLD_FLAG_DECRYPT)
+		dsl_dataset_remove_key_mapping(ds);
+
+	dsl_dataset_rele(ds, tag);
+}
+
+void
+dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag)
+{
+	ASSERT3P(ds->ds_owner, ==, tag);
+	ASSERT(ds->ds_dbuf != NULL);
+
+	mutex_enter(&ds->ds_lock);
+	ds->ds_owner = NULL;
+	mutex_exit(&ds->ds_lock);
+	dsl_dataset_long_rele(ds, tag);
+	dsl_dataset_rele_flags(ds, flags, tag);
+}
+
+boolean_t
+dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override)
+{
+	boolean_t gotit = FALSE;
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) ||
+	    (dsl_dataset_feature_is_active(ds,
+	    SPA_FEATURE_REDACTED_DATASETS) &&
+	    !zfs_allow_redacted_dataset_mount)))) {
+		ds->ds_owner = tag;
+		dsl_dataset_long_hold(ds, tag);
+		gotit = TRUE;
+	}
+	mutex_exit(&ds->ds_lock);
+	return (gotit);
+}
+
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+	boolean_t rv;
+	mutex_enter(&ds->ds_lock);
+	rv = (ds->ds_owner != NULL);
+	mutex_exit(&ds->ds_lock);
+	return (rv);
+}
+
+static boolean_t
+zfeature_active(spa_feature_t f, void *arg)
+{
+	switch (spa_feature_table[f].fi_type) {
+	case ZFEATURE_TYPE_BOOLEAN: {
+		boolean_t val = (boolean_t)(uintptr_t)arg;
+		ASSERT(val == B_FALSE || val == B_TRUE);
+		return (val);
+	}
+	case ZFEATURE_TYPE_UINT64_ARRAY:
+		/*
+		 * In this case, arg is a uint64_t array.  The feature is active
+		 * if the array is non-null.
+		 */
+		return (arg != NULL);
+	default:
+		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+		return (B_FALSE);
+	}
+}
+
+boolean_t
+dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f)
+{
+	return (zfeature_active(f, ds->ds_feature[f]));
+}
+
+/*
+ * The buffers passed out by this function are references to internal buffers;
+ * they should not be freed by callers of this function, and they should not be
+ * used after the dataset has been released.
+ */
+boolean_t
+dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f,
+    uint64_t *outlength, uint64_t **outp)
+{
+	VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY);
+	if (!dsl_dataset_feature_is_active(ds, f)) {
+		return (B_FALSE);
+	}
+	struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f];
+	*outp = ftuaa->array;
+	*outlength = ftuaa->length;
+	return (B_TRUE);
+}
+
+void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+	uint64_t zero = 0;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	spa_feature_incr(spa, f, tx);
+	dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+	switch (spa_feature_table[f].fi_type) {
+	case ZFEATURE_TYPE_BOOLEAN:
+		ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE);
+		VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+		    sizeof (zero), 1, &zero, tx));
+		break;
+	case ZFEATURE_TYPE_UINT64_ARRAY:
+	{
+		struct feature_type_uint64_array_arg *ftuaa = arg;
+		VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+		    sizeof (uint64_t), ftuaa->length, ftuaa->array, tx));
+		break;
+	}
+	default:
+		panic("Invalid zfeature type %d", spa_feature_table[f].fi_type);
+	}
+}
+
+static void
+dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+	uint64_t dsobj = ds->ds_object;
+
+	VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+	VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+	spa_feature_decr(spa, f, tx);
+	ds->ds_feature[f] = NULL;
+}
+
+void
+dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx)
+{
+	unload_zfeature(ds, f);
+	dsl_dataset_deactivate_feature_impl(ds, f, tx);
+}
+
+uint64_t
+dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
+    dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dd->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (origin == NULL)
+		origin = dp->dp_origin_snap;
+
+	ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
+	ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	bzero(dsphys, sizeof (dsl_dataset_phys_t));
+	dsphys->ds_dir_obj = dd->dd_object;
+	dsphys->ds_flags = flags;
+	dsphys->ds_fsid_guid = unique_create();
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_snapnames_zapobj =
+	    zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP,
+	    DMU_OT_NONE, 0, tx);
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg;
+
+	if (origin == NULL) {
+		dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx);
+	} else {
+		dsl_dataset_t *ohds; /* head of the origin snapshot */
+
+		dsphys->ds_prev_snap_obj = origin->ds_object;
+		dsphys->ds_prev_snap_txg =
+		    dsl_dataset_phys(origin)->ds_creation_txg;
+		dsphys->ds_referenced_bytes =
+		    dsl_dataset_phys(origin)->ds_referenced_bytes;
+		dsphys->ds_compressed_bytes =
+		    dsl_dataset_phys(origin)->ds_compressed_bytes;
+		dsphys->ds_uncompressed_bytes =
+		    dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+		rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
+		dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
+		rrw_exit(&origin->ds_bp_rwlock, FTAG);
+
+		/*
+		 * Inherit flags that describe the dataset's contents
+		 * (INCONSISTENT) or properties (Case Insensitive).
+		 */
+		dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
+		    (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+
+		for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+			if (zfeature_active(f, origin->ds_feature[f])) {
+				dsl_dataset_activate_feature(dsobj, f,
+				    origin->ds_feature[f], tx);
+			}
+		}
+
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		dsl_dataset_phys(origin)->ds_num_children++;
+
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+		    FTAG, &ohds));
+		dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
+		    dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
+		dsl_dataset_rele(ohds, FTAG);
+
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
+			if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+				dsl_dataset_phys(origin)->ds_next_clones_obj =
+				    zap_create(mos,
+				    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY0(zap_add_int(mos,
+			    dsl_dataset_phys(origin)->ds_next_clones_obj,
+			    dsobj, tx));
+		}
+
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+				dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+				dsl_dir_phys(origin->ds_dir)->dd_clones =
+				    zap_create(mos,
+				    DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+			}
+			VERIFY0(zap_add_int(mos,
+			    dsl_dir_phys(origin->ds_dir)->dd_clones,
+			    dsobj, tx));
+		}
+	}
+
+	/* handle encryption */
+	dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx);
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+	dmu_buf_rele(dbuf, FTAG);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
+
+	return (dsobj);
+}
+
+static void
+dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *os;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+		dsl_pool_t *dp = ds->ds_dir->dd_pool;
+		zio_t *zio;
+
+		bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+		if (os->os_encrypted)
+			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+
+		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+		dsl_dataset_sync(ds, zio, tx);
+		VERIFY0(zio_wait(zio));
+
+		/* dsl_dataset_sync_done will drop this reference. */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+		dsl_dataset_sync_done(ds, tx);
+	}
+}
+
+uint64_t
+dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname,
+    dsl_dataset_t *origin, uint64_t flags, cred_t *cr,
+    dsl_crypto_params_t *dcp, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = pdd->dd_pool;
+	uint64_t dsobj, ddobj;
+	dsl_dir_t *dd;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(lastname[0] != '@');
+	/*
+	 * Filesystems will eventually have their origin set to dp_origin_snap,
+	 * but that's taken care of in dsl_dataset_create_sync_dd. When
+	 * creating a filesystem, this function is called with origin equal to
+	 * NULL.
+	 */
+	if (origin != NULL)
+		ASSERT3P(origin, !=, dp->dp_origin_snap);
+
+	ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx);
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd));
+
+	dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp,
+	    flags & ~DS_CREATE_FLAG_NODIRTY, tx);
+
+	dsl_deleg_set_create_perms(dd, tx, cr);
+
+	/*
+	 * If we are creating a clone and the livelist feature is enabled,
+	 * add the entry DD_FIELD_LIVELIST to ZAP.
+	 */
+	if (origin != NULL &&
+	    spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) {
+		objset_t *mos = dd->dd_pool->dp_meta_objset;
+		dsl_dir_zapify(dd, tx);
+		uint64_t obj = dsl_deadlist_alloc(mos, tx);
+		VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST,
+		    sizeof (uint64_t), 1, &obj, tx));
+		spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx);
+	}
+
+	/*
+	 * Since we're creating a new node we know it's a leaf, so we can
+	 * initialize the counts if the limit feature is active.
+	 */
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		uint64_t cnt = 0;
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+
+		dsl_dir_zapify(dd, tx);
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+		VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+		    sizeof (cnt), 1, &cnt, tx));
+	}
+
+	dsl_dir_rele(dd, FTAG);
+
+	/*
+	 * If we are creating a clone, make sure we zero out any stale
+	 * data from the origin snapshots zil header.
+	 */
+	if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		dsl_dataset_zero_zil(ds, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+
+	return (dsobj);
+}
+
+/*
+ * The unique space in the head dataset can be calculated by subtracting
+ * the space used in the most recent snapshot, that is still being used
+ * in this file system, from the space currently in use.  To figure out
+ * the space in the most recent snapshot still in use, we need to take
+ * the total space used in the snapshot and subtract out the space that
+ * has been freed up since the snapshot was taken.
+ */
+void
+dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds)
+{
+	uint64_t mrs_used;
+	uint64_t dlused, dlcomp, dluncomp;
+
+	ASSERT(!ds->ds_is_snapshot);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+		mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
+	else
+		mrs_used = 0;
+
+	dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
+
+	ASSERT3U(dlused, <=, mrs_used);
+	dsl_dataset_phys(ds)->ds_unique_bytes =
+	    dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
+	    SPA_VERSION_UNIQUE_ACCURATE)
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+}
+
+void
+dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t count __maybe_unused;
+	int err;
+
+	ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+	err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    obj, tx);
+	/*
+	 * The err should not be ENOENT, but a bug in a previous version
+	 * of the code could cause upgrade_clones_cb() to not set
+	 * ds_next_snap_obj when it should, leading to a missing entry.
+	 * If we knew that the pool was created after
+	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+	 * ENOENT.  However, at least we can check that we don't have
+	 * too many entries in the next_clones_obj even after failing to
+	 * remove this one.
+	 */
+	if (err != ENOENT)
+		VERIFY0(err);
+	ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+	    &count));
+	ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
+}
+
+
+blkptr_t *
+dsl_dataset_get_blkptr(dsl_dataset_t *ds)
+{
+	return (&dsl_dataset_phys(ds)->ds_bp);
+}
+
+spa_t *
+dsl_dataset_get_spa(dsl_dataset_t *ds)
+{
+	return (ds->ds_dir->dd_pool->dp_spa);
+}
+
+void
+dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp;
+
+	if (ds == NULL) /* this is the meta-objset */
+		return;
+
+	ASSERT(ds->ds_objset != NULL);
+
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
+		panic("dirtying snapshot!");
+
+	/* Must not dirty a dataset in the same txg where it got snapshotted. */
+	ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
+	dp = ds->ds_dir->dd_pool;
+	if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
+		objset_t *os = ds->ds_objset;
+
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(ds->ds_dbuf, ds);
+
+		/* if this dataset is encrypted, grab a reference to the DCK */
+		if (ds->ds_dir->dd_crypto_obj != 0 &&
+		    !os->os_raw_receive &&
+		    !os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
+			ASSERT3P(ds->ds_key_mapping, !=, NULL);
+			key_mapping_add_ref(ds->ds_key_mapping, ds);
+		}
+	}
+}
+
+static int
+dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t asize;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	/*
+	 * If there's an fs-only reservation, any blocks that might become
+	 * owned by the snapshot dataset must be accommodated by space
+	 * outside of the reservation.
+	 */
+	ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
+	asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
+	if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+		return (SET_ERROR(ENOSPC));
+
+	/*
+	 * Propagate any reserved space for this snapshot to other
+	 * snapshot checks in this sync group.
+	 */
+	if (asize > 0)
+		dsl_dir_willuse_space(ds->ds_dir, asize, tx);
+
+	return (0);
+}
+
+int
+dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc)
+{
+	int error;
+	uint64_t value;
+
+	ds->ds_trysnap_txg = tx->tx_txg;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	/*
+	 * We don't allow multiple snapshots of the same txg.  If there
+	 * is already one, try again.
+	 */
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
+		return (SET_ERROR(EAGAIN));
+
+	/*
+	 * Check for conflicting snapshot name.
+	 */
+	error = dsl_dataset_snap_lookup(ds, snapname, &value);
+	if (error == 0)
+		return (SET_ERROR(EEXIST));
+	if (error != ENOENT)
+		return (error);
+
+	/*
+	 * We don't allow taking snapshots of inconsistent datasets, such as
+	 * those into which we are currently receiving.  However, if we are
+	 * creating this snapshot as part of a receive, this check will be
+	 * executed atomically with respect to the completion of the receive
+	 * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
+	 * case we ignore this, knowing it will be fixed up for us shortly in
+	 * dmu_recv_end_sync().
+	 */
+	if (!recv && DS_IS_INCONSISTENT(ds))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Skip the check for temporary snapshots or if we have already checked
+	 * the counts in dsl_dataset_snapshot_check. This means we really only
+	 * check the count here when we're receiving a stream.
+	 */
+	if (cnt != 0 && cr != NULL) {
+		error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+		    ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr, proc);
+		if (error != 0)
+			return (error);
+	}
+
+	error = dsl_dataset_snapshot_reserve_space(ds, tx);
+	if (error != 0)
+		return (error);
+
+	return (0);
+}
+
+int
+dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+	int rv = 0;
+
+	/*
+	 * Pre-compute how many total new snapshots will be created for each
+	 * level in the tree and below. This is needed for validating the
+	 * snapshot limit when either taking a recursive snapshot or when
+	 * taking multiple snapshots.
+	 *
+	 * The problem is that the counts are not actually adjusted when
+	 * we are checking, only when we finally sync. For a single snapshot,
+	 * this is easy, the count will increase by 1 at each node up the tree,
+	 * but its more complicated for the recursive/multiple snapshot case.
+	 *
+	 * The dsl_fs_ss_limit_check function does recursively check the count
+	 * at each level up the tree but since it is validating each snapshot
+	 * independently we need to be sure that we are validating the complete
+	 * count for the entire set of snapshots. We do this by rolling up the
+	 * counts for each component of the name into an nvlist and then
+	 * checking each of those cases with the aggregated count.
+	 *
+	 * This approach properly handles not only the recursive snapshot
+	 * case (where we get all of those on the ddsa_snaps list) but also
+	 * the sibling case (e.g. snapshot a/b and a/c so that we will also
+	 * validate the limit on 'a' using a count of 2).
+	 *
+	 * We validate the snapshot names in the third loop and only report
+	 * name errors once.
+	 */
+	if (dmu_tx_is_syncing(tx)) {
+		char *nm;
+		nvlist_t *cnt_track = NULL;
+		cnt_track = fnvlist_alloc();
+
+		nm = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+		/* Rollup aggregated counts into the cnt_track list */
+		for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+		    pair != NULL;
+		    pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+			char *pdelim;
+			uint64_t val;
+
+			(void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN);
+			pdelim = strchr(nm, '@');
+			if (pdelim == NULL)
+				continue;
+			*pdelim = '\0';
+
+			do {
+				if (nvlist_lookup_uint64(cnt_track, nm,
+				    &val) == 0) {
+					/* update existing entry */
+					fnvlist_add_uint64(cnt_track, nm,
+					    val + 1);
+				} else {
+					/* add to list */
+					fnvlist_add_uint64(cnt_track, nm, 1);
+				}
+
+				pdelim = strrchr(nm, '/');
+				if (pdelim != NULL)
+					*pdelim = '\0';
+			} while (pdelim != NULL);
+		}
+
+		kmem_free(nm, MAXPATHLEN);
+
+		/* Check aggregated counts at each level */
+		for (pair = nvlist_next_nvpair(cnt_track, NULL);
+		    pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+			int error = 0;
+			char *name;
+			uint64_t cnt = 0;
+			dsl_dataset_t *ds;
+
+			name = nvpair_name(pair);
+			cnt = fnvpair_value_uint64(pair);
+			ASSERT(cnt > 0);
+
+			error = dsl_dataset_hold(dp, name, FTAG, &ds);
+			if (error == 0) {
+				error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+				    ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+				    ddsa->ddsa_cr, ddsa->ddsa_proc);
+				dsl_dataset_rele(ds, FTAG);
+			}
+
+			if (error != 0) {
+				if (ddsa->ddsa_errors != NULL)
+					fnvlist_add_int32(ddsa->ddsa_errors,
+					    name, error);
+				rv = error;
+				/* only report one error for this check */
+				break;
+			}
+		}
+		nvlist_free(cnt_track);
+	}
+
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		int error = 0;
+		dsl_dataset_t *ds;
+		char *name, *atp = NULL;
+		char dsname[ZFS_MAX_DATASET_NAME_LEN];
+
+		name = nvpair_name(pair);
+		if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
+			error = SET_ERROR(ENAMETOOLONG);
+		if (error == 0) {
+			atp = strchr(name, '@');
+			if (atp == NULL)
+				error = SET_ERROR(EINVAL);
+			if (error == 0)
+				(void) strlcpy(dsname, name, atp - name + 1);
+		}
+		if (error == 0)
+			error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+		if (error == 0) {
+			/* passing 0/NULL skips dsl_fs_ss_limit_check */
+			error = dsl_dataset_snapshot_check_impl(ds,
+			    atp + 1, tx, B_FALSE, 0, NULL, NULL);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error != 0) {
+			if (ddsa->ddsa_errors != NULL) {
+				fnvlist_add_int32(ddsa->ddsa_errors,
+				    name, error);
+			}
+			rv = error;
+		}
+	}
+
+	return (rv);
+}
+
+void
+dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dmu_buf_t *dbuf;
+	dsl_dataset_phys_t *dsphys;
+	uint64_t dsobj, crtxg;
+	objset_t *mos = dp->dp_meta_objset;
+	static zil_header_t zero_zil __maybe_unused;
+	objset_t *os __maybe_unused;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	/*
+	 * If we are on an old pool, the zil must not be active, in which
+	 * case it will be zeroed.  Usually zil_suspend() accomplishes this.
+	 */
+	ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP ||
+	    dmu_objset_from_ds(ds, &os) != 0 ||
+	    bcmp(&os->os_phys->os_zil_header, &zero_zil,
+	    sizeof (zero_zil)) == 0);
+
+	/* Should not snapshot a dirty dataset. */
+	ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+	    ds, tx->tx_txg));
+
+	dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
+
+	/*
+	 * The origin's ds_creation_txg has to be < TXG_INITIAL
+	 */
+	if (strcmp(snapname, ORIGIN_DIR_NAME) == 0)
+		crtxg = 1;
+	else
+		crtxg = tx->tx_txg;
+
+	dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
+	    DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
+	VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
+	dmu_buf_will_dirty(dbuf, tx);
+	dsphys = dbuf->db_data;
+	bzero(dsphys, sizeof (dsl_dataset_phys_t));
+	dsphys->ds_dir_obj = ds->ds_dir->dd_object;
+	dsphys->ds_fsid_guid = unique_create();
+	(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
+	    sizeof (dsphys->ds_guid));
+	dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	dsphys->ds_next_snap_obj = ds->ds_object;
+	dsphys->ds_num_children = 1;
+	dsphys->ds_creation_time = gethrestime_sec();
+	dsphys->ds_creation_txg = crtxg;
+	dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+	dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+	dsphys->ds_uncompressed_bytes =
+	    dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+	dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	dmu_buf_rele(dbuf, FTAG);
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (zfeature_active(f, ds->ds_feature[f])) {
+			dsl_dataset_activate_feature(dsobj, f,
+			    ds->ds_feature[f], tx);
+		}
+	}
+
+	ASSERT3U(ds->ds_prev != 0, ==,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+	if (ds->ds_prev) {
+		uint64_t next_clones_obj =
+		    dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+		ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object ||
+		    dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+		    ds->ds_object) {
+			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+			ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+			    dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+			dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
+		} else if (next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
+			    dsphys->ds_next_snap_obj, tx);
+			VERIFY0(zap_add_int(mos,
+			    next_clones_obj, dsobj, tx));
+		}
+	}
+
+	/*
+	 * If we have a reference-reservation on this dataset, we will
+	 * need to increase the amount of refreservation being charged
+	 * since our unique space is going to zero.
+	 */
+	if (ds->ds_reserved) {
+		int64_t delta;
+		ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+		delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+		    ds->ds_reserved);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
+		    delta, 0, 0, tx);
+	}
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
+	dsl_deadlist_add_key(&ds->ds_deadlist,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+	dsl_bookmark_snapshotted(ds, tx);
+
+	if (dsl_dataset_remap_deadlist_exists(ds)) {
+		uint64_t remap_deadlist_obj =
+		    dsl_dataset_get_remap_deadlist_object(ds);
+		/*
+		 * Move the remap_deadlist to the snapshot.  The head
+		 * will create a new remap deadlist on demand, from
+		 * dsl_dataset_block_remapped().
+		 */
+		dsl_dataset_unset_remap_deadlist_object(ds, tx);
+		dsl_deadlist_close(&ds->ds_remap_deadlist);
+
+		dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST,
+		    sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx));
+	}
+
+	/*
+	 * Create a ivset guid for this snapshot if the dataset is
+	 * encrypted. This may be overridden by a raw receive. A
+	 * previous implementation of this code did not have this
+	 * field as part of the on-disk format for ZFS encryption
+	 * (see errata #4). As part of the remediation for this
+	 * issue, we ask the user to enable the bookmark_v2 feature
+	 * which is now a dependency of the encryption feature. We
+	 * use this as a heuristic to determine when the user has
+	 * elected to correct any datasets created with the old code.
+	 * As a result, we only do this step if the bookmark_v2
+	 * feature is enabled, which limits the number of states a
+	 * given pool / dataset can be in with regards to terms of
+	 * correcting the issue.
+	 */
+	if (ds->ds_dir->dd_crypto_obj != 0 &&
+	    spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) {
+		uint64_t ivset_guid = unique_create();
+
+		dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+		VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID,
+		    sizeof (ivset_guid), 1, &ivset_guid, tx));
+	}
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+	dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+	dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+	dsl_dataset_phys(ds)->ds_unique_bytes = 0;
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+
+	VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+	    snapname, 8, 1, &dsobj, tx));
+
+	if (ds->ds_prev)
+		dsl_dataset_rele(ds->ds_prev, ds);
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
+
+	dsl_scan_ds_snapshotted(ds, tx);
+
+	dsl_dir_snap_cmtime_update(ds->ds_dir);
+
+	spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " ");
+}
+
+void
+dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_arg_t *ddsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvpair_t *pair;
+
+	for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+		dsl_dataset_t *ds;
+		char *name, *atp;
+		char dsname[ZFS_MAX_DATASET_NAME_LEN];
+
+		name = nvpair_name(pair);
+		atp = strchr(name, '@');
+		(void) strlcpy(dsname, name, atp - name + 1);
+		VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds));
+
+		dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx);
+		if (ddsa->ddsa_props != NULL) {
+			dsl_props_set_sync_impl(ds->ds_prev,
+			    ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
+		}
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The snapshots must all be in the same pool.
+ * All-or-nothing: if there are any failures, nothing will be modified.
+ */
+int
+dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
+{
+	dsl_dataset_snapshot_arg_t ddsa;
+	nvpair_t *pair;
+	boolean_t needsuspend;
+	int error;
+	spa_t *spa;
+	char *firstname;
+	nvlist_t *suspended = NULL;
+
+	pair = nvlist_next_nvpair(snaps, NULL);
+	if (pair == NULL)
+		return (0);
+	firstname = nvpair_name(pair);
+
+	error = spa_open(firstname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
+
+	if (needsuspend) {
+		suspended = fnvlist_alloc();
+		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(snaps, pair)) {
+			char fsname[ZFS_MAX_DATASET_NAME_LEN];
+			char *snapname = nvpair_name(pair);
+			char *atp;
+			void *cookie;
+
+			atp = strchr(snapname, '@');
+			if (atp == NULL) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+			(void) strlcpy(fsname, snapname, atp - snapname + 1);
+
+			error = zil_suspend(fsname, &cookie);
+			if (error != 0)
+				break;
+			fnvlist_add_uint64(suspended, fsname,
+			    (uintptr_t)cookie);
+		}
+	}
+
+	ddsa.ddsa_snaps = snaps;
+	ddsa.ddsa_props = props;
+	ddsa.ddsa_errors = errors;
+	ddsa.ddsa_cr = CRED();
+	ddsa.ddsa_proc = curproc;
+
+	if (error == 0) {
+		error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
+		    dsl_dataset_snapshot_sync, &ddsa,
+		    fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
+	}
+
+	if (suspended != NULL) {
+		for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(suspended, pair)) {
+			zil_resume((void *)(uintptr_t)
+			    fnvpair_value_uint64(pair));
+		}
+		fnvlist_free(suspended);
+	}
+
+	if (error == 0) {
+		for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(snaps, pair)) {
+			zvol_create_minor(nvpair_name(pair));
+		}
+	}
+
+	return (error);
+}
+
+typedef struct dsl_dataset_snapshot_tmp_arg {
+	const char *ddsta_fsname;
+	const char *ddsta_snapname;
+	minor_t ddsta_cleanup_minor;
+	const char *ddsta_htag;
+} dsl_dataset_snapshot_tmp_arg_t;
+
+static int
+dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	/* NULL cred means no limit check for tmp snapshot */
+	error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
+	    tx, B_FALSE, 0, NULL, NULL);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag,
+	    B_TRUE, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_snapshot_tmp_arg_t *ddsta = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds = NULL;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds));
+
+	dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx);
+	dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag,
+	    ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx);
+	dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname,
+    minor_t cleanup_minor, const char *htag)
+{
+	dsl_dataset_snapshot_tmp_arg_t ddsta;
+	int error;
+	spa_t *spa;
+	boolean_t needsuspend;
+	void *cookie;
+
+	ddsta.ddsta_fsname = fsname;
+	ddsta.ddsta_snapname = snapname;
+	ddsta.ddsta_cleanup_minor = cleanup_minor;
+	ddsta.ddsta_htag = htag;
+
+	error = spa_open(fsname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP);
+	spa_close(spa, FTAG);
+
+	if (needsuspend) {
+		error = zil_suspend(fsname, &cookie);
+		if (error != 0)
+			return (error);
+	}
+
+	error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
+	    dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
+
+	if (needsuspend)
+		zil_resume(cookie);
+	return (error);
+}
+
+void
+dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(ds->ds_objset != NULL);
+	ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
+
+	/*
+	 * in case we had to change ds_fsid_guid when we opened it,
+	 * sync it out now.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
+
+	if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+		    &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+		    &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+		VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+		    ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+		    &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+		ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+		ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+	}
+
+	dmu_objset_sync(ds->ds_objset, zio, tx);
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (zfeature_active(f, ds->ds_feature_activation[f])) {
+			if (zfeature_active(f, ds->ds_feature[f]))
+				continue;
+			dsl_dataset_activate_feature(ds->ds_object, f,
+			    ds->ds_feature_activation[f], tx);
+			ds->ds_feature[f] = ds->ds_feature_activation[f];
+		}
+	}
+}
+
+/*
+ * Check if the percentage of blocks shared between the clone and the
+ * snapshot (as opposed to those that are clone only) is below a certain
+ * threshold
+ */
+static boolean_t
+dsl_livelist_should_disable(dsl_dataset_t *ds)
+{
+	uint64_t used, referenced;
+	int percent_shared;
+
+	used = dsl_dir_get_usedds(ds->ds_dir);
+	referenced = dsl_get_referenced(ds);
+	ASSERT3U(referenced, >=, 0);
+	ASSERT3U(used, >=, 0);
+	if (referenced == 0)
+		return (B_FALSE);
+	percent_shared = (100 * (referenced - used)) / referenced;
+	if (percent_shared <= zfs_livelist_min_percent_shared)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ *  Check if it is possible to combine two livelist entries into one.
+ *  This is the case if the combined number of 'live' blkptrs (ALLOCs that
+ *  don't have a matching FREE) is under the maximum sublist size.
+ *  We check this by subtracting twice the total number of frees from the total
+ *  number of blkptrs. FREEs are counted twice because each FREE blkptr
+ *  will cancel out an ALLOC blkptr when the livelist is processed.
+ */
+static boolean_t
+dsl_livelist_should_condense(dsl_deadlist_entry_t *first,
+    dsl_deadlist_entry_t *next)
+{
+	uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed +
+	    next->dle_bpobj.bpo_phys->bpo_num_freed;
+	uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs +
+	    next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+	if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+typedef struct try_condense_arg {
+	spa_t *spa;
+	dsl_dataset_t *ds;
+} try_condense_arg_t;
+
+/*
+ * Iterate over the livelist entries, searching for a pair to condense.
+ * A nonzero return value means stop, 0 means keep looking.
+ */
+static int
+dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first)
+{
+	try_condense_arg_t *tca = arg;
+	spa_t *spa = tca->spa;
+	dsl_dataset_t *ds = tca->ds;
+	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+	dsl_deadlist_entry_t *next;
+
+	/* The condense thread has not yet been created at import */
+	if (spa->spa_livelist_condense_zthr == NULL)
+		return (1);
+
+	/* A condense is already in progress */
+	if (spa->spa_to_condense.ds != NULL)
+		return (1);
+
+	next = AVL_NEXT(&ll->dl_tree, &first->dle_node);
+	/* The livelist has only one entry - don't condense it */
+	if (next == NULL)
+		return (1);
+
+	/* Next is the newest entry - don't condense it */
+	if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL)
+		return (1);
+
+	/* This pair is not ready to condense but keep looking */
+	if (!dsl_livelist_should_condense(first, next))
+		return (0);
+
+	/*
+	 * Add a ref to prevent the dataset from being evicted while
+	 * the condense zthr or synctask are running. Ref will be
+	 * released at the end of the condense synctask
+	 */
+	dmu_buf_add_ref(ds->ds_dbuf, spa);
+
+	spa->spa_to_condense.ds = ds;
+	spa->spa_to_condense.first = first;
+	spa->spa_to_condense.next = next;
+	spa->spa_to_condense.syncing = B_FALSE;
+	spa->spa_to_condense.cancelled = B_FALSE;
+
+	zthr_wakeup(spa->spa_livelist_condense_zthr);
+	return (1);
+}
+
+static void
+dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+	dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist);
+
+	/* Check if we need to add a new sub-livelist */
+	if (last == NULL) {
+		/* The livelist is empty */
+		dsl_deadlist_add_key(&dd->dd_livelist,
+		    tx->tx_txg - 1, tx);
+	} else if (spa_sync_pass(spa) == 1) {
+		/*
+		 * Check if the newest entry is full. If it is, make a new one.
+		 * We only do this once per sync because we could overfill a
+		 * sublist in one sync pass and don't want to add another entry
+		 * for a txg that is already represented. This ensures that
+		 * blkptrs born in the same txg are stored in the same sublist.
+		 */
+		bpobj_t bpobj = last->dle_bpobj;
+		uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs;
+		uint64_t free = bpobj.bpo_phys->bpo_num_freed;
+		uint64_t alloc = all - free;
+		if (alloc > zfs_livelist_max_entries) {
+			dsl_deadlist_add_key(&dd->dd_livelist,
+			    tx->tx_txg - 1, tx);
+		}
+	}
+
+	/* Insert each entry into the on-disk livelist */
+	bplist_iterate(&dd->dd_pending_allocs,
+	    dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx);
+	bplist_iterate(&dd->dd_pending_frees,
+	    dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx);
+
+	/* Attempt to condense every pair of adjacent entries */
+	try_condense_arg_t arg = {
+	    .spa = spa,
+	    .ds = ds
+	};
+	dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense,
+	    &arg);
+}
+
+void
+dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *os = ds->ds_objset;
+
+	bplist_iterate(&ds->ds_pending_deadlist,
+	    dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx);
+
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
+		dsl_flush_pending_livelist(ds, tx);
+		if (dsl_livelist_should_disable(ds)) {
+			dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE);
+		}
+	}
+
+	dsl_bookmark_sync_done(ds, tx);
+
+	multilist_destroy(os->os_synced_dnodes);
+	os->os_synced_dnodes = NULL;
+
+	if (os->os_encrypted)
+		os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE;
+	else
+		ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]);
+
+	ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+
+	dmu_buf_rele(ds->ds_dbuf, ds);
+}
+
+int
+get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val)
+{
+	uint64_t count = 0;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+
+	/*
+	 * There may be missing entries in ds_next_clones_obj
+	 * due to a bug in a previous version of the code.
+	 * Only trust it if it has the right number of entries.
+	 */
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+		    &count));
+	}
+	if (count != dsl_dataset_phys(ds)->ds_num_children - 1) {
+		return (SET_ERROR(ENOENT));
+	}
+	for (zap_cursor_init(&zc, mos,
+	    dsl_dataset_phys(ds)->ds_next_clones_obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_dataset_t *clone;
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
+		VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+		    za.za_first_integer, FTAG, &clone));
+		dsl_dir_name(clone->ds_dir, buf);
+		fnvlist_add_boolean(val, buf);
+		dsl_dataset_rele(clone, FTAG);
+	}
+	zap_cursor_fini(&zc);
+	return (0);
+}
+
+void
+get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	nvlist_t *propval = fnvlist_alloc();
+	nvlist_t *val = fnvlist_alloc();
+
+	if (get_clones_stat_impl(ds, val) == 0) {
+		fnvlist_add_nvlist(propval, ZPROP_VALUE, val);
+		fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES),
+		    propval);
+	}
+
+	nvlist_free(val);
+	nvlist_free(propval);
+}
+
+/*
+ * Returns a string that represents the receive resume stats token. It should
+ * be freed with strfree().
+ */
+char *
+get_receive_resume_stats_impl(dsl_dataset_t *ds)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	if (dsl_dataset_has_resume_receive_state(ds)) {
+		char *str;
+		void *packed;
+		uint8_t *compressed;
+		uint64_t val;
+		nvlist_t *token_nv = fnvlist_alloc();
+		size_t packed_size, compressed_size;
+
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "fromguid", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "object", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "offset", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "bytes", val);
+		}
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+			fnvlist_add_uint64(token_nv, "toguid", val);
+		}
+		char buf[MAXNAMELEN];
+		if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+			fnvlist_add_string(token_nv, "toname", buf);
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_LARGEBLOCK) == 0) {
+			fnvlist_add_boolean(token_nv, "largeblockok");
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_EMBEDOK) == 0) {
+			fnvlist_add_boolean(token_nv, "embedok");
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_COMPRESSOK) == 0) {
+			fnvlist_add_boolean(token_nv, "compressok");
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_RAWOK) == 0) {
+			fnvlist_add_boolean(token_nv, "rawok");
+		}
+		if (dsl_dataset_feature_is_active(ds,
+		    SPA_FEATURE_REDACTED_DATASETS)) {
+			uint64_t num_redact_snaps;
+			uint64_t *redact_snaps;
+			VERIFY(dsl_dataset_get_uint64_array_feature(ds,
+			    SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps,
+			    &redact_snaps));
+			fnvlist_add_uint64_array(token_nv, "redact_snaps",
+			    redact_snaps, num_redact_snaps);
+		}
+		if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+		    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) {
+			uint64_t num_redact_snaps, int_size;
+			uint64_t *redact_snaps;
+			VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size,
+			    &num_redact_snaps));
+			ASSERT3U(int_size, ==, sizeof (uint64_t));
+
+			redact_snaps = kmem_alloc(int_size * num_redact_snaps,
+			    KM_SLEEP);
+			VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object,
+			    DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size,
+			    num_redact_snaps, redact_snaps));
+			fnvlist_add_uint64_array(token_nv, "book_redact_snaps",
+			    redact_snaps, num_redact_snaps);
+			kmem_free(redact_snaps, int_size * num_redact_snaps);
+		}
+		packed = fnvlist_pack(token_nv, &packed_size);
+		fnvlist_free(token_nv);
+		compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+		compressed_size = gzip_compress(packed, compressed,
+		    packed_size, packed_size, 6);
+
+		zio_cksum_t cksum;
+		fletcher_4_native_varsize(compressed, compressed_size, &cksum);
+
+		size_t alloc_size = compressed_size * 2 + 1;
+		str = kmem_alloc(alloc_size, KM_SLEEP);
+		for (int i = 0; i < compressed_size; i++) {
+			size_t offset = i * 2;
+			(void) snprintf(str + offset, alloc_size - offset,
+		    "%02x", compressed[i]);
+		}
+		str[compressed_size * 2] = '\0';
+		char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+		    ZFS_SEND_RESUME_TOKEN_VERSION,
+		    (longlong_t)cksum.zc_word[0],
+		    (longlong_t)packed_size, str);
+		kmem_free(packed, packed_size);
+		kmem_free(str, alloc_size);
+		kmem_free(compressed, packed_size);
+		return (propval);
+	}
+	return (kmem_strdup(""));
+}
+
+/*
+ * Returns a string that represents the receive resume stats token of the
+ * dataset's child. It should be freed with strfree().
+ */
+char *
+get_child_receive_stats(dsl_dataset_t *ds)
+{
+	char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+	dsl_dataset_t *recv_ds;
+	dsl_dataset_name(ds, recvname);
+	if (strlcat(recvname, "/", sizeof (recvname)) <
+	    sizeof (recvname) &&
+	    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+	    sizeof (recvname) &&
+	    dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG,
+	    &recv_ds)  == 0) {
+		char *propval = get_receive_resume_stats_impl(recv_ds);
+		dsl_dataset_rele(recv_ds, FTAG);
+		return (propval);
+	}
+	return (kmem_strdup(""));
+}
+
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	char *propval = get_receive_resume_stats_impl(ds);
+	if (strcmp(propval, "") != 0) {
+		dsl_prop_nvlist_add_string(nv,
+		    ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+	} else {
+		char *childval = get_child_receive_stats(ds);
+		if (strcmp(childval, "") != 0) {
+			dsl_prop_nvlist_add_string(nv,
+			    ZFS_PROP_RECEIVE_RESUME_TOKEN, childval);
+		}
+		kmem_strfree(childval);
+	}
+	kmem_strfree(propval);
+}
+
+uint64_t
+dsl_get_refratio(dsl_dataset_t *ds)
+{
+	uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+	    (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+	    dsl_dataset_phys(ds)->ds_compressed_bytes);
+	return (ratio);
+}
+
+uint64_t
+dsl_get_logicalreferenced(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_phys(ds)->ds_uncompressed_bytes);
+}
+
+uint64_t
+dsl_get_compressratio(dsl_dataset_t *ds)
+{
+	if (ds->ds_is_snapshot) {
+		return (dsl_get_refratio(ds));
+	} else {
+		dsl_dir_t *dd = ds->ds_dir;
+		mutex_enter(&dd->dd_lock);
+		uint64_t val = dsl_dir_get_compressratio(dd);
+		mutex_exit(&dd->dd_lock);
+		return (val);
+	}
+}
+
+uint64_t
+dsl_get_used(dsl_dataset_t *ds)
+{
+	if (ds->ds_is_snapshot) {
+		return (dsl_dataset_phys(ds)->ds_unique_bytes);
+	} else {
+		dsl_dir_t *dd = ds->ds_dir;
+		mutex_enter(&dd->dd_lock);
+		uint64_t val = dsl_dir_get_used(dd);
+		mutex_exit(&dd->dd_lock);
+		return (val);
+	}
+}
+
+uint64_t
+dsl_get_creation(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_phys(ds)->ds_creation_time);
+}
+
+uint64_t
+dsl_get_creationtxg(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_phys(ds)->ds_creation_txg);
+}
+
+uint64_t
+dsl_get_refquota(dsl_dataset_t *ds)
+{
+	return (ds->ds_quota);
+}
+
+uint64_t
+dsl_get_refreservation(dsl_dataset_t *ds)
+{
+	return (ds->ds_reserved);
+}
+
+uint64_t
+dsl_get_guid(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_phys(ds)->ds_guid);
+}
+
+uint64_t
+dsl_get_unique(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_phys(ds)->ds_unique_bytes);
+}
+
+uint64_t
+dsl_get_objsetid(dsl_dataset_t *ds)
+{
+	return (ds->ds_object);
+}
+
+uint64_t
+dsl_get_userrefs(dsl_dataset_t *ds)
+{
+	return (ds->ds_userrefs);
+}
+
+uint64_t
+dsl_get_defer_destroy(dsl_dataset_t *ds)
+{
+	return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
+}
+
+uint64_t
+dsl_get_referenced(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_phys(ds)->ds_referenced_bytes);
+}
+
+uint64_t
+dsl_get_numclones(dsl_dataset_t *ds)
+{
+	ASSERT(ds->ds_is_snapshot);
+	return (dsl_dataset_phys(ds)->ds_num_children - 1);
+}
+
+uint64_t
+dsl_get_inconsistent(dsl_dataset_t *ds)
+{
+	return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ?
+	    1 : 0);
+}
+
+uint64_t
+dsl_get_redacted(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_feature_is_active(ds,
+	    SPA_FEATURE_REDACTED_DATASETS));
+}
+
+uint64_t
+dsl_get_available(dsl_dataset_t *ds)
+{
+	uint64_t refdbytes = dsl_get_referenced(ds);
+	uint64_t availbytes = dsl_dir_space_available(ds->ds_dir,
+	    NULL, 0, TRUE);
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+		availbytes +=
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
+	}
+	if (ds->ds_quota != 0) {
+		/*
+		 * Adjust available bytes according to refquota
+		 */
+		if (refdbytes < ds->ds_quota) {
+			availbytes = MIN(availbytes,
+			    ds->ds_quota - refdbytes);
+		} else {
+			availbytes = 0;
+		}
+	}
+	return (availbytes);
+}
+
+int
+dsl_get_written(dsl_dataset_t *ds, uint64_t *written)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_dataset_t *prev;
+	int err = dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+	if (err == 0) {
+		uint64_t comp, uncomp;
+		err = dsl_dataset_space_written(prev, ds, written,
+		    &comp, &uncomp);
+		dsl_dataset_rele(prev, FTAG);
+	}
+	return (err);
+}
+
+/*
+ * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN.
+ */
+int
+dsl_get_prev_snap(dsl_dataset_t *ds, char *snap)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
+		dsl_dataset_name(ds->ds_prev, snap);
+		return (0);
+	} else {
+		return (SET_ERROR(ENOENT));
+	}
+}
+
+void
+dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval)
+{
+	uint64_t nsnaps;
+	uint64_t *snaps;
+	if (dsl_dataset_get_uint64_array_feature(ds,
+	    SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) {
+		fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps,
+		    nsnaps);
+	}
+}
+
+/*
+ * Returns the mountpoint property and source for the given dataset in the value
+ * and source buffers. The value buffer must be at least as large as MAXPATHLEN
+ * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN.
+ * Returns 0 on success and an error on failure.
+ */
+int
+dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value,
+    char *source)
+{
+	int error;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	/* Retrieve the mountpoint value stored in the zap object */
+	error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1,
+	    ZAP_MAXVALUELEN, value, source);
+	if (error != 0) {
+		return (error);
+	}
+
+	/*
+	 * Process the dsname and source to find the full mountpoint string.
+	 * Can be skipped for 'legacy' or 'none'.
+	 */
+	if (value[0] == '/') {
+		char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+		char *root = buf;
+		const char *relpath;
+
+		/*
+		 * If we inherit the mountpoint, even from a dataset
+		 * with a received value, the source will be the path of
+		 * the dataset we inherit from. If source is
+		 * ZPROP_SOURCE_VAL_RECVD, the received value is not
+		 * inherited.
+		 */
+		if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
+			relpath = "";
+		} else {
+			ASSERT0(strncmp(dsname, source, strlen(source)));
+			relpath = dsname + strlen(source);
+			if (relpath[0] == '/')
+				relpath++;
+		}
+
+		spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN);
+
+		/*
+		 * Special case an alternate root of '/'. This will
+		 * avoid having multiple leading slashes in the
+		 * mountpoint path.
+		 */
+		if (strcmp(root, "/") == 0)
+			root++;
+
+		/*
+		 * If the mountpoint is '/' then skip over this
+		 * if we are obtaining either an alternate root or
+		 * an inherited mountpoint.
+		 */
+		char *mnt = value;
+		if (value[1] == '\0' && (root[0] != '\0' ||
+		    relpath[0] != '\0'))
+			mnt = value + 1;
+
+		if (relpath[0] == '\0') {
+			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s",
+			    root, mnt);
+		} else {
+			(void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s",
+			    root, mnt, relpath[0] == '@' ? "" : "/",
+			    relpath);
+		}
+		kmem_free(buf, ZAP_MAXVALUELEN);
+	}
+
+	return (0);
+}
+
+void
+dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO,
+	    dsl_get_refratio(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
+	    dsl_get_logicalreferenced(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
+	    dsl_get_compressratio(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
+	    dsl_get_used(ds));
+
+	if (ds->ds_is_snapshot) {
+		get_clones_stat(ds, nv);
+	} else {
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
+		if (dsl_get_prev_snap(ds, buf) == 0)
+			dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP,
+			    buf);
+		dsl_dir_stats(ds->ds_dir, nv);
+	}
+
+	nvlist_t *propval = fnvlist_alloc();
+	dsl_get_redact_snaps(ds, propval);
+	fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS),
+	    propval);
+	nvlist_free(propval);
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE,
+	    dsl_get_available(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED,
+	    dsl_get_referenced(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
+	    dsl_get_creation(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
+	    dsl_get_creationtxg(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
+	    dsl_get_refquota(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
+	    dsl_get_refreservation(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
+	    dsl_get_guid(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+	    dsl_get_unique(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+	    dsl_get_objsetid(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+	    dsl_get_userrefs(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+	    dsl_get_defer_destroy(ds));
+	dsl_dataset_crypt_stats(ds, nv);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		uint64_t written;
+		if (dsl_get_written(ds, &written) == 0) {
+			dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN,
+			    written);
+		}
+	}
+
+	if (!dsl_dataset_is_snapshot(ds)) {
+		/*
+		 * A failed "newfs" (e.g. full) resumable receive leaves
+		 * the stats set on this dataset.  Check here for the prop.
+		 */
+		get_receive_resume_stats(ds, nv);
+
+		/*
+		 * A failed incremental resumable receive leaves the
+		 * stats set on our child named "%recv".  Check the child
+		 * for the prop.
+		 */
+		/* 6 extra bytes for /%recv */
+		char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+		dsl_dataset_t *recv_ds;
+		dsl_dataset_name(ds, recvname);
+		if (strlcat(recvname, "/", sizeof (recvname)) <
+		    sizeof (recvname) &&
+		    strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+		    sizeof (recvname) &&
+		    dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+			get_receive_resume_stats(recv_ds, nv);
+			dsl_dataset_rele(recv_ds, FTAG);
+		}
+	}
+}
+
+void
+dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
+{
+	dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
+	ASSERT(dsl_pool_config_held(dp));
+
+	stat->dds_creation_txg = dsl_get_creationtxg(ds);
+	stat->dds_inconsistent = dsl_get_inconsistent(ds);
+	stat->dds_guid = dsl_get_guid(ds);
+	stat->dds_redacted = dsl_get_redacted(ds);
+	stat->dds_origin[0] = '\0';
+	if (ds->ds_is_snapshot) {
+		stat->dds_is_snapshot = B_TRUE;
+		stat->dds_num_clones = dsl_get_numclones(ds);
+	} else {
+		stat->dds_is_snapshot = B_FALSE;
+		stat->dds_num_clones = 0;
+
+		if (dsl_dir_is_clone(ds->ds_dir)) {
+			dsl_dir_get_origin(ds->ds_dir, stat->dds_origin);
+		}
+	}
+}
+
+uint64_t
+dsl_dataset_fsid_guid(dsl_dataset_t *ds)
+{
+	return (ds->ds_fsid_guid);
+}
+
+void
+dsl_dataset_space(dsl_dataset_t *ds,
+    uint64_t *refdbytesp, uint64_t *availbytesp,
+    uint64_t *usedobjsp, uint64_t *availobjsp)
+{
+	*refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
+	*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+		*availbytesp +=
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
+	if (ds->ds_quota != 0) {
+		/*
+		 * Adjust available bytes according to refquota
+		 */
+		if (*refdbytesp < ds->ds_quota)
+			*availbytesp = MIN(*availbytesp,
+			    ds->ds_quota - *refdbytesp);
+		else
+			*availbytesp = 0;
+	}
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	*usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	*availobjsp = DN_MAX_OBJECT - *usedobjsp;
+}
+
+boolean_t
+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
+{
+	dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool;
+	uint64_t birth;
+
+	ASSERT(dsl_pool_config_held(dp));
+	if (snap == NULL)
+		return (B_FALSE);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
+		objset_t *os, *os_snap;
+		/*
+		 * It may be that only the ZIL differs, because it was
+		 * reset in the head.  Don't count that as being
+		 * modified.
+		 */
+		if (dmu_objset_from_ds(ds, &os) != 0)
+			return (B_TRUE);
+		if (dmu_objset_from_ds(snap, &os_snap) != 0)
+			return (B_TRUE);
+		return (bcmp(&os->os_phys->os_meta_dnode,
+		    &os_snap->os_phys->os_meta_dnode,
+		    sizeof (os->os_phys->os_meta_dnode)) != 0);
+	}
+	return (B_FALSE);
+}
+
+typedef struct dsl_dataset_rename_snapshot_arg {
+	const char *ddrsa_fsname;
+	const char *ddrsa_oldsnapname;
+	const char *ddrsa_newsnapname;
+	boolean_t ddrsa_recursive;
+	dmu_tx_t *ddrsa_tx;
+} dsl_dataset_rename_snapshot_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	int error;
+	uint64_t val;
+
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	if (error != 0) {
+		/* ignore nonexistent snapshots */
+		return (error == ENOENT ? 0 : error);
+	}
+
+	/* new name should not exist */
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val);
+	if (error == 0)
+		error = SET_ERROR(EEXIST);
+	else if (error == ENOENT)
+		error = 0;
+
+	/* dataset name + 1 for the "@" + the new snapshot name must fit */
+	if (dsl_dir_namelen(hds->ds_dir) + 1 +
+	    strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
+		error = SET_ERROR(ENAMETOOLONG);
+
+	return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds);
+	if (error != 0)
+		return (error);
+
+	if (ddrsa->ddrsa_recursive) {
+		error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_check_impl, ddrsa,
+		    DS_FIND_CHILDREN);
+	} else {
+		error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa);
+	}
+	dsl_dataset_rele(hds, FTAG);
+	return (error);
+}
+
+static int
+dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
+    dsl_dataset_t *hds, void *arg)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_dataset_t *ds;
+	uint64_t val;
+	dmu_tx_t *tx = ddrsa->ddrsa_tx;
+	int error;
+
+	error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val);
+	ASSERT(error == 0 || error == ENOENT);
+	if (error == ENOENT) {
+		/* ignore nonexistent snapshots */
+		return (0);
+	}
+
+	VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds));
+
+	/* log before we change the name */
+	spa_history_log_internal_ds(ds, "rename", tx,
+	    "-> @%s", ddrsa->ddrsa_newsnapname);
+
+	VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+	    B_FALSE));
+	mutex_enter(&ds->ds_lock);
+	(void) strlcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname,
+	    sizeof (ds->ds_snapname));
+	mutex_exit(&ds->ds_lock);
+	VERIFY0(zap_add(dp->dp_meta_objset,
+	    dsl_dataset_phys(hds)->ds_snapnames_zapobj,
+	    ds->ds_snapname, 8, 1, &ds->ds_object, tx));
+	zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname,
+	    ddrsa->ddrsa_newsnapname, B_TRUE);
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rename_snapshot_arg_t *ddrsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds = NULL;
+
+	VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds));
+	ddrsa->ddrsa_tx = tx;
+	if (ddrsa->ddrsa_recursive) {
+		VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object,
+		    dsl_dataset_rename_snapshot_sync_impl, ddrsa,
+		    DS_FIND_CHILDREN));
+	} else {
+		VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa));
+	}
+	dsl_dataset_rele(hds, FTAG);
+}
+
+int
+dsl_dataset_rename_snapshot(const char *fsname,
+    const char *oldsnapname, const char *newsnapname, boolean_t recursive)
+{
+	dsl_dataset_rename_snapshot_arg_t ddrsa;
+
+	ddrsa.ddrsa_fsname = fsname;
+	ddrsa.ddrsa_oldsnapname = oldsnapname;
+	ddrsa.ddrsa_newsnapname = newsnapname;
+	ddrsa.ddrsa_recursive = recursive;
+
+	return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
+	    dsl_dataset_rename_snapshot_sync, &ddrsa,
+	    1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset.  We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here.  We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
+static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+	boolean_t held = B_FALSE;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	dsl_dir_t *dd = ds->ds_dir;
+	mutex_enter(&dd->dd_activity_lock);
+	uint64_t holds = zfs_refcount_count(&ds->ds_longholds) -
+	    (owner != NULL ? 1 : 0);
+	/*
+	 * The value of dd_activity_waiters can chance as soon as we drop the
+	 * lock, but we're fine with that; new waiters coming in or old
+	 * waiters leaving doesn't cause problems, since we're going to cancel
+	 * waiters later anyway. The goal of this check is to verify that no
+	 * non-waiters have long-holds, and all new long-holds will be
+	 * prevented because we're holding the pool config as writer.
+	 */
+	if (holds != dd->dd_activity_waiters)
+		held = B_TRUE;
+	mutex_exit(&dd->dd_activity_lock);
+
+	if (held)
+		return (SET_ERROR(EBUSY));
+
+	return (0);
+}
+
+int
+dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rollback_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int64_t unused_refres_delta;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	/* must not be a snapshot */
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/* must have a most recent snapshot */
+	if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ESRCH));
+	}
+
+	/*
+	 * No rollback to a snapshot created in the current txg, because
+	 * the rollback may dirty the dataset and create blocks that are
+	 * not reachable from the rootbp while having a birth txg that
+	 * falls into the snapshot's range.
+	 */
+	if (dmu_tx_is_syncing(tx) &&
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EAGAIN));
+	}
+
+	/*
+	 * If the expected target snapshot is specified, then check that
+	 * the latest snapshot is it.
+	 */
+	if (ddra->ddra_tosnap != NULL) {
+		dsl_dataset_t *snapds;
+
+		/* Check if the target snapshot exists at all. */
+		error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds);
+		if (error != 0) {
+			/*
+			 * ESRCH is used to signal that the target snapshot does
+			 * not exist, while ENOENT is used to report that
+			 * the rolled back dataset does not exist.
+			 * ESRCH is also used to cover other cases where the
+			 * target snapshot is not related to the dataset being
+			 * rolled back such as being in a different pool.
+			 */
+			if (error == ENOENT || error == EXDEV)
+				error = SET_ERROR(ESRCH);
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+		ASSERT(snapds->ds_is_snapshot);
+
+		/* Check if the snapshot is the latest snapshot indeed. */
+		if (snapds != ds->ds_prev) {
+			/*
+			 * Distinguish between the case where the only problem
+			 * is intervening snapshots (EEXIST) vs the snapshot
+			 * not being a valid target for rollback (ESRCH).
+			 */
+			if (snapds->ds_dir == ds->ds_dir ||
+			    (dsl_dir_is_clone(ds->ds_dir) &&
+			    dsl_dir_phys(ds->ds_dir)->dd_origin_obj ==
+			    snapds->ds_object)) {
+				error = SET_ERROR(EEXIST);
+			} else {
+				error = SET_ERROR(ESRCH);
+			}
+			dsl_dataset_rele(snapds, FTAG);
+			dsl_dataset_rele(ds, FTAG);
+			return (error);
+		}
+		dsl_dataset_rele(snapds, FTAG);
+	}
+
+	/* must not have any bookmarks after the most recent snapshot */
+	if (dsl_bookmark_latest_txg(ds) >
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EEXIST));
+	}
+
+	error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	/*
+	 * Check if the snap we are rolling back to uses more than
+	 * the refquota.
+	 */
+	if (ds->ds_quota != 0 &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EDQUOT));
+	}
+
+	/*
+	 * When we do the clone swap, we will temporarily use more space
+	 * due to the refreservation (the head will no longer have any
+	 * unique space, so the entire amount of the refreservation will need
+	 * to be free).  We will immediately destroy the clone, freeing
+	 * this space, but the freeing happens over many txg's.
+	 */
+	unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
+	    dsl_dataset_phys(ds)->ds_unique_bytes);
+
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_rollback_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds, *clone;
+	uint64_t cloneobj;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+	VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
+
+	dsl_dataset_name(ds->ds_prev, namebuf);
+	fnvlist_add_string(ddra->ddra_result, "target", namebuf);
+
+	cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
+	    ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx);
+
+	VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone));
+
+	dsl_dataset_clone_swap_sync_impl(clone, ds, tx);
+	dsl_dataset_zero_zil(ds, tx);
+
+	dsl_destroy_head_sync_impl(clone, tx);
+
+	dsl_dataset_rele(clone, FTAG);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Rolls back the given filesystem or volume to the most recent snapshot.
+ * The name of the most recent snapshot will be returned under key "target"
+ * in the result nvlist.
+ *
+ * If owner != NULL:
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ *   succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted.  See
+ * notes above zfs_suspend_fs() for further details.
+ */
+int
+dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner,
+    nvlist_t *result)
+{
+	dsl_dataset_rollback_arg_t ddra;
+
+	ddra.ddra_fsname = fsname;
+	ddra.ddra_tosnap = tosnap;
+	ddra.ddra_owner = owner;
+	ddra.ddra_result = result;
+
+	return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
+	    dsl_dataset_rollback_sync, &ddra,
+	    1, ZFS_SPACE_CHECK_RESERVED));
+}
+
+struct promotenode {
+	list_node_t link;
+	dsl_dataset_t *ds;
+};
+
+static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
+static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp,
+    void *tag);
+static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag);
+
+int
+dsl_dataset_promote_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds, *origin_head;
+	int err;
+	uint64_t unused;
+	uint64_t ss_mv_cnt;
+	size_t max_snap_len;
+	boolean_t conflicting_snaps;
+
+	err = promote_hold(ddpa, dp, FTAG);
+	if (err != 0)
+		return (err);
+
+	hds = ddpa->ddpa_clone;
+	max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
+
+	if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
+		promote_rele(ddpa, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	snap = list_head(&ddpa->shared_snaps);
+	origin_head = snap->ds;
+	if (snap == NULL) {
+		err = SET_ERROR(ENOENT);
+		goto out;
+	}
+	origin_ds = snap->ds;
+
+	/*
+	 * Encrypted clones share a DSL Crypto Key with their origin's dsl dir.
+	 * When doing a promote we must make sure the encryption root for
+	 * both the target and the target's origin does not change to avoid
+	 * needing to rewrap encryption keys
+	 */
+	err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir);
+	if (err != 0)
+		goto out;
+
+	/*
+	 * Compute and check the amount of space to transfer.  Since this is
+	 * so expensive, don't do the preliminary check.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		promote_rele(ddpa, FTAG);
+		return (0);
+	}
+
+	/* compute origin's new unique space */
+	snap = list_tail(&ddpa->clone_snaps);
+	ASSERT(snap != NULL);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+	    dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
+	    &ddpa->unique, &unused, &unused);
+
+	/*
+	 * Walk the snapshots that we are moving
+	 *
+	 * Compute space to transfer.  Consider the incremental changes
+	 * to used by each snapshot:
+	 * (my used) = (prev's used) + (blocks born) - (blocks killed)
+	 * So each snapshot gave birth to:
+	 * (blocks born) = (my used) - (prev's used) + (blocks killed)
+	 * So a sequence would look like:
+	 * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0)
+	 * Which simplifies to:
+	 * uN + kN + kN-1 + ... + k1 + k0
+	 * Note however, if we stop before we reach the ORIGIN we get:
+	 * uN + kN + kN-1 + ... + kM - uM-1
+	 */
+	conflicting_snaps = B_FALSE;
+	ss_mv_cnt = 0;
+	ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+	ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+	ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
+		uint64_t val, dlused, dlcomp, dluncomp;
+		dsl_dataset_t *ds = snap->ds;
+
+		ss_mv_cnt++;
+
+		/*
+		 * If there are long holds, we won't be able to evict
+		 * the objset.
+		 */
+		if (dsl_dataset_long_held(ds)) {
+			err = SET_ERROR(EBUSY);
+			goto out;
+		}
+
+		/* Check that the snapshot name does not conflict */
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		if (strlen(ds->ds_snapname) >= max_snap_len) {
+			err = SET_ERROR(ENAMETOOLONG);
+			goto out;
+		}
+		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
+		if (err == 0) {
+			fnvlist_add_boolean(ddpa->err_ds,
+			    snap->ds->ds_snapname);
+			conflicting_snaps = B_TRUE;
+		} else if (err != ENOENT) {
+			goto out;
+		}
+
+		/* The very first snapshot does not have a deadlist */
+		if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
+			continue;
+
+		dsl_deadlist_space(&ds->ds_deadlist,
+		    &dlused, &dlcomp, &dluncomp);
+		ddpa->used += dlused;
+		ddpa->comp += dlcomp;
+		ddpa->uncomp += dluncomp;
+	}
+
+	/*
+	 * Check that bookmarks that are being transferred don't have
+	 * name conflicts.
+	 */
+	for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
+	    dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) {
+		if (strlen(dbn->dbn_name) >= max_snap_len) {
+			err = SET_ERROR(ENAMETOOLONG);
+			goto out;
+		}
+		zfs_bookmark_phys_t bm;
+		err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone,
+		    dbn->dbn_name, &bm);
+
+		if (err == 0) {
+			fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name);
+			conflicting_snaps = B_TRUE;
+		} else if (err == ESRCH) {
+			err = 0;
+		} else if (err != 0) {
+			goto out;
+		}
+	}
+
+	/*
+	 * In order to return the full list of conflicting snapshots, we check
+	 * whether there was a conflict after traversing all of them.
+	 */
+	if (conflicting_snaps) {
+		err = SET_ERROR(EEXIST);
+		goto out;
+	}
+
+	/*
+	 * If we are a clone of a clone then we never reached ORIGIN,
+	 * so we need to subtract out the clone origin's used space.
+	 */
+	if (ddpa->origin_origin) {
+		ddpa->used -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+		ddpa->comp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
+		ddpa->uncomp -=
+		    dsl_dataset_phys(ddpa->origin_origin)->
+		    ds_uncompressed_bytes;
+	}
+
+	/* Check that there is enough space and limit headroom here */
+	err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
+	    0, ss_mv_cnt, ddpa->used, ddpa->cr, ddpa->proc);
+	if (err != 0)
+		goto out;
+
+	/*
+	 * Compute the amounts of space that will be used by snapshots
+	 * after the promotion (for both origin and clone).  For each,
+	 * it is the amount of space that will be on all of their
+	 * deadlists (that was not born before their new origin).
+	 */
+	if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		uint64_t space;
+
+		/*
+		 * Note, typically this will not be a clone of a clone,
+		 * so dd_origin_txg will be < TXG_INITIAL, so
+		 * these snaplist_space() -> dsl_deadlist_space_range()
+		 * calls will be fast because they do not have to
+		 * iterate over all bps.
+		 */
+		snap = list_head(&ddpa->origin_snaps);
+		if (snap == NULL) {
+			err = SET_ERROR(ENOENT);
+			goto out;
+		}
+		err = snaplist_space(&ddpa->shared_snaps,
+		    snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap);
+		if (err != 0)
+			goto out;
+
+		err = snaplist_space(&ddpa->clone_snaps,
+		    snap->ds->ds_dir->dd_origin_txg, &space);
+		if (err != 0)
+			goto out;
+		ddpa->cloneusedsnap += space;
+	}
+	if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+	    DD_FLAG_USED_BREAKDOWN) {
+		err = snaplist_space(&ddpa->origin_snaps,
+		    dsl_dataset_phys(origin_ds)->ds_creation_txg,
+		    &ddpa->originusedsnap);
+		if (err != 0)
+			goto out;
+	}
+
+out:
+	promote_rele(ddpa, FTAG);
+	return (err);
+}
+
+void
+dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_promote_arg_t *ddpa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *hds;
+	struct promotenode *snap;
+	dsl_dataset_t *origin_ds;
+	dsl_dataset_t *origin_head;
+	dsl_dir_t *dd;
+	dsl_dir_t *odd = NULL;
+	uint64_t oldnext_obj;
+	int64_t delta;
+
+	ASSERT(nvlist_empty(ddpa->err_ds));
+
+	VERIFY0(promote_hold(ddpa, dp, FTAG));
+	hds = ddpa->ddpa_clone;
+
+	ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
+
+	snap = list_head(&ddpa->shared_snaps);
+	origin_ds = snap->ds;
+	dd = hds->ds_dir;
+
+	snap = list_head(&ddpa->origin_snaps);
+	origin_head = snap->ds;
+
+	/*
+	 * We need to explicitly open odd, since origin_ds's dd will be
+	 * changing.
+	 */
+	VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object,
+	    NULL, FTAG, &odd));
+
+	dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx);
+
+	/* change origin's next snap */
+	dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
+	oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
+	snap = list_tail(&ddpa->clone_snaps);
+	ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+	    origin_ds->ds_object);
+	dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
+
+	/* change the origin's next clone */
+	if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
+		dsl_dataset_remove_from_next_clones(origin_ds,
+		    snap->ds->ds_object, tx);
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
+		    oldnext_obj, tx));
+	}
+
+	/* change origin */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+	dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
+	dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
+	dmu_buf_will_dirty(odd->dd_dbuf, tx);
+	dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
+	origin_head->ds_dir->dd_origin_txg =
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
+
+	/* change dd_clone entries */
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
+		    dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+		    hds->ds_object, tx));
+
+		VERIFY0(zap_remove_int(dp->dp_meta_objset,
+		    dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
+		    origin_head->ds_object, tx));
+		if (dsl_dir_phys(dd)->dd_clones == 0) {
+			dsl_dir_phys(dd)->dd_clones =
+			    zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+			    DMU_OT_NONE, 0, tx);
+		}
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
+	}
+
+	/*
+	 * Move bookmarks to this dir.
+	 */
+	dsl_bookmark_node_t *dbn_next;
+	for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks);
+	    dbn != NULL && dbn->dbn_phys.zbm_creation_txg <=
+	    dsl_dataset_phys(origin_ds)->ds_creation_txg;
+	    dbn = dbn_next) {
+		dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn);
+
+		avl_remove(&origin_head->ds_bookmarks, dbn);
+		VERIFY0(zap_remove(dp->dp_meta_objset,
+		    origin_head->ds_bookmarks_obj, dbn->dbn_name, tx));
+
+		dsl_bookmark_node_add(hds, dbn, tx);
+	}
+
+	dsl_bookmark_next_changed(hds, origin_ds, tx);
+
+	/* move snapshots to this dir */
+	for (snap = list_head(&ddpa->shared_snaps); snap;
+	    snap = list_next(&ddpa->shared_snaps, snap)) {
+		dsl_dataset_t *ds = snap->ds;
+
+		/*
+		 * Property callbacks are registered to a particular
+		 * dsl_dir.  Since ours is changing, evict the objset
+		 * so that they will be unregistered from the old dsl_dir.
+		 */
+		if (ds->ds_objset) {
+			dmu_objset_evict(ds->ds_objset);
+			ds->ds_objset = NULL;
+		}
+
+		/* move snap name entry */
+		VERIFY0(dsl_dataset_get_snapname(ds));
+		VERIFY0(dsl_dataset_snap_remove(origin_head,
+		    ds->ds_snapname, tx, B_TRUE));
+		VERIFY0(zap_add(dp->dp_meta_objset,
+		    dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
+		    8, 1, &ds->ds_object, tx));
+		dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
+		/* change containing dsl_dir */
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+		dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
+		ASSERT3P(ds->ds_dir, ==, odd);
+		dsl_dir_rele(ds->ds_dir, ds);
+		VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
+		    NULL, ds, &ds->ds_dir));
+
+		/* move any clone references */
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
+		    spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			zap_cursor_t zc;
+			zap_attribute_t za;
+
+			for (zap_cursor_init(&zc, dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    zap_cursor_advance(&zc)) {
+				dsl_dataset_t *cnds;
+				uint64_t o;
+
+				if (za.za_first_integer == oldnext_obj) {
+					/*
+					 * We've already moved the
+					 * origin's reference.
+					 */
+					continue;
+				}
+
+				VERIFY0(dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &cnds));
+				o = dsl_dir_phys(cnds->ds_dir)->
+				    dd_head_dataset_obj;
+
+				VERIFY0(zap_remove_int(dp->dp_meta_objset,
+				    dsl_dir_phys(odd)->dd_clones, o, tx));
+				VERIFY0(zap_add_int(dp->dp_meta_objset,
+				    dsl_dir_phys(dd)->dd_clones, o, tx));
+				dsl_dataset_rele(cnds, FTAG);
+			}
+			zap_cursor_fini(&zc);
+		}
+
+		ASSERT(!dsl_prop_hascb(ds));
+	}
+
+	/*
+	 * Change space accounting.
+	 * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
+	 * both be valid, or both be 0 (resulting in delta == 0).  This
+	 * is true for each of {clone,origin} independently.
+	 */
+
+	delta = ddpa->cloneusedsnap -
+	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
+	ASSERT3S(delta, >=, 0);
+	ASSERT3U(ddpa->used, >=, delta);
+	dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
+	dsl_dir_diduse_space(dd, DD_USED_HEAD,
+	    ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
+
+	delta = ddpa->originusedsnap -
+	    dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
+	ASSERT3S(delta, <=, 0);
+	ASSERT3U(ddpa->used, >=, -delta);
+	dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
+	dsl_dir_diduse_space(odd, DD_USED_HEAD,
+	    -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
+
+	dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
+
+	/*
+	 * Since livelists are specific to a clone's origin txg, they
+	 * are no longer accurate. Destroy the livelist from the clone being
+	 * promoted. If the origin dataset is a clone, destroy its livelist
+	 * as well.
+	 */
+	dsl_dir_remove_livelist(dd, tx, B_TRUE);
+	dsl_dir_remove_livelist(odd, tx, B_TRUE);
+
+	/* log history record */
+	spa_history_log_internal_ds(hds, "promote", tx, " ");
+
+	dsl_dir_rele(odd, FTAG);
+	promote_rele(ddpa, FTAG);
+}
+
+/*
+ * Make a list of dsl_dataset_t's for the snapshots between first_obj
+ * (exclusive) and last_obj (inclusive).  The list will be in reverse
+ * order (last_obj will be the list_head()).  If first_obj == 0, do all
+ * snapshots back to this dataset's origin.
+ */
+static int
+snaplist_make(dsl_pool_t *dp,
+    uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag)
+{
+	uint64_t obj = last_obj;
+
+	list_create(l, sizeof (struct promotenode),
+	    offsetof(struct promotenode, link));
+
+	while (obj != first_obj) {
+		dsl_dataset_t *ds;
+		struct promotenode *snap;
+		int err;
+
+		err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
+		ASSERT(err != ENOENT);
+		if (err != 0)
+			return (err);
+
+		if (first_obj == 0)
+			first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
+
+		snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
+		snap->ds = ds;
+		list_insert_tail(l, snap);
+		obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	}
+
+	return (0);
+}
+
+static int
+snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep)
+{
+	struct promotenode *snap;
+
+	*spacep = 0;
+	for (snap = list_head(l); snap; snap = list_next(l, snap)) {
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_space_range(&snap->ds->ds_deadlist,
+		    mintxg, UINT64_MAX, &used, &comp, &uncomp);
+		*spacep += used;
+	}
+	return (0);
+}
+
+static void
+snaplist_destroy(list_t *l, void *tag)
+{
+	struct promotenode *snap;
+
+	if (l == NULL || !list_link_active(&l->list_head))
+		return;
+
+	while ((snap = list_tail(l)) != NULL) {
+		list_remove(l, snap);
+		dsl_dataset_rele(snap->ds, tag);
+		kmem_free(snap, sizeof (*snap));
+	}
+	list_destroy(l);
+}
+
+static int
+promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag)
+{
+	int error;
+	dsl_dir_t *dd;
+	struct promotenode *snap;
+
+	error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag,
+	    &ddpa->ddpa_clone);
+	if (error != 0)
+		return (error);
+	dd = ddpa->ddpa_clone->ds_dir;
+
+	if (ddpa->ddpa_clone->ds_is_snapshot ||
+	    !dsl_dir_is_clone(dd)) {
+		dsl_dataset_rele(ddpa->ddpa_clone, tag);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
+	    &ddpa->shared_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object,
+	    &ddpa->clone_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	snap = list_head(&ddpa->shared_snaps);
+	ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+	error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+	    dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
+	    &ddpa->origin_snaps, tag);
+	if (error != 0)
+		goto out;
+
+	if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
+		error = dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
+		    tag, &ddpa->origin_origin);
+		if (error != 0)
+			goto out;
+	}
+out:
+	if (error != 0)
+		promote_rele(ddpa, tag);
+	return (error);
+}
+
+static void
+promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag)
+{
+	snaplist_destroy(&ddpa->shared_snaps, tag);
+	snaplist_destroy(&ddpa->clone_snaps, tag);
+	snaplist_destroy(&ddpa->origin_snaps, tag);
+	if (ddpa->origin_origin != NULL)
+		dsl_dataset_rele(ddpa->origin_origin, tag);
+	dsl_dataset_rele(ddpa->ddpa_clone, tag);
+}
+
+/*
+ * Promote a clone.
+ *
+ * If it fails due to a conflicting snapshot name, "conflsnap" will be filled
+ * in with the name.  (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
+ */
+int
+dsl_dataset_promote(const char *name, char *conflsnap)
+{
+	dsl_dataset_promote_arg_t ddpa = { 0 };
+	uint64_t numsnaps;
+	int error;
+	nvpair_t *snap_pair;
+	objset_t *os;
+
+	/*
+	 * We will modify space proportional to the number of
+	 * snapshots.  Compute numsnaps.
+	 */
+	error = dmu_objset_hold(name, FTAG, &os);
+	if (error != 0)
+		return (error);
+	error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
+	    dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+	    &numsnaps);
+	dmu_objset_rele(os, FTAG);
+	if (error != 0)
+		return (error);
+
+	ddpa.ddpa_clonename = name;
+	ddpa.err_ds = fnvlist_alloc();
+	ddpa.cr = CRED();
+	ddpa.proc = curproc;
+
+	error = dsl_sync_task(name, dsl_dataset_promote_check,
+	    dsl_dataset_promote_sync, &ddpa,
+	    2 + numsnaps, ZFS_SPACE_CHECK_RESERVED);
+
+	/*
+	 * Return the first conflicting snapshot found.
+	 */
+	snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL);
+	if (snap_pair != NULL && conflsnap != NULL)
+		(void) strlcpy(conflsnap, nvpair_name(snap_pair),
+		    ZFS_MAX_DATASET_NAME_LEN);
+
+	fnvlist_free(ddpa.err_ds);
+	return (error);
+}
+
+int
+dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
+{
+	/*
+	 * "slack" factor for received datasets with refquota set on them.
+	 * See the bottom of this function for details on its use.
+	 */
+	uint64_t refquota_slack = (uint64_t)DMU_MAX_ACCESS *
+	    spa_asize_inflation;
+	int64_t unused_refres_delta;
+
+	/* they should both be heads */
+	if (clone->ds_is_snapshot ||
+	    origin_head->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	/* if we are not forcing, the branch point should be just before them */
+	if (!force && clone->ds_prev != origin_head->ds_prev)
+		return (SET_ERROR(EINVAL));
+
+	/* clone should be the clone (unless they are unrelated) */
+	if (clone->ds_prev != NULL &&
+	    clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
+	    origin_head->ds_dir != clone->ds_prev->ds_dir)
+		return (SET_ERROR(EINVAL));
+
+	/* the clone should be a child of the origin */
+	if (clone->ds_dir->dd_parent != origin_head->ds_dir)
+		return (SET_ERROR(EINVAL));
+
+	/* origin_head shouldn't be modified unless 'force' */
+	if (!force &&
+	    dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
+		return (SET_ERROR(ETXTBSY));
+
+	/* origin_head should have no long holds (e.g. is not mounted) */
+	if (dsl_dataset_handoff_check(origin_head, owner, tx))
+		return (SET_ERROR(EBUSY));
+
+	/* check amount of any unconsumed refreservation */
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
+
+	if (unused_refres_delta > 0 &&
+	    unused_refres_delta >
+	    dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
+		return (SET_ERROR(ENOSPC));
+
+	/*
+	 * The clone can't be too much over the head's refquota.
+	 *
+	 * To ensure that the entire refquota can be used, we allow one
+	 * transaction to exceed the refquota.  Therefore, this check
+	 * needs to also allow for the space referenced to be more than the
+	 * refquota.  The maximum amount of space that one transaction can use
+	 * on disk is DMU_MAX_ACCESS * spa_asize_inflation.  Allowing this
+	 * overage ensures that we are able to receive a filesystem that
+	 * exceeds the refquota on the source system.
+	 *
+	 * So that overage is the refquota_slack we use below.
+	 */
+	if (origin_head->ds_quota != 0 &&
+	    dsl_dataset_phys(clone)->ds_referenced_bytes >
+	    origin_head->ds_quota + refquota_slack)
+		return (SET_ERROR(EDQUOT));
+
+	return (0);
+}
+
+static void
+dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone,
+    dsl_dataset_t *origin, dmu_tx_t *tx)
+{
+	uint64_t clone_remap_dl_obj, origin_remap_dl_obj;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	ASSERT(dsl_pool_sync_context(dp));
+
+	clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone);
+	origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin);
+
+	if (clone_remap_dl_obj != 0) {
+		dsl_deadlist_close(&clone->ds_remap_deadlist);
+		dsl_dataset_unset_remap_deadlist_object(clone, tx);
+	}
+	if (origin_remap_dl_obj != 0) {
+		dsl_deadlist_close(&origin->ds_remap_deadlist);
+		dsl_dataset_unset_remap_deadlist_object(origin, tx);
+	}
+
+	if (clone_remap_dl_obj != 0) {
+		dsl_dataset_set_remap_deadlist_object(origin,
+		    clone_remap_dl_obj, tx);
+		dsl_deadlist_open(&origin->ds_remap_deadlist,
+		    dp->dp_meta_objset, clone_remap_dl_obj);
+	}
+	if (origin_remap_dl_obj != 0) {
+		dsl_dataset_set_remap_deadlist_object(clone,
+		    origin_remap_dl_obj, tx);
+		dsl_deadlist_open(&clone->ds_remap_deadlist,
+		    dp->dp_meta_objset, origin_remap_dl_obj);
+	}
+}
+
+void
+dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
+    dsl_dataset_t *origin_head, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int64_t unused_refres_delta;
+
+	ASSERT(clone->ds_reserved == 0);
+	/*
+	 * NOTE: On DEBUG kernels there could be a race between this and
+	 * the check function if spa_asize_inflation is adjusted...
+	 */
+	ASSERT(origin_head->ds_quota == 0 ||
+	    dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
+	    DMU_MAX_ACCESS * spa_asize_inflation);
+	ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
+
+	dsl_dir_cancel_waiters(origin_head->ds_dir);
+
+	/*
+	 * Swap per-dataset feature flags.
+	 */
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (!(spa_feature_table[f].fi_flags &
+		    ZFEATURE_FLAG_PER_DATASET)) {
+			ASSERT(!dsl_dataset_feature_is_active(clone, f));
+			ASSERT(!dsl_dataset_feature_is_active(origin_head, f));
+			continue;
+		}
+
+		boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f);
+		void *clone_feature = clone->ds_feature[f];
+		boolean_t origin_head_inuse =
+		    dsl_dataset_feature_is_active(origin_head, f);
+		void *origin_head_feature = origin_head->ds_feature[f];
+
+		if (clone_inuse)
+			dsl_dataset_deactivate_feature_impl(clone, f, tx);
+		if (origin_head_inuse)
+			dsl_dataset_deactivate_feature_impl(origin_head, f, tx);
+
+		if (clone_inuse) {
+			dsl_dataset_activate_feature(origin_head->ds_object, f,
+			    clone_feature, tx);
+			origin_head->ds_feature[f] = clone_feature;
+		}
+		if (origin_head_inuse) {
+			dsl_dataset_activate_feature(clone->ds_object, f,
+			    origin_head_feature, tx);
+			clone->ds_feature[f] = origin_head_feature;
+		}
+	}
+
+	dmu_buf_will_dirty(clone->ds_dbuf, tx);
+	dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
+
+	if (clone->ds_objset != NULL) {
+		dmu_objset_evict(clone->ds_objset);
+		clone->ds_objset = NULL;
+	}
+
+	if (origin_head->ds_objset != NULL) {
+		dmu_objset_evict(origin_head->ds_objset);
+		origin_head->ds_objset = NULL;
+	}
+
+	unused_refres_delta =
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(origin_head)->ds_unique_bytes) -
+	    (int64_t)MIN(origin_head->ds_reserved,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
+
+	/*
+	 * Reset origin's unique bytes.
+	 */
+	{
+		dsl_dataset_t *origin = clone->ds_prev;
+		uint64_t comp, uncomp;
+
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		dsl_deadlist_space_range(&clone->ds_deadlist,
+		    dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+		    &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
+	}
+
+	/* swap blkptrs */
+	{
+		rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
+		rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
+		blkptr_t tmp;
+		tmp = dsl_dataset_phys(origin_head)->ds_bp;
+		dsl_dataset_phys(origin_head)->ds_bp =
+		    dsl_dataset_phys(clone)->ds_bp;
+		dsl_dataset_phys(clone)->ds_bp = tmp;
+		rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
+		rrw_exit(&clone->ds_bp_rwlock, FTAG);
+	}
+
+	/* set dd_*_bytes */
+	{
+		int64_t dused, dcomp, duncomp;
+		uint64_t cdl_used, cdl_comp, cdl_uncomp;
+		uint64_t odl_used, odl_comp, odl_uncomp;
+
+		ASSERT3U(dsl_dir_phys(clone->ds_dir)->
+		    dd_used_breakdown[DD_USED_SNAP], ==, 0);
+
+		dsl_deadlist_space(&clone->ds_deadlist,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space(&origin_head->ds_deadlist,
+		    &odl_used, &odl_comp, &odl_uncomp);
+
+		dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+		    cdl_used -
+		    (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+		    odl_used);
+		dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+		    cdl_comp -
+		    (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+		    odl_comp);
+		duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
+		    cdl_uncomp -
+		    (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+		    odl_uncomp);
+
+		dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
+		    dused, dcomp, duncomp, tx);
+		dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD,
+		    -dused, -dcomp, -duncomp, tx);
+
+		/*
+		 * The difference in the space used by snapshots is the
+		 * difference in snapshot space due to the head's
+		 * deadlist (since that's the only thing that's
+		 * changing that affects the snapused).
+		 */
+		dsl_deadlist_space_range(&clone->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &cdl_used, &cdl_comp, &cdl_uncomp);
+		dsl_deadlist_space_range(&origin_head->ds_deadlist,
+		    origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
+		    &odl_used, &odl_comp, &odl_uncomp);
+		dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
+		    DD_USED_HEAD, DD_USED_SNAP, tx);
+	}
+
+	/* swap ds_*_bytes */
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+	    dsl_dataset_phys(clone)->ds_referenced_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+	    dsl_dataset_phys(clone)->ds_compressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+	    dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+	    dsl_dataset_phys(clone)->ds_unique_bytes);
+
+	/* apply any parent delta for change in unconsumed refreservation */
+	dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
+	    unused_refres_delta, 0, 0, tx);
+
+	/*
+	 * Swap deadlists.
+	 */
+	dsl_deadlist_close(&clone->ds_deadlist);
+	dsl_deadlist_close(&origin_head->ds_deadlist);
+	SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
+	dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
+	    dsl_dataset_phys(clone)->ds_deadlist_obj);
+	dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
+	    dsl_dataset_phys(origin_head)->ds_deadlist_obj);
+	dsl_dataset_swap_remap_deadlists(clone, origin_head, tx);
+
+	/*
+	 * If there is a bookmark at the origin, its "next dataset" is
+	 * changing, so we need to reset its FBN.
+	 */
+	dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx);
+
+	dsl_scan_ds_clone_swapped(origin_head, clone, tx);
+
+	/*
+	 * Destroy any livelists associated with the clone or the origin,
+	 * since after the swap the corresponding livelists are no longer
+	 * valid.
+	 */
+	dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE);
+	dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE);
+
+	spa_history_log_internal_ds(clone, "clone swap", tx,
+	    "parent=%s", origin_head->ds_dir->dd_myname);
+}
+
+/*
+ * Given a pool name and a dataset object number in that pool,
+ * return the name of that dataset.
+ */
+int
+dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_pool_hold(pname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
+	if (error == 0) {
+		dsl_dataset_name(ds, buf);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_pool_rele(dp, FTAG);
+
+	return (error);
+}
+
+int
+dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
+    uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv)
+{
+	int error = 0;
+
+	ASSERT3S(asize, >, 0);
+
+	/*
+	 * *ref_rsrv is the portion of asize that will come from any
+	 * unconsumed refreservation space.
+	 */
+	*ref_rsrv = 0;
+
+	mutex_enter(&ds->ds_lock);
+	/*
+	 * Make a space adjustment for reserved bytes.
+	 */
+	if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
+		ASSERT3U(*used, >=,
+		    ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*used -=
+		    (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+		*ref_rsrv =
+		    asize - MIN(asize, parent_delta(ds, asize + inflight));
+	}
+
+	if (!check_quota || ds->ds_quota == 0) {
+		mutex_exit(&ds->ds_lock);
+		return (0);
+	}
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota, they get to try again unless the actual
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
+	 */
+	if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+	    ds->ds_quota) {
+		if (inflight > 0 ||
+		    dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
+			error = SET_ERROR(ERESTART);
+		else
+			error = SET_ERROR(EDQUOT);
+	}
+	mutex_exit(&ds->ds_lock);
+
+	return (error);
+}
+
+typedef struct dsl_dataset_set_qr_arg {
+	const char *ddsqra_name;
+	zprop_source_t ddsqra_source;
+	uint64_t ddsqra_value;
+} dsl_dataset_set_qr_arg_t;
+
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t newval;
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA)
+		return (SET_ERROR(ENOTSUP));
+
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	if (newval == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
+	    newval < ds->ds_reserved) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds = NULL;
+	uint64_t newval;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+	dsl_prop_set_sync_impl(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA),
+	    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+	    &ddsqra->ddsqra_value, tx);
+
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval));
+
+	if (ds->ds_quota != newval) {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_quota = newval;
+	}
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refquota(const char *dsname, zprop_source_t source,
+    uint64_t refquota)
+{
+	dsl_dataset_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refquota;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
+	    dsl_dataset_set_refquota_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static int
+dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t newval, unique;
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION)
+		return (SET_ERROR(ENOTSUP));
+
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (ds->ds_is_snapshot) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	/*
+	 * If we are doing the preliminary check in open context, the
+	 * space estimates may be inaccurate.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	mutex_enter(&ds->ds_lock);
+	if (!DS_UNIQUE_IS_ACCURATE(ds))
+		dsl_dataset_recalc_head_uniq(ds);
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+	mutex_exit(&ds->ds_lock);
+
+	if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
+		uint64_t delta = MAX(unique, newval) -
+		    MAX(unique, ds->ds_reserved);
+
+		if (delta >
+		    dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) ||
+		    (ds->ds_quota > 0 && newval > ds->ds_quota)) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(ENOSPC));
+		}
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
+    zprop_source_t source, uint64_t value, dmu_tx_t *tx)
+{
+	uint64_t newval;
+	uint64_t unique;
+	int64_t delta;
+
+	dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+	    source, sizeof (value), 1, &value, tx);
+
+	VERIFY0(dsl_prop_get_int_ds(ds,
+	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval));
+
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	mutex_enter(&ds->ds_dir->dd_lock);
+	mutex_enter(&ds->ds_lock);
+	ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
+	unique = dsl_dataset_phys(ds)->ds_unique_bytes;
+	delta = MAX(0, (int64_t)(newval - unique)) -
+	    MAX(0, (int64_t)(ds->ds_reserved - unique));
+	ds->ds_reserved = newval;
+	mutex_exit(&ds->ds_lock);
+
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
+	mutex_exit(&ds->ds_dir->dd_lock);
+}
+
+static void
+dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds = NULL;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+	dsl_dataset_set_refreservation_sync_impl(ds,
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
+    uint64_t refreservation)
+{
+	dsl_dataset_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = dsname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = refreservation;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
+	    dsl_dataset_set_refreservation_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+typedef struct dsl_dataset_set_compression_arg {
+	const char *ddsca_name;
+	zprop_source_t ddsca_source;
+	uint64_t ddsca_value;
+} dsl_dataset_set_compression_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_compression_arg_t *ddsca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
+	spa_feature_t f = zio_compress_to_feature(compval);
+
+	if (f == SPA_FEATURE_NONE)
+		return (SET_ERROR(EINVAL));
+
+	if (!spa_feature_is_enabled(dp->dp_spa, f))
+		return (SET_ERROR(ENOTSUP));
+
+	return (0);
+}
+
+static void
+dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_set_compression_arg_t *ddsca = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds = NULL;
+
+	uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value);
+	spa_feature_t f = zio_compress_to_feature(compval);
+	ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN);
+
+	VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds));
+	if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) {
+		ds->ds_feature_activation[f] = (void *)B_TRUE;
+		dsl_dataset_activate_feature(ds->ds_object, f,
+		    ds->ds_feature_activation[f], tx);
+		ds->ds_feature[f] = ds->ds_feature_activation[f];
+	}
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dataset_set_compression(const char *dsname, zprop_source_t source,
+    uint64_t compression)
+{
+	dsl_dataset_set_compression_arg_t ddsca;
+
+	/*
+	 * The sync task is only required for zstd in order to activate
+	 * the feature flag when the property is first set.
+	 */
+	if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD)
+		return (0);
+
+	ddsca.ddsca_name = dsname;
+	ddsca.ddsca_source = source;
+	ddsca.ddsca_value = compression;
+
+	return (dsl_sync_task(dsname, dsl_dataset_set_compression_check,
+	    dsl_dataset_set_compression_sync, &ddsca, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Return (in *usedp) the amount of space referenced by "new" that was not
+ * referenced at the time the bookmark corresponds to.  "New" may be a
+ * snapshot or a head.  The bookmark must be before new, in
+ * new's filesystem (or its origin) -- caller verifies this.
+ *
+ * The written space is calculated by considering two components:  First, we
+ * ignore any freed space, and calculate the written as new's used space
+ * minus old's used space.  Next, we add in the amount of space that was freed
+ * between the two time points, thus reducing new's used space relative to
+ * old's. Specifically, this is the space that was born before
+ * zbm_creation_txg, and freed before new (ie. on new's deadlist or a
+ * previous deadlist).
+ *
+ * space freed                         [---------------------]
+ * snapshots                       ---O-------O--------O-------O------
+ *                                         bookmark           new
+ *
+ * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN
+ * flag is not set, we will calculate the freed_before_next based on the
+ * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap.
+ */
+static int
+dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp,
+    dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	int err = 0;
+	dsl_pool_t *dp = new->ds_dir->dd_pool;
+
+	ASSERT(dsl_pool_config_held(dp));
+	if (dsl_dataset_is_snapshot(new)) {
+		ASSERT3U(bmp->zbm_creation_txg, <,
+		    dsl_dataset_phys(new)->ds_creation_txg);
+	}
+
+	*usedp = 0;
+	*usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+	*usedp -= bmp->zbm_referenced_bytes_refd;
+
+	*compp = 0;
+	*compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+	*compp -= bmp->zbm_compressed_bytes_refd;
+
+	*uncompp = 0;
+	*uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+	*uncompp -= bmp->zbm_uncompressed_bytes_refd;
+
+	dsl_dataset_t *snap = new;
+
+	while (dsl_dataset_phys(snap)->ds_prev_snap_txg >
+	    bmp->zbm_creation_txg) {
+		uint64_t used, comp, uncomp;
+
+		dsl_deadlist_space_range(&snap->ds_deadlist,
+		    0, bmp->zbm_creation_txg,
+		    &used, &comp, &uncomp);
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+
+		uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+		if (snap != new)
+			dsl_dataset_rele(snap, FTAG);
+		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap);
+		if (err != 0)
+			break;
+	}
+
+	/*
+	 * We might not have the FBN if we are calculating written from
+	 * a snapshot (because we didn't know the correct "next" snapshot
+	 * until now).
+	 */
+	if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) {
+		*usedp += bmp->zbm_referenced_freed_before_next_snap;
+		*compp += bmp->zbm_compressed_freed_before_next_snap;
+		*uncompp += bmp->zbm_uncompressed_freed_before_next_snap;
+	} else {
+		ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==,
+		    bmp->zbm_creation_txg);
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp);
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+	}
+	if (snap != new)
+		dsl_dataset_rele(snap, FTAG);
+	return (err);
+}
+
+/*
+ * Return (in *usedp) the amount of space written in new that was not
+ * present at the time the bookmark corresponds to.  New may be a
+ * snapshot or the head.  Old must be a bookmark before new, in
+ * new's filesystem (or its origin) -- caller verifies this.
+ */
+int
+dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp,
+    dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN))
+		return (SET_ERROR(ENOTSUP));
+	return (dsl_dataset_space_written_impl(bmp, new,
+	    usedp, compp, uncompp));
+}
+
+/*
+ * Return (in *usedp) the amount of space written in new that is not
+ * present in oldsnap.  New may be a snapshot or the head.  Old must be
+ * a snapshot before new, in new's filesystem (or its origin).  If not then
+ * fail and return EINVAL.
+ */
+int
+dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	if (!dsl_dataset_is_before(new, oldsnap, 0))
+		return (SET_ERROR(EINVAL));
+
+	zfs_bookmark_phys_t zbm = { 0 };
+	dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap);
+	zbm.zbm_guid = dsp->ds_guid;
+	zbm.zbm_creation_txg = dsp->ds_creation_txg;
+	zbm.zbm_creation_time = dsp->ds_creation_time;
+	zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes;
+	zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes;
+	zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes;
+
+	/*
+	 * If oldsnap is the origin (or origin's origin, ...) of new,
+	 * we can't easily calculate the effective FBN.  Therefore,
+	 * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate
+	 * it relative to the correct "next": the next snapshot towards "new",
+	 * rather than the next snapshot in oldsnap's dsl_dir.
+	 */
+	return (dsl_dataset_space_written_impl(&zbm, new,
+	    usedp, compp, uncompp));
+}
+
+/*
+ * Return (in *usedp) the amount of space that will be reclaimed if firstsnap,
+ * lastsnap, and all snapshots in between are deleted.
+ *
+ * blocks that would be freed            [---------------------------]
+ * snapshots                       ---O-------O--------O-------O--------O
+ *                                        firstsnap        lastsnap
+ *
+ * This is the set of blocks that were born after the snap before firstsnap,
+ * (birth > firstsnap->prev_snap_txg) and died before the snap after the
+ * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist).
+ * We calculate this by iterating over the relevant deadlists (from the snap
+ * after lastsnap, backward to the snap after firstsnap), summing up the
+ * space on the deadlist that was born after the snap before firstsnap.
+ */
+int
+dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap,
+    dsl_dataset_t *lastsnap,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	int err = 0;
+	uint64_t snapobj;
+	dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
+
+	ASSERT(firstsnap->ds_is_snapshot);
+	ASSERT(lastsnap->ds_is_snapshot);
+
+	/*
+	 * Check that the snapshots are in the same dsl_dir, and firstsnap
+	 * is before lastsnap.
+	 */
+	if (firstsnap->ds_dir != lastsnap->ds_dir ||
+	    dsl_dataset_phys(firstsnap)->ds_creation_txg >
+	    dsl_dataset_phys(lastsnap)->ds_creation_txg)
+		return (SET_ERROR(EINVAL));
+
+	*usedp = *compp = *uncompp = 0;
+
+	snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
+	while (snapobj != firstsnap->ds_object) {
+		dsl_dataset_t *ds;
+		uint64_t used, comp, uncomp;
+
+		err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds);
+		if (err != 0)
+			break;
+
+		dsl_deadlist_space_range(&ds->ds_deadlist,
+		    dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
+		    &used, &comp, &uncomp);
+		*usedp += used;
+		*compp += comp;
+		*uncompp += uncomp;
+
+		snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		ASSERT3U(snapobj, !=, 0);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	return (err);
+}
+
+/*
+ * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
+ * For example, they could both be snapshots of the same filesystem, and
+ * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
+ * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
+ * filesystem.  Or 'earlier' could be the origin's origin.
+ *
+ * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
+ */
+boolean_t
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+    uint64_t earlier_txg)
+{
+	dsl_pool_t *dp = later->ds_dir->dd_pool;
+	int error;
+	boolean_t ret;
+
+	ASSERT(dsl_pool_config_held(dp));
+	ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
+
+	if (earlier_txg == 0)
+		earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
+
+	if (later->ds_is_snapshot &&
+	    earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
+		return (B_FALSE);
+
+	if (later->ds_dir == earlier->ds_dir)
+		return (B_TRUE);
+
+	/*
+	 * We check dd_origin_obj explicitly here rather than using
+	 * dsl_dir_is_clone() so that we will return TRUE if "earlier"
+	 * is $ORIGIN@$ORIGIN.  dsl_dataset_space_written() depends on
+	 * this behavior.
+	 */
+	if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0)
+		return (B_FALSE);
+
+	dsl_dataset_t *origin;
+	error = dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
+	if (error != 0)
+		return (B_FALSE);
+	if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg &&
+	    origin->ds_dir == earlier->ds_dir) {
+		dsl_dataset_rele(origin, FTAG);
+		return (B_TRUE);
+	}
+	ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
+	dsl_dataset_rele(origin, FTAG);
+	return (ret);
+}
+
+void
+dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
+}
+
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+	dmu_object_info_t doi;
+
+	dmu_object_info_from_db(ds->ds_dbuf, &doi);
+	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+	return (dsl_dataset_is_zapified(ds) &&
+	    zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
+
+uint64_t
+dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds)
+{
+	uint64_t remap_deadlist_obj;
+	int err;
+
+	if (!dsl_dataset_is_zapified(ds))
+		return (0);
+
+	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1,
+	    &remap_deadlist_obj);
+
+	if (err != 0) {
+		VERIFY3S(err, ==, ENOENT);
+		return (0);
+	}
+
+	ASSERT(remap_deadlist_obj != 0);
+	return (remap_deadlist_obj);
+}
+
+boolean_t
+dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds)
+{
+	EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist),
+	    dsl_dataset_get_remap_deadlist_object(ds) != 0);
+	return (dsl_deadlist_is_open(&ds->ds_remap_deadlist));
+}
+
+static void
+dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj,
+    dmu_tx_t *tx)
+{
+	ASSERT(obj != 0);
+	dsl_dataset_zapify(ds, tx);
+	VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx));
+}
+
+static void
+dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx));
+}
+
+void
+dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t remap_deadlist_object;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dsl_dataset_remap_deadlist_exists(ds));
+
+	remap_deadlist_object = ds->ds_remap_deadlist.dl_object;
+	dsl_deadlist_close(&ds->ds_remap_deadlist);
+	dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx);
+	dsl_dataset_unset_remap_deadlist_object(ds, tx);
+	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t remap_deadlist_obj;
+	spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock));
+	/*
+	 * Currently we only create remap deadlists when there are indirect
+	 * vdevs with referenced mappings.
+	 */
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+	remap_deadlist_obj = dsl_deadlist_clone(
+	    &ds->ds_deadlist, UINT64_MAX,
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
+	dsl_dataset_set_remap_deadlist_object(ds,
+	    remap_deadlist_obj, tx);
+	dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa),
+	    remap_deadlist_obj);
+	spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps,
+    uint64_t num_redact_snaps, dmu_tx_t *tx)
+{
+	uint64_t dsobj = ds->ds_object;
+	struct feature_type_uint64_array_arg *ftuaa =
+	    kmem_zalloc(sizeof (*ftuaa), KM_SLEEP);
+	ftuaa->length = (int64_t)num_redact_snaps;
+	if (num_redact_snaps > 0) {
+		ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t),
+		    KM_SLEEP);
+		bcopy(redact_snaps, ftuaa->array, num_redact_snaps *
+		    sizeof (uint64_t));
+	}
+	dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS,
+	    ftuaa, tx);
+	ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa;
+}
+
+/* BEGIN CSTYLED */
+#if defined(_LP64)
+#define	RECORDSIZE_PERM ZMOD_RW
+#else
+/* Limited to 1M on 32-bit platforms due to lack of virtual address space */
+#define	RECORDSIZE_PERM ZMOD_RD
+#endif
+ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM,
+	"Max allowed record size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW,
+	"Allow mounting of redacted datasets");
+/* END CSTYLED */
+
+EXPORT_SYMBOL(dsl_dataset_hold);
+EXPORT_SYMBOL(dsl_dataset_hold_flags);
+EXPORT_SYMBOL(dsl_dataset_hold_obj);
+EXPORT_SYMBOL(dsl_dataset_hold_obj_flags);
+EXPORT_SYMBOL(dsl_dataset_own);
+EXPORT_SYMBOL(dsl_dataset_own_obj);
+EXPORT_SYMBOL(dsl_dataset_name);
+EXPORT_SYMBOL(dsl_dataset_rele);
+EXPORT_SYMBOL(dsl_dataset_rele_flags);
+EXPORT_SYMBOL(dsl_dataset_disown);
+EXPORT_SYMBOL(dsl_dataset_tryown);
+EXPORT_SYMBOL(dsl_dataset_create_sync);
+EXPORT_SYMBOL(dsl_dataset_create_sync_dd);
+EXPORT_SYMBOL(dsl_dataset_snapshot_check);
+EXPORT_SYMBOL(dsl_dataset_snapshot_sync);
+EXPORT_SYMBOL(dsl_dataset_promote);
+EXPORT_SYMBOL(dsl_dataset_user_hold);
+EXPORT_SYMBOL(dsl_dataset_user_release);
+EXPORT_SYMBOL(dsl_dataset_get_holds);
+EXPORT_SYMBOL(dsl_dataset_get_blkptr);
+EXPORT_SYMBOL(dsl_dataset_get_spa);
+EXPORT_SYMBOL(dsl_dataset_modified_since_snap);
+EXPORT_SYMBOL(dsl_dataset_space_written);
+EXPORT_SYMBOL(dsl_dataset_space_wouldfree);
+EXPORT_SYMBOL(dsl_dataset_sync);
+EXPORT_SYMBOL(dsl_dataset_block_born);
+EXPORT_SYMBOL(dsl_dataset_block_kill);
+EXPORT_SYMBOL(dsl_dataset_dirty);
+EXPORT_SYMBOL(dsl_dataset_stats);
+EXPORT_SYMBOL(dsl_dataset_fast_stat);
+EXPORT_SYMBOL(dsl_dataset_space);
+EXPORT_SYMBOL(dsl_dataset_fsid_guid);
+EXPORT_SYMBOL(dsl_dsobj_to_dsname);
+EXPORT_SYMBOL(dsl_dataset_check_quota);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl);
+EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl);
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deadlist.c b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
new file mode 100644
index 000000000000..bad2d56eefdd
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_deadlist.c
@@ -0,0 +1,1012 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/zfs_context.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * Deadlist concurrency:
+ *
+ * Deadlists can only be modified from the syncing thread.
+ *
+ * Except for dsl_deadlist_insert(), it can only be modified with the
+ * dp_config_rwlock held with RW_WRITER.
+ *
+ * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
+ * be called concurrently, from open context, with the dl_config_rwlock held
+ * with RW_READER.
+ *
+ * Therefore, we only need to provide locking between dsl_deadlist_insert() and
+ * the accessors, protecting:
+ *     dl_phys->dl_used,comp,uncomp
+ *     and protecting the dl_tree from being loaded.
+ * The locking is provided by dl_lock.  Note that locking on the bpobj_t
+ * provides its own locking, and dl_oldfmt is immutable.
+ */
+
+/*
+ * Livelist Overview
+ * ================
+ *
+ * Livelists use the same 'deadlist_t' struct as deadlists and are also used
+ * to track blkptrs over the lifetime of a dataset. Livelists however, belong
+ * to clones and track the blkptrs that are clone-specific (were born after
+ * the clone's creation). The exception is embedded block pointers which are
+ * not included in livelists because they do not need to be freed.
+ *
+ * When it comes time to delete the clone, the livelist provides a quick
+ * reference as to what needs to be freed. For this reason, livelists also track
+ * when clone-specific blkptrs are freed before deletion to prevent double
+ * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the
+ * deletion algorithm iterates backwards over the livelist, matching
+ * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists
+ * are also updated in the case when blkptrs are remapped: the old version
+ * of the blkptr is cancelled out with a FREE and the new version is tracked
+ * with an ALLOC.
+ *
+ * To bound the amount of memory required for deletion, livelists over a
+ * certain size are spread over multiple entries. Entries are grouped by
+ * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will
+ * be in the same entry. This allows us to delete livelists incrementally
+ * over multiple syncs, one entry at a time.
+ *
+ * During the lifetime of the clone, livelists can get extremely large.
+ * Their size is managed by periodic condensing (preemptively cancelling out
+ * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when
+ * the shared space between the clone and its origin is so small that it
+ * doesn't make sense to use livelists anymore.
+ */
+
+/*
+ * The threshold sublist size at which we create a new sub-livelist for the
+ * next txg. However, since blkptrs of the same transaction group must be in
+ * the same sub-list, the actual sublist size may exceed this. When picking the
+ * size we had to balance the fact that larger sublists mean fewer sublists
+ * (decreasing the cost of insertion) against the consideration that sublists
+ * will be loaded into memory and shouldn't take up an inordinate amount of
+ * space. We settled on ~500000 entries, corresponding to roughly 128M.
+ */
+unsigned long zfs_livelist_max_entries = 500000;
+
+/*
+ * We can approximate how much of a performance gain a livelist will give us
+ * based on the percentage of blocks shared between the clone and its origin.
+ * 0 percent shared means that the clone has completely diverged and that the
+ * old method is maximally effective: every read from the block tree will
+ * result in lots of frees. Livelists give us gains when they track blocks
+ * scattered across the tree, when one read in the old method might only
+ * result in a few frees. Once the clone has been overwritten enough,
+ * writes are no longer sparse and we'll no longer get much of a benefit from
+ * tracking them with a livelist. We chose a lower limit of 75 percent shared
+ * (25 percent overwritten). This means that 1/4 of all block pointers will be
+ * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists
+ * to make deletion 4x faster. Once the amount of shared space drops below this
+ * threshold, the clone will revert to the old deletion method.
+ */
+int zfs_livelist_min_percent_shared = 75;
+
+static int
+dsl_deadlist_compare(const void *arg1, const void *arg2)
+{
+	const dsl_deadlist_entry_t *dle1 = arg1;
+	const dsl_deadlist_entry_t *dle2 = arg2;
+
+	return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg));
+}
+
+static int
+dsl_deadlist_cache_compare(const void *arg1, const void *arg2)
+{
+	const dsl_deadlist_cache_entry_t *dlce1 = arg1;
+	const dsl_deadlist_cache_entry_t *dlce2 = arg2;
+
+	return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg));
+}
+
+static void
+dsl_deadlist_load_tree(dsl_deadlist_t *dl)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int error;
+
+	ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+	ASSERT(!dl->dl_oldfmt);
+	if (dl->dl_havecache) {
+		/*
+		 * After loading the tree, the caller may modify the tree,
+		 * e.g. to add or remove nodes, or to make a node no longer
+		 * refer to the empty_bpobj.  These changes would make the
+		 * dl_cache incorrect.  Therefore we discard the cache here,
+		 * so that it can't become incorrect.
+		 */
+		dsl_deadlist_cache_entry_t *dlce;
+		void *cookie = NULL;
+		while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
+		    != NULL) {
+			kmem_free(dlce, sizeof (*dlce));
+		}
+		avl_destroy(&dl->dl_cache);
+		dl->dl_havecache = B_FALSE;
+	}
+	if (dl->dl_havetree)
+		return;
+
+	avl_create(&dl->dl_tree, dsl_deadlist_compare,
+	    sizeof (dsl_deadlist_entry_t),
+	    offsetof(dsl_deadlist_entry_t, dle_node));
+	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+		dle->dle_mintxg = zfs_strtonum(za.za_name, NULL);
+
+		/*
+		 * Prefetch all the bpobj's so that we do that i/o
+		 * in parallel.  Then open them all in a second pass.
+		 */
+		dle->dle_bpobj.bpo_object = za.za_first_integer;
+		dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object,
+		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+
+		avl_add(&dl->dl_tree, dle);
+	}
+	VERIFY3U(error, ==, ENOENT);
+	zap_cursor_fini(&zc);
+
+	for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree);
+	    dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os,
+		    dle->dle_bpobj.bpo_object));
+	}
+	dl->dl_havetree = B_TRUE;
+}
+
+/*
+ * Load only the non-empty bpobj's into the dl_cache.  The cache is an analog
+ * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It
+ * is used only for gathering space statistics.  The dl_cache has two
+ * advantages over the dl_tree:
+ *
+ * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's
+ * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj
+ * many times and to inquire about its (zero) space stats many times.
+ *
+ * 2. The dl_cache uses less memory than the dl_tree.  We only need to load
+ * the dl_tree of snapshots when deleting a snapshot, after which we free the
+ * dl_tree with dsl_deadlist_discard_tree
+ */
+static void
+dsl_deadlist_load_cache(dsl_deadlist_t *dl)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int error;
+
+	ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+	ASSERT(!dl->dl_oldfmt);
+	if (dl->dl_havecache)
+		return;
+
+	uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj;
+
+	avl_create(&dl->dl_cache, dsl_deadlist_cache_compare,
+	    sizeof (dsl_deadlist_cache_entry_t),
+	    offsetof(dsl_deadlist_cache_entry_t, dlce_node));
+	for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if (za.za_first_integer == empty_bpobj)
+			continue;
+		dsl_deadlist_cache_entry_t *dlce =
+		    kmem_zalloc(sizeof (*dlce), KM_SLEEP);
+		dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL);
+
+		/*
+		 * Prefetch all the bpobj's so that we do that i/o
+		 * in parallel.  Then open them all in a second pass.
+		 */
+		dlce->dlce_bpobj = za.za_first_integer;
+		dmu_prefetch(dl->dl_os, dlce->dlce_bpobj,
+		    0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		avl_add(&dl->dl_cache, dlce);
+	}
+	VERIFY3U(error, ==, ENOENT);
+	zap_cursor_fini(&zc);
+
+	for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache);
+	    dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) {
+		bpobj_t bpo;
+		VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj));
+
+		VERIFY0(bpobj_space(&bpo,
+		    &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp));
+		bpobj_close(&bpo);
+	}
+	dl->dl_havecache = B_TRUE;
+}
+
+/*
+ * Discard the tree to save memory.
+ */
+void
+dsl_deadlist_discard_tree(dsl_deadlist_t *dl)
+{
+	mutex_enter(&dl->dl_lock);
+
+	if (!dl->dl_havetree) {
+		mutex_exit(&dl->dl_lock);
+		return;
+	}
+	dsl_deadlist_entry_t *dle;
+	void *cookie = NULL;
+	while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) {
+		bpobj_close(&dle->dle_bpobj);
+		kmem_free(dle, sizeof (*dle));
+	}
+	avl_destroy(&dl->dl_tree);
+
+	dl->dl_havetree = B_FALSE;
+	mutex_exit(&dl->dl_lock);
+}
+
+void
+dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args)
+{
+	dsl_deadlist_entry_t *dle;
+
+	ASSERT(dsl_deadlist_is_open(dl));
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+	mutex_exit(&dl->dl_lock);
+	for (dle = avl_first(&dl->dl_tree); dle != NULL;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		if (func(args, dle) != 0)
+			break;
+	}
+}
+
+void
+dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
+{
+	dmu_object_info_t doi;
+
+	ASSERT(!dsl_deadlist_is_open(dl));
+
+	mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
+	dl->dl_os = os;
+	dl->dl_object = object;
+	VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
+	dmu_object_info_from_db(dl->dl_dbuf, &doi);
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		dmu_buf_rele(dl->dl_dbuf, dl);
+		dl->dl_dbuf = NULL;
+		dl->dl_oldfmt = B_TRUE;
+		VERIFY0(bpobj_open(&dl->dl_bpobj, os, object));
+		return;
+	}
+
+	dl->dl_oldfmt = B_FALSE;
+	dl->dl_phys = dl->dl_dbuf->db_data;
+	dl->dl_havetree = B_FALSE;
+	dl->dl_havecache = B_FALSE;
+}
+
+boolean_t
+dsl_deadlist_is_open(dsl_deadlist_t *dl)
+{
+	return (dl->dl_os != NULL);
+}
+
+void
+dsl_deadlist_close(dsl_deadlist_t *dl)
+{
+	ASSERT(dsl_deadlist_is_open(dl));
+	mutex_destroy(&dl->dl_lock);
+
+	if (dl->dl_oldfmt) {
+		dl->dl_oldfmt = B_FALSE;
+		bpobj_close(&dl->dl_bpobj);
+		dl->dl_os = NULL;
+		dl->dl_object = 0;
+		return;
+	}
+
+	if (dl->dl_havetree) {
+		dsl_deadlist_entry_t *dle;
+		void *cookie = NULL;
+		while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
+		    != NULL) {
+			bpobj_close(&dle->dle_bpobj);
+			kmem_free(dle, sizeof (*dle));
+		}
+		avl_destroy(&dl->dl_tree);
+	}
+	if (dl->dl_havecache) {
+		dsl_deadlist_cache_entry_t *dlce;
+		void *cookie = NULL;
+		while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie))
+		    != NULL) {
+			kmem_free(dlce, sizeof (*dlce));
+		}
+		avl_destroy(&dl->dl_cache);
+	}
+	dmu_buf_rele(dl->dl_dbuf, dl);
+	dl->dl_dbuf = NULL;
+	dl->dl_phys = NULL;
+	dl->dl_os = NULL;
+	dl->dl_object = 0;
+}
+
+uint64_t
+dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
+		return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
+	return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
+	    sizeof (dsl_deadlist_phys_t), tx));
+}
+
+void
+dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
+{
+	dmu_object_info_t doi;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int error;
+
+	VERIFY0(dmu_object_info(os, dlobj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_free(os, dlobj, tx);
+		return;
+	}
+
+	for (zap_cursor_init(&zc, os, dlobj);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t obj = za.za_first_integer;
+		if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
+			bpobj_decr_empty(os, tx);
+		else
+			bpobj_free(os, obj, tx);
+	}
+	VERIFY3U(error, ==, ENOENT);
+	zap_cursor_fini(&zc);
+	VERIFY0(dmu_object_free(os, dlobj, tx));
+}
+
+static void
+dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+	ASSERT(MUTEX_HELD(&dl->dl_lock));
+	if (dle->dle_bpobj.bpo_object ==
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+	bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx);
+}
+
+static void
+dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
+    uint64_t obj, dmu_tx_t *tx)
+{
+	ASSERT(MUTEX_HELD(&dl->dl_lock));
+	if (dle->dle_bpobj.bpo_object !=
+	    dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
+		bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
+	} else {
+		bpobj_close(&dle->dle_bpobj);
+		bpobj_decr_empty(dl->dl_os, tx);
+		VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+		VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, obj, tx));
+	}
+}
+
+void
+dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx);
+		return;
+	}
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+
+	int sign = bp_freed ? -1 : +1;
+	dl->dl_phys->dl_used +=
+	    sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
+	dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp);
+	dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp);
+
+	dle_tofind.dle_mintxg = bp->blk_birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	else
+		dle = AVL_PREV(&dl->dl_tree, dle);
+
+	if (dle == NULL) {
+		zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu",
+		    bp, (longlong_t)bp->blk_birth);
+		dle = avl_first(&dl->dl_tree);
+	}
+
+	ASSERT3P(dle, !=, NULL);
+	dle_enqueue(dl, dle, bp, bp_freed, tx);
+	mutex_exit(&dl->dl_lock);
+}
+
+int
+dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, B_FALSE, tx);
+	return (0);
+}
+
+int
+dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, B_TRUE, tx);
+	return (0);
+}
+
+/*
+ * Insert new key in deadlist, which must be > all current entries.
+ * mintxg is not inclusive.
+ */
+void
+dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	dsl_deadlist_entry_t *dle;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
+	dle->dle_mintxg = mintxg;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+
+	obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+	VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
+	avl_add(&dl->dl_tree, dle);
+
+	VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object,
+	    mintxg, obj, tx));
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove this key, merging its entries into the previous key.
+ */
+void
+dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle, *dle_prev;
+
+	if (dl->dl_oldfmt)
+		return;
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+	ASSERT3P(dle, !=, NULL);
+	dle_prev = AVL_PREV(&dl->dl_tree, dle);
+
+	dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
+
+	avl_remove(&dl->dl_tree, dle);
+	bpobj_close(&dle->dle_bpobj);
+	kmem_free(dle, sizeof (*dle));
+
+	VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove a deadlist entry and all of its contents by removing the entry from
+ * the deadlist's avl tree, freeing the entry's bpobj and adjusting the
+ * deadlist's space accounting accordingly.
+ */
+void
+dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
+{
+	uint64_t used, comp, uncomp;
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	objset_t *os = dl->dl_os;
+
+	if (dl->dl_oldfmt)
+		return;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
+	VERIFY3P(dle, !=, NULL);
+
+	avl_remove(&dl->dl_tree, dle);
+	VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx));
+	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	dl->dl_phys->dl_used -= used;
+	dl->dl_phys->dl_comp -= comp;
+	dl->dl_phys->dl_uncomp -= uncomp;
+	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) {
+		bpobj_decr_empty(os, tx);
+	} else {
+		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
+	}
+	bpobj_close(&dle->dle_bpobj);
+	kmem_free(dle, sizeof (*dle));
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Clear out the contents of a deadlist_entry by freeing its bpobj,
+ * replacing it with an empty bpobj and adjusting the deadlist's
+ * space accounting
+ */
+void
+dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl,
+    dmu_tx_t *tx)
+{
+	uint64_t new_obj, used, comp, uncomp;
+	objset_t *os = dl->dl_os;
+
+	mutex_enter(&dl->dl_lock);
+	VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx));
+	VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp));
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	dl->dl_phys->dl_used -= used;
+	dl->dl_phys->dl_comp -= comp;
+	dl->dl_phys->dl_uncomp -= uncomp;
+	if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj)
+		bpobj_decr_empty(os, tx);
+	else
+		bpobj_free(os, dle->dle_bpobj.bpo_object, tx);
+	bpobj_close(&dle->dle_bpobj);
+	new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx);
+	VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj));
+	VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg,
+	    new_obj, tx));
+	ASSERT(bpobj_is_empty(&dle->dle_bpobj));
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Return the first entry in deadlist's avl tree
+ */
+dsl_deadlist_entry_t *
+dsl_deadlist_first(dsl_deadlist_t *dl)
+{
+	dsl_deadlist_entry_t *dle;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+	dle = avl_first(&dl->dl_tree);
+	mutex_exit(&dl->dl_lock);
+
+	return (dle);
+}
+
+/*
+ * Return the last entry in deadlist's avl tree
+ */
+dsl_deadlist_entry_t *
+dsl_deadlist_last(dsl_deadlist_t *dl)
+{
+	dsl_deadlist_entry_t *dle;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+	dle = avl_last(&dl->dl_tree);
+	mutex_exit(&dl->dl_lock);
+
+	return (dle);
+}
+
+/*
+ * Walk ds's snapshots to regenerate generate ZAP & AVL.
+ */
+static void
+dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_t dl = { 0 };
+	dsl_pool_t *dp = dmu_objset_pool(os);
+
+	dsl_deadlist_open(&dl, os, dlobj);
+	if (dl.dl_oldfmt) {
+		dsl_deadlist_close(&dl);
+		return;
+	}
+
+	while (mrs_obj != 0) {
+		dsl_dataset_t *ds;
+		VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
+		dsl_deadlist_add_key(&dl,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+		mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_deadlist_close(&dl);
+}
+
+uint64_t
+dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
+    uint64_t mrs_obj, dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t *dle;
+	uint64_t newobj;
+
+	newobj = dsl_deadlist_alloc(dl->dl_os, tx);
+
+	if (dl->dl_oldfmt) {
+		dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
+		return (newobj);
+	}
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_tree(dl);
+
+	for (dle = avl_first(&dl->dl_tree); dle;
+	    dle = AVL_NEXT(&dl->dl_tree, dle)) {
+		uint64_t obj;
+
+		if (dle->dle_mintxg >= maxtxg)
+			break;
+
+		obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
+		VERIFY0(zap_add_int_key(dl->dl_os, newobj,
+		    dle->dle_mintxg, obj, tx));
+	}
+	mutex_exit(&dl->dl_lock);
+	return (newobj);
+}
+
+void
+dsl_deadlist_space(dsl_deadlist_t *dl,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	ASSERT(dsl_deadlist_is_open(dl));
+	if (dl->dl_oldfmt) {
+		VERIFY0(bpobj_space(&dl->dl_bpobj,
+		    usedp, compp, uncompp));
+		return;
+	}
+
+	mutex_enter(&dl->dl_lock);
+	*usedp = dl->dl_phys->dl_used;
+	*compp = dl->dl_phys->dl_comp;
+	*uncompp = dl->dl_phys->dl_uncomp;
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * return space used in the range (mintxg, maxtxg].
+ * Includes maxtxg, does not include mintxg.
+ * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
+ * UINT64_MAX).
+ */
+void
+dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
+    uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
+{
+	dsl_deadlist_cache_entry_t *dlce;
+	dsl_deadlist_cache_entry_t dlce_tofind;
+	avl_index_t where;
+
+	if (dl->dl_oldfmt) {
+		VERIFY0(bpobj_space_range(&dl->dl_bpobj,
+		    mintxg, maxtxg, usedp, compp, uncompp));
+		return;
+	}
+
+	*usedp = *compp = *uncompp = 0;
+
+	mutex_enter(&dl->dl_lock);
+	dsl_deadlist_load_cache(dl);
+	dlce_tofind.dlce_mintxg = mintxg;
+	dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where);
+
+	/*
+	 * If this mintxg doesn't exist, it may be an empty_bpobj which
+	 * is omitted from the sparse tree.  Start at the next non-empty
+	 * entry.
+	 */
+	if (dlce == NULL)
+		dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER);
+
+	for (; dlce && dlce->dlce_mintxg < maxtxg;
+	    dlce = AVL_NEXT(&dl->dl_tree, dlce)) {
+		*usedp += dlce->dlce_bytes;
+		*compp += dlce->dlce_comp;
+		*uncompp += dlce->dlce_uncomp;
+	}
+
+	mutex_exit(&dl->dl_lock);
+}
+
+static void
+dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+	uint64_t used, comp, uncomp;
+	bpobj_t bpo;
+
+	ASSERT(MUTEX_HELD(&dl->dl_lock));
+
+	VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
+	VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp));
+	bpobj_close(&bpo);
+
+	dsl_deadlist_load_tree(dl);
+
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	dl->dl_phys->dl_used += used;
+	dl->dl_phys->dl_comp += comp;
+	dl->dl_phys->dl_uncomp += uncomp;
+
+	dle_tofind.dle_mintxg = birth;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
+	dle_enqueue_subobj(dl, dle, obj, tx);
+}
+
+static int
+dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_t *dl = arg;
+	dsl_deadlist_insert(dl, bp, bp_freed, tx);
+	return (0);
+}
+
+/*
+ * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
+ * an empty deadlist.
+ */
+void
+dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	dmu_buf_t *bonus;
+	dsl_deadlist_phys_t *dlp;
+	dmu_object_info_t doi;
+	int error;
+
+	VERIFY0(dmu_object_info(dl->dl_os, obj, &doi));
+	if (doi.doi_type == DMU_OT_BPOBJ) {
+		bpobj_t bpo;
+		VERIFY0(bpobj_open(&bpo, dl->dl_os, obj));
+		VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx));
+		bpobj_close(&bpo);
+		return;
+	}
+
+	mutex_enter(&dl->dl_lock);
+	for (zap_cursor_init(&zc, dl->dl_os, obj);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t mintxg = zfs_strtonum(za.za_name, NULL);
+		dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
+		VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx));
+	}
+	VERIFY3U(error, ==, ENOENT);
+	zap_cursor_fini(&zc);
+
+	VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
+	dlp = bonus->db_data;
+	dmu_buf_will_dirty(bonus, tx);
+	bzero(dlp, sizeof (*dlp));
+	dmu_buf_rele(bonus, FTAG);
+	mutex_exit(&dl->dl_lock);
+}
+
+/*
+ * Remove entries on dl that are born > mintxg, and put them on the bpobj.
+ */
+void
+dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
+    dmu_tx_t *tx)
+{
+	dsl_deadlist_entry_t dle_tofind;
+	dsl_deadlist_entry_t *dle;
+	avl_index_t where;
+
+	ASSERT(!dl->dl_oldfmt);
+
+	mutex_enter(&dl->dl_lock);
+	dmu_buf_will_dirty(dl->dl_dbuf, tx);
+	dsl_deadlist_load_tree(dl);
+
+	dle_tofind.dle_mintxg = mintxg;
+	dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
+	if (dle == NULL)
+		dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
+	while (dle) {
+		uint64_t used, comp, uncomp;
+		dsl_deadlist_entry_t *dle_next;
+
+		bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
+
+		VERIFY0(bpobj_space(&dle->dle_bpobj,
+		    &used, &comp, &uncomp));
+		ASSERT3U(dl->dl_phys->dl_used, >=, used);
+		ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
+		ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
+		dl->dl_phys->dl_used -= used;
+		dl->dl_phys->dl_comp -= comp;
+		dl->dl_phys->dl_uncomp -= uncomp;
+
+		VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object,
+		    dle->dle_mintxg, tx));
+
+		dle_next = AVL_NEXT(&dl->dl_tree, dle);
+		avl_remove(&dl->dl_tree, dle);
+		bpobj_close(&dle->dle_bpobj);
+		kmem_free(dle, sizeof (*dle));
+		dle = dle_next;
+	}
+	mutex_exit(&dl->dl_lock);
+}
+
+typedef struct livelist_entry {
+	const blkptr_t *le_bp;
+	avl_node_t le_node;
+} livelist_entry_t;
+
+static int
+livelist_compare(const void *larg, const void *rarg)
+{
+	const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp;
+	const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp;
+
+	/* Sort them according to dva[0] */
+	uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]);
+	uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]);
+
+	if (l_dva0_vdev != r_dva0_vdev)
+		return (TREE_CMP(l_dva0_vdev, r_dva0_vdev));
+
+	/* if vdevs are equal, sort by offsets. */
+	uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]);
+	uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]);
+	if (l_dva0_offset == r_dva0_offset)
+		ASSERT3U(l->blk_birth, ==, r->blk_birth);
+	return (TREE_CMP(l_dva0_offset, r_dva0_offset));
+}
+
+struct livelist_iter_arg {
+	avl_tree_t *avl;
+	bplist_t *to_free;
+	zthr_t *t;
+};
+
+/*
+ * Expects an AVL tree which is incrementally filled will FREE blkptrs
+ * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a
+ * corresponding FREE are stored in the supplied bplist.
+ */
+static int
+dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	struct livelist_iter_arg *lia = arg;
+	avl_tree_t *avl = lia->avl;
+	bplist_t *to_free = lia->to_free;
+	zthr_t *t = lia->t;
+	ASSERT(tx == NULL);
+
+	if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t)))
+		return (SET_ERROR(EINTR));
+	if (bp_freed) {
+		livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t),
+		    KM_SLEEP);
+		blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+		*temp_bp = *bp;
+		node->le_bp = temp_bp;
+		avl_add(avl, node);
+	} else {
+		livelist_entry_t node;
+		node.le_bp = bp;
+		livelist_entry_t *found = avl_find(avl, &node, NULL);
+		if (found != NULL) {
+			avl_remove(avl, found);
+			kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t));
+			kmem_free(found, sizeof (livelist_entry_t));
+		} else {
+			bplist_append(to_free, bp);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs
+ * which have an ALLOC entry but no matching FREE
+ */
+int
+dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t,
+    uint64_t *size)
+{
+	avl_tree_t avl;
+	avl_create(&avl, livelist_compare, sizeof (livelist_entry_t),
+	    offsetof(livelist_entry_t, le_node));
+
+	/* process the sublist */
+	struct livelist_iter_arg arg = {
+	    .avl = &avl,
+	    .to_free = to_free,
+	    .t = t
+	};
+	int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size);
+
+	avl_destroy(&avl);
+	return (err);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW,
+	"Size to start the next sub-livelist in a livelist");
+
+ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW,
+	"Threshold at which livelist is disabled");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_deleg.c b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
new file mode 100644
index 000000000000..cf8a3c9bbdfb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_deleg.c
@@ -0,0 +1,774 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
+/*
+ * DSL permissions are stored in a two level zap attribute
+ * mechanism.   The first level identifies the "class" of
+ * entry.  The class is identified by the first 2 letters of
+ * the attribute.  The second letter "l" or "d" identifies whether
+ * it is a local or descendent permission.  The first letter
+ * identifies the type of entry.
+ *
+ * ul$<id>    identifies permissions granted locally for this userid.
+ * ud$<id>    identifies permissions granted on descendent datasets for
+ *            this userid.
+ * Ul$<id>    identifies permission sets granted locally for this userid.
+ * Ud$<id>    identifies permission sets granted on descendent datasets for
+ *            this userid.
+ * gl$<id>    identifies permissions granted locally for this groupid.
+ * gd$<id>    identifies permissions granted on descendent datasets for
+ *            this groupid.
+ * Gl$<id>    identifies permission sets granted locally for this groupid.
+ * Gd$<id>    identifies permission sets granted on descendent datasets for
+ *            this groupid.
+ * el$        identifies permissions granted locally for everyone.
+ * ed$        identifies permissions granted on descendent datasets
+ *            for everyone.
+ * El$        identifies permission sets granted locally for everyone.
+ * Ed$        identifies permission sets granted to descendent datasets for
+ *            everyone.
+ * c-$        identifies permission to create at dataset creation time.
+ * C-$        identifies permission sets to grant locally at dataset creation
+ *            time.
+ * s-$@<name> permissions defined in specified set @<name>
+ * S-$@<name> Sets defined in named set @<name>
+ *
+ * Each of the above entities points to another zap attribute that contains one
+ * attribute for each allowed permission, such as create, destroy,...
+ * All of the "upper" case class types will specify permission set names
+ * rather than permissions.
+ *
+ * Basically it looks something like this:
+ * ul$12 -> ZAP OBJ -> permissions...
+ *
+ * The ZAP OBJ is referred to as the jump object.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/cred.h>
+#include <sys/sunddi.h>
+
+#include "zfs_deleg.h"
+
+/*
+ * Validate that user is allowed to delegate specified permissions.
+ *
+ * In order to delegate "create" you must have "create"
+ * and "allow".
+ */
+int
+dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+	nvpair_t *whopair = NULL;
+	int error;
+
+	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+		return (error);
+
+	while ((whopair = nvlist_next_nvpair(nvp, whopair))) {
+		nvlist_t *perms;
+		nvpair_t *permpair = NULL;
+
+		VERIFY(nvpair_value_nvlist(whopair, &perms) == 0);
+
+		while ((permpair = nvlist_next_nvpair(perms, permpair))) {
+			const char *perm = nvpair_name(permpair);
+
+			if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0)
+				return (SET_ERROR(EPERM));
+
+			if ((error = dsl_deleg_access(ddname, perm, cr)) != 0)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Validate that user is allowed to unallow specified permissions.  They
+ * must have the 'allow' permission, and even then can only unallow
+ * perms for their uid.
+ */
+int
+dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr)
+{
+	nvpair_t *whopair = NULL;
+	int error;
+	char idstr[32];
+
+	if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0)
+		return (error);
+
+	(void) snprintf(idstr, sizeof (idstr), "%lld",
+	    (longlong_t)crgetuid(cr));
+
+	while ((whopair = nvlist_next_nvpair(nvp, whopair))) {
+		zfs_deleg_who_type_t type = nvpair_name(whopair)[0];
+
+		if (type != ZFS_DELEG_USER &&
+		    type != ZFS_DELEG_USER_SETS)
+			return (SET_ERROR(EPERM));
+
+		if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0)
+			return (SET_ERROR(EPERM));
+	}
+	return (0);
+}
+
+typedef struct dsl_deleg_arg {
+	const char *dda_name;
+	nvlist_t *dda_nvlist;
+} dsl_deleg_arg_t;
+
+static void
+dsl_deleg_set_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	nvpair_t *whopair = NULL;
+	uint64_t zapobj;
+
+	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+	if (zapobj == 0) {
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
+		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+	}
+
+	while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) {
+		const char *whokey = nvpair_name(whopair);
+		nvlist_t *perms;
+		nvpair_t *permpair = NULL;
+		uint64_t jumpobj;
+
+		perms = fnvpair_value_nvlist(whopair);
+
+		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) {
+			jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS,
+			    zapobj, whokey, tx);
+		}
+
+		while ((permpair = nvlist_next_nvpair(perms, permpair))) {
+			const char *perm = nvpair_name(permpair);
+			uint64_t n = 0;
+
+			VERIFY(zap_update(mos, jumpobj,
+			    perm, 8, 1, &n, tx) == 0);
+			spa_history_log_internal_dd(dd, "permission update", tx,
+			    "%s %s", whokey, perm);
+		}
+	}
+	dsl_dir_rele(dd, FTAG);
+}
+
+static void
+dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	nvpair_t *whopair = NULL;
+	uint64_t zapobj;
+
+	VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
+	zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+	if (zapobj == 0) {
+		dsl_dir_rele(dd, FTAG);
+		return;
+	}
+
+	while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) {
+		const char *whokey = nvpair_name(whopair);
+		nvlist_t *perms;
+		nvpair_t *permpair = NULL;
+		uint64_t jumpobj;
+
+		if (nvpair_value_nvlist(whopair, &perms) != 0) {
+			if (zap_lookup(mos, zapobj, whokey, 8,
+			    1, &jumpobj) == 0) {
+				(void) zap_remove(mos, zapobj, whokey, tx);
+				VERIFY(0 == zap_destroy(mos, jumpobj, tx));
+			}
+			spa_history_log_internal_dd(dd, "permission who remove",
+			    tx, "%s", whokey);
+			continue;
+		}
+
+		if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0)
+			continue;
+
+		while ((permpair = nvlist_next_nvpair(perms, permpair))) {
+			const char *perm = nvpair_name(permpair);
+			uint64_t n = 0;
+
+			(void) zap_remove(mos, jumpobj, perm, tx);
+			if (zap_count(mos, jumpobj, &n) == 0 && n == 0) {
+				(void) zap_remove(mos, zapobj,
+				    whokey, tx);
+				VERIFY(0 == zap_destroy(mos,
+				    jumpobj, tx));
+			}
+			spa_history_log_internal_dd(dd, "permission remove", tx,
+			    "%s %s", whokey, perm);
+		}
+	}
+	dsl_dir_rele(dd, FTAG);
+}
+
+static int
+dsl_deleg_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_deleg_arg_t *dda = arg;
+	dsl_dir_t *dd;
+	int error;
+
+	if (spa_version(dmu_tx_pool(tx)->dp_spa) <
+	    SPA_VERSION_DELEGATED_PERMS) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL);
+	if (error == 0)
+		dsl_dir_rele(dd, FTAG);
+	return (error);
+}
+
+int
+dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset)
+{
+	dsl_deleg_arg_t dda;
+
+	/* nvp must already have been verified to be valid */
+
+	dda.dda_name = ddname;
+	dda.dda_nvlist = nvp;
+
+	return (dsl_sync_task(ddname, dsl_deleg_check,
+	    unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
+	    &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
+}
+
+/*
+ * Find all 'allow' permissions from a given point and then continue
+ * traversing up to the root.
+ *
+ * This function constructs an nvlist of nvlists.
+ * each setpoint is an nvlist composed of an nvlist of an nvlist
+ * of the individual * users/groups/everyone/create
+ * permissions.
+ *
+ * The nvlist will look like this.
+ *
+ * { source fsname -> { whokeys { permissions,...}, ...}}
+ *
+ * The fsname nvpairs will be arranged in a bottom up order.  For example,
+ * if we have the following structure a/b/c then the nvpairs for the fsnames
+ * will be ordered a/b/c, a/b, a.
+ */
+int
+dsl_deleg_get(const char *ddname, nvlist_t **nvp)
+{
+	dsl_dir_t *dd, *startdd;
+	dsl_pool_t *dp;
+	int error;
+	objset_t *mos;
+	zap_cursor_t *basezc, *zc;
+	zap_attribute_t *baseza, *za;
+	char *source;
+
+	error = dsl_pool_hold(ddname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	dp = startdd->dd_pool;
+	mos = dp->dp_meta_objset;
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	basezc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	baseza = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	source = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	for (dd = startdd; dd != NULL; dd = dd->dd_parent) {
+		nvlist_t *sp_nvp;
+		uint64_t n;
+
+		if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+		    zap_count(mos,
+		    dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
+			continue;
+
+		sp_nvp = fnvlist_alloc();
+		for (zap_cursor_init(basezc, mos,
+		    dsl_dir_phys(dd)->dd_deleg_zapobj);
+		    zap_cursor_retrieve(basezc, baseza) == 0;
+		    zap_cursor_advance(basezc)) {
+			nvlist_t *perms_nvp;
+
+			ASSERT(baseza->za_integer_length == 8);
+			ASSERT(baseza->za_num_integers == 1);
+
+			perms_nvp = fnvlist_alloc();
+			for (zap_cursor_init(zc, mos, baseza->za_first_integer);
+			    zap_cursor_retrieve(zc, za) == 0;
+			    zap_cursor_advance(zc)) {
+				fnvlist_add_boolean(perms_nvp, za->za_name);
+			}
+			zap_cursor_fini(zc);
+			fnvlist_add_nvlist(sp_nvp, baseza->za_name, perms_nvp);
+			fnvlist_free(perms_nvp);
+		}
+
+		zap_cursor_fini(basezc);
+
+		dsl_dir_name(dd, source);
+		fnvlist_add_nvlist(*nvp, source, sp_nvp);
+		nvlist_free(sp_nvp);
+	}
+
+	kmem_free(source, ZFS_MAX_DATASET_NAME_LEN);
+	kmem_free(baseza, sizeof (zap_attribute_t));
+	kmem_free(basezc, sizeof (zap_cursor_t));
+	kmem_free(za, sizeof (zap_attribute_t));
+	kmem_free(zc, sizeof (zap_cursor_t));
+
+	dsl_dir_rele(startdd, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (0);
+}
+
+/*
+ * Routines for dsl_deleg_access() -- access checking.
+ */
+typedef struct perm_set {
+	avl_node_t	p_node;
+	boolean_t	p_matched;
+	char		p_setname[ZFS_MAX_DELEG_NAME];
+} perm_set_t;
+
+static int
+perm_set_compare(const void *arg1, const void *arg2)
+{
+	const perm_set_t *node1 = (const perm_set_t *)arg1;
+	const perm_set_t *node2 = (const perm_set_t *)arg2;
+	int val;
+
+	val = strcmp(node1->p_setname, node2->p_setname);
+
+	return (TREE_ISIGN(val));
+}
+
+/*
+ * Determine whether a specified permission exists.
+ *
+ * First the base attribute has to be retrieved.  i.e. ul$12
+ * Once the base object has been retrieved the actual permission
+ * is lookup up in the zap object the base object points to.
+ *
+ * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if
+ * there is no perm in that jumpobj.
+ */
+static int
+dsl_check_access(objset_t *mos, uint64_t zapobj,
+    char type, char checkflag, void *valp, const char *perm)
+{
+	int error;
+	uint64_t jumpobj, zero;
+	char whokey[ZFS_MAX_DELEG_NAME];
+
+	zfs_deleg_whokey(whokey, type, checkflag, valp);
+	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+	if (error == 0) {
+		error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero);
+		if (error == ENOENT)
+			error = SET_ERROR(EPERM);
+	}
+	return (error);
+}
+
+/*
+ * check a specified user/group for a requested permission
+ */
+static int
+dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm,
+    int checkflag, cred_t *cr)
+{
+	const	gid_t *gids;
+	int	ngids;
+	int	i;
+	uint64_t id;
+
+	/* check for user */
+	id = crgetuid(cr);
+	if (dsl_check_access(mos, zapobj,
+	    ZFS_DELEG_USER, checkflag, &id, perm) == 0)
+		return (0);
+
+	/* check for users primary group */
+	id = crgetgid(cr);
+	if (dsl_check_access(mos, zapobj,
+	    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+		return (0);
+
+	/* check for everyone entry */
+	id = -1;
+	if (dsl_check_access(mos, zapobj,
+	    ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0)
+		return (0);
+
+	/* check each supplemental group user is a member of */
+	ngids = crgetngroups(cr);
+	gids = crgetgroups(cr);
+	for (i = 0; i != ngids; i++) {
+		id = gids[i];
+		if (dsl_check_access(mos, zapobj,
+		    ZFS_DELEG_GROUP, checkflag, &id, perm) == 0)
+			return (0);
+	}
+
+	return (SET_ERROR(EPERM));
+}
+
+/*
+ * Iterate over the sets specified in the specified zapobj
+ * and load them into the permsets avl tree.
+ */
+static int
+dsl_load_sets(objset_t *mos, uint64_t zapobj,
+    char type, char checkflag, void *valp, avl_tree_t *avl)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	perm_set_t *permnode;
+	avl_index_t idx;
+	uint64_t jumpobj;
+	int error;
+	char whokey[ZFS_MAX_DELEG_NAME];
+
+	zfs_deleg_whokey(whokey, type, checkflag, valp);
+
+	error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj);
+	if (error != 0)
+		return (error);
+
+	for (zap_cursor_init(&zc, mos, jumpobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP);
+		(void) strlcpy(permnode->p_setname, za.za_name,
+		    sizeof (permnode->p_setname));
+		permnode->p_matched = B_FALSE;
+
+		if (avl_find(avl, permnode, &idx) == NULL) {
+			avl_insert(avl, permnode, idx);
+		} else {
+			kmem_free(permnode, sizeof (perm_set_t));
+		}
+	}
+	zap_cursor_fini(&zc);
+	return (0);
+}
+
+/*
+ * Load all permissions user based on cred belongs to.
+ */
+static void
+dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl,
+    char checkflag, cred_t *cr)
+{
+	const	gid_t *gids;
+	int	ngids, i;
+	uint64_t id;
+
+	id = crgetuid(cr);
+	(void) dsl_load_sets(mos, zapobj,
+	    ZFS_DELEG_USER_SETS, checkflag, &id, avl);
+
+	id = crgetgid(cr);
+	(void) dsl_load_sets(mos, zapobj,
+	    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+
+	(void) dsl_load_sets(mos, zapobj,
+	    ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl);
+
+	ngids = crgetngroups(cr);
+	gids = crgetgroups(cr);
+	for (i = 0; i != ngids; i++) {
+		id = gids[i];
+		(void) dsl_load_sets(mos, zapobj,
+		    ZFS_DELEG_GROUP_SETS, checkflag, &id, avl);
+	}
+}
+
+/*
+ * Check if user has requested permission.
+ */
+int
+dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp;
+	void *cookie;
+	int	error;
+	char	checkflag;
+	objset_t *mos;
+	avl_tree_t permsets;
+	perm_set_t *setnode;
+
+	dp = ds->ds_dir->dd_pool;
+	mos = dp->dp_meta_objset;
+
+	if (dsl_delegation_on(mos) == B_FALSE)
+		return (SET_ERROR(ECANCELED));
+
+	if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) <
+	    SPA_VERSION_DELEGATED_PERMS)
+		return (SET_ERROR(EPERM));
+
+	if (ds->ds_is_snapshot) {
+		/*
+		 * Snapshots are treated as descendents only,
+		 * local permissions do not apply.
+		 */
+		checkflag = ZFS_DELEG_DESCENDENT;
+	} else {
+		checkflag = ZFS_DELEG_LOCAL;
+	}
+
+	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
+	    offsetof(perm_set_t, p_node));
+
+	ASSERT(dsl_pool_config_held(dp));
+	for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent,
+	    checkflag = ZFS_DELEG_DESCENDENT) {
+		uint64_t zapobj;
+		boolean_t expanded;
+
+		/*
+		 * If not in global zone then make sure
+		 * the zoned property is set
+		 */
+		if (!INGLOBALZONE(curproc)) {
+			uint64_t zoned;
+
+			if (dsl_prop_get_dd(dd,
+			    zfs_prop_to_name(ZFS_PROP_ZONED),
+			    8, 1, &zoned, NULL, B_FALSE) != 0)
+				break;
+			if (!zoned)
+				break;
+		}
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+
+		if (zapobj == 0)
+			continue;
+
+		dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr);
+again:
+		expanded = B_FALSE;
+		for (setnode = avl_first(&permsets); setnode;
+		    setnode = AVL_NEXT(&permsets, setnode)) {
+			if (setnode->p_matched == B_TRUE)
+				continue;
+
+			/* See if this set directly grants this permission */
+			error = dsl_check_access(mos, zapobj,
+			    ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm);
+			if (error == 0)
+				goto success;
+			if (error == EPERM)
+				setnode->p_matched = B_TRUE;
+
+			/* See if this set includes other sets */
+			error = dsl_load_sets(mos, zapobj,
+			    ZFS_DELEG_NAMED_SET_SETS, 0,
+			    setnode->p_setname, &permsets);
+			if (error == 0)
+				setnode->p_matched = expanded = B_TRUE;
+		}
+		/*
+		 * If we expanded any sets, that will define more sets,
+		 * which we need to check.
+		 */
+		if (expanded)
+			goto again;
+
+		error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr);
+		if (error == 0)
+			goto success;
+	}
+	error = SET_ERROR(EPERM);
+success:
+
+	cookie = NULL;
+	while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL)
+		kmem_free(setnode, sizeof (perm_set_t));
+
+	return (error);
+}
+
+int
+dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_pool_hold(dsname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (error == 0) {
+		error = dsl_deleg_access_impl(ds, perm, cr);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_pool_rele(dp, FTAG);
+
+	return (error);
+}
+
+/*
+ * Other routines.
+ */
+
+static void
+copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj,
+    boolean_t dosets, uint64_t uid, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	uint64_t jumpobj, pjumpobj;
+	uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	char whokey[ZFS_MAX_DELEG_NAME];
+
+	zfs_deleg_whokey(whokey,
+	    dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE,
+	    ZFS_DELEG_LOCAL, NULL);
+	if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0)
+		return;
+
+	if (zapobj == 0) {
+		dmu_buf_will_dirty(dd->dd_dbuf, tx);
+		zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
+		    DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+	}
+
+	zfs_deleg_whokey(whokey,
+	    dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER,
+	    ZFS_DELEG_LOCAL, &uid);
+	if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) {
+		jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
+		VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0);
+	}
+
+	for (zap_cursor_init(&zc, mos, pjumpobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t zero = 0;
+		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+
+		VERIFY(zap_update(mos, jumpobj, za.za_name,
+		    8, 1, &zero, tx) == 0);
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * set all create time permission on new dataset.
+ */
+void
+dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr)
+{
+	dsl_dir_t *dd;
+	uint64_t uid = crgetuid(cr);
+
+	if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) <
+	    SPA_VERSION_DELEGATED_PERMS)
+		return;
+
+	for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
+		uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
+
+		if (pzapobj == 0)
+			continue;
+
+		copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx);
+		copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx);
+	}
+}
+
+int
+dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	if (zapobj == 0)
+		return (0);
+
+	for (zap_cursor_init(&zc, mos, zapobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1);
+		VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx));
+	}
+	zap_cursor_fini(&zc);
+	VERIFY(0 == zap_destroy(mos, zapobj, tx));
+	return (0);
+}
+
+boolean_t
+dsl_delegation_on(objset_t *os)
+{
+	return (!!spa_delegation(os->os_spa));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_deleg_get);
+EXPORT_SYMBOL(dsl_deleg_set);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_destroy.c b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
new file mode 100644
index 000000000000..837d78987e75
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_destroy.c
@@ -0,0 +1,1281 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_scan.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+#include <sys/zvol.h>
+#include <sys/zcp.h>
+#include <sys/dsl_deadlist.h>
+#include <sys/zthr.h>
+#include <sys/spa_impl.h>
+
+int
+dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
+{
+	if (!ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (dsl_dataset_long_held(ds))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Only allow deferred destroy on pools that support it.
+	 * NOTE: deferred destroy is only supported on snapshots.
+	 */
+	if (defer) {
+		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+		    SPA_VERSION_USERREFS)
+			return (SET_ERROR(ENOTSUP));
+		return (0);
+	}
+
+	/*
+	 * If this snapshot has an elevated user reference count,
+	 * we can't destroy it yet.
+	 */
+	if (ds->ds_userrefs > 0)
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Can't delete a branch point.
+	 */
+	if (dsl_dataset_phys(ds)->ds_num_children > 1)
+		return (SET_ERROR(EEXIST));
+
+	return (0);
+}
+
+int
+dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_snapshot_arg_t *ddsa = arg;
+	const char *dsname = ddsa->ddsa_name;
+	boolean_t defer = ddsa->ddsa_defer;
+
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	int error = 0;
+	dsl_dataset_t *ds;
+
+	error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+
+	/*
+	 * If the snapshot does not exist, silently ignore it, and
+	 * dsl_destroy_snapshot_sync() will be a no-op
+	 * (it's "already destroyed").
+	 */
+	if (error == ENOENT)
+		return (0);
+
+	if (error == 0) {
+		error = dsl_destroy_snapshot_check_impl(ds, defer);
+		dsl_dataset_rele(ds, FTAG);
+	}
+
+	return (error);
+}
+
+struct process_old_arg {
+	dsl_dataset_t *ds;
+	dsl_dataset_t *ds_prev;
+	boolean_t after_branch_point;
+	zio_t *pio;
+	uint64_t used, comp, uncomp;
+};
+
+static int
+process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
+{
+	struct process_old_arg *poa = arg;
+	dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
+		dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx);
+		if (poa->ds_prev && !poa->after_branch_point &&
+		    bp->blk_birth >
+		    dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+			dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
+			    bp_get_dsize_sync(dp->dp_spa, bp);
+		}
+	} else {
+		poa->used += bp_get_dsize_sync(dp->dp_spa, bp);
+		poa->comp += BP_GET_PSIZE(bp);
+		poa->uncomp += BP_GET_UCSIZE(bp);
+		dsl_free_sync(poa->pio, dp, tx->tx_txg, bp);
+	}
+	return (0);
+}
+
+static void
+process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev,
+    dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx)
+{
+	struct process_old_arg poa = { 0 };
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t deadlist_obj;
+
+	ASSERT(ds->ds_deadlist.dl_oldfmt);
+	ASSERT(ds_next->ds_deadlist.dl_oldfmt);
+
+	poa.ds = ds;
+	poa.ds_prev = ds_prev;
+	poa.after_branch_point = after_branch_point;
+	poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
+	    process_old_cb, &poa, tx));
+	VERIFY0(zio_wait(poa.pio));
+	ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
+
+	/* change snapused */
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+	    -poa.used, -poa.comp, -poa.uncomp, tx);
+
+	/* swap next's deadlist to our deadlist */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_close(&ds_next->ds_deadlist);
+	deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+	dsl_dataset_phys(ds)->ds_deadlist_obj =
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+	dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+	dsl_deadlist_open(&ds->ds_deadlist, mos,
+	    dsl_dataset_phys(ds)->ds_deadlist_obj);
+	dsl_deadlist_open(&ds_next->ds_deadlist, mos,
+	    dsl_dataset_phys(ds_next)->ds_deadlist_obj);
+}
+
+typedef struct remaining_clones_key {
+	dsl_dataset_t *rck_clone;
+	list_node_t rck_node;
+} remaining_clones_key_t;
+
+static remaining_clones_key_t *
+rck_alloc(dsl_dataset_t *clone)
+{
+	remaining_clones_key_t *rck = kmem_alloc(sizeof (*rck), KM_SLEEP);
+	rck->rck_clone = clone;
+	return (rck);
+}
+
+static void
+dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx,
+    list_t *stack, void *tag)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+
+	/*
+	 * If it is the old version, dd_clones doesn't exist so we can't
+	 * find the clones, but dsl_deadlist_remove_key() is a no-op so it
+	 * doesn't matter.
+	 */
+	if (dsl_dir_phys(dd)->dd_clones == 0)
+		return;
+
+	zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		dsl_dataset_t *clone;
+
+		VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+		    za->za_first_integer, tag, &clone));
+
+		if (clone->ds_dir->dd_origin_txg > mintxg) {
+			dsl_deadlist_remove_key(&clone->ds_deadlist,
+			    mintxg, tx);
+
+			if (dsl_dataset_remap_deadlist_exists(clone)) {
+				dsl_deadlist_remove_key(
+				    &clone->ds_remap_deadlist, mintxg, tx);
+			}
+
+			list_insert_head(stack, rck_alloc(clone));
+		} else {
+			dsl_dataset_rele(clone, tag);
+		}
+	}
+	zap_cursor_fini(zc);
+
+	kmem_free(za, sizeof (zap_attribute_t));
+	kmem_free(zc, sizeof (zap_cursor_t));
+}
+
+void
+dsl_dir_remove_clones_key(dsl_dir_t *top_dd, uint64_t mintxg, dmu_tx_t *tx)
+{
+	list_t stack;
+
+	list_create(&stack, sizeof (remaining_clones_key_t),
+	    offsetof(remaining_clones_key_t, rck_node));
+
+	dsl_dir_remove_clones_key_impl(top_dd, mintxg, tx, &stack, FTAG);
+	for (remaining_clones_key_t *rck = list_remove_head(&stack);
+	    rck != NULL; rck = list_remove_head(&stack)) {
+		dsl_dataset_t *clone = rck->rck_clone;
+		dsl_dir_t *clone_dir = clone->ds_dir;
+
+		kmem_free(rck, sizeof (*rck));
+
+		dsl_dir_remove_clones_key_impl(clone_dir, mintxg, tx,
+		    &stack, FTAG);
+		dsl_dataset_rele(clone, FTAG);
+	}
+
+	list_destroy(&stack);
+}
+
+static void
+dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+	/* Move blocks to be obsoleted to pool's obsolete list. */
+	if (dsl_dataset_remap_deadlist_exists(ds_next)) {
+		if (!bpobj_is_open(&dp->dp_obsolete_bpobj))
+			dsl_pool_create_obsolete_bpobj(dp, tx);
+
+		dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist,
+		    &dp->dp_obsolete_bpobj,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+	}
+
+	/* Merge our deadlist into next's and free it. */
+	if (dsl_dataset_remap_deadlist_exists(ds)) {
+		uint64_t remap_deadlist_object =
+		    dsl_dataset_get_remap_deadlist_object(ds);
+		ASSERT(remap_deadlist_object != 0);
+
+		mutex_enter(&ds_next->ds_remap_deadlist_lock);
+		if (!dsl_dataset_remap_deadlist_exists(ds_next))
+			dsl_dataset_create_remap_deadlist(ds_next, tx);
+		mutex_exit(&ds_next->ds_remap_deadlist_lock);
+
+		dsl_deadlist_merge(&ds_next->ds_remap_deadlist,
+		    remap_deadlist_object, tx);
+		dsl_dataset_destroy_remap_deadlist(ds, tx);
+	}
+}
+
+void
+dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
+{
+	int after_branch_point = FALSE;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	dsl_dataset_t *ds_prev = NULL;
+	uint64_t obj;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	ASSERT(zfs_refcount_is_zero(&ds->ds_longholds));
+
+	if (defer &&
+	    (ds->ds_userrefs > 0 ||
+	    dsl_dataset_phys(ds)->ds_num_children > 1)) {
+		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
+		spa_history_log_internal_ds(ds, "defer_destroy", tx, " ");
+		return;
+	}
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+
+	/* We need to log before removing it from the namespace. */
+	spa_history_log_internal_ds(ds, "destroy", tx, " ");
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	obj = ds->ds_object;
+
+	boolean_t book_exists = dsl_bookmark_ds_destroyed(ds, tx);
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (dsl_dataset_feature_is_active(ds, f))
+			dsl_dataset_deactivate_feature(ds, f, tx);
+	}
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		ASSERT3P(ds->ds_prev, ==, NULL);
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
+		after_branch_point =
+		    (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
+
+		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
+		if (after_branch_point &&
+		    dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
+			if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+				VERIFY0(zap_add_int(mos,
+				    dsl_dataset_phys(ds_prev)->
+				    ds_next_clones_obj,
+				    dsl_dataset_phys(ds)->ds_next_snap_obj,
+				    tx));
+			}
+		}
+		if (!after_branch_point) {
+			dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
+		}
+	}
+
+	dsl_dataset_t *ds_next;
+	uint64_t old_unique;
+	uint64_t used = 0, comp = 0, uncomp = 0;
+
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+	ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
+
+	old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+	dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
+	dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+	    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+	dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg;
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+	    ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
+
+	if (ds_next->ds_deadlist.dl_oldfmt) {
+		process_old_deadlist(ds, ds_prev, ds_next,
+		    after_branch_point, tx);
+	} else {
+		/* Adjust prev's unique space. */
+		if (ds_prev && !after_branch_point) {
+			dsl_deadlist_space_range(&ds_next->ds_deadlist,
+			    dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+			    &used, &comp, &uncomp);
+			dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
+		}
+
+		/* Adjust snapused. */
+		dsl_deadlist_space_range(&ds_next->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
+		    &used, &comp, &uncomp);
+		dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
+		    -used, -comp, -uncomp, tx);
+
+		/* Move blocks to be freed to pool's free list. */
+		dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
+		    &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    tx);
+		dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
+		    DD_USED_HEAD, used, comp, uncomp, tx);
+
+		/* Merge our deadlist into next's and free it. */
+		dsl_deadlist_merge(&ds_next->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+
+		/*
+		 * We are done with the deadlist tree (generated/used
+		 * by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()).
+		 * Discard it to save memory.
+		 */
+		dsl_deadlist_discard_tree(&ds_next->ds_deadlist);
+	}
+
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+	dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx);
+
+	if (!book_exists) {
+		/* Collapse range in clone heads */
+		dsl_dir_remove_clones_key(ds->ds_dir,
+		    dsl_dataset_phys(ds)->ds_creation_txg, tx);
+	}
+
+	if (ds_next->ds_is_snapshot) {
+		dsl_dataset_t *ds_nextnext;
+
+		/*
+		 * Update next's unique to include blocks which
+		 * were previously shared by only this snapshot
+		 * and it.  Those blocks will be born after the
+		 * prev snap and before this snap, and will have
+		 * died after the next snap and before the one
+		 * after that (ie. be on the snap after next's
+		 * deadlist).
+		 */
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+		    FTAG, &ds_nextnext));
+		dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
+		    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+		    dsl_dataset_phys(ds)->ds_creation_txg,
+		    &used, &comp, &uncomp);
+		dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
+		dsl_dataset_rele(ds_nextnext, FTAG);
+		ASSERT3P(ds_next->ds_prev, ==, NULL);
+
+		/* Collapse range in this head. */
+		dsl_dataset_t *hds;
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
+		    FTAG, &hds));
+		if (!book_exists) {
+			/* Collapse range in this head. */
+			dsl_deadlist_remove_key(&hds->ds_deadlist,
+			    dsl_dataset_phys(ds)->ds_creation_txg, tx);
+		}
+		if (dsl_dataset_remap_deadlist_exists(hds)) {
+			dsl_deadlist_remove_key(&hds->ds_remap_deadlist,
+			    dsl_dataset_phys(ds)->ds_creation_txg, tx);
+		}
+		dsl_dataset_rele(hds, FTAG);
+
+	} else {
+		ASSERT3P(ds_next->ds_prev, ==, ds);
+		dsl_dataset_rele(ds_next->ds_prev, ds_next);
+		ds_next->ds_prev = NULL;
+		if (ds_prev) {
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+			    ds_next, &ds_next->ds_prev));
+		}
+
+		dsl_dataset_recalc_head_uniq(ds_next);
+
+		/*
+		 * Reduce the amount of our unconsumed refreservation
+		 * being charged to our parent by the amount of
+		 * new unique data we have gained.
+		 */
+		if (old_unique < ds_next->ds_reserved) {
+			int64_t mrsdelta;
+			uint64_t new_unique =
+			    dsl_dataset_phys(ds_next)->ds_unique_bytes;
+
+			ASSERT(old_unique <= new_unique);
+			mrsdelta = MIN(new_unique - old_unique,
+			    ds_next->ds_reserved - old_unique);
+			dsl_dir_diduse_space(ds->ds_dir,
+			    DD_USED_REFRSRV, -mrsdelta, 0, 0, tx);
+		}
+	}
+	dsl_dataset_rele(ds_next, FTAG);
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* remove from snapshot namespace */
+	dsl_dataset_t *ds_head;
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
+	VERIFY0(dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
+	VERIFY0(dsl_dataset_get_snapname(ds));
+#ifdef ZFS_DEBUG
+	{
+		uint64_t val;
+		int err;
+
+		err = dsl_dataset_snap_lookup(ds_head,
+		    ds->ds_snapname, &val);
+		ASSERT0(err);
+		ASSERT3U(val, ==, obj);
+	}
+#endif
+	VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
+	dsl_dataset_rele(ds_head, FTAG);
+
+	if (ds_prev != NULL)
+		dsl_dataset_rele(ds_prev, FTAG);
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+		uint64_t count __maybe_unused;
+		ASSERT0(zap_count(mos,
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+		    count == 0);
+		VERIFY0(dmu_object_free(mos,
+		    dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
+	}
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+		    tx));
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+		VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    tx));
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	dmu_object_free_zapified(mos, obj, tx);
+}
+
+void
+dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_snapshot_arg_t *ddsa = arg;
+	const char *dsname = ddsa->ddsa_name;
+	boolean_t defer = ddsa->ddsa_defer;
+
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (error == ENOENT)
+		return;
+	ASSERT0(error);
+	dsl_destroy_snapshot_sync_impl(ds, defer, tx);
+	zvol_remove_minors(dp->dp_spa, dsname, B_TRUE);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * The semantics of this function are described in the comment above
+ * lzc_destroy_snaps().  To summarize:
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Snapshots that don't exist will be silently ignored (considered to be
+ * "already deleted").
+ *
+ * On success, all snaps will be destroyed and this will return 0.
+ * On failure, no snaps will be destroyed, the errlist will be filled in,
+ * and this will return an errno.
+ */
+int
+dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
+    nvlist_t *errlist)
+{
+	if (nvlist_next_nvpair(snaps, NULL) == NULL)
+		return (0);
+
+	/*
+	 * lzc_destroy_snaps() is documented to take an nvlist whose
+	 * values "don't matter".  We need to convert that nvlist to
+	 * one that we know can be converted to LUA.
+	 */
+	nvlist_t *snaps_normalized = fnvlist_alloc();
+	for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) {
+		fnvlist_add_boolean_value(snaps_normalized,
+		    nvpair_name(pair), B_TRUE);
+	}
+
+	nvlist_t *arg = fnvlist_alloc();
+	fnvlist_add_nvlist(arg, "snaps", snaps_normalized);
+	fnvlist_free(snaps_normalized);
+	fnvlist_add_boolean_value(arg, "defer", defer);
+
+	nvlist_t *wrapper = fnvlist_alloc();
+	fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg);
+	fnvlist_free(arg);
+
+	const char *program =
+	    "arg = ...\n"
+	    "snaps = arg['snaps']\n"
+	    "defer = arg['defer']\n"
+	    "errors = { }\n"
+	    "has_errors = false\n"
+	    "for snap, v in pairs(snaps) do\n"
+	    "    errno = zfs.check.destroy{snap, defer=defer}\n"
+	    "    zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n"
+	    "    if errno == ENOENT then\n"
+	    "        snaps[snap] = nil\n"
+	    "    elseif errno ~= 0 then\n"
+	    "        errors[snap] = errno\n"
+	    "        has_errors = true\n"
+	    "    end\n"
+	    "end\n"
+	    "if has_errors then\n"
+	    "    return errors\n"
+	    "end\n"
+	    "for snap, v in pairs(snaps) do\n"
+	    "    errno = zfs.sync.destroy{snap, defer=defer}\n"
+	    "    assert(errno == 0)\n"
+	    "end\n"
+	    "return { }\n";
+
+	nvlist_t *result = fnvlist_alloc();
+	int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
+	    program,
+	    B_TRUE,
+	    0,
+	    zfs_lua_max_memlimit,
+	    fnvlist_lookup_nvpair(wrapper, ZCP_ARG_ARGLIST), result);
+	if (error != 0) {
+		char *errorstr = NULL;
+		(void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr);
+		if (errorstr != NULL) {
+			zfs_dbgmsg(errorstr);
+		}
+		fnvlist_free(wrapper);
+		fnvlist_free(result);
+		return (error);
+	}
+	fnvlist_free(wrapper);
+
+	/*
+	 * lzc_destroy_snaps() is documented to fill the errlist with
+	 * int32 values, so we need to convert the int64 values that are
+	 * returned from LUA.
+	 */
+	int rv = 0;
+	nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN);
+	for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) {
+		int32_t val = (int32_t)fnvpair_value_int64(pair);
+		if (rv == 0)
+			rv = val;
+		fnvlist_add_int32(errlist, nvpair_name(pair), val);
+	}
+	fnvlist_free(result);
+	return (rv);
+}
+
+int
+dsl_destroy_snapshot(const char *name, boolean_t defer)
+{
+	int error;
+	nvlist_t *nvl = fnvlist_alloc();
+	nvlist_t *errlist = fnvlist_alloc();
+
+	fnvlist_add_boolean(nvl, name);
+	error = dsl_destroy_snapshots_nvl(nvl, defer, errlist);
+	fnvlist_free(errlist);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+struct killarg {
+	dsl_dataset_t *ds;
+	dmu_tx_t *tx;
+};
+
+/* ARGSUSED */
+static int
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	struct killarg *ka = arg;
+	dmu_tx_t *tx = ka->tx;
+
+	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+	    BP_IS_EMBEDDED(bp))
+		return (0);
+
+	if (zb->zb_level == ZB_ZIL_LEVEL) {
+		ASSERT(zilog != NULL);
+		/*
+		 * It's a block in the intent log.  It has no
+		 * accounting, so just free it.
+		 */
+		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
+	} else {
+		ASSERT(zilog == NULL);
+		ASSERT3U(bp->blk_birth, >,
+		    dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
+		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
+	}
+
+	return (0);
+}
+
+static void
+old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	struct killarg ka;
+
+	spa_history_log_internal_ds(ds, "destroy", tx,
+	    "(synchronous, mintxg=%llu)",
+	    (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
+	/*
+	 * Free everything that we point to (that's born after
+	 * the previous snapshot, if we are a clone)
+	 *
+	 * NB: this should be very quick, because we already
+	 * freed all the objects in open context.
+	 */
+	ka.ds = ds;
+	ka.tx = tx;
+	VERIFY0(traverse_dataset(ds,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST |
+	    TRAVERSE_NO_DECRYPT, kill_blkptr, &ka));
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+	    dsl_dataset_phys(ds)->ds_unique_bytes == 0);
+}
+
+int
+dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds)
+{
+	int error;
+	uint64_t count;
+	objset_t *mos;
+
+	ASSERT(!ds->ds_is_snapshot);
+	if (ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_refcount_count(&ds->ds_longholds) != expected_holds)
+		return (SET_ERROR(EBUSY));
+
+	ASSERT0(ds->ds_dir->dd_activity_waiters);
+
+	mos = ds->ds_dir->dd_pool->dp_meta_objset;
+
+	/*
+	 * Can't delete a head dataset if there are snapshots of it.
+	 * (Except if the only snapshots are from the branch we cloned
+	 * from.)
+	 */
+	if (ds->ds_prev != NULL &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * Can't delete if there are children of this fs.
+	 */
+	error = zap_count(mos,
+	    dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
+	if (error != 0)
+		return (error);
+	if (count != 0)
+		return (SET_ERROR(EEXIST));
+
+	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0) {
+		/* We need to remove the origin snapshot as well. */
+		if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds))
+			return (SET_ERROR(EBUSY));
+	}
+	return (0);
+}
+
+int
+dsl_destroy_head_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	error = dsl_destroy_head_check_impl(ds, 0);
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+static void
+dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx)
+{
+	dsl_dir_t *dd;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	dd_used_t t;
+
+	ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock));
+
+	VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
+
+	ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+	/* Decrement the filesystem count for all parent filesystems. */
+	if (dd->dd_parent != NULL)
+		dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+
+	/*
+	 * Remove our reservation. The impl() routine avoids setting the
+	 * actual property, which would require the (already destroyed) ds.
+	 */
+	dsl_dir_set_reservation_sync_impl(dd, 0, tx);
+
+	ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+	ASSERT0(dsl_dir_phys(dd)->dd_reserved);
+	for (t = 0; t < DD_USED_NUM; t++)
+		ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
+
+	if (dd->dd_crypto_obj != 0) {
+		dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx);
+		(void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object);
+	}
+
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+	VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+	if (dsl_dir_phys(dd)->dd_clones != 0)
+		VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx));
+	VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
+	VERIFY0(zap_remove(mos,
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+	    dd->dd_myname, tx));
+
+	dsl_dir_rele(dd, FTAG);
+	dmu_object_free_zapified(mos, ddobj, tx);
+}
+
+static void
+dsl_clone_destroy_assert(dsl_dir_t *dd)
+{
+	uint64_t used, comp, uncomp;
+
+	ASSERT(dsl_dir_is_clone(dd));
+	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
+
+	ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used);
+	ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp);
+	/*
+	 * Greater than because we do not track embedded block pointers in
+	 * the livelist
+	 */
+	ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp);
+
+	ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list));
+	ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list));
+}
+
+/*
+ * Start the delete process for a clone. Free its zil, verify the space usage
+ * and queue the blkptrs for deletion by adding the livelist to the pool-wide
+ * delete queue.
+ */
+static void
+dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t zap_obj, to_delete, used, comp, uncomp;
+	objset_t *os;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+
+	uint64_t mintxg = 0;
+	dsl_deadlist_entry_t *dle = dsl_deadlist_first(&dd->dd_livelist);
+	if (dle != NULL)
+		mintxg = dle->dle_mintxg;
+
+	spa_history_log_internal_ds(ds, "destroy", tx,
+	    "(livelist, mintxg=%llu)", (long long)mintxg);
+
+	/* Check that the clone is in a correct state to be deleted */
+	dsl_clone_destroy_assert(dd);
+
+	/* Destroy the zil */
+	zil_destroy_sync(dmu_objset_zil(os), tx);
+
+	VERIFY0(zap_lookup(mos, dd->dd_object,
+	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete));
+	/* Initialize deleted_clones entry to track livelists to cleanup */
+	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
+	if (error == ENOENT) {
+		zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA,
+		    DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1,
+		    &(zap_obj), tx));
+		spa->spa_livelists_to_delete = zap_obj;
+	} else if (error != 0) {
+		zfs_panic_recover("zfs: error %d was returned while looking "
+		    "up DMU_POOL_DELETED_CLONES in the zap", error);
+		return;
+	}
+	VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx));
+
+	/* Clone is no longer using space, now tracked by dp_free_dir */
+	dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp);
+	dsl_dir_diduse_space(dd, DD_USED_HEAD,
+	    -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes,
+	    tx);
+	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+	    used, comp, uncomp, tx);
+	dsl_dir_remove_livelist(dd, tx, B_FALSE);
+	zthr_wakeup(spa->spa_livelist_delete_zthr);
+}
+
+/*
+ * Move the bptree into the pool's list of trees to clean up, update space
+ * accounting information and destroy the zil.
+ */
+static void
+dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	uint64_t used, comp, uncomp;
+	objset_t *os;
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+
+	spa_history_log_internal_ds(ds, "destroy", tx,
+	    "(bptree, mintxg=%llu)",
+	    (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
+	zil_destroy_sync(dmu_objset_zil(os), tx);
+
+	if (!spa_feature_is_active(dp->dp_spa,
+	    SPA_FEATURE_ASYNC_DESTROY)) {
+		dsl_scan_t *scn = dp->dp_scan;
+		spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+		    tx);
+		dp->dp_bptree_obj = bptree_alloc(mos, tx);
+		VERIFY0(zap_add(mos,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+		    &dp->dp_bptree_obj, tx));
+		ASSERT(!scn->scn_async_destroying);
+		scn->scn_async_destroying = B_TRUE;
+	}
+
+	used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+	comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+	uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
+
+	ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+	    dsl_dataset_phys(ds)->ds_unique_bytes == used);
+
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	bptree_add(mos, dp->dp_bptree_obj,
+	    &dsl_dataset_phys(ds)->ds_bp,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg,
+	    used, comp, uncomp, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
+	    -used, -comp, -uncomp, tx);
+	dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+	    used, comp, uncomp, tx);
+}
+
+void
+dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t obj, ddobj, prevobj = 0;
+	boolean_t rmorigin;
+
+	ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+	ASSERT(ds->ds_prev == NULL ||
+	    dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	dsl_dir_cancel_waiters(ds->ds_dir);
+
+	rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
+	    DS_IS_DEFER_DESTROY(ds->ds_prev) &&
+	    dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
+	    ds->ds_prev->ds_userrefs == 0);
+
+	/* Remove our reservation. */
+	if (ds->ds_reserved != 0) {
+		dsl_dataset_set_refreservation_sync_impl(ds,
+		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+		    0, tx);
+		ASSERT0(ds->ds_reserved);
+	}
+
+	obj = ds->ds_object;
+
+	for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+		if (dsl_dataset_feature_is_active(ds, f))
+			dsl_dataset_deactivate_feature(ds, f, tx);
+	}
+
+	dsl_scan_ds_destroyed(ds, tx);
+
+	if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		/* This is a clone */
+		ASSERT(ds->ds_prev != NULL);
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+		    obj);
+		ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
+
+		dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
+		if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
+			dsl_dataset_remove_from_next_clones(ds->ds_prev,
+			    obj, tx);
+		}
+
+		ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+		dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
+	}
+
+	/*
+	 * Destroy the deadlist. Unless it's a clone, the
+	 * deadlist should be empty since the dataset has no snapshots.
+	 * (If it's a clone, it's safe to ignore the deadlist contents
+	 * since they are still referenced by the origin snapshot.)
+	 */
+	dsl_deadlist_close(&ds->ds_deadlist);
+	dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+
+	if (dsl_dataset_remap_deadlist_exists(ds))
+		dsl_dataset_destroy_remap_deadlist(ds, tx);
+
+	/*
+	 * Each destroy is responsible for both destroying (enqueuing
+	 * to be destroyed) the blkptrs comprising the dataset as well as
+	 * those belonging to the zil.
+	 */
+	if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) {
+		dsl_async_clone_destroy(ds, tx);
+	} else if (spa_feature_is_enabled(dp->dp_spa,
+	    SPA_FEATURE_ASYNC_DESTROY)) {
+		dsl_async_dataset_destroy(ds, tx);
+	} else {
+		old_synchronous_dataset_destroy(ds, tx);
+	}
+
+	if (ds->ds_prev != NULL) {
+		if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
+			VERIFY0(zap_remove_int(mos,
+			    dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
+			    ds->ds_object, tx));
+		}
+		prevobj = ds->ds_prev->ds_object;
+		dsl_dataset_rele(ds->ds_prev, ds);
+		ds->ds_prev = NULL;
+	}
+
+	/*
+	 * This must be done after the dsl_traverse(), because it will
+	 * re-open the objset.
+	 */
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/* Erase the link in the dir */
+	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+	dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
+	ddobj = ds->ds_dir->dd_object;
+	ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+	VERIFY0(zap_destroy(mos,
+	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
+
+	if (ds->ds_bookmarks_obj != 0) {
+		void *cookie = NULL;
+		dsl_bookmark_node_t *dbn;
+
+		while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) !=
+		    NULL) {
+			if (dbn->dbn_phys.zbm_redaction_obj != 0) {
+				VERIFY0(dmu_object_free(mos,
+				    dbn->dbn_phys.zbm_redaction_obj, tx));
+				spa_feature_decr(dmu_objset_spa(mos),
+				    SPA_FEATURE_REDACTION_BOOKMARKS, tx);
+			}
+			if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) {
+				spa_feature_decr(dmu_objset_spa(mos),
+				    SPA_FEATURE_BOOKMARK_WRITTEN, tx);
+			}
+			spa_strfree(dbn->dbn_name);
+			mutex_destroy(&dbn->dbn_lock);
+			kmem_free(dbn, sizeof (*dbn));
+		}
+		avl_destroy(&ds->ds_bookmarks);
+		VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx));
+		spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+	}
+
+	spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
+
+	ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+	ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
+	dsl_dir_rele(ds->ds_dir, ds);
+	ds->ds_dir = NULL;
+	dmu_object_free_zapified(mos, obj, tx);
+
+	dsl_dir_destroy_sync(ddobj, tx);
+
+	if (rmorigin) {
+		dsl_dataset_t *prev;
+		VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev));
+		dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx);
+		dsl_dataset_rele(prev, FTAG);
+	}
+}
+
+void
+dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+	dsl_destroy_head_sync_impl(ds, tx);
+	zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+static void
+dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_destroy_head_arg_t *ddha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
+
+	/* Mark it as inconsistent on-disk, in case we crash */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+	spa_history_log_internal_ds(ds, "destroy begin", tx, " ");
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_destroy_head(const char *name)
+{
+	dsl_destroy_head_arg_t ddha;
+	int error;
+	spa_t *spa;
+	boolean_t isenabled;
+
+#ifdef _KERNEL
+	zfs_destroy_unmount_origin(name);
+#endif
+
+	error = spa_open(name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
+	spa_close(spa, FTAG);
+
+	ddha.ddha_name = name;
+
+	if (!isenabled) {
+		objset_t *os;
+
+		error = dsl_sync_task(name, dsl_destroy_head_check,
+		    dsl_destroy_head_begin_sync, &ddha,
+		    0, ZFS_SPACE_CHECK_DESTROY);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Head deletion is processed in one txg on old pools;
+		 * remove the objects from open context so that the txg sync
+		 * is not too long. This optimization can only work for
+		 * encrypted datasets if the wrapping key is loaded.
+		 */
+		error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE,
+		    FTAG, &os);
+		if (error == 0) {
+			uint64_t prev_snap_txg =
+			    dsl_dataset_phys(dmu_objset_ds(os))->
+			    ds_prev_snap_txg;
+			for (uint64_t obj = 0; error == 0;
+			    error = dmu_object_next(os, &obj, FALSE,
+			    prev_snap_txg))
+				(void) dmu_free_long_object(os, obj);
+			/* sync out all frees */
+			txg_wait_synced(dmu_objset_pool(os), 0);
+			dmu_objset_disown(os, B_TRUE, FTAG);
+		}
+	}
+
+	return (dsl_sync_task(name, dsl_destroy_head_check,
+	    dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY));
+}
+
+/*
+ * Note, this function is used as the callback for dmu_objset_find().  We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+	objset_t *os;
+
+	if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
+		boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+		/*
+		 * If the dataset is inconsistent because a resumable receive
+		 * has failed, then do not destroy it.
+		 */
+		if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+			need_destroy = B_FALSE;
+
+		dmu_objset_rele(os, FTAG);
+		if (need_destroy)
+			(void) dsl_destroy_head(dsname);
+	}
+	return (0);
+}
+
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_destroy_head);
+EXPORT_SYMBOL(dsl_destroy_head_sync_impl);
+EXPORT_SYMBOL(dsl_dataset_user_hold_check_one);
+EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl);
+EXPORT_SYMBOL(dsl_destroy_inconsistent);
+EXPORT_SYMBOL(dsl_dataset_user_release_tmp);
+EXPORT_SYMBOL(dsl_destroy_head_check_impl);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_dir.c b/sys/contrib/openzfs/module/zfs/dsl_dir.c
new file mode 100644
index 000000000000..90dd787023be
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_dir.c
@@ -0,0 +1,2403 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/arc.h>
+#include <sys/sunddi.h>
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zvol.h>
+#include <sys/zthr.h>
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initialized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ */
+
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
+
+static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
+
+typedef struct ddulrt_arg {
+	dsl_dir_t	*ddulrta_dd;
+	uint64_t	ddlrta_txg;
+} ddulrt_arg_t;
+
+static void
+dsl_dir_evict_async(void *dbu)
+{
+	dsl_dir_t *dd = dbu;
+	int t;
+	dsl_pool_t *dp __maybe_unused = dd->dd_pool;
+
+	dd->dd_dbuf = NULL;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
+		ASSERT(dd->dd_tempreserved[t] == 0);
+		ASSERT(dd->dd_space_towrite[t] == 0);
+	}
+
+	if (dd->dd_parent)
+		dsl_dir_async_rele(dd->dd_parent, dd);
+
+	spa_async_close(dd->dd_pool->dp_spa, dd);
+
+	if (dsl_deadlist_is_open(&dd->dd_livelist))
+		dsl_dir_livelist_close(dd);
+
+	dsl_prop_fini(dd);
+	cv_destroy(&dd->dd_activity_cv);
+	mutex_destroy(&dd->dd_activity_lock);
+	mutex_destroy(&dd->dd_lock);
+	kmem_free(dd, sizeof (dsl_dir_t));
+}
+
+int
+dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
+    const char *tail, void *tag, dsl_dir_t **ddp)
+{
+	dmu_buf_t *dbuf;
+	dsl_dir_t *dd;
+	dmu_object_info_t doi;
+	int err;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+	if (err != 0)
+		return (err);
+	dd = dmu_buf_get_user(dbuf);
+
+	dmu_object_info_from_db(dbuf, &doi);
+	ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
+	ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
+
+	if (dd == NULL) {
+		dsl_dir_t *winner;
+
+		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
+		dd->dd_object = ddobj;
+		dd->dd_dbuf = dbuf;
+		dd->dd_pool = dp;
+
+		mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL);
+		dsl_prop_init(dd);
+
+		if (dsl_dir_is_zapified(dd)) {
+			err = zap_lookup(dp->dp_meta_objset,
+			    ddobj, DD_FIELD_CRYPTO_KEY_OBJ,
+			    sizeof (uint64_t), 1, &dd->dd_crypto_obj);
+			if (err == 0) {
+				/* check for on-disk format errata */
+				if (dsl_dir_incompatible_encryption_version(
+				    dd)) {
+					dp->dp_spa->spa_errata =
+					    ZPOOL_ERRATA_ZOL_6845_ENCRYPTION;
+				}
+			} else if (err != ENOENT) {
+				goto errout;
+			}
+		}
+
+		dsl_dir_snap_cmtime_update(dd);
+
+		if (dsl_dir_phys(dd)->dd_parent_obj) {
+			err = dsl_dir_hold_obj(dp,
+			    dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+			    &dd->dd_parent);
+			if (err != 0)
+				goto errout;
+			if (tail) {
+#ifdef ZFS_DEBUG
+				uint64_t foundobj;
+
+				err = zap_lookup(dp->dp_meta_objset,
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj, tail,
+				    sizeof (foundobj), 1, &foundobj);
+				ASSERT(err || foundobj == ddobj);
+#endif
+				(void) strlcpy(dd->dd_myname, tail,
+				    sizeof (dd->dd_myname));
+			} else {
+				err = zap_value_search(dp->dp_meta_objset,
+				    dsl_dir_phys(dd->dd_parent)->
+				    dd_child_dir_zapobj,
+				    ddobj, 0, dd->dd_myname);
+			}
+			if (err != 0)
+				goto errout;
+		} else {
+			(void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa),
+			    sizeof (dd->dd_myname));
+		}
+
+		if (dsl_dir_is_clone(dd)) {
+			dmu_buf_t *origin_bonus;
+			dsl_dataset_phys_t *origin_phys;
+
+			/*
+			 * We can't open the origin dataset, because
+			 * that would require opening this dsl_dir.
+			 * Just look at its phys directly instead.
+			 */
+			err = dmu_bonus_hold(dp->dp_meta_objset,
+			    dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+			    &origin_bonus);
+			if (err != 0)
+				goto errout;
+			origin_phys = origin_bonus->db_data;
+			dd->dd_origin_txg =
+			    origin_phys->ds_creation_txg;
+			dmu_buf_rele(origin_bonus, FTAG);
+			if (dsl_dir_is_zapified(dd)) {
+				uint64_t obj;
+				err = zap_lookup(dp->dp_meta_objset,
+				    dd->dd_object, DD_FIELD_LIVELIST,
+				    sizeof (uint64_t), 1, &obj);
+				if (err == 0)
+					dsl_dir_livelist_open(dd, obj);
+				else if (err != ENOENT)
+					goto errout;
+			}
+		}
+
+		dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async,
+		    &dd->dd_dbuf);
+		winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
+		if (winner != NULL) {
+			if (dd->dd_parent)
+				dsl_dir_rele(dd->dd_parent, dd);
+			if (dsl_deadlist_is_open(&dd->dd_livelist))
+				dsl_dir_livelist_close(dd);
+			dsl_prop_fini(dd);
+			cv_destroy(&dd->dd_activity_cv);
+			mutex_destroy(&dd->dd_activity_lock);
+			mutex_destroy(&dd->dd_lock);
+			kmem_free(dd, sizeof (dsl_dir_t));
+			dd = winner;
+		} else {
+			spa_open_ref(dp->dp_spa, dd);
+		}
+	}
+
+	/*
+	 * The dsl_dir_t has both open-to-close and instantiate-to-evict
+	 * holds on the spa.  We need the open-to-close holds because
+	 * otherwise the spa_refcnt wouldn't change when we open a
+	 * dir which the spa also has open, so we could incorrectly
+	 * think it was OK to unload/export/destroy the pool.  We need
+	 * the instantiate-to-evict hold because the dsl_dir_t has a
+	 * pointer to the dd_pool, which has a pointer to the spa_t.
+	 */
+	spa_open_ref(dp->dp_spa, tag);
+	ASSERT3P(dd->dd_pool, ==, dp);
+	ASSERT3U(dd->dd_object, ==, ddobj);
+	ASSERT3P(dd->dd_dbuf, ==, dbuf);
+	*ddp = dd;
+	return (0);
+
+errout:
+	if (dd->dd_parent)
+		dsl_dir_rele(dd->dd_parent, dd);
+	if (dsl_deadlist_is_open(&dd->dd_livelist))
+		dsl_dir_livelist_close(dd);
+	dsl_prop_fini(dd);
+	cv_destroy(&dd->dd_activity_cv);
+	mutex_destroy(&dd->dd_activity_lock);
+	mutex_destroy(&dd->dd_lock);
+	kmem_free(dd, sizeof (dsl_dir_t));
+	dmu_buf_rele(dbuf, tag);
+	return (err);
+}
+
+void
+dsl_dir_rele(dsl_dir_t *dd, void *tag)
+{
+	dprintf_dd(dd, "%s\n", "");
+	spa_close(dd->dd_pool->dp_spa, tag);
+	dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/*
+ * Remove a reference to the given dsl dir that is being asynchronously
+ * released.  Async releases occur from a taskq performing eviction of
+ * dsl datasets and dirs.  This process is identical to a normal release
+ * with the exception of using the async API for releasing the reference on
+ * the spa.
+ */
+void
+dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+{
+	dprintf_dd(dd, "%s\n", "");
+	spa_async_close(dd->dd_pool->dp_spa, tag);
+	dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
+void
+dsl_dir_name(dsl_dir_t *dd, char *buf)
+{
+	if (dd->dd_parent) {
+		dsl_dir_name(dd->dd_parent, buf);
+		VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
+		    ZFS_MAX_DATASET_NAME_LEN);
+	} else {
+		buf[0] = '\0';
+	}
+	if (!MUTEX_HELD(&dd->dd_lock)) {
+		/*
+		 * recursive mutex so that we can use
+		 * dprintf_dd() with dd_lock held
+		 */
+		mutex_enter(&dd->dd_lock);
+		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+		    <, ZFS_MAX_DATASET_NAME_LEN);
+		mutex_exit(&dd->dd_lock);
+	} else {
+		VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+		    <, ZFS_MAX_DATASET_NAME_LEN);
+	}
+}
+
+/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
+int
+dsl_dir_namelen(dsl_dir_t *dd)
+{
+	int result = 0;
+
+	if (dd->dd_parent) {
+		/* parent's name + 1 for the "/" */
+		result = dsl_dir_namelen(dd->dd_parent) + 1;
+	}
+
+	if (!MUTEX_HELD(&dd->dd_lock)) {
+		/* see dsl_dir_name */
+		mutex_enter(&dd->dd_lock);
+		result += strlen(dd->dd_myname);
+		mutex_exit(&dd->dd_lock);
+	} else {
+		result += strlen(dd->dd_myname);
+	}
+
+	return (result);
+}
+
+static int
+getcomponent(const char *path, char *component, const char **nextp)
+{
+	char *p;
+
+	if ((path == NULL) || (path[0] == '\0'))
+		return (SET_ERROR(ENOENT));
+	/* This would be a good place to reserve some namespace... */
+	p = strpbrk(path, "/@");
+	if (p && (p[1] == '/' || p[1] == '@')) {
+		/* two separators in a row */
+		return (SET_ERROR(EINVAL));
+	}
+	if (p == NULL || p == path) {
+		/*
+		 * if the first thing is an @ or /, it had better be an
+		 * @ and it had better not have any more ats or slashes,
+		 * and it had better have something after the @.
+		 */
+		if (p != NULL &&
+		    (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
+			return (SET_ERROR(EINVAL));
+		if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
+			return (SET_ERROR(ENAMETOOLONG));
+		(void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN);
+		p = NULL;
+	} else if (p[0] == '/') {
+		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+			return (SET_ERROR(ENAMETOOLONG));
+		(void) strncpy(component, path, p - path);
+		component[p - path] = '\0';
+		p++;
+	} else if (p[0] == '@') {
+		/*
+		 * if the next separator is an @, there better not be
+		 * any more slashes.
+		 */
+		if (strchr(path, '/'))
+			return (SET_ERROR(EINVAL));
+		if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
+			return (SET_ERROR(ENAMETOOLONG));
+		(void) strncpy(component, path, p - path);
+		component[p - path] = '\0';
+	} else {
+		panic("invalid p=%p", (void *)p);
+	}
+	*nextp = p;
+	return (0);
+}
+
+/*
+ * Return the dsl_dir_t, and possibly the last component which couldn't
+ * be found in *tail.  The name must be in the specified dsl_pool_t.  This
+ * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
+ * path is bogus, or if tail==NULL and we couldn't parse the whole name.
+ * (*tail)[0] == '@' means that the last component is a snapshot.
+ */
+int
+dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
+    dsl_dir_t **ddp, const char **tailp)
+{
+	char *buf;
+	const char *spaname, *next, *nextnext = NULL;
+	int err;
+	dsl_dir_t *dd;
+	uint64_t ddobj;
+
+	buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	err = getcomponent(name, buf, &next);
+	if (err != 0)
+		goto error;
+
+	/* Make sure the name is in the specified pool. */
+	spaname = spa_name(dp->dp_spa);
+	if (strcmp(buf, spaname) != 0) {
+		err = SET_ERROR(EXDEV);
+		goto error;
+	}
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+	if (err != 0) {
+		goto error;
+	}
+
+	while (next != NULL) {
+		dsl_dir_t *child_dd;
+		err = getcomponent(next, buf, &nextnext);
+		if (err != 0)
+			break;
+		ASSERT(next[0] != '\0');
+		if (next[0] == '@')
+			break;
+		dprintf("looking up %s in obj%lld\n",
+		    buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+
+		err = zap_lookup(dp->dp_meta_objset,
+		    dsl_dir_phys(dd)->dd_child_dir_zapobj,
+		    buf, sizeof (ddobj), 1, &ddobj);
+		if (err != 0) {
+			if (err == ENOENT)
+				err = 0;
+			break;
+		}
+
+		err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
+		if (err != 0)
+			break;
+		dsl_dir_rele(dd, tag);
+		dd = child_dd;
+		next = nextnext;
+	}
+
+	if (err != 0) {
+		dsl_dir_rele(dd, tag);
+		goto error;
+	}
+
+	/*
+	 * It's an error if there's more than one component left, or
+	 * tailp==NULL and there's any component left.
+	 */
+	if (next != NULL &&
+	    (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
+		/* bad path name */
+		dsl_dir_rele(dd, tag);
+		dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
+		err = SET_ERROR(ENOENT);
+	}
+	if (tailp != NULL)
+		*tailp = next;
+	if (err == 0)
+		*ddp = dd;
+error:
+	kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN);
+	return (err);
+}
+
+/*
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	uint64_t my_fs_cnt = 0;
+	uint64_t my_ss_cnt = 0;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *os = dp->dp_meta_objset;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+	dsl_dataset_t *ds;
+
+	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+	ASSERT(dsl_pool_config_held(dp));
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dsl_dir_zapify(dd, tx);
+
+	/*
+	 * If the filesystem count has already been initialized then we
+	 * don't need to recurse down any further.
+	 */
+	if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+		return;
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+	/* Iterate my child dirs */
+	for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+		dsl_dir_t *chld_dd;
+		uint64_t count;
+
+		VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+		    &chld_dd));
+
+		/*
+		 * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets.
+		 */
+		if (chld_dd->dd_myname[0] == '$') {
+			dsl_dir_rele(chld_dd, FTAG);
+			continue;
+		}
+
+		my_fs_cnt++;	/* count this child */
+
+		dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+		VERIFY0(zap_lookup(os, chld_dd->dd_object,
+		    DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+		my_fs_cnt += count;
+		VERIFY0(zap_lookup(os, chld_dd->dd_object,
+		    DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+		my_ss_cnt += count;
+
+		dsl_dir_rele(chld_dd, FTAG);
+	}
+	zap_cursor_fini(zc);
+	/* Count my snapshots (we counted children's snapshots above) */
+	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+	    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
+
+	for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		/* Don't count temporary snapshots */
+		if (za->za_name[0] != '%')
+			my_ss_cnt++;
+	}
+	zap_cursor_fini(zc);
+
+	dsl_dataset_rele(ds, FTAG);
+
+	kmem_free(zc, sizeof (zap_cursor_t));
+	kmem_free(za, sizeof (zap_attribute_t));
+
+	/* we're in a sync task, update counts */
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+	    sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+	VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+	    sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+	char *ddname = (char *)arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	dsl_dir_t *dd;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	dd = ds->ds_dir;
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+	    dsl_dir_is_zapified(dd) &&
+	    zap_contains(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(EALREADY));
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+	char *ddname = (char *)arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	spa_t *spa;
+
+	VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+	spa = dsl_dataset_get_spa(ds);
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+		/*
+		 * Since the feature was not active and we're now setting a
+		 * limit, increment the feature-active counter so that the
+		 * feature becomes active for the first time.
+		 *
+		 * We are already in a sync task so we can update the MOS.
+		 */
+		spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+	}
+
+	/*
+	 * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+	 * we need to ensure the counts are correct. Descend down the tree from
+	 * this point and update all of the counts to be accurate.
+	 */
+	dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+	int error;
+
+	error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+	    dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
+	    ZFS_SPACE_CHECK_RESERVED);
+
+	if (error == EALREADY)
+		error = 0;
+
+	return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+	ENFORCE_ALWAYS,
+	ENFORCE_NEVER,
+	ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop,
+    cred_t *cr, proc_t *proc)
+{
+	enforce_res_t enforce = ENFORCE_ALWAYS;
+	uint64_t obj;
+	dsl_dataset_t *ds;
+	uint64_t zoned;
+	const char *zonedstr;
+
+	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+	if (crgetzoneid(cr) != GLOBAL_ZONEID)
+		return (ENFORCE_ALWAYS);
+
+	/*
+	 * We are checking the saved credentials of the user process, which is
+	 * not the current process.  Note that we can't use secpolicy_zfs(),
+	 * because it only works if the cred is that of the current process (on
+	 * Linux).
+	 */
+	if (secpolicy_zfs_proc(cr, proc) == 0)
+		return (ENFORCE_NEVER);
+#endif
+
+	if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
+		return (ENFORCE_ALWAYS);
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+	if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+		return (ENFORCE_ALWAYS);
+
+	zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED);
+	if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) {
+		/* Only root can access zoned fs's from the GZ */
+		enforce = ENFORCE_ALWAYS;
+	} else {
+		if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+			enforce = ENFORCE_ABOVE;
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (enforce);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+    dsl_dir_t *ancestor, cred_t *cr, proc_t *proc)
+{
+	objset_t *os = dd->dd_pool->dp_meta_objset;
+	uint64_t limit, count;
+	char *count_prop;
+	enforce_res_t enforce;
+	int err = 0;
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+	ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+	    prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+	/*
+	 * If we're allowed to change the limit, don't enforce the limit
+	 * e.g. this can happen if a snapshot is taken by an administrative
+	 * user in the global zone (i.e. a recursive snapshot by root).
+	 * However, we must handle the case of delegated permissions where we
+	 * are allowed to change the limit on the current dataset, but there
+	 * is another limit in the tree above.
+	 */
+	enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc);
+	if (enforce == ENFORCE_NEVER)
+		return (0);
+
+	/*
+	 * e.g. if renaming a dataset with no snapshots, count adjustment
+	 * is 0.
+	 */
+	if (delta == 0)
+		return (0);
+
+	if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+		/*
+		 * We don't enforce the limit for temporary snapshots. This is
+		 * indicated by a NULL cred_t argument.
+		 */
+		if (cr == NULL)
+			return (0);
+
+		count_prop = DD_FIELD_SNAPSHOT_COUNT;
+	} else {
+		count_prop = DD_FIELD_FILESYSTEM_COUNT;
+	}
+
+	/*
+	 * If an ancestor has been provided, stop checking the limit once we
+	 * hit that dir. We need this during rename so that we don't overcount
+	 * the check once we recurse up to the common ancestor.
+	 */
+	if (ancestor == dd)
+		return (0);
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know there is no limit here (or above). The counts are
+	 * not valid on this node and we know we won't touch this node's counts.
+	 */
+	if (!dsl_dir_is_zapified(dd))
+		return (0);
+	err = zap_lookup(os, dd->dd_object,
+	    count_prop, sizeof (count), 1, &count);
+	if (err == ENOENT)
+		return (0);
+	if (err != 0)
+		return (err);
+
+	err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+	    B_FALSE);
+	if (err != 0)
+		return (err);
+
+	/* Is there a limit which we've hit? */
+	if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+		return (SET_ERROR(EDQUOT));
+
+	if (dd->dd_parent != NULL)
+		err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+		    ancestor, cr, proc);
+
+	return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+    dmu_tx_t *tx)
+{
+	int err;
+	objset_t *os = dd->dd_pool->dp_meta_objset;
+	uint64_t count;
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+	    strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+	/*
+	 * We don't do accounting for hidden ($FREE, $MOS & $ORIGIN) objsets.
+	 */
+	if (dd->dd_myname[0] == '$' && strcmp(prop,
+	    DD_FIELD_FILESYSTEM_COUNT) == 0) {
+		return;
+	}
+
+	/*
+	 * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+	 */
+	if (delta == 0)
+		return;
+
+	/*
+	 * If we hit an uninitialized node while recursing up the tree, we can
+	 * stop since we know the counts are not valid on this node and we
+	 * know we shouldn't touch this node's counts. An uninitialized count
+	 * on the node indicates that either the feature has not yet been
+	 * activated or there are no limits on this part of the tree.
+	 */
+	if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+	    prop, sizeof (count), 1, &count)) == ENOENT)
+		return;
+	VERIFY0(err);
+
+	count += delta;
+	/* Use a signed verify to make sure we're not neg. */
+	VERIFY3S(count, >=, 0);
+
+	VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
+	    tx));
+
+	/* Roll up this additional count into our ancestors */
+	if (dd->dd_parent != NULL)
+		dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
+}
+
+uint64_t
+dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
+    dmu_tx_t *tx)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t ddobj;
+	dsl_dir_phys_t *ddphys;
+	dmu_buf_t *dbuf;
+
+	ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
+	    DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
+	if (pds) {
+		VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
+		    name, sizeof (uint64_t), 1, &ddobj, tx));
+	} else {
+		/* it's the root dir */
+		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
+	}
+	VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
+	dmu_buf_will_dirty(dbuf, tx);
+	ddphys = dbuf->db_data;
+
+	ddphys->dd_creation_time = gethrestime_sec();
+	if (pds) {
+		ddphys->dd_parent_obj = pds->dd_object;
+
+		/* update the filesystem counts */
+		dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
+	}
+	ddphys->dd_props_zapobj = zap_create(mos,
+	    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+	ddphys->dd_child_dir_zapobj = zap_create(mos,
+	    DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
+		ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
+
+	dmu_buf_rele(dbuf, FTAG);
+
+	return (ddobj);
+}
+
+boolean_t
+dsl_dir_is_clone(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_origin_obj &&
+	    (dd->dd_pool->dp_origin_snap == NULL ||
+	    dsl_dir_phys(dd)->dd_origin_obj !=
+	    dd->dd_pool->dp_origin_snap->ds_object));
+}
+
+uint64_t
+dsl_dir_get_used(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_used_bytes);
+}
+
+uint64_t
+dsl_dir_get_compressed(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_compressed_bytes);
+}
+
+uint64_t
+dsl_dir_get_quota(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_quota);
+}
+
+uint64_t
+dsl_dir_get_reservation(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_reserved);
+}
+
+uint64_t
+dsl_dir_get_compressratio(dsl_dir_t *dd)
+{
+	/* a fixed point number, 100x the ratio */
+	return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+	    (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+	    dsl_dir_phys(dd)->dd_compressed_bytes));
+}
+
+uint64_t
+dsl_dir_get_logicalused(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_uncompressed_bytes);
+}
+
+uint64_t
+dsl_dir_get_usedsnap(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
+}
+
+uint64_t
+dsl_dir_get_usedds(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
+}
+
+uint64_t
+dsl_dir_get_usedrefreserv(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
+}
+
+uint64_t
+dsl_dir_get_usedchild(dsl_dir_t *dd)
+{
+	return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+	    dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+}
+
+void
+dsl_dir_get_origin(dsl_dir_t *dd, char *buf)
+{
+	dsl_dataset_t *ds;
+	VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
+
+	dsl_dataset_name(ds, buf);
+
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count)
+{
+	if (dsl_dir_is_zapified(dd)) {
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+		    sizeof (*count), 1, count));
+	} else {
+		return (SET_ERROR(ENOENT));
+	}
+}
+
+int
+dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count)
+{
+	if (dsl_dir_is_zapified(dd)) {
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+		    sizeof (*count), 1, count));
+	} else {
+		return (SET_ERROR(ENOENT));
+	}
+}
+
+void
+dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
+{
+	mutex_enter(&dd->dd_lock);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+	    dsl_dir_get_quota(dd));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
+	    dsl_dir_get_reservation(dd));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
+	    dsl_dir_get_logicalused(dd));
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
+		    dsl_dir_get_usedsnap(dd));
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
+		    dsl_dir_get_usedds(dd));
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
+		    dsl_dir_get_usedrefreserv(dd));
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
+		    dsl_dir_get_usedchild(dd));
+	}
+	mutex_exit(&dd->dd_lock);
+
+	uint64_t count;
+	if (dsl_dir_get_filesystem_count(dd, &count) == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT,
+		    count);
+	}
+	if (dsl_dir_get_snapshot_count(dd, &count) == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT,
+		    count);
+	}
+
+	if (dsl_dir_is_clone(dd)) {
+		char buf[ZFS_MAX_DATASET_NAME_LEN];
+		dsl_dir_get_origin(dd, buf);
+		dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
+	}
+
+}
+
+void
+dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dd->dd_pool;
+
+	ASSERT(dsl_dir_phys(dd));
+
+	if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(dd->dd_dbuf, dd);
+	}
+}
+
+static int64_t
+parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
+{
+	uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	uint64_t new_accounted =
+	    MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
+	return (new_accounted - old_accounted);
+}
+
+void
+dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	mutex_enter(&dd->dd_lock);
+	ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]);
+	dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
+	    dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024);
+	dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0;
+	mutex_exit(&dd->dd_lock);
+
+	/* release the hold from dsl_dir_dirty */
+	dmu_buf_rele(dd->dd_dbuf, dd);
+}
+
+static uint64_t
+dsl_dir_space_towrite(dsl_dir_t *dd)
+{
+	uint64_t space = 0;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		space += dd->dd_space_towrite[i & TXG_MASK];
+		ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0);
+	}
+	return (space);
+}
+
+/*
+ * How much space would dd have available if ancestor had delta applied
+ * to it?  If ondiskonly is set, we're only interested in what's
+ * on-disk, not estimated pending changes.
+ */
+uint64_t
+dsl_dir_space_available(dsl_dir_t *dd,
+    dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
+{
+	uint64_t parentspace, myspace, quota, used;
+
+	/*
+	 * If there are no restrictions otherwise, assume we have
+	 * unlimited space available.
+	 */
+	quota = UINT64_MAX;
+	parentspace = UINT64_MAX;
+
+	if (dd->dd_parent != NULL) {
+		parentspace = dsl_dir_space_available(dd->dd_parent,
+		    ancestor, delta, ondiskonly);
+	}
+
+	mutex_enter(&dd->dd_lock);
+	if (dsl_dir_phys(dd)->dd_quota != 0)
+		quota = dsl_dir_phys(dd)->dd_quota;
+	used = dsl_dir_phys(dd)->dd_used_bytes;
+	if (!ondiskonly)
+		used += dsl_dir_space_towrite(dd);
+
+	if (dd->dd_parent == NULL) {
+		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool,
+		    ZFS_SPACE_CHECK_NORMAL);
+		quota = MIN(quota, poolsize);
+	}
+
+	if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
+		/*
+		 * We have some space reserved, in addition to what our
+		 * parent gave us.
+		 */
+		parentspace += dsl_dir_phys(dd)->dd_reserved - used;
+	}
+
+	if (dd == ancestor) {
+		ASSERT(delta <= 0);
+		ASSERT(used >= -delta);
+		used += delta;
+		if (parentspace != UINT64_MAX)
+			parentspace -= delta;
+	}
+
+	if (used > quota) {
+		/* over quota */
+		myspace = 0;
+	} else {
+		/*
+		 * the lesser of the space provided by our parent and
+		 * the space left in our quota
+		 */
+		myspace = MIN(parentspace, quota - used);
+	}
+
+	mutex_exit(&dd->dd_lock);
+
+	return (myspace);
+}
+
+struct tempreserve {
+	list_node_t tr_node;
+	dsl_dir_t *tr_ds;
+	uint64_t tr_size;
+};
+
+static int
+dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
+    boolean_t ignorequota, list_t *tr_list,
+    dmu_tx_t *tx, boolean_t first)
+{
+	uint64_t txg;
+	uint64_t quota;
+	struct tempreserve *tr;
+	int retval;
+	uint64_t ref_rsrv;
+
+top_of_function:
+	txg = tx->tx_txg;
+	retval = EDQUOT;
+	ref_rsrv = 0;
+
+	ASSERT3U(txg, !=, 0);
+	ASSERT3S(asize, >, 0);
+
+	mutex_enter(&dd->dd_lock);
+
+	/*
+	 * Check against the dsl_dir's quota.  We don't add in the delta
+	 * when checking for over-quota because they get one free hit.
+	 */
+	uint64_t est_inflight = dsl_dir_space_towrite(dd);
+	for (int i = 0; i < TXG_SIZE; i++)
+		est_inflight += dd->dd_tempreserved[i];
+	uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
+
+	/*
+	 * On the first iteration, fetch the dataset's used-on-disk and
+	 * refreservation values. Also, if checkrefquota is set, test if
+	 * allocating this space would exceed the dataset's refquota.
+	 */
+	if (first && tx->tx_objset) {
+		int error;
+		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
+
+		error = dsl_dataset_check_quota(ds, !netfree,
+		    asize, est_inflight, &used_on_disk, &ref_rsrv);
+		if (error != 0) {
+			mutex_exit(&dd->dd_lock);
+			DMU_TX_STAT_BUMP(dmu_tx_quota);
+			return (error);
+		}
+	}
+
+	/*
+	 * If this transaction will result in a net free of space,
+	 * we want to let it through.
+	 */
+	if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
+		quota = UINT64_MAX;
+	else
+		quota = dsl_dir_phys(dd)->dd_quota;
+
+	/*
+	 * Adjust the quota against the actual pool size at the root
+	 * minus any outstanding deferred frees.
+	 * To ensure that it's possible to remove files from a full
+	 * pool without inducing transient overcommits, we throttle
+	 * netfree transactions against a quota that is slightly larger,
+	 * but still within the pool's allocation slop.  In cases where
+	 * we're very close to full, this will allow a steady trickle of
+	 * removes to get through.
+	 */
+	uint64_t deferred = 0;
+	if (dd->dd_parent == NULL) {
+		uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool,
+		    (netfree) ?
+		    ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL);
+
+		if (avail < quota) {
+			quota = avail;
+			retval = SET_ERROR(ENOSPC);
+		}
+	}
+
+	/*
+	 * If they are requesting more space, and our current estimate
+	 * is over quota, they get to try again unless the actual
+	 * on-disk is over quota and there are no pending changes (which
+	 * may free up space for us).
+	 */
+	if (used_on_disk + est_inflight >= quota) {
+		if (est_inflight > 0 || used_on_disk < quota ||
+		    (retval == ENOSPC && used_on_disk < quota + deferred))
+			retval = ERESTART;
+		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
+		    "quota=%lluK tr=%lluK err=%d\n",
+		    used_on_disk>>10, est_inflight>>10,
+		    quota>>10, asize>>10, retval);
+		mutex_exit(&dd->dd_lock);
+		DMU_TX_STAT_BUMP(dmu_tx_quota);
+		return (SET_ERROR(retval));
+	}
+
+	/* We need to up our estimated delta before dropping dd_lock */
+	dd->dd_tempreserved[txg & TXG_MASK] += asize;
+
+	uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
+	    asize - ref_rsrv);
+	mutex_exit(&dd->dd_lock);
+
+	tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+	tr->tr_ds = dd;
+	tr->tr_size = asize;
+	list_insert_tail(tr_list, tr);
+
+	/* see if it's OK with our parent */
+	if (dd->dd_parent != NULL && parent_rsrv != 0) {
+		/*
+		 * Recurse on our parent without recursion. This has been
+		 * observed to be potentially large stack usage even within
+		 * the test suite. Largest seen stack was 7632 bytes on linux.
+		 */
+
+		dd = dd->dd_parent;
+		asize = parent_rsrv;
+		ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
+		first = B_FALSE;
+		goto top_of_function;
+
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * Reserve space in this dsl_dir, to be used in this tx's txg.
+ * After the space has been dirtied (and dsl_dir_willuse_space()
+ * has been called), the reservation should be canceled, using
+ * dsl_dir_tempreserve_clear().
+ */
+int
+dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
+    boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx)
+{
+	int err;
+	list_t *tr_list;
+
+	if (asize == 0) {
+		*tr_cookiep = NULL;
+		return (0);
+	}
+
+	tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+	list_create(tr_list, sizeof (struct tempreserve),
+	    offsetof(struct tempreserve, tr_node));
+	ASSERT3S(asize, >, 0);
+
+	err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg);
+	if (err == 0) {
+		struct tempreserve *tr;
+
+		tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
+		tr->tr_size = lsize;
+		list_insert_tail(tr_list, tr);
+	} else {
+		if (err == EAGAIN) {
+			/*
+			 * If arc_memory_throttle() detected that pageout
+			 * is running and we are low on memory, we delay new
+			 * non-pageout transactions to give pageout an
+			 * advantage.
+			 *
+			 * It is unfortunate to be delaying while the caller's
+			 * locks are held.
+			 */
+			txg_delay(dd->dd_pool, tx->tx_txg,
+			    MSEC2NSEC(10), MSEC2NSEC(10));
+			err = SET_ERROR(ERESTART);
+		}
+	}
+
+	if (err == 0) {
+		err = dsl_dir_tempreserve_impl(dd, asize, netfree,
+		    B_FALSE, tr_list, tx, B_TRUE);
+	}
+
+	if (err != 0)
+		dsl_dir_tempreserve_clear(tr_list, tx);
+	else
+		*tr_cookiep = tr_list;
+
+	return (err);
+}
+
+/*
+ * Clear a temporary reservation that we previously made with
+ * dsl_dir_tempreserve_space().
+ */
+void
+dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
+{
+	int txgidx = tx->tx_txg & TXG_MASK;
+	list_t *tr_list = tr_cookie;
+	struct tempreserve *tr;
+
+	ASSERT3U(tx->tx_txg, !=, 0);
+
+	if (tr_cookie == NULL)
+		return;
+
+	while ((tr = list_head(tr_list)) != NULL) {
+		if (tr->tr_ds) {
+			mutex_enter(&tr->tr_ds->dd_lock);
+			ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
+			    tr->tr_size);
+			tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
+			mutex_exit(&tr->tr_ds->dd_lock);
+		} else {
+			arc_tempreserve_clear(tr->tr_size);
+		}
+		list_remove(tr_list, tr);
+		kmem_free(tr, sizeof (struct tempreserve));
+	}
+
+	kmem_free(tr_list, sizeof (list_t));
+}
+
+/*
+ * This should be called from open context when we think we're going to write
+ * or free space, for example when dirtying data. Be conservative; it's okay
+ * to write less space or free more, but we don't want to write more or free
+ * less than the amount specified.
+ *
+ * NOTE: The behavior of this function is identical to the Illumos / FreeBSD
+ * version however it has been adjusted to use an iterative rather than
+ * recursive algorithm to minimize stack usage.
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+{
+	int64_t parent_space;
+	uint64_t est_used;
+
+	do {
+		mutex_enter(&dd->dd_lock);
+		if (space > 0)
+			dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
+
+		est_used = dsl_dir_space_towrite(dd) +
+		    dsl_dir_phys(dd)->dd_used_bytes;
+		parent_space = parent_delta(dd, est_used, space);
+		mutex_exit(&dd->dd_lock);
+
+		/* Make sure that we clean up dd_space_to* */
+		dsl_dir_dirty(dd, tx);
+
+		dd = dd->dd_parent;
+		space = parent_space;
+	} while (space && dd);
+}
+
+/* call from syncing context when we actually write/free space for this dd */
+void
+dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
+    int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
+{
+	int64_t accounted_delta;
+
+	/*
+	 * dsl_dataset_set_refreservation_sync_impl() calls this with
+	 * dd_lock held, so that it can atomically update
+	 * ds->ds_reserved and the dsl_dir accounting, so that
+	 * dsl_dataset_check_quota() can see dataset and dir accounting
+	 * consistently.
+	 */
+	boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(type < DD_USED_NUM);
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	if (needlock)
+		mutex_enter(&dd->dd_lock);
+	accounted_delta =
+	    parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+	ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
+	ASSERT(compressed >= 0 ||
+	    dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
+	ASSERT(uncompressed >= 0 ||
+	    dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+	dsl_dir_phys(dd)->dd_used_bytes += used;
+	dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+	dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
+
+	if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+		ASSERT(used > 0 ||
+		    dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+		dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
+#ifdef ZFS_DEBUG
+		{
+			dd_used_t t;
+			uint64_t u = 0;
+			for (t = 0; t < DD_USED_NUM; t++)
+				u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+			ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
+		}
+#endif
+	}
+	if (needlock)
+		mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_parent != NULL) {
+		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+		    accounted_delta, compressed, uncompressed, tx);
+		dsl_dir_transfer_space(dd->dd_parent,
+		    used - accounted_delta,
+		    DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+	}
+}
+
+void
+dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
+    dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(oldtype < DD_USED_NUM);
+	ASSERT(newtype < DD_USED_NUM);
+
+	if (delta == 0 ||
+	    !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
+		return;
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+	mutex_enter(&dd->dd_lock);
+	ASSERT(delta > 0 ?
+	    dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+	    dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+	ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+	dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+	dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
+	mutex_exit(&dd->dd_lock);
+}
+
+typedef struct dsl_dir_set_qr_arg {
+	const char *ddsqra_name;
+	zprop_source_t ddsqra_source;
+	uint64_t ddsqra_value;
+} dsl_dir_set_qr_arg_t;
+
+static int
+dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	int error;
+	uint64_t towrite, newval;
+
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+
+	error = dsl_prop_predict(ds->ds_dir, "quota",
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	if (newval == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	mutex_enter(&ds->ds_dir->dd_lock);
+	/*
+	 * If we are doing the preliminary check in open context, and
+	 * there are pending changes, then don't fail it, since the
+	 * pending changes could under-estimate the amount of space to be
+	 * freed up.
+	 */
+	towrite = dsl_dir_space_towrite(ds->ds_dir);
+	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
+	    (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+	    newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
+		error = SET_ERROR(ENOSPC);
+	}
+	mutex_exit(&ds->ds_dir->dd_lock);
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+static void
+dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	uint64_t newval;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
+		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+		    &ddsqra->ddsqra_value, tx);
+
+		VERIFY0(dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
+	} else {
+		newval = ddsqra->ddsqra_value;
+		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+		    zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
+	}
+
+	dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+	mutex_enter(&ds->ds_dir->dd_lock);
+	dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
+	mutex_exit(&ds->ds_dir->dd_lock);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
+{
+	dsl_dir_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = ddname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = quota;
+
+	return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
+	    dsl_dir_set_quota_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static int
+dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	dsl_dir_t *dd;
+	uint64_t newval, used, avail;
+	int error;
+
+	error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
+	if (error != 0)
+		return (error);
+	dd = ds->ds_dir;
+
+	/*
+	 * If we are doing the preliminary check in open context, the
+	 * space estimates may be inaccurate.
+	 */
+	if (!dmu_tx_is_syncing(tx)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	error = dsl_prop_predict(ds->ds_dir,
+	    zfs_prop_to_name(ZFS_PROP_RESERVATION),
+	    ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (error);
+	}
+
+	mutex_enter(&dd->dd_lock);
+	used = dsl_dir_phys(dd)->dd_used_bytes;
+	mutex_exit(&dd->dd_lock);
+
+	if (dd->dd_parent) {
+		avail = dsl_dir_space_available(dd->dd_parent,
+		    NULL, 0, FALSE);
+	} else {
+		avail = dsl_pool_adjustedsize(dd->dd_pool,
+		    ZFS_SPACE_CHECK_NORMAL) - used;
+	}
+
+	if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
+		uint64_t delta = MAX(used, newval) -
+		    MAX(used, dsl_dir_phys(dd)->dd_reserved);
+
+		if (delta > avail ||
+		    (dsl_dir_phys(dd)->dd_quota > 0 &&
+		    newval > dsl_dir_phys(dd)->dd_quota))
+			error = SET_ERROR(ENOSPC);
+	}
+
+	dsl_dataset_rele(ds, FTAG);
+	return (error);
+}
+
+void
+dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
+{
+	uint64_t used;
+	int64_t delta;
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	mutex_enter(&dd->dd_lock);
+	used = dsl_dir_phys(dd)->dd_used_bytes;
+	delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+	dsl_dir_phys(dd)->dd_reserved = value;
+
+	if (dd->dd_parent != NULL) {
+		/* Roll up this additional usage into our ancestors */
+		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+		    delta, 0, 0, tx);
+	}
+	mutex_exit(&dd->dd_lock);
+}
+
+static void
+dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dir_set_qr_arg_t *ddsqra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	uint64_t newval;
+
+	VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
+		dsl_prop_set_sync_impl(ds,
+		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
+		    ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
+		    &ddsqra->ddsqra_value, tx);
+
+		VERIFY0(dsl_prop_get_int_ds(ds,
+		    zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
+	} else {
+		newval = ddsqra->ddsqra_value;
+		spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
+		    zfs_prop_to_name(ZFS_PROP_RESERVATION),
+		    (longlong_t)newval);
+	}
+
+	dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation)
+{
+	dsl_dir_set_qr_arg_t ddsqra;
+
+	ddsqra.ddsqra_name = ddname;
+	ddsqra.ddsqra_source = source;
+	ddsqra.ddsqra_value = reservation;
+
+	return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
+	    dsl_dir_set_reservation_sync, &ddsqra, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+static dsl_dir_t *
+closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
+{
+	for (; ds1; ds1 = ds1->dd_parent) {
+		dsl_dir_t *dd;
+		for (dd = ds2; dd; dd = dd->dd_parent) {
+			if (ds1 == dd)
+				return (dd);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * If delta is applied to dd, how much of that delta would be applied to
+ * ancestor?  Syncing context only.
+ */
+static int64_t
+would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
+{
+	if (dd == ancestor)
+		return (delta);
+
+	mutex_enter(&dd->dd_lock);
+	delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
+	mutex_exit(&dd->dd_lock);
+	return (would_change(dd->dd_parent, delta, ancestor));
+}
+
+typedef struct dsl_dir_rename_arg {
+	const char *ddra_oldname;
+	const char *ddra_newname;
+	cred_t *ddra_cred;
+	proc_t *ddra_proc;
+} dsl_dir_rename_arg_t;
+
+typedef struct dsl_valid_rename_arg {
+	int char_delta;
+	int nest_delta;
+} dsl_valid_rename_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	dsl_valid_rename_arg_t *dvra = arg;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+
+	dsl_dataset_name(ds, namebuf);
+
+	ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN),
+	    <, ZFS_MAX_DATASET_NAME_LEN);
+	int namelen = strlen(namebuf) + dvra->char_delta;
+	int depth = get_dataset_depth(namebuf) + dvra->nest_delta;
+
+	if (namelen >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting)
+		return (SET_ERROR(ENAMETOOLONG));
+	return (0);
+}
+
+static int
+dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dir_rename_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd, *newparent;
+	dsl_valid_rename_arg_t dvra;
+	dsl_dataset_t *parentds;
+	objset_t *parentos;
+	const char *mynewname;
+	int error;
+
+	/* target dir should exist */
+	error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
+	if (error != 0)
+		return (error);
+
+	/* new parent should exist */
+	error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
+	    &newparent, &mynewname);
+	if (error != 0) {
+		dsl_dir_rele(dd, FTAG);
+		return (error);
+	}
+
+	/* can't rename to different pool */
+	if (dd->dd_pool != newparent->dd_pool) {
+		dsl_dir_rele(newparent, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		return (SET_ERROR(EXDEV));
+	}
+
+	/* new name should not already exist */
+	if (mynewname == NULL) {
+		dsl_dir_rele(newparent, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		return (SET_ERROR(EEXIST));
+	}
+
+	/* can't rename below anything but filesystems (eg. no ZVOLs) */
+	error = dsl_dataset_hold_obj(newparent->dd_pool,
+	    dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds);
+	if (error != 0) {
+		dsl_dir_rele(newparent, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		return (error);
+	}
+	error = dmu_objset_from_ds(parentds, &parentos);
+	if (error != 0) {
+		dsl_dataset_rele(parentds, FTAG);
+		dsl_dir_rele(newparent, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		return (error);
+	}
+	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
+		dsl_dataset_rele(parentds, FTAG);
+		dsl_dir_rele(newparent, FTAG);
+		dsl_dir_rele(dd, FTAG);
+		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+	}
+	dsl_dataset_rele(parentds, FTAG);
+
+	ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN),
+	    <, ZFS_MAX_DATASET_NAME_LEN);
+	ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN),
+	    <, ZFS_MAX_DATASET_NAME_LEN);
+	dvra.char_delta = strlen(ddra->ddra_newname)
+	    - strlen(ddra->ddra_oldname);
+	dvra.nest_delta = get_dataset_depth(ddra->ddra_newname)
+	    - get_dataset_depth(ddra->ddra_oldname);
+
+	/* if the name length is growing, validate child name lengths */
+	if (dvra.char_delta > 0 || dvra.nest_delta > 0) {
+		error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
+		    &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
+		if (error != 0) {
+			dsl_dir_rele(newparent, FTAG);
+			dsl_dir_rele(dd, FTAG);
+			return (error);
+		}
+	}
+
+	if (dmu_tx_is_syncing(tx)) {
+		if (spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_FS_SS_LIMIT)) {
+			/*
+			 * Although this is the check function and we don't
+			 * normally make on-disk changes in check functions,
+			 * we need to do that here.
+			 *
+			 * Ensure this portion of the tree's counts have been
+			 * initialized in case the new parent has limits set.
+			 */
+			dsl_dir_init_fs_ss_count(dd, tx);
+		}
+	}
+
+	if (newparent != dd->dd_parent) {
+		/* is there enough space? */
+		uint64_t myspace =
+		    MAX(dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_reserved);
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		uint64_t fs_cnt = 0;
+		uint64_t ss_cnt = 0;
+
+		if (dsl_dir_is_zapified(dd)) {
+			int err;
+
+			err = zap_lookup(os, dd->dd_object,
+			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+			    &fs_cnt);
+			if (err != ENOENT && err != 0) {
+				dsl_dir_rele(newparent, FTAG);
+				dsl_dir_rele(dd, FTAG);
+				return (err);
+			}
+
+			/*
+			 * have to add 1 for the filesystem itself that we're
+			 * moving
+			 */
+			fs_cnt++;
+
+			err = zap_lookup(os, dd->dd_object,
+			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+			    &ss_cnt);
+			if (err != ENOENT && err != 0) {
+				dsl_dir_rele(newparent, FTAG);
+				dsl_dir_rele(dd, FTAG);
+				return (err);
+			}
+		}
+
+		/* check for encryption errors */
+		error = dsl_dir_rename_crypt_check(dd, newparent);
+		if (error != 0) {
+			dsl_dir_rele(newparent, FTAG);
+			dsl_dir_rele(dd, FTAG);
+			return (SET_ERROR(EACCES));
+		}
+
+		/* no rename into our descendant */
+		if (closest_common_ancestor(dd, newparent) == dd) {
+			dsl_dir_rele(newparent, FTAG);
+			dsl_dir_rele(dd, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+
+		error = dsl_dir_transfer_possible(dd->dd_parent,
+		    newparent, fs_cnt, ss_cnt, myspace,
+		    ddra->ddra_cred, ddra->ddra_proc);
+		if (error != 0) {
+			dsl_dir_rele(newparent, FTAG);
+			dsl_dir_rele(dd, FTAG);
+			return (error);
+		}
+	}
+
+	dsl_dir_rele(newparent, FTAG);
+	dsl_dir_rele(dd, FTAG);
+	return (0);
+}
+
+static void
+dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dir_rename_arg_t *ddra = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd, *newparent;
+	const char *mynewname;
+	objset_t *mos = dp->dp_meta_objset;
+
+	VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
+	VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
+	    &mynewname));
+
+	/* Log this before we change the name. */
+	spa_history_log_internal_dd(dd, "rename", tx,
+	    "-> %s", ddra->ddra_newname);
+
+	if (newparent != dd->dd_parent) {
+		objset_t *os = dd->dd_pool->dp_meta_objset;
+		uint64_t fs_cnt = 0;
+		uint64_t ss_cnt = 0;
+
+		/*
+		 * We already made sure the dd counts were initialized in the
+		 * check function.
+		 */
+		if (spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_FS_SS_LIMIT)) {
+			VERIFY0(zap_lookup(os, dd->dd_object,
+			    DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+			    &fs_cnt));
+			/* add 1 for the filesystem itself that we're moving */
+			fs_cnt++;
+
+			VERIFY0(zap_lookup(os, dd->dd_object,
+			    DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+			    &ss_cnt));
+		}
+
+		dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+		dsl_fs_ss_count_adjust(newparent, fs_cnt,
+		    DD_FIELD_FILESYSTEM_COUNT, tx);
+
+		dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+		dsl_fs_ss_count_adjust(newparent, ss_cnt,
+		    DD_FIELD_SNAPSHOT_COUNT, tx);
+
+		dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
+		    -dsl_dir_phys(dd)->dd_used_bytes,
+		    -dsl_dir_phys(dd)->dd_compressed_bytes,
+		    -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(newparent, DD_USED_CHILD,
+		    dsl_dir_phys(dd)->dd_used_bytes,
+		    dsl_dir_phys(dd)->dd_compressed_bytes,
+		    dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
+
+		if (dsl_dir_phys(dd)->dd_reserved >
+		    dsl_dir_phys(dd)->dd_used_bytes) {
+			uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+			    dsl_dir_phys(dd)->dd_used_bytes;
+
+			dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
+			    -unused_rsrv, 0, 0, tx);
+			dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
+			    unused_rsrv, 0, 0, tx);
+		}
+	}
+
+	dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
+	/* remove from old parent zapobj */
+	VERIFY0(zap_remove(mos,
+	    dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+	    dd->dd_myname, tx));
+
+	(void) strlcpy(dd->dd_myname, mynewname,
+	    sizeof (dd->dd_myname));
+	dsl_dir_rele(dd->dd_parent, dd);
+	dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
+	VERIFY0(dsl_dir_hold_obj(dp,
+	    newparent->dd_object, NULL, dd, &dd->dd_parent));
+
+	/* add to new parent zapobj */
+	VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
+	    dd->dd_myname, 8, 1, &dd->dd_object, tx));
+
+	/* TODO: A rename callback to avoid these layering violations. */
+	zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
+	zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname,
+	    ddra->ddra_newname, B_TRUE);
+
+	dsl_prop_notify_all(dd);
+
+	dsl_dir_rele(newparent, FTAG);
+	dsl_dir_rele(dd, FTAG);
+}
+
+int
+dsl_dir_rename(const char *oldname, const char *newname)
+{
+	dsl_dir_rename_arg_t ddra;
+
+	ddra.ddra_oldname = oldname;
+	ddra.ddra_newname = newname;
+	ddra.ddra_cred = CRED();
+	ddra.ddra_proc = curproc;
+
+	return (dsl_sync_task(oldname,
+	    dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
+	    3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+int
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+    uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space,
+    cred_t *cr, proc_t *proc)
+{
+	dsl_dir_t *ancestor;
+	int64_t adelta;
+	uint64_t avail;
+	int err;
+
+	ancestor = closest_common_ancestor(sdd, tdd);
+	adelta = would_change(sdd, -space, ancestor);
+	avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
+	if (avail < space)
+		return (SET_ERROR(ENOSPC));
+
+	err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
+	    ancestor, cr, proc);
+	if (err != 0)
+		return (err);
+	err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
+	    ancestor, cr, proc);
+	if (err != 0)
+		return (err);
+
+	return (0);
+}
+
+inode_timespec_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+	inode_timespec_t t;
+
+	mutex_enter(&dd->dd_lock);
+	t = dd->dd_snap_cmtime;
+	mutex_exit(&dd->dd_lock);
+
+	return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+	inode_timespec_t t;
+
+	gethrestime(&t);
+	mutex_enter(&dd->dd_lock);
+	dd->dd_snap_cmtime = t;
+	mutex_exit(&dd->dd_lock);
+}
+
+void
+dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
+}
+
+boolean_t
+dsl_dir_is_zapified(dsl_dir_t *dd)
+{
+	dmu_object_info_t doi;
+
+	dmu_object_info_from_db(dd->dd_dbuf, &doi);
+	return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+void
+dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj)
+{
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa,
+	    SPA_FEATURE_LIVELIST));
+	dsl_deadlist_open(&dd->dd_livelist, mos, obj);
+	bplist_create(&dd->dd_pending_allocs);
+	bplist_create(&dd->dd_pending_frees);
+}
+
+void
+dsl_dir_livelist_close(dsl_dir_t *dd)
+{
+	dsl_deadlist_close(&dd->dd_livelist);
+	bplist_destroy(&dd->dd_pending_allocs);
+	bplist_destroy(&dd->dd_pending_frees);
+}
+
+void
+dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total)
+{
+	uint64_t obj;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	livelist_condense_entry_t to_condense = spa->spa_to_condense;
+
+	if (!dsl_deadlist_is_open(&dd->dd_livelist))
+		return;
+
+	/*
+	 * If the livelist being removed is set to be condensed, stop the
+	 * condense zthr and indicate the cancellation in the spa_to_condense
+	 * struct in case the condense no-wait synctask has already started
+	 */
+	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+	if (ll_condense_thread != NULL &&
+	    (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) {
+		/*
+		 * We use zthr_wait_cycle_done instead of zthr_cancel
+		 * because we don't want to destroy the zthr, just have
+		 * it skip its current task.
+		 */
+		spa->spa_to_condense.cancelled = B_TRUE;
+		zthr_wait_cycle_done(ll_condense_thread);
+		/*
+		 * If we've returned from zthr_wait_cycle_done without
+		 * clearing the to_condense data structure it's either
+		 * because the no-wait synctask has started (which is
+		 * indicated by 'syncing' field of to_condense) and we
+		 * can expect it to clear to_condense on its own.
+		 * Otherwise, we returned before the zthr ran. The
+		 * checkfunc will now fail as cancelled == B_TRUE so we
+		 * can safely NULL out ds, allowing a different dir's
+		 * livelist to be condensed.
+		 *
+		 * We can be sure that the to_condense struct will not
+		 * be repopulated at this stage because both this
+		 * function and dsl_livelist_try_condense execute in
+		 * syncing context.
+		 */
+		if ((spa->spa_to_condense.ds != NULL) &&
+		    !spa->spa_to_condense.syncing) {
+			dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf,
+			    spa);
+			spa->spa_to_condense.ds = NULL;
+		}
+	}
+
+	dsl_dir_livelist_close(dd);
+	VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj));
+	VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object,
+	    DD_FIELD_LIVELIST, tx));
+	if (total) {
+		dsl_deadlist_free(dp->dp_meta_objset, obj, tx);
+		spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
+	}
+}
+
+static int
+dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds,
+    zfs_wait_activity_t activity, boolean_t *in_progress)
+{
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&dd->dd_activity_lock));
+
+	switch (activity) {
+	case ZFS_WAIT_DELETEQ: {
+#ifdef _KERNEL
+		objset_t *os;
+		error = dmu_objset_from_ds(ds, &os);
+		if (error != 0)
+			break;
+
+		mutex_enter(&os->os_user_ptr_lock);
+		void *user = dmu_objset_get_user(os);
+		mutex_exit(&os->os_user_ptr_lock);
+		if (dmu_objset_type(os) != DMU_OST_ZFS ||
+		    user == NULL || zfs_get_vfs_flag_unmounted(os)) {
+			*in_progress = B_FALSE;
+			return (0);
+		}
+
+		uint64_t readonly = B_FALSE;
+		error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly,
+		    NULL);
+
+		if (error != 0)
+			break;
+
+		if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) {
+			*in_progress = B_FALSE;
+			return (0);
+		}
+
+		uint64_t count, unlinked_obj;
+		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+		    &unlinked_obj);
+		if (error != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			break;
+		}
+		error = zap_count(os, unlinked_obj, &count);
+
+		if (error == 0)
+			*in_progress = (count != 0);
+		break;
+#else
+		/*
+		 * The delete queue is ZPL specific, and libzpool doesn't have
+		 * it. It doesn't make sense to wait for it.
+		 */
+		*in_progress = B_FALSE;
+		break;
+#endif
+	}
+	default:
+		panic("unrecognized value for activity %d", activity);
+	}
+
+	return (error);
+}
+
+int
+dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity,
+    boolean_t *waited)
+{
+	int error = 0;
+	boolean_t in_progress;
+	dsl_pool_t *dp = dd->dd_pool;
+	for (;;) {
+		dsl_pool_config_enter(dp, FTAG);
+		error = dsl_dir_activity_in_progress(dd, ds, activity,
+		    &in_progress);
+		dsl_pool_config_exit(dp, FTAG);
+		if (error != 0 || !in_progress)
+			break;
+
+		*waited = B_TRUE;
+
+		if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) ==
+		    0 || dd->dd_activity_cancelled) {
+			error = SET_ERROR(EINTR);
+			break;
+		}
+	}
+	return (error);
+}
+
+void
+dsl_dir_cancel_waiters(dsl_dir_t *dd)
+{
+	mutex_enter(&dd->dd_activity_lock);
+	dd->dd_activity_cancelled = B_TRUE;
+	cv_broadcast(&dd->dd_activity_cv);
+	while (dd->dd_activity_waiters > 0)
+		cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock);
+	mutex_exit(&dd->dd_activity_lock);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_dir_set_quota);
+EXPORT_SYMBOL(dsl_dir_set_reservation);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_pool.c b/sys/contrib/openzfs/module/zfs/dsl_pool.c
new file mode 100644
index 000000000000..c770eafa75d8
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_pool.c
@@ -0,0 +1,1417 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ */
+
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_scan.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/bptree.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_userhold.h>
+#include <sys/trace_zfs.h>
+#include <sys/mmp.h>
+
+/*
+ * ZFS Write Throttle
+ * ------------------
+ *
+ * ZFS must limit the rate of incoming writes to the rate at which it is able
+ * to sync data modifications to the backend storage. Throttling by too much
+ * creates an artificial limit; throttling by too little can only be sustained
+ * for short periods and would lead to highly lumpy performance. On a per-pool
+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change
+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount
+ * of dirty data decreases. When the amount of dirty data exceeds a
+ * predetermined threshold further modifications are blocked until the amount
+ * of dirty data decreases (as data is synced out).
+ *
+ * The limit on dirty data is tunable, and should be adjusted according to
+ * both the IO capacity and available memory of the system. The larger the
+ * window, the more ZFS is able to aggregate and amortize metadata (and data)
+ * changes. However, memory is a limited resource, and allowing for more dirty
+ * data comes at the cost of keeping other useful data in memory (for example
+ * ZFS data cached by the ARC).
+ *
+ * Implementation
+ *
+ * As buffers are modified dsl_pool_willuse_space() increments both the per-
+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
+ * dirty space used; dsl_pool_dirty_space() decrements those values as data
+ * is synced out from dsl_pool_sync(). While only the poolwide value is
+ * relevant, the per-txg value is useful for debugging. The tunable
+ * zfs_dirty_data_max determines the dirty space limit. Once that value is
+ * exceeded, new writes are halted until space frees up.
+ *
+ * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we
+ * ensure that there is a txg syncing (see the comment in txg.c for a full
+ * description of transaction group stages).
+ *
+ * The IO scheduler uses both the dirty space limit and current amount of
+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
+ * issues. See the comment in vdev_queue.c for details of the IO scheduler.
+ *
+ * The delay is also calculated based on the amount of dirty data.  See the
+ * comment above dmu_tx_delay() for details.
+ */
+
+/*
+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
+ * capped at zfs_dirty_data_max_max.  It can also be overridden with a module
+ * parameter.
+ */
+unsigned long zfs_dirty_data_max = 0;
+unsigned long zfs_dirty_data_max_max = 0;
+int zfs_dirty_data_max_percent = 10;
+int zfs_dirty_data_max_max_percent = 25;
+
+/*
+ * If there's at least this much dirty data (as a percentage of
+ * zfs_dirty_data_max), push out a txg.  This should be less than
+ * zfs_vdev_async_write_active_min_dirty_percent.
+ */
+int zfs_dirty_data_sync_percent = 20;
+
+/*
+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
+ * and delay each transaction.
+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+ */
+int zfs_delay_min_dirty_percent = 60;
+
+/*
+ * This controls how quickly the delay approaches infinity.
+ * Larger values cause it to delay more for a given amount of dirty data.
+ * Therefore larger values will cause there to be less dirty data for a
+ * given throughput.
+ *
+ * For the smoothest delay, this value should be about 1 billion divided
+ * by the maximum number of operations per second.  This will smoothly
+ * handle between 10x and 1/10th this number.
+ *
+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
+ * multiply in dmu_tx_delay().
+ */
+unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
+
+/*
+ * This determines the number of threads used by the dp_sync_taskq.
+ */
+int zfs_sync_taskq_batch_pct = 75;
+
+/*
+ * These tunables determine the behavior of how zil_itxg_clean() is
+ * called via zil_clean() in the context of spa_sync(). When an itxg
+ * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching.
+ * If the dispatch fails, the call to zil_itxg_clean() will occur
+ * synchronously in the context of spa_sync(), which can negatively
+ * impact the performance of spa_sync() (e.g. in the case of the itxg
+ * list having a large number of itxs that needs to be cleaned).
+ *
+ * Thus, these tunables can be used to manipulate the behavior of the
+ * taskq used by zil_clean(); they determine the number of taskq entries
+ * that are pre-populated when the taskq is first created (via the
+ * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of
+ * taskq entries that are cached after an on-demand allocation (via the
+ * "zfs_zil_clean_taskq_maxalloc").
+ *
+ * The idea being, we want to try reasonably hard to ensure there will
+ * already be a taskq entry pre-allocated by the time that it is needed
+ * by zil_clean(). This way, we can avoid the possibility of an
+ * on-demand allocation of a new taskq entry from failing, which would
+ * result in zil_itxg_clean() being called synchronously from zil_clean()
+ * (which can adversely affect performance of spa_sync()).
+ *
+ * Additionally, the number of threads used by the taskq can be
+ * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable.
+ */
+int zfs_zil_clean_taskq_nthr_pct = 100;
+int zfs_zil_clean_taskq_minalloc = 1024;
+int zfs_zil_clean_taskq_maxalloc = 1024 * 1024;
+
+int
+dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
+{
+	uint64_t obj;
+	int err;
+
+	err = zap_lookup(dp->dp_meta_objset,
+	    dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
+	    name, sizeof (obj), 1, &obj);
+	if (err)
+		return (err);
+
+	return (dsl_dir_hold_obj(dp, obj, name, dp, ddp));
+}
+
+static dsl_pool_t *
+dsl_pool_open_impl(spa_t *spa, uint64_t txg)
+{
+	dsl_pool_t *dp;
+	blkptr_t *bp = spa_get_rootblkptr(spa);
+
+	dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP);
+	dp->dp_spa = spa;
+	dp->dp_meta_rootbp = *bp;
+	rrw_init(&dp->dp_config_rwlock, B_TRUE);
+	txg_init(dp, txg);
+	mmp_init(spa);
+
+	txg_list_create(&dp->dp_dirty_datasets, spa,
+	    offsetof(dsl_dataset_t, ds_dirty_link));
+	txg_list_create(&dp->dp_dirty_zilogs, spa,
+	    offsetof(zilog_t, zl_dirty_link));
+	txg_list_create(&dp->dp_dirty_dirs, spa,
+	    offsetof(dsl_dir_t, dd_dirty_link));
+	txg_list_create(&dp->dp_sync_tasks, spa,
+	    offsetof(dsl_sync_task_t, dst_node));
+	txg_list_create(&dp->dp_early_sync_tasks, spa,
+	    offsetof(dsl_sync_task_t, dst_node));
+
+	dp->dp_sync_taskq = taskq_create("dp_sync_taskq",
+	    zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX,
+	    TASKQ_THREADS_CPU_PCT);
+
+	dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq",
+	    zfs_zil_clean_taskq_nthr_pct, minclsyspri,
+	    zfs_zil_clean_taskq_minalloc,
+	    zfs_zil_clean_taskq_maxalloc,
+	    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
+
+	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
+
+	dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri,
+	    boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+	    TASKQ_THREADS_CPU_PCT);
+	dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain",
+	    100, defclsyspri, boot_ncpus, INT_MAX,
+	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
+	return (dp);
+}
+
+int
+dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
+{
+	int err;
+	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+
+	/*
+	 * Initialize the caller's dsl_pool_t structure before we actually open
+	 * the meta objset.  This is done because a self-healing write zio may
+	 * be issued as part of dmu_objset_open_impl() and the spa needs its
+	 * dsl_pool_t initialized in order to handle the write.
+	 */
+	*dpp = dp;
+
+	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+	    &dp->dp_meta_objset);
+	if (err != 0) {
+		dsl_pool_close(dp);
+		*dpp = NULL;
+	}
+
+	return (err);
+}
+
+int
+dsl_pool_open(dsl_pool_t *dp)
+{
+	int err;
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	uint64_t obj;
+
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
+	    &dp->dp_root_dir_obj);
+	if (err)
+		goto out;
+
+	err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp, &dp->dp_root_dir);
+	if (err)
+		goto out;
+
+	err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir);
+	if (err)
+		goto out;
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) {
+		err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
+		if (err)
+			goto out;
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
+		if (err == 0) {
+			err = dsl_dataset_hold_obj(dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
+			    &dp->dp_origin_snap);
+			dsl_dataset_rele(ds, FTAG);
+		}
+		dsl_dir_rele(dd, dp);
+		if (err)
+			goto out;
+	}
+
+	if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME,
+		    &dp->dp_free_dir);
+		if (err)
+			goto out;
+
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj);
+		if (err)
+			goto out;
+		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
+	}
+
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj);
+		if (err == 0) {
+			VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj,
+			    dp->dp_meta_objset, obj));
+		} else if (err == ENOENT) {
+			/*
+			 * We might not have created the remap bpobj yet.
+			 */
+			err = 0;
+		} else {
+			goto out;
+		}
+	}
+
+	/*
+	 * Note: errors ignored, because the these special dirs, used for
+	 * space accounting, are only created on demand.
+	 */
+	(void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+	    &dp->dp_leak_dir);
+
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
+		    &dp->dp_bptree_obj);
+		if (err != 0)
+			goto out;
+	}
+
+	if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
+		    &dp->dp_empty_bpobj);
+		if (err != 0)
+			goto out;
+	}
+
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+	    &dp->dp_tmp_userrefs_obj);
+	if (err == ENOENT)
+		err = 0;
+	if (err)
+		goto out;
+
+	err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg);
+
+out:
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
+	return (err);
+}
+
+void
+dsl_pool_close(dsl_pool_t *dp)
+{
+	/*
+	 * Drop our references from dsl_pool_open().
+	 *
+	 * Since we held the origin_snap from "syncing" context (which
+	 * includes pool-opening context), it actually only got a "ref"
+	 * and not a hold, so just drop that here.
+	 */
+	if (dp->dp_origin_snap != NULL)
+		dsl_dataset_rele(dp->dp_origin_snap, dp);
+	if (dp->dp_mos_dir != NULL)
+		dsl_dir_rele(dp->dp_mos_dir, dp);
+	if (dp->dp_free_dir != NULL)
+		dsl_dir_rele(dp->dp_free_dir, dp);
+	if (dp->dp_leak_dir != NULL)
+		dsl_dir_rele(dp->dp_leak_dir, dp);
+	if (dp->dp_root_dir != NULL)
+		dsl_dir_rele(dp->dp_root_dir, dp);
+
+	bpobj_close(&dp->dp_free_bpobj);
+	bpobj_close(&dp->dp_obsolete_bpobj);
+
+	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
+	if (dp->dp_meta_objset != NULL)
+		dmu_objset_evict(dp->dp_meta_objset);
+
+	txg_list_destroy(&dp->dp_dirty_datasets);
+	txg_list_destroy(&dp->dp_dirty_zilogs);
+	txg_list_destroy(&dp->dp_sync_tasks);
+	txg_list_destroy(&dp->dp_early_sync_tasks);
+	txg_list_destroy(&dp->dp_dirty_dirs);
+
+	taskq_destroy(dp->dp_zil_clean_taskq);
+	taskq_destroy(dp->dp_sync_taskq);
+
+	/*
+	 * We can't set retry to TRUE since we're explicitly specifying
+	 * a spa to flush. This is good enough; any missed buffers for
+	 * this spa won't cause trouble, and they'll eventually fall
+	 * out of the ARC just like any other unused buffer.
+	 */
+	arc_flush(dp->dp_spa, FALSE);
+
+	mmp_fini(dp->dp_spa);
+	txg_fini(dp);
+	dsl_scan_fini(dp);
+	dmu_buf_user_evict_wait();
+
+	rrw_destroy(&dp->dp_config_rwlock);
+	mutex_destroy(&dp->dp_lock);
+	cv_destroy(&dp->dp_spaceavail_cv);
+	taskq_destroy(dp->dp_unlinked_drain_taskq);
+	taskq_destroy(dp->dp_zrele_taskq);
+	if (dp->dp_blkstats != NULL) {
+		mutex_destroy(&dp->dp_blkstats->zab_lock);
+		vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+	}
+	kmem_free(dp, sizeof (dsl_pool_t));
+}
+
+void
+dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	uint64_t obj;
+	/*
+	 * Currently, we only create the obsolete_bpobj where there are
+	 * indirect vdevs with referenced mappings.
+	 */
+	ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL));
+	/* create and open the obsolete_bpobj */
+	obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+	VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj));
+	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+	spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+}
+
+void
+dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+	VERIFY0(zap_remove(dp->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_OBSOLETE_BPOBJ, tx));
+	bpobj_free(dp->dp_meta_objset,
+	    dp->dp_obsolete_bpobj.bpo_object, tx);
+	bpobj_close(&dp->dp_obsolete_bpobj);
+}
+
+dsl_pool_t *
+dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp,
+    uint64_t txg)
+{
+	int err;
+	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
+	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+#ifdef _KERNEL
+	objset_t *os;
+#else
+	objset_t *os __attribute__((unused));
+#endif
+	dsl_dataset_t *ds;
+	uint64_t obj;
+
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
+	/* create and open the MOS (meta-objset) */
+	dp->dp_meta_objset = dmu_objset_create_impl(spa,
+	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
+	spa->spa_meta_objset = dp->dp_meta_objset;
+
+	/* create the pool directory */
+	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx);
+	ASSERT0(err);
+
+	/* Initialize scan structures */
+	VERIFY0(dsl_scan_init(dp, txg));
+
+	/* create and open the root dir */
+	dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx);
+	VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj,
+	    NULL, dp, &dp->dp_root_dir));
+
+	/* create and open the meta-objset dir */
+	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx);
+	VERIFY0(dsl_pool_open_special_dir(dp,
+	    MOS_DIR_NAME, &dp->dp_mos_dir));
+
+	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		/* create and open the free dir */
+		(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+		    FREE_DIR_NAME, tx);
+		VERIFY0(dsl_pool_open_special_dir(dp,
+		    FREE_DIR_NAME, &dp->dp_free_dir));
+
+		/* create and open the free_bplist */
+		obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
+		VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
+		VERIFY0(bpobj_open(&dp->dp_free_bpobj,
+		    dp->dp_meta_objset, obj));
+	}
+
+	if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB)
+		dsl_pool_create_origin(dp, tx);
+
+	/*
+	 * Some features may be needed when creating the root dataset, so we
+	 * create the feature objects here.
+	 */
+	if (spa_version(spa) >= SPA_VERSION_FEATURES)
+		spa_feature_create_zap_objects(spa, tx);
+
+	if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF &&
+	    dcp->cp_crypt != ZIO_CRYPT_INHERIT)
+		spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx);
+
+	/* create the root dataset */
+	obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx);
+
+	/* create the root objset */
+	VERIFY0(dsl_dataset_hold_obj_flags(dp, obj,
+	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	os = dmu_objset_create_impl(dp->dp_spa, ds,
+	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+#ifdef _KERNEL
+	zfs_create_fs(os, kcred, zplprops, tx);
+#endif
+	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+
+	dmu_tx_commit(tx);
+
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
+
+	return (dp);
+}
+
+/*
+ * Account for the meta-objset space in its placeholder dsl_dir.
+ */
+void
+dsl_pool_mos_diduse_space(dsl_pool_t *dp,
+    int64_t used, int64_t comp, int64_t uncomp)
+{
+	ASSERT3U(comp, ==, uncomp); /* it's all metadata */
+	mutex_enter(&dp->dp_lock);
+	dp->dp_mos_used_delta += used;
+	dp->dp_mos_compressed_delta += comp;
+	dp->dp_mos_uncompressed_delta += uncomp;
+	mutex_exit(&dp->dp_lock);
+}
+
+static void
+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	dmu_objset_sync(dp->dp_meta_objset, zio, tx);
+	VERIFY0(zio_wait(zio));
+	dmu_objset_sync_done(dp->dp_meta_objset, tx);
+	taskq_wait(dp->dp_sync_taskq);
+	multilist_destroy(dp->dp_meta_objset->os_synced_dnodes);
+	dp->dp_meta_objset->os_synced_dnodes = NULL;
+
+	dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+	spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+}
+
+static void
+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
+{
+	ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+	if (delta < 0)
+		ASSERT3U(-delta, <=, dp->dp_dirty_total);
+
+	dp->dp_dirty_total += delta;
+
+	/*
+	 * Note: we signal even when increasing dp_dirty_total.
+	 * This ensures forward progress -- each thread wakes the next waiter.
+	 */
+	if (dp->dp_dirty_total < zfs_dirty_data_max)
+		cv_signal(&dp->dp_spaceavail_cv);
+}
+
+#ifdef ZFS_DEBUG
+static boolean_t
+dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg)
+{
+	spa_t *spa = dp->dp_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+		txg_list_t *tl = &vd->vdev_ms_list;
+		metaslab_t *ms;
+
+		for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms;
+		    ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) {
+			VERIFY(range_tree_is_empty(ms->ms_freeing));
+			VERIFY(range_tree_is_empty(ms->ms_checkpointing));
+		}
+	}
+
+	return (B_TRUE);
+}
+#endif
+
+void
+dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
+{
+	zio_t *zio;
+	dmu_tx_t *tx;
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	objset_t *mos = dp->dp_meta_objset;
+	list_t synced_datasets;
+
+	list_create(&synced_datasets, sizeof (dsl_dataset_t),
+	    offsetof(dsl_dataset_t, ds_synced_link));
+
+	tx = dmu_tx_create_assigned(dp, txg);
+
+	/*
+	 * Run all early sync tasks before writing out any dirty blocks.
+	 * For more info on early sync tasks see block comment in
+	 * dsl_early_sync_task().
+	 */
+	if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) {
+		dsl_sync_task_t *dst;
+
+		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+		while ((dst =
+		    txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) {
+			ASSERT(dsl_early_sync_task_verify(dp, txg));
+			dsl_sync_task_sync(dst, tx);
+		}
+		ASSERT(dsl_early_sync_task_verify(dp, txg));
+	}
+
+	/*
+	 * Write out all dirty blocks of dirty datasets.
+	 */
+	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+		/*
+		 * We must not sync any non-MOS datasets twice, because
+		 * we may have taken a snapshot of them.  However, we
+		 * may sync newly-created datasets on pass 2.
+		 */
+		ASSERT(!list_link_active(&ds->ds_synced_link));
+		list_insert_tail(&synced_datasets, ds);
+		dsl_dataset_sync(ds, zio, tx);
+	}
+	VERIFY0(zio_wait(zio));
+
+	/*
+	 * Update the long range free counter after
+	 * we're done syncing user data
+	 */
+	mutex_enter(&dp->dp_lock);
+	ASSERT(spa_sync_pass(dp->dp_spa) == 1 ||
+	    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0);
+	dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0;
+	mutex_exit(&dp->dp_lock);
+
+	/*
+	 * After the data blocks have been written (ensured by the zio_wait()
+	 * above), update the user/group/project space accounting.  This happens
+	 * in tasks dispatched to dp_sync_taskq, so wait for them before
+	 * continuing.
+	 */
+	for (ds = list_head(&synced_datasets); ds != NULL;
+	    ds = list_next(&synced_datasets, ds)) {
+		dmu_objset_sync_done(ds->ds_objset, tx);
+	}
+	taskq_wait(dp->dp_sync_taskq);
+
+	/*
+	 * Sync the datasets again to push out the changes due to
+	 * userspace updates.  This must be done before we process the
+	 * sync tasks, so that any snapshots will have the correct
+	 * user accounting information (and we won't get confused
+	 * about which blocks are part of the snapshot).
+	 */
+	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
+		objset_t *os = ds->ds_objset;
+
+		ASSERT(list_link_active(&ds->ds_synced_link));
+		dmu_buf_rele(ds->ds_dbuf, ds);
+		dsl_dataset_sync(ds, zio, tx);
+
+		/*
+		 * Release any key mappings created by calls to
+		 * dsl_dataset_dirty() from the userquota accounting
+		 * code paths.
+		 */
+		if (os->os_encrypted && !os->os_raw_receive &&
+		    !os->os_next_write_raw[txg & TXG_MASK]) {
+			ASSERT3P(ds->ds_key_mapping, !=, NULL);
+			key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+		}
+	}
+	VERIFY0(zio_wait(zio));
+
+	/*
+	 * Now that the datasets have been completely synced, we can
+	 * clean up our in-memory structures accumulated while syncing:
+	 *
+	 *  - move dead blocks from the pending deadlist and livelists
+	 *    to the on-disk versions
+	 *  - release hold from dsl_dataset_dirty()
+	 *  - release key mapping hold from dsl_dataset_dirty()
+	 */
+	while ((ds = list_remove_head(&synced_datasets)) != NULL) {
+		objset_t *os = ds->ds_objset;
+
+		if (os->os_encrypted && !os->os_raw_receive &&
+		    !os->os_next_write_raw[txg & TXG_MASK]) {
+			ASSERT3P(ds->ds_key_mapping, !=, NULL);
+			key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds);
+		}
+
+		dsl_dataset_sync_done(ds, tx);
+	}
+
+	while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
+		dsl_dir_sync(dd, tx);
+	}
+
+	/*
+	 * The MOS's space is accounted for in the pool/$MOS
+	 * (dp_mos_dir).  We can't modify the mos while we're syncing
+	 * it, so we remember the deltas and apply them here.
+	 */
+	if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 ||
+	    dp->dp_mos_uncompressed_delta != 0) {
+		dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD,
+		    dp->dp_mos_used_delta,
+		    dp->dp_mos_compressed_delta,
+		    dp->dp_mos_uncompressed_delta, tx);
+		dp->dp_mos_used_delta = 0;
+		dp->dp_mos_compressed_delta = 0;
+		dp->dp_mos_uncompressed_delta = 0;
+	}
+
+	if (dmu_objset_is_dirty(mos, txg)) {
+		dsl_pool_sync_mos(dp, tx);
+	}
+
+	/*
+	 * We have written all of the accounted dirty data, so our
+	 * dp_space_towrite should now be zero. However, some seldom-used
+	 * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up
+	 * the accounting of any dirtied space now.
+	 *
+	 * Note that, besides any dirty data from datasets, the amount of
+	 * dirty data in the MOS is also accounted by the pool. Therefore,
+	 * we want to do this cleanup after dsl_pool_sync_mos() so we don't
+	 * attempt to update the accounting for the same dirty data twice.
+	 * (i.e. at this point we only update the accounting for the space
+	 * that we know that we "leaked").
+	 */
+	dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
+
+	/*
+	 * If we modify a dataset in the same txg that we want to destroy it,
+	 * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it.
+	 * dsl_dir_destroy_check() will fail if there are unexpected holds.
+	 * Therefore, we want to sync the MOS (thus syncing the dd_dbuf
+	 * and clearing the hold on it) before we process the sync_tasks.
+	 * The MOS data dirtied by the sync_tasks will be synced on the next
+	 * pass.
+	 */
+	if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
+		dsl_sync_task_t *dst;
+		/*
+		 * No more sync tasks should have been added while we
+		 * were syncing.
+		 */
+		ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+		while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
+			dsl_sync_task_sync(dst, tx);
+	}
+
+	dmu_tx_commit(tx);
+
+	DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
+}
+
+void
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
+{
+	zilog_t *zilog;
+
+	while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) {
+		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+		/*
+		 * We don't remove the zilog from the dp_dirty_zilogs
+		 * list until after we've cleaned it. This ensures that
+		 * callers of zilog_is_dirty() receive an accurate
+		 * answer when they are racing with the spa sync thread.
+		 */
+		zil_clean(zilog, txg);
+		(void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
+		ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
+		dmu_buf_rele(ds->ds_dbuf, zilog);
+	}
+	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
+}
+
+/*
+ * TRUE if the current thread is the tx_sync_thread or if we
+ * are being called from SPA context during pool initialization.
+ */
+int
+dsl_pool_sync_context(dsl_pool_t *dp)
+{
+	return (curthread == dp->dp_tx.tx_sync_thread ||
+	    spa_is_initializing(dp->dp_spa) ||
+	    taskq_member(dp->dp_sync_taskq, curthread));
+}
+
+/*
+ * This function returns the amount of allocatable space in the pool
+ * minus whatever space is currently reserved by ZFS for specific
+ * purposes. Specifically:
+ *
+ * 1] Any reserved SLOP space
+ * 2] Any space used by the checkpoint
+ * 3] Any space used for deferred frees
+ *
+ * The latter 2 are especially important because they are needed to
+ * rectify the SPA's and DMU's different understanding of how much space
+ * is used. Now the DMU is aware of that extra space tracked by the SPA
+ * without having to maintain a separate special dir (e.g similar to
+ * $MOS, $FREEING, and $LEAKED).
+ *
+ * Note: By deferred frees here, we mean the frees that were deferred
+ * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the
+ * segments placed in ms_defer trees during metaslab_sync_done().
+ */
+uint64_t
+dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+	spa_t *spa = dp->dp_spa;
+	uint64_t space, resv, adjustedsize;
+	uint64_t spa_deferred_frees =
+	    spa->spa_deferred_bpobj.bpo_phys->bpo_bytes;
+
+	space = spa_get_dspace(spa)
+	    - spa_get_checkpoint_space(spa) - spa_deferred_frees;
+	resv = spa_get_slop_space(spa);
+
+	switch (slop_policy) {
+	case ZFS_SPACE_CHECK_NORMAL:
+		break;
+	case ZFS_SPACE_CHECK_RESERVED:
+		resv >>= 1;
+		break;
+	case ZFS_SPACE_CHECK_EXTRA_RESERVED:
+		resv >>= 2;
+		break;
+	case ZFS_SPACE_CHECK_NONE:
+		resv = 0;
+		break;
+	default:
+		panic("invalid slop policy value: %d", slop_policy);
+		break;
+	}
+	adjustedsize = (space >= resv) ? (space - resv) : 0;
+
+	return (adjustedsize);
+}
+
+uint64_t
+dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy)
+{
+	uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy);
+	uint64_t deferred =
+	    metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+	uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0;
+	return (quota);
+}
+
+boolean_t
+dsl_pool_need_dirty_delay(dsl_pool_t *dp)
+{
+	uint64_t delay_min_bytes =
+	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+	uint64_t dirty_min_bytes =
+	    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
+	uint64_t dirty;
+
+	mutex_enter(&dp->dp_lock);
+	dirty = dp->dp_dirty_total;
+	mutex_exit(&dp->dp_lock);
+	if (dirty > dirty_min_bytes)
+		txg_kick(dp);
+	return (dirty > delay_min_bytes);
+}
+
+void
+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+{
+	if (space > 0) {
+		mutex_enter(&dp->dp_lock);
+		dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
+		dsl_pool_dirty_delta(dp, space);
+		mutex_exit(&dp->dp_lock);
+	}
+}
+
+void
+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
+{
+	ASSERT3S(space, >=, 0);
+	if (space == 0)
+		return;
+
+	mutex_enter(&dp->dp_lock);
+	if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
+		/* XXX writing something we didn't dirty? */
+		space = dp->dp_dirty_pertxg[txg & TXG_MASK];
+	}
+	ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
+	dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
+	ASSERT3U(dp->dp_dirty_total, >=, space);
+	dsl_pool_dirty_delta(dp, -space);
+	mutex_exit(&dp->dp_lock);
+}
+
+/* ARGSUSED */
+static int
+upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	dsl_dataset_t *ds, *prev = NULL;
+	int err;
+
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
+			break;
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+		prev = NULL;
+	}
+
+	if (prev == NULL) {
+		prev = dp->dp_origin_snap;
+
+		/*
+		 * The $ORIGIN can't have any data, or the accounting
+		 * will be wrong.
+		 */
+		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+		ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+		rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+		/* The origin doesn't get attached to itself */
+		if (ds->ds_object == prev->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			return (0);
+		}
+
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+		dsl_dataset_phys(ds)->ds_prev_snap_txg =
+		    dsl_dataset_phys(prev)->ds_creation_txg;
+
+		dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
+		dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
+
+		dmu_buf_will_dirty(prev->ds_dbuf, tx);
+		dsl_dataset_phys(prev)->ds_num_children++;
+
+		if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
+			ASSERT(ds->ds_prev == NULL);
+			VERIFY0(dsl_dataset_hold_obj(dp,
+			    dsl_dataset_phys(ds)->ds_prev_snap_obj,
+			    ds, &ds->ds_prev));
+		}
+	}
+
+	ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+	ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
+
+	if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
+		dmu_buf_will_dirty(prev->ds_dbuf, tx);
+		dsl_dataset_phys(prev)->ds_next_clones_obj =
+		    zap_create(dp->dp_meta_objset,
+		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
+	}
+	VERIFY0(zap_add_int(dp->dp_meta_objset,
+	    dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
+
+	dsl_dataset_rele(ds, FTAG);
+	if (prev != dp->dp_origin_snap)
+		dsl_dataset_rele(prev, FTAG);
+	return (0);
+}
+
+void
+dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dp->dp_origin_snap != NULL);
+
+	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
+	    tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
+}
+
+/* ARGSUSED */
+static int
+upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	dmu_tx_t *tx = arg;
+	objset_t *mos = dp->dp_meta_objset;
+
+	if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
+		dsl_dataset_t *origin;
+
+		VERIFY0(dsl_dataset_hold_obj(dp,
+		    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
+
+		if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
+			dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
+			dsl_dir_phys(origin->ds_dir)->dd_clones =
+			    zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+			    0, tx);
+		}
+
+		VERIFY0(zap_add_int(dp->dp_meta_objset,
+		    dsl_dir_phys(origin->ds_dir)->dd_clones,
+		    ds->ds_object, tx));
+
+		dsl_dataset_rele(origin, FTAG);
+	}
+	return (0);
+}
+
+void
+dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	uint64_t obj;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	(void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx);
+	VERIFY0(dsl_pool_open_special_dir(dp,
+	    FREE_DIR_NAME, &dp->dp_free_dir));
+
+	/*
+	 * We can't use bpobj_alloc(), because spa_version() still
+	 * returns the old version, and we need a new-version bpobj with
+	 * subobj support.  So call dmu_object_alloc() directly.
+	 */
+	obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+	VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
+	VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
+
+	VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+	    upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
+}
+
+void
+dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	uint64_t dsobj;
+	dsl_dataset_t *ds;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dp->dp_origin_snap == NULL);
+	ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER));
+
+	/* create the origin dir, ds, & snap-ds */
+	dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME,
+	    NULL, 0, kcred, NULL, tx);
+	VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+	dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
+	VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
+	    dp, &dp->dp_origin_snap));
+	dsl_dataset_rele(ds, FTAG);
+}
+
+taskq_t *
+dsl_pool_zrele_taskq(dsl_pool_t *dp)
+{
+	return (dp->dp_zrele_taskq);
+}
+
+taskq_t *
+dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp)
+{
+	return (dp->dp_unlinked_drain_taskq);
+}
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+	zap_attribute_t za;
+	zap_cursor_t zc;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+	nvlist_t *holds;
+
+	if (zapobj == 0)
+		return;
+	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+	holds = fnvlist_alloc();
+
+	for (zap_cursor_init(&zc, mos, zapobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		char *htag;
+		nvlist_t *tags;
+
+		htag = strchr(za.za_name, '-');
+		*htag = '\0';
+		++htag;
+		if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) {
+			tags = fnvlist_alloc();
+			fnvlist_add_boolean(tags, htag);
+			fnvlist_add_nvlist(holds, za.za_name, tags);
+			fnvlist_free(tags);
+		} else {
+			fnvlist_add_boolean(tags, htag);
+		}
+	}
+	dsl_dataset_user_release_tmp(dp, holds);
+	fnvlist_free(holds);
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+static void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	objset_t *mos = dp->dp_meta_objset;
+
+	ASSERT(dp->dp_tmp_userrefs_obj == 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+	char *name;
+	int error;
+
+	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/*
+	 * If the pool was created prior to SPA_VERSION_USERREFS, the
+	 * zap object for temporary holds might not exist yet.
+	 */
+	if (zapobj == 0) {
+		if (holding) {
+			dsl_pool_user_hold_create_obj(dp, tx);
+			zapobj = dp->dp_tmp_userrefs_obj;
+		} else {
+			return (SET_ERROR(ENOENT));
+		}
+	}
+
+	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+	if (holding)
+		error = zap_add(mos, zapobj, name, 8, 1, &now, tx);
+	else
+		error = zap_remove(mos, zapobj, name, tx);
+	kmem_strfree(name);
+
+	return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    uint64_t now, dmu_tx_t *tx)
+{
+	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    dmu_tx_t *tx)
+{
+	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0,
+	    tx, B_FALSE));
+}
+
+/*
+ * DSL Pool Configuration Lock
+ *
+ * The dp_config_rwlock protects against changes to DSL state (e.g. dataset
+ * creation / destruction / rename / property setting).  It must be held for
+ * read to hold a dataset or dsl_dir.  I.e. you must call
+ * dsl_pool_config_enter() or dsl_pool_hold() before calling
+ * dsl_{dataset,dir}_hold{_obj}.  In most circumstances, the dp_config_rwlock
+ * must be held continuously until all datasets and dsl_dirs are released.
+ *
+ * The only exception to this rule is that if a "long hold" is placed on
+ * a dataset, then the dp_config_rwlock may be dropped while the dataset
+ * is still held.  The long hold will prevent the dataset from being
+ * destroyed -- the destroy will fail with EBUSY.  A long hold can be
+ * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset
+ * (by calling dsl_{dataset,objset}_{try}own{_obj}).
+ *
+ * Legitimate long-holders (including owners) should be long-running, cancelable
+ * tasks that should cause "zfs destroy" to fail.  This includes DMU
+ * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open),
+ * "zfs send", and "zfs diff".  There are several other long-holders whose
+ * uses are suboptimal (e.g. "zfs promote", and zil_suspend()).
+ *
+ * The usual formula for long-holding would be:
+ * dsl_pool_hold()
+ * dsl_dataset_hold()
+ * ... perform checks ...
+ * dsl_dataset_long_hold()
+ * dsl_pool_rele()
+ * ... perform long-running task ...
+ * dsl_dataset_long_rele()
+ * dsl_dataset_rele()
+ *
+ * Note that when the long hold is released, the dataset is still held but
+ * the pool is not held.  The dataset may change arbitrarily during this time
+ * (e.g. it could be destroyed).  Therefore you shouldn't do anything to the
+ * dataset except release it.
+ *
+ * Operations generally fall somewhere into the following taxonomy:
+ *
+ *                              Read-Only             Modifying
+ *
+ *    Dataset Layer / MOS        zfs get             zfs destroy
+ *
+ *     Individual Dataset         read()                write()
+ *
+ *
+ * Dataset Layer Operations
+ *
+ * Modifying operations should generally use dsl_sync_task().  The synctask
+ * infrastructure enforces proper locking strategy with respect to the
+ * dp_config_rwlock.  See the comment above dsl_sync_task() for details.
+ *
+ * Read-only operations will manually hold the pool, then the dataset, obtain
+ * information from the dataset, then release the pool and dataset.
+ * dmu_objset_{hold,rele}() are convenience routines that also do the pool
+ * hold/rele.
+ *
+ *
+ * Operations On Individual Datasets
+ *
+ * Objects _within_ an objset should only be modified by the current 'owner'
+ * of the objset to prevent incorrect concurrent modification. Thus, use
+ * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner,
+ * and fail with EBUSY if there is already an owner. The owner can then
+ * implement its own locking strategy, independent of the dataset layer's
+ * locking infrastructure.
+ * (E.g., the ZPL has its own set of locks to control concurrency. A regular
+ *  vnop will not reach into the dataset layer).
+ *
+ * Ideally, objects would also only be read by the objset’s owner, so that we
+ * don’t observe state mid-modification.
+ * (E.g. the ZPL is creating a new object and linking it into a directory; if
+ * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an
+ * intermediate state.  The ioctl level violates this but in pretty benign
+ * ways, e.g. reading the zpl props object.)
+ */
+
+int
+dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(name, &spa, tag);
+	if (error == 0) {
+		*dp = spa_get_dsl(spa);
+		dsl_pool_config_enter(*dp, tag);
+	}
+	return (error);
+}
+
+void
+dsl_pool_rele(dsl_pool_t *dp, void *tag)
+{
+	dsl_pool_config_exit(dp, tag);
+	spa_close(dp->dp_spa, tag);
+}
+
+void
+dsl_pool_config_enter(dsl_pool_t *dp, void *tag)
+{
+	/*
+	 * We use a "reentrant" reader-writer lock, but not reentrantly.
+	 *
+	 * The rrwlock can (with the track_all flag) track all reading threads,
+	 * which is very useful for debugging which code path failed to release
+	 * the lock, and for verifying that the *current* thread does hold
+	 * the lock.
+	 *
+	 * (Unlike a rwlock, which knows that N threads hold it for
+	 * read, but not *which* threads, so rw_held(RW_READER) returns TRUE
+	 * if any thread holds it for read, even if this thread doesn't).
+	 */
+	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+	rrw_enter(&dp->dp_config_rwlock, RW_READER, tag);
+}
+
+void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+	ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+	rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
+void
+dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
+{
+	rrw_exit(&dp->dp_config_rwlock, tag);
+}
+
+boolean_t
+dsl_pool_config_held(dsl_pool_t *dp)
+{
+	return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
+}
+
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+	return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
+
+EXPORT_SYMBOL(dsl_pool_config_enter);
+EXPORT_SYMBOL(dsl_pool_config_exit);
+
+/* BEGIN CSTYLED */
+/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD,
+	"Max percent of RAM allowed to be dirty");
+
+/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD,
+	"zfs_dirty_data_max upper bound as % of RAM");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW,
+	"Transaction delay threshold");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW,
+	"Determines the dirty space limit");
+
+/* zfs_dirty_data_max_max only applied at module load in arc_init(). */
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD,
+	"zfs_dirty_data_max upper bound in bytes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW,
+	"Dirty data txg sync threshold as a percentage of zfs_dirty_data_max");
+
+ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW,
+	"How quickly delay approaches infinity");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW,
+	"Max percent of CPUs that are used to sync dirty data");
+
+ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW,
+	"Max percent of CPUs that are used per dp_sync_taskq");
+
+ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW,
+	"Number of taskq entries that are pre-populated");
+
+ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW,
+	"Max number of taskq entries that are cached");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_prop.c b/sys/contrib/openzfs/module/zfs/dsl_prop.c
new file mode 100644
index 000000000000..f6ff9ae47192
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_prop.c
@@ -0,0 +1,1287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Martin Matuska. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/spa.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+
+#include "zfs_prop.h"
+
+#define	ZPROP_INHERIT_SUFFIX "$inherit"
+#define	ZPROP_RECVD_SUFFIX "$recvd"
+
+static int
+dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
+{
+	/*
+	 * The setonce properties are read-only, BUT they still
+	 * have a default value that can be used as the initial
+	 * value.
+	 */
+	if (prop == ZPROP_INVAL ||
+	    (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
+		return (SET_ERROR(ENOENT));
+
+	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+		if (intsz != 1)
+			return (SET_ERROR(EOVERFLOW));
+		(void) strncpy(buf, zfs_prop_default_string(prop),
+		    numints);
+	} else {
+		if (intsz != 8 || numints < 1)
+			return (SET_ERROR(EOVERFLOW));
+
+		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
+	}
+
+	return (0);
+}
+
+int
+dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
+{
+	int err;
+	dsl_dir_t *target = dd;
+	objset_t *mos = dd->dd_pool->dp_meta_objset;
+	zfs_prop_t prop;
+	boolean_t inheritable;
+	boolean_t inheriting = B_FALSE;
+	char *inheritstr;
+	char *recvdstr;
+
+	ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+	if (setpoint)
+		setpoint[0] = '\0';
+
+	prop = zfs_name_to_prop(propname);
+	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+	/*
+	 * Note: dd may become NULL, therefore we shouldn't dereference it
+	 * after this loop.
+	 */
+	for (; dd != NULL; dd = dd->dd_parent) {
+		if (dd != target || snapshot) {
+			if (!inheritable) {
+				err = SET_ERROR(ENOENT);
+				break;
+			}
+			inheriting = B_TRUE;
+		}
+
+		/* Check for a local value. */
+		err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname, intsz, numints, buf);
+		if (err != ENOENT) {
+			if (setpoint != NULL && err == 0)
+				dsl_dir_name(dd, setpoint);
+			break;
+		}
+
+		/*
+		 * Skip the check for a received value if there is an explicit
+		 * inheritance entry.
+		 */
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    inheritstr);
+		if (err != 0 && err != ENOENT)
+			break;
+
+		if (err == ENOENT) {
+			/* Check for a received value. */
+			err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+			    recvdstr, intsz, numints, buf);
+			if (err != ENOENT) {
+				if (setpoint != NULL && err == 0) {
+					if (inheriting) {
+						dsl_dir_name(dd, setpoint);
+					} else {
+						(void) strlcpy(setpoint,
+						    ZPROP_SOURCE_VAL_RECVD,
+						    MAXNAMELEN);
+					}
+				}
+				break;
+			}
+		}
+
+		/*
+		 * If we found an explicit inheritance entry, err is zero even
+		 * though we haven't yet found the value, so reinitializing err
+		 * at the end of the loop (instead of at the beginning) ensures
+		 * that err has a valid post-loop value.
+		 */
+		err = SET_ERROR(ENOENT);
+	}
+
+	if (err == ENOENT)
+		err = dodefault(prop, intsz, numints, buf);
+
+	kmem_strfree(inheritstr);
+	kmem_strfree(recvdstr);
+
+	return (err);
+}
+
+int
+dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint)
+{
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	boolean_t inheritable;
+	uint64_t zapobj;
+
+	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
+	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+	zapobj = dsl_dataset_phys(ds)->ds_props_obj;
+
+	if (zapobj != 0) {
+		objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+		int err;
+
+		ASSERT(ds->ds_is_snapshot);
+
+		/* Check for a local value. */
+		err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
+		if (err != ENOENT) {
+			if (setpoint != NULL && err == 0)
+				dsl_dataset_name(ds, setpoint);
+			return (err);
+		}
+
+		/*
+		 * Skip the check for a received value if there is an explicit
+		 * inheritance entry.
+		 */
+		if (inheritable) {
+			char *inheritstr = kmem_asprintf("%s%s", propname,
+			    ZPROP_INHERIT_SUFFIX);
+			err = zap_contains(mos, zapobj, inheritstr);
+			kmem_strfree(inheritstr);
+			if (err != 0 && err != ENOENT)
+				return (err);
+		}
+
+		if (err == ENOENT) {
+			/* Check for a received value. */
+			char *recvdstr = kmem_asprintf("%s%s", propname,
+			    ZPROP_RECVD_SUFFIX);
+			err = zap_lookup(mos, zapobj, recvdstr,
+			    intsz, numints, buf);
+			kmem_strfree(recvdstr);
+			if (err != ENOENT) {
+				if (setpoint != NULL && err == 0)
+					(void) strlcpy(setpoint,
+					    ZPROP_SOURCE_VAL_RECVD,
+					    MAXNAMELEN);
+				return (err);
+			}
+		}
+	}
+
+	return (dsl_prop_get_dd(ds->ds_dir, propname,
+	    intsz, numints, buf, setpoint, ds->ds_is_snapshot));
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
+{
+	dsl_prop_record_t *pr = NULL;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	for (pr = list_head(&dd->dd_props);
+	    pr != NULL; pr = list_next(&dd->dd_props, pr)) {
+		if (strcmp(pr->pr_propname, propname) == 0)
+			break;
+	}
+
+	return (pr);
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
+{
+	dsl_prop_record_t *pr;
+
+	ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+	pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
+	pr->pr_propname = spa_strdup(propname);
+	list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
+	    offsetof(dsl_prop_cb_record_t, cbr_pr_node));
+	list_insert_head(&dd->dd_props, pr);
+
+	return (pr);
+}
+
+void
+dsl_prop_init(dsl_dir_t *dd)
+{
+	list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
+	    offsetof(dsl_prop_record_t, pr_node));
+}
+
+void
+dsl_prop_fini(dsl_dir_t *dd)
+{
+	dsl_prop_record_t *pr;
+
+	while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
+		list_destroy(&pr->pr_cbs);
+		spa_strfree((char *)pr->pr_propname);
+		kmem_free(pr, sizeof (dsl_prop_record_t));
+	}
+	list_destroy(&dd->dd_props);
+}
+
+/*
+ * Register interest in the named property.  We'll call the callback
+ * once to notify it of the current property value, and again each time
+ * the property changes, until this callback is unregistered.
+ *
+ * Return 0 on success, errno if the prop is not an integer value.
+ */
+int
+dsl_prop_register(dsl_dataset_t *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	uint64_t value;
+	dsl_prop_record_t *pr;
+	dsl_prop_cb_record_t *cbr;
+	int err;
+	dsl_pool_t *dp __maybe_unused = dd->dd_pool;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	err = dsl_prop_get_int_ds(ds, propname, &value);
+	if (err != 0)
+		return (err);
+
+	cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
+	cbr->cbr_ds = ds;
+	cbr->cbr_func = callback;
+	cbr->cbr_arg = cbarg;
+
+	mutex_enter(&dd->dd_lock);
+	pr = dsl_prop_record_find(dd, propname);
+	if (pr == NULL)
+		pr = dsl_prop_record_create(dd, propname);
+	cbr->cbr_pr = pr;
+	list_insert_head(&pr->pr_cbs, cbr);
+	list_insert_head(&ds->ds_prop_cbs, cbr);
+	mutex_exit(&dd->dd_lock);
+
+	cbr->cbr_func(cbr->cbr_arg, value);
+	return (0);
+}
+
+int
+dsl_prop_get(const char *dsname, const char *propname,
+    int intsz, int numints, void *buf, char *setpoint)
+{
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_hold(dsname, FTAG, &os);
+	if (error != 0)
+		return (error);
+
+	error = dsl_prop_get_ds(dmu_objset_ds(os), propname,
+	    intsz, numints, buf, setpoint);
+
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+/*
+ * Get the current property value.  It may have changed by the time this
+ * function returns, so it is NOT safe to follow up with
+ * dsl_prop_register() and assume that the value has not changed in
+ * between.
+ *
+ * Return 0 on success, ENOENT if ddname is invalid.
+ */
+int
+dsl_prop_get_integer(const char *ddname, const char *propname,
+    uint64_t *valuep, char *setpoint)
+{
+	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
+}
+
+int
+dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname,
+    uint64_t *valuep)
+{
+	return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL));
+}
+
+/*
+ * Predict the effective value of the given special property if it were set with
+ * the given value and source. This is not a general purpose function. It exists
+ * only to handle the special requirements of the quota and reservation
+ * properties. The fact that these properties are non-inheritable greatly
+ * simplifies the prediction logic.
+ *
+ * Returns 0 on success, a positive error code on failure, or -1 if called with
+ * a property not handled by this function.
+ */
+int
+dsl_prop_predict(dsl_dir_t *dd, const char *propname,
+    zprop_source_t source, uint64_t value, uint64_t *newvalp)
+{
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	objset_t *mos;
+	uint64_t zapobj;
+	uint64_t version;
+	char *recvdstr;
+	int err = 0;
+
+	switch (prop) {
+	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_RESERVATION:
+	case ZFS_PROP_REFQUOTA:
+	case ZFS_PROP_REFRESERVATION:
+		break;
+	default:
+		return (-1);
+	}
+
+	mos = dd->dd_pool->dp_meta_objset;
+	zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
+	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+	version = spa_version(dd->dd_pool->dp_spa);
+	if (version < SPA_VERSION_RECVD_PROPS) {
+		if (source & ZPROP_SRC_NONE)
+			source = ZPROP_SRC_NONE;
+		else if (source & ZPROP_SRC_RECEIVED)
+			source = ZPROP_SRC_LOCAL;
+	}
+
+	switch ((int)source) {
+	case ZPROP_SRC_NONE:
+		/* Revert to the received value, if any. */
+		err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp);
+		if (err == ENOENT)
+			*newvalp = 0;
+		break;
+	case ZPROP_SRC_LOCAL:
+		*newvalp = value;
+		break;
+	case ZPROP_SRC_RECEIVED:
+		/*
+		 * If there's no local setting, then the new received value will
+		 * be the effective value.
+		 */
+		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
+		if (err == ENOENT)
+			*newvalp = value;
+		break;
+	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+		/*
+		 * We're clearing the received value, so the local setting (if
+		 * it exists) remains the effective value.
+		 */
+		err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp);
+		if (err == ENOENT)
+			*newvalp = 0;
+		break;
+	default:
+		panic("unexpected property source: %d", source);
+	}
+
+	kmem_strfree(recvdstr);
+
+	if (err == ENOENT)
+		return (0);
+
+	return (err);
+}
+
+/*
+ * Unregister this callback.  Return 0 on success, ENOENT if ddname is
+ * invalid, or ENOMSG if no matching callback registered.
+ *
+ * NOTE: This function is no longer used internally but has been preserved
+ * to prevent breaking external consumers (Lustre, etc).
+ */
+int
+dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
+    dsl_prop_changed_cb_t *callback, void *cbarg)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_cb_record_t *cbr;
+
+	mutex_enter(&dd->dd_lock);
+	for (cbr = list_head(&ds->ds_prop_cbs);
+	    cbr; cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+		if (cbr->cbr_ds == ds &&
+		    cbr->cbr_func == callback &&
+		    cbr->cbr_arg == cbarg &&
+		    strcmp(cbr->cbr_pr->pr_propname, propname) == 0)
+			break;
+	}
+
+	if (cbr == NULL) {
+		mutex_exit(&dd->dd_lock);
+		return (SET_ERROR(ENOMSG));
+	}
+
+	list_remove(&ds->ds_prop_cbs, cbr);
+	list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+	mutex_exit(&dd->dd_lock);
+	kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+
+	return (0);
+}
+
+/*
+ * Unregister all callbacks that are registered with the
+ * given callback argument.
+ */
+void
+dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
+{
+	dsl_prop_cb_record_t *cbr, *next_cbr;
+
+	dsl_dir_t *dd = ds->ds_dir;
+
+	mutex_enter(&dd->dd_lock);
+	next_cbr = list_head(&ds->ds_prop_cbs);
+	while (next_cbr != NULL) {
+		cbr = next_cbr;
+		next_cbr = list_next(&ds->ds_prop_cbs, cbr);
+		if (cbr->cbr_arg == cbarg) {
+			list_remove(&ds->ds_prop_cbs, cbr);
+			list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+			kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+		}
+	}
+	mutex_exit(&dd->dd_lock);
+}
+
+boolean_t
+dsl_prop_hascb(dsl_dataset_t *ds)
+{
+	return (!list_is_empty(&ds->ds_prop_cbs));
+}
+
+/* ARGSUSED */
+static int
+dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_record_t *pr;
+	dsl_prop_cb_record_t *cbr;
+
+	mutex_enter(&dd->dd_lock);
+	for (pr = list_head(&dd->dd_props);
+	    pr; pr = list_next(&dd->dd_props, pr)) {
+		for (cbr = list_head(&pr->pr_cbs); cbr;
+		    cbr = list_next(&pr->pr_cbs, cbr)) {
+			uint64_t value;
+
+			/*
+			 * Callback entries do not have holds on their
+			 * datasets so that datasets with registered
+			 * callbacks are still eligible for eviction.
+			 * Unlike operations to update properties on a
+			 * single dataset, we are performing a recursive
+			 * descent of related head datasets.  The caller
+			 * of this function only has a dataset hold on
+			 * the passed in head dataset, not the snapshots
+			 * associated with this dataset.  Without a hold,
+			 * the dataset pointer within callback records
+			 * for snapshots can be invalidated by eviction
+			 * at any time.
+			 *
+			 * Use dsl_dataset_try_add_ref() to verify
+			 * that the dataset for a snapshot has not
+			 * begun eviction processing and to prevent
+			 * eviction from occurring for the duration of
+			 * the callback.  If the hold attempt fails,
+			 * this object is already being evicted and the
+			 * callback can be safely ignored.
+			 */
+			if (ds != cbr->cbr_ds &&
+			    !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+				continue;
+
+			if (dsl_prop_get_ds(cbr->cbr_ds,
+			    cbr->cbr_pr->pr_propname, sizeof (value), 1,
+			    &value, NULL) == 0)
+				cbr->cbr_func(cbr->cbr_arg, value);
+
+			if (ds != cbr->cbr_ds)
+				dsl_dataset_rele(cbr->cbr_ds, FTAG);
+		}
+	}
+	mutex_exit(&dd->dd_lock);
+
+	return (0);
+}
+
+/*
+ * Update all property values for ddobj & its descendants.  This is used
+ * when renaming the dir.
+ */
+void
+dsl_prop_notify_all(dsl_dir_t *dd)
+{
+	dsl_pool_t *dp = dd->dd_pool;
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	(void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb,
+	    NULL, DS_FIND_CHILDREN);
+}
+
+static void
+dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
+    const char *propname, uint64_t value, int first)
+{
+	dsl_dir_t *dd;
+	dsl_prop_record_t *pr;
+	dsl_prop_cb_record_t *cbr;
+	objset_t *mos = dp->dp_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t *za;
+	int err;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+	err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+	if (err)
+		return;
+
+	if (!first) {
+		/*
+		 * If the prop is set here, then this change is not
+		 * being inherited here or below; stop the recursion.
+		 */
+		err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+		    propname);
+		if (err == 0) {
+			dsl_dir_rele(dd, FTAG);
+			return;
+		}
+		ASSERT3U(err, ==, ENOENT);
+	}
+
+	mutex_enter(&dd->dd_lock);
+	pr = dsl_prop_record_find(dd, propname);
+	if (pr != NULL) {
+		for (cbr = list_head(&pr->pr_cbs); cbr;
+		    cbr = list_next(&pr->pr_cbs, cbr)) {
+			uint64_t propobj;
+
+			/*
+			 * cbr->cbr_ds may be invalidated due to eviction,
+			 * requiring the use of dsl_dataset_try_add_ref().
+			 * See comment block in dsl_prop_notify_all_cb()
+			 * for details.
+			 */
+			if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+				continue;
+
+			propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
+
+			/*
+			 * If the property is not set on this ds, then it is
+			 * inherited here; call the callback.
+			 */
+			if (propobj == 0 ||
+			    zap_contains(mos, propobj, propname) != 0)
+				cbr->cbr_func(cbr->cbr_arg, value);
+
+			dsl_dataset_rele(cbr->cbr_ds, FTAG);
+		}
+	}
+	mutex_exit(&dd->dd_lock);
+
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	for (zap_cursor_init(&zc, mos,
+	    dsl_dir_phys(dd)->dd_child_dir_zapobj);
+	    zap_cursor_retrieve(&zc, za) == 0;
+	    zap_cursor_advance(&zc)) {
+		dsl_prop_changed_notify(dp, za->za_first_integer,
+		    propname, value, FALSE);
+	}
+	kmem_free(za, sizeof (zap_attribute_t));
+	zap_cursor_fini(&zc);
+	dsl_dir_rele(dd, FTAG);
+}
+
+void
+dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
+    zprop_source_t source, int intsz, int numints, const void *value,
+    dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t zapobj, intval, dummy, count;
+	int isint;
+	char valbuf[32];
+	const char *valstr = NULL;
+	char *inheritstr;
+	char *recvdstr;
+	char *tbuf = NULL;
+	int err;
+	uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+
+	isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
+
+	if (ds->ds_is_snapshot) {
+		ASSERT(version >= SPA_VERSION_SNAP_PROPS);
+		if (dsl_dataset_phys(ds)->ds_props_obj == 0 &&
+		    (source & ZPROP_SRC_NONE) == 0) {
+			dmu_buf_will_dirty(ds->ds_dbuf, tx);
+			dsl_dataset_phys(ds)->ds_props_obj =
+			    zap_create(mos,
+			    DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
+		}
+		zapobj = dsl_dataset_phys(ds)->ds_props_obj;
+	} else {
+		zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
+	}
+
+	/* If we are removing objects from a non-existent ZAP just return */
+	if (zapobj == 0)
+		return;
+
+	if (version < SPA_VERSION_RECVD_PROPS) {
+		if (source & ZPROP_SRC_NONE)
+			source = ZPROP_SRC_NONE;
+		else if (source & ZPROP_SRC_RECEIVED)
+			source = ZPROP_SRC_LOCAL;
+	}
+
+	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+	switch ((int)source) {
+	case ZPROP_SRC_NONE:
+		/*
+		 * revert to received value, if any (inherit -S)
+		 * - remove propname
+		 * - remove propname$inherit
+		 */
+		err = zap_remove(mos, zapobj, propname, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		err = zap_remove(mos, zapobj, inheritstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		break;
+	case ZPROP_SRC_LOCAL:
+		/*
+		 * remove propname$inherit
+		 * set propname -> value
+		 */
+		err = zap_remove(mos, zapobj, inheritstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		VERIFY0(zap_update(mos, zapobj, propname,
+		    intsz, numints, value, tx));
+		break;
+	case ZPROP_SRC_INHERITED:
+		/*
+		 * explicitly inherit
+		 * - remove propname
+		 * - set propname$inherit
+		 */
+		err = zap_remove(mos, zapobj, propname, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		if (version >= SPA_VERSION_RECVD_PROPS &&
+		    dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) {
+			dummy = 0;
+			VERIFY0(zap_update(mos, zapobj, inheritstr,
+			    8, 1, &dummy, tx));
+		}
+		break;
+	case ZPROP_SRC_RECEIVED:
+		/*
+		 * set propname$recvd -> value
+		 */
+		err = zap_update(mos, zapobj, recvdstr,
+		    intsz, numints, value, tx);
+		ASSERT(err == 0);
+		break;
+	case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
+		/*
+		 * clear local and received settings
+		 * - remove propname
+		 * - remove propname$inherit
+		 * - remove propname$recvd
+		 */
+		err = zap_remove(mos, zapobj, propname, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		err = zap_remove(mos, zapobj, inheritstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		/* FALLTHRU */
+	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+		/*
+		 * remove propname$recvd
+		 */
+		err = zap_remove(mos, zapobj, recvdstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		break;
+	default:
+		cmn_err(CE_PANIC, "unexpected property source: %d", source);
+	}
+
+	kmem_strfree(inheritstr);
+	kmem_strfree(recvdstr);
+
+	/*
+	 * If we are left with an empty snap zap we can destroy it.
+	 * This will prevent unnecessary calls to zap_lookup() in
+	 * the "zfs list" and "zfs get" code paths.
+	 */
+	if (ds->ds_is_snapshot &&
+	    zap_count(mos, zapobj, &count) == 0 && count == 0) {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		dsl_dataset_phys(ds)->ds_props_obj = 0;
+		zap_destroy(mos, zapobj, tx);
+	}
+
+	if (isint) {
+		VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
+
+		if (ds->ds_is_snapshot) {
+			dsl_prop_cb_record_t *cbr;
+			/*
+			 * It's a snapshot; nothing can inherit this
+			 * property, so just look for callbacks on this
+			 * ds here.
+			 */
+			mutex_enter(&ds->ds_dir->dd_lock);
+			for (cbr = list_head(&ds->ds_prop_cbs); cbr;
+			    cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+				if (strcmp(cbr->cbr_pr->pr_propname,
+				    propname) == 0)
+					cbr->cbr_func(cbr->cbr_arg, intval);
+			}
+			mutex_exit(&ds->ds_dir->dd_lock);
+		} else {
+			dsl_prop_changed_notify(ds->ds_dir->dd_pool,
+			    ds->ds_dir->dd_object, propname, intval, TRUE);
+		}
+
+		(void) snprintf(valbuf, sizeof (valbuf),
+		    "%lld", (longlong_t)intval);
+		valstr = valbuf;
+	} else {
+		if (source == ZPROP_SRC_LOCAL) {
+			valstr = value;
+		} else {
+			tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+			if (dsl_prop_get_ds(ds, propname, 1,
+			    ZAP_MAXVALUELEN, tbuf, NULL) == 0)
+				valstr = tbuf;
+		}
+	}
+
+	spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE ||
+	    source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx,
+	    "%s=%s", propname, (valstr == NULL ? "" : valstr));
+
+	if (tbuf != NULL)
+		kmem_free(tbuf, ZAP_MAXVALUELEN);
+}
+
+int
+dsl_prop_set_int(const char *dsname, const char *propname,
+    zprop_source_t source, uint64_t value)
+{
+	nvlist_t *nvl = fnvlist_alloc();
+	int error;
+
+	fnvlist_add_uint64(nvl, propname, value);
+	error = dsl_props_set(dsname, source, nvl);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+int
+dsl_prop_set_string(const char *dsname, const char *propname,
+    zprop_source_t source, const char *value)
+{
+	nvlist_t *nvl = fnvlist_alloc();
+	int error;
+
+	fnvlist_add_string(nvl, propname, value);
+	error = dsl_props_set(dsname, source, nvl);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+int
+dsl_prop_inherit(const char *dsname, const char *propname,
+    zprop_source_t source)
+{
+	nvlist_t *nvl = fnvlist_alloc();
+	int error;
+
+	fnvlist_add_boolean(nvl, propname);
+	error = dsl_props_set(dsname, source, nvl);
+	fnvlist_free(nvl);
+	return (error);
+}
+
+int
+dsl_props_set_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_props_set_arg_t *dpsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+	uint64_t version;
+	nvpair_t *elem = NULL;
+	int err;
+
+	err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds);
+	if (err != 0)
+		return (err);
+
+	version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+	while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) {
+		if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
+			dsl_dataset_rele(ds, FTAG);
+			return (SET_ERROR(ENAMETOOLONG));
+		}
+		if (nvpair_type(elem) == DATA_TYPE_STRING) {
+			char *valstr = fnvpair_value_string(elem);
+			if (strlen(valstr) >= (version <
+			    SPA_VERSION_STMF_PROP ?
+			    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+				dsl_dataset_rele(ds, FTAG);
+				return (SET_ERROR(E2BIG));
+			}
+		}
+	}
+
+	if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+void
+dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source,
+    nvlist_t *props, dmu_tx_t *tx)
+{
+	nvpair_t *elem = NULL;
+
+	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+		nvpair_t *pair = elem;
+		const char *name = nvpair_name(pair);
+
+		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+			/*
+			 * This usually happens when we reuse the nvlist_t data
+			 * returned by the counterpart dsl_prop_get_all_impl().
+			 * For instance we do this to restore the original
+			 * received properties when an error occurs in the
+			 * zfs_ioc_recv() codepath.
+			 */
+			nvlist_t *attrs = fnvpair_value_nvlist(pair);
+			pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
+		}
+
+		if (nvpair_type(pair) == DATA_TYPE_STRING) {
+			const char *value = fnvpair_value_string(pair);
+			dsl_prop_set_sync_impl(ds, name,
+			    source, 1, strlen(value) + 1, value, tx);
+		} else if (nvpair_type(pair) == DATA_TYPE_UINT64) {
+			uint64_t intval = fnvpair_value_uint64(pair);
+			dsl_prop_set_sync_impl(ds, name,
+			    source, sizeof (intval), 1, &intval, tx);
+		} else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) {
+			dsl_prop_set_sync_impl(ds, name,
+			    source, 0, 0, NULL, tx);
+		} else {
+			panic("invalid nvpair type");
+		}
+	}
+}
+
+void
+dsl_props_set_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_props_set_arg_t *dpsa = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dataset_t *ds;
+
+	VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds));
+	dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * All-or-nothing; if any prop can't be set, nothing will be modified.
+ */
+int
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
+{
+	dsl_props_set_arg_t dpsa;
+	int nblks = 0;
+
+	dpsa.dpsa_dsname = dsname;
+	dpsa.dpsa_source = source;
+	dpsa.dpsa_props = props;
+
+	/*
+	 * If the source includes NONE, then we will only be removing entries
+	 * from the ZAP object.  In that case don't check for ENOSPC.
+	 */
+	if ((source & ZPROP_SRC_NONE) == 0)
+		nblks = 2 * fnvlist_num_pairs(props);
+
+	return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
+	    &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
+}
+
+typedef enum dsl_prop_getflags {
+	DSL_PROP_GET_INHERITING = 0x1,	/* searching parent of target ds */
+	DSL_PROP_GET_SNAPSHOT = 0x2,	/* snapshot dataset */
+	DSL_PROP_GET_LOCAL = 0x4,	/* local properties */
+	DSL_PROP_GET_RECEIVED = 0x8,	/* received properties */
+} dsl_prop_getflags_t;
+
+static int
+dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
+    const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err = 0;
+
+	for (zap_cursor_init(&zc, mos, propobj);
+	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		nvlist_t *propval;
+		zfs_prop_t prop;
+		char buf[ZAP_MAXNAMELEN];
+		char *valstr;
+		const char *suffix;
+		const char *propname;
+		const char *source;
+
+		suffix = strchr(za.za_name, '$');
+
+		if (suffix == NULL) {
+			/*
+			 * Skip local properties if we only want received
+			 * properties.
+			 */
+			if (flags & DSL_PROP_GET_RECEIVED)
+				continue;
+
+			propname = za.za_name;
+			source = setpoint;
+		} else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
+			/* Skip explicitly inherited entries. */
+			continue;
+		} else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
+			if (flags & DSL_PROP_GET_LOCAL)
+				continue;
+
+			(void) strncpy(buf, za.za_name, (suffix - za.za_name));
+			buf[suffix - za.za_name] = '\0';
+			propname = buf;
+
+			if (!(flags & DSL_PROP_GET_RECEIVED)) {
+				/* Skip if locally overridden. */
+				err = zap_contains(mos, propobj, propname);
+				if (err == 0)
+					continue;
+				if (err != ENOENT)
+					break;
+
+				/* Skip if explicitly inherited. */
+				valstr = kmem_asprintf("%s%s", propname,
+				    ZPROP_INHERIT_SUFFIX);
+				err = zap_contains(mos, propobj, valstr);
+				kmem_strfree(valstr);
+				if (err == 0)
+					continue;
+				if (err != ENOENT)
+					break;
+			}
+
+			source = ((flags & DSL_PROP_GET_INHERITING) ?
+			    setpoint : ZPROP_SOURCE_VAL_RECVD);
+		} else {
+			/*
+			 * For backward compatibility, skip suffixes we don't
+			 * recognize.
+			 */
+			continue;
+		}
+
+		prop = zfs_name_to_prop(propname);
+
+		/* Skip non-inheritable properties. */
+		if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
+		    !zfs_prop_inheritable(prop))
+			continue;
+
+		/* Skip properties not valid for this type. */
+		if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+		    !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE))
+			continue;
+
+		/* Skip properties already defined. */
+		if (nvlist_exists(nv, propname))
+			continue;
+
+		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		if (za.za_integer_length == 1) {
+			/*
+			 * String property
+			 */
+			char *tmp = kmem_alloc(za.za_num_integers,
+			    KM_SLEEP);
+			err = zap_lookup(mos, propobj,
+			    za.za_name, 1, za.za_num_integers, tmp);
+			if (err != 0) {
+				kmem_free(tmp, za.za_num_integers);
+				break;
+			}
+			VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+			    tmp) == 0);
+			kmem_free(tmp, za.za_num_integers);
+		} else {
+			/*
+			 * Integer property
+			 */
+			ASSERT(za.za_integer_length == 8);
+			(void) nvlist_add_uint64(propval, ZPROP_VALUE,
+			    za.za_first_integer);
+		}
+
+		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
+		VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+		nvlist_free(propval);
+	}
+	zap_cursor_fini(&zc);
+	if (err == ENOENT)
+		err = 0;
+	return (err);
+}
+
+/*
+ * Iterate over all properties for this dataset and return them in an nvlist.
+ */
+static int
+dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
+    dsl_prop_getflags_t flags)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	int err = 0;
+	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+
+	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	if (ds->ds_is_snapshot)
+		flags |= DSL_PROP_GET_SNAPSHOT;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
+		ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
+		dsl_dataset_name(ds, setpoint);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
+		if (err)
+			goto out;
+	}
+
+	for (; dd != NULL; dd = dd->dd_parent) {
+		if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
+			if (flags & (DSL_PROP_GET_LOCAL |
+			    DSL_PROP_GET_RECEIVED))
+				break;
+			flags |= DSL_PROP_GET_INHERITING;
+		}
+		dsl_dir_name(dd, setpoint);
+		err = dsl_prop_get_all_impl(mos,
+		    dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
+		if (err)
+			break;
+	}
+
+out:
+	if (err) {
+		nvlist_free(*nvp);
+		*nvp = NULL;
+	}
+	return (err);
+}
+
+boolean_t
+dsl_prop_get_hasrecvd(const char *dsname)
+{
+	uint64_t dummy;
+
+	return (0 ==
+	    dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL));
+}
+
+static int
+dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source)
+{
+	uint64_t version;
+	spa_t *spa;
+	int error = 0;
+
+	VERIFY0(spa_open(dsname, &spa, FTAG));
+	version = spa_version(spa);
+	spa_close(spa, FTAG);
+
+	if (version >= SPA_VERSION_RECVD_PROPS)
+		error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0);
+	return (error);
+}
+
+/*
+ * Call after successfully receiving properties to ensure that only the first
+ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
+ */
+int
+dsl_prop_set_hasrecvd(const char *dsname)
+{
+	int error = 0;
+	if (!dsl_prop_get_hasrecvd(dsname))
+		error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL);
+	return (error);
+}
+
+void
+dsl_prop_unset_hasrecvd(const char *dsname)
+{
+	VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE));
+}
+
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+	return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
+}
+
+int
+dsl_prop_get_received(const char *dsname, nvlist_t **nvp)
+{
+	objset_t *os;
+	int error;
+
+	/*
+	 * Received properties are not distinguishable from local properties
+	 * until the dataset has received properties on or after
+	 * SPA_VERSION_RECVD_PROPS.
+	 */
+	dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ?
+	    DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
+
+	error = dmu_objset_hold(dsname, FTAG, &os);
+	if (error != 0)
+		return (error);
+	error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags);
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+void
+dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
+{
+	nvlist_t *propval;
+	const char *propname = zfs_prop_to_name(prop);
+	uint64_t default_value;
+
+	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+		return;
+	}
+
+	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+	/* Indicate the default source if we can. */
+	if (dodefault(prop, 8, 1, &default_value) == 0 &&
+	    value == default_value) {
+		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
+	}
+	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+	nvlist_free(propval);
+}
+
+void
+dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
+{
+	nvlist_t *propval;
+	const char *propname = zfs_prop_to_name(prop);
+
+	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+		return;
+	}
+
+	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+	nvlist_free(propval);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_prop_register);
+EXPORT_SYMBOL(dsl_prop_unregister);
+EXPORT_SYMBOL(dsl_prop_unregister_all);
+EXPORT_SYMBOL(dsl_prop_get);
+EXPORT_SYMBOL(dsl_prop_get_integer);
+EXPORT_SYMBOL(dsl_prop_get_all);
+EXPORT_SYMBOL(dsl_prop_get_received);
+EXPORT_SYMBOL(dsl_prop_get_ds);
+EXPORT_SYMBOL(dsl_prop_get_int_ds);
+EXPORT_SYMBOL(dsl_prop_get_dd);
+EXPORT_SYMBOL(dsl_props_set);
+EXPORT_SYMBOL(dsl_prop_set_int);
+EXPORT_SYMBOL(dsl_prop_set_string);
+EXPORT_SYMBOL(dsl_prop_inherit);
+EXPORT_SYMBOL(dsl_prop_predict);
+EXPORT_SYMBOL(dsl_prop_nvlist_add_uint64);
+EXPORT_SYMBOL(dsl_prop_nvlist_add_string);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_scan.c b/sys/contrib/openzfs/module/zfs/dsl_scan.c
new file mode 100644
index 000000000000..40adfbcee4e1
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_scan.c
@@ -0,0 +1,4422 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright 2016 Gary Mills
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#include <sys/dsl_scan.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dnode.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/abd.h>
+#include <sys/range_tree.h>
+#ifdef _KERNEL
+#include <sys/zfs_vfsops.h>
+#endif
+
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limited performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
+ *
+ * Backwards compatibility
+ *
+ * This new algorithm is backwards compatible with the legacy on-disk data
+ * structures (and therefore does not require a new feature flag).
+ * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
+ * will stop scanning metadata (in logical order) and wait for all outstanding
+ * sorted I/O to complete. Once this is done, we write out a checkpoint
+ * bookmark, indicating that we have scanned everything logically before it.
+ * If the pool is imported on a machine without the new sorting algorithm,
+ * the scan simply resumes from the last checkpoint using the legacy algorithm.
+ */
+
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
+    const zbookmark_phys_t *);
+
+static scan_cb_t dsl_scan_scrub_cb;
+
+static int scan_ds_queue_compare(const void *a, const void *b);
+static int scan_prefetch_queue_compare(const void *a, const void *b);
+static void scan_ds_queue_clear(dsl_scan_t *scn);
+static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn);
+static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
+    uint64_t *txg);
+static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
+static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
+static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
+static uint64_t dsl_scan_count_leaves(vdev_t *vd);
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+/*
+ * By default zfs will check to ensure it is not over the hard memory
+ * limit before each txg. If finer-grained control of this is needed
+ * this value can be set to 1 to enable checking before scanning each
+ * block.
+ */
+int zfs_scan_strict_mem_lim = B_FALSE;
+
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev. We attempt
+ * to strike a balance here between keeping the vdev queues full of I/Os
+ * at all times and not overflowing the queues to cause long latency,
+ * which would cause long txg sync times. No matter what, we will not
+ * overload the drives with I/O, since that is protected by
+ * zfs_vdev_scrub_max_active.
+ */
+unsigned long zfs_scan_vdev_limit = 4 << 20;
+
+int zfs_scan_issue_strategy = 0;
+int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
+unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+
+/*
+ * fill_weight is non-tunable at runtime, so we copy it at module init from
+ * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
+ * break queue sorting.
+ */
+int zfs_scan_fill_weight = 3;
+static uint64_t fill_weight;
+
+/* See dsl_scan_should_clear() for details on the memory limit tunables */
+uint64_t zfs_scan_mem_lim_min = 16 << 20;	/* bytes */
+uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;	/* bytes */
+int zfs_scan_mem_lim_fact = 20;		/* fraction of physmem */
+int zfs_scan_mem_lim_soft_fact = 20;	/* fraction of mem lim above */
+
+int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
+int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+int zfs_scan_checkpoint_intval = 7200; /* in seconds */
+int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
+int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+/* max number of blocks to free in a single TXG */
+unsigned long zfs_async_block_max_blocks = ULONG_MAX;
+/* max number of dedup blocks to free in a single TXG */
+unsigned long zfs_max_async_dedup_frees = 100000;
+
+int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+
+/*
+ * We wait a few txgs after importing a pool to begin scanning so that
+ * the import / mounting code isn't held up by scrub / resilver IO.
+ * Unfortunately, it is a bit difficult to determine exactly how long
+ * this will take since userspace will trigger fs mounts asynchronously
+ * and the kernel will create zvol minors asynchronously. As a result,
+ * the value provided here is a bit arbitrary, but represents a
+ * reasonable estimate of how many txgs it will take to finish fully
+ * importing a pool
+ */
+#define	SCAN_IMPORT_WAIT_TXGS 		5
+
+#define	DSL_SCAN_IS_SCRUB_RESILVER(scn) \
+	((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
+	(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
+
+/*
+ * Enable/disable the processing of the free_bpobj object.
+ */
+int zfs_free_bpobj_enabled = 1;
+
+/* the order has to match pool_scan_type */
+static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
+	NULL,
+	dsl_scan_scrub_cb,	/* POOL_SCAN_SCRUB */
+	dsl_scan_scrub_cb,	/* POOL_SCAN_RESILVER */
+};
+
+/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
+typedef struct {
+	uint64_t	sds_dsobj;
+	uint64_t	sds_txg;
+	avl_node_t	sds_node;
+} scan_ds_t;
+
+/*
+ * This controls what conditions are placed on dsl_scan_sync_state():
+ * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
+ * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ *	write out the scn_phys_cached version.
+ * See dsl_scan_sync_state for details.
+ */
+typedef enum {
+	SYNC_OPTIONAL,
+	SYNC_MANDATORY,
+	SYNC_CACHED
+} state_sync_type_t;
+
+/*
+ * This struct represents the minimum information needed to reconstruct a
+ * zio for sequential scanning. This is useful because many of these will
+ * accumulate in the sequential IO queues before being issued, so saving
+ * memory matters here.
+ */
+typedef struct scan_io {
+	/* fields from blkptr_t */
+	uint64_t		sio_blk_prop;
+	uint64_t		sio_phys_birth;
+	uint64_t		sio_birth;
+	zio_cksum_t		sio_cksum;
+	uint32_t		sio_nr_dvas;
+
+	/* fields from zio_t */
+	uint32_t		sio_flags;
+	zbookmark_phys_t	sio_zb;
+
+	/* members for queue sorting */
+	union {
+		avl_node_t	sio_addr_node; /* link into issuing queue */
+		list_node_t	sio_list_node; /* link for issuing to disk */
+	} sio_nodes;
+
+	/*
+	 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+	 * depending on how many were in the original bp. Only the
+	 * first DVA is really used for sorting and issuing purposes.
+	 * The other DVAs (if provided) simply exist so that the zio
+	 * layer can find additional copies to repair from in the
+	 * event of an error. This array must go at the end of the
+	 * struct to allow this for the variable number of elements.
+	 */
+	dva_t			sio_dva[0];
+} scan_io_t;
+
+#define	SIO_SET_OFFSET(sio, x)		DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define	SIO_SET_ASIZE(sio, x)		DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define	SIO_GET_OFFSET(sio)		DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define	SIO_GET_ASIZE(sio)		DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define	SIO_GET_END_OFFSET(sio)		\
+	(SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define	SIO_GET_MUSED(sio)		\
+	(sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
+struct dsl_scan_io_queue {
+	dsl_scan_t	*q_scn; /* associated dsl_scan_t */
+	vdev_t		*q_vd; /* top-level vdev that this queue represents */
+
+	/* trees used for sorting I/Os and extents of I/Os */
+	range_tree_t	*q_exts_by_addr;
+	zfs_btree_t		q_exts_by_size;
+	avl_tree_t	q_sios_by_addr;
+	uint64_t	q_sio_memused;
+
+	/* members for zio rate limiting */
+	uint64_t	q_maxinflight_bytes;
+	uint64_t	q_inflight_bytes;
+	kcondvar_t	q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
+
+	/* per txg statistics */
+	uint64_t	q_total_seg_size_this_txg;
+	uint64_t	q_segs_this_txg;
+	uint64_t	q_total_zio_size_this_txg;
+	uint64_t	q_zios_this_txg;
+};
+
+/* private data for dsl_scan_prefetch_cb() */
+typedef struct scan_prefetch_ctx {
+	zfs_refcount_t spc_refcnt;	/* refcount for memory management */
+	dsl_scan_t *spc_scn;		/* dsl_scan_t for the pool */
+	boolean_t spc_root;		/* is this prefetch for an objset? */
+	uint8_t spc_indblkshift;	/* dn_indblkshift of current dnode */
+	uint16_t spc_datablkszsec;	/* dn_idatablkszsec of current dnode */
+} scan_prefetch_ctx_t;
+
+/* private data for dsl_scan_prefetch() */
+typedef struct scan_prefetch_issue_ctx {
+	avl_node_t spic_avl_node;	/* link into scn->scn_prefetch_queue */
+	scan_prefetch_ctx_t *spic_spc;	/* spc for the callback */
+	blkptr_t spic_bp;		/* bp to prefetch */
+	zbookmark_phys_t spic_zb;	/* bookmark to prefetch */
+} scan_prefetch_issue_ctx_t;
+
+static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
+static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
+    scan_io_t *sio);
+
+static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
+static void scan_io_queues_destroy(dsl_scan_t *scn);
+
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
+
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+	ASSERT3U(sio->sio_nr_dvas, >, 0);
+	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+	ASSERT3U(nr_dvas, >, 0);
+	ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
+
+void
+scan_init(void)
+{
+	/*
+	 * This is used in ext_size_compare() to weight segments
+	 * based on how sparse they are. This cannot be changed
+	 * mid-scan and the tree comparison functions don't currently
+	 * have a mechanism for passing additional context to the
+	 * compare functions. Thus we store this value globally and
+	 * we only allow it to be set at module initialization time
+	 */
+	fill_weight = zfs_scan_fill_weight;
+
+	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+		char name[36];
+
+		(void) snprintf(name, sizeof (name), "sio_cache_%d", i);
+		sio_cache[i] = kmem_cache_create(name,
+		    (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+		    0, NULL, NULL, NULL, NULL, NULL, 0);
+	}
+}
+
+void
+scan_fini(void)
+{
+	for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+		kmem_cache_destroy(sio_cache[i]);
+	}
+}
+
+static inline boolean_t
+dsl_scan_is_running(const dsl_scan_t *scn)
+{
+	return (scn->scn_phys.scn_state == DSS_SCANNING);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+	return (dsl_scan_is_running(dp->dp_scan) &&
+	    dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+static inline void
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
+{
+	bzero(bp, sizeof (*bp));
+	bp->blk_prop = sio->sio_blk_prop;
+	bp->blk_phys_birth = sio->sio_phys_birth;
+	bp->blk_birth = sio->sio_birth;
+	bp->blk_fill = 1;	/* we always only work with data pointers */
+	bp->blk_cksum = sio->sio_cksum;
+
+	ASSERT3U(sio->sio_nr_dvas, >, 0);
+	ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+	bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
+}
+
+static inline void
+bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
+{
+	sio->sio_blk_prop = bp->blk_prop;
+	sio->sio_phys_birth = bp->blk_phys_birth;
+	sio->sio_birth = bp->blk_birth;
+	sio->sio_cksum = bp->blk_cksum;
+	sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+	/*
+	 * Copy the DVAs to the sio. We need all copies of the block so
+	 * that the self healing code can use the alternate copies if the
+	 * first is corrupted. We want the DVA at index dva_i to be first
+	 * in the sio since this is the primary one that we want to issue.
+	 */
+	for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+		sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+	}
+}
+
+int
+dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
+{
+	int err;
+	dsl_scan_t *scn;
+	spa_t *spa = dp->dp_spa;
+	uint64_t f;
+
+	scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
+	scn->scn_dp = dp;
+
+	/*
+	 * It's possible that we're resuming a scan after a reboot so
+	 * make sure that the scan_async_destroying flag is initialized
+	 * appropriately.
+	 */
+	ASSERT(!scn->scn_async_destroying);
+	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
+	    SPA_FEATURE_ASYNC_DESTROY);
+
+	/*
+	 * Calculate the max number of in-flight bytes for pool-wide
+	 * scanning operations (minimum 1MB). Limits for the issuing
+	 * phase are done per top-level vdev and are handled separately.
+	 */
+	scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
+	    dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
+
+	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
+	    offsetof(scan_ds_t, sds_node));
+	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
+	    sizeof (scan_prefetch_issue_ctx_t),
+	    offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
+
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    "scrub_func", sizeof (uint64_t), 1, &f);
+	if (err == 0) {
+		/*
+		 * There was an old-style scrub in progress.  Restart a
+		 * new-style scrub from the beginning.
+		 */
+		scn->scn_restart_txg = txg;
+		zfs_dbgmsg("old-style scrub was in progress; "
+		    "restarting new-style scrub in txg %llu",
+		    (longlong_t)scn->scn_restart_txg);
+
+		/*
+		 * Load the queue obj from the old location so that it
+		 * can be freed by dsl_scan_done().
+		 */
+		(void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    "scrub_queue", sizeof (uint64_t), 1,
+		    &scn->scn_phys.scn_queue_obj);
+	} else {
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys);
+		/*
+		 * Detect if the pool contains the signature of #2094.  If it
+		 * does properly update the scn->scn_phys structure and notify
+		 * the administrator by setting an errata for the pool.
+		 */
+		if (err == EOVERFLOW) {
+			uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1];
+			VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24);
+			VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==,
+			    (23 * sizeof (uint64_t)));
+
+			err = zap_lookup(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN,
+			    sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp);
+			if (err == 0) {
+				uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS];
+
+				if (overflow & ~DSL_SCAN_FLAGS_MASK ||
+				    scn->scn_async_destroying) {
+					spa->spa_errata =
+					    ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
+					return (EOVERFLOW);
+				}
+
+				bcopy(zaptmp, &scn->scn_phys,
+				    SCAN_PHYS_NUMINTS * sizeof (uint64_t));
+				scn->scn_phys.scn_flags = overflow;
+
+				/* Required scrub already in progress. */
+				if (scn->scn_phys.scn_state == DSS_FINISHED ||
+				    scn->scn_phys.scn_state == DSS_CANCELED)
+					spa->spa_errata =
+					    ZPOOL_ERRATA_ZOL_2094_SCRUB;
+			}
+		}
+
+		if (err == ENOENT)
+			return (0);
+		else if (err)
+			return (err);
+
+		/*
+		 * We might be restarting after a reboot, so jump the issued
+		 * counter to how far we've scanned. We know we're consistent
+		 * up to here.
+		 */
+		scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+
+		if (dsl_scan_is_running(scn) &&
+		    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
+			/*
+			 * A new-type scrub was in progress on an old
+			 * pool, and the pool was accessed by old
+			 * software.  Restart from the beginning, since
+			 * the old software may have changed the pool in
+			 * the meantime.
+			 */
+			scn->scn_restart_txg = txg;
+			zfs_dbgmsg("new-style scrub was modified "
+			    "by old software; restarting in txg %llu",
+			    (longlong_t)scn->scn_restart_txg);
+		} else if (dsl_scan_resilvering(dp)) {
+			/*
+			 * If a resilver is in progress and there are already
+			 * errors, restart it instead of finishing this scan and
+			 * then restarting it. If there haven't been any errors
+			 * then remember that the incore DTL is valid.
+			 */
+			if (scn->scn_phys.scn_errors > 0) {
+				scn->scn_restart_txg = txg;
+				zfs_dbgmsg("resilver can't excise DTL_MISSING "
+				    "when finished; restarting in txg %llu",
+				    (u_longlong_t)scn->scn_restart_txg);
+			} else {
+				/* it's safe to excise DTL when finished */
+				spa->spa_scrub_started = B_TRUE;
+			}
+		}
+	}
+
+	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+	/* reload the queue into the in-core state */
+	if (scn->scn_phys.scn_queue_obj != 0) {
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		for (zap_cursor_init(&zc, dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    (void) zap_cursor_advance(&zc)) {
+			scan_ds_queue_insert(scn,
+			    zfs_strtonum(za.za_name, NULL),
+			    za.za_first_integer);
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	spa_scan_stat_init(spa);
+	return (0);
+}
+
+void
+dsl_scan_fini(dsl_pool_t *dp)
+{
+	if (dp->dp_scan != NULL) {
+		dsl_scan_t *scn = dp->dp_scan;
+
+		if (scn->scn_taskq != NULL)
+			taskq_destroy(scn->scn_taskq);
+
+		scan_ds_queue_clear(scn);
+		avl_destroy(&scn->scn_queue);
+		scan_ds_prefetch_queue_clear(scn);
+		avl_destroy(&scn->scn_prefetch_queue);
+
+		kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
+		dp->dp_scan = NULL;
+	}
+}
+
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	return (scn->scn_restart_txg != 0 &&
+	    scn->scn_restart_txg <= tx->tx_txg);
+}
+
+boolean_t
+dsl_scan_resilver_scheduled(dsl_pool_t *dp)
+{
+	return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) ||
+	    (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER));
+}
+
+boolean_t
+dsl_scan_scrubbing(const dsl_pool_t *dp)
+{
+	dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
+
+	return (scn_phys->scn_state == DSS_SCANNING &&
+	    scn_phys->scn_func == POOL_SCAN_SCRUB);
+}
+
+boolean_t
+dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
+{
+	return (dsl_scan_scrubbing(scn->scn_dp) &&
+	    scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
+}
+
+/*
+ * Writes out a persistent dsl_scan_phys_t record to the pool directory.
+ * Because we can be running in the block sorting algorithm, we do not always
+ * want to write out the record, only when it is "safe" to do so. This safety
+ * condition is achieved by making sure that the sorting queues are empty
+ * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * is inconsistent with how much actual scanning progress has been made. The
+ * kind of sync to be performed is specified by the sync_type argument. If the
+ * sync is optional, we only sync if the queues are empty. If the sync is
+ * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
+ * third possible state is a "cached" sync. This is done in response to:
+ * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ *	destroyed, so we wouldn't be able to restart scanning from it.
+ * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
+ *	superseded by a newer snapshot.
+ * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ *	swapped with its clone.
+ * In all cases, a cached sync simply rewrites the last record we've written,
+ * just slightly modified. For the modifications that are performed to the
+ * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
+ * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
+ */
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
+{
+	int i;
+	spa_t *spa = scn->scn_dp->dp_spa;
+
+	ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
+	if (scn->scn_bytes_pending == 0) {
+		for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+			vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+			dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
+
+			if (q == NULL)
+				continue;
+
+			mutex_enter(&vd->vdev_scan_io_queue_lock);
+			ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
+			ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==,
+			    NULL);
+			ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
+			mutex_exit(&vd->vdev_scan_io_queue_lock);
+		}
+
+		if (scn->scn_phys.scn_queue_obj != 0)
+			scan_ds_queue_sync(scn, tx);
+		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys, tx));
+		bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+		    sizeof (scn->scn_phys));
+
+		if (scn->scn_checkpointing)
+			zfs_dbgmsg("finish scan checkpoint");
+
+		scn->scn_checkpointing = B_FALSE;
+		scn->scn_last_checkpoint = ddi_get_lbolt();
+	} else if (sync_type == SYNC_CACHED) {
+		VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+		    &scn->scn_phys_cached, tx));
+	}
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+	if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
+		return (SET_ERROR(EBUSY));
+
+	return (0);
+}
+
+void
+dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+	pool_scan_func_t *funcp = arg;
+	dmu_object_type_t ot = 0;
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(!dsl_scan_is_running(scn));
+	ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+	bzero(&scn->scn_phys, sizeof (scn->scn_phys));
+	scn->scn_phys.scn_func = *funcp;
+	scn->scn_phys.scn_state = DSS_SCANNING;
+	scn->scn_phys.scn_min_txg = 0;
+	scn->scn_phys.scn_max_txg = tx->tx_txg;
+	scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */
+	scn->scn_phys.scn_start_time = gethrestime_sec();
+	scn->scn_phys.scn_errors = 0;
+	scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+	scn->scn_issued_before_pass = 0;
+	scn->scn_restart_txg = 0;
+	scn->scn_done_txg = 0;
+	scn->scn_last_checkpoint = 0;
+	scn->scn_checkpointing = B_FALSE;
+	spa_scan_stat_init(spa);
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max;
+
+		/* rewrite all disk labels */
+		vdev_config_dirty(spa->spa_root_vdev);
+
+		if (vdev_resilver_needed(spa->spa_root_vdev,
+		    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
+			nvlist_t *aux = fnvlist_alloc();
+			fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+			    "healing");
+			spa_event_notify(spa, NULL, aux,
+			    ESC_ZFS_RESILVER_START);
+			nvlist_free(aux);
+		} else {
+			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
+		}
+
+		spa->spa_scrub_started = B_TRUE;
+		/*
+		 * If this is an incremental scrub, limit the DDT scrub phase
+		 * to just the auto-ditto class (for correctness); the rest
+		 * of the scrub should go faster using top-down pruning.
+		 */
+		if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
+			scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
+
+		/*
+		 * When starting a resilver clear any existing rebuild state.
+		 * This is required to prevent stale rebuild status from
+		 * being reported when a rebuild is run, then a resilver and
+		 * finally a scrub.  In which case only the scrub status
+		 * should be reported by 'zpool status'.
+		 */
+		if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+			vdev_t *rvd = spa->spa_root_vdev;
+			for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+				vdev_t *vd = rvd->vdev_child[i];
+				vdev_rebuild_clear_sync(
+				    (void *)(uintptr_t)vd->vdev_id, tx);
+			}
+		}
+	}
+
+	/* back to the generic stuff */
+
+	if (dp->dp_blkstats == NULL) {
+		dp->dp_blkstats =
+		    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+		mutex_init(&dp->dp_blkstats->zab_lock, NULL,
+		    MUTEX_DEFAULT, NULL);
+	}
+	bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
+
+	if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
+		ot = DMU_OT_ZAP_OTHER;
+
+	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
+	    ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
+
+	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
+
+	spa_history_log_internal(spa, "scan setup", tx,
+	    "func=%u mintxg=%llu maxtxg=%llu",
+	    *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg,
+	    (u_longlong_t)scn->scn_phys.scn_max_txg);
+}
+
+/*
+ * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
+ * Can also be called to resume a paused scrub.
+ */
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	/*
+	 * Purge all vdev caches and probe all devices.  We do this here
+	 * rather than in sync context because this requires a writer lock
+	 * on the spa_config lock, which we can't do from sync context.  The
+	 * spa_scrub_reopen flag indicates that vdev_open() should not
+	 * attempt to start another scrub.
+	 */
+	spa_vdev_state_enter(spa, SCL_NONE);
+	spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+
+	if (func == POOL_SCAN_RESILVER) {
+		dsl_scan_restart_resilver(spa->spa_dsl_pool, 0);
+		return (0);
+	}
+
+	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
+		/* got scrub start cmd, resume paused scrub */
+		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+		    POOL_SCRUB_NORMAL);
+		if (err == 0) {
+			spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
+			return (SET_ERROR(ECANCELED));
+		}
+
+		return (SET_ERROR(err));
+	}
+
+	return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+	static const char *old_names[] = {
+		"scrub_bookmark",
+		"scrub_ddt_bookmark",
+		"scrub_ddt_class_max",
+		"scrub_queue",
+		"scrub_min_txg",
+		"scrub_max_txg",
+		"scrub_func",
+		"scrub_errors",
+		NULL
+	};
+
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+	int i;
+
+	/* Remove any remnants of an old-style scrub. */
+	for (i = 0; old_names[i]; i++) {
+		(void) zap_remove(dp->dp_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx);
+	}
+
+	if (scn->scn_phys.scn_queue_obj != 0) {
+		VERIFY0(dmu_object_free(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, tx));
+		scn->scn_phys.scn_queue_obj = 0;
+	}
+	scan_ds_queue_clear(scn);
+	scan_ds_prefetch_queue_clear(scn);
+
+	scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+
+	/*
+	 * If we were "restarted" from a stopped state, don't bother
+	 * with anything else.
+	 */
+	if (!dsl_scan_is_running(scn)) {
+		ASSERT(!scn->scn_is_sorted);
+		return;
+	}
+
+	if (scn->scn_is_sorted) {
+		scan_io_queues_destroy(scn);
+		scn->scn_is_sorted = B_FALSE;
+
+		if (scn->scn_taskq != NULL) {
+			taskq_destroy(scn->scn_taskq);
+			scn->scn_taskq = NULL;
+		}
+	}
+
+	scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
+
+	spa_notify_waiters(spa);
+
+	if (dsl_scan_restarting(scn, tx))
+		spa_history_log_internal(spa, "scan aborted, restarting", tx,
+		    "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+	else if (!complete)
+		spa_history_log_internal(spa, "scan cancelled", tx,
+		    "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+	else
+		spa_history_log_internal(spa, "scan done", tx,
+		    "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+
+	if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
+		spa->spa_scrub_active = B_FALSE;
+
+		/*
+		 * If the scrub/resilver completed, update all DTLs to
+		 * reflect this.  Whether it succeeded or not, vacate
+		 * all temporary scrub DTLs.
+		 *
+		 * As the scrub does not currently support traversing
+		 * data that have been freed but are part of a checkpoint,
+		 * we don't mark the scrub as done in the DTLs as faults
+		 * may still exist in those vdevs.
+		 */
+		if (complete &&
+		    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+			    scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
+
+			if (scn->scn_phys.scn_min_txg) {
+				nvlist_t *aux = fnvlist_alloc();
+				fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+				    "healing");
+				spa_event_notify(spa, NULL, aux,
+				    ESC_ZFS_RESILVER_FINISH);
+				nvlist_free(aux);
+			} else {
+				spa_event_notify(spa, NULL, NULL,
+				    ESC_ZFS_SCRUB_FINISH);
+			}
+		} else {
+			vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+			    0, B_TRUE, B_FALSE);
+		}
+		spa_errlog_rotate(spa);
+
+		/*
+		 * Don't clear flag until after vdev_dtl_reassess to ensure that
+		 * DTL_MISSING will get updated when possible.
+		 */
+		spa->spa_scrub_started = B_FALSE;
+
+		/*
+		 * We may have finished replacing a device.
+		 * Let the async thread assess this and handle the detach.
+		 */
+		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+		/*
+		 * Clear any resilver_deferred flags in the config.
+		 * If there are drives that need resilvering, kick
+		 * off an asynchronous request to start resilver.
+		 * vdev_clear_resilver_deferred() may update the config
+		 * before the resilver can restart. In the event of
+		 * a crash during this period, the spa loading code
+		 * will find the drives that need to be resilvered
+		 * and start the resilver then.
+		 */
+		if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) &&
+		    vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
+			spa_history_log_internal(spa,
+			    "starting deferred resilver", tx, "errors=%llu",
+			    (u_longlong_t)spa_get_errlog_size(spa));
+			spa_async_request(spa, SPA_ASYNC_RESILVER);
+		}
+	}
+
+	scn->scn_phys.scn_end_time = gethrestime_sec();
+
+	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
+		spa->spa_errata = 0;
+
+	ASSERT(!dsl_scan_is_running(scn));
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	if (!dsl_scan_is_running(scn))
+		return (SET_ERROR(ENOENT));
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+	dsl_scan_done(scn, B_FALSE, tx);
+	dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
+	spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT);
+}
+
+int
+dsl_scan_cancel(dsl_pool_t *dp)
+{
+	return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
+	    dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+}
+
+static int
+dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
+{
+	pool_scrub_cmd_t *cmd = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (*cmd == POOL_SCRUB_PAUSE) {
+		/* can't pause a scrub when there is no in-progress scrub */
+		if (!dsl_scan_scrubbing(dp))
+			return (SET_ERROR(ENOENT));
+
+		/* can't pause a paused scrub */
+		if (dsl_scan_is_paused_scrub(scn))
+			return (SET_ERROR(EBUSY));
+	} else if (*cmd != POOL_SCRUB_NORMAL) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	return (0);
+}
+
+static void
+dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
+{
+	pool_scrub_cmd_t *cmd = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (*cmd == POOL_SCRUB_PAUSE) {
+		/* can't pause a scrub when there is no in-progress scrub */
+		spa->spa_scan_pass_scrub_pause = gethrestime_sec();
+		scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
+		scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
+		dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
+		spa_notify_waiters(spa);
+	} else {
+		ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+		if (dsl_scan_is_paused_scrub(scn)) {
+			/*
+			 * We need to keep track of how much time we spend
+			 * paused per pass so that we can adjust the scrub rate
+			 * shown in the output of 'zpool status'
+			 */
+			spa->spa_scan_pass_scrub_spent_paused +=
+			    gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
+			spa->spa_scan_pass_scrub_pause = 0;
+			scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+			scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
+			dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+		}
+	}
+}
+
+/*
+ * Set scrub pause/resume state if it makes sense to do so
+ */
+int
+dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
+{
+	return (dsl_sync_task(spa_name(dp->dp_spa),
+	    dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
+	    ZFS_SPACE_CHECK_RESERVED));
+}
+
+
+/* start a new scan, or restart an existing one. */
+void
+dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg)
+{
+	if (txg == 0) {
+		dmu_tx_t *tx;
+		tx = dmu_tx_create_dd(dp->dp_mos_dir);
+		VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+
+		txg = dmu_tx_get_txg(tx);
+		dp->dp_scan->scn_restart_txg = txg;
+		dmu_tx_commit(tx);
+	} else {
+		dp->dp_scan->scn_restart_txg = txg;
+	}
+	zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg);
+}
+
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
+{
+	zio_free(dp->dp_spa, txg, bp);
+}
+
+void
+dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
+{
+	ASSERT(dsl_pool_sync_context(dp));
+	zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
+}
+
+static int
+scan_ds_queue_compare(const void *a, const void *b)
+{
+	const scan_ds_t *sds_a = a, *sds_b = b;
+
+	if (sds_a->sds_dsobj < sds_b->sds_dsobj)
+		return (-1);
+	if (sds_a->sds_dsobj == sds_b->sds_dsobj)
+		return (0);
+	return (1);
+}
+
+static void
+scan_ds_queue_clear(dsl_scan_t *scn)
+{
+	void *cookie = NULL;
+	scan_ds_t *sds;
+	while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
+		kmem_free(sds, sizeof (*sds));
+	}
+}
+
+static boolean_t
+scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
+{
+	scan_ds_t srch, *sds;
+
+	srch.sds_dsobj = dsobj;
+	sds = avl_find(&scn->scn_queue, &srch, NULL);
+	if (sds != NULL && txg != NULL)
+		*txg = sds->sds_txg;
+	return (sds != NULL);
+}
+
+static void
+scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
+{
+	scan_ds_t *sds;
+	avl_index_t where;
+
+	sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
+	sds->sds_dsobj = dsobj;
+	sds->sds_txg = txg;
+
+	VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
+	avl_insert(&scn->scn_queue, sds, where);
+}
+
+static void
+scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
+{
+	scan_ds_t srch, *sds;
+
+	srch.sds_dsobj = dsobj;
+
+	sds = avl_find(&scn->scn_queue, &srch, NULL);
+	VERIFY(sds != NULL);
+	avl_remove(&scn->scn_queue, sds);
+	kmem_free(sds, sizeof (*sds));
+}
+
+static void
+scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	spa_t *spa = dp->dp_spa;
+	dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
+	    DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
+
+	ASSERT0(scn->scn_bytes_pending);
+	ASSERT(scn->scn_phys.scn_queue_obj != 0);
+
+	VERIFY0(dmu_object_free(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, tx));
+	scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
+	    DMU_OT_NONE, 0, tx);
+	for (scan_ds_t *sds = avl_first(&scn->scn_queue);
+	    sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
+		VERIFY0(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
+		    sds->sds_txg, tx));
+	}
+}
+
+/*
+ * Computes the memory limit state that we're currently in. A sorted scan
+ * needs quite a bit of memory to hold the sorting queue, so we need to
+ * reasonably constrain the size so it doesn't impact overall system
+ * performance. We compute two limits:
+ * 1) Hard memory limit: if the amount of memory used by the sorting
+ *	queues on a pool gets above this value, we stop the metadata
+ *	scanning portion and start issuing the queued up and sorted
+ *	I/Os to reduce memory usage.
+ *	This limit is calculated as a fraction of physmem (by default 5%).
+ *	We constrain the lower bound of the hard limit to an absolute
+ *	minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
+ *	the upper bound to 5% of the total pool size - no chance we'll
+ *	ever need that much memory, but just to keep the value in check.
+ * 2) Soft memory limit: once we hit the hard memory limit, we start
+ *	issuing I/O to reduce queue memory usage, but we don't want to
+ *	completely empty out the queues, since we might be able to find I/Os
+ *	that will fill in the gaps of our non-sequential IOs at some point
+ *	in the future. So we stop the issuing of I/Os once the amount of
+ *	memory used drops below the soft limit (at which point we stop issuing
+ *	I/O and start scanning metadata again).
+ *
+ *	This limit is calculated by subtracting a fraction of the hard
+ *	limit from the hard limit. By default this fraction is 5%, so
+ *	the soft limit is 95% of the hard limit. We cap the size of the
+ *	difference between the hard and soft limits at an absolute
+ *	maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
+ *	sufficient to not cause too frequent switching between the
+ *	metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
+ *	worth of queues is about 1.2 GiB of on-pool data, so scanning
+ *	that should take at least a decent fraction of a second).
+ */
+static boolean_t
+dsl_scan_should_clear(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+	uint64_t alloc, mlim_hard, mlim_soft, mused;
+
+	alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	alloc += metaslab_class_get_alloc(spa_special_class(spa));
+	alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+
+	mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
+	    zfs_scan_mem_lim_min);
+	mlim_hard = MIN(mlim_hard, alloc / 20);
+	mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
+	    zfs_scan_mem_lim_soft_max);
+	mused = 0;
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *tvd = rvd->vdev_child[i];
+		dsl_scan_io_queue_t *queue;
+
+		mutex_enter(&tvd->vdev_scan_io_queue_lock);
+		queue = tvd->vdev_scan_io_queue;
+		if (queue != NULL) {
+			/* # extents in exts_by_size = # in exts_by_addr */
+			mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
+			    sizeof (range_seg_gap_t) + queue->q_sio_memused;
+		}
+		mutex_exit(&tvd->vdev_scan_io_queue_lock);
+	}
+
+	dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
+
+	if (mused == 0)
+		ASSERT0(scn->scn_bytes_pending);
+
+	/*
+	 * If we are above our hard limit, we need to clear out memory.
+	 * If we are below our soft limit, we need to accumulate sequential IOs.
+	 * Otherwise, we should keep doing whatever we are currently doing.
+	 */
+	if (mused >= mlim_hard)
+		return (B_TRUE);
+	else if (mused < mlim_soft)
+		return (B_FALSE);
+	else
+		return (scn->scn_clearing);
+}
+
+static boolean_t
+dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+	/* we never skip user/group accounting objects */
+	if (zb && (int64_t)zb->zb_object < 0)
+		return (B_FALSE);
+
+	if (scn->scn_suspending)
+		return (B_TRUE); /* we're already suspending */
+
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark))
+		return (B_FALSE); /* we're resuming */
+
+	/* We only know how to resume from level-0 and objset blocks. */
+	if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL))
+		return (B_FALSE);
+
+	/*
+	 * We suspend if:
+	 *  - we have scanned for at least the minimum time (default 1 sec
+	 *    for scrub, 3 sec for resilver), and either we have sufficient
+	 *    dirty data that we are starting to write more quickly
+	 *    (default 30%), someone is explicitly waiting for this txg
+	 *    to complete, or we have used up all of the time in the txg
+	 *    timeout (default 5 sec).
+	 *  or
+	 *  - the spa is shutting down because this pool is being exported
+	 *    or the machine is rebooting.
+	 *  or
+	 *  - the scan queue has reached its memory use limit
+	 */
+	uint64_t curr_time_ns = gethrtime();
+	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+	uint64_t sync_time_ns = curr_time_ns -
+	    scn->scn_dp->dp_spa->spa_sync_starttime;
+	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+	if ((NSEC2MSEC(scan_time_ns) > mintime &&
+	    (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+	    txg_sync_waiting(scn->scn_dp) ||
+	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa) ||
+	    (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
+		if (zb && zb->zb_level == ZB_ROOT_LEVEL) {
+			dprintf("suspending at first available bookmark "
+			    "%llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+			    zb->zb_objset, 0, 0, 0);
+		} else if (zb != NULL) {
+			dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			scn->scn_phys.scn_bookmark = *zb;
+		} else {
+#ifdef ZFS_DEBUG
+			dsl_scan_phys_t *scnp = &scn->scn_phys;
+			dprintf("suspending at at DDT bookmark "
+			    "%llx/%llx/%llx/%llx\n",
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+#endif
+		}
+		scn->scn_suspending = B_TRUE;
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+typedef struct zil_scan_arg {
+	dsl_pool_t	*zsa_dp;
+	zil_header_t	*zsa_zh;
+} zil_scan_arg_t;
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg,
+    uint64_t claim_txg)
+{
+	zil_scan_arg_t *zsa = arg;
+	dsl_pool_t *dp = zsa->zsa_dp;
+	dsl_scan_t *scn = dp->dp_scan;
+	zil_header_t *zh = zsa->zsa_zh;
+	zbookmark_phys_t zb;
+
+	ASSERT(!BP_IS_REDACTED(bp));
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+		return (0);
+
+	/*
+	 * One block ("stubby") can be allocated a long time ago; we
+	 * want to visit that one because it has been allocated
+	 * (on-disk) even if it hasn't been claimed (even though for
+	 * scrub there's nothing to do to it).
+	 */
+	if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
+		return (0);
+
+	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg,
+    uint64_t claim_txg)
+{
+	if (lrc->lrc_txtype == TX_WRITE) {
+		zil_scan_arg_t *zsa = arg;
+		dsl_pool_t *dp = zsa->zsa_dp;
+		dsl_scan_t *scn = dp->dp_scan;
+		zil_header_t *zh = zsa->zsa_zh;
+		const lr_write_t *lr = (const lr_write_t *)lrc;
+		const blkptr_t *bp = &lr->lr_blkptr;
+		zbookmark_phys_t zb;
+
+		ASSERT(!BP_IS_REDACTED(bp));
+		if (BP_IS_HOLE(bp) ||
+		    bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+			return (0);
+
+		/*
+		 * birth can be < claim_txg if this record's txg is
+		 * already txg sync'ed (but this log block contains
+		 * other records that are not synced)
+		 */
+		if (claim_txg == 0 || bp->blk_birth < claim_txg)
+			return (0);
+
+		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb));
+	}
+	return (0);
+}
+
+static void
+dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
+{
+	uint64_t claim_txg = zh->zh_claim_txg;
+	zil_scan_arg_t zsa = { dp, zh };
+	zilog_t *zilog;
+
+	ASSERT(spa_writeable(dp->dp_spa));
+
+	/*
+	 * We only want to visit blocks that have been claimed but not yet
+	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 */
+	if (claim_txg == 0)
+		return;
+
+	zilog = zil_alloc(dp->dp_meta_objset, zh);
+
+	(void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa,
+	    claim_txg, B_FALSE);
+
+	zil_free(zilog);
+}
+
+/*
+ * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
+ * here is to sort the AVL tree by the order each block will be needed.
+ */
+static int
+scan_prefetch_queue_compare(const void *a, const void *b)
+{
+	const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
+	const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
+	const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
+
+	return (zbookmark_compare(spc_a->spc_datablkszsec,
+	    spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
+	    spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
+}
+
+static void
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
+{
+	if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
+		zfs_refcount_destroy(&spc->spc_refcnt);
+		kmem_free(spc, sizeof (scan_prefetch_ctx_t));
+	}
+}
+
+static scan_prefetch_ctx_t *
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+{
+	scan_prefetch_ctx_t *spc;
+
+	spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
+	zfs_refcount_create(&spc->spc_refcnt);
+	zfs_refcount_add(&spc->spc_refcnt, tag);
+	spc->spc_scn = scn;
+	if (dnp != NULL) {
+		spc->spc_datablkszsec = dnp->dn_datablkszsec;
+		spc->spc_indblkshift = dnp->dn_indblkshift;
+		spc->spc_root = B_FALSE;
+	} else {
+		spc->spc_datablkszsec = 0;
+		spc->spc_indblkshift = 0;
+		spc->spc_root = B_TRUE;
+	}
+
+	return (spc);
+}
+
+static void
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+{
+	zfs_refcount_add(&spc->spc_refcnt, tag);
+}
+
+static void
+scan_ds_prefetch_queue_clear(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	void *cookie = NULL;
+	scan_prefetch_issue_ctx_t *spic = NULL;
+
+	mutex_enter(&spa->spa_scrub_lock);
+	while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue,
+	    &cookie)) != NULL) {
+		scan_prefetch_ctx_rele(spic->spic_spc, scn);
+		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+	}
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static boolean_t
+dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
+    const zbookmark_phys_t *zb)
+{
+	zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
+	dnode_phys_t tmp_dnp;
+	dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
+
+	if (zb->zb_objset != last_zb->zb_objset)
+		return (B_TRUE);
+	if ((int64_t)zb->zb_object < 0)
+		return (B_FALSE);
+
+	tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
+	tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
+
+	if (zbookmark_subtree_completed(dnp, zb, last_zb))
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+static void
+dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
+{
+	avl_index_t idx;
+	dsl_scan_t *scn = spc->spc_scn;
+	spa_t *spa = scn->scn_dp->dp_spa;
+	scan_prefetch_issue_ctx_t *spic;
+
+	if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp))
+		return;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+	    BP_GET_TYPE(bp) != DMU_OT_OBJSET))
+		return;
+
+	if (dsl_scan_check_prefetch_resume(spc, zb))
+		return;
+
+	scan_prefetch_ctx_add_ref(spc, scn);
+	spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
+	spic->spic_spc = spc;
+	spic->spic_bp = *bp;
+	spic->spic_zb = *zb;
+
+	/*
+	 * Add the IO to the queue of blocks to prefetch. This allows us to
+	 * prioritize blocks that we will need first for the main traversal
+	 * thread.
+	 */
+	mutex_enter(&spa->spa_scrub_lock);
+	if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
+		/* this block is already queued for prefetch */
+		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+		scan_prefetch_ctx_rele(spc, scn);
+		mutex_exit(&spa->spa_scrub_lock);
+		return;
+	}
+
+	avl_insert(&scn->scn_prefetch_queue, spic, idx);
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+	int i;
+	zbookmark_phys_t zb;
+	scan_prefetch_ctx_t *spc;
+
+	if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+		return;
+
+	SET_BOOKMARK(&zb, objset, object, 0, 0);
+
+	spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
+
+	for (i = 0; i < dnp->dn_nblkptr; i++) {
+		zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
+		zb.zb_blkid = i;
+		dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		zb.zb_level = 0;
+		zb.zb_blkid = DMU_SPILL_BLKID;
+		dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
+	}
+
+	scan_prefetch_ctx_rele(spc, FTAG);
+}
+
+static void
+dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *private)
+{
+	scan_prefetch_ctx_t *spc = private;
+	dsl_scan_t *scn = spc->spc_scn;
+	spa_t *spa = scn->scn_dp->dp_spa;
+
+	/* broadcast that the IO has completed for rate limiting purposes */
+	mutex_enter(&spa->spa_scrub_lock);
+	ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+	spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+
+	/* if there was an error or we are done prefetching, just cleanup */
+	if (buf == NULL || scn->scn_prefetch_stop)
+		goto out;
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		zbookmark_phys_t czb;
+
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1, zb->zb_blkid * epb + i);
+			dsl_scan_prefetch(spc, cbp, &czb);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		dnode_phys_t *cdnp;
+		int i;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+		for (i = 0, cdnp = buf->b_data; i < epb;
+		    i += cdnp->dn_extra_slots + 1,
+		    cdnp += cdnp->dn_extra_slots + 1) {
+			dsl_scan_prefetch_dnode(scn, cdnp,
+			    zb->zb_objset, zb->zb_blkid * epb + i);
+		}
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		objset_phys_t *osp = buf->b_data;
+
+		dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
+		    zb->zb_objset, DMU_META_DNODE_OBJECT);
+
+		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			dsl_scan_prefetch_dnode(scn,
+			    &osp->os_groupused_dnode, zb->zb_objset,
+			    DMU_GROUPUSED_OBJECT);
+			dsl_scan_prefetch_dnode(scn,
+			    &osp->os_userused_dnode, zb->zb_objset,
+			    DMU_USERUSED_OBJECT);
+		}
+	}
+
+out:
+	if (buf != NULL)
+		arc_buf_destroy(buf, private);
+	scan_prefetch_ctx_rele(spc, scn);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch_thread(void *arg)
+{
+	dsl_scan_t *scn = arg;
+	spa_t *spa = scn->scn_dp->dp_spa;
+	scan_prefetch_issue_ctx_t *spic;
+
+	/* loop until we are told to stop */
+	while (!scn->scn_prefetch_stop) {
+		arc_flags_t flags = ARC_FLAG_NOWAIT |
+		    ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
+		int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+
+		mutex_enter(&spa->spa_scrub_lock);
+
+		/*
+		 * Wait until we have an IO to issue and are not above our
+		 * maximum in flight limit.
+		 */
+		while (!scn->scn_prefetch_stop &&
+		    (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
+		    spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		}
+
+		/* recheck if we should stop since we waited for the cv */
+		if (scn->scn_prefetch_stop) {
+			mutex_exit(&spa->spa_scrub_lock);
+			break;
+		}
+
+		/* remove the prefetch IO from the tree */
+		spic = avl_first(&scn->scn_prefetch_queue);
+		spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
+		avl_remove(&scn->scn_prefetch_queue, spic);
+
+		mutex_exit(&spa->spa_scrub_lock);
+
+		if (BP_IS_PROTECTED(&spic->spic_bp)) {
+			ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
+			    BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
+			ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
+			zio_flags |= ZIO_FLAG_RAW;
+		}
+
+		/* issue the prefetch asynchronously */
+		(void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
+		    &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
+		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb);
+
+		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+	}
+
+	ASSERT(scn->scn_prefetch_stop);
+
+	/* free any prefetches we didn't get to complete */
+	mutex_enter(&spa->spa_scrub_lock);
+	while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
+		avl_remove(&scn->scn_prefetch_queue, spic);
+		scan_prefetch_ctx_rele(spic->spic_spc, scn);
+		kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+	}
+	ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+static boolean_t
+dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
+    const zbookmark_phys_t *zb)
+{
+	/*
+	 * We never skip over user/group accounting objects (obj<0)
+	 */
+	if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) &&
+	    (int64_t)zb->zb_object >= 0) {
+		/*
+		 * If we already visited this bp & everything below (in
+		 * a prior txg sync), don't bother doing it again.
+		 */
+		if (zbookmark_subtree_completed(dnp, zb,
+		    &scn->scn_phys.scn_bookmark))
+			return (B_TRUE);
+
+		/*
+		 * If we found the block we're trying to resume from, or
+		 * we went past it to a different object, zero it out to
+		 * indicate that it's OK to start checking for suspending
+		 * again.
+		 */
+		if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 ||
+		    zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) {
+			dprintf("resuming at %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb));
+		}
+	}
+	return (B_FALSE);
+}
+
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx);
+inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
+    dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
+/*
+ * Return nonzero on i/o error.
+ * Return new buf to write out in *bufp.
+ */
+inline __attribute__((always_inline)) static int
+dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+	int err;
+
+	ASSERT(!BP_IS_REDACTED(bp));
+
+	if (BP_GET_LEVEL(bp) > 0) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		int i;
+		blkptr_t *cbp;
+		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			zbookmark_phys_t czb;
+
+			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+			    zb->zb_level - 1,
+			    zb->zb_blkid * epb + i);
+			dsl_scan_visitbp(cbp, &czb, dnp,
+			    ds, scn, ostype, tx);
+		}
+		arc_buf_destroy(buf, &buf);
+	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		dnode_phys_t *cdnp;
+		int i;
+		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+		arc_buf_t *buf;
+
+		if (BP_IS_PROTECTED(bp)) {
+			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+			zio_flags |= ZIO_FLAG_RAW;
+		}
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+		for (i = 0, cdnp = buf->b_data; i < epb;
+		    i += cdnp->dn_extra_slots + 1,
+		    cdnp += cdnp->dn_extra_slots + 1) {
+			dsl_scan_visitdnode(scn, ds, ostype,
+			    cdnp, zb->zb_blkid * epb + i, tx);
+		}
+
+		arc_buf_destroy(buf, &buf);
+	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+		arc_flags_t flags = ARC_FLAG_WAIT;
+		objset_phys_t *osp;
+		arc_buf_t *buf;
+
+		err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
+		    ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb);
+		if (err) {
+			scn->scn_phys.scn_errors++;
+			return (err);
+		}
+
+		osp = buf->b_data;
+
+		dsl_scan_visitdnode(scn, ds, osp->os_type,
+		    &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
+
+		if (OBJSET_BUF_HAS_USERUSED(buf)) {
+			/*
+			 * We also always visit user/group/project accounting
+			 * objects, and never skip them, even if we are
+			 * suspending. This is necessary so that the
+			 * space deltas from this txg get integrated.
+			 */
+			if (OBJSET_BUF_HAS_PROJECTUSED(buf))
+				dsl_scan_visitdnode(scn, ds, osp->os_type,
+				    &osp->os_projectused_dnode,
+				    DMU_PROJECTUSED_OBJECT, tx);
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_groupused_dnode,
+			    DMU_GROUPUSED_OBJECT, tx);
+			dsl_scan_visitdnode(scn, ds, osp->os_type,
+			    &osp->os_userused_dnode,
+			    DMU_USERUSED_OBJECT, tx);
+		}
+		arc_buf_destroy(buf, &buf);
+	}
+
+	return (0);
+}
+
+inline __attribute__((always_inline)) static void
+dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
+    dmu_objset_type_t ostype, dnode_phys_t *dnp,
+    uint64_t object, dmu_tx_t *tx)
+{
+	int j;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		zbookmark_phys_t czb;
+
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    dnp->dn_nlevels - 1, j);
+		dsl_scan_visitbp(&dnp->dn_blkptr[j],
+		    &czb, dnp, ds, scn, ostype, tx);
+	}
+
+	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+		zbookmark_phys_t czb;
+		SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
+		    0, DMU_SPILL_BLKID);
+		dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp),
+		    &czb, dnp, ds, scn, ostype, tx);
+	}
+}
+
+/*
+ * The arguments are in this order because mdb can only print the
+ * first 5; we want them to be useful.
+ */
+static void
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	blkptr_t *bp_toread = NULL;
+
+	if (dsl_scan_check_suspend(scn, zb))
+		return;
+
+	if (dsl_scan_check_resume(scn, dnp, zb))
+		return;
+
+	scn->scn_visited_this_txg++;
+
+	/*
+	 * This debugging is commented out to conserve stack space.  This
+	 * function is called recursively and the debugging adds several
+	 * bytes to the stack for each call.  It can be commented back in
+	 * if required to debug an issue in dsl_scan_visitbp().
+	 *
+	 * dprintf_bp(bp,
+	 *     "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
+	 *     ds, ds ? ds->ds_object : 0,
+	 *     zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+	 *     bp);
+	 */
+
+	if (BP_IS_HOLE(bp)) {
+		scn->scn_holes_this_txg++;
+		return;
+	}
+
+	if (BP_IS_REDACTED(bp)) {
+		ASSERT(dsl_dataset_feature_is_active(ds,
+		    SPA_FEATURE_REDACTED_DATASETS));
+		return;
+	}
+
+	if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+		scn->scn_lt_min_this_txg++;
+		return;
+	}
+
+	bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+	*bp_toread = *bp;
+
+	if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
+		goto out;
+
+	/*
+	 * If dsl_scan_ddt() has already visited this block, it will have
+	 * already done any translations or scrubbing, so don't call the
+	 * callback again.
+	 */
+	if (ddt_class_contains(dp->dp_spa,
+	    scn->scn_phys.scn_ddt_class_max, bp)) {
+		scn->scn_ddt_contained_this_txg++;
+		goto out;
+	}
+
+	/*
+	 * If this block is from the future (after cur_max_txg), then we
+	 * are doing this on behalf of a deleted snapshot, and we will
+	 * revisit the future block on the next pass of this dataset.
+	 * Don't scan it now unless we need to because something
+	 * under it was modified.
+	 */
+	if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+		scn->scn_gt_max_this_txg++;
+		goto out;
+	}
+
+	scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+
+out:
+	kmem_free(bp_toread, sizeof (blkptr_t));
+}
+
+static void
+dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
+    dmu_tx_t *tx)
+{
+	zbookmark_phys_t zb;
+	scan_prefetch_ctx_t *spc;
+
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+	if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
+		SET_BOOKMARK(&scn->scn_prefetch_bookmark,
+		    zb.zb_objset, 0, 0, 0);
+	} else {
+		scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
+	}
+
+	scn->scn_objsets_visited_this_txg++;
+
+	spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
+	dsl_scan_prefetch(spc, bp, &zb);
+	scan_prefetch_ctx_rele(spc, FTAG);
+
+	dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
+
+	dprintf_ds(ds, "finished scan%s", "");
+}
+
+static void
+ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
+{
+	if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
+		if (ds->ds_is_snapshot) {
+			/*
+			 * Note:
+			 *  - scn_cur_{min,max}_txg stays the same.
+			 *  - Setting the flag is not really necessary if
+			 *    scn_cur_max_txg == scn_max_txg, because there
+			 *    is nothing after this snapshot that we care
+			 *    about.  However, we set it anyway and then
+			 *    ignore it when we retraverse it in
+			 *    dsl_scan_visitds().
+			 */
+			scn_phys->scn_bookmark.zb_objset =
+			    dsl_dataset_phys(ds)->ds_next_snap_obj;
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset zb_objset to %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
+			scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
+		} else {
+			SET_BOOKMARK(&scn_phys->scn_bookmark,
+			    ZB_DESTROYED_OBJSET, 0, 0, 0);
+			zfs_dbgmsg("destroying ds %llu; currently traversing; "
+			    "reset bookmark to -1,0,0,0",
+			    (u_longlong_t)ds->ds_object);
+		}
+	}
+}
+
+/*
+ * Invoked when a dataset is destroyed. We need to make sure that:
+ *
+ * 1) If it is the dataset that was currently being scanned, we write
+ *	a new dsl_scan_phys_t and marking the objset reference in it
+ *	as destroyed.
+ * 2) Remove it from the work queue, if it was present.
+ *
+ * If the dataset was actually a snapshot, instead of marking the dataset
+ * as destroyed, we instead substitute the next snapshot in line.
+ */
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	ds_destroyed_scn_phys(ds, &scn->scn_phys);
+	ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
+
+	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+		scan_ds_queue_remove(scn, ds->ds_object);
+		if (ds->ds_is_snapshot)
+			scan_ds_queue_insert(scn,
+			    dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
+	}
+
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds->ds_object, &mintxg) == 0) {
+		ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		if (ds->ds_is_snapshot) {
+			/*
+			 * We keep the same mintxg; it could be >
+			 * ds_creation_txg if the previous snapshot was
+			 * deleted too.
+			 */
+			VERIFY(zap_add_int_key(dp->dp_meta_objset,
+			    scn->scn_phys.scn_queue_obj,
+			    dsl_dataset_phys(ds)->ds_next_snap_obj,
+			    mintxg, tx) == 0);
+			zfs_dbgmsg("destroying ds %llu; in queue; "
+			    "replacing with %llu",
+			    (u_longlong_t)ds->ds_object,
+			    (u_longlong_t)dsl_dataset_phys(ds)->
+			    ds_next_snap_obj);
+		} else {
+			zfs_dbgmsg("destroying ds %llu; in queue; removing",
+			    (u_longlong_t)ds->ds_object);
+		}
+	}
+
+	/*
+	 * dsl_scan_sync() should be called after this, and should sync
+	 * out our changed state, but just to be safe, do it here.
+	 */
+	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
+{
+	if (scn_bookmark->zb_objset == ds->ds_object) {
+		scn_bookmark->zb_objset =
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj;
+		zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+	}
+}
+
+/*
+ * Called when a dataset is snapshotted. If we were currently traversing
+ * this snapshot, we reset our bookmark to point at the newly created
+ * snapshot. We also modify our work queue to remove the old snapshot and
+ * replace with the new one.
+ */
+void
+dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg;
+
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
+
+	ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
+	ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
+
+	if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+		scan_ds_queue_remove(scn, ds->ds_object);
+		scan_ds_queue_insert(scn,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
+	}
+
+	if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+	    ds->ds_object, &mintxg) == 0) {
+		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
+		VERIFY(zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
+		zfs_dbgmsg("snapshotting ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds->ds_object,
+		    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+	}
+
+	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
+    zbookmark_phys_t *scn_bookmark)
+{
+	if (scn_bookmark->zb_objset == ds1->ds_object) {
+		scn_bookmark->zb_objset = ds2->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (scn_bookmark->zb_objset == ds2->ds_object) {
+		scn_bookmark->zb_objset = ds1->ds_object;
+		zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
+		    "reset zb_objset to %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+}
+
+/*
+ * Called when an origin dataset and its clone are swapped.  If we were
+ * currently traversing the dataset, we need to switch to traversing the
+ * newly promoted clone.
+ */
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	uint64_t mintxg1, mintxg2;
+	boolean_t ds1_queued, ds2_queued;
+
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
+	ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
+
+	/*
+	 * Handle the in-memory scan queue.
+	 */
+	ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1);
+	ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2);
+
+	/* Sanity checking. */
+	if (ds1_queued) {
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+	}
+	if (ds2_queued) {
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+	}
+
+	if (ds1_queued && ds2_queued) {
+		/*
+		 * If both are queued, we don't need to do anything.
+		 * The swapping code below would not handle this case correctly,
+		 * since we can't insert ds2 if it is already there. That's
+		 * because scan_ds_queue_insert() prohibits a duplicate insert
+		 * and panics.
+		 */
+	} else if (ds1_queued) {
+		scan_ds_queue_remove(scn, ds1->ds_object);
+		scan_ds_queue_insert(scn, ds2->ds_object, mintxg1);
+	} else if (ds2_queued) {
+		scan_ds_queue_remove(scn, ds2->ds_object);
+		scan_ds_queue_insert(scn, ds1->ds_object, mintxg2);
+	}
+
+	/*
+	 * Handle the on-disk scan queue.
+	 * The on-disk state is an out-of-date version of the in-memory state,
+	 * so the in-memory and on-disk values for ds1_queued and ds2_queued may
+	 * be different. Therefore we need to apply the swap logic to the
+	 * on-disk state independently of the in-memory state.
+	 */
+	ds1_queued = zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0;
+	ds2_queued = zap_lookup_int_key(dp->dp_meta_objset,
+	    scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0;
+
+	/* Sanity checking. */
+	if (ds1_queued) {
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+	}
+	if (ds2_queued) {
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+		ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
+	}
+
+	if (ds1_queued && ds2_queued) {
+		/*
+		 * If both are queued, we don't need to do anything.
+		 * Alternatively, we could check for EEXIST from
+		 * zap_add_int_key() and back out to the original state, but
+		 * that would be more work than checking for this case upfront.
+		 */
+	} else if (ds1_queued) {
+		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
+		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx));
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds1->ds_object,
+		    (u_longlong_t)ds2->ds_object);
+	} else if (ds2_queued) {
+		VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
+		VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset,
+		    scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx));
+		zfs_dbgmsg("clone_swap ds %llu; in queue; "
+		    "replacing with %llu",
+		    (u_longlong_t)ds2->ds_object,
+		    (u_longlong_t)ds1->ds_object);
+	}
+
+	dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+/* ARGSUSED */
+static int
+enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+	uint64_t originobj = *(uint64_t *)arg;
+	dsl_dataset_t *ds;
+	int err;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
+		return (0);
+
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+
+		dsl_dataset_rele(ds, FTAG);
+		if (err)
+			return (err);
+		ds = prev;
+	}
+	scan_ds_queue_insert(scn, ds->ds_object,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+static void
+dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = scn->scn_dp;
+	dsl_dataset_t *ds;
+
+	VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+
+	if (scn->scn_phys.scn_cur_min_txg >=
+	    scn->scn_phys.scn_max_txg) {
+		/*
+		 * This can happen if this snapshot was created after the
+		 * scan started, and we already completed a previous snapshot
+		 * that was created after the scan started.  This snapshot
+		 * only references blocks with:
+		 *
+		 *	birth < our ds_creation_txg
+		 *	cur_min_txg is no less than ds_creation_txg.
+		 *	We have already visited these blocks.
+		 * or
+		 *	birth > scn_max_txg
+		 *	The scan requested not to visit these blocks.
+		 *
+		 * Subsequent snapshots (and clones) can reference our
+		 * blocks, or blocks with even higher birth times.
+		 * Therefore we do not need to visit them either,
+		 * so we do not add them to the work queue.
+		 *
+		 * Note that checking for cur_min_txg >= cur_max_txg
+		 * is not sufficient, because in that case we may need to
+		 * visit subsequent snapshots.  This happens when min_txg > 0,
+		 * which raises cur_min_txg.  In this case we will visit
+		 * this dataset but skip all of its blocks, because the
+		 * rootbp's birth time is < cur_min_txg.  Then we will
+		 * add the next snapshots/clones to the work queue.
+		 */
+		char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+		dsl_dataset_name(ds, dsname);
+		zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
+		    "cur_min_txg (%llu) >= max_txg (%llu)",
+		    (longlong_t)dsobj, dsname,
+		    (longlong_t)scn->scn_phys.scn_cur_min_txg,
+		    (longlong_t)scn->scn_phys.scn_max_txg);
+		kmem_free(dsname, MAXNAMELEN);
+
+		goto out;
+	}
+
+	/*
+	 * Only the ZIL in the head (non-snapshot) is valid. Even though
+	 * snapshots can have ZIL block pointers (which may be the same
+	 * BP as in the head), they must be ignored. In addition, $ORIGIN
+	 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
+	 * need to look for a ZIL in it either. So we traverse the ZIL here,
+	 * rather than in scan_recurse(), because the regular snapshot
+	 * block-sharing rules don't apply to it.
+	 */
+	if (!dsl_dataset_is_snapshot(ds) &&
+	    (dp->dp_origin_snap == NULL ||
+	    ds->ds_dir != dp->dp_origin_snap->ds_dir)) {
+		objset_t *os;
+		if (dmu_objset_from_ds(ds, &os) != 0) {
+			goto out;
+		}
+		dsl_scan_zil(dp, &os->os_zil_header);
+	}
+
+	/*
+	 * Iterate over the bps in this ds.
+	 */
+	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+	dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
+	rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+	char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
+	dsl_dataset_name(ds, dsname);
+	zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
+	    "suspending=%u",
+	    (longlong_t)dsobj, dsname,
+	    (longlong_t)scn->scn_phys.scn_cur_min_txg,
+	    (longlong_t)scn->scn_phys.scn_cur_max_txg,
+	    (int)scn->scn_suspending);
+	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
+
+	if (scn->scn_suspending)
+		goto out;
+
+	/*
+	 * We've finished this pass over this dataset.
+	 */
+
+	/*
+	 * If we did not completely visit this dataset, do another pass.
+	 */
+	if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
+		zfs_dbgmsg("incomplete pass; visiting again");
+		scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
+		scan_ds_queue_insert(scn, ds->ds_object,
+		    scn->scn_phys.scn_cur_max_txg);
+		goto out;
+	}
+
+	/*
+	 * Add descendant datasets to work queue.
+	 */
+	if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
+		scan_ds_queue_insert(scn,
+		    dsl_dataset_phys(ds)->ds_next_snap_obj,
+		    dsl_dataset_phys(ds)->ds_creation_txg);
+	}
+	if (dsl_dataset_phys(ds)->ds_num_children > 1) {
+		boolean_t usenext = B_FALSE;
+		if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+			uint64_t count;
+			/*
+			 * A bug in a previous version of the code could
+			 * cause upgrade_clones_cb() to not set
+			 * ds_next_snap_obj when it should, leading to a
+			 * missing entry.  Therefore we can only use the
+			 * next_clones_obj when its count is correct.
+			 */
+			int err = zap_count(dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
+			if (err == 0 &&
+			    count == dsl_dataset_phys(ds)->ds_num_children - 1)
+				usenext = B_TRUE;
+		}
+
+		if (usenext) {
+			zap_cursor_t zc;
+			zap_attribute_t za;
+			for (zap_cursor_init(&zc, dp->dp_meta_objset,
+			    dsl_dataset_phys(ds)->ds_next_clones_obj);
+			    zap_cursor_retrieve(&zc, &za) == 0;
+			    (void) zap_cursor_advance(&zc)) {
+				scan_ds_queue_insert(scn,
+				    zfs_strtonum(za.za_name, NULL),
+				    dsl_dataset_phys(ds)->ds_creation_txg);
+			}
+			zap_cursor_fini(&zc);
+		} else {
+			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+			    enqueue_clones_cb, &ds->ds_object,
+			    DS_FIND_CHILDREN));
+		}
+	}
+
+out:
+	dsl_dataset_rele(ds, FTAG);
+}
+
+/* ARGSUSED */
+static int
+enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
+{
+	dsl_dataset_t *ds;
+	int err;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
+	if (err)
+		return (err);
+
+	while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+		dsl_dataset_t *prev;
+		err = dsl_dataset_hold_obj(dp,
+		    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
+		if (err) {
+			dsl_dataset_rele(ds, FTAG);
+			return (err);
+		}
+
+		/*
+		 * If this is a clone, we don't need to worry about it for now.
+		 */
+		if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
+			dsl_dataset_rele(ds, FTAG);
+			dsl_dataset_rele(prev, FTAG);
+			return (0);
+		}
+		dsl_dataset_rele(ds, FTAG);
+		ds = prev;
+	}
+
+	scan_ds_queue_insert(scn, ds->ds_object,
+	    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	const ddt_key_t *ddk = &dde->dde_key;
+	ddt_phys_t *ddp = dde->dde_phys;
+	blkptr_t bp;
+	zbookmark_phys_t zb = { 0 };
+	int p;
+
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	/*
+	 * This function is special because it is the only thing
+	 * that can add scan_io_t's to the vdev scan queues from
+	 * outside dsl_scan_sync(). For the most part this is ok
+	 * as long as it is called from within syncing context.
+	 * However, dsl_scan_sync() expects that no new sio's will
+	 * be added between when all the work for a scan is done
+	 * and the next txg when the scan is actually marked as
+	 * completed. This check ensures we do not issue new sio's
+	 * during this period.
+	 */
+	if (scn->scn_done_txg != 0)
+		return;
+
+	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+			continue;
+		ddt_bp_create(checksum, ddk, ddp, &bp);
+
+		scn->scn_visited_this_txg++;
+		scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+	}
+}
+
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark;
+	ddt_entry_t dde;
+	int error;
+	uint64_t n = 0;
+
+	bzero(&dde, sizeof (ddt_entry_t));
+
+	while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) {
+		ddt_t *ddt;
+
+		if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max)
+			break;
+		dprintf("visiting ddb=%llu/%llu/%llu/%llx\n",
+		    (longlong_t)ddb->ddb_class,
+		    (longlong_t)ddb->ddb_type,
+		    (longlong_t)ddb->ddb_checksum,
+		    (longlong_t)ddb->ddb_cursor);
+
+		/* There should be no pending changes to the dedup table */
+		ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum];
+		ASSERT(avl_first(&ddt->ddt_tree) == NULL);
+
+		dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx);
+		n++;
+
+		if (dsl_scan_check_suspend(scn, NULL))
+			break;
+	}
+
+	zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; "
+	    "suspending=%u", (longlong_t)n,
+	    (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending);
+
+	ASSERT(error == 0 || error == ENOENT);
+	ASSERT(error != ENOENT ||
+	    ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
+}
+
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+{
+	uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+	if (ds->ds_is_snapshot)
+		return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+	return (smt);
+}
+
+static void
+dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+	scan_ds_t *sds;
+	dsl_pool_t *dp = scn->scn_dp;
+
+	if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
+	    scn->scn_phys.scn_ddt_class_max) {
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_ddt(scn, tx);
+		if (scn->scn_suspending)
+			return;
+	}
+
+	if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) {
+		/* First do the MOS & ORIGIN */
+
+		scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg;
+		scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg;
+		dsl_scan_visit_rootbp(scn, NULL,
+		    &dp->dp_meta_rootbp, tx);
+		spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+		if (scn->scn_suspending)
+			return;
+
+		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+			    enqueue_cb, NULL, DS_FIND_CHILDREN));
+		} else {
+			dsl_scan_visitds(scn,
+			    dp->dp_origin_snap->ds_object, tx);
+		}
+		ASSERT(!scn->scn_suspending);
+	} else if (scn->scn_phys.scn_bookmark.zb_objset !=
+	    ZB_DESTROYED_OBJSET) {
+		uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
+		/*
+		 * If we were suspended, continue from here. Note if the
+		 * ds we were suspended on was deleted, the zb_objset may
+		 * be -1, so we will skip this and find a new objset
+		 * below.
+		 */
+		dsl_scan_visitds(scn, dsobj, tx);
+		if (scn->scn_suspending)
+			return;
+	}
+
+	/*
+	 * In case we suspended right at the end of the ds, zero the
+	 * bookmark so we don't think that we're still trying to resume.
+	 */
+	bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
+
+	/*
+	 * Keep pulling things out of the dataset avl queue. Updates to the
+	 * persistent zap-object-as-queue happen only at checkpoints.
+	 */
+	while ((sds = avl_first(&scn->scn_queue)) != NULL) {
+		dsl_dataset_t *ds;
+		uint64_t dsobj = sds->sds_dsobj;
+		uint64_t txg = sds->sds_txg;
+
+		/* dequeue and free the ds from the queue */
+		scan_ds_queue_remove(scn, dsobj);
+		sds = NULL;
+
+		/* set up min / max txg */
+		VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+		if (txg != 0) {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg, txg);
+		} else {
+			scn->scn_phys.scn_cur_min_txg =
+			    MAX(scn->scn_phys.scn_min_txg,
+			    dsl_dataset_phys(ds)->ds_prev_snap_txg);
+		}
+		scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
+		dsl_dataset_rele(ds, FTAG);
+
+		dsl_scan_visitds(scn, dsobj, tx);
+		if (scn->scn_suspending)
+			return;
+	}
+
+	/* No more objsets to fetch, we're done */
+	scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
+	ASSERT0(scn->scn_suspending);
+}
+
+static uint64_t
+dsl_scan_count_leaves(vdev_t *vd)
+{
+	uint64_t i, leaves = 0;
+
+	/* we only count leaves that belong to the main pool and are readable */
+	if (vd->vdev_islog || vd->vdev_isspare ||
+	    vd->vdev_isl2cache || !vdev_readable(vd))
+		return (0);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (1);
+
+	for (i = 0; i < vd->vdev_children; i++) {
+		leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
+	}
+
+	return (leaves);
+}
+
+static void
+scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
+{
+	int i;
+	uint64_t cur_size = 0;
+
+	for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+		cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
+	}
+
+	q->q_total_zio_size_this_txg += cur_size;
+	q->q_zios_this_txg++;
+}
+
+static void
+scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
+    uint64_t end)
+{
+	q->q_total_seg_size_this_txg += end - start;
+	q->q_segs_this_txg++;
+}
+
+static boolean_t
+scan_io_queue_check_suspend(dsl_scan_t *scn)
+{
+	/* See comment in dsl_scan_check_suspend() */
+	uint64_t curr_time_ns = gethrtime();
+	uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+	uint64_t sync_time_ns = curr_time_ns -
+	    scn->scn_dp->dp_spa->spa_sync_starttime;
+	int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+	int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+	    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+	return ((NSEC2MSEC(scan_time_ns) > mintime &&
+	    (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+	    txg_sync_waiting(scn->scn_dp) ||
+	    NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+/*
+ * Given a list of scan_io_t's in io_list, this issues the I/Os out to
+ * disk. This consumes the io_list and frees the scan_io_t's. This is
+ * called when emptying queues, either when we're up against the memory
+ * limit or when we have finished scanning. Returns B_TRUE if we stopped
+ * processing the list before we finished. Any sios that were not issued
+ * will remain in the io_list.
+ */
+static boolean_t
+scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
+{
+	dsl_scan_t *scn = queue->q_scn;
+	scan_io_t *sio;
+	int64_t bytes_issued = 0;
+	boolean_t suspended = B_FALSE;
+
+	while ((sio = list_head(io_list)) != NULL) {
+		blkptr_t bp;
+
+		if (scan_io_queue_check_suspend(scn)) {
+			suspended = B_TRUE;
+			break;
+		}
+
+		sio2bp(sio, &bp);
+		bytes_issued += SIO_GET_ASIZE(sio);
+		scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
+		    &sio->sio_zb, queue);
+		(void) list_remove_head(io_list);
+		scan_io_queues_update_zio_stats(queue, &bp);
+		sio_free(sio);
+	}
+
+	atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
+
+	return (suspended);
+}
+
+/*
+ * This function removes sios from an IO queue which reside within a given
+ * range_seg_t and inserts them (in offset order) into a list. Note that
+ * we only ever return a maximum of 32 sios at once. If there are more sios
+ * to process within this segment that did not make it onto the list we
+ * return B_TRUE and otherwise B_FALSE.
+ */
+static boolean_t
+scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
+{
+	scan_io_t *srch_sio, *sio, *next_sio;
+	avl_index_t idx;
+	uint_t num_sios = 0;
+	int64_t bytes_issued = 0;
+
+	ASSERT(rs != NULL);
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+	srch_sio = sio_alloc(1);
+	srch_sio->sio_nr_dvas = 1;
+	SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr));
+
+	/*
+	 * The exact start of the extent might not contain any matching zios,
+	 * so if that's the case, examine the next one in the tree.
+	 */
+	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+	sio_free(srch_sio);
+
+	if (sio == NULL)
+		sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
+
+	while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+	    queue->q_exts_by_addr) && num_sios <= 32) {
+		ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs,
+		    queue->q_exts_by_addr));
+		ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs,
+		    queue->q_exts_by_addr));
+
+		next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
+		avl_remove(&queue->q_sios_by_addr, sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
+
+		bytes_issued += SIO_GET_ASIZE(sio);
+		num_sios++;
+		list_insert_tail(list, sio);
+		sio = next_sio;
+	}
+
+	/*
+	 * We limit the number of sios we process at once to 32 to avoid
+	 * biting off more than we can chew. If we didn't take everything
+	 * in the segment we update it to reflect the work we were able to
+	 * complete. Otherwise, we remove it from the range tree entirely.
+	 */
+	if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs,
+	    queue->q_exts_by_addr)) {
+		range_tree_adjust_fill(queue->q_exts_by_addr, rs,
+		    -bytes_issued);
+		range_tree_resize_segment(queue->q_exts_by_addr, rs,
+		    SIO_GET_OFFSET(sio), rs_get_end(rs,
+		    queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
+
+		return (B_TRUE);
+	} else {
+		uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
+		uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
+		range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
+		return (B_FALSE);
+	}
+}
+
+/*
+ * This is called from the queue emptying thread and selects the next
+ * extent from which we are to issue I/Os. The behavior of this function
+ * depends on the state of the scan, the current memory consumption and
+ * whether or not we are performing a scan shutdown.
+ * 1) We select extents in an elevator algorithm (LBA-order) if the scan
+ * 	needs to perform a checkpoint
+ * 2) We select the largest available extent if we are up against the
+ * 	memory limit.
+ * 3) Otherwise we don't select any extents.
+ */
+static range_seg_t *
+scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
+{
+	dsl_scan_t *scn = queue->q_scn;
+	range_tree_t *rt = queue->q_exts_by_addr;
+
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+	ASSERT(scn->scn_is_sorted);
+
+	/* handle tunable overrides */
+	if (scn->scn_checkpointing || scn->scn_clearing) {
+		if (zfs_scan_issue_strategy == 1) {
+			return (range_tree_first(rt));
+		} else if (zfs_scan_issue_strategy == 2) {
+			/*
+			 * We need to get the original entry in the by_addr
+			 * tree so we can modify it.
+			 */
+			range_seg_t *size_rs =
+			    zfs_btree_first(&queue->q_exts_by_size, NULL);
+			if (size_rs == NULL)
+				return (NULL);
+			uint64_t start = rs_get_start(size_rs, rt);
+			uint64_t size = rs_get_end(size_rs, rt) - start;
+			range_seg_t *addr_rs = range_tree_find(rt, start,
+			    size);
+			ASSERT3P(addr_rs, !=, NULL);
+			ASSERT3U(rs_get_start(size_rs, rt), ==,
+			    rs_get_start(addr_rs, rt));
+			ASSERT3U(rs_get_end(size_rs, rt), ==,
+			    rs_get_end(addr_rs, rt));
+			return (addr_rs);
+		}
+	}
+
+	/*
+	 * During normal clearing, we want to issue our largest segments
+	 * first, keeping IO as sequential as possible, and leaving the
+	 * smaller extents for later with the hope that they might eventually
+	 * grow to larger sequential segments. However, when the scan is
+	 * checkpointing, no new extents will be added to the sorting queue,
+	 * so the way we are sorted now is as good as it will ever get.
+	 * In this case, we instead switch to issuing extents in LBA order.
+	 */
+	if (scn->scn_checkpointing) {
+		return (range_tree_first(rt));
+	} else if (scn->scn_clearing) {
+		/*
+		 * We need to get the original entry in the by_addr
+		 * tree so we can modify it.
+		 */
+		range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size,
+		    NULL);
+		if (size_rs == NULL)
+			return (NULL);
+		uint64_t start = rs_get_start(size_rs, rt);
+		uint64_t size = rs_get_end(size_rs, rt) - start;
+		range_seg_t *addr_rs = range_tree_find(rt, start, size);
+		ASSERT3P(addr_rs, !=, NULL);
+		ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs,
+		    rt));
+		ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt));
+		return (addr_rs);
+	} else {
+		return (NULL);
+	}
+}
+
+static void
+scan_io_queues_run_one(void *arg)
+{
+	dsl_scan_io_queue_t *queue = arg;
+	kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+	boolean_t suspended = B_FALSE;
+	range_seg_t *rs = NULL;
+	scan_io_t *sio = NULL;
+	list_t sio_list;
+	uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+	uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
+
+	ASSERT(queue->q_scn->scn_is_sorted);
+
+	list_create(&sio_list, sizeof (scan_io_t),
+	    offsetof(scan_io_t, sio_nodes.sio_list_node));
+	mutex_enter(q_lock);
+
+	/* calculate maximum in-flight bytes for this txg (min 1MB) */
+	queue->q_maxinflight_bytes =
+	    MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+	/* reset per-queue scan statistics for this txg */
+	queue->q_total_seg_size_this_txg = 0;
+	queue->q_segs_this_txg = 0;
+	queue->q_total_zio_size_this_txg = 0;
+	queue->q_zios_this_txg = 0;
+
+	/* loop until we run out of time or sios */
+	while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
+		uint64_t seg_start = 0, seg_end = 0;
+		boolean_t more_left = B_TRUE;
+
+		ASSERT(list_is_empty(&sio_list));
+
+		/* loop while we still have sios left to process in this rs */
+		while (more_left) {
+			scan_io_t *first_sio, *last_sio;
+
+			/*
+			 * We have selected which extent needs to be
+			 * processed next. Gather up the corresponding sios.
+			 */
+			more_left = scan_io_queue_gather(queue, rs, &sio_list);
+			ASSERT(!list_is_empty(&sio_list));
+			first_sio = list_head(&sio_list);
+			last_sio = list_tail(&sio_list);
+
+			seg_end = SIO_GET_END_OFFSET(last_sio);
+			if (seg_start == 0)
+				seg_start = SIO_GET_OFFSET(first_sio);
+
+			/*
+			 * Issuing sios can take a long time so drop the
+			 * queue lock. The sio queue won't be updated by
+			 * other threads since we're in syncing context so
+			 * we can be sure that our trees will remain exactly
+			 * as we left them.
+			 */
+			mutex_exit(q_lock);
+			suspended = scan_io_queue_issue(queue, &sio_list);
+			mutex_enter(q_lock);
+
+			if (suspended)
+				break;
+		}
+
+		/* update statistics for debugging purposes */
+		scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
+
+		if (suspended)
+			break;
+	}
+
+	/*
+	 * If we were suspended in the middle of processing,
+	 * requeue any unfinished sios and exit.
+	 */
+	while ((sio = list_head(&sio_list)) != NULL) {
+		list_remove(&sio_list, sio);
+		scan_io_queue_insert_impl(queue, sio);
+	}
+
+	mutex_exit(q_lock);
+	list_destroy(&sio_list);
+}
+
+/*
+ * Performs an emptying run on all scan queues in the pool. This just
+ * punches out one thread per top-level vdev, each of which processes
+ * only that vdev's scan queue. We can parallelize the I/O here because
+ * we know that each queue's I/Os only affect its own top-level vdev.
+ *
+ * This function waits for the queue runs to complete, and must be
+ * called from dsl_scan_sync (or in general, syncing context).
+ */
+static void
+scan_io_queues_run(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+
+	ASSERT(scn->scn_is_sorted);
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+	if (scn->scn_bytes_pending == 0)
+		return;
+
+	if (scn->scn_taskq == NULL) {
+		int nthreads = spa->spa_root_vdev->vdev_children;
+
+		/*
+		 * We need to make this taskq *always* execute as many
+		 * threads in parallel as we have top-level vdevs and no
+		 * less, otherwise strange serialization of the calls to
+		 * scan_io_queues_run_one can occur during spa_sync runs
+		 * and that significantly impacts performance.
+		 */
+		scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
+		    minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
+	}
+
+	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+
+		mutex_enter(&vd->vdev_scan_io_queue_lock);
+		if (vd->vdev_scan_io_queue != NULL) {
+			VERIFY(taskq_dispatch(scn->scn_taskq,
+			    scan_io_queues_run_one, vd->vdev_scan_io_queue,
+			    TQ_SLEEP) != TASKQID_INVALID);
+		}
+		mutex_exit(&vd->vdev_scan_io_queue_lock);
+	}
+
+	/*
+	 * Wait for the queues to finish issuing their IOs for this run
+	 * before we return. There may still be IOs in flight at this
+	 * point.
+	 */
+	taskq_wait(scn->scn_taskq);
+}
+
+static boolean_t
+dsl_scan_async_block_should_pause(dsl_scan_t *scn)
+{
+	uint64_t elapsed_nanosecs;
+
+	if (zfs_recover)
+		return (B_FALSE);
+
+	if (zfs_async_block_max_blocks != 0 &&
+	    scn->scn_visited_this_txg >= zfs_async_block_max_blocks) {
+		return (B_TRUE);
+	}
+
+	if (zfs_max_async_dedup_frees != 0 &&
+	    scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) {
+		return (B_TRUE);
+	}
+
+	elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+	return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+	    (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
+	    txg_sync_waiting(scn->scn_dp)) ||
+	    spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+static int
+dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = arg;
+
+	if (!scn->scn_is_bptree ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
+		if (dsl_scan_async_block_should_pause(scn))
+			return (SET_ERROR(ERESTART));
+	}
+
+	zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa,
+	    dmu_tx_get_txg(tx), bp, 0));
+	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+	    -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp),
+	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+	scn->scn_visited_this_txg++;
+	if (BP_GET_DEDUP(bp))
+		scn->scn_dedup_frees_this_txg++;
+	return (0);
+}
+
+static void
+dsl_scan_update_stats(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	uint64_t i;
+	uint64_t seg_size_total = 0, zio_size_total = 0;
+	uint64_t seg_count_total = 0, zio_count_total = 0;
+
+	for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+		dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
+
+		if (queue == NULL)
+			continue;
+
+		seg_size_total += queue->q_total_seg_size_this_txg;
+		zio_size_total += queue->q_total_zio_size_this_txg;
+		seg_count_total += queue->q_segs_this_txg;
+		zio_count_total += queue->q_zios_this_txg;
+	}
+
+	if (seg_count_total == 0 || zio_count_total == 0) {
+		scn->scn_avg_seg_size_this_txg = 0;
+		scn->scn_avg_zio_size_this_txg = 0;
+		scn->scn_segs_this_txg = 0;
+		scn->scn_zios_this_txg = 0;
+		return;
+	}
+
+	scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
+	scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
+	scn->scn_segs_this_txg = seg_count_total;
+	scn->scn_zios_this_txg = zio_count_total;
+}
+
+static int
+bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(!bp_freed);
+	return (dsl_scan_free_block_cb(arg, bp, tx));
+}
+
+static int
+dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(!bp_freed);
+	dsl_scan_t *scn = arg;
+	const dva_t *dva = &bp->blk_dva[0];
+
+	if (dsl_scan_async_block_should_pause(scn))
+		return (SET_ERROR(ERESTART));
+
+	spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
+	    DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
+	    DVA_GET_ASIZE(dva), tx);
+	scn->scn_visited_this_txg++;
+	return (0);
+}
+
+boolean_t
+dsl_scan_active(dsl_scan_t *scn)
+{
+	spa_t *spa = scn->scn_dp->dp_spa;
+	uint64_t used = 0, comp, uncomp;
+	boolean_t clones_left;
+
+	if (spa->spa_load_state != SPA_LOAD_NONE)
+		return (B_FALSE);
+	if (spa_shutting_down(spa))
+		return (B_FALSE);
+	if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
+	    (scn->scn_async_destroying && !scn->scn_async_stalled))
+		return (B_TRUE);
+
+	if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+		(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
+		    &used, &comp, &uncomp);
+	}
+	clones_left = spa_livelist_delete_check(spa);
+	return ((used != 0) || (clones_left));
+}
+
+static boolean_t
+dsl_scan_check_deferred(vdev_t *vd)
+{
+	boolean_t need_resilver = B_FALSE;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		need_resilver |=
+		    dsl_scan_check_deferred(vd->vdev_child[c]);
+	}
+
+	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+	    !vd->vdev_ops->vdev_op_leaf)
+		return (need_resilver);
+
+	if (!vd->vdev_resilver_deferred)
+		need_resilver = B_TRUE;
+
+	return (need_resilver);
+}
+
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	vdev_t *vd;
+
+	vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+	if (vd->vdev_ops == &vdev_indirect_ops) {
+		/*
+		 * The indirect vdev can point to multiple
+		 * vdevs.  For simplicity, always create
+		 * the resilver zio_t. zio_vdev_io_start()
+		 * will bypass the child resilver i/o's if
+		 * they are on vdevs that don't have DTL's.
+		 */
+		return (B_TRUE);
+	}
+
+	if (DVA_GET_GANG(dva)) {
+		/*
+		 * Gang members may be spread across multiple
+		 * vdevs, so the best estimate we have is the
+		 * scrub range, which has already been checked.
+		 * XXX -- it would be better to change our
+		 * allocation policy to ensure that all
+		 * gang members reside on the same vdev.
+		 */
+		return (B_TRUE);
+	}
+
+	/*
+	 * Check if the top-level vdev must resilver this offset.
+	 * When the offset does not intersect with a dirty leaf DTL
+	 * then it may be possible to skip the resilver IO.  The psize
+	 * is provided instead of asize to simplify the check for RAIDZ.
+	 */
+	if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth))
+		return (B_FALSE);
+
+	/*
+	 * Check that this top-level vdev has a device under it which
+	 * is resilvering and is not deferred.
+	 */
+	if (!dsl_scan_check_deferred(vd))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+	int err = 0;
+
+	if (spa_suspend_async_destroy(spa))
+		return (0);
+
+	if (zfs_free_bpobj_enabled &&
+	    spa_version(spa) >= SPA_VERSION_DEADLISTS) {
+		scn->scn_is_bptree = B_FALSE;
+		scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
+		scn->scn_zio_root = zio_root(spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bpobj_iterate(&dp->dp_free_bpobj,
+		    bpobj_dsl_scan_free_block_cb, scn, tx);
+		VERIFY0(zio_wait(scn->scn_zio_root));
+		scn->scn_zio_root = NULL;
+
+		if (err != 0 && err != ERESTART)
+			zfs_panic_recover("error %u from bpobj_iterate()", err);
+	}
+
+	if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+		ASSERT(scn->scn_async_destroying);
+		scn->scn_is_bptree = B_TRUE;
+		scn->scn_zio_root = zio_root(spa, NULL,
+		    NULL, ZIO_FLAG_MUSTSUCCEED);
+		err = bptree_iterate(dp->dp_meta_objset,
+		    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+		VERIFY0(zio_wait(scn->scn_zio_root));
+		scn->scn_zio_root = NULL;
+
+		if (err == EIO || err == ECKSUM) {
+			err = 0;
+		} else if (err != 0 && err != ERESTART) {
+			zfs_panic_recover("error %u from "
+			    "traverse_dataset_destroyed()", err);
+		}
+
+		if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+			/* finished; deactivate async destroy feature */
+			spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+			ASSERT(!spa_feature_is_active(spa,
+			    SPA_FEATURE_ASYNC_DESTROY));
+			VERIFY0(zap_remove(dp->dp_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_BPTREE_OBJ, tx));
+			VERIFY0(bptree_free(dp->dp_meta_objset,
+			    dp->dp_bptree_obj, tx));
+			dp->dp_bptree_obj = 0;
+			scn->scn_async_destroying = B_FALSE;
+			scn->scn_async_stalled = B_FALSE;
+		} else {
+			/*
+			 * If we didn't make progress, mark the async
+			 * destroy as stalled, so that we will not initiate
+			 * a spa_sync() on its behalf.  Note that we only
+			 * check this if we are not finished, because if the
+			 * bptree had no blocks for us to visit, we can
+			 * finish without "making progress".
+			 */
+			scn->scn_async_stalled =
+			    (scn->scn_visited_this_txg == 0);
+		}
+	}
+	if (scn->scn_visited_this_txg) {
+		zfs_dbgmsg("freed %llu blocks in %llums from "
+		    "free_bpobj/bptree txg %llu; err=%u",
+		    (longlong_t)scn->scn_visited_this_txg,
+		    (longlong_t)
+		    NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+		    (longlong_t)tx->tx_txg, err);
+		scn->scn_visited_this_txg = 0;
+		scn->scn_dedup_frees_this_txg = 0;
+
+		/*
+		 * Write out changes to the DDT that may be required as a
+		 * result of the blocks freed.  This ensures that the DDT
+		 * is clean when a scrub/resilver runs.
+		 */
+		ddt_sync(spa, tx->tx_txg);
+	}
+	if (err != 0)
+		return (err);
+	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+	    zfs_free_leak_on_eio &&
+	    (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+	    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
+		/*
+		 * We have finished background destroying, but there is still
+		 * some space left in the dp_free_dir. Transfer this leaked
+		 * space to the dp_leak_dir.
+		 */
+		if (dp->dp_leak_dir == NULL) {
+			rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+			(void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+			    LEAK_DIR_NAME, tx);
+			VERIFY0(dsl_pool_open_special_dir(dp,
+			    LEAK_DIR_NAME, &dp->dp_leak_dir));
+			rrw_exit(&dp->dp_config_rwlock, FTAG);
+		}
+		dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+		dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+		    -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+	}
+
+	if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+	    !spa_livelist_delete_check(spa)) {
+		/* finished; verify that space accounting went to zero */
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+		ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+	}
+
+	spa_notify_waiters(spa);
+
+	EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
+	    0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_OBSOLETE_BPOBJ));
+	if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+		ASSERT(spa_feature_is_active(dp->dp_spa,
+		    SPA_FEATURE_OBSOLETE_COUNTS));
+
+		scn->scn_is_bptree = B_FALSE;
+		scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
+		err = bpobj_iterate(&dp->dp_obsolete_bpobj,
+		    dsl_scan_obsolete_block_cb, scn, tx);
+		if (err != 0 && err != ERESTART)
+			zfs_panic_recover("error %u from bpobj_iterate()", err);
+
+		if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
+			dsl_pool_destroy_obsolete_bpobj(dp, tx);
+	}
+	return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * can guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this function controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	int err = 0;
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+	state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+	if (spa->spa_resilver_deferred &&
+	    !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+		spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+
+	/*
+	 * Check for scn_restart_txg before checking spa_load_state, so
+	 * that we can restart an old-style scan while the pool is being
+	 * imported (see dsl_scan_init). We also restart scans if there
+	 * is a deferred resilver and the user has manually disabled
+	 * deferred resilvers via the tunable.
+	 */
+	if (dsl_scan_restarting(scn, tx) ||
+	    (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_done(scn, B_FALSE, tx);
+		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+			func = POOL_SCAN_RESILVER;
+		zfs_dbgmsg("restarting scan func=%u txg=%llu",
+		    func, (longlong_t)tx->tx_txg);
+		dsl_scan_setup_sync(&func, tx);
+	}
+
+	/*
+	 * Only process scans in sync pass 1.
+	 */
+	if (spa_sync_pass(spa) > 1)
+		return;
+
+	/*
+	 * If the spa is shutting down, then stop scanning. This will
+	 * ensure that the scan does not dirty any new data during the
+	 * shutdown phase.
+	 */
+	if (spa_shutting_down(spa))
+		return;
+
+	/*
+	 * If the scan is inactive due to a stalled async destroy, try again.
+	 */
+	if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+		return;
+
+	/* reset scan statistics */
+	scn->scn_visited_this_txg = 0;
+	scn->scn_dedup_frees_this_txg = 0;
+	scn->scn_holes_this_txg = 0;
+	scn->scn_lt_min_this_txg = 0;
+	scn->scn_gt_max_this_txg = 0;
+	scn->scn_ddt_contained_this_txg = 0;
+	scn->scn_objsets_visited_this_txg = 0;
+	scn->scn_avg_seg_size_this_txg = 0;
+	scn->scn_segs_this_txg = 0;
+	scn->scn_avg_zio_size_this_txg = 0;
+	scn->scn_zios_this_txg = 0;
+	scn->scn_suspending = B_FALSE;
+	scn->scn_sync_start_time = gethrtime();
+	spa->spa_scrub_active = B_TRUE;
+
+	/*
+	 * First process the async destroys.  If we suspend, don't do
+	 * any scrubbing or resilvering.  This ensures that there are no
+	 * async destroys while we are scanning, so the scan code doesn't
+	 * have to worry about traversing it.  It is also faster to free the
+	 * blocks than to scrub them.
+	 */
+	err = dsl_process_async_destroys(dp, tx);
+	if (err != 0)
+		return;
+
+	if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
+		return;
+
+	/*
+	 * Wait a few txgs after importing to begin scanning so that
+	 * we can get the pool imported quickly.
+	 */
+	if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
+		return;
+
+	/*
+	 * zfs_scan_suspend_progress can be set to disable scan progress.
+	 * We don't want to spin the txg_sync thread, so we add a delay
+	 * here to simulate the time spent doing a scan. This is mostly
+	 * useful for testing and debugging.
+	 */
+	if (zfs_scan_suspend_progress) {
+		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+		    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+		while (zfs_scan_suspend_progress &&
+		    !txg_sync_waiting(scn->scn_dp) &&
+		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
+		    NSEC2MSEC(scan_time_ns) < mintime) {
+			delay(hz);
+			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		}
+		return;
+	}
+
+	/*
+	 * It is possible to switch from unsorted to sorted at any time,
+	 * but afterwards the scan will remain sorted unless reloaded from
+	 * a checkpoint after a reboot.
+	 */
+	if (!zfs_scan_legacy) {
+		scn->scn_is_sorted = B_TRUE;
+		if (scn->scn_last_checkpoint == 0)
+			scn->scn_last_checkpoint = ddi_get_lbolt();
+	}
+
+	/*
+	 * For sorted scans, determine what kind of work we will be doing
+	 * this txg based on our memory limitations and whether or not we
+	 * need to perform a checkpoint.
+	 */
+	if (scn->scn_is_sorted) {
+		/*
+		 * If we are over our checkpoint interval, set scn_clearing
+		 * so that we can begin checkpointing immediately. The
+		 * checkpoint allows us to save a consistent bookmark
+		 * representing how much data we have scrubbed so far.
+		 * Otherwise, use the memory limit to determine if we should
+		 * scan for metadata or start issue scrub IOs. We accumulate
+		 * metadata until we hit our hard memory limit at which point
+		 * we issue scrub IOs until we are at our soft memory limit.
+		 */
+		if (scn->scn_checkpointing ||
+		    ddi_get_lbolt() - scn->scn_last_checkpoint >
+		    SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
+			if (!scn->scn_checkpointing)
+				zfs_dbgmsg("begin scan checkpoint");
+
+			scn->scn_checkpointing = B_TRUE;
+			scn->scn_clearing = B_TRUE;
+		} else {
+			boolean_t should_clear = dsl_scan_should_clear(scn);
+			if (should_clear && !scn->scn_clearing) {
+				zfs_dbgmsg("begin scan clearing");
+				scn->scn_clearing = B_TRUE;
+			} else if (!should_clear && scn->scn_clearing) {
+				zfs_dbgmsg("finish scan clearing");
+				scn->scn_clearing = B_FALSE;
+			}
+		}
+	} else {
+		ASSERT0(scn->scn_checkpointing);
+		ASSERT0(scn->scn_clearing);
+	}
+
+	if (!scn->scn_clearing && scn->scn_done_txg == 0) {
+		/* Need to scan metadata for more blocks to scrub */
+		dsl_scan_phys_t *scnp = &scn->scn_phys;
+		taskqid_t prefetch_tqid;
+		uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+		uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
+
+		/*
+		 * Recalculate the max number of in-flight bytes for pool-wide
+		 * scanning operations (minimum 1MB). Limits for the issuing
+		 * phase are done per top-level vdev and are handled separately.
+		 */
+		scn->scn_maxinflight_bytes =
+		    MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+		if (scnp->scn_ddt_bookmark.ddb_class <=
+		    scnp->scn_ddt_class_max) {
+			ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
+			zfs_dbgmsg("doing scan sync txg %llu; "
+			    "ddt bm=%llu/%llu/%llu/%llx",
+			    (longlong_t)tx->tx_txg,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+			    (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+		} else {
+			zfs_dbgmsg("doing scan sync txg %llu; "
+			    "bm=%llu/%llu/%llu/%llu",
+			    (longlong_t)tx->tx_txg,
+			    (longlong_t)scnp->scn_bookmark.zb_objset,
+			    (longlong_t)scnp->scn_bookmark.zb_object,
+			    (longlong_t)scnp->scn_bookmark.zb_level,
+			    (longlong_t)scnp->scn_bookmark.zb_blkid);
+		}
+
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_CANFAIL);
+
+		scn->scn_prefetch_stop = B_FALSE;
+		prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
+		    dsl_scan_prefetch_thread, scn, TQ_SLEEP);
+		ASSERT(prefetch_tqid != TASKQID_INVALID);
+
+		dsl_pool_config_enter(dp, FTAG);
+		dsl_scan_visit(scn, tx);
+		dsl_pool_config_exit(dp, FTAG);
+
+		mutex_enter(&dp->dp_spa->spa_scrub_lock);
+		scn->scn_prefetch_stop = B_TRUE;
+		cv_broadcast(&spa->spa_scrub_io_cv);
+		mutex_exit(&dp->dp_spa->spa_scrub_lock);
+
+		taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
+		(void) zio_wait(scn->scn_zio_root);
+		scn->scn_zio_root = NULL;
+
+		zfs_dbgmsg("scan visited %llu blocks in %llums "
+		    "(%llu os's, %llu holes, %llu < mintxg, "
+		    "%llu in ddt, %llu > maxtxg)",
+		    (longlong_t)scn->scn_visited_this_txg,
+		    (longlong_t)NSEC2MSEC(gethrtime() -
+		    scn->scn_sync_start_time),
+		    (longlong_t)scn->scn_objsets_visited_this_txg,
+		    (longlong_t)scn->scn_holes_this_txg,
+		    (longlong_t)scn->scn_lt_min_this_txg,
+		    (longlong_t)scn->scn_ddt_contained_this_txg,
+		    (longlong_t)scn->scn_gt_max_this_txg);
+
+		if (!scn->scn_suspending) {
+			ASSERT0(avl_numnodes(&scn->scn_queue));
+			scn->scn_done_txg = tx->tx_txg + 1;
+			if (scn->scn_is_sorted) {
+				scn->scn_checkpointing = B_TRUE;
+				scn->scn_clearing = B_TRUE;
+			}
+			zfs_dbgmsg("scan complete txg %llu",
+			    (longlong_t)tx->tx_txg);
+		}
+	} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+		ASSERT(scn->scn_clearing);
+
+		/* need to issue scrubbing IOs from per-vdev queues */
+		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_CANFAIL);
+		scan_io_queues_run(scn);
+		(void) zio_wait(scn->scn_zio_root);
+		scn->scn_zio_root = NULL;
+
+		/* calculate and dprintf the current memory usage */
+		(void) dsl_scan_should_clear(scn);
+		dsl_scan_update_stats(scn);
+
+		zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums "
+		    "(avg_block_size = %llu, avg_seg_size = %llu)",
+		    (longlong_t)scn->scn_zios_this_txg,
+		    (longlong_t)scn->scn_segs_this_txg,
+		    (longlong_t)NSEC2MSEC(gethrtime() -
+		    scn->scn_sync_start_time),
+		    (longlong_t)scn->scn_avg_zio_size_this_txg,
+		    (longlong_t)scn->scn_avg_seg_size_this_txg);
+	} else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
+		/* Finished with everything. Mark the scrub as complete */
+		zfs_dbgmsg("scan issuing complete txg %llu",
+		    (longlong_t)tx->tx_txg);
+		ASSERT3U(scn->scn_done_txg, !=, 0);
+		ASSERT0(spa->spa_scrub_inflight);
+		ASSERT0(scn->scn_bytes_pending);
+		dsl_scan_done(scn, B_TRUE, tx);
+		sync_type = SYNC_MANDATORY;
+	}
+
+	dsl_scan_sync_state(scn, tx, sync_type);
+}
+
+static void
+count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
+{
+	int i;
+
+	/*
+	 * Don't count embedded bp's, since we already did the work of
+	 * scanning these when we scanned the containing block.
+	 */
+	if (BP_IS_EMBEDDED(bp))
+		return;
+
+	/*
+	 * Update the spa's stats on how many bytes we have issued.
+	 * Sequential scrubs create a zio for each DVA of the bp. Each
+	 * of these will include all DVAs for repair purposes, but the
+	 * zio code will only try the first one unless there is an issue.
+	 * Therefore, we should only count the first DVA for these IOs.
+	 */
+	if (scn->scn_is_sorted) {
+		atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
+		    DVA_GET_ASIZE(&bp->blk_dva[0]));
+	} else {
+		spa_t *spa = scn->scn_dp->dp_spa;
+
+		for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+			atomic_add_64(&spa->spa_scan_pass_issued,
+			    DVA_GET_ASIZE(&bp->blk_dva[i]));
+		}
+	}
+
+	/*
+	 * If we resume after a reboot, zab will be NULL; don't record
+	 * incomplete stats in that case.
+	 */
+	if (zab == NULL)
+		return;
+
+	mutex_enter(&zab->zab_lock);
+
+	for (i = 0; i < 4; i++) {
+		int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
+		int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
+
+		if (t & DMU_OT_NEWTYPE)
+			t = DMU_OT_OTHER;
+		zfs_blkstat_t *zb = &zab->zab_type[l][t];
+		int equal;
+
+		zb->zb_count++;
+		zb->zb_asize += BP_GET_ASIZE(bp);
+		zb->zb_lsize += BP_GET_LSIZE(bp);
+		zb->zb_psize += BP_GET_PSIZE(bp);
+		zb->zb_gangs += BP_COUNT_GANG(bp);
+
+		switch (BP_GET_NDVAS(bp)) {
+		case 2:
+			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1]))
+				zb->zb_ditto_2_of_2_samevdev++;
+			break;
+		case 3:
+			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[1])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2])) +
+			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
+			    DVA_GET_VDEV(&bp->blk_dva[2]));
+			if (equal == 1)
+				zb->zb_ditto_2_of_3_samevdev++;
+			else if (equal == 3)
+				zb->zb_ditto_3_of_3_samevdev++;
+			break;
+		}
+	}
+
+	mutex_exit(&zab->zab_lock);
+}
+
+static void
+scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
+{
+	avl_index_t idx;
+	int64_t asize = SIO_GET_ASIZE(sio);
+	dsl_scan_t *scn = queue->q_scn;
+
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+	if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
+		/* block is already scheduled for reading */
+		atomic_add_64(&scn->scn_bytes_pending, -asize);
+		sio_free(sio);
+		return;
+	}
+	avl_insert(&queue->q_sios_by_addr, sio, idx);
+	queue->q_sio_memused += SIO_GET_MUSED(sio);
+	range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
+}
+
+/*
+ * Given all the info we got from our metadata scanning process, we
+ * construct a scan_io_t and insert it into the scan sorting queue. The
+ * I/O must already be suitable for us to process. This is controlled
+ * by dsl_scan_enqueue().
+ */
+static void
+scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
+    int zio_flags, const zbookmark_phys_t *zb)
+{
+	dsl_scan_t *scn = queue->q_scn;
+	scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
+
+	ASSERT0(BP_IS_GANG(bp));
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+	bp2sio(bp, sio, dva_i);
+	sio->sio_flags = zio_flags;
+	sio->sio_zb = *zb;
+
+	/*
+	 * Increment the bytes pending counter now so that we can't
+	 * get an integer underflow in case the worker processes the
+	 * zio before we get to incrementing this counter.
+	 */
+	atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
+
+	scan_io_queue_insert_impl(queue, sio);
+}
+
+/*
+ * Given a set of I/O parameters as discovered by the metadata traversal
+ * process, attempts to place the I/O into the sorted queues (if allowed),
+ * or immediately executes the I/O.
+ */
+static void
+dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb)
+{
+	spa_t *spa = dp->dp_spa;
+
+	ASSERT(!BP_IS_EMBEDDED(bp));
+
+	/*
+	 * Gang blocks are hard to issue sequentially, so we just issue them
+	 * here immediately instead of queuing them.
+	 */
+	if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
+		scan_exec_io(dp, bp, zio_flags, zb, NULL);
+		return;
+	}
+
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+		dva_t dva;
+		vdev_t *vdev;
+
+		dva = bp->blk_dva[i];
+		vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
+		ASSERT(vdev != NULL);
+
+		mutex_enter(&vdev->vdev_scan_io_queue_lock);
+		if (vdev->vdev_scan_io_queue == NULL)
+			vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
+		ASSERT(dp->dp_scan != NULL);
+		scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
+		    i, zio_flags, zb);
+		mutex_exit(&vdev->vdev_scan_io_queue_lock);
+	}
+}
+
+static int
+dsl_scan_scrub_cb(dsl_pool_t *dp,
+    const blkptr_t *bp, const zbookmark_phys_t *zb)
+{
+	dsl_scan_t *scn = dp->dp_scan;
+	spa_t *spa = dp->dp_spa;
+	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+	size_t psize = BP_GET_PSIZE(bp);
+	boolean_t needs_io = B_FALSE;
+	int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
+
+
+	if (phys_birth <= scn->scn_phys.scn_min_txg ||
+	    phys_birth >= scn->scn_phys.scn_max_txg) {
+		count_block(scn, dp->dp_blkstats, bp);
+		return (0);
+	}
+
+	/* Embedded BP's have phys_birth==0, so we reject them above. */
+	ASSERT(!BP_IS_EMBEDDED(bp));
+
+	ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
+	if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
+		zio_flags |= ZIO_FLAG_SCRUB;
+		needs_io = B_TRUE;
+	} else {
+		ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
+		zio_flags |= ZIO_FLAG_RESILVER;
+		needs_io = B_FALSE;
+	}
+
+	/* If it's an intent log block, failure is expected. */
+	if (zb->zb_level == ZB_ZIL_LEVEL)
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
+		const dva_t *dva = &bp->blk_dva[d];
+
+		/*
+		 * Keep track of how much data we've examined so that
+		 * zpool(8) status can make useful progress reports.
+		 */
+		scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva);
+		spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva);
+
+		/* if it's a resilver, this may not be in the target range */
+		if (!needs_io)
+			needs_io = dsl_scan_need_resilver(spa, dva, psize,
+			    phys_birth);
+	}
+
+	if (needs_io && !zfs_no_scrub_io) {
+		dsl_scan_enqueue(dp, bp, zio_flags, zb);
+	} else {
+		count_block(scn, dp->dp_blkstats, bp);
+	}
+
+	/* do not relocate this block */
+	return (0);
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	dsl_scan_io_queue_t *queue = zio->io_private;
+
+	abd_free(zio->io_abd);
+
+	if (queue == NULL) {
+		mutex_enter(&spa->spa_scrub_lock);
+		ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+		spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+		cv_broadcast(&spa->spa_scrub_io_cv);
+		mutex_exit(&spa->spa_scrub_lock);
+	} else {
+		mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
+		ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
+		queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
+		cv_broadcast(&queue->q_zio_cv);
+		mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
+	}
+
+	if (zio->io_error && (zio->io_error != ECKSUM ||
+	    !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+		atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+	}
+}
+
+/*
+ * Given a scanning zio's information, executes the zio. The zio need
+ * not necessarily be only sortable, this function simply executes the
+ * zio, no matter what it is. The optional queue argument allows the
+ * caller to specify that they want per top level vdev IO rate limiting
+ * instead of the legacy global limiting.
+ */
+static void
+scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
+{
+	spa_t *spa = dp->dp_spa;
+	dsl_scan_t *scn = dp->dp_scan;
+	size_t size = BP_GET_PSIZE(bp);
+	abd_t *data = abd_alloc_for_io(size, B_FALSE);
+
+	ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
+
+	if (queue == NULL) {
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
+		mutex_exit(&spa->spa_scrub_lock);
+	} else {
+		kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+
+		mutex_enter(q_lock);
+		while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
+			cv_wait(&queue->q_zio_cv, q_lock);
+		queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+		mutex_exit(q_lock);
+	}
+
+	count_block(scn, dp->dp_blkstats, bp);
+	zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
+	    dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+}
+
+/*
+ * This is the primary extent sorting algorithm. We balance two parameters:
+ * 1) how many bytes of I/O are in an extent
+ * 2) how well the extent is filled with I/O (as a fraction of its total size)
+ * Since we allow extents to have gaps between their constituent I/Os, it's
+ * possible to have a fairly large extent that contains the same amount of
+ * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
+ * The algorithm sorts based on a score calculated from the extent's size,
+ * the relative fill volume (in %) and a "fill weight" parameter that controls
+ * the split between whether we prefer larger extents or more well populated
+ * extents:
+ *
+ * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
+ *
+ * Example:
+ * 1) assume extsz = 64 MiB
+ * 2) assume fill = 32 MiB (extent is half full)
+ * 3) assume fill_weight = 3
+ * 4)	SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
+ *	SCORE = 32M + (50 * 3 * 32M) / 100
+ *	SCORE = 32M + (4800M / 100)
+ *	SCORE = 32M + 48M
+ *	         ^     ^
+ *	         |     +--- final total relative fill-based score
+ *	         +--------- final total fill-based score
+ *	SCORE = 80M
+ *
+ * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
+ * extents that are more completely filled (in a 3:2 ratio) vs just larger.
+ * Note that as an optimization, we replace multiplication and division by
+ * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
+ */
+static int
+ext_size_compare(const void *x, const void *y)
+{
+	const range_seg_gap_t *rsa = x, *rsb = y;
+
+	uint64_t sa = rsa->rs_end - rsa->rs_start;
+	uint64_t sb = rsb->rs_end - rsb->rs_start;
+	uint64_t score_a, score_b;
+
+	score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
+	    fill_weight * rsa->rs_fill) >> 7);
+	score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
+	    fill_weight * rsb->rs_fill) >> 7);
+
+	if (score_a > score_b)
+		return (-1);
+	if (score_a == score_b) {
+		if (rsa->rs_start < rsb->rs_start)
+			return (-1);
+		if (rsa->rs_start == rsb->rs_start)
+			return (0);
+		return (1);
+	}
+	return (1);
+}
+
+/*
+ * Comparator for the q_sios_by_addr tree. Sorting is simply performed
+ * based on LBA-order (from lowest to highest).
+ */
+static int
+sio_addr_compare(const void *x, const void *y)
+{
+	const scan_io_t *a = x, *b = y;
+
+	return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
+}
+
+/* IO queues are created on demand when they are needed. */
+static dsl_scan_io_queue_t *
+scan_io_queue_create(vdev_t *vd)
+{
+	dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+	dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
+
+	q->q_scn = scn;
+	q->q_vd = vd;
+	q->q_sio_memused = 0;
+	cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
+	q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP,
+	    &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap);
+	avl_create(&q->q_sios_by_addr, sio_addr_compare,
+	    sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
+
+	return (q);
+}
+
+/*
+ * Destroys a scan queue and all segments and scan_io_t's contained in it.
+ * No further execution of I/O occurs, anything pending in the queue is
+ * simply freed without being executed.
+ */
+void
+dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
+{
+	dsl_scan_t *scn = queue->q_scn;
+	scan_io_t *sio;
+	void *cookie = NULL;
+	int64_t bytes_dequeued = 0;
+
+	ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+	while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
+	    NULL) {
+		ASSERT(range_tree_contains(queue->q_exts_by_addr,
+		    SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+		bytes_dequeued += SIO_GET_ASIZE(sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
+		sio_free(sio);
+	}
+
+	ASSERT0(queue->q_sio_memused);
+	atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
+	range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
+	range_tree_destroy(queue->q_exts_by_addr);
+	avl_destroy(&queue->q_sios_by_addr);
+	cv_destroy(&queue->q_zio_cv);
+
+	kmem_free(queue, sizeof (*queue));
+}
+
+/*
+ * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
+ * called on behalf of vdev_top_transfer when creating or destroying
+ * a mirror vdev due to zpool attach/detach.
+ */
+void
+dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
+{
+	mutex_enter(&svd->vdev_scan_io_queue_lock);
+	mutex_enter(&tvd->vdev_scan_io_queue_lock);
+
+	VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
+	tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
+	svd->vdev_scan_io_queue = NULL;
+	if (tvd->vdev_scan_io_queue != NULL)
+		tvd->vdev_scan_io_queue->q_vd = tvd;
+
+	mutex_exit(&tvd->vdev_scan_io_queue_lock);
+	mutex_exit(&svd->vdev_scan_io_queue_lock);
+}
+
+static void
+scan_io_queues_destroy(dsl_scan_t *scn)
+{
+	vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *tvd = rvd->vdev_child[i];
+
+		mutex_enter(&tvd->vdev_scan_io_queue_lock);
+		if (tvd->vdev_scan_io_queue != NULL)
+			dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
+		tvd->vdev_scan_io_queue = NULL;
+		mutex_exit(&tvd->vdev_scan_io_queue_lock);
+	}
+}
+
+static void
+dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+	vdev_t *vdev;
+	kmutex_t *q_lock;
+	dsl_scan_io_queue_t *queue;
+	scan_io_t *srch_sio, *sio;
+	avl_index_t idx;
+	uint64_t start, size;
+
+	vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
+	ASSERT(vdev != NULL);
+	q_lock = &vdev->vdev_scan_io_queue_lock;
+	queue = vdev->vdev_scan_io_queue;
+
+	mutex_enter(q_lock);
+	if (queue == NULL) {
+		mutex_exit(q_lock);
+		return;
+	}
+
+	srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+	bp2sio(bp, srch_sio, dva_i);
+	start = SIO_GET_OFFSET(srch_sio);
+	size = SIO_GET_ASIZE(srch_sio);
+
+	/*
+	 * We can find the zio in two states:
+	 * 1) Cold, just sitting in the queue of zio's to be issued at
+	 *	some point in the future. In this case, all we do is
+	 *	remove the zio from the q_sios_by_addr tree, decrement
+	 *	its data volume from the containing range_seg_t and
+	 *	resort the q_exts_by_size tree to reflect that the
+	 *	range_seg_t has lost some of its 'fill'. We don't shorten
+	 *	the range_seg_t - this is usually rare enough not to be
+	 *	worth the extra hassle of trying keep track of precise
+	 *	extent boundaries.
+	 * 2) Hot, where the zio is currently in-flight in
+	 *	dsl_scan_issue_ios. In this case, we can't simply
+	 *	reach in and stop the in-flight zio's, so we instead
+	 *	block the caller. Eventually, dsl_scan_issue_ios will
+	 *	be done with issuing the zio's it gathered and will
+	 *	signal us.
+	 */
+	sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+	sio_free(srch_sio);
+
+	if (sio != NULL) {
+		int64_t asize = SIO_GET_ASIZE(sio);
+		blkptr_t tmpbp;
+
+		/* Got it while it was cold in the queue */
+		ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
+		ASSERT3U(size, ==, asize);
+		avl_remove(&queue->q_sios_by_addr, sio);
+		queue->q_sio_memused -= SIO_GET_MUSED(sio);
+
+		ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
+		range_tree_remove_fill(queue->q_exts_by_addr, start, size);
+
+		/*
+		 * We only update scn_bytes_pending in the cold path,
+		 * otherwise it will already have been accounted for as
+		 * part of the zio's execution.
+		 */
+		atomic_add_64(&scn->scn_bytes_pending, -asize);
+
+		/* count the block as though we issued it */
+		sio2bp(sio, &tmpbp);
+		count_block(scn, dp->dp_blkstats, &tmpbp);
+
+		sio_free(sio);
+	}
+	mutex_exit(q_lock);
+}
+
+/*
+ * Callback invoked when a zio_free() zio is executing. This needs to be
+ * intercepted to prevent the zio from deallocating a particular portion
+ * of disk space and it then getting reallocated and written to, while we
+ * still have it queued up for processing.
+ */
+void
+dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
+{
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dsl_scan_t *scn = dp->dp_scan;
+
+	ASSERT(!BP_IS_EMBEDDED(bp));
+	ASSERT(scn != NULL);
+	if (!dsl_scan_is_running(scn))
+		return;
+
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++)
+		dsl_scan_freed_dva(spa, bp, i);
+}
+
+/*
+ * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has
+ * not started, start it. Otherwise, only restart if max txg in DTL range is
+ * greater than the max txg in the current scan. If the DTL max is less than
+ * the scan max, then the vdev has not missed any new data since the resilver
+ * started, so a restart is not needed.
+ */
+void
+dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd)
+{
+	uint64_t min, max;
+
+	if (!vdev_resilver_needed(vd, &min, &max))
+		return;
+
+	if (!dsl_scan_resilvering(dp)) {
+		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
+		return;
+	}
+
+	if (max <= dp->dp_scan->scn_phys.scn_max_txg)
+		return;
+
+	/* restart is needed, check if it can be deferred */
+	if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+		vdev_defer_resilver(vd);
+	else
+		spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW,
+	"Max bytes in flight per leaf vdev for scrubs and resilvers");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW,
+	"Min millisecs to scrub per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW,
+	"Min millisecs to obsolete per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW,
+	"Min millisecs to free per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW,
+	"Min millisecs to resilver per txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW,
+	"Set to prevent scans from progressing");
+
+ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW,
+	"Set to disable scrub I/O");
+
+ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW,
+	"Set to disable scrub prefetching");
+
+ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW,
+	"Max number of blocks freed in one txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW,
+	"Max number of dedup blocks freed in one txg");
+
+ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW,
+	"Enable processing of the free_bpobj");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW,
+	"Fraction of RAM for scan hard limit");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW,
+	"IO issuing strategy during scrubbing. "
+	"0 = default, 1 = LBA, 2 = size");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW,
+	"Scrub using legacy non-sequential method");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW,
+	"Scan progress on-disk checkpointing interval");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW,
+	"Max gap in bytes between sequential scrub / resilver I/Os");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW,
+	"Fraction of hard limit used as soft limit");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW,
+	"Tunable to attempt to reduce lock contention");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW,
+	"Tunable to adjust bias towards more filled segments during scans");
+
+ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
+	"Process all resilvers immediately");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dsl_synctask.c b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
new file mode 100644
index 000000000000..148e8fff2437
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_synctask.c
@@ -0,0 +1,257 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab.h>
+
+#define	DST_AVG_BLKSHIFT 14
+
+/* ARGSUSED */
+static int
+dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
+{
+	return (0);
+}
+
+static int
+dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check, boolean_t early)
+{
+	spa_t *spa;
+	dmu_tx_t *tx;
+	int err;
+	dsl_sync_task_t dst = { { { NULL } } };
+	dsl_pool_t *dp;
+
+	err = spa_open(pool, &spa, FTAG);
+	if (err != 0)
+		return (err);
+	dp = spa_get_dsl(spa);
+
+top:
+	tx = dmu_tx_create_dd(dp->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	dst.dst_pool = dp;
+	dst.dst_txg = dmu_tx_get_txg(tx);
+	dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+	dst.dst_space_check = space_check;
+	dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
+	dst.dst_syncfunc = syncfunc;
+	dst.dst_arg = arg;
+	dst.dst_error = 0;
+	dst.dst_nowaiter = B_FALSE;
+
+	dsl_pool_config_enter(dp, FTAG);
+	err = dst.dst_checkfunc(arg, tx);
+	dsl_pool_config_exit(dp, FTAG);
+
+	if (err != 0) {
+		dmu_tx_commit(tx);
+		spa_close(spa, FTAG);
+		return (err);
+	}
+
+	txg_list_t *task_list = (early) ?
+	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+	VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg));
+
+	dmu_tx_commit(tx);
+
+	if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) {
+		/* current contract is to call func once */
+		sigfunc(arg, tx);
+		sigfunc = NULL;	/* in case we're performing an EAGAIN retry */
+	}
+	txg_wait_synced(dp, dst.dst_txg);
+
+	if (dst.dst_error == EAGAIN) {
+		txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE);
+		goto top;
+	}
+
+	spa_close(spa, FTAG);
+	return (dst.dst_error);
+}
+
+/*
+ * Called from open context to perform a callback in syncing context.  Waits
+ * for the operation to complete.
+ *
+ * The checkfunc will be called from open context as a preliminary check
+ * which can quickly fail.  If it succeeds, it will be called again from
+ * syncing context.  The checkfunc should generally be designed to work
+ * properly in either context, but if necessary it can check
+ * dmu_tx_is_syncing(tx).
+ *
+ * The synctask infrastructure enforces proper locking strategy with respect
+ * to the dp_config_rwlock -- the lock will always be held when the callbacks
+ * are called.  It will be held for read during the open-context (preliminary)
+ * call to the checkfunc, and then held for write from syncing context during
+ * the calls to the check and sync funcs.
+ *
+ * A dataset or pool name can be passed as the first argument.  Typically,
+ * the check func will hold, check the return value of the hold, and then
+ * release the dataset.  The sync func will VERIFYO(hold()) the dataset.
+ * This is safe because no changes can be made between the check and sync funcs,
+ * and the sync func will only be called if the check func successfully opened
+ * the dataset.
+ */
+int
+dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+	return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
+	    blocks_modified, space_check, B_FALSE));
+}
+
+/*
+ * An early synctask works exactly as a standard synctask with one important
+ * difference on the way it is handled during syncing context. Standard
+ * synctasks run after we've written out all the dirty blocks of dirty
+ * datasets. Early synctasks are executed before writing out any dirty data,
+ * and thus before standard synctasks.
+ *
+ * For that reason, early synctasks can affect the process of writing dirty
+ * changes to disk for the txg that they run and should be used with caution.
+ * In addition, early synctasks should not dirty any metaslabs as this would
+ * invalidate the precondition/invariant for subsequent early synctasks.
+ * [see dsl_pool_sync() and dsl_early_sync_task_verify()]
+ */
+int
+dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+	return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
+	    blocks_modified, space_check, B_TRUE));
+}
+
+/*
+ * A standard synctask that can be interrupted from a signal. The sigfunc
+ * is called once if a signal occurred while waiting for the task to sync.
+ */
+int
+dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
+    int blocks_modified, zfs_space_check_t space_check)
+{
+	return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg,
+	    blocks_modified, space_check, B_FALSE));
+}
+
+static void
+dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    dmu_tx_t *tx, boolean_t early)
+{
+	dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
+
+	dst->dst_pool = dp;
+	dst->dst_txg = dmu_tx_get_txg(tx);
+	dst->dst_space_check = ZFS_SPACE_CHECK_NONE;
+	dst->dst_checkfunc = dsl_null_checkfunc;
+	dst->dst_syncfunc = syncfunc;
+	dst->dst_arg = arg;
+	dst->dst_error = 0;
+	dst->dst_nowaiter = B_TRUE;
+
+	txg_list_t *task_list = (early) ?
+	    &dp->dp_early_sync_tasks : &dp->dp_sync_tasks;
+	VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg));
+}
+
+void
+dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    dmu_tx_t *tx)
+{
+	dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_FALSE);
+}
+
+void
+dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
+    dmu_tx_t *tx)
+{
+	dsl_sync_task_nowait_common(dp, syncfunc, arg, tx, B_TRUE);
+}
+
+/*
+ * Called in syncing context to execute the synctask.
+ */
+void
+dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dst->dst_pool;
+
+	ASSERT0(dst->dst_error);
+
+	/*
+	 * Check for sufficient space.
+	 *
+	 * When the sync task was created, the caller specified the
+	 * type of space checking required.  See the comment in
+	 * zfs_space_check_t for details on the semantics of each
+	 * type of space checking.
+	 *
+	 * We just check against what's on-disk; we don't want any
+	 * in-flight accounting to get in our way, because open context
+	 * may have already used up various in-core limits
+	 * (arc_tempreserve, dsl_pool_tempreserve).
+	 */
+	if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
+		uint64_t quota = dsl_pool_unreserved_space(dp,
+		    dst->dst_space_check);
+		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+
+		/* MOS space is triple-dittoed, so we multiply by 3. */
+		if (used + dst->dst_space * 3 > quota) {
+			dst->dst_error = SET_ERROR(ENOSPC);
+			if (dst->dst_nowaiter)
+				kmem_free(dst, sizeof (*dst));
+			return;
+		}
+	}
+
+	/*
+	 * Check for errors by calling checkfunc.
+	 */
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+	dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx);
+	if (dst->dst_error == 0)
+		dst->dst_syncfunc(dst->dst_arg, tx);
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
+	if (dst->dst_nowaiter)
+		kmem_free(dst, sizeof (*dst));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dsl_sync_task);
+EXPORT_SYMBOL(dsl_sync_task_nowait);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/dsl_userhold.c b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
new file mode 100644
index 000000000000..75d153194a00
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/dsl_userhold.c
@@ -0,0 +1,691 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dsl_userhold.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfs_onexit.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zap.h>
+
+typedef struct dsl_dataset_user_hold_arg {
+	nvlist_t *dduha_holds;
+	nvlist_t *dduha_chkholds;
+	nvlist_t *dduha_errlist;
+	minor_t dduha_minor;
+} dsl_dataset_user_hold_arg_t;
+
+/*
+ * If you add new checks here, you may need to add additional checks to the
+ * "temporary" case in snapshot_check() in dmu_objset.c.
+ */
+int
+dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag,
+    boolean_t temphold, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	objset_t *mos = dp->dp_meta_objset;
+	int error = 0;
+
+	ASSERT(dsl_pool_config_held(dp));
+
+	if (strlen(htag) > MAXNAMELEN)
+		return (SET_ERROR(E2BIG));
+	/* Tempholds have a more restricted length */
+	if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+		return (SET_ERROR(E2BIG));
+
+	/* tags must be unique (if ds already exists) */
+	if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+		uint64_t value;
+
+		error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    htag, 8, 1, &value);
+		if (error == 0)
+			error = SET_ERROR(EEXIST);
+		else if (error == ENOENT)
+			error = 0;
+	}
+
+	return (error);
+}
+
+static int
+dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_hold_arg_t *dduha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvlist_t *tmp_holds;
+
+	if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS)
+		return (SET_ERROR(ENOTSUP));
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	/*
+	 * Ensure the list has no duplicates by copying name/values from
+	 * non-unique dduha_holds to unique tmp_holds, and comparing counts.
+	 */
+	tmp_holds = fnvlist_alloc();
+	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+		size_t len = strlen(nvpair_name(pair)) +
+		    strlen(fnvpair_value_string(pair));
+		char *nameval = kmem_zalloc(len + 2, KM_SLEEP);
+		(void) strlcpy(nameval, nvpair_name(pair), len + 2);
+		(void) strlcat(nameval, "@", len + 2);
+		(void) strlcat(nameval, fnvpair_value_string(pair), len + 2);
+		fnvlist_add_string(tmp_holds, nameval, "");
+		kmem_free(nameval, len + 2);
+	}
+	size_t tmp_count = fnvlist_num_pairs(tmp_holds);
+	fnvlist_free(tmp_holds);
+	if (tmp_count != fnvlist_num_pairs(dduha->dduha_holds))
+		return (SET_ERROR(EEXIST));
+	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) {
+		dsl_dataset_t *ds;
+		int error = 0;
+		char *htag, *name;
+
+		/* must be a snapshot */
+		name = nvpair_name(pair);
+		if (strchr(name, '@') == NULL)
+			error = SET_ERROR(EINVAL);
+
+		if (error == 0)
+			error = nvpair_value_string(pair, &htag);
+
+		if (error == 0)
+			error = dsl_dataset_hold(dp, name, FTAG, &ds);
+
+		if (error == 0) {
+			error = dsl_dataset_user_hold_check_one(ds, htag,
+			    dduha->dduha_minor != 0, tx);
+			dsl_dataset_rele(ds, FTAG);
+		}
+
+		if (error == 0) {
+			fnvlist_add_string(dduha->dduha_chkholds, name, htag);
+		} else {
+			/*
+			 * We register ENOENT errors so they can be correctly
+			 * reported if needed, such as when all holds fail.
+			 */
+			fnvlist_add_int32(dduha->dduha_errlist, name, error);
+			if (error != ENOENT)
+				return (error);
+		}
+	}
+
+	return (0);
+}
+
+
+static void
+dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds,
+    const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj;
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
+		/*
+		 * This is the first user hold for this dataset.  Create
+		 * the userrefs zap object.
+		 */
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
+		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+	} else {
+		zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+	}
+	ds->ds_userrefs++;
+
+	VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+	if (minor != 0) {
+		char name[MAXNAMELEN];
+		nvlist_t *tags;
+
+		VERIFY0(dsl_pool_user_hold(dp, ds->ds_object,
+		    htag, now, tx));
+		(void) snprintf(name, sizeof (name), "%llx",
+		    (u_longlong_t)ds->ds_object);
+
+		if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) {
+			tags = fnvlist_alloc();
+			fnvlist_add_boolean(tags, htag);
+			fnvlist_add_nvlist(tmpholds, name, tags);
+			fnvlist_free(tags);
+		} else {
+			fnvlist_add_boolean(tags, htag);
+		}
+	}
+
+	spa_history_log_internal_ds(ds, "hold", tx,
+	    "tag=%s temp=%d refs=%llu",
+	    htag, minor != 0, (u_longlong_t)ds->ds_userrefs);
+}
+
+typedef struct zfs_hold_cleanup_arg {
+	char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t zhca_spa_load_guid;
+	nvlist_t *zhca_holds;
+} zfs_hold_cleanup_arg_t;
+
+static void
+dsl_dataset_user_release_onexit(void *arg)
+{
+	zfs_hold_cleanup_arg_t *ca = arg;
+	spa_t *spa;
+	int error;
+
+	error = spa_open(ca->zhca_spaname, &spa, FTAG);
+	if (error != 0) {
+		zfs_dbgmsg("couldn't release holds on pool=%s "
+		    "because pool is no longer loaded",
+		    ca->zhca_spaname);
+		return;
+	}
+	if (spa_load_guid(spa) != ca->zhca_spa_load_guid) {
+		zfs_dbgmsg("couldn't release holds on pool=%s "
+		    "because pool is no longer loaded (guid doesn't match)",
+		    ca->zhca_spaname);
+		spa_close(spa, FTAG);
+		return;
+	}
+
+	(void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds);
+	fnvlist_free(ca->zhca_holds);
+	kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t));
+	spa_close(spa, FTAG);
+}
+
+static void
+dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor)
+{
+	zfs_hold_cleanup_arg_t *ca;
+
+	if (minor == 0 || nvlist_empty(holds)) {
+		fnvlist_free(holds);
+		return;
+	}
+
+	ASSERT(spa != NULL);
+	ca = kmem_alloc(sizeof (*ca), KM_SLEEP);
+
+	(void) strlcpy(ca->zhca_spaname, spa_name(spa),
+	    sizeof (ca->zhca_spaname));
+	ca->zhca_spa_load_guid = spa_load_guid(spa);
+	ca->zhca_holds = holds;
+	VERIFY0(zfs_onexit_add_cb(minor,
+	    dsl_dataset_user_release_onexit, ca, NULL));
+}
+
+void
+dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag,
+    minor_t minor, uint64_t now, dmu_tx_t *tx)
+{
+	nvlist_t *tmpholds;
+
+	if (minor != 0)
+		tmpholds = fnvlist_alloc();
+	else
+		tmpholds = NULL;
+	dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx);
+	dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_hold_arg_t *dduha = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	nvlist_t *tmpholds;
+	uint64_t now = gethrestime_sec();
+
+	if (dduha->dduha_minor != 0)
+		tmpholds = fnvlist_alloc();
+	else
+		tmpholds = NULL;
+	for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) {
+		dsl_dataset_t *ds;
+
+		VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds));
+		dsl_dataset_user_hold_sync_one_impl(tmpholds, ds,
+		    fnvpair_value_string(pair), dduha->dduha_minor, now, tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor);
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_hold().
+ *
+ * To summarize:
+ * holds is nvl of snapname -> holdname
+ * errlist will be filled in with snapname -> error
+ *
+ * The snapshots must all be in the same pool.
+ *
+ * Holds for snapshots that don't exist will be skipped.
+ *
+ * If none of the snapshots for requested holds exist then ENOENT will be
+ * returned.
+ *
+ * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned
+ * up when the process exits.
+ *
+ * On success all the holds, for snapshots that existed, will be created and 0
+ * will be returned.
+ *
+ * On failure no holds will be created, the errlist will be filled in,
+ * and an errno will returned.
+ *
+ * In all cases the errlist will contain entries for holds where the snapshot
+ * didn't exist.
+ */
+int
+dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist)
+{
+	dsl_dataset_user_hold_arg_t dduha;
+	nvpair_t *pair;
+	int ret;
+
+	pair = nvlist_next_nvpair(holds, NULL);
+	if (pair == NULL)
+		return (0);
+
+	dduha.dduha_holds = holds;
+	/* chkholds can have non-unique name */
+	VERIFY(0 == nvlist_alloc(&dduha.dduha_chkholds, 0, KM_SLEEP));
+	dduha.dduha_errlist = errlist;
+	dduha.dduha_minor = cleanup_minor;
+
+	ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
+	    dsl_dataset_user_hold_sync, &dduha,
+	    fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
+	fnvlist_free(dduha.dduha_chkholds);
+
+	return (ret);
+}
+
+typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag,
+    dsl_dataset_t **dsp);
+
+typedef struct dsl_dataset_user_release_arg {
+	dsl_holdfunc_t *ddura_holdfunc;
+	nvlist_t *ddura_holds;
+	nvlist_t *ddura_todelete;
+	nvlist_t *ddura_errlist;
+	nvlist_t *ddura_chkholds;
+} dsl_dataset_user_release_arg_t;
+
+/* Place a dataset hold on the snapshot identified by passed dsobj string */
+static int
+dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag,
+    dsl_dataset_t **dsp)
+{
+	return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp));
+}
+
+static int
+dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura,
+    dsl_dataset_t *ds, nvlist_t *holds, const char *snapname)
+{
+	uint64_t zapobj;
+	nvlist_t *holds_found;
+	objset_t *mos;
+	int numholds;
+
+	if (!ds->ds_is_snapshot)
+		return (SET_ERROR(EINVAL));
+
+	if (nvlist_empty(holds))
+		return (0);
+
+	numholds = 0;
+	mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
+	VERIFY0(nvlist_alloc(&holds_found, NV_UNIQUE_NAME, KM_SLEEP));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		uint64_t tmp;
+		int error;
+		const char *holdname = nvpair_name(pair);
+
+		if (zapobj != 0)
+			error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp);
+		else
+			error = SET_ERROR(ENOENT);
+
+		/*
+		 * Non-existent holds are put on the errlist, but don't
+		 * cause an overall failure.
+		 */
+		if (error == ENOENT) {
+			if (ddura->ddura_errlist != NULL) {
+				char *errtag = kmem_asprintf("%s#%s",
+				    snapname, holdname);
+				fnvlist_add_int32(ddura->ddura_errlist, errtag,
+				    ENOENT);
+				kmem_strfree(errtag);
+			}
+			continue;
+		}
+
+		if (error != 0) {
+			fnvlist_free(holds_found);
+			return (error);
+		}
+
+		fnvlist_add_boolean(holds_found, holdname);
+		numholds++;
+	}
+
+	if (DS_IS_DEFER_DESTROY(ds) &&
+	    dsl_dataset_phys(ds)->ds_num_children == 1 &&
+	    ds->ds_userrefs == numholds) {
+		/* we need to destroy the snapshot as well */
+		if (dsl_dataset_long_held(ds)) {
+			fnvlist_free(holds_found);
+			return (SET_ERROR(EBUSY));
+		}
+		fnvlist_add_boolean(ddura->ddura_todelete, snapname);
+	}
+
+	if (numholds != 0) {
+		fnvlist_add_nvlist(ddura->ddura_chkholds, snapname,
+		    holds_found);
+	}
+	fnvlist_free(holds_found);
+
+	return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_release_arg_t *ddura;
+	dsl_holdfunc_t *holdfunc;
+	dsl_pool_t *dp;
+
+	if (!dmu_tx_is_syncing(tx))
+		return (0);
+
+	dp = dmu_tx_pool(tx);
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	ddura = arg;
+	holdfunc = ddura->ddura_holdfunc;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) {
+		int error;
+		dsl_dataset_t *ds;
+		nvlist_t *holds;
+		const char *snapname = nvpair_name(pair);
+
+		error = nvpair_value_nvlist(pair, &holds);
+		if (error != 0)
+			error = (SET_ERROR(EINVAL));
+		else
+			error = holdfunc(dp, snapname, FTAG, &ds);
+		if (error == 0) {
+			error = dsl_dataset_user_release_check_one(ddura, ds,
+			    holds, snapname);
+			dsl_dataset_rele(ds, FTAG);
+		}
+		if (error != 0) {
+			if (ddura->ddura_errlist != NULL) {
+				fnvlist_add_int32(ddura->ddura_errlist,
+				    snapname, error);
+			}
+			/*
+			 * Non-existent snapshots are put on the errlist,
+			 * but don't cause an overall failure.
+			 */
+			if (error != ENOENT)
+				return (error);
+		}
+	}
+
+	return (0);
+}
+
+static void
+dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds,
+    dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		int error;
+		const char *holdname = nvpair_name(pair);
+
+		/* Remove temporary hold if one exists. */
+		error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
+		VERIFY(error == 0 || error == ENOENT);
+
+		VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+		    holdname, tx));
+		ds->ds_userrefs--;
+
+		spa_history_log_internal_ds(ds, "release", tx,
+		    "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs);
+	}
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_dataset_user_release_arg_t *ddura = arg;
+	dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+
+	ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds,
+	    pair)) {
+		dsl_dataset_t *ds;
+		const char *name = nvpair_name(pair);
+
+		VERIFY0(holdfunc(dp, name, FTAG, &ds));
+
+		dsl_dataset_user_release_sync_one(ds,
+		    fnvpair_value_nvlist(pair), tx);
+		if (nvlist_exists(ddura->ddura_todelete, name)) {
+			ASSERT(ds->ds_userrefs == 0 &&
+			    dsl_dataset_phys(ds)->ds_num_children == 1 &&
+			    DS_IS_DEFER_DESTROY(ds));
+			dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
+		}
+		dsl_dataset_rele(ds, FTAG);
+	}
+}
+
+/*
+ * The full semantics of this function are described in the comment above
+ * lzc_release().
+ *
+ * To summarize:
+ * Releases holds specified in the nvl holds.
+ *
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ *
+ * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots,
+ * otherwise they should be the names of snapshots.
+ *
+ * As a release may cause snapshots to be destroyed this tries to ensure they
+ * aren't mounted.
+ *
+ * The release of non-existent holds are skipped.
+ *
+ * At least one hold must have been released for the this function to succeed
+ * and return 0.
+ */
+static int
+dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist,
+    dsl_pool_t *tmpdp)
+{
+	dsl_dataset_user_release_arg_t ddura;
+	nvpair_t *pair;
+	char *pool;
+	int error;
+
+	pair = nvlist_next_nvpair(holds, NULL);
+	if (pair == NULL)
+		return (0);
+
+	/*
+	 * The release may cause snapshots to be destroyed; make sure they
+	 * are not mounted.
+	 */
+	if (tmpdp != NULL) {
+		/* Temporary holds are specified by dsobj string. */
+		ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
+		pool = spa_name(tmpdp->dp_spa);
+#ifdef _KERNEL
+		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(holds, pair)) {
+			dsl_dataset_t *ds;
+
+			dsl_pool_config_enter(tmpdp, FTAG);
+			error = dsl_dataset_hold_obj_string(tmpdp,
+			    nvpair_name(pair), FTAG, &ds);
+			if (error == 0) {
+				char name[ZFS_MAX_DATASET_NAME_LEN];
+				dsl_dataset_name(ds, name);
+				dsl_pool_config_exit(tmpdp, FTAG);
+				dsl_dataset_rele(ds, FTAG);
+				(void) zfs_unmount_snap(name);
+			} else {
+				dsl_pool_config_exit(tmpdp, FTAG);
+			}
+		}
+#endif
+	} else {
+		/* Non-temporary holds are specified by name. */
+		ddura.ddura_holdfunc = dsl_dataset_hold;
+		pool = nvpair_name(pair);
+#ifdef _KERNEL
+		for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+		    pair = nvlist_next_nvpair(holds, pair)) {
+			(void) zfs_unmount_snap(nvpair_name(pair));
+		}
+#endif
+	}
+
+	ddura.ddura_holds = holds;
+	ddura.ddura_errlist = errlist;
+	VERIFY0(nvlist_alloc(&ddura.ddura_todelete, NV_UNIQUE_NAME,
+	    KM_SLEEP));
+	VERIFY0(nvlist_alloc(&ddura.ddura_chkholds, NV_UNIQUE_NAME,
+	    KM_SLEEP));
+
+	error = dsl_sync_task(pool, dsl_dataset_user_release_check,
+	    dsl_dataset_user_release_sync, &ddura, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
+	fnvlist_free(ddura.ddura_todelete);
+	fnvlist_free(ddura.ddura_chkholds);
+
+	return (error);
+}
+
+/*
+ * holds is nvl of snapname -> { holdname, ... }
+ * errlist will be filled in with snapname -> error
+ */
+int
+dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist)
+{
+	return (dsl_dataset_user_release_impl(holds, errlist, NULL));
+}
+
+/*
+ * holds is nvl of snapdsobj -> { holdname, ... }
+ */
+void
+dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds)
+{
+	ASSERT(dp != NULL);
+	(void) dsl_dataset_user_release_impl(holds, NULL, dp);
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_pool_hold(dsname, FTAG, &dp);
+	if (err != 0)
+		return (err);
+	err = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (err != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (err);
+	}
+
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
+		zap_attribute_t *za;
+		zap_cursor_t zc;
+
+		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+		    dsl_dataset_phys(ds)->ds_userrefs_obj);
+		    zap_cursor_retrieve(&zc, za) == 0;
+		    zap_cursor_advance(&zc)) {
+			fnvlist_add_uint64(nvl, za->za_name,
+			    za->za_first_integer);
+		}
+		zap_cursor_fini(&zc);
+		kmem_free(za, sizeof (zap_attribute_t));
+	}
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/edonr_zfs.c b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
new file mode 100644
index 000000000000..aa00e1c9417e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/edonr_zfs.c
@@ -0,0 +1,115 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/edonr.h>
+#include <sys/abd.h>
+
+#define	EDONR_MODE		512
+#define	EDONR_BLOCK_SIZE	EdonR512_BLOCK_SIZE
+
+static int
+edonr_incremental(void *buf, size_t size, void *arg)
+{
+	EdonRState *ctx = arg;
+	EdonRUpdate(ctx, buf, size * 8);
+	return (0);
+}
+
+/*
+ * Native zio_checksum interface for the Edon-R hash function.
+ */
+/*ARGSUSED*/
+void
+abd_checksum_edonr_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	uint8_t		digest[EDONR_MODE / 8];
+	EdonRState	ctx;
+
+	ASSERT(ctx_template != NULL);
+	bcopy(ctx_template, &ctx, sizeof (ctx));
+	(void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx);
+	EdonRFinal(&ctx, digest);
+	bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+}
+
+/*
+ * Byteswapped zio_checksum interface for the Edon-R hash function.
+ */
+void
+abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	abd_checksum_edonr_native(abd, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
+}
+
+void *
+abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+{
+	EdonRState	*ctx;
+	uint8_t		salt_block[EDONR_BLOCK_SIZE];
+
+	/*
+	 * Edon-R needs all but the last hash invocation to be on full-size
+	 * blocks, but the salt is too small. Rather than simply padding it
+	 * with zeros, we expand the salt into a new salt block of proper
+	 * size by double-hashing it (the new salt block will be composed of
+	 * H(salt) || H(H(salt))).
+	 */
+	CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
+	EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
+	    salt_block);
+	EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
+	    EDONR_MODE / 8);
+
+	/*
+	 * Feed the new salt block into the hash function - this will serve
+	 * as our MAC key.
+	 */
+	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+	EdonRInit(ctx, EDONR_MODE);
+	EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
+	return (ctx);
+}
+
+void
+abd_checksum_edonr_tmpl_free(void *ctx_template)
+{
+	EdonRState	*ctx = ctx_template;
+
+	bzero(ctx, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/contrib/openzfs/module/zfs/fm.c b/sys/contrib/openzfs/module/zfs/fm.c
new file mode 100644
index 000000000000..a5003f85d621
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/fm.c
@@ -0,0 +1,1686 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * Fault Management Architecture (FMA) Resource and Protocol Support
+ *
+ * The routines contained herein provide services to support kernel subsystems
+ * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
+ *
+ * Name-Value Pair Lists
+ *
+ * The embodiment of an FMA protocol element (event, fmri or authority) is a
+ * name-value pair list (nvlist_t).  FMA-specific nvlist constructor and
+ * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
+ * to create an nvpair list using custom allocators.  Callers may choose to
+ * allocate either from the kernel memory allocator, or from a preallocated
+ * buffer, useful in constrained contexts like high-level interrupt routines.
+ *
+ * Protocol Event and FMRI Construction
+ *
+ * Convenience routines are provided to construct nvlist events according to
+ * the FMA Event Protocol and Naming Schema specification for ereports and
+ * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
+ *
+ * ENA Manipulation
+ *
+ * Routines to generate ENA formats 0, 1 and 2 are available as well as
+ * routines to increment formats 1 and 2.  Individual fields within the
+ * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
+ * fm_ena_format_get() and fm_ena_gen_get().
+ */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/list.h>
+#include <sys/nvpair.h>
+#include <sys/cmn_err.h>
+#include <sys/sysmacros.h>
+#include <sys/sunddi.h>
+#include <sys/systeminfo.h>
+#include <sys/fm/util.h>
+#include <sys/fm/protocol.h>
+#include <sys/kstat.h>
+#include <sys/zfs_context.h>
+#ifdef _KERNEL
+#include <sys/atomic.h>
+#include <sys/condvar.h>
+#include <sys/console.h>
+#include <sys/time.h>
+#include <sys/zfs_ioctl.h>
+
+int zfs_zevent_len_max = 0;
+int zfs_zevent_cols = 80;
+int zfs_zevent_console = 0;
+
+static int zevent_len_cur = 0;
+static int zevent_waiters = 0;
+static int zevent_flags = 0;
+
+/* Num events rate limited since the last time zfs_zevent_next() was called */
+static uint64_t ratelimit_dropped = 0;
+
+/*
+ * The EID (Event IDentifier) is used to uniquely tag a zevent when it is
+ * posted.  The posted EIDs are monotonically increasing but not persistent.
+ * They will be reset to the initial value (1) each time the kernel module is
+ * loaded.
+ */
+static uint64_t zevent_eid = 0;
+
+static kmutex_t zevent_lock;
+static list_t zevent_list;
+static kcondvar_t zevent_cv;
+#endif /* _KERNEL */
+
+
+/*
+ * Common fault management kstats to record event generation failures
+ */
+
+struct erpt_kstat {
+	kstat_named_t	erpt_dropped;		/* num erpts dropped on post */
+	kstat_named_t	erpt_set_failed;	/* num erpt set failures */
+	kstat_named_t	fmri_set_failed;	/* num fmri set failures */
+	kstat_named_t	payload_set_failed;	/* num payload set failures */
+	kstat_named_t	erpt_duplicates;	/* num duplicate erpts */
+};
+
+static struct erpt_kstat erpt_kstat_data = {
+	{ "erpt-dropped", KSTAT_DATA_UINT64 },
+	{ "erpt-set-failed", KSTAT_DATA_UINT64 },
+	{ "fmri-set-failed", KSTAT_DATA_UINT64 },
+	{ "payload-set-failed", KSTAT_DATA_UINT64 },
+	{ "erpt-duplicates", KSTAT_DATA_UINT64 }
+};
+
+kstat_t *fm_ksp;
+
+#ifdef _KERNEL
+
+/*
+ * Formatting utility function for fm_nvprintr.  We attempt to wrap chunks of
+ * output so they aren't split across console lines, and return the end column.
+ */
+/*PRINTFLIKE4*/
+static int
+fm_printf(int depth, int c, int cols, const char *format, ...)
+{
+	va_list ap;
+	int width;
+	char c1;
+
+	va_start(ap, format);
+	width = vsnprintf(&c1, sizeof (c1), format, ap);
+	va_end(ap);
+
+	if (c + width >= cols) {
+		console_printf("\n");
+		c = 0;
+		if (format[0] != ' ' && depth > 0) {
+			console_printf(" ");
+			c++;
+		}
+	}
+
+	va_start(ap, format);
+	console_vprintf(format, ap);
+	va_end(ap);
+
+	return ((c + width) % cols);
+}
+
+/*
+ * Recursively print an nvlist in the specified column width and return the
+ * column we end up in.  This function is called recursively by fm_nvprint(),
+ * below.  We generically format the entire nvpair using hexadecimal
+ * integers and strings, and elide any integer arrays.  Arrays are basically
+ * used for cache dumps right now, so we suppress them so as not to overwhelm
+ * the amount of console output we produce at panic time.  This can be further
+ * enhanced as FMA technology grows based upon the needs of consumers.  All
+ * FMA telemetry is logged using the dump device transport, so the console
+ * output serves only as a fallback in case this procedure is unsuccessful.
+ */
+static int
+fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
+{
+	nvpair_t *nvp;
+
+	for (nvp = nvlist_next_nvpair(nvl, NULL);
+	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
+
+		data_type_t type = nvpair_type(nvp);
+		const char *name = nvpair_name(nvp);
+
+		boolean_t b;
+		uint8_t i8;
+		uint16_t i16;
+		uint32_t i32;
+		uint64_t i64;
+		char *str;
+		nvlist_t *cnv;
+
+		if (strcmp(name, FM_CLASS) == 0)
+			continue; /* already printed by caller */
+
+		c = fm_printf(d, c, cols, " %s=", name);
+
+		switch (type) {
+		case DATA_TYPE_BOOLEAN:
+			c = fm_printf(d + 1, c, cols, " 1");
+			break;
+
+		case DATA_TYPE_BOOLEAN_VALUE:
+			(void) nvpair_value_boolean_value(nvp, &b);
+			c = fm_printf(d + 1, c, cols, b ? "1" : "0");
+			break;
+
+		case DATA_TYPE_BYTE:
+			(void) nvpair_value_byte(nvp, &i8);
+			c = fm_printf(d + 1, c, cols, "0x%x", i8);
+			break;
+
+		case DATA_TYPE_INT8:
+			(void) nvpair_value_int8(nvp, (void *)&i8);
+			c = fm_printf(d + 1, c, cols, "0x%x", i8);
+			break;
+
+		case DATA_TYPE_UINT8:
+			(void) nvpair_value_uint8(nvp, &i8);
+			c = fm_printf(d + 1, c, cols, "0x%x", i8);
+			break;
+
+		case DATA_TYPE_INT16:
+			(void) nvpair_value_int16(nvp, (void *)&i16);
+			c = fm_printf(d + 1, c, cols, "0x%x", i16);
+			break;
+
+		case DATA_TYPE_UINT16:
+			(void) nvpair_value_uint16(nvp, &i16);
+			c = fm_printf(d + 1, c, cols, "0x%x", i16);
+			break;
+
+		case DATA_TYPE_INT32:
+			(void) nvpair_value_int32(nvp, (void *)&i32);
+			c = fm_printf(d + 1, c, cols, "0x%x", i32);
+			break;
+
+		case DATA_TYPE_UINT32:
+			(void) nvpair_value_uint32(nvp, &i32);
+			c = fm_printf(d + 1, c, cols, "0x%x", i32);
+			break;
+
+		case DATA_TYPE_INT64:
+			(void) nvpair_value_int64(nvp, (void *)&i64);
+			c = fm_printf(d + 1, c, cols, "0x%llx",
+			    (u_longlong_t)i64);
+			break;
+
+		case DATA_TYPE_UINT64:
+			(void) nvpair_value_uint64(nvp, &i64);
+			c = fm_printf(d + 1, c, cols, "0x%llx",
+			    (u_longlong_t)i64);
+			break;
+
+		case DATA_TYPE_HRTIME:
+			(void) nvpair_value_hrtime(nvp, (void *)&i64);
+			c = fm_printf(d + 1, c, cols, "0x%llx",
+			    (u_longlong_t)i64);
+			break;
+
+		case DATA_TYPE_STRING:
+			(void) nvpair_value_string(nvp, &str);
+			c = fm_printf(d + 1, c, cols, "\"%s\"",
+			    str ? str : "<NULL>");
+			break;
+
+		case DATA_TYPE_NVLIST:
+			c = fm_printf(d + 1, c, cols, "[");
+			(void) nvpair_value_nvlist(nvp, &cnv);
+			c = fm_nvprintr(cnv, d + 1, c, cols);
+			c = fm_printf(d + 1, c, cols, " ]");
+			break;
+
+		case DATA_TYPE_NVLIST_ARRAY: {
+			nvlist_t **val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[");
+			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++) {
+				c = fm_nvprintr(val[i], d + 1, c, cols);
+			}
+			c = fm_printf(d + 1, c, cols, " ]");
+			}
+			break;
+
+		case DATA_TYPE_INT8_ARRAY: {
+			int8_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_int8_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				c = fm_printf(d + 1, c, cols, "0x%llx ",
+				    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_UINT8_ARRAY: {
+			uint8_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_uint8_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				c = fm_printf(d + 1, c, cols, "0x%llx ",
+				    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_INT16_ARRAY: {
+			int16_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_int16_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				c = fm_printf(d + 1, c, cols, "0x%llx ",
+				    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_UINT16_ARRAY: {
+			uint16_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_uint16_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				c = fm_printf(d + 1, c, cols, "0x%llx ",
+				    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_INT32_ARRAY: {
+			int32_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_int32_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+			c = fm_printf(d + 1, c, cols, "0x%llx ",
+			    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_UINT32_ARRAY: {
+			uint32_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_uint32_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				c = fm_printf(d + 1, c, cols, "0x%llx ",
+				    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_INT64_ARRAY: {
+			int64_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_int64_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				c = fm_printf(d + 1, c, cols, "0x%llx ",
+				    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_UINT64_ARRAY: {
+			uint64_t *val;
+			uint_t i, nelem;
+
+			c = fm_printf(d + 1, c, cols, "[ ");
+			(void) nvpair_value_uint64_array(nvp, &val, &nelem);
+			for (i = 0; i < nelem; i++)
+				c = fm_printf(d + 1, c, cols, "0x%llx ",
+				    (u_longlong_t)val[i]);
+
+			c = fm_printf(d + 1, c, cols, "]");
+			break;
+			}
+
+		case DATA_TYPE_STRING_ARRAY:
+		case DATA_TYPE_BOOLEAN_ARRAY:
+		case DATA_TYPE_BYTE_ARRAY:
+			c = fm_printf(d + 1, c, cols, "[...]");
+			break;
+
+		case DATA_TYPE_UNKNOWN:
+		case DATA_TYPE_DONTCARE:
+			c = fm_printf(d + 1, c, cols, "<unknown>");
+			break;
+		}
+	}
+
+	return (c);
+}
+
+void
+fm_nvprint(nvlist_t *nvl)
+{
+	char *class;
+	int c = 0;
+
+	console_printf("\n");
+
+	if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
+		c = fm_printf(0, c, zfs_zevent_cols, "%s", class);
+
+	if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0)
+		console_printf("\n");
+
+	console_printf("\n");
+}
+
+static zevent_t *
+zfs_zevent_alloc(void)
+{
+	zevent_t *ev;
+
+	ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP);
+
+	list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t),
+	    offsetof(zfs_zevent_t, ze_node));
+	list_link_init(&ev->ev_node);
+
+	return (ev);
+}
+
+static void
+zfs_zevent_free(zevent_t *ev)
+{
+	/* Run provided cleanup callback */
+	ev->ev_cb(ev->ev_nvl, ev->ev_detector);
+
+	list_destroy(&ev->ev_ze_list);
+	kmem_free(ev, sizeof (zevent_t));
+}
+
+static void
+zfs_zevent_drain(zevent_t *ev)
+{
+	zfs_zevent_t *ze;
+
+	ASSERT(MUTEX_HELD(&zevent_lock));
+	list_remove(&zevent_list, ev);
+
+	/* Remove references to this event in all private file data */
+	while ((ze = list_head(&ev->ev_ze_list)) != NULL) {
+		list_remove(&ev->ev_ze_list, ze);
+		ze->ze_zevent = NULL;
+		ze->ze_dropped++;
+	}
+
+	zfs_zevent_free(ev);
+}
+
+void
+zfs_zevent_drain_all(int *count)
+{
+	zevent_t *ev;
+
+	mutex_enter(&zevent_lock);
+	while ((ev = list_head(&zevent_list)) != NULL)
+		zfs_zevent_drain(ev);
+
+	*count = zevent_len_cur;
+	zevent_len_cur = 0;
+	mutex_exit(&zevent_lock);
+}
+
+/*
+ * New zevents are inserted at the head.  If the maximum queue
+ * length is exceeded a zevent will be drained from the tail.
+ * As part of this any user space processes which currently have
+ * a reference to this zevent_t in their private data will have
+ * this reference set to NULL.
+ */
+static void
+zfs_zevent_insert(zevent_t *ev)
+{
+	ASSERT(MUTEX_HELD(&zevent_lock));
+	list_insert_head(&zevent_list, ev);
+
+	if (zevent_len_cur >= zfs_zevent_len_max)
+		zfs_zevent_drain(list_tail(&zevent_list));
+	else
+		zevent_len_cur++;
+}
+
+/*
+ * Post a zevent. The cb will be called when nvl and detector are no longer
+ * needed, i.e.:
+ * - An error happened and a zevent can't be posted. In this case, cb is called
+ *   before zfs_zevent_post() returns.
+ * - The event is being drained and freed.
+ */
+int
+zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb)
+{
+	inode_timespec_t tv;
+	int64_t tv_array[2];
+	uint64_t eid;
+	size_t nvl_size = 0;
+	zevent_t *ev;
+	int error;
+
+	ASSERT(cb != NULL);
+
+	gethrestime(&tv);
+	tv_array[0] = tv.tv_sec;
+	tv_array[1] = tv.tv_nsec;
+
+	error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2);
+	if (error) {
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+		goto out;
+	}
+
+	eid = atomic_inc_64_nv(&zevent_eid);
+	error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid);
+	if (error) {
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+		goto out;
+	}
+
+	error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE);
+	if (error) {
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+		goto out;
+	}
+
+	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+		error = EOVERFLOW;
+		goto out;
+	}
+
+	if (zfs_zevent_console)
+		fm_nvprint(nvl);
+
+	ev = zfs_zevent_alloc();
+	if (ev == NULL) {
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
+		error = ENOMEM;
+		goto out;
+	}
+
+	ev->ev_nvl = nvl;
+	ev->ev_detector = detector;
+	ev->ev_cb = cb;
+	ev->ev_eid = eid;
+
+	mutex_enter(&zevent_lock);
+	zfs_zevent_insert(ev);
+	cv_broadcast(&zevent_cv);
+	mutex_exit(&zevent_lock);
+
+out:
+	if (error)
+		cb(nvl, detector);
+
+	return (error);
+}
+
+void
+zfs_zevent_track_duplicate(void)
+{
+	atomic_inc_64(&erpt_kstat_data.erpt_duplicates.value.ui64);
+}
+
+static int
+zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze)
+{
+	*ze = zfsdev_get_state(minor, ZST_ZEVENT);
+	if (*ze == NULL)
+		return (SET_ERROR(EBADF));
+
+	return (0);
+}
+
+int
+zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze)
+{
+	int error;
+
+	error = zfsdev_getminor(fd, minorp);
+	if (error == 0)
+		error = zfs_zevent_minor_to_state(*minorp, ze);
+
+	if (error)
+		zfs_zevent_fd_rele(fd);
+
+	return (error);
+}
+
+void
+zfs_zevent_fd_rele(int fd)
+{
+	zfs_file_put(fd);
+}
+
+/*
+ * Get the next zevent in the stream and place a copy in 'event'.  This
+ * may fail with ENOMEM if the encoded nvlist size exceeds the passed
+ * 'event_size'.  In this case the stream pointer is not advanced and
+ * and 'event_size' is set to the minimum required buffer size.
+ */
+int
+zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size,
+    uint64_t *dropped)
+{
+	zevent_t *ev;
+	size_t size;
+	int error = 0;
+
+	mutex_enter(&zevent_lock);
+	if (ze->ze_zevent == NULL) {
+		/* New stream start at the beginning/tail */
+		ev = list_tail(&zevent_list);
+		if (ev == NULL) {
+			error = ENOENT;
+			goto out;
+		}
+	} else {
+		/*
+		 * Existing stream continue with the next element and remove
+		 * ourselves from the wait queue for the previous element
+		 */
+		ev = list_prev(&zevent_list, ze->ze_zevent);
+		if (ev == NULL) {
+			error = ENOENT;
+			goto out;
+		}
+	}
+
+	VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0);
+	if (size > *event_size) {
+		*event_size = size;
+		error = ENOMEM;
+		goto out;
+	}
+
+	if (ze->ze_zevent)
+		list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+	ze->ze_zevent = ev;
+	list_insert_head(&ev->ev_ze_list, ze);
+	(void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP);
+	*dropped = ze->ze_dropped;
+
+#ifdef _KERNEL
+	/* Include events dropped due to rate limiting */
+	*dropped += ratelimit_dropped;
+	ratelimit_dropped = 0;
+#endif
+	ze->ze_dropped = 0;
+out:
+	mutex_exit(&zevent_lock);
+
+	return (error);
+}
+
+/*
+ * Wait in an interruptible state for any new events.
+ */
+int
+zfs_zevent_wait(zfs_zevent_t *ze)
+{
+	int error = EAGAIN;
+
+	mutex_enter(&zevent_lock);
+	zevent_waiters++;
+
+	while (error == EAGAIN) {
+		if (zevent_flags & ZEVENT_SHUTDOWN) {
+			error = SET_ERROR(ESHUTDOWN);
+			break;
+		}
+
+		error = cv_wait_sig(&zevent_cv, &zevent_lock);
+		if (signal_pending(current)) {
+			error = SET_ERROR(EINTR);
+			break;
+		} else if (!list_is_empty(&zevent_list)) {
+			error = 0;
+			continue;
+		} else {
+			error = EAGAIN;
+		}
+	}
+
+	zevent_waiters--;
+	mutex_exit(&zevent_lock);
+
+	return (error);
+}
+
+/*
+ * The caller may seek to a specific EID by passing that EID.  If the EID
+ * is still available in the posted list of events the cursor is positioned
+ * there.  Otherwise ENOENT is returned and the cursor is not moved.
+ *
+ * There are two reserved EIDs which may be passed and will never fail.
+ * ZEVENT_SEEK_START positions the cursor at the start of the list, and
+ * ZEVENT_SEEK_END positions the cursor at the end of the list.
+ */
+int
+zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid)
+{
+	zevent_t *ev;
+	int error = 0;
+
+	mutex_enter(&zevent_lock);
+
+	if (eid == ZEVENT_SEEK_START) {
+		if (ze->ze_zevent)
+			list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+		ze->ze_zevent = NULL;
+		goto out;
+	}
+
+	if (eid == ZEVENT_SEEK_END) {
+		if (ze->ze_zevent)
+			list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+		ev = list_head(&zevent_list);
+		if (ev) {
+			ze->ze_zevent = ev;
+			list_insert_head(&ev->ev_ze_list, ze);
+		} else {
+			ze->ze_zevent = NULL;
+		}
+
+		goto out;
+	}
+
+	for (ev = list_tail(&zevent_list); ev != NULL;
+	    ev = list_prev(&zevent_list, ev)) {
+		if (ev->ev_eid == eid) {
+			if (ze->ze_zevent)
+				list_remove(&ze->ze_zevent->ev_ze_list, ze);
+
+			ze->ze_zevent = ev;
+			list_insert_head(&ev->ev_ze_list, ze);
+			break;
+		}
+	}
+
+	if (ev == NULL)
+		error = ENOENT;
+
+out:
+	mutex_exit(&zevent_lock);
+
+	return (error);
+}
+
+void
+zfs_zevent_init(zfs_zevent_t **zep)
+{
+	zfs_zevent_t *ze;
+
+	ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP);
+	list_link_init(&ze->ze_node);
+}
+
+void
+zfs_zevent_destroy(zfs_zevent_t *ze)
+{
+	mutex_enter(&zevent_lock);
+	if (ze->ze_zevent)
+		list_remove(&ze->ze_zevent->ev_ze_list, ze);
+	mutex_exit(&zevent_lock);
+
+	kmem_free(ze, sizeof (zfs_zevent_t));
+}
+#endif /* _KERNEL */
+
+/*
+ * Wrappers for FM nvlist allocators
+ */
+/* ARGSUSED */
+static void *
+i_fm_alloc(nv_alloc_t *nva, size_t size)
+{
+	return (kmem_zalloc(size, KM_SLEEP));
+}
+
+/* ARGSUSED */
+static void
+i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
+{
+	kmem_free(buf, size);
+}
+
+const nv_alloc_ops_t fm_mem_alloc_ops = {
+	.nv_ao_init = NULL,
+	.nv_ao_fini = NULL,
+	.nv_ao_alloc = i_fm_alloc,
+	.nv_ao_free = i_fm_free,
+	.nv_ao_reset = NULL
+};
+
+/*
+ * Create and initialize a new nv_alloc_t for a fixed buffer, buf.  A pointer
+ * to the newly allocated nv_alloc_t structure is returned upon success or NULL
+ * is returned to indicate that the nv_alloc structure could not be created.
+ */
+nv_alloc_t *
+fm_nva_xcreate(char *buf, size_t bufsz)
+{
+	nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+	if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
+		kmem_free(nvhdl, sizeof (nv_alloc_t));
+		return (NULL);
+	}
+
+	return (nvhdl);
+}
+
+/*
+ * Destroy a previously allocated nv_alloc structure.  The fixed buffer
+ * associated with nva must be freed by the caller.
+ */
+void
+fm_nva_xdestroy(nv_alloc_t *nva)
+{
+	nv_alloc_fini(nva);
+	kmem_free(nva, sizeof (nv_alloc_t));
+}
+
+/*
+ * Create a new nv list.  A pointer to a new nv list structure is returned
+ * upon success or NULL is returned to indicate that the structure could
+ * not be created.  The newly created nv list is created and managed by the
+ * operations installed in nva.   If nva is NULL, the default FMA nva
+ * operations are installed and used.
+ *
+ * When called from the kernel and nva == NULL, this function must be called
+ * from passive kernel context with no locks held that can prevent a
+ * sleeping memory allocation from occurring.  Otherwise, this function may
+ * be called from other kernel contexts as long a valid nva created via
+ * fm_nva_create() is supplied.
+ */
+nvlist_t *
+fm_nvlist_create(nv_alloc_t *nva)
+{
+	int hdl_alloced = 0;
+	nvlist_t *nvl;
+	nv_alloc_t *nvhdl;
+
+	if (nva == NULL) {
+		nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
+
+		if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
+			kmem_free(nvhdl, sizeof (nv_alloc_t));
+			return (NULL);
+		}
+		hdl_alloced = 1;
+	} else {
+		nvhdl = nva;
+	}
+
+	if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
+		if (hdl_alloced) {
+			nv_alloc_fini(nvhdl);
+			kmem_free(nvhdl, sizeof (nv_alloc_t));
+		}
+		return (NULL);
+	}
+
+	return (nvl);
+}
+
+/*
+ * Destroy a previously allocated nvlist structure.  flag indicates whether
+ * or not the associated nva structure should be freed (FM_NVA_FREE) or
+ * retained (FM_NVA_RETAIN).  Retaining the nv alloc structure allows
+ * it to be re-used for future nvlist creation operations.
+ */
+void
+fm_nvlist_destroy(nvlist_t *nvl, int flag)
+{
+	nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
+
+	nvlist_free(nvl);
+
+	if (nva != NULL) {
+		if (flag == FM_NVA_FREE)
+			fm_nva_xdestroy(nva);
+	}
+}
+
+int
+i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
+{
+	int nelem, ret = 0;
+	data_type_t type;
+
+	while (ret == 0 && name != NULL) {
+		type = va_arg(ap, data_type_t);
+		switch (type) {
+		case DATA_TYPE_BYTE:
+			ret = nvlist_add_byte(payload, name,
+			    va_arg(ap, uint_t));
+			break;
+		case DATA_TYPE_BYTE_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_byte_array(payload, name,
+			    va_arg(ap, uchar_t *), nelem);
+			break;
+		case DATA_TYPE_BOOLEAN_VALUE:
+			ret = nvlist_add_boolean_value(payload, name,
+			    va_arg(ap, boolean_t));
+			break;
+		case DATA_TYPE_BOOLEAN_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_boolean_array(payload, name,
+			    va_arg(ap, boolean_t *), nelem);
+			break;
+		case DATA_TYPE_INT8:
+			ret = nvlist_add_int8(payload, name,
+			    va_arg(ap, int));
+			break;
+		case DATA_TYPE_INT8_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_int8_array(payload, name,
+			    va_arg(ap, int8_t *), nelem);
+			break;
+		case DATA_TYPE_UINT8:
+			ret = nvlist_add_uint8(payload, name,
+			    va_arg(ap, uint_t));
+			break;
+		case DATA_TYPE_UINT8_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_uint8_array(payload, name,
+			    va_arg(ap, uint8_t *), nelem);
+			break;
+		case DATA_TYPE_INT16:
+			ret = nvlist_add_int16(payload, name,
+			    va_arg(ap, int));
+			break;
+		case DATA_TYPE_INT16_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_int16_array(payload, name,
+			    va_arg(ap, int16_t *), nelem);
+			break;
+		case DATA_TYPE_UINT16:
+			ret = nvlist_add_uint16(payload, name,
+			    va_arg(ap, uint_t));
+			break;
+		case DATA_TYPE_UINT16_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_uint16_array(payload, name,
+			    va_arg(ap, uint16_t *), nelem);
+			break;
+		case DATA_TYPE_INT32:
+			ret = nvlist_add_int32(payload, name,
+			    va_arg(ap, int32_t));
+			break;
+		case DATA_TYPE_INT32_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_int32_array(payload, name,
+			    va_arg(ap, int32_t *), nelem);
+			break;
+		case DATA_TYPE_UINT32:
+			ret = nvlist_add_uint32(payload, name,
+			    va_arg(ap, uint32_t));
+			break;
+		case DATA_TYPE_UINT32_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_uint32_array(payload, name,
+			    va_arg(ap, uint32_t *), nelem);
+			break;
+		case DATA_TYPE_INT64:
+			ret = nvlist_add_int64(payload, name,
+			    va_arg(ap, int64_t));
+			break;
+		case DATA_TYPE_INT64_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_int64_array(payload, name,
+			    va_arg(ap, int64_t *), nelem);
+			break;
+		case DATA_TYPE_UINT64:
+			ret = nvlist_add_uint64(payload, name,
+			    va_arg(ap, uint64_t));
+			break;
+		case DATA_TYPE_UINT64_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_uint64_array(payload, name,
+			    va_arg(ap, uint64_t *), nelem);
+			break;
+		case DATA_TYPE_STRING:
+			ret = nvlist_add_string(payload, name,
+			    va_arg(ap, char *));
+			break;
+		case DATA_TYPE_STRING_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_string_array(payload, name,
+			    va_arg(ap, char **), nelem);
+			break;
+		case DATA_TYPE_NVLIST:
+			ret = nvlist_add_nvlist(payload, name,
+			    va_arg(ap, nvlist_t *));
+			break;
+		case DATA_TYPE_NVLIST_ARRAY:
+			nelem = va_arg(ap, int);
+			ret = nvlist_add_nvlist_array(payload, name,
+			    va_arg(ap, nvlist_t **), nelem);
+			break;
+		default:
+			ret = EINVAL;
+		}
+
+		name = va_arg(ap, char *);
+	}
+	return (ret);
+}
+
+void
+fm_payload_set(nvlist_t *payload, ...)
+{
+	int ret;
+	const char *name;
+	va_list ap;
+
+	va_start(ap, payload);
+	name = va_arg(ap, char *);
+	ret = i_fm_payload_set(payload, name, ap);
+	va_end(ap);
+
+	if (ret)
+		atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
+}
+
+/*
+ * Set-up and validate the members of an ereport event according to:
+ *
+ *	Member name		Type		Value
+ *	====================================================
+ *	class			string		ereport
+ *	version			uint8_t		0
+ *	ena			uint64_t	<ena>
+ *	detector		nvlist_t	<detector>
+ *	ereport-payload		nvlist_t	<var args>
+ *
+ * We don't actually add a 'version' member to the payload.  Really,
+ * the version quoted to us by our caller is that of the category 1
+ * "ereport" event class (and we require FM_EREPORT_VERS0) but
+ * the payload version of the actual leaf class event under construction
+ * may be something else.  Callers should supply a version in the varargs,
+ * or (better) we could take two version arguments - one for the
+ * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
+ * for the leaf class.
+ */
+void
+fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
+    uint64_t ena, const nvlist_t *detector, ...)
+{
+	char ereport_class[FM_MAX_CLASS];
+	const char *name;
+	va_list ap;
+	int ret;
+
+	if (version != FM_EREPORT_VERS0) {
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+		return;
+	}
+
+	(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
+	    FM_EREPORT_CLASS, erpt_class);
+	if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+		return;
+	}
+
+	if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+	}
+
+	if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
+	    (nvlist_t *)detector) != 0) {
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+	}
+
+	va_start(ap, detector);
+	name = va_arg(ap, const char *);
+	ret = i_fm_payload_set(ereport, name, ap);
+	va_end(ap);
+
+	if (ret)
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
+}
+
+/*
+ * Set-up and validate the members of an hc fmri according to;
+ *
+ *	Member name		Type		Value
+ *	===================================================
+ *	version			uint8_t		0
+ *	auth			nvlist_t	<auth>
+ *	hc-name			string		<name>
+ *	hc-id			string		<id>
+ *
+ * Note that auth and hc-id are optional members.
+ */
+
+#define	HC_MAXPAIRS	20
+#define	HC_MAXNAMELEN	50
+
+static int
+fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
+{
+	if (version != FM_HC_SCHEME_VERSION) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return (0);
+	}
+
+	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
+	    nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return (0);
+	}
+
+	if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+	    (nvlist_t *)auth) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return (0);
+	}
+
+	return (1);
+}
+
+void
+fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+    nvlist_t *snvl, int npairs, ...)
+{
+	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+	nvlist_t *pairs[HC_MAXPAIRS];
+	va_list ap;
+	int i;
+
+	if (!fm_fmri_hc_set_common(fmri, version, auth))
+		return;
+
+	npairs = MIN(npairs, HC_MAXPAIRS);
+
+	va_start(ap, npairs);
+	for (i = 0; i < npairs; i++) {
+		const char *name = va_arg(ap, const char *);
+		uint32_t id = va_arg(ap, uint32_t);
+		char idstr[11];
+
+		(void) snprintf(idstr, sizeof (idstr), "%u", id);
+
+		pairs[i] = fm_nvlist_create(nva);
+		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+		}
+	}
+	va_end(ap);
+
+	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+
+	for (i = 0; i < npairs; i++)
+		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+
+	if (snvl != NULL) {
+		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+		}
+	}
+}
+
+void
+fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
+    nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
+{
+	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
+	nvlist_t *pairs[HC_MAXPAIRS];
+	nvlist_t **hcl;
+	uint_t n;
+	int i, j;
+	va_list ap;
+	char *hcname, *hcid;
+
+	if (!fm_fmri_hc_set_common(fmri, version, auth))
+		return;
+
+	/*
+	 * copy the bboard nvpairs to the pairs array
+	 */
+	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
+	    != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
+		    &hcname) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+			return;
+		}
+		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+			return;
+		}
+
+		pairs[i] = fm_nvlist_create(nva);
+		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
+		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
+			for (j = 0; j <= i; j++) {
+				if (pairs[j] != NULL)
+					fm_nvlist_destroy(pairs[j],
+					    FM_NVA_RETAIN);
+			}
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+			return;
+		}
+	}
+
+	/*
+	 * create the pairs from passed in pairs
+	 */
+	npairs = MIN(npairs, HC_MAXPAIRS);
+
+	va_start(ap, npairs);
+	for (i = n; i < npairs + n; i++) {
+		const char *name = va_arg(ap, const char *);
+		uint32_t id = va_arg(ap, uint32_t);
+		char idstr[11];
+		(void) snprintf(idstr, sizeof (idstr), "%u", id);
+		pairs[i] = fm_nvlist_create(nva);
+		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
+		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
+			for (j = 0; j <= i; j++) {
+				if (pairs[j] != NULL)
+					fm_nvlist_destroy(pairs[j],
+					    FM_NVA_RETAIN);
+			}
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+			return;
+		}
+	}
+	va_end(ap);
+
+	/*
+	 * Create the fmri hc list
+	 */
+	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
+	    npairs + n) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	for (i = 0; i < npairs + n; i++) {
+			fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
+	}
+
+	if (snvl != NULL) {
+		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+			return;
+		}
+	}
+}
+
+/*
+ * Set-up and validate the members of an dev fmri according to:
+ *
+ *	Member name		Type		Value
+ *	====================================================
+ *	version			uint8_t		0
+ *	auth			nvlist_t	<auth>
+ *	devpath			string		<devpath>
+ *	[devid]			string		<devid>
+ *	[target-port-l0id]	string		<target-port-lun0-id>
+ *
+ * Note that auth and devid are optional members.
+ */
+void
+fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
+    const char *devpath, const char *devid, const char *tpl0)
+{
+	int err = 0;
+
+	if (version != DEV_SCHEME_VERSION0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
+	err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
+
+	if (auth != NULL) {
+		err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
+		    (nvlist_t *)auth);
+	}
+
+	err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
+
+	if (devid != NULL)
+		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
+
+	if (tpl0 != NULL)
+		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
+
+	if (err)
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+
+}
+
+/*
+ * Set-up and validate the members of an cpu fmri according to:
+ *
+ *	Member name		Type		Value
+ *	====================================================
+ *	version			uint8_t		0
+ *	auth			nvlist_t	<auth>
+ *	cpuid			uint32_t	<cpu_id>
+ *	cpumask			uint8_t		<cpu_mask>
+ *	serial			uint64_t	<serial_id>
+ *
+ * Note that auth, cpumask, serial are optional members.
+ *
+ */
+void
+fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
+    uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
+{
+	uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
+
+	if (version < CPU_SCHEME_VERSION1) {
+		atomic_inc_64(failedp);
+		return;
+	}
+
+	if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
+		atomic_inc_64(failedp);
+		return;
+	}
+
+	if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
+	    FM_FMRI_SCHEME_CPU) != 0) {
+		atomic_inc_64(failedp);
+		return;
+	}
+
+	if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
+	    (nvlist_t *)auth) != 0)
+		atomic_inc_64(failedp);
+
+	if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
+		atomic_inc_64(failedp);
+
+	if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
+	    *cpu_maskp) != 0)
+		atomic_inc_64(failedp);
+
+	if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
+	    FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
+			atomic_inc_64(failedp);
+}
+
+/*
+ * Set-up and validate the members of a mem according to:
+ *
+ *	Member name		Type		Value
+ *	====================================================
+ *	version			uint8_t		0
+ *	auth			nvlist_t	<auth>		[optional]
+ *	unum			string		<unum>
+ *	serial			string		<serial>	[optional*]
+ *	offset			uint64_t	<offset>	[optional]
+ *
+ *	* serial is required if offset is present
+ */
+void
+fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
+    const char *unum, const char *serial, uint64_t offset)
+{
+	if (version != MEM_SCHEME_VERSION0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	if (!serial && (offset != (uint64_t)-1)) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	if (auth != NULL) {
+		if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
+		    (nvlist_t *)auth) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+		}
+	}
+
+	if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+	}
+
+	if (serial != NULL) {
+		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
+		    (char **)&serial, 1) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+		}
+		if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
+		    FM_FMRI_MEM_OFFSET, offset) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+		}
+	}
+}
+
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+    uint64_t vdev_guid)
+{
+	if (version != ZFS_SCHEME_VERSION0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+		return;
+	}
+
+	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
+	}
+
+	if (vdev_guid != 0) {
+		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
+		}
+	}
+}
+
+uint64_t
+fm_ena_increment(uint64_t ena)
+{
+	uint64_t new_ena;
+
+	switch (ENA_FORMAT(ena)) {
+	case FM_ENA_FMT1:
+		new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
+		break;
+	case FM_ENA_FMT2:
+		new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
+		break;
+	default:
+		new_ena = 0;
+	}
+
+	return (new_ena);
+}
+
+uint64_t
+fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
+{
+	uint64_t ena = 0;
+
+	switch (format) {
+	case FM_ENA_FMT1:
+		if (timestamp) {
+			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
+			    ENA_FMT1_CPUID_MASK) |
+			    ((timestamp << ENA_FMT1_TIME_SHFT) &
+			    ENA_FMT1_TIME_MASK));
+		} else {
+			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
+			    ENA_FMT1_CPUID_MASK) |
+			    ((gethrtime() << ENA_FMT1_TIME_SHFT) &
+			    ENA_FMT1_TIME_MASK));
+		}
+		break;
+	case FM_ENA_FMT2:
+		ena = (uint64_t)((format & ENA_FORMAT_MASK) |
+		    ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
+		break;
+	default:
+		break;
+	}
+
+	return (ena);
+}
+
+uint64_t
+fm_ena_generate(uint64_t timestamp, uchar_t format)
+{
+	uint64_t ena;
+
+	kpreempt_disable();
+	ena = fm_ena_generate_cpu(timestamp, getcpuid(), format);
+	kpreempt_enable();
+
+	return (ena);
+}
+
+uint64_t
+fm_ena_generation_get(uint64_t ena)
+{
+	uint64_t gen;
+
+	switch (ENA_FORMAT(ena)) {
+	case FM_ENA_FMT1:
+		gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
+		break;
+	case FM_ENA_FMT2:
+		gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
+		break;
+	default:
+		gen = 0;
+		break;
+	}
+
+	return (gen);
+}
+
+uchar_t
+fm_ena_format_get(uint64_t ena)
+{
+
+	return (ENA_FORMAT(ena));
+}
+
+uint64_t
+fm_ena_id_get(uint64_t ena)
+{
+	uint64_t id;
+
+	switch (ENA_FORMAT(ena)) {
+	case FM_ENA_FMT1:
+		id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
+		break;
+	case FM_ENA_FMT2:
+		id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
+		break;
+	default:
+		id = 0;
+	}
+
+	return (id);
+}
+
+uint64_t
+fm_ena_time_get(uint64_t ena)
+{
+	uint64_t time;
+
+	switch (ENA_FORMAT(ena)) {
+	case FM_ENA_FMT1:
+		time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
+		break;
+	case FM_ENA_FMT2:
+		time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
+		break;
+	default:
+		time = 0;
+	}
+
+	return (time);
+}
+
+#ifdef _KERNEL
+/*
+ * Helper function to increment ereport dropped count.  Used by the event
+ * rate limiting code to give feedback to the user about how many events were
+ * rate limited by including them in the 'dropped' count.
+ */
+void
+fm_erpt_dropped_increment(void)
+{
+	atomic_inc_64(&ratelimit_dropped);
+}
+
+void
+fm_init(void)
+{
+	zevent_len_cur = 0;
+	zevent_flags = 0;
+
+	if (zfs_zevent_len_max == 0)
+		zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
+
+	/* Initialize zevent allocation and generation kstats */
+	fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED,
+	    sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (fm_ksp != NULL) {
+		fm_ksp->ks_data = &erpt_kstat_data;
+		kstat_install(fm_ksp);
+	} else {
+		cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
+	}
+
+	mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zevent_list, sizeof (zevent_t),
+	    offsetof(zevent_t, ev_node));
+	cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL);
+
+	zfs_ereport_init();
+}
+
+void
+fm_fini(void)
+{
+	int count;
+
+	zfs_ereport_fini();
+
+	zfs_zevent_drain_all(&count);
+
+	mutex_enter(&zevent_lock);
+	cv_broadcast(&zevent_cv);
+
+	zevent_flags |= ZEVENT_SHUTDOWN;
+	while (zevent_waiters > 0) {
+		mutex_exit(&zevent_lock);
+		schedule();
+		mutex_enter(&zevent_lock);
+	}
+	mutex_exit(&zevent_lock);
+
+	cv_destroy(&zevent_cv);
+	list_destroy(&zevent_list);
+	mutex_destroy(&zevent_lock);
+
+	if (fm_ksp != NULL) {
+		kstat_delete(fm_ksp);
+		fm_ksp = NULL;
+	}
+}
+#endif /* _KERNEL */
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW,
+	"Max event queue length");
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, cols, INT, ZMOD_RW,
+	"Max event column width");
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, console, INT, ZMOD_RW,
+	"Log events to the console");
diff --git a/sys/contrib/openzfs/module/zfs/gzip.c b/sys/contrib/openzfs/module/zfs/gzip.c
new file mode 100644
index 000000000000..e2c6e59969d6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/gzip.c
@@ -0,0 +1,106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+#include <sys/debug.h>
+#include <sys/types.h>
+#include <sys/strings.h>
+#include <sys/qat.h>
+#include <sys/zio_compress.h>
+
+#ifdef _KERNEL
+
+#include <sys/zmod.h>
+typedef size_t zlen_t;
+#define	compress_func	z_compress_level
+#define	uncompress_func	z_uncompress
+
+#else /* _KERNEL */
+
+#include <zlib.h>
+typedef uLongf zlen_t;
+#define	compress_func	compress2
+#define	uncompress_func	uncompress
+
+#endif
+
+size_t
+gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	int ret;
+	zlen_t dstlen = d_len;
+
+	ASSERT(d_len <= s_len);
+
+	/* check if hardware accelerator can be used */
+	if (qat_dc_use_accel(s_len)) {
+		ret = qat_compress(QAT_COMPRESS, s_start, s_len, d_start,
+		    d_len, &dstlen);
+		if (ret == CPA_STATUS_SUCCESS) {
+			return ((size_t)dstlen);
+		} else if (ret == CPA_STATUS_INCOMPRESSIBLE) {
+			if (d_len != s_len)
+				return (s_len);
+
+			bcopy(s_start, d_start, s_len);
+			return (s_len);
+		}
+		/* if hardware compression fails, do it again with software */
+	}
+
+	if (compress_func(d_start, &dstlen, s_start, s_len, n) != Z_OK) {
+		if (d_len != s_len)
+			return (s_len);
+
+		bcopy(s_start, d_start, s_len);
+		return (s_len);
+	}
+
+	return ((size_t)dstlen);
+}
+
+/*ARGSUSED*/
+int
+gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	zlen_t dstlen = d_len;
+
+	ASSERT(d_len >= s_len);
+
+	/* check if hardware accelerator can be used */
+	if (qat_dc_use_accel(d_len)) {
+		if (qat_compress(QAT_DECOMPRESS, s_start, s_len,
+		    d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS)
+			return (0);
+		/* if hardware de-compress fail, do it again with software */
+	}
+
+	if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK)
+		return (-1);
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/hkdf.c b/sys/contrib/openzfs/module/zfs/hkdf.c
new file mode 100644
index 000000000000..14265472df7d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/hkdf.c
@@ -0,0 +1,171 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Datto, Inc. All rights reserved.
+ */
+
+#include <sys/crypto/api.h>
+#include <sys/sha2.h>
+#include <sys/hkdf.h>
+
+static int
+hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material,
+    uint_t km_len, uint8_t *out_buf)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	crypto_key_t key;
+	crypto_data_t input_cd, output_cd;
+
+	/* initialize HMAC mechanism */
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	mech.cm_param = NULL;
+	mech.cm_param_len = 0;
+
+	/* initialize the salt as a crypto key */
+	key.ck_format = CRYPTO_KEY_RAW;
+	key.ck_length = CRYPTO_BYTES2BITS(salt_len);
+	key.ck_data = salt;
+
+	/* initialize crypto data for the input and output data */
+	input_cd.cd_format = CRYPTO_DATA_RAW;
+	input_cd.cd_offset = 0;
+	input_cd.cd_length = km_len;
+	input_cd.cd_raw.iov_base = (char *)key_material;
+	input_cd.cd_raw.iov_len = input_cd.cd_length;
+
+	output_cd.cd_format = CRYPTO_DATA_RAW;
+	output_cd.cd_offset = 0;
+	output_cd.cd_length = SHA512_DIGEST_LENGTH;
+	output_cd.cd_raw.iov_base = (char *)out_buf;
+	output_cd.cd_raw.iov_len = output_cd.cd_length;
+
+	ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL);
+	if (ret != CRYPTO_SUCCESS)
+		return (SET_ERROR(EIO));
+
+	return (0);
+}
+
+static int
+hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len,
+    uint8_t *out_buf, uint_t out_len)
+{
+	int ret;
+	crypto_mechanism_t mech;
+	crypto_context_t ctx;
+	crypto_key_t key;
+	crypto_data_t T_cd, info_cd, c_cd;
+	uint_t i, T_len = 0, pos = 0;
+	uint8_t c;
+	uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH;
+	uint8_t T[SHA512_DIGEST_LENGTH];
+
+	if (N > 255)
+		return (SET_ERROR(EINVAL));
+
+	/* initialize HMAC mechanism */
+	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
+	mech.cm_param = NULL;
+	mech.cm_param_len = 0;
+
+	/* initialize the salt as a crypto key */
+	key.ck_format = CRYPTO_KEY_RAW;
+	key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH);
+	key.ck_data = extract_key;
+
+	/* initialize crypto data for the input and output data */
+	T_cd.cd_format = CRYPTO_DATA_RAW;
+	T_cd.cd_offset = 0;
+	T_cd.cd_raw.iov_base = (char *)T;
+
+	c_cd.cd_format = CRYPTO_DATA_RAW;
+	c_cd.cd_offset = 0;
+	c_cd.cd_length = 1;
+	c_cd.cd_raw.iov_base = (char *)&c;
+	c_cd.cd_raw.iov_len = c_cd.cd_length;
+
+	info_cd.cd_format = CRYPTO_DATA_RAW;
+	info_cd.cd_offset = 0;
+	info_cd.cd_length = info_len;
+	info_cd.cd_raw.iov_base = (char *)info;
+	info_cd.cd_raw.iov_len = info_cd.cd_length;
+
+	for (i = 1; i <= N; i++) {
+		c = i;
+
+		T_cd.cd_length = T_len;
+		T_cd.cd_raw.iov_len = T_cd.cd_length;
+
+		ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL);
+		if (ret != CRYPTO_SUCCESS)
+			return (SET_ERROR(EIO));
+
+		ret = crypto_mac_update(ctx, &T_cd, NULL);
+		if (ret != CRYPTO_SUCCESS)
+			return (SET_ERROR(EIO));
+
+		ret = crypto_mac_update(ctx, &info_cd, NULL);
+		if (ret != CRYPTO_SUCCESS)
+			return (SET_ERROR(EIO));
+
+		ret = crypto_mac_update(ctx, &c_cd, NULL);
+		if (ret != CRYPTO_SUCCESS)
+			return (SET_ERROR(EIO));
+
+		T_len = SHA512_DIGEST_LENGTH;
+		T_cd.cd_length = T_len;
+		T_cd.cd_raw.iov_len = T_cd.cd_length;
+
+		ret = crypto_mac_final(ctx, &T_cd, NULL);
+		if (ret != CRYPTO_SUCCESS)
+			return (SET_ERROR(EIO));
+
+		bcopy(T, out_buf + pos,
+		    (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos));
+		pos += SHA512_DIGEST_LENGTH;
+	}
+
+	return (0);
+}
+
+/*
+ * HKDF is designed to be a relatively fast function for deriving keys from a
+ * master key + a salt. We use this function to generate new encryption keys
+ * so as to avoid hitting the cryptographic limits of the underlying
+ * encryption modes. Note that, for the sake of deriving encryption keys, the
+ * info parameter is called the "salt" everywhere else in the code.
+ */
+int
+hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt,
+    uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key,
+    uint_t out_len)
+{
+	int ret;
+	uint8_t extract_key[SHA512_DIGEST_LENGTH];
+
+	ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len,
+	    extract_key);
+	if (ret != 0)
+		return (ret);
+
+	ret = hkdf_sha512_expand(extract_key, info, info_len, output_key,
+	    out_len);
+	if (ret != 0)
+		return (ret);
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/lz4.c b/sys/contrib/openzfs/module/zfs/lz4.c
new file mode 100644
index 000000000000..9da9d9e00635
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/lz4.c
@@ -0,0 +1,1084 @@
+/*
+ * LZ4 - Fast LZ compression algorithm
+ * Header File
+ * Copyright (C) 2011-2013, Yann Collet.
+ * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You can contact the author at :
+ * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ * - LZ4 source repository : http://code.google.com/p/lz4/
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+
+static int real_LZ4_compress(const char *source, char *dest, int isize,
+    int osize);
+static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest,
+    int isize, int maxOutputSize);
+static int LZ4_compressCtx(void *ctx, const char *source, char *dest,
+    int isize, int osize);
+static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
+    int isize, int osize);
+
+static void *lz4_alloc(int flags);
+static void lz4_free(void *ctx);
+
+/*ARGSUSED*/
+size_t
+lz4_compress_zfs(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
+{
+	uint32_t bufsiz;
+	char *dest = d_start;
+
+	ASSERT(d_len >= sizeof (bufsiz));
+
+	bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len,
+	    d_len - sizeof (bufsiz));
+
+	/* Signal an error if the compression routine returned zero. */
+	if (bufsiz == 0)
+		return (s_len);
+
+	/*
+	 * The exact compressed size is needed by the decompression routine,
+	 * so it is stored at the start of the buffer. Note that this may be
+	 * less than the compressed block size, which is rounded up to a
+	 * multiple of 1<<ashift.
+	 */
+	*(uint32_t *)dest = BE_32(bufsiz);
+
+	return (bufsiz + sizeof (bufsiz));
+}
+
+/*ARGSUSED*/
+int
+lz4_decompress_zfs(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int n)
+{
+	const char *src = s_start;
+	uint32_t bufsiz = BE_IN32(src);
+
+	/* invalid compressed buffer size encoded at start */
+	if (bufsiz + sizeof (bufsiz) > s_len)
+		return (1);
+
+	/*
+	 * Returns 0 on success (decompression function returned non-negative)
+	 * and non-zero on failure (decompression function returned negative).
+	 */
+	return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)],
+	    d_start, bufsiz, d_len) < 0);
+}
+
+/*
+ * LZ4 API Description:
+ *
+ * Simple Functions:
+ * real_LZ4_compress() :
+ * 	isize  : is the input size. Max supported value is ~1.9GB
+ * 	return : the number of bytes written in buffer dest
+ *		 or 0 if the compression fails (if LZ4_COMPRESSMIN is set).
+ * 	note : destination buffer must be already allocated.
+ * 		destination buffer must be sized to handle worst cases
+ * 		situations (input data not compressible) worst case size
+ * 		evaluation is provided by function LZ4_compressBound().
+ *
+ * real_LZ4_uncompress() :
+ * 	osize  : is the output size, therefore the original size
+ * 	return : the number of bytes read in the source buffer.
+ * 		If the source stream is malformed, the function will stop
+ * 		decoding and return a negative result, indicating the byte
+ * 		position of the faulty instruction. This function never
+ * 		writes beyond dest + osize, and is therefore protected
+ * 		against malicious data packets.
+ * 	note : destination buffer must be already allocated
+ *	note : real_LZ4_uncompress() is not used in ZFS so its code
+ *	       is not present here.
+ *
+ * Advanced Functions
+ *
+ * LZ4_compressBound() :
+ * 	Provides the maximum size that LZ4 may output in a "worst case"
+ * 	scenario (input data not compressible) primarily useful for memory
+ * 	allocation of output buffer.
+ *
+ * 	isize  : is the input size. Max supported value is ~1.9GB
+ * 	return : maximum output size in a "worst case" scenario
+ * 	note : this function is limited by "int" range (2^31-1)
+ *
+ * LZ4_uncompress_unknownOutputSize() :
+ * 	isize  : is the input size, therefore the compressed size
+ * 	maxOutputSize : is the size of the destination buffer (which must be
+ * 		already allocated)
+ * 	return : the number of bytes decoded in the destination buffer
+ * 		(necessarily <= maxOutputSize). If the source stream is
+ * 		malformed, the function will stop decoding and return a
+ * 		negative result, indicating the byte position of the faulty
+ * 		instruction. This function never writes beyond dest +
+ * 		maxOutputSize, and is therefore protected against malicious
+ * 		data packets.
+ * 	note   : Destination buffer must be already allocated.
+ *		This version is slightly slower than real_LZ4_uncompress()
+ *
+ * LZ4_compressCtx() :
+ * 	This function explicitly handles the CTX memory structure.
+ *
+ * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * 	by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * 	NULL isn't valid.
+ *
+ * LZ4_compress64kCtx() :
+ * 	Same as LZ4_compressCtx(), but specific to small inputs (<64KB).
+ * 	isize *Must* be <64KB, otherwise the output will be corrupted.
+ *
+ * 	ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated
+ * 	by the caller (either on the stack or using kmem_cache_alloc). Passing
+ * 	NULL isn't valid.
+ */
+
+/*
+ * Tuning parameters
+ */
+
+/*
+ * COMPRESSIONLEVEL: Increasing this value improves compression ratio
+ *	 Lowering this value reduces memory usage. Reduced memory usage
+ *	typically improves speed, due to cache effect (ex: L1 32KB for Intel,
+ *	L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes
+ *	(examples : 12 -> 16KB ; 17 -> 512KB)
+ */
+#define	COMPRESSIONLEVEL 12
+
+/*
+ * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the
+ *	algorithm skip faster data segments considered "incompressible".
+ *	This may decrease compression ratio dramatically, but will be
+ *	faster on incompressible data. Increasing this value will make
+ *	the algorithm search more before declaring a segment "incompressible".
+ *	This could improve compression a bit, but will be slower on
+ *	incompressible data. The default value (6) is recommended.
+ */
+#define	NOTCOMPRESSIBLE_CONFIRMATION 6
+
+/*
+ * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to
+ * performance for big endian cpu, but the resulting compressed stream
+ * will be incompatible with little-endian CPU. You can set this option
+ * to 1 in situations where data will stay within closed environment.
+ * This option is useless on Little_Endian CPU (such as x86).
+ */
+/* #define	BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */
+
+/*
+ * CPU Feature Detection
+ */
+
+/* 32 or 64 bits ? */
+#if defined(_LP64)
+#define	LZ4_ARCH64 1
+#else
+#define	LZ4_ARCH64 0
+#endif
+
+/*
+ * Little Endian or Big Endian?
+ * Note: overwrite the below #define if you know your architecture endianness.
+ */
+#if defined(_ZFS_BIG_ENDIAN)
+#define	LZ4_BIG_ENDIAN 1
+#else
+/*
+ * Little Endian assumed. PDP Endian and other very rare endian format
+ * are unsupported.
+ */
+#undef LZ4_BIG_ENDIAN
+#endif
+
+/*
+ * Unaligned memory access is automatically enabled for "common" CPU,
+ * such as x86. For others CPU, the compiler will be more cautious, and
+ * insert extra code to ensure aligned access is respected. If you know
+ * your target CPU supports unaligned memory access, you may want to
+ * force this option manually to improve performance
+ */
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define	LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+/*
+ * Illumos : we can't use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ * Linux : we can use GCC's __builtin_ctz family of builtins in the
+ * kernel
+ */
+#undef	LZ4_FORCE_SW_BITCOUNT
+#if defined(__sparc)
+#define	LZ4_FORCE_SW_BITCOUNT
+#endif
+
+/*
+ * Compiler Options
+ */
+/* Disable restrict */
+#define	restrict
+
+/*
+ * Linux : GCC_VERSION is defined as of 3.9-rc1, so undefine it.
+ * torvalds/linux@3f3f8d2f48acfd8ed3b8e6b7377935da57b27b16
+ */
+#ifdef GCC_VERSION
+#undef GCC_VERSION
+#endif
+
+#define	GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#define	expect(expr, value)    (__builtin_expect((expr), (value)))
+#else
+#define	expect(expr, value)    (expr)
+#endif
+
+#ifndef likely
+#define	likely(expr)	expect((expr) != 0, 1)
+#endif
+
+#ifndef unlikely
+#define	unlikely(expr)	expect((expr) != 0, 0)
+#endif
+
+#define	lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \
+	(((x) & 0xffu) << 8)))
+
+/* Basic types */
+#define	BYTE	uint8_t
+#define	U16	uint16_t
+#define	U32	uint32_t
+#define	S32	int32_t
+#define	U64	uint64_t
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack(1)
+#endif
+
+typedef struct _U16_S {
+	U16 v;
+} U16_S;
+typedef struct _U32_S {
+	U32 v;
+} U32_S;
+typedef struct _U64_S {
+	U64 v;
+} U64_S;
+
+#ifndef LZ4_FORCE_UNALIGNED_ACCESS
+#pragma pack()
+#endif
+
+#define	A64(x) (((U64_S *)(x))->v)
+#define	A32(x) (((U32_S *)(x))->v)
+#define	A16(x) (((U16_S *)(x))->v)
+
+/*
+ * Constants
+ */
+#define	MINMATCH 4
+
+#define	HASH_LOG COMPRESSIONLEVEL
+#define	HASHTABLESIZE (1 << HASH_LOG)
+#define	HASH_MASK (HASHTABLESIZE - 1)
+
+#define	SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \
+	NOTCOMPRESSIBLE_CONFIRMATION : 2)
+
+#define	COPYLENGTH 8
+#define	LASTLITERALS 5
+#define	MFLIMIT (COPYLENGTH + MINMATCH)
+#define	MINLENGTH (MFLIMIT + 1)
+
+#define	MAXD_LOG 16
+#define	MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define	ML_BITS 4
+#define	ML_MASK ((1U<<ML_BITS)-1)
+#define	RUN_BITS (8-ML_BITS)
+#define	RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*
+ * Architecture-specific macros
+ */
+#if LZ4_ARCH64
+#define	STEPSIZE 8
+#define	UARCH U64
+#define	AARCH A64
+#define	LZ4_COPYSTEP(s, d)	A64(d) = A64(s); d += 8; s += 8;
+#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d)
+#define	LZ4_SECURECOPY(s, d, e)	if (d < e) LZ4_WILDCOPY(s, d, e)
+#define	HTYPE U32
+#define	INITBASE(base)		const BYTE* const base = ip
+#else /* !LZ4_ARCH64 */
+#define	STEPSIZE 4
+#define	UARCH U32
+#define	AARCH A32
+#define	LZ4_COPYSTEP(s, d)	A32(d) = A32(s); d += 4; s += 4;
+#define	LZ4_COPYPACKET(s, d)	LZ4_COPYSTEP(s, d); LZ4_COPYSTEP(s, d);
+#define	LZ4_SECURECOPY		LZ4_WILDCOPY
+#define	HTYPE const BYTE *
+#define	INITBASE(base)		const int base = 0
+#endif /* !LZ4_ARCH64 */
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) \
+	{ U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+#define	LZ4_WRITE_LITTLEENDIAN_16(p, i) \
+	{ U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p += 2; }
+#else
+#define	LZ4_READ_LITTLEENDIAN_16(d, s, p) { d = (s) - A16(p); }
+#define	LZ4_WRITE_LITTLEENDIAN_16(p, v)  { A16(p) = v; p += 2; }
+#endif
+
+
+/* Local structures */
+struct refTables {
+	HTYPE hashTable[HASHTABLESIZE];
+};
+
+
+/* Macros */
+#define	LZ4_HASH_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH * 8) - \
+	HASH_LOG))
+#define	LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p))
+#define	LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e);
+#define	LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \
+	d = e; }
+
+
+/* Private functions */
+#if LZ4_ARCH64
+
+static inline int
+LZ4_NbCommonBytes(register U64 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_clzll(val) >> 3);
+#else
+	int r;
+	if (!(val >> 32)) {
+		r = 4;
+	} else {
+		r = 0;
+		val >>= 32;
+	}
+	if (!(val >> 16)) {
+		r += 2;
+		val >>= 8;
+	} else {
+		val >>= 24;
+	}
+	r += (!val);
+	return (r);
+#endif
+#else
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_ctzll(val) >> 3);
+#else
+	static const int DeBruijnBytePos[64] =
+	    { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5,
+		3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5,
+		5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4,
+		4, 5, 7, 2, 6, 5, 7, 6, 7, 7
+	};
+	return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >>
+	    58];
+#endif
+#endif
+}
+
+#else
+
+static inline int
+LZ4_NbCommonBytes(register U32 val)
+{
+#if defined(LZ4_BIG_ENDIAN)
+#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_clz(val) >> 3);
+#else
+	int r;
+	if (!(val >> 16)) {
+		r = 2;
+		val >>= 8;
+	} else {
+		r = 0;
+		val >>= 24;
+	}
+	r += (!val);
+	return (r);
+#endif
+#else
+#if defined(__GNUC__) && (GCC_VERSION >= 304) && \
+	!defined(LZ4_FORCE_SW_BITCOUNT)
+	return (__builtin_ctz(val) >> 3);
+#else
+	static const int DeBruijnBytePos[32] = {
+		0, 0, 3, 0, 3, 1, 3, 0,
+		3, 2, 2, 1, 3, 2, 0, 1,
+		3, 3, 1, 2, 2, 2, 2, 0,
+		3, 1, 2, 0, 1, 0, 1, 1
+	};
+	return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >>
+	    27];
+#endif
+#endif
+}
+
+#endif
+
+/* Compression functions */
+
+/*ARGSUSED*/
+static int
+LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize,
+    int osize)
+{
+	struct refTables *srt = (struct refTables *)ctx;
+	HTYPE *HashTable = (HTYPE *) (srt->hashTable);
+
+	const BYTE *ip = (BYTE *) source;
+	INITBASE(base);
+	const BYTE *anchor = ip;
+	const BYTE *const iend = ip + isize;
+	const BYTE *const oend = (BYTE *) dest + osize;
+	const BYTE *const mflimit = iend - MFLIMIT;
+#define	matchlimit (iend - LASTLITERALS)
+
+	BYTE *op = (BYTE *) dest;
+
+	int len, length;
+	const int skipStrength = SKIPSTRENGTH;
+	U32 forwardH;
+
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	/* First Byte */
+	HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+	ip++;
+	forwardH = LZ4_HASH_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findMatchAttempts = (1U << skipStrength) + 3;
+		const BYTE *forwardIp = ip;
+		const BYTE *ref;
+		BYTE *token;
+
+		/* Find a match */
+		do {
+			U32 h = forwardH;
+			int step = findMatchAttempts++ >> skipStrength;
+			ip = forwardIp;
+			forwardIp = ip + step;
+
+			if (unlikely(forwardIp > mflimit)) {
+				goto _last_literals;
+			}
+
+			forwardH = LZ4_HASH_VALUE(forwardIp);
+			ref = base + HashTable[h];
+			HashTable[h] = ip - base;
+
+		} while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip)));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (BYTE *) source) &&
+		    unlikely(ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = ip - anchor;
+		token = op++;
+
+		/* Check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+		    (length >> 8) > oend))
+			return (0);
+
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254; len -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+		_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+		/* Start Counting */
+		ip += MINMATCH;
+		ref += MINMATCH;	/* MinMatch verified */
+		anchor = ip;
+		while (likely(ip < matchlimit - (STEPSIZE - 1))) {
+			UARCH diff = AARCH(ref) ^ AARCH(ip);
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NbCommonBytes(diff);
+			goto _endCount;
+		}
+#if LZ4_ARCH64
+		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+#endif
+		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < matchlimit) && (*ref == *ip))
+			ip++;
+		_endCount:
+
+		/* Encode MatchLength */
+		len = (ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+			return (0);
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (BYTE)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+		/* Fill table */
+		HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + HashTable[LZ4_HASH_VALUE(ip)];
+		HashTable[LZ4_HASH_VALUE(ip)] = ip - base;
+		if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardH = LZ4_HASH_VALUE(ip);
+	}
+
+	_last_literals:
+	/* Encode Last Literals */
+	{
+		int lastRun = iend - anchor;
+		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+		    oend)
+			return (0);
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK << ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254; lastRun -= 255) {
+				*op++ = 255;
+			}
+			*op++ = (BYTE)lastRun;
+		} else
+			*op++ = (lastRun << ML_BITS);
+		(void) memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+
+
+/* Note : this function is valid only if isize < LZ4_64KLIMIT */
+#define	LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1))
+#define	HASHLOG64K (HASH_LOG + 1)
+#define	HASH64KTABLESIZE (1U << HASHLOG64K)
+#define	LZ4_HASH64K_FUNCTION(i)	(((i) * 2654435761U) >> ((MINMATCH*8) - \
+	HASHLOG64K))
+#define	LZ4_HASH64K_VALUE(p)	LZ4_HASH64K_FUNCTION(A32(p))
+
+/*ARGSUSED*/
+static int
+LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize,
+    int osize)
+{
+	struct refTables *srt = (struct refTables *)ctx;
+	U16 *HashTable = (U16 *) (srt->hashTable);
+
+	const BYTE *ip = (BYTE *) source;
+	const BYTE *anchor = ip;
+	const BYTE *const base = ip;
+	const BYTE *const iend = ip + isize;
+	const BYTE *const oend = (BYTE *) dest + osize;
+	const BYTE *const mflimit = iend - MFLIMIT;
+#define	matchlimit (iend - LASTLITERALS)
+
+	BYTE *op = (BYTE *) dest;
+
+	int len, length;
+	const int skipStrength = SKIPSTRENGTH;
+	U32 forwardH;
+
+	/* Init */
+	if (isize < MINLENGTH)
+		goto _last_literals;
+
+	/* First Byte */
+	ip++;
+	forwardH = LZ4_HASH64K_VALUE(ip);
+
+	/* Main Loop */
+	for (;;) {
+		int findMatchAttempts = (1U << skipStrength) + 3;
+		const BYTE *forwardIp = ip;
+		const BYTE *ref;
+		BYTE *token;
+
+		/* Find a match */
+		do {
+			U32 h = forwardH;
+			int step = findMatchAttempts++ >> skipStrength;
+			ip = forwardIp;
+			forwardIp = ip + step;
+
+			if (forwardIp > mflimit) {
+				goto _last_literals;
+			}
+
+			forwardH = LZ4_HASH64K_VALUE(forwardIp);
+			ref = base + HashTable[h];
+			HashTable[h] = ip - base;
+
+		} while (A32(ref) != A32(ip));
+
+		/* Catch up */
+		while ((ip > anchor) && (ref > (BYTE *) source) &&
+		    (ip[-1] == ref[-1])) {
+			ip--;
+			ref--;
+		}
+
+		/* Encode Literal length */
+		length = ip - anchor;
+		token = op++;
+
+		/* Check output limit */
+		if (unlikely(op + length + (2 + 1 + LASTLITERALS) +
+		    (length >> 8) > oend))
+			return (0);
+
+		if (length >= (int)RUN_MASK) {
+			*token = (RUN_MASK << ML_BITS);
+			len = length - RUN_MASK;
+			for (; len > 254; len -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)len;
+		} else
+			*token = (length << ML_BITS);
+
+		/* Copy Literals */
+		LZ4_BLINDCOPY(anchor, op, length);
+
+		_next_match:
+		/* Encode Offset */
+		LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref);
+
+		/* Start Counting */
+		ip += MINMATCH;
+		ref += MINMATCH;	/* MinMatch verified */
+		anchor = ip;
+		while (ip < matchlimit - (STEPSIZE - 1)) {
+			UARCH diff = AARCH(ref) ^ AARCH(ip);
+			if (!diff) {
+				ip += STEPSIZE;
+				ref += STEPSIZE;
+				continue;
+			}
+			ip += LZ4_NbCommonBytes(diff);
+			goto _endCount;
+		}
+#if LZ4_ARCH64
+		if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) {
+			ip += 4;
+			ref += 4;
+		}
+#endif
+		if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) {
+			ip += 2;
+			ref += 2;
+		}
+		if ((ip < matchlimit) && (*ref == *ip))
+			ip++;
+		_endCount:
+
+		/* Encode MatchLength */
+		len = (ip - anchor);
+		/* Check output limit */
+		if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend))
+			return (0);
+		if (len >= (int)ML_MASK) {
+			*token += ML_MASK;
+			len -= ML_MASK;
+			for (; len > 509; len -= 510) {
+				*op++ = 255;
+				*op++ = 255;
+			}
+			if (len > 254) {
+				len -= 255;
+				*op++ = 255;
+			}
+			*op++ = (BYTE)len;
+		} else
+			*token += len;
+
+		/* Test end of chunk */
+		if (ip > mflimit) {
+			anchor = ip;
+			break;
+		}
+		/* Fill table */
+		HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base;
+
+		/* Test next position */
+		ref = base + HashTable[LZ4_HASH64K_VALUE(ip)];
+		HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base;
+		if (A32(ref) == A32(ip)) {
+			token = op++;
+			*token = 0;
+			goto _next_match;
+		}
+		/* Prepare next loop */
+		anchor = ip++;
+		forwardH = LZ4_HASH64K_VALUE(ip);
+	}
+
+	_last_literals:
+	/* Encode Last Literals */
+	{
+		int lastRun = iend - anchor;
+		if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) >
+		    oend)
+			return (0);
+		if (lastRun >= (int)RUN_MASK) {
+			*op++ = (RUN_MASK << ML_BITS);
+			lastRun -= RUN_MASK;
+			for (; lastRun > 254; lastRun -= 255)
+				*op++ = 255;
+			*op++ = (BYTE)lastRun;
+		} else
+			*op++ = (lastRun << ML_BITS);
+		(void) memcpy(op, anchor, iend - anchor);
+		op += iend - anchor;
+	}
+
+	/* End */
+	return (int)(((char *)op) - dest);
+}
+
+static int
+real_LZ4_compress(const char *source, char *dest, int isize, int osize)
+{
+	void *ctx;
+	int result;
+
+	ctx = lz4_alloc(KM_SLEEP);
+
+	/*
+	 * out of kernel memory, gently fall through - this will disable
+	 * compression in zio_compress_data
+	 */
+	if (ctx == NULL)
+		return (0);
+
+	memset(ctx, 0, sizeof (struct refTables));
+
+	if (isize < LZ4_64KLIMIT)
+		result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
+	else
+		result = LZ4_compressCtx(ctx, source, dest, isize, osize);
+
+	lz4_free(ctx);
+	return (result);
+}
+
+/* Decompression functions */
+
+/*
+ * Note: The decoding functions real_LZ4_uncompress() and
+ *	LZ4_uncompress_unknownOutputSize() are safe against "buffer overflow"
+ *	attack type. They will never write nor read outside of the provided
+ *	output buffers. LZ4_uncompress_unknownOutputSize() also insures that
+ *	it will never read outside of the input buffer. A corrupted input
+ *	will produce an error result, a negative int, indicating the position
+ *	of the error within input stream.
+ *
+ * Note[2]: real_LZ4_uncompress(), referred to above, is not used in ZFS so
+ *	its code is not present here.
+ */
+
+static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0};
+#if LZ4_ARCH64
+static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3};
+#endif
+
+static int
+LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize,
+    int maxOutputSize)
+{
+	/* Local Variables */
+	const BYTE *restrict ip = (const BYTE *) source;
+	const BYTE *const iend = ip + isize;
+	const BYTE *ref;
+
+	BYTE *op = (BYTE *) dest;
+	BYTE *const oend = op + maxOutputSize;
+	BYTE *cpy;
+
+	/* Main Loop */
+	while (ip < iend) {
+		unsigned token;
+		size_t length;
+
+		/* get runlength */
+		token = *ip++;
+		if ((length = (token >> ML_BITS)) == RUN_MASK) {
+			int s = 255;
+			while ((ip < iend) && (s == 255)) {
+				s = *ip++;
+				if (unlikely(length > (size_t)(length + s)))
+					goto _output_error;
+				length += s;
+			}
+		}
+		/* copy literals */
+		cpy = op + length;
+		/* CORNER-CASE: cpy might overflow. */
+		if (cpy < op)
+			goto _output_error;	/* cpy was overflowed, bail! */
+		if ((cpy > oend - COPYLENGTH) ||
+		    (ip + length > iend - COPYLENGTH)) {
+			if (cpy > oend)
+				/* Error: writes beyond output buffer */
+				goto _output_error;
+			if (ip + length != iend)
+				/*
+				 * Error: LZ4 format requires to consume all
+				 * input at this stage
+				 */
+				goto _output_error;
+			(void) memcpy(op, ip, length);
+			op += length;
+			/* Necessarily EOF, due to parsing restrictions */
+			break;
+		}
+		LZ4_WILDCOPY(ip, op, cpy);
+		ip -= (op - cpy);
+		op = cpy;
+
+		/* get offset */
+		LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip);
+		ip += 2;
+		if (ref < (BYTE * const) dest)
+			/*
+			 * Error: offset creates reference outside of
+			 * destination buffer
+			 */
+			goto _output_error;
+
+		/* get matchlength */
+		if ((length = (token & ML_MASK)) == ML_MASK) {
+			while (ip < iend) {
+				int s = *ip++;
+				if (unlikely(length > (size_t)(length + s)))
+					goto _output_error;
+				length += s;
+				if (s == 255)
+					continue;
+				break;
+			}
+		}
+		/* copy repeated sequence */
+		if (unlikely(op - ref < STEPSIZE)) {
+#if LZ4_ARCH64
+			int dec64 = dec64table[op - ref];
+#else
+			const int dec64 = 0;
+#endif
+			op[0] = ref[0];
+			op[1] = ref[1];
+			op[2] = ref[2];
+			op[3] = ref[3];
+			op += 4;
+			ref += 4;
+			ref -= dec32table[op - ref];
+			A32(op) = A32(ref);
+			op += STEPSIZE - 4;
+			ref -= dec64;
+		} else {
+			LZ4_COPYSTEP(ref, op);
+		}
+		cpy = op + length - (STEPSIZE - 4);
+		if (cpy > oend - COPYLENGTH) {
+			if (cpy > oend)
+				/*
+				 * Error: request to write outside of
+				 * destination buffer
+				 */
+				goto _output_error;
+#if LZ4_ARCH64
+			if ((ref + COPYLENGTH) > oend)
+#else
+			if ((ref + COPYLENGTH) > oend ||
+			    (op + COPYLENGTH) > oend)
+#endif
+				goto _output_error;
+			LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH));
+			while (op < cpy)
+				*op++ = *ref++;
+			op = cpy;
+			if (op == oend)
+				/*
+				 * Check EOF (should never happen, since
+				 * last 5 bytes are supposed to be literals)
+				 */
+				goto _output_error;
+			continue;
+		}
+		LZ4_SECURECOPY(ref, op, cpy);
+		op = cpy;	/* correction */
+	}
+
+	/* end of decoding */
+	return (int)(((char *)op) - dest);
+
+	/* write overflow error detected */
+	_output_error:
+	return (-1);
+}
+
+#ifdef __FreeBSD__
+/*
+ * FreeBSD has 4, 8 and 16 KB malloc zones which can be used here.
+ * Should struct refTables get resized this may need to be revisited, hence
+ * compiler-time asserts.
+ */
+_Static_assert(sizeof(struct refTables) <= 16384,
+    "refTables too big for malloc");
+_Static_assert((sizeof(struct refTables) % 4096) == 0,
+    "refTables not a multiple of page size");
+#else
+#define ZFS_LZ4_USE_CACHE
+#endif
+
+#ifdef ZFS_LZ4_USE_CACHE
+static kmem_cache_t *lz4_cache;
+
+void
+lz4_init(void)
+{
+	lz4_cache = kmem_cache_create("lz4_cache",
+	    sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+lz4_fini(void)
+{
+	if (lz4_cache) {
+		kmem_cache_destroy(lz4_cache);
+		lz4_cache = NULL;
+	}
+}
+
+static void *
+lz4_alloc(int flags)
+{
+	ASSERT(lz4_cache != NULL);
+	return (kmem_cache_alloc(lz4_cache, flags));
+}
+
+static void
+lz4_free(void *ctx)
+{
+	kmem_cache_free(lz4_cache, ctx);
+}
+#else
+void
+lz4_init(void)
+{
+}
+
+void
+lz4_fini(void)
+{
+}
+
+static void *
+lz4_alloc(int flags)
+{
+	return (kmem_alloc(sizeof (struct refTables), flags));
+}
+
+static void
+lz4_free(void *ctx)
+{
+	kmem_free(ctx, sizeof (struct refTables));
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/lzjb.c b/sys/contrib/openzfs/module/zfs/lzjb.c
new file mode 100644
index 000000000000..a478e64c5141
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/lzjb.c
@@ -0,0 +1,132 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+/*
+ * We keep our own copy of this algorithm for 3 main reasons:
+ *	1. If we didn't, anyone modifying common/os/compress.c would
+ *         directly break our on disk format
+ *	2. Our version of lzjb does not have a number of checks that the
+ *         common/os version needs and uses
+ *	3. We initialize the lempel to ensure deterministic results,
+ *	   so that identical blocks can always be deduplicated.
+ * In particular, we are adding the "feature" that compress() can
+ * take a destination buffer size and returns the compressed length, or the
+ * source length if compression would overflow the destination buffer.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+
+#define	MATCH_BITS	6
+#define	MATCH_MIN	3
+#define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
+#define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
+#define	LEMPEL_SIZE	1024
+
+/*ARGSUSED*/
+size_t
+lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *cpy;
+	uchar_t *copymap = NULL;
+	int copymask = 1 << (NBBY - 1);
+	int mlen, offset, hash;
+	uint16_t *hp;
+	uint16_t *lempel;
+
+	lempel = kmem_zalloc(LEMPEL_SIZE * sizeof (uint16_t), KM_SLEEP);
+	while (src < (uchar_t *)s_start + s_len) {
+		if ((copymask <<= 1) == (1 << NBBY)) {
+			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
+				kmem_free(lempel,
+				    LEMPEL_SIZE*sizeof (uint16_t));
+				return (s_len);
+			}
+			copymask = 1;
+			copymap = dst;
+			*dst++ = 0;
+		}
+		if (src > (uchar_t *)s_start + s_len - MATCH_MAX) {
+			*dst++ = *src++;
+			continue;
+		}
+		hash = (src[0] << 16) + (src[1] << 8) + src[2];
+		hash += hash >> 9;
+		hash += hash >> 5;
+		hp = &lempel[hash & (LEMPEL_SIZE - 1)];
+		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
+		*hp = (uint16_t)(uintptr_t)src;
+		cpy = src - offset;
+		if (cpy >= (uchar_t *)s_start && cpy != src &&
+		    src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) {
+			*copymap |= copymask;
+			for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++)
+				if (src[mlen] != cpy[mlen])
+					break;
+			*dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) |
+			    (offset >> NBBY);
+			*dst++ = (uchar_t)offset;
+			src += mlen;
+		} else {
+			*dst++ = *src++;
+		}
+	}
+
+	kmem_free(lempel, LEMPEL_SIZE * sizeof (uint16_t));
+	return (dst - (uchar_t *)d_start);
+}
+
+/*ARGSUSED*/
+int
+lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *d_end = (uchar_t *)d_start + d_len;
+	uchar_t *cpy;
+	uchar_t copymap = 0;
+	int copymask = 1 << (NBBY - 1);
+
+	while (dst < d_end) {
+		if ((copymask <<= 1) == (1 << NBBY)) {
+			copymask = 1;
+			copymap = *src++;
+		}
+		if (copymap & copymask) {
+			int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN;
+			int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK;
+			src += 2;
+			if ((cpy = dst - offset) < (uchar_t *)d_start)
+				return (-1);
+			while (--mlen >= 0 && dst < d_end)
+				*dst++ = *cpy++;
+		} else {
+			*dst++ = *src++;
+		}
+	}
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/metaslab.c b/sys/contrib/openzfs/module/zfs/metaslab.c
new file mode 100644
index 000000000000..bc4f007b61a1
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/metaslab.c
@@ -0,0 +1,6287 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/space_map.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/zap.h>
+#include <sys/btree.h>
+
+#define	WITH_DF_BLOCK_ALLOCATOR
+
+#define	GANG_ALLOCATION(flags) \
+	((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
+
+/*
+ * Metaslab granularity, in bytes. This is roughly similar to what would be
+ * referred to as the "stripe size" in traditional RAID arrays. In normal
+ * operation, we will try to write this amount of data to a top-level vdev
+ * before moving on to the next one.
+ */
+unsigned long metaslab_aliquot = 512 << 10;
+
+/*
+ * For testing, make some blocks above a certain size be gang blocks.
+ */
+unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1;
+
+/*
+ * In pools where the log space map feature is not enabled we touch
+ * multiple metaslabs (and their respective space maps) with each
+ * transaction group. Thus, we benefit from having a small space map
+ * block size since it allows us to issue more I/O operations scattered
+ * around the disk. So a sane default for the space map block size
+ * is 8~16K.
+ */
+int zfs_metaslab_sm_blksz_no_log = (1 << 14);
+
+/*
+ * When the log space map feature is enabled, we accumulate a lot of
+ * changes per metaslab that are flushed once in a while so we benefit
+ * from a bigger block size like 128K for the metaslab space maps.
+ */
+int zfs_metaslab_sm_blksz_with_log = (1 << 17);
+
+/*
+ * The in-core space map representation is more compact than its on-disk form.
+ * The zfs_condense_pct determines how much more compact the in-core
+ * space map representation must be before we compact it on-disk.
+ * Values should be greater than or equal to 100.
+ */
+int zfs_condense_pct = 200;
+
+/*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, space_map_blksz), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+
+/*
+ * Metaslab groups are considered eligible for allocations if their
+ * fragmentation metric (measured as a percentage) is less than or
+ * equal to zfs_mg_fragmentation_threshold. If a metaslab group
+ * exceeds this threshold then it will be skipped unless all metaslab
+ * groups within the metaslab class have also crossed this threshold.
+ *
+ * This tunable was introduced to avoid edge cases where we continue
+ * allocating from very fragmented disks in our pool while other, less
+ * fragmented disks, exists. On the other hand, if all disks in the
+ * pool are uniformly approaching the threshold, the threshold can
+ * be a speed bump in performance, where we keep switching the disks
+ * that we allocate from (e.g. we allocate some segments from disk A
+ * making it bypassing the threshold while freeing segments from disk
+ * B getting its fragmentation below the threshold).
+ *
+ * Empirically, we've seen that our vdev selection for allocations is
+ * good enough that fragmentation increases uniformly across all vdevs
+ * the majority of the time. Thus we set the threshold percentage high
+ * enough to avoid hitting the speed bump on pools that are being pushed
+ * to the edge.
+ */
+int zfs_mg_fragmentation_threshold = 95;
+
+/*
+ * Allow metaslabs to keep their active state as long as their fragmentation
+ * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
+ * active metaslab that exceeds this threshold will no longer keep its active
+ * status allowing better metaslabs to be selected.
+ */
+int zfs_metaslab_fragmentation_threshold = 70;
+
+/*
+ * When set will load all metaslabs when pool is first opened.
+ */
+int metaslab_debug_load = 0;
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy.  Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 4;
+
+/*
+ * Maximum distance to search forward from the last offset. Without this
+ * limit, fragmented pools can see >100,000 iterations and
+ * metaslab_block_picker() becomes the performance limiting factor on
+ * high-performance storage.
+ *
+ * With the default setting of 16MB, we typically see less than 500
+ * iterations, even with very fragmented, ashift=9 pools. The maximum number
+ * of iterations possible is:
+ *     metaslab_df_max_search / (2 * (1<<ashift))
+ * With the default setting of 16MB this is 16*1024 (with ashift=9) or
+ * 2048 (with ashift=12).
+ */
+int metaslab_df_max_search = 16 * 1024 * 1024;
+
+/*
+ * Forces the metaslab_block_picker function to search for at least this many
+ * segments forwards until giving up on finding a segment that the allocation
+ * will fit into.
+ */
+uint32_t metaslab_min_search_count = 100;
+
+/*
+ * If we are not searching forward (due to metaslab_df_max_search,
+ * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
+ * controls what segment is used.  If it is set, we will use the largest free
+ * segment.  If it is not set, we will use a segment of exactly the requested
+ * size (or larger).
+ */
+int metaslab_df_use_largest_segment = B_FALSE;
+
+/*
+ * Percentage of all cpus that can be used by the metaslab taskq.
+ */
+int metaslab_load_pct = 50;
+
+/*
+ * These tunables control how long a metaslab will remain loaded after the
+ * last allocation from it.  A metaslab can't be unloaded until at least
+ * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
+ * have elapsed.  However, zfs_metaslab_mem_limit may cause it to be
+ * unloaded sooner.  These settings are intended to be generous -- to keep
+ * metaslabs loaded for a long time, reducing the rate of metaslab loading.
+ */
+int metaslab_unload_delay = 32;
+int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
+
+/*
+ * Max number of metaslabs per group to preload.
+ */
+int metaslab_preload_limit = 10;
+
+/*
+ * Enable/disable preloading of metaslab.
+ */
+int metaslab_preload_enabled = B_TRUE;
+
+/*
+ * Enable/disable fragmentation weighting on metaslabs.
+ */
+int metaslab_fragmentation_factor_enabled = B_TRUE;
+
+/*
+ * Enable/disable lba weighting (i.e. outer tracks are given preference).
+ */
+int metaslab_lba_weighting_enabled = B_TRUE;
+
+/*
+ * Enable/disable metaslab group biasing.
+ */
+int metaslab_bias_enabled = B_TRUE;
+
+/*
+ * Enable/disable remapping of indirect DVAs to their concrete vdevs.
+ */
+boolean_t zfs_remap_blkptr_enable = B_TRUE;
+
+/*
+ * Enable/disable segment-based metaslab selection.
+ */
+int zfs_metaslab_segment_weight_enabled = B_TRUE;
+
+/*
+ * When using segment-based metaslab selection, we will continue
+ * allocating from the active metaslab until we have exhausted
+ * zfs_metaslab_switch_threshold of its buckets.
+ */
+int zfs_metaslab_switch_threshold = 2;
+
+/*
+ * Internal switch to enable/disable the metaslab allocation tracing
+ * facility.
+ */
+boolean_t metaslab_trace_enabled = B_FALSE;
+
+/*
+ * Maximum entries that the metaslab allocation tracing facility will keep
+ * in a given list when running in non-debug mode. We limit the number
+ * of entries in non-debug mode to prevent us from using up too much memory.
+ * The limit should be sufficiently large that we don't expect any allocation
+ * to every exceed this value. In debug mode, the system will panic if this
+ * limit is ever reached allowing for further investigation.
+ */
+uint64_t metaslab_trace_max_entries = 5000;
+
+/*
+ * Maximum number of metaslabs per group that can be disabled
+ * simultaneously.
+ */
+int max_disabled_ms = 3;
+
+/*
+ * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
+ * To avoid 64-bit overflow, don't set above UINT32_MAX.
+ */
+unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
+
+/*
+ * Maximum percentage of memory to use on storing loaded metaslabs. If loading
+ * a metaslab would take it over this percentage, the oldest selected metaslab
+ * is automatically unloaded.
+ */
+int zfs_metaslab_mem_limit = 75;
+
+/*
+ * Force the per-metaslab range trees to use 64-bit integers to store
+ * segments. Used for debugging purposes.
+ */
+boolean_t zfs_metaslab_force_large_segs = B_FALSE;
+
+/*
+ * By default we only store segments over a certain size in the size-sorted
+ * metaslab trees (ms_allocatable_by_size and
+ * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
+ * improves load and unload times at the cost of causing us to use slightly
+ * larger segments than we would otherwise in some cases.
+ */
+uint32_t metaslab_by_size_min_shift = 14;
+
+/*
+ * If not set, we will first try normal allocation.  If that fails then
+ * we will do a gang allocation.  If that fails then we will do a "try hard"
+ * gang allocation.  If that fails then we will have a multi-layer gang
+ * block.
+ *
+ * If set, we will first try normal allocation.  If that fails then
+ * we will do a "try hard" allocation.  If that fails we will do a gang
+ * allocation.  If that fails we will do a "try hard" gang allocation.  If
+ * that fails then we will have a multi-layer gang block.
+ */
+int zfs_metaslab_try_hard_before_gang = B_FALSE;
+
+/*
+ * When not trying hard, we only consider the best zfs_metaslab_find_max_tries
+ * metaslabs.  This improves performance, especially when there are many
+ * metaslabs per vdev and the allocation can't actually be satisfied (so we
+ * would otherwise iterate all the metaslabs).  If there is a metaslab with a
+ * worse weight but it can actually satisfy the allocation, we won't find it
+ * until trying hard.  This may happen if the worse metaslab is not loaded
+ * (and the true weight is better than we have calculated), or due to weight
+ * bucketization.  E.g. we are looking for a 60K segment, and the best
+ * metaslabs all have free segments in the 32-63K bucket, but the best
+ * zfs_metaslab_find_max_tries metaslabs have ms_max_size <60KB, and a
+ * subsequent metaslab has ms_max_size >60KB (but fewer segments in this
+ * bucket, and therefore a lower weight).
+ */
+int zfs_metaslab_find_max_tries = 100;
+
+static uint64_t metaslab_weight(metaslab_t *, boolean_t);
+static void metaslab_set_fragmentation(metaslab_t *, boolean_t);
+static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
+static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
+
+static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
+static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
+static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
+static unsigned int metaslab_idx_func(multilist_t *, void *);
+static void metaslab_evict(metaslab_t *, uint64_t);
+static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+kmem_cache_t *metaslab_alloc_trace_cache;
+
+typedef struct metaslab_stats {
+	kstat_named_t metaslabstat_trace_over_limit;
+	kstat_named_t metaslabstat_reload_tree;
+	kstat_named_t metaslabstat_too_many_tries;
+	kstat_named_t metaslabstat_try_hard;
+} metaslab_stats_t;
+
+static metaslab_stats_t metaslab_stats = {
+	{ "trace_over_limit",		KSTAT_DATA_UINT64 },
+	{ "reload_tree",		KSTAT_DATA_UINT64 },
+	{ "too_many_tries",		KSTAT_DATA_UINT64 },
+	{ "try_hard",			KSTAT_DATA_UINT64 },
+};
+
+#define	METASLABSTAT_BUMP(stat) \
+	atomic_inc_64(&metaslab_stats.stat.value.ui64);
+
+
+kstat_t *metaslab_ksp;
+
+void
+metaslab_stat_init(void)
+{
+	ASSERT(metaslab_alloc_trace_cache == NULL);
+	metaslab_alloc_trace_cache = kmem_cache_create(
+	    "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
+	    0, NULL, NULL, NULL, NULL, NULL, 0);
+	metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
+	    "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
+	    sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (metaslab_ksp != NULL) {
+		metaslab_ksp->ks_data = &metaslab_stats;
+		kstat_install(metaslab_ksp);
+	}
+}
+
+void
+metaslab_stat_fini(void)
+{
+	if (metaslab_ksp != NULL) {
+		kstat_delete(metaslab_ksp);
+		metaslab_ksp = NULL;
+	}
+
+	kmem_cache_destroy(metaslab_alloc_trace_cache);
+	metaslab_alloc_trace_cache = NULL;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab classes
+ * ==========================================================================
+ */
+metaslab_class_t *
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
+{
+	metaslab_class_t *mc;
+
+	mc = kmem_zalloc(offsetof(metaslab_class_t,
+	    mc_allocator[spa->spa_alloc_count]), KM_SLEEP);
+
+	mc->mc_spa = spa;
+	mc->mc_ops = ops;
+	mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+	mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
+	    offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+		mca->mca_rotor = NULL;
+		zfs_refcount_create_tracked(&mca->mca_alloc_slots);
+	}
+
+	return (mc);
+}
+
+void
+metaslab_class_destroy(metaslab_class_t *mc)
+{
+	spa_t *spa = mc->mc_spa;
+
+	ASSERT(mc->mc_alloc == 0);
+	ASSERT(mc->mc_deferred == 0);
+	ASSERT(mc->mc_space == 0);
+	ASSERT(mc->mc_dspace == 0);
+
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		metaslab_class_allocator_t *mca = &mc->mc_allocator[i];
+		ASSERT(mca->mca_rotor == NULL);
+		zfs_refcount_destroy(&mca->mca_alloc_slots);
+	}
+	mutex_destroy(&mc->mc_lock);
+	multilist_destroy(mc->mc_metaslab_txg_list);
+	kmem_free(mc, offsetof(metaslab_class_t,
+	    mc_allocator[spa->spa_alloc_count]));
+}
+
+int
+metaslab_class_validate(metaslab_class_t *mc)
+{
+	metaslab_group_t *mg;
+	vdev_t *vd;
+
+	/*
+	 * Must hold one of the spa_config locks.
+	 */
+	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
+
+	if ((mg = mc->mc_allocator[0].mca_rotor) == NULL)
+		return (0);
+
+	do {
+		vd = mg->mg_vd;
+		ASSERT(vd->vdev_mg != NULL);
+		ASSERT3P(vd->vdev_top, ==, vd);
+		ASSERT3P(mg->mg_class, ==, mc);
+		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+	} while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor);
+
+	return (0);
+}
+
+static void
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
+{
+	atomic_add_64(&mc->mc_alloc, alloc_delta);
+	atomic_add_64(&mc->mc_deferred, defer_delta);
+	atomic_add_64(&mc->mc_space, space_delta);
+	atomic_add_64(&mc->mc_dspace, dspace_delta);
+}
+
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+	return (mc->mc_alloc);
+}
+
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+	return (mc->mc_deferred);
+}
+
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+	return (mc->mc_space);
+}
+
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
+}
+
+void
+metaslab_class_histogram_verify(metaslab_class_t *mc)
+{
+	spa_t *spa = mc->mc_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t *mc_hist;
+	int i;
+
+	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+		return;
+
+	mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	    KM_SLEEP);
+
+	mutex_enter(&mc->mc_lock);
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = vdev_get_mg(tvd, mc);
+
+		/*
+		 * Skip any holes, uninitialized top-levels, or
+		 * vdevs that are not in this metalab class.
+		 */
+		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		IMPLY(mg == mg->mg_vd->vdev_log_mg,
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+
+		for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+			mc_hist[i] += mg->mg_histogram[i];
+	}
+
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+		VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+	}
+
+	mutex_exit(&mc->mc_lock);
+	kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
+/*
+ * Calculate the metaslab class's fragmentation metric. The metric
+ * is weighted based on the space contribution of each metaslab group.
+ * The return value will be a number between 0 and 100 (inclusive), or
+ * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
+ * zfs_frag_table for more information about the metric.
+ */
+uint64_t
+metaslab_class_fragmentation(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t fragmentation = 0;
+
+	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		/*
+		 * Skip any holes, uninitialized top-levels,
+		 * or vdevs that are not in this metalab class.
+		 */
+		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		/*
+		 * If a metaslab group does not contain a fragmentation
+		 * metric then just bail out.
+		 */
+		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+			spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+			return (ZFS_FRAG_INVALID);
+		}
+
+		/*
+		 * Determine how much this metaslab_group is contributing
+		 * to the overall pool fragmentation metric.
+		 */
+		fragmentation += mg->mg_fragmentation *
+		    metaslab_group_get_space(mg);
+	}
+	fragmentation /= metaslab_class_get_space(mc);
+
+	ASSERT3U(fragmentation, <=, 100);
+	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+	return (fragmentation);
+}
+
+/*
+ * Calculate the amount of expandable space that is available in
+ * this metaslab class. If a device is expanded then its expandable
+ * space will be the amount of allocatable space that is currently not
+ * part of this metaslab class.
+ */
+uint64_t
+metaslab_class_expandable_space(metaslab_class_t *mc)
+{
+	vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+	uint64_t space = 0;
+
+	spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
+		    mg->mg_class != mc) {
+			continue;
+		}
+
+		/*
+		 * Calculate if we have enough space to add additional
+		 * metaslabs. We report the expandable space in terms
+		 * of the metaslab size since that's the unit of expansion.
+		 */
+		space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
+		    1ULL << tvd->vdev_ms_shift);
+	}
+	spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+	return (space);
+}
+
+void
+metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
+{
+	multilist_t *ml = mc->mc_metaslab_txg_list;
+	for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
+		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+		metaslab_t *msp = multilist_sublist_head(mls);
+		multilist_sublist_unlock(mls);
+		while (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+
+			/*
+			 * If the metaslab has been removed from the list
+			 * (which could happen if we were at the memory limit
+			 * and it was evicted during this loop), then we can't
+			 * proceed and we should restart the sublist.
+			 */
+			if (!multilist_link_active(&msp->ms_class_txg_node)) {
+				mutex_exit(&msp->ms_lock);
+				i--;
+				break;
+			}
+			mls = multilist_sublist_lock(ml, i);
+			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+			multilist_sublist_unlock(mls);
+			if (txg >
+			    msp->ms_selected_txg + metaslab_unload_delay &&
+			    gethrtime() > msp->ms_selected_time +
+			    (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
+				metaslab_evict(msp, txg);
+			} else {
+				/*
+				 * Once we've hit a metaslab selected too
+				 * recently to evict, we're done evicting for
+				 * now.
+				 */
+				mutex_exit(&msp->ms_lock);
+				break;
+			}
+			mutex_exit(&msp->ms_lock);
+			msp = next_msp;
+		}
+	}
+}
+
+static int
+metaslab_compare(const void *x1, const void *x2)
+{
+	const metaslab_t *m1 = (const metaslab_t *)x1;
+	const metaslab_t *m2 = (const metaslab_t *)x2;
+
+	int sort1 = 0;
+	int sort2 = 0;
+	if (m1->ms_allocator != -1 && m1->ms_primary)
+		sort1 = 1;
+	else if (m1->ms_allocator != -1 && !m1->ms_primary)
+		sort1 = 2;
+	if (m2->ms_allocator != -1 && m2->ms_primary)
+		sort2 = 1;
+	else if (m2->ms_allocator != -1 && !m2->ms_primary)
+		sort2 = 2;
+
+	/*
+	 * Sort inactive metaslabs first, then primaries, then secondaries. When
+	 * selecting a metaslab to allocate from, an allocator first tries its
+	 * primary, then secondary active metaslab. If it doesn't have active
+	 * metaslabs, or can't allocate from them, it searches for an inactive
+	 * metaslab to activate. If it can't find a suitable one, it will steal
+	 * a primary or secondary metaslab from another allocator.
+	 */
+	if (sort1 < sort2)
+		return (-1);
+	if (sort1 > sort2)
+		return (1);
+
+	int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
+	if (likely(cmp))
+		return (cmp);
+
+	IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
+
+	return (TREE_CMP(m1->ms_start, m2->ms_start));
+}
+
+/*
+ * ==========================================================================
+ * Metaslab groups
+ * ==========================================================================
+ */
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	metaslab_class_t *mc = mg->mg_class;
+	vdev_stat_t *vs = &vd->vdev_stat;
+	boolean_t was_allocatable;
+	boolean_t was_initialized;
+
+	ASSERT(vd == vd->vdev_top);
+	ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
+	    SCL_ALLOC);
+
+	mutex_enter(&mg->mg_lock);
+	was_allocatable = mg->mg_allocatable;
+	was_initialized = mg->mg_initialized;
+
+	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+	    (vs->vs_space + 1);
+
+	mutex_enter(&mc->mc_lock);
+
+	/*
+	 * If the metaslab group was just added then it won't
+	 * have any space until we finish syncing out this txg.
+	 * At that point we will consider it initialized and available
+	 * for allocations.  We also don't consider non-activated
+	 * metaslab groups (e.g. vdevs that are in the middle of being removed)
+	 * to be initialized, because they can't be used for allocation.
+	 */
+	mg->mg_initialized = metaslab_group_initialized(mg);
+	if (!was_initialized && mg->mg_initialized) {
+		mc->mc_groups++;
+	} else if (was_initialized && !mg->mg_initialized) {
+		ASSERT3U(mc->mc_groups, >, 0);
+		mc->mc_groups--;
+	}
+	if (mg->mg_initialized)
+		mg->mg_no_free_space = B_FALSE;
+
+	/*
+	 * A metaslab group is considered allocatable if it has plenty
+	 * of free space or is not heavily fragmented. We only take
+	 * fragmentation into account if the metaslab group has a valid
+	 * fragmentation metric (i.e. a value between 0 and 100).
+	 */
+	mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+	    mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+	    (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
+	    mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
+
+	/*
+	 * The mc_alloc_groups maintains a count of the number of
+	 * groups in this metaslab class that are still above the
+	 * zfs_mg_noalloc_threshold. This is used by the allocating
+	 * threads to determine if they should avoid allocations to
+	 * a given group. The allocator will avoid allocations to a group
+	 * if that group has reached or is below the zfs_mg_noalloc_threshold
+	 * and there are still other groups that are above the threshold.
+	 * When a group transitions from allocatable to non-allocatable or
+	 * vice versa we update the metaslab class to reflect that change.
+	 * When the mc_alloc_groups value drops to 0 that means that all
+	 * groups have reached the zfs_mg_noalloc_threshold making all groups
+	 * eligible for allocations. This effectively means that all devices
+	 * are balanced again.
+	 */
+	if (was_allocatable && !mg->mg_allocatable)
+		mc->mc_alloc_groups--;
+	else if (!was_allocatable && mg->mg_allocatable)
+		mc->mc_alloc_groups++;
+	mutex_exit(&mc->mc_lock);
+
+	mutex_exit(&mg->mg_lock);
+}
+
+int
+metaslab_sort_by_flushed(const void *va, const void *vb)
+{
+	const metaslab_t *a = va;
+	const metaslab_t *b = vb;
+
+	int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
+	if (likely(cmp))
+		return (cmp);
+
+	uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
+	uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
+	cmp = TREE_CMP(a_vdev_id, b_vdev_id);
+	if (cmp)
+		return (cmp);
+
+	return (TREE_CMP(a->ms_id, b->ms_id));
+}
+
+metaslab_group_t *
+metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
+{
+	metaslab_group_t *mg;
+
+	mg = kmem_zalloc(offsetof(metaslab_group_t,
+	    mg_allocator[allocators]), KM_SLEEP);
+	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
+	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
+	    sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
+	mg->mg_vd = vd;
+	mg->mg_class = mc;
+	mg->mg_activation_count = 0;
+	mg->mg_initialized = B_FALSE;
+	mg->mg_no_free_space = B_TRUE;
+	mg->mg_allocators = allocators;
+
+	for (int i = 0; i < allocators; i++) {
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+		zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
+	}
+
+	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
+	    maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
+
+	return (mg);
+}
+
+void
+metaslab_group_destroy(metaslab_group_t *mg)
+{
+	ASSERT(mg->mg_prev == NULL);
+	ASSERT(mg->mg_next == NULL);
+	/*
+	 * We may have gone below zero with the activation count
+	 * either because we never activated in the first place or
+	 * because we're done, and possibly removing the vdev.
+	 */
+	ASSERT(mg->mg_activation_count <= 0);
+
+	taskq_destroy(mg->mg_taskq);
+	avl_destroy(&mg->mg_metaslab_tree);
+	mutex_destroy(&mg->mg_lock);
+	mutex_destroy(&mg->mg_ms_disabled_lock);
+	cv_destroy(&mg->mg_ms_disabled_cv);
+
+	for (int i = 0; i < mg->mg_allocators; i++) {
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+		zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
+	}
+	kmem_free(mg, offsetof(metaslab_group_t,
+	    mg_allocator[mg->mg_allocators]));
+}
+
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	spa_t *spa = mc->mc_spa;
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT3U(spa_config_held(spa, SCL_ALLOC, RW_WRITER), !=, 0);
+
+	ASSERT(mg->mg_prev == NULL);
+	ASSERT(mg->mg_next == NULL);
+	ASSERT(mg->mg_activation_count <= 0);
+
+	if (++mg->mg_activation_count <= 0)
+		return;
+
+	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+	metaslab_group_alloc_update(mg);
+
+	if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) {
+		mg->mg_prev = mg;
+		mg->mg_next = mg;
+	} else {
+		mgnext = mgprev->mg_next;
+		mg->mg_prev = mgprev;
+		mg->mg_next = mgnext;
+		mgprev->mg_next = mg;
+		mgnext->mg_prev = mg;
+	}
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mc->mc_allocator[i].mca_rotor = mg;
+		mg = mg->mg_next;
+	}
+}
+
+/*
+ * Passivate a metaslab group and remove it from the allocation rotor.
+ * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
+ * a metaslab group. This function will momentarily drop spa_config_locks
+ * that are lower than the SCL_ALLOC lock (see comment below).
+ */
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	spa_t *spa = mc->mc_spa;
+	metaslab_group_t *mgprev, *mgnext;
+	int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
+
+	ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
+	    (SCL_ALLOC | SCL_ZIO));
+
+	if (--mg->mg_activation_count != 0) {
+		for (int i = 0; i < spa->spa_alloc_count; i++)
+			ASSERT(mc->mc_allocator[i].mca_rotor != mg);
+		ASSERT(mg->mg_prev == NULL);
+		ASSERT(mg->mg_next == NULL);
+		ASSERT(mg->mg_activation_count < 0);
+		return;
+	}
+
+	/*
+	 * The spa_config_lock is an array of rwlocks, ordered as
+	 * follows (from highest to lowest):
+	 *	SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
+	 *	SCL_ZIO > SCL_FREE > SCL_VDEV
+	 * (For more information about the spa_config_lock see spa_misc.c)
+	 * The higher the lock, the broader its coverage. When we passivate
+	 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
+	 * config locks. However, the metaslab group's taskq might be trying
+	 * to preload metaslabs so we must drop the SCL_ZIO lock and any
+	 * lower locks to allow the I/O to complete. At a minimum,
+	 * we continue to hold the SCL_ALLOC lock, which prevents any future
+	 * allocations from taking place and any changes to the vdev tree.
+	 */
+	spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
+	taskq_wait_outstanding(mg->mg_taskq, 0);
+	spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
+	metaslab_group_alloc_update(mg);
+	for (int i = 0; i < mg->mg_allocators; i++) {
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
+		metaslab_t *msp = mga->mga_primary;
+		if (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+			metaslab_passivate(msp,
+			    metaslab_weight_from_range_tree(msp));
+			mutex_exit(&msp->ms_lock);
+		}
+		msp = mga->mga_secondary;
+		if (msp != NULL) {
+			mutex_enter(&msp->ms_lock);
+			metaslab_passivate(msp,
+			    metaslab_weight_from_range_tree(msp));
+			mutex_exit(&msp->ms_lock);
+		}
+	}
+
+	mgprev = mg->mg_prev;
+	mgnext = mg->mg_next;
+
+	if (mg == mgnext) {
+		mgnext = NULL;
+	} else {
+		mgprev->mg_next = mgnext;
+		mgnext->mg_prev = mgprev;
+	}
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		if (mc->mc_allocator[i].mca_rotor == mg)
+			mc->mc_allocator[i].mca_rotor = mgnext;
+	}
+
+	mg->mg_prev = NULL;
+	mg->mg_next = NULL;
+}
+
+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
+uint64_t
+metaslab_group_get_space(metaslab_group_t *mg)
+{
+	/*
+	 * Note that the number of nodes in mg_metaslab_tree may be one less
+	 * than vdev_ms_count, due to the embedded log metaslab.
+	 */
+	mutex_enter(&mg->mg_lock);
+	uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree);
+	mutex_exit(&mg->mg_lock);
+	return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count);
+}
+
+void
+metaslab_group_histogram_verify(metaslab_group_t *mg)
+{
+	uint64_t *mg_hist;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+	if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+		return;
+
+	mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+	    KM_SLEEP);
+
+	ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
+	    SPACE_MAP_HISTOGRAM_SIZE + ashift);
+
+	mutex_enter(&mg->mg_lock);
+	for (metaslab_t *msp = avl_first(t);
+	    msp != NULL; msp = AVL_NEXT(t, msp)) {
+		VERIFY3P(msp->ms_group, ==, mg);
+		/* skip if not active */
+		if (msp->ms_sm == NULL)
+			continue;
+
+		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+			mg_hist[i + ashift] +=
+			    msp->ms_sm->sm_phys->smp_histogram[i];
+		}
+	}
+
+	for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+		VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
+
+	mutex_exit(&mg->mg_lock);
+
+	kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
+static void
+metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_sm == NULL)
+		return;
+
+	mutex_enter(&mg->mg_lock);
+	mutex_enter(&mc->mc_lock);
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		IMPLY(mg == mg->mg_vd->vdev_log_mg,
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+		mg->mg_histogram[i + ashift] +=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+		mc->mc_histogram[i + ashift] +=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+	mutex_exit(&mc->mc_lock);
+	mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	if (msp->ms_sm == NULL)
+		return;
+
+	mutex_enter(&mg->mg_lock);
+	mutex_enter(&mc->mc_lock);
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		ASSERT3U(mg->mg_histogram[i + ashift], >=,
+		    msp->ms_sm->sm_phys->smp_histogram[i]);
+		ASSERT3U(mc->mc_histogram[i + ashift], >=,
+		    msp->ms_sm->sm_phys->smp_histogram[i]);
+		IMPLY(mg == mg->mg_vd->vdev_log_mg,
+		    mc == spa_embedded_log_class(mg->mg_vd->vdev_spa));
+
+		mg->mg_histogram[i + ashift] -=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+		mc->mc_histogram[i + ashift] -=
+		    msp->ms_sm->sm_phys->smp_histogram[i];
+	}
+	mutex_exit(&mc->mc_lock);
+	mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+	ASSERT(msp->ms_group == NULL);
+	mutex_enter(&mg->mg_lock);
+	msp->ms_group = mg;
+	msp->ms_weight = 0;
+	avl_add(&mg->mg_metaslab_tree, msp);
+	mutex_exit(&mg->mg_lock);
+
+	mutex_enter(&msp->ms_lock);
+	metaslab_group_histogram_add(mg, msp);
+	mutex_exit(&msp->ms_lock);
+}
+
+static void
+metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+	mutex_enter(&msp->ms_lock);
+	metaslab_group_histogram_remove(mg, msp);
+	mutex_exit(&msp->ms_lock);
+
+	mutex_enter(&mg->mg_lock);
+	ASSERT(msp->ms_group == mg);
+	avl_remove(&mg->mg_metaslab_tree, msp);
+
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+	multilist_sublist_t *mls =
+	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+	if (multilist_link_active(&msp->ms_class_txg_node))
+		multilist_sublist_remove(mls, msp);
+	multilist_sublist_unlock(mls);
+
+	msp->ms_group = NULL;
+	mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(MUTEX_HELD(&mg->mg_lock));
+	ASSERT(msp->ms_group == mg);
+
+	avl_remove(&mg->mg_metaslab_tree, msp);
+	msp->ms_weight = weight;
+	avl_add(&mg->mg_metaslab_tree, msp);
+
+}
+
+static void
+metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
+{
+	/*
+	 * Although in principle the weight can be any value, in
+	 * practice we do not use values in the range [1, 511].
+	 */
+	ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	mutex_enter(&mg->mg_lock);
+	metaslab_group_sort_impl(mg, msp, weight);
+	mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * Calculate the fragmentation for a given metaslab group. We can use
+ * a simple average here since all metaslabs within the group must have
+ * the same size. The return value will be a value between 0 and 100
+ * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
+ * group have a fragmentation metric.
+ */
+uint64_t
+metaslab_group_fragmentation(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	uint64_t fragmentation = 0;
+	uint64_t valid_ms = 0;
+
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
+
+		if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
+			continue;
+		if (msp->ms_group != mg)
+			continue;
+
+		valid_ms++;
+		fragmentation += msp->ms_fragmentation;
+	}
+
+	if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
+		return (ZFS_FRAG_INVALID);
+
+	fragmentation /= valid_ms;
+	ASSERT3U(fragmentation, <=, 100);
+	return (fragmentation);
+}
+
+/*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its free capacity is less than the
+ * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
+ * zfs_mg_fragmentation_threshold and there is at least one metaslab group
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+    uint64_t psize, int allocator, int d)
+{
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_class_t *mc = mg->mg_class;
+
+	/*
+	 * We can only consider skipping this metaslab group if it's
+	 * in the normal metaslab class and there are other metaslab
+	 * groups to select from. Otherwise, we always consider it eligible
+	 * for allocations.
+	 */
+	if ((mc != spa_normal_class(spa) &&
+	    mc != spa_special_class(spa) &&
+	    mc != spa_dedup_class(spa)) ||
+	    mc->mc_groups <= 1)
+		return (B_TRUE);
+
+	/*
+	 * If the metaslab group's mg_allocatable flag is set (see comments
+	 * in metaslab_group_alloc_update() for more information) and
+	 * the allocation throttle is disabled then allow allocations to this
+	 * device. However, if the allocation throttle is enabled then
+	 * check if we have reached our allocation limit (mga_alloc_queue_depth)
+	 * to determine if we should allow allocations to this metaslab group.
+	 * If all metaslab groups are no longer considered allocatable
+	 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+	 * gang block size then we allow allocations on this metaslab group
+	 * regardless of the mg_allocatable or throttle settings.
+	 */
+	if (mg->mg_allocatable) {
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+		int64_t qdepth;
+		uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
+
+		if (!mc->mc_alloc_throttle_enabled)
+			return (B_TRUE);
+
+		/*
+		 * If this metaslab group does not have any free space, then
+		 * there is no point in looking further.
+		 */
+		if (mg->mg_no_free_space)
+			return (B_FALSE);
+
+		/*
+		 * Relax allocation throttling for ditto blocks.  Due to
+		 * random imbalances in allocation it tends to push copies
+		 * to one vdev, that looks a bit better at the moment.
+		 */
+		qmax = qmax * (4 + d) / 4;
+
+		qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
+
+		/*
+		 * If this metaslab group is below its qmax or it's
+		 * the only allocatable metasable group, then attempt
+		 * to allocate from it.
+		 */
+		if (qdepth < qmax || mc->mc_alloc_groups == 1)
+			return (B_TRUE);
+		ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+		/*
+		 * Since this metaslab group is at or over its qmax, we
+		 * need to determine if there are metaslab groups after this
+		 * one that might be able to handle this allocation. This is
+		 * racy since we can't hold the locks for all metaslab
+		 * groups at the same time when we make this check.
+		 */
+		for (metaslab_group_t *mgp = mg->mg_next;
+		    mgp != rotor; mgp = mgp->mg_next) {
+			metaslab_group_allocator_t *mgap =
+			    &mgp->mg_allocator[allocator];
+			qmax = mgap->mga_cur_max_alloc_queue_depth;
+			qmax = qmax * (4 + d) / 4;
+			qdepth =
+			    zfs_refcount_count(&mgap->mga_alloc_queue_depth);
+
+			/*
+			 * If there is another metaslab group that
+			 * might be able to handle the allocation, then
+			 * we return false so that we skip this group.
+			 */
+			if (qdepth < qmax && !mgp->mg_no_free_space)
+				return (B_FALSE);
+		}
+
+		/*
+		 * We didn't find another group to handle the allocation
+		 * so we can't skip this metaslab group even though
+		 * we are at or over our qmax.
+		 */
+		return (B_TRUE);
+
+	} else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * ==========================================================================
+ * Range tree callbacks
+ * ==========================================================================
+ */
+
+/*
+ * Comparison function for the private size-ordered tree using 32-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize32_compare(const void *x1, const void *x2)
+{
+	const range_seg32_t *r1 = x1;
+	const range_seg32_t *r2 = x2;
+
+	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+	int cmp = TREE_CMP(rs_size1, rs_size2);
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+
+/*
+ * Comparison function for the private size-ordered tree using 64-bit
+ * ranges. Tree is sorted by size, larger sizes at the end of the tree.
+ */
+static int
+metaslab_rangesize64_compare(const void *x1, const void *x2)
+{
+	const range_seg64_t *r1 = x1;
+	const range_seg64_t *r2 = x2;
+
+	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
+
+	int cmp = TREE_CMP(rs_size1, rs_size2);
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_CMP(r1->rs_start, r2->rs_start));
+}
+typedef struct metaslab_rt_arg {
+	zfs_btree_t *mra_bt;
+	uint32_t mra_floor_shift;
+} metaslab_rt_arg_t;
+
+struct mssa_arg {
+	range_tree_t *rt;
+	metaslab_rt_arg_t *mra;
+};
+
+static void
+metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
+{
+	struct mssa_arg *mssap = arg;
+	range_tree_t *rt = mssap->rt;
+	metaslab_rt_arg_t *mrap = mssap->mra;
+	range_seg_max_t seg = {0};
+	rs_set_start(&seg, rt, start);
+	rs_set_end(&seg, rt, start + size);
+	metaslab_rt_add(rt, &seg, mrap);
+}
+
+static void
+metaslab_size_tree_full_load(range_tree_t *rt)
+{
+	metaslab_rt_arg_t *mrap = rt->rt_arg;
+	METASLABSTAT_BUMP(metaslabstat_reload_tree);
+	ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
+	mrap->mra_floor_shift = 0;
+	struct mssa_arg arg = {0};
+	arg.rt = rt;
+	arg.mra = mrap;
+	range_tree_walk(rt, metaslab_size_sorted_add, &arg);
+}
+
+/*
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
+ */
+/* ARGSUSED */
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	size_t size;
+	int (*compare) (const void *, const void *);
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		compare = metaslab_rangesize32_compare;
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		compare = metaslab_rangesize64_compare;
+		break;
+	default:
+		panic("Invalid range seg type %d", rt->rt_type);
+	}
+	zfs_btree_create(size_tree, compare, size);
+	mrap->mra_floor_shift = metaslab_by_size_min_shift;
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	zfs_btree_destroy(size_tree);
+	kmem_free(mrap, sizeof (*mrap));
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
+	    (1 << mrap->mra_floor_shift))
+		return;
+
+	zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+
+	if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
+	    mrap->mra_floor_shift))
+		return;
+
+	zfs_btree_remove(size_tree, rs);
+}
+
+/* ARGSUSED */
+static void
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
+{
+	metaslab_rt_arg_t *mrap = arg;
+	zfs_btree_t *size_tree = mrap->mra_bt;
+	zfs_btree_clear(size_tree);
+	zfs_btree_destroy(size_tree);
+
+	metaslab_rt_create(rt, arg);
+}
+
+static range_tree_ops_t metaslab_rt_ops = {
+	.rtop_create = metaslab_rt_create,
+	.rtop_destroy = metaslab_rt_destroy,
+	.rtop_add = metaslab_rt_add,
+	.rtop_remove = metaslab_rt_remove,
+	.rtop_vacate = metaslab_rt_vacate
+};
+
+/*
+ * ==========================================================================
+ * Common allocator routines
+ * ==========================================================================
+ */
+
+/*
+ * Return the maximum contiguous segment within the metaslab.
+ */
+uint64_t
+metaslab_largest_allocatable(metaslab_t *msp)
+{
+	zfs_btree_t *t = &msp->ms_allocatable_by_size;
+	range_seg_t *rs;
+
+	if (t == NULL)
+		return (0);
+	if (zfs_btree_numnodes(t) == 0)
+		metaslab_size_tree_full_load(msp->ms_allocatable);
+
+	rs = zfs_btree_last(t, NULL);
+	if (rs == NULL)
+		return (0);
+
+	return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
+	    msp->ms_allocatable));
+}
+
+/*
+ * Return the maximum contiguous segment within the unflushed frees of this
+ * metaslab.
+ */
+static uint64_t
+metaslab_largest_unflushed_free(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if (msp->ms_unflushed_frees == NULL)
+		return (0);
+
+	if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
+		metaslab_size_tree_full_load(msp->ms_unflushed_frees);
+	range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
+	    NULL);
+	if (rs == NULL)
+		return (0);
+
+	/*
+	 * When a range is freed from the metaslab, that range is added to
+	 * both the unflushed frees and the deferred frees. While the block
+	 * will eventually be usable, if the metaslab were loaded the range
+	 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
+	 * txgs had passed.  As a result, when attempting to estimate an upper
+	 * bound for the largest currently-usable free segment in the
+	 * metaslab, we need to not consider any ranges currently in the defer
+	 * trees. This algorithm approximates the largest available chunk in
+	 * the largest range in the unflushed_frees tree by taking the first
+	 * chunk.  While this may be a poor estimate, it should only remain so
+	 * briefly and should eventually self-correct as frees are no longer
+	 * deferred. Similar logic applies to the ms_freed tree. See
+	 * metaslab_load() for more details.
+	 *
+	 * There are two primary sources of inaccuracy in this estimate. Both
+	 * are tolerated for performance reasons. The first source is that we
+	 * only check the largest segment for overlaps. Smaller segments may
+	 * have more favorable overlaps with the other trees, resulting in
+	 * larger usable chunks.  Second, we only look at the first chunk in
+	 * the largest segment; there may be other usable chunks in the
+	 * largest segment, but we ignore them.
+	 */
+	uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
+	uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		uint64_t start = 0;
+		uint64_t size = 0;
+		boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
+		    rsize, &start, &size);
+		if (found) {
+			if (rstart == start)
+				return (0);
+			rsize = start - rstart;
+		}
+	}
+
+	uint64_t start = 0;
+	uint64_t size = 0;
+	boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
+	    rsize, &start, &size);
+	if (found)
+		rsize = start - rstart;
+
+	return (rsize);
+}
+
+static range_seg_t *
+metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
+    uint64_t size, zfs_btree_index_t *where)
+{
+	range_seg_t *rs;
+	range_seg_max_t rsearch;
+
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, start + size);
+
+	rs = zfs_btree_find(t, &rsearch, where);
+	if (rs == NULL) {
+		rs = zfs_btree_next(t, where, where);
+	}
+
+	return (rs);
+}
+
+#if defined(WITH_DF_BLOCK_ALLOCATOR) || \
+    defined(WITH_CF_BLOCK_ALLOCATOR)
+
+/*
+ * This is a helper function that can be used by the allocator to find a
+ * suitable block to allocate. This will search the specified B-tree looking
+ * for a block that matches the specified criteria.
+ */
+static uint64_t
+metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
+    uint64_t max_search)
+{
+	if (*cursor == 0)
+		*cursor = rt->rt_start;
+	zfs_btree_t *bt = &rt->rt_root;
+	zfs_btree_index_t where;
+	range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
+	uint64_t first_found;
+	int count_searched = 0;
+
+	if (rs != NULL)
+		first_found = rs_get_start(rs, rt);
+
+	while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
+	    max_search || count_searched < metaslab_min_search_count)) {
+		uint64_t offset = rs_get_start(rs, rt);
+		if (offset + size <= rs_get_end(rs, rt)) {
+			*cursor = offset + size;
+			return (offset);
+		}
+		rs = zfs_btree_next(bt, &where, &where);
+		count_searched++;
+	}
+
+	*cursor = 0;
+	return (-1ULL);
+}
+#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_DF_BLOCK_ALLOCATOR)
+/*
+ * ==========================================================================
+ * Dynamic Fit (df) block allocator
+ *
+ * Search for a free chunk of at least this size, starting from the last
+ * offset (for this alignment of block) looking for up to
+ * metaslab_df_max_search bytes (16MB).  If a large enough free chunk is not
+ * found within 16MB, then return a free chunk of exactly the requested size (or
+ * larger).
+ *
+ * If it seems like searching from the last offset will be unproductive, skip
+ * that and just return a free chunk of exactly the requested size (or larger).
+ * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct.  This
+ * mechanism is probably not very useful and may be removed in the future.
+ *
+ * The behavior when not searching can be changed to return the largest free
+ * chunk, instead of a free chunk of exactly the requested size, by setting
+ * metaslab_df_use_largest_segment.
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
+{
+	/*
+	 * Find the largest power of 2 block size that evenly divides the
+	 * requested size. This is used to try to allocate blocks with similar
+	 * alignment from the same area of the metaslab (i.e. same cursor
+	 * bucket) but it does not guarantee that other allocations sizes
+	 * may exist in the same region.
+	 */
+	uint64_t align = size & -size;
+	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+	range_tree_t *rt = msp->ms_allocatable;
+	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
+	uint64_t offset;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * If we're running low on space, find a segment based on size,
+	 * rather than iterating based on offset.
+	 */
+	if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
+	    free_pct < metaslab_df_free_pct) {
+		offset = -1;
+	} else {
+		offset = metaslab_block_picker(rt,
+		    cursor, size, metaslab_df_max_search);
+	}
+
+	if (offset == -1) {
+		range_seg_t *rs;
+		if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
+			metaslab_size_tree_full_load(msp->ms_allocatable);
+
+		if (metaslab_df_use_largest_segment) {
+			/* use largest free segment */
+			rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
+		} else {
+			zfs_btree_index_t where;
+			/* use segment of this size, or next largest */
+			rs = metaslab_block_find(&msp->ms_allocatable_by_size,
+			    rt, msp->ms_start, size, &where);
+		}
+		if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
+		    rt)) {
+			offset = rs_get_start(rs, rt);
+			*cursor = offset + size;
+		}
+	}
+
+	return (offset);
+}
+
+static metaslab_ops_t metaslab_df_ops = {
+	metaslab_df_alloc
+};
+
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+#endif /* WITH_DF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_CF_BLOCK_ALLOCATOR)
+/*
+ * ==========================================================================
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
+{
+	range_tree_t *rt = msp->ms_allocatable;
+	zfs_btree_t *t = &msp->ms_allocatable_by_size;
+	uint64_t *cursor = &msp->ms_lbas[0];
+	uint64_t *cursor_end = &msp->ms_lbas[1];
+	uint64_t offset = 0;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	ASSERT3U(*cursor_end, >=, *cursor);
+
+	if ((*cursor + size) > *cursor_end) {
+		range_seg_t *rs;
+
+		if (zfs_btree_numnodes(t) == 0)
+			metaslab_size_tree_full_load(msp->ms_allocatable);
+		rs = zfs_btree_last(t, NULL);
+		if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
+		    size)
+			return (-1ULL);
+
+		*cursor = rs_get_start(rs, rt);
+		*cursor_end = rs_get_end(rs, rt);
+	}
+
+	offset = *cursor;
+	*cursor += size;
+
+	return (offset);
+}
+
+static metaslab_ops_t metaslab_cf_ops = {
+	metaslab_cf_alloc
+};
+
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops;
+#endif /* WITH_CF_BLOCK_ALLOCATOR */
+
+#if defined(WITH_NDF_BLOCK_ALLOCATOR)
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
+uint64_t metaslab_ndf_clump_shift = 4;
+
+static uint64_t
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
+{
+	zfs_btree_t *t = &msp->ms_allocatable->rt_root;
+	range_tree_t *rt = msp->ms_allocatable;
+	zfs_btree_index_t where;
+	range_seg_t *rs;
+	range_seg_max_t rsearch;
+	uint64_t hbit = highbit64(size);
+	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+	uint64_t max_size = metaslab_largest_allocatable(msp);
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	rs_set_start(&rsearch, rt, *cursor);
+	rs_set_end(&rsearch, rt, *cursor + size);
+
+	rs = zfs_btree_find(t, &rsearch, &where);
+	if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
+		t = &msp->ms_allocatable_by_size;
+
+		rs_set_start(&rsearch, rt, 0);
+		rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
+		    metaslab_ndf_clump_shift)));
+
+		rs = zfs_btree_find(t, &rsearch, &where);
+		if (rs == NULL)
+			rs = zfs_btree_next(t, &where, &where);
+		ASSERT(rs != NULL);
+	}
+
+	if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
+		*cursor = rs_get_start(rs, rt) + size;
+		return (rs_get_start(rs, rt));
+	}
+	return (-1ULL);
+}
+
+static metaslab_ops_t metaslab_ndf_ops = {
+	metaslab_ndf_alloc
+};
+
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
+#endif /* WITH_NDF_BLOCK_ALLOCATOR */
+
+
+/*
+ * ==========================================================================
+ * Metaslabs
+ * ==========================================================================
+ */
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+static void
+metaslab_load_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_loading) {
+		ASSERT(!msp->ms_loaded);
+		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+	}
+}
+
+/*
+ * Wait for any in-progress flushing to complete.
+ */
+static void
+metaslab_flush_wait(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	while (msp->ms_flushing)
+		cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
+}
+
+static unsigned int
+metaslab_idx_func(multilist_t *ml, void *arg)
+{
+	metaslab_t *msp = arg;
+	return (msp->ms_id % multilist_get_num_sublists(ml));
+}
+
+uint64_t
+metaslab_allocated_space(metaslab_t *msp)
+{
+	return (msp->ms_allocated_space);
+}
+
+/*
+ * Verify that the space accounting on disk matches the in-core range_trees.
+ */
+static void
+metaslab_verify_space(metaslab_t *msp, uint64_t txg)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	uint64_t allocating = 0;
+	uint64_t sm_free_space, msp_free_space;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(!msp->ms_condensing);
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	/*
+	 * We can only verify the metaslab space when we're called
+	 * from syncing context with a loaded metaslab that has an
+	 * allocated space map. Calling this in non-syncing context
+	 * does not provide a consistent view of the metaslab since
+	 * we're performing allocations in the future.
+	 */
+	if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
+	    !msp->ms_loaded)
+		return;
+
+	/*
+	 * Even though the smp_alloc field can get negative,
+	 * when it comes to a metaslab's space map, that should
+	 * never be the case.
+	 */
+	ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
+
+	ASSERT3U(space_map_allocated(msp->ms_sm), >=,
+	    range_tree_space(msp->ms_unflushed_frees));
+
+	ASSERT3U(metaslab_allocated_space(msp), ==,
+	    space_map_allocated(msp->ms_sm) +
+	    range_tree_space(msp->ms_unflushed_allocs) -
+	    range_tree_space(msp->ms_unflushed_frees));
+
+	sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
+
+	/*
+	 * Account for future allocations since we would have
+	 * already deducted that space from the ms_allocatable.
+	 */
+	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+		allocating +=
+		    range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
+	}
+	ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
+	    msp->ms_allocating_total);
+
+	ASSERT3U(msp->ms_deferspace, ==,
+	    range_tree_space(msp->ms_defer[0]) +
+	    range_tree_space(msp->ms_defer[1]));
+
+	msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
+	    msp->ms_deferspace + range_tree_space(msp->ms_freed);
+
+	VERIFY3U(sm_free_space, ==, msp_free_space);
+}
+
+static void
+metaslab_aux_histograms_clear(metaslab_t *msp)
+{
+	/*
+	 * Auxiliary histograms are only cleared when resetting them,
+	 * which can only happen while the metaslab is loaded.
+	 */
+	ASSERT(msp->ms_loaded);
+
+	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+	for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
+}
+
+static void
+metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
+    range_tree_t *rt)
+{
+	/*
+	 * This is modeled after space_map_histogram_add(), so refer to that
+	 * function for implementation details. We want this to work like
+	 * the space map histogram, and not the range tree histogram, as we
+	 * are essentially constructing a delta that will be later subtracted
+	 * from the space map histogram.
+	 */
+	int idx = 0;
+	for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+		ASSERT3U(i, >=, idx + shift);
+		histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
+
+		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+			ASSERT3U(idx + shift, ==, i);
+			idx++;
+			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+		}
+	}
+}
+
+/*
+ * Called at every sync pass that the metaslab gets synced.
+ *
+ * The reason is that we want our auxiliary histograms to be updated
+ * wherever the metaslab's space map histogram is updated. This way
+ * we stay consistent on which parts of the metaslab space map's
+ * histogram are currently not available for allocations (e.g because
+ * they are in the defer, freed, and freeing trees).
+ */
+static void
+metaslab_aux_histograms_update(metaslab_t *msp)
+{
+	space_map_t *sm = msp->ms_sm;
+	ASSERT(sm != NULL);
+
+	/*
+	 * This is similar to the metaslab's space map histogram updates
+	 * that take place in metaslab_sync(). The only difference is that
+	 * we only care about segments that haven't made it into the
+	 * ms_allocatable tree yet.
+	 */
+	if (msp->ms_loaded) {
+		metaslab_aux_histograms_clear(msp);
+
+		metaslab_aux_histogram_add(msp->ms_synchist,
+		    sm->sm_shift, msp->ms_freed);
+
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			metaslab_aux_histogram_add(msp->ms_deferhist[t],
+			    sm->sm_shift, msp->ms_defer[t]);
+		}
+	}
+
+	metaslab_aux_histogram_add(msp->ms_synchist,
+	    sm->sm_shift, msp->ms_freeing);
+}
+
+/*
+ * Called every time we are done syncing (writing to) the metaslab,
+ * i.e. at the end of each sync pass.
+ * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
+ */
+static void
+metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	space_map_t *sm = msp->ms_sm;
+
+	if (sm == NULL) {
+		/*
+		 * We came here from metaslab_init() when creating/opening a
+		 * pool, looking at a metaslab that hasn't had any allocations
+		 * yet.
+		 */
+		return;
+	}
+
+	/*
+	 * This is similar to the actions that we take for the ms_freed
+	 * and ms_defer trees in metaslab_sync_done().
+	 */
+	uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
+	if (defer_allowed) {
+		bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
+		    sizeof (msp->ms_synchist));
+	} else {
+		bzero(msp->ms_deferhist[hist_index],
+		    sizeof (msp->ms_deferhist[hist_index]));
+	}
+	bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
+}
+
+/*
+ * Ensure that the metaslab's weight and fragmentation are consistent
+ * with the contents of the histogram (either the range tree's histogram
+ * or the space map's depending whether the metaslab is loaded).
+ */
+static void
+metaslab_verify_weight_and_frag(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	/*
+	 * We can end up here from vdev_remove_complete(), in which case we
+	 * cannot do these assertions because we hold spa config locks and
+	 * thus we are not allowed to read from the DMU.
+	 *
+	 * We check if the metaslab group has been removed and if that's
+	 * the case we return immediately as that would mean that we are
+	 * here from the aforementioned code path.
+	 */
+	if (msp->ms_group == NULL)
+		return;
+
+	/*
+	 * Devices being removed always return a weight of 0 and leave
+	 * fragmentation and ms_max_size as is - there is nothing for
+	 * us to verify here.
+	 */
+	vdev_t *vd = msp->ms_group->mg_vd;
+	if (vd->vdev_removing)
+		return;
+
+	/*
+	 * If the metaslab is dirty it probably means that we've done
+	 * some allocations or frees that have changed our histograms
+	 * and thus the weight.
+	 */
+	for (int t = 0; t < TXG_SIZE; t++) {
+		if (txg_list_member(&vd->vdev_ms_list, msp, t))
+			return;
+	}
+
+	/*
+	 * This verification checks that our in-memory state is consistent
+	 * with what's on disk. If the pool is read-only then there aren't
+	 * any changes and we just have the initially-loaded state.
+	 */
+	if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
+		return;
+
+	/* some extra verification for in-core tree if you can */
+	if (msp->ms_loaded) {
+		range_tree_stat_verify(msp->ms_allocatable);
+		VERIFY(space_map_histogram_verify(msp->ms_sm,
+		    msp->ms_allocatable));
+	}
+
+	uint64_t weight = msp->ms_weight;
+	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+	boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
+	uint64_t frag = msp->ms_fragmentation;
+	uint64_t max_segsize = msp->ms_max_size;
+
+	msp->ms_weight = 0;
+	msp->ms_fragmentation = 0;
+
+	/*
+	 * This function is used for verification purposes and thus should
+	 * not introduce any side-effects/mutations on the system's state.
+	 *
+	 * Regardless of whether metaslab_weight() thinks this metaslab
+	 * should be active or not, we want to ensure that the actual weight
+	 * (and therefore the value of ms_weight) would be the same if it
+	 * was to be recalculated at this point.
+	 *
+	 * In addition we set the nodirty flag so metaslab_weight() does
+	 * not dirty the metaslab for future TXGs (e.g. when trying to
+	 * force condensing to upgrade the metaslab spacemaps).
+	 */
+	msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active;
+
+	VERIFY3U(max_segsize, ==, msp->ms_max_size);
+
+	/*
+	 * If the weight type changed then there is no point in doing
+	 * verification. Revert fields to their original values.
+	 */
+	if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
+	    (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
+		msp->ms_fragmentation = frag;
+		msp->ms_weight = weight;
+		return;
+	}
+
+	VERIFY3U(msp->ms_fragmentation, ==, frag);
+	VERIFY3U(msp->ms_weight, ==, weight);
+}
+
+/*
+ * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
+ * this class that was used longest ago, and attempt to unload it.  We don't
+ * want to spend too much time in this loop to prevent performance
+ * degradation, and we expect that most of the time this operation will
+ * succeed. Between that and the normal unloading processing during txg sync,
+ * we expect this to keep the metaslab memory usage under control.
+ */
+static void
+metaslab_potentially_evict(metaslab_class_t *mc)
+{
+#ifdef _KERNEL
+	uint64_t allmem = arc_all_memory();
+	uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
+	uint64_t size =	spl_kmem_cache_entry_size(zfs_btree_leaf_cache);
+	int tries = 0;
+	for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
+	    tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
+	    tries++) {
+		unsigned int idx = multilist_get_random_index(
+		    mc->mc_metaslab_txg_list);
+		multilist_sublist_t *mls =
+		    multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
+		metaslab_t *msp = multilist_sublist_head(mls);
+		multilist_sublist_unlock(mls);
+		while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
+		    inuse * size) {
+			VERIFY3P(mls, ==, multilist_sublist_lock(
+			    mc->mc_metaslab_txg_list, idx));
+			ASSERT3U(idx, ==,
+			    metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
+
+			if (!multilist_link_active(&msp->ms_class_txg_node)) {
+				multilist_sublist_unlock(mls);
+				break;
+			}
+			metaslab_t *next_msp = multilist_sublist_next(mls, msp);
+			multilist_sublist_unlock(mls);
+			/*
+			 * If the metaslab is currently loading there are two
+			 * cases. If it's the metaslab we're evicting, we
+			 * can't continue on or we'll panic when we attempt to
+			 * recursively lock the mutex. If it's another
+			 * metaslab that's loading, it can be safely skipped,
+			 * since we know it's very new and therefore not a
+			 * good eviction candidate. We check later once the
+			 * lock is held that the metaslab is fully loaded
+			 * before actually unloading it.
+			 */
+			if (msp->ms_loading) {
+				msp = next_msp;
+				inuse =
+				    spl_kmem_cache_inuse(zfs_btree_leaf_cache);
+				continue;
+			}
+			/*
+			 * We can't unload metaslabs with no spacemap because
+			 * they're not ready to be unloaded yet. We can't
+			 * unload metaslabs with outstanding allocations
+			 * because doing so could cause the metaslab's weight
+			 * to decrease while it's unloaded, which violates an
+			 * invariant that we use to prevent unnecessary
+			 * loading. We also don't unload metaslabs that are
+			 * currently active because they are high-weight
+			 * metaslabs that are likely to be used in the near
+			 * future.
+			 */
+			mutex_enter(&msp->ms_lock);
+			if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
+			    msp->ms_allocating_total == 0) {
+				metaslab_unload(msp);
+			}
+			mutex_exit(&msp->ms_lock);
+			msp = next_msp;
+			inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache);
+		}
+	}
+#endif
+}
+
+static int
+metaslab_load_impl(metaslab_t *msp)
+{
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_loading);
+	ASSERT(!msp->ms_condensing);
+
+	/*
+	 * We temporarily drop the lock to unblock other operations while we
+	 * are reading the space map. Therefore, metaslab_sync() and
+	 * metaslab_sync_done() can run at the same time as we do.
+	 *
+	 * If we are using the log space maps, metaslab_sync() can't write to
+	 * the metaslab's space map while we are loading as we only write to
+	 * it when we are flushing the metaslab, and that can't happen while
+	 * we are loading it.
+	 *
+	 * If we are not using log space maps though, metaslab_sync() can
+	 * append to the space map while we are loading. Therefore we load
+	 * only entries that existed when we started the load. Additionally,
+	 * metaslab_sync_done() has to wait for the load to complete because
+	 * there are potential races like metaslab_load() loading parts of the
+	 * space map that are currently being appended by metaslab_sync(). If
+	 * we didn't, the ms_allocatable would have entries that
+	 * metaslab_sync_done() would try to re-add later.
+	 *
+	 * That's why before dropping the lock we remember the synced length
+	 * of the metaslab and read up to that point of the space map,
+	 * ignoring entries appended by metaslab_sync() that happen after we
+	 * drop the lock.
+	 */
+	uint64_t length = msp->ms_synced_length;
+	mutex_exit(&msp->ms_lock);
+
+	hrtime_t load_start = gethrtime();
+	metaslab_rt_arg_t *mrap;
+	if (msp->ms_allocatable->rt_arg == NULL) {
+		mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+	} else {
+		mrap = msp->ms_allocatable->rt_arg;
+		msp->ms_allocatable->rt_ops = NULL;
+		msp->ms_allocatable->rt_arg = NULL;
+	}
+	mrap->mra_bt = &msp->ms_allocatable_by_size;
+	mrap->mra_floor_shift = metaslab_by_size_min_shift;
+
+	if (msp->ms_sm != NULL) {
+		error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
+		    SM_FREE, length);
+
+		/* Now, populate the size-sorted tree. */
+		metaslab_rt_create(msp->ms_allocatable, mrap);
+		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+		msp->ms_allocatable->rt_arg = mrap;
+
+		struct mssa_arg arg = {0};
+		arg.rt = msp->ms_allocatable;
+		arg.mra = mrap;
+		range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
+		    &arg);
+	} else {
+		/*
+		 * Add the size-sorted tree first, since we don't need to load
+		 * the metaslab from the spacemap.
+		 */
+		metaslab_rt_create(msp->ms_allocatable, mrap);
+		msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
+		msp->ms_allocatable->rt_arg = mrap;
+		/*
+		 * The space map has not been allocated yet, so treat
+		 * all the space in the metaslab as free and add it to the
+		 * ms_allocatable tree.
+		 */
+		range_tree_add(msp->ms_allocatable,
+		    msp->ms_start, msp->ms_size);
+
+		if (msp->ms_freed != NULL) {
+			/*
+			 * If the ms_sm doesn't exist, this means that this
+			 * metaslab hasn't gone through metaslab_sync() and
+			 * thus has never been dirtied. So we shouldn't
+			 * expect any unflushed allocs or frees from previous
+			 * TXGs.
+			 *
+			 * Note: ms_freed and all the other trees except for
+			 * the ms_allocatable, can be NULL at this point only
+			 * if this is a new metaslab of a vdev that just got
+			 * expanded.
+			 */
+			ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+			ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+		}
+	}
+
+	/*
+	 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
+	 * changing the ms_sm (or log_sm) and the metaslab's range trees
+	 * while we are about to use them and populate the ms_allocatable.
+	 * The ms_lock is insufficient for this because metaslab_sync() doesn't
+	 * hold the ms_lock while writing the ms_checkpointing tree to disk.
+	 */
+	mutex_enter(&msp->ms_sync_lock);
+	mutex_enter(&msp->ms_lock);
+
+	ASSERT(!msp->ms_condensing);
+	ASSERT(!msp->ms_flushing);
+
+	if (error != 0) {
+		mutex_exit(&msp->ms_sync_lock);
+		return (error);
+	}
+
+	ASSERT3P(msp->ms_group, !=, NULL);
+	msp->ms_loaded = B_TRUE;
+
+	/*
+	 * Apply all the unflushed changes to ms_allocatable right
+	 * away so any manipulations we do below have a clear view
+	 * of what is allocated and what is free.
+	 */
+	range_tree_walk(msp->ms_unflushed_allocs,
+	    range_tree_remove, msp->ms_allocatable);
+	range_tree_walk(msp->ms_unflushed_frees,
+	    range_tree_add, msp->ms_allocatable);
+
+	msp->ms_loaded = B_TRUE;
+
+	ASSERT3P(msp->ms_group, !=, NULL);
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	if (spa_syncing_log_sm(spa) != NULL) {
+		ASSERT(spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LOG_SPACEMAP));
+
+		/*
+		 * If we use a log space map we add all the segments
+		 * that are in ms_unflushed_frees so they are available
+		 * for allocation.
+		 *
+		 * ms_allocatable needs to contain all free segments
+		 * that are ready for allocations (thus not segments
+		 * from ms_freeing, ms_freed, and the ms_defer trees).
+		 * But if we grab the lock in this code path at a sync
+		 * pass later that 1, then it also contains the
+		 * segments of ms_freed (they were added to it earlier
+		 * in this path through ms_unflushed_frees). So we
+		 * need to remove all the segments that exist in
+		 * ms_freed from ms_allocatable as they will be added
+		 * later in metaslab_sync_done().
+		 *
+		 * When there's no log space map, the ms_allocatable
+		 * correctly doesn't contain any segments that exist
+		 * in ms_freed [see ms_synced_length].
+		 */
+		range_tree_walk(msp->ms_freed,
+		    range_tree_remove, msp->ms_allocatable);
+	}
+
+	/*
+	 * If we are not using the log space map, ms_allocatable
+	 * contains the segments that exist in the ms_defer trees
+	 * [see ms_synced_length]. Thus we need to remove them
+	 * from ms_allocatable as they will be added again in
+	 * metaslab_sync_done().
+	 *
+	 * If we are using the log space map, ms_allocatable still
+	 * contains the segments that exist in the ms_defer trees.
+	 * Not because it read them through the ms_sm though. But
+	 * because these segments are part of ms_unflushed_frees
+	 * whose segments we add to ms_allocatable earlier in this
+	 * code path.
+	 */
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		range_tree_walk(msp->ms_defer[t],
+		    range_tree_remove, msp->ms_allocatable);
+	}
+
+	/*
+	 * Call metaslab_recalculate_weight_and_sort() now that the
+	 * metaslab is loaded so we get the metaslab's real weight.
+	 *
+	 * Unless this metaslab was created with older software and
+	 * has not yet been converted to use segment-based weight, we
+	 * expect the new weight to be better or equal to the weight
+	 * that the metaslab had while it was not loaded. This is
+	 * because the old weight does not take into account the
+	 * consolidation of adjacent segments between TXGs. [see
+	 * comment for ms_synchist and ms_deferhist[] for more info]
+	 */
+	uint64_t weight = msp->ms_weight;
+	uint64_t max_size = msp->ms_max_size;
+	metaslab_recalculate_weight_and_sort(msp);
+	if (!WEIGHT_IS_SPACEBASED(weight))
+		ASSERT3U(weight, <=, msp->ms_weight);
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
+	ASSERT3U(max_size, <=, msp->ms_max_size);
+	hrtime_t load_end = gethrtime();
+	msp->ms_load_time = load_end;
+	zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, "
+	    "ms_id %llu, smp_length %llu, "
+	    "unflushed_allocs %llu, unflushed_frees %llu, "
+	    "freed %llu, defer %llu + %llu, unloaded time %llu ms, "
+	    "loading_time %lld ms, ms_max_size %llu, "
+	    "max size error %lld, "
+	    "old_weight %llx, new_weight %llx",
+	    spa_syncing_txg(spa), spa_name(spa),
+	    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+	    space_map_length(msp->ms_sm),
+	    range_tree_space(msp->ms_unflushed_allocs),
+	    range_tree_space(msp->ms_unflushed_frees),
+	    range_tree_space(msp->ms_freed),
+	    range_tree_space(msp->ms_defer[0]),
+	    range_tree_space(msp->ms_defer[1]),
+	    (longlong_t)((load_start - msp->ms_unload_time) / 1000000),
+	    (longlong_t)((load_end - load_start) / 1000000),
+	    msp->ms_max_size, msp->ms_max_size - max_size,
+	    weight, msp->ms_weight);
+
+	metaslab_verify_space(msp, spa_syncing_txg(spa));
+	mutex_exit(&msp->ms_sync_lock);
+	return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * There may be another thread loading the same metaslab, if that's
+	 * the case just wait until the other thread is done and return.
+	 */
+	metaslab_load_wait(msp);
+	if (msp->ms_loaded)
+		return (0);
+	VERIFY(!msp->ms_loading);
+	ASSERT(!msp->ms_condensing);
+
+	/*
+	 * We set the loading flag BEFORE potentially dropping the lock to
+	 * wait for an ongoing flush (see ms_flushing below). This way other
+	 * threads know that there is already a thread that is loading this
+	 * metaslab.
+	 */
+	msp->ms_loading = B_TRUE;
+
+	/*
+	 * Wait for any in-progress flushing to finish as we drop the ms_lock
+	 * both here (during space_map_load()) and in metaslab_flush() (when
+	 * we flush our changes to the ms_sm).
+	 */
+	if (msp->ms_flushing)
+		metaslab_flush_wait(msp);
+
+	/*
+	 * In the possibility that we were waiting for the metaslab to be
+	 * flushed (where we temporarily dropped the ms_lock), ensure that
+	 * no one else loaded the metaslab somehow.
+	 */
+	ASSERT(!msp->ms_loaded);
+
+	/*
+	 * If we're loading a metaslab in the normal class, consider evicting
+	 * another one to keep our memory usage under the limit defined by the
+	 * zfs_metaslab_mem_limit tunable.
+	 */
+	if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
+	    msp->ms_group->mg_class) {
+		metaslab_potentially_evict(msp->ms_group->mg_class);
+	}
+
+	int error = metaslab_load_impl(msp);
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	msp->ms_loading = B_FALSE;
+	cv_broadcast(&msp->ms_load_cv);
+
+	return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * This can happen if a metaslab is selected for eviction (in
+	 * metaslab_potentially_evict) and then unloaded during spa_sync (via
+	 * metaslab_class_evict_old).
+	 */
+	if (!msp->ms_loaded)
+		return;
+
+	range_tree_vacate(msp->ms_allocatable, NULL, NULL);
+	msp->ms_loaded = B_FALSE;
+	msp->ms_unload_time = gethrtime();
+
+	msp->ms_activation_weight = 0;
+	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+
+	if (msp->ms_group != NULL) {
+		metaslab_class_t *mc = msp->ms_group->mg_class;
+		multilist_sublist_t *mls =
+		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+		if (multilist_link_active(&msp->ms_class_txg_node))
+			multilist_sublist_remove(mls, msp);
+		multilist_sublist_unlock(mls);
+
+		spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+		zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, "
+		    "ms_id %llu, weight %llx, "
+		    "selected txg %llu (%llu ms ago), alloc_txg %llu, "
+		    "loaded %llu ms ago, max_size %llu",
+		    spa_syncing_txg(spa), spa_name(spa),
+		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+		    msp->ms_weight,
+		    msp->ms_selected_txg,
+		    (msp->ms_unload_time - msp->ms_selected_time) / 1000 / 1000,
+		    msp->ms_alloc_txg,
+		    (msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000,
+		    msp->ms_max_size);
+	}
+
+	/*
+	 * We explicitly recalculate the metaslab's weight based on its space
+	 * map (as it is now not loaded). We want unload metaslabs to always
+	 * have their weights calculated from the space map histograms, while
+	 * loaded ones have it calculated from their in-core range tree
+	 * [see metaslab_load()]. This way, the weight reflects the information
+	 * available in-core, whether it is loaded or not.
+	 *
+	 * If ms_group == NULL means that we came here from metaslab_fini(),
+	 * at which point it doesn't make sense for us to do the recalculation
+	 * and the sorting.
+	 */
+	if (msp->ms_group != NULL)
+		metaslab_recalculate_weight_and_sort(msp);
+}
+
+/*
+ * We want to optimize the memory use of the per-metaslab range
+ * trees. To do this, we store the segments in the range trees in
+ * units of sectors, zero-indexing from the start of the metaslab. If
+ * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
+ * the ranges using two uint32_ts, rather than two uint64_ts.
+ */
+range_seg_type_t
+metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
+    uint64_t *start, uint64_t *shift)
+{
+	if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
+	    !zfs_metaslab_force_large_segs) {
+		*shift = vdev->vdev_ashift;
+		*start = msp->ms_start;
+		return (RANGE_SEG32);
+	} else {
+		*shift = 0;
+		*start = 0;
+		return (RANGE_SEG64);
+	}
+}
+
+void
+metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+	multilist_sublist_t *mls =
+	    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+	if (multilist_link_active(&msp->ms_class_txg_node))
+		multilist_sublist_remove(mls, msp);
+	msp->ms_selected_txg = txg;
+	msp->ms_selected_time = gethrtime();
+	multilist_sublist_insert_tail(mls, msp);
+	multilist_sublist_unlock(mls);
+}
+
+void
+metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
+    int64_t defer_delta, int64_t space_delta)
+{
+	vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
+
+	ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
+	ASSERT(vd->vdev_ms_count != 0);
+
+	metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
+	    vdev_deflated_space(vd, space_delta));
+}
+
+int
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
+    uint64_t txg, metaslab_t **msp)
+{
+	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	metaslab_t *ms;
+	int error;
+
+	ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
+	multilist_link_init(&ms->ms_class_txg_node);
+
+	ms->ms_id = id;
+	ms->ms_start = id << vd->vdev_ms_shift;
+	ms->ms_size = 1ULL << vd->vdev_ms_shift;
+	ms->ms_allocator = -1;
+	ms->ms_new = B_TRUE;
+
+	vdev_ops_t *ops = vd->vdev_ops;
+	if (ops->vdev_op_metaslab_init != NULL)
+		ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size);
+
+	/*
+	 * We only open space map objects that already exist. All others
+	 * will be opened when we finally allocate an object for it.
+	 *
+	 * Note:
+	 * When called from vdev_expand(), we can't call into the DMU as
+	 * we are holding the spa_config_lock as a writer and we would
+	 * deadlock [see relevant comment in vdev_metaslab_init()]. in
+	 * that case, the object parameter is zero though, so we won't
+	 * call into the DMU.
+	 */
+	if (object != 0) {
+		error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
+		    ms->ms_size, vd->vdev_ashift);
+
+		if (error != 0) {
+			kmem_free(ms, sizeof (metaslab_t));
+			return (error);
+		}
+
+		ASSERT(ms->ms_sm != NULL);
+		ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
+	}
+
+	range_seg_type_t type;
+	uint64_t shift, start;
+	type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
+
+	/*
+	 * We create the ms_allocatable here, but we don't create the
+	 * other range trees until metaslab_sync_done().  This serves
+	 * two purposes: it allows metaslab_sync_done() to detect the
+	 * addition of new space; and for debugging, it ensures that
+	 * we'd data fault on any attempt to use this metaslab before
+	 * it's ready.
+	 */
+	ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
+
+	ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
+
+	metaslab_group_add(mg, ms);
+	metaslab_set_fragmentation(ms, B_FALSE);
+
+	/*
+	 * If we're opening an existing pool (txg == 0) or creating
+	 * a new one (txg == TXG_INITIAL), all space is available now.
+	 * If we're adding space to an existing pool, the new space
+	 * does not become available until after this txg has synced.
+	 * The metaslab's weight will also be initialized when we sync
+	 * out this txg. This ensures that we don't attempt to allocate
+	 * from it before we have initialized it completely.
+	 */
+	if (txg <= TXG_INITIAL) {
+		metaslab_sync_done(ms, 0);
+		metaslab_space_update(vd, mg->mg_class,
+		    metaslab_allocated_space(ms), 0, 0);
+	}
+
+	if (txg != 0) {
+		vdev_dirty(vd, 0, NULL, txg);
+		vdev_dirty(vd, VDD_METASLAB, ms, txg);
+	}
+
+	*msp = ms;
+
+	return (0);
+}
+
+static void
+metaslab_fini_flush_data(metaslab_t *msp)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	if (metaslab_unflushed_txg(msp) == 0) {
+		ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
+		    ==, NULL);
+		return;
+	}
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	mutex_enter(&spa->spa_flushed_ms_lock);
+	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+	mutex_exit(&spa->spa_flushed_ms_lock);
+
+	spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+	spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
+}
+
+uint64_t
+metaslab_unflushed_changes_memused(metaslab_t *ms)
+{
+	return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
+	    range_tree_numsegs(ms->ms_unflushed_frees)) *
+	    ms->ms_unflushed_allocs->rt_root.bt_elem_size);
+}
+
+void
+metaslab_fini(metaslab_t *msp)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+
+	metaslab_fini_flush_data(msp);
+
+	metaslab_group_remove(mg, msp);
+
+	mutex_enter(&msp->ms_lock);
+	VERIFY(msp->ms_group == NULL);
+	/*
+	 * If the range trees haven't been allocated, this metaslab hasn't
+	 * been through metaslab_sync_done() for the first time yet, so its
+	 * space hasn't been accounted for in its vdev and doesn't need to be
+	 * subtracted.
+	 */
+	if (msp->ms_freed != NULL) {
+		metaslab_space_update(vd, mg->mg_class,
+		    -metaslab_allocated_space(msp), 0, -msp->ms_size);
+
+	}
+	space_map_close(msp->ms_sm);
+	msp->ms_sm = NULL;
+
+	metaslab_unload(msp);
+
+	range_tree_destroy(msp->ms_allocatable);
+
+	if (msp->ms_freed != NULL) {
+		range_tree_destroy(msp->ms_freeing);
+		range_tree_destroy(msp->ms_freed);
+
+		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+		    metaslab_unflushed_changes_memused(msp));
+		spa->spa_unflushed_stats.sus_memused -=
+		    metaslab_unflushed_changes_memused(msp);
+		range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+		range_tree_destroy(msp->ms_unflushed_allocs);
+		range_tree_destroy(msp->ms_checkpointing);
+		range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+		range_tree_destroy(msp->ms_unflushed_frees);
+
+		for (int t = 0; t < TXG_SIZE; t++) {
+			range_tree_destroy(msp->ms_allocating[t]);
+		}
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_destroy(msp->ms_defer[t]);
+		}
+	}
+	ASSERT0(msp->ms_deferspace);
+
+	for (int t = 0; t < TXG_SIZE; t++)
+		ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
+
+	range_tree_vacate(msp->ms_trim, NULL, NULL);
+	range_tree_destroy(msp->ms_trim);
+
+	mutex_exit(&msp->ms_lock);
+	cv_destroy(&msp->ms_load_cv);
+	cv_destroy(&msp->ms_flush_cv);
+	mutex_destroy(&msp->ms_lock);
+	mutex_destroy(&msp->ms_sync_lock);
+	ASSERT3U(msp->ms_allocator, ==, -1);
+
+	kmem_free(msp, sizeof (metaslab_t));
+}
+
+#define	FRAGMENTATION_TABLE_SIZE	17
+
+/*
+ * This table defines a segment size based fragmentation metric that will
+ * allow each metaslab to derive its own fragmentation value. This is done
+ * by calculating the space in each bucket of the spacemap histogram and
+ * multiplying that by the fragmentation metric in this table. Doing
+ * this for all buckets and dividing it by the total amount of free
+ * space in this metaslab (i.e. the total free space in all buckets) gives
+ * us the fragmentation metric. This means that a high fragmentation metric
+ * equates to most of the free space being comprised of small segments.
+ * Conversely, if the metric is low, then most of the free space is in
+ * large segments. A 10% change in fragmentation equates to approximately
+ * double the number of segments.
+ *
+ * This table defines 0% fragmented space using 16MB segments. Testing has
+ * shown that segments that are greater than or equal to 16MB do not suffer
+ * from drastic performance problems. Using this value, we derive the rest
+ * of the table. Since the fragmentation value is never stored on disk, it
+ * is possible to change these calculations in the future.
+ */
+int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+	100,	/* 512B	*/
+	100,	/* 1K	*/
+	98,	/* 2K	*/
+	95,	/* 4K	*/
+	90,	/* 8K	*/
+	80,	/* 16K	*/
+	70,	/* 32K	*/
+	60,	/* 64K	*/
+	50,	/* 128K	*/
+	40,	/* 256K	*/
+	30,	/* 512K	*/
+	20,	/* 1M	*/
+	15,	/* 2M	*/
+	10,	/* 4M	*/
+	5,	/* 8M	*/
+	0	/* 16M	*/
+};
+
+/*
+ * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
+ * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
+ * been upgraded and does not support this metric. Otherwise, the return
+ * value should be in the range [0, 100].
+ */
+static void
+metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+	uint64_t fragmentation = 0;
+	uint64_t total = 0;
+	boolean_t feature_enabled = spa_feature_is_enabled(spa,
+	    SPA_FEATURE_SPACEMAP_HISTOGRAM);
+
+	if (!feature_enabled) {
+		msp->ms_fragmentation = ZFS_FRAG_INVALID;
+		return;
+	}
+
+	/*
+	 * A null space map means that the entire metaslab is free
+	 * and thus is not fragmented.
+	 */
+	if (msp->ms_sm == NULL) {
+		msp->ms_fragmentation = 0;
+		return;
+	}
+
+	/*
+	 * If this metaslab's space map has not been upgraded, flag it
+	 * so that we upgrade next time we encounter it.
+	 */
+	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
+		uint64_t txg = spa_syncing_txg(spa);
+		vdev_t *vd = msp->ms_group->mg_vd;
+
+		/*
+		 * If we've reached the final dirty txg, then we must
+		 * be shutting down the pool. We don't want to dirty
+		 * any data past this point so skip setting the condense
+		 * flag. We can retry this action the next time the pool
+		 * is imported. We also skip marking this metaslab for
+		 * condensing if the caller has explicitly set nodirty.
+		 */
+		if (!nodirty &&
+		    spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
+			msp->ms_condense_wanted = B_TRUE;
+			vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+			zfs_dbgmsg("txg %llu, requesting force condense: "
+			    "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
+			    vd->vdev_id);
+		}
+		msp->ms_fragmentation = ZFS_FRAG_INVALID;
+		return;
+	}
+
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+		uint64_t space = 0;
+		uint8_t shift = msp->ms_sm->sm_shift;
+
+		int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
+		    FRAGMENTATION_TABLE_SIZE - 1);
+
+		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
+			continue;
+
+		space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
+		total += space;
+
+		ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
+		fragmentation += space * zfs_frag_table[idx];
+	}
+
+	if (total > 0)
+		fragmentation /= total;
+	ASSERT3U(fragmentation, <=, 100);
+
+	msp->ms_fragmentation = fragmentation;
+}
+
+/*
+ * Compute a weight -- a selection preference value -- for the given metaslab.
+ * This is based on the amount of free space, the level of fragmentation,
+ * the LBA range, and whether the metaslab is loaded.
+ */
+static uint64_t
+metaslab_space_weight(metaslab_t *msp)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	vdev_t *vd = mg->mg_vd;
+	uint64_t weight, space;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * The baseline weight is the metaslab's free space.
+	 */
+	space = msp->ms_size - metaslab_allocated_space(msp);
+
+	if (metaslab_fragmentation_factor_enabled &&
+	    msp->ms_fragmentation != ZFS_FRAG_INVALID) {
+		/*
+		 * Use the fragmentation information to inversely scale
+		 * down the baseline weight. We need to ensure that we
+		 * don't exclude this metaslab completely when it's 100%
+		 * fragmented. To avoid this we reduce the fragmented value
+		 * by 1.
+		 */
+		space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
+
+		/*
+		 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
+		 * this metaslab again. The fragmentation metric may have
+		 * decreased the space to something smaller than
+		 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
+		 * so that we can consume any remaining space.
+		 */
+		if (space > 0 && space < SPA_MINBLOCKSIZE)
+			space = SPA_MINBLOCKSIZE;
+	}
+	weight = space;
+
+	/*
+	 * Modern disks have uniform bit density and constant angular velocity.
+	 * Therefore, the outer recording zones are faster (higher bandwidth)
+	 * than the inner zones by the ratio of outer to inner track diameter,
+	 * which is typically around 2:1.  We account for this by assigning
+	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
+	 * In effect, this means that we'll select the metaslab with the most
+	 * free bandwidth rather than simply the one with the most free space.
+	 */
+	if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
+		weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
+		ASSERT(weight >= space && weight <= 2 * space);
+	}
+
+	/*
+	 * If this metaslab is one we're actively using, adjust its
+	 * weight to make it preferable to any inactive metaslab so
+	 * we'll polish it off. If the fragmentation on this metaslab
+	 * has exceed our threshold, then don't mark it active.
+	 */
+	if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
+	    msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
+		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+	}
+
+	WEIGHT_SET_SPACEBASED(weight);
+	return (weight);
+}
+
+/*
+ * Return the weight of the specified metaslab, according to the segment-based
+ * weighting algorithm. The metaslab must be loaded. This function can
+ * be called within a sync pass since it relies only on the metaslab's
+ * range tree which is always accurate when the metaslab is loaded.
+ */
+static uint64_t
+metaslab_weight_from_range_tree(metaslab_t *msp)
+{
+	uint64_t weight = 0;
+	uint32_t segments = 0;
+
+	ASSERT(msp->ms_loaded);
+
+	for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
+	    i--) {
+		uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
+		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+		segments <<= 1;
+		segments += msp->ms_allocatable->rt_histogram[i];
+
+		/*
+		 * The range tree provides more precision than the space map
+		 * and must be downgraded so that all values fit within the
+		 * space map's histogram. This allows us to compare loaded
+		 * vs. unloaded metaslabs to determine which metaslab is
+		 * considered "best".
+		 */
+		if (i > max_idx)
+			continue;
+
+		if (segments != 0) {
+			WEIGHT_SET_COUNT(weight, segments);
+			WEIGHT_SET_INDEX(weight, i);
+			WEIGHT_SET_ACTIVE(weight, 0);
+			break;
+		}
+	}
+	return (weight);
+}
+
+/*
+ * Calculate the weight based on the on-disk histogram. Should be applied
+ * only to unloaded metaslabs  (i.e no incoming allocations) in-order to
+ * give results consistent with the on-disk state
+ */
+static uint64_t
+metaslab_weight_from_spacemap(metaslab_t *msp)
+{
+	space_map_t *sm = msp->ms_sm;
+	ASSERT(!msp->ms_loaded);
+	ASSERT(sm != NULL);
+	ASSERT3U(space_map_object(sm), !=, 0);
+	ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+	/*
+	 * Create a joint histogram from all the segments that have made
+	 * it to the metaslab's space map histogram, that are not yet
+	 * available for allocation because they are still in the freeing
+	 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
+	 * these segments from the space map's histogram to get a more
+	 * accurate weight.
+	 */
+	uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
+	for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+		deferspace_histogram[i] += msp->ms_synchist[i];
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+			deferspace_histogram[i] += msp->ms_deferhist[t][i];
+		}
+	}
+
+	uint64_t weight = 0;
+	for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
+		ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
+		    deferspace_histogram[i]);
+		uint64_t count =
+		    sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
+		if (count != 0) {
+			WEIGHT_SET_COUNT(weight, count);
+			WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
+			WEIGHT_SET_ACTIVE(weight, 0);
+			break;
+		}
+	}
+	return (weight);
+}
+
+/*
+ * Compute a segment-based weight for the specified metaslab. The weight
+ * is determined by highest bucket in the histogram. The information
+ * for the highest bucket is encoded into the weight value.
+ */
+static uint64_t
+metaslab_segment_weight(metaslab_t *msp)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	uint64_t weight = 0;
+	uint8_t shift = mg->mg_vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * The metaslab is completely free.
+	 */
+	if (metaslab_allocated_space(msp) == 0) {
+		int idx = highbit64(msp->ms_size) - 1;
+		int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
+
+		if (idx < max_idx) {
+			WEIGHT_SET_COUNT(weight, 1ULL);
+			WEIGHT_SET_INDEX(weight, idx);
+		} else {
+			WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
+			WEIGHT_SET_INDEX(weight, max_idx);
+		}
+		WEIGHT_SET_ACTIVE(weight, 0);
+		ASSERT(!WEIGHT_IS_SPACEBASED(weight));
+		return (weight);
+	}
+
+	ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
+
+	/*
+	 * If the metaslab is fully allocated then just make the weight 0.
+	 */
+	if (metaslab_allocated_space(msp) == msp->ms_size)
+		return (0);
+	/*
+	 * If the metaslab is already loaded, then use the range tree to
+	 * determine the weight. Otherwise, we rely on the space map information
+	 * to generate the weight.
+	 */
+	if (msp->ms_loaded) {
+		weight = metaslab_weight_from_range_tree(msp);
+	} else {
+		weight = metaslab_weight_from_spacemap(msp);
+	}
+
+	/*
+	 * If the metaslab was active the last time we calculated its weight
+	 * then keep it active. We want to consume the entire region that
+	 * is associated with this weight.
+	 */
+	if (msp->ms_activation_weight != 0 && weight != 0)
+		WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
+	return (weight);
+}
+
+/*
+ * Determine if we should attempt to allocate from this metaslab. If the
+ * metaslab is loaded, then we can determine if the desired allocation
+ * can be satisfied by looking at the size of the maximum free segment
+ * on that metaslab. Otherwise, we make our decision based on the metaslab's
+ * weight. For segment-based weighting we can determine the maximum
+ * allocation based on the index encoded in its value. For space-based
+ * weights we rely on the entire weight (excluding the weight-type bit).
+ */
+static boolean_t
+metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
+{
+	/*
+	 * If the metaslab is loaded, ms_max_size is definitive and we can use
+	 * the fast check. If it's not, the ms_max_size is a lower bound (once
+	 * set), and we should use the fast check as long as we're not in
+	 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
+	 * seconds since the metaslab was unloaded.
+	 */
+	if (msp->ms_loaded ||
+	    (msp->ms_max_size != 0 && !try_hard && gethrtime() <
+	    msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
+		return (msp->ms_max_size >= asize);
+
+	boolean_t should_allocate;
+	if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+		/*
+		 * The metaslab segment weight indicates segments in the
+		 * range [2^i, 2^(i+1)), where i is the index in the weight.
+		 * Since the asize might be in the middle of the range, we
+		 * should attempt the allocation if asize < 2^(i+1).
+		 */
+		should_allocate = (asize <
+		    1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
+	} else {
+		should_allocate = (asize <=
+		    (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
+	}
+
+	return (should_allocate);
+}
+
+static uint64_t
+metaslab_weight(metaslab_t *msp, boolean_t nodirty)
+{
+	vdev_t *vd = msp->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	uint64_t weight;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	metaslab_set_fragmentation(msp, nodirty);
+
+	/*
+	 * Update the maximum size. If the metaslab is loaded, this will
+	 * ensure that we get an accurate maximum size if newly freed space
+	 * has been added back into the free tree. If the metaslab is
+	 * unloaded, we check if there's a larger free segment in the
+	 * unflushed frees. This is a lower bound on the largest allocatable
+	 * segment size. Coalescing of adjacent entries may reveal larger
+	 * allocatable segments, but we aren't aware of those until loading
+	 * the space map into a range tree.
+	 */
+	if (msp->ms_loaded) {
+		msp->ms_max_size = metaslab_largest_allocatable(msp);
+	} else {
+		msp->ms_max_size = MAX(msp->ms_max_size,
+		    metaslab_largest_unflushed_free(msp));
+	}
+
+	/*
+	 * Segment-based weighting requires space map histogram support.
+	 */
+	if (zfs_metaslab_segment_weight_enabled &&
+	    spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+	    (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
+	    sizeof (space_map_phys_t))) {
+		weight = metaslab_segment_weight(msp);
+	} else {
+		weight = metaslab_space_weight(msp);
+	}
+	return (weight);
+}
+
+void
+metaslab_recalculate_weight_and_sort(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/* note: we preserve the mask (e.g. indication of primary, etc..) */
+	uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
+	metaslab_group_sort(msp->ms_group, msp,
+	    metaslab_weight(msp, B_FALSE) | was_active);
+}
+
+static int
+metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    int allocator, uint64_t activation_weight)
+{
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * If we're activating for the claim code, we don't want to actually
+	 * set the metaslab up for a specific allocator.
+	 */
+	if (activation_weight == METASLAB_WEIGHT_CLAIM) {
+		ASSERT0(msp->ms_activation_weight);
+		msp->ms_activation_weight = msp->ms_weight;
+		metaslab_group_sort(mg, msp, msp->ms_weight |
+		    activation_weight);
+		return (0);
+	}
+
+	metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
+	    &mga->mga_primary : &mga->mga_secondary);
+
+	mutex_enter(&mg->mg_lock);
+	if (*mspp != NULL) {
+		mutex_exit(&mg->mg_lock);
+		return (EEXIST);
+	}
+
+	*mspp = msp;
+	ASSERT3S(msp->ms_allocator, ==, -1);
+	msp->ms_allocator = allocator;
+	msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
+
+	ASSERT0(msp->ms_activation_weight);
+	msp->ms_activation_weight = msp->ms_weight;
+	metaslab_group_sort_impl(mg, msp,
+	    msp->ms_weight | activation_weight);
+	mutex_exit(&mg->mg_lock);
+
+	return (0);
+}
+
+static int
+metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * The current metaslab is already activated for us so there
+	 * is nothing to do. Already activated though, doesn't mean
+	 * that this metaslab is activated for our allocator nor our
+	 * requested activation weight. The metaslab could have started
+	 * as an active one for our allocator but changed allocators
+	 * while we were waiting to grab its ms_lock or we stole it
+	 * [see find_valid_metaslab()]. This means that there is a
+	 * possibility of passivating a metaslab of another allocator
+	 * or from a different activation mask, from this thread.
+	 */
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+		ASSERT(msp->ms_loaded);
+		return (0);
+	}
+
+	int error = metaslab_load(msp);
+	if (error != 0) {
+		metaslab_group_sort(msp->ms_group, msp, 0);
+		return (error);
+	}
+
+	/*
+	 * When entering metaslab_load() we may have dropped the
+	 * ms_lock because we were loading this metaslab, or we
+	 * were waiting for another thread to load it for us. In
+	 * that scenario, we recheck the weight of the metaslab
+	 * to see if it was activated by another thread.
+	 *
+	 * If the metaslab was activated for another allocator or
+	 * it was activated with a different activation weight (e.g.
+	 * we wanted to make it a primary but it was activated as
+	 * secondary) we return error (EBUSY).
+	 *
+	 * If the metaslab was activated for the same allocator
+	 * and requested activation mask, skip activating it.
+	 */
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
+		if (msp->ms_allocator != allocator)
+			return (EBUSY);
+
+		if ((msp->ms_weight & activation_weight) == 0)
+			return (SET_ERROR(EBUSY));
+
+		EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
+		    msp->ms_primary);
+		return (0);
+	}
+
+	/*
+	 * If the metaslab has literally 0 space, it will have weight 0. In
+	 * that case, don't bother activating it. This can happen if the
+	 * metaslab had space during find_valid_metaslab, but another thread
+	 * loaded it and used all that space while we were waiting to grab the
+	 * lock.
+	 */
+	if (msp->ms_weight == 0) {
+		ASSERT0(range_tree_space(msp->ms_allocatable));
+		return (SET_ERROR(ENOSPC));
+	}
+
+	if ((error = metaslab_activate_allocator(msp->ms_group, msp,
+	    allocator, activation_weight)) != 0) {
+		return (error);
+	}
+
+	ASSERT(msp->ms_loaded);
+	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+
+	return (0);
+}
+
+static void
+metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
+    uint64_t weight)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_loaded);
+
+	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+		metaslab_group_sort(mg, msp, weight);
+		return;
+	}
+
+	mutex_enter(&mg->mg_lock);
+	ASSERT3P(msp->ms_group, ==, mg);
+	ASSERT3S(0, <=, msp->ms_allocator);
+	ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
+
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
+	if (msp->ms_primary) {
+		ASSERT3P(mga->mga_primary, ==, msp);
+		ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+		mga->mga_primary = NULL;
+	} else {
+		ASSERT3P(mga->mga_secondary, ==, msp);
+		ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+		mga->mga_secondary = NULL;
+	}
+	msp->ms_allocator = -1;
+	metaslab_group_sort_impl(mg, msp, weight);
+	mutex_exit(&mg->mg_lock);
+}
+
+static void
+metaslab_passivate(metaslab_t *msp, uint64_t weight)
+{
+	uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE;
+
+	/*
+	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
+	 * this metaslab again.  In that case, it had better be empty,
+	 * or we would be leaving space on the table.
+	 */
+	ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) ||
+	    size >= SPA_MINBLOCKSIZE ||
+	    range_tree_space(msp->ms_allocatable) == 0);
+	ASSERT0(weight & METASLAB_ACTIVE_MASK);
+
+	ASSERT(msp->ms_activation_weight != 0);
+	msp->ms_activation_weight = 0;
+	metaslab_passivate_allocator(msp->ms_group, msp, weight);
+	ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
+}
+
+/*
+ * Segment-based metaslabs are activated once and remain active until
+ * we either fail an allocation attempt (similar to space-based metaslabs)
+ * or have exhausted the free space in zfs_metaslab_switch_threshold
+ * buckets since the metaslab was activated. This function checks to see
+ * if we've exhausted the zfs_metaslab_switch_threshold buckets in the
+ * metaslab and passivates it proactively. This will allow us to select a
+ * metaslab with a larger contiguous region, if any, remaining within this
+ * metaslab group. If we're in sync pass > 1, then we continue using this
+ * metaslab so that we don't dirty more block and cause more sync passes.
+ */
+static void
+metaslab_segment_may_passivate(metaslab_t *msp)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
+		return;
+
+	/*
+	 * Since we are in the middle of a sync pass, the most accurate
+	 * information that is accessible to us is the in-core range tree
+	 * histogram; calculate the new weight based on that information.
+	 */
+	uint64_t weight = metaslab_weight_from_range_tree(msp);
+	int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
+	int current_idx = WEIGHT_GET_INDEX(weight);
+
+	if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
+		metaslab_passivate(msp, weight);
+}
+
+static void
+metaslab_preload(void *arg)
+{
+	metaslab_t *msp = arg;
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+	spa_t *spa = mc->mc_spa;
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+
+	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
+
+	mutex_enter(&msp->ms_lock);
+	(void) metaslab_load(msp);
+	metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
+	mutex_exit(&msp->ms_lock);
+	spl_fstrans_unmark(cookie);
+}
+
+static void
+metaslab_group_preload(metaslab_group_t *mg)
+{
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_t *msp;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	int m = 0;
+
+	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
+		taskq_wait_outstanding(mg->mg_taskq, 0);
+		return;
+	}
+
+	mutex_enter(&mg->mg_lock);
+
+	/*
+	 * Load the next potential metaslabs
+	 */
+	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
+		ASSERT3P(msp->ms_group, ==, mg);
+
+		/*
+		 * We preload only the maximum number of metaslabs specified
+		 * by metaslab_preload_limit. If a metaslab is being forced
+		 * to condense then we preload it too. This will ensure
+		 * that force condensing happens in the next txg.
+		 */
+		if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
+			continue;
+		}
+
+		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
+		    msp, TQ_SLEEP) != TASKQID_INVALID);
+	}
+	mutex_exit(&mg->mg_lock);
+}
+
+/*
+ * Determine if the space map's on-disk footprint is past our tolerance for
+ * inefficiency. We would like to use the following criteria to make our
+ * decision:
+ *
+ * 1. Do not condense if the size of the space map object would dramatically
+ *    increase as a result of writing out the free space range tree.
+ *
+ * 2. Condense if the on on-disk space map representation is at least
+ *    zfs_condense_pct/100 times the size of the optimal representation
+ *    (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
+ *
+ * 3. Do not condense if the on-disk size of the space map does not actually
+ *    decrease.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
+ */
+static boolean_t
+metaslab_should_condense(metaslab_t *msp)
+{
+	space_map_t *sm = msp->ms_sm;
+	vdev_t *vd = msp->ms_group->mg_vd;
+	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_loaded);
+	ASSERT(sm != NULL);
+	ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
+
+	/*
+	 * We always condense metaslabs that are empty and metaslabs for
+	 * which a condense request has been made.
+	 */
+	if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
+	    msp->ms_condense_wanted)
+		return (B_TRUE);
+
+	uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
+	uint64_t object_size = space_map_length(sm);
+	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+	    msp->ms_allocatable, SM_NO_VDEVID);
+
+	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	    object_size > zfs_metaslab_condense_block_threshold * record_size);
+}
+
+/*
+ * Condense the on-disk space map representation to its minimized form.
+ * The minimized form consists of a small number of allocations followed
+ * by the entries of the free range tree (ms_allocatable). The condensed
+ * spacemap contains all the entries of previous TXGs (including those in
+ * the pool-wide log spacemaps; thus this is effectively a superset of
+ * metaslab_flush()), but this TXG's entries still need to be written.
+ */
+static void
+metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
+{
+	range_tree_t *condense_tree;
+	space_map_t *sm = msp->ms_sm;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT(msp->ms_loaded);
+	ASSERT(msp->ms_sm != NULL);
+
+	/*
+	 * In order to condense the space map, we need to change it so it
+	 * only describes which segments are currently allocated and free.
+	 *
+	 * All the current free space resides in the ms_allocatable, all
+	 * the ms_defer trees, and all the ms_allocating trees. We ignore
+	 * ms_freed because it is empty because we're in sync pass 1. We
+	 * ignore ms_freeing because these changes are not yet reflected
+	 * in the spacemap (they will be written later this txg).
+	 *
+	 * So to truncate the space map to represent all the entries of
+	 * previous TXGs we do the following:
+	 *
+	 * 1] We create a range tree (condense tree) that is 100% empty.
+	 * 2] We add to it all segments found in the ms_defer trees
+	 *    as those segments are marked as free in the original space
+	 *    map. We do the same with the ms_allocating trees for the same
+	 *    reason. Adding these segments should be a relatively
+	 *    inexpensive operation since we expect these trees to have a
+	 *    small number of nodes.
+	 * 3] We vacate any unflushed allocs, since they are not frees we
+	 *    need to add to the condense tree. Then we vacate any
+	 *    unflushed frees as they should already be part of ms_allocatable.
+	 * 4] At this point, we would ideally like to add all segments
+	 *    in the ms_allocatable tree from the condense tree. This way
+	 *    we would write all the entries of the condense tree as the
+	 *    condensed space map, which would only contain freed
+	 *    segments with everything else assumed to be allocated.
+	 *
+	 *    Doing so can be prohibitively expensive as ms_allocatable can
+	 *    be large, and therefore computationally expensive to add to
+	 *    the condense_tree. Instead we first sync out an entry marking
+	 *    everything as allocated, then the condense_tree and then the
+	 *    ms_allocatable, in the condensed space map. While this is not
+	 *    optimal, it is typically close to optimal and more importantly
+	 *    much cheaper to compute.
+	 *
+	 * 5] Finally, as both of the unflushed trees were written to our
+	 *    new and condensed metaslab space map, we basically flushed
+	 *    all the unflushed changes to disk, thus we call
+	 *    metaslab_flush_update().
+	 */
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
+
+	zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, "
+	    "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
+	    msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
+	    spa->spa_name, space_map_length(msp->ms_sm),
+	    range_tree_numsegs(msp->ms_allocatable),
+	    msp->ms_condense_wanted ? "TRUE" : "FALSE");
+
+	msp->ms_condense_wanted = B_FALSE;
+
+	range_seg_type_t type;
+	uint64_t shift, start;
+	type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
+	    &start, &shift);
+
+	condense_tree = range_tree_create(NULL, type, NULL, start, shift);
+
+	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+		range_tree_walk(msp->ms_defer[t],
+		    range_tree_add, condense_tree);
+	}
+
+	for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
+		range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
+		    range_tree_add, condense_tree);
+	}
+
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
+	/*
+	 * We're about to drop the metaslab's lock thus allowing other
+	 * consumers to change it's content. Set the metaslab's ms_condensing
+	 * flag to ensure that allocations on this metaslab do not occur
+	 * while we're in the middle of committing it to disk. This is only
+	 * critical for ms_allocatable as all other range trees use per TXG
+	 * views of their content.
+	 */
+	msp->ms_condensing = B_TRUE;
+
+	mutex_exit(&msp->ms_lock);
+	uint64_t object = space_map_object(msp->ms_sm);
+	space_map_truncate(sm,
+	    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+	    zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
+
+	/*
+	 * space_map_truncate() may have reallocated the spacemap object.
+	 * If so, update the vdev_ms_array.
+	 */
+	if (space_map_object(msp->ms_sm) != object) {
+		object = space_map_object(msp->ms_sm);
+		dmu_write(spa->spa_meta_objset,
+		    msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &object, tx);
+	}
+
+	/*
+	 * Note:
+	 * When the log space map feature is enabled, each space map will
+	 * always have ALLOCS followed by FREES for each sync pass. This is
+	 * typically true even when the log space map feature is disabled,
+	 * except from the case where a metaslab goes through metaslab_sync()
+	 * and gets condensed. In that case the metaslab's space map will have
+	 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
+	 * followed by FREES (due to space_map_write() in metaslab_sync()) for
+	 * sync pass 1.
+	 */
+	range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
+	    shift);
+	range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
+	space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
+	space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
+
+	range_tree_vacate(condense_tree, NULL, NULL);
+	range_tree_destroy(condense_tree);
+	range_tree_vacate(tmp_tree, NULL, NULL);
+	range_tree_destroy(tmp_tree);
+	mutex_enter(&msp->ms_lock);
+
+	msp->ms_condensing = B_FALSE;
+	metaslab_flush_update(msp, tx);
+}
+
+/*
+ * Called when the metaslab has been flushed (its own spacemap now reflects
+ * all the contents of the pool-wide spacemap log). Updates the metaslab's
+ * metadata and any pool-wide related log space map data (e.g. summary,
+ * obsolete logs, etc..) to reflect that.
+ */
+static void
+metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	spa_t *spa = mg->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+	ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+
+	/*
+	 * Just because a metaslab got flushed, that doesn't mean that
+	 * it will pass through metaslab_sync_done(). Thus, make sure to
+	 * update ms_synced_length here in case it doesn't.
+	 */
+	msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+	/*
+	 * We may end up here from metaslab_condense() without the
+	 * feature being active. In that case this is a no-op.
+	 */
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	ASSERT(spa_syncing_log_sm(spa) != NULL);
+	ASSERT(msp->ms_sm != NULL);
+	ASSERT(metaslab_unflushed_txg(msp) != 0);
+	ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
+
+	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
+
+	/* update metaslab's position in our flushing tree */
+	uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
+	mutex_enter(&spa->spa_flushed_ms_lock);
+	avl_remove(&spa->spa_metaslabs_by_flushed, msp);
+	metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+	avl_add(&spa->spa_metaslabs_by_flushed, msp);
+	mutex_exit(&spa->spa_flushed_ms_lock);
+
+	/* update metaslab counts of spa_log_sm_t nodes */
+	spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
+	spa_log_sm_increment_current_mscount(spa);
+
+	/* cleanup obsolete logs if any */
+	uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
+	spa_cleanup_old_sm_logs(spa, tx);
+	uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
+	VERIFY3U(log_blocks_after, <=, log_blocks_before);
+
+	/* update log space map summary */
+	uint64_t blocks_gone = log_blocks_before - log_blocks_after;
+	spa_log_summary_add_flushed_metaslab(spa);
+	spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
+	spa_log_summary_decrement_blkcount(spa, blocks_gone);
+}
+
+boolean_t
+metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
+{
+	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	ASSERT(msp->ms_sm != NULL);
+	ASSERT(metaslab_unflushed_txg(msp) != 0);
+	ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
+
+	/*
+	 * There is nothing wrong with flushing the same metaslab twice, as
+	 * this codepath should work on that case. However, the current
+	 * flushing scheme makes sure to avoid this situation as we would be
+	 * making all these calls without having anything meaningful to write
+	 * to disk. We assert this behavior here.
+	 */
+	ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
+
+	/*
+	 * We can not flush while loading, because then we would
+	 * not load the ms_unflushed_{allocs,frees}.
+	 */
+	if (msp->ms_loading)
+		return (B_FALSE);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	/*
+	 * Metaslab condensing is effectively flushing. Therefore if the
+	 * metaslab can be condensed we can just condense it instead of
+	 * flushing it.
+	 *
+	 * Note that metaslab_condense() does call metaslab_flush_update()
+	 * so we can just return immediately after condensing. We also
+	 * don't need to care about setting ms_flushing or broadcasting
+	 * ms_flush_cv, even if we temporarily drop the ms_lock in
+	 * metaslab_condense(), as the metaslab is already loaded.
+	 */
+	if (msp->ms_loaded && metaslab_should_condense(msp)) {
+		metaslab_group_t *mg = msp->ms_group;
+
+		/*
+		 * For all histogram operations below refer to the
+		 * comments of metaslab_sync() where we follow a
+		 * similar procedure.
+		 */
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+		metaslab_group_histogram_remove(mg, msp);
+
+		metaslab_condense(msp, tx);
+
+		space_map_histogram_clear(msp->ms_sm);
+		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+		ASSERT(range_tree_is_empty(msp->ms_freed));
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			space_map_histogram_add(msp->ms_sm,
+			    msp->ms_defer[t], tx);
+		}
+		metaslab_aux_histograms_update(msp);
+
+		metaslab_group_histogram_add(mg, msp);
+		metaslab_group_histogram_verify(mg);
+		metaslab_class_histogram_verify(mg->mg_class);
+
+		metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+
+		/*
+		 * Since we recreated the histogram (and potentially
+		 * the ms_sm too while condensing) ensure that the
+		 * weight is updated too because we are not guaranteed
+		 * that this metaslab is dirty and will go through
+		 * metaslab_sync_done().
+		 */
+		metaslab_recalculate_weight_and_sort(msp);
+		return (B_TRUE);
+	}
+
+	msp->ms_flushing = B_TRUE;
+	uint64_t sm_len_before = space_map_length(msp->ms_sm);
+
+	mutex_exit(&msp->ms_lock);
+	space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
+	    SM_NO_VDEVID, tx);
+	space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
+	    SM_NO_VDEVID, tx);
+	mutex_enter(&msp->ms_lock);
+
+	uint64_t sm_len_after = space_map_length(msp->ms_sm);
+	if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
+		zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
+		    "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
+		    "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
+		    msp->ms_group->mg_vd->vdev_id, msp->ms_id,
+		    range_tree_space(msp->ms_unflushed_allocs),
+		    range_tree_space(msp->ms_unflushed_frees),
+		    (sm_len_after - sm_len_before));
+	}
+
+	ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+	    metaslab_unflushed_changes_memused(msp));
+	spa->spa_unflushed_stats.sus_memused -=
+	    metaslab_unflushed_changes_memused(msp);
+	range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
+	range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	metaslab_flush_update(msp, tx);
+
+	metaslab_verify_space(msp, dmu_tx_get_txg(tx));
+	metaslab_verify_weight_and_frag(msp);
+
+	msp->ms_flushing = B_FALSE;
+	cv_broadcast(&msp->ms_flush_cv);
+	return (B_TRUE);
+}
+
+/*
+ * Write a metaslab to disk in the context of the specified transaction group.
+ */
+void
+metaslab_sync(metaslab_t *msp, uint64_t txg)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa_meta_objset(spa);
+	range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
+	dmu_tx_t *tx;
+
+	ASSERT(!vd->vdev_ishole);
+
+	/*
+	 * This metaslab has just been added so there's no work to do now.
+	 */
+	if (msp->ms_freeing == NULL) {
+		ASSERT3P(alloctree, ==, NULL);
+		return;
+	}
+
+	ASSERT3P(alloctree, !=, NULL);
+	ASSERT3P(msp->ms_freeing, !=, NULL);
+	ASSERT3P(msp->ms_freed, !=, NULL);
+	ASSERT3P(msp->ms_checkpointing, !=, NULL);
+	ASSERT3P(msp->ms_trim, !=, NULL);
+
+	/*
+	 * Normally, we don't want to process a metaslab if there are no
+	 * allocations or frees to perform. However, if the metaslab is being
+	 * forced to condense, it's loaded and we're not beyond the final
+	 * dirty txg, we need to let it through. Not condensing beyond the
+	 * final dirty txg prevents an issue where metaslabs that need to be
+	 * condensed but were loaded for other reasons could cause a panic
+	 * here. By only checking the txg in that branch of the conditional,
+	 * we preserve the utility of the VERIFY statements in all other
+	 * cases.
+	 */
+	if (range_tree_is_empty(alloctree) &&
+	    range_tree_is_empty(msp->ms_freeing) &&
+	    range_tree_is_empty(msp->ms_checkpointing) &&
+	    !(msp->ms_loaded && msp->ms_condense_wanted &&
+	    txg <= spa_final_dirty_txg(spa)))
+		return;
+
+
+	VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
+
+	/*
+	 * The only state that can actually be changing concurrently
+	 * with metaslab_sync() is the metaslab's ms_allocatable. No
+	 * other thread can be modifying this txg's alloc, freeing,
+	 * freed, or space_map_phys_t.  We drop ms_lock whenever we
+	 * could call into the DMU, because the DMU can call down to
+	 * us (e.g. via zio_free()) at any time.
+	 *
+	 * The spa_vdev_remove_thread() can be reading metaslab state
+	 * concurrently, and it is locked out by the ms_sync_lock.
+	 * Note that the ms_lock is insufficient for this, because it
+	 * is dropped by space_map_write().
+	 */
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+	/*
+	 * Generate a log space map if one doesn't exist already.
+	 */
+	spa_generate_syncing_log_sm(spa, tx);
+
+	if (msp->ms_sm == NULL) {
+		uint64_t new_object = space_map_alloc(mos,
+		    spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
+		    zfs_metaslab_sm_blksz_with_log :
+		    zfs_metaslab_sm_blksz_no_log, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+		    msp->ms_id, sizeof (uint64_t), &new_object, tx);
+
+		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
+		    msp->ms_start, msp->ms_size, vd->vdev_ashift));
+		ASSERT(msp->ms_sm != NULL);
+
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+		ASSERT0(metaslab_allocated_space(msp));
+	}
+
+	if (metaslab_unflushed_txg(msp) == 0 &&
+	    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+		ASSERT(spa_syncing_log_sm(spa) != NULL);
+
+		metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
+		spa_log_sm_increment_current_mscount(spa);
+		spa_log_summary_add_flushed_metaslab(spa);
+
+		ASSERT(msp->ms_sm != NULL);
+		mutex_enter(&spa->spa_flushed_ms_lock);
+		avl_add(&spa->spa_metaslabs_by_flushed, msp);
+		mutex_exit(&spa->spa_flushed_ms_lock);
+
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
+		ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
+	}
+
+	if (!range_tree_is_empty(msp->ms_checkpointing) &&
+	    vd->vdev_checkpoint_sm == NULL) {
+		ASSERT(spa_has_checkpoint(spa));
+
+		uint64_t new_object = space_map_alloc(mos,
+		    zfs_vdev_standard_sm_blksz, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
+		    mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
+		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+		/*
+		 * We save the space map object as an entry in vdev_top_zap
+		 * so it can be retrieved when the pool is reopened after an
+		 * export or through zdb.
+		 */
+		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
+		    sizeof (new_object), 1, &new_object, tx));
+	}
+
+	mutex_enter(&msp->ms_sync_lock);
+	mutex_enter(&msp->ms_lock);
+
+	/*
+	 * Note: metaslab_condense() clears the space map's histogram.
+	 * Therefore we must verify and remove this histogram before
+	 * condensing.
+	 */
+	metaslab_group_histogram_verify(mg);
+	metaslab_class_histogram_verify(mg->mg_class);
+	metaslab_group_histogram_remove(mg, msp);
+
+	if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
+	    metaslab_should_condense(msp))
+		metaslab_condense(msp, tx);
+
+	/*
+	 * We'll be going to disk to sync our space accounting, thus we
+	 * drop the ms_lock during that time so allocations coming from
+	 * open-context (ZIL) for future TXGs do not block.
+	 */
+	mutex_exit(&msp->ms_lock);
+	space_map_t *log_sm = spa_syncing_log_sm(spa);
+	if (log_sm != NULL) {
+		ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+		space_map_write(log_sm, alloctree, SM_ALLOC,
+		    vd->vdev_id, tx);
+		space_map_write(log_sm, msp->ms_freeing, SM_FREE,
+		    vd->vdev_id, tx);
+		mutex_enter(&msp->ms_lock);
+
+		ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
+		    metaslab_unflushed_changes_memused(msp));
+		spa->spa_unflushed_stats.sus_memused -=
+		    metaslab_unflushed_changes_memused(msp);
+		range_tree_remove_xor_add(alloctree,
+		    msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
+		range_tree_remove_xor_add(msp->ms_freeing,
+		    msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
+		spa->spa_unflushed_stats.sus_memused +=
+		    metaslab_unflushed_changes_memused(msp);
+	} else {
+		ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+		    SM_NO_VDEVID, tx);
+		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+		    SM_NO_VDEVID, tx);
+		mutex_enter(&msp->ms_lock);
+	}
+
+	msp->ms_allocated_space += range_tree_space(alloctree);
+	ASSERT3U(msp->ms_allocated_space, >=,
+	    range_tree_space(msp->ms_freeing));
+	msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
+
+	if (!range_tree_is_empty(msp->ms_checkpointing)) {
+		ASSERT(spa_has_checkpoint(spa));
+		ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+		/*
+		 * Since we are doing writes to disk and the ms_checkpointing
+		 * tree won't be changing during that time, we drop the
+		 * ms_lock while writing to the checkpoint space map, for the
+		 * same reason mentioned above.
+		 */
+		mutex_exit(&msp->ms_lock);
+		space_map_write(vd->vdev_checkpoint_sm,
+		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
+		mutex_enter(&msp->ms_lock);
+
+		spa->spa_checkpoint_info.sci_dspace +=
+		    range_tree_space(msp->ms_checkpointing);
+		vd->vdev_stat.vs_checkpoint_space +=
+		    range_tree_space(msp->ms_checkpointing);
+		ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
+		    -space_map_allocated(vd->vdev_checkpoint_sm));
+
+		range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
+	}
+
+	if (msp->ms_loaded) {
+		/*
+		 * When the space map is loaded, we have an accurate
+		 * histogram in the range tree. This gives us an opportunity
+		 * to bring the space map's histogram up-to-date so we clear
+		 * it first before updating it.
+		 */
+		space_map_histogram_clear(msp->ms_sm);
+		space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
+
+		/*
+		 * Since we've cleared the histogram we need to add back
+		 * any free space that has already been processed, plus
+		 * any deferred space. This allows the on-disk histogram
+		 * to accurately reflect all free space even if some space
+		 * is not yet available for allocation (i.e. deferred).
+		 */
+		space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
+
+		/*
+		 * Add back any deferred free space that has not been
+		 * added back into the in-core free tree yet. This will
+		 * ensure that we don't end up with a space map histogram
+		 * that is completely empty unless the metaslab is fully
+		 * allocated.
+		 */
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			space_map_histogram_add(msp->ms_sm,
+			    msp->ms_defer[t], tx);
+		}
+	}
+
+	/*
+	 * Always add the free space from this sync pass to the space
+	 * map histogram. We want to make sure that the on-disk histogram
+	 * accounts for all free space. If the space map is not loaded,
+	 * then we will lose some accuracy but will correct it the next
+	 * time we load the space map.
+	 */
+	space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
+	metaslab_aux_histograms_update(msp);
+
+	metaslab_group_histogram_add(mg, msp);
+	metaslab_group_histogram_verify(mg);
+	metaslab_class_histogram_verify(mg->mg_class);
+
+	/*
+	 * For sync pass 1, we avoid traversing this txg's free range tree
+	 * and instead will just swap the pointers for freeing and freed.
+	 * We can safely do this since the freed_tree is guaranteed to be
+	 * empty on the initial pass.
+	 *
+	 * Keep in mind that even if we are currently using a log spacemap
+	 * we want current frees to end up in the ms_allocatable (but not
+	 * get appended to the ms_sm) so their ranges can be reused as usual.
+	 */
+	if (spa_sync_pass(spa) == 1) {
+		range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
+		ASSERT0(msp->ms_allocated_this_txg);
+	} else {
+		range_tree_vacate(msp->ms_freeing,
+		    range_tree_add, msp->ms_freed);
+	}
+	msp->ms_allocated_this_txg += range_tree_space(alloctree);
+	range_tree_vacate(alloctree, NULL, NULL);
+
+	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
+	    & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));
+
+	mutex_exit(&msp->ms_lock);
+
+	/*
+	 * Verify that the space map object ID has been recorded in the
+	 * vdev_ms_array.
+	 */
+	uint64_t object;
+	VERIFY0(dmu_read(mos, vd->vdev_ms_array,
+	    msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
+	VERIFY3U(object, ==, space_map_object(msp->ms_sm));
+
+	mutex_exit(&msp->ms_sync_lock);
+	dmu_tx_commit(tx);
+}
+
+static void
+metaslab_evict(metaslab_t *msp, uint64_t txg)
+{
+	if (!msp->ms_loaded || msp->ms_disabled != 0)
+		return;
+
+	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+		VERIFY0(range_tree_space(
+		    msp->ms_allocating[(txg + t) & TXG_MASK]));
+	}
+	if (msp->ms_allocator != -1)
+		metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
+
+	if (!metaslab_debug_unload)
+		metaslab_unload(msp);
+}
+
+/*
+ * Called after a transaction group has completely synced to mark
+ * all of the metaslab's free space as usable.
+ */
+void
+metaslab_sync_done(metaslab_t *msp, uint64_t txg)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	range_tree_t **defer_tree;
+	int64_t alloc_delta, defer_delta;
+	boolean_t defer_allowed = B_TRUE;
+
+	ASSERT(!vd->vdev_ishole);
+
+	mutex_enter(&msp->ms_lock);
+
+	/*
+	 * If this metaslab is just becoming available, initialize its
+	 * range trees and add its capacity to the vdev.
+	 */
+	if (msp->ms_freed == NULL) {
+		range_seg_type_t type;
+		uint64_t shift, start;
+		type = metaslab_calculate_range_tree_type(vd, msp, &start,
+		    &shift);
+
+		for (int t = 0; t < TXG_SIZE; t++) {
+			ASSERT(msp->ms_allocating[t] == NULL);
+
+			msp->ms_allocating[t] = range_tree_create(NULL, type,
+			    NULL, start, shift);
+		}
+
+		ASSERT3P(msp->ms_freeing, ==, NULL);
+		msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
+		    shift);
+
+		ASSERT3P(msp->ms_freed, ==, NULL);
+		msp->ms_freed = range_tree_create(NULL, type, NULL, start,
+		    shift);
+
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			ASSERT3P(msp->ms_defer[t], ==, NULL);
+			msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
+			    start, shift);
+		}
+
+		ASSERT3P(msp->ms_checkpointing, ==, NULL);
+		msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
+		    start, shift);
+
+		ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
+		msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
+		    start, shift);
+
+		metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
+		mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
+		mrap->mra_floor_shift = metaslab_by_size_min_shift;
+		ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
+		msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
+		    type, mrap, start, shift);
+
+		metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
+	}
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));
+
+	defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
+
+	uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
+	    metaslab_class_get_alloc(spa_normal_class(spa));
+	if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
+		defer_allowed = B_FALSE;
+	}
+
+	defer_delta = 0;
+	alloc_delta = msp->ms_allocated_this_txg -
+	    range_tree_space(msp->ms_freed);
+
+	if (defer_allowed) {
+		defer_delta = range_tree_space(msp->ms_freed) -
+		    range_tree_space(*defer_tree);
+	} else {
+		defer_delta -= range_tree_space(*defer_tree);
+	}
+	metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
+	    defer_delta, 0);
+
+	if (spa_syncing_log_sm(spa) == NULL) {
+		/*
+		 * If there's a metaslab_load() in progress and we don't have
+		 * a log space map, it means that we probably wrote to the
+		 * metaslab's space map. If this is the case, we need to
+		 * make sure that we wait for the load to complete so that we
+		 * have a consistent view at the in-core side of the metaslab.
+		 */
+		metaslab_load_wait(msp);
+	} else {
+		ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+	}
+
+	/*
+	 * When auto-trimming is enabled, free ranges which are added to
+	 * ms_allocatable are also be added to ms_trim.  The ms_trim tree is
+	 * periodically consumed by the vdev_autotrim_thread() which issues
+	 * trims for all ranges and then vacates the tree.  The ms_trim tree
+	 * can be discarded at any time with the sole consequence of recent
+	 * frees not being trimmed.
+	 */
+	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
+		range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
+		if (!defer_allowed) {
+			range_tree_walk(msp->ms_freed, range_tree_add,
+			    msp->ms_trim);
+		}
+	} else {
+		range_tree_vacate(msp->ms_trim, NULL, NULL);
+	}
+
+	/*
+	 * Move the frees from the defer_tree back to the free
+	 * range tree (if it's loaded). Swap the freed_tree and
+	 * the defer_tree -- this is safe to do because we've
+	 * just emptied out the defer_tree.
+	 */
+	range_tree_vacate(*defer_tree,
+	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
+	if (defer_allowed) {
+		range_tree_swap(&msp->ms_freed, defer_tree);
+	} else {
+		range_tree_vacate(msp->ms_freed,
+		    msp->ms_loaded ? range_tree_add : NULL,
+		    msp->ms_allocatable);
+	}
+
+	msp->ms_synced_length = space_map_length(msp->ms_sm);
+
+	msp->ms_deferspace += defer_delta;
+	ASSERT3S(msp->ms_deferspace, >=, 0);
+	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
+	if (msp->ms_deferspace != 0) {
+		/*
+		 * Keep syncing this metaslab until all deferred frees
+		 * are back in circulation.
+		 */
+		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+	}
+	metaslab_aux_histograms_update_done(msp, defer_allowed);
+
+	if (msp->ms_new) {
+		msp->ms_new = B_FALSE;
+		mutex_enter(&mg->mg_lock);
+		mg->mg_ms_ready++;
+		mutex_exit(&mg->mg_lock);
+	}
+
+	/*
+	 * Re-sort metaslab within its group now that we've adjusted
+	 * its allocatable space.
+	 */
+	metaslab_recalculate_weight_and_sort(msp);
+
+	ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
+	ASSERT0(range_tree_space(msp->ms_freeing));
+	ASSERT0(range_tree_space(msp->ms_freed));
+	ASSERT0(range_tree_space(msp->ms_checkpointing));
+	msp->ms_allocating_total -= msp->ms_allocated_this_txg;
+	msp->ms_allocated_this_txg = 0;
+	mutex_exit(&msp->ms_lock);
+}
+
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+	spa_t *spa = mg->mg_class->mc_spa;
+
+	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+	metaslab_group_alloc_update(mg);
+	mg->mg_fragmentation = metaslab_group_fragmentation(mg);
+
+	/*
+	 * Preload the next potential metaslabs but only on active
+	 * metaslab groups. We can get into a state where the metaslab
+	 * is no longer active since we dirty metaslabs as we remove a
+	 * a device, thus potentially making the metaslab group eligible
+	 * for preloading.
+	 */
+	if (mg->mg_activation_count > 0) {
+		metaslab_group_preload(mg);
+	}
+	spa_config_exit(spa, SCL_ALLOC, FTAG);
+}
+
+/*
+ * When writing a ditto block (i.e. more than one DVA for a given BP) on
+ * the same vdev as an existing DVA of this BP, then try to allocate it
+ * on a different metaslab than existing DVAs (i.e. a unique metaslab).
+ */
+static boolean_t
+metaslab_is_unique(metaslab_t *msp, dva_t *dva)
+{
+	uint64_t dva_ms_id;
+
+	if (DVA_GET_ASIZE(dva) == 0)
+		return (B_TRUE);
+
+	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
+		return (B_TRUE);
+
+	dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
+
+	return (msp->ms_id != dva_ms_id);
+}
+
+/*
+ * ==========================================================================
+ * Metaslab allocation tracing facility
+ * ==========================================================================
+ */
+
+/*
+ * Add an allocation trace element to the allocation tracing list.
+ */
+static void
+metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
+    metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
+    int allocator)
+{
+	metaslab_alloc_trace_t *mat;
+
+	if (!metaslab_trace_enabled)
+		return;
+
+	/*
+	 * When the tracing list reaches its maximum we remove
+	 * the second element in the list before adding a new one.
+	 * By removing the second element we preserve the original
+	 * entry as a clue to what allocations steps have already been
+	 * performed.
+	 */
+	if (zal->zal_size == metaslab_trace_max_entries) {
+		metaslab_alloc_trace_t *mat_next;
+#ifdef ZFS_DEBUG
+		panic("too many entries in allocation list");
+#endif
+		METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
+		zal->zal_size--;
+		mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
+		list_remove(&zal->zal_list, mat_next);
+		kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
+	}
+
+	mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
+	list_link_init(&mat->mat_list_node);
+	mat->mat_mg = mg;
+	mat->mat_msp = msp;
+	mat->mat_size = psize;
+	mat->mat_dva_id = dva_id;
+	mat->mat_offset = offset;
+	mat->mat_weight = 0;
+	mat->mat_allocator = allocator;
+
+	if (msp != NULL)
+		mat->mat_weight = msp->ms_weight;
+
+	/*
+	 * The list is part of the zio so locking is not required. Only
+	 * a single thread will perform allocations for a given zio.
+	 */
+	list_insert_tail(&zal->zal_list, mat);
+	zal->zal_size++;
+
+	ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
+}
+
+void
+metaslab_trace_init(zio_alloc_list_t *zal)
+{
+	list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
+	    offsetof(metaslab_alloc_trace_t, mat_list_node));
+	zal->zal_size = 0;
+}
+
+void
+metaslab_trace_fini(zio_alloc_list_t *zal)
+{
+	metaslab_alloc_trace_t *mat;
+
+	while ((mat = list_remove_head(&zal->zal_list)) != NULL)
+		kmem_cache_free(metaslab_alloc_trace_cache, mat);
+	list_destroy(&zal->zal_list);
+	zal->zal_size = 0;
+}
+
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    (flags & METASLAB_DONT_THROTTLE))
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+	(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
+}
+
+static void
+metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
+{
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+	metaslab_class_allocator_t *mca =
+	    &mg->mg_class->mc_allocator[allocator];
+	uint64_t max = mg->mg_max_alloc_queue_depth;
+	uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
+	while (cur < max) {
+		if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
+		    cur, cur + 1) == cur) {
+			atomic_inc_64(&mca->mca_alloc_max_slots);
+			return;
+		}
+		cur = mga->mga_cur_max_alloc_queue_depth;
+	}
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
+    int allocator, boolean_t io_complete)
+{
+	if (!(flags & METASLAB_ASYNC_ALLOC) ||
+	    (flags & METASLAB_DONT_THROTTLE))
+		return;
+
+	metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+	if (!mg->mg_class->mc_alloc_throttle_enabled)
+		return;
+
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+	(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
+	if (io_complete)
+		metaslab_group_increment_qdepth(mg, allocator);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
+    int allocator)
+{
+#ifdef ZFS_DEBUG
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+
+	for (int d = 0; d < ndvas; d++) {
+		uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+		metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+		metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+		VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
+	}
+#endif
+}
+
+static uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
+{
+	uint64_t start;
+	range_tree_t *rt = msp->ms_allocatable;
+	metaslab_class_t *mc = msp->ms_group->mg_class;
+
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+	VERIFY(!msp->ms_condensing);
+	VERIFY0(msp->ms_disabled);
+
+	start = mc->mc_ops->msop_alloc(msp, size);
+	if (start != -1ULL) {
+		metaslab_group_t *mg = msp->ms_group;
+		vdev_t *vd = mg->mg_vd;
+
+		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+		range_tree_remove(rt, start, size);
+		range_tree_clear(msp->ms_trim, start, size);
+
+		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
+			vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
+
+		range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
+		msp->ms_allocating_total += size;
+
+		/* Track the last successful allocation */
+		msp->ms_alloc_txg = txg;
+		metaslab_verify_space(msp, txg);
+	}
+
+	/*
+	 * Now that we've attempted the allocation we need to update the
+	 * metaslab's maximum block size since it may have changed.
+	 */
+	msp->ms_max_size = metaslab_largest_allocatable(msp);
+	return (start);
+}
+
+/*
+ * Find the metaslab with the highest weight that is less than what we've
+ * already tried.  In the common case, this means that we will examine each
+ * metaslab at most once. Note that concurrent callers could reorder metaslabs
+ * by activation/passivation once we have dropped the mg_lock. If a metaslab is
+ * activated by another thread, and we fail to allocate from the metaslab we
+ * have selected, we may not try the newly-activated metaslab, and instead
+ * activate another metaslab.  This is not optimal, but generally does not cause
+ * any problems (a possible exception being if every metaslab is completely full
+ * except for the newly-activated metaslab which we fail to examine).
+ */
+static metaslab_t *
+find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
+    dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
+    boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
+    boolean_t *was_active)
+{
+	avl_index_t idx;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	metaslab_t *msp = avl_find(t, search, &idx);
+	if (msp == NULL)
+		msp = avl_nearest(t, idx, AVL_AFTER);
+
+	int tries = 0;
+	for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
+		int i;
+
+		if (!try_hard && tries > zfs_metaslab_find_max_tries) {
+			METASLABSTAT_BUMP(metaslabstat_too_many_tries);
+			return (NULL);
+		}
+		tries++;
+
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_TOO_SMALL, allocator);
+			continue;
+		}
+
+		/*
+		 * If the selected metaslab is condensing or disabled,
+		 * skip it.
+		 */
+		if (msp->ms_condensing || msp->ms_disabled > 0)
+			continue;
+
+		*was_active = msp->ms_allocator != -1;
+		/*
+		 * If we're activating as primary, this is our first allocation
+		 * from this disk, so we don't need to check how close we are.
+		 * If the metaslab under consideration was already active,
+		 * we're getting desperate enough to steal another allocator's
+		 * metaslab, so we still don't care about distances.
+		 */
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
+			break;
+
+		for (i = 0; i < d; i++) {
+			if (want_unique &&
+			    !metaslab_is_unique(msp, &dva[i]))
+				break;  /* try another metaslab */
+		}
+		if (i == d)
+			break;
+	}
+
+	if (msp != NULL) {
+		search->ms_weight = msp->ms_weight;
+		search->ms_start = msp->ms_start + 1;
+		search->ms_allocator = msp->ms_allocator;
+		search->ms_primary = msp->ms_primary;
+	}
+	return (msp);
+}
+
+static void
+metaslab_active_mask_verify(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
+		return;
+
+	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
+		return;
+
+	if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+		VERIFY3S(msp->ms_allocator, !=, -1);
+		VERIFY(msp->ms_primary);
+		return;
+	}
+
+	if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
+		VERIFY3S(msp->ms_allocator, !=, -1);
+		VERIFY(!msp->ms_primary);
+		return;
+	}
+
+	if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
+		VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
+		VERIFY3S(msp->ms_allocator, ==, -1);
+		return;
+	}
+}
+
+/* ARGSUSED */
+static uint64_t
+metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
+{
+	metaslab_t *msp = NULL;
+	uint64_t offset = -1ULL;
+
+	uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
+	for (int i = 0; i < d; i++) {
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+			activation_weight = METASLAB_WEIGHT_SECONDARY;
+		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+		    DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
+			activation_weight = METASLAB_WEIGHT_CLAIM;
+			break;
+		}
+	}
+
+	/*
+	 * If we don't have enough metaslabs active to fill the entire array, we
+	 * just use the 0th slot.
+	 */
+	if (mg->mg_ms_ready < mg->mg_allocators * 3)
+		allocator = 0;
+	metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
+
+	ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
+
+	metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
+	search->ms_weight = UINT64_MAX;
+	search->ms_start = 0;
+	/*
+	 * At the end of the metaslab tree are the already-active metaslabs,
+	 * first the primaries, then the secondaries. When we resume searching
+	 * through the tree, we need to consider ms_allocator and ms_primary so
+	 * we start in the location right after where we left off, and don't
+	 * accidentally loop forever considering the same metaslabs.
+	 */
+	search->ms_allocator = -1;
+	search->ms_primary = B_TRUE;
+	for (;;) {
+		boolean_t was_active = B_FALSE;
+
+		mutex_enter(&mg->mg_lock);
+
+		if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
+		    mga->mga_primary != NULL) {
+			msp = mga->mga_primary;
+
+			/*
+			 * Even though we don't hold the ms_lock for the
+			 * primary metaslab, those fields should not
+			 * change while we hold the mg_lock. Thus it is
+			 * safe to make assertions on them.
+			 */
+			ASSERT(msp->ms_primary);
+			ASSERT3S(msp->ms_allocator, ==, allocator);
+			ASSERT(msp->ms_loaded);
+
+			was_active = B_TRUE;
+			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+		} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
+		    mga->mga_secondary != NULL) {
+			msp = mga->mga_secondary;
+
+			/*
+			 * See comment above about the similar assertions
+			 * for the primary metaslab.
+			 */
+			ASSERT(!msp->ms_primary);
+			ASSERT3S(msp->ms_allocator, ==, allocator);
+			ASSERT(msp->ms_loaded);
+
+			was_active = B_TRUE;
+			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+		} else {
+			msp = find_valid_metaslab(mg, activation_weight, dva, d,
+			    want_unique, asize, allocator, try_hard, zal,
+			    search, &was_active);
+		}
+
+		mutex_exit(&mg->mg_lock);
+		if (msp == NULL) {
+			kmem_free(search, sizeof (*search));
+			return (-1ULL);
+		}
+		mutex_enter(&msp->ms_lock);
+
+		metaslab_active_mask_verify(msp);
+
+		/*
+		 * This code is disabled out because of issues with
+		 * tracepoints in non-gpl kernel modules.
+		 */
+#if 0
+		DTRACE_PROBE3(ms__activation__attempt,
+		    metaslab_t *, msp, uint64_t, activation_weight,
+		    boolean_t, was_active);
+#endif
+
+		/*
+		 * Ensure that the metaslab we have selected is still
+		 * capable of handling our request. It's possible that
+		 * another thread may have changed the weight while we
+		 * were blocked on the metaslab lock. We check the
+		 * active status first to see if we need to set_selected_txg
+		 * a new metaslab.
+		 */
+		if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
+			ASSERT3S(msp->ms_allocator, ==, -1);
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/*
+		 * If the metaslab was activated for another allocator
+		 * while we were waiting in the ms_lock above, or it's
+		 * a primary and we're seeking a secondary (or vice versa),
+		 * we go back and select a new metaslab.
+		 */
+		if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+		    (msp->ms_allocator != -1) &&
+		    (msp->ms_allocator != allocator || ((activation_weight ==
+		    METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
+			ASSERT(msp->ms_loaded);
+			ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
+			    msp->ms_allocator != -1);
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/*
+		 * This metaslab was used for claiming regions allocated
+		 * by the ZIL during pool import. Once these regions are
+		 * claimed we don't need to keep the CLAIM bit set
+		 * anymore. Passivate this metaslab to zero its activation
+		 * mask.
+		 */
+		if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
+		    activation_weight != METASLAB_WEIGHT_CLAIM) {
+			ASSERT(msp->ms_loaded);
+			ASSERT3S(msp->ms_allocator, ==, -1);
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_WEIGHT_CLAIM);
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		metaslab_set_selected_txg(msp, txg);
+
+		int activation_error =
+		    metaslab_activate(msp, allocator, activation_weight);
+		metaslab_active_mask_verify(msp);
+
+		/*
+		 * If the metaslab was activated by another thread for
+		 * another allocator or activation_weight (EBUSY), or it
+		 * failed because another metaslab was assigned as primary
+		 * for this allocator (EEXIST) we continue using this
+		 * metaslab for our allocation, rather than going on to a
+		 * worse metaslab (we waited for that metaslab to be loaded
+		 * after all).
+		 *
+		 * If the activation failed due to an I/O error or ENOSPC we
+		 * skip to the next metaslab.
+		 */
+		boolean_t activated;
+		if (activation_error == 0) {
+			activated = B_TRUE;
+		} else if (activation_error == EBUSY ||
+		    activation_error == EEXIST) {
+			activated = B_FALSE;
+		} else {
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+		ASSERT(msp->ms_loaded);
+
+		/*
+		 * Now that we have the lock, recheck to see if we should
+		 * continue to use this metaslab for this allocation. The
+		 * the metaslab is now loaded so metaslab_should_allocate()
+		 * can accurately determine if the allocation attempt should
+		 * proceed.
+		 */
+		if (!metaslab_should_allocate(msp, asize, try_hard)) {
+			/* Passivate this metaslab and select a new one. */
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_TOO_SMALL, allocator);
+			goto next;
+		}
+
+		/*
+		 * If this metaslab is currently condensing then pick again
+		 * as we can't manipulate this metaslab until it's committed
+		 * to disk. If this metaslab is being initialized, we shouldn't
+		 * allocate from it since the allocated region might be
+		 * overwritten after allocation.
+		 */
+		if (msp->ms_condensing) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_CONDENSING, allocator);
+			if (activated) {
+				metaslab_passivate(msp, msp->ms_weight &
+				    ~METASLAB_ACTIVE_MASK);
+			}
+			mutex_exit(&msp->ms_lock);
+			continue;
+		} else if (msp->ms_disabled > 0) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_DISABLED, allocator);
+			if (activated) {
+				metaslab_passivate(msp, msp->ms_weight &
+				    ~METASLAB_ACTIVE_MASK);
+			}
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		offset = metaslab_block_alloc(msp, asize, txg);
+		metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
+
+		if (offset != -1ULL) {
+			/* Proactively passivate the metaslab, if needed */
+			if (activated)
+				metaslab_segment_may_passivate(msp);
+			break;
+		}
+next:
+		ASSERT(msp->ms_loaded);
+
+		/*
+		 * This code is disabled out because of issues with
+		 * tracepoints in non-gpl kernel modules.
+		 */
+#if 0
+		DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
+		    uint64_t, asize);
+#endif
+
+		/*
+		 * We were unable to allocate from this metaslab so determine
+		 * a new weight for this metaslab. Now that we have loaded
+		 * the metaslab we can provide a better hint to the metaslab
+		 * selector.
+		 *
+		 * For space-based metaslabs, we use the maximum block size.
+		 * This information is only available when the metaslab
+		 * is loaded and is more accurate than the generic free
+		 * space weight that was calculated by metaslab_weight().
+		 * This information allows us to quickly compare the maximum
+		 * available allocation in the metaslab to the allocation
+		 * size being requested.
+		 *
+		 * For segment-based metaslabs, determine the new weight
+		 * based on the highest bucket in the range tree. We
+		 * explicitly use the loaded segment weight (i.e. the range
+		 * tree histogram) since it contains the space that is
+		 * currently available for allocation and is accurate
+		 * even within a sync pass.
+		 */
+		uint64_t weight;
+		if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
+			weight = metaslab_largest_allocatable(msp);
+			WEIGHT_SET_SPACEBASED(weight);
+		} else {
+			weight = metaslab_weight_from_range_tree(msp);
+		}
+
+		if (activated) {
+			metaslab_passivate(msp, weight);
+		} else {
+			/*
+			 * For the case where we use the metaslab that is
+			 * active for another allocator we want to make
+			 * sure that we retain the activation mask.
+			 *
+			 * Note that we could attempt to use something like
+			 * metaslab_recalculate_weight_and_sort() that
+			 * retains the activation mask here. That function
+			 * uses metaslab_weight() to set the weight though
+			 * which is not as accurate as the calculations
+			 * above.
+			 */
+			weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
+			metaslab_group_sort(mg, msp, weight);
+		}
+		metaslab_active_mask_verify(msp);
+
+		/*
+		 * We have just failed an allocation attempt, check
+		 * that metaslab_should_allocate() agrees. Otherwise,
+		 * we may end up in an infinite loop retrying the same
+		 * metaslab.
+		 */
+		ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
+
+		mutex_exit(&msp->ms_lock);
+	}
+	mutex_exit(&msp->ms_lock);
+	kmem_free(search, sizeof (*search));
+	return (offset);
+}
+
+static uint64_t
+metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
+    uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
+    int allocator, boolean_t try_hard)
+{
+	uint64_t offset;
+	ASSERT(mg->mg_initialized);
+
+	offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
+	    dva, d, allocator, try_hard);
+
+	mutex_enter(&mg->mg_lock);
+	if (offset == -1ULL) {
+		mg->mg_failed_allocations++;
+		metaslab_trace_add(zal, mg, NULL, asize, d,
+		    TRACE_GROUP_FAILURE, allocator);
+		if (asize == SPA_GANGBLOCKSIZE) {
+			/*
+			 * This metaslab group was unable to allocate
+			 * the minimum gang block size so it must be out of
+			 * space. We must notify the allocation throttle
+			 * to start skipping allocation attempts to this
+			 * metaslab group until more space becomes available.
+			 * Note: this failure cannot be caused by the
+			 * allocation throttle since the allocation throttle
+			 * is only responsible for skipping devices and
+			 * not failing block allocations.
+			 */
+			mg->mg_no_free_space = B_TRUE;
+		}
+	}
+	mg->mg_allocations++;
+	mutex_exit(&mg->mg_lock);
+	return (offset);
+}
+
+/*
+ * Allocate a block for the specified i/o.
+ */
+int
+metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
+    dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
+    zio_alloc_list_t *zal, int allocator)
+{
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+	metaslab_group_t *mg, *fast_mg, *rotor;
+	vdev_t *vd;
+	boolean_t try_hard = B_FALSE;
+
+	ASSERT(!DVA_IS_VALID(&dva[d]));
+
+	/*
+	 * For testing, make some blocks above a certain size be gang blocks.
+	 * This will result in more split blocks when using device removal,
+	 * and a large number of split blocks coupled with ztest-induced
+	 * damage can result in extremely long reconstruction times.  This
+	 * will also test spilling from special to normal.
+	 */
+	if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) {
+		metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
+		    allocator);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	/*
+	 * Start at the rotor and loop through all mgs until we find something.
+	 * Note that there's no locking on mca_rotor or mca_aliquot because
+	 * nothing actually breaks if we miss a few updates -- we just won't
+	 * allocate quite as evenly.  It all balances out over time.
+	 *
+	 * If we are doing ditto or log blocks, try to spread them across
+	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
+	 * allocated all of our ditto blocks, then try and spread them out on
+	 * that vdev as much as possible.  If it turns out to not be possible,
+	 * gradually lower our standards until anything becomes acceptable.
+	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
+	 * gives us hope of containing our fault domains to something we're
+	 * able to reason about.  Otherwise, any two top-level vdev failures
+	 * will guarantee the loss of data.  With consecutive allocation,
+	 * only two adjacent top-level vdev failures will result in data loss.
+	 *
+	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
+	 * ourselves on the same vdev as our gang block header.  That
+	 * way, we can hope for locality in vdev_cache, plus it makes our
+	 * fault domains something tractable.
+	 */
+	if (hintdva) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
+
+		/*
+		 * It's possible the vdev we're using as the hint no
+		 * longer exists or its mg has been closed (e.g. by
+		 * device removal).  Consult the rotor when
+		 * all else fails.
+		 */
+		if (vd != NULL && vd->vdev_mg != NULL) {
+			mg = vdev_get_mg(vd, mc);
+
+			if (flags & METASLAB_HINTBP_AVOID &&
+			    mg->mg_next != NULL)
+				mg = mg->mg_next;
+		} else {
+			mg = mca->mca_rotor;
+		}
+	} else if (d != 0) {
+		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
+		mg = vd->vdev_mg->mg_next;
+	} else if (flags & METASLAB_FASTWRITE) {
+		mg = fast_mg = mca->mca_rotor;
+
+		do {
+			if (fast_mg->mg_vd->vdev_pending_fastwrite <
+			    mg->mg_vd->vdev_pending_fastwrite)
+				mg = fast_mg;
+		} while ((fast_mg = fast_mg->mg_next) != mca->mca_rotor);
+
+	} else {
+		ASSERT(mca->mca_rotor != NULL);
+		mg = mca->mca_rotor;
+	}
+
+	/*
+	 * If the hint put us into the wrong metaslab class, or into a
+	 * metaslab group that has been passivated, just follow the rotor.
+	 */
+	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
+		mg = mca->mca_rotor;
+
+	rotor = mg;
+top:
+	do {
+		boolean_t allocatable;
+
+		ASSERT(mg->mg_activation_count == 1);
+		vd = mg->mg_vd;
+
+		/*
+		 * Don't allocate from faulted devices.
+		 */
+		if (try_hard) {
+			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+			allocatable = vdev_allocatable(vd);
+			spa_config_exit(spa, SCL_ZIO, FTAG);
+		} else {
+			allocatable = vdev_allocatable(vd);
+		}
+
+		/*
+		 * Determine if the selected metaslab group is eligible
+		 * for allocations. If we're ganging then don't allow
+		 * this metaslab group to skip allocations since that would
+		 * inadvertently return ENOSPC and suspend the pool
+		 * even though space is still available.
+		 */
+		if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
+			allocatable = metaslab_group_allocatable(mg, rotor,
+			    psize, allocator, d);
+		}
+
+		if (!allocatable) {
+			metaslab_trace_add(zal, mg, NULL, psize, d,
+			    TRACE_NOT_ALLOCATABLE, allocator);
+			goto next;
+		}
+
+		ASSERT(mg->mg_initialized);
+
+		/*
+		 * Avoid writing single-copy data to a failing,
+		 * non-redundant vdev, unless we've already tried all
+		 * other vdevs.
+		 */
+		if ((vd->vdev_stat.vs_write_errors > 0 ||
+		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
+		    d == 0 && !try_hard && vd->vdev_children == 0) {
+			metaslab_trace_add(zal, mg, NULL, psize, d,
+			    TRACE_VDEV_ERROR, allocator);
+			goto next;
+		}
+
+		ASSERT(mg->mg_class == mc);
+
+		uint64_t asize = vdev_psize_to_asize(vd, psize);
+		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
+
+		/*
+		 * If we don't need to try hard, then require that the
+		 * block be on a different metaslab from any other DVAs
+		 * in this BP (unique=true).  If we are trying hard, then
+		 * allow any metaslab to be used (unique=false).
+		 */
+		uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
+		    !try_hard, dva, d, allocator, try_hard);
+
+		if (offset != -1ULL) {
+			/*
+			 * If we've just selected this metaslab group,
+			 * figure out whether the corresponding vdev is
+			 * over- or under-used relative to the pool,
+			 * and set an allocation bias to even it out.
+			 *
+			 * Bias is also used to compensate for unequally
+			 * sized vdevs so that space is allocated fairly.
+			 */
+			if (mca->mca_aliquot == 0 && metaslab_bias_enabled) {
+				vdev_stat_t *vs = &vd->vdev_stat;
+				int64_t vs_free = vs->vs_space - vs->vs_alloc;
+				int64_t mc_free = mc->mc_space - mc->mc_alloc;
+				int64_t ratio;
+
+				/*
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 *
+				 * This basically introduces a zero-centered
+				 * bias towards the devices with the most
+				 * free space, while compensating for vdev
+				 * size differences.
+				 *
+				 * Examples:
+				 *  vdev V1 = 16M/128M
+				 *  vdev V2 = 16M/128M
+				 *  ratio(V1) = 100% ratio(V2) = 100%
+				 *
+				 *  vdev V1 = 16M/128M
+				 *  vdev V2 = 64M/128M
+				 *  ratio(V1) = 127% ratio(V2) =  72%
+				 *
+				 *  vdev V1 = 16M/128M
+				 *  vdev V2 = 64M/512M
+				 *  ratio(V1) =  40% ratio(V2) = 160%
+				 */
+				ratio = (vs_free * mc->mc_alloc_groups * 100) /
+				    (mc_free + 1);
+				mg->mg_bias = ((ratio - 100) *
+				    (int64_t)mg->mg_aliquot) / 100;
+			} else if (!metaslab_bias_enabled) {
+				mg->mg_bias = 0;
+			}
+
+			if ((flags & METASLAB_FASTWRITE) ||
+			    atomic_add_64_nv(&mca->mca_aliquot, asize) >=
+			    mg->mg_aliquot + mg->mg_bias) {
+				mca->mca_rotor = mg->mg_next;
+				mca->mca_aliquot = 0;
+			}
+
+			DVA_SET_VDEV(&dva[d], vd->vdev_id);
+			DVA_SET_OFFSET(&dva[d], offset);
+			DVA_SET_GANG(&dva[d],
+			    ((flags & METASLAB_GANG_HEADER) ? 1 : 0));
+			DVA_SET_ASIZE(&dva[d], asize);
+
+			if (flags & METASLAB_FASTWRITE) {
+				atomic_add_64(&vd->vdev_pending_fastwrite,
+				    psize);
+			}
+
+			return (0);
+		}
+next:
+		mca->mca_rotor = mg->mg_next;
+		mca->mca_aliquot = 0;
+	} while ((mg = mg->mg_next) != rotor);
+
+	/*
+	 * If we haven't tried hard, perhaps do so now.
+	 */
+	if (!try_hard && (zfs_metaslab_try_hard_before_gang ||
+	    GANG_ALLOCATION(flags) || (flags & METASLAB_ZIL) != 0 ||
+	    psize <= 1 << spa->spa_min_ashift)) {
+		METASLABSTAT_BUMP(metaslabstat_try_hard);
+		try_hard = B_TRUE;
+		goto top;
+	}
+
+	bzero(&dva[d], sizeof (dva_t));
+
+	metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
+	return (SET_ERROR(ENOSPC));
+}
+
+void
+metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
+    boolean_t checkpoint)
+{
+	metaslab_t *msp;
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	VERIFY(!msp->ms_condensing);
+	VERIFY3U(offset, >=, msp->ms_start);
+	VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
+	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+	VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
+
+	metaslab_check_free_impl(vd, offset, asize);
+
+	mutex_enter(&msp->ms_lock);
+	if (range_tree_is_empty(msp->ms_freeing) &&
+	    range_tree_is_empty(msp->ms_checkpointing)) {
+		vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
+	}
+
+	if (checkpoint) {
+		ASSERT(spa_has_checkpoint(spa));
+		range_tree_add(msp->ms_checkpointing, offset, asize);
+	} else {
+		range_tree_add(msp->ms_freeing, offset, asize);
+	}
+	mutex_exit(&msp->ms_lock);
+}
+
+/* ARGSUSED */
+void
+metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *arg)
+{
+	boolean_t *checkpoint = arg;
+
+	ASSERT3P(checkpoint, !=, NULL);
+
+	if (vd->vdev_ops->vdev_op_remap != NULL)
+		vdev_indirect_mark_obsolete(vd, offset, size);
+	else
+		metaslab_free_impl(vd, offset, size, *checkpoint);
+}
+
+static void
+metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
+    boolean_t checkpoint)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+	if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
+		return;
+
+	if (spa->spa_vdev_removal != NULL &&
+	    spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
+	    vdev_is_concrete(vd)) {
+		/*
+		 * Note: we check if the vdev is concrete because when
+		 * we complete the removal, we first change the vdev to be
+		 * an indirect vdev (in open context), and then (in syncing
+		 * context) clear spa_vdev_removal.
+		 */
+		free_from_removing_vdev(vd, offset, size);
+	} else if (vd->vdev_ops->vdev_op_remap != NULL) {
+		vdev_indirect_mark_obsolete(vd, offset, size);
+		vd->vdev_ops->vdev_op_remap(vd, offset, size,
+		    metaslab_free_impl_cb, &checkpoint);
+	} else {
+		metaslab_free_concrete(vd, offset, size, checkpoint);
+	}
+}
+
+typedef struct remap_blkptr_cb_arg {
+	blkptr_t *rbca_bp;
+	spa_remap_cb_t rbca_cb;
+	vdev_t *rbca_remap_vd;
+	uint64_t rbca_remap_offset;
+	void *rbca_cb_arg;
+} remap_blkptr_cb_arg_t;
+
+static void
+remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *arg)
+{
+	remap_blkptr_cb_arg_t *rbca = arg;
+	blkptr_t *bp = rbca->rbca_bp;
+
+	/* We can not remap split blocks. */
+	if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
+		return;
+	ASSERT0(inner_offset);
+
+	if (rbca->rbca_cb != NULL) {
+		/*
+		 * At this point we know that we are not handling split
+		 * blocks and we invoke the callback on the previous
+		 * vdev which must be indirect.
+		 */
+		ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
+
+		rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
+		    rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
+
+		/* set up remap_blkptr_cb_arg for the next call */
+		rbca->rbca_remap_vd = vd;
+		rbca->rbca_remap_offset = offset;
+	}
+
+	/*
+	 * The phys birth time is that of dva[0].  This ensures that we know
+	 * when each dva was written, so that resilver can determine which
+	 * blocks need to be scrubbed (i.e. those written during the time
+	 * the vdev was offline).  It also ensures that the key used in
+	 * the ARC hash table is unique (i.e. dva[0] + phys_birth).  If
+	 * we didn't change the phys_birth, a lookup in the ARC for a
+	 * remapped BP could find the data that was previously stored at
+	 * this vdev + offset.
+	 */
+	vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
+	    DVA_GET_VDEV(&bp->blk_dva[0]));
+	vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
+	bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
+	    DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
+
+	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+	DVA_SET_OFFSET(&bp->blk_dva[0], offset);
+}
+
+/*
+ * If the block pointer contains any indirect DVAs, modify them to refer to
+ * concrete DVAs.  Note that this will sometimes not be possible, leaving
+ * the indirect DVA in place.  This happens if the indirect DVA spans multiple
+ * segments in the mapping (i.e. it is a "split block").
+ *
+ * If the BP was remapped, calls the callback on the original dva (note the
+ * callback can be called multiple times if the original indirect DVA refers
+ * to another indirect DVA, etc).
+ *
+ * Returns TRUE if the BP was remapped.
+ */
+boolean_t
+spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
+{
+	remap_blkptr_cb_arg_t rbca;
+
+	if (!zfs_remap_blkptr_enable)
+		return (B_FALSE);
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
+		return (B_FALSE);
+
+	/*
+	 * Dedup BP's can not be remapped, because ddt_phys_select() depends
+	 * on DVA[0] being the same in the BP as in the DDT (dedup table).
+	 */
+	if (BP_GET_DEDUP(bp))
+		return (B_FALSE);
+
+	/*
+	 * Gang blocks can not be remapped, because
+	 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
+	 * the BP used to read the gang block header (GBH) being the same
+	 * as the DVA[0] that we allocated for the GBH.
+	 */
+	if (BP_IS_GANG(bp))
+		return (B_FALSE);
+
+	/*
+	 * Embedded BP's have no DVA to remap.
+	 */
+	if (BP_GET_NDVAS(bp) < 1)
+		return (B_FALSE);
+
+	/*
+	 * Note: we only remap dva[0].  If we remapped other dvas, we
+	 * would no longer know what their phys birth txg is.
+	 */
+	dva_t *dva = &bp->blk_dva[0];
+
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+	if (vd->vdev_ops->vdev_op_remap == NULL)
+		return (B_FALSE);
+
+	rbca.rbca_bp = bp;
+	rbca.rbca_cb = callback;
+	rbca.rbca_remap_vd = vd;
+	rbca.rbca_remap_offset = offset;
+	rbca.rbca_cb_arg = arg;
+
+	/*
+	 * remap_blkptr_cb() will be called in order for each level of
+	 * indirection, until a concrete vdev is reached or a split block is
+	 * encountered. old_vd and old_offset are updated within the callback
+	 * as we go from the one indirect vdev to the next one (either concrete
+	 * or indirect again) in that order.
+	 */
+	vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
+
+	/* Check if the DVA wasn't remapped because it is a split block */
+	if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Undo the allocation of a DVA which happened in the given transaction group.
+ */
+void
+metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+	metaslab_t *msp;
+	vdev_t *vd;
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+
+	ASSERT(DVA_IS_VALID(dva));
+	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+	if (txg > spa_freeze_txg(spa))
+		return;
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) ||
+	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
+		zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu",
+		    (u_longlong_t)vdev, (u_longlong_t)offset,
+		    (u_longlong_t)size);
+		return;
+	}
+
+	ASSERT(!vd->vdev_removing);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
+	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+
+	if (DVA_GET_GANG(dva))
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	mutex_enter(&msp->ms_lock);
+	range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
+	    offset, size);
+	msp->ms_allocating_total -= size;
+
+	VERIFY(!msp->ms_condensing);
+	VERIFY3U(offset, >=, msp->ms_start);
+	VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
+	VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
+	    msp->ms_size);
+	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+	range_tree_add(msp->ms_allocatable, offset, size);
+	mutex_exit(&msp->ms_lock);
+}
+
+/*
+ * Free the block represented by the given DVA.
+ */
+void
+metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
+{
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	vdev_t *vd = vdev_lookup_top(spa, vdev);
+
+	ASSERT(DVA_IS_VALID(dva));
+	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+	if (DVA_GET_GANG(dva)) {
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+	}
+
+	metaslab_free_impl(vd, offset, size, checkpoint);
+}
+
+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
+    zio_t *zio, int flags)
+{
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+	uint64_t available_slots = 0;
+	boolean_t slot_reserved = B_FALSE;
+	uint64_t max = mca->mca_alloc_max_slots;
+
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+
+	uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
+	if (reserved_slots < max)
+		available_slots = max - reserved_slots;
+
+	if (slots <= available_slots || GANG_ALLOCATION(flags) ||
+	    flags & METASLAB_MUST_RESERVE) {
+		/*
+		 * We reserve the slots individually so that we can unreserve
+		 * them individually when an I/O completes.
+		 */
+		for (int d = 0; d < slots; d++)
+			zfs_refcount_add(&mca->mca_alloc_slots, zio);
+		zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+		slot_reserved = B_TRUE;
+	}
+
+	mutex_exit(&mc->mc_lock);
+	return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
+    int allocator, zio_t *zio)
+{
+	metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
+
+	ASSERT(mc->mc_alloc_throttle_enabled);
+	mutex_enter(&mc->mc_lock);
+	for (int d = 0; d < slots; d++)
+		zfs_refcount_remove(&mca->mca_alloc_slots, zio);
+	mutex_exit(&mc->mc_lock);
+}
+
+static int
+metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
+    uint64_t txg)
+{
+	metaslab_t *msp;
+	spa_t *spa = vd->vdev_spa;
+	int error = 0;
+
+	if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
+		return (SET_ERROR(ENXIO));
+
+	ASSERT3P(vd->vdev_ms, !=, NULL);
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	mutex_enter(&msp->ms_lock);
+
+	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) {
+		error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
+		if (error == EBUSY) {
+			ASSERT(msp->ms_loaded);
+			ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
+			error = 0;
+		}
+	}
+
+	if (error == 0 &&
+	    !range_tree_contains(msp->ms_allocatable, offset, size))
+		error = SET_ERROR(ENOENT);
+
+	if (error || txg == 0) {	/* txg == 0 indicates dry run */
+		mutex_exit(&msp->ms_lock);
+		return (error);
+	}
+
+	VERIFY(!msp->ms_condensing);
+	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+	VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
+	    msp->ms_size);
+	range_tree_remove(msp->ms_allocatable, offset, size);
+	range_tree_clear(msp->ms_trim, offset, size);
+
+	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(8) */
+		metaslab_class_t *mc = msp->ms_group->mg_class;
+		multilist_sublist_t *mls =
+		    multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
+		if (!multilist_link_active(&msp->ms_class_txg_node)) {
+			msp->ms_selected_txg = txg;
+			multilist_sublist_insert_head(mls, msp);
+		}
+		multilist_sublist_unlock(mls);
+
+		if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
+			vdev_dirty(vd, VDD_METASLAB, msp, txg);
+		range_tree_add(msp->ms_allocating[txg & TXG_MASK],
+		    offset, size);
+		msp->ms_allocating_total += size;
+	}
+
+	mutex_exit(&msp->ms_lock);
+
+	return (0);
+}
+
+typedef struct metaslab_claim_cb_arg_t {
+	uint64_t	mcca_txg;
+	int		mcca_error;
+} metaslab_claim_cb_arg_t;
+
+/* ARGSUSED */
+static void
+metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *arg)
+{
+	metaslab_claim_cb_arg_t *mcca_arg = arg;
+
+	if (mcca_arg->mcca_error == 0) {
+		mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
+		    size, mcca_arg->mcca_txg);
+	}
+}
+
+int
+metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
+{
+	if (vd->vdev_ops->vdev_op_remap != NULL) {
+		metaslab_claim_cb_arg_t arg;
+
+		/*
+		 * Only zdb(8) can claim on indirect vdevs.  This is used
+		 * to detect leaks of mapped space (that are not accounted
+		 * for in the obsolete counts, spacemap, or bpobj).
+		 */
+		ASSERT(!spa_writeable(vd->vdev_spa));
+		arg.mcca_error = 0;
+		arg.mcca_txg = txg;
+
+		vd->vdev_ops->vdev_op_remap(vd, offset, size,
+		    metaslab_claim_impl_cb, &arg);
+
+		if (arg.mcca_error == 0) {
+			arg.mcca_error = metaslab_claim_concrete(vd,
+			    offset, size, txg);
+		}
+		return (arg.mcca_error);
+	} else {
+		return (metaslab_claim_concrete(vd, offset, size, txg));
+	}
+}
+
+/*
+ * Intent log support: upon opening the pool after a crash, notify the SPA
+ * of blocks that the intent log has allocated for immediate write, but
+ * which are still considered free by the SPA because the last transaction
+ * group didn't commit yet.
+ */
+static int
+metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
+{
+	uint64_t vdev = DVA_GET_VDEV(dva);
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t size = DVA_GET_ASIZE(dva);
+	vdev_t *vd;
+
+	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
+		return (SET_ERROR(ENXIO));
+	}
+
+	ASSERT(DVA_IS_VALID(dva));
+
+	if (DVA_GET_GANG(dva))
+		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+	return (metaslab_claim_impl(vd, offset, size, txg));
+}
+
+int
+metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
+    int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
+    zio_alloc_list_t *zal, zio_t *zio, int allocator)
+{
+	dva_t *dva = bp->blk_dva;
+	dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
+	int error = 0;
+
+	ASSERT(bp->blk_birth == 0);
+	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
+
+	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+	if (mc->mc_allocator[allocator].mca_rotor == NULL) {
+		/* no vdevs in this class */
+		spa_config_exit(spa, SCL_ALLOC, FTAG);
+		return (SET_ERROR(ENOSPC));
+	}
+
+	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
+	ASSERT(BP_GET_NDVAS(bp) == 0);
+	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
+	ASSERT3P(zal, !=, NULL);
+
+	for (int d = 0; d < ndvas; d++) {
+		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
+		    txg, flags, zal, allocator);
+		if (error != 0) {
+			for (d--; d >= 0; d--) {
+				metaslab_unalloc_dva(spa, &dva[d], txg);
+				metaslab_group_alloc_decrement(spa,
+				    DVA_GET_VDEV(&dva[d]), zio, flags,
+				    allocator, B_FALSE);
+				bzero(&dva[d], sizeof (dva_t));
+			}
+			spa_config_exit(spa, SCL_ALLOC, FTAG);
+			return (error);
+		} else {
+			/*
+			 * Update the metaslab group's queue depth
+			 * based on the newly allocated dva.
+			 */
+			metaslab_group_alloc_increment(spa,
+			    DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
+		}
+	}
+	ASSERT(error == 0);
+	ASSERT(BP_GET_NDVAS(bp) == ndvas);
+
+	spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+	BP_SET_BIRTH(bp, txg, 0);
+
+	return (0);
+}
+
+void
+metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
+
+	/*
+	 * If we have a checkpoint for the pool we need to make sure that
+	 * the blocks that we free that are part of the checkpoint won't be
+	 * reused until the checkpoint is discarded or we revert to it.
+	 *
+	 * The checkpoint flag is passed down the metaslab_free code path
+	 * and is set whenever we want to add a block to the checkpoint's
+	 * accounting. That is, we "checkpoint" blocks that existed at the
+	 * time the checkpoint was created and are therefore referenced by
+	 * the checkpointed uberblock.
+	 *
+	 * Note that, we don't checkpoint any blocks if the current
+	 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
+	 * normally as they will be referenced by the checkpointed uberblock.
+	 */
+	boolean_t checkpoint = B_FALSE;
+	if (bp->blk_birth <= spa->spa_checkpoint_txg &&
+	    spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
+		/*
+		 * At this point, if the block is part of the checkpoint
+		 * there is no way it was created in the current txg.
+		 */
+		ASSERT(!now);
+		ASSERT3U(spa_syncing_txg(spa), ==, txg);
+		checkpoint = B_TRUE;
+	}
+
+	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
+
+	for (int d = 0; d < ndvas; d++) {
+		if (now) {
+			metaslab_unalloc_dva(spa, &dva[d], txg);
+		} else {
+			ASSERT3U(txg, ==, spa_syncing_txg(spa));
+			metaslab_free_dva(spa, &dva[d], checkpoint);
+		}
+	}
+
+	spa_config_exit(spa, SCL_FREE, FTAG);
+}
+
+int
+metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	int error = 0;
+
+	ASSERT(!BP_IS_HOLE(bp));
+
+	if (txg != 0) {
+		/*
+		 * First do a dry run to make sure all DVAs are claimable,
+		 * so we don't have to unwind from partial failures below.
+		 */
+		if ((error = metaslab_claim(spa, bp, 0)) != 0)
+			return (error);
+	}
+
+	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
+
+	for (int d = 0; d < ndvas; d++) {
+		error = metaslab_claim_dva(spa, &dva[d], txg);
+		if (error != 0)
+			break;
+	}
+
+	spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+	ASSERT(error == 0 || txg == 0);
+
+	return (error);
+}
+
+void
+metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(!BP_IS_EMBEDDED(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		atomic_add_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+void
+metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp)
+{
+	const dva_t *dva = bp->blk_dva;
+	int ndvas = BP_GET_NDVAS(bp);
+	uint64_t psize = BP_GET_PSIZE(bp);
+	int d;
+	vdev_t *vd;
+
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(!BP_IS_EMBEDDED(bp));
+	ASSERT(psize > 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (d = 0; d < ndvas; d++) {
+		if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL)
+			continue;
+		ASSERT3U(vd->vdev_pending_fastwrite, >=, psize);
+		atomic_sub_64(&vd->vdev_pending_fastwrite, psize);
+	}
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+/* ARGSUSED */
+static void
+metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *arg)
+{
+	if (vd->vdev_ops == &vdev_indirect_ops)
+		return;
+
+	metaslab_check_free_impl(vd, offset, size);
+}
+
+static void
+metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+	metaslab_t *msp;
+	spa_t *spa __maybe_unused = vd->vdev_spa;
+
+	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+		return;
+
+	if (vd->vdev_ops->vdev_op_remap != NULL) {
+		vd->vdev_ops->vdev_op_remap(vd, offset, size,
+		    metaslab_check_free_impl_cb, NULL);
+		return;
+	}
+
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
+	ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
+
+	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+
+	mutex_enter(&msp->ms_lock);
+	if (msp->ms_loaded) {
+		range_tree_verify_not_present(msp->ms_allocatable,
+		    offset, size);
+	}
+
+	/*
+	 * Check all segments that currently exist in the freeing pipeline.
+	 *
+	 * It would intuitively make sense to also check the current allocating
+	 * tree since metaslab_unalloc_dva() exists for extents that are
+	 * allocated and freed in the same sync pass within the same txg.
+	 * Unfortunately there are places (e.g. the ZIL) where we allocate a
+	 * segment but then we free part of it within the same txg
+	 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
+	 * current allocating tree.
+	 */
+	range_tree_verify_not_present(msp->ms_freeing, offset, size);
+	range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
+	range_tree_verify_not_present(msp->ms_freed, offset, size);
+	for (int j = 0; j < TXG_DEFER_SIZE; j++)
+		range_tree_verify_not_present(msp->ms_defer[j], offset, size);
+	range_tree_verify_not_present(msp->ms_trim, offset, size);
+	mutex_exit(&msp->ms_lock);
+}
+
+void
+metaslab_check_free(spa_t *spa, const blkptr_t *bp)
+{
+	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
+		return;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+		vdev_t *vd = vdev_lookup_top(spa, vdev);
+		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
+
+		if (DVA_GET_GANG(&bp->blk_dva[i]))
+			size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+
+		ASSERT3P(vd, !=, NULL);
+
+		metaslab_check_free_impl(vd, offset, size);
+	}
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+}
+
+static void
+metaslab_group_disable_wait(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+	while (mg->mg_disabled_updating) {
+		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+	}
+}
+
+static void
+metaslab_group_disabled_increment(metaslab_group_t *mg)
+{
+	ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
+	ASSERT(mg->mg_disabled_updating);
+
+	while (mg->mg_ms_disabled >= max_disabled_ms) {
+		cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
+	}
+	mg->mg_ms_disabled++;
+	ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
+}
+
+/*
+ * Mark the metaslab as disabled to prevent any allocations on this metaslab.
+ * We must also track how many metaslabs are currently disabled within a
+ * metaslab group and limit them to prevent allocation failures from
+ * occurring because all metaslabs are disabled.
+ */
+void
+metaslab_disable(metaslab_t *msp)
+{
+	ASSERT(!MUTEX_HELD(&msp->ms_lock));
+	metaslab_group_t *mg = msp->ms_group;
+
+	mutex_enter(&mg->mg_ms_disabled_lock);
+
+	/*
+	 * To keep an accurate count of how many threads have disabled
+	 * a specific metaslab group, we only allow one thread to mark
+	 * the metaslab group at a time. This ensures that the value of
+	 * ms_disabled will be accurate when we decide to mark a metaslab
+	 * group as disabled. To do this we force all other threads
+	 * to wait till the metaslab's mg_disabled_updating flag is no
+	 * longer set.
+	 */
+	metaslab_group_disable_wait(mg);
+	mg->mg_disabled_updating = B_TRUE;
+	if (msp->ms_disabled == 0) {
+		metaslab_group_disabled_increment(mg);
+	}
+	mutex_enter(&msp->ms_lock);
+	msp->ms_disabled++;
+	mutex_exit(&msp->ms_lock);
+
+	mg->mg_disabled_updating = B_FALSE;
+	cv_broadcast(&mg->mg_ms_disabled_cv);
+	mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
+void
+metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
+{
+	metaslab_group_t *mg = msp->ms_group;
+	spa_t *spa = mg->mg_vd->vdev_spa;
+
+	/*
+	 * Wait for the outstanding IO to be synced to prevent newly
+	 * allocated blocks from being overwritten.  This used by
+	 * initialize and TRIM which are modifying unallocated space.
+	 */
+	if (sync)
+		txg_wait_synced(spa_get_dsl(spa), 0);
+
+	mutex_enter(&mg->mg_ms_disabled_lock);
+	mutex_enter(&msp->ms_lock);
+	if (--msp->ms_disabled == 0) {
+		mg->mg_ms_disabled--;
+		cv_broadcast(&mg->mg_ms_disabled_cv);
+		if (unload)
+			metaslab_unload(msp);
+	}
+	mutex_exit(&msp->ms_lock);
+	mutex_exit(&mg->mg_ms_disabled_lock);
+}
+
+static void
+metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
+{
+	vdev_t *vd = ms->ms_group->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa_meta_objset(spa);
+
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	metaslab_unflushed_phys_t entry = {
+		.msp_unflushed_txg = metaslab_unflushed_txg(ms),
+	};
+	uint64_t entry_size = sizeof (entry);
+	uint64_t entry_offset = ms->ms_id * entry_size;
+
+	uint64_t object = 0;
+	int err = zap_lookup(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+	    &object);
+	if (err == ENOENT) {
+		object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
+		    SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_add(mos, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
+		    &object, tx));
+	} else {
+		VERIFY0(err);
+	}
+
+	dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
+	    &entry, tx);
+}
+
+void
+metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
+{
+	spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	ms->ms_unflushed_txg = txg;
+	metaslab_update_ondisk_flush_data(ms, tx);
+}
+
+uint64_t
+metaslab_unflushed_txg(metaslab_t *ms)
+{
+	return (ms->ms_unflushed_txg);
+}
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW,
+	"Allocation granularity (a.k.a. stripe size)");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW,
+	"Load all metaslabs when pool is first opened");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
+	"Prevent metaslabs from being unloaded");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
+	"Preload potential metaslabs during reassessment");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW,
+	"Delay in txgs after metaslab was last used before unloading");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW,
+	"Delay in milliseconds after metaslab was last used before unloading");
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW,
+	"Percentage of metaslab group size that should be free to make it "
+	"eligible for allocation");
+
+ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW,
+	"Percentage of metaslab group size that should be considered eligible "
+	"for allocations unless all metaslab groups within the metaslab class "
+	"have also crossed this threshold");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT,
+	 ZMOD_RW, "Fragmentation for metaslab to allow allocation");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW,
+	"Use the fragmentation metric to prefer less fragmented metaslabs");
+/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW,
+	"Prefer metaslabs with lower LBAs");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW,
+	"Enable metaslab group biasing");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT,
+	ZMOD_RW, "Enable segment-based metaslab selection");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW,
+	"Segment-based metaslab selection maximum buckets before switching");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW,
+	"Blocks larger than this size are forced to be gang blocks");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW,
+	"Max distance (bytes) to search forward before using size tree");
+
+ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW,
+	"When looking in size tree, use largest segment instead of exact fit");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG,
+	ZMOD_RW, "How long to trust the cached max chunk size of a metaslab");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW,
+	"Percentage of memory that can be used to store metaslab range trees");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, try_hard_before_gang, INT,
+	ZMOD_RW, "Try hard to allocate before ganging");
+
+ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, find_max_tries, INT, ZMOD_RW,
+	"Normally only consider this many of the best metaslabs in each vdev");
diff --git a/sys/contrib/openzfs/module/zfs/mmp.c b/sys/contrib/openzfs/module/zfs/mmp.c
new file mode 100644
index 000000000000..d05c9db24c20
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/mmp.c
@@ -0,0 +1,741 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/abd.h>
+#include <sys/mmp.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/time.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_context.h>
+#include <sys/callb.h>
+
+/*
+ * Multi-Modifier Protection (MMP) attempts to prevent a user from importing
+ * or opening a pool on more than one host at a time.  In particular, it
+ * prevents "zpool import -f" on a host from succeeding while the pool is
+ * already imported on another host.  There are many other ways in which a
+ * device could be used by two hosts for different purposes at the same time
+ * resulting in pool damage.  This implementation does not attempt to detect
+ * those cases.
+ *
+ * MMP operates by ensuring there are frequent visible changes on disk (a
+ * "heartbeat") at all times.  And by altering the import process to check
+ * for these changes and failing the import when they are detected.  This
+ * functionality is enabled by setting the 'multihost' pool property to on.
+ *
+ * Uberblocks written by the txg_sync thread always go into the first
+ * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP.
+ * They are used to hold uberblocks which are exactly the same as the last
+ * synced uberblock except that the ub_timestamp and mmp_config are frequently
+ * updated.  Like all other uberblocks, the slot is written with an embedded
+ * checksum, and slots with invalid checksums are ignored.  This provides the
+ * "heartbeat", with no risk of overwriting good uberblocks that must be
+ * preserved, e.g. previous txgs and associated block pointers.
+ *
+ * Three optional fields are added to uberblock structure; ub_mmp_magic,
+ * ub_mmp_config, and ub_mmp_delay.  The ub_mmp_magic value allows zfs to tell
+ * whether the other ub_mmp_* fields are valid.  The ub_mmp_config field tells
+ * the importing host the settings of zfs_multihost_interval and
+ * zfs_multihost_fail_intervals on the host which last had (or currently has)
+ * the pool imported.  These determine how long a host must wait to detect
+ * activity in the pool, before concluding the pool is not in use.  The
+ * mmp_delay field is a decaying average of the amount of time between
+ * completion of successive MMP writes, in nanoseconds.  It indicates whether
+ * MMP is enabled.
+ *
+ * During import an activity test may now be performed to determine if
+ * the pool is in use.  The activity test is typically required if the
+ * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is
+ * POOL_STATE_ACTIVE, and the pool is not a root pool.
+ *
+ * The activity test finds the "best" uberblock (highest txg, timestamp, and, if
+ * ub_mmp_magic is valid, sequence number from ub_mmp_config).  It then waits
+ * some time, and finds the "best" uberblock again.  If any of the mentioned
+ * fields have different values in the newly read uberblock, the pool is in use
+ * by another host and the import fails.  In order to assure the accuracy of the
+ * activity test, the default values result in an activity test duration of 20x
+ * the mmp write interval.
+ *
+ * The duration of the "zpool import" activity test depends on the information
+ * available in the "best" uberblock:
+ *
+ * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0:
+ *    ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2
+ *
+ *    In this case, a weak guarantee is provided.  Since the host which last had
+ *    the pool imported will suspend the pool if no mmp writes land within
+ *    fail_intervals * multihost_interval ms, the absence of writes during that
+ *    time means either the pool is not imported, or it is imported but the pool
+ *    is suspended and no further writes will occur.
+ *
+ *    Note that resuming the suspended pool on the remote host would invalidate
+ *    this guarantee, and so it is not allowed.
+ *
+ *    The factor of 2 provides a conservative safety factor and derives from
+ *    MMP_IMPORT_SAFETY_FACTOR;
+ *
+ * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0:
+ *    (ub_mmp_config.multihost_interval + ub_mmp_delay) *
+ *        zfs_multihost_import_intervals
+ *
+ *    In this case no guarantee can provided.  However, as long as some devices
+ *    are healthy and connected, it is likely that at least one write will land
+ *    within (multihost_interval + mmp_delay) because multihost_interval is
+ *    enough time for a write to be attempted to each leaf vdev, and mmp_delay
+ *    is enough for one to land, based on past delays.  Multiplying by
+ *    zfs_multihost_import_intervals provides a conservative safety factor.
+ *
+ * 3) If uberblock was written by zfs-0.7:
+ *    (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals
+ *
+ *    The same logic as case #2 applies, but we do not know remote tunables.
+ *
+ *    We use the local value for zfs_multihost_interval because the original MMP
+ *    did not record this value in the uberblock.
+ *
+ *    ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host
+ *    has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect
+ *    that.  We will have waited enough time for zfs_multihost_import_intervals
+ *    writes to be issued and all but one to land.
+ *
+ *    single device pool example delays
+ *
+ *    import_delay = (1 + 1) * 20   =  40s #defaults, no I/O delay
+ *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
+ *    import_delay = (10 + 10) * 20 = 400s #10s multihost_interval,
+ *                                          no I/O delay
+ *    100 device pool example delays
+ *
+ *    import_delay = (1 + .01) * 20 =  20s #defaults, no I/O delay
+ *    import_delay = (1 + 10) * 20  = 220s #defaults, 10s I/O delay
+ *    import_delay = (10 + .1) * 20 = 202s #10s multihost_interval,
+ *                                          no I/O delay
+ *
+ * 4) Otherwise, this uberblock was written by a pre-MMP zfs:
+ *    zfs_multihost_import_intervals * zfs_multihost_interval
+ *
+ *    In this case local tunables are used.  By default this product = 10s, long
+ *    enough for a pool with any activity at all to write at least one
+ *    uberblock.  No guarantee can be provided.
+ *
+ * Additionally, the duration is then extended by a random 25% to attempt to to
+ * detect simultaneous imports.  For example, if both partner hosts are rebooted
+ * at the same time and automatically attempt to import the pool.
+ */
+
+/*
+ * Used to control the frequency of mmp writes which are performed when the
+ * 'multihost' pool property is on.  This is one factor used to determine the
+ * length of the activity check during import.
+ *
+ * On average an mmp write will be issued for each leaf vdev every
+ * zfs_multihost_interval milliseconds.  In practice, the observed period can
+ * vary with the I/O load and this observed value is the ub_mmp_delay which is
+ * stored in the uberblock.  The minimum allowed value is 100 ms.
+ */
+ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL;
+
+/*
+ * Used to control the duration of the activity test on import.  Smaller values
+ * of zfs_multihost_import_intervals will reduce the import time but increase
+ * the risk of failing to detect an active pool.  The total activity check time
+ * is never allowed to drop below one second.  A value of 0 is ignored and
+ * treated as if it was set to 1.
+ */
+uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS;
+
+/*
+ * Controls the behavior of the pool when mmp write failures or delays are
+ * detected.
+ *
+ * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are
+ * ignored.  The failures will still be reported to the ZED which depending on
+ * its configuration may take action such as suspending the pool or taking a
+ * device offline.
+ *
+ * When zfs_multihost_fail_intervals > 0, the pool will be suspended if
+ * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass
+ * without a successful mmp write.  This guarantees the activity test will see
+ * mmp writes if the pool is imported.  A value of 1 is ignored and treated as
+ * if it was set to 2, because a single leaf vdev pool will issue a write once
+ * per multihost_interval and thus any variation in latency would cause the
+ * pool to be suspended.
+ */
+uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS;
+
+char *mmp_tag = "mmp_write_uberblock";
+static void mmp_thread(void *arg);
+
+void
+mmp_init(spa_t *spa)
+{
+	mmp_thread_t *mmp = &spa->spa_mmp;
+
+	mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	mmp->mmp_kstat_id = 1;
+}
+
+void
+mmp_fini(spa_t *spa)
+{
+	mmp_thread_t *mmp = &spa->spa_mmp;
+
+	mutex_destroy(&mmp->mmp_thread_lock);
+	cv_destroy(&mmp->mmp_thread_cv);
+	mutex_destroy(&mmp->mmp_io_lock);
+}
+
+static void
+mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr)
+{
+	CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG);
+	mutex_enter(&mmp->mmp_thread_lock);
+}
+
+static void
+mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr)
+{
+	ASSERT(*mpp != NULL);
+	*mpp = NULL;
+	cv_broadcast(&mmp->mmp_thread_cv);
+	CALLB_CPR_EXIT(cpr);		/* drops &mmp->mmp_thread_lock */
+	thread_exit();
+}
+
+void
+mmp_thread_start(spa_t *spa)
+{
+	mmp_thread_t *mmp = &spa->spa_mmp;
+
+	if (spa_writeable(spa)) {
+		mutex_enter(&mmp->mmp_thread_lock);
+		if (!mmp->mmp_thread) {
+			mmp->mmp_thread = thread_create(NULL, 0, mmp_thread,
+			    spa, 0, &p0, TS_RUN, defclsyspri);
+			zfs_dbgmsg("MMP thread started pool '%s' "
+			    "gethrtime %llu", spa_name(spa), gethrtime());
+		}
+		mutex_exit(&mmp->mmp_thread_lock);
+	}
+}
+
+void
+mmp_thread_stop(spa_t *spa)
+{
+	mmp_thread_t *mmp = &spa->spa_mmp;
+
+	mutex_enter(&mmp->mmp_thread_lock);
+	mmp->mmp_thread_exiting = 1;
+	cv_broadcast(&mmp->mmp_thread_cv);
+
+	while (mmp->mmp_thread) {
+		cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock);
+	}
+	mutex_exit(&mmp->mmp_thread_lock);
+	zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu",
+	    spa_name(spa), gethrtime());
+
+	ASSERT(mmp->mmp_thread == NULL);
+	mmp->mmp_thread_exiting = 0;
+}
+
+typedef enum mmp_vdev_state_flag {
+	MMP_FAIL_NOT_WRITABLE	= (1 << 0),
+	MMP_FAIL_WRITE_PENDING	= (1 << 1),
+} mmp_vdev_state_flag_t;
+
+/*
+ * Find a leaf vdev to write an MMP block to.  It must not have an outstanding
+ * mmp write (if so a new write will also likely block).  If there is no usable
+ * leaf, a nonzero error value is returned. The error value returned is a bit
+ * field.
+ *
+ * MMP_FAIL_WRITE_PENDING   One or more leaf vdevs are writeable, but have an
+ *                          outstanding MMP write.
+ * MMP_FAIL_NOT_WRITABLE    One or more leaf vdevs are not writeable.
+ */
+
+static int
+mmp_next_leaf(spa_t *spa)
+{
+	vdev_t *leaf;
+	vdev_t *starting_leaf;
+	int fail_mask = 0;
+
+	ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock));
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER));
+	ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE);
+	ASSERT(!list_is_empty(&spa->spa_leaf_list));
+
+	if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) {
+		spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list);
+		spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen;
+	}
+
+	leaf = spa->spa_mmp.mmp_last_leaf;
+	if (leaf == NULL)
+		leaf = list_head(&spa->spa_leaf_list);
+	starting_leaf = leaf;
+
+	do {
+		leaf = list_next(&spa->spa_leaf_list, leaf);
+		if (leaf == NULL)
+			leaf = list_head(&spa->spa_leaf_list);
+
+		/*
+		 * We skip unwritable, offline, detached, and dRAID spare
+		 * devices as they are either not legal targets or the write
+		 * may fail or not be seen by other hosts.  Skipped dRAID
+		 * spares can never be written so the fail mask is not set.
+		 */
+		if (!vdev_writeable(leaf) || leaf->vdev_offline ||
+		    leaf->vdev_detached) {
+			fail_mask |= MMP_FAIL_NOT_WRITABLE;
+		} else if (leaf->vdev_ops == &vdev_draid_spare_ops) {
+			continue;
+		} else if (leaf->vdev_mmp_pending != 0) {
+			fail_mask |= MMP_FAIL_WRITE_PENDING;
+		} else {
+			spa->spa_mmp.mmp_last_leaf = leaf;
+			return (0);
+		}
+	} while (leaf != starting_leaf);
+
+	ASSERT(fail_mask);
+
+	return (fail_mask);
+}
+
+/*
+ * MMP writes are issued on a fixed schedule, but may complete at variable,
+ * much longer, intervals.  The mmp_delay captures long periods between
+ * successful writes for any reason, including disk latency, scheduling delays,
+ * etc.
+ *
+ * The mmp_delay is usually calculated as a decaying average, but if the latest
+ * delay is higher we do not average it, so that we do not hide sudden spikes
+ * which the importing host must wait for.
+ *
+ * If writes are occurring frequently, such as due to a high rate of txg syncs,
+ * the mmp_delay could become very small.  Since those short delays depend on
+ * activity we cannot count on, we never allow mmp_delay to get lower than rate
+ * expected if only mmp_thread writes occur.
+ *
+ * If an mmp write was skipped or fails, and we have already waited longer than
+ * mmp_delay, we need to update it so the next write reflects the longer delay.
+ *
+ * Do not set mmp_delay if the multihost property is not on, so as not to
+ * trigger an activity check on import.
+ */
+static void
+mmp_delay_update(spa_t *spa, boolean_t write_completed)
+{
+	mmp_thread_t *mts = &spa->spa_mmp;
+	hrtime_t delay = gethrtime() - mts->mmp_last_write;
+
+	ASSERT(MUTEX_HELD(&mts->mmp_io_lock));
+
+	if (spa_multihost(spa) == B_FALSE) {
+		mts->mmp_delay = 0;
+		return;
+	}
+
+	if (delay > mts->mmp_delay)
+		mts->mmp_delay = delay;
+
+	if (write_completed == B_FALSE)
+		return;
+
+	mts->mmp_last_write = gethrtime();
+
+	/*
+	 * strictly less than, in case delay was changed above.
+	 */
+	if (delay < mts->mmp_delay) {
+		hrtime_t min_delay =
+		    MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) /
+		    MAX(1, vdev_count_leaves(spa));
+		mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128),
+		    min_delay);
+	}
+}
+
+static void
+mmp_write_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	vdev_t *vd = zio->io_vd;
+	mmp_thread_t *mts = zio->io_private;
+
+	mutex_enter(&mts->mmp_io_lock);
+	uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id;
+	hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending;
+
+	mmp_delay_update(spa, (zio->io_error == 0));
+
+	vd->vdev_mmp_pending = 0;
+	vd->vdev_mmp_kstat_id = 0;
+
+	mutex_exit(&mts->mmp_io_lock);
+	spa_config_exit(spa, SCL_STATE, mmp_tag);
+
+	spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error,
+	    mmp_write_duration);
+
+	abd_free(zio->io_abd);
+}
+
+/*
+ * When the uberblock on-disk is updated by a spa_sync,
+ * creating a new "best" uberblock, update the one stored
+ * in the mmp thread state, used for mmp writes.
+ */
+void
+mmp_update_uberblock(spa_t *spa, uberblock_t *ub)
+{
+	mmp_thread_t *mmp = &spa->spa_mmp;
+
+	mutex_enter(&mmp->mmp_io_lock);
+	mmp->mmp_ub = *ub;
+	mmp->mmp_seq = 1;
+	mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+	mmp_delay_update(spa, B_TRUE);
+	mutex_exit(&mmp->mmp_io_lock);
+}
+
+/*
+ * Choose a random vdev, label, and MMP block, and write over it
+ * with a copy of the last-synced uberblock, whose timestamp
+ * has been updated to reflect that the pool is in use.
+ */
+static void
+mmp_write_uberblock(spa_t *spa)
+{
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	mmp_thread_t *mmp = &spa->spa_mmp;
+	uberblock_t *ub;
+	vdev_t *vd = NULL;
+	int label, error;
+	uint64_t offset;
+
+	hrtime_t lock_acquire_time = gethrtime();
+	spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER);
+	lock_acquire_time = gethrtime() - lock_acquire_time;
+	if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10))
+		zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns "
+		    "gethrtime %llu", spa_name(spa), lock_acquire_time,
+		    gethrtime());
+
+	mutex_enter(&mmp->mmp_io_lock);
+
+	error = mmp_next_leaf(spa);
+
+	/*
+	 * spa_mmp_history has two types of entries:
+	 * Issued MMP write: records time issued, error status, etc.
+	 * Skipped MMP write: an MMP write could not be issued because no
+	 * suitable leaf vdev was available.  See comment above struct
+	 * spa_mmp_history for details.
+	 */
+
+	if (error) {
+		mmp_delay_update(spa, B_FALSE);
+		if (mmp->mmp_skip_error == error) {
+			spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1);
+		} else {
+			mmp->mmp_skip_error = error;
+			spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg,
+			    gethrestime_sec(), mmp->mmp_delay, NULL, 0,
+			    mmp->mmp_kstat_id++, error);
+			zfs_dbgmsg("MMP error choosing leaf pool '%s' "
+			    "gethrtime %llu fail_mask %#x", spa_name(spa),
+			    gethrtime(), error);
+		}
+		mutex_exit(&mmp->mmp_io_lock);
+		spa_config_exit(spa, SCL_STATE, mmp_tag);
+		return;
+	}
+
+	vd = spa->spa_mmp.mmp_last_leaf;
+	if (mmp->mmp_skip_error != 0) {
+		mmp->mmp_skip_error = 0;
+		zfs_dbgmsg("MMP write after skipping due to unavailable "
+		    "leaves, pool '%s' gethrtime %llu leaf %#llu",
+		    spa_name(spa), gethrtime(), vd->vdev_guid);
+	}
+
+	if (mmp->mmp_zio_root == NULL)
+		mmp->mmp_zio_root = zio_root(spa, NULL, NULL,
+		    flags | ZIO_FLAG_GODFATHER);
+
+	if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) {
+		/*
+		 * Want to reset mmp_seq when timestamp advances because after
+		 * an mmp_seq wrap new values will not be chosen by
+		 * uberblock_compare() as the "best".
+		 */
+		mmp->mmp_ub.ub_timestamp = gethrestime_sec();
+		mmp->mmp_seq = 1;
+	}
+
+	ub = &mmp->mmp_ub;
+	ub->ub_mmp_magic = MMP_MAGIC;
+	ub->ub_mmp_delay = mmp->mmp_delay;
+	ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) |
+	    MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) |
+	    MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK(
+	    zfs_multihost_fail_intervals));
+	vd->vdev_mmp_pending = gethrtime();
+	vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id;
+
+	zio_t *zio  = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags);
+	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+
+	mmp->mmp_seq++;
+	mmp->mmp_kstat_id++;
+	mutex_exit(&mmp->mmp_io_lock);
+
+	offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) -
+	    MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL));
+
+	label = spa_get_random(VDEV_LABELS);
+	vdev_label_write(zio, vd, label, ub_abd, offset,
+	    VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp,
+	    flags | ZIO_FLAG_DONT_PROPAGATE);
+
+	(void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp,
+	    ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0);
+
+	zio_nowait(zio);
+}
+
+static void
+mmp_thread(void *arg)
+{
+	spa_t *spa = (spa_t *)arg;
+	mmp_thread_t *mmp = &spa->spa_mmp;
+	boolean_t suspended = spa_suspended(spa);
+	boolean_t multihost = spa_multihost(spa);
+	uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
+	    zfs_multihost_interval));
+	uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK(
+	    zfs_multihost_fail_intervals);
+	hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval;
+	boolean_t last_spa_suspended = suspended;
+	boolean_t last_spa_multihost = multihost;
+	uint64_t last_mmp_interval = mmp_interval;
+	uint32_t last_mmp_fail_intervals = mmp_fail_intervals;
+	hrtime_t last_mmp_fail_ns = mmp_fail_ns;
+	callb_cpr_t cpr;
+	int skip_wait = 0;
+
+	mmp_thread_enter(mmp, &cpr);
+
+	/*
+	 * There have been no MMP writes yet.  Setting mmp_last_write here gives
+	 * us one mmp_fail_ns period, which is consistent with the activity
+	 * check duration, to try to land an MMP write before MMP suspends the
+	 * pool (if so configured).
+	 */
+
+	mutex_enter(&mmp->mmp_io_lock);
+	mmp->mmp_last_write = gethrtime();
+	mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval));
+	mutex_exit(&mmp->mmp_io_lock);
+
+	while (!mmp->mmp_thread_exiting) {
+		hrtime_t next_time = gethrtime() +
+		    MSEC2NSEC(MMP_DEFAULT_INTERVAL);
+		int leaves = MAX(vdev_count_leaves(spa), 1);
+
+		/* Detect changes in tunables or state */
+
+		last_spa_suspended = suspended;
+		last_spa_multihost = multihost;
+		suspended = spa_suspended(spa);
+		multihost = spa_multihost(spa);
+
+		last_mmp_interval = mmp_interval;
+		last_mmp_fail_intervals = mmp_fail_intervals;
+		last_mmp_fail_ns = mmp_fail_ns;
+		mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK(
+		    zfs_multihost_interval));
+		mmp_fail_intervals = MMP_FAIL_INTVS_OK(
+		    zfs_multihost_fail_intervals);
+
+		/* Smooth so pool is not suspended when reducing tunables */
+		if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) {
+			mmp_fail_ns = (mmp_fail_ns * 31 +
+			    mmp_fail_intervals * mmp_interval) / 32;
+		} else {
+			mmp_fail_ns = mmp_fail_intervals *
+			    mmp_interval;
+		}
+
+		if (mmp_interval != last_mmp_interval ||
+		    mmp_fail_intervals != last_mmp_fail_intervals) {
+			/*
+			 * We want other hosts to see new tunables as quickly as
+			 * possible.  Write out at higher frequency than usual.
+			 */
+			skip_wait += leaves;
+		}
+
+		if (multihost)
+			next_time = gethrtime() + mmp_interval / leaves;
+
+		if (mmp_fail_ns != last_mmp_fail_ns) {
+			zfs_dbgmsg("MMP interval change pool '%s' "
+			    "gethrtime %llu last_mmp_interval %llu "
+			    "mmp_interval %llu last_mmp_fail_intervals %u "
+			    "mmp_fail_intervals %u mmp_fail_ns %llu "
+			    "skip_wait %d leaves %d next_time %llu",
+			    spa_name(spa), gethrtime(), last_mmp_interval,
+			    mmp_interval, last_mmp_fail_intervals,
+			    mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves,
+			    next_time);
+		}
+
+		/*
+		 * MMP off => on, or suspended => !suspended:
+		 * No writes occurred recently.  Update mmp_last_write to give
+		 * us some time to try.
+		 */
+		if ((!last_spa_multihost && multihost) ||
+		    (last_spa_suspended && !suspended)) {
+			zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu "
+			    "last_spa_multihost %u multihost %u "
+			    "last_spa_suspended %u suspended %u",
+			    spa_name(spa), last_spa_multihost, multihost,
+			    last_spa_suspended, suspended);
+			mutex_enter(&mmp->mmp_io_lock);
+			mmp->mmp_last_write = gethrtime();
+			mmp->mmp_delay = mmp_interval;
+			mutex_exit(&mmp->mmp_io_lock);
+		}
+
+		/*
+		 * MMP on => off:
+		 * mmp_delay == 0 tells importing node to skip activity check.
+		 */
+		if (last_spa_multihost && !multihost) {
+			mutex_enter(&mmp->mmp_io_lock);
+			mmp->mmp_delay = 0;
+			mutex_exit(&mmp->mmp_io_lock);
+		}
+
+		/*
+		 * Suspend the pool if no MMP write has succeeded in over
+		 * mmp_interval * mmp_fail_intervals nanoseconds.
+		 */
+		if (multihost && !suspended && mmp_fail_intervals &&
+		    (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) {
+			zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu "
+			    "mmp_last_write %llu mmp_interval %llu "
+			    "mmp_fail_intervals %llu mmp_fail_ns %llu",
+			    spa_name(spa), (u_longlong_t)gethrtime(),
+			    (u_longlong_t)mmp->mmp_last_write,
+			    (u_longlong_t)mmp_interval,
+			    (u_longlong_t)mmp_fail_intervals,
+			    (u_longlong_t)mmp_fail_ns);
+			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
+			    "succeeded in over %llu ms; suspending pool. "
+			    "Hrtime %llu",
+			    spa_name(spa),
+			    NSEC2MSEC(gethrtime() - mmp->mmp_last_write),
+			    gethrtime());
+			zio_suspend(spa, NULL, ZIO_SUSPEND_MMP);
+		}
+
+		if (multihost && !suspended)
+			mmp_write_uberblock(spa);
+
+		if (skip_wait > 0) {
+			next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) /
+			    leaves;
+			skip_wait--;
+		}
+
+		CALLB_CPR_SAFE_BEGIN(&cpr);
+		(void) cv_timedwait_idle_hires(&mmp->mmp_thread_cv,
+		    &mmp->mmp_thread_lock, next_time, USEC2NSEC(100),
+		    CALLOUT_FLAG_ABSOLUTE);
+		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
+	}
+
+	/* Outstanding writes are allowed to complete. */
+	zio_wait(mmp->mmp_zio_root);
+
+	mmp->mmp_zio_root = NULL;
+	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
+}
+
+/*
+ * Signal the MMP thread to wake it, when it is sleeping on
+ * its cv.  Used when some module parameter has changed and
+ * we want the thread to know about it.
+ * Only signal if the pool is active and mmp thread is
+ * running, otherwise there is no thread to wake.
+ */
+static void
+mmp_signal_thread(spa_t *spa)
+{
+	mmp_thread_t *mmp = &spa->spa_mmp;
+
+	mutex_enter(&mmp->mmp_thread_lock);
+	if (mmp->mmp_thread)
+		cv_broadcast(&mmp->mmp_thread_cv);
+	mutex_exit(&mmp->mmp_thread_lock);
+}
+
+void
+mmp_signal_all_threads(void)
+{
+	spa_t *spa = NULL;
+
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa))) {
+		if (spa->spa_state == POOL_STATE_ACTIVE)
+			mmp_signal_thread(spa);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval,
+	param_set_multihost_interval, param_get_ulong, ZMOD_RW,
+	"Milliseconds between mmp writes to each leaf");
+/* END CSTYLED */
+
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW,
+	"Max allowed period without a successful mmp write");
+
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW,
+	"Number of zfs_multihost_interval periods to wait for activity");
diff --git a/sys/contrib/openzfs/module/zfs/multilist.c b/sys/contrib/openzfs/module/zfs/multilist.c
new file mode 100644
index 000000000000..36c0d33bf1f6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/multilist.c
@@ -0,0 +1,434 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/multilist.h>
+#include <sys/trace_zfs.h>
+
+/* needed for spa_get_random() */
+#include <sys/spa.h>
+
+/*
+ * This overrides the number of sublists in each multilist_t, which defaults
+ * to the number of CPUs in the system (see multilist_create()).
+ */
+int zfs_multilist_num_sublists = 0;
+
+/*
+ * Given the object contained on the list, return a pointer to the
+ * object's multilist_node_t structure it contains.
+ */
+#ifdef ZFS_DEBUG
+static multilist_node_t *
+multilist_d2l(multilist_t *ml, void *obj)
+{
+	return ((multilist_node_t *)((char *)obj + ml->ml_offset));
+}
+#endif
+
+/*
+ * Initialize a new mutlilist using the parameters specified.
+ *
+ *  - 'size' denotes the size of the structure containing the
+ *     multilist_node_t.
+ *  - 'offset' denotes the byte offset of the mutlilist_node_t within
+ *     the structure that contains it.
+ *  - 'num' specifies the number of internal sublists to create.
+ *  - 'index_func' is used to determine which sublist to insert into
+ *     when the multilist_insert() function is called; as well as which
+ *     sublist to remove from when multilist_remove() is called. The
+ *     requirements this function must meet, are the following:
+ *
+ *      - It must always return the same value when called on the same
+ *        object (to ensure the object is removed from the list it was
+ *        inserted into).
+ *
+ *      - It must return a value in the range [0, number of sublists).
+ *        The multilist_get_num_sublists() function may be used to
+ *        determine the number of sublists in the multilist.
+ *
+ *     Also, in order to reduce internal contention between the sublists
+ *     during insertion and removal, this function should choose evenly
+ *     between all available sublists when inserting. This isn't a hard
+ *     requirement, but a general rule of thumb in order to garner the
+ *     best multi-threaded performance out of the data structure.
+ */
+static multilist_t *
+multilist_create_impl(size_t size, size_t offset,
+    unsigned int num, multilist_sublist_index_func_t *index_func)
+{
+	ASSERT3U(size, >, 0);
+	ASSERT3U(size, >=, offset + sizeof (multilist_node_t));
+	ASSERT3U(num, >, 0);
+	ASSERT3P(index_func, !=, NULL);
+
+	multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP);
+	ml->ml_offset = offset;
+	ml->ml_num_sublists = num;
+	ml->ml_index_func = index_func;
+
+	ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) *
+	    ml->ml_num_sublists, KM_SLEEP);
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		mutex_init(&mls->mls_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+		list_create(&mls->mls_list, size, offset);
+	}
+	return (ml);
+}
+
+/*
+ * Allocate a new multilist, using the default number of sublists (the number
+ * of CPUs, or at least 4, or the tunable zfs_multilist_num_sublists). Note
+ * that the multilists do not expand if more CPUs are hot-added. In that case,
+ * we will have less fanout than boot_ncpus, but we don't want to always
+ * reserve the RAM necessary to create the extra slots for additional CPUs up
+ * front, and dynamically adding them is a complex task.
+ */
+multilist_t *
+multilist_create(size_t size, size_t offset,
+    multilist_sublist_index_func_t *index_func)
+{
+	int num_sublists;
+
+	if (zfs_multilist_num_sublists > 0) {
+		num_sublists = zfs_multilist_num_sublists;
+	} else {
+		num_sublists = MAX(boot_ncpus, 4);
+	}
+
+	return (multilist_create_impl(size, offset, num_sublists, index_func));
+}
+
+/*
+ * Destroy the given multilist object, and free up any memory it holds.
+ */
+void
+multilist_destroy(multilist_t *ml)
+{
+	ASSERT(multilist_is_empty(ml));
+
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+
+		ASSERT(list_is_empty(&mls->mls_list));
+
+		list_destroy(&mls->mls_list);
+		mutex_destroy(&mls->mls_lock);
+	}
+
+	ASSERT3P(ml->ml_sublists, !=, NULL);
+	kmem_free(ml->ml_sublists,
+	    sizeof (multilist_sublist_t) * ml->ml_num_sublists);
+
+	ml->ml_num_sublists = 0;
+	ml->ml_offset = 0;
+	kmem_free(ml, sizeof (multilist_t));
+}
+
+/*
+ * Insert the given object into the multilist.
+ *
+ * This function will insert the object specified into the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The sublist locks are automatically acquired if not already held, to
+ * ensure consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_insert(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__insert, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+
+	/*
+	 * Note: Callers may already hold the sublist lock by calling
+	 * multilist_sublist_lock().  Here we rely on MUTEX_HELD()
+	 * returning TRUE if and only if the current thread holds the
+	 * lock.  While it's a little ugly to make the lock recursive in
+	 * this way, it works and allows the calling code to be much
+	 * simpler -- otherwise it would have to pass around a flag
+	 * indicating that it already has the lock.
+	 */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(!multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_insert_head(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Remove the given object from the multilist.
+ *
+ * This function will remove the object specified from the sublist
+ * determined using the function given at multilist creation time.
+ *
+ * The necessary sublist locks are automatically acquired, to ensure
+ * consistency when inserting and removing from multiple threads.
+ */
+void
+multilist_remove(multilist_t *ml, void *obj)
+{
+	unsigned int sublist_idx = ml->ml_index_func(ml, obj);
+	multilist_sublist_t *mls;
+	boolean_t need_lock;
+
+	DTRACE_PROBE3(multilist__remove, multilist_t *, ml,
+	    unsigned int, sublist_idx, void *, obj);
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+
+	mls = &ml->ml_sublists[sublist_idx];
+	/* See comment in multilist_insert(). */
+	need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+	if (need_lock)
+		mutex_enter(&mls->mls_lock);
+
+	ASSERT(multilist_link_active(multilist_d2l(ml, obj)));
+
+	multilist_sublist_remove(mls, obj);
+
+	if (need_lock)
+		mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * Check to see if this multilist object is empty.
+ *
+ * This will return TRUE if it finds all of the sublists of this
+ * multilist to be empty, and FALSE otherwise. Each sublist lock will be
+ * automatically acquired as necessary.
+ *
+ * If concurrent insertions and removals are occurring, the semantics
+ * of this function become a little fuzzy. Instead of locking all
+ * sublists for the entire call time of the function, each sublist is
+ * only locked as it is individually checked for emptiness. Thus, it's
+ * possible for this function to return TRUE with non-empty sublists at
+ * the time the function returns. This would be due to another thread
+ * inserting into a given sublist, after that specific sublist was check
+ * and deemed empty, but before all sublists have been checked.
+ */
+int
+multilist_is_empty(multilist_t *ml)
+{
+	for (int i = 0; i < ml->ml_num_sublists; i++) {
+		multilist_sublist_t *mls = &ml->ml_sublists[i];
+		/* See comment in multilist_insert(). */
+		boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock);
+
+		if (need_lock)
+			mutex_enter(&mls->mls_lock);
+
+		if (!list_is_empty(&mls->mls_list)) {
+			if (need_lock)
+				mutex_exit(&mls->mls_lock);
+
+			return (FALSE);
+		}
+
+		if (need_lock)
+			mutex_exit(&mls->mls_lock);
+	}
+
+	return (TRUE);
+}
+
+/* Return the number of sublists composing this multilist */
+unsigned int
+multilist_get_num_sublists(multilist_t *ml)
+{
+	return (ml->ml_num_sublists);
+}
+
+/* Return a randomly selected, valid sublist index for this multilist */
+unsigned int
+multilist_get_random_index(multilist_t *ml)
+{
+	return (spa_get_random(ml->ml_num_sublists));
+}
+
+/* Lock and return the sublist specified at the given index */
+multilist_sublist_t *
+multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+{
+	multilist_sublist_t *mls;
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+	mls = &ml->ml_sublists[sublist_idx];
+	mutex_enter(&mls->mls_lock);
+
+	return (mls);
+}
+
+/* Lock and return the sublist that would be used to store the specified obj */
+multilist_sublist_t *
+multilist_sublist_lock_obj(multilist_t *ml, void *obj)
+{
+	return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+}
+
+void
+multilist_sublist_unlock(multilist_sublist_t *mls)
+{
+	mutex_exit(&mls->mls_lock);
+}
+
+/*
+ * We're allowing any object to be inserted into this specific sublist,
+ * but this can lead to trouble if multilist_remove() is called to
+ * remove this object. Specifically, if calling ml_index_func on this
+ * object returns an index for sublist different than what is passed as
+ * a parameter here, any call to multilist_remove() with this newly
+ * inserted object is undefined! (the call to multilist_remove() will
+ * remove the object from a list that it isn't contained in)
+ */
+void
+multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_head(&mls->mls_list, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_insert_tail(&mls->mls_list, obj);
+}
+
+/*
+ * Move the object one element forward in the list.
+ *
+ * This function will move the given object forward in the list (towards
+ * the head) by one object. So, in essence, it will swap its position in
+ * the list with its "prev" pointer. If the given object is already at the
+ * head of the list, it cannot be moved forward any more than it already
+ * is, so no action is taken.
+ *
+ * NOTE: This function **must not** remove any object from the list other
+ *       than the object given as the parameter. This is relied upon in
+ *       arc_evict_state_impl().
+ */
+void
+multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj)
+{
+	void *prev = list_prev(&mls->mls_list, obj);
+
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	ASSERT(!list_is_empty(&mls->mls_list));
+
+	/* 'obj' must be at the head of the list, nothing to do */
+	if (prev == NULL)
+		return;
+
+	list_remove(&mls->mls_list, obj);
+	list_insert_before(&mls->mls_list, prev, obj);
+}
+
+void
+multilist_sublist_remove(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	list_remove(&mls->mls_list, obj);
+}
+
+int
+multilist_sublist_is_empty(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_is_empty(&mls->mls_list));
+}
+
+int
+multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx)
+{
+	multilist_sublist_t *mls;
+	int empty;
+
+	ASSERT3U(sublist_idx, <, ml->ml_num_sublists);
+	mls = &ml->ml_sublists[sublist_idx];
+	ASSERT(!MUTEX_HELD(&mls->mls_lock));
+	mutex_enter(&mls->mls_lock);
+	empty = list_is_empty(&mls->mls_list);
+	mutex_exit(&mls->mls_lock);
+	return (empty);
+}
+
+void *
+multilist_sublist_head(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_head(&mls->mls_list));
+}
+
+void *
+multilist_sublist_tail(multilist_sublist_t *mls)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_tail(&mls->mls_list));
+}
+
+void *
+multilist_sublist_next(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_next(&mls->mls_list, obj));
+}
+
+void *
+multilist_sublist_prev(multilist_sublist_t *mls, void *obj)
+{
+	ASSERT(MUTEX_HELD(&mls->mls_lock));
+	return (list_prev(&mls->mls_list, obj));
+}
+
+void
+multilist_link_init(multilist_node_t *link)
+{
+	list_link_init(link);
+}
+
+int
+multilist_link_active(multilist_node_t *link)
+{
+	return (list_link_active(link));
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW,
+	"Number of sublists used in each multilist");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/objlist.c b/sys/contrib/openzfs/module/zfs/objlist.c
new file mode 100644
index 000000000000..c80bab2a77bd
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/objlist.c
@@ -0,0 +1,84 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include	<sys/objlist.h>
+#include	<sys/zfs_context.h>
+
+objlist_t *
+objlist_create(void)
+{
+	objlist_t *list = kmem_alloc(sizeof (*list), KM_SLEEP);
+	list_create(&list->ol_list, sizeof (objlist_node_t),
+	    offsetof(objlist_node_t, on_node));
+	list->ol_last_lookup = 0;
+	return (list);
+}
+
+void
+objlist_destroy(objlist_t *list)
+{
+	for (objlist_node_t *n = list_remove_head(&list->ol_list);
+	    n != NULL; n = list_remove_head(&list->ol_list)) {
+		kmem_free(n, sizeof (*n));
+	}
+	list_destroy(&list->ol_list);
+	kmem_free(list, sizeof (*list));
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist.  In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number.  Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+boolean_t
+objlist_exists(objlist_t *list, uint64_t object)
+{
+	objlist_node_t *node = list_head(&list->ol_list);
+	ASSERT3U(object, >=, list->ol_last_lookup);
+	list->ol_last_lookup = object;
+	while (node != NULL && node->on_object < object) {
+		VERIFY3P(node, ==, list_remove_head(&list->ol_list));
+		kmem_free(node, sizeof (*node));
+		node = list_head(&list->ol_list);
+	}
+	return (node != NULL && node->on_object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order.  However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+void
+objlist_insert(objlist_t *list, uint64_t object)
+{
+	objlist_node_t *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+	node->on_object = object;
+#ifdef ZFS_DEBUG
+	objlist_node_t *last_object = list_tail(&list->ol_list);
+	uint64_t last_objnum = (last_object != NULL ? last_object->on_object :
+	    0);
+	ASSERT3U(node->on_object, >, last_objnum);
+#endif
+	list_insert_tail(&list->ol_list, node);
+}
diff --git a/sys/contrib/openzfs/module/zfs/pathname.c b/sys/contrib/openzfs/module/zfs/pathname.c
new file mode 100644
index 000000000000..84ab7b7e1111
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/pathname.c
@@ -0,0 +1,88 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T	*/
+/*	  All Rights Reserved  	*/
+
+/*
+ * University Copyright- Copyright (c) 1982, 1986, 1988
+ * The Regents of the University of California
+ * All Rights Reserved
+ *
+ * University Acknowledgment- Portions of this document are derived from
+ * software developed by the University of California, Berkeley, and its
+ * contributors.
+ */
+
+
+#include <sys/types.h>
+#include <sys/pathname.h>
+#include <sys/kmem.h>
+#include <sys/sysmacros.h>
+
+/*
+ * Pathname utilities.
+ *
+ * In translating file names we copy each argument file
+ * name into a pathname structure where we operate on it.
+ * Each pathname structure can hold "pn_bufsize" characters
+ * including a terminating null, and operations here support
+ * allocating and freeing pathname structures, fetching
+ * strings from user space, getting the next character from
+ * a pathname, combining two pathnames (used in symbolic
+ * link processing), and peeling off the first component
+ * of a pathname.
+ */
+
+/*
+ * Allocate contents of pathname structure.  Structure is typically
+ * an automatic variable in calling routine for convenience.
+ *
+ * May sleep in the call to kmem_alloc() and so must not be called
+ * from interrupt level.
+ */
+void
+pn_alloc(struct pathname *pnp)
+{
+	pn_alloc_sz(pnp, MAXPATHLEN);
+}
+void
+pn_alloc_sz(struct pathname *pnp, size_t sz)
+{
+	pnp->pn_buf = kmem_alloc(sz, KM_SLEEP);
+	pnp->pn_bufsize = sz;
+}
+
+/*
+ * Free pathname resources.
+ */
+void
+pn_free(struct pathname *pnp)
+{
+	/* pn_bufsize is usually MAXPATHLEN, but may not be */
+	kmem_free(pnp->pn_buf, pnp->pn_bufsize);
+	pnp->pn_buf = NULL;
+	pnp->pn_bufsize = 0;
+}
diff --git a/sys/contrib/openzfs/module/zfs/range_tree.c b/sys/contrib/openzfs/module/zfs/range_tree.c
new file mode 100644
index 000000000000..5219fd079b73
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/range_tree.c
@@ -0,0 +1,922 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dnode.h>
+#include <sys/zio.h>
+#include <sys/range_tree.h>
+
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_remove can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk()
+ * or range_tree_vacate() functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create function. Any callbacks that are non-NULL
+ * are then called at the appropriate times.
+ *
+ * The range tree code also supports a special variant of range trees
+ * that can bridge small gaps between segments. This kind of tree is used
+ * by the dsl scanning code to group I/Os into mostly sequential chunks to
+ * optimize disk performance. The code here attempts to do this with as
+ * little memory and computational overhead as possible. One limitation of
+ * this implementation is that segments of range trees with gaps can only
+ * support removing complete segments.
+ */
+
+static inline void
+rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt)
+{
+	ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES);
+	size_t size = 0;
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		break;
+	case RANGE_SEG_GAP:
+		size = sizeof (range_seg_gap_t);
+		break;
+	default:
+		VERIFY(0);
+	}
+	bcopy(src, dest, size);
+}
+
+void
+range_tree_stat_verify(range_tree_t *rt)
+{
+	range_seg_t *rs;
+	zfs_btree_index_t where;
+	uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 };
+	int i;
+
+	for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL;
+	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+		uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+		int idx	= highbit64(size) - 1;
+
+		hist[idx]++;
+		ASSERT3U(hist[idx], !=, 0);
+	}
+
+	for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+		if (hist[i] != rt->rt_histogram[i]) {
+			zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu",
+			    i, hist, hist[i], rt->rt_histogram[i]);
+		}
+		VERIFY3U(hist[i], ==, rt->rt_histogram[i]);
+	}
+}
+
+static void
+range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
+{
+	uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+	int idx = highbit64(size) - 1;
+
+	ASSERT(size != 0);
+	ASSERT3U(idx, <,
+	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+	rt->rt_histogram[idx]++;
+	ASSERT3U(rt->rt_histogram[idx], !=, 0);
+}
+
+static void
+range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs)
+{
+	uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+	int idx = highbit64(size) - 1;
+
+	ASSERT(size != 0);
+	ASSERT3U(idx, <,
+	    sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram));
+
+	ASSERT3U(rt->rt_histogram[idx], !=, 0);
+	rt->rt_histogram[idx]--;
+}
+
+static int
+range_tree_seg32_compare(const void *x1, const void *x2)
+{
+	const range_seg32_t *r1 = x1;
+	const range_seg32_t *r2 = x2;
+
+	ASSERT3U(r1->rs_start, <=, r1->rs_end);
+	ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+static int
+range_tree_seg64_compare(const void *x1, const void *x2)
+{
+	const range_seg64_t *r1 = x1;
+	const range_seg64_t *r2 = x2;
+
+	ASSERT3U(r1->rs_start, <=, r1->rs_end);
+	ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+static int
+range_tree_seg_gap_compare(const void *x1, const void *x2)
+{
+	const range_seg_gap_t *r1 = x1;
+	const range_seg_gap_t *r2 = x2;
+
+	ASSERT3U(r1->rs_start, <=, r1->rs_end);
+	ASSERT3U(r2->rs_start, <=, r2->rs_end);
+
+	return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start));
+}
+
+range_tree_t *
+range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
+    uint64_t start, uint64_t shift,
+    int (*zfs_btree_compare) (const void *, const void *),
+    uint64_t gap)
+{
+	range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
+
+	ASSERT3U(shift, <, 64);
+	ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES);
+	size_t size;
+	int (*compare) (const void *, const void *);
+	switch (type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		compare = range_tree_seg32_compare;
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		compare = range_tree_seg64_compare;
+		break;
+	case RANGE_SEG_GAP:
+		size = sizeof (range_seg_gap_t);
+		compare = range_tree_seg_gap_compare;
+		break;
+	default:
+		panic("Invalid range seg type %d", type);
+	}
+	zfs_btree_create(&rt->rt_root, compare, size);
+
+	rt->rt_ops = ops;
+	rt->rt_gap = gap;
+	rt->rt_arg = arg;
+	rt->rt_type = type;
+	rt->rt_start = start;
+	rt->rt_shift = shift;
+	rt->rt_btree_compare = zfs_btree_compare;
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
+		rt->rt_ops->rtop_create(rt, rt->rt_arg);
+
+	return (rt);
+}
+
+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, range_seg_type_t type,
+    void *arg, uint64_t start, uint64_t shift)
+{
+	return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0));
+}
+
+void
+range_tree_destroy(range_tree_t *rt)
+{
+	VERIFY0(rt->rt_space);
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
+		rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
+
+	zfs_btree_destroy(&rt->rt_root);
+	kmem_free(rt, sizeof (*rt));
+}
+
+void
+range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
+{
+	if (delta < 0 && delta * -1 >= rs_get_fill(rs, rt)) {
+		zfs_panic_recover("zfs: attempting to decrease fill to or "
+		    "below 0; probable double remove in segment [%llx:%llx]",
+		    (longlong_t)rs_get_start(rs, rt),
+		    (longlong_t)rs_get_end(rs, rt));
+	}
+	if (rs_get_fill(rs, rt) + delta > rs_get_end(rs, rt) -
+	    rs_get_start(rs, rt)) {
+		zfs_panic_recover("zfs: attempting to increase fill beyond "
+		    "max; probable double add in segment [%llx:%llx]",
+		    (longlong_t)rs_get_start(rs, rt),
+		    (longlong_t)rs_get_end(rs, rt));
+	}
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+	rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta);
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+}
+
+static void
+range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
+{
+	range_tree_t *rt = arg;
+	zfs_btree_index_t where;
+	range_seg_t *rs_before, *rs_after, *rs;
+	range_seg_max_t tmp, rsearch;
+	uint64_t end = start + size, gap = rt->rt_gap;
+	uint64_t bridge_size = 0;
+	boolean_t merge_before, merge_after;
+
+	ASSERT3U(size, !=, 0);
+	ASSERT3U(fill, <=, size);
+	ASSERT3U(start + size, >, start);
+
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, end);
+	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
+
+	/*
+	 * If this is a gap-supporting range tree, it is possible that we
+	 * are inserting into an existing segment. In this case simply
+	 * bump the fill count and call the remove / add callbacks. If the
+	 * new range will extend an existing segment, we remove the
+	 * existing one, apply the new extent to it and re-insert it using
+	 * the normal code paths.
+	 */
+	if (rs != NULL) {
+		if (gap == 0) {
+			zfs_panic_recover("zfs: adding existent segment to "
+			    "range tree (offset=%llx size=%llx)",
+			    (longlong_t)start, (longlong_t)size);
+			return;
+		}
+		uint64_t rstart = rs_get_start(rs, rt);
+		uint64_t rend = rs_get_end(rs, rt);
+		if (rstart <= start && rend >= end) {
+			range_tree_adjust_fill(rt, rs, fill);
+			return;
+		}
+
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+			rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+		range_tree_stat_decr(rt, rs);
+		rt->rt_space -= rend - rstart;
+
+		fill += rs_get_fill(rs, rt);
+		start = MIN(start, rstart);
+		end = MAX(end, rend);
+		size = end - start;
+
+		zfs_btree_remove(&rt->rt_root, rs);
+		range_tree_add_impl(rt, start, size, fill);
+		return;
+	}
+
+	ASSERT3P(rs, ==, NULL);
+
+	/*
+	 * Determine whether or not we will have to merge with our neighbors.
+	 * If gap != 0, we might need to merge with our neighbors even if we
+	 * aren't directly touching.
+	 */
+	zfs_btree_index_t where_before, where_after;
+	rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before);
+	rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after);
+
+	merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >=
+	    start - gap);
+	merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end +
+	    gap);
+
+	if (merge_before && gap != 0)
+		bridge_size += start - rs_get_end(rs_before, rt);
+	if (merge_after && gap != 0)
+		bridge_size += rs_get_start(rs_after, rt) - end;
+
+	if (merge_before && merge_after) {
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
+			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+		}
+
+		range_tree_stat_decr(rt, rs_before);
+		range_tree_stat_decr(rt, rs_after);
+
+		rs_copy(rs_after, &tmp, rt);
+		uint64_t before_start = rs_get_start_raw(rs_before, rt);
+		uint64_t before_fill = rs_get_fill(rs_before, rt);
+		uint64_t after_fill = rs_get_fill(rs_after, rt);
+		zfs_btree_remove_idx(&rt->rt_root, &where_before);
+
+		/*
+		 * We have to re-find the node because our old reference is
+		 * invalid as soon as we do any mutating btree operations.
+		 */
+		rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after);
+		rs_set_start_raw(rs_after, rt, before_start);
+		rs_set_fill(rs_after, rt, after_fill + before_fill + fill);
+		rs = rs_after;
+	} else if (merge_before) {
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+			rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
+
+		range_tree_stat_decr(rt, rs_before);
+
+		uint64_t before_fill = rs_get_fill(rs_before, rt);
+		rs_set_end(rs_before, rt, end);
+		rs_set_fill(rs_before, rt, before_fill + fill);
+		rs = rs_before;
+	} else if (merge_after) {
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+			rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
+
+		range_tree_stat_decr(rt, rs_after);
+
+		uint64_t after_fill = rs_get_fill(rs_after, rt);
+		rs_set_start(rs_after, rt, start);
+		rs_set_fill(rs_after, rt, after_fill + fill);
+		rs = rs_after;
+	} else {
+		rs = &tmp;
+
+		rs_set_start(rs, rt, start);
+		rs_set_end(rs, rt, end);
+		rs_set_fill(rs, rt, fill);
+		zfs_btree_add_idx(&rt->rt_root, rs, &where);
+	}
+
+	if (gap != 0) {
+		ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) -
+		    rs_get_start(rs, rt));
+	} else {
+		ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) -
+		    rs_get_start(rs, rt));
+	}
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+	range_tree_stat_incr(rt, rs);
+	rt->rt_space += size + bridge_size;
+}
+
+void
+range_tree_add(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_add_impl(arg, start, size, size);
+}
+
+static void
+range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
+    boolean_t do_fill)
+{
+	zfs_btree_index_t where;
+	range_seg_t *rs;
+	range_seg_max_t rsearch, rs_tmp;
+	uint64_t end = start + size;
+	boolean_t left_over, right_over;
+
+	VERIFY3U(size, !=, 0);
+	VERIFY3U(size, <=, rt->rt_space);
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
+
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, end);
+	rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
+
+	/* Make sure we completely overlap with someone */
+	if (rs == NULL) {
+		zfs_panic_recover("zfs: removing nonexistent segment from "
+		    "range tree (offset=%llx size=%llx)",
+		    (longlong_t)start, (longlong_t)size);
+		return;
+	}
+
+	/*
+	 * Range trees with gap support must only remove complete segments
+	 * from the tree. This allows us to maintain accurate fill accounting
+	 * and to ensure that bridged sections are not leaked. If we need to
+	 * remove less than the full segment, we can only adjust the fill count.
+	 */
+	if (rt->rt_gap != 0) {
+		if (do_fill) {
+			if (rs_get_fill(rs, rt) == size) {
+				start = rs_get_start(rs, rt);
+				end = rs_get_end(rs, rt);
+				size = end - start;
+			} else {
+				range_tree_adjust_fill(rt, rs, -size);
+				return;
+			}
+		} else if (rs_get_start(rs, rt) != start ||
+		    rs_get_end(rs, rt) != end) {
+			zfs_panic_recover("zfs: freeing partial segment of "
+			    "gap tree (offset=%llx size=%llx) of "
+			    "(offset=%llx size=%llx)",
+			    (longlong_t)start, (longlong_t)size,
+			    (longlong_t)rs_get_start(rs, rt),
+			    (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs,
+			    rt));
+			return;
+		}
+	}
+
+	VERIFY3U(rs_get_start(rs, rt), <=, start);
+	VERIFY3U(rs_get_end(rs, rt), >=, end);
+
+	left_over = (rs_get_start(rs, rt) != start);
+	right_over = (rs_get_end(rs, rt) != end);
+
+	range_tree_stat_decr(rt, rs);
+
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+	if (left_over && right_over) {
+		range_seg_max_t newseg;
+		rs_set_start(&newseg, rt, end);
+		rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt));
+		rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end);
+		range_tree_stat_incr(rt, &newseg);
+
+		// This modifies the buffer already inside the range tree
+		rs_set_end(rs, rt, start);
+
+		rs_copy(rs, &rs_tmp, rt);
+		if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL)
+			zfs_btree_add_idx(&rt->rt_root, &newseg, &where);
+		else
+			zfs_btree_add(&rt->rt_root, &newseg);
+
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+			rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg);
+	} else if (left_over) {
+		// This modifies the buffer already inside the range tree
+		rs_set_end(rs, rt, start);
+		rs_copy(rs, &rs_tmp, rt);
+	} else if (right_over) {
+		// This modifies the buffer already inside the range tree
+		rs_set_start(rs, rt, end);
+		rs_copy(rs, &rs_tmp, rt);
+	} else {
+		zfs_btree_remove_idx(&rt->rt_root, &where);
+		rs = NULL;
+	}
+
+	if (rs != NULL) {
+		/*
+		 * The fill of the leftover segment will always be equal to
+		 * the size, since we do not support removing partial segments
+		 * of range trees with gaps.
+		 */
+		rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) -
+		    rs_get_start_raw(rs, rt));
+		range_tree_stat_incr(rt, &rs_tmp);
+
+		if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+			rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg);
+	}
+
+	rt->rt_space -= size;
+}
+
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+	range_tree_remove_impl(arg, start, size, B_FALSE);
+}
+
+void
+range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	range_tree_remove_impl(rt, start, size, B_TRUE);
+}
+
+void
+range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+    uint64_t newstart, uint64_t newsize)
+{
+	int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt));
+
+	range_tree_stat_decr(rt, rs);
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+		rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+	rs_set_start(rs, rt, newstart);
+	rs_set_end(rs, rt, newstart + newsize);
+
+	range_tree_stat_incr(rt, rs);
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+		rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+	rt->rt_space += delta;
+}
+
+static range_seg_t *
+range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	range_seg_max_t rsearch;
+	uint64_t end = start + size;
+
+	VERIFY(size != 0);
+
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end(&rsearch, rt, end);
+	return (zfs_btree_find(&rt->rt_root, &rsearch, NULL));
+}
+
+range_seg_t *
+range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
+
+	range_seg_t *rs = range_tree_find_impl(rt, start, size);
+	if (rs != NULL && rs_get_start(rs, rt) <= start &&
+	    rs_get_end(rs, rt) >= start + size) {
+		return (rs);
+	}
+	return (NULL);
+}
+
+void
+range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size)
+{
+	range_seg_t *rs = range_tree_find(rt, off, size);
+	if (rs != NULL)
+		panic("segment already in tree; rs=%p", (void *)rs);
+}
+
+boolean_t
+range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	return (range_tree_find(rt, start, size) != NULL);
+}
+
+/*
+ * Returns the first subset of the given range which overlaps with the range
+ * tree. Returns true if there is a segment in the range, and false if there
+ * isn't.
+ */
+boolean_t
+range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size,
+    uint64_t *ostart, uint64_t *osize)
+{
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
+
+	range_seg_max_t rsearch;
+	rs_set_start(&rsearch, rt, start);
+	rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1);
+
+	zfs_btree_index_t where;
+	range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where);
+	if (rs != NULL) {
+		*ostart = start;
+		*osize = MIN(size, rs_get_end(rs, rt) - start);
+		return (B_TRUE);
+	}
+
+	rs = zfs_btree_next(&rt->rt_root, &where, &where);
+	if (rs == NULL || rs_get_start(rs, rt) > start + size)
+		return (B_FALSE);
+
+	*ostart = rs_get_start(rs, rt);
+	*osize = MIN(start + size, rs_get_end(rs, rt)) -
+	    rs_get_start(rs, rt);
+	return (B_TRUE);
+}
+
+/*
+ * Ensure that this range is not in the tree, regardless of whether
+ * it is currently in the tree.
+ */
+void
+range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+	range_seg_t *rs;
+
+	if (size == 0)
+		return;
+
+	if (rt->rt_type == RANGE_SEG64)
+		ASSERT3U(start + size, >, start);
+
+	while ((rs = range_tree_find_impl(rt, start, size)) != NULL) {
+		uint64_t free_start = MAX(rs_get_start(rs, rt), start);
+		uint64_t free_end = MIN(rs_get_end(rs, rt), start + size);
+		range_tree_remove(rt, free_start, free_end - free_start);
+	}
+}
+
+void
+range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst)
+{
+	range_tree_t *rt;
+
+	ASSERT0(range_tree_space(*rtdst));
+	ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root));
+
+	rt = *rtsrc;
+	*rtsrc = *rtdst;
+	*rtdst = rt;
+}
+
+void
+range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+	if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
+		rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
+
+	if (func != NULL) {
+		range_seg_t *rs;
+		zfs_btree_index_t *cookie = NULL;
+
+		while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) !=
+		    NULL) {
+			func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+			    rs_get_start(rs, rt));
+		}
+	} else {
+		zfs_btree_clear(&rt->rt_root);
+	}
+
+	bzero(rt->rt_histogram, sizeof (rt->rt_histogram));
+	rt->rt_space = 0;
+}
+
+void
+range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
+{
+	zfs_btree_index_t where;
+	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where);
+	    rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+		func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) -
+		    rs_get_start(rs, rt));
+	}
+}
+
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+	return (zfs_btree_first(&rt->rt_root, NULL));
+}
+
+uint64_t
+range_tree_space(range_tree_t *rt)
+{
+	return (rt->rt_space);
+}
+
+uint64_t
+range_tree_numsegs(range_tree_t *rt)
+{
+	return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root));
+}
+
+boolean_t
+range_tree_is_empty(range_tree_t *rt)
+{
+	ASSERT(rt != NULL);
+	return (range_tree_space(rt) == 0);
+}
+
+/* ARGSUSED */
+void
+rt_btree_create(range_tree_t *rt, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+
+	size_t size;
+	switch (rt->rt_type) {
+	case RANGE_SEG32:
+		size = sizeof (range_seg32_t);
+		break;
+	case RANGE_SEG64:
+		size = sizeof (range_seg64_t);
+		break;
+	case RANGE_SEG_GAP:
+		size = sizeof (range_seg_gap_t);
+		break;
+	default:
+		panic("Invalid range seg type %d", rt->rt_type);
+	}
+	zfs_btree_create(size_tree, rt->rt_btree_compare, size);
+}
+
+/* ARGSUSED */
+void
+rt_btree_destroy(range_tree_t *rt, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+	ASSERT0(zfs_btree_numnodes(size_tree));
+
+	zfs_btree_destroy(size_tree);
+}
+
+/* ARGSUSED */
+void
+rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+
+	zfs_btree_add(size_tree, rs);
+}
+
+/* ARGSUSED */
+void
+rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+
+	zfs_btree_remove(size_tree, rs);
+}
+
+/* ARGSUSED */
+void
+rt_btree_vacate(range_tree_t *rt, void *arg)
+{
+	zfs_btree_t *size_tree = arg;
+	zfs_btree_clear(size_tree);
+	zfs_btree_destroy(size_tree);
+
+	rt_btree_create(rt, arg);
+}
+
+range_tree_ops_t rt_btree_ops = {
+	.rtop_create = rt_btree_create,
+	.rtop_destroy = rt_btree_destroy,
+	.rtop_add = rt_btree_add,
+	.rtop_remove = rt_btree_remove,
+	.rtop_vacate = rt_btree_vacate
+};
+
+/*
+ * Remove any overlapping ranges between the given segment [start, end)
+ * from removefrom. Add non-overlapping leftovers to addto.
+ */
+void
+range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
+    range_tree_t *removefrom, range_tree_t *addto)
+{
+	zfs_btree_index_t where;
+	range_seg_max_t starting_rs;
+	rs_set_start(&starting_rs, removefrom, start);
+	rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs,
+	    removefrom) + 1);
+
+	range_seg_t *curr = zfs_btree_find(&removefrom->rt_root,
+	    &starting_rs, &where);
+
+	if (curr == NULL)
+		curr = zfs_btree_next(&removefrom->rt_root, &where, &where);
+
+	range_seg_t *next;
+	for (; curr != NULL; curr = next) {
+		if (start == end)
+			return;
+		VERIFY3U(start, <, end);
+
+		/* there is no overlap */
+		if (end <= rs_get_start(curr, removefrom)) {
+			range_tree_add(addto, start, end - start);
+			return;
+		}
+
+		uint64_t overlap_start = MAX(rs_get_start(curr, removefrom),
+		    start);
+		uint64_t overlap_end = MIN(rs_get_end(curr, removefrom),
+		    end);
+		uint64_t overlap_size = overlap_end - overlap_start;
+		ASSERT3S(overlap_size, >, 0);
+		range_seg_max_t rs;
+		rs_copy(curr, &rs, removefrom);
+
+		range_tree_remove(removefrom, overlap_start, overlap_size);
+
+		if (start < overlap_start)
+			range_tree_add(addto, start, overlap_start - start);
+
+		start = overlap_end;
+		next = zfs_btree_find(&removefrom->rt_root, &rs, &where);
+		/*
+		 * If we find something here, we only removed part of the
+		 * curr segment. Either there's some left at the end
+		 * because we've reached the end of the range we're removing,
+		 * or there's some left at the start because we started
+		 * partway through the range.  Either way, we continue with
+		 * the loop. If it's the former, we'll return at the start of
+		 * the loop, and if it's the latter we'll see if there is more
+		 * area to process.
+		 */
+		if (next != NULL) {
+			ASSERT(start == end || start == rs_get_end(&rs,
+			    removefrom));
+		}
+
+		next = zfs_btree_next(&removefrom->rt_root, &where, &where);
+	}
+	VERIFY3P(curr, ==, NULL);
+
+	if (start != end) {
+		VERIFY3U(start, <, end);
+		range_tree_add(addto, start, end - start);
+	} else {
+		VERIFY3U(start, ==, end);
+	}
+}
+
+/*
+ * For each entry in rt, if it exists in removefrom, remove it
+ * from removefrom. Otherwise, add it to addto.
+ */
+void
+range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
+    range_tree_t *addto)
+{
+	zfs_btree_index_t where;
+	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs;
+	    rs = zfs_btree_next(&rt->rt_root, &where, &where)) {
+		range_tree_remove_xor_add_segment(rs_get_start(rs, rt),
+		    rs_get_end(rs, rt), removefrom, addto);
+	}
+}
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+	range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL);
+	return (rs != NULL ? rs_get_start(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+	range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL);
+	return (rs != NULL ? rs_get_end(rs, rt) : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+	return (range_tree_max(rt) - range_tree_min(rt));
+}
diff --git a/sys/contrib/openzfs/module/zfs/refcount.c b/sys/contrib/openzfs/module/zfs/refcount.c
new file mode 100644
index 000000000000..39476261edfb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/refcount.c
@@ -0,0 +1,327 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_refcount.h>
+
+/*
+ * Reference count tracking is disabled by default.  It's memory requirements
+ * are reasonable, however as implemented it consumes a significant amount of
+ * cpu time.  Until its performance is improved it should be manually enabled.
+ */
+int reference_tracking_enable = FALSE;
+int reference_history = 3; /* tunable */
+
+#ifdef	ZFS_DEBUG
+static kmem_cache_t *reference_cache;
+static kmem_cache_t *reference_history_cache;
+
+void
+zfs_refcount_init(void)
+{
+	reference_cache = kmem_cache_create("reference_cache",
+	    sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	reference_history_cache = kmem_cache_create("reference_history_cache",
+	    sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+zfs_refcount_fini(void)
+{
+	kmem_cache_destroy(reference_cache);
+	kmem_cache_destroy(reference_history_cache);
+}
+
+void
+zfs_refcount_create(zfs_refcount_t *rc)
+{
+	mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&rc->rc_list, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	list_create(&rc->rc_removed, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	rc->rc_count = 0;
+	rc->rc_removed_count = 0;
+	rc->rc_tracked = reference_tracking_enable;
+}
+
+void
+zfs_refcount_create_tracked(zfs_refcount_t *rc)
+{
+	zfs_refcount_create(rc);
+	rc->rc_tracked = B_TRUE;
+}
+
+void
+zfs_refcount_create_untracked(zfs_refcount_t *rc)
+{
+	zfs_refcount_create(rc);
+	rc->rc_tracked = B_FALSE;
+}
+
+void
+zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number)
+{
+	reference_t *ref;
+
+	ASSERT3U(rc->rc_count, ==, number);
+	while ((ref = list_head(&rc->rc_list))) {
+		list_remove(&rc->rc_list, ref);
+		kmem_cache_free(reference_cache, ref);
+	}
+	list_destroy(&rc->rc_list);
+
+	while ((ref = list_head(&rc->rc_removed))) {
+		list_remove(&rc->rc_removed, ref);
+		kmem_cache_free(reference_history_cache, ref->ref_removed);
+		kmem_cache_free(reference_cache, ref);
+	}
+	list_destroy(&rc->rc_removed);
+	mutex_destroy(&rc->rc_mtx);
+}
+
+void
+zfs_refcount_destroy(zfs_refcount_t *rc)
+{
+	zfs_refcount_destroy_many(rc, 0);
+}
+
+int
+zfs_refcount_is_zero(zfs_refcount_t *rc)
+{
+	return (rc->rc_count == 0);
+}
+
+int64_t
+zfs_refcount_count(zfs_refcount_t *rc)
+{
+	return (rc->rc_count);
+}
+
+int64_t
+zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder)
+{
+	reference_t *ref = NULL;
+	int64_t count;
+
+	if (rc->rc_tracked) {
+		ref = kmem_cache_alloc(reference_cache, KM_SLEEP);
+		ref->ref_holder = holder;
+		ref->ref_number = number;
+	}
+	mutex_enter(&rc->rc_mtx);
+	ASSERT3U(rc->rc_count, >=, 0);
+	if (rc->rc_tracked)
+		list_insert_head(&rc->rc_list, ref);
+	rc->rc_count += number;
+	count = rc->rc_count;
+	mutex_exit(&rc->rc_mtx);
+
+	return (count);
+}
+
+int64_t
+zfs_refcount_add(zfs_refcount_t *rc, const void *holder)
+{
+	return (zfs_refcount_add_many(rc, 1, holder));
+}
+
+int64_t
+zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number,
+    const void *holder)
+{
+	reference_t *ref;
+	int64_t count;
+
+	mutex_enter(&rc->rc_mtx);
+	ASSERT3U(rc->rc_count, >=, number);
+
+	if (!rc->rc_tracked) {
+		rc->rc_count -= number;
+		count = rc->rc_count;
+		mutex_exit(&rc->rc_mtx);
+		return (count);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder && ref->ref_number == number) {
+			list_remove(&rc->rc_list, ref);
+			if (reference_history > 0) {
+				ref->ref_removed =
+				    kmem_cache_alloc(reference_history_cache,
+				    KM_SLEEP);
+				list_insert_head(&rc->rc_removed, ref);
+				rc->rc_removed_count++;
+				if (rc->rc_removed_count > reference_history) {
+					ref = list_tail(&rc->rc_removed);
+					list_remove(&rc->rc_removed, ref);
+					kmem_cache_free(reference_history_cache,
+					    ref->ref_removed);
+					kmem_cache_free(reference_cache, ref);
+					rc->rc_removed_count--;
+				}
+			} else {
+				kmem_cache_free(reference_cache, ref);
+			}
+			rc->rc_count -= number;
+			count = rc->rc_count;
+			mutex_exit(&rc->rc_mtx);
+			return (count);
+		}
+	}
+	panic("No such hold %p on refcount %llx", holder,
+	    (u_longlong_t)(uintptr_t)rc);
+	return (-1);
+}
+
+int64_t
+zfs_refcount_remove(zfs_refcount_t *rc, const void *holder)
+{
+	return (zfs_refcount_remove_many(rc, 1, holder));
+}
+
+void
+zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src)
+{
+	int64_t count, removed_count;
+	list_t list, removed;
+
+	list_create(&list, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+	list_create(&removed, sizeof (reference_t),
+	    offsetof(reference_t, ref_link));
+
+	mutex_enter(&src->rc_mtx);
+	count = src->rc_count;
+	removed_count = src->rc_removed_count;
+	src->rc_count = 0;
+	src->rc_removed_count = 0;
+	list_move_tail(&list, &src->rc_list);
+	list_move_tail(&removed, &src->rc_removed);
+	mutex_exit(&src->rc_mtx);
+
+	mutex_enter(&dst->rc_mtx);
+	dst->rc_count += count;
+	dst->rc_removed_count += removed_count;
+	list_move_tail(&dst->rc_list, &list);
+	list_move_tail(&dst->rc_removed, &removed);
+	mutex_exit(&dst->rc_mtx);
+
+	list_destroy(&list);
+	list_destroy(&removed);
+}
+
+void
+zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number,
+    const void *current_holder, const void *new_holder)
+{
+	reference_t *ref;
+	boolean_t found = B_FALSE;
+
+	mutex_enter(&rc->rc_mtx);
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return;
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == current_holder &&
+		    ref->ref_number == number) {
+			ref->ref_holder = new_holder;
+			found = B_TRUE;
+			break;
+		}
+	}
+	ASSERT(found);
+	mutex_exit(&rc->rc_mtx);
+}
+
+void
+zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder,
+    const void *new_holder)
+{
+	return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder,
+	    new_holder));
+}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+zfs_refcount_held(zfs_refcount_t *rc, const void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (rc->rc_count > 0);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_TRUE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder)
+{
+	reference_t *ref;
+
+	mutex_enter(&rc->rc_mtx);
+
+	if (!rc->rc_tracked) {
+		mutex_exit(&rc->rc_mtx);
+		return (B_TRUE);
+	}
+
+	for (ref = list_head(&rc->rc_list); ref;
+	    ref = list_next(&rc->rc_list, ref)) {
+		if (ref->ref_holder == holder) {
+			mutex_exit(&rc->rc_mtx);
+			return (B_FALSE);
+		}
+	}
+	mutex_exit(&rc->rc_mtx);
+	return (B_TRUE);
+}
+#endif	/* ZFS_DEBUG */
diff --git a/sys/contrib/openzfs/module/zfs/rrwlock.c b/sys/contrib/openzfs/module/zfs/rrwlock.c
new file mode 100644
index 000000000000..d23fc3ad1067
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/rrwlock.c
@@ -0,0 +1,396 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
+#include <sys/rrwlock.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * This file contains the implementation of a re-entrant read
+ * reader/writer lock (aka "rrwlock").
+ *
+ * This is a normal reader/writer lock with the additional feature
+ * of allowing threads who have already obtained a read lock to
+ * re-enter another read lock (re-entrant read) - even if there are
+ * waiting writers.
+ *
+ * Callers who have not obtained a read lock give waiting writers priority.
+ *
+ * The rrwlock_t lock does not allow re-entrant writers, nor does it
+ * allow a re-entrant mix of reads and writes (that is, it does not
+ * allow a caller who has already obtained a read lock to be able to
+ * then grab a write lock without first dropping all read locks, and
+ * vice versa).
+ *
+ * The rrwlock_t uses tsd (thread specific data) to keep a list of
+ * nodes (rrw_node_t), where each node keeps track of which specific
+ * lock (rrw_node_t::rn_rrl) the thread has grabbed.  Since re-entering
+ * should be rare, a thread that grabs multiple reads on the same rrwlock_t
+ * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the
+ * tsd list can represent a different rrwlock_t.  This allows a thread
+ * to enter multiple and unique rrwlock_ts for read locks at the same time.
+ *
+ * Since using tsd exposes some overhead, the rrwlock_t only needs to
+ * keep tsd data when writers are waiting.  If no writers are waiting, then
+ * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd
+ * is needed.  Once a writer attempts to grab the lock, readers then
+ * keep tsd data and bump the linked readers count (rr_linked_rcount).
+ *
+ * If there are waiting writers and there are anonymous readers, then a
+ * reader doesn't know if it is a re-entrant lock. But since it may be one,
+ * we allow the read to proceed (otherwise it could deadlock).  Since once
+ * waiting writers are active, readers no longer bump the anonymous count,
+ * the anonymous readers will eventually flush themselves out.  At this point,
+ * readers will be able to tell if they are a re-entrant lock (have a
+ * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then
+ * we must let the proceed.  If they are not, then the reader blocks for the
+ * waiting writers.  Hence, we do not starve writers.
+ */
+
+/* global key for TSD */
+uint_t rrw_tsd_key;
+
+typedef struct rrw_node {
+	struct rrw_node *rn_next;
+	rrwlock_t *rn_rrl;
+	void *rn_tag;
+} rrw_node_t;
+
+static rrw_node_t *
+rrn_find(rrwlock_t *rrl)
+{
+	rrw_node_t *rn;
+
+	if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (NULL);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl)
+			return (rn);
+	}
+	return (NULL);
+}
+
+/*
+ * Add a node to the head of the singly linked list.
+ */
+static void
+rrn_add(rrwlock_t *rrl, void *tag)
+{
+	rrw_node_t *rn;
+
+	rn = kmem_alloc(sizeof (*rn), KM_SLEEP);
+	rn->rn_rrl = rrl;
+	rn->rn_next = tsd_get(rrw_tsd_key);
+	rn->rn_tag = tag;
+	VERIFY(tsd_set(rrw_tsd_key, rn) == 0);
+}
+
+/*
+ * If a node is found for 'rrl', then remove the node from this
+ * thread's list and return TRUE; otherwise return FALSE.
+ */
+static boolean_t
+rrn_find_and_remove(rrwlock_t *rrl, void *tag)
+{
+	rrw_node_t *rn;
+	rrw_node_t *prev = NULL;
+
+	if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0)
+		return (B_FALSE);
+
+	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
+		if (rn->rn_rrl == rrl && rn->rn_tag == tag) {
+			if (prev)
+				prev->rn_next = rn->rn_next;
+			else
+				VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0);
+			kmem_free(rn, sizeof (*rn));
+			return (B_TRUE);
+		}
+		prev = rn;
+	}
+	return (B_FALSE);
+}
+
+void
+rrw_init(rrwlock_t *rrl, boolean_t track_all)
+{
+	mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL);
+	rrl->rr_writer = NULL;
+	zfs_refcount_create(&rrl->rr_anon_rcount);
+	zfs_refcount_create(&rrl->rr_linked_rcount);
+	rrl->rr_writer_wanted = B_FALSE;
+	rrl->rr_track_all = track_all;
+}
+
+void
+rrw_destroy(rrwlock_t *rrl)
+{
+	mutex_destroy(&rrl->rr_lock);
+	cv_destroy(&rrl->rr_cv);
+	ASSERT(rrl->rr_writer == NULL);
+	zfs_refcount_destroy(&rrl->rr_anon_rcount);
+	zfs_refcount_destroy(&rrl->rr_linked_rcount);
+}
+
+static void
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+#if !defined(ZFS_DEBUG) && defined(_KERNEL)
+	if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted &&
+	    !rrl->rr_track_all) {
+		rrl->rr_anon_rcount.rc_count++;
+		mutex_exit(&rrl->rr_lock);
+		return;
+	}
+	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
+	ASSERT(rrl->rr_writer != curthread);
+	ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0);
+
+	while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
+	    zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
+	    rrn_find(rrl) == NULL))
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+
+	if (rrl->rr_writer_wanted || rrl->rr_track_all) {
+		/* may or may not be a re-entrant enter */
+		rrn_add(rrl, tag);
+		(void) zfs_refcount_add(&rrl->rr_linked_rcount, tag);
+	} else {
+		(void) zfs_refcount_add(&rrl->rr_anon_rcount, tag);
+	}
+	ASSERT(rrl->rr_writer == NULL);
+	mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+	rrw_enter_read_impl(rrl, B_FALSE, tag);
+}
+
+/*
+ * take a read lock even if there are pending write lock requests. if we want
+ * to take a lock reentrantly, but from different threads (that have a
+ * relationship to each other), the normal detection mechanism to overrule
+ * the pending writer does not work, so we have to give an explicit hint here.
+ */
+void
+rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+{
+	rrw_enter_read_impl(rrl, B_TRUE, tag);
+}
+
+
+void
+rrw_enter_write(rrwlock_t *rrl)
+{
+	mutex_enter(&rrl->rr_lock);
+	ASSERT(rrl->rr_writer != curthread);
+
+	while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 ||
+	    zfs_refcount_count(&rrl->rr_linked_rcount) > 0 ||
+	    rrl->rr_writer != NULL) {
+		rrl->rr_writer_wanted = B_TRUE;
+		cv_wait(&rrl->rr_cv, &rrl->rr_lock);
+	}
+	rrl->rr_writer_wanted = B_FALSE;
+	rrl->rr_writer = curthread;
+	mutex_exit(&rrl->rr_lock);
+}
+
+void
+rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rrw_enter_read(rrl, tag);
+	else
+		rrw_enter_write(rrl);
+}
+
+void
+rrw_exit(rrwlock_t *rrl, void *tag)
+{
+	mutex_enter(&rrl->rr_lock);
+#if !defined(ZFS_DEBUG) && defined(_KERNEL)
+	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+		rrl->rr_anon_rcount.rc_count--;
+		if (rrl->rr_anon_rcount.rc_count == 0)
+			cv_broadcast(&rrl->rr_cv);
+		mutex_exit(&rrl->rr_lock);
+		return;
+	}
+	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
+	ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
+	    !zfs_refcount_is_zero(&rrl->rr_linked_rcount) ||
+	    rrl->rr_writer != NULL);
+
+	if (rrl->rr_writer == NULL) {
+		int64_t count;
+		if (rrn_find_and_remove(rrl, tag)) {
+			count = zfs_refcount_remove(
+			    &rrl->rr_linked_rcount, tag);
+		} else {
+			ASSERT(!rrl->rr_track_all);
+			count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag);
+		}
+		if (count == 0)
+			cv_broadcast(&rrl->rr_cv);
+	} else {
+		ASSERT(rrl->rr_writer == curthread);
+		ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) &&
+		    zfs_refcount_is_zero(&rrl->rr_linked_rcount));
+		rrl->rr_writer = NULL;
+		cv_broadcast(&rrl->rr_cv);
+	}
+	mutex_exit(&rrl->rr_lock);
+}
+
+/*
+ * If the lock was created with track_all, rrw_held(RW_READER) will return
+ * B_TRUE iff the current thread has the lock for reader.  Otherwise it may
+ * return B_TRUE if any thread has the lock for reader.
+ */
+boolean_t
+rrw_held(rrwlock_t *rrl, krw_t rw)
+{
+	boolean_t held;
+
+	mutex_enter(&rrl->rr_lock);
+	if (rw == RW_WRITER) {
+		held = (rrl->rr_writer == curthread);
+	} else {
+		held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) ||
+		    rrn_find(rrl) != NULL);
+	}
+	mutex_exit(&rrl->rr_lock);
+
+	return (held);
+}
+
+void
+rrw_tsd_destroy(void *arg)
+{
+	rrw_node_t *rn = arg;
+	if (rn != NULL) {
+		panic("thread %p terminating with rrw lock %p held",
+		    (void *)curthread, (void *)rn->rn_rrl);
+	}
+}
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function.  That proportionally reduces lock congestion.
+ * Writer at the same time has to sequentially acquire write on all the locks.
+ * That makes write acquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+	if (rw == RW_READER)
+		rrm_enter_read(rrl, tag);
+	else
+		rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock.  Note that the lock
+ * must be released by the same thread that acquired it.  We do this
+ * mapping by taking the thread pointer mod a prime number.  We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define	RRM_TD_LOCK()	(((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+	rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+	int i;
+
+	for (i = 0; i < RRM_NUM_LOCKS; i++)
+		rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+	int i;
+
+	if (rrl->locks[0].rr_writer == curthread) {
+		for (i = 0; i < RRM_NUM_LOCKS; i++)
+			rrw_exit(&rrl->locks[i], tag);
+	} else {
+		rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+	}
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+	if (rw == RW_WRITER) {
+		return (rrw_held(&rrl->locks[0], rw));
+	} else {
+		return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+	}
+}
diff --git a/sys/contrib/openzfs/module/zfs/sa.c b/sys/contrib/openzfs/module/zfs/sa.c
new file mode 100644
index 000000000000..5af0aaa7d0aa
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/sa.c
@@ -0,0 +1,2257 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/dmu.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_tx.h>
+#include <sys/dbuf.h>
+#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/sa.h>
+#include <sys/sunddi.h>
+#include <sys/sa_impl.h>
+#include <sys/errno.h>
+#include <sys/zfs_context.h>
+
+#ifdef _KERNEL
+#include <sys/zfs_znode.h>
+#endif
+
+/*
+ * ZFS System attributes:
+ *
+ * A generic mechanism to allow for arbitrary attributes
+ * to be stored in a dnode.  The data will be stored in the bonus buffer of
+ * the dnode and if necessary a special "spill" block will be used to handle
+ * overflow situations.  The spill block will be sized to fit the data
+ * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
+ * spill block is stored at the end of the current bonus buffer.  Any
+ * attributes that would be in the way of the blkptr_t will be relocated
+ * into the spill block.
+ *
+ * Attribute registration:
+ *
+ * Stored persistently on a per dataset basis
+ * a mapping between attribute "string" names and their actual attribute
+ * numeric values, length, and byteswap function.  The names are only used
+ * during registration.  All  attributes are known by their unique attribute
+ * id value.  If an attribute can have a variable size then the value
+ * 0 will be used to indicate this.
+ *
+ * Attribute Layout:
+ *
+ * Attribute layouts are a way to compactly store multiple attributes, but
+ * without taking the overhead associated with managing each attribute
+ * individually.  Since you will typically have the same set of attributes
+ * stored in the same order a single table will be used to represent that
+ * layout.  The ZPL for example will usually have only about 10 different
+ * layouts (regular files, device files, symlinks,
+ * regular files + scanstamp, files/dir with extended attributes, and then
+ * you have the possibility of all of those minus ACL, because it would
+ * be kicked out into the spill block)
+ *
+ * Layouts are simply an array of the attributes and their
+ * ordering i.e. [0, 1, 4, 5, 2]
+ *
+ * Each distinct layout is given a unique layout number and that is what's
+ * stored in the header at the beginning of the SA data buffer.
+ *
+ * A layout only covers a single dbuf (bonus or spill).  If a set of
+ * attributes is split up between the bonus buffer and a spill buffer then
+ * two different layouts will be used.  This allows us to byteswap the
+ * spill without looking at the bonus buffer and keeps the on disk format of
+ * the bonus and spill buffer the same.
+ *
+ * Adding a single attribute will cause the entire set of attributes to
+ * be rewritten and could result in a new layout number being constructed
+ * as part of the rewrite if no such layout exists for the new set of
+ * attributes.  The new attribute will be appended to the end of the already
+ * existing attributes.
+ *
+ * Both the attribute registration and attribute layout information are
+ * stored in normal ZAP attributes.  Their should be a small number of
+ * known layouts and the set of attributes is assumed to typically be quite
+ * small.
+ *
+ * The registered attributes and layout "table" information is maintained
+ * in core and a special "sa_os_t" is attached to the objset_t.
+ *
+ * A special interface is provided to allow for quickly applying
+ * a large set of attributes at once.  sa_replace_all_by_template() is
+ * used to set an array of attributes.  This is used by the ZPL when
+ * creating a brand new file.  The template that is passed into the function
+ * specifies the attribute, size for variable length attributes, location of
+ * data and special "data locator" function if the data isn't in a contiguous
+ * location.
+ *
+ * Byteswap implications:
+ *
+ * Since the SA attributes are not entirely self describing we can't do
+ * the normal byteswap processing.  The special ZAP layout attribute and
+ * attribute registration attributes define the byteswap function and the
+ * size of the attributes, unless it is variable sized.
+ * The normal ZFS byteswapping infrastructure assumes you don't need
+ * to read any objects in order to do the necessary byteswapping.  Whereas
+ * SA attributes can only be properly byteswapped if the dataset is opened
+ * and the layout/attribute ZAP attributes are available.  Because of this
+ * the SA attributes will be byteswapped when they are first accessed by
+ * the SA code that will read the SA data.
+ */
+
+typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
+    uint16_t length, int length_idx, boolean_t, void *userp);
+
+static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
+static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
+static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
+    sa_hdr_phys_t *hdr);
+static void sa_idx_tab_rele(objset_t *os, void *arg);
+static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
+    int buflen);
+static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+    uint16_t buflen, dmu_tx_t *tx);
+
+arc_byteswap_func_t sa_bswap_table[] = {
+	byteswap_uint64_array,
+	byteswap_uint32_array,
+	byteswap_uint16_array,
+	byteswap_uint8_array,
+	zfs_acl_byteswap,
+};
+
+#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
+#define	SA_COPY_DATA(f, s, t, l)				\
+do {								\
+	if (f == NULL) {					\
+		if (l == 8) {					\
+			*(uint64_t *)t = *(uint64_t *)s;	\
+		} else if (l == 16) {				\
+			*(uint64_t *)t = *(uint64_t *)s;	\
+			*(uint64_t *)((uintptr_t)t + 8) =	\
+			    *(uint64_t *)((uintptr_t)s + 8);	\
+		} else {					\
+			bcopy(s, t, l);				\
+		}						\
+	} else {						\
+		sa_copy_data(f, s, t, l);			\
+	}							\
+} while (0)
+#else
+#define	SA_COPY_DATA(f, s, t, l)	sa_copy_data(f, s, t, l)
+#endif
+
+/*
+ * This table is fixed and cannot be changed.  Its purpose is to
+ * allow the SA code to work with both old/new ZPL file systems.
+ * It contains the list of legacy attributes.  These attributes aren't
+ * stored in the "attribute" registry zap objects, since older ZPL file systems
+ * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
+ * use this static table.
+ */
+sa_attr_reg_t sa_legacy_attrs[] = {
+	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+};
+
+/*
+ * This is only used for objects of type DMU_OT_ZNODE
+ */
+sa_attr_type_t sa_legacy_zpl_layout[] = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+/*
+ * Special dummy layout used for buffers with no attributes.
+ */
+sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
+
+static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
+static kmem_cache_t *sa_cache = NULL;
+
+/*ARGSUSED*/
+static int
+sa_cache_constructor(void *buf, void *unused, int kmflag)
+{
+	sa_handle_t *hdl = buf;
+
+	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_cache_destructor(void *buf, void *unused)
+{
+	sa_handle_t *hdl = buf;
+	mutex_destroy(&hdl->sa_lock);
+}
+
+void
+sa_cache_init(void)
+{
+	sa_cache = kmem_cache_create("sa_cache",
+	    sizeof (sa_handle_t), 0, sa_cache_constructor,
+	    sa_cache_destructor, NULL, NULL, NULL, 0);
+}
+
+void
+sa_cache_fini(void)
+{
+	if (sa_cache)
+		kmem_cache_destroy(sa_cache);
+}
+
+static int
+layout_num_compare(const void *arg1, const void *arg2)
+{
+	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
+
+	return (TREE_CMP(node1->lot_num, node2->lot_num));
+}
+
+static int
+layout_hash_compare(const void *arg1, const void *arg2)
+{
+	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
+	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
+
+	int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_CMP(node1->lot_instance, node2->lot_instance));
+}
+
+static boolean_t
+sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
+{
+	int i;
+
+	if (count != tbf->lot_attr_count)
+		return (1);
+
+	for (i = 0; i != count; i++) {
+		if (attrs[i] != tbf->lot_attrs[i])
+			return (1);
+	}
+	return (0);
+}
+
+#define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
+
+static uint64_t
+sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count)
+{
+	int i;
+	uint64_t crc = -1ULL;
+
+	for (i = 0; i != attr_count; i++)
+		crc ^= SA_ATTR_HASH(attrs[i]);
+
+	return (crc);
+}
+
+static int
+sa_get_spill(sa_handle_t *hdl)
+{
+	int rc;
+	if (hdl->sa_spill == NULL) {
+		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
+		    &hdl->sa_spill)) == 0)
+			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+	} else {
+		rc = 0;
+	}
+
+	return (rc);
+}
+
+/*
+ * Main attribute lookup/update function
+ * returns 0 for success or non zero for failures
+ *
+ * Operates on bulk array, first failure will abort further processing
+ */
+static int
+sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+    sa_data_op_t data_op, dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	int i;
+	int error = 0;
+	sa_buf_type_t buftypes;
+
+	buftypes = 0;
+
+	ASSERT(count > 0);
+	for (i = 0; i != count; i++) {
+		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
+
+		bulk[i].sa_addr = NULL;
+		/* First check the bonus buffer */
+
+		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
+		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
+			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
+			    SA_GET_HDR(hdl, SA_BONUS),
+			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
+			if (tx && !(buftypes & SA_BONUS)) {
+				dmu_buf_will_dirty(hdl->sa_bonus, tx);
+				buftypes |= SA_BONUS;
+			}
+		}
+		if (bulk[i].sa_addr == NULL &&
+		    ((error = sa_get_spill(hdl)) == 0)) {
+			if (TOC_ATTR_PRESENT(
+			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
+				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
+				    SA_GET_HDR(hdl, SA_SPILL),
+				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
+				if (tx && !(buftypes & SA_SPILL) &&
+				    bulk[i].sa_size == bulk[i].sa_length) {
+					dmu_buf_will_dirty(hdl->sa_spill, tx);
+					buftypes |= SA_SPILL;
+				}
+			}
+		}
+		if (error && error != ENOENT) {
+			return ((error == ECKSUM) ? EIO : error);
+		}
+
+		switch (data_op) {
+		case SA_LOOKUP:
+			if (bulk[i].sa_addr == NULL)
+				return (SET_ERROR(ENOENT));
+			if (bulk[i].sa_data) {
+				SA_COPY_DATA(bulk[i].sa_data_func,
+				    bulk[i].sa_addr, bulk[i].sa_data,
+				    bulk[i].sa_size);
+			}
+			continue;
+
+		case SA_UPDATE:
+			/* existing rewrite of attr */
+			if (bulk[i].sa_addr &&
+			    bulk[i].sa_size == bulk[i].sa_length) {
+				SA_COPY_DATA(bulk[i].sa_data_func,
+				    bulk[i].sa_data, bulk[i].sa_addr,
+				    bulk[i].sa_length);
+				continue;
+			} else if (bulk[i].sa_addr) { /* attr size change */
+				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+				    SA_REPLACE, bulk[i].sa_data_func,
+				    bulk[i].sa_data, bulk[i].sa_length, tx);
+			} else { /* adding new attribute */
+				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
+				    SA_ADD, bulk[i].sa_data_func,
+				    bulk[i].sa_data, bulk[i].sa_length, tx);
+			}
+			if (error)
+				return (error);
+			break;
+		default:
+			break;
+		}
+	}
+	return (error);
+}
+
+static sa_lot_t *
+sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count,
+    uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
+{
+	sa_os_t *sa = os->os_sa;
+	sa_lot_t *tb, *findtb;
+	int i;
+	avl_index_t loc;
+
+	ASSERT(MUTEX_HELD(&sa->sa_lock));
+	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
+	tb->lot_attr_count = attr_count;
+	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+	    KM_SLEEP);
+	bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count);
+	tb->lot_num = lot_num;
+	tb->lot_hash = hash;
+	tb->lot_instance = 0;
+
+	if (zapadd) {
+		char attr_name[8];
+
+		if (sa->sa_layout_attr_obj == 0) {
+			sa->sa_layout_attr_obj = zap_create_link(os,
+			    DMU_OT_SA_ATTR_LAYOUTS,
+			    sa->sa_master_obj, SA_LAYOUTS, tx);
+		}
+
+		(void) snprintf(attr_name, sizeof (attr_name),
+		    "%d", (int)lot_num);
+		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
+		    attr_name, 2, attr_count, attrs, tx));
+	}
+
+	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
+	    offsetof(sa_idx_tab_t, sa_next));
+
+	for (i = 0; i != attr_count; i++) {
+		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
+			tb->lot_var_sizes++;
+	}
+
+	avl_add(&sa->sa_layout_num_tree, tb);
+
+	/* verify we don't have a hash collision */
+	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
+		for (; findtb && findtb->lot_hash == hash;
+		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
+			if (findtb->lot_instance != tb->lot_instance)
+				break;
+			tb->lot_instance++;
+		}
+	}
+	avl_add(&sa->sa_layout_hash_tree, tb);
+	return (tb);
+}
+
+static void
+sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
+    int count, dmu_tx_t *tx, sa_lot_t **lot)
+{
+	sa_lot_t *tb, tbsearch;
+	avl_index_t loc;
+	sa_os_t *sa = os->os_sa;
+	boolean_t found = B_FALSE;
+
+	mutex_enter(&sa->sa_lock);
+	tbsearch.lot_hash = hash;
+	tbsearch.lot_instance = 0;
+	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
+	if (tb) {
+		for (; tb && tb->lot_hash == hash;
+		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
+			if (sa_layout_equal(tb, attrs, count) == 0) {
+				found = B_TRUE;
+				break;
+			}
+		}
+	}
+	if (!found) {
+		tb = sa_add_layout_entry(os, attrs, count,
+		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
+	}
+	mutex_exit(&sa->sa_lock);
+	*lot = tb;
+}
+
+static int
+sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
+{
+	int error;
+	uint32_t blocksize;
+
+	if (size == 0) {
+		blocksize = SPA_MINBLOCKSIZE;
+	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
+		ASSERT(0);
+		return (SET_ERROR(EFBIG));
+	} else {
+		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
+	}
+
+	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
+	ASSERT(error == 0);
+	return (error);
+}
+
+static void
+sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
+{
+	if (func == NULL) {
+		bcopy(datastart, target, buflen);
+	} else {
+		boolean_t start;
+		int bytes;
+		void *dataptr;
+		void *saptr = target;
+		uint32_t length;
+
+		start = B_TRUE;
+		bytes = 0;
+		while (bytes < buflen) {
+			func(&dataptr, &length, buflen, start, datastart);
+			bcopy(dataptr, saptr, length);
+			saptr = (void *)((caddr_t)saptr + length);
+			bytes += length;
+			start = B_FALSE;
+		}
+	}
+}
+
+/*
+ * Determine several different values pertaining to system attribute
+ * buffers.
+ *
+ * Return the size of the sa_hdr_phys_t header for the buffer. Each
+ * variable length attribute except the first contributes two bytes to
+ * the header size, which is then rounded up to an 8-byte boundary.
+ *
+ * The following output parameters are also computed.
+ *
+ *  index - The index of the first attribute in attr_desc that will
+ *  spill over. Only valid if will_spill is set.
+ *
+ *  total - The total number of bytes of all system attributes described
+ *  in attr_desc.
+ *
+ *  will_spill - Set when spilling is necessary. It is only set when
+ *  the buftype is SA_BONUS.
+ */
+static int
+sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
+    dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
+    int *total, boolean_t *will_spill)
+{
+	int var_size_count = 0;
+	int i;
+	int hdrsize;
+	int extra_hdrsize;
+
+	if (buftype == SA_BONUS && sa->sa_force_spill) {
+		*total = 0;
+		*index = 0;
+		*will_spill = B_TRUE;
+		return (0);
+	}
+
+	*index = -1;
+	*total = 0;
+	*will_spill = B_FALSE;
+
+	extra_hdrsize = 0;
+	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
+	    sizeof (sa_hdr_phys_t);
+
+	ASSERT(IS_P2ALIGNED(full_space, 8));
+
+	for (i = 0; i != attr_count; i++) {
+		boolean_t is_var_sz, might_spill_here;
+		int tmp_hdrsize;
+
+		*total = P2ROUNDUP(*total, 8);
+		*total += attr_desc[i].sa_length;
+		if (*will_spill)
+			continue;
+
+		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
+		if (is_var_sz)
+			var_size_count++;
+
+		/*
+		 * Calculate what the SA header size would be if this
+		 * attribute doesn't spill.
+		 */
+		tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ?
+		    sizeof (uint16_t) : 0);
+
+		/*
+		 * Check whether this attribute spans into the space
+		 * that would be used by the spill block pointer should
+		 * a spill block be needed.
+		 */
+		might_spill_here =
+		    buftype == SA_BONUS && *index == -1 &&
+		    (*total + P2ROUNDUP(tmp_hdrsize, 8)) >
+		    (full_space - sizeof (blkptr_t));
+
+		if (is_var_sz && var_size_count > 1) {
+			if (buftype == SA_SPILL ||
+			    tmp_hdrsize + *total < full_space) {
+				/*
+				 * Record the extra header size in case this
+				 * increase needs to be reversed due to
+				 * spill-over.
+				 */
+				hdrsize = tmp_hdrsize;
+				if (*index != -1 || might_spill_here)
+					extra_hdrsize += sizeof (uint16_t);
+			} else {
+				ASSERT(buftype == SA_BONUS);
+				if (*index == -1)
+					*index = i;
+				*will_spill = B_TRUE;
+				continue;
+			}
+		}
+
+		/*
+		 * Store index of where spill *could* occur. Then
+		 * continue to count the remaining attribute sizes. The
+		 * sum is used later for sizing bonus and spill buffer.
+		 */
+		if (might_spill_here)
+			*index = i;
+
+		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
+		    buftype == SA_BONUS)
+			*will_spill = B_TRUE;
+	}
+
+	if (*will_spill)
+		hdrsize -= extra_hdrsize;
+
+	hdrsize = P2ROUNDUP(hdrsize, 8);
+	return (hdrsize);
+}
+
+#define	BUF_SPACE_NEEDED(total, header) (total + header)
+
+/*
+ * Find layout that corresponds to ordering of attributes
+ * If not found a new layout number is created and added to
+ * persistent layout tables.
+ */
+static int
+sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
+    dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	uint64_t hash;
+	sa_buf_type_t buftype;
+	sa_hdr_phys_t *sahdr;
+	void *data_start;
+	sa_attr_type_t *attrs, *attrs_start;
+	int i, lot_count;
+	int dnodesize;
+	int spill_idx;
+	int hdrsize;
+	int spillhdrsize = 0;
+	int used;
+	dmu_object_type_t bonustype;
+	sa_lot_t *lot;
+	int len_idx;
+	int spill_used;
+	int bonuslen;
+	boolean_t spilling;
+
+	dmu_buf_will_dirty(hdl->sa_bonus, tx);
+	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
+	dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
+	bonuslen = DN_BONUS_SIZE(dnodesize);
+
+	/* first determine bonus header size and sum of all attributes */
+	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
+	    SA_BONUS, bonuslen, &spill_idx, &used, &spilling);
+
+	if (used > SPA_OLD_MAXBLOCKSIZE)
+		return (SET_ERROR(EFBIG));
+
+	VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ?
+	    MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
+	    used + hdrsize, tx));
+
+	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
+	    bonustype == DMU_OT_SA);
+
+	/* setup and size spill buffer when needed */
+	if (spilling) {
+		boolean_t dummy;
+
+		if (hdl->sa_spill == NULL) {
+			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
+			    &hdl->sa_spill) == 0);
+		}
+		dmu_buf_will_dirty(hdl->sa_spill, tx);
+
+		spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
+		    attr_count - spill_idx, hdl->sa_spill, SA_SPILL,
+		    hdl->sa_spill->db_size, &i, &spill_used, &dummy);
+
+		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
+			return (SET_ERROR(EFBIG));
+
+		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
+		    hdl->sa_spill->db_size)
+			VERIFY(0 == sa_resize_spill(hdl,
+			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
+	}
+
+	/* setup starting pointers to lay down data */
+	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
+	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
+	buftype = SA_BONUS;
+
+	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
+	    KM_SLEEP);
+	lot_count = 0;
+
+	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
+		uint16_t length;
+
+		ASSERT(IS_P2ALIGNED(data_start, 8));
+		attrs[i] = attr_desc[i].sa_attr;
+		length = SA_REGISTERED_LEN(sa, attrs[i]);
+		if (length == 0)
+			length = attr_desc[i].sa_length;
+
+		if (spilling && i == spill_idx) { /* switch to spill buffer */
+			VERIFY(bonustype == DMU_OT_SA);
+			if (buftype == SA_BONUS && !sa->sa_force_spill) {
+				sa_find_layout(hdl->sa_os, hash, attrs_start,
+				    lot_count, tx, &lot);
+				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
+			}
+
+			buftype = SA_SPILL;
+			hash = -1ULL;
+			len_idx = 0;
+
+			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
+			sahdr->sa_magic = SA_MAGIC;
+			data_start = (void *)((uintptr_t)sahdr +
+			    spillhdrsize);
+			attrs_start = &attrs[i];
+			lot_count = 0;
+		}
+		hash ^= SA_ATTR_HASH(attrs[i]);
+		attr_desc[i].sa_addr = data_start;
+		attr_desc[i].sa_size = length;
+		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
+		    data_start, length);
+		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
+			sahdr->sa_lengths[len_idx++] = length;
+		}
+		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+		    length), 8);
+		lot_count++;
+	}
+
+	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
+
+	/*
+	 * Verify that old znodes always have layout number 0.
+	 * Must be DMU_OT_SA for arbitrary layouts
+	 */
+	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
+	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
+
+	if (bonustype == DMU_OT_SA) {
+		SA_SET_HDR(sahdr, lot->lot_num,
+		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
+	}
+
+	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
+	if (hdl->sa_bonus_tab) {
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+		hdl->sa_bonus_tab = NULL;
+	}
+	if (!sa->sa_force_spill)
+		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
+	if (hdl->sa_spill) {
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+		if (!spilling) {
+			/*
+			 * remove spill block that is no longer needed.
+			 */
+			dmu_buf_rele(hdl->sa_spill, NULL);
+			hdl->sa_spill = NULL;
+			hdl->sa_spill_tab = NULL;
+			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
+			    sa_handle_object(hdl), tx));
+		} else {
+			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
+		}
+	}
+
+	return (0);
+}
+
+static void
+sa_free_attr_table(sa_os_t *sa)
+{
+	int i;
+
+	if (sa->sa_attr_table == NULL)
+		return;
+
+	for (i = 0; i != sa->sa_num_attrs; i++) {
+		if (sa->sa_attr_table[i].sa_name)
+			kmem_free(sa->sa_attr_table[i].sa_name,
+			    strlen(sa->sa_attr_table[i].sa_name) + 1);
+	}
+
+	kmem_free(sa->sa_attr_table,
+	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
+
+	sa->sa_attr_table = NULL;
+}
+
+static int
+sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count)
+{
+	sa_os_t *sa = os->os_sa;
+	uint64_t sa_attr_count = 0;
+	uint64_t sa_reg_count = 0;
+	int error = 0;
+	uint64_t attr_value;
+	sa_attr_table_t *tb;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int registered_count = 0;
+	int i;
+	dmu_objset_type_t ostype = dmu_objset_type(os);
+
+	sa->sa_user_table =
+	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
+	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
+
+	if (sa->sa_reg_attr_obj != 0) {
+		error = zap_count(os, sa->sa_reg_attr_obj,
+		    &sa_attr_count);
+
+		/*
+		 * Make sure we retrieved a count and that it isn't zero
+		 */
+		if (error || (error == 0 && sa_attr_count == 0)) {
+			if (error == 0)
+				error = SET_ERROR(EINVAL);
+			goto bail;
+		}
+		sa_reg_count = sa_attr_count;
+	}
+
+	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
+		sa_attr_count += sa_legacy_attr_count;
+
+	/* Allocate attribute numbers for attributes that aren't registered */
+	for (i = 0; i != count; i++) {
+		boolean_t found = B_FALSE;
+		int j;
+
+		if (ostype == DMU_OST_ZFS) {
+			for (j = 0; j != sa_legacy_attr_count; j++) {
+				if (strcmp(reg_attrs[i].sa_name,
+				    sa_legacy_attrs[j].sa_name) == 0) {
+					sa->sa_user_table[i] =
+					    sa_legacy_attrs[j].sa_attr;
+					found = B_TRUE;
+				}
+			}
+		}
+		if (found)
+			continue;
+
+		if (sa->sa_reg_attr_obj)
+			error = zap_lookup(os, sa->sa_reg_attr_obj,
+			    reg_attrs[i].sa_name, 8, 1, &attr_value);
+		else
+			error = SET_ERROR(ENOENT);
+		switch (error) {
+		case ENOENT:
+			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
+			sa_attr_count++;
+			break;
+		case 0:
+			sa->sa_user_table[i] = ATTR_NUM(attr_value);
+			break;
+		default:
+			goto bail;
+		}
+	}
+
+	sa->sa_num_attrs = sa_attr_count;
+	tb = sa->sa_attr_table =
+	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
+
+	/*
+	 * Attribute table is constructed from requested attribute list,
+	 * previously foreign registered attributes, and also the legacy
+	 * ZPL set of attributes.
+	 */
+
+	if (sa->sa_reg_attr_obj) {
+		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
+		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t value;
+			value  = za.za_first_integer;
+
+			registered_count++;
+			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
+			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
+			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
+			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
+
+			if (tb[ATTR_NUM(value)].sa_name) {
+				continue;
+			}
+			tb[ATTR_NUM(value)].sa_name =
+			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
+			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
+			    strlen(za.za_name) +1);
+		}
+		zap_cursor_fini(&zc);
+		/*
+		 * Make sure we processed the correct number of registered
+		 * attributes
+		 */
+		if (registered_count != sa_reg_count) {
+			ASSERT(error != 0);
+			goto bail;
+		}
+
+	}
+
+	if (ostype == DMU_OST_ZFS) {
+		for (i = 0; i != sa_legacy_attr_count; i++) {
+			if (tb[i].sa_name)
+				continue;
+			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
+			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
+			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
+			tb[i].sa_registered = B_FALSE;
+			tb[i].sa_name =
+			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
+			    KM_SLEEP);
+			(void) strlcpy(tb[i].sa_name,
+			    sa_legacy_attrs[i].sa_name,
+			    strlen(sa_legacy_attrs[i].sa_name) + 1);
+		}
+	}
+
+	for (i = 0; i != count; i++) {
+		sa_attr_type_t attr_id;
+
+		attr_id = sa->sa_user_table[i];
+		if (tb[attr_id].sa_name)
+			continue;
+
+		tb[attr_id].sa_length = reg_attrs[i].sa_length;
+		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
+		tb[attr_id].sa_attr = attr_id;
+		tb[attr_id].sa_name =
+		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
+		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
+		    strlen(reg_attrs[i].sa_name) + 1);
+	}
+
+	sa->sa_need_attr_registration =
+	    (sa_attr_count != registered_count);
+
+	return (0);
+bail:
+	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
+	sa->sa_user_table = NULL;
+	sa_free_attr_table(sa);
+	ASSERT(error != 0);
+	return (error);
+}
+
+int
+sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count,
+    sa_attr_type_t **user_table)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	sa_os_t *sa;
+	dmu_objset_type_t ostype = dmu_objset_type(os);
+	sa_attr_type_t *tb;
+	int error;
+
+	mutex_enter(&os->os_user_ptr_lock);
+	if (os->os_sa) {
+		mutex_enter(&os->os_sa->sa_lock);
+		mutex_exit(&os->os_user_ptr_lock);
+		tb = os->os_sa->sa_user_table;
+		mutex_exit(&os->os_sa->sa_lock);
+		*user_table = tb;
+		return (0);
+	}
+
+	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
+	mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+	sa->sa_master_obj = sa_obj;
+
+	os->os_sa = sa;
+	mutex_enter(&sa->sa_lock);
+	mutex_exit(&os->os_user_ptr_lock);
+	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
+	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
+	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
+	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
+
+	if (sa_obj) {
+		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
+		    8, 1, &sa->sa_layout_attr_obj);
+		if (error != 0 && error != ENOENT)
+			goto fail;
+		error = zap_lookup(os, sa_obj, SA_REGISTRY,
+		    8, 1, &sa->sa_reg_attr_obj);
+		if (error != 0 && error != ENOENT)
+			goto fail;
+	}
+
+	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
+		goto fail;
+
+	if (sa->sa_layout_attr_obj != 0) {
+		uint64_t layout_count;
+
+		error = zap_count(os, sa->sa_layout_attr_obj,
+		    &layout_count);
+
+		/*
+		 * Layout number count should be > 0
+		 */
+		if (error || (error == 0 && layout_count == 0)) {
+			if (error == 0)
+				error = SET_ERROR(EINVAL);
+			goto fail;
+		}
+
+		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
+		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+		    zap_cursor_advance(&zc)) {
+			sa_attr_type_t *lot_attrs;
+			uint64_t lot_num;
+
+			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
+			    za.za_num_integers, KM_SLEEP);
+
+			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
+			    za.za_name, 2, za.za_num_integers,
+			    lot_attrs))) != 0) {
+				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+				    za.za_num_integers);
+				break;
+			}
+			VERIFY(ddi_strtoull(za.za_name, NULL, 10,
+			    (unsigned long long *)&lot_num) == 0);
+
+			(void) sa_add_layout_entry(os, lot_attrs,
+			    za.za_num_integers, lot_num,
+			    sa_layout_info_hash(lot_attrs,
+			    za.za_num_integers), B_FALSE, NULL);
+			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
+			    za.za_num_integers);
+		}
+		zap_cursor_fini(&zc);
+
+		/*
+		 * Make sure layout count matches number of entries added
+		 * to AVL tree
+		 */
+		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
+			ASSERT(error != 0);
+			goto fail;
+		}
+	}
+
+	/* Add special layout number for old ZNODES */
+	if (ostype == DMU_OST_ZFS) {
+		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
+		    sa_legacy_attr_count, 0,
+		    sa_layout_info_hash(sa_legacy_zpl_layout,
+		    sa_legacy_attr_count), B_FALSE, NULL);
+
+		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
+		    0, B_FALSE, NULL);
+	}
+	*user_table = os->os_sa->sa_user_table;
+	mutex_exit(&sa->sa_lock);
+	return (0);
+fail:
+	os->os_sa = NULL;
+	sa_free_attr_table(sa);
+	if (sa->sa_user_table)
+		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+	mutex_exit(&sa->sa_lock);
+	avl_destroy(&sa->sa_layout_hash_tree);
+	avl_destroy(&sa->sa_layout_num_tree);
+	mutex_destroy(&sa->sa_lock);
+	kmem_free(sa, sizeof (sa_os_t));
+	return ((error == ECKSUM) ? EIO : error);
+}
+
+void
+sa_tear_down(objset_t *os)
+{
+	sa_os_t *sa = os->os_sa;
+	sa_lot_t *layout;
+	void *cookie;
+
+	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
+
+	/* Free up attr table */
+
+	sa_free_attr_table(sa);
+
+	cookie = NULL;
+	while ((layout =
+	    avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) {
+		sa_idx_tab_t *tab;
+		while ((tab = list_head(&layout->lot_idx_tab))) {
+			ASSERT(zfs_refcount_count(&tab->sa_refcount));
+			sa_idx_tab_rele(os, tab);
+		}
+	}
+
+	cookie = NULL;
+	while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) {
+		kmem_free(layout->lot_attrs,
+		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
+		kmem_free(layout, sizeof (sa_lot_t));
+	}
+
+	avl_destroy(&sa->sa_layout_hash_tree);
+	avl_destroy(&sa->sa_layout_num_tree);
+	mutex_destroy(&sa->sa_lock);
+
+	kmem_free(sa, sizeof (sa_os_t));
+	os->os_sa = NULL;
+}
+
+static void
+sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
+    uint16_t length, int length_idx, boolean_t var_length, void *userp)
+{
+	sa_idx_tab_t *idx_tab = userp;
+
+	if (var_length) {
+		ASSERT(idx_tab->sa_variable_lengths);
+		idx_tab->sa_variable_lengths[length_idx] = length;
+	}
+	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
+	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
+}
+
+static void
+sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
+    sa_iterfunc_t func, sa_lot_t *tab, void *userp)
+{
+	void *data_start;
+	sa_lot_t *tb = tab;
+	sa_lot_t search;
+	avl_index_t loc;
+	sa_os_t *sa = os->os_sa;
+	int i;
+	uint16_t *length_start = NULL;
+	uint8_t length_idx = 0;
+
+	if (tab == NULL) {
+		search.lot_num = SA_LAYOUT_NUM(hdr, type);
+		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+		ASSERT(tb);
+	}
+
+	if (IS_SA_BONUSTYPE(type)) {
+		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
+		    offsetof(sa_hdr_phys_t, sa_lengths) +
+		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
+		length_start = hdr->sa_lengths;
+	} else {
+		data_start = hdr;
+	}
+
+	for (i = 0; i != tb->lot_attr_count; i++) {
+		int attr_length, reg_length;
+		uint8_t idx_len;
+
+		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
+		if (reg_length) {
+			attr_length = reg_length;
+			idx_len = 0;
+		} else {
+			attr_length = length_start[length_idx];
+			idx_len = length_idx++;
+		}
+
+		func(hdr, data_start, tb->lot_attrs[i], attr_length,
+		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
+
+		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
+		    attr_length), 8);
+	}
+}
+
+/*ARGSUSED*/
+static void
+sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
+    uint16_t length, int length_idx, boolean_t variable_length, void *userp)
+{
+	sa_handle_t *hdl = userp;
+	sa_os_t *sa = hdl->sa_os->os_sa;
+
+	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
+}
+
+static void
+sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+	dmu_buf_impl_t *db;
+	int num_lengths = 1;
+	int i;
+	sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa;
+
+	ASSERT(MUTEX_HELD(&sa->sa_lock));
+	if (sa_hdr_phys->sa_magic == SA_MAGIC)
+		return;
+
+	db = SA_GET_DB(hdl, buftype);
+
+	if (buftype == SA_SPILL) {
+		arc_release(db->db_buf, NULL);
+		arc_buf_thaw(db->db_buf);
+	}
+
+	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
+	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
+
+	/*
+	 * Determine number of variable lengths in header
+	 * The standard 8 byte header has one for free and a
+	 * 16 byte header would have 4 + 1;
+	 */
+	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
+		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
+	for (i = 0; i != num_lengths; i++)
+		sa_hdr_phys->sa_lengths[i] =
+		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
+
+	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
+	    sa_byteswap_cb, NULL, hdl);
+
+	if (buftype == SA_SPILL)
+		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
+}
+
+static int
+sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
+{
+	sa_hdr_phys_t *sa_hdr_phys;
+	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
+	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	sa_idx_tab_t *idx_tab;
+
+	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
+
+	mutex_enter(&sa->sa_lock);
+
+	/* Do we need to byteswap? */
+
+	/* only check if not old znode */
+	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
+	    sa_hdr_phys->sa_magic != 0) {
+		if (BSWAP_32(sa_hdr_phys->sa_magic) != SA_MAGIC) {
+			mutex_exit(&sa->sa_lock);
+			zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x "
+			    "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC,
+			    db->db.db_object);
+			return (SET_ERROR(EIO));
+		}
+		sa_byteswap(hdl, buftype);
+	}
+
+	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
+
+	if (buftype == SA_BONUS)
+		hdl->sa_bonus_tab = idx_tab;
+	else
+		hdl->sa_spill_tab = idx_tab;
+
+	mutex_exit(&sa->sa_lock);
+	return (0);
+}
+
+/*ARGSUSED*/
+static void
+sa_evict_sync(void *dbu)
+{
+	panic("evicting sa dbuf\n");
+}
+
+static void
+sa_idx_tab_rele(objset_t *os, void *arg)
+{
+	sa_os_t *sa = os->os_sa;
+	sa_idx_tab_t *idx_tab = arg;
+
+	if (idx_tab == NULL)
+		return;
+
+	mutex_enter(&sa->sa_lock);
+	if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
+		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
+		if (idx_tab->sa_variable_lengths)
+			kmem_free(idx_tab->sa_variable_lengths,
+			    sizeof (uint16_t) *
+			    idx_tab->sa_layout->lot_var_sizes);
+		zfs_refcount_destroy(&idx_tab->sa_refcount);
+		kmem_free(idx_tab->sa_idx_tab,
+		    sizeof (uint32_t) * sa->sa_num_attrs);
+		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
+	}
+	mutex_exit(&sa->sa_lock);
+}
+
+static void
+sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
+{
+	sa_os_t *sa __maybe_unused = os->os_sa;
+
+	ASSERT(MUTEX_HELD(&sa->sa_lock));
+	(void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
+}
+
+void
+sa_spill_rele(sa_handle_t *hdl)
+{
+	mutex_enter(&hdl->sa_lock);
+	if (hdl->sa_spill) {
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+		dmu_buf_rele(hdl->sa_spill, NULL);
+		hdl->sa_spill = NULL;
+		hdl->sa_spill_tab = NULL;
+	}
+	mutex_exit(&hdl->sa_lock);
+}
+
+void
+sa_handle_destroy(sa_handle_t *hdl)
+{
+	dmu_buf_t *db = hdl->sa_bonus;
+
+	mutex_enter(&hdl->sa_lock);
+	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
+
+	if (hdl->sa_bonus_tab)
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
+
+	if (hdl->sa_spill_tab)
+		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+
+	dmu_buf_rele(hdl->sa_bonus, NULL);
+
+	if (hdl->sa_spill)
+		dmu_buf_rele(hdl->sa_spill, NULL);
+	mutex_exit(&hdl->sa_lock);
+
+	kmem_cache_free(sa_cache, hdl);
+}
+
+int
+sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
+    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+	int error = 0;
+	sa_handle_t *handle = NULL;
+#ifdef ZFS_DEBUG
+	dmu_object_info_t doi;
+
+	dmu_object_info_from_db(db, &doi);
+	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
+	    doi.doi_bonus_type == DMU_OT_ZNODE);
+#endif
+	/* find handle, if it exists */
+	/* if one doesn't exist then create a new one, and initialize it */
+
+	if (hdl_type == SA_HDL_SHARED)
+		handle = dmu_buf_get_user(db);
+
+	if (handle == NULL) {
+		sa_handle_t *winner = NULL;
+
+		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+		handle->sa_dbu.dbu_evict_func_sync = NULL;
+		handle->sa_dbu.dbu_evict_func_async = NULL;
+		handle->sa_userp = userp;
+		handle->sa_bonus = db;
+		handle->sa_os = os;
+		handle->sa_spill = NULL;
+		handle->sa_bonus_tab = NULL;
+		handle->sa_spill_tab = NULL;
+
+		error = sa_build_index(handle, SA_BONUS);
+
+		if (hdl_type == SA_HDL_SHARED) {
+			dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
+			    NULL);
+			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
+		}
+
+		if (winner != NULL) {
+			kmem_cache_free(sa_cache, handle);
+			handle = winner;
+		}
+	}
+	*handlepp = handle;
+
+	return (error);
+}
+
+int
+sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
+    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
+{
+	dmu_buf_t *db;
+	int error;
+
+	if ((error = dmu_bonus_hold(objset, objid, NULL, &db)))
+		return (error);
+
+	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
+	    handlepp));
+}
+
+int
+sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db)
+{
+	return (dmu_bonus_hold(objset, obj_num, tag, db));
+}
+
+void
+sa_buf_rele(dmu_buf_t *db, void *tag)
+{
+	dmu_buf_rele(db, tag);
+}
+
+static int
+sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
+{
+	ASSERT(hdl);
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
+}
+
+static int
+sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf,
+    uint32_t buflen)
+{
+	int error;
+	sa_bulk_attr_t bulk;
+
+	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
+
+	bulk.sa_attr = attr;
+	bulk.sa_data = buf;
+	bulk.sa_length = buflen;
+	bulk.sa_data_func = NULL;
+
+	ASSERT(hdl);
+	error = sa_lookup_impl(hdl, &bulk, 1);
+	return (error);
+}
+
+int
+sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
+{
+	int error;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_lookup_locked(hdl, attr, buf, buflen);
+	mutex_exit(&hdl->sa_lock);
+
+	return (error);
+}
+
+#ifdef _KERNEL
+int
+sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, zfs_uio_t *uio)
+{
+	int error;
+	sa_bulk_attr_t bulk;
+
+	bulk.sa_data = NULL;
+	bulk.sa_attr = attr;
+	bulk.sa_data_func = NULL;
+
+	ASSERT(hdl);
+
+	mutex_enter(&hdl->sa_lock);
+	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
+		error = zfs_uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
+		    zfs_uio_resid(uio)), UIO_READ, uio);
+	}
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+/*
+ * For the existed object that is upgraded from old system, its ondisk layout
+ * has no slot for the project ID attribute. But quota accounting logic needs
+ * to access related slots by offset directly. So we need to adjust these old
+ * objects' layout to make the project ID to some unified and fixed offset.
+ */
+int
+sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
+{
+	znode_t *zp = sa_get_userdata(hdl);
+	dmu_buf_t *db = sa_get_db(hdl);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int count = 0, err = 0;
+	sa_bulk_attr_t *bulk, *attrs;
+	zfs_acl_locator_cb_t locate = { 0 };
+	uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links;
+	uint64_t crtime[2], mtime[2], ctime[2], atime[2];
+	zfs_acl_phys_t znode_acl = { 0 };
+	char scanstamp[AV_SCANSTAMP_SZ];
+
+	if (zp->z_acl_cached == NULL) {
+		zfs_acl_t *aclp;
+
+		mutex_enter(&zp->z_acl_lock);
+		err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+		mutex_exit(&zp->z_acl_lock);
+		if (err != 0 && err != ENOENT)
+			return (err);
+	}
+
+	bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+	attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+	mutex_enter(&hdl->sa_lock);
+	mutex_enter(&zp->z_lock);
+
+	err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
+	    sizeof (uint64_t));
+	if (unlikely(err == 0))
+		/* Someone has added project ID attr by race. */
+		err = EEXIST;
+	if (err != ENOENT)
+		goto out;
+
+	/* First do a bulk query of the attributes that aren't cached */
+	if (zp->z_is_sa) {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+		    &mode, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+		    &gen, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+		    &uid, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+		    &gid, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+		    &parent, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+		    &atime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    &mtime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+		    &crtime, 16);
+		if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
+			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+			    &rdev, 8);
+	} else {
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
+		    &atime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+		    &mtime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+		    &ctime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+		    &crtime, 16);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
+		    &gen, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
+		    &mode, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
+		    &parent, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL,
+		    &xattr, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
+		    &rdev, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
+		    &uid, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
+		    &gid, 8);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+		    &znode_acl, 88);
+	}
+	err = sa_bulk_lookup_locked(hdl, bulk, count);
+	if (err != 0)
+		goto out;
+
+	err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8);
+	if (err != 0 && err != ENOENT)
+		goto out;
+
+	zp->z_projid = projid;
+	zp->z_pflags |= ZFS_PROJID;
+	links = ZTONLNK(zp);
+	count = 0;
+	err = 0;
+
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+	    &crtime, 16);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
+	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8);
+
+	if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
+		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+		    &rdev, 8);
+
+	if (zp->z_acl_cached != NULL) {
+		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+		    &zp->z_acl_cached->z_acl_count, 8);
+		if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+			zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+		locate.cb_aclp = zp->z_acl_cached;
+		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+		    zfs_acl_data_locator, &locate,
+		    zp->z_acl_cached->z_acl_bytes);
+	}
+
+	if (xattr)
+		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL,
+		    &xattr, 8);
+
+	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+		bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+		    scanstamp, AV_SCANSTAMP_SZ);
+		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
+		    scanstamp, AV_SCANSTAMP_SZ);
+		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+	}
+
+	VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+	VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0);
+	if (znode_acl.z_acl_extern_obj) {
+		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+		    znode_acl.z_acl_extern_obj, tx));
+	}
+
+	zp->z_is_sa = B_TRUE;
+
+out:
+	mutex_exit(&zp->z_lock);
+	mutex_exit(&hdl->sa_lock);
+	kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+	kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
+	return (err);
+}
+#endif
+
+static sa_idx_tab_t *
+sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
+{
+	sa_idx_tab_t *idx_tab;
+	sa_os_t *sa = os->os_sa;
+	sa_lot_t *tb, search;
+	avl_index_t loc;
+
+	/*
+	 * Deterimine layout number.  If SA node and header == 0 then
+	 * force the index table to the dummy "1" empty layout.
+	 *
+	 * The layout number would only be zero for a newly created file
+	 * that has not added any attributes yet, or with crypto enabled which
+	 * doesn't write any attributes to the bonus buffer.
+	 */
+
+	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
+
+	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
+
+	/* Verify header size is consistent with layout information */
+	ASSERT(tb);
+	ASSERT((IS_SA_BONUSTYPE(bonustype) &&
+	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) ||
+	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
+
+	/*
+	 * See if any of the already existing TOC entries can be reused?
+	 */
+
+	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
+	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
+		boolean_t valid_idx = B_TRUE;
+		int i;
+
+		if (tb->lot_var_sizes != 0 &&
+		    idx_tab->sa_variable_lengths != NULL) {
+			for (i = 0; i != tb->lot_var_sizes; i++) {
+				if (hdr->sa_lengths[i] !=
+				    idx_tab->sa_variable_lengths[i]) {
+					valid_idx = B_FALSE;
+					break;
+				}
+			}
+		}
+		if (valid_idx) {
+			sa_idx_tab_hold(os, idx_tab);
+			return (idx_tab);
+		}
+	}
+
+	/* No such luck, create a new entry */
+	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
+	idx_tab->sa_idx_tab =
+	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
+	idx_tab->sa_layout = tb;
+	zfs_refcount_create(&idx_tab->sa_refcount);
+	if (tb->lot_var_sizes)
+		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
+		    tb->lot_var_sizes, KM_SLEEP);
+
+	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
+	    tb, idx_tab);
+	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
+	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
+	list_insert_tail(&tb->lot_idx_tab, idx_tab);
+	return (idx_tab);
+}
+
+void
+sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
+    boolean_t start, void *userdata)
+{
+	ASSERT(start);
+
+	*dataptr = userdata;
+	*len = total_len;
+}
+
+static void
+sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+	uint64_t attr_value = 0;
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	sa_attr_table_t *tb = sa->sa_attr_table;
+	int i;
+
+	mutex_enter(&sa->sa_lock);
+
+	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
+		mutex_exit(&sa->sa_lock);
+		return;
+	}
+
+	if (sa->sa_reg_attr_obj == 0) {
+		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
+		    DMU_OT_SA_ATTR_REGISTRATION,
+		    sa->sa_master_obj, SA_REGISTRY, tx);
+	}
+	for (i = 0; i != sa->sa_num_attrs; i++) {
+		if (sa->sa_attr_table[i].sa_registered)
+			continue;
+		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
+		    tb[i].sa_byteswap);
+		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
+		    tb[i].sa_name, 8, 1, &attr_value, tx));
+		tb[i].sa_registered = B_TRUE;
+	}
+	sa->sa_need_attr_registration = B_FALSE;
+	mutex_exit(&sa->sa_lock);
+}
+
+/*
+ * Replace all attributes with attributes specified in template.
+ * If dnode had a spill buffer then those attributes will be
+ * also be replaced, possibly with just an empty spill block
+ *
+ * This interface is intended to only be used for bulk adding of
+ * attributes for a new file.  It will also be used by the ZPL
+ * when converting and old formatted znode to native SA support.
+ */
+int
+sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+    int attr_count, dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+
+	if (sa->sa_need_attr_registration)
+		sa_attr_register_sync(hdl, tx);
+	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
+}
+
+int
+sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
+    int attr_count, dmu_tx_t *tx)
+{
+	int error;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_replace_all_by_template_locked(hdl, attr_desc,
+	    attr_count, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+/*
+ * Add/remove a single attribute or replace a variable-sized attribute value
+ * with a value of a different size, and then rewrite the entire set
+ * of attributes.
+ * Same-length attribute value replacement (including fixed-length attributes)
+ * is handled more efficiently by the upper layers.
+ */
+static int
+sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
+    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
+    uint16_t buflen, dmu_tx_t *tx)
+{
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
+	dnode_t *dn;
+	sa_bulk_attr_t *attr_desc;
+	void *old_data[2];
+	int bonus_attr_count = 0;
+	int bonus_data_size = 0;
+	int spill_data_size = 0;
+	int spill_attr_count = 0;
+	int error;
+	uint16_t length, reg_length;
+	int i, j, k, length_idx;
+	sa_hdr_phys_t *hdr;
+	sa_idx_tab_t *idx_tab;
+	int attr_count;
+	int count;
+
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+	/* First make of copy of the old data */
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+	if (dn->dn_bonuslen != 0) {
+		bonus_data_size = hdl->sa_bonus->db_size;
+		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
+		bcopy(hdl->sa_bonus->db_data, old_data[0],
+		    hdl->sa_bonus->db_size);
+		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
+	} else {
+		old_data[0] = NULL;
+	}
+	DB_DNODE_EXIT(db);
+
+	/* Bring spill buffer online if it isn't currently */
+
+	if ((error = sa_get_spill(hdl)) == 0) {
+		spill_data_size = hdl->sa_spill->db_size;
+		old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
+		bcopy(hdl->sa_spill->db_data, old_data[1],
+		    hdl->sa_spill->db_size);
+		spill_attr_count =
+		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
+	} else if (error && error != ENOENT) {
+		if (old_data[0])
+			kmem_free(old_data[0], bonus_data_size);
+		return (error);
+	} else {
+		old_data[1] = NULL;
+	}
+
+	/* build descriptor of all attributes */
+
+	attr_count = bonus_attr_count + spill_attr_count;
+	if (action == SA_ADD)
+		attr_count++;
+	else if (action == SA_REMOVE)
+		attr_count--;
+
+	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
+
+	/*
+	 * loop through bonus and spill buffer if it exists, and
+	 * build up new attr_descriptor to reset the attributes
+	 */
+	k = j = 0;
+	count = bonus_attr_count;
+	hdr = SA_GET_HDR(hdl, SA_BONUS);
+	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
+	for (; k != 2; k++) {
+		/*
+		 * Iterate over each attribute in layout.  Fetch the
+		 * size of variable-length attributes needing rewrite
+		 * from sa_lengths[].
+		 */
+		for (i = 0, length_idx = 0; i != count; i++) {
+			sa_attr_type_t attr;
+
+			attr = idx_tab->sa_layout->lot_attrs[i];
+			reg_length = SA_REGISTERED_LEN(sa, attr);
+			if (reg_length == 0) {
+				length = hdr->sa_lengths[length_idx];
+				length_idx++;
+			} else {
+				length = reg_length;
+			}
+			if (attr == newattr) {
+				/*
+				 * There is nothing to do for SA_REMOVE,
+				 * so it is just skipped.
+				 */
+				if (action == SA_REMOVE)
+					continue;
+
+				/*
+				 * Duplicate attributes are not allowed, so the
+				 * action can not be SA_ADD here.
+				 */
+				ASSERT3S(action, ==, SA_REPLACE);
+
+				/*
+				 * Only a variable-sized attribute can be
+				 * replaced here, and its size must be changing.
+				 */
+				ASSERT3U(reg_length, ==, 0);
+				ASSERT3U(length, !=, buflen);
+				SA_ADD_BULK_ATTR(attr_desc, j, attr,
+				    locator, datastart, buflen);
+			} else {
+				SA_ADD_BULK_ATTR(attr_desc, j, attr,
+				    NULL, (void *)
+				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
+				    (uintptr_t)old_data[k]), length);
+			}
+		}
+		if (k == 0 && hdl->sa_spill) {
+			hdr = SA_GET_HDR(hdl, SA_SPILL);
+			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
+			count = spill_attr_count;
+		} else {
+			break;
+		}
+	}
+	if (action == SA_ADD) {
+		reg_length = SA_REGISTERED_LEN(sa, newattr);
+		IMPLY(reg_length != 0, reg_length == buflen);
+		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
+		    datastart, buflen);
+	}
+	ASSERT3U(j, ==, attr_count);
+
+	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
+
+	if (old_data[0])
+		kmem_free(old_data[0], bonus_data_size);
+	if (old_data[1])
+		vmem_free(old_data[1], spill_data_size);
+	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
+
+	return (error);
+}
+
+static int
+sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
+    dmu_tx_t *tx)
+{
+	int error;
+	sa_os_t *sa = hdl->sa_os->os_sa;
+	dmu_object_type_t bonustype;
+	dmu_buf_t *saved_spill;
+
+	ASSERT(hdl);
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+
+	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
+	saved_spill = hdl->sa_spill;
+
+	/* sync out registration table if necessary */
+	if (sa->sa_need_attr_registration)
+		sa_attr_register_sync(hdl, tx);
+
+	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
+	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
+		sa->sa_update_cb(hdl, tx);
+
+	/*
+	 * If saved_spill is NULL and current sa_spill is not NULL that
+	 * means we increased the refcount of the spill buffer through
+	 * sa_get_spill() or dmu_spill_hold_by_dnode().  Therefore we
+	 * must release the hold before calling dmu_tx_commit() to avoid
+	 * making a copy of this buffer in dbuf_sync_leaf() due to the
+	 * reference count now being greater than 1.
+	 */
+	if (!saved_spill && hdl->sa_spill) {
+		if (hdl->sa_spill_tab) {
+			sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
+			hdl->sa_spill_tab = NULL;
+		}
+
+		dmu_buf_rele(hdl->sa_spill, NULL);
+		hdl->sa_spill = NULL;
+	}
+
+	return (error);
+}
+
+/*
+ * update or add new attribute
+ */
+int
+sa_update(sa_handle_t *hdl, sa_attr_type_t type,
+    void *buf, uint32_t buflen, dmu_tx_t *tx)
+{
+	int error;
+	sa_bulk_attr_t bulk;
+
+	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
+
+	bulk.sa_attr = type;
+	bulk.sa_data_func = NULL;
+	bulk.sa_length = buflen;
+	bulk.sa_data = buf;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+/*
+ * Return size of an attribute
+ */
+
+int
+sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
+{
+	sa_bulk_attr_t bulk;
+	int error;
+
+	bulk.sa_data = NULL;
+	bulk.sa_attr = attr;
+	bulk.sa_data_func = NULL;
+
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
+		mutex_exit(&hdl->sa_lock);
+		return (error);
+	}
+	*size = bulk.sa_size;
+
+	mutex_exit(&hdl->sa_lock);
+	return (0);
+}
+
+int
+sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+	ASSERT(hdl);
+	ASSERT(MUTEX_HELD(&hdl->sa_lock));
+	return (sa_lookup_impl(hdl, attrs, count));
+}
+
+int
+sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
+{
+	int error;
+
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+	error = sa_bulk_lookup_locked(hdl, attrs, count);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+int
+sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
+{
+	int error;
+
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+	error = sa_bulk_update_impl(hdl, attrs, count, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+int
+sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
+{
+	int error;
+
+	mutex_enter(&hdl->sa_lock);
+	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
+	    NULL, 0, tx);
+	mutex_exit(&hdl->sa_lock);
+	return (error);
+}
+
+void
+sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
+{
+	dmu_object_info_from_db(hdl->sa_bonus, doi);
+}
+
+void
+sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
+{
+	dmu_object_size_from_db(hdl->sa_bonus,
+	    blksize, nblocks);
+}
+
+void
+sa_set_userp(sa_handle_t *hdl, void *ptr)
+{
+	hdl->sa_userp = ptr;
+}
+
+dmu_buf_t *
+sa_get_db(sa_handle_t *hdl)
+{
+	return (hdl->sa_bonus);
+}
+
+void *
+sa_get_userdata(sa_handle_t *hdl)
+{
+	return (hdl->sa_userp);
+}
+
+void
+sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
+{
+	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
+	os->os_sa->sa_update_cb = func;
+}
+
+void
+sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
+{
+
+	mutex_enter(&os->os_sa->sa_lock);
+	sa_register_update_callback_locked(os, func);
+	mutex_exit(&os->os_sa->sa_lock);
+}
+
+uint64_t
+sa_handle_object(sa_handle_t *hdl)
+{
+	return (hdl->sa_bonus->db_object);
+}
+
+boolean_t
+sa_enabled(objset_t *os)
+{
+	return (os->os_sa == NULL);
+}
+
+int
+sa_set_sa_object(objset_t *os, uint64_t sa_object)
+{
+	sa_os_t *sa = os->os_sa;
+
+	if (sa->sa_master_obj)
+		return (1);
+
+	sa->sa_master_obj = sa_object;
+
+	return (0);
+}
+
+int
+sa_hdrsize(void *arg)
+{
+	sa_hdr_phys_t *hdr = arg;
+
+	return (SA_HDR_SIZE(hdr));
+}
+
+void
+sa_handle_lock(sa_handle_t *hdl)
+{
+	ASSERT(hdl);
+	mutex_enter(&hdl->sa_lock);
+}
+
+void
+sa_handle_unlock(sa_handle_t *hdl)
+{
+	ASSERT(hdl);
+	mutex_exit(&hdl->sa_lock);
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(sa_handle_get);
+EXPORT_SYMBOL(sa_handle_get_from_db);
+EXPORT_SYMBOL(sa_handle_destroy);
+EXPORT_SYMBOL(sa_buf_hold);
+EXPORT_SYMBOL(sa_buf_rele);
+EXPORT_SYMBOL(sa_spill_rele);
+EXPORT_SYMBOL(sa_lookup);
+EXPORT_SYMBOL(sa_update);
+EXPORT_SYMBOL(sa_remove);
+EXPORT_SYMBOL(sa_bulk_lookup);
+EXPORT_SYMBOL(sa_bulk_lookup_locked);
+EXPORT_SYMBOL(sa_bulk_update);
+EXPORT_SYMBOL(sa_size);
+EXPORT_SYMBOL(sa_object_info);
+EXPORT_SYMBOL(sa_object_size);
+EXPORT_SYMBOL(sa_get_userdata);
+EXPORT_SYMBOL(sa_set_userp);
+EXPORT_SYMBOL(sa_get_db);
+EXPORT_SYMBOL(sa_handle_object);
+EXPORT_SYMBOL(sa_register_update_callback);
+EXPORT_SYMBOL(sa_setup);
+EXPORT_SYMBOL(sa_replace_all_by_template);
+EXPORT_SYMBOL(sa_replace_all_by_template_locked);
+EXPORT_SYMBOL(sa_enabled);
+EXPORT_SYMBOL(sa_cache_init);
+EXPORT_SYMBOL(sa_cache_fini);
+EXPORT_SYMBOL(sa_set_sa_object);
+EXPORT_SYMBOL(sa_hdrsize);
+EXPORT_SYMBOL(sa_handle_lock);
+EXPORT_SYMBOL(sa_handle_unlock);
+EXPORT_SYMBOL(sa_lookup_uio);
+EXPORT_SYMBOL(sa_add_projid);
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/zfs/sha256.c b/sys/contrib/openzfs/module/zfs/sha256.c
new file mode 100644
index 000000000000..d297768eada5
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/sha256.c
@@ -0,0 +1,105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/sha2.h>
+#include <sys/abd.h>
+#include <sys/qat.h>
+
+static int
+sha_incremental(void *buf, size_t size, void *arg)
+{
+	SHA2_CTX *ctx = arg;
+	SHA2Update(ctx, buf, size);
+	return (0);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA256(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	int ret;
+	SHA2_CTX ctx;
+	zio_cksum_t tmp;
+
+	if (qat_checksum_use_accel(size)) {
+		uint8_t *buf = abd_borrow_buf_copy(abd, size);
+		ret = qat_checksum(ZIO_CHECKSUM_SHA256, buf, size, &tmp);
+		abd_return_buf(abd, buf, size);
+		if (ret == CPA_STATUS_SUCCESS)
+			goto bswap;
+
+		/* If the hardware implementation fails fall back to software */
+	}
+
+	SHA2Init(SHA256, &ctx);
+	(void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
+	SHA2Final(&tmp, &ctx);
+
+bswap:
+	/*
+	 * A prior implementation of this function had a
+	 * private SHA256 implementation always wrote things out in
+	 * Big Endian and there wasn't a byteswap variant of it.
+	 * To preserve on disk compatibility we need to force that
+	 * behavior.
+	 */
+	zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA512_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	SHA2_CTX	ctx;
+
+	SHA2Init(SHA512_256, &ctx);
+	(void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx);
+	SHA2Final(zcp, &ctx);
+}
+
+/*ARGSUSED*/
+void
+abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	abd_checksum_SHA512_native(abd, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
diff --git a/sys/contrib/openzfs/module/zfs/skein_zfs.c b/sys/contrib/openzfs/module/zfs/skein_zfs.c
new file mode 100644
index 000000000000..11b9940e027e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/skein_zfs.c
@@ -0,0 +1,102 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/skein.h>
+
+#include <sys/abd.h>
+
+static int
+skein_incremental(void *buf, size_t size, void *arg)
+{
+	Skein_512_Ctxt_t *ctx = arg;
+	(void) Skein_512_Update(ctx, buf, size);
+	return (0);
+}
+/*
+ * Computes a native 256-bit skein MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using abd_checksum_skein_tmpl_init.
+ */
+/*ARGSUSED*/
+void
+abd_checksum_skein_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	Skein_512_Ctxt_t	ctx;
+
+	ASSERT(ctx_template != NULL);
+	bcopy(ctx_template, &ctx, sizeof (ctx));
+	(void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx);
+	(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
+	bzero(&ctx, sizeof (ctx));
+}
+
+/*
+ * Byteswapped version of abd_checksum_skein_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * skein is internally endian-insensitive).
+ */
+void
+abd_checksum_skein_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	abd_checksum_skein_native(abd, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a skein MAC template suitable for using in skein MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+{
+	Skein_512_Ctxt_t	*ctx;
+
+	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+	(void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
+	    salt->zcs_bytes, sizeof (salt->zcs_bytes));
+	return (ctx);
+}
+
+/*
+ * Frees a skein context template previously allocated using
+ * zio_checksum_skein_tmpl_init.
+ */
+void
+abd_checksum_skein_tmpl_free(void *ctx_template)
+{
+	Skein_512_Ctxt_t	*ctx = ctx_template;
+
+	bzero(ctx, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/sys/contrib/openzfs/module/zfs/spa.c b/sys/contrib/openzfs/module/zfs/spa.c
new file mode 100644
index 000000000000..5170c9ca226f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa.c
@@ -0,0 +1,9885 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
+ * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ */
+
+/*
+ * SPA: Storage Pool Allocator
+ *
+ * This file contains all the routines used when modifying on-disk SPA state.
+ * This includes opening, importing, destroying, exporting a pool, and syncing a
+ * pool.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/ddt.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_disk.h>
+#include <sys/vdev_draid.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/mmp.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dmu_objset.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/callb.h>
+#include <sys/systeminfo.h>
+#include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/dsl_scan.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_destroy.h>
+#include <sys/zvol.h>
+
+#ifdef	_KERNEL
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/callb.h>
+#include <sys/zone.h>
+#include <sys/vmsystm.h>
+#endif	/* _KERNEL */
+
+#include "zfs_prop.h"
+#include "zfs_comutil.h"
+
+/*
+ * The interval, in seconds, at which failed configuration cache file writes
+ * should be retried.
+ */
+int zfs_ccw_retry_interval = 300;
+
+typedef enum zti_modes {
+	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
+	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
+	ZTI_MODE_NULL,			/* don't create a taskq */
+	ZTI_NMODES
+} zti_modes_t;
+
+#define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
+#define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
+#define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
+#define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
+
+#define	ZTI_N(n)	ZTI_P(n, 1)
+#define	ZTI_ONE		ZTI_N(1)
+
+typedef struct zio_taskq_info {
+	zti_modes_t zti_mode;
+	uint_t zti_value;
+	uint_t zti_count;
+} zio_taskq_info_t;
+
+static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
+	"iss", "iss_h", "int", "int_h"
+};
+
+/*
+ * This table defines the taskq settings for each ZFS I/O type. When
+ * initializing a pool, we use this table to create an appropriately sized
+ * taskq. Some operations are low volume and therefore have a small, static
+ * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
+ * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macro causes us to create a taskq oriented for throughput. Some operations
+ * are so high frequency and short-lived that the taskq itself can become a
+ * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
+ * additional degree of parallelism specified by the number of threads per-
+ * taskq and the number of taskqs; when dispatching an event in this case, the
+ * particular taskq is chosen at random.
+ *
+ * The different taskq priorities are to handle the different contexts (issue
+ * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
+ * need to be handled with minimum delay.
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
+	{ ZTI_N(8),	ZTI_NULL,	ZTI_P(12, 8),	ZTI_NULL }, /* READ */
+	{ ZTI_BATCH,	ZTI_N(5),	ZTI_P(12, 8),	ZTI_N(5) }, /* WRITE */
+	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
+	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
+};
+
+static void spa_sync_version(void *arg, dmu_tx_t *tx);
+static void spa_sync_props(void *arg, dmu_tx_t *tx);
+static boolean_t spa_has_active_shared_spare(spa_t *spa);
+static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
+static void spa_vdev_resilver_done(spa_t *spa);
+
+uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
+boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
+uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
+
+boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
+
+/*
+ * Report any spa_load_verify errors found, but do not fail spa_load.
+ * This is used by zdb to analyze non-idle pools.
+ */
+boolean_t	spa_load_verify_dryrun = B_FALSE;
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define	TRYIMPORT_NAME	"$import"
+
+/*
+ * For debugging purposes: print out vdev tree during pool import.
+ */
+int		spa_load_print_vdev_tree = B_FALSE;
+
+/*
+ * A non-zero value for zfs_max_missing_tvds means that we allow importing
+ * pools with missing top-level vdevs. This is strictly intended for advanced
+ * pool recovery cases since missing data is almost inevitable. Pools with
+ * missing devices can only be imported read-only for safety reasons, and their
+ * fail-mode will be automatically set to "continue".
+ *
+ * With 1 missing vdev we should be able to import the pool and mount all
+ * datasets. User data that was not modified after the missing device has been
+ * added should be recoverable. This means that snapshots created prior to the
+ * addition of that device should be completely intact.
+ *
+ * With 2 missing vdevs, some datasets may fail to mount since there are
+ * dataset statistics that are stored as regular metadata. Some data might be
+ * recoverable if those vdevs were added recently.
+ *
+ * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
+ * may be missing entirely. Chances of data recovery are very low. Note that
+ * there are also risks of performing an inadvertent rewind as we might be
+ * missing all the vdevs with the latest uberblocks.
+ */
+unsigned long	zfs_max_missing_tvds = 0;
+
+/*
+ * The parameters below are similar to zfs_max_missing_tvds but are only
+ * intended for a preliminary open of the pool with an untrusted config which
+ * might be incomplete or out-dated.
+ *
+ * We are more tolerant for pools opened from a cachefile since we could have
+ * an out-dated cachefile where a device removal was not registered.
+ * We could have set the limit arbitrarily high but in the case where devices
+ * are really missing we would want to return the proper error codes; we chose
+ * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
+ * and we get a chance to retrieve the trusted config.
+ */
+uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
+
+/*
+ * In the case where config was assembled by scanning device paths (/dev/dsks
+ * by default) we are less tolerant since all the existing devices should have
+ * been detected and we want spa_load to return the right error codes.
+ */
+uint64_t	zfs_max_missing_tvds_scan = 0;
+
+/*
+ * Debugging aid that pauses spa_sync() towards the end.
+ */
+boolean_t	zfs_pause_spa_sync = B_FALSE;
+
+/*
+ * Variables to indicate the livelist condense zthr func should wait at certain
+ * points for the livelist to be removed - used to test condense/destroy races
+ */
+int zfs_livelist_condense_zthr_pause = 0;
+int zfs_livelist_condense_sync_pause = 0;
+
+/*
+ * Variables to track whether or not condense cancellation has been
+ * triggered in testing.
+ */
+int zfs_livelist_condense_sync_cancel = 0;
+int zfs_livelist_condense_zthr_cancel = 0;
+
+/*
+ * Variable to track whether or not extra ALLOC blkptrs were added to a
+ * livelist entry while it was being condensed (caused by the way we track
+ * remapped blkptrs in dbuf_remap_impl)
+ */
+int zfs_livelist_condense_new_alloc = 0;
+
+/*
+ * ==========================================================================
+ * SPA properties routines
+ * ==========================================================================
+ */
+
+/*
+ * Add a (source=src, propname=propval) list to an nvlist.
+ */
+static void
+spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
+    uint64_t intval, zprop_source_t src)
+{
+	const char *propname = zpool_prop_to_name(prop);
+	nvlist_t *propval;
+
+	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
+
+	if (strval != NULL)
+		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
+	else
+		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
+
+	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
+	nvlist_free(propval);
+}
+
+/*
+ * Get property values from the spa configuration.
+ */
+static void
+spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	dsl_pool_t *pool = spa->spa_dsl_pool;
+	uint64_t size, alloc, cap, version;
+	const zprop_source_t src = ZPROP_SRC_NONE;
+	spa_config_dirent_t *dp;
+	metaslab_class_t *mc = spa_normal_class(spa);
+
+	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
+
+	if (rvd != NULL) {
+		alloc = metaslab_class_get_alloc(mc);
+		alloc += metaslab_class_get_alloc(spa_special_class(spa));
+		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
+		alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
+
+		size = metaslab_class_get_space(mc);
+		size += metaslab_class_get_space(spa_special_class(spa));
+		size += metaslab_class_get_space(spa_dedup_class(spa));
+		size += metaslab_class_get_space(spa_embedded_log_class(spa));
+
+		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+		    size - alloc, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
+		    spa->spa_checkpoint_info.sci_dspace, src);
+
+		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
+		    metaslab_class_fragmentation(mc), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
+		    metaslab_class_expandable_space(mc), src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
+		    (spa_mode(spa) == SPA_MODE_READ), src);
+
+		cap = (size == 0) ? 0 : (alloc * 100 / size);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+		    ddt_get_pool_dedup_ratio(spa), src);
+
+		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+		    rvd->vdev_state, src);
+
+		version = spa_version(spa);
+		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
+			    version, ZPROP_SRC_DEFAULT);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
+			    version, ZPROP_SRC_LOCAL);
+		}
+		spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID,
+		    NULL, spa_load_guid(spa), src);
+	}
+
+	if (pool != NULL) {
+		/*
+		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
+		 * when opening pools before this version freedir will be NULL.
+		 */
+		if (pool->dp_free_dir != NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
+			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+			    src);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
+			    NULL, 0, src);
+		}
+
+		if (pool->dp_leak_dir != NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+			    src);
+		} else {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+			    NULL, 0, src);
+		}
+	}
+
+	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
+
+	if (spa->spa_comment != NULL) {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
+		    0, ZPROP_SRC_LOCAL);
+	}
+
+	if (spa->spa_compatibility != NULL) {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY,
+		    spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
+	}
+
+	if (spa->spa_root != NULL)
+		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
+		    0, ZPROP_SRC_LOCAL);
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+	} else {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+	}
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
+	} else {
+		spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL,
+		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
+	}
+
+	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
+		if (dp->scd_path == NULL) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+			    "none", 0, ZPROP_SRC_LOCAL);
+		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
+			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
+			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
+		}
+	}
+}
+
+/*
+ * Get zpool property values.
+ */
+int
+spa_prop_get(spa_t *spa, nvlist_t **nvp)
+{
+	objset_t *mos = spa->spa_meta_objset;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	dsl_pool_t *dp;
+	int err;
+
+	err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP);
+	if (err)
+		return (err);
+
+	dp = spa_get_dsl(spa);
+	dsl_pool_config_enter(dp, FTAG);
+	mutex_enter(&spa->spa_props_lock);
+
+	/*
+	 * Get properties from the spa config.
+	 */
+	spa_prop_get_config(spa, nvp);
+
+	/* If no pool property object, no more prop to get. */
+	if (mos == NULL || spa->spa_pool_props_object == 0)
+		goto out;
+
+	/*
+	 * Get properties from the MOS pool property object.
+	 */
+	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
+	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t intval = 0;
+		char *strval = NULL;
+		zprop_source_t src = ZPROP_SRC_DEFAULT;
+		zpool_prop_t prop;
+
+		if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
+			continue;
+
+		switch (za.za_integer_length) {
+		case 8:
+			/* integer property */
+			if (za.za_first_integer !=
+			    zpool_prop_default_numeric(prop))
+				src = ZPROP_SRC_LOCAL;
+
+			if (prop == ZPOOL_PROP_BOOTFS) {
+				dsl_dataset_t *ds = NULL;
+
+				err = dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &ds);
+				if (err != 0)
+					break;
+
+				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
+				    KM_SLEEP);
+				dsl_dataset_name(ds, strval);
+				dsl_dataset_rele(ds, FTAG);
+			} else {
+				strval = NULL;
+				intval = za.za_first_integer;
+			}
+
+			spa_prop_add_list(*nvp, prop, strval, intval, src);
+
+			if (strval != NULL)
+				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
+
+			break;
+
+		case 1:
+			/* string property */
+			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
+			err = zap_lookup(mos, spa->spa_pool_props_object,
+			    za.za_name, 1, za.za_num_integers, strval);
+			if (err) {
+				kmem_free(strval, za.za_num_integers);
+				break;
+			}
+			spa_prop_add_list(*nvp, prop, strval, 0, src);
+			kmem_free(strval, za.za_num_integers);
+			break;
+
+		default:
+			break;
+		}
+	}
+	zap_cursor_fini(&zc);
+out:
+	mutex_exit(&spa->spa_props_lock);
+	dsl_pool_config_exit(dp, FTAG);
+	if (err && err != ENOENT) {
+		nvlist_free(*nvp);
+		*nvp = NULL;
+		return (err);
+	}
+
+	return (0);
+}
+
+/*
+ * Validate the given pool properties nvlist and modify the list
+ * for the property values to be set.
+ */
+static int
+spa_prop_validate(spa_t *spa, nvlist_t *props)
+{
+	nvpair_t *elem;
+	int error = 0, reset_bootfs = 0;
+	uint64_t objnum = 0;
+	boolean_t has_feature = B_FALSE;
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+		uint64_t intval;
+		char *strval, *slash, *check, *fname;
+		const char *propname = nvpair_name(elem);
+		zpool_prop_t prop = zpool_name_to_prop(propname);
+
+		switch (prop) {
+		case ZPOOL_PROP_INVAL:
+			if (!zpool_prop_feature(propname)) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			/*
+			 * Sanitize the input.
+			 */
+			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			if (nvpair_value_uint64(elem, &intval) != 0) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			if (intval != 0) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			fname = strchr(propname, '@') + 1;
+			if (zfeature_lookup_name(fname, NULL) != 0) {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			has_feature = B_TRUE;
+			break;
+
+		case ZPOOL_PROP_VERSION:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error &&
+			    (intval < spa_version(spa) ||
+			    intval > SPA_VERSION_BEFORE_FEATURES ||
+			    has_feature))
+				error = SET_ERROR(EINVAL);
+			break;
+
+		case ZPOOL_PROP_DELEGATION:
+		case ZPOOL_PROP_AUTOREPLACE:
+		case ZPOOL_PROP_LISTSNAPS:
+		case ZPOOL_PROP_AUTOEXPAND:
+		case ZPOOL_PROP_AUTOTRIM:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error && intval > 1)
+				error = SET_ERROR(EINVAL);
+			break;
+
+		case ZPOOL_PROP_MULTIHOST:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error && intval > 1)
+				error = SET_ERROR(EINVAL);
+
+			if (!error) {
+				uint32_t hostid = zone_get_hostid(NULL);
+				if (hostid)
+					spa->spa_hostid = hostid;
+				else
+					error = SET_ERROR(ENOTSUP);
+			}
+
+			break;
+
+		case ZPOOL_PROP_BOOTFS:
+			/*
+			 * If the pool version is less than SPA_VERSION_BOOTFS,
+			 * or the pool is still being created (version == 0),
+			 * the bootfs property cannot be set.
+			 */
+			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
+				error = SET_ERROR(ENOTSUP);
+				break;
+			}
+
+			/*
+			 * Make sure the vdev config is bootable
+			 */
+			if (!vdev_is_bootable(spa->spa_root_vdev)) {
+				error = SET_ERROR(ENOTSUP);
+				break;
+			}
+
+			reset_bootfs = 1;
+
+			error = nvpair_value_string(elem, &strval);
+
+			if (!error) {
+				objset_t *os;
+
+				if (strval == NULL || strval[0] == '\0') {
+					objnum = zpool_prop_default_numeric(
+					    ZPOOL_PROP_BOOTFS);
+					break;
+				}
+
+				error = dmu_objset_hold(strval, FTAG, &os);
+				if (error != 0)
+					break;
+
+				/* Must be ZPL. */
+				if (dmu_objset_type(os) != DMU_OST_ZFS) {
+					error = SET_ERROR(ENOTSUP);
+				} else {
+					objnum = dmu_objset_id(os);
+				}
+				dmu_objset_rele(os, FTAG);
+			}
+			break;
+
+		case ZPOOL_PROP_FAILUREMODE:
+			error = nvpair_value_uint64(elem, &intval);
+			if (!error && intval > ZIO_FAILURE_MODE_PANIC)
+				error = SET_ERROR(EINVAL);
+
+			/*
+			 * This is a special case which only occurs when
+			 * the pool has completely failed. This allows
+			 * the user to change the in-core failmode property
+			 * without syncing it out to disk (I/Os might
+			 * currently be blocked). We do this by returning
+			 * EIO to the caller (spa_prop_set) to trick it
+			 * into thinking we encountered a property validation
+			 * error.
+			 */
+			if (!error && spa_suspended(spa)) {
+				spa->spa_failmode = intval;
+				error = SET_ERROR(EIO);
+			}
+			break;
+
+		case ZPOOL_PROP_CACHEFILE:
+			if ((error = nvpair_value_string(elem, &strval)) != 0)
+				break;
+
+			if (strval[0] == '\0')
+				break;
+
+			if (strcmp(strval, "none") == 0)
+				break;
+
+			if (strval[0] != '/') {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+
+			slash = strrchr(strval, '/');
+			ASSERT(slash != NULL);
+
+			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
+			    strcmp(slash, "/..") == 0)
+				error = SET_ERROR(EINVAL);
+			break;
+
+		case ZPOOL_PROP_COMMENT:
+			if ((error = nvpair_value_string(elem, &strval)) != 0)
+				break;
+			for (check = strval; *check != '\0'; check++) {
+				if (!isprint(*check)) {
+					error = SET_ERROR(EINVAL);
+					break;
+				}
+			}
+			if (strlen(strval) > ZPROP_MAX_COMMENT)
+				error = SET_ERROR(E2BIG);
+			break;
+
+		default:
+			break;
+		}
+
+		if (error)
+			break;
+	}
+
+	(void) nvlist_remove_all(props,
+	    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
+
+	if (!error && reset_bootfs) {
+		error = nvlist_remove(props,
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
+
+		if (!error) {
+			error = nvlist_add_uint64(props,
+			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
+		}
+	}
+
+	return (error);
+}
+
+void
+spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
+{
+	char *cachefile;
+	spa_config_dirent_t *dp;
+
+	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
+	    &cachefile) != 0)
+		return;
+
+	dp = kmem_alloc(sizeof (spa_config_dirent_t),
+	    KM_SLEEP);
+
+	if (cachefile[0] == '\0')
+		dp->scd_path = spa_strdup(spa_config_path);
+	else if (strcmp(cachefile, "none") == 0)
+		dp->scd_path = NULL;
+	else
+		dp->scd_path = spa_strdup(cachefile);
+
+	list_insert_head(&spa->spa_config_list, dp);
+	if (need_sync)
+		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
+int
+spa_prop_set(spa_t *spa, nvlist_t *nvp)
+{
+	int error;
+	nvpair_t *elem = NULL;
+	boolean_t need_sync = B_FALSE;
+
+	if ((error = spa_prop_validate(spa, nvp)) != 0)
+		return (error);
+
+	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
+		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
+
+		if (prop == ZPOOL_PROP_CACHEFILE ||
+		    prop == ZPOOL_PROP_ALTROOT ||
+		    prop == ZPOOL_PROP_READONLY)
+			continue;
+
+		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
+			uint64_t ver;
+
+			if (prop == ZPOOL_PROP_VERSION) {
+				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
+			} else {
+				ASSERT(zpool_prop_feature(nvpair_name(elem)));
+				ver = SPA_VERSION_FEATURES;
+				need_sync = B_TRUE;
+			}
+
+			/* Save time if the version is already set. */
+			if (ver == spa_version(spa))
+				continue;
+
+			/*
+			 * In addition to the pool directory object, we might
+			 * create the pool properties object, the features for
+			 * read object, the features for write object, or the
+			 * feature descriptions object.
+			 */
+			error = dsl_sync_task(spa->spa_name, NULL,
+			    spa_sync_version, &ver,
+			    6, ZFS_SPACE_CHECK_RESERVED);
+			if (error)
+				return (error);
+			continue;
+		}
+
+		need_sync = B_TRUE;
+		break;
+	}
+
+	if (need_sync) {
+		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
+		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
+	}
+
+	return (0);
+}
+
+/*
+ * If the bootfs property value is dsobj, clear it.
+ */
+void
+spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
+{
+	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
+		VERIFY(zap_remove(spa->spa_meta_objset,
+		    spa->spa_pool_props_object,
+		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
+		spa->spa_bootfs = 0;
+	}
+}
+
+/*ARGSUSED*/
+static int
+spa_change_guid_check(void *arg, dmu_tx_t *tx)
+{
+	uint64_t *newguid __maybe_unused = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t vdev_state;
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		int error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (SET_ERROR(error));
+	}
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	vdev_state = rvd->vdev_state;
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	if (vdev_state != VDEV_STATE_HEALTHY)
+		return (SET_ERROR(ENXIO));
+
+	ASSERT3U(spa_guid(spa), !=, *newguid);
+
+	return (0);
+}
+
+static void
+spa_change_guid_sync(void *arg, dmu_tx_t *tx)
+{
+	uint64_t *newguid = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	uint64_t oldguid;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	oldguid = spa_guid(spa);
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	rvd->vdev_guid = *newguid;
+	rvd->vdev_guid_sum += (*newguid - oldguid);
+	vdev_config_dirty(rvd);
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
+	    (u_longlong_t)oldguid, (u_longlong_t)*newguid);
+}
+
+/*
+ * Change the GUID for the pool.  This is done so that we can later
+ * re-import a pool built from a clone of our own vdevs.  We will modify
+ * the root vdev's guid, our own pool guid, and then mark all of our
+ * vdevs dirty.  Note that we must make sure that all our vdevs are
+ * online when we do this, or else any vdevs that weren't present
+ * would be orphaned from our pool.  We are also going to issue a
+ * sysevent to update any watchers.
+ */
+int
+spa_change_guid(spa_t *spa)
+{
+	int error;
+	uint64_t guid;
+
+	mutex_enter(&spa->spa_vdev_top_lock);
+	mutex_enter(&spa_namespace_lock);
+	guid = spa_generate_guid(NULL);
+
+	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
+	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
+
+	if (error == 0) {
+		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
+	}
+
+	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * SPA state manipulation (open/create/destroy/import/export)
+ * ==========================================================================
+ */
+
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
+	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
+	int ret;
+
+	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
+	    sizeof (zbookmark_phys_t));
+
+	return (TREE_ISIGN(ret));
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+}
+
+static void
+spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+	enum zti_modes mode = ztip->zti_mode;
+	uint_t value = ztip->zti_value;
+	uint_t count = ztip->zti_count;
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+	uint_t flags = 0;
+	boolean_t batch = B_FALSE;
+
+	if (mode == ZTI_MODE_NULL) {
+		tqs->stqs_count = 0;
+		tqs->stqs_taskq = NULL;
+		return;
+	}
+
+	ASSERT3U(count, >, 0);
+
+	tqs->stqs_count = count;
+	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
+
+	switch (mode) {
+	case ZTI_MODE_FIXED:
+		ASSERT3U(value, >=, 1);
+		value = MAX(value, 1);
+		flags |= TASKQ_DYNAMIC;
+		break;
+
+	case ZTI_MODE_BATCH:
+		batch = B_TRUE;
+		flags |= TASKQ_THREADS_CPU_PCT;
+		value = MIN(zio_taskq_batch_pct, 100);
+		break;
+
+	default:
+		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
+		    "spa_activate()",
+		    zio_type_name[t], zio_taskq_types[q], mode, value);
+		break;
+	}
+
+	for (uint_t i = 0; i < count; i++) {
+		taskq_t *tq;
+		char name[32];
+
+		(void) snprintf(name, sizeof (name), "%s_%s",
+		    zio_type_name[t], zio_taskq_types[q]);
+
+		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+			if (batch)
+				flags |= TASKQ_DC_BATCH;
+
+			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
+			    spa->spa_proc, zio_taskq_basedc, flags);
+		} else {
+			pri_t pri = maxclsyspri;
+			/*
+			 * The write issue taskq can be extremely CPU
+			 * intensive.  Run it at slightly less important
+			 * priority than the other taskqs.
+			 *
+			 * Under Linux and FreeBSD this means incrementing
+			 * the priority value as opposed to platforms like
+			 * illumos where it should be decremented.
+			 *
+			 * On FreeBSD, if priorities divided by four (RQ_PPQ)
+			 * are equal then a difference between them is
+			 * insignificant.
+			 */
+			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) {
+#if defined(__linux__)
+				pri++;
+#elif defined(__FreeBSD__)
+				pri += 4;
+#else
+#error "unknown OS"
+#endif
+			}
+			tq = taskq_create_proc(name, value, pri, 50,
+			    INT_MAX, spa->spa_proc, flags);
+		}
+
+		tqs->stqs_taskq[i] = tq;
+	}
+}
+
+static void
+spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+
+	if (tqs->stqs_taskq == NULL) {
+		ASSERT3U(tqs->stqs_count, ==, 0);
+		return;
+	}
+
+	for (uint_t i = 0; i < tqs->stqs_count; i++) {
+		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
+		taskq_destroy(tqs->stqs_taskq[i]);
+	}
+
+	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
+	tqs->stqs_taskq = NULL;
+}
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself. In that case we choose which taskq at random by using
+ * the low bits of gethrtime().
+ */
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+{
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+	taskq_t *tq;
+
+	ASSERT3P(tqs->stqs_taskq, !=, NULL);
+	ASSERT3U(tqs->stqs_count, !=, 0);
+
+	if (tqs->stqs_count == 1) {
+		tq = tqs->stqs_taskq[0];
+	} else {
+		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
+	}
+
+	taskq_dispatch_ent(tq, func, arg, flags, ent);
+}
+
+/*
+ * Same as spa_taskq_dispatch_ent() but block on the task until completion.
+ */
+void
+spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+    task_func_t *func, void *arg, uint_t flags)
+{
+	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+	taskq_t *tq;
+	taskqid_t id;
+
+	ASSERT3P(tqs->stqs_taskq, !=, NULL);
+	ASSERT3U(tqs->stqs_count, !=, 0);
+
+	if (tqs->stqs_count == 1) {
+		tq = tqs->stqs_taskq[0];
+	} else {
+		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
+	}
+
+	id = taskq_dispatch(tq, func, arg, flags);
+	if (id)
+		taskq_wait_id(tq, id);
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
+	for (int t = 0; t < ZIO_TYPES; t++) {
+		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+			spa_taskqs_init(spa, t, q);
+		}
+	}
+}
+
+/*
+ * Disabled until spa_thread() can be adapted for Linux.
+ */
+#undef HAVE_SPA_THREAD
+
+#if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
+static void
+spa_thread(void *arg)
+{
+	psetid_t zio_taskq_psrset_bind = PS_NONE;
+	callb_cpr_t cprinfo;
+
+	spa_t *spa = arg;
+	user_t *pu = PTOU(curproc);
+
+	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+	    spa->spa_name);
+
+	ASSERT(curproc != &p0);
+	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+	    "zpool-%s", spa->spa_name);
+	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+	/* bind this thread to the requested psrset */
+	if (zio_taskq_psrset_bind != PS_NONE) {
+		pool_lock();
+		mutex_enter(&cpu_lock);
+		mutex_enter(&pidlock);
+		mutex_enter(&curproc->p_lock);
+
+		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+		    0, NULL, NULL) == 0)  {
+			curthread->t_bind_pset = zio_taskq_psrset_bind;
+		} else {
+			cmn_err(CE_WARN,
+			    "Couldn't bind process for zfs pool \"%s\" to "
+			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+		}
+
+		mutex_exit(&curproc->p_lock);
+		mutex_exit(&pidlock);
+		mutex_exit(&cpu_lock);
+		pool_unlock();
+	}
+
+	if (zio_taskq_sysdc) {
+		sysdc_thread_enter(curthread, 100, 0);
+	}
+
+	spa->spa_proc = curproc;
+	spa->spa_did = curthread->t_did;
+
+	spa_create_zio_taskqs(spa);
+
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+	spa->spa_proc_state = SPA_PROC_ACTIVE;
+	cv_broadcast(&spa->spa_proc_cv);
+
+	CALLB_CPR_SAFE_BEGIN(&cprinfo);
+	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+	spa->spa_proc_state = SPA_PROC_GONE;
+	spa->spa_proc = &p0;
+	cv_broadcast(&spa->spa_proc_cv);
+	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
+
+	mutex_enter(&curproc->p_lock);
+	lwp_exit();
+}
+#endif
+
+/*
+ * Activate an uninitialized pool.
+ */
+static void
+spa_activate(spa_t *spa, spa_mode_t mode)
+{
+	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+
+	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_mode = mode;
+
+	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_embedded_log_class =
+	    metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops);
+
+	/* Try to create a covering process */
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+	ASSERT(spa->spa_proc == &p0);
+	spa->spa_did = 0;
+
+#ifdef HAVE_SPA_THREAD
+	/* Only create a process if we're going to be around a while. */
+	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+		    NULL, 0) == 0) {
+			spa->spa_proc_state = SPA_PROC_CREATED;
+			while (spa->spa_proc_state == SPA_PROC_CREATED) {
+				cv_wait(&spa->spa_proc_cv,
+				    &spa->spa_proc_lock);
+			}
+			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+			ASSERT(spa->spa_proc != &p0);
+			ASSERT(spa->spa_did != 0);
+		} else {
+#ifdef _KERNEL
+			cmn_err(CE_WARN,
+			    "Couldn't create process for zfs pool \"%s\"\n",
+			    spa->spa_name);
+#endif
+		}
+	}
+#endif /* HAVE_SPA_THREAD */
+	mutex_exit(&spa->spa_proc_lock);
+
+	/* If we didn't create a process, we need to create our taskqs. */
+	if (spa->spa_proc == &p0) {
+		spa_create_zio_taskqs(spa);
+	}
+
+	for (size_t i = 0; i < TXG_SIZE; i++) {
+		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+	}
+
+	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_config_dirty_node));
+	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
+	    offsetof(objset_t, os_evicting_node));
+	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_state_dirty_node));
+
+	txg_list_create(&spa->spa_vdev_txg_list, spa,
+	    offsetof(struct vdev, vdev_txg_node));
+
+	avl_create(&spa->spa_errlist_scrub,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+	avl_create(&spa->spa_errlist_last,
+	    spa_error_entry_compare, sizeof (spa_error_entry_t),
+	    offsetof(spa_error_entry_t, se_avl));
+
+	spa_keystore_init(&spa->spa_keystore);
+
+	/*
+	 * This taskq is used to perform zvol-minor-related tasks
+	 * asynchronously. This has several advantages, including easy
+	 * resolution of various deadlocks.
+	 *
+	 * The taskq must be single threaded to ensure tasks are always
+	 * processed in the order in which they were dispatched.
+	 *
+	 * A taskq per pool allows one to keep the pools independent.
+	 * This way if one pool is suspended, it will not impact another.
+	 *
+	 * The preferred location to dispatch a zvol minor task is a sync
+	 * task. In this context, there is easy access to the spa_t and minimal
+	 * error handling is required because the sync task must succeed.
+	 */
+	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
+	    1, INT_MAX, 0);
+
+	/*
+	 * Taskq dedicated to prefetcher threads: this is used to prevent the
+	 * pool traverse code from monopolizing the global (and limited)
+	 * system_taskq by inappropriately scheduling long running tasks on it.
+	 */
+	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
+	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+
+	/*
+	 * The taskq to upgrade datasets in this pool. Currently used by
+	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
+	 */
+	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
+	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
+}
+
+/*
+ * Opposite of spa_activate().
+ */
+static void
+spa_deactivate(spa_t *spa)
+{
+	ASSERT(spa->spa_sync_on == B_FALSE);
+	ASSERT(spa->spa_dsl_pool == NULL);
+	ASSERT(spa->spa_root_vdev == NULL);
+	ASSERT(spa->spa_async_zio_root == NULL);
+	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
+
+	spa_evicting_os_wait(spa);
+
+	if (spa->spa_zvol_taskq) {
+		taskq_destroy(spa->spa_zvol_taskq);
+		spa->spa_zvol_taskq = NULL;
+	}
+
+	if (spa->spa_prefetch_taskq) {
+		taskq_destroy(spa->spa_prefetch_taskq);
+		spa->spa_prefetch_taskq = NULL;
+	}
+
+	if (spa->spa_upgrade_taskq) {
+		taskq_destroy(spa->spa_upgrade_taskq);
+		spa->spa_upgrade_taskq = NULL;
+	}
+
+	txg_list_destroy(&spa->spa_vdev_txg_list);
+
+	list_destroy(&spa->spa_config_dirty_list);
+	list_destroy(&spa->spa_evicting_os_list);
+	list_destroy(&spa->spa_state_dirty_list);
+
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+
+	for (int t = 0; t < ZIO_TYPES; t++) {
+		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+			spa_taskqs_fini(spa, t, q);
+		}
+	}
+
+	for (size_t i = 0; i < TXG_SIZE; i++) {
+		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
+		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
+		spa->spa_txg_zio[i] = NULL;
+	}
+
+	metaslab_class_destroy(spa->spa_normal_class);
+	spa->spa_normal_class = NULL;
+
+	metaslab_class_destroy(spa->spa_log_class);
+	spa->spa_log_class = NULL;
+
+	metaslab_class_destroy(spa->spa_embedded_log_class);
+	spa->spa_embedded_log_class = NULL;
+
+	metaslab_class_destroy(spa->spa_special_class);
+	spa->spa_special_class = NULL;
+
+	metaslab_class_destroy(spa->spa_dedup_class);
+	spa->spa_dedup_class = NULL;
+
+	/*
+	 * If this was part of an import or the open otherwise failed, we may
+	 * still have errors left in the queues.  Empty them just in case.
+	 */
+	spa_errlog_drain(spa);
+	avl_destroy(&spa->spa_errlist_scrub);
+	avl_destroy(&spa->spa_errlist_last);
+
+	spa_keystore_fini(&spa->spa_keystore);
+
+	spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+	mutex_enter(&spa->spa_proc_lock);
+	if (spa->spa_proc_state != SPA_PROC_NONE) {
+		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+		cv_broadcast(&spa->spa_proc_cv);
+		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+			ASSERT(spa->spa_proc != &p0);
+			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+		}
+		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+		spa->spa_proc_state = SPA_PROC_NONE;
+	}
+	ASSERT(spa->spa_proc == &p0);
+	mutex_exit(&spa->spa_proc_lock);
+
+	/*
+	 * We want to make sure spa_thread() has actually exited the ZFS
+	 * module, so that the module can't be unloaded out from underneath
+	 * it.
+	 */
+	if (spa->spa_did != 0) {
+		thread_join(spa->spa_did);
+		spa->spa_did = 0;
+	}
+}
+
+/*
+ * Verify a pool configuration, and construct the vdev tree appropriately.  This
+ * will create all the necessary vdevs in the appropriate layout, with each vdev
+ * in the CLOSED state.  This will prep the pool before open/creation/import.
+ * All vdev validation is done by the vdev_alloc() routine.
+ */
+int
+spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
+    uint_t id, int atype)
+{
+	nvlist_t **child;
+	uint_t children;
+	int error;
+
+	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
+		return (error);
+
+	if ((*vdp)->vdev_ops->vdev_op_leaf)
+		return (0);
+
+	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children);
+
+	if (error == ENOENT)
+		return (0);
+
+	if (error) {
+		vdev_free(*vdp);
+		*vdp = NULL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	for (int c = 0; c < children; c++) {
+		vdev_t *vd;
+		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
+		    atype)) != 0) {
+			vdev_free(*vdp);
+			*vdp = NULL;
+			return (error);
+		}
+	}
+
+	ASSERT(*vdp != NULL);
+
+	return (0);
+}
+
+static boolean_t
+spa_should_flush_logs_on_unload(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return (B_FALSE);
+
+	if (!spa_writeable(spa))
+		return (B_FALSE);
+
+	if (!spa->spa_sync_on)
+		return (B_FALSE);
+
+	if (spa_state(spa) != POOL_STATE_EXPORTED)
+		return (B_FALSE);
+
+	if (zfs_keep_log_spacemaps_at_export)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Opens a transaction that will set the flag that will instruct
+ * spa_sync to attempt to flush all the metaslabs for that txg.
+ */
+static void
+spa_unload_log_sm_flush_all(spa_t *spa)
+{
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
+	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
+
+	dmu_tx_commit(tx);
+	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
+}
+
+static void
+spa_unload_log_sm_metadata(spa_t *spa)
+{
+	void *cookie = NULL;
+	spa_log_sm_t *sls;
+	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
+	    &cookie)) != NULL) {
+		VERIFY0(sls->sls_mscount);
+		kmem_free(sls, sizeof (spa_log_sm_t));
+	}
+
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e != NULL; e = list_head(&spa->spa_log_summary)) {
+		VERIFY0(e->lse_mscount);
+		list_remove(&spa->spa_log_summary, e);
+		kmem_free(e, sizeof (log_summary_entry_t));
+	}
+
+	spa->spa_unflushed_stats.sus_nblocks = 0;
+	spa->spa_unflushed_stats.sus_memused = 0;
+	spa->spa_unflushed_stats.sus_blocklimit = 0;
+}
+
+static void
+spa_destroy_aux_threads(spa_t *spa)
+{
+	if (spa->spa_condense_zthr != NULL) {
+		zthr_destroy(spa->spa_condense_zthr);
+		spa->spa_condense_zthr = NULL;
+	}
+	if (spa->spa_checkpoint_discard_zthr != NULL) {
+		zthr_destroy(spa->spa_checkpoint_discard_zthr);
+		spa->spa_checkpoint_discard_zthr = NULL;
+	}
+	if (spa->spa_livelist_delete_zthr != NULL) {
+		zthr_destroy(spa->spa_livelist_delete_zthr);
+		spa->spa_livelist_delete_zthr = NULL;
+	}
+	if (spa->spa_livelist_condense_zthr != NULL) {
+		zthr_destroy(spa->spa_livelist_condense_zthr);
+		spa->spa_livelist_condense_zthr = NULL;
+	}
+}
+
+/*
+ * Opposite of spa_load().
+ */
+static void
+spa_unload(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
+
+	spa_import_progress_remove(spa_guid(spa));
+	spa_load_note(spa, "UNLOADING");
+
+	spa_wake_waiters(spa);
+
+	/*
+	 * If the log space map feature is enabled and the pool is getting
+	 * exported (but not destroyed), we want to spend some time flushing
+	 * as many metaslabs as we can in an attempt to destroy log space
+	 * maps and save import time.
+	 */
+	if (spa_should_flush_logs_on_unload(spa))
+		spa_unload_log_sm_flush_all(spa);
+
+	/*
+	 * Stop async tasks.
+	 */
+	spa_async_suspend(spa);
+
+	if (spa->spa_root_vdev) {
+		vdev_t *root_vdev = spa->spa_root_vdev;
+		vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
+		vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
+		vdev_autotrim_stop_all(spa);
+		vdev_rebuild_stop_all(spa);
+	}
+
+	/*
+	 * Stop syncing.
+	 */
+	if (spa->spa_sync_on) {
+		txg_sync_stop(spa->spa_dsl_pool);
+		spa->spa_sync_on = B_FALSE;
+	}
+
+	/*
+	 * This ensures that there is no async metaslab prefetching
+	 * while we attempt to unload the spa.
+	 */
+	if (spa->spa_root_vdev != NULL) {
+		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+			vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
+			if (vc->vdev_mg != NULL)
+				taskq_wait(vc->vdev_mg->mg_taskq);
+		}
+	}
+
+	if (spa->spa_mmp.mmp_thread)
+		mmp_thread_stop(spa);
+
+	/*
+	 * Wait for any outstanding async I/O to complete.
+	 */
+	if (spa->spa_async_zio_root != NULL) {
+		for (int i = 0; i < max_ncpus; i++)
+			(void) zio_wait(spa->spa_async_zio_root[i]);
+		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
+		spa->spa_async_zio_root = NULL;
+	}
+
+	if (spa->spa_vdev_removal != NULL) {
+		spa_vdev_removal_destroy(spa->spa_vdev_removal);
+		spa->spa_vdev_removal = NULL;
+	}
+
+	spa_destroy_aux_threads(spa);
+
+	spa_condense_fini(spa);
+
+	bpobj_close(&spa->spa_deferred_bpobj);
+
+	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+
+	/*
+	 * Close all vdevs.
+	 */
+	if (spa->spa_root_vdev)
+		vdev_free(spa->spa_root_vdev);
+	ASSERT(spa->spa_root_vdev == NULL);
+
+	/*
+	 * Close the dsl pool.
+	 */
+	if (spa->spa_dsl_pool) {
+		dsl_pool_close(spa->spa_dsl_pool);
+		spa->spa_dsl_pool = NULL;
+		spa->spa_meta_objset = NULL;
+	}
+
+	ddt_unload(spa);
+	spa_unload_log_sm_metadata(spa);
+
+	/*
+	 * Drop and purge level 2 cache
+	 */
+	spa_l2cache_drop(spa);
+
+	for (int i = 0; i < spa->spa_spares.sav_count; i++)
+		vdev_free(spa->spa_spares.sav_vdevs[i]);
+	if (spa->spa_spares.sav_vdevs) {
+		kmem_free(spa->spa_spares.sav_vdevs,
+		    spa->spa_spares.sav_count * sizeof (void *));
+		spa->spa_spares.sav_vdevs = NULL;
+	}
+	if (spa->spa_spares.sav_config) {
+		nvlist_free(spa->spa_spares.sav_config);
+		spa->spa_spares.sav_config = NULL;
+	}
+	spa->spa_spares.sav_count = 0;
+
+	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
+		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+	}
+	if (spa->spa_l2cache.sav_vdevs) {
+		kmem_free(spa->spa_l2cache.sav_vdevs,
+		    spa->spa_l2cache.sav_count * sizeof (void *));
+		spa->spa_l2cache.sav_vdevs = NULL;
+	}
+	if (spa->spa_l2cache.sav_config) {
+		nvlist_free(spa->spa_l2cache.sav_config);
+		spa->spa_l2cache.sav_config = NULL;
+	}
+	spa->spa_l2cache.sav_count = 0;
+
+	spa->spa_async_suspended = 0;
+
+	spa->spa_indirect_vdevs_loaded = B_FALSE;
+
+	if (spa->spa_comment != NULL) {
+		spa_strfree(spa->spa_comment);
+		spa->spa_comment = NULL;
+	}
+	if (spa->spa_compatibility != NULL) {
+		spa_strfree(spa->spa_compatibility);
+		spa->spa_compatibility = NULL;
+	}
+
+	spa_config_exit(spa, SCL_ALL, spa);
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active spares for
+ * this pool.  When this is called, we have some form of basic information in
+ * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ */
+void
+spa_load_spares(spa_t *spa)
+{
+	nvlist_t **spares;
+	uint_t nspares;
+	int i;
+	vdev_t *vd, *tvd;
+
+#ifndef _KERNEL
+	/*
+	 * zdb opens both the current state of the pool and the
+	 * checkpointed state (if present), with a different spa_t.
+	 *
+	 * As spare vdevs are shared among open pools, we skip loading
+	 * them when we load the checkpointed state of the pool.
+	 */
+	if (!spa_writeable(spa))
+		return;
+#endif
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	/*
+	 * First, close and free any existing spare vdevs.
+	 */
+	for (i = 0; i < spa->spa_spares.sav_count; i++) {
+		vd = spa->spa_spares.sav_vdevs[i];
+
+		/* Undo the call to spa_activate() below */
+		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+		    B_FALSE)) != NULL && tvd->vdev_isspare)
+			spa_spare_remove(tvd);
+		vdev_close(vd);
+		vdev_free(vd);
+	}
+
+	if (spa->spa_spares.sav_vdevs)
+		kmem_free(spa->spa_spares.sav_vdevs,
+		    spa->spa_spares.sav_count * sizeof (void *));
+
+	if (spa->spa_spares.sav_config == NULL)
+		nspares = 0;
+	else
+		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+	spa->spa_spares.sav_count = (int)nspares;
+	spa->spa_spares.sav_vdevs = NULL;
+
+	if (nspares == 0)
+		return;
+
+	/*
+	 * Construct the array of vdevs, opening them to get status in the
+	 * process.   For each spare, there is potentially two different vdev_t
+	 * structures associated with it: one in the list of spares (used only
+	 * for basic validation purposes) and one in the active vdev
+	 * configuration (if it's spared in).  During this phase we open and
+	 * validate each vdev on the spare list.  If the vdev also exists in the
+	 * active configuration, then we also mark this vdev as an active spare.
+	 */
+	spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
+	    KM_SLEEP);
+	for (i = 0; i < spa->spa_spares.sav_count; i++) {
+		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
+		    VDEV_ALLOC_SPARE) == 0);
+		ASSERT(vd != NULL);
+
+		spa->spa_spares.sav_vdevs[i] = vd;
+
+		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
+		    B_FALSE)) != NULL) {
+			if (!tvd->vdev_isspare)
+				spa_spare_add(tvd);
+
+			/*
+			 * We only mark the spare active if we were successfully
+			 * able to load the vdev.  Otherwise, importing a pool
+			 * with a bad active spare would result in strange
+			 * behavior, because multiple pool would think the spare
+			 * is actively in use.
+			 *
+			 * There is a vulnerability here to an equally bizarre
+			 * circumstance, where a dead active spare is later
+			 * brought back to life (onlined or otherwise).  Given
+			 * the rarity of this scenario, and the extra complexity
+			 * it adds, we ignore the possibility.
+			 */
+			if (!vdev_is_dead(tvd))
+				spa_spare_activate(tvd);
+		}
+
+		vd->vdev_top = vd;
+		vd->vdev_aux = &spa->spa_spares;
+
+		if (vdev_open(vd) != 0)
+			continue;
+
+		if (vdev_validate_aux(vd) == 0)
+			spa_spare_add(vd);
+	}
+
+	/*
+	 * Recompute the stashed list of spares, with status information
+	 * this time.
+	 */
+	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
+	    DATA_TYPE_NVLIST_ARRAY) == 0);
+
+	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
+	    KM_SLEEP);
+	for (i = 0; i < spa->spa_spares.sav_count; i++)
+		spares[i] = vdev_config_generate(spa,
+		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
+	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
+	for (i = 0; i < spa->spa_spares.sav_count; i++)
+		nvlist_free(spares[i]);
+	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active l2cache for
+ * this pool.  When this is called, we have some form of basic information in
+ * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ * Devices which are already active have their details maintained, and are
+ * not re-opened.
+ */
+void
+spa_load_l2cache(spa_t *spa)
+{
+	nvlist_t **l2cache = NULL;
+	uint_t nl2cache;
+	int i, j, oldnvdevs;
+	uint64_t guid;
+	vdev_t *vd, **oldvdevs, **newvdevs;
+	spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+#ifndef _KERNEL
+	/*
+	 * zdb opens both the current state of the pool and the
+	 * checkpointed state (if present), with a different spa_t.
+	 *
+	 * As L2 caches are part of the ARC which is shared among open
+	 * pools, we skip loading them when we load the checkpointed
+	 * state of the pool.
+	 */
+	if (!spa_writeable(spa))
+		return;
+#endif
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	oldvdevs = sav->sav_vdevs;
+	oldnvdevs = sav->sav_count;
+	sav->sav_vdevs = NULL;
+	sav->sav_count = 0;
+
+	if (sav->sav_config == NULL) {
+		nl2cache = 0;
+		newvdevs = NULL;
+		goto out;
+	}
+
+	VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
+
+	/*
+	 * Process new nvlist of vdevs.
+	 */
+	for (i = 0; i < nl2cache; i++) {
+		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
+		    &guid) == 0);
+
+		newvdevs[i] = NULL;
+		for (j = 0; j < oldnvdevs; j++) {
+			vd = oldvdevs[j];
+			if (vd != NULL && guid == vd->vdev_guid) {
+				/*
+				 * Retain previous vdev for add/remove ops.
+				 */
+				newvdevs[i] = vd;
+				oldvdevs[j] = NULL;
+				break;
+			}
+		}
+
+		if (newvdevs[i] == NULL) {
+			/*
+			 * Create new vdev
+			 */
+			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
+			    VDEV_ALLOC_L2CACHE) == 0);
+			ASSERT(vd != NULL);
+			newvdevs[i] = vd;
+
+			/*
+			 * Commit this vdev as an l2cache device,
+			 * even if it fails to open.
+			 */
+			spa_l2cache_add(vd);
+
+			vd->vdev_top = vd;
+			vd->vdev_aux = sav;
+
+			spa_l2cache_activate(vd);
+
+			if (vdev_open(vd) != 0)
+				continue;
+
+			(void) vdev_validate_aux(vd);
+
+			if (!vdev_is_dead(vd))
+				l2arc_add_vdev(spa, vd);
+
+			/*
+			 * Upon cache device addition to a pool or pool
+			 * creation with a cache device or if the header
+			 * of the device is invalid we issue an async
+			 * TRIM command for the whole device which will
+			 * execute if l2arc_trim_ahead > 0.
+			 */
+			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
+		}
+	}
+
+	sav->sav_vdevs = newvdevs;
+	sav->sav_count = (int)nl2cache;
+
+	/*
+	 * Recompute the stashed list of l2cache devices, with status
+	 * information this time.
+	 */
+	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+	    DATA_TYPE_NVLIST_ARRAY) == 0);
+
+	if (sav->sav_count > 0)
+		l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
+		    KM_SLEEP);
+	for (i = 0; i < sav->sav_count; i++)
+		l2cache[i] = vdev_config_generate(spa,
+		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
+	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
+
+out:
+	/*
+	 * Purge vdevs that were dropped
+	 */
+	for (i = 0; i < oldnvdevs; i++) {
+		uint64_t pool;
+
+		vd = oldvdevs[i];
+		if (vd != NULL) {
+			ASSERT(vd->vdev_isl2cache);
+
+			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+			    pool != 0ULL && l2arc_vdev_present(vd))
+				l2arc_remove_vdev(vd);
+			vdev_clear_stats(vd);
+			vdev_free(vd);
+		}
+	}
+
+	if (oldvdevs)
+		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+
+	for (i = 0; i < sav->sav_count; i++)
+		nvlist_free(l2cache[i]);
+	if (sav->sav_count)
+		kmem_free(l2cache, sav->sav_count * sizeof (void *));
+}
+
+static int
+load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
+{
+	dmu_buf_t *db;
+	char *packed = NULL;
+	size_t nvsize = 0;
+	int error;
+	*value = NULL;
+
+	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
+	if (error)
+		return (error);
+
+	nvsize = *(uint64_t *)db->db_data;
+	dmu_buf_rele(db, FTAG);
+
+	packed = vmem_alloc(nvsize, KM_SLEEP);
+	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
+	    DMU_READ_PREFETCH);
+	if (error == 0)
+		error = nvlist_unpack(packed, nvsize, value, 0);
+	vmem_free(packed, nvsize);
+
+	return (error);
+}
+
+/*
+ * Concrete top-level vdevs that are not missing and are not logs. At every
+ * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
+ */
+static uint64_t
+spa_healthy_core_tvds(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t tvds = 0;
+
+	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+		vdev_t *vd = rvd->vdev_child[i];
+		if (vd->vdev_islog)
+			continue;
+		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
+			tvds++;
+	}
+
+	return (tvds);
+}
+
+/*
+ * Checks to see if the given vdev could not be opened, in which case we post a
+ * sysevent to notify the autoreplace code that the device has been removed.
+ */
+static void
+spa_check_removed(vdev_t *vd)
+{
+	for (uint64_t c = 0; c < vd->vdev_children; c++)
+		spa_check_removed(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
+	    vdev_is_concrete(vd)) {
+		zfs_post_autoreplace(vd->vdev_spa, vd);
+		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
+	}
+}
+
+static int
+spa_check_for_missing_logs(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	/*
+	 * If we're doing a normal import, then build up any additional
+	 * diagnostic information about missing log devices.
+	 * We'll pass this up to the user for further processing.
+	 */
+	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
+		nvlist_t **child, *nv;
+		uint64_t idx = 0;
+
+		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
+		    KM_SLEEP);
+		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *tvd = rvd->vdev_child[c];
+
+			/*
+			 * We consider a device as missing only if it failed
+			 * to open (i.e. offline or faulted is not considered
+			 * as missing).
+			 */
+			if (tvd->vdev_islog &&
+			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+				child[idx++] = vdev_config_generate(spa, tvd,
+				    B_FALSE, VDEV_CONFIG_MISSING);
+			}
+		}
+
+		if (idx > 0) {
+			fnvlist_add_nvlist_array(nv,
+			    ZPOOL_CONFIG_CHILDREN, child, idx);
+			fnvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
+
+			for (uint64_t i = 0; i < idx; i++)
+				nvlist_free(child[i]);
+		}
+		nvlist_free(nv);
+		kmem_free(child, rvd->vdev_children * sizeof (char **));
+
+		if (idx > 0) {
+			spa_load_failed(spa, "some log devices are missing");
+			vdev_dbgmsg_print_tree(rvd, 2);
+			return (SET_ERROR(ENXIO));
+		}
+	} else {
+		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *tvd = rvd->vdev_child[c];
+
+			if (tvd->vdev_islog &&
+			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
+				spa_set_log_state(spa, SPA_LOG_CLEAR);
+				spa_load_note(spa, "some log devices are "
+				    "missing, ZIL is dropped.");
+				vdev_dbgmsg_print_tree(rvd, 2);
+				break;
+			}
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Check for missing log devices
+ */
+static boolean_t
+spa_check_logs(spa_t *spa)
+{
+	boolean_t rv = B_FALSE;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	switch (spa->spa_log_state) {
+	default:
+		break;
+	case SPA_LOG_MISSING:
+		/* need to recheck in case slog has been restored */
+	case SPA_LOG_UNKNOWN:
+		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
+		if (rv)
+			spa_set_log_state(spa, SPA_LOG_MISSING);
+		break;
+	}
+	return (rv);
+}
+
+/*
+ * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
+static boolean_t
+spa_passivate_log(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	boolean_t slog_found = B_FALSE;
+
+	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+
+		if (tvd->vdev_islog) {
+			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+			metaslab_group_passivate(tvd->vdev_mg);
+			slog_found = B_TRUE;
+		}
+	}
+
+	return (slog_found);
+}
+
+/*
+ * Activate any log vdevs (note, does not apply to embedded log metaslabs).
+ */
+static void
+spa_activate_log(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+
+		if (tvd->vdev_islog) {
+			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+			metaslab_group_activate(tvd->vdev_mg);
+		}
+	}
+}
+
+int
+spa_reset_logs(spa_t *spa)
+{
+	int error;
+
+	error = dmu_objset_find(spa_name(spa), zil_reset,
+	    NULL, DS_FIND_CHILDREN);
+	if (error == 0) {
+		/*
+		 * We successfully offlined the log device, sync out the
+		 * current txg so that the "stubby" block can be removed
+		 * by zil_sync().
+		 */
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+	}
+	return (error);
+}
+
+static void
+spa_aux_check_removed(spa_aux_vdev_t *sav)
+{
+	for (int i = 0; i < sav->sav_count; i++)
+		spa_check_removed(sav->sav_vdevs[i]);
+}
+
+void
+spa_claim_notify(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+
+	if (zio->io_error)
+		return;
+
+	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
+	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+	mutex_exit(&spa->spa_props_lock);
+}
+
+typedef struct spa_load_error {
+	uint64_t	sle_meta_count;
+	uint64_t	sle_data_count;
+} spa_load_error_t;
+
+static void
+spa_load_verify_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	spa_load_error_t *sle = zio->io_private;
+	dmu_object_type_t type = BP_GET_TYPE(bp);
+	int error = zio->io_error;
+	spa_t *spa = zio->io_spa;
+
+	abd_free(zio->io_abd);
+	if (error) {
+		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
+		    type != DMU_OT_INTENT_LOG)
+			atomic_inc_64(&sle->sle_meta_count);
+		else
+			atomic_inc_64(&sle->sle_data_count);
+	}
+
+	mutex_enter(&spa->spa_scrub_lock);
+	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
+	cv_broadcast(&spa->spa_scrub_io_cv);
+	mutex_exit(&spa->spa_scrub_lock);
+}
+
+/*
+ * Maximum number of inflight bytes is the log2 fraction of the arc size.
+ * By default, we set it to 1/16th of the arc.
+ */
+int spa_load_verify_shift = 4;
+int spa_load_verify_metadata = B_TRUE;
+int spa_load_verify_data = B_TRUE;
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
+	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
+		return (0);
+	/*
+	 * Note: normally this routine will not be called if
+	 * spa_load_verify_metadata is not set.  However, it may be useful
+	 * to manually set the flag after the traversal has begun.
+	 */
+	if (!spa_load_verify_metadata)
+		return (0);
+	if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
+		return (0);
+
+	uint64_t maxinflight_bytes =
+	    arc_target_bytes() >> spa_load_verify_shift;
+	zio_t *rio = arg;
+	size_t size = BP_GET_PSIZE(bp);
+
+	mutex_enter(&spa->spa_scrub_lock);
+	while (spa->spa_load_verify_bytes >= maxinflight_bytes)
+		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+	spa->spa_load_verify_bytes += size;
+	mutex_exit(&spa->spa_scrub_lock);
+
+	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
+	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
+		return (SET_ERROR(ENAMETOOLONG));
+
+	return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+	zio_t *rio;
+	spa_load_error_t sle = { 0 };
+	zpool_load_policy_t policy;
+	boolean_t verify_ok = B_FALSE;
+	int error = 0;
+
+	zpool_get_load_policy(spa->spa_config, &policy);
+
+	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
+		return (0);
+
+	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+	error = dmu_objset_find_dp(spa->spa_dsl_pool,
+	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
+	    DS_FIND_CHILDREN);
+	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+	if (error != 0)
+		return (error);
+
+	rio = zio_root(spa, NULL, &sle,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	if (spa_load_verify_metadata) {
+		if (spa->spa_extreme_rewind) {
+			spa_load_note(spa, "performing a complete scan of the "
+			    "pool since extreme rewind is on. This may take "
+			    "a very long time.\n  (spa_load_verify_data=%u, "
+			    "spa_load_verify_metadata=%u)",
+			    spa_load_verify_data, spa_load_verify_metadata);
+		}
+
+		error = traverse_pool(spa, spa->spa_verify_min_txg,
+		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
+		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
+	}
+
+	(void) zio_wait(rio);
+	ASSERT0(spa->spa_load_verify_bytes);
+
+	spa->spa_load_meta_errors = sle.sle_meta_count;
+	spa->spa_load_data_errors = sle.sle_data_count;
+
+	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
+		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
+		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
+		    (u_longlong_t)sle.sle_data_count);
+	}
+
+	if (spa_load_verify_dryrun ||
+	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
+	    sle.sle_data_count <= policy.zlp_maxdata)) {
+		int64_t loss = 0;
+
+		verify_ok = B_TRUE;
+		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+		VERIFY(nvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
+		VERIFY(nvlist_add_int64(spa->spa_load_info,
+		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+		VERIFY(nvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
+	} else {
+		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+	}
+
+	if (spa_load_verify_dryrun)
+		return (0);
+
+	if (error) {
+		if (error != ENXIO && error != EIO)
+			error = SET_ERROR(EIO);
+		return (error);
+	}
+
+	return (verify_ok ? 0 : EIO);
+}
+
+/*
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
+ */
+static int
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
+{
+	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    name, sizeof (uint64_t), 1, val);
+
+	if (error != 0 && (error != ENOENT || log_enoent)) {
+		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
+		    "[error=%d]", name, error);
+	}
+
+	return (error);
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+	return (SET_ERROR(err));
+}
+
+boolean_t
+spa_livelist_delete_check(spa_t *spa)
+{
+	return (spa->spa_livelists_to_delete != 0);
+}
+
+/* ARGSUSED */
+static boolean_t
+spa_livelist_delete_cb_check(void *arg, zthr_t *z)
+{
+	spa_t *spa = arg;
+	return (spa_livelist_delete_check(spa));
+}
+
+static int
+delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	spa_t *spa = arg;
+	zio_free(spa, tx->tx_txg, bp);
+	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
+	    -bp_get_dsize_sync(spa, bp),
+	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
+	return (0);
+}
+
+static int
+dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
+{
+	int err;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zap_cursor_init(&zc, os, zap_obj);
+	err = zap_cursor_retrieve(&zc, &za);
+	zap_cursor_fini(&zc);
+	if (err == 0)
+		*llp = za.za_first_integer;
+	return (err);
+}
+
+/*
+ * Components of livelist deletion that must be performed in syncing
+ * context: freeing block pointers and updating the pool-wide data
+ * structures to indicate how much work is left to do
+ */
+typedef struct sublist_delete_arg {
+	spa_t *spa;
+	dsl_deadlist_t *ll;
+	uint64_t key;
+	bplist_t *to_free;
+} sublist_delete_arg_t;
+
+static void
+sublist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+	sublist_delete_arg_t *sda = arg;
+	spa_t *spa = sda->spa;
+	dsl_deadlist_t *ll = sda->ll;
+	uint64_t key = sda->key;
+	bplist_t *to_free = sda->to_free;
+
+	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
+	dsl_deadlist_remove_entry(ll, key, tx);
+}
+
+typedef struct livelist_delete_arg {
+	spa_t *spa;
+	uint64_t ll_obj;
+	uint64_t zap_obj;
+} livelist_delete_arg_t;
+
+static void
+livelist_delete_sync(void *arg, dmu_tx_t *tx)
+{
+	livelist_delete_arg_t *lda = arg;
+	spa_t *spa = lda->spa;
+	uint64_t ll_obj = lda->ll_obj;
+	uint64_t zap_obj = lda->zap_obj;
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t count;
+
+	/* free the livelist and decrement the feature count */
+	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
+	dsl_deadlist_free(mos, ll_obj, tx);
+	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
+	VERIFY0(zap_count(mos, zap_obj, &count));
+	if (count == 0) {
+		/* no more livelists to delete */
+		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DELETED_CLONES, tx));
+		VERIFY0(zap_destroy(mos, zap_obj, tx));
+		spa->spa_livelists_to_delete = 0;
+		spa_notify_waiters(spa);
+	}
+}
+
+/*
+ * Load in the value for the livelist to be removed and open it. Then,
+ * load its first sublist and determine which block pointers should actually
+ * be freed. Then, call a synctask which performs the actual frees and updates
+ * the pool-wide livelist data.
+ */
+/* ARGSUSED */
+static void
+spa_livelist_delete_cb(void *arg, zthr_t *z)
+{
+	spa_t *spa = arg;
+	uint64_t ll_obj = 0, count;
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t zap_obj = spa->spa_livelists_to_delete;
+	/*
+	 * Determine the next livelist to delete. This function should only
+	 * be called if there is at least one deleted clone.
+	 */
+	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
+	VERIFY0(zap_count(mos, ll_obj, &count));
+	if (count > 0) {
+		dsl_deadlist_t *ll;
+		dsl_deadlist_entry_t *dle;
+		bplist_t to_free;
+		ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
+		dsl_deadlist_open(ll, mos, ll_obj);
+		dle = dsl_deadlist_first(ll);
+		ASSERT3P(dle, !=, NULL);
+		bplist_create(&to_free);
+		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
+		    z, NULL);
+		if (err == 0) {
+			sublist_delete_arg_t sync_arg = {
+			    .spa = spa,
+			    .ll = ll,
+			    .key = dle->dle_mintxg,
+			    .to_free = &to_free
+			};
+			zfs_dbgmsg("deleting sublist (id %llu) from"
+			    " livelist %llu, %d remaining",
+			    dle->dle_bpobj.bpo_object, ll_obj, count - 1);
+			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+			    sublist_delete_sync, &sync_arg, 0,
+			    ZFS_SPACE_CHECK_DESTROY));
+		} else {
+			VERIFY3U(err, ==, EINTR);
+		}
+		bplist_clear(&to_free);
+		bplist_destroy(&to_free);
+		dsl_deadlist_close(ll);
+		kmem_free(ll, sizeof (dsl_deadlist_t));
+	} else {
+		livelist_delete_arg_t sync_arg = {
+		    .spa = spa,
+		    .ll_obj = ll_obj,
+		    .zap_obj = zap_obj
+		};
+		zfs_dbgmsg("deletion of livelist %llu completed", ll_obj);
+		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
+		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
+	}
+}
+
+static void
+spa_start_livelist_destroy_thread(spa_t *spa)
+{
+	ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
+	spa->spa_livelist_delete_zthr =
+	    zthr_create("z_livelist_destroy",
+	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa);
+}
+
+typedef struct livelist_new_arg {
+	bplist_t *allocs;
+	bplist_t *frees;
+} livelist_new_arg_t;
+
+static int
+livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(tx == NULL);
+	livelist_new_arg_t *lna = arg;
+	if (bp_freed) {
+		bplist_append(lna->frees, bp);
+	} else {
+		bplist_append(lna->allocs, bp);
+		zfs_livelist_condense_new_alloc++;
+	}
+	return (0);
+}
+
+typedef struct livelist_condense_arg {
+	spa_t *spa;
+	bplist_t to_keep;
+	uint64_t first_size;
+	uint64_t next_size;
+} livelist_condense_arg_t;
+
+static void
+spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
+{
+	livelist_condense_arg_t *lca = arg;
+	spa_t *spa = lca->spa;
+	bplist_t new_frees;
+	dsl_dataset_t *ds = spa->spa_to_condense.ds;
+
+	/* Have we been cancelled? */
+	if (spa->spa_to_condense.cancelled) {
+		zfs_livelist_condense_sync_cancel++;
+		goto out;
+	}
+
+	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
+
+	/*
+	 * It's possible that the livelist was changed while the zthr was
+	 * running. Therefore, we need to check for new blkptrs in the two
+	 * entries being condensed and continue to track them in the livelist.
+	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
+	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
+	 * we need to sort them into two different bplists.
+	 */
+	uint64_t first_obj = first->dle_bpobj.bpo_object;
+	uint64_t next_obj = next->dle_bpobj.bpo_object;
+	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
+
+	bplist_create(&new_frees);
+	livelist_new_arg_t new_bps = {
+	    .allocs = &lca->to_keep,
+	    .frees = &new_frees,
+	};
+
+	if (cur_first_size > lca->first_size) {
+		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
+		    livelist_track_new_cb, &new_bps, lca->first_size));
+	}
+	if (cur_next_size > lca->next_size) {
+		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
+		    livelist_track_new_cb, &new_bps, lca->next_size));
+	}
+
+	dsl_deadlist_clear_entry(first, ll, tx);
+	ASSERT(bpobj_is_empty(&first->dle_bpobj));
+	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
+
+	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
+	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
+	bplist_destroy(&new_frees);
+
+	char dsname[ZFS_MAX_DATASET_NAME_LEN];
+	dsl_dataset_name(ds, dsname);
+	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
+	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
+	    "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj,
+	    cur_first_size, next_obj, cur_next_size,
+	    first->dle_bpobj.bpo_object,
+	    first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
+out:
+	dmu_buf_rele(ds->ds_dbuf, spa);
+	spa->spa_to_condense.ds = NULL;
+	bplist_clear(&lca->to_keep);
+	bplist_destroy(&lca->to_keep);
+	kmem_free(lca, sizeof (livelist_condense_arg_t));
+	spa->spa_to_condense.syncing = B_FALSE;
+}
+
+static void
+spa_livelist_condense_cb(void *arg, zthr_t *t)
+{
+	while (zfs_livelist_condense_zthr_pause &&
+	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+		delay(1);
+
+	spa_t *spa = arg;
+	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
+	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
+	uint64_t first_size, next_size;
+
+	livelist_condense_arg_t *lca =
+	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
+	bplist_create(&lca->to_keep);
+
+	/*
+	 * Process the livelists (matching FREEs and ALLOCs) in open context
+	 * so we have minimal work in syncing context to condense.
+	 *
+	 * We save bpobj sizes (first_size and next_size) to use later in
+	 * syncing context to determine if entries were added to these sublists
+	 * while in open context. This is possible because the clone is still
+	 * active and open for normal writes and we want to make sure the new,
+	 * unprocessed blockpointers are inserted into the livelist normally.
+	 *
+	 * Note that dsl_process_sub_livelist() both stores the size number of
+	 * blockpointers and iterates over them while the bpobj's lock held, so
+	 * the sizes returned to us are consistent which what was actually
+	 * processed.
+	 */
+	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
+	    &first_size);
+	if (err == 0)
+		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
+		    t, &next_size);
+
+	if (err == 0) {
+		while (zfs_livelist_condense_sync_pause &&
+		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
+			delay(1);
+
+		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+		dmu_tx_mark_netfree(tx);
+		dmu_tx_hold_space(tx, 1);
+		err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE);
+		if (err == 0) {
+			/*
+			 * Prevent the condense zthr restarting before
+			 * the synctask completes.
+			 */
+			spa->spa_to_condense.syncing = B_TRUE;
+			lca->spa = spa;
+			lca->first_size = first_size;
+			lca->next_size = next_size;
+			dsl_sync_task_nowait(spa_get_dsl(spa),
+			    spa_livelist_condense_sync, lca, tx);
+			dmu_tx_commit(tx);
+			return;
+		}
+	}
+	/*
+	 * Condensing can not continue: either it was externally stopped or
+	 * we were unable to assign to a tx because the pool has run out of
+	 * space. In the second case, we'll just end up trying to condense
+	 * again in a later txg.
+	 */
+	ASSERT(err != 0);
+	bplist_clear(&lca->to_keep);
+	bplist_destroy(&lca->to_keep);
+	kmem_free(lca, sizeof (livelist_condense_arg_t));
+	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
+	spa->spa_to_condense.ds = NULL;
+	if (err == EINTR)
+		zfs_livelist_condense_zthr_cancel++;
+}
+
+/* ARGSUSED */
+/*
+ * Check that there is something to condense but that a condense is not
+ * already in progress and that condensing has not been cancelled.
+ */
+static boolean_t
+spa_livelist_condense_cb_check(void *arg, zthr_t *z)
+{
+	spa_t *spa = arg;
+	if ((spa->spa_to_condense.ds != NULL) &&
+	    (spa->spa_to_condense.syncing == B_FALSE) &&
+	    (spa->spa_to_condense.cancelled == B_FALSE)) {
+		return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+static void
+spa_start_livelist_condensing_thread(spa_t *spa)
+{
+	spa->spa_to_condense.ds = NULL;
+	spa->spa_to_condense.first = NULL;
+	spa->spa_to_condense.next = NULL;
+	spa->spa_to_condense.syncing = B_FALSE;
+	spa->spa_to_condense.cancelled = B_FALSE;
+
+	ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
+	spa->spa_livelist_condense_zthr =
+	    zthr_create("z_livelist_condense",
+	    spa_livelist_condense_cb_check,
+	    spa_livelist_condense_cb, spa);
+}
+
+static void
+spa_spawn_aux_threads(spa_t *spa)
+{
+	ASSERT(spa_writeable(spa));
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa_start_indirect_condensing_thread(spa);
+	spa_start_livelist_destroy_thread(spa);
+	spa_start_livelist_condensing_thread(spa);
+
+	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
+	spa->spa_checkpoint_discard_zthr =
+	    zthr_create("z_checkpoint_discard",
+	    spa_checkpoint_discard_thread_check,
+	    spa_checkpoint_discard_thread, spa);
+}
+
+/*
+ * Fix up config after a partly-completed split.  This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process.  To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call.  If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+	uint_t extracted;
+	uint64_t *glist;
+	uint_t i, gcount;
+	nvlist_t *nvl;
+	vdev_t **vd;
+	boolean_t attempt_reopen;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+		return;
+
+	/* check that the config is complete */
+	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+	    &glist, &gcount) != 0)
+		return;
+
+	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+	/* attempt to online all the vdevs & validate */
+	attempt_reopen = B_TRUE;
+	for (i = 0; i < gcount; i++) {
+		if (glist[i] == 0)	/* vdev is hole */
+			continue;
+
+		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+		if (vd[i] == NULL) {
+			/*
+			 * Don't bother attempting to reopen the disks;
+			 * just do the split.
+			 */
+			attempt_reopen = B_FALSE;
+		} else {
+			/* attempt to re-online it */
+			vd[i]->vdev_offline = B_FALSE;
+		}
+	}
+
+	if (attempt_reopen) {
+		vdev_reopen(spa->spa_root_vdev);
+
+		/* check each device to see what state it's in */
+		for (extracted = 0, i = 0; i < gcount; i++) {
+			if (vd[i] != NULL &&
+			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+				break;
+			++extracted;
+		}
+	}
+
+	/*
+	 * If every disk has been moved to the new pool, or if we never
+	 * even attempted to look at them, then we split them off for
+	 * good.
+	 */
+	if (!attempt_reopen || gcount == extracted) {
+		for (i = 0; i < gcount; i++)
+			if (vd[i] != NULL)
+				vdev_split(vd[i]);
+		vdev_reopen(spa->spa_root_vdev);
+	}
+
+	kmem_free(vd, gcount * sizeof (vdev_t *));
+}
+
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
+{
+	char *ereport = FM_EREPORT_ZFS_POOL;
+	int error;
+
+	spa->spa_load_state = state;
+	(void) spa_import_progress_set_state(spa_guid(spa),
+	    spa_load_state(spa));
+
+	gethrestime(&spa->spa_loaded_ts);
+	error = spa_load_impl(spa, type, &ereport);
+
+	/*
+	 * Don't count references from objsets that are already closed
+	 * and are making their way through the eviction process.
+	 */
+	spa_evicting_os_wait(spa);
+	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
+	if (error) {
+		if (error != EEXIST) {
+			spa->spa_loaded_ts.tv_sec = 0;
+			spa->spa_loaded_ts.tv_nsec = 0;
+		}
+		if (error != EBADF) {
+			(void) zfs_ereport_post(ereport, spa,
+			    NULL, NULL, NULL, 0);
+		}
+	}
+	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+	spa->spa_ena = 0;
+
+	(void) spa_import_progress_set_state(spa_guid(spa),
+	    spa_load_state(spa));
+
+	return (error);
+}
+
+#ifdef ZFS_DEBUG
+/*
+ * Count the number of per-vdev ZAPs associated with all of the vdevs in the
+ * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
+ * spa's per-vdev ZAP list.
+ */
+static uint64_t
+vdev_count_verify_zaps(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	uint64_t total = 0;
+
+	if (vd->vdev_top_zap != 0) {
+		total++;
+		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
+	}
+	if (vd->vdev_leaf_zap != 0) {
+		total++;
+		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		total += vdev_count_verify_zaps(vd->vdev_child[i]);
+	}
+
+	return (total);
+}
+#endif
+
+/*
+ * Determine whether the activity check is required.
+ */
+static boolean_t
+spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
+    nvlist_t *config)
+{
+	uint64_t state = 0;
+	uint64_t hostid = 0;
+	uint64_t tryconfig_txg = 0;
+	uint64_t tryconfig_timestamp = 0;
+	uint16_t tryconfig_mmp_seq = 0;
+	nvlist_t *nvinfo;
+
+	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
+		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
+		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
+		    &tryconfig_txg);
+		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
+		    &tryconfig_timestamp);
+		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
+		    &tryconfig_mmp_seq);
+	}
+
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
+
+	/*
+	 * Disable the MMP activity check - This is used by zdb which
+	 * is intended to be used on potentially active pools.
+	 */
+	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
+		return (B_FALSE);
+
+	/*
+	 * Skip the activity check when the MMP feature is disabled.
+	 */
+	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
+		return (B_FALSE);
+
+	/*
+	 * If the tryconfig_ values are nonzero, they are the results of an
+	 * earlier tryimport.  If they all match the uberblock we just found,
+	 * then the pool has not changed and we return false so we do not test
+	 * a second time.
+	 */
+	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
+	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
+	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
+	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
+		return (B_FALSE);
+
+	/*
+	 * Allow the activity check to be skipped when importing the pool
+	 * on the same host which last imported it.  Since the hostid from
+	 * configuration may be stale use the one read from the label.
+	 */
+	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
+		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
+
+	if (hostid == spa_get_hostid(spa))
+		return (B_FALSE);
+
+	/*
+	 * Skip the activity test when the pool was cleanly exported.
+	 */
+	if (state != POOL_STATE_ACTIVE)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Nanoseconds the activity check must watch for changes on-disk.
+ */
+static uint64_t
+spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
+{
+	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
+	uint64_t multihost_interval = MSEC2NSEC(
+	    MMP_INTERVAL_OK(zfs_multihost_interval));
+	uint64_t import_delay = MAX(NANOSEC, import_intervals *
+	    multihost_interval);
+
+	/*
+	 * Local tunables determine a minimum duration except for the case
+	 * where we know when the remote host will suspend the pool if MMP
+	 * writes do not land.
+	 *
+	 * See Big Theory comment at the top of mmp.c for the reasoning behind
+	 * these cases and times.
+	 */
+
+	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
+
+	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
+	    MMP_FAIL_INT(ub) > 0) {
+
+		/* MMP on remote host will suspend pool after failed writes */
+		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
+		    MMP_IMPORT_SAFETY_FACTOR / 100;
+
+		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
+		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
+		    "import_intervals=%u", import_delay, MMP_FAIL_INT(ub),
+		    MMP_INTERVAL(ub), import_intervals);
+
+	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
+	    MMP_FAIL_INT(ub) == 0) {
+
+		/* MMP on remote host will never suspend pool */
+		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
+		    ub->ub_mmp_delay) * import_intervals);
+
+		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
+		    "mmp_interval=%llu ub_mmp_delay=%llu "
+		    "import_intervals=%u", import_delay, MMP_INTERVAL(ub),
+		    ub->ub_mmp_delay, import_intervals);
+
+	} else if (MMP_VALID(ub)) {
+		/*
+		 * zfs-0.7 compatibility case
+		 */
+
+		import_delay = MAX(import_delay, (multihost_interval +
+		    ub->ub_mmp_delay) * import_intervals);
+
+		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
+		    "import_intervals=%u leaves=%u", import_delay,
+		    ub->ub_mmp_delay, import_intervals,
+		    vdev_count_leaves(spa));
+	} else {
+		/* Using local tunings is the only reasonable option */
+		zfs_dbgmsg("pool last imported on non-MMP aware "
+		    "host using import_delay=%llu multihost_interval=%llu "
+		    "import_intervals=%u", import_delay, multihost_interval,
+		    import_intervals);
+	}
+
+	return (import_delay);
+}
+
+/*
+ * Perform the import activity check.  If the user canceled the import or
+ * we detected activity then fail.
+ */
+static int
+spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config)
+{
+	uint64_t txg = ub->ub_txg;
+	uint64_t timestamp = ub->ub_timestamp;
+	uint64_t mmp_config = ub->ub_mmp_config;
+	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
+	uint64_t import_delay;
+	hrtime_t import_expire;
+	nvlist_t *mmp_label = NULL;
+	vdev_t *rvd = spa->spa_root_vdev;
+	kcondvar_t cv;
+	kmutex_t mtx;
+	int error = 0;
+
+	cv_init(&cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
+	mutex_enter(&mtx);
+
+	/*
+	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
+	 * during the earlier tryimport.  If the txg recorded there is 0 then
+	 * the pool is known to be active on another host.
+	 *
+	 * Otherwise, the pool might be in use on another host.  Check for
+	 * changes in the uberblocks on disk if necessary.
+	 */
+	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
+		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
+		    ZPOOL_CONFIG_LOAD_INFO);
+
+		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
+		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
+			vdev_uberblock_load(rvd, ub, &mmp_label);
+			error = SET_ERROR(EREMOTEIO);
+			goto out;
+		}
+	}
+
+	import_delay = spa_activity_check_duration(spa, ub);
+
+	/* Add a small random factor in case of simultaneous imports (0-25%) */
+	import_delay += import_delay * spa_get_random(250) / 1000;
+
+	import_expire = gethrtime() + import_delay;
+
+	while (gethrtime() < import_expire) {
+		(void) spa_import_progress_set_mmp_check(spa_guid(spa),
+		    NSEC2SEC(import_expire - gethrtime()));
+
+		vdev_uberblock_load(rvd, ub, &mmp_label);
+
+		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
+		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
+			zfs_dbgmsg("multihost activity detected "
+			    "txg %llu ub_txg  %llu "
+			    "timestamp %llu ub_timestamp  %llu "
+			    "mmp_config %#llx ub_mmp_config %#llx",
+			    txg, ub->ub_txg, timestamp, ub->ub_timestamp,
+			    mmp_config, ub->ub_mmp_config);
+
+			error = SET_ERROR(EREMOTEIO);
+			break;
+		}
+
+		if (mmp_label) {
+			nvlist_free(mmp_label);
+			mmp_label = NULL;
+		}
+
+		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
+		if (error != -1) {
+			error = SET_ERROR(EINTR);
+			break;
+		}
+		error = 0;
+	}
+
+out:
+	mutex_exit(&mtx);
+	mutex_destroy(&mtx);
+	cv_destroy(&cv);
+
+	/*
+	 * If the pool is determined to be active store the status in the
+	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
+	 * available from configuration read from disk store them as well.
+	 * This allows 'zpool import' to generate a more useful message.
+	 *
+	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
+	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
+	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
+	 */
+	if (error == EREMOTEIO) {
+		char *hostname = "<unknown>";
+		uint64_t hostid = 0;
+
+		if (mmp_label) {
+			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
+				hostname = fnvlist_lookup_string(mmp_label,
+				    ZPOOL_CONFIG_HOSTNAME);
+				fnvlist_add_string(spa->spa_load_info,
+				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
+			}
+
+			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
+				hostid = fnvlist_lookup_uint64(mmp_label,
+				    ZPOOL_CONFIG_HOSTID);
+				fnvlist_add_uint64(spa->spa_load_info,
+				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
+			}
+		}
+
+		fnvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
+		fnvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_MMP_TXG, 0);
+
+		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
+	}
+
+	if (mmp_label)
+		nvlist_free(mmp_label);
+
+	return (error);
+}
+
+static int
+spa_verify_host(spa_t *spa, nvlist_t *mos_config)
+{
+	uint64_t hostid;
+	char *hostname;
+	uint64_t myhostid = 0;
+
+	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
+	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
+		hostname = fnvlist_lookup_string(mos_config,
+		    ZPOOL_CONFIG_HOSTNAME);
+
+		myhostid = zone_get_hostid(NULL);
+
+		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
+			cmn_err(CE_WARN, "pool '%s' could not be "
+			    "loaded as it was last accessed by "
+			    "another system (host: %s hostid: 0x%llx). "
+			    "See: https://openzfs.github.io/openzfs-docs/msg/"
+			    "ZFS-8000-EY",
+			    spa_name(spa), hostname, (u_longlong_t)hostid);
+			spa_load_failed(spa, "hostid verification failed: pool "
+			    "last accessed by host: %s (hostid: 0x%llx)",
+			    hostname, (u_longlong_t)hostid);
+			return (SET_ERROR(EBADF));
+		}
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
+{
+	int error = 0;
+	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
+	int parse;
+	vdev_t *rvd;
+	uint64_t pool_guid;
+	char *comment;
+	char *compatibility;
+
+	/*
+	 * Versioning wasn't explicitly added to the label until later, so if
+	 * it's not present treat it as the initial version.
+	 */
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &spa->spa_ubsync.ub_version) != 0)
+		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+		spa_load_failed(spa, "invalid config provided: '%s' missing",
+		    ZPOOL_CONFIG_POOL_GUID);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * If we are doing an import, ensure that the pool is not already
+	 * imported by checking if its pool guid already exists in the
+	 * spa namespace.
+	 *
+	 * The only case that we allow an already imported pool to be
+	 * imported again, is when the pool is checkpointed and we want to
+	 * look at its checkpointed state from userland tools like zdb.
+	 */
+#ifdef _KERNEL
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0)) {
+#else
+	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0) &&
+	    !spa_importing_readonly_checkpoint(spa)) {
+#endif
+		spa_load_failed(spa, "a pool with guid %llu is already open",
+		    (u_longlong_t)pool_guid);
+		return (SET_ERROR(EEXIST));
+	}
+
+	spa->spa_config_guid = pool_guid;
+
+	nvlist_free(spa->spa_load_info);
+	spa->spa_load_info = fnvlist_alloc();
+
+	ASSERT(spa->spa_comment == NULL);
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
+		spa->spa_comment = spa_strdup(comment);
+
+	ASSERT(spa->spa_compatibility == NULL);
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
+	    &compatibility) == 0)
+		spa->spa_compatibility = spa_strdup(compatibility);
+
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    &spa->spa_config_txg);
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
+		spa->spa_config_splitting = fnvlist_dup(nvl);
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
+		spa_load_failed(spa, "invalid config provided: '%s' missing",
+		    ZPOOL_CONFIG_VDEV_TREE);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Create "The Godfather" zio to hold all async IOs
+	 */
+	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+	    KM_SLEEP);
+	for (int i = 0; i < max_ncpus; i++) {
+		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+	}
+
+	/*
+	 * Parse the configuration into a vdev tree.  We explicitly set the
+	 * value that will be returned by spa_version() since parsing the
+	 * configuration requires knowing the version number.
+	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	parse = (type == SPA_IMPORT_EXISTING ?
+	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (error != 0) {
+		spa_load_failed(spa, "unable to parse config [error=%d]",
+		    error);
+		return (error);
+	}
+
+	ASSERT(spa->spa_root_vdev == rvd);
+	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
+
+	if (type != SPA_IMPORT_ASSEMBLE) {
+		ASSERT(spa_guid(spa) == pool_guid);
+	}
+
+	return (0);
+}
+
+/*
+ * Recursively open all vdevs in the vdev tree. This function is called twice:
+ * first with the untrusted config, then with the trusted config.
+ */
+static int
+spa_ld_open_vdevs(spa_t *spa)
+{
+	int error = 0;
+
+	/*
+	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
+	 * missing/unopenable for the root vdev to be still considered openable.
+	 */
+	if (spa->spa_trust_config) {
+		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
+	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
+		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
+	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
+		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
+	} else {
+		spa->spa_missing_tvds_allowed = 0;
+	}
+
+	spa->spa_missing_tvds_allowed =
+	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = vdev_open(spa->spa_root_vdev);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (spa->spa_missing_tvds != 0) {
+		spa_load_note(spa, "vdev tree has %lld missing top-level "
+		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
+		if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
+			/*
+			 * Although theoretically we could allow users to open
+			 * incomplete pools in RW mode, we'd need to add a lot
+			 * of extra logic (e.g. adjust pool space to account
+			 * for missing vdevs).
+			 * This limitation also prevents users from accidentally
+			 * opening the pool in RW mode during data recovery and
+			 * damaging it further.
+			 */
+			spa_load_note(spa, "pools with missing top-level "
+			    "vdevs can only be opened in read-only mode.");
+			error = SET_ERROR(ENXIO);
+		} else {
+			spa_load_note(spa, "current settings allow for maximum "
+			    "%lld missing top-level vdevs at this stage.",
+			    (u_longlong_t)spa->spa_missing_tvds_allowed);
+		}
+	}
+	if (error != 0) {
+		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
+		    error);
+	}
+	if (spa->spa_missing_tvds != 0 || error != 0)
+		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
+
+	return (error);
+}
+
+/*
+ * We need to validate the vdev labels against the configuration that
+ * we have in hand. This function is called twice: first with an untrusted
+ * config, then with a trusted config. The validation is more strict when the
+ * config is trusted.
+ */
+static int
+spa_ld_validate_vdevs(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = vdev_validate(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (error != 0) {
+		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
+		return (error);
+	}
+
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+		spa_load_failed(spa, "cannot open vdev tree after invalidating "
+		    "some vdevs");
+		vdev_dbgmsg_print_tree(rvd, 2);
+		return (SET_ERROR(ENXIO));
+	}
+
+	return (0);
+}
+
+static void
+spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
+{
+	spa->spa_state = POOL_STATE_ACTIVE;
+	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+	spa->spa_claim_max_txg = spa->spa_first_txg;
+	spa->spa_prev_software_version = ub->ub_software_version;
+}
+
+static int
+spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	nvlist_t *label;
+	uberblock_t *ub = &spa->spa_uberblock;
+	boolean_t activity_check = B_FALSE;
+
+	/*
+	 * If we are opening the checkpointed state of the pool by
+	 * rewinding to it, at this point we will have written the
+	 * checkpointed uberblock to the vdev labels, so searching
+	 * the labels will find the right uberblock.  However, if
+	 * we are opening the checkpointed state read-only, we have
+	 * not modified the labels. Therefore, we must ignore the
+	 * labels and continue using the spa_uberblock that was set
+	 * by spa_ld_checkpoint_rewind.
+	 *
+	 * Note that it would be fine to ignore the labels when
+	 * rewinding (opening writeable) as well. However, if we
+	 * crash just after writing the labels, we will end up
+	 * searching the labels. Doing so in the common case means
+	 * that this code path gets exercised normally, rather than
+	 * just in the edge case.
+	 */
+	if (ub->ub_checkpoint_txg != 0 &&
+	    spa_importing_readonly_checkpoint(spa)) {
+		spa_ld_select_uberblock_done(spa, ub);
+		return (0);
+	}
+
+	/*
+	 * Find the best uberblock.
+	 */
+	vdev_uberblock_load(rvd, ub, &label);
+
+	/*
+	 * If we weren't able to find a single valid uberblock, return failure.
+	 */
+	if (ub->ub_txg == 0) {
+		nvlist_free(label);
+		spa_load_failed(spa, "no valid uberblock found");
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+	}
+
+	if (spa->spa_load_max_txg != UINT64_MAX) {
+		(void) spa_import_progress_set_max_txg(spa_guid(spa),
+		    (u_longlong_t)spa->spa_load_max_txg);
+	}
+	spa_load_note(spa, "using uberblock with txg=%llu",
+	    (u_longlong_t)ub->ub_txg);
+
+
+	/*
+	 * For pools which have the multihost property on determine if the
+	 * pool is truly inactive and can be safely imported.  Prevent
+	 * hosts which don't have a hostid set from importing the pool.
+	 */
+	activity_check = spa_activity_check_required(spa, ub, label,
+	    spa->spa_config);
+	if (activity_check) {
+		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
+		    spa_get_hostid(spa) == 0) {
+			nvlist_free(label);
+			fnvlist_add_uint64(spa->spa_load_info,
+			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
+			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
+		}
+
+		int error = spa_activity_check(spa, ub, spa->spa_config);
+		if (error) {
+			nvlist_free(label);
+			return (error);
+		}
+
+		fnvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
+		fnvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
+		fnvlist_add_uint16(spa->spa_load_info,
+		    ZPOOL_CONFIG_MMP_SEQ,
+		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
+	}
+
+	/*
+	 * If the pool has an unsupported version we can't open it.
+	 */
+	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
+		nvlist_free(label);
+		spa_load_failed(spa, "version %llu is not supported",
+		    (u_longlong_t)ub->ub_version);
+		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+	}
+
+	if (ub->ub_version >= SPA_VERSION_FEATURES) {
+		nvlist_t *features;
+
+		/*
+		 * If we weren't able to find what's necessary for reading the
+		 * MOS in the label, return failure.
+		 */
+		if (label == NULL) {
+			spa_load_failed(spa, "label config unavailable");
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+			    ENXIO));
+		}
+
+		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
+		    &features) != 0) {
+			nvlist_free(label);
+			spa_load_failed(spa, "invalid label: '%s' missing",
+			    ZPOOL_CONFIG_FEATURES_FOR_READ);
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+			    ENXIO));
+		}
+
+		/*
+		 * Update our in-core representation with the definitive values
+		 * from the label.
+		 */
+		nvlist_free(spa->spa_label_features);
+		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
+	}
+
+	nvlist_free(label);
+
+	/*
+	 * Look through entries in the label nvlist's features_for_read. If
+	 * there is a feature listed there which we don't understand then we
+	 * cannot open a pool.
+	 */
+	if (ub->ub_version >= SPA_VERSION_FEATURES) {
+		nvlist_t *unsup_feat;
+
+		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
+		    0);
+
+		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
+		    NULL); nvp != NULL;
+		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
+			if (!zfeature_is_supported(nvpair_name(nvp))) {
+				VERIFY(nvlist_add_string(unsup_feat,
+				    nvpair_name(nvp), "") == 0);
+			}
+		}
+
+		if (!nvlist_empty(unsup_feat)) {
+			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
+			nvlist_free(unsup_feat);
+			spa_load_failed(spa, "some features are unsupported");
+			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+			    ENOTSUP));
+		}
+
+		nvlist_free(unsup_feat);
+	}
+
+	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_try_repair(spa, spa->spa_config);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		nvlist_free(spa->spa_config_splitting);
+		spa->spa_config_splitting = NULL;
+	}
+
+	/*
+	 * Initialize internal SPA structures.
+	 */
+	spa_ld_select_uberblock_done(spa, ub);
+
+	return (0);
+}
+
+static int
+spa_ld_open_rootbp(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+	if (error != 0) {
+		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
+		    "[error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
+
+	return (0);
+}
+
+static int
+spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t reloading)
+{
+	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
+	nvlist_t *nv, *mos_config, *policy;
+	int error = 0, copy_error;
+	uint64_t healthy_tvds, healthy_tvds_mos;
+	uint64_t mos_config_txg;
+
+	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
+	    != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	/*
+	 * If we're assembling a pool from a split, the config provided is
+	 * already trusted so there is nothing to do.
+	 */
+	if (type == SPA_IMPORT_ASSEMBLE)
+		return (0);
+
+	healthy_tvds = spa_healthy_core_tvds(spa);
+
+	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
+	    != 0) {
+		spa_load_failed(spa, "unable to retrieve MOS config");
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	/*
+	 * If we are doing an open, pool owner wasn't verified yet, thus do
+	 * the verification here.
+	 */
+	if (spa->spa_load_state == SPA_LOAD_OPEN) {
+		error = spa_verify_host(spa, mos_config);
+		if (error != 0) {
+			nvlist_free(mos_config);
+			return (error);
+		}
+	}
+
+	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	/*
+	 * Build a new vdev tree from the trusted config
+	 */
+	error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
+	if (error != 0) {
+		nvlist_free(mos_config);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa_load_failed(spa, "spa_config_parse failed [error=%d]",
+		    error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+	}
+
+	/*
+	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
+	 * obtained by scanning /dev/dsk, then it will have the right vdev
+	 * paths. We update the trusted MOS config with this information.
+	 * We first try to copy the paths with vdev_copy_path_strict, which
+	 * succeeds only when both configs have exactly the same vdev tree.
+	 * If that fails, we fall back to a more flexible method that has a
+	 * best effort policy.
+	 */
+	copy_error = vdev_copy_path_strict(rvd, mrvd);
+	if (copy_error != 0 || spa_load_print_vdev_tree) {
+		spa_load_note(spa, "provided vdev tree:");
+		vdev_dbgmsg_print_tree(rvd, 2);
+		spa_load_note(spa, "MOS vdev tree:");
+		vdev_dbgmsg_print_tree(mrvd, 2);
+	}
+	if (copy_error != 0) {
+		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
+		    "back to vdev_copy_path_relaxed");
+		vdev_copy_path_relaxed(rvd, mrvd);
+	}
+
+	vdev_close(rvd);
+	vdev_free(rvd);
+	spa->spa_root_vdev = mrvd;
+	rvd = mrvd;
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	/*
+	 * We will use spa_config if we decide to reload the spa or if spa_load
+	 * fails and we rewind. We must thus regenerate the config using the
+	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
+	 * pass settings on how to load the pool and is not stored in the MOS.
+	 * We copy it over to our new, trusted config.
+	 */
+	mos_config_txg = fnvlist_lookup_uint64(mos_config,
+	    ZPOOL_CONFIG_POOL_TXG);
+	nvlist_free(mos_config);
+	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
+	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
+	    &policy) == 0)
+		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
+	spa_config_set(spa, mos_config);
+	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
+
+	/*
+	 * Now that we got the config from the MOS, we should be more strict
+	 * in checking blkptrs and can make assumptions about the consistency
+	 * of the vdev tree. spa_trust_config must be set to true before opening
+	 * vdevs in order for them to be writeable.
+	 */
+	spa->spa_trust_config = B_TRUE;
+
+	/*
+	 * Open and validate the new vdev tree
+	 */
+	error = spa_ld_open_vdevs(spa);
+	if (error != 0)
+		return (error);
+
+	error = spa_ld_validate_vdevs(spa);
+	if (error != 0)
+		return (error);
+
+	if (copy_error != 0 || spa_load_print_vdev_tree) {
+		spa_load_note(spa, "final vdev tree:");
+		vdev_dbgmsg_print_tree(rvd, 2);
+	}
+
+	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
+	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
+		/*
+		 * Sanity check to make sure that we are indeed loading the
+		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
+		 * in the config provided and they happened to be the only ones
+		 * to have the latest uberblock, we could involuntarily perform
+		 * an extreme rewind.
+		 */
+		healthy_tvds_mos = spa_healthy_core_tvds(spa);
+		if (healthy_tvds_mos - healthy_tvds >=
+		    SPA_SYNC_MIN_VDEVS) {
+			spa_load_note(spa, "config provided misses too many "
+			    "top-level vdevs compared to MOS (%lld vs %lld). ",
+			    (u_longlong_t)healthy_tvds,
+			    (u_longlong_t)healthy_tvds_mos);
+			spa_load_note(spa, "vdev tree:");
+			vdev_dbgmsg_print_tree(rvd, 2);
+			if (reloading) {
+				spa_load_failed(spa, "config was already "
+				    "provided from MOS. Aborting.");
+				return (spa_vdev_err(rvd,
+				    VDEV_AUX_CORRUPT_DATA, EIO));
+			}
+			spa_load_note(spa, "spa must be reloaded using MOS "
+			    "config");
+			return (SET_ERROR(EAGAIN));
+		}
+	}
+
+	error = spa_check_for_missing_logs(spa);
+	if (error != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
+		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
+		    "guid sum (%llu != %llu)",
+		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
+		    (u_longlong_t)rvd->vdev_guid_sum);
+		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
+		    ENXIO));
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_open_indirect_vdev_metadata(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	/*
+	 * Everything that we read before spa_remove_init() must be stored
+	 * on concreted vdevs.  Therefore we do this as early as possible.
+	 */
+	error = spa_remove_init(spa);
+	if (error != 0) {
+		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
+		    error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	/*
+	 * Retrieve information needed to condense indirect vdev mappings.
+	 */
+	error = spa_condense_init(spa);
+	if (error != 0) {
+		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
+		    error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
+		boolean_t missing_feat_read = B_FALSE;
+		nvlist_t *unsup_feat, *enabled_feat;
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
+		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
+		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
+		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		enabled_feat = fnvlist_alloc();
+		unsup_feat = fnvlist_alloc();
+
+		if (!spa_features_check(spa, B_FALSE,
+		    unsup_feat, enabled_feat))
+			missing_feat_read = B_TRUE;
+
+		if (spa_writeable(spa) ||
+		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
+			if (!spa_features_check(spa, B_TRUE,
+			    unsup_feat, enabled_feat)) {
+				*missing_feat_writep = B_TRUE;
+			}
+		}
+
+		fnvlist_add_nvlist(spa->spa_load_info,
+		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
+
+		if (!nvlist_empty(unsup_feat)) {
+			fnvlist_add_nvlist(spa->spa_load_info,
+			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
+		}
+
+		fnvlist_free(enabled_feat);
+		fnvlist_free(unsup_feat);
+
+		if (!missing_feat_read) {
+			fnvlist_add_boolean(spa->spa_load_info,
+			    ZPOOL_CONFIG_CAN_RDONLY);
+		}
+
+		/*
+		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
+		 * twofold: to determine whether the pool is available for
+		 * import in read-write mode and (if it is not) whether the
+		 * pool is available for import in read-only mode. If the pool
+		 * is available for import in read-write mode, it is displayed
+		 * as available in userland; if it is not available for import
+		 * in read-only mode, it is displayed as unavailable in
+		 * userland. If the pool is available for import in read-only
+		 * mode but not read-write mode, it is displayed as unavailable
+		 * in userland with a special note that the pool is actually
+		 * available for open in read-only mode.
+		 *
+		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
+		 * missing a feature for write, we must first determine whether
+		 * the pool can be opened read-only before returning to
+		 * userland in order to know whether to display the
+		 * abovementioned note.
+		 */
+		if (missing_feat_read || (*missing_feat_writep &&
+		    spa_writeable(spa))) {
+			spa_load_failed(spa, "pool uses unsupported features");
+			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
+			    ENOTSUP));
+		}
+
+		/*
+		 * Load refcounts for ZFS features from disk into an in-memory
+		 * cache during SPA initialization.
+		 */
+		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+			uint64_t refcount;
+
+			error = feature_get_refcount_from_disk(spa,
+			    &spa_feature_table[i], &refcount);
+			if (error == 0) {
+				spa->spa_feat_refcount_cache[i] = refcount;
+			} else if (error == ENOTSUP) {
+				spa->spa_feat_refcount_cache[i] =
+				    SPA_FEATURE_DISABLED;
+			} else {
+				spa_load_failed(spa, "error getting refcount "
+				    "for feature %s [error=%d]",
+				    spa_feature_table[i].fi_guid, error);
+				return (spa_vdev_err(rvd,
+				    VDEV_AUX_CORRUPT_DATA, EIO));
+			}
+		}
+	}
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
+		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	/*
+	 * Encryption was added before bookmark_v2, even though bookmark_v2
+	 * is now a dependency. If this pool has encryption enabled without
+	 * bookmark_v2, trigger an errata message.
+	 */
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
+		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_load_special_directories(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	spa->spa_is_initializing = B_TRUE;
+	error = dsl_pool_open(spa->spa_dsl_pool);
+	spa->spa_is_initializing = B_FALSE;
+	if (error != 0) {
+		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_get_props(spa_t *spa)
+{
+	int error = 0;
+	uint64_t obj;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	/* Grab the checksum salt from the MOS. */
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT, 1,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes),
+	    spa->spa_cksum_salt.zcs_bytes);
+	if (error == ENOENT) {
+		/* Generate a new salt for subsequent use */
+		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes));
+	} else if (error != 0) {
+		spa_load_failed(spa, "unable to retrieve checksum salt from "
+		    "MOS [error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
+	if (error != 0) {
+		spa_load_failed(spa, "error opening deferred-frees bpobj "
+		    "[error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	/*
+	 * Load the bit that tells us to use the new accounting function
+	 * (raid-z deflation).  If we have an older pool, this will not
+	 * be present.
+	 */
+	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
+	    &spa->spa_creation_version, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	/*
+	 * Load the persistent error log.  If we have an older pool, this will
+	 * not be present.
+	 */
+	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
+	    B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+	    &spa->spa_errlog_scrub, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	/*
+	 * Load the livelist deletion field. If a livelist is queued for
+	 * deletion, indicate that in the spa
+	 */
+	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
+	    &spa->spa_livelists_to_delete, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	/*
+	 * Load the history object.  If we have an older pool, this
+	 * will not be present.
+	 */
+	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	/*
+	 * Load the per-vdev ZAP map. If we have an older pool, this will not
+	 * be present; in this case, defer its creation to a later time to
+	 * avoid dirtying the MOS this early / out of sync context. See
+	 * spa_sync_config_object.
+	 */
+
+	/* The sentinel is only available in the MOS config. */
+	nvlist_t *mos_config;
+	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
+		spa_load_failed(spa, "unable to retrieve MOS config");
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
+	    &spa->spa_all_vdev_zaps, B_FALSE);
+
+	if (error == ENOENT) {
+		VERIFY(!nvlist_exists(mos_config,
+		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
+		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
+		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+	} else if (error != 0) {
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
+		/*
+		 * An older version of ZFS overwrote the sentinel value, so
+		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
+		 * destruction to later; see spa_sync_config_object.
+		 */
+		spa->spa_avz_action = AVZ_ACTION_DESTROY;
+		/*
+		 * We're assuming that no vdevs have had their ZAPs created
+		 * before this. Better be sure of it.
+		 */
+		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
+	}
+	nvlist_free(mos_config);
+
+	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+
+	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
+	    B_FALSE);
+	if (error && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	if (error == 0) {
+		uint64_t autoreplace;
+
+		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
+		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
+		spa->spa_autoreplace = (autoreplace != 0);
+	}
+
+	/*
+	 * If we are importing a pool with missing top-level vdevs,
+	 * we enforce that the pool doesn't panic or get suspended on
+	 * error since the likelihood of missing data is extremely high.
+	 */
+	if (spa->spa_missing_tvds > 0 &&
+	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
+	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+		spa_load_note(spa, "forcing failmode to 'continue' "
+		    "as some top level vdevs are missing");
+		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	/*
+	 * If we're assembling the pool from the split-off vdevs of
+	 * an existing pool, we don't want to attach the spares & cache
+	 * devices.
+	 */
+
+	/*
+	 * Load any hot spares for this pool.
+	 */
+	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
+	    B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
+		if (load_nvlist(spa, spa->spa_spares.sav_object,
+		    &spa->spa_spares.sav_config) != 0) {
+			spa_load_failed(spa, "error loading spares nvlist");
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_spares(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+	} else if (error == 0) {
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+
+	/*
+	 * Load any level 2 ARC devices for this pool.
+	 */
+	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
+	    &spa->spa_l2cache.sav_object, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
+		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
+		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
+		    &spa->spa_l2cache.sav_config) != 0) {
+			spa_load_failed(spa, "error loading l2cache nvlist");
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+		}
+
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+	} else if (error == 0) {
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_load_vdev_metadata(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	/*
+	 * If the 'multihost' property is set, then never allow a pool to
+	 * be imported when the system hostid is zero.  The exception to
+	 * this rule is zdb which is always allowed to access pools.
+	 */
+	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
+	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
+		fnvlist_add_uint64(spa->spa_load_info,
+		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
+		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
+	}
+
+	/*
+	 * If the 'autoreplace' property is set, then post a resource notifying
+	 * the ZFS DE that it should not issue any faults for unopenable
+	 * devices.  We also iterate over the vdevs, and post a sysevent for any
+	 * unopenable vdevs so that the normal autoreplace handler can take
+	 * over.
+	 */
+	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+		spa_check_removed(spa->spa_root_vdev);
+		/*
+		 * For the import case, this is done in spa_import(), because
+		 * at this point we're using the spare definitions from
+		 * the MOS config, not necessarily from the userland config.
+		 */
+		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
+			spa_aux_check_removed(&spa->spa_spares);
+			spa_aux_check_removed(&spa->spa_l2cache);
+		}
+	}
+
+	/*
+	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
+	 */
+	error = vdev_load(rvd);
+	if (error != 0) {
+		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+	}
+
+	error = spa_ld_log_spacemaps(spa);
+	if (error != 0) {
+		spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]",
+		    error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
+	}
+
+	/*
+	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
+	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	return (0);
+}
+
+static int
+spa_ld_load_dedup_tables(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	error = ddt_load(spa);
+	if (error != 0) {
+		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
+		boolean_t missing = spa_check_logs(spa);
+		if (missing) {
+			if (spa->spa_missing_tvds != 0) {
+				spa_load_note(spa, "spa_check_logs failed "
+				    "so dropping the logs");
+			} else {
+				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+				spa_load_failed(spa, "spa_check_logs failed");
+				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
+				    ENXIO));
+			}
+		}
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_verify_pool_data(spa_t *spa)
+{
+	int error = 0;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	/*
+	 * We've successfully opened the pool, verify that we're ready
+	 * to start pushing transactions.
+	 */
+	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
+		error = spa_load_verify(spa);
+		if (error != 0) {
+			spa_load_failed(spa, "spa_load_verify failed "
+			    "[error=%d]", error);
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+			    error));
+		}
+	}
+
+	return (0);
+}
+
+static void
+spa_ld_claim_log_blocks(spa_t *spa)
+{
+	dmu_tx_t *tx;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	/*
+	 * Claim log blocks that haven't been committed yet.
+	 * This must all happen in a single txg.
+	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+	 * invoked from zil_claim_log_block()'s i/o done callback.
+	 * Price of rollback is that we abandon the log.
+	 */
+	spa->spa_claiming = B_TRUE;
+
+	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+	    zil_claim, tx, DS_FIND_CHILDREN);
+	dmu_tx_commit(tx);
+
+	spa->spa_claiming = B_FALSE;
+
+	spa_set_log_state(spa, SPA_LOG_GOOD);
+}
+
+static void
+spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
+    boolean_t update_config_cache)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	int need_update = B_FALSE;
+
+	/*
+	 * If the config cache is stale, or we have uninitialized
+	 * metaslabs (see spa_vdev_add()), then update the config.
+	 *
+	 * If this is a verbatim import, trust the current
+	 * in-core spa_config and update the disk labels.
+	 */
+	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
+	    spa->spa_load_state == SPA_LOAD_IMPORT ||
+	    spa->spa_load_state == SPA_LOAD_RECOVER ||
+	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
+		need_update = B_TRUE;
+
+	for (int c = 0; c < rvd->vdev_children; c++)
+		if (rvd->vdev_child[c]->vdev_ms_array == 0)
+			need_update = B_TRUE;
+
+	/*
+	 * Update the config cache asynchronously in case we're the
+	 * root pool, in which case the config cache isn't writable yet.
+	 */
+	if (need_update)
+		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
+static void
+spa_ld_prepare_for_reload(spa_t *spa)
+{
+	spa_mode_t mode = spa->spa_mode;
+	int async_suspended = spa->spa_async_suspended;
+
+	spa_unload(spa);
+	spa_deactivate(spa);
+	spa_activate(spa, mode);
+
+	/*
+	 * We save the value of spa_async_suspended as it gets reset to 0 by
+	 * spa_unload(). We want to restore it back to the original value before
+	 * returning as we might be calling spa_async_resume() later.
+	 */
+	spa->spa_async_suspended = async_suspended;
+}
+
+static int
+spa_ld_read_checkpoint_txg(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error = 0;
+
+	ASSERT0(spa->spa_checkpoint_txg);
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error == ENOENT)
+		return (0);
+
+	if (error != 0)
+		return (error);
+
+	ASSERT3U(checkpoint.ub_txg, !=, 0);
+	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
+	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
+	spa->spa_checkpoint_txg = checkpoint.ub_txg;
+	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+	return (0);
+}
+
+static int
+spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
+{
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+	/*
+	 * Never trust the config that is provided unless we are assembling
+	 * a pool following a split.
+	 * This means don't trust blkptrs and the vdev tree in general. This
+	 * also effectively puts the spa in read-only mode since
+	 * spa_writeable() checks for spa_trust_config to be true.
+	 * We will later load a trusted config from the MOS.
+	 */
+	if (type != SPA_IMPORT_ASSEMBLE)
+		spa->spa_trust_config = B_FALSE;
+
+	/*
+	 * Parse the config provided to create a vdev tree.
+	 */
+	error = spa_ld_parse_config(spa, type);
+	if (error != 0)
+		return (error);
+
+	spa_import_progress_add(spa);
+
+	/*
+	 * Now that we have the vdev tree, try to open each vdev. This involves
+	 * opening the underlying physical device, retrieving its geometry and
+	 * probing the vdev with a dummy I/O. The state of each vdev will be set
+	 * based on the success of those operations. After this we'll be ready
+	 * to read from the vdevs.
+	 */
+	error = spa_ld_open_vdevs(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Read the label of each vdev and make sure that the GUIDs stored
+	 * there match the GUIDs in the config provided.
+	 * If we're assembling a new pool that's been split off from an
+	 * existing pool, the labels haven't yet been updated so we skip
+	 * validation for now.
+	 */
+	if (type != SPA_IMPORT_ASSEMBLE) {
+		error = spa_ld_validate_vdevs(spa);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * Read all vdev labels to find the best uberblock (i.e. latest,
+	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
+	 * get the list of features required to read blkptrs in the MOS from
+	 * the vdev label with the best uberblock and verify that our version
+	 * of zfs supports them all.
+	 */
+	error = spa_ld_select_uberblock(spa, type);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Pass that uberblock to the dsl_pool layer which will open the root
+	 * blkptr. This blkptr points to the latest version of the MOS and will
+	 * allow us to read its contents.
+	 */
+	error = spa_ld_open_rootbp(spa);
+	if (error != 0)
+		return (error);
+
+	return (0);
+}
+
+static int
+spa_ld_checkpoint_rewind(spa_t *spa)
+{
+	uberblock_t checkpoint;
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
+	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
+
+	if (error != 0) {
+		spa_load_failed(spa, "unable to retrieve checkpointed "
+		    "uberblock from the MOS config [error=%d]", error);
+
+		if (error == ENOENT)
+			error = ZFS_ERR_NO_CHECKPOINT;
+
+		return (error);
+	}
+
+	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
+	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
+
+	/*
+	 * We need to update the txg and timestamp of the checkpointed
+	 * uberblock to be higher than the latest one. This ensures that
+	 * the checkpointed uberblock is selected if we were to close and
+	 * reopen the pool right after we've written it in the vdev labels.
+	 * (also see block comment in vdev_uberblock_compare)
+	 */
+	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
+	checkpoint.ub_timestamp = gethrestime_sec();
+
+	/*
+	 * Set current uberblock to be the checkpointed uberblock.
+	 */
+	spa->spa_uberblock = checkpoint;
+
+	/*
+	 * If we are doing a normal rewind, then the pool is open for
+	 * writing and we sync the "updated" checkpointed uberblock to
+	 * disk. Once this is done, we've basically rewound the whole
+	 * pool and there is no way back.
+	 *
+	 * There are cases when we don't want to attempt and sync the
+	 * checkpointed uberblock to disk because we are opening a
+	 * pool as read-only. Specifically, verifying the checkpointed
+	 * state with zdb, and importing the checkpointed state to get
+	 * a "preview" of its content.
+	 */
+	if (spa_writeable(spa)) {
+		vdev_t *rvd = spa->spa_root_vdev;
+
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+		int svdcount = 0;
+		int children = rvd->vdev_children;
+		int c0 = spa_get_random(children);
+
+		for (int c = 0; c < children; c++) {
+			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
+
+			/* Stop when revisiting the first vdev */
+			if (c > 0 && svd[0] == vd)
+				break;
+
+			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
+			    !vdev_is_concrete(vd))
+				continue;
+
+			svd[svdcount++] = vd;
+			if (svdcount == SPA_SYNC_MIN_VDEVS)
+				break;
+		}
+		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
+		if (error == 0)
+			spa->spa_last_synced_guid = rvd->vdev_guid;
+		spa_config_exit(spa, SCL_ALL, FTAG);
+
+		if (error != 0) {
+			spa_load_failed(spa, "failed to write checkpointed "
+			    "uberblock to the vdev labels [error=%d]", error);
+			return (error);
+		}
+	}
+
+	return (0);
+}
+
+static int
+spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
+    boolean_t *update_config_cache)
+{
+	int error;
+
+	/*
+	 * Parse the config for pool, open and validate vdevs,
+	 * select an uberblock, and use that uberblock to open
+	 * the MOS.
+	 */
+	error = spa_ld_mos_init(spa, type);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Retrieve the trusted config stored in the MOS and use it to create
+	 * a new, exact version of the vdev tree, then reopen all vdevs.
+	 */
+	error = spa_ld_trusted_config(spa, type, B_FALSE);
+	if (error == EAGAIN) {
+		if (update_config_cache != NULL)
+			*update_config_cache = B_TRUE;
+
+		/*
+		 * Redo the loading process with the trusted config if it is
+		 * too different from the untrusted config.
+		 */
+		spa_ld_prepare_for_reload(spa);
+		spa_load_note(spa, "RELOADING");
+		error = spa_ld_mos_init(spa, type);
+		if (error != 0)
+			return (error);
+
+		error = spa_ld_trusted_config(spa, type, B_TRUE);
+		if (error != 0)
+			return (error);
+
+	} else if (error != 0) {
+		return (error);
+	}
+
+	return (0);
+}
+
+/*
+ * Load an existing storage pool, using the config provided. This config
+ * describes which vdevs are part of the pool and is later validated against
+ * partial configs present in each vdev's label and an entire copy of the
+ * config stored in the MOS.
+ */
+static int
+spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
+{
+	int error = 0;
+	boolean_t missing_feat_write = B_FALSE;
+	boolean_t checkpoint_rewind =
+	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+	boolean_t update_config_cache = B_FALSE;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
+
+	spa_load_note(spa, "LOADING");
+
+	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * If we are rewinding to the checkpoint then we need to repeat
+	 * everything we've done so far in this function but this time
+	 * selecting the checkpointed uberblock and using that to open
+	 * the MOS.
+	 */
+	if (checkpoint_rewind) {
+		/*
+		 * If we are rewinding to the checkpoint update config cache
+		 * anyway.
+		 */
+		update_config_cache = B_TRUE;
+
+		/*
+		 * Extract the checkpointed uberblock from the current MOS
+		 * and use this as the pool's uberblock from now on. If the
+		 * pool is imported as writeable we also write the checkpoint
+		 * uberblock to the labels, making the rewind permanent.
+		 */
+		error = spa_ld_checkpoint_rewind(spa);
+		if (error != 0)
+			return (error);
+
+		/*
+		 * Redo the loading process again with the
+		 * checkpointed uberblock.
+		 */
+		spa_ld_prepare_for_reload(spa);
+		spa_load_note(spa, "LOADING checkpointed uberblock");
+		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
+		if (error != 0)
+			return (error);
+	}
+
+	/*
+	 * Retrieve the checkpoint txg if the pool has a checkpoint.
+	 */
+	error = spa_ld_read_checkpoint_txg(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
+	 * from the pool and their contents were re-mapped to other vdevs. Note
+	 * that everything that we read before this step must have been
+	 * rewritten on concrete vdevs after the last device removal was
+	 * initiated. Otherwise we could be reading from indirect vdevs before
+	 * we have loaded their mappings.
+	 */
+	error = spa_ld_open_indirect_vdev_metadata(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Retrieve the full list of active features from the MOS and check if
+	 * they are all supported.
+	 */
+	error = spa_ld_check_features(spa, &missing_feat_write);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Load several special directories from the MOS needed by the dsl_pool
+	 * layer.
+	 */
+	error = spa_ld_load_special_directories(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Retrieve pool properties from the MOS.
+	 */
+	error = spa_ld_get_props(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Retrieve the list of auxiliary devices - cache devices and spares -
+	 * and open them.
+	 */
+	error = spa_ld_open_aux_vdevs(spa, type);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Load the metadata for all vdevs. Also check if unopenable devices
+	 * should be autoreplaced.
+	 */
+	error = spa_ld_load_vdev_metadata(spa);
+	if (error != 0)
+		return (error);
+
+	error = spa_ld_load_dedup_tables(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Verify the logs now to make sure we don't have any unexpected errors
+	 * when we claim log blocks later.
+	 */
+	error = spa_ld_verify_logs(spa, type, ereport);
+	if (error != 0)
+		return (error);
+
+	if (missing_feat_write) {
+		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
+
+		/*
+		 * At this point, we know that we can open the pool in
+		 * read-only mode but not read-write mode. We now have enough
+		 * information and can return to userland.
+		 */
+		return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
+		    ENOTSUP));
+	}
+
+	/*
+	 * Traverse the last txgs to make sure the pool was left off in a safe
+	 * state. When performing an extreme rewind, we verify the whole pool,
+	 * which can take a very long time.
+	 */
+	error = spa_ld_verify_pool_data(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Calculate the deflated space for the pool. This must be done before
+	 * we write anything to the pool because we'd need to update the space
+	 * accounting using the deflated sizes.
+	 */
+	spa_update_dspace(spa);
+
+	/*
+	 * We have now retrieved all the information we needed to open the
+	 * pool. If we are importing the pool in read-write mode, a few
+	 * additional steps must be performed to finish the import.
+	 */
+	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
+	    spa->spa_load_max_txg == UINT64_MAX)) {
+		uint64_t config_cache_txg = spa->spa_config_txg;
+
+		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
+
+		/*
+		 * In case of a checkpoint rewind, log the original txg
+		 * of the checkpointed uberblock.
+		 */
+		if (checkpoint_rewind) {
+			spa_history_log_internal(spa, "checkpoint rewind",
+			    NULL, "rewound state to txg=%llu",
+			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
+		}
+
+		/*
+		 * Traverse the ZIL and claim all blocks.
+		 */
+		spa_ld_claim_log_blocks(spa);
+
+		/*
+		 * Kick-off the syncing thread.
+		 */
+		spa->spa_sync_on = B_TRUE;
+		txg_sync_start(spa->spa_dsl_pool);
+		mmp_thread_start(spa);
+
+		/*
+		 * Wait for all claims to sync.  We sync up to the highest
+		 * claimed log block birth time so that claimed log blocks
+		 * don't appear to be from the future.  spa_claim_max_txg
+		 * will have been set for us by ZIL traversal operations
+		 * performed above.
+		 */
+		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
+
+		/*
+		 * Check if we need to request an update of the config. On the
+		 * next sync, we would update the config stored in vdev labels
+		 * and the cachefile (by default /etc/zfs/zpool.cache).
+		 */
+		spa_ld_check_for_config_update(spa, config_cache_txg,
+		    update_config_cache);
+
+		/*
+		 * Check if a rebuild was in progress and if so resume it.
+		 * Then check all DTLs to see if anything needs resilvering.
+		 * The resilver will be deferred if a rebuild was started.
+		 */
+		if (vdev_rebuild_active(spa->spa_root_vdev)) {
+			vdev_rebuild_restart(spa);
+		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+			spa_async_request(spa, SPA_ASYNC_RESILVER);
+		}
+
+		/*
+		 * Log the fact that we booted up (so that we can detect if
+		 * we rebooted in the middle of an operation).
+		 */
+		spa_history_log_version(spa, "open", NULL);
+
+		spa_restart_removal(spa);
+		spa_spawn_aux_threads(spa);
+
+		/*
+		 * Delete any inconsistent datasets.
+		 *
+		 * Note:
+		 * Since we may be issuing deletes for clones here,
+		 * we make sure to do so after we've spawned all the
+		 * auxiliary threads above (from which the livelist
+		 * deletion zthr is part of).
+		 */
+		(void) dmu_objset_find(spa_name(spa),
+		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+		/*
+		 * Clean up any stale temporary dataset userrefs.
+		 */
+		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		vdev_trim_restart(spa->spa_root_vdev);
+		vdev_autotrim_restart(spa);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+
+	spa_import_progress_remove(spa_guid(spa));
+	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+
+	spa_load_note(spa, "LOADED");
+
+	return (0);
+}
+
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state)
+{
+	spa_mode_t mode = spa->spa_mode;
+
+	spa_unload(spa);
+	spa_deactivate(spa);
+
+	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
+
+	spa_activate(spa, mode);
+	spa_async_suspend(spa);
+
+	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
+	    (u_longlong_t)spa->spa_load_max_txg);
+
+	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
+}
+
+/*
+ * If spa_load() fails this function will try loading prior txg's. If
+ * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
+ * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
+ * function will not rewind the pool and will return the same error as
+ * spa_load().
+ */
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
+    int rewind_flags)
+{
+	nvlist_t *loadinfo = NULL;
+	nvlist_t *config = NULL;
+	int load_error, rewind_error;
+	uint64_t safe_rewind_txg;
+	uint64_t min_txg;
+
+	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+		spa->spa_load_max_txg = spa->spa_load_txg;
+		spa_set_log_state(spa, SPA_LOG_CLEAR);
+	} else {
+		spa->spa_load_max_txg = max_request;
+		if (max_request != UINT64_MAX)
+			spa->spa_extreme_rewind = B_TRUE;
+	}
+
+	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
+	if (load_error == 0)
+		return (0);
+	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
+		/*
+		 * When attempting checkpoint-rewind on a pool with no
+		 * checkpoint, we should not attempt to load uberblocks
+		 * from previous txgs when spa_load fails.
+		 */
+		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
+		spa_import_progress_remove(spa_guid(spa));
+		return (load_error);
+	}
+
+	if (spa->spa_root_vdev != NULL)
+		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+	if (rewind_flags & ZPOOL_NEVER_REWIND) {
+		nvlist_free(config);
+		spa_import_progress_remove(spa_guid(spa));
+		return (load_error);
+	}
+
+	if (state == SPA_LOAD_RECOVER) {
+		/* Price of rolling back is discarding txgs, including log */
+		spa_set_log_state(spa, SPA_LOG_CLEAR);
+	} else {
+		/*
+		 * If we aren't rolling back save the load info from our first
+		 * import attempt so that we can restore it after attempting
+		 * to rewind.
+		 */
+		loadinfo = spa->spa_load_info;
+		spa->spa_load_info = fnvlist_alloc();
+	}
+
+	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+	    TXG_INITIAL : safe_rewind_txg;
+
+	/*
+	 * Continue as long as we're finding errors, we're still within
+	 * the acceptable rewind range, and we're still finding uberblocks
+	 */
+	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+		if (spa->spa_load_max_txg < safe_rewind_txg)
+			spa->spa_extreme_rewind = B_TRUE;
+		rewind_error = spa_load_retry(spa, state);
+	}
+
+	spa->spa_extreme_rewind = B_FALSE;
+	spa->spa_load_max_txg = UINT64_MAX;
+
+	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+		spa_config_set(spa, config);
+	else
+		nvlist_free(config);
+
+	if (state == SPA_LOAD_RECOVER) {
+		ASSERT3P(loadinfo, ==, NULL);
+		spa_import_progress_remove(spa_guid(spa));
+		return (rewind_error);
+	} else {
+		/* Store the rewind info as part of the initial load info */
+		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
+		    spa->spa_load_info);
+
+		/* Restore the initial load info */
+		fnvlist_free(spa->spa_load_info);
+		spa->spa_load_info = loadinfo;
+
+		spa_import_progress_remove(spa_guid(spa));
+		return (load_error);
+	}
+}
+
+/*
+ * Pool Open/Import
+ *
+ * The import case is identical to an open except that the configuration is sent
+ * down from userland, instead of grabbed from the configuration cache.  For the
+ * case of an open, the pool configuration will exist in the
+ * POOL_STATE_UNINITIALIZED state.
+ *
+ * The stats information (gen/count/ustats) is used to gather vdev statistics at
+ * the same time open the pool, without having to keep around the spa_t in some
+ * ambiguous state.
+ */
+static int
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+    nvlist_t **config)
+{
+	spa_t *spa;
+	spa_load_state_t state = SPA_LOAD_OPEN;
+	int error;
+	int locked = B_FALSE;
+	int firstopen = B_FALSE;
+
+	*spapp = NULL;
+
+	/*
+	 * As disgusting as this is, we need to support recursive calls to this
+	 * function because dsl_dir_open() is called during spa_load(), and ends
+	 * up calling spa_open() again.  The real fix is to figure out how to
+	 * avoid dsl_dir_open() calling this in the first place.
+	 */
+	if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
+		mutex_enter(&spa_namespace_lock);
+		locked = B_TRUE;
+	}
+
+	if ((spa = spa_lookup(pool)) == NULL) {
+		if (locked)
+			mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
+		zpool_load_policy_t policy;
+
+		firstopen = B_TRUE;
+
+		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
+		    &policy);
+		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
+			state = SPA_LOAD_RECOVER;
+
+		spa_activate(spa, spa_mode_global);
+
+		if (state != SPA_LOAD_RECOVER)
+			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+
+		zfs_dbgmsg("spa_open_common: opening %s", pool);
+		error = spa_load_best(spa, state, policy.zlp_txg,
+		    policy.zlp_rewind);
+
+		if (error == EBADF) {
+			/*
+			 * If vdev_validate() returns failure (indicated by
+			 * EBADF), it indicates that one of the vdevs indicates
+			 * that the pool has been exported or destroyed.  If
+			 * this is the case, the config cache is out of sync and
+			 * we should remove the pool from the namespace.
+			 */
+			spa_unload(spa);
+			spa_deactivate(spa);
+			spa_write_cachefile(spa, B_TRUE, B_TRUE);
+			spa_remove(spa);
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			return (SET_ERROR(ENOENT));
+		}
+
+		if (error) {
+			/*
+			 * We can't open the pool, but we still have useful
+			 * information: the state of each vdev after the
+			 * attempted vdev_open().  Return this to the user.
+			 */
+			if (config != NULL && spa->spa_config) {
+				VERIFY(nvlist_dup(spa->spa_config, config,
+				    KM_SLEEP) == 0);
+				VERIFY(nvlist_add_nvlist(*config,
+				    ZPOOL_CONFIG_LOAD_INFO,
+				    spa->spa_load_info) == 0);
+			}
+			spa_unload(spa);
+			spa_deactivate(spa);
+			spa->spa_last_open_failed = error;
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			*spapp = NULL;
+			return (error);
+		}
+	}
+
+	spa_open_ref(spa, tag);
+
+	if (config != NULL)
+		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+	/*
+	 * If we've recovered the pool, pass back any information we
+	 * gathered while doing the load.
+	 */
+	if (state == SPA_LOAD_RECOVER) {
+		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
+		    spa->spa_load_info) == 0);
+	}
+
+	if (locked) {
+		spa->spa_last_open_failed = 0;
+		spa->spa_last_ubsync_txg = 0;
+		spa->spa_load_txg = 0;
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	if (firstopen)
+		zvol_create_minors_recursive(spa_name(spa));
+
+	*spapp = spa;
+
+	return (0);
+}
+
+int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+    nvlist_t **config)
+{
+	return (spa_open_common(name, spapp, tag, policy, config));
+}
+
+int
+spa_open(const char *name, spa_t **spapp, void *tag)
+{
+	return (spa_open_common(name, spapp, tag, NULL, NULL));
+}
+
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+	spa_t *spa;
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(name)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (NULL);
+	}
+	spa->spa_inject_ref++;
+	mutex_exit(&spa_namespace_lock);
+
+	return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+	mutex_enter(&spa_namespace_lock);
+	spa->spa_inject_ref--;
+	mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * Add spares device information to the nvlist.
+ */
+static void
+spa_add_spares(spa_t *spa, nvlist_t *config)
+{
+	nvlist_t **spares;
+	uint_t i, nspares;
+	nvlist_t *nvroot;
+	uint64_t guid;
+	vdev_stat_t *vs;
+	uint_t vsc;
+	uint64_t pool;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+	if (spa->spa_spares.sav_count == 0)
+		return;
+
+	VERIFY(nvlist_lookup_nvlist(config,
+	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+	if (nspares != 0) {
+		VERIFY(nvlist_add_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+		VERIFY(nvlist_lookup_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+
+		/*
+		 * Go through and find any spares which have since been
+		 * repurposed as an active spare.  If this is the case, update
+		 * their status appropriately.
+		 */
+		for (i = 0; i < nspares; i++) {
+			VERIFY(nvlist_lookup_uint64(spares[i],
+			    ZPOOL_CONFIG_GUID, &guid) == 0);
+			if (spa_spare_exists(guid, &pool, NULL) &&
+			    pool != 0ULL) {
+				VERIFY(nvlist_lookup_uint64_array(
+				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
+				    (uint64_t **)&vs, &vsc) == 0);
+				vs->vs_state = VDEV_STATE_CANT_OPEN;
+				vs->vs_aux = VDEV_AUX_SPARED;
+			}
+		}
+	}
+}
+
+/*
+ * Add l2cache device information to the nvlist, including vdev stats.
+ */
+static void
+spa_add_l2cache(spa_t *spa, nvlist_t *config)
+{
+	nvlist_t **l2cache;
+	uint_t i, j, nl2cache;
+	nvlist_t *nvroot;
+	uint64_t guid;
+	vdev_t *vd;
+	vdev_stat_t *vs;
+	uint_t vsc;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+	if (spa->spa_l2cache.sav_count == 0)
+		return;
+
+	VERIFY(nvlist_lookup_nvlist(config,
+	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+	if (nl2cache != 0) {
+		VERIFY(nvlist_add_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		VERIFY(nvlist_lookup_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+
+		/*
+		 * Update level 2 cache device stats.
+		 */
+
+		for (i = 0; i < nl2cache; i++) {
+			VERIFY(nvlist_lookup_uint64(l2cache[i],
+			    ZPOOL_CONFIG_GUID, &guid) == 0);
+
+			vd = NULL;
+			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
+				if (guid ==
+				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
+					vd = spa->spa_l2cache.sav_vdevs[j];
+					break;
+				}
+			}
+			ASSERT(vd != NULL);
+
+			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
+			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
+			    == 0);
+			vdev_get_stats(vd, vs);
+			vdev_config_generate_stats(vd, l2cache[i]);
+
+		}
+	}
+}
+
+static void
+spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+
+	if (spa->spa_feat_for_read_obj != 0) {
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_feat_for_read_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+			    za.za_num_integers == 1);
+			VERIFY0(nvlist_add_uint64(features, za.za_name,
+			    za.za_first_integer));
+		}
+		zap_cursor_fini(&zc);
+	}
+
+	if (spa->spa_feat_for_write_obj != 0) {
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_feat_for_write_obj);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
+			    za.za_num_integers == 1);
+			VERIFY0(nvlist_add_uint64(features, za.za_name,
+			    za.za_first_integer));
+		}
+		zap_cursor_fini(&zc);
+	}
+}
+
+static void
+spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
+{
+	int i;
+
+	for (i = 0; i < SPA_FEATURES; i++) {
+		zfeature_info_t feature = spa_feature_table[i];
+		uint64_t refcount;
+
+		if (feature_get_refcount(spa, &feature, &refcount) != 0)
+			continue;
+
+		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
+	}
+}
+
+/*
+ * Store a list of pool features and their reference counts in the
+ * config.
+ *
+ * The first time this is called on a spa, allocate a new nvlist, fetch
+ * the pool features and reference counts from disk, then save the list
+ * in the spa. In subsequent calls on the same spa use the saved nvlist
+ * and refresh its values from the cached reference counts.  This
+ * ensures we don't block here on I/O on a suspended pool so 'zpool
+ * clear' can resume the pool.
+ */
+static void
+spa_add_feature_stats(spa_t *spa, nvlist_t *config)
+{
+	nvlist_t *features;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+	mutex_enter(&spa->spa_feat_stats_lock);
+	features = spa->spa_feat_stats;
+
+	if (features != NULL) {
+		spa_feature_stats_from_cache(spa, features);
+	} else {
+		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
+		spa->spa_feat_stats = features;
+		spa_feature_stats_from_disk(spa, features);
+	}
+
+	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
+	    features));
+
+	mutex_exit(&spa->spa_feat_stats_lock);
+}
+
+int
+spa_get_stats(const char *name, nvlist_t **config,
+    char *altroot, size_t buflen)
+{
+	int error;
+	spa_t *spa;
+
+	*config = NULL;
+	error = spa_open_common(name, &spa, FTAG, NULL, config);
+
+	if (spa != NULL) {
+		/*
+		 * This still leaves a window of inconsistency where the spares
+		 * or l2cache devices could change and the config would be
+		 * self-inconsistent.
+		 */
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+		if (*config != NULL) {
+			uint64_t loadtimes[2];
+
+			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
+			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
+			VERIFY(nvlist_add_uint64_array(*config,
+			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
+
+			VERIFY(nvlist_add_uint64(*config,
+			    ZPOOL_CONFIG_ERRCOUNT,
+			    spa_get_errlog_size(spa)) == 0);
+
+			if (spa_suspended(spa)) {
+				VERIFY(nvlist_add_uint64(*config,
+				    ZPOOL_CONFIG_SUSPENDED,
+				    spa->spa_failmode) == 0);
+				VERIFY(nvlist_add_uint64(*config,
+				    ZPOOL_CONFIG_SUSPENDED_REASON,
+				    spa->spa_suspended) == 0);
+			}
+
+			spa_add_spares(spa, *config);
+			spa_add_l2cache(spa, *config);
+			spa_add_feature_stats(spa, *config);
+		}
+	}
+
+	/*
+	 * We want to get the alternate root even for faulted pools, so we cheat
+	 * and call spa_lookup() directly.
+	 */
+	if (altroot) {
+		if (spa == NULL) {
+			mutex_enter(&spa_namespace_lock);
+			spa = spa_lookup(name);
+			if (spa)
+				spa_altroot(spa, altroot, buflen);
+			else
+				altroot[0] = '\0';
+			spa = NULL;
+			mutex_exit(&spa_namespace_lock);
+		} else {
+			spa_altroot(spa, altroot, buflen);
+		}
+	}
+
+	if (spa != NULL) {
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		spa_close(spa, FTAG);
+	}
+
+	return (error);
+}
+
+/*
+ * Validate that the auxiliary device array is well formed.  We must have an
+ * array of nvlists, each which describes a valid leaf vdev.  If this is an
+ * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
+ * specified, as long as they are well-formed.
+ */
+static int
+spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
+    spa_aux_vdev_t *sav, const char *config, uint64_t version,
+    vdev_labeltype_t label)
+{
+	nvlist_t **dev;
+	uint_t i, ndev;
+	vdev_t *vd;
+	int error;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	/*
+	 * It's acceptable to have no devs specified.
+	 */
+	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
+		return (0);
+
+	if (ndev == 0)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * Make sure the pool is formatted with a version that supports this
+	 * device type.
+	 */
+	if (spa_version(spa) < version)
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * Set the pending device list so we correctly handle device in-use
+	 * checking.
+	 */
+	sav->sav_pending = dev;
+	sav->sav_npending = ndev;
+
+	for (i = 0; i < ndev; i++) {
+		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
+		    mode)) != 0)
+			goto out;
+
+		if (!vd->vdev_ops->vdev_op_leaf) {
+			vdev_free(vd);
+			error = SET_ERROR(EINVAL);
+			goto out;
+		}
+
+		vd->vdev_top = vd;
+
+		if ((error = vdev_open(vd)) == 0 &&
+		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
+			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
+			    vd->vdev_guid) == 0);
+		}
+
+		vdev_free(vd);
+
+		if (error &&
+		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
+			goto out;
+		else
+			error = 0;
+	}
+
+out:
+	sav->sav_pending = NULL;
+	sav->sav_npending = 0;
+	return (error);
+}
+
+static int
+spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+	int error;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
+	    VDEV_LABEL_SPARE)) != 0) {
+		return (error);
+	}
+
+	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
+	    VDEV_LABEL_L2CACHE));
+}
+
+static void
+spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
+    const char *config)
+{
+	int i;
+
+	if (sav->sav_config != NULL) {
+		nvlist_t **olddevs;
+		uint_t oldndevs;
+		nvlist_t **newdevs;
+
+		/*
+		 * Generate new dev list by concatenating with the
+		 * current dev list.
+		 */
+		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
+		    &olddevs, &oldndevs) == 0);
+
+		newdevs = kmem_alloc(sizeof (void *) *
+		    (ndevs + oldndevs), KM_SLEEP);
+		for (i = 0; i < oldndevs; i++)
+			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
+			    KM_SLEEP) == 0);
+		for (i = 0; i < ndevs; i++)
+			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
+			    KM_SLEEP) == 0);
+
+		VERIFY(nvlist_remove(sav->sav_config, config,
+		    DATA_TYPE_NVLIST_ARRAY) == 0);
+
+		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+		    config, newdevs, ndevs + oldndevs) == 0);
+		for (i = 0; i < oldndevs + ndevs; i++)
+			nvlist_free(newdevs[i]);
+		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
+	} else {
+		/*
+		 * Generate a new dev list.
+		 */
+		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
+		    KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
+		    devs, ndevs) == 0);
+	}
+}
+
+/*
+ * Stop and drop level 2 ARC devices
+ */
+void
+spa_l2cache_drop(spa_t *spa)
+{
+	vdev_t *vd;
+	int i;
+	spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+	for (i = 0; i < sav->sav_count; i++) {
+		uint64_t pool;
+
+		vd = sav->sav_vdevs[i];
+		ASSERT(vd != NULL);
+
+		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+		    pool != 0ULL && l2arc_vdev_present(vd))
+			l2arc_remove_vdev(vd);
+	}
+}
+
+/*
+ * Verify encryption parameters for spa creation. If we are encrypting, we must
+ * have the encryption feature flag enabled.
+ */
+static int
+spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
+    boolean_t has_encryption)
+{
+	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
+	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
+	    !has_encryption)
+		return (SET_ERROR(ENOTSUP));
+
+	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
+}
+
+/*
+ * Pool Creation
+ */
+int
+spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
+    nvlist_t *zplprops, dsl_crypto_params_t *dcp)
+{
+	spa_t *spa;
+	char *altroot = NULL;
+	vdev_t *rvd;
+	dsl_pool_t *dp;
+	dmu_tx_t *tx;
+	int error = 0;
+	uint64_t txg = TXG_INITIAL;
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
+	uint64_t version, obj, ndraid = 0;
+	boolean_t has_features;
+	boolean_t has_encryption;
+	boolean_t has_allocclass;
+	spa_feature_t feat;
+	char *feat_name;
+	char *poolname;
+	nvlist_t *nvl;
+
+	if (props == NULL ||
+	    nvlist_lookup_string(props, "tname", &poolname) != 0)
+		poolname = (char *)pool;
+
+	/*
+	 * If this pool already exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(poolname) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EEXIST));
+	}
+
+	/*
+	 * Allocate a new spa_t structure.
+	 */
+	nvl = fnvlist_alloc();
+	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+	spa = spa_add(poolname, nvl, altroot);
+	fnvlist_free(nvl);
+	spa_activate(spa, spa_mode_global);
+
+	if (props && (error = spa_prop_validate(spa, props))) {
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	/*
+	 * Temporary pool names should never be written to disk.
+	 */
+	if (poolname != pool)
+		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
+
+	has_features = B_FALSE;
+	has_encryption = B_FALSE;
+	has_allocclass = B_FALSE;
+	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
+	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
+		if (zpool_prop_feature(nvpair_name(elem))) {
+			has_features = B_TRUE;
+
+			feat_name = strchr(nvpair_name(elem), '@') + 1;
+			VERIFY0(zfeature_lookup_name(feat_name, &feat));
+			if (feat == SPA_FEATURE_ENCRYPTION)
+				has_encryption = B_TRUE;
+			if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
+				has_allocclass = B_TRUE;
+		}
+	}
+
+	/* verify encryption params, if they were provided */
+	if (dcp != NULL) {
+		error = spa_create_check_encryption_params(dcp, has_encryption);
+		if (error != 0) {
+			spa_deactivate(spa);
+			spa_remove(spa);
+			mutex_exit(&spa_namespace_lock);
+			return (error);
+		}
+	}
+	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (ENOTSUP);
+	}
+
+	if (has_features || nvlist_lookup_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
+		version = SPA_VERSION;
+	}
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+
+	spa->spa_first_txg = txg;
+	spa->spa_uberblock.ub_txg = txg - 1;
+	spa->spa_uberblock.ub_version = version;
+	spa->spa_ubsync = spa->spa_uberblock;
+	spa->spa_load_state = SPA_LOAD_CREATE;
+	spa->spa_removing_phys.sr_state = DSS_NONE;
+	spa->spa_removing_phys.sr_removing_vdev = -1;
+	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+	spa->spa_indirect_vdevs_loaded = B_TRUE;
+
+	/*
+	 * Create "The Godfather" zio to hold all async IOs
+	 */
+	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+	    KM_SLEEP);
+	for (int i = 0; i < max_ncpus; i++) {
+		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+	}
+
+	/*
+	 * Create the root vdev.
+	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
+
+	ASSERT(error != 0 || rvd != NULL);
+	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
+
+	if (error == 0 && !zfs_allocatable_devs(nvroot))
+		error = SET_ERROR(EINVAL);
+
+	if (error == 0 &&
+	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
+	    (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
+	    (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
+		/*
+		 * instantiate the metaslab groups (this will dirty the vdevs)
+		 * we can no longer error exit past this point
+		 */
+		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
+			vdev_t *vd = rvd->vdev_child[c];
+
+			vdev_metaslab_set_size(vd);
+			vdev_expand(vd, txg);
+		}
+	}
+
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (error != 0) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	/*
+	 * Get the list of spares, if specified.
+	 */
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares) == 0) {
+		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
+		    KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_spares(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+
+	/*
+	 * Get the list of level 2 cache devices, if specified.
+	 */
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache) == 0) {
+		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	}
+
+	spa->spa_is_initializing = B_TRUE;
+	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
+	spa->spa_is_initializing = B_FALSE;
+
+	/*
+	 * Create DDTs (dedup tables).
+	 */
+	ddt_create(spa);
+
+	spa_update_dspace(spa);
+
+	tx = dmu_tx_create_assigned(dp, txg);
+
+	/*
+	 * Create the pool's history object.
+	 */
+	if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
+		spa_history_create_obj(spa, tx);
+
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
+	spa_history_log_version(spa, "create", tx);
+
+	/*
+	 * Create the pool config object.
+	 */
+	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
+	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
+	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+
+	if (zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
+	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add pool config");
+	}
+
+	if (zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
+	    sizeof (uint64_t), 1, &version, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add pool version");
+	}
+
+	/* Newly created pools with the right version are always deflated. */
+	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
+		spa->spa_deflate = TRUE;
+		if (zap_add(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
+			cmn_err(CE_PANIC, "failed to add deflate");
+		}
+	}
+
+	/*
+	 * Create the deferred-free bpobj.  Turn off compression
+	 * because sync-to-convergence takes longer if the blocksize
+	 * keeps changing.
+	 */
+	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
+	dmu_object_set_compress(spa->spa_meta_objset, obj,
+	    ZIO_COMPRESS_OFF, tx);
+	if (zap_add(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
+	    sizeof (uint64_t), 1, &obj, tx) != 0) {
+		cmn_err(CE_PANIC, "failed to add bpobj");
+	}
+	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
+	    spa->spa_meta_objset, obj));
+
+	/*
+	 * Generate some random noise for salted checksums to operate on.
+	 */
+	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+	/*
+	 * Set pool properties.
+	 */
+	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
+	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
+	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
+	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
+	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
+
+	if (props != NULL) {
+		spa_configfile_set(spa, props, B_FALSE);
+		spa_sync_props(props, tx);
+	}
+
+	for (int i = 0; i < ndraid; i++)
+		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+
+	dmu_tx_commit(tx);
+
+	spa->spa_sync_on = B_TRUE;
+	txg_sync_start(dp);
+	mmp_thread_start(spa);
+	txg_wait_synced(dp, txg);
+
+	spa_spawn_aux_threads(spa);
+
+	spa_write_cachefile(spa, B_FALSE, B_TRUE);
+
+	/*
+	 * Don't count references from objsets that are already closed
+	 * and are making their way through the eviction process.
+	 */
+	spa_evicting_os_wait(spa);
+	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
+	spa->spa_load_state = SPA_LOAD_NONE;
+
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+/*
+ * Import a non-root pool into the system.
+ */
+int
+spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
+{
+	spa_t *spa;
+	char *altroot = NULL;
+	spa_load_state_t state = SPA_LOAD_IMPORT;
+	zpool_load_policy_t policy;
+	spa_mode_t mode = spa_mode_global;
+	uint64_t readonly = B_FALSE;
+	int error;
+	nvlist_t *nvroot;
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
+
+	/*
+	 * If a pool with this name exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EEXIST));
+	}
+
+	/*
+	 * Create and initialize the spa structure.
+	 */
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+	(void) nvlist_lookup_uint64(props,
+	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
+	if (readonly)
+		mode = SPA_MODE_READ;
+	spa = spa_add(pool, config, altroot);
+	spa->spa_import_flags = flags;
+
+	/*
+	 * Verbatim import - Take a pool and insert it into the namespace
+	 * as if it had been loaded at boot.
+	 */
+	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
+		if (props != NULL)
+			spa_configfile_set(spa, props, B_FALSE);
+
+		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
+		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
+		mutex_exit(&spa_namespace_lock);
+		return (0);
+	}
+
+	spa_activate(spa, mode);
+
+	/*
+	 * Don't start async tasks until we know everything is healthy.
+	 */
+	spa_async_suspend(spa);
+
+	zpool_get_load_policy(config, &policy);
+	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
+
+	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
+
+	if (state != SPA_LOAD_RECOVER) {
+		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+		zfs_dbgmsg("spa_import: importing %s", pool);
+	} else {
+		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
+		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
+	}
+	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
+
+	/*
+	 * Propagate anything learned while loading the pool and pass it
+	 * back to caller (i.e. rewind info, missing devices, etc).
+	 */
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+	    spa->spa_load_info) == 0);
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	/*
+	 * Toss any existing sparelist, as it doesn't have any validity
+	 * anymore, and conflicts with spa_has_spare().
+	 */
+	if (spa->spa_spares.sav_config) {
+		nvlist_free(spa->spa_spares.sav_config);
+		spa->spa_spares.sav_config = NULL;
+		spa_load_spares(spa);
+	}
+	if (spa->spa_l2cache.sav_config) {
+		nvlist_free(spa->spa_l2cache.sav_config);
+		spa->spa_l2cache.sav_config = NULL;
+		spa_load_l2cache(spa);
+	}
+
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (props != NULL)
+		spa_configfile_set(spa, props, B_FALSE);
+
+	if (error != 0 || (props && spa_writeable(spa) &&
+	    (error = spa_prop_set(spa, props)))) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
+	}
+
+	spa_async_resume(spa);
+
+	/*
+	 * Override any spares and level 2 cache devices as specified by
+	 * the user, as these may have correct device names/devids, etc.
+	 */
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares) == 0) {
+		if (spa->spa_spares.sav_config)
+			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
+			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+		else
+			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_spares(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache) == 0) {
+		if (spa->spa_l2cache.sav_config)
+			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+		else
+			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	}
+
+	/*
+	 * Check for any removed devices.
+	 */
+	if (spa->spa_autoreplace) {
+		spa_aux_check_removed(&spa->spa_spares);
+		spa_aux_check_removed(&spa->spa_l2cache);
+	}
+
+	if (spa_writeable(spa)) {
+		/*
+		 * Update the config cache to include the newly-imported pool.
+		 */
+		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	}
+
+	/*
+	 * It's possible that the pool was expanded while it was exported.
+	 * We kick off an async task to handle this for us.
+	 */
+	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+
+	spa_history_log_version(spa, "import", NULL);
+
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
+
+	mutex_exit(&spa_namespace_lock);
+
+	zvol_create_minors_recursive(pool);
+
+	return (0);
+}
+
+nvlist_t *
+spa_tryimport(nvlist_t *tryconfig)
+{
+	nvlist_t *config = NULL;
+	char *poolname, *cachefile;
+	spa_t *spa;
+	uint64_t state;
+	int error;
+	zpool_load_policy_t policy;
+
+	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
+		return (NULL);
+
+	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
+		return (NULL);
+
+	/*
+	 * Create and initialize the spa structure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
+	spa_activate(spa, SPA_MODE_READ);
+
+	/*
+	 * Rewind pool if a max txg was provided.
+	 */
+	zpool_get_load_policy(spa->spa_config, &policy);
+	if (policy.zlp_txg != UINT64_MAX) {
+		spa->spa_load_max_txg = policy.zlp_txg;
+		spa->spa_extreme_rewind = B_TRUE;
+		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
+		    poolname, (longlong_t)policy.zlp_txg);
+	} else {
+		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
+	}
+
+	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
+	    == 0) {
+		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
+		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
+	} else {
+		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
+	}
+
+	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
+
+	/*
+	 * If 'tryconfig' was at least parsable, return the current config.
+	 */
+	if (spa->spa_root_vdev != NULL) {
+		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
+		    poolname) == 0);
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+		    state) == 0);
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
+		    spa->spa_uberblock.ub_timestamp) == 0);
+		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
+		    spa->spa_load_info) == 0);
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
+		    spa->spa_errata) == 0);
+
+		/*
+		 * If the bootfs property exists on this pool then we
+		 * copy it out so that external consumers can tell which
+		 * pools are bootable.
+		 */
+		if ((!error || error == EEXIST) && spa->spa_bootfs) {
+			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+			/*
+			 * We have to play games with the name since the
+			 * pool was opened as TRYIMPORT_NAME.
+			 */
+			if (dsl_dsobj_to_dsname(spa_name(spa),
+			    spa->spa_bootfs, tmpname) == 0) {
+				char *cp;
+				char *dsname;
+
+				dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+				cp = strchr(tmpname, '/');
+				if (cp == NULL) {
+					(void) strlcpy(dsname, tmpname,
+					    MAXPATHLEN);
+				} else {
+					(void) snprintf(dsname, MAXPATHLEN,
+					    "%s/%s", poolname, ++cp);
+				}
+				VERIFY(nvlist_add_string(config,
+				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
+				kmem_free(dsname, MAXPATHLEN);
+			}
+			kmem_free(tmpname, MAXPATHLEN);
+		}
+
+		/*
+		 * Add the list of hot spares and level 2 cache devices.
+		 */
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_add_spares(spa, config);
+		spa_add_l2cache(spa, config);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+
+	spa_unload(spa);
+	spa_deactivate(spa);
+	spa_remove(spa);
+	mutex_exit(&spa_namespace_lock);
+
+	return (config);
+}
+
+/*
+ * Pool export/destroy
+ *
+ * The act of destroying or exporting a pool is very simple.  We make sure there
+ * is no more pending I/O and any references to the pool are gone.  Then, we
+ * update the pool state and sync all the labels to disk, removing the
+ * configuration from the cache afterwards. If the 'hardforce' flag is set, then
+ * we don't sync the labels or remove the configuration cache.
+ */
+static int
+spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
+    boolean_t force, boolean_t hardforce)
+{
+	int error;
+	spa_t *spa;
+
+	if (oldconfig)
+		*oldconfig = NULL;
+
+	if (!(spa_mode_global & SPA_MODE_WRITE))
+		return (SET_ERROR(EROFS));
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(pool)) == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (spa->spa_is_exporting) {
+		/* the pool is being exported by another thread */
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
+	}
+	spa->spa_is_exporting = B_TRUE;
+
+	/*
+	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
+	 * reacquire the namespace lock, and see if we can export.
+	 */
+	spa_open_ref(spa, FTAG);
+	mutex_exit(&spa_namespace_lock);
+	spa_async_suspend(spa);
+	if (spa->spa_zvol_taskq) {
+		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
+		taskq_wait(spa->spa_zvol_taskq);
+	}
+	mutex_enter(&spa_namespace_lock);
+	spa_close(spa, FTAG);
+
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+		goto export_spa;
+	/*
+	 * The pool will be in core if it's openable, in which case we can
+	 * modify its state.  Objsets may be open only because they're dirty,
+	 * so we have to force it to sync before checking spa_refcnt.
+	 */
+	if (spa->spa_sync_on) {
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+		spa_evicting_os_wait(spa);
+	}
+
+	/*
+	 * A pool cannot be exported or destroyed if there are active
+	 * references.  If we are resetting a pool, allow references by
+	 * fault injection handlers.
+	 */
+	if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
+		error = SET_ERROR(EBUSY);
+		goto fail;
+	}
+
+	if (spa->spa_sync_on) {
+		/*
+		 * A pool cannot be exported if it has an active shared spare.
+		 * This is to prevent other pools stealing the active spare
+		 * from an exported pool. At user's own will, such pool can
+		 * be forcedly exported.
+		 */
+		if (!force && new_state == POOL_STATE_EXPORTED &&
+		    spa_has_active_shared_spare(spa)) {
+			error = SET_ERROR(EXDEV);
+			goto fail;
+		}
+
+		/*
+		 * We're about to export or destroy this pool. Make sure
+		 * we stop all initialization and trim activity here before
+		 * we set the spa_final_txg. This will ensure that all
+		 * dirty data resulting from the initialization is
+		 * committed to disk before we unload the pool.
+		 */
+		if (spa->spa_root_vdev != NULL) {
+			vdev_t *rvd = spa->spa_root_vdev;
+			vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
+			vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
+			vdev_autotrim_stop_all(spa);
+			vdev_rebuild_stop_all(spa);
+		}
+
+		/*
+		 * We want this to be reflected on every label,
+		 * so mark them all dirty.  spa_unload() will do the
+		 * final sync that pushes these changes out.
+		 */
+		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
+			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+			spa->spa_state = new_state;
+			spa->spa_final_txg = spa_last_synced_txg(spa) +
+			    TXG_DEFER_SIZE + 1;
+			vdev_config_dirty(spa->spa_root_vdev);
+			spa_config_exit(spa, SCL_ALL, FTAG);
+		}
+	}
+
+export_spa:
+	if (new_state == POOL_STATE_DESTROYED)
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
+	else if (new_state == POOL_STATE_EXPORTED)
+		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
+
+	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+	}
+
+	if (oldconfig && spa->spa_config)
+		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
+
+	if (new_state != POOL_STATE_UNINITIALIZED) {
+		if (!hardforce)
+			spa_write_cachefile(spa, B_TRUE, B_TRUE);
+		spa_remove(spa);
+	} else {
+		/*
+		 * If spa_remove() is not called for this spa_t and
+		 * there is any possibility that it can be reused,
+		 * we make sure to reset the exporting flag.
+		 */
+		spa->spa_is_exporting = B_FALSE;
+	}
+
+	mutex_exit(&spa_namespace_lock);
+	return (0);
+
+fail:
+	spa->spa_is_exporting = B_FALSE;
+	spa_async_resume(spa);
+	mutex_exit(&spa_namespace_lock);
+	return (error);
+}
+
+/*
+ * Destroy a storage pool.
+ */
+int
+spa_destroy(const char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
+	    B_FALSE, B_FALSE));
+}
+
+/*
+ * Export a storage pool.
+ */
+int
+spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
+    boolean_t hardforce)
+{
+	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
+	    force, hardforce));
+}
+
+/*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(const char *pool)
+{
+	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
+	    B_FALSE, B_FALSE));
+}
+
+/*
+ * ==========================================================================
+ * Device manipulation
+ * ==========================================================================
+ */
+
+/*
+ * This is called as a synctask to increment the draid feature flag
+ */
+static void
+spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	int draid = (int)(uintptr_t)arg;
+
+	for (int c = 0; c < draid; c++)
+		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
+}
+
+/*
+ * Add a device to a storage pool.
+ */
+int
+spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
+{
+	uint64_t txg, ndraid = 0;
+	int error;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd, *tvd;
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
+
+	ASSERT(spa_writeable(spa));
+
+	txg = spa_vdev_enter(spa);
+
+	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
+	    VDEV_ALLOC_ADD)) != 0)
+		return (spa_vdev_exit(spa, NULL, txg, error));
+
+	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
+
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
+	    &nspares) != 0)
+		nspares = 0;
+
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
+	    &nl2cache) != 0)
+		nl2cache = 0;
+
+	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
+		return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+	if (vd->vdev_children != 0 &&
+	    (error = vdev_create(vd, txg, B_FALSE)) != 0) {
+		return (spa_vdev_exit(spa, vd, txg, error));
+	}
+
+	/*
+	 * The virtual dRAID spares must be added after vdev tree is created
+	 * and the vdev guids are generated.  The guid of their assoicated
+	 * dRAID is stored in the config and used when opening the spare.
+	 */
+	if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
+	    rvd->vdev_children)) == 0) {
+		if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
+		    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
+			nspares = 0;
+	} else {
+		return (spa_vdev_exit(spa, vd, txg, error));
+	}
+
+	/*
+	 * We must validate the spares and l2cache devices after checking the
+	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
+	 */
+	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
+		return (spa_vdev_exit(spa, vd, txg, error));
+
+	/*
+	 * If we are in the middle of a device removal, we can only add
+	 * devices which match the existing devices in the pool.
+	 * If we are in the middle of a removal, or have some indirect
+	 * vdevs, we can not add raidz or dRAID top levels.
+	 */
+	if (spa->spa_vdev_removal != NULL ||
+	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			tvd = vd->vdev_child[c];
+			if (spa->spa_vdev_removal != NULL &&
+			    tvd->vdev_ashift != spa->spa_max_ashift) {
+				return (spa_vdev_exit(spa, vd, txg, EINVAL));
+			}
+			/* Fail if top level vdev is raidz or a dRAID */
+			if (vdev_get_nparity(tvd) != 0)
+				return (spa_vdev_exit(spa, vd, txg, EINVAL));
+
+			/*
+			 * Need the top level mirror to be
+			 * a mirror of leaf vdevs only
+			 */
+			if (tvd->vdev_ops == &vdev_mirror_ops) {
+				for (uint64_t cid = 0;
+				    cid < tvd->vdev_children; cid++) {
+					vdev_t *cvd = tvd->vdev_child[cid];
+					if (!cvd->vdev_ops->vdev_op_leaf) {
+						return (spa_vdev_exit(spa, vd,
+						    txg, EINVAL));
+					}
+				}
+			}
+		}
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		tvd = vd->vdev_child[c];
+		vdev_remove_child(vd, tvd);
+		tvd->vdev_id = rvd->vdev_children;
+		vdev_add_child(rvd, tvd);
+		vdev_config_dirty(tvd);
+	}
+
+	if (nspares != 0) {
+		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
+		    ZPOOL_CONFIG_SPARES);
+		spa_load_spares(spa);
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+
+	if (nl2cache != 0) {
+		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
+		    ZPOOL_CONFIG_L2CACHE);
+		spa_load_l2cache(spa);
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	}
+
+	/*
+	 * We can't increment a feature while holding spa_vdev so we
+	 * have to do it in a synctask.
+	 */
+	if (ndraid != 0) {
+		dmu_tx_t *tx;
+
+		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+		dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
+		    (void *)(uintptr_t)ndraid, tx);
+		dmu_tx_commit(tx);
+	}
+
+	/*
+	 * We have to be careful when adding new vdevs to an existing pool.
+	 * If other threads start allocating from these vdevs before we
+	 * sync the config cache, and we lose power, then upon reboot we may
+	 * fail to open the pool because there are DVAs that the config cache
+	 * can't translate.  Therefore, we first add the vdevs without
+	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
+	 * and then let spa_config_update() initialize the new metaslabs.
+	 *
+	 * spa_load() checks for added-but-not-initialized vdevs, so that
+	 * if we lose power at any point in this sequence, the remaining
+	 * steps will be completed the next time we load the pool.
+	 */
+	(void) spa_vdev_exit(spa, vd, txg, 0);
+
+	mutex_enter(&spa_namespace_lock);
+	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+/*
+ * Attach a device to a mirror.  The arguments are the path to any device
+ * in the mirror, and the nvroot for the new device.  If the path specifies
+ * a device that is not mirrored, we automatically insert the mirror vdev.
+ *
+ * If 'replacing' is specified, the new device is intended to replace the
+ * existing device; in this case the two devices are made into their own
+ * mirror using the 'replacing' vdev, which is functionally identical to
+ * the mirror vdev (it actually reuses all the same ops) but has a few
+ * extra rules: you can't attach to it after it's been created, and upon
+ * completion of resilvering, the first disk (the one being replaced)
+ * is automatically detached.
+ *
+ * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
+ * should be performed instead of traditional healing reconstruction.  From
+ * an administrators perspective these are both resilver operations.
+ */
+int
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
+    int rebuild)
+{
+	uint64_t txg, dtl_max_txg;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
+	vdev_ops_t *pvops;
+	char *oldvdpath, *newvdpath;
+	int newvd_isspare;
+	int error;
+
+	ASSERT(spa_writeable(spa));
+
+	txg = spa_vdev_enter(spa);
+
+	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
+	if (rebuild) {
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+		if (dsl_scan_resilvering(spa_get_dsl(spa)))
+			return (spa_vdev_exit(spa, NULL, txg,
+			    ZFS_ERR_RESILVER_IN_PROGRESS));
+	} else {
+		if (vdev_rebuild_active(rvd))
+			return (spa_vdev_exit(spa, NULL, txg,
+			    ZFS_ERR_REBUILD_IN_PROGRESS));
+	}
+
+	if (spa->spa_vdev_removal != NULL)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	if (oldvd == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	if (!oldvd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	pvd = oldvd->vdev_parent;
+
+	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
+	    VDEV_ALLOC_ATTACH)) != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	if (newrootvd->vdev_children != 1)
+		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+	newvd = newrootvd->vdev_child[0];
+
+	if (!newvd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
+
+	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
+		return (spa_vdev_exit(spa, newrootvd, txg, error));
+
+	/*
+	 * Spares can't replace logs
+	 */
+	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
+		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+	/*
+	 * A dRAID spare can only replace a child of its parent dRAID vdev.
+	 */
+	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
+	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
+		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+	}
+
+	if (rebuild) {
+		/*
+		 * For rebuilds, the top vdev must support reconstruction
+		 * using only space maps.  This means the only allowable
+		 * vdevs types are the root vdev, a mirror, or dRAID.
+		 */
+		tvd = pvd;
+		if (pvd->vdev_top != NULL)
+			tvd = pvd->vdev_top;
+
+		if (tvd->vdev_ops != &vdev_mirror_ops &&
+		    tvd->vdev_ops != &vdev_root_ops &&
+		    tvd->vdev_ops != &vdev_draid_ops) {
+			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+		}
+	}
+
+	if (!replacing) {
+		/*
+		 * For attach, the only allowable parent is a mirror or the root
+		 * vdev.
+		 */
+		if (pvd->vdev_ops != &vdev_mirror_ops &&
+		    pvd->vdev_ops != &vdev_root_ops)
+			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+		pvops = &vdev_mirror_ops;
+	} else {
+		/*
+		 * Active hot spares can only be replaced by inactive hot
+		 * spares.
+		 */
+		if (pvd->vdev_ops == &vdev_spare_ops &&
+		    oldvd->vdev_isspare &&
+		    !spa_has_spare(spa, newvd->vdev_guid))
+			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+		/*
+		 * If the source is a hot spare, and the parent isn't already a
+		 * spare, then we want to create a new hot spare.  Otherwise, we
+		 * want to create a replacing vdev.  The user is not allowed to
+		 * attach to a spared vdev child unless the 'isspare' state is
+		 * the same (spare replaces spare, non-spare replaces
+		 * non-spare).
+		 */
+		if (pvd->vdev_ops == &vdev_replacing_ops &&
+		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
+			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+		} else if (pvd->vdev_ops == &vdev_spare_ops &&
+		    newvd->vdev_isspare != oldvd->vdev_isspare) {
+			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+		}
+
+		if (newvd->vdev_isspare)
+			pvops = &vdev_spare_ops;
+		else
+			pvops = &vdev_replacing_ops;
+	}
+
+	/*
+	 * Make sure the new device is big enough.
+	 */
+	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
+		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
+
+	/*
+	 * The new device cannot have a higher alignment requirement
+	 * than the top-level vdev.
+	 */
+	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
+		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+
+	/*
+	 * If this is an in-place replacement, update oldvd's path and devid
+	 * to make it distinguishable from newvd, and unopenable from now on.
+	 */
+	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
+		spa_strfree(oldvd->vdev_path);
+		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
+		    KM_SLEEP);
+		(void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5,
+		    "%s/%s", newvd->vdev_path, "old");
+		if (oldvd->vdev_devid != NULL) {
+			spa_strfree(oldvd->vdev_devid);
+			oldvd->vdev_devid = NULL;
+		}
+	}
+
+	/*
+	 * If the parent is not a mirror, or if we're replacing, insert the new
+	 * mirror/replacing/spare vdev above oldvd.
+	 */
+	if (pvd->vdev_ops != pvops)
+		pvd = vdev_add_parent(oldvd, pvops);
+
+	ASSERT(pvd->vdev_top->vdev_parent == rvd);
+	ASSERT(pvd->vdev_ops == pvops);
+	ASSERT(oldvd->vdev_parent == pvd);
+
+	/*
+	 * Extract the new device from its root and add it to pvd.
+	 */
+	vdev_remove_child(newrootvd, newvd);
+	newvd->vdev_id = pvd->vdev_children;
+	newvd->vdev_crtxg = oldvd->vdev_crtxg;
+	vdev_add_child(pvd, newvd);
+
+	/*
+	 * Reevaluate the parent vdev state.
+	 */
+	vdev_propagate_state(pvd);
+
+	tvd = newvd->vdev_top;
+	ASSERT(pvd->vdev_top == tvd);
+	ASSERT(tvd->vdev_parent == rvd);
+
+	vdev_config_dirty(tvd);
+
+	/*
+	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
+	 * for any dmu_sync-ed blocks.  It will propagate upward when
+	 * spa_vdev_exit() calls vdev_dtl_reassess().
+	 */
+	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
+
+	vdev_dtl_dirty(newvd, DTL_MISSING,
+	    TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
+
+	if (newvd->vdev_isspare) {
+		spa_spare_activate(newvd);
+		spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
+	}
+
+	oldvdpath = spa_strdup(oldvd->vdev_path);
+	newvdpath = spa_strdup(newvd->vdev_path);
+	newvd_isspare = newvd->vdev_isspare;
+
+	/*
+	 * Mark newvd's DTL dirty in this txg.
+	 */
+	vdev_dirty(tvd, VDD_DTL, newvd, txg);
+
+	/*
+	 * Schedule the resilver or rebuild to restart in the future. We do
+	 * this to ensure that dmu_sync-ed blocks have been stitched into the
+	 * respective datasets.
+	 */
+	if (rebuild) {
+		newvd->vdev_rebuild_txg = txg;
+
+		vdev_rebuild(tvd);
+	} else {
+		newvd->vdev_resilver_txg = txg;
+
+		if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+		    spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
+			vdev_defer_resilver(newvd);
+		} else {
+			dsl_scan_restart_resilver(spa->spa_dsl_pool,
+			    dtl_max_txg);
+		}
+	}
+
+	if (spa->spa_bootfs)
+		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
+	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
+
+	/*
+	 * Commit the config
+	 */
+	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
+
+	spa_history_log_internal(spa, "vdev attach", NULL,
+	    "%s vdev=%s %s vdev=%s",
+	    replacing && newvd_isspare ? "spare in" :
+	    replacing ? "replace" : "attach", newvdpath,
+	    replacing ? "for" : "to", oldvdpath);
+
+	spa_strfree(oldvdpath);
+	spa_strfree(newvdpath);
+
+	return (0);
+}
+
+/*
+ * Detach a device from a mirror or replacing vdev.
+ *
+ * If 'replace_done' is specified, only detach if the parent
+ * is a replacing vdev.
+ */
+int
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
+{
+	uint64_t txg;
+	int error;
+	vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
+	vdev_t *vd, *pvd, *cvd, *tvd;
+	boolean_t unspare = B_FALSE;
+	uint64_t unspare_guid = 0;
+	char *vdpath;
+
+	ASSERT(spa_writeable(spa));
+
+	txg = spa_vdev_detach_enter(spa, guid);
+
+	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+	/*
+	 * Besides being called directly from the userland through the
+	 * ioctl interface, spa_vdev_detach() can be potentially called
+	 * at the end of spa_vdev_resilver_done().
+	 *
+	 * In the regular case, when we have a checkpoint this shouldn't
+	 * happen as we never empty the DTLs of a vdev during the scrub
+	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
+	 * should never get here when we have a checkpoint.
+	 *
+	 * That said, even in a case when we checkpoint the pool exactly
+	 * as spa_vdev_resilver_done() calls this function everything
+	 * should be fine as the resilver will return right away.
+	 */
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
+	if (vd == NULL)
+		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	pvd = vd->vdev_parent;
+
+	/*
+	 * If the parent/child relationship is not as expected, don't do it.
+	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+	 * vdev that's replacing B with C.  The user's intent in replacing
+	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
+	 * the replace by detaching C, the expected behavior is to end up
+	 * M(A,B).  But suppose that right after deciding to detach C,
+	 * the replacement of B completes.  We would have M(A,C), and then
+	 * ask to detach C, which would leave us with just A -- not what
+	 * the user wanted.  To prevent this, we make sure that the
+	 * parent/child relationship hasn't changed -- in this example,
+	 * that C's parent is still the replacing vdev R.
+	 */
+	if (pvd->vdev_guid != pguid && pguid != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	/*
+	 * Only 'replacing' or 'spare' vdevs can be replaced.
+	 */
+	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
+	    pvd->vdev_ops != &vdev_spare_ops)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
+	    spa_version(spa) >= SPA_VERSION_SPARES);
+
+	/*
+	 * Only mirror, replacing, and spare vdevs support detach.
+	 */
+	if (pvd->vdev_ops != &vdev_replacing_ops &&
+	    pvd->vdev_ops != &vdev_mirror_ops &&
+	    pvd->vdev_ops != &vdev_spare_ops)
+		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+	/*
+	 * If this device has the only valid copy of some data,
+	 * we cannot safely detach it.
+	 */
+	if (vdev_dtl_required(vd))
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
+	ASSERT(pvd->vdev_children >= 2);
+
+	/*
+	 * If we are detaching the second disk from a replacing vdev, then
+	 * check to see if we changed the original vdev's path to have "/old"
+	 * at the end in spa_vdev_attach().  If so, undo that change now.
+	 */
+	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
+	    vd->vdev_path != NULL) {
+		size_t len = strlen(vd->vdev_path);
+
+		for (int c = 0; c < pvd->vdev_children; c++) {
+			cvd = pvd->vdev_child[c];
+
+			if (cvd == vd || cvd->vdev_path == NULL)
+				continue;
+
+			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
+			    strcmp(cvd->vdev_path + len, "/old") == 0) {
+				spa_strfree(cvd->vdev_path);
+				cvd->vdev_path = spa_strdup(vd->vdev_path);
+				break;
+			}
+		}
+	}
+
+	/*
+	 * If we are detaching the original disk from a normal spare, then it
+	 * implies that the spare should become a real disk, and be removed
+	 * from the active spare list for the pool.  dRAID spares on the
+	 * other hand are coupled to the pool and thus should never be removed
+	 * from the spares list.
+	 */
+	if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
+		vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+		if (last_cvd->vdev_isspare &&
+		    last_cvd->vdev_ops != &vdev_draid_spare_ops) {
+			unspare = B_TRUE;
+		}
+	}
+
+	/*
+	 * Erase the disk labels so the disk can be used for other things.
+	 * This must be done after all other error cases are handled,
+	 * but before we disembowel vd (so we can still do I/O to it).
+	 * But if we can't do it, don't treat the error as fatal --
+	 * it may be that the unwritability of the disk is the reason
+	 * it's being detached!
+	 */
+	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+	/*
+	 * Remove vd from its parent and compact the parent's children.
+	 */
+	vdev_remove_child(pvd, vd);
+	vdev_compact_children(pvd);
+
+	/*
+	 * Remember one of the remaining children so we can get tvd below.
+	 */
+	cvd = pvd->vdev_child[pvd->vdev_children - 1];
+
+	/*
+	 * If we need to remove the remaining child from the list of hot spares,
+	 * do it now, marking the vdev as no longer a spare in the process.
+	 * We must do this before vdev_remove_parent(), because that can
+	 * change the GUID if it creates a new toplevel GUID.  For a similar
+	 * reason, we must remove the spare now, in the same txg as the detach;
+	 * otherwise someone could attach a new sibling, change the GUID, and
+	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
+	 */
+	if (unspare) {
+		ASSERT(cvd->vdev_isspare);
+		spa_spare_remove(cvd);
+		unspare_guid = cvd->vdev_guid;
+		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+		cvd->vdev_unspare = B_TRUE;
+	}
+
+	/*
+	 * If the parent mirror/replacing vdev only has one child,
+	 * the parent is no longer needed.  Remove it from the tree.
+	 */
+	if (pvd->vdev_children == 1) {
+		if (pvd->vdev_ops == &vdev_spare_ops)
+			cvd->vdev_unspare = B_FALSE;
+		vdev_remove_parent(cvd);
+	}
+
+	/*
+	 * We don't set tvd until now because the parent we just removed
+	 * may have been the previous top-level vdev.
+	 */
+	tvd = cvd->vdev_top;
+	ASSERT(tvd->vdev_parent == rvd);
+
+	/*
+	 * Reevaluate the parent vdev state.
+	 */
+	vdev_propagate_state(cvd);
+
+	/*
+	 * If the 'autoexpand' property is set on the pool then automatically
+	 * try to expand the size of the pool. For example if the device we
+	 * just detached was smaller than the others, it may be possible to
+	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+	 * first so that we can obtain the updated sizes of the leaf vdevs.
+	 */
+	if (spa->spa_autoexpand) {
+		vdev_reopen(tvd);
+		vdev_expand(tvd, txg);
+	}
+
+	vdev_config_dirty(tvd);
+
+	/*
+	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
+	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
+	 * But first make sure we're not on any *other* txg's DTL list, to
+	 * prevent vd from being accessed after it's freed.
+	 */
+	vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
+	for (int t = 0; t < TXG_SIZE; t++)
+		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+	vd->vdev_detached = B_TRUE;
+	vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
+	spa_notify_waiters(spa);
+
+	/* hang on to the spa before we release the lock */
+	spa_open_ref(spa, FTAG);
+
+	error = spa_vdev_exit(spa, vd, txg, 0);
+
+	spa_history_log_internal(spa, "detach", NULL,
+	    "vdev=%s", vdpath);
+	spa_strfree(vdpath);
+
+	/*
+	 * If this was the removal of the original device in a hot spare vdev,
+	 * then we want to go through and remove the device from the hot spare
+	 * list of every other pool.
+	 */
+	if (unspare) {
+		spa_t *altspa = NULL;
+
+		mutex_enter(&spa_namespace_lock);
+		while ((altspa = spa_next(altspa)) != NULL) {
+			if (altspa->spa_state != POOL_STATE_ACTIVE ||
+			    altspa == spa)
+				continue;
+
+			spa_open_ref(altspa, FTAG);
+			mutex_exit(&spa_namespace_lock);
+			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
+			mutex_enter(&spa_namespace_lock);
+			spa_close(altspa, FTAG);
+		}
+		mutex_exit(&spa_namespace_lock);
+
+		/* search the rest of the vdevs for spares to remove */
+		spa_vdev_resilver_done(spa);
+	}
+
+	/* all done with the spa; OK to release */
+	mutex_enter(&spa_namespace_lock);
+	spa_close(spa, FTAG);
+	mutex_exit(&spa_namespace_lock);
+
+	return (error);
+}
+
+static int
+spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+    list_t *vd_list)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+	/* Look up vdev and ensure it's a leaf. */
+	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_detached) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(ENODEV));
+	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EINVAL));
+	} else if (!vdev_writeable(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EROFS));
+	}
+	mutex_enter(&vd->vdev_initialize_lock);
+	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+	/*
+	 * When we activate an initialize action we check to see
+	 * if the vdev_initialize_thread is NULL. We do this instead
+	 * of using the vdev_initialize_state since there might be
+	 * a previous initialization process which has completed but
+	 * the thread is not exited.
+	 */
+	if (cmd_type == POOL_INITIALIZE_START &&
+	    (vd->vdev_initialize_thread != NULL ||
+	    vd->vdev_top->vdev_removing)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		return (SET_ERROR(EBUSY));
+	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		return (SET_ERROR(ESRCH));
+	}
+
+	switch (cmd_type) {
+	case POOL_INITIALIZE_START:
+		vdev_initialize(vd);
+		break;
+	case POOL_INITIALIZE_CANCEL:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
+		break;
+	case POOL_INITIALIZE_SUSPEND:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
+		break;
+	default:
+		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	return (0);
+}
+
+int
+spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
+    nvlist_t *vdev_errlist)
+{
+	int total_errors = 0;
+	list_t vd_list;
+
+	list_create(&vd_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_initialize_node));
+
+	/*
+	 * We hold the namespace lock through the whole function
+	 * to prevent any changes to the pool while we're starting or
+	 * stopping initialization. The config and state locks are held so that
+	 * we can properly assess the vdev state before we commit to
+	 * the initializing operation.
+	 */
+	mutex_enter(&spa_namespace_lock);
+
+	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+		uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+		int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
+		    &vd_list);
+		if (error != 0) {
+			char guid_as_str[MAXNAMELEN];
+
+			(void) snprintf(guid_as_str, sizeof (guid_as_str),
+			    "%llu", (unsigned long long)vdev_guid);
+			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+			total_errors++;
+		}
+	}
+
+	/* Wait for all initialize threads to stop. */
+	vdev_initialize_stop_wait(spa, &vd_list);
+
+	/* Sync out the initializing state */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	mutex_exit(&spa_namespace_lock);
+
+	list_destroy(&vd_list);
+
+	return (total_errors);
+}
+
+static int
+spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
+    uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+	/* Look up vdev and ensure it's a leaf. */
+	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_detached) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(ENODEV));
+	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EINVAL));
+	} else if (!vdev_writeable(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EROFS));
+	} else if (!vd->vdev_has_trim) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EOPNOTSUPP));
+	} else if (secure && !vd->vdev_has_securetrim) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		return (SET_ERROR(EOPNOTSUPP));
+	}
+	mutex_enter(&vd->vdev_trim_lock);
+	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+	/*
+	 * When we activate a TRIM action we check to see if the
+	 * vdev_trim_thread is NULL. We do this instead of using the
+	 * vdev_trim_state since there might be a previous TRIM process
+	 * which has completed but the thread is not exited.
+	 */
+	if (cmd_type == POOL_TRIM_START &&
+	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) {
+		mutex_exit(&vd->vdev_trim_lock);
+		return (SET_ERROR(EBUSY));
+	} else if (cmd_type == POOL_TRIM_CANCEL &&
+	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
+	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
+		mutex_exit(&vd->vdev_trim_lock);
+		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_TRIM_SUSPEND &&
+	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
+		mutex_exit(&vd->vdev_trim_lock);
+		return (SET_ERROR(ESRCH));
+	}
+
+	switch (cmd_type) {
+	case POOL_TRIM_START:
+		vdev_trim(vd, rate, partial, secure);
+		break;
+	case POOL_TRIM_CANCEL:
+		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
+		break;
+	case POOL_TRIM_SUSPEND:
+		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
+		break;
+	default:
+		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+	}
+	mutex_exit(&vd->vdev_trim_lock);
+
+	return (0);
+}
+
+/*
+ * Initiates a manual TRIM for the requested vdevs. This kicks off individual
+ * TRIM threads for each child vdev.  These threads pass over all of the free
+ * space in the vdev's metaslabs and issues TRIM commands for that space.
+ */
+int
+spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
+    boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
+{
+	int total_errors = 0;
+	list_t vd_list;
+
+	list_create(&vd_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_trim_node));
+
+	/*
+	 * We hold the namespace lock through the whole function
+	 * to prevent any changes to the pool while we're starting or
+	 * stopping TRIM. The config and state locks are held so that
+	 * we can properly assess the vdev state before we commit to
+	 * the TRIM operation.
+	 */
+	mutex_enter(&spa_namespace_lock);
+
+	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
+		uint64_t vdev_guid = fnvpair_value_uint64(pair);
+
+		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
+		    rate, partial, secure, &vd_list);
+		if (error != 0) {
+			char guid_as_str[MAXNAMELEN];
+
+			(void) snprintf(guid_as_str, sizeof (guid_as_str),
+			    "%llu", (unsigned long long)vdev_guid);
+			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
+			total_errors++;
+		}
+	}
+
+	/* Wait for all TRIM threads to stop. */
+	vdev_trim_stop_wait(spa, &vd_list);
+
+	/* Sync out the TRIM state */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	mutex_exit(&spa_namespace_lock);
+
+	list_destroy(&vd_list);
+
+	return (total_errors);
+}
+
+/*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+int
+spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+    nvlist_t *props, boolean_t exp)
+{
+	int error = 0;
+	uint64_t txg, *glist;
+	spa_t *newspa;
+	uint_t c, children, lastlog;
+	nvlist_t **child, *nvl, *tmp;
+	dmu_tx_t *tx;
+	char *altroot = NULL;
+	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
+	boolean_t activate_slog;
+
+	ASSERT(spa_writeable(spa));
+
+	txg = spa_vdev_enter(spa);
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
+	/* clear the log and flush everything up to now */
+	activate_slog = spa_passivate_log(spa);
+	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+	error = spa_reset_logs(spa);
+	txg = spa_vdev_config_enter(spa);
+
+	if (activate_slog)
+		spa_activate_log(spa);
+
+	if (error != 0)
+		return (spa_vdev_exit(spa, NULL, txg, error));
+
+	/* check new spa name before going any further */
+	if (spa_lookup(newname) != NULL)
+		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+	/*
+	 * scan through all the children to ensure they're all mirrors
+	 */
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	/* first, check to ensure we've got the right child count */
+	rvd = spa->spa_root_vdev;
+	lastlog = 0;
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		/* don't count the holes & logs as children */
+		if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
+		    !vdev_is_concrete(vd))) {
+			if (lastlog == 0)
+				lastlog = c;
+			continue;
+		}
+
+		lastlog = 0;
+	}
+	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	/* next, ensure no spare or cache devices are part of the split */
+	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+	/* then, loop over each vdev and validate it */
+	for (c = 0; c < children; c++) {
+		uint64_t is_hole = 0;
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+		    &is_hole);
+
+		if (is_hole != 0) {
+			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+				continue;
+			} else {
+				error = SET_ERROR(EINVAL);
+				break;
+			}
+		}
+
+		/* deal with indirect vdevs */
+		if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
+		    &vdev_indirect_ops)
+			continue;
+
+		/* which disk is going to be split? */
+		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+		    &glist[c]) != 0) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+
+		/* look it up in the spa */
+		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+		if (vml[c] == NULL) {
+			error = SET_ERROR(ENODEV);
+			break;
+		}
+
+		/* make sure there's nothing stopping the split */
+		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+		    vml[c]->vdev_islog ||
+		    !vdev_is_concrete(vml[c]) ||
+		    vml[c]->vdev_isspare ||
+		    vml[c]->vdev_isl2cache ||
+		    !vdev_writeable(vml[c]) ||
+		    vml[c]->vdev_children != 0 ||
+		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+
+		if (vdev_dtl_required(vml[c]) ||
+		    vdev_resilver_needed(vml[c], NULL, NULL)) {
+			error = SET_ERROR(EBUSY);
+			break;
+		}
+
+		/* we need certain info from the top level */
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+		    vml[c]->vdev_top->vdev_ms_array) == 0);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+		    vml[c]->vdev_top->vdev_ms_shift) == 0);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+		    vml[c]->vdev_top->vdev_asize) == 0);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+		    vml[c]->vdev_top->vdev_ashift) == 0);
+
+		/* transfer per-vdev ZAPs */
+		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
+		VERIFY0(nvlist_add_uint64(child[c],
+		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
+
+		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
+		VERIFY0(nvlist_add_uint64(child[c],
+		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
+		    vml[c]->vdev_parent->vdev_top_zap));
+	}
+
+	if (error != 0) {
+		kmem_free(vml, children * sizeof (vdev_t *));
+		kmem_free(glist, children * sizeof (uint64_t));
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
+	/* stop writers from using the disks */
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL)
+			vml[c]->vdev_offline = B_TRUE;
+	}
+	vdev_reopen(spa->spa_root_vdev);
+
+	/*
+	 * Temporarily record the splitting vdevs in the spa config.  This
+	 * will disappear once the config is regenerated.
+	 */
+	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+	    glist, children) == 0);
+	kmem_free(glist, children * sizeof (uint64_t));
+
+	mutex_enter(&spa->spa_props_lock);
+	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+	    nvl) == 0);
+	mutex_exit(&spa->spa_props_lock);
+	spa->spa_config_splitting = nvl;
+	vdev_config_dirty(spa->spa_root_vdev);
+
+	/* configure and create the new pool */
+	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+	    spa_version(spa)) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    spa->spa_config_txg) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    spa_generate_guid(NULL)) == 0);
+	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+
+	/* add the new pool to the namespace */
+	newspa = spa_add(newname, config, altroot);
+	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
+	newspa->spa_config_txg = spa->spa_config_txg;
+	spa_set_log_state(newspa, SPA_LOG_CLEAR);
+
+	/* release the spa config lock, retaining the namespace lock */
+	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, FTAG, 1);
+
+	spa_activate(newspa, spa_mode_global);
+	spa_async_suspend(newspa);
+
+	/*
+	 * Temporarily stop the initializing and TRIM activity.  We set the
+	 * state to ACTIVE so that we know to resume initializing or TRIM
+	 * once the split has completed.
+	 */
+	list_t vd_initialize_list;
+	list_create(&vd_initialize_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_initialize_node));
+
+	list_t vd_trim_list;
+	list_create(&vd_trim_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_trim_node));
+
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
+			mutex_enter(&vml[c]->vdev_initialize_lock);
+			vdev_initialize_stop(vml[c],
+			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
+			mutex_exit(&vml[c]->vdev_initialize_lock);
+
+			mutex_enter(&vml[c]->vdev_trim_lock);
+			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
+			mutex_exit(&vml[c]->vdev_trim_lock);
+		}
+	}
+
+	vdev_initialize_stop_wait(spa, &vd_initialize_list);
+	vdev_trim_stop_wait(spa, &vd_trim_list);
+
+	list_destroy(&vd_initialize_list);
+	list_destroy(&vd_trim_list);
+
+	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
+	newspa->spa_is_splitting = B_TRUE;
+
+	/* create the new pool from the disks of the original pool */
+	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
+	if (error)
+		goto out;
+
+	/* if that worked, generate a real config for the new pool */
+	if (newspa->spa_root_vdev != NULL) {
+		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+		    B_TRUE));
+	}
+
+	/* set the props */
+	if (props != NULL) {
+		spa_configfile_set(newspa, props, B_FALSE);
+		error = spa_prop_set(newspa, props);
+		if (error)
+			goto out;
+	}
+
+	/* flush everything */
+	txg = spa_vdev_config_enter(newspa);
+	vdev_config_dirty(newspa->spa_root_vdev);
+	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
+
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, FTAG, 2);
+
+	spa_async_resume(newspa);
+
+	/* finally, update the original pool's config */
+	txg = spa_vdev_config_enter(spa);
+	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error != 0)
+		dmu_tx_abort(tx);
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
+			vdev_t *tvd = vml[c]->vdev_top;
+
+			/*
+			 * Need to be sure the detachable VDEV is not
+			 * on any *other* txg's DTL list to prevent it
+			 * from being accessed after it's freed.
+			 */
+			for (int t = 0; t < TXG_SIZE; t++) {
+				(void) txg_list_remove_this(
+				    &tvd->vdev_dtl_list, vml[c], t);
+			}
+
+			vdev_split(vml[c]);
+			if (error == 0)
+				spa_history_log_internal(spa, "detach", tx,
+				    "vdev=%s", vml[c]->vdev_path);
+
+			vdev_free(vml[c]);
+		}
+	}
+	spa->spa_avz_action = AVZ_ACTION_REBUILD;
+	vdev_config_dirty(spa->spa_root_vdev);
+	spa->spa_config_splitting = NULL;
+	nvlist_free(nvl);
+	if (error == 0)
+		dmu_tx_commit(tx);
+	(void) spa_vdev_exit(spa, NULL, txg, 0);
+
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, FTAG, 3);
+
+	/* split is complete; log a history record */
+	spa_history_log_internal(newspa, "split", NULL,
+	    "from pool %s", spa_name(spa));
+
+	newspa->spa_is_splitting = B_FALSE;
+	kmem_free(vml, children * sizeof (vdev_t *));
+
+	/* if we're not going to mount the filesystems in userland, export */
+	if (exp)
+		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+		    B_FALSE, B_FALSE);
+
+	return (error);
+
+out:
+	spa_unload(newspa);
+	spa_deactivate(newspa);
+	spa_remove(newspa);
+
+	txg = spa_vdev_config_enter(spa);
+
+	/* re-online all offlined disks */
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL)
+			vml[c]->vdev_offline = B_FALSE;
+	}
+
+	/* restart initializing or trimming disks as necessary */
+	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
+
+	vdev_reopen(spa->spa_root_vdev);
+
+	nvlist_free(spa->spa_config_splitting);
+	spa->spa_config_splitting = NULL;
+	(void) spa_vdev_exit(spa, NULL, txg, error);
+
+	kmem_free(vml, children * sizeof (vdev_t *));
+	return (error);
+}
+
+/*
+ * Find any device that's done replacing, or a vdev marked 'unspare' that's
+ * currently spared, so we can detach it.
+ */
+static vdev_t *
+spa_vdev_resilver_done_hunt(vdev_t *vd)
+{
+	vdev_t *newvd, *oldvd;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
+		if (oldvd != NULL)
+			return (oldvd);
+	}
+
+	/*
+	 * Check for a completed replacement.  We always consider the first
+	 * vdev in the list to be the oldest vdev, and the last one to be
+	 * the newest (see spa_vdev_attach() for how that works).  In
+	 * the case where the newest vdev is faulted, we will not automatically
+	 * remove it after a resilver completes.  This is OK as it will require
+	 * user intervention to determine which disk the admin wishes to keep.
+	 */
+	if (vd->vdev_ops == &vdev_replacing_ops) {
+		ASSERT(vd->vdev_children > 1);
+
+		newvd = vd->vdev_child[vd->vdev_children - 1];
+		oldvd = vd->vdev_child[0];
+
+		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+		    !vdev_dtl_required(oldvd))
+			return (oldvd);
+	}
+
+	/*
+	 * Check for a completed resilver with the 'unspare' flag set.
+	 * Also potentially update faulted state.
+	 */
+	if (vd->vdev_ops == &vdev_spare_ops) {
+		vdev_t *first = vd->vdev_child[0];
+		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
+
+		if (last->vdev_unspare) {
+			oldvd = first;
+			newvd = last;
+		} else if (first->vdev_unspare) {
+			oldvd = last;
+			newvd = first;
+		} else {
+			oldvd = NULL;
+		}
+
+		if (oldvd != NULL &&
+		    vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
+		    !vdev_dtl_required(oldvd))
+			return (oldvd);
+
+		vdev_propagate_state(vd);
+
+		/*
+		 * If there are more than two spares attached to a disk,
+		 * and those spares are not required, then we want to
+		 * attempt to free them up now so that they can be used
+		 * by other pools.  Once we're back down to a single
+		 * disk+spare, we stop removing them.
+		 */
+		if (vd->vdev_children > 2) {
+			newvd = vd->vdev_child[1];
+
+			if (newvd->vdev_isspare && last->vdev_isspare &&
+			    vdev_dtl_empty(last, DTL_MISSING) &&
+			    vdev_dtl_empty(last, DTL_OUTAGE) &&
+			    !vdev_dtl_required(newvd))
+				return (newvd);
+		}
+	}
+
+	return (NULL);
+}
+
+static void
+spa_vdev_resilver_done(spa_t *spa)
+{
+	vdev_t *vd, *pvd, *ppvd;
+	uint64_t guid, sguid, pguid, ppguid;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+		pvd = vd->vdev_parent;
+		ppvd = pvd->vdev_parent;
+		guid = vd->vdev_guid;
+		pguid = pvd->vdev_guid;
+		ppguid = ppvd->vdev_guid;
+		sguid = 0;
+		/*
+		 * If we have just finished replacing a hot spared device, then
+		 * we need to detach the parent's first child (the original hot
+		 * spare) as well.
+		 */
+		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
+		    ppvd->vdev_children == 2) {
+			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
+			sguid = ppvd->vdev_child[1]->vdev_guid;
+		}
+		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
+			return;
+		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
+			return;
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	}
+
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	/*
+	 * If a detach was not performed above replace waiters will not have
+	 * been notified.  In which case we must do so now.
+	 */
+	spa_notify_waiters(spa);
+}
+
+/*
+ * Update the stored path or FRU for this vdev.
+ */
+static int
+spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
+    boolean_t ispath)
+{
+	vdev_t *vd;
+	boolean_t sync = B_FALSE;
+
+	ASSERT(spa_writeable(spa));
+
+	spa_vdev_state_enter(spa, SCL_ALL);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, ENOENT));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+	if (ispath) {
+		if (strcmp(value, vd->vdev_path) != 0) {
+			spa_strfree(vd->vdev_path);
+			vd->vdev_path = spa_strdup(value);
+			sync = B_TRUE;
+		}
+	} else {
+		if (vd->vdev_fru == NULL) {
+			vd->vdev_fru = spa_strdup(value);
+			sync = B_TRUE;
+		} else if (strcmp(value, vd->vdev_fru) != 0) {
+			spa_strfree(vd->vdev_fru);
+			vd->vdev_fru = spa_strdup(value);
+			sync = B_TRUE;
+		}
+	}
+
+	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
+}
+
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
+}
+
+int
+spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
+{
+	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
+}
+
+/*
+ * ==========================================================================
+ * SPA Scanning
+ * ==========================================================================
+ */
+int
+spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
+{
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+	if (dsl_scan_resilvering(spa->spa_dsl_pool))
+		return (SET_ERROR(EBUSY));
+
+	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
+}
+
+int
+spa_scan_stop(spa_t *spa)
+{
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+	if (dsl_scan_resilvering(spa->spa_dsl_pool))
+		return (SET_ERROR(EBUSY));
+	return (dsl_scan_cancel(spa->spa_dsl_pool));
+}
+
+int
+spa_scan(spa_t *spa, pool_scan_func_t func)
+{
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+
+	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
+		return (SET_ERROR(ENOTSUP));
+
+	if (func == POOL_SCAN_RESILVER &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * If a resilver was requested, but there is no DTL on a
+	 * writeable leaf device, we have nothing to do.
+	 */
+	if (func == POOL_SCAN_RESILVER &&
+	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
+		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+		return (0);
+	}
+
+	return (dsl_scan(spa->spa_dsl_pool, func));
+}
+
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_remove(spa_t *spa, vdev_t *vd)
+{
+	if (vd->vdev_remove_wanted) {
+		vd->vdev_remove_wanted = B_FALSE;
+		vd->vdev_delayed_close = B_FALSE;
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
+
+		/*
+		 * We want to clear the stats, but we don't want to do a full
+		 * vdev_clear() as that will cause us to throw away
+		 * degraded/faulted state as well as attempt to reopen the
+		 * device, all of which is a waste.
+		 */
+		vd->vdev_stat.vs_read_errors = 0;
+		vd->vdev_stat.vs_write_errors = 0;
+		vd->vdev_stat.vs_checksum_errors = 0;
+
+		vdev_state_dirty(vd->vdev_top);
+
+		/* Tell userspace that the vdev is gone. */
+		zfs_post_remove(spa, vd);
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		spa_async_remove(spa, vd->vdev_child[c]);
+}
+
+static void
+spa_async_probe(spa_t *spa, vdev_t *vd)
+{
+	if (vd->vdev_probe_wanted) {
+		vd->vdev_probe_wanted = B_FALSE;
+		vdev_reopen(vd);	/* vdev_open() does the actual probe */
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		spa_async_probe(spa, vd->vdev_child[c]);
+}
+
+static void
+spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+{
+	if (!spa->spa_autoexpand)
+		return;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		spa_async_autoexpand(spa, cvd);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+		return;
+
+	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
+}
+
+static void
+spa_async_thread(void *arg)
+{
+	spa_t *spa = (spa_t *)arg;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	int tasks;
+
+	ASSERT(spa->spa_sync_on);
+
+	mutex_enter(&spa->spa_async_lock);
+	tasks = spa->spa_async_tasks;
+	spa->spa_async_tasks = 0;
+	mutex_exit(&spa->spa_async_lock);
+
+	/*
+	 * See if the config needs to be updated.
+	 */
+	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+		uint64_t old_space, new_space;
+
+		mutex_enter(&spa_namespace_lock);
+		old_space = metaslab_class_get_space(spa_normal_class(spa));
+		old_space += metaslab_class_get_space(spa_special_class(spa));
+		old_space += metaslab_class_get_space(spa_dedup_class(spa));
+		old_space += metaslab_class_get_space(
+		    spa_embedded_log_class(spa));
+
+		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+
+		new_space = metaslab_class_get_space(spa_normal_class(spa));
+		new_space += metaslab_class_get_space(spa_special_class(spa));
+		new_space += metaslab_class_get_space(spa_dedup_class(spa));
+		new_space += metaslab_class_get_space(
+		    spa_embedded_log_class(spa));
+		mutex_exit(&spa_namespace_lock);
+
+		/*
+		 * If the pool grew as a result of the config update,
+		 * then log an internal history event.
+		 */
+		if (new_space != old_space) {
+			spa_history_log_internal(spa, "vdev online", NULL,
+			    "pool '%s' size: %llu(+%llu)",
+			    spa_name(spa), (u_longlong_t)new_space,
+			    (u_longlong_t)(new_space - old_space));
+		}
+	}
+
+	/*
+	 * See if any devices need to be marked REMOVED.
+	 */
+	if (tasks & SPA_ASYNC_REMOVE) {
+		spa_vdev_state_enter(spa, SCL_NONE);
+		spa_async_remove(spa, spa->spa_root_vdev);
+		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
+		for (int i = 0; i < spa->spa_spares.sav_count; i++)
+			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
+		(void) spa_vdev_state_exit(spa, NULL, 0);
+	}
+
+	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_async_autoexpand(spa, spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+
+	/*
+	 * See if any devices need to be probed.
+	 */
+	if (tasks & SPA_ASYNC_PROBE) {
+		spa_vdev_state_enter(spa, SCL_NONE);
+		spa_async_probe(spa, spa->spa_root_vdev);
+		(void) spa_vdev_state_exit(spa, NULL, 0);
+	}
+
+	/*
+	 * If any devices are done replacing, detach them.
+	 */
+	if (tasks & SPA_ASYNC_RESILVER_DONE ||
+	    tasks & SPA_ASYNC_REBUILD_DONE) {
+		spa_vdev_resilver_done(spa);
+	}
+
+	/*
+	 * Kick off a resilver.
+	 */
+	if (tasks & SPA_ASYNC_RESILVER &&
+	    !vdev_rebuild_active(spa->spa_root_vdev) &&
+	    (!dsl_scan_resilvering(dp) ||
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
+		dsl_scan_restart_resilver(dp, 0);
+
+	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	if (tasks & SPA_ASYNC_TRIM_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_trim_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_autotrim_restart(spa);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	/*
+	 * Kick off L2 cache whole device TRIM.
+	 */
+	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_trim_l2arc(spa);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	/*
+	 * Kick off L2 cache rebuilding.
+	 */
+	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
+		l2arc_spa_rebuild_start(spa);
+		spa_config_exit(spa, SCL_L2ARC, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	/*
+	 * Let the world know that we're done.
+	 */
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_thread = NULL;
+	cv_broadcast(&spa->spa_async_cv);
+	mutex_exit(&spa->spa_async_lock);
+	thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_suspended++;
+	while (spa->spa_async_thread != NULL)
+		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+	mutex_exit(&spa->spa_async_lock);
+
+	spa_vdev_remove_suspend(spa);
+
+	zthr_t *condense_thread = spa->spa_condense_zthr;
+	if (condense_thread != NULL)
+		zthr_cancel(condense_thread);
+
+	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+	if (discard_thread != NULL)
+		zthr_cancel(discard_thread);
+
+	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+	if (ll_delete_thread != NULL)
+		zthr_cancel(ll_delete_thread);
+
+	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+	if (ll_condense_thread != NULL)
+		zthr_cancel(ll_condense_thread);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	ASSERT(spa->spa_async_suspended != 0);
+	spa->spa_async_suspended--;
+	mutex_exit(&spa->spa_async_lock);
+	spa_restart_removal(spa);
+
+	zthr_t *condense_thread = spa->spa_condense_zthr;
+	if (condense_thread != NULL)
+		zthr_resume(condense_thread);
+
+	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
+	if (discard_thread != NULL)
+		zthr_resume(discard_thread);
+
+	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
+	if (ll_delete_thread != NULL)
+		zthr_resume(ll_delete_thread);
+
+	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
+	if (ll_condense_thread != NULL)
+		zthr_resume(ll_condense_thread);
+}
+
+static boolean_t
+spa_async_tasks_pending(spa_t *spa)
+{
+	uint_t non_config_tasks;
+	uint_t config_task;
+	boolean_t config_task_suspended;
+
+	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
+	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
+	if (spa->spa_ccw_fail_time == 0) {
+		config_task_suspended = B_FALSE;
+	} else {
+		config_task_suspended =
+		    (gethrtime() - spa->spa_ccw_fail_time) <
+		    ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
+	}
+
+	return (non_config_tasks || (config_task && !config_task_suspended));
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+	mutex_enter(&spa->spa_async_lock);
+	if (spa_async_tasks_pending(spa) &&
+	    !spa->spa_async_suspended &&
+	    spa->spa_async_thread == NULL)
+		spa->spa_async_thread = thread_create(NULL, 0,
+		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+	mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
+	mutex_enter(&spa->spa_async_lock);
+	spa->spa_async_tasks |= task;
+	mutex_exit(&spa->spa_async_lock);
+}
+
+int
+spa_async_tasks(spa_t *spa)
+{
+	return (spa->spa_async_tasks);
+}
+
+/*
+ * ==========================================================================
+ * SPA syncing routines
+ * ==========================================================================
+ */
+
+
+static int
+bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	bpobj_t *bpo = arg;
+	bpobj_enqueue(bpo, bp, bp_freed, tx);
+	return (0);
+}
+
+int
+bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
+}
+
+int
+bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
+}
+
+static int
+spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zio_t *pio = arg;
+
+	zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
+	    pio->io_flags));
+	return (0);
+}
+
+static int
+bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
+    dmu_tx_t *tx)
+{
+	ASSERT(!bp_freed);
+	return (spa_free_sync_cb(arg, bp, tx));
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
+static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+	zio_t *zio = zio_root(spa, NULL, NULL, 0);
+	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+	VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+	if (spa_sync_pass(spa) != 1)
+		return;
+
+	/*
+	 * Note:
+	 * If the log space map feature is active, we stop deferring
+	 * frees to the next TXG and therefore running this function
+	 * would be considered a no-op as spa_deferred_bpobj should
+	 * not have any entries.
+	 *
+	 * That said we run this function anyway (instead of returning
+	 * immediately) for the edge-case scenario where we just
+	 * activated the log space map feature in this TXG but we have
+	 * deferred frees from the previous TXG.
+	 */
+	zio_t *zio = zio_root(spa, NULL, NULL, 0);
+	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
+	VERIFY0(zio_wait(zio));
+}
+
+static void
+spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
+{
+	char *packed = NULL;
+	size_t bufsize;
+	size_t nvsize = 0;
+	dmu_buf_t *db;
+
+	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
+
+	/*
+	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
+	 * information.  This avoids the dmu_buf_will_dirty() path and
+	 * saves us a pre-read to get data we don't actually care about.
+	 */
+	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
+	packed = vmem_alloc(bufsize, KM_SLEEP);
+
+	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
+	    KM_SLEEP) == 0);
+	bzero(packed + nvsize, bufsize - nvsize);
+
+	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
+
+	vmem_free(packed, bufsize);
+
+	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	*(uint64_t *)db->db_data = nvsize;
+	dmu_buf_rele(db, FTAG);
+}
+
+static void
+spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
+    const char *config, const char *entry)
+{
+	nvlist_t *nvroot;
+	nvlist_t **list;
+	int i;
+
+	if (!sav->sav_sync)
+		return;
+
+	/*
+	 * Update the MOS nvlist describing the list of available devices.
+	 * spa_validate_aux() will have already made sure this nvlist is
+	 * valid and the vdevs are labeled appropriately.
+	 */
+	if (sav->sav_object == 0) {
+		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
+		    sizeof (uint64_t), tx);
+		VERIFY(zap_update(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
+		    &sav->sav_object, tx) == 0);
+	}
+
+	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	if (sav->sav_count == 0) {
+		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
+	} else {
+		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
+		for (i = 0; i < sav->sav_count; i++)
+			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
+			    B_FALSE, VDEV_CONFIG_L2CACHE);
+		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
+		    sav->sav_count) == 0);
+		for (i = 0; i < sav->sav_count; i++)
+			nvlist_free(list[i]);
+		kmem_free(list, sav->sav_count * sizeof (void *));
+	}
+
+	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
+	nvlist_free(nvroot);
+
+	sav->sav_sync = B_FALSE;
+}
+
+/*
+ * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
+ * The all-vdev ZAP must be empty.
+ */
+static void
+spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	if (vd->vdev_top_zap != 0) {
+		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+		    vd->vdev_top_zap, tx));
+	}
+	if (vd->vdev_leaf_zap != 0) {
+		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
+		    vd->vdev_leaf_zap, tx));
+	}
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		spa_avz_build(vd->vdev_child[i], avz, tx);
+	}
+}
+
+static void
+spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
+{
+	nvlist_t *config;
+
+	/*
+	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
+	 * its config may not be dirty but we still need to build per-vdev ZAPs.
+	 * Similarly, if the pool is being assembled (e.g. after a split), we
+	 * need to rebuild the AVZ although the config may not be dirty.
+	 */
+	if (list_is_empty(&spa->spa_config_dirty_list) &&
+	    spa->spa_avz_action == AVZ_ACTION_NONE)
+		return;
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
+	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
+	    spa->spa_all_vdev_zaps != 0);
+
+	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
+		/* Make and build the new AVZ */
+		uint64_t new_avz = zap_create(spa->spa_meta_objset,
+		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
+
+		/* Diff old AVZ with new one */
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t vdzap = za.za_first_integer;
+			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
+			    vdzap) == ENOENT) {
+				/*
+				 * ZAP is listed in old AVZ but not in new one;
+				 * destroy it
+				 */
+				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
+				    tx));
+			}
+		}
+
+		zap_cursor_fini(&zc);
+
+		/* Destroy the old AVZ */
+		VERIFY0(zap_destroy(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, tx));
+
+		/* Replace the old AVZ in the dir obj with the new one */
+		VERIFY0(zap_update(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
+		    sizeof (new_avz), 1, &new_avz, tx));
+
+		spa->spa_all_vdev_zaps = new_avz;
+	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
+		zap_cursor_t zc;
+		zap_attribute_t za;
+
+		/* Walk through the AVZ and destroy all listed ZAPs */
+		for (zap_cursor_init(&zc, spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps);
+		    zap_cursor_retrieve(&zc, &za) == 0;
+		    zap_cursor_advance(&zc)) {
+			uint64_t zap = za.za_first_integer;
+			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
+		}
+
+		zap_cursor_fini(&zc);
+
+		/* Destroy and unlink the AVZ itself */
+		VERIFY0(zap_destroy(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, tx));
+		VERIFY0(zap_remove(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
+		spa->spa_all_vdev_zaps = 0;
+	}
+
+	if (spa->spa_all_vdev_zaps == 0) {
+		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
+		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_VDEV_ZAP_MAP, tx);
+	}
+	spa->spa_avz_action = AVZ_ACTION_NONE;
+
+	/* Create ZAPs for vdevs that don't have them. */
+	vdev_construct_zaps(spa->spa_root_vdev, tx);
+
+	config = spa_config_generate(spa, spa->spa_root_vdev,
+	    dmu_tx_get_txg(tx), B_FALSE);
+
+	/*
+	 * If we're upgrading the spa version then make sure that
+	 * the config object gets updated with the correct version.
+	 */
+	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+		    spa->spa_uberblock.ub_version);
+
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	nvlist_free(spa->spa_config_syncing);
+	spa->spa_config_syncing = config;
+
+	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
+}
+
+static void
+spa_sync_version(void *arg, dmu_tx_t *tx)
+{
+	uint64_t *versionp = arg;
+	uint64_t version = *versionp;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	/*
+	 * Setting the version is special cased when first creating the pool.
+	 */
+	ASSERT(tx->tx_txg != TXG_INITIAL);
+
+	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
+	ASSERT(version >= spa_version(spa));
+
+	spa->spa_uberblock.ub_version = version;
+	vdev_config_dirty(spa->spa_root_vdev);
+	spa_history_log_internal(spa, "set", tx, "version=%lld",
+	    (longlong_t)version);
+}
+
+/*
+ * Set zpool properties.
+ */
+static void
+spa_sync_props(void *arg, dmu_tx_t *tx)
+{
+	nvlist_t *nvp = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	nvpair_t *elem = NULL;
+
+	mutex_enter(&spa->spa_props_lock);
+
+	while ((elem = nvlist_next_nvpair(nvp, elem))) {
+		uint64_t intval;
+		char *strval, *fname;
+		zpool_prop_t prop;
+		const char *propname;
+		zprop_type_t proptype;
+		spa_feature_t fid;
+
+		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
+		case ZPOOL_PROP_INVAL:
+			/*
+			 * We checked this earlier in spa_prop_validate().
+			 */
+			ASSERT(zpool_prop_feature(nvpair_name(elem)));
+
+			fname = strchr(nvpair_name(elem), '@') + 1;
+			VERIFY0(zfeature_lookup_name(fname, &fid));
+
+			spa_feature_enable(spa, fid, tx);
+			spa_history_log_internal(spa, "set", tx,
+			    "%s=enabled", nvpair_name(elem));
+			break;
+
+		case ZPOOL_PROP_VERSION:
+			intval = fnvpair_value_uint64(elem);
+			/*
+			 * The version is synced separately before other
+			 * properties and should be correct by now.
+			 */
+			ASSERT3U(spa_version(spa), >=, intval);
+			break;
+
+		case ZPOOL_PROP_ALTROOT:
+			/*
+			 * 'altroot' is a non-persistent property. It should
+			 * have been set temporarily at creation or import time.
+			 */
+			ASSERT(spa->spa_root != NULL);
+			break;
+
+		case ZPOOL_PROP_READONLY:
+		case ZPOOL_PROP_CACHEFILE:
+			/*
+			 * 'readonly' and 'cachefile' are also non-persistent
+			 * properties.
+			 */
+			break;
+		case ZPOOL_PROP_COMMENT:
+			strval = fnvpair_value_string(elem);
+			if (spa->spa_comment != NULL)
+				spa_strfree(spa->spa_comment);
+			spa->spa_comment = spa_strdup(strval);
+			/*
+			 * We need to dirty the configuration on all the vdevs
+			 * so that their labels get updated.  It's unnecessary
+			 * to do this for pool creation since the vdev's
+			 * configuration has already been dirtied.
+			 */
+			if (tx->tx_txg != TXG_INITIAL)
+				vdev_config_dirty(spa->spa_root_vdev);
+			spa_history_log_internal(spa, "set", tx,
+			    "%s=%s", nvpair_name(elem), strval);
+			break;
+		case ZPOOL_PROP_COMPATIBILITY:
+			strval = fnvpair_value_string(elem);
+			if (spa->spa_compatibility != NULL)
+				spa_strfree(spa->spa_compatibility);
+			spa->spa_compatibility = spa_strdup(strval);
+			/*
+			 * Dirty the configuration on vdevs as above.
+			 */
+			if (tx->tx_txg != TXG_INITIAL)
+				vdev_config_dirty(spa->spa_root_vdev);
+			spa_history_log_internal(spa, "set", tx,
+			    "%s=%s", nvpair_name(elem), strval);
+			break;
+
+		default:
+			/*
+			 * Set pool property values in the poolprops mos object.
+			 */
+			if (spa->spa_pool_props_object == 0) {
+				spa->spa_pool_props_object =
+				    zap_create_link(mos, DMU_OT_POOL_PROPS,
+				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
+				    tx);
+			}
+
+			/* normalize the property name */
+			propname = zpool_prop_to_name(prop);
+			proptype = zpool_prop_get_type(prop);
+
+			if (nvpair_type(elem) == DATA_TYPE_STRING) {
+				ASSERT(proptype == PROP_TYPE_STRING);
+				strval = fnvpair_value_string(elem);
+				VERIFY0(zap_update(mos,
+				    spa->spa_pool_props_object, propname,
+				    1, strlen(strval) + 1, strval, tx));
+				spa_history_log_internal(spa, "set", tx,
+				    "%s=%s", nvpair_name(elem), strval);
+			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+				intval = fnvpair_value_uint64(elem);
+
+				if (proptype == PROP_TYPE_INDEX) {
+					const char *unused;
+					VERIFY0(zpool_prop_index_to_string(
+					    prop, intval, &unused));
+				}
+				VERIFY0(zap_update(mos,
+				    spa->spa_pool_props_object, propname,
+				    8, 1, &intval, tx));
+				spa_history_log_internal(spa, "set", tx,
+				    "%s=%lld", nvpair_name(elem),
+				    (longlong_t)intval);
+			} else {
+				ASSERT(0); /* not allowed */
+			}
+
+			switch (prop) {
+			case ZPOOL_PROP_DELEGATION:
+				spa->spa_delegation = intval;
+				break;
+			case ZPOOL_PROP_BOOTFS:
+				spa->spa_bootfs = intval;
+				break;
+			case ZPOOL_PROP_FAILUREMODE:
+				spa->spa_failmode = intval;
+				break;
+			case ZPOOL_PROP_AUTOTRIM:
+				spa->spa_autotrim = intval;
+				spa_async_request(spa,
+				    SPA_ASYNC_AUTOTRIM_RESTART);
+				break;
+			case ZPOOL_PROP_AUTOEXPAND:
+				spa->spa_autoexpand = intval;
+				if (tx->tx_txg != TXG_INITIAL)
+					spa_async_request(spa,
+					    SPA_ASYNC_AUTOEXPAND);
+				break;
+			case ZPOOL_PROP_MULTIHOST:
+				spa->spa_multihost = intval;
+				break;
+			default:
+				break;
+			}
+		}
+
+	}
+
+	mutex_exit(&spa->spa_props_lock);
+}
+
+/*
+ * Perform one-time upgrade on-disk changes.  spa_version() does not
+ * reflect the new version this txg, so there must be no changes this
+ * txg to anything that the upgrade code depends on after it executes.
+ * Therefore this must be called after dsl_pool_sync() does the sync
+ * tasks.
+ */
+static void
+spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
+{
+	if (spa_sync_pass(spa) != 1)
+		return;
+
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
+		dsl_pool_create_origin(dp, tx);
+
+		/* Keeping the origin open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
+		dsl_pool_upgrade_clones(dp, tx);
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
+		dsl_pool_upgrade_dir_clones(dp, tx);
+
+		/* Keeping the freedir open increases spa_minref */
+		spa->spa_minref += 3;
+	}
+
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+		spa_feature_create_zap_objects(spa, tx);
+	}
+
+	/*
+	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
+	 * when possibility to use lz4 compression for metadata was added
+	 * Old pools that have this feature enabled must be upgraded to have
+	 * this feature active
+	 */
+	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+		boolean_t lz4_en = spa_feature_is_enabled(spa,
+		    SPA_FEATURE_LZ4_COMPRESS);
+		boolean_t lz4_ac = spa_feature_is_active(spa,
+		    SPA_FEATURE_LZ4_COMPRESS);
+
+		if (lz4_en && !lz4_ac)
+			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
+	}
+
+	/*
+	 * If we haven't written the salt, do so now.  Note that the
+	 * feature may not be activated yet, but that's fine since
+	 * the presence of this ZAP entry is backwards compatible.
+	 */
+	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+		VERIFY0(zap_add(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes),
+		    spa->spa_cksum_salt.zcs_bytes, tx));
+	}
+
+	rrw_exit(&dp->dp_config_rwlock, FTAG);
+}
+
+static void
+vdev_indirect_state_sync_verify(vdev_t *vd)
+{
+	vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
+	vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
+
+	if (vd->vdev_ops == &vdev_indirect_ops) {
+		ASSERT(vim != NULL);
+		ASSERT(vib != NULL);
+	}
+
+	uint64_t obsolete_sm_object = 0;
+	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+	if (obsolete_sm_object != 0) {
+		ASSERT(vd->vdev_obsolete_sm != NULL);
+		ASSERT(vd->vdev_removing ||
+		    vd->vdev_ops == &vdev_indirect_ops);
+		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
+		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
+		ASSERT3U(obsolete_sm_object, ==,
+		    space_map_object(vd->vdev_obsolete_sm));
+		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
+		    space_map_allocated(vd->vdev_obsolete_sm));
+	}
+	ASSERT(vd->vdev_obsolete_segments != NULL);
+
+	/*
+	 * Since frees / remaps to an indirect vdev can only
+	 * happen in syncing context, the obsolete segments
+	 * tree must be empty when we start syncing.
+	 */
+	ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
+}
+
+/*
+ * Set the top-level vdev's max queue depth. Evaluate each top-level's
+ * async write queue depth in case it changed. The max queue depth will
+ * not change in the middle of syncing out this txg.
+ */
+static void
+spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
+{
+	ASSERT(spa_writeable(spa));
+
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+	    zfs_vdev_queue_depth_pct / 100;
+	metaslab_class_t *normal = spa_normal_class(spa);
+	metaslab_class_t *special = spa_special_class(spa);
+	metaslab_class_t *dedup = spa_dedup_class(spa);
+
+	uint64_t slots_per_allocator = 0;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+
+		metaslab_group_t *mg = tvd->vdev_mg;
+		if (mg == NULL || !metaslab_group_initialized(mg))
+			continue;
+
+		metaslab_class_t *mc = mg->mg_class;
+		if (mc != normal && mc != special && mc != dedup)
+			continue;
+
+		/*
+		 * It is safe to do a lock-free check here because only async
+		 * allocations look at mg_max_alloc_queue_depth, and async
+		 * allocations all happen from spa_sync().
+		 */
+		for (int i = 0; i < mg->mg_allocators; i++) {
+			ASSERT0(zfs_refcount_count(
+			    &(mg->mg_allocator[i].mga_alloc_queue_depth)));
+		}
+		mg->mg_max_alloc_queue_depth = max_queue_depth;
+
+		for (int i = 0; i < mg->mg_allocators; i++) {
+			mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
+			    zfs_vdev_def_queue_depth;
+		}
+		slots_per_allocator += zfs_vdev_def_queue_depth;
+	}
+
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		ASSERT0(zfs_refcount_count(&normal->mc_allocator[i].
+		    mca_alloc_slots));
+		ASSERT0(zfs_refcount_count(&special->mc_allocator[i].
+		    mca_alloc_slots));
+		ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i].
+		    mca_alloc_slots));
+		normal->mc_allocator[i].mca_alloc_max_slots =
+		    slots_per_allocator;
+		special->mc_allocator[i].mca_alloc_max_slots =
+		    slots_per_allocator;
+		dedup->mc_allocator[i].mca_alloc_max_slots =
+		    slots_per_allocator;
+	}
+	normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+	special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+	dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+}
+
+static void
+spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
+{
+	ASSERT(spa_writeable(spa));
+
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+		vdev_indirect_state_sync_verify(vd);
+
+		if (vdev_indirect_should_condense(vd)) {
+			spa_condense_indirect_start_sync(vd, tx);
+			break;
+		}
+	}
+}
+
+static void
+spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
+{
+	objset_t *mos = spa->spa_meta_objset;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	uint64_t txg = tx->tx_txg;
+	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
+
+	do {
+		int pass = ++spa->spa_sync_pass;
+
+		spa_sync_config_object(spa, tx);
+		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
+		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
+		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
+		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
+		spa_errlog_sync(spa, txg);
+		dsl_pool_sync(dp, txg);
+
+		if (pass < zfs_sync_pass_deferred_free ||
+		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+			/*
+			 * If the log space map feature is active we don't
+			 * care about deferred frees and the deferred bpobj
+			 * as the log space map should effectively have the
+			 * same results (i.e. appending only to one object).
+			 */
+			spa_sync_frees(spa, free_bpl, tx);
+		} else {
+			/*
+			 * We can not defer frees in pass 1, because
+			 * we sync the deferred frees later in pass 1.
+			 */
+			ASSERT3U(pass, >, 1);
+			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
+			    &spa->spa_deferred_bpobj, tx);
+		}
+
+		ddt_sync(spa, txg);
+		dsl_scan_sync(dp, tx);
+		svr_sync(spa, tx);
+		spa_sync_upgrades(spa, tx);
+
+		spa_flush_metaslabs(spa, tx);
+
+		vdev_t *vd = NULL;
+		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+		    != NULL)
+			vdev_sync(vd, txg);
+
+		/*
+		 * Note: We need to check if the MOS is dirty because we could
+		 * have marked the MOS dirty without updating the uberblock
+		 * (e.g. if we have sync tasks but no dirty user data). We need
+		 * to check the uberblock's rootbp because it is updated if we
+		 * have synced out dirty data (though in this case the MOS will
+		 * most likely also be dirty due to second order effects, we
+		 * don't want to rely on that here).
+		 */
+		if (pass == 1 &&
+		    spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+		    !dmu_objset_is_dirty(mos, txg)) {
+			/*
+			 * Nothing changed on the first pass, therefore this
+			 * TXG is a no-op. Avoid syncing deferred frees, so
+			 * that we can keep this TXG as a no-op.
+			 */
+			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
+			break;
+		}
+
+		spa_sync_deferred_frees(spa, tx);
+	} while (dmu_objset_is_dirty(mos, txg));
+}
+
+/*
+ * Rewrite the vdev configuration (which includes the uberblock) to
+ * commit the transaction group.
+ *
+ * If there are no dirty vdevs, we sync the uberblock to a few random
+ * top-level vdevs that are known to be visible in the config cache
+ * (see spa_vdev_add() for a complete description). If there *are* dirty
+ * vdevs, sync the uberblock to all vdevs.
+ */
+static void
+spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t txg = tx->tx_txg;
+
+	for (;;) {
+		int error = 0;
+
+		/*
+		 * We hold SCL_STATE to prevent vdev open/close/etc.
+		 * while we're attempting to write the vdev labels.
+		 */
+		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+
+		if (list_is_empty(&spa->spa_config_dirty_list)) {
+			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
+			int svdcount = 0;
+			int children = rvd->vdev_children;
+			int c0 = spa_get_random(children);
+
+			for (int c = 0; c < children; c++) {
+				vdev_t *vd =
+				    rvd->vdev_child[(c0 + c) % children];
+
+				/* Stop when revisiting the first vdev */
+				if (c > 0 && svd[0] == vd)
+					break;
+
+				if (vd->vdev_ms_array == 0 ||
+				    vd->vdev_islog ||
+				    !vdev_is_concrete(vd))
+					continue;
+
+				svd[svdcount++] = vd;
+				if (svdcount == SPA_SYNC_MIN_VDEVS)
+					break;
+			}
+			error = vdev_config_sync(svd, svdcount, txg);
+		} else {
+			error = vdev_config_sync(rvd->vdev_child,
+			    rvd->vdev_children, txg);
+		}
+
+		if (error == 0)
+			spa->spa_last_synced_guid = rvd->vdev_guid;
+
+		spa_config_exit(spa, SCL_STATE, FTAG);
+
+		if (error == 0)
+			break;
+		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
+		zio_resume_wait(spa);
+	}
+}
+
+/*
+ * Sync the specified transaction group.  New blocks may be dirtied as
+ * part of the process, so we iterate until it converges.
+ */
+void
+spa_sync(spa_t *spa, uint64_t txg)
+{
+	vdev_t *vd = NULL;
+
+	VERIFY(spa_writeable(spa));
+
+	/*
+	 * Wait for i/os issued in open context that need to complete
+	 * before this txg syncs.
+	 */
+	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
+
+	/*
+	 * Lock out configuration changes.
+	 */
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	spa->spa_syncing_txg = txg;
+	spa->spa_sync_pass = 0;
+
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_enter(&spa->spa_alloc_locks[i]);
+		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+		mutex_exit(&spa->spa_alloc_locks[i]);
+	}
+
+	/*
+	 * If there are any pending vdev state changes, convert them
+	 * into config changes that go out with this transaction group.
+	 */
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	while (list_head(&spa->spa_state_dirty_list) != NULL) {
+		/*
+		 * We need the write lock here because, for aux vdevs,
+		 * calling vdev_config_dirty() modifies sav_config.
+		 * This is ugly and will become unnecessary when we
+		 * eliminate the aux vdev wart by integrating all vdevs
+		 * into the root vdev tree.
+		 */
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+			vdev_state_clean(vd);
+			vdev_config_dirty(vd);
+		}
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+	}
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	dsl_pool_t *dp = spa->spa_dsl_pool;
+	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
+
+	spa->spa_sync_starttime = gethrtime();
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+	    NSEC_TO_TICK(spa->spa_deadman_synctime));
+
+	/*
+	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
+	 * set spa_deflate if we have no raid-z vdevs.
+	 */
+	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
+	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
+		vdev_t *rvd = spa->spa_root_vdev;
+
+		int i;
+		for (i = 0; i < rvd->vdev_children; i++) {
+			vd = rvd->vdev_child[i];
+			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
+				break;
+		}
+		if (i == rvd->vdev_children) {
+			spa->spa_deflate = TRUE;
+			VERIFY0(zap_add(spa->spa_meta_objset,
+			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
+			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
+		}
+	}
+
+	spa_sync_adjust_vdev_max_queue_depth(spa);
+
+	spa_sync_condense_indirect(spa, tx);
+
+	spa_sync_iterate_to_convergence(spa, tx);
+
+#ifdef ZFS_DEBUG
+	if (!list_is_empty(&spa->spa_config_dirty_list)) {
+	/*
+	 * Make sure that the number of ZAPs for all the vdevs matches
+	 * the number of ZAPs in the per-vdev ZAP list. This only gets
+	 * called if the config is dirty; otherwise there may be
+	 * outstanding AVZ operations that weren't completed in
+	 * spa_sync_config_object.
+	 */
+		uint64_t all_vdev_zap_entry_count;
+		ASSERT0(zap_count(spa->spa_meta_objset,
+		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
+		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
+		    all_vdev_zap_entry_count);
+	}
+#endif
+
+	if (spa->spa_vdev_removal != NULL) {
+		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
+	}
+
+	spa_sync_rewrite_vdev_config(spa, tx);
+	dmu_tx_commit(tx);
+
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	spa->spa_deadman_tqid = 0;
+
+	/*
+	 * Clear the dirty config list.
+	 */
+	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
+		vdev_config_clean(vd);
+
+	/*
+	 * Now that the new config has synced transactionally,
+	 * let it become visible to the config cache.
+	 */
+	if (spa->spa_config_syncing != NULL) {
+		spa_config_set(spa, spa->spa_config_syncing);
+		spa->spa_config_txg = txg;
+		spa->spa_config_syncing = NULL;
+	}
+
+	dsl_pool_sync_done(dp, txg);
+
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_enter(&spa->spa_alloc_locks[i]);
+		VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
+		mutex_exit(&spa->spa_alloc_locks[i]);
+	}
+
+	/*
+	 * Update usable space statistics.
+	 */
+	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+	    != NULL)
+		vdev_sync_done(vd, txg);
+
+	metaslab_class_evict_old(spa->spa_normal_class, txg);
+	metaslab_class_evict_old(spa->spa_log_class, txg);
+
+	spa_sync_close_syncing_log_sm(spa);
+
+	spa_update_dspace(spa);
+
+	/*
+	 * It had better be the case that we didn't dirty anything
+	 * since vdev_config_sync().
+	 */
+	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
+	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
+
+	while (zfs_pause_spa_sync)
+		delay(1);
+
+	spa->spa_sync_pass = 0;
+
+	/*
+	 * Update the last synced uberblock here. We want to do this at
+	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
+	 * will be guaranteed that all the processing associated with
+	 * that txg has been completed.
+	 */
+	spa->spa_ubsync = spa->spa_uberblock;
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	spa_handle_ignored_writes(spa);
+
+	/*
+	 * If any async tasks have been requested, kick them off.
+	 */
+	spa_async_dispatch(spa);
+}
+
+/*
+ * Sync all pools.  We don't want to hold the namespace lock across these
+ * operations, so we take a reference on the spa_t and drop the lock during the
+ * sync.
+ */
+void
+spa_sync_allpools(void)
+{
+	spa_t *spa = NULL;
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL) {
+		if (spa_state(spa) != POOL_STATE_ACTIVE ||
+		    !spa_writeable(spa) || spa_suspended(spa))
+			continue;
+		spa_open_ref(spa, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		txg_wait_synced(spa_get_dsl(spa), 0);
+		mutex_enter(&spa_namespace_lock);
+		spa_close(spa, FTAG);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous routines
+ * ==========================================================================
+ */
+
+/*
+ * Remove all pools in the system.
+ */
+void
+spa_evict_all(void)
+{
+	spa_t *spa;
+
+	/*
+	 * Remove all cached state.  All pools should be closed now,
+	 * so every spa in the AVL tree should be unreferenced.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(NULL)) != NULL) {
+		/*
+		 * Stop async tasks.  The async thread may need to detach
+		 * a device that's been replaced, which requires grabbing
+		 * spa_namespace_lock, so we must drop it here.
+		 */
+		spa_open_ref(spa, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		spa_async_suspend(spa);
+		mutex_enter(&spa_namespace_lock);
+		spa_close(spa, FTAG);
+
+		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
+			spa_unload(spa);
+			spa_deactivate(spa);
+		}
+		spa_remove(spa);
+	}
+	mutex_exit(&spa_namespace_lock);
+}
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
+{
+	vdev_t *vd;
+	int i;
+
+	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
+		return (vd);
+
+	if (aux) {
+		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
+			vd = spa->spa_l2cache.sav_vdevs[i];
+			if (vd->vdev_guid == guid)
+				return (vd);
+		}
+
+		for (i = 0; i < spa->spa_spares.sav_count; i++) {
+			vd = spa->spa_spares.sav_vdevs[i];
+			if (vd->vdev_guid == guid)
+				return (vd);
+		}
+	}
+
+	return (NULL);
+}
+
+void
+spa_upgrade(spa_t *spa, uint64_t version)
+{
+	ASSERT(spa_writeable(spa));
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	/*
+	 * This should only be called for a non-faulted pool, and since a
+	 * future version would result in an unopenable pool, this shouldn't be
+	 * possible.
+	 */
+	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
+	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
+
+	spa->spa_uberblock.ub_version = version;
+	vdev_config_dirty(spa->spa_root_vdev);
+
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+}
+
+boolean_t
+spa_has_spare(spa_t *spa, uint64_t guid)
+{
+	int i;
+	uint64_t spareguid;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
+
+	for (i = 0; i < sav->sav_count; i++)
+		if (sav->sav_vdevs[i]->vdev_guid == guid)
+			return (B_TRUE);
+
+	for (i = 0; i < sav->sav_npending; i++) {
+		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
+		    &spareguid) == 0 && spareguid == guid)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Check if a pool has an active shared spare device.
+ * Note: reference count of an active spare is 2, as a spare and as a replace
+ */
+static boolean_t
+spa_has_active_shared_spare(spa_t *spa)
+{
+	int i, refcnt;
+	uint64_t pool;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
+
+	for (i = 0; i < sav->sav_count; i++) {
+		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
+		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
+		    refcnt > 2)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+uint64_t
+spa_total_metaslabs(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	uint64_t m = 0;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+		if (!vdev_is_concrete(vd))
+			continue;
+		m += vd->vdev_ms_count;
+	}
+	return (m);
+}
+
+/*
+ * Notify any waiting threads that some activity has switched from being in-
+ * progress to not-in-progress so that the thread can wake up and determine
+ * whether it is finished waiting.
+ */
+void
+spa_notify_waiters(spa_t *spa)
+{
+	/*
+	 * Acquiring spa_activities_lock here prevents the cv_broadcast from
+	 * happening between the waiting thread's check and cv_wait.
+	 */
+	mutex_enter(&spa->spa_activities_lock);
+	cv_broadcast(&spa->spa_activities_cv);
+	mutex_exit(&spa->spa_activities_lock);
+}
+
+/*
+ * Notify any waiting threads that the pool is exporting, and then block until
+ * they are finished using the spa_t.
+ */
+void
+spa_wake_waiters(spa_t *spa)
+{
+	mutex_enter(&spa->spa_activities_lock);
+	spa->spa_waiters_cancel = B_TRUE;
+	cv_broadcast(&spa->spa_activities_cv);
+	while (spa->spa_waiters != 0)
+		cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
+	spa->spa_waiters_cancel = B_FALSE;
+	mutex_exit(&spa->spa_activities_lock);
+}
+
+/* Whether the vdev or any of its descendants are being initialized/trimmed. */
+static boolean_t
+spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
+	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+	ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
+	    activity == ZPOOL_WAIT_TRIM);
+
+	kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
+	    &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
+
+	mutex_exit(&spa->spa_activities_lock);
+	mutex_enter(lock);
+	mutex_enter(&spa->spa_activities_lock);
+
+	boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
+	    (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
+	    (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
+	mutex_exit(lock);
+
+	if (in_progress)
+		return (B_TRUE);
+
+	for (int i = 0; i < vd->vdev_children; i++) {
+		if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
+		    activity))
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * If use_guid is true, this checks whether the vdev specified by guid is
+ * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
+ * is being initialized/trimmed. The caller must hold the config lock and
+ * spa_activities_lock.
+ */
+static int
+spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
+    zpool_wait_activity_t activity, boolean_t *in_progress)
+{
+	mutex_exit(&spa->spa_activities_lock);
+	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+	mutex_enter(&spa->spa_activities_lock);
+
+	vdev_t *vd;
+	if (use_guid) {
+		vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+		if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
+			spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+			return (EINVAL);
+		}
+	} else {
+		vd = spa->spa_root_vdev;
+	}
+
+	*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
+
+	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+	return (0);
+}
+
+/*
+ * Locking for waiting threads
+ * ---------------------------
+ *
+ * Waiting threads need a way to check whether a given activity is in progress,
+ * and then, if it is, wait for it to complete. Each activity will have some
+ * in-memory representation of the relevant on-disk state which can be used to
+ * determine whether or not the activity is in progress. The in-memory state and
+ * the locking used to protect it will be different for each activity, and may
+ * not be suitable for use with a cvar (e.g., some state is protected by the
+ * config lock). To allow waiting threads to wait without any races, another
+ * lock, spa_activities_lock, is used.
+ *
+ * When the state is checked, both the activity-specific lock (if there is one)
+ * and spa_activities_lock are held. In some cases, the activity-specific lock
+ * is acquired explicitly (e.g. the config lock). In others, the locking is
+ * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
+ * thread releases the activity-specific lock and, if the activity is in
+ * progress, then cv_waits using spa_activities_lock.
+ *
+ * The waiting thread is woken when another thread, one completing some
+ * activity, updates the state of the activity and then calls
+ * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
+ * needs to hold its activity-specific lock when updating the state, and this
+ * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
+ *
+ * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
+ * and because it is held when the waiting thread checks the state of the
+ * activity, it can never be the case that the completing thread both updates
+ * the activity state and cv_broadcasts in between the waiting thread's check
+ * and cv_wait. Thus, a waiting thread can never miss a wakeup.
+ *
+ * In order to prevent deadlock, when the waiting thread does its check, in some
+ * cases it will temporarily drop spa_activities_lock in order to acquire the
+ * activity-specific lock. The order in which spa_activities_lock and the
+ * activity specific lock are acquired in the waiting thread is determined by
+ * the order in which they are acquired in the completing thread; if the
+ * completing thread calls spa_notify_waiters with the activity-specific lock
+ * held, then the waiting thread must also acquire the activity-specific lock
+ * first.
+ */
+
+static int
+spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
+    boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
+{
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
+
+	switch (activity) {
+	case ZPOOL_WAIT_CKPT_DISCARD:
+		*in_progress =
+		    (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
+		    zap_contains(spa_meta_objset(spa),
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
+		    ENOENT);
+		break;
+	case ZPOOL_WAIT_FREE:
+		*in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
+		    !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
+		    spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
+		    spa_livelist_delete_check(spa));
+		break;
+	case ZPOOL_WAIT_INITIALIZE:
+	case ZPOOL_WAIT_TRIM:
+		error = spa_vdev_activity_in_progress(spa, use_tag, tag,
+		    activity, in_progress);
+		break;
+	case ZPOOL_WAIT_REPLACE:
+		mutex_exit(&spa->spa_activities_lock);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+		mutex_enter(&spa->spa_activities_lock);
+
+		*in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		break;
+	case ZPOOL_WAIT_REMOVE:
+		*in_progress = (spa->spa_removing_phys.sr_state ==
+		    DSS_SCANNING);
+		break;
+	case ZPOOL_WAIT_RESILVER:
+		if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+			break;
+		/* fall through */
+	case ZPOOL_WAIT_SCRUB:
+	{
+		boolean_t scanning, paused, is_scrub;
+		dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
+
+		is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
+		scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
+		paused = dsl_scan_is_paused_scrub(scn);
+		*in_progress = (scanning && !paused &&
+		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
+		break;
+	}
+	default:
+		panic("unrecognized value for activity %d", activity);
+	}
+
+	return (error);
+}
+
+static int
+spa_wait_common(const char *pool, zpool_wait_activity_t activity,
+    boolean_t use_tag, uint64_t tag, boolean_t *waited)
+{
+	/*
+	 * The tag is used to distinguish between instances of an activity.
+	 * 'initialize' and 'trim' are the only activities that we use this for.
+	 * The other activities can only have a single instance in progress in a
+	 * pool at one time, making the tag unnecessary.
+	 *
+	 * There can be multiple devices being replaced at once, but since they
+	 * all finish once resilvering finishes, we don't bother keeping track
+	 * of them individually, we just wait for them all to finish.
+	 */
+	if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
+	    activity != ZPOOL_WAIT_TRIM)
+		return (EINVAL);
+
+	if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
+		return (EINVAL);
+
+	spa_t *spa;
+	int error = spa_open(pool, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Increment the spa's waiter count so that we can call spa_close and
+	 * still ensure that the spa_t doesn't get freed before this thread is
+	 * finished with it when the pool is exported. We want to call spa_close
+	 * before we start waiting because otherwise the additional ref would
+	 * prevent the pool from being exported or destroyed throughout the
+	 * potentially long wait.
+	 */
+	mutex_enter(&spa->spa_activities_lock);
+	spa->spa_waiters++;
+	spa_close(spa, FTAG);
+
+	*waited = B_FALSE;
+	for (;;) {
+		boolean_t in_progress;
+		error = spa_activity_in_progress(spa, activity, use_tag, tag,
+		    &in_progress);
+
+		if (error || !in_progress || spa->spa_waiters_cancel)
+			break;
+
+		*waited = B_TRUE;
+
+		if (cv_wait_sig(&spa->spa_activities_cv,
+		    &spa->spa_activities_lock) == 0) {
+			error = EINTR;
+			break;
+		}
+	}
+
+	spa->spa_waiters--;
+	cv_signal(&spa->spa_waiters_cv);
+	mutex_exit(&spa->spa_activities_lock);
+
+	return (error);
+}
+
+/*
+ * Wait for a particular instance of the specified activity to complete, where
+ * the instance is identified by 'tag'
+ */
+int
+spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
+    boolean_t *waited)
+{
+	return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
+}
+
+/*
+ * Wait for all instances of the specified activity complete
+ */
+int
+spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
+{
+
+	return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
+}
+
+sysevent_t *
+spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
+{
+	sysevent_t *ev = NULL;
+#ifdef _KERNEL
+	nvlist_t *resource;
+
+	resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
+	if (resource) {
+		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
+		ev->resource = resource;
+	}
+#endif
+	return (ev);
+}
+
+void
+spa_event_post(sysevent_t *ev)
+{
+#ifdef _KERNEL
+	if (ev) {
+		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
+		kmem_free(ev, sizeof (*ev));
+	}
+#endif
+}
+
+/*
+ * Post a zevent corresponding to the given sysevent.   The 'name' must be one
+ * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
+ * filled in from the spa and (optionally) the vdev.  This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
+{
+	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
+}
+
+/* state manipulation functions */
+EXPORT_SYMBOL(spa_open);
+EXPORT_SYMBOL(spa_open_rewind);
+EXPORT_SYMBOL(spa_get_stats);
+EXPORT_SYMBOL(spa_create);
+EXPORT_SYMBOL(spa_import);
+EXPORT_SYMBOL(spa_tryimport);
+EXPORT_SYMBOL(spa_destroy);
+EXPORT_SYMBOL(spa_export);
+EXPORT_SYMBOL(spa_reset);
+EXPORT_SYMBOL(spa_async_request);
+EXPORT_SYMBOL(spa_async_suspend);
+EXPORT_SYMBOL(spa_async_resume);
+EXPORT_SYMBOL(spa_inject_addref);
+EXPORT_SYMBOL(spa_inject_delref);
+EXPORT_SYMBOL(spa_scan_stat_init);
+EXPORT_SYMBOL(spa_scan_get_stats);
+
+/* device manipulation */
+EXPORT_SYMBOL(spa_vdev_add);
+EXPORT_SYMBOL(spa_vdev_attach);
+EXPORT_SYMBOL(spa_vdev_detach);
+EXPORT_SYMBOL(spa_vdev_setpath);
+EXPORT_SYMBOL(spa_vdev_setfru);
+EXPORT_SYMBOL(spa_vdev_split_mirror);
+
+/* spare statech is global across all pools) */
+EXPORT_SYMBOL(spa_spare_add);
+EXPORT_SYMBOL(spa_spare_remove);
+EXPORT_SYMBOL(spa_spare_exists);
+EXPORT_SYMBOL(spa_spare_activate);
+
+/* L2ARC statech is global across all pools) */
+EXPORT_SYMBOL(spa_l2cache_add);
+EXPORT_SYMBOL(spa_l2cache_remove);
+EXPORT_SYMBOL(spa_l2cache_exists);
+EXPORT_SYMBOL(spa_l2cache_activate);
+EXPORT_SYMBOL(spa_l2cache_drop);
+
+/* scanning */
+EXPORT_SYMBOL(spa_scan);
+EXPORT_SYMBOL(spa_scan_stop);
+
+/* spa syncing */
+EXPORT_SYMBOL(spa_sync); /* only for DMU use */
+EXPORT_SYMBOL(spa_sync_allpools);
+
+/* properties */
+EXPORT_SYMBOL(spa_prop_set);
+EXPORT_SYMBOL(spa_prop_get);
+EXPORT_SYMBOL(spa_prop_clear_bootfs);
+
+/* asynchronous event notification */
+EXPORT_SYMBOL(spa_event_notify);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW,
+	"log2(fraction of arc that can be used by inflight I/Os when "
+	"verifying pool during import");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
+	"Set to traverse metadata on pool import");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
+	"Set to traverse data on pool import");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
+	"Print vdev tree to zfs_dbgmsg during pool import");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD,
+	"Percentage of CPUs to run an IO worker thread");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW,
+	"Allow importing pool with up to this number of missing top-level "
+	"vdevs (in read-only mode)");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW,
+	"Set the livelist condense zthr to pause");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW,
+	"Set the livelist condense synctask to pause");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW,
+	"Whether livelist condensing was canceled in the synctask");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW,
+	"Whether livelist condensing was canceled in the zthr function");
+
+ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW,
+	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
+	"was being condensed");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_boot.c b/sys/contrib/openzfs/module/zfs/spa_boot.c
new file mode 100644
index 000000000000..674394650f82
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_boot.c
@@ -0,0 +1,50 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifdef _KERNEL
+
+#include <sys/zio.h>
+#include <sys/spa_boot.h>
+#include <sys/sunddi.h>
+
+char *
+spa_get_bootprop(char *propname)
+{
+	char *value;
+
+	if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(),
+	    DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS)
+		return (NULL);
+	return (value);
+}
+
+void
+spa_free_bootprop(char *value)
+{
+	ddi_prop_free(value);
+}
+
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/zfs/spa_checkpoint.c b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
new file mode 100644
index 000000000000..5fb614467273
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_checkpoint.c
@@ -0,0 +1,636 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017 by Delphix. All rights reserved.
+ */
+
+/*
+ * Storage Pool Checkpoint
+ *
+ * A storage pool checkpoint can be thought of as a pool-wide snapshot or
+ * a stable version of extreme rewind that guarantees no blocks from the
+ * checkpointed state will have been overwritten. It remembers the entire
+ * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
+ * point that it was taken and the user can rewind back to that point even if
+ * they applied destructive operations on their datasets or even enabled new
+ * zpool on-disk features. If a pool has a checkpoint that is no longer
+ * needed, the user can discard it.
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ *   flag is set to active when we create the checkpoint and remains active
+ *   until the checkpoint is fully discarded. The entry in the MOS config
+ *   (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
+ *   references the state of the pool when we take the checkpoint. The entry
+ *   remains populated until we start discarding the checkpoint or we rewind
+ *   back to it.
+ *
+ * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
+ *   which persists until the checkpoint is fully discarded. The space map
+ *   contains entries that have been freed in the current state of the pool
+ *   but we want to keep around in case we decide to rewind to the checkpoint.
+ *   [see vdev_checkpoint_sm]
+ *
+ * - Each metaslab's ms_sm space map behaves the same as without the
+ *   checkpoint, with the only exception being the scenario when we free
+ *   blocks that belong to the checkpoint. In this case, these blocks remain
+ *   ALLOCATED in the metaslab's space map and they are added as FREE in the
+ *   vdev's checkpoint space map.
+ *
+ * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
+ *   the uberblock was checkpointed. For normal uberblocks this field is 0.
+ *
+ * == Overview of operations ==
+ *
+ * - To create a checkpoint, we first wait for the current TXG to be synced,
+ *   so we can use the most recently synced uberblock (spa_ubsync) as the
+ *   checkpointed uberblock. Then we use an early synctask to place that
+ *   uberblock in MOS config, increment the feature flag for the checkpoint
+ *   (marking it active), and setting spa_checkpoint_txg (see its use below)
+ *   to the TXG of the checkpointed uberblock. We use an early synctask for
+ *   the aforementioned operations to ensure that no blocks were dirtied
+ *   between the current TXG and the TXG of the checkpointed uberblock
+ *   (e.g the previous txg).
+ *
+ * - When a checkpoint exists, we need to ensure that the blocks that
+ *   belong to the checkpoint are freed but never reused. This means that
+ *   these blocks should never end up in the ms_allocatable or the ms_freeing
+ *   trees of a metaslab. Therefore, whenever there is a checkpoint the new
+ *   ms_checkpointing tree is used in addition to the aforementioned ones.
+ *
+ *   Whenever a block is freed and we find out that it is referenced by the
+ *   checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
+ *   we place it in the ms_checkpointing tree instead of the ms_freeingtree.
+ *   This way, we divide the blocks that are being freed into checkpointed
+ *   and not-checkpointed blocks.
+ *
+ *   In order to persist these frees, we write the extents from the
+ *   ms_freeingtree to the ms_sm as usual, and the extents from the
+ *   ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
+ *   checkpointed extents will remain allocated in the metaslab's ms_sm space
+ *   map, and therefore won't be reused [see metaslab_sync()]. In addition,
+ *   when we discard the checkpoint, we can find the entries that have
+ *   actually been freed in vdev_checkpoint_sm.
+ *   [see spa_checkpoint_discard_thread_sync()]
+ *
+ * - To discard the checkpoint we use an early synctask to delete the
+ *   checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
+ *   and wakeup the discarding zthr thread (an open-context async thread).
+ *   We use an early synctask to ensure that the operation happens before any
+ *   new data end up in the checkpoint's data structures.
+ *
+ *   Once the synctask is done and the discarding zthr is awake, we discard
+ *   the checkpointed data over multiple TXGs by having the zthr prefetching
+ *   entries from vdev_checkpoint_sm and then starting a synctask that places
+ *   them as free blocks into their respective ms_allocatable and ms_sm
+ *   structures.
+ *   [see spa_checkpoint_discard_thread()]
+ *
+ *   When there are no entries left in the vdev_checkpoint_sm of all
+ *   top-level vdevs, a final synctask runs that decrements the feature flag.
+ *
+ * - To rewind to the checkpoint, we first use the current uberblock and
+ *   open the MOS so we can access the checkpointed uberblock from the MOS
+ *   config. After we retrieve the checkpointed uberblock, we use it as the
+ *   current uberblock for the pool by writing it to disk with an updated
+ *   TXG, opening its version of the MOS, and moving on as usual from there.
+ *   [see spa_ld_checkpoint_rewind()]
+ *
+ *   An important note on rewinding to the checkpoint has to do with how we
+ *   handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
+ *   blocks that have not been claimed by the time we took the checkpoint
+ *   as they should no longer be valid.
+ *   [see comment in zil_claim()]
+ *
+ * == Miscellaneous information ==
+ *
+ * - In the hypothetical event that we take a checkpoint, remove a vdev,
+ *   and attempt to rewind, the rewind would fail as the checkpointed
+ *   uberblock would reference data in the removed device. For this reason
+ *   and others of similar nature, we disallow the following operations that
+ *   can change the config:
+ *   	vdev removal and attach/detach, mirror splitting, and pool reguid.
+ *
+ * - As most of the checkpoint logic is implemented in the SPA and doesn't
+ *   distinguish datasets when it comes to space accounting, having a
+ *   checkpoint can potentially break the boundaries set by dataset
+ *   reservations.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_checkpoint.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+
+/*
+ * The following parameter limits the amount of memory to be used for the
+ * prefetching of the checkpoint space map done on each vdev while
+ * discarding the checkpoint.
+ *
+ * The reason it exists is because top-level vdevs with long checkpoint
+ * space maps can potentially take up a lot of memory depending on the
+ * amount of checkpointed data that has been freed within them while
+ * the pool had a checkpoint.
+ */
+unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
+
+int
+spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+	bzero(pcs, sizeof (pool_checkpoint_stat_t));
+
+	int error = zap_contains(spa_meta_objset(spa),
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
+	ASSERT(error == 0 || error == ENOENT);
+
+	if (error == ENOENT)
+		pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
+	else
+		pcs->pcs_state = CS_CHECKPOINT_EXISTS;
+
+	pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
+	pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
+
+	return (0);
+}
+
+static void
+spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = arg;
+
+	spa->spa_checkpoint_info.sci_timestamp = 0;
+
+	spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+	spa_notify_waiters(spa);
+
+	spa_history_log_internal(spa, "spa discard checkpoint", tx,
+	    "finished discarding checkpointed state from the pool");
+}
+
+typedef struct spa_checkpoint_discard_sync_callback_arg {
+	vdev_t *sdc_vd;
+	uint64_t sdc_txg;
+	uint64_t sdc_entry_limit;
+} spa_checkpoint_discard_sync_callback_arg_t;
+
+static int
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
+{
+	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
+	vdev_t *vd = sdc->sdc_vd;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
+
+	if (sdc->sdc_entry_limit == 0)
+		return (SET_ERROR(EINTR));
+
+	/*
+	 * Since the space map is not condensed, we know that
+	 * none of its entries is crossing the boundaries of
+	 * its respective metaslab.
+	 *
+	 * That said, there is no fundamental requirement that
+	 * the checkpoint's space map entries should not cross
+	 * metaslab boundaries. So if needed we could add code
+	 * that handles metaslab-crossing segments in the future.
+	 */
+	VERIFY3U(sme->sme_type, ==, SM_FREE);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
+	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
+
+	/*
+	 * At this point we should not be processing any
+	 * other frees concurrently, so the lock is technically
+	 * unnecessary. We use the lock anyway though to
+	 * potentially save ourselves from future headaches.
+	 */
+	mutex_enter(&ms->ms_lock);
+	if (range_tree_is_empty(ms->ms_freeing))
+		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
+	range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
+	mutex_exit(&ms->ms_lock);
+
+	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+	    sme->sme_run);
+	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
+
+	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+	vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
+	sdc->sdc_entry_limit--;
+
+	return (0);
+}
+
+#ifdef ZFS_DEBUG
+static void
+spa_checkpoint_accounting_verify(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t ckpoint_sm_space_sum = 0;
+	uint64_t vs_ckpoint_space_sum = 0;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		if (vd->vdev_checkpoint_sm != NULL) {
+			ckpoint_sm_space_sum +=
+			    -space_map_allocated(vd->vdev_checkpoint_sm);
+			vs_ckpoint_space_sum +=
+			    vd->vdev_stat.vs_checkpoint_space;
+			ASSERT3U(ckpoint_sm_space_sum, ==,
+			    vs_ckpoint_space_sum);
+		} else {
+			ASSERT0(vd->vdev_stat.vs_checkpoint_space);
+		}
+	}
+	ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
+}
+#endif
+
+static void
+spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
+{
+	vdev_t *vd = arg;
+	int error;
+
+	/*
+	 * The space map callback is applied only to non-debug entries.
+	 * Because the number of debug entries is less or equal to the
+	 * number of non-debug entries, we want to ensure that we only
+	 * read what we prefetched from open-context.
+	 *
+	 * Thus, we set the maximum entries that the space map callback
+	 * will be applied to be half the entries that could fit in the
+	 * imposed memory limit.
+	 *
+	 * Note that since this is a conservative estimate we also
+	 * assume the worst case scenario in our computation where each
+	 * entry is two-word.
+	 */
+	uint64_t max_entry_limit =
+	    (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
+
+	/*
+	 * Iterate from the end of the space map towards the beginning,
+	 * placing its entries on ms_freeing and removing them from the
+	 * space map. The iteration stops if one of the following
+	 * conditions is true:
+	 *
+	 * 1] We reached the beginning of the space map. At this point
+	 *    the space map should be completely empty and
+	 *    space_map_incremental_destroy should have returned 0.
+	 *    The next step would be to free and close the space map
+	 *    and remove its entry from its vdev's top zap. This allows
+	 *    spa_checkpoint_discard_thread() to move on to the next vdev.
+	 *
+	 * 2] We reached the memory limit (amount of memory used to hold
+	 *    space map entries in memory) and space_map_incremental_destroy
+	 *    returned EINTR. This means that there are entries remaining
+	 *    in the space map that will be cleared in a future invocation
+	 *    of this function by spa_checkpoint_discard_thread().
+	 */
+	spa_checkpoint_discard_sync_callback_arg_t sdc;
+	sdc.sdc_vd = vd;
+	sdc.sdc_txg = tx->tx_txg;
+	sdc.sdc_entry_limit = max_entry_limit;
+
+	uint64_t words_before =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
+	    spa_checkpoint_discard_sync_callback, &sdc, tx);
+
+	uint64_t words_after =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
+
+#ifdef ZFS_DEBUG
+	spa_checkpoint_accounting_verify(vd->vdev_spa);
+#endif
+
+	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
+	    "deleted %llu words - %llu words are left",
+	    tx->tx_txg, vd->vdev_id, (words_before - words_after),
+	    words_after);
+
+	if (error != EINTR) {
+		if (error != 0) {
+			zfs_panic_recover("zfs: error %d was returned "
+			    "while incrementally destroying the checkpoint "
+			    "space map of vdev %llu\n",
+			    error, vd->vdev_id);
+		}
+		ASSERT0(words_after);
+		ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
+		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
+
+		space_map_free(vd->vdev_checkpoint_sm, tx);
+		space_map_close(vd->vdev_checkpoint_sm);
+		vd->vdev_checkpoint_sm = NULL;
+
+		VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
+		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
+	}
+}
+
+static boolean_t
+spa_checkpoint_discard_is_done(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(!spa_has_checkpoint(spa));
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
+			return (B_FALSE);
+		ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
+	}
+
+	return (B_TRUE);
+}
+
+/* ARGSUSED */
+boolean_t
+spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (B_FALSE);
+
+	if (spa_has_checkpoint(spa))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+void
+spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		while (vd->vdev_checkpoint_sm != NULL) {
+			space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
+			int numbufs;
+			dmu_buf_t **dbp;
+
+			if (zthr_iscancelled(zthr))
+				return;
+
+			ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+
+			uint64_t size = MIN(space_map_length(checkpoint_sm),
+			    zfs_spa_discard_memory_limit);
+			uint64_t offset =
+			    space_map_length(checkpoint_sm) - size;
+
+			/*
+			 * Ensure that the part of the space map that will
+			 * be destroyed by the synctask, is prefetched in
+			 * memory before the synctask runs.
+			 */
+			int error = dmu_buf_hold_array_by_bonus(
+			    checkpoint_sm->sm_dbuf, offset, size,
+			    B_TRUE, FTAG, &numbufs, &dbp);
+			if (error != 0) {
+				zfs_panic_recover("zfs: error %d was returned "
+				    "while prefetching checkpoint space map "
+				    "entries of vdev %llu\n",
+				    error, vd->vdev_id);
+			}
+
+			VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+			    spa_checkpoint_discard_thread_sync, vd,
+			    0, ZFS_SPACE_CHECK_NONE));
+
+			dmu_buf_rele_array(dbp, numbufs, FTAG);
+		}
+	}
+
+	VERIFY(spa_checkpoint_discard_is_done(spa));
+	VERIFY0(spa->spa_checkpoint_info.sci_dspace);
+	VERIFY0(dsl_sync_task(spa->spa_name, NULL,
+	    spa_checkpoint_discard_complete_sync, spa,
+	    0, ZFS_SPACE_CHECK_NONE));
+}
+
+
+/* ARGSUSED */
+static int
+spa_checkpoint_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ENOTSUP));
+
+	if (!spa_top_vdevs_spacemap_addressable(spa))
+		return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
+
+	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+		return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
+
+	if (spa->spa_checkpoint_txg != 0)
+		return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
+
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
+{
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	spa_t *spa = dp->dp_spa;
+	uberblock_t checkpoint = spa->spa_ubsync;
+
+	/*
+	 * At this point, there should not be a checkpoint in the MOS.
+	 */
+	ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
+
+	ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
+	ASSERT0(spa->spa_checkpoint_info.sci_dspace);
+
+	/*
+	 * Since the checkpointed uberblock is the one that just got synced
+	 * (we use spa_ubsync), its txg must be equal to the txg number of
+	 * the txg we are syncing, minus 1.
+	 */
+	ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
+
+	/*
+	 * Once the checkpoint is in place, we need to ensure that none of
+	 * its blocks will be marked for reuse after it has been freed.
+	 * When there is a checkpoint and a block is freed, we compare its
+	 * birth txg to the txg of the checkpointed uberblock to see if the
+	 * block is part of the checkpoint or not. Therefore, we have to set
+	 * spa_checkpoint_txg before any frees happen in this txg (which is
+	 * why this is done as an early_synctask as explained in the comment
+	 * in spa_checkpoint()).
+	 */
+	spa->spa_checkpoint_txg = checkpoint.ub_txg;
+	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
+
+	checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
+	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
+	    sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
+	    &checkpoint, tx));
+
+	/*
+	 * Increment the feature refcount and thus activate the feature.
+	 * Note that the feature will be deactivated when we've
+	 * completely discarded all checkpointed state (both vdev
+	 * space maps and uberblock).
+	 */
+	spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
+
+	spa_history_log_internal(spa, "spa checkpoint", tx,
+	    "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg);
+}
+
+/*
+ * Create a checkpoint for the pool.
+ */
+int
+spa_checkpoint(const char *pool)
+{
+	int error;
+	spa_t *spa;
+
+	error = spa_open(pool, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	mutex_enter(&spa->spa_vdev_top_lock);
+
+	/*
+	 * Wait for current syncing txg to finish so the latest synced
+	 * uberblock (spa_ubsync) has all the changes that we expect
+	 * to see if we were to revert later to the checkpoint. In other
+	 * words we want the checkpointed uberblock to include/reference
+	 * all the changes that were pending at the time that we issued
+	 * the checkpoint command.
+	 */
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	/*
+	 * As the checkpointed uberblock references blocks from the previous
+	 * txg (spa_ubsync) we want to ensure that are not freeing any of
+	 * these blocks in the same txg that the following synctask will
+	 * run. Thus, we run it as an early synctask, so the dirty changes
+	 * that are synced to disk afterwards during zios and other synctasks
+	 * do not reuse checkpointed blocks.
+	 */
+	error = dsl_early_sync_task(pool, spa_checkpoint_check,
+	    spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
+
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
+		return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
+
+	if (spa->spa_checkpoint_txg == 0)
+		return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
+
+	VERIFY0(zap_contains(spa_meta_objset(spa),
+	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
+
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ZPOOL_CHECKPOINT, tx));
+
+	spa->spa_checkpoint_txg = 0;
+
+	zthr_wakeup(spa->spa_checkpoint_discard_zthr);
+
+	spa_history_log_internal(spa, "spa discard checkpoint", tx,
+	    "started discarding checkpointed state from the pool");
+}
+
+/*
+ * Discard the checkpoint from a pool.
+ */
+int
+spa_checkpoint_discard(const char *pool)
+{
+	/*
+	 * Similarly to spa_checkpoint(), we want our synctask to run
+	 * before any pending dirty data are written to disk so they
+	 * won't end up in the checkpoint's data structures (e.g.
+	 * ms_checkpointing and vdev_checkpoint_sm) and re-create any
+	 * space maps that the discarding open-context thread has
+	 * deleted.
+	 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
+	 */
+	return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
+	    spa_checkpoint_discard_sync, NULL, 0,
+	    ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
+}
+
+EXPORT_SYMBOL(spa_checkpoint_get_stats);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread);
+EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW,
+	"Limit for memory used in prefetching the checkpoint space map done "
+	"on each vdev while discarding the checkpoint");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_config.c b/sys/contrib/openzfs/module/zfs/spa_config.c
new file mode 100644
index 000000000000..4a3144313267
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_config.c
@@ -0,0 +1,623 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
+ */
+
+#include <sys/spa.h>
+#include <sys/file.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa_impl.h>
+#include <sys/nvpair.h>
+#include <sys/uio.h>
+#include <sys/fs/zfs.h>
+#include <sys/vdev_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/systeminfo.h>
+#include <sys/sunddi.h>
+#include <sys/zfeature.h>
+#include <sys/zfs_file.h>
+#ifdef _KERNEL
+#include <sys/zone.h>
+#endif
+
+/*
+ * Pool configuration repository.
+ *
+ * Pool configuration is stored as a packed nvlist on the filesystem.  By
+ * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot
+ * (when the ZFS module is loaded).  Pools can also have the 'cachefile'
+ * property set that allows them to be stored in an alternate location until
+ * the control of external software.
+ *
+ * For each cache file, we have a single nvlist which holds all the
+ * configuration information.  When the module loads, we read this information
+ * from /etc/zfs/zpool.cache and populate the SPA namespace.  This namespace is
+ * maintained independently in spa.c.  Whenever the namespace is modified, or
+ * the configuration of a pool is changed, we call spa_write_cachefile(), which
+ * walks through all the active pools and writes the configuration to disk.
+ */
+
+static uint64_t spa_config_generation = 1;
+
+/*
+ * This can be overridden in userland to preserve an alternate namespace for
+ * userland pools when doing testing.
+ */
+char *spa_config_path = ZPOOL_CACHE;
+int zfs_autoimport_disable = 1;
+
+/*
+ * Called when the module is first loaded, this routine loads the configuration
+ * file into the SPA namespace.  It does not actually open or load the pools; it
+ * only populates the namespace.
+ */
+void
+spa_config_load(void)
+{
+	void *buf = NULL;
+	nvlist_t *nvlist, *child;
+	nvpair_t *nvpair;
+	char *pathname;
+	zfs_file_t *fp;
+	zfs_file_attr_t zfa;
+	uint64_t fsize;
+	int err;
+
+#ifdef _KERNEL
+	if (zfs_autoimport_disable)
+		return;
+#endif
+
+	/*
+	 * Open the configuration file.
+	 */
+	pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	(void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path);
+
+	err = zfs_file_open(pathname, O_RDONLY, 0, &fp);
+
+#ifdef __FreeBSD__
+	if (err)
+		err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp);
+#endif
+	kmem_free(pathname, MAXPATHLEN);
+
+	if (err)
+		return;
+
+	if (zfs_file_getattr(fp, &zfa))
+		goto out;
+
+	fsize = zfa.zfa_size;
+	buf = kmem_alloc(fsize, KM_SLEEP);
+
+	/*
+	 * Read the nvlist from the file.
+	 */
+	if (zfs_file_read(fp, buf, fsize, NULL) < 0)
+		goto out;
+
+	/*
+	 * Unpack the nvlist.
+	 */
+	if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0)
+		goto out;
+
+	/*
+	 * Iterate over all elements in the nvlist, creating a new spa_t for
+	 * each one with the specified configuration.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	nvpair = NULL;
+	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
+		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
+			continue;
+
+		child = fnvpair_value_nvlist(nvpair);
+
+		if (spa_lookup(nvpair_name(nvpair)) != NULL)
+			continue;
+		(void) spa_add(nvpair_name(nvpair), child, NULL);
+	}
+	mutex_exit(&spa_namespace_lock);
+
+	nvlist_free(nvlist);
+
+out:
+	if (buf != NULL)
+		kmem_free(buf, fsize);
+
+	zfs_file_close(fp);
+}
+
+static int
+spa_config_remove(spa_config_dirent_t *dp)
+{
+	int error = 0;
+
+	/*
+	 * Remove the cache file.  If zfs_file_unlink() in not supported by the
+	 * platform fallback to truncating the file which is functionally
+	 * equivalent.
+	 */
+	error = zfs_file_unlink(dp->scd_path);
+	if (error == EOPNOTSUPP) {
+		int flags = O_RDWR | O_TRUNC;
+		zfs_file_t *fp;
+
+		error = zfs_file_open(dp->scd_path, flags, 0644, &fp);
+		if (error == 0) {
+			(void) zfs_file_fsync(fp, O_SYNC);
+			(void) zfs_file_close(fp);
+		}
+	}
+
+	return (error);
+}
+
+static int
+spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
+{
+	size_t buflen;
+	char *buf;
+	int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE;
+	char *temp;
+	int err;
+	zfs_file_t *fp;
+
+	/*
+	 * If the nvlist is empty (NULL), then remove the old cachefile.
+	 */
+	if (nvl == NULL) {
+		err = spa_config_remove(dp);
+		if (err == ENOENT)
+			err = 0;
+
+		return (err);
+	}
+
+	/*
+	 * Pack the configuration into a buffer.
+	 */
+	buf = fnvlist_pack(nvl, &buflen);
+	temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+	/*
+	 * Write the configuration to disk.  Due to the complexity involved
+	 * in performing a rename and remove from within the kernel the file
+	 * is instead truncated and overwritten in place.  This way we always
+	 * have a consistent view of the data or a zero length file.
+	 */
+	err = zfs_file_open(dp->scd_path, oflags, 0644, &fp);
+	if (err == 0) {
+		err = zfs_file_write(fp, buf, buflen, NULL);
+		if (err == 0)
+			err = zfs_file_fsync(fp, O_SYNC);
+
+		zfs_file_close(fp);
+		if (err)
+			(void) spa_config_remove(dp);
+	}
+	fnvlist_pack_free(buf, buflen);
+	kmem_free(temp, MAXPATHLEN);
+	return (err);
+}
+
+/*
+ * Synchronize pool configuration to disk.  This must be called with the
+ * namespace lock held. Synchronizing the pool cache is typically done after
+ * the configuration has been synced to the MOS. This exposes a window where
+ * the MOS config will have been updated but the cache file has not. If
+ * the system were to crash at that instant then the cached config may not
+ * contain the correct information to open the pool and an explicit import
+ * would be required.
+ */
+void
+spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent)
+{
+	spa_config_dirent_t *dp, *tdp;
+	nvlist_t *nvl;
+	char *pool_name;
+	boolean_t ccw_failure;
+	int error = 0;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	if (!(spa_mode_global & SPA_MODE_WRITE))
+		return;
+
+	/*
+	 * Iterate over all cachefiles for the pool, past or present.  When the
+	 * cachefile is changed, the new one is pushed onto this list, allowing
+	 * us to update previous cachefiles that no longer contain this pool.
+	 */
+	ccw_failure = B_FALSE;
+	for (dp = list_head(&target->spa_config_list); dp != NULL;
+	    dp = list_next(&target->spa_config_list, dp)) {
+		spa_t *spa = NULL;
+		if (dp->scd_path == NULL)
+			continue;
+
+		/*
+		 * Iterate over all pools, adding any matching pools to 'nvl'.
+		 */
+		nvl = NULL;
+		while ((spa = spa_next(spa)) != NULL) {
+			/*
+			 * Skip over our own pool if we're about to remove
+			 * ourselves from the spa namespace or any pool that
+			 * is readonly. Since we cannot guarantee that a
+			 * readonly pool would successfully import upon reboot,
+			 * we don't allow them to be written to the cache file.
+			 */
+			if ((spa == target && removing) ||
+			    !spa_writeable(spa))
+				continue;
+
+			mutex_enter(&spa->spa_props_lock);
+			tdp = list_head(&spa->spa_config_list);
+			if (spa->spa_config == NULL ||
+			    tdp == NULL ||
+			    tdp->scd_path == NULL ||
+			    strcmp(tdp->scd_path, dp->scd_path) != 0) {
+				mutex_exit(&spa->spa_props_lock);
+				continue;
+			}
+
+			if (nvl == NULL)
+				nvl = fnvlist_alloc();
+
+			if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME)
+				pool_name = fnvlist_lookup_string(
+				    spa->spa_config, ZPOOL_CONFIG_POOL_NAME);
+			else
+				pool_name = spa_name(spa);
+
+			fnvlist_add_nvlist(nvl, pool_name, spa->spa_config);
+			mutex_exit(&spa->spa_props_lock);
+		}
+
+		error = spa_config_write(dp, nvl);
+		if (error != 0)
+			ccw_failure = B_TRUE;
+		nvlist_free(nvl);
+	}
+
+	if (ccw_failure) {
+		/*
+		 * Keep trying so that configuration data is
+		 * written if/when any temporary filesystem
+		 * resource issues are resolved.
+		 */
+		if (target->spa_ccw_fail_time == 0) {
+			(void) zfs_ereport_post(
+			    FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
+			    target, NULL, NULL, NULL, 0);
+		}
+		target->spa_ccw_fail_time = gethrtime();
+		spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
+	} else {
+		/*
+		 * Do not rate limit future attempts to update
+		 * the config cache.
+		 */
+		target->spa_ccw_fail_time = 0;
+	}
+
+	/*
+	 * Remove any config entries older than the current one.
+	 */
+	dp = list_head(&target->spa_config_list);
+	while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) {
+		list_remove(&target->spa_config_list, tdp);
+		if (tdp->scd_path != NULL)
+			spa_strfree(tdp->scd_path);
+		kmem_free(tdp, sizeof (spa_config_dirent_t));
+	}
+
+	spa_config_generation++;
+
+	if (postsysevent)
+		spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC);
+}
+
+/*
+ * Sigh.  Inside a local zone, we don't have access to /etc/zfs/zpool.cache,
+ * and we don't want to allow the local zone to see all the pools anyway.
+ * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration
+ * information for all pool visible within the zone.
+ */
+nvlist_t *
+spa_all_configs(uint64_t *generation)
+{
+	nvlist_t *pools;
+	spa_t *spa = NULL;
+
+	if (*generation == spa_config_generation)
+		return (NULL);
+
+	pools = fnvlist_alloc();
+
+	mutex_enter(&spa_namespace_lock);
+	while ((spa = spa_next(spa)) != NULL) {
+		if (INGLOBALZONE(curproc) ||
+		    zone_dataset_visible(spa_name(spa), NULL)) {
+			mutex_enter(&spa->spa_props_lock);
+			fnvlist_add_nvlist(pools, spa_name(spa),
+			    spa->spa_config);
+			mutex_exit(&spa->spa_props_lock);
+		}
+	}
+	*generation = spa_config_generation;
+	mutex_exit(&spa_namespace_lock);
+
+	return (pools);
+}
+
+void
+spa_config_set(spa_t *spa, nvlist_t *config)
+{
+	mutex_enter(&spa->spa_props_lock);
+	if (spa->spa_config != NULL && spa->spa_config != config)
+		nvlist_free(spa->spa_config);
+	spa->spa_config = config;
+	mutex_exit(&spa->spa_props_lock);
+}
+
+/*
+ * Generate the pool's configuration based on the current in-core state.
+ *
+ * We infer whether to generate a complete config or just one top-level config
+ * based on whether vd is the root vdev.
+ */
+nvlist_t *
+spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
+{
+	nvlist_t *config, *nvroot;
+	vdev_t *rvd = spa->spa_root_vdev;
+	unsigned long hostid = 0;
+	boolean_t locked = B_FALSE;
+	uint64_t split_guid;
+	char *pool_name;
+
+	if (vd == NULL) {
+		vd = rvd;
+		locked = B_TRUE;
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+	}
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
+	    (SCL_CONFIG | SCL_STATE));
+
+	/*
+	 * If txg is -1, report the current value of spa->spa_config_txg.
+	 */
+	if (txg == -1ULL)
+		txg = spa->spa_config_txg;
+
+	/*
+	 * Originally, users had to handle spa namespace collisions by either
+	 * exporting the already imported pool or by specifying a new name for
+	 * the pool with a conflicting name. In the case of root pools from
+	 * virtual guests, neither approach to collision resolution is
+	 * reasonable. This is addressed by extending the new name syntax with
+	 * an option to specify that the new name is temporary. When specified,
+	 * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us
+	 * to use the previous name, which we do below.
+	 */
+	if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+		VERIFY0(nvlist_lookup_string(spa->spa_config,
+		    ZPOOL_CONFIG_POOL_NAME, &pool_name));
+	} else
+		pool_name = spa_name(spa);
+
+	config = fnvlist_alloc();
+
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
+	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+	fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata);
+	if (spa->spa_comment != NULL)
+		fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
+		    spa->spa_comment);
+	if (spa->spa_compatibility != NULL)
+		fnvlist_add_string(config, ZPOOL_CONFIG_COMPATIBILITY,
+		    spa->spa_compatibility);
+
+	hostid = spa_get_hostid(spa);
+	if (hostid != 0)
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid);
+	fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename);
+
+	int config_gen_flags = 0;
+	if (vd != rvd) {
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID,
+		    vd->vdev_top->vdev_guid);
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID,
+		    vd->vdev_guid);
+		if (vd->vdev_isspare)
+			fnvlist_add_uint64(config,
+			    ZPOOL_CONFIG_IS_SPARE, 1ULL);
+		if (vd->vdev_islog)
+			fnvlist_add_uint64(config,
+			    ZPOOL_CONFIG_IS_LOG, 1ULL);
+		vd = vd->vdev_top;		/* label contains top config */
+	} else {
+		/*
+		 * Only add the (potentially large) split information
+		 * in the mos config, and not in the vdev labels
+		 */
+		if (spa->spa_config_splitting != NULL)
+			fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+			    spa->spa_config_splitting);
+
+		fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS);
+
+		config_gen_flags |= VDEV_CONFIG_MOS;
+	}
+
+	/*
+	 * Add the top-level config.  We even add this on pools which
+	 * don't support holes in the namespace.
+	 */
+	vdev_top_config_generate(spa, config);
+
+	/*
+	 * If we're splitting, record the original pool's guid.
+	 */
+	if (spa->spa_config_splitting != NULL &&
+	    nvlist_lookup_uint64(spa->spa_config_splitting,
+	    ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
+		fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid);
+	}
+
+	nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags);
+	fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot);
+	nvlist_free(nvroot);
+
+	/*
+	 * Store what's necessary for reading the MOS in the label.
+	 */
+	fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+	    spa->spa_label_features);
+
+	if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
+		ddt_histogram_t *ddh;
+		ddt_stat_t *dds;
+		ddt_object_t *ddo;
+
+		ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+		ddt_get_dedup_histogram(spa, ddh);
+		fnvlist_add_uint64_array(config,
+		    ZPOOL_CONFIG_DDT_HISTOGRAM,
+		    (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t));
+		kmem_free(ddh, sizeof (ddt_histogram_t));
+
+		ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
+		ddt_get_dedup_object_stats(spa, ddo);
+		fnvlist_add_uint64_array(config,
+		    ZPOOL_CONFIG_DDT_OBJ_STATS,
+		    (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t));
+		kmem_free(ddo, sizeof (ddt_object_t));
+
+		dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
+		ddt_get_dedup_stats(spa, dds);
+		fnvlist_add_uint64_array(config,
+		    ZPOOL_CONFIG_DDT_STATS,
+		    (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t));
+		kmem_free(dds, sizeof (ddt_stat_t));
+	}
+
+	if (locked)
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+	return (config);
+}
+
+/*
+ * Update all disk labels, generate a fresh config based on the current
+ * in-core state, and sync the global config cache (do not sync the config
+ * cache if this is a booting rootpool).
+ */
+void
+spa_config_update(spa_t *spa, int what)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t txg;
+	int c;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	txg = spa_last_synced_txg(spa) + 1;
+	if (what == SPA_CONFIG_UPDATE_POOL) {
+		vdev_config_dirty(rvd);
+	} else {
+		/*
+		 * If we have top-level vdevs that were added but have
+		 * not yet been prepared for allocation, do that now.
+		 * (It's safe now because the config cache is up to date,
+		 * so it will be able to translate the new DVAs.)
+		 * See comments in spa_vdev_add() for full details.
+		 */
+		for (c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *tvd = rvd->vdev_child[c];
+
+			/*
+			 * Explicitly skip vdevs that are indirect or
+			 * log vdevs that are being removed. The reason
+			 * is that both of those can have vdev_ms_array
+			 * set to 0 and we wouldn't want to change their
+			 * metaslab size nor call vdev_expand() on them.
+			 */
+			if (!vdev_is_concrete(tvd) ||
+			    (tvd->vdev_islog && tvd->vdev_removing))
+				continue;
+
+			if (tvd->vdev_ms_array == 0)
+				vdev_metaslab_set_size(tvd);
+			vdev_expand(tvd, txg);
+		}
+	}
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	/*
+	 * Wait for the mosconfig to be regenerated and synced.
+	 */
+	txg_wait_synced(spa->spa_dsl_pool, txg);
+
+	/*
+	 * Update the global config cache to reflect the new mosconfig.
+	 */
+	if (!spa->spa_is_root) {
+		spa_write_cachefile(spa, B_FALSE,
+		    what != SPA_CONFIG_UPDATE_POOL);
+	}
+
+	if (what == SPA_CONFIG_UPDATE_POOL)
+		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
+}
+
+EXPORT_SYMBOL(spa_config_load);
+EXPORT_SYMBOL(spa_all_configs);
+EXPORT_SYMBOL(spa_config_set);
+EXPORT_SYMBOL(spa_config_generate);
+EXPORT_SYMBOL(spa_config_update);
+
+/* BEGIN CSTYLED */
+#ifdef __linux__
+/* string sysctls require a char array on FreeBSD */
+ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD,
+	"SPA config file (/etc/zfs/zpool.cache)");
+#endif
+
+ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW,
+	"Disable pool import at module load");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_errlog.c b/sys/contrib/openzfs/module/zfs/spa_errlog.c
new file mode 100644
index 000000000000..fa5120eb61b3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_errlog.c
@@ -0,0 +1,416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ */
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation.  This is actually the union of two distinct logs: the last log,
+ * and the current log.  All errors seen are logged to the current log.  When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized.  This way, if an error is somehow
+ * corrected, a new scrub will show that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data.  When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known.  This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path.  Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name.  Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
+{
+	(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+	    (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+	    (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+#ifdef _KERNEL
+static void
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
+{
+	zb->zb_objset = zfs_strtonum(buf, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_object = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == ':');
+	zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
+	ASSERT(*buf == '\0');
+}
+#endif
+
+/*
+ * Log an uncorrectable error to the persistent error log.  We add it to the
+ * spa's list of pending errors.  The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, const zbookmark_phys_t *zb)
+{
+	spa_error_entry_t search;
+	spa_error_entry_t *new;
+	avl_tree_t *tree;
+	avl_index_t where;
+
+	/*
+	 * If we are trying to import a pool, ignore any errors, as we won't be
+	 * writing to the pool any time soon.
+	 */
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+		return;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * If we have had a request to rotate the log, log it to the next list
+	 * instead of the current one.
+	 */
+	if (spa->spa_scrub_active || spa->spa_scrub_finished)
+		tree = &spa->spa_errlist_scrub;
+	else
+		tree = &spa->spa_errlist_last;
+
+	search.se_bookmark = *zb;
+	if (avl_find(tree, &search, &where) != NULL) {
+		mutex_exit(&spa->spa_errlist_lock);
+		return;
+	}
+
+	new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+	new->se_bookmark = *zb;
+	avl_insert(tree, new, where);
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log.  This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+	uint64_t total = 0, count;
+
+	mutex_enter(&spa->spa_errlog_lock);
+	if (spa->spa_errlog_scrub != 0 &&
+	    zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+	    &count) == 0)
+		total += count;
+
+	if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+	    zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+	    &count) == 0)
+		total += count;
+	mutex_exit(&spa->spa_errlog_lock);
+
+	mutex_enter(&spa->spa_errlist_lock);
+	total += avl_numnodes(&spa->spa_errlist_last);
+	total += avl_numnodes(&spa->spa_errlist_scrub);
+	mutex_exit(&spa->spa_errlist_lock);
+
+	return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zbookmark_phys_t zb;
+
+	if (obj == 0)
+		return (0);
+
+	for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+
+		if (*count == 0) {
+			zap_cursor_fini(&zc);
+			return (SET_ERROR(ENOMEM));
+		}
+
+		name_to_bookmark(za.za_name, &zb);
+
+		if (copyout(&zb, (char *)addr +
+		    (*count - 1) * sizeof (zbookmark_phys_t),
+		    sizeof (zbookmark_phys_t)) != 0) {
+			zap_cursor_fini(&zc);
+			return (SET_ERROR(EFAULT));
+		}
+
+		*count -= 1;
+	}
+
+	zap_cursor_fini(&zc);
+
+	return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+	spa_error_entry_t *se;
+
+	for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+		if (*count == 0)
+			return (SET_ERROR(ENOMEM));
+
+		if (copyout(&se->se_bookmark, (char *)addr +
+		    (*count - 1) * sizeof (zbookmark_phys_t),
+		    sizeof (zbookmark_phys_t)) != 0)
+			return (SET_ERROR(EFAULT));
+
+		*count -= 1;
+	}
+
+	return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks.  This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists.  We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+	int ret = 0;
+
+#ifdef _KERNEL
+	mutex_enter(&spa->spa_errlog_lock);
+
+	ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+	if (!ret && !spa->spa_scrub_finished)
+		ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+		    count);
+
+	mutex_enter(&spa->spa_errlist_lock);
+	if (!ret)
+		ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+		    count);
+	if (!ret)
+		ret = process_error_list(&spa->spa_errlist_last, uaddr,
+		    count);
+	mutex_exit(&spa->spa_errlist_lock);
+
+	mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+	return (ret);
+}
+
+/*
+ * Called when a scrub completes.  This simply set a bit which tells which AVL
+ * tree to add new errors.  spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+	mutex_enter(&spa->spa_errlist_lock);
+	spa->spa_scrub_finished = B_TRUE;
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t.  Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+	spa_error_entry_t *se;
+	void *cookie;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	cookie = NULL;
+	while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+	    &cookie)) != NULL)
+		kmem_free(se, sizeof (spa_error_entry_t));
+	cookie = NULL;
+	while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+	    &cookie)) != NULL)
+		kmem_free(se, sizeof (spa_error_entry_t));
+
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+	spa_error_entry_t *se;
+	char buf[64];
+	void *cookie;
+
+	if (avl_numnodes(t) != 0) {
+		/* create log if necessary */
+		if (*obj == 0)
+			*obj = zap_create(spa->spa_meta_objset,
+			    DMU_OT_ERROR_LOG, DMU_OT_NONE,
+			    0, tx);
+
+		/* add errors to the current log */
+		for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+			char *name = se->se_name ? se->se_name : "";
+
+			bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+			(void) zap_update(spa->spa_meta_objset,
+			    *obj, buf, 1, strlen(name) + 1, name, tx);
+		}
+
+		/* purge the error list */
+		cookie = NULL;
+		while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+			kmem_free(se, sizeof (spa_error_entry_t));
+	}
+}
+
+/*
+ * Sync the error log out to disk.  This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock.  So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them.  Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing.  Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list.  Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	avl_tree_t scrub, last;
+	int scrub_finished;
+
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * Bail out early under normal circumstances.
+	 */
+	if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+	    avl_numnodes(&spa->spa_errlist_last) == 0 &&
+	    !spa->spa_scrub_finished) {
+		mutex_exit(&spa->spa_errlist_lock);
+		return;
+	}
+
+	spa_get_errlists(spa, &last, &scrub);
+	scrub_finished = spa->spa_scrub_finished;
+	spa->spa_scrub_finished = B_FALSE;
+
+	mutex_exit(&spa->spa_errlist_lock);
+	mutex_enter(&spa->spa_errlog_lock);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	/*
+	 * Sync out the current list of errors.
+	 */
+	sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+	/*
+	 * Rotate the log if necessary.
+	 */
+	if (scrub_finished) {
+		if (spa->spa_errlog_last != 0)
+			VERIFY(dmu_object_free(spa->spa_meta_objset,
+			    spa->spa_errlog_last, tx) == 0);
+		spa->spa_errlog_last = spa->spa_errlog_scrub;
+		spa->spa_errlog_scrub = 0;
+
+		sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+	}
+
+	/*
+	 * Sync out any pending scrub errors.
+	 */
+	sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+	/*
+	 * Update the MOS to reflect the new values.
+	 */
+	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+	    &spa->spa_errlog_last, tx);
+	(void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+	    &spa->spa_errlog_scrub, tx);
+
+	dmu_tx_commit(tx);
+
+	mutex_exit(&spa->spa_errlog_lock);
+}
+
+#if defined(_KERNEL)
+/* error handling */
+EXPORT_SYMBOL(spa_log_error);
+EXPORT_SYMBOL(spa_get_errlog_size);
+EXPORT_SYMBOL(spa_get_errlog);
+EXPORT_SYMBOL(spa_errlog_rotate);
+EXPORT_SYMBOL(spa_errlog_drain);
+EXPORT_SYMBOL(spa_errlog_sync);
+EXPORT_SYMBOL(spa_get_errlists);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/spa_history.c b/sys/contrib/openzfs/module/zfs/spa_history.c
new file mode 100644
index 000000000000..0482e0f6c39d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_history.c
@@ -0,0 +1,634 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2017 Joyent, Inc.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/cmn_err.h>
+#include <sys/sunddi.h>
+#include <sys/cred.h>
+#include "zfs_comutil.h"
+#include "zfs_gitrev.h"
+#ifdef _KERNEL
+#include <sys/zone.h>
+#endif
+
+/*
+ * Routines to manage the on-disk history log.
+ *
+ * The history log is stored as a dmu object containing
+ * <packed record length, record nvlist> tuples.
+ *
+ * Where "record nvlist" is an nvlist containing uint64_ts and strings, and
+ * "packed record length" is the packed length of the "record nvlist" stored
+ * as a little endian uint64_t.
+ *
+ * The log is implemented as a ring buffer, though the original creation
+ * of the pool ('zpool create') is never overwritten.
+ *
+ * The history log is tracked as object 'spa_t::spa_history'.  The bonus buffer
+ * of 'spa_history' stores the offsets for logging/retrieving history as
+ * 'spa_history_phys_t'.  'sh_pool_create_len' is the ending offset in bytes of
+ * where the 'zpool create' record is stored.  This allows us to never
+ * overwrite the original creation of the pool.  'sh_phys_max_off' is the
+ * physical ending offset in bytes of the log.  This tells you the length of
+ * the buffer. 'sh_eof' is the logical EOF (in bytes).  Whenever a record
+ * is added, 'sh_eof' is incremented by the size of the record.
+ * 'sh_eof' is never decremented.  'sh_bof' is the logical BOF (in bytes).
+ * This is where the consumer should start reading from after reading in
+ * the 'zpool create' portion of the log.
+ *
+ * 'sh_records_lost' keeps track of how many records have been overwritten
+ * and permanently lost.
+ */
+
+/* convert a logical offset to physical */
+static uint64_t
+spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp)
+{
+	uint64_t phys_len;
+
+	phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len;
+	return ((log_off - shpp->sh_pool_create_len) % phys_len
+	    + shpp->sh_pool_create_len);
+}
+
+void
+spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
+{
+	dmu_buf_t *dbp;
+	spa_history_phys_t *shpp;
+	objset_t *mos = spa->spa_meta_objset;
+
+	ASSERT0(spa->spa_history);
+	spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
+	    SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+	    sizeof (spa_history_phys_t), tx);
+
+	VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_HISTORY, sizeof (uint64_t), 1,
+	    &spa->spa_history, tx));
+
+	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+	ASSERT3U(dbp->db_size, >=, sizeof (spa_history_phys_t));
+
+	shpp = dbp->db_data;
+	dmu_buf_will_dirty(dbp, tx);
+
+	/*
+	 * Figure out maximum size of history log.  We set it at
+	 * 0.1% of pool size, with a max of 1G and min of 128KB.
+	 */
+	shpp->sh_phys_max_off =
+	    metaslab_class_get_dspace(spa_normal_class(spa)) / 1000;
+	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30);
+	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
+
+	dmu_buf_rele(dbp, FTAG);
+}
+
+/*
+ * Change 'sh_bof' to the beginning of the next record.
+ */
+static int
+spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
+{
+	objset_t *mos = spa->spa_meta_objset;
+	uint64_t firstread, reclen, phys_bof;
+	char buf[sizeof (reclen)];
+	int err;
+
+	phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp);
+	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
+
+	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
+	    buf, DMU_READ_PREFETCH)) != 0)
+		return (err);
+	if (firstread != sizeof (reclen)) {
+		if ((err = dmu_read(mos, spa->spa_history,
+		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
+		    buf + firstread, DMU_READ_PREFETCH)) != 0)
+			return (err);
+	}
+
+	reclen = LE_64(*((uint64_t *)buf));
+	shpp->sh_bof += reclen + sizeof (reclen);
+	shpp->sh_records_lost++;
+	return (0);
+}
+
+static int
+spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp,
+    dmu_tx_t *tx)
+{
+	uint64_t firstwrite, phys_eof;
+	objset_t *mos = spa->spa_meta_objset;
+	int err;
+
+	ASSERT(MUTEX_HELD(&spa->spa_history_lock));
+
+	/* see if we need to reset logical BOF */
+	while (shpp->sh_phys_max_off - shpp->sh_pool_create_len -
+	    (shpp->sh_eof - shpp->sh_bof) <= len) {
+		if ((err = spa_history_advance_bof(spa, shpp)) != 0) {
+			return (err);
+		}
+	}
+
+	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+	firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof);
+	shpp->sh_eof += len;
+	dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx);
+
+	len -= firstwrite;
+	if (len > 0) {
+		/* write out the rest at the beginning of physical file */
+		dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len,
+		    len, (char *)buf + firstwrite, tx);
+	}
+
+	return (0);
+}
+
+/*
+ * Post a history sysevent.
+ *
+ * The nvlist_t* passed into this function will be transformed into a new
+ * nvlist where:
+ *
+ * 1. Nested nvlists will be flattened to a single level
+ * 2. Keys will have their names normalized (to remove any problematic
+ * characters, such as whitespace)
+ *
+ * The nvlist_t passed into this function will duplicated and should be freed
+ * by caller.
+ *
+ */
+static void
+spa_history_log_notify(spa_t *spa, nvlist_t *nvl)
+{
+	nvlist_t *hist_nvl = fnvlist_alloc();
+	uint64_t uint64;
+	char *string;
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string);
+
+	if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0)
+		fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64);
+
+	if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0)
+		fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64);
+
+	spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT);
+
+	nvlist_free(hist_nvl);
+}
+
+/*
+ * Write out a history event.
+ */
+/*ARGSUSED*/
+static void
+spa_history_log_sync(void *arg, dmu_tx_t *tx)
+{
+	nvlist_t	*nvl = arg;
+	spa_t		*spa = dmu_tx_pool(tx)->dp_spa;
+	objset_t	*mos = spa->spa_meta_objset;
+	dmu_buf_t	*dbp;
+	spa_history_phys_t *shpp;
+	size_t		reclen;
+	uint64_t	le_len;
+	char		*record_packed = NULL;
+	int		ret;
+
+	/*
+	 * If we have an older pool that doesn't have a command
+	 * history object, create it now.
+	 */
+	mutex_enter(&spa->spa_history_lock);
+	if (!spa->spa_history)
+		spa_history_create_obj(spa, tx);
+	mutex_exit(&spa->spa_history_lock);
+
+	/*
+	 * Get the offset of where we need to write via the bonus buffer.
+	 * Update the offset when the write completes.
+	 */
+	VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp));
+	shpp = dbp->db_data;
+
+	dmu_buf_will_dirty(dbp, tx);
+
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(dbp, &doi);
+		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+	}
+#endif
+
+	fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname()->nodename);
+
+	if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+		zfs_dbgmsg("command: %s",
+		    fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD));
+	} else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) {
+		if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) {
+			zfs_dbgmsg("txg %lld %s %s (id %llu) %s",
+			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME),
+			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+		} else {
+			zfs_dbgmsg("txg %lld %s %s",
+			    fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME),
+			    fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR));
+		}
+		/*
+		 * The history sysevent is posted only for internal history
+		 * messages to show what has happened, not how it happened. For
+		 * example, the following command:
+		 *
+		 * # zfs destroy -r tank/foo
+		 *
+		 * will result in one sysevent posted per dataset that is
+		 * destroyed as a result of the command - which could be more
+		 * than one event in total.  By contrast, if the sysevent was
+		 * posted as a result of the ZPOOL_HIST_CMD key being present
+		 * it would result in only one sysevent being posted with the
+		 * full command line arguments, requiring the consumer to know
+		 * how to parse and understand zfs(8) command invocations.
+		 */
+		spa_history_log_notify(spa, nvl);
+	} else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) {
+		zfs_dbgmsg("ioctl %s",
+		    fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL));
+	}
+
+	VERIFY3U(nvlist_pack(nvl, &record_packed, &reclen, NV_ENCODE_NATIVE,
+	    KM_SLEEP), ==, 0);
+
+	mutex_enter(&spa->spa_history_lock);
+
+	/* write out the packed length as little endian */
+	le_len = LE_64((uint64_t)reclen);
+	ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx);
+	if (!ret)
+		ret = spa_history_write(spa, record_packed, reclen, shpp, tx);
+
+	/* The first command is the create, which we keep forever */
+	if (ret == 0 && shpp->sh_pool_create_len == 0 &&
+	    nvlist_exists(nvl, ZPOOL_HIST_CMD)) {
+		shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof;
+	}
+
+	mutex_exit(&spa->spa_history_lock);
+	fnvlist_pack_free(record_packed, reclen);
+	dmu_buf_rele(dbp, FTAG);
+	fnvlist_free(nvl);
+}
+
+/*
+ * Write out a history event.
+ */
+int
+spa_history_log(spa_t *spa, const char *msg)
+{
+	int err;
+	nvlist_t *nvl = fnvlist_alloc();
+
+	fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg);
+	err = spa_history_log_nvl(spa, nvl);
+	fnvlist_free(nvl);
+	return (err);
+}
+
+int
+spa_history_log_nvl(spa_t *spa, nvlist_t *nvl)
+{
+	int err = 0;
+	dmu_tx_t *tx;
+	nvlist_t *nvarg, *in_nvl = NULL;
+
+	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa))
+		return (SET_ERROR(EINVAL));
+
+	err = nvlist_lookup_nvlist(nvl, ZPOOL_HIST_INPUT_NVL, &in_nvl);
+	if (err == 0) {
+		(void) nvlist_remove_all(in_nvl, ZPOOL_HIDDEN_ARGS);
+	}
+
+	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	VERIFY0(nvlist_dup(nvl, &nvarg, KM_SLEEP));
+	if (spa_history_zone() != NULL) {
+		fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE,
+		    spa_history_zone());
+	}
+	fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED()));
+
+	/*
+	 * Since the history is recorded asynchronously, the effective time is
+	 * now, which may be considerably before the change is made on disk.
+	 */
+	fnvlist_add_uint64(nvarg, ZPOOL_HIST_TIME, gethrestime_sec());
+
+	/* Kick this off asynchronously; errors are ignored. */
+	dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, nvarg, tx);
+	dmu_tx_commit(tx);
+
+	/* spa_history_log_sync will free nvl */
+	return (err);
+}
+
+/*
+ * Read out the command history.
+ */
+int
+spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
+{
+	objset_t *mos = spa->spa_meta_objset;
+	dmu_buf_t *dbp;
+	uint64_t read_len, phys_read_off, phys_eof;
+	uint64_t leftover = 0;
+	spa_history_phys_t *shpp;
+	int err;
+
+	/*
+	 * If the command history doesn't exist (older pool),
+	 * that's ok, just return ENOENT.
+	 */
+	if (!spa->spa_history)
+		return (SET_ERROR(ENOENT));
+
+	/*
+	 * The history is logged asynchronously, so when they request
+	 * the first chunk of history, make sure everything has been
+	 * synced to disk so that we get it.
+	 */
+	if (*offp == 0 && spa_writeable(spa))
+		txg_wait_synced(spa_get_dsl(spa), 0);
+
+	if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0)
+		return (err);
+	shpp = dbp->db_data;
+
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(dbp, &doi);
+		ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS);
+	}
+#endif
+
+	mutex_enter(&spa->spa_history_lock);
+	phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp);
+
+	if (*offp < shpp->sh_pool_create_len) {
+		/* read in just the zpool create history */
+		phys_read_off = *offp;
+		read_len = MIN(*len, shpp->sh_pool_create_len -
+		    phys_read_off);
+	} else {
+		/*
+		 * Need to reset passed in offset to BOF if the passed in
+		 * offset has since been overwritten.
+		 */
+		*offp = MAX(*offp, shpp->sh_bof);
+		phys_read_off = spa_history_log_to_phys(*offp, shpp);
+
+		/*
+		 * Read up to the minimum of what the user passed down or
+		 * the EOF (physical or logical).  If we hit physical EOF,
+		 * use 'leftover' to read from the physical BOF.
+		 */
+		if (phys_read_off <= phys_eof) {
+			read_len = MIN(*len, phys_eof - phys_read_off);
+		} else {
+			read_len = MIN(*len,
+			    shpp->sh_phys_max_off - phys_read_off);
+			if (phys_read_off + *len > shpp->sh_phys_max_off) {
+				leftover = MIN(*len - read_len,
+				    phys_eof - shpp->sh_pool_create_len);
+			}
+		}
+	}
+
+	/* offset for consumer to use next */
+	*offp += read_len + leftover;
+
+	/* tell the consumer how much you actually read */
+	*len = read_len + leftover;
+
+	if (read_len == 0) {
+		mutex_exit(&spa->spa_history_lock);
+		dmu_buf_rele(dbp, FTAG);
+		return (0);
+	}
+
+	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
+	    DMU_READ_PREFETCH);
+	if (leftover && err == 0) {
+		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
+		    leftover, buf + read_len, DMU_READ_PREFETCH);
+	}
+	mutex_exit(&spa->spa_history_lock);
+
+	dmu_buf_rele(dbp, FTAG);
+	return (err);
+}
+
+/*
+ * The nvlist will be consumed by this call.
+ */
+static void
+log_internal(nvlist_t *nvl, const char *operation, spa_t *spa,
+    dmu_tx_t *tx, const char *fmt, va_list adx)
+{
+	char *msg;
+
+	/*
+	 * If this is part of creating a pool, not everything is
+	 * initialized yet, so don't bother logging the internal events.
+	 * Likewise if the pool is not writeable.
+	 */
+	if (spa_is_initializing(spa) || !spa_writeable(spa)) {
+		fnvlist_free(nvl);
+		return;
+	}
+
+	msg = kmem_vasprintf(fmt, adx);
+	fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg);
+	kmem_strfree(msg);
+
+	fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec());
+
+	if (dmu_tx_is_syncing(tx)) {
+		spa_history_log_sync(nvl, tx);
+	} else {
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    spa_history_log_sync, nvl, tx);
+	}
+	/* spa_history_log_sync() will free nvl */
+}
+
+void
+spa_history_log_internal(spa_t *spa, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...)
+{
+	dmu_tx_t *htx = tx;
+	va_list adx;
+
+	/* create a tx if we didn't get one */
+	if (tx == NULL) {
+		htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+		if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+			dmu_tx_abort(htx);
+			return;
+		}
+	}
+
+	va_start(adx, fmt);
+	log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx);
+	va_end(adx);
+
+	/* if we didn't get a tx from the caller, commit the one we made */
+	if (tx == NULL)
+		dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...)
+{
+	va_list adx;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *nvl = fnvlist_alloc();
+
+	ASSERT(tx != NULL);
+
+	dsl_dataset_name(ds, namebuf);
+	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object);
+
+	va_start(adx, fmt);
+	log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx);
+	va_end(adx);
+}
+
+void
+spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
+    dmu_tx_t *tx, const char *fmt, ...)
+{
+	va_list adx;
+	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
+	nvlist_t *nvl = fnvlist_alloc();
+
+	ASSERT(tx != NULL);
+
+	dsl_dir_name(dd, namebuf);
+	fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
+	fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
+	    dsl_dir_phys(dd)->dd_head_dataset_obj);
+
+	va_start(adx, fmt);
+	log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
+	va_end(adx);
+}
+
+void
+spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx)
+{
+	utsname_t *u = utsname();
+
+	spa_history_log_internal(spa, operation, tx,
+	    "pool version %llu; software version %s; uts %s %s %s %s",
+	    (u_longlong_t)spa_version(spa), ZFS_META_GITREV,
+	    u->nodename, u->release, u->version, u->machine);
+}
+
+#ifndef _KERNEL
+const char *
+spa_history_zone(void)
+{
+	return (NULL);
+}
+#endif
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(spa_history_create_obj);
+EXPORT_SYMBOL(spa_history_get);
+EXPORT_SYMBOL(spa_history_log);
+EXPORT_SYMBOL(spa_history_log_internal);
+EXPORT_SYMBOL(spa_history_log_version);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
new file mode 100644
index 000000000000..5c55d32ec066
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_log_spacemap.c
@@ -0,0 +1,1322 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2018, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_objset.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/spa_log_spacemap.h>
+#include <sys/vdev_impl.h>
+#include <sys/zap.h>
+
+/*
+ * Log Space Maps
+ *
+ * Log space maps are an optimization in ZFS metadata allocations for pools
+ * whose workloads are primarily random-writes. Random-write workloads are also
+ * typically random-free, meaning that they are freeing from locations scattered
+ * throughout the pool. This means that each TXG we will have to append some
+ * FREE records to almost every metaslab. With log space maps, we hold their
+ * changes in memory and log them altogether in one pool-wide space map on-disk
+ * for persistence. As more blocks are accumulated in the log space maps and
+ * more unflushed changes are accounted in memory, we flush a selected group
+ * of metaslabs every TXG to relieve memory pressure and potential overheads
+ * when loading the pool. Flushing a metaslab to disk relieves memory as we
+ * flush any unflushed changes from memory to disk (i.e. the metaslab's space
+ * map) and saves import time by making old log space maps obsolete and
+ * eventually destroying them. [A log space map is said to be obsolete when all
+ * its entries have made it to their corresponding metaslab space maps].
+ *
+ * == On disk data structures used ==
+ *
+ * - The pool has a new feature flag and a new entry in the MOS. The feature
+ *   is activated when we create the first log space map and remains active
+ *   for the lifetime of the pool. The new entry in the MOS Directory [refer
+ *   to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value
+ *   pairs are of the form <key: txg, value: log space map object for that txg>.
+ *   This entry is our on-disk reference of the log space maps that exist in
+ *   the pool for each TXG and it is used during import to load all the
+ *   metaslab unflushed changes in memory. To see how this structure is first
+ *   created and later populated refer to spa_generate_syncing_log_sm(). To see
+ *   how it is used during import time refer to spa_ld_log_sm_metadata().
+ *
+ * - Each vdev has a new entry in its vdev_top_zap (see field
+ *   VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of
+ *   each metaslab in this vdev. This field is the on-disk counterpart of the
+ *   in-memory field ms_unflushed_txg which tells us from which TXG and onwards
+ *   the metaslab haven't had its changes flushed. During import, we use this
+ *   to ignore any entries in the space map log that are for this metaslab but
+ *   from a TXG before msp_unflushed_txg. At that point, we also populate its
+ *   in-memory counterpart and from there both fields are updated every time
+ *   we flush that metaslab.
+ *
+ * - A space map is created every TXG and, during that TXG, it is used to log
+ *   all incoming changes (the log space map). When created, the log space map
+ *   is referenced in memory by spa_syncing_log_sm and its object ID is inserted
+ *   to the space map ZAP mentioned above. The log space map is closed at the
+ *   end of the TXG and will be destroyed when it becomes fully obsolete. We
+ *   know when a log space map has become obsolete by looking at the oldest
+ *   (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger
+ *   than the log space map's TXG, then it means that there is no metaslab who
+ *   doesn't have the changes from that log and we can therefore destroy it.
+ *   [see spa_cleanup_old_sm_logs()].
+ *
+ * == Important in-memory structures ==
+ *
+ * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in
+ *   the pool by their ms_unflushed_txg field. It is primarily used for three
+ *   reasons. First of all, it is used during flushing where we try to flush
+ *   metaslabs in-order from the oldest-flushed to the most recently flushed
+ *   every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the
+ *   oldest flushed metaslab to distinguish which log space maps have become
+ *   obsolete and which ones are still relevant. Finally it tells us which
+ *   metaslabs have unflushed changes in a pool where this feature was just
+ *   enabled, as we don't immediately add all of the pool's metaslabs but we
+ *   add them over time as they go through metaslab_sync(). The reason that
+ *   we do that is to ease these pools into the behavior of the flushing
+ *   algorithm (described later on).
+ *
+ * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory
+ *   counterpart of the space map ZAP mentioned above. It's an AVL tree whose
+ *   nodes represent the log space maps in the pool. This in-memory
+ *   representation of log space maps in the pool sorts the log space maps by
+ *   the TXG that they were created (which is also the TXG of their unflushed
+ *   changes). It also contains the following extra information for each
+ *   space map:
+ *   [1] The number of metaslabs that were last flushed on that TXG. This is
+ *       important because if that counter is zero and this is the oldest
+ *       log then it means that it is also obsolete.
+ *   [2] The number of blocks of that space map. This field is used by the
+ *       block heuristic of our flushing algorithm (described later on).
+ *       It represents how many blocks of metadata changes ZFS had to write
+ *       to disk for that TXG.
+ *
+ * - The per-spa field spa_log_summary is a list of entries that summarizes
+ *   the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg
+ *   AVL tree mentioned above. The reason this exists is that our flushing
+ *   algorithm (described later) tries to estimate how many metaslabs to flush
+ *   in each TXG by iterating over all the log space maps and looking at their
+ *   block counts. Summarizing that information means that don't have to
+ *   iterate through each space map, minimizing the runtime overhead of the
+ *   flushing algorithm which would be induced in syncing context. In terms of
+ *   implementation the log summary is used as a queue:
+ *   * we modify or pop entries from its head when we flush metaslabs
+ *   * we modify or append entries to its tail when we sync changes.
+ *
+ * - Each metaslab has two new range trees that hold its unflushed changes,
+ *   ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint.
+ *
+ * == Flushing algorithm ==
+ *
+ * The decision of how many metaslabs to flush on a give TXG is guided by
+ * two heuristics:
+ *
+ * [1] The memory heuristic -
+ * We keep track of the memory used by the unflushed trees from all the
+ * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it
+ * stays below a certain threshold which is determined by an arbitrary hard
+ * limit and an arbitrary percentage of the system's memory [see
+ * spa_log_exceeds_memlimit()]. When we see that the memory usage of the
+ * unflushed changes are passing that threshold, we flush metaslabs, which
+ * empties their unflushed range trees, reducing the memory used.
+ *
+ * [2] The block heuristic -
+ * We try to keep the total number of blocks in the log space maps in check
+ * so the log doesn't grow indefinitely and we don't induce a lot of overhead
+ * when loading the pool. At the same time we don't want to flush a lot of
+ * metaslabs too often as this would defeat the purpose of the log space map.
+ * As a result we set a limit in the amount of blocks that we think it's
+ * acceptable for the log space maps to have and try not to cross it.
+ * [see sus_blocklimit from spa_unflushed_stats].
+ *
+ * In order to stay below the block limit every TXG we have to estimate how
+ * many metaslabs we need to flush based on the current rate of incoming blocks
+ * and our history of log space map blocks. The main idea here is to answer
+ * the question of how many metaslabs do we need to flush in order to get rid
+ * at least an X amount of log space map blocks. We can answer this question
+ * by iterating backwards from the oldest log space map to the newest one
+ * and looking at their metaslab and block counts. At this point the log summary
+ * mentioned above comes handy as it reduces the amount of things that we have
+ * to iterate (even though it may reduce the preciseness of our estimates due
+ * to its aggregation of data). So with that in mind, we project the incoming
+ * rate of the current TXG into the future and attempt to approximate how many
+ * metaslabs would we need to flush from now in order to avoid exceeding our
+ * block limit in different points in the future (granted that we would keep
+ * flushing the same number of metaslabs for every TXG). Then we take the
+ * maximum number from all these estimates to be on the safe side. For the
+ * exact implementation details of algorithm refer to
+ * spa_estimate_metaslabs_to_flush.
+ */
+
+/*
+ * This is used as the block size for the space maps used for the
+ * log space map feature. These space maps benefit from a bigger
+ * block size as we expect to be writing a lot of data to them at
+ * once.
+ */
+unsigned long zfs_log_sm_blksz = 1ULL << 17;
+
+/*
+ * Percentage of the overall system's memory that ZFS allows to be
+ * used for unflushed changes (e.g. the sum of size of all the nodes
+ * in the unflushed trees).
+ *
+ * Note that this value is calculated over 1000000 for finer granularity
+ * (thus the _ppm suffix; reads as "parts per million"). As an example,
+ * the default of 1000 allows 0.1% of memory to be used.
+ */
+unsigned long zfs_unflushed_max_mem_ppm = 1000;
+
+/*
+ * Specific hard-limit in memory that ZFS allows to be used for
+ * unflushed changes.
+ */
+unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30;
+
+/*
+ * The following tunable determines the number of blocks that can be used for
+ * the log space maps. It is expressed as a percentage of the total number of
+ * metaslabs in the pool (i.e. the default of 400 means that the number of log
+ * blocks is capped at 4 times the number of metaslabs).
+ *
+ * This value exists to tune our flushing algorithm, with higher values
+ * flushing metaslabs less often (doing less I/Os) per TXG versus lower values
+ * flushing metaslabs more aggressively with the upside of saving overheads
+ * when loading the pool. Another factor in this tradeoff is that flushing
+ * less often can potentially lead to better utilization of the metaslab space
+ * map's block size as we accumulate more changes per flush.
+ *
+ * Given that this tunable indirectly controls the flush rate (metaslabs
+ * flushed per txg) and that's why making it a percentage in terms of the
+ * number of metaslabs in the pool makes sense here.
+ *
+ * As a rule of thumb we default this tunable to 400% based on the following:
+ *
+ * 1] Assuming a constant flush rate and a constant incoming rate of log blocks
+ *    it is reasonable to expect that the amount of obsolete entries changes
+ *    linearly from txg to txg (e.g. the oldest log should have the most
+ *    obsolete entries, and the most recent one the least). With this we could
+ *    say that, at any given time, about half of the entries in the whole space
+ *    map log are obsolete. Thus for every two entries for a metaslab in the
+ *    log space map, only one of them is valid and actually makes it to the
+ *    metaslab's space map.
+ *    [factor of 2]
+ * 2] Each entry in the log space map is guaranteed to be two words while
+ *    entries in metaslab space maps are generally single-word.
+ *    [an extra factor of 2 - 400% overall]
+ * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into
+ *    account any consolidation of segments from the log space map to the
+ *    unflushed range trees nor their history (e.g. a segment being allocated,
+ *    then freed, then allocated again means 3 log space map entries but 0
+ *    metaslab space map entries). Depending on the workload, we've seen ~1.8
+ *    non-obsolete log space map entries per metaslab entry, for a total of
+ *    ~600%. Since most of these estimates though are workload dependent, we
+ *    default on 400% to be conservative.
+ *
+ *    Thus we could say that even in the worst
+ *    case of [1] and [2], the factor should end up being 4.
+ *
+ * That said, regardless of the number of metaslabs in the pool we need to
+ * provide upper and lower bounds for the log block limit.
+ * [see zfs_unflushed_log_block_{min,max}]
+ */
+unsigned long zfs_unflushed_log_block_pct = 400;
+
+/*
+ * If the number of metaslabs is small and our incoming rate is high, we could
+ * get into a situation that we are flushing all our metaslabs every TXG. Thus
+ * we always allow at least this many log blocks.
+ */
+unsigned long zfs_unflushed_log_block_min = 1000;
+
+/*
+ * If the log becomes too big, the import time of the pool can take a hit in
+ * terms of performance. Thus we have a hard limit in the size of the log in
+ * terms of blocks.
+ */
+unsigned long zfs_unflushed_log_block_max = (1ULL << 18);
+
+/*
+ * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and
+ * stability of the flushing algorithm (longer summary) vs its runtime overhead
+ * (smaller summary is faster to traverse).
+ */
+unsigned long zfs_max_logsm_summary_length = 10;
+
+/*
+ * Tunable that sets the lower bound on the metaslabs to flush every TXG.
+ *
+ * Setting this to 0 has no effect since if the pool is idle we won't even be
+ * creating log space maps and therefore we won't be flushing. On the other
+ * hand if the pool has any incoming workload our block heuristic will start
+ * flushing metaslabs anyway.
+ *
+ * The point of this tunable is to be used in extreme cases where we really
+ * want to flush more metaslabs than our adaptable heuristic plans to flush.
+ */
+unsigned long zfs_min_metaslabs_to_flush = 1;
+
+/*
+ * Tunable that specifies how far in the past do we want to look when trying to
+ * estimate the incoming log blocks for the current TXG.
+ *
+ * Setting this too high may not only increase runtime but also minimize the
+ * effect of the incoming rates from the most recent TXGs as we take the
+ * average over all the blocks that we walk
+ * [see spa_estimate_incoming_log_blocks].
+ */
+unsigned long zfs_max_log_walking = 5;
+
+/*
+ * This tunable exists solely for testing purposes. It ensures that the log
+ * spacemaps are not flushed and destroyed during export in order for the
+ * relevant log spacemap import code paths to be tested (effectively simulating
+ * a crash).
+ */
+int zfs_keep_log_spacemaps_at_export = 0;
+
+static uint64_t
+spa_estimate_incoming_log_blocks(spa_t *spa)
+{
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	uint64_t steps = 0, sum = 0;
+	for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
+	    sls != NULL && steps < zfs_max_log_walking;
+	    sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) {
+		if (sls->sls_txg == spa_syncing_txg(spa)) {
+			/*
+			 * skip the log created in this TXG as this would
+			 * make our estimations inaccurate.
+			 */
+			continue;
+		}
+		sum += sls->sls_nblocks;
+		steps++;
+	}
+	return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0);
+}
+
+uint64_t
+spa_log_sm_blocklimit(spa_t *spa)
+{
+	return (spa->spa_unflushed_stats.sus_blocklimit);
+}
+
+void
+spa_log_sm_set_blocklimit(spa_t *spa)
+{
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
+		ASSERT0(spa_log_sm_blocklimit(spa));
+		return;
+	}
+
+	uint64_t calculated_limit =
+	    (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100;
+	spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit,
+	    zfs_unflushed_log_block_min), zfs_unflushed_log_block_max);
+}
+
+uint64_t
+spa_log_sm_nblocks(spa_t *spa)
+{
+	return (spa->spa_unflushed_stats.sus_nblocks);
+}
+
+/*
+ * Ensure that the in-memory log space map structures and the summary
+ * have the same block and metaslab counts.
+ */
+static void
+spa_log_summary_verify_counts(spa_t *spa)
+{
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0)
+		return;
+
+	uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+
+	uint64_t ms_in_summary = 0, blk_in_summary = 0;
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e; e = list_next(&spa->spa_log_summary, e)) {
+		ms_in_summary += e->lse_mscount;
+		blk_in_summary += e->lse_blkcount;
+	}
+
+	uint64_t ms_in_logs = 0, blk_in_logs = 0;
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		ms_in_logs += sls->sls_mscount;
+		blk_in_logs += sls->sls_nblocks;
+	}
+
+	VERIFY3U(ms_in_logs, ==, ms_in_summary);
+	VERIFY3U(ms_in_logs, ==, ms_in_avl);
+	VERIFY3U(blk_in_logs, ==, blk_in_summary);
+	VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa));
+}
+
+static boolean_t
+summary_entry_is_full(spa_t *spa, log_summary_entry_t *e)
+{
+	uint64_t blocks_per_row = MAX(1,
+	    DIV_ROUND_UP(spa_log_sm_blocklimit(spa),
+	    zfs_max_logsm_summary_length));
+	return (blocks_per_row <= e->lse_blkcount);
+}
+
+/*
+ * Update the log summary information to reflect the fact that a metaslab
+ * was flushed or destroyed (e.g due to device removal or pool export/destroy).
+ *
+ * We typically flush the oldest flushed metaslab so the first (and oldest)
+ * entry of the summary is updated. However if that metaslab is getting loaded
+ * we may flush the second oldest one which may be part of an entry later in
+ * the summary. Moreover, if we call into this function from metaslab_fini()
+ * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask
+ * for a txg as an argument so we can locate the appropriate summary entry for
+ * the metaslab.
+ */
+void
+spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg)
+{
+	/*
+	 * We don't track summary data for read-only pools and this function
+	 * can be called from metaslab_fini(). In that case return immediately.
+	 */
+	if (!spa_writeable(spa))
+		return;
+
+	log_summary_entry_t *target = NULL;
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e != NULL; e = list_next(&spa->spa_log_summary, e)) {
+		if (e->lse_start > txg)
+			break;
+		target = e;
+	}
+
+	if (target == NULL || target->lse_mscount == 0) {
+		/*
+		 * We didn't find a summary entry for this metaslab. We must be
+		 * at the teardown of a spa_load() attempt that got an error
+		 * while reading the log space maps.
+		 */
+		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
+		return;
+	}
+
+	target->lse_mscount--;
+}
+
+/*
+ * Update the log summary information to reflect the fact that we destroyed
+ * old log space maps. Since we can only destroy the oldest log space maps,
+ * we decrement the block count of the oldest summary entry and potentially
+ * destroy it when that count hits 0.
+ *
+ * This function is called after a metaslab is flushed and typically that
+ * metaslab is the oldest flushed, which means that this function will
+ * typically decrement the block count of the first entry of the summary and
+ * potentially free it if the block count gets to zero (its metaslab count
+ * should be zero too at that point).
+ *
+ * There are certain scenarios though that don't work exactly like that so we
+ * need to account for them:
+ *
+ * Scenario [1]: It is possible that after we flushed the oldest flushed
+ * metaslab and we destroyed the oldest log space map, more recent logs had 0
+ * metaslabs pointing to them so we got rid of them too. This can happen due
+ * to metaslabs being destroyed through device removal, or because the oldest
+ * flushed metaslab was loading but we kept flushing more recently flushed
+ * metaslabs due to the memory pressure of unflushed changes. Because of that,
+ * we always iterate from the beginning of the summary and if blocks_gone is
+ * bigger than the block_count of the current entry we free that entry (we
+ * expect its metaslab count to be zero), we decrement blocks_gone and on to
+ * the next entry repeating this procedure until blocks_gone gets decremented
+ * to 0. Doing this also works for the typical case mentioned above.
+ *
+ * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by
+ * the first (and oldest) entry in the summary. If the first few entries of
+ * the summary were only accounting metaslabs from a device that was just
+ * removed, then the current oldest flushed metaslab could be accounted by an
+ * entry somewhere in the middle of the summary. Moreover flushing that
+ * metaslab will destroy all the log space maps older than its ms_unflushed_txg
+ * because they became obsolete after the removal. Thus, iterating as we did
+ * for scenario [1] works out for this case too.
+ *
+ * Scenario [3]: At times we decide to flush all the metaslabs in the pool
+ * in one TXG (either because we are exporting the pool or because our flushing
+ * heuristics decided to do so). When that happens all the log space maps get
+ * destroyed except the one created for the current TXG which doesn't have
+ * any log blocks yet. As log space maps get destroyed with every metaslab that
+ * we flush, entries in the summary are also destroyed. This brings a weird
+ * corner-case when we flush the last metaslab and the log space map of the
+ * current TXG is in the same summary entry with other log space maps that
+ * are older. When that happens we are eventually left with this one last
+ * summary entry whose blocks are gone (blocks_gone equals the entry's block
+ * count) but its metaslab count is non-zero (because it accounts all the
+ * metaslabs in the pool as they all got flushed). Under this scenario we can't
+ * free this last summary entry as it's referencing all the metaslabs in the
+ * pool and its block count will get incremented at the end of this sync (when
+ * we close the syncing log space map). Thus we just decrement its current
+ * block count and leave it alone. In the case that the pool gets exported,
+ * its metaslab count will be decremented over time as we call metaslab_fini()
+ * for all the metaslabs in the pool and the entry will be freed at
+ * spa_unload_log_sm_metadata().
+ */
+void
+spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone)
+{
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e != NULL; e = list_head(&spa->spa_log_summary)) {
+		if (e->lse_blkcount > blocks_gone) {
+			/*
+			 * Assert that we stopped at an entry that is not
+			 * obsolete.
+			 */
+			ASSERT(e->lse_mscount != 0);
+
+			e->lse_blkcount -= blocks_gone;
+			blocks_gone = 0;
+			break;
+		} else if (e->lse_mscount == 0) {
+			/* remove obsolete entry */
+			blocks_gone -= e->lse_blkcount;
+			list_remove(&spa->spa_log_summary, e);
+			kmem_free(e, sizeof (log_summary_entry_t));
+		} else {
+			/* Verify that this is scenario [3] mentioned above. */
+			VERIFY3U(blocks_gone, ==, e->lse_blkcount);
+
+			/*
+			 * Assert that this is scenario [3] further by ensuring
+			 * that this is the only entry in the summary.
+			 */
+			VERIFY3P(e, ==, list_tail(&spa->spa_log_summary));
+			ASSERT3P(e, ==, list_head(&spa->spa_log_summary));
+
+			blocks_gone = e->lse_blkcount = 0;
+			break;
+		}
+	}
+
+	/*
+	 * Ensure that there is no way we are trying to remove more blocks
+	 * than the # of blocks in the summary.
+	 */
+	ASSERT0(blocks_gone);
+}
+
+void
+spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg)
+{
+	spa_log_sm_t target = { .sls_txg = txg };
+	spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
+	    &target, NULL);
+
+	if (sls == NULL) {
+		/*
+		 * We must be at the teardown of a spa_load() attempt that
+		 * got an error while reading the log space maps.
+		 */
+		VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR);
+		return;
+	}
+
+	ASSERT(sls->sls_mscount > 0);
+	sls->sls_mscount--;
+}
+
+void
+spa_log_sm_increment_current_mscount(spa_t *spa)
+{
+	spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg);
+	ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa));
+	last_sls->sls_mscount++;
+}
+
+static void
+summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed,
+    uint64_t nblocks)
+{
+	log_summary_entry_t *e = list_tail(&spa->spa_log_summary);
+
+	if (e == NULL || summary_entry_is_full(spa, e)) {
+		e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP);
+		e->lse_start = txg;
+		list_insert_tail(&spa->spa_log_summary, e);
+	}
+
+	ASSERT3U(e->lse_start, <=, txg);
+	e->lse_mscount += metaslabs_flushed;
+	e->lse_blkcount += nblocks;
+}
+
+static void
+spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks)
+{
+	summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks);
+}
+
+void
+spa_log_summary_add_flushed_metaslab(spa_t *spa)
+{
+	summary_add_data(spa, spa_syncing_txg(spa), 1, 0);
+}
+
+/*
+ * This function attempts to estimate how many metaslabs should
+ * we flush to satisfy our block heuristic for the log spacemap
+ * for the upcoming TXGs.
+ *
+ * Specifically, it first tries to estimate the number of incoming
+ * blocks in this TXG. Then by projecting that incoming rate to
+ * future TXGs and using the log summary, it figures out how many
+ * flushes we would need to do for future TXGs individually to
+ * stay below our block limit and returns the maximum number of
+ * flushes from those estimates.
+ */
+static uint64_t
+spa_estimate_metaslabs_to_flush(spa_t *spa)
+{
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+	ASSERT3U(spa_sync_pass(spa), ==, 1);
+	ASSERT(spa_log_sm_blocklimit(spa) != 0);
+
+	/*
+	 * This variable contains the incoming rate that will be projected
+	 * and used for our flushing estimates in the future.
+	 */
+	uint64_t incoming = spa_estimate_incoming_log_blocks(spa);
+
+	/*
+	 * At any point in time this variable tells us how many
+	 * TXGs in the future we are so we can make our estimations.
+	 */
+	uint64_t txgs_in_future = 1;
+
+	/*
+	 * This variable tells us how much room do we have until we hit
+	 * our limit. When it goes negative, it means that we've exceeded
+	 * our limit and we need to flush.
+	 *
+	 * Note that since we start at the first TXG in the future (i.e.
+	 * txgs_in_future starts from 1) we already decrement this
+	 * variable by the incoming rate.
+	 */
+	int64_t available_blocks =
+	    spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming;
+
+	/*
+	 * This variable tells us the total number of flushes needed to
+	 * keep the log size within the limit when we reach txgs_in_future.
+	 */
+	uint64_t total_flushes = 0;
+
+	/* Holds the current maximum of our estimates so far. */
+	uint64_t max_flushes_pertxg =
+	    MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed),
+	    zfs_min_metaslabs_to_flush);
+
+	/*
+	 * For our estimations we only look as far in the future
+	 * as the summary allows us.
+	 */
+	for (log_summary_entry_t *e = list_head(&spa->spa_log_summary);
+	    e; e = list_next(&spa->spa_log_summary, e)) {
+
+		/*
+		 * If there is still room before we exceed our limit
+		 * then keep skipping TXGs accumulating more blocks
+		 * based on the incoming rate until we exceed it.
+		 */
+		if (available_blocks >= 0) {
+			uint64_t skip_txgs = (available_blocks / incoming) + 1;
+			available_blocks -= (skip_txgs * incoming);
+			txgs_in_future += skip_txgs;
+			ASSERT3S(available_blocks, >=, -incoming);
+		}
+
+		/*
+		 * At this point we're far enough into the future where
+		 * the limit was just exceeded and we flush metaslabs
+		 * based on the current entry in the summary, updating
+		 * our available_blocks.
+		 */
+		ASSERT3S(available_blocks, <, 0);
+		available_blocks += e->lse_blkcount;
+		total_flushes += e->lse_mscount;
+
+		/*
+		 * Keep the running maximum of the total_flushes that
+		 * we've done so far over the number of TXGs in the
+		 * future that we are. The idea here is to estimate
+		 * the average number of flushes that we should do
+		 * every TXG so that when we are that many TXGs in the
+		 * future we stay under the limit.
+		 */
+		max_flushes_pertxg = MAX(max_flushes_pertxg,
+		    DIV_ROUND_UP(total_flushes, txgs_in_future));
+		ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
+		    max_flushes_pertxg);
+	}
+	return (max_flushes_pertxg);
+}
+
+uint64_t
+spa_log_sm_memused(spa_t *spa)
+{
+	return (spa->spa_unflushed_stats.sus_memused);
+}
+
+static boolean_t
+spa_log_exceeds_memlimit(spa_t *spa)
+{
+	if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt)
+		return (B_TRUE);
+
+	uint64_t system_mem_allowed = ((physmem * PAGESIZE) *
+	    zfs_unflushed_max_mem_ppm) / 1000000;
+	if (spa_log_sm_memused(spa) > system_mem_allowed)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+boolean_t
+spa_flush_all_logs_requested(spa_t *spa)
+{
+	return (spa->spa_log_flushall_txg != 0);
+}
+
+void
+spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx)
+{
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	if (spa_sync_pass(spa) != 1)
+		return;
+
+	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	/*
+	 * If we don't have any metaslabs with unflushed changes
+	 * return immediately.
+	 */
+	if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0)
+		return;
+
+	/*
+	 * During SPA export we leave a few empty TXGs to go by [see
+	 * spa_final_dirty_txg() to understand why]. For this specific
+	 * case, it is important to not flush any metaslabs as that
+	 * would dirty this TXG.
+	 *
+	 * That said, during one of these dirty TXGs that is less or
+	 * equal to spa_final_dirty(), spa_unload() will request that
+	 * we try to flush all the metaslabs for that TXG before
+	 * exporting the pool, thus we ensure that we didn't get a
+	 * request of flushing everything before we attempt to return
+	 * immediately.
+	 */
+	if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+	    !dmu_objset_is_dirty(spa_meta_objset(spa), txg) &&
+	    !spa_flush_all_logs_requested(spa))
+		return;
+
+	/*
+	 * We need to generate a log space map before flushing because this
+	 * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg)
+	 * for this TXG's flushed metaslab count (aka sls_mscount which is
+	 * manipulated in many ways down the metaslab_flush() codepath).
+	 *
+	 * That is not to say that we may generate a log space map when we
+	 * don't need it. If we are flushing metaslabs, that means that we
+	 * were going to write changes to disk anyway, so even if we were
+	 * not flushing, a log space map would have been created anyway in
+	 * metaslab_sync().
+	 */
+	spa_generate_syncing_log_sm(spa, tx);
+
+	/*
+	 * This variable tells us how many metaslabs we want to flush based
+	 * on the block-heuristic of our flushing algorithm (see block comment
+	 * of log space map feature). We also decrement this as we flush
+	 * metaslabs and attempt to destroy old log space maps.
+	 */
+	uint64_t want_to_flush;
+	if (spa_flush_all_logs_requested(spa)) {
+		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
+		want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed);
+	} else {
+		want_to_flush = spa_estimate_metaslabs_to_flush(spa);
+	}
+
+	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=,
+	    want_to_flush);
+
+	/* Used purely for verification purposes */
+	uint64_t visited = 0;
+
+	/*
+	 * Ideally we would only iterate through spa_metaslabs_by_flushed
+	 * using only one variable (curr). We can't do that because
+	 * metaslab_flush() mutates position of curr in the AVL when
+	 * it flushes that metaslab by moving it to the end of the tree.
+	 * Thus we always keep track of the original next node of the
+	 * current node (curr) in another variable (next).
+	 */
+	metaslab_t *next = NULL;
+	for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed);
+	    curr != NULL; curr = next) {
+		next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr);
+
+		/*
+		 * If this metaslab has been flushed this txg then we've done
+		 * a full circle over the metaslabs.
+		 */
+		if (metaslab_unflushed_txg(curr) == txg)
+			break;
+
+		/*
+		 * If we are done flushing for the block heuristic and the
+		 * unflushed changes don't exceed the memory limit just stop.
+		 */
+		if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa))
+			break;
+
+		mutex_enter(&curr->ms_sync_lock);
+		mutex_enter(&curr->ms_lock);
+		boolean_t flushed = metaslab_flush(curr, tx);
+		mutex_exit(&curr->ms_lock);
+		mutex_exit(&curr->ms_sync_lock);
+
+		/*
+		 * If we failed to flush a metaslab (because it was loading),
+		 * then we are done with the block heuristic as it's not
+		 * possible to destroy any log space maps once you've skipped
+		 * a metaslab. In that case we just set our counter to 0 but
+		 * we continue looping in case there is still memory pressure
+		 * due to unflushed changes. Note that, flushing a metaslab
+		 * that is not the oldest flushed in the pool, will never
+		 * destroy any log space maps [see spa_cleanup_old_sm_logs()].
+		 */
+		if (!flushed) {
+			want_to_flush = 0;
+		} else if (want_to_flush > 0) {
+			want_to_flush--;
+		}
+
+		visited++;
+	}
+	ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited);
+}
+
+/*
+ * Close the log space map for this TXG and update the block counts
+ * for the log's in-memory structure and the summary.
+ */
+void
+spa_sync_close_syncing_log_sm(spa_t *spa)
+{
+	if (spa_syncing_log_sm(spa) == NULL)
+		return;
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
+
+	spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg);
+	ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa));
+
+	sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa));
+	spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+
+	/*
+	 * Note that we can't assert that sls_mscount is not 0,
+	 * because there is the case where the first metaslab
+	 * in spa_metaslabs_by_flushed is loading and we were
+	 * not able to flush any metaslabs the current TXG.
+	 */
+	ASSERT(sls->sls_nblocks != 0);
+
+	spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks);
+	spa_log_summary_verify_counts(spa);
+
+	space_map_close(spa->spa_syncing_log_sm);
+	spa->spa_syncing_log_sm = NULL;
+
+	/*
+	 * At this point we tried to flush as many metaslabs as we
+	 * can as the pool is getting exported. Reset the "flush all"
+	 * so the last few TXGs before closing the pool can be empty
+	 * (e.g. not dirty).
+	 */
+	if (spa_flush_all_logs_requested(spa)) {
+		ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED);
+		spa->spa_log_flushall_txg = 0;
+	}
+}
+
+void
+spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx)
+{
+	objset_t *mos = spa_meta_objset(spa);
+
+	uint64_t spacemap_zap;
+	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT) {
+		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+		return;
+	}
+	VERIFY0(error);
+
+	metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed);
+	uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest);
+
+	/* Free all log space maps older than the oldest_flushed_txg. */
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls && sls->sls_txg < oldest_flushed_txg;
+	    sls = avl_first(&spa->spa_sm_logs_by_txg)) {
+		ASSERT0(sls->sls_mscount);
+		avl_remove(&spa->spa_sm_logs_by_txg, sls);
+		space_map_free_obj(mos, sls->sls_sm_obj, tx);
+		VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx));
+		spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks;
+		kmem_free(sls, sizeof (spa_log_sm_t));
+	}
+}
+
+static spa_log_sm_t *
+spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg)
+{
+	spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP);
+	sls->sls_sm_obj = sm_obj;
+	sls->sls_txg = txg;
+	return (sls);
+}
+
+void
+spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx)
+{
+	uint64_t txg = dmu_tx_get_txg(tx);
+	objset_t *mos = spa_meta_objset(spa);
+
+	if (spa_syncing_log_sm(spa) != NULL)
+		return;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP))
+		return;
+
+	uint64_t spacemap_zap;
+	int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT) {
+		ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+
+		error = 0;
+		spacemap_zap = zap_create(mos,
+		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
+		VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1,
+		    &spacemap_zap, tx));
+		spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx);
+	}
+	VERIFY0(error);
+
+	uint64_t sm_obj;
+	ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj),
+	    ==, ENOENT);
+	sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx);
+	VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx));
+	avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg));
+
+	/*
+	 * We pass UINT64_MAX as the space map's representation size
+	 * and SPA_MINBLOCKSHIFT as the shift, to make the space map
+	 * accept any sorts of segments since there's no real advantage
+	 * to being more restrictive (given that we're already going
+	 * to be using 2-word entries).
+	 */
+	VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj,
+	    0, UINT64_MAX, SPA_MINBLOCKSHIFT));
+
+	/*
+	 * If the log space map feature was just enabled, the blocklimit
+	 * has not yet been set.
+	 */
+	if (spa_log_sm_blocklimit(spa) == 0)
+		spa_log_sm_set_blocklimit(spa);
+}
+
+/*
+ * Find all the log space maps stored in the space map ZAP and sort
+ * them by their TXG in spa_sm_logs_by_txg.
+ */
+static int
+spa_ld_log_sm_metadata(spa_t *spa)
+{
+	int error;
+	uint64_t spacemap_zap;
+
+	ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg));
+
+	error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap);
+	if (error == ENOENT) {
+		/* the space map ZAP doesn't exist yet */
+		return (0);
+	} else if (error != 0) {
+		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
+		    "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]",
+		    error);
+		return (error);
+	}
+
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		uint64_t log_txg = zfs_strtonum(za.za_name, NULL);
+		spa_log_sm_t *sls =
+		    spa_log_sm_alloc(za.za_first_integer, log_txg);
+		avl_add(&spa->spa_sm_logs_by_txg, sls);
+	}
+	zap_cursor_fini(&zc);
+	if (error != ENOENT) {
+		spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at "
+		    "zap_cursor_retrieve(spacemap_zap) [error %d]",
+		    error);
+		return (error);
+	}
+
+	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
+	    m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
+		spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) };
+		spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg,
+		    &target, NULL);
+
+		/*
+		 * At this point if sls is zero it means that a bug occurred
+		 * in ZFS the last time the pool was open or earlier in the
+		 * import code path. In general, we would have placed a
+		 * VERIFY() here or in this case just let the kernel panic
+		 * with NULL pointer dereference when incrementing sls_mscount,
+		 * but since this is the import code path we can be a bit more
+		 * lenient. Thus, for DEBUG bits we always cause a panic, while
+		 * in production we log the error and just fail the import.
+		 */
+		ASSERT(sls != NULL);
+		if (sls == NULL) {
+			spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug "
+			    "encountered: could not find log spacemap for "
+			    "TXG %ld [error %d]",
+			    metaslab_unflushed_txg(m), ENOENT);
+			return (ENOENT);
+		}
+		sls->sls_mscount++;
+	}
+
+	return (0);
+}
+
+typedef struct spa_ld_log_sm_arg {
+	spa_t *slls_spa;
+	uint64_t slls_txg;
+} spa_ld_log_sm_arg_t;
+
+static int
+spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg)
+{
+	uint64_t offset = sme->sme_offset;
+	uint64_t size = sme->sme_run;
+	uint32_t vdev_id = sme->sme_vdev;
+
+	spa_ld_log_sm_arg_t *slls = arg;
+	spa_t *spa = slls->slls_spa;
+
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+
+	/*
+	 * If the vdev has been removed (i.e. it is indirect or a hole)
+	 * skip this entry. The contents of this vdev have already moved
+	 * elsewhere.
+	 */
+	if (!vdev_is_concrete(vd))
+		return (0);
+
+	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
+	ASSERT(!ms->ms_loaded);
+
+	/*
+	 * If we have already flushed entries for this TXG to this
+	 * metaslab's space map, then ignore it. Note that we flush
+	 * before processing any allocations/frees for that TXG, so
+	 * the metaslab's space map only has entries from *before*
+	 * the unflushed TXG.
+	 */
+	if (slls->slls_txg < metaslab_unflushed_txg(ms))
+		return (0);
+
+	switch (sme->sme_type) {
+	case SM_ALLOC:
+		range_tree_remove_xor_add_segment(offset, offset + size,
+		    ms->ms_unflushed_frees, ms->ms_unflushed_allocs);
+		break;
+	case SM_FREE:
+		range_tree_remove_xor_add_segment(offset, offset + size,
+		    ms->ms_unflushed_allocs, ms->ms_unflushed_frees);
+		break;
+	default:
+		panic("invalid maptype_t");
+		break;
+	}
+	return (0);
+}
+
+static int
+spa_ld_log_sm_data(spa_t *spa)
+{
+	int error = 0;
+
+	/*
+	 * If we are not going to do any writes there is no need
+	 * to read the log space maps.
+	 */
+	if (!spa_writeable(spa))
+		return (0);
+
+	ASSERT0(spa->spa_unflushed_stats.sus_nblocks);
+	ASSERT0(spa->spa_unflushed_stats.sus_memused);
+
+	hrtime_t read_logs_starttime = gethrtime();
+	/* this is a no-op when we don't have space map logs */
+	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
+	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
+		space_map_t *sm = NULL;
+		error = space_map_open(&sm, spa_meta_objset(spa),
+		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT);
+		if (error != 0) {
+			spa_load_failed(spa, "spa_ld_log_sm_data(): failed at "
+			    "space_map_open(obj=%llu) [error %d]",
+			    (u_longlong_t)sls->sls_sm_obj, error);
+			goto out;
+		}
+
+		struct spa_ld_log_sm_arg vla = {
+			.slls_spa = spa,
+			.slls_txg = sls->sls_txg
+		};
+		error = space_map_iterate(sm, space_map_length(sm),
+		    spa_ld_log_sm_cb, &vla);
+		if (error != 0) {
+			space_map_close(sm);
+			spa_load_failed(spa, "spa_ld_log_sm_data(): failed "
+			    "at space_map_iterate(obj=%llu) [error %d]",
+			    (u_longlong_t)sls->sls_sm_obj, error);
+			goto out;
+		}
+
+		ASSERT0(sls->sls_nblocks);
+		sls->sls_nblocks = space_map_nblocks(sm);
+		spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks;
+		summary_add_data(spa, sls->sls_txg,
+		    sls->sls_mscount, sls->sls_nblocks);
+
+		space_map_close(sm);
+	}
+	hrtime_t read_logs_endtime = gethrtime();
+	spa_load_note(spa,
+	    "read %llu log space maps (%llu total blocks - blksz = %llu bytes) "
+	    "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg),
+	    (u_longlong_t)spa_log_sm_nblocks(spa),
+	    (u_longlong_t)zfs_log_sm_blksz,
+	    (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000));
+
+out:
+	/*
+	 * Now that the metaslabs contain their unflushed changes:
+	 * [1] recalculate their actual allocated space
+	 * [2] recalculate their weights
+	 * [3] sum up the memory usage of their unflushed range trees
+	 * [4] optionally load them, if debug_load is set
+	 *
+	 * Note that even in the case where we get here because of an
+	 * error (e.g. error != 0), we still want to update the fields
+	 * below in order to have a proper teardown in spa_unload().
+	 */
+	for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed);
+	    m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) {
+		mutex_enter(&m->ms_lock);
+		m->ms_allocated_space = space_map_allocated(m->ms_sm) +
+		    range_tree_space(m->ms_unflushed_allocs) -
+		    range_tree_space(m->ms_unflushed_frees);
+
+		vdev_t *vd = m->ms_group->mg_vd;
+		metaslab_space_update(vd, m->ms_group->mg_class,
+		    range_tree_space(m->ms_unflushed_allocs), 0, 0);
+		metaslab_space_update(vd, m->ms_group->mg_class,
+		    -range_tree_space(m->ms_unflushed_frees), 0, 0);
+
+		ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK);
+		metaslab_recalculate_weight_and_sort(m);
+
+		spa->spa_unflushed_stats.sus_memused +=
+		    metaslab_unflushed_changes_memused(m);
+
+		if (metaslab_debug_load && m->ms_sm != NULL) {
+			VERIFY0(metaslab_load(m));
+			metaslab_set_selected_txg(m, 0);
+		}
+		mutex_exit(&m->ms_lock);
+	}
+
+	return (error);
+}
+
+static int
+spa_ld_unflushed_txgs(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa_meta_objset(spa);
+
+	if (vd->vdev_top_zap == 0)
+		return (0);
+
+	uint64_t object = 0;
+	int error = zap_lookup(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
+	    sizeof (uint64_t), 1, &object);
+	if (error == ENOENT)
+		return (0);
+	else if (error != 0) {
+		spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at "
+		    "zap_lookup(vdev_top_zap=%llu) [error %d]",
+		    (u_longlong_t)vd->vdev_top_zap, error);
+		return (error);
+	}
+
+	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *ms = vd->vdev_ms[m];
+		ASSERT(ms != NULL);
+
+		metaslab_unflushed_phys_t entry;
+		uint64_t entry_size = sizeof (entry);
+		uint64_t entry_offset = ms->ms_id * entry_size;
+
+		error = dmu_read(mos, object,
+		    entry_offset, entry_size, &entry, 0);
+		if (error != 0) {
+			spa_load_failed(spa, "spa_ld_unflushed_txgs(): "
+			    "failed at dmu_read(obj=%llu) [error %d]",
+			    (u_longlong_t)object, error);
+			return (error);
+		}
+
+		ms->ms_unflushed_txg = entry.msp_unflushed_txg;
+		if (ms->ms_unflushed_txg != 0) {
+			mutex_enter(&spa->spa_flushed_ms_lock);
+			avl_add(&spa->spa_metaslabs_by_flushed, ms);
+			mutex_exit(&spa->spa_flushed_ms_lock);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Read all the log space map entries into their respective
+ * metaslab unflushed trees and keep them sorted by TXG in the
+ * SPA's metadata. In addition, setup all the metadata for the
+ * memory and the block heuristics.
+ */
+int
+spa_ld_log_spacemaps(spa_t *spa)
+{
+	int error;
+
+	spa_log_sm_set_blocklimit(spa);
+
+	for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
+		error = spa_ld_unflushed_txgs(vd);
+		if (error != 0)
+			return (error);
+	}
+
+	error = spa_ld_log_sm_metadata(spa);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Note: we don't actually expect anything to change at this point
+	 * but we grab the config lock so we don't fail any assertions
+	 * when using vdev_lookup_top().
+	 */
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	error = spa_ld_log_sm_data(spa);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	return (error);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW,
+    "Specific hard-limit in memory that ZFS allows to be used for "
+    "unflushed changes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW,
+    "Percentage of the overall system memory that ZFS allows to be "
+    "used for unflushed changes (value is calculated over 1000000 for "
+    "finer granularity");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW,
+    "Hard limit (upper-bound) in the size of the space map log "
+    "in terms of blocks.");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW,
+    "Lower-bound limit for the maximum amount of blocks allowed in "
+    "log spacemap (see zfs_unflushed_log_block_max)");
+
+ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW,
+    "Tunable used to determine the number of blocks that can be used for "
+    "the spacemap log, expressed as a percentage of the total number of "
+    "metaslabs in the pool (e.g. 400 means the number of log blocks is "
+    "capped at 4 times the number of metaslabs)");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW,
+    "The number of past TXGs that the flushing algorithm of the log "
+    "spacemap feature uses to estimate incoming log blocks");
+
+ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW,
+    "Maximum number of rows allowed in the summary of the spacemap log");
+
+ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW,
+    "Minimum number of metaslabs to flush per dirty TXG");
+
+ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW,
+    "Prevent the log spacemaps from being flushed and destroyed "
+    "during pool export/destroy");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/spa_misc.c b/sys/contrib/openzfs/module/zfs/spa_misc.c
new file mode 100644
index 000000000000..b4c73f58d3bc
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_misc.c
@@ -0,0 +1,2953 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2017 Datto Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/zil.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
+#include <sys/metaslab.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_scan.h>
+#include <sys/fs/zfs.h>
+#include <sys/metaslab_impl.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/kstat.h>
+#include "zfs_prop.h"
+#include <sys/btree.h>
+#include <sys/zfeature.h>
+#include <sys/qat.h>
+#include <sys/zstd/zstd.h>
+
+/*
+ * SPA locking
+ *
+ * There are three basic locks for managing spa_t structures:
+ *
+ * spa_namespace_lock (global mutex)
+ *
+ *	This lock must be acquired to do any of the following:
+ *
+ *		- Lookup a spa_t by name
+ *		- Add or remove a spa_t from the namespace
+ *		- Increase spa_refcount from non-zero
+ *		- Check if spa_refcount is zero
+ *		- Rename a spa_t
+ *		- add/remove/attach/detach devices
+ *		- Held for the duration of create/destroy/import/export
+ *
+ *	It does not need to handle recursion.  A create or destroy may
+ *	reference objects (files or zvols) in other pools, but by
+ *	definition they must have an existing reference, and will never need
+ *	to lookup a spa_t by name.
+ *
+ * spa_refcount (per-spa zfs_refcount_t protected by mutex)
+ *
+ *	This reference count keep track of any active users of the spa_t.  The
+ *	spa_t cannot be destroyed or freed while this is non-zero.  Internally,
+ *	the refcount is never really 'zero' - opening a pool implicitly keeps
+ *	some references in the DMU.  Internally we check against spa_minref, but
+ *	present the image of a zero/non-zero value to consumers.
+ *
+ * spa_config_lock[] (per-spa array of rwlocks)
+ *
+ *	This protects the spa_t from config changes, and must be held in
+ *	the following circumstances:
+ *
+ *		- RW_READER to perform I/O to the spa
+ *		- RW_WRITER to change the vdev config
+ *
+ * The locking order is fairly straightforward:
+ *
+ *		spa_namespace_lock	->	spa_refcount
+ *
+ *	The namespace lock must be acquired to increase the refcount from 0
+ *	or to check if it is zero.
+ *
+ *		spa_refcount		->	spa_config_lock[]
+ *
+ *	There must be at least one valid reference on the spa_t to acquire
+ *	the config lock.
+ *
+ *		spa_namespace_lock	->	spa_config_lock[]
+ *
+ *	The namespace lock must always be taken before the config lock.
+ *
+ *
+ * The spa_namespace_lock can be acquired directly and is globally visible.
+ *
+ * The namespace is manipulated using the following functions, all of which
+ * require the spa_namespace_lock to be held.
+ *
+ *	spa_lookup()		Lookup a spa_t by name.
+ *
+ *	spa_add()		Create a new spa_t in the namespace.
+ *
+ *	spa_remove()		Remove a spa_t from the namespace.  This also
+ *				frees up any memory associated with the spa_t.
+ *
+ *	spa_next()		Returns the next spa_t in the system, or the
+ *				first if NULL is passed.
+ *
+ *	spa_evict_all()		Shutdown and remove all spa_t structures in
+ *				the system.
+ *
+ *	spa_guid_exists()	Determine whether a pool/device guid exists.
+ *
+ * The spa_refcount is manipulated using the following functions:
+ *
+ *	spa_open_ref()		Adds a reference to the given spa_t.  Must be
+ *				called with spa_namespace_lock held if the
+ *				refcount is currently zero.
+ *
+ *	spa_close()		Remove a reference from the spa_t.  This will
+ *				not free the spa_t or remove it from the
+ *				namespace.  No locking is required.
+ *
+ *	spa_refcount_zero()	Returns true if the refcount is currently
+ *				zero.  Must be called with spa_namespace_lock
+ *				held.
+ *
+ * The spa_config_lock[] is an array of rwlocks, ordered as follows:
+ * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV.
+ * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}().
+ *
+ * To read the configuration, it suffices to hold one of these locks as reader.
+ * To modify the configuration, you must hold all locks as writer.  To modify
+ * vdev state without altering the vdev tree's topology (e.g. online/offline),
+ * you must hold SCL_STATE and SCL_ZIO as writer.
+ *
+ * We use these distinct config locks to avoid recursive lock entry.
+ * For example, spa_sync() (which holds SCL_CONFIG as reader) induces
+ * block allocations (SCL_ALLOC), which may require reading space maps
+ * from disk (dmu_read() -> zio_read() -> SCL_ZIO).
+ *
+ * The spa config locks cannot be normal rwlocks because we need the
+ * ability to hand off ownership.  For example, SCL_ZIO is acquired
+ * by the issuing thread and later released by an interrupt thread.
+ * They do, however, obey the usual write-wanted semantics to prevent
+ * writer (i.e. system administrator) starvation.
+ *
+ * The lock acquisition rules are as follows:
+ *
+ * SCL_CONFIG
+ *	Protects changes to the vdev tree topology, such as vdev
+ *	add/remove/attach/detach.  Protects the dirty config list
+ *	(spa_config_dirty_list) and the set of spares and l2arc devices.
+ *
+ * SCL_STATE
+ *	Protects changes to pool state and vdev state, such as vdev
+ *	online/offline/fault/degrade/clear.  Protects the dirty state list
+ *	(spa_state_dirty_list) and global pool state (spa_state).
+ *
+ * SCL_ALLOC
+ *	Protects changes to metaslab groups and classes.
+ *	Held as reader by metaslab_alloc() and metaslab_claim().
+ *
+ * SCL_ZIO
+ *	Held by bp-level zios (those which have no io_vd upon entry)
+ *	to prevent changes to the vdev tree.  The bp-level zio implicitly
+ *	protects all of its vdev child zios, which do not hold SCL_ZIO.
+ *
+ * SCL_FREE
+ *	Protects changes to metaslab groups and classes.
+ *	Held as reader by metaslab_free().  SCL_FREE is distinct from
+ *	SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free
+ *	blocks in zio_done() while another i/o that holds either
+ *	SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete.
+ *
+ * SCL_VDEV
+ *	Held as reader to prevent changes to the vdev tree during trivial
+ *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
+ *	other locks, and lower than all of them, to ensure that it's safe
+ *	to acquire regardless of caller context.
+ *
+ * In addition, the following rules apply:
+ *
+ * (a)	spa_props_lock protects pool properties, spa_config and spa_config_list.
+ *	The lock ordering is SCL_CONFIG > spa_props_lock.
+ *
+ * (b)	I/O operations on leaf vdevs.  For any zio operation that takes
+ *	an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(),
+ *	or zio_write_phys() -- the caller must ensure that the config cannot
+ *	cannot change in the interim, and that the vdev cannot be reopened.
+ *	SCL_STATE as reader suffices for both.
+ *
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
+ *
+ *	spa_vdev_enter()	Acquire the namespace lock and the config lock
+ *				for writing.
+ *
+ *	spa_vdev_exit()		Release the config lock, wait for all I/O
+ *				to complete, sync the updated configs to the
+ *				cache, and release the namespace lock.
+ *
+ * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit().
+ * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual
+ * locking is, always, based on spa_namespace_lock and spa_config_lock[].
+ */
+
+static avl_tree_t spa_namespace_avl;
+kmutex_t spa_namespace_lock;
+static kcondvar_t spa_namespace_cv;
+int spa_max_replication_override = SPA_DVAS_PER_BP;
+
+static kmutex_t spa_spare_lock;
+static avl_tree_t spa_spare_avl;
+static kmutex_t spa_l2cache_lock;
+static avl_tree_t spa_l2cache_avl;
+
+kmem_cache_t *spa_buffer_pool;
+spa_mode_t spa_mode_global = SPA_MODE_UNINIT;
+
+#ifdef ZFS_DEBUG
+/*
+ * Everything except dprintf, set_error, spa, and indirect_remap is on
+ * by default in debug builds.
+ */
+int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR |
+    ZFS_DEBUG_INDIRECT_REMAP);
+#else
+int zfs_flags = 0;
+#endif
+
+/*
+ * zfs_recover can be set to nonzero to attempt to recover from
+ * otherwise-fatal errors, typically caused by on-disk corruption.  When
+ * set, calls to zfs_panic_recover() will turn into warning messages.
+ * This should only be used as a last resort, as it typically results
+ * in leaked space, or worse.
+ */
+int zfs_recover = B_FALSE;
+
+/*
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress.  While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked".  Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers.  In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks.  However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug).  In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless.  In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do.  Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
+ */
+int zfs_free_leak_on_eio = B_FALSE;
+
+/*
+ * Expiration time in milliseconds. This value has two meanings. First it is
+ * used to determine when the spa_deadman() logic should fire. By default the
+ * spa_deadman() will fire if spa_sync() has not completed in 600 seconds.
+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that
+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+ * in one of three behaviors controlled by zfs_deadman_failmode.
+ */
+unsigned long zfs_deadman_synctime_ms = 600000UL;
+
+/*
+ * This value controls the maximum amount of time zio_wait() will block for an
+ * outstanding IO.  By default this is 300 seconds at which point the "hung"
+ * behavior will be applied as described for zfs_deadman_synctime_ms.
+ */
+unsigned long zfs_deadman_ziotime_ms = 300000UL;
+
+/*
+ * Check time in milliseconds. This defines the frequency at which we check
+ * for hung I/O.
+ */
+unsigned long zfs_deadman_checktime_ms = 60000UL;
+
+/*
+ * By default the deadman is enabled.
+ */
+int zfs_deadman_enabled = 1;
+
+/*
+ * Controls the behavior of the deadman when it detects a "hung" I/O.
+ * Valid values are zfs_deadman_failmode=<wait|continue|panic>.
+ *
+ * wait     - Wait for the "hung" I/O (default)
+ * continue - Attempt to recover from a "hung" I/O
+ * panic    - Panic the system
+ */
+char *zfs_deadman_failmode = "wait";
+
+/*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that.  Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync().  All together,
+ * the worst case is:
+ *     (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
+ */
+int spa_asize_inflation = 24;
+
+/*
+ * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
+ * the pool to be consumed.  This ensures that we don't run the pool
+ * completely out of space, due to unaccounted changes (e.g. to the MOS).
+ * It also limits the worst-case time to allocate space.  If we have less than
+ * this amount of free space, most ZPL operations (e.g. write, create) will
+ * return ENOSPC.  The ZIL metaslabs (spa_embedded_log_class) are also part of
+ * this 3.2% of space which can't be consumed by normal writes; the slop space
+ * "proper" (spa_get_slop_space()) is decreased by the embedded log space.
+ *
+ * Certain operations (e.g. file removal, most administrative actions) can
+ * use half the slop space.  They will only return ENOSPC if less than half
+ * the slop space is free.  Typically, once the pool has less than the slop
+ * space free, the user will use these operations to free up space in the pool.
+ * These are the operations that call dsl_pool_adjustedsize() with the netfree
+ * argument set to TRUE.
+ *
+ * Operations that are almost guaranteed to free up space in the absence of
+ * a pool checkpoint can use up to three quarters of the slop space
+ * (e.g zfs destroy).
+ *
+ * A very restricted set of operations are always permitted, regardless of
+ * the amount of free space.  These are the operations that call
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net
+ * increase in the amount of space used, it is possible to run the pool
+ * completely out of space, causing it to be permanently read-only.
+ *
+ * Note that on very small pools, the slop space will be larger than
+ * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
+ * but we never allow it to be more than half the pool size.
+ *
+ * See also the comments in zfs_space_check_t.
+ */
+int spa_slop_shift = 5;
+uint64_t spa_min_slop = 128 * 1024 * 1024;
+int spa_allocators = 4;
+
+
+/*PRINTFLIKE2*/
+void
+spa_load_failed(spa_t *spa, const char *fmt, ...)
+{
+	va_list adx;
+	char buf[256];
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
+	va_end(adx);
+
+	zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name,
+	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
+}
+
+/*PRINTFLIKE2*/
+void
+spa_load_note(spa_t *spa, const char *fmt, ...)
+{
+	va_list adx;
+	char buf[256];
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
+	va_end(adx);
+
+	zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name,
+	    spa->spa_trust_config ? "trusted" : "untrusted", buf);
+}
+
+/*
+ * By default dedup and user data indirects land in the special class
+ */
+int zfs_ddt_data_is_special = B_TRUE;
+int zfs_user_indirect_is_special = B_TRUE;
+
+/*
+ * The percentage of special class final space reserved for metadata only.
+ * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only
+ * let metadata into the class.
+ */
+int zfs_special_class_metadata_reserve_pct = 25;
+
+/*
+ * ==========================================================================
+ * SPA config locking
+ * ==========================================================================
+ */
+static void
+spa_config_lock_init(spa_t *spa)
+{
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL);
+		cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL);
+		zfs_refcount_create_untracked(&scl->scl_count);
+		scl->scl_writer = NULL;
+		scl->scl_write_wanted = 0;
+	}
+}
+
+static void
+spa_config_lock_destroy(spa_t *spa)
+{
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		mutex_destroy(&scl->scl_lock);
+		cv_destroy(&scl->scl_cv);
+		zfs_refcount_destroy(&scl->scl_count);
+		ASSERT(scl->scl_writer == NULL);
+		ASSERT(scl->scl_write_wanted == 0);
+	}
+}
+
+int
+spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
+{
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (!(locks & (1 << i)))
+			continue;
+		mutex_enter(&scl->scl_lock);
+		if (rw == RW_READER) {
+			if (scl->scl_writer || scl->scl_write_wanted) {
+				mutex_exit(&scl->scl_lock);
+				spa_config_exit(spa, locks & ((1 << i) - 1),
+				    tag);
+				return (0);
+			}
+		} else {
+			ASSERT(scl->scl_writer != curthread);
+			if (!zfs_refcount_is_zero(&scl->scl_count)) {
+				mutex_exit(&scl->scl_lock);
+				spa_config_exit(spa, locks & ((1 << i) - 1),
+				    tag);
+				return (0);
+			}
+			scl->scl_writer = curthread;
+		}
+		(void) zfs_refcount_add(&scl->scl_count, tag);
+		mutex_exit(&scl->scl_lock);
+	}
+	return (1);
+}
+
+void
+spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw)
+{
+	int wlocks_held = 0;
+
+	ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY);
+
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (scl->scl_writer == curthread)
+			wlocks_held |= (1 << i);
+		if (!(locks & (1 << i)))
+			continue;
+		mutex_enter(&scl->scl_lock);
+		if (rw == RW_READER) {
+			while (scl->scl_writer || scl->scl_write_wanted) {
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+			}
+		} else {
+			ASSERT(scl->scl_writer != curthread);
+			while (!zfs_refcount_is_zero(&scl->scl_count)) {
+				scl->scl_write_wanted++;
+				cv_wait(&scl->scl_cv, &scl->scl_lock);
+				scl->scl_write_wanted--;
+			}
+			scl->scl_writer = curthread;
+		}
+		(void) zfs_refcount_add(&scl->scl_count, tag);
+		mutex_exit(&scl->scl_lock);
+	}
+	ASSERT3U(wlocks_held, <=, locks);
+}
+
+void
+spa_config_exit(spa_t *spa, int locks, const void *tag)
+{
+	for (int i = SCL_LOCKS - 1; i >= 0; i--) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (!(locks & (1 << i)))
+			continue;
+		mutex_enter(&scl->scl_lock);
+		ASSERT(!zfs_refcount_is_zero(&scl->scl_count));
+		if (zfs_refcount_remove(&scl->scl_count, tag) == 0) {
+			ASSERT(scl->scl_writer == NULL ||
+			    scl->scl_writer == curthread);
+			scl->scl_writer = NULL;	/* OK in either case */
+			cv_broadcast(&scl->scl_cv);
+		}
+		mutex_exit(&scl->scl_lock);
+	}
+}
+
+int
+spa_config_held(spa_t *spa, int locks, krw_t rw)
+{
+	int locks_held = 0;
+
+	for (int i = 0; i < SCL_LOCKS; i++) {
+		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (!(locks & (1 << i)))
+			continue;
+		if ((rw == RW_READER &&
+		    !zfs_refcount_is_zero(&scl->scl_count)) ||
+		    (rw == RW_WRITER && scl->scl_writer == curthread))
+			locks_held |= 1 << i;
+	}
+
+	return (locks_held);
+}
+
+/*
+ * ==========================================================================
+ * SPA namespace functions
+ * ==========================================================================
+ */
+
+/*
+ * Lookup the named spa_t in the AVL tree.  The spa_namespace_lock must be held.
+ * Returns NULL if no matching spa_t is found.
+ */
+spa_t *
+spa_lookup(const char *name)
+{
+	static spa_t search;	/* spa_t is large; don't allocate on stack */
+	spa_t *spa;
+	avl_index_t where;
+	char *cp;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	(void) strlcpy(search.spa_name, name, sizeof (search.spa_name));
+
+	/*
+	 * If it's a full dataset name, figure out the pool name and
+	 * just use that.
+	 */
+	cp = strpbrk(search.spa_name, "/@#");
+	if (cp != NULL)
+		*cp = '\0';
+
+	spa = avl_find(&spa_namespace_avl, &search, &where);
+
+	return (spa);
+}
+
+/*
+ * Fires when spa_sync has not completed within zfs_deadman_synctime_ms.
+ * If the zfs_deadman_enabled flag is set then it inspects all vdev queues
+ * looking for potentially hung I/Os.
+ */
+void
+spa_deadman(void *arg)
+{
+	spa_t *spa = arg;
+
+	/* Disable the deadman if the pool is suspended. */
+	if (spa_suspended(spa))
+		return;
+
+	zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
+	    (gethrtime() - spa->spa_sync_starttime) / NANOSEC,
+	    ++spa->spa_deadman_calls);
+	if (zfs_deadman_enabled)
+		vdev_deadman(spa->spa_root_vdev, FTAG);
+
+	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
+	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
+	    MSEC_TO_TICK(zfs_deadman_checktime_ms));
+}
+
+static int
+spa_log_sm_sort_by_txg(const void *va, const void *vb)
+{
+	const spa_log_sm_t *a = va;
+	const spa_log_sm_t *b = vb;
+
+	return (TREE_CMP(a->sls_txg, b->sls_txg));
+}
+
+/*
+ * Create an uninitialized spa_t with the given name.  Requires
+ * spa_namespace_lock.  The caller must ensure that the spa_t doesn't already
+ * exist by calling spa_lookup() first.
+ */
+spa_t *
+spa_add(const char *name, nvlist_t *config, const char *altroot)
+{
+	spa_t *spa;
+	spa_config_dirent_t *dp;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
+
+	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL);
+
+	for (int t = 0; t < TXG_SIZE; t++)
+		bplist_create(&spa->spa_free_bplist[t]);
+
+	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
+	spa->spa_state = POOL_STATE_UNINITIALIZED;
+	spa->spa_freeze_txg = UINT64_MAX;
+	spa->spa_final_txg = UINT64_MAX;
+	spa->spa_load_max_txg = UINT64_MAX;
+	spa->spa_proc = &p0;
+	spa->spa_proc_state = SPA_PROC_NONE;
+	spa->spa_trust_config = B_TRUE;
+	spa->spa_hostid = zone_get_hostid(NULL);
+
+	spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
+	spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms);
+	spa_set_deadman_failmode(spa, zfs_deadman_failmode);
+
+	zfs_refcount_create(&spa->spa_refcount);
+	spa_config_lock_init(spa);
+	spa_stats_init(spa);
+
+	avl_add(&spa_namespace_avl, spa);
+
+	/*
+	 * Set the alternate root, if there is one.
+	 */
+	if (altroot)
+		spa->spa_root = spa_strdup(altroot);
+
+	spa->spa_alloc_count = spa_allocators;
+	spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (kmutex_t), KM_SLEEP);
+	spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
+	    sizeof (avl_tree_t), KM_SLEEP);
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL);
+		avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare,
+		    sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+	}
+	avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
+	    sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
+	avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg,
+	    sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node));
+	list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t),
+	    offsetof(log_summary_entry_t, lse_node));
+
+	/*
+	 * Every pool starts with the default cachefile
+	 */
+	list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t),
+	    offsetof(spa_config_dirent_t, scd_link));
+
+	dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP);
+	dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path);
+	list_insert_head(&spa->spa_config_list, dp);
+
+	VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME,
+	    KM_SLEEP) == 0);
+
+	if (config != NULL) {
+		nvlist_t *features;
+
+		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ,
+		    &features) == 0) {
+			VERIFY(nvlist_dup(features, &spa->spa_label_features,
+			    0) == 0);
+		}
+
+		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+	}
+
+	if (spa->spa_label_features == NULL) {
+		VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME,
+		    KM_SLEEP) == 0);
+	}
+
+	spa->spa_min_ashift = INT_MAX;
+	spa->spa_max_ashift = 0;
+	spa->spa_min_alloc = INT_MAX;
+
+	/* Reset cached value */
+	spa->spa_dedup_dspace = ~0ULL;
+
+	/*
+	 * As a pool is being created, treat all features as disabled by
+	 * setting SPA_FEATURE_DISABLED for all entries in the feature
+	 * refcount cache.
+	 */
+	for (int i = 0; i < SPA_FEATURES; i++) {
+		spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
+	}
+
+	list_create(&spa->spa_leaf_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_leaf_node));
+
+	return (spa);
+}
+
+/*
+ * Removes a spa_t from the namespace, freeing up any memory used.  Requires
+ * spa_namespace_lock.  This is called only after the spa_t has been closed and
+ * deactivated.
+ */
+void
+spa_remove(spa_t *spa)
+{
+	spa_config_dirent_t *dp;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED);
+	ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0);
+	ASSERT0(spa->spa_waiters);
+
+	nvlist_free(spa->spa_config_splitting);
+
+	avl_remove(&spa_namespace_avl, spa);
+	cv_broadcast(&spa_namespace_cv);
+
+	if (spa->spa_root)
+		spa_strfree(spa->spa_root);
+
+	while ((dp = list_head(&spa->spa_config_list)) != NULL) {
+		list_remove(&spa->spa_config_list, dp);
+		if (dp->scd_path != NULL)
+			spa_strfree(dp->scd_path);
+		kmem_free(dp, sizeof (spa_config_dirent_t));
+	}
+
+	for (int i = 0; i < spa->spa_alloc_count; i++) {
+		avl_destroy(&spa->spa_alloc_trees[i]);
+		mutex_destroy(&spa->spa_alloc_locks[i]);
+	}
+	kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count *
+	    sizeof (kmutex_t));
+	kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
+	    sizeof (avl_tree_t));
+
+	avl_destroy(&spa->spa_metaslabs_by_flushed);
+	avl_destroy(&spa->spa_sm_logs_by_txg);
+	list_destroy(&spa->spa_log_summary);
+	list_destroy(&spa->spa_config_list);
+	list_destroy(&spa->spa_leaf_list);
+
+	nvlist_free(spa->spa_label_features);
+	nvlist_free(spa->spa_load_info);
+	nvlist_free(spa->spa_feat_stats);
+	spa_config_set(spa, NULL);
+
+	zfs_refcount_destroy(&spa->spa_refcount);
+
+	spa_stats_destroy(spa);
+	spa_config_lock_destroy(spa);
+
+	for (int t = 0; t < TXG_SIZE; t++)
+		bplist_destroy(&spa->spa_free_bplist[t]);
+
+	zio_checksum_templates_free(spa);
+
+	cv_destroy(&spa->spa_async_cv);
+	cv_destroy(&spa->spa_evicting_os_cv);
+	cv_destroy(&spa->spa_proc_cv);
+	cv_destroy(&spa->spa_scrub_io_cv);
+	cv_destroy(&spa->spa_suspend_cv);
+	cv_destroy(&spa->spa_activities_cv);
+	cv_destroy(&spa->spa_waiters_cv);
+
+	mutex_destroy(&spa->spa_flushed_ms_lock);
+	mutex_destroy(&spa->spa_async_lock);
+	mutex_destroy(&spa->spa_errlist_lock);
+	mutex_destroy(&spa->spa_errlog_lock);
+	mutex_destroy(&spa->spa_evicting_os_lock);
+	mutex_destroy(&spa->spa_history_lock);
+	mutex_destroy(&spa->spa_proc_lock);
+	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_cksum_tmpls_lock);
+	mutex_destroy(&spa->spa_scrub_lock);
+	mutex_destroy(&spa->spa_suspend_lock);
+	mutex_destroy(&spa->spa_vdev_top_lock);
+	mutex_destroy(&spa->spa_feat_stats_lock);
+	mutex_destroy(&spa->spa_activities_lock);
+
+	kmem_free(spa, sizeof (spa_t));
+}
+
+/*
+ * Given a pool, return the next pool in the namespace, or NULL if there is
+ * none.  If 'prev' is NULL, return the first pool.
+ */
+spa_t *
+spa_next(spa_t *prev)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	if (prev)
+		return (AVL_NEXT(&spa_namespace_avl, prev));
+	else
+		return (avl_first(&spa_namespace_avl));
+}
+
+/*
+ * ==========================================================================
+ * SPA refcount functions
+ * ==========================================================================
+ */
+
+/*
+ * Add a reference to the given spa_t.  Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_open_ref(spa_t *spa, void *tag)
+{
+	ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref ||
+	    MUTEX_HELD(&spa_namespace_lock));
+	(void) zfs_refcount_add(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t.  Must have at least one reference, or
+ * have the namespace lock held.
+ */
+void
+spa_close(spa_t *spa, void *tag)
+{
+	ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref ||
+	    MUTEX_HELD(&spa_namespace_lock));
+	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Remove a reference to the given spa_t held by a dsl dir that is
+ * being asynchronously released.  Async releases occur from a taskq
+ * performing eviction of dsl datasets and dirs.  The namespace lock
+ * isn't held and the hold by the object being evicted may contribute to
+ * spa_minref (e.g. dataset or directory released during pool export),
+ * so the asserts in spa_close() do not apply.
+ */
+void
+spa_async_close(spa_t *spa, void *tag)
+{
+	(void) zfs_refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
+ * Check to see if the spa refcount is zero.  Must be called with
+ * spa_namespace_lock held.  We really compare against spa_minref, which is the
+ * number of references acquired when opening a pool
+ */
+boolean_t
+spa_refcount_zero(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref);
+}
+
+/*
+ * ==========================================================================
+ * SPA spare and l2cache tracking
+ * ==========================================================================
+ */
+
+/*
+ * Hot spares and cache devices are tracked using the same code below,
+ * for 'auxiliary' devices.
+ */
+
+typedef struct spa_aux {
+	uint64_t	aux_guid;
+	uint64_t	aux_pool;
+	avl_node_t	aux_avl;
+	int		aux_count;
+} spa_aux_t;
+
+static inline int
+spa_aux_compare(const void *a, const void *b)
+{
+	const spa_aux_t *sa = (const spa_aux_t *)a;
+	const spa_aux_t *sb = (const spa_aux_t *)b;
+
+	return (TREE_CMP(sa->aux_guid, sb->aux_guid));
+}
+
+static void
+spa_aux_add(vdev_t *vd, avl_tree_t *avl)
+{
+	avl_index_t where;
+	spa_aux_t search;
+	spa_aux_t *aux;
+
+	search.aux_guid = vd->vdev_guid;
+	if ((aux = avl_find(avl, &search, &where)) != NULL) {
+		aux->aux_count++;
+	} else {
+		aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
+		aux->aux_guid = vd->vdev_guid;
+		aux->aux_count = 1;
+		avl_insert(avl, aux, where);
+	}
+}
+
+static void
+spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
+{
+	spa_aux_t search;
+	spa_aux_t *aux;
+	avl_index_t where;
+
+	search.aux_guid = vd->vdev_guid;
+	aux = avl_find(avl, &search, &where);
+
+	ASSERT(aux != NULL);
+
+	if (--aux->aux_count == 0) {
+		avl_remove(avl, aux);
+		kmem_free(aux, sizeof (spa_aux_t));
+	} else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
+		aux->aux_pool = 0ULL;
+	}
+}
+
+static boolean_t
+spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl)
+{
+	spa_aux_t search, *found;
+
+	search.aux_guid = guid;
+	found = avl_find(avl, &search, NULL);
+
+	if (pool) {
+		if (found)
+			*pool = found->aux_pool;
+		else
+			*pool = 0ULL;
+	}
+
+	if (refcnt) {
+		if (found)
+			*refcnt = found->aux_count;
+		else
+			*refcnt = 0;
+	}
+
+	return (found != NULL);
+}
+
+static void
+spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
+{
+	spa_aux_t search, *found;
+	avl_index_t where;
+
+	search.aux_guid = vd->vdev_guid;
+	found = avl_find(avl, &search, &where);
+	ASSERT(found != NULL);
+	ASSERT(found->aux_pool == 0ULL);
+
+	found->aux_pool = spa_guid(vd->vdev_spa);
+}
+
+/*
+ * Spares are tracked globally due to the following constraints:
+ *
+ *	- A spare may be part of multiple pools.
+ *	- A spare may be added to a pool even if it's actively in use within
+ *	  another pool.
+ *	- A spare in use in any pool can only be the source of a replacement if
+ *	  the target is a spare in the same pool.
+ *
+ * We keep track of all spares on the system through the use of a reference
+ * counted AVL tree.  When a vdev is added as a spare, or used as a replacement
+ * spare, then we bump the reference count in the AVL tree.  In addition, we set
+ * the 'vdev_isspare' member to indicate that the device is a spare (active or
+ * inactive).  When a spare is made active (used to replace a device in the
+ * pool), we also keep track of which pool its been made a part of.
+ *
+ * The 'spa_spare_lock' protects the AVL tree.  These functions are normally
+ * called under the spa_namespace lock as part of vdev reconfiguration.  The
+ * separate spare lock exists for the status query path, which does not need to
+ * be completely consistent with respect to other vdev configuration changes.
+ */
+
+static int
+spa_spare_compare(const void *a, const void *b)
+{
+	return (spa_aux_compare(a, b));
+}
+
+void
+spa_spare_add(vdev_t *vd)
+{
+	mutex_enter(&spa_spare_lock);
+	ASSERT(!vd->vdev_isspare);
+	spa_aux_add(vd, &spa_spare_avl);
+	vd->vdev_isspare = B_TRUE;
+	mutex_exit(&spa_spare_lock);
+}
+
+void
+spa_spare_remove(vdev_t *vd)
+{
+	mutex_enter(&spa_spare_lock);
+	ASSERT(vd->vdev_isspare);
+	spa_aux_remove(vd, &spa_spare_avl);
+	vd->vdev_isspare = B_FALSE;
+	mutex_exit(&spa_spare_lock);
+}
+
+boolean_t
+spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt)
+{
+	boolean_t found;
+
+	mutex_enter(&spa_spare_lock);
+	found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl);
+	mutex_exit(&spa_spare_lock);
+
+	return (found);
+}
+
+void
+spa_spare_activate(vdev_t *vd)
+{
+	mutex_enter(&spa_spare_lock);
+	ASSERT(vd->vdev_isspare);
+	spa_aux_activate(vd, &spa_spare_avl);
+	mutex_exit(&spa_spare_lock);
+}
+
+/*
+ * Level 2 ARC devices are tracked globally for the same reasons as spares.
+ * Cache devices currently only support one pool per cache device, and so
+ * for these devices the aux reference count is currently unused beyond 1.
+ */
+
+static int
+spa_l2cache_compare(const void *a, const void *b)
+{
+	return (spa_aux_compare(a, b));
+}
+
+void
+spa_l2cache_add(vdev_t *vd)
+{
+	mutex_enter(&spa_l2cache_lock);
+	ASSERT(!vd->vdev_isl2cache);
+	spa_aux_add(vd, &spa_l2cache_avl);
+	vd->vdev_isl2cache = B_TRUE;
+	mutex_exit(&spa_l2cache_lock);
+}
+
+void
+spa_l2cache_remove(vdev_t *vd)
+{
+	mutex_enter(&spa_l2cache_lock);
+	ASSERT(vd->vdev_isl2cache);
+	spa_aux_remove(vd, &spa_l2cache_avl);
+	vd->vdev_isl2cache = B_FALSE;
+	mutex_exit(&spa_l2cache_lock);
+}
+
+boolean_t
+spa_l2cache_exists(uint64_t guid, uint64_t *pool)
+{
+	boolean_t found;
+
+	mutex_enter(&spa_l2cache_lock);
+	found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl);
+	mutex_exit(&spa_l2cache_lock);
+
+	return (found);
+}
+
+void
+spa_l2cache_activate(vdev_t *vd)
+{
+	mutex_enter(&spa_l2cache_lock);
+	ASSERT(vd->vdev_isl2cache);
+	spa_aux_activate(vd, &spa_l2cache_avl);
+	mutex_exit(&spa_l2cache_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA vdev locking
+ * ==========================================================================
+ */
+
+/*
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
+ * It returns the next transaction group for the spa_t.
+ */
+uint64_t
+spa_vdev_enter(spa_t *spa)
+{
+	mutex_enter(&spa->spa_vdev_top_lock);
+	mutex_enter(&spa_namespace_lock);
+
+	vdev_autotrim_stop_all(spa);
+
+	return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * The same as spa_vdev_enter() above but additionally takes the guid of
+ * the vdev being detached.  When there is a rebuild in process it will be
+ * suspended while the vdev tree is modified then resumed by spa_vdev_exit().
+ * The rebuild is canceled if only a single child remains after the detach.
+ */
+uint64_t
+spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
+{
+	mutex_enter(&spa->spa_vdev_top_lock);
+	mutex_enter(&spa_namespace_lock);
+
+	vdev_autotrim_stop_all(spa);
+
+	if (guid != 0) {
+		vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+		if (vd) {
+			vdev_rebuild_stop_wait(vd->vdev_top);
+		}
+	}
+
+	return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter().  Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+
+	return (spa_last_synced_txg(spa) + 1);
+}
+
+/*
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
+ */
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	int config_changed = B_FALSE;
+
+	ASSERT(txg > spa_last_synced_txg(spa));
+
+	spa->spa_pending_vdev = NULL;
+
+	/*
+	 * Reassess the DTLs.
+	 */
+	vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
+
+	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
+		config_changed = B_TRUE;
+		spa->spa_config_generation++;
+	}
+
+	/*
+	 * Verify the metaslab classes.
+	 */
+	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+	ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0);
+	ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0);
+	ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0);
+
+	spa_config_exit(spa, SCL_ALL, spa);
+
+	/*
+	 * Panic the system if the specified tag requires it.  This
+	 * is useful for ensuring that configurations are updated
+	 * transactionally.
+	 */
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, tag, 0);
+
+	/*
+	 * Note: this txg_wait_synced() is important because it ensures
+	 * that there won't be more than one config change per txg.
+	 * This allows us to use the txg as the generation number.
+	 */
+	if (error == 0)
+		txg_wait_synced(spa->spa_dsl_pool, txg);
+
+	if (vd != NULL) {
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+		if (vd->vdev_ops->vdev_op_leaf) {
+			mutex_enter(&vd->vdev_initialize_lock);
+			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED,
+			    NULL);
+			mutex_exit(&vd->vdev_initialize_lock);
+
+			mutex_enter(&vd->vdev_trim_lock);
+			vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+			mutex_exit(&vd->vdev_trim_lock);
+		}
+
+		/*
+		 * The vdev may be both a leaf and top-level device.
+		 */
+		vdev_autotrim_stop_wait(vd);
+
+		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
+		vdev_free(vd);
+		spa_config_exit(spa, SCL_ALL, spa);
+	}
+
+	/*
+	 * If the config changed, update the config cache.
+	 */
+	if (config_changed)
+		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+}
+
+/*
+ * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+	vdev_autotrim_restart(spa);
+	vdev_rebuild_restart(spa);
+
+	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
+	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	return (error);
+}
+
+/*
+ * Lock the given spa_t for the purpose of changing vdev state.
+ */
+void
+spa_vdev_state_enter(spa_t *spa, int oplocks)
+{
+	int locks = SCL_STATE_ALL | oplocks;
+
+	/*
+	 * Root pools may need to read of the underlying devfs filesystem
+	 * when opening up a vdev.  Unfortunately if we're holding the
+	 * SCL_ZIO lock it will result in a deadlock when we try to issue
+	 * the read from the root filesystem.  Instead we "prefetch"
+	 * the associated vnodes that we need prior to opening the
+	 * underlying devices and cache them so that we can prevent
+	 * any I/O when we are doing the actual open.
+	 */
+	if (spa_is_root(spa)) {
+		int low = locks & ~(SCL_ZIO - 1);
+		int high = locks & ~low;
+
+		spa_config_enter(spa, high, spa, RW_WRITER);
+		vdev_hold(spa->spa_root_vdev);
+		spa_config_enter(spa, low, spa, RW_WRITER);
+	} else {
+		spa_config_enter(spa, locks, spa, RW_WRITER);
+	}
+	spa->spa_vdev_locks = locks;
+}
+
+int
+spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
+{
+	boolean_t config_changed = B_FALSE;
+	vdev_t *vdev_top;
+
+	if (vd == NULL || vd == spa->spa_root_vdev) {
+		vdev_top = spa->spa_root_vdev;
+	} else {
+		vdev_top = vd->vdev_top;
+	}
+
+	if (vd != NULL || error == 0)
+		vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
+
+	if (vd != NULL) {
+		if (vd != spa->spa_root_vdev)
+			vdev_state_dirty(vdev_top);
+
+		config_changed = B_TRUE;
+		spa->spa_config_generation++;
+	}
+
+	if (spa_is_root(spa))
+		vdev_rele(spa->spa_root_vdev);
+
+	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+	spa_config_exit(spa, spa->spa_vdev_locks, spa);
+
+	/*
+	 * If anything changed, wait for it to sync.  This ensures that,
+	 * from the system administrator's perspective, zpool(8) commands
+	 * are synchronous.  This is important for things like zpool offline:
+	 * when the command completes, you expect no further I/O from ZFS.
+	 */
+	if (vd != NULL)
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+
+	/*
+	 * If the config changed, update the config cache.
+	 */
+	if (config_changed) {
+		mutex_enter(&spa_namespace_lock);
+		spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * Miscellaneous functions
+ * ==========================================================================
+ */
+
+void
+spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
+{
+	if (!nvlist_exists(spa->spa_label_features, feature)) {
+		fnvlist_add_boolean(spa->spa_label_features, feature);
+		/*
+		 * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
+		 * dirty the vdev config because lock SCL_CONFIG is not held.
+		 * Thankfully, in this case we don't need to dirty the config
+		 * because it will be written out anyway when we finish
+		 * creating the pool.
+		 */
+		if (tx->tx_txg != TXG_INITIAL)
+			vdev_config_dirty(spa->spa_root_vdev);
+	}
+}
+
+void
+spa_deactivate_mos_feature(spa_t *spa, const char *feature)
+{
+	if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
+		vdev_config_dirty(spa->spa_root_vdev);
+}
+
+/*
+ * Return the spa_t associated with given pool_guid, if it exists.  If
+ * device_guid is non-zero, determine whether the pool exists *and* contains
+ * a device with the specified device_guid.
+ */
+spa_t *
+spa_by_guid(uint64_t pool_guid, uint64_t device_guid)
+{
+	spa_t *spa;
+	avl_tree_t *t = &spa_namespace_avl;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
+		if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+			continue;
+		if (spa->spa_root_vdev == NULL)
+			continue;
+		if (spa_guid(spa) == pool_guid) {
+			if (device_guid == 0)
+				break;
+
+			if (vdev_lookup_by_guid(spa->spa_root_vdev,
+			    device_guid) != NULL)
+				break;
+
+			/*
+			 * Check any devices we may be in the process of adding.
+			 */
+			if (spa->spa_pending_vdev) {
+				if (vdev_lookup_by_guid(spa->spa_pending_vdev,
+				    device_guid) != NULL)
+					break;
+			}
+		}
+	}
+
+	return (spa);
+}
+
+/*
+ * Determine whether a pool with the given pool_guid exists.
+ */
+boolean_t
+spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
+{
+	return (spa_by_guid(pool_guid, device_guid) != NULL);
+}
+
+char *
+spa_strdup(const char *s)
+{
+	size_t len;
+	char *new;
+
+	len = strlen(s);
+	new = kmem_alloc(len + 1, KM_SLEEP);
+	bcopy(s, new, len);
+	new[len] = '\0';
+
+	return (new);
+}
+
+void
+spa_strfree(char *s)
+{
+	kmem_free(s, strlen(s) + 1);
+}
+
+uint64_t
+spa_get_random(uint64_t range)
+{
+	uint64_t r;
+
+	ASSERT(range != 0);
+
+	if (range == 1)
+		return (0);
+
+	(void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t));
+
+	return (r % range);
+}
+
+uint64_t
+spa_generate_guid(spa_t *spa)
+{
+	uint64_t guid = spa_get_random(-1ULL);
+
+	if (spa != NULL) {
+		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
+			guid = spa_get_random(-1ULL);
+	} else {
+		while (guid == 0 || spa_guid_exists(guid, 0))
+			guid = spa_get_random(-1ULL);
+	}
+
+	return (guid);
+}
+
+void
+snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
+{
+	char type[256];
+	char *checksum = NULL;
+	char *compress = NULL;
+
+	if (bp != NULL) {
+		if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) {
+			dmu_object_byteswap_t bswap =
+			    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
+			(void) snprintf(type, sizeof (type), "bswap %s %s",
+			    DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ?
+			    "metadata" : "data",
+			    dmu_ot_byteswap[bswap].ob_name);
+		} else {
+			(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
+			    sizeof (type));
+		}
+		if (!BP_IS_EMBEDDED(bp)) {
+			checksum =
+			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+		}
+		compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
+	}
+
+	SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+	    compress);
+}
+
+void
+spa_freeze(spa_t *spa)
+{
+	uint64_t freeze_txg = 0;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	if (spa->spa_freeze_txg == UINT64_MAX) {
+		freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
+		spa->spa_freeze_txg = freeze_txg;
+	}
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	if (freeze_txg != 0)
+		txg_wait_synced(spa_get_dsl(spa), freeze_txg);
+}
+
+void
+zfs_panic_recover(const char *fmt, ...)
+{
+	va_list adx;
+
+	va_start(adx, fmt);
+	vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx);
+	va_end(adx);
+}
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexadecimal numbers that don't overflow.
+ */
+uint64_t
+zfs_strtonum(const char *str, char **nptr)
+{
+	uint64_t val = 0;
+	char c;
+	int digit;
+
+	while ((c = *str) != '\0') {
+		if (c >= '0' && c <= '9')
+			digit = c - '0';
+		else if (c >= 'a' && c <= 'f')
+			digit = 10 + c - 'a';
+		else
+			break;
+
+		val *= 16;
+		val += digit;
+
+		str++;
+	}
+
+	if (nptr)
+		*nptr = (char *)str;
+
+	return (val);
+}
+
+void
+spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx)
+{
+	/*
+	 * We bump the feature refcount for each special vdev added to the pool
+	 */
+	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES));
+	spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx);
+}
+
+/*
+ * ==========================================================================
+ * Accessor functions
+ * ==========================================================================
+ */
+
+boolean_t
+spa_shutting_down(spa_t *spa)
+{
+	return (spa->spa_async_suspended);
+}
+
+dsl_pool_t *
+spa_get_dsl(spa_t *spa)
+{
+	return (spa->spa_dsl_pool);
+}
+
+boolean_t
+spa_is_initializing(spa_t *spa)
+{
+	return (spa->spa_is_initializing);
+}
+
+boolean_t
+spa_indirect_vdevs_loaded(spa_t *spa)
+{
+	return (spa->spa_indirect_vdevs_loaded);
+}
+
+blkptr_t *
+spa_get_rootblkptr(spa_t *spa)
+{
+	return (&spa->spa_ubsync.ub_rootbp);
+}
+
+void
+spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp)
+{
+	spa->spa_uberblock.ub_rootbp = *bp;
+}
+
+void
+spa_altroot(spa_t *spa, char *buf, size_t buflen)
+{
+	if (spa->spa_root == NULL)
+		buf[0] = '\0';
+	else
+		(void) strncpy(buf, spa->spa_root, buflen);
+}
+
+int
+spa_sync_pass(spa_t *spa)
+{
+	return (spa->spa_sync_pass);
+}
+
+char *
+spa_name(spa_t *spa)
+{
+	return (spa->spa_name);
+}
+
+uint64_t
+spa_guid(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	uint64_t guid;
+
+	/*
+	 * If we fail to parse the config during spa_load(), we can go through
+	 * the error path (which posts an ereport) and end up here with no root
+	 * vdev.  We stash the original pool guid in 'spa_config_guid' to handle
+	 * this case.
+	 */
+	if (spa->spa_root_vdev == NULL)
+		return (spa->spa_config_guid);
+
+	guid = spa->spa_last_synced_guid != 0 ?
+	    spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid;
+
+	/*
+	 * Return the most recently synced out guid unless we're
+	 * in syncing context.
+	 */
+	if (dp && dsl_pool_sync_context(dp))
+		return (spa->spa_root_vdev->vdev_guid);
+	else
+		return (guid);
+}
+
+uint64_t
+spa_load_guid(spa_t *spa)
+{
+	/*
+	 * This is a GUID that exists solely as a reference for the
+	 * purposes of the arc.  It is generated at load time, and
+	 * is never written to persistent storage.
+	 */
+	return (spa->spa_load_guid);
+}
+
+uint64_t
+spa_last_synced_txg(spa_t *spa)
+{
+	return (spa->spa_ubsync.ub_txg);
+}
+
+uint64_t
+spa_first_txg(spa_t *spa)
+{
+	return (spa->spa_first_txg);
+}
+
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+	return (spa->spa_syncing_txg);
+}
+
+/*
+ * Return the last txg where data can be dirtied. The final txgs
+ * will be used to just clear out any deferred frees that remain.
+ */
+uint64_t
+spa_final_dirty_txg(spa_t *spa)
+{
+	return (spa->spa_final_txg - TXG_DEFER_SIZE);
+}
+
+pool_state_t
+spa_state(spa_t *spa)
+{
+	return (spa->spa_state);
+}
+
+spa_load_state_t
+spa_load_state(spa_t *spa)
+{
+	return (spa->spa_load_state);
+}
+
+uint64_t
+spa_freeze_txg(spa_t *spa)
+{
+	return (spa->spa_freeze_txg);
+}
+
+/*
+ * Return the inflated asize for a logical write in bytes. This is used by the
+ * DMU to calculate the space a logical write will require on disk.
+ * If lsize is smaller than the largest physical block size allocatable on this
+ * pool we use its value instead, since the write will end up using the whole
+ * block anyway.
+ */
+uint64_t
+spa_get_worst_case_asize(spa_t *spa, uint64_t lsize)
+{
+	if (lsize == 0)
+		return (0);	/* No inflation needed */
+	return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation);
+}
+
+/*
+ * Return the amount of slop space in bytes.  It is typically 1/32 of the pool
+ * (3.2%), minus the embedded log space.  On very small pools, it may be
+ * slightly larger than this. The embedded log space is not included in
+ * spa_dspace.  By subtracting it, the usable space (per "zfs list") is a
+ * constant 97% of the total space, regardless of metaslab size (assuming the
+ * default spa_slop_shift=5 and a non-tiny pool).
+ *
+ * See the comment above spa_slop_shift for more details.
+ */
+uint64_t
+spa_get_slop_space(spa_t *spa)
+{
+	uint64_t space = spa_get_dspace(spa);
+	uint64_t slop = space >> spa_slop_shift;
+
+	/*
+	 * Subtract the embedded log space, but no more than half the (3.2%)
+	 * unusable space.  Note, the "no more than half" is only relevant if
+	 * zfs_embedded_slog_min_ms >> spa_slop_shift < 2, which is not true by
+	 * default.
+	 */
+	uint64_t embedded_log =
+	    metaslab_class_get_dspace(spa_embedded_log_class(spa));
+	slop -= MIN(embedded_log, slop >> 1);
+
+	/*
+	 * Slop space should be at least spa_min_slop, but no more than half
+	 * the entire pool.
+	 */
+	slop = MAX(slop, MIN(space >> 1, spa_min_slop));
+	return (slop);
+}
+
+uint64_t
+spa_get_dspace(spa_t *spa)
+{
+	return (spa->spa_dspace);
+}
+
+uint64_t
+spa_get_checkpoint_space(spa_t *spa)
+{
+	return (spa->spa_checkpoint_info.sci_dspace);
+}
+
+void
+spa_update_dspace(spa_t *spa)
+{
+	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
+	    ddt_get_dedup_dspace(spa);
+	if (spa->spa_vdev_removal != NULL) {
+		/*
+		 * We can't allocate from the removing device, so subtract
+		 * its size if it was included in dspace (i.e. if this is a
+		 * normal-class vdev, not special/dedup).  This prevents the
+		 * DMU/DSL from filling up the (now smaller) pool while we
+		 * are in the middle of removing the device.
+		 *
+		 * Note that the DMU/DSL doesn't actually know or care
+		 * how much space is allocated (it does its own tracking
+		 * of how much space has been logically used).  So it
+		 * doesn't matter that the data we are moving may be
+		 * allocated twice (on the old device and the new
+		 * device).
+		 */
+		spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+		vdev_t *vd =
+		    vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+		if (vd->vdev_mg->mg_class == spa_normal_class(spa)) {
+			spa->spa_dspace -= spa_deflate(spa) ?
+			    vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space;
+		}
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+	}
+}
+
+/*
+ * Return the failure mode that has been set to this pool. The default
+ * behavior will be to block all I/Os when a complete failure occurs.
+ */
+uint64_t
+spa_get_failmode(spa_t *spa)
+{
+	return (spa->spa_failmode);
+}
+
+boolean_t
+spa_suspended(spa_t *spa)
+{
+	return (spa->spa_suspended != ZIO_SUSPEND_NONE);
+}
+
+uint64_t
+spa_version(spa_t *spa)
+{
+	return (spa->spa_ubsync.ub_version);
+}
+
+boolean_t
+spa_deflate(spa_t *spa)
+{
+	return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+	return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+	return (spa->spa_log_class);
+}
+
+metaslab_class_t *
+spa_embedded_log_class(spa_t *spa)
+{
+	return (spa->spa_embedded_log_class);
+}
+
+metaslab_class_t *
+spa_special_class(spa_t *spa)
+{
+	return (spa->spa_special_class);
+}
+
+metaslab_class_t *
+spa_dedup_class(spa_t *spa)
+{
+	return (spa->spa_dedup_class);
+}
+
+/*
+ * Locate an appropriate allocation class
+ */
+metaslab_class_t *
+spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype,
+    uint_t level, uint_t special_smallblk)
+{
+	/*
+	 * ZIL allocations determine their class in zio_alloc_zil().
+	 */
+	ASSERT(objtype != DMU_OT_INTENT_LOG);
+
+	boolean_t has_special_class = spa->spa_special_class->mc_groups != 0;
+
+	if (DMU_OT_IS_DDT(objtype)) {
+		if (spa->spa_dedup_class->mc_groups != 0)
+			return (spa_dedup_class(spa));
+		else if (has_special_class && zfs_ddt_data_is_special)
+			return (spa_special_class(spa));
+		else
+			return (spa_normal_class(spa));
+	}
+
+	/* Indirect blocks for user data can land in special if allowed */
+	if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) {
+		if (has_special_class && zfs_user_indirect_is_special)
+			return (spa_special_class(spa));
+		else
+			return (spa_normal_class(spa));
+	}
+
+	if (DMU_OT_IS_METADATA(objtype) || level > 0) {
+		if (has_special_class)
+			return (spa_special_class(spa));
+		else
+			return (spa_normal_class(spa));
+	}
+
+	/*
+	 * Allow small file blocks in special class in some cases (like
+	 * for the dRAID vdev feature). But always leave a reserve of
+	 * zfs_special_class_metadata_reserve_pct exclusively for metadata.
+	 */
+	if (DMU_OT_IS_FILE(objtype) &&
+	    has_special_class && size <= special_smallblk) {
+		metaslab_class_t *special = spa_special_class(spa);
+		uint64_t alloc = metaslab_class_get_alloc(special);
+		uint64_t space = metaslab_class_get_space(special);
+		uint64_t limit =
+		    (space * (100 - zfs_special_class_metadata_reserve_pct))
+		    / 100;
+
+		if (alloc < limit)
+			return (special);
+	}
+
+	return (spa_normal_class(spa));
+}
+
+void
+spa_evicting_os_register(spa_t *spa, objset_t *os)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	list_insert_head(&spa->spa_evicting_os_list, os);
+	mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_deregister(spa_t *spa, objset_t *os)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	list_remove(&spa->spa_evicting_os_list, os);
+	cv_broadcast(&spa->spa_evicting_os_cv);
+	mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_wait(spa_t *spa)
+{
+	mutex_enter(&spa->spa_evicting_os_lock);
+	while (!list_is_empty(&spa->spa_evicting_os_list))
+		cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
+	mutex_exit(&spa->spa_evicting_os_lock);
+
+	dmu_buf_user_evict_wait();
+}
+
+int
+spa_max_replication(spa_t *spa)
+{
+	/*
+	 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to
+	 * handle BPs with more than one DVA allocated.  Set our max
+	 * replication level accordingly.
+	 */
+	if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS)
+		return (1);
+	return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override));
+}
+
+int
+spa_prev_software_version(spa_t *spa)
+{
+	return (spa->spa_prev_software_version);
+}
+
+uint64_t
+spa_deadman_synctime(spa_t *spa)
+{
+	return (spa->spa_deadman_synctime);
+}
+
+spa_autotrim_t
+spa_get_autotrim(spa_t *spa)
+{
+	return (spa->spa_autotrim);
+}
+
+uint64_t
+spa_deadman_ziotime(spa_t *spa)
+{
+	return (spa->spa_deadman_ziotime);
+}
+
+uint64_t
+spa_get_deadman_failmode(spa_t *spa)
+{
+	return (spa->spa_deadman_failmode);
+}
+
+void
+spa_set_deadman_failmode(spa_t *spa, const char *failmode)
+{
+	if (strcmp(failmode, "wait") == 0)
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+	else if (strcmp(failmode, "continue") == 0)
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE;
+	else if (strcmp(failmode, "panic") == 0)
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC;
+	else
+		spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT;
+}
+
+void
+spa_set_deadman_ziotime(hrtime_t ns)
+{
+	spa_t *spa = NULL;
+
+	if (spa_mode_global != SPA_MODE_UNINIT) {
+		mutex_enter(&spa_namespace_lock);
+		while ((spa = spa_next(spa)) != NULL)
+			spa->spa_deadman_ziotime = ns;
+		mutex_exit(&spa_namespace_lock);
+	}
+}
+
+void
+spa_set_deadman_synctime(hrtime_t ns)
+{
+	spa_t *spa = NULL;
+
+	if (spa_mode_global != SPA_MODE_UNINIT) {
+		mutex_enter(&spa_namespace_lock);
+		while ((spa = spa_next(spa)) != NULL)
+			spa->spa_deadman_synctime = ns;
+		mutex_exit(&spa_namespace_lock);
+	}
+}
+
+uint64_t
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
+{
+	uint64_t asize = DVA_GET_ASIZE(dva);
+	uint64_t dsize = asize;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	if (asize != 0 && spa->spa_deflate) {
+		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+		if (vd != NULL)
+			dsize = (asize >> SPA_MINBLOCKSHIFT) *
+			    vd->vdev_deflate_ratio;
+	}
+
+	return (dsize);
+}
+
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+	uint64_t dsize = 0;
+
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
+		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+	return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+	uint64_t dsize = 0;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++)
+		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	return (dsize);
+}
+
+uint64_t
+spa_dirty_data(spa_t *spa)
+{
+	return (spa->spa_dsl_pool->dp_dirty_total);
+}
+
+/*
+ * ==========================================================================
+ * SPA Import Progress Routines
+ * ==========================================================================
+ */
+
+typedef struct spa_import_progress {
+	uint64_t		pool_guid;	/* unique id for updates */
+	char			*pool_name;
+	spa_load_state_t	spa_load_state;
+	uint64_t		mmp_sec_remaining;	/* MMP activity check */
+	uint64_t		spa_load_max_txg;	/* rewind txg */
+	procfs_list_node_t	smh_node;
+} spa_import_progress_t;
+
+spa_history_list_t *spa_import_progress_list = NULL;
+
+static int
+spa_import_progress_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid",
+	    "load_state", "multihost_secs", "max_txg",
+	    "pool_name");
+	return (0);
+}
+
+static int
+spa_import_progress_show(struct seq_file *f, void *data)
+{
+	spa_import_progress_t *sip = (spa_import_progress_t *)data;
+
+	seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n",
+	    (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state,
+	    (u_longlong_t)sip->mmp_sec_remaining,
+	    (u_longlong_t)sip->spa_load_max_txg,
+	    (sip->pool_name ? sip->pool_name : "-"));
+
+	return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size)
+{
+	spa_import_progress_t *sip;
+	while (shl->size > size) {
+		sip = list_remove_head(&shl->procfs_list.pl_list);
+		if (sip->pool_name)
+			spa_strfree(sip->pool_name);
+		kmem_free(sip, sizeof (spa_import_progress_t));
+		shl->size--;
+	}
+
+	IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list));
+}
+
+static void
+spa_import_progress_init(void)
+{
+	spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t),
+	    KM_SLEEP);
+
+	spa_import_progress_list->size = 0;
+
+	spa_import_progress_list->procfs_list.pl_private =
+	    spa_import_progress_list;
+
+	procfs_list_install("zfs",
+	    NULL,
+	    "import_progress",
+	    0644,
+	    &spa_import_progress_list->procfs_list,
+	    spa_import_progress_show,
+	    spa_import_progress_show_header,
+	    NULL,
+	    offsetof(spa_import_progress_t, smh_node));
+}
+
+static void
+spa_import_progress_destroy(void)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	procfs_list_uninstall(&shl->procfs_list);
+	spa_import_progress_truncate(shl, 0);
+	procfs_list_destroy(&shl->procfs_list);
+	kmem_free(shl, sizeof (spa_history_list_t));
+}
+
+int
+spa_import_progress_set_state(uint64_t pool_guid,
+    spa_load_state_t load_state)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	spa_import_progress_t *sip;
+	int error = ENOENT;
+
+	if (shl->size == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+		if (sip->pool_guid == pool_guid) {
+			sip->spa_load_state = load_state;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+int
+spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	spa_import_progress_t *sip;
+	int error = ENOENT;
+
+	if (shl->size == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+		if (sip->pool_guid == pool_guid) {
+			sip->spa_load_max_txg = load_max_txg;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+int
+spa_import_progress_set_mmp_check(uint64_t pool_guid,
+    uint64_t mmp_sec_remaining)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	spa_import_progress_t *sip;
+	int error = ENOENT;
+
+	if (shl->size == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+		if (sip->pool_guid == pool_guid) {
+			sip->mmp_sec_remaining = mmp_sec_remaining;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+/*
+ * A new import is in progress, add an entry.
+ */
+void
+spa_import_progress_add(spa_t *spa)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	spa_import_progress_t *sip;
+	char *poolname = NULL;
+
+	sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP);
+	sip->pool_guid = spa_guid(spa);
+
+	(void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME,
+	    &poolname);
+	if (poolname == NULL)
+		poolname = spa_name(spa);
+	sip->pool_name = spa_strdup(poolname);
+	sip->spa_load_state = spa_load_state(spa);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	procfs_list_add(&shl->procfs_list, sip);
+	shl->size++;
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+void
+spa_import_progress_remove(uint64_t pool_guid)
+{
+	spa_history_list_t *shl = spa_import_progress_list;
+	spa_import_progress_t *sip;
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL;
+	    sip = list_prev(&shl->procfs_list.pl_list, sip)) {
+		if (sip->pool_guid == pool_guid) {
+			if (sip->pool_name)
+				spa_strfree(sip->pool_name);
+			list_remove(&shl->procfs_list.pl_list, sip);
+			shl->size--;
+			kmem_free(sip, sizeof (spa_import_progress_t));
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * ==========================================================================
+ * Initialization and Termination
+ * ==========================================================================
+ */
+
+static int
+spa_name_compare(const void *a1, const void *a2)
+{
+	const spa_t *s1 = a1;
+	const spa_t *s2 = a2;
+	int s;
+
+	s = strcmp(s1->spa_name, s2->spa_name);
+
+	return (TREE_ISIGN(s));
+}
+
+void
+spa_boot_init(void)
+{
+	spa_config_load();
+}
+
+void
+spa_init(spa_mode_t mode)
+{
+	mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
+
+	avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
+	    offsetof(spa_t, spa_avl));
+
+	avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
+	    offsetof(spa_aux_t, aux_avl));
+
+	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
+	    offsetof(spa_aux_t, aux_avl));
+
+	spa_mode_global = mode;
+
+#ifndef _KERNEL
+	if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) {
+		struct sigaction sa;
+
+		sa.sa_flags = SA_SIGINFO;
+		sigemptyset(&sa.sa_mask);
+		sa.sa_sigaction = arc_buf_sigsegv;
+
+		if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+			perror("could not enable watchpoints: "
+			    "sigaction(SIGSEGV, ...) = ");
+		} else {
+			arc_watch = B_TRUE;
+		}
+	}
+#endif
+
+	fm_init();
+	zfs_refcount_init();
+	unique_init();
+	zfs_btree_init();
+	metaslab_stat_init();
+	ddt_init();
+	zio_init();
+	dmu_init();
+	zil_init();
+	vdev_cache_stat_init();
+	vdev_mirror_stat_init();
+	vdev_raidz_math_init();
+	vdev_file_init();
+	zfs_prop_init();
+	zpool_prop_init();
+	zpool_feature_init();
+	spa_config_load();
+	l2arc_start();
+	scan_init();
+	qat_init();
+	spa_import_progress_init();
+}
+
+void
+spa_fini(void)
+{
+	l2arc_stop();
+
+	spa_evict_all();
+
+	vdev_file_fini();
+	vdev_cache_stat_fini();
+	vdev_mirror_stat_fini();
+	vdev_raidz_math_fini();
+	zil_fini();
+	dmu_fini();
+	zio_fini();
+	ddt_fini();
+	metaslab_stat_fini();
+	zfs_btree_fini();
+	unique_fini();
+	zfs_refcount_fini();
+	fm_fini();
+	scan_fini();
+	qat_fini();
+	spa_import_progress_destroy();
+
+	avl_destroy(&spa_namespace_avl);
+	avl_destroy(&spa_spare_avl);
+	avl_destroy(&spa_l2cache_avl);
+
+	cv_destroy(&spa_namespace_cv);
+	mutex_destroy(&spa_namespace_lock);
+	mutex_destroy(&spa_spare_lock);
+	mutex_destroy(&spa_l2cache_lock);
+}
+
+/*
+ * Return whether this pool has a dedicated slog device. No locking needed.
+ * It's not a problem if the wrong answer is returned as it's only for
+ * performance and not correctness.
+ */
+boolean_t
+spa_has_slogs(spa_t *spa)
+{
+	return (spa->spa_log_class->mc_groups != 0);
+}
+
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+	return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+	spa->spa_log_state = state;
+}
+
+boolean_t
+spa_is_root(spa_t *spa)
+{
+	return (spa->spa_is_root);
+}
+
+boolean_t
+spa_writeable(spa_t *spa)
+{
+	return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config);
+}
+
+/*
+ * Returns true if there is a pending sync task in any of the current
+ * syncing txg, the current quiescing txg, or the current open txg.
+ */
+boolean_t
+spa_has_pending_synctask(spa_t *spa)
+{
+	return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) ||
+	    !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks));
+}
+
+spa_mode_t
+spa_mode(spa_t *spa)
+{
+	return (spa->spa_mode);
+}
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+	return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+	return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+	return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+	return (spa->spa_dedup_checksum);
+}
+
+/*
+ * Reset pool scan stat per scan pass (or reboot).
+ */
+void
+spa_scan_stat_init(spa_t *spa)
+{
+	/* data not stored on disk */
+	spa->spa_scan_pass_start = gethrestime_sec();
+	if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan))
+		spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
+	else
+		spa->spa_scan_pass_scrub_pause = 0;
+	spa->spa_scan_pass_scrub_spent_paused = 0;
+	spa->spa_scan_pass_exam = 0;
+	spa->spa_scan_pass_issued = 0;
+	vdev_scan_stat_init(spa->spa_root_vdev);
+}
+
+/*
+ * Get scan stats for zpool status reports
+ */
+int
+spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
+{
+	dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
+
+	if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+		return (SET_ERROR(ENOENT));
+	bzero(ps, sizeof (pool_scan_stat_t));
+
+	/* data stored on disk */
+	ps->pss_func = scn->scn_phys.scn_func;
+	ps->pss_state = scn->scn_phys.scn_state;
+	ps->pss_start_time = scn->scn_phys.scn_start_time;
+	ps->pss_end_time = scn->scn_phys.scn_end_time;
+	ps->pss_to_examine = scn->scn_phys.scn_to_examine;
+	ps->pss_examined = scn->scn_phys.scn_examined;
+	ps->pss_to_process = scn->scn_phys.scn_to_process;
+	ps->pss_processed = scn->scn_phys.scn_processed;
+	ps->pss_errors = scn->scn_phys.scn_errors;
+
+	/* data not stored on disk */
+	ps->pss_pass_exam = spa->spa_scan_pass_exam;
+	ps->pss_pass_start = spa->spa_scan_pass_start;
+	ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
+	ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
+	ps->pss_pass_issued = spa->spa_scan_pass_issued;
+	ps->pss_issued =
+	    scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
+
+	return (0);
+}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+		return (SPA_MAXBLOCKSIZE);
+	else
+		return (SPA_OLD_MAXBLOCKSIZE);
+}
+
+
+/*
+ * Returns the txg that the last device removal completed. No indirect mappings
+ * have been added since this txg.
+ */
+uint64_t
+spa_get_last_removal_txg(spa_t *spa)
+{
+	uint64_t vdevid;
+	uint64_t ret = -1ULL;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	/*
+	 * sr_prev_indirect_vdev is only modified while holding all the
+	 * config locks, so it is sufficient to hold SCL_VDEV as reader when
+	 * examining it.
+	 */
+	vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev;
+
+	while (vdevid != -1ULL) {
+		vdev_t *vd = vdev_lookup_top(spa, vdevid);
+		vdev_indirect_births_t *vib = vd->vdev_indirect_births;
+
+		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+		/*
+		 * If the removal did not remap any data, we don't care.
+		 */
+		if (vdev_indirect_births_count(vib) != 0) {
+			ret = vdev_indirect_births_last_entry_txg(vib);
+			break;
+		}
+
+		vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev;
+	}
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	IMPLY(ret != -1ULL,
+	    spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
+
+	return (ret);
+}
+
+int
+spa_maxdnodesize(spa_t *spa)
+{
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
+		return (DNODE_MAX_SIZE);
+	else
+		return (DNODE_MIN_SIZE);
+}
+
+boolean_t
+spa_multihost(spa_t *spa)
+{
+	return (spa->spa_multihost ? B_TRUE : B_FALSE);
+}
+
+uint32_t
+spa_get_hostid(spa_t *spa)
+{
+	return (spa->spa_hostid);
+}
+
+boolean_t
+spa_trust_config(spa_t *spa)
+{
+	return (spa->spa_trust_config);
+}
+
+uint64_t
+spa_missing_tvds_allowed(spa_t *spa)
+{
+	return (spa->spa_missing_tvds_allowed);
+}
+
+space_map_t *
+spa_syncing_log_sm(spa_t *spa)
+{
+	return (spa->spa_syncing_log_sm);
+}
+
+void
+spa_set_missing_tvds(spa_t *spa, uint64_t missing)
+{
+	spa->spa_missing_tvds = missing;
+}
+
+/*
+ * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc).
+ */
+const char *
+spa_state_to_name(spa_t *spa)
+{
+	ASSERT3P(spa, !=, NULL);
+
+	/*
+	 * it is possible for the spa to exist, without root vdev
+	 * as the spa transitions during import/export
+	 */
+	vdev_t *rvd = spa->spa_root_vdev;
+	if (rvd == NULL) {
+		return ("TRANSITIONING");
+	}
+	vdev_state_t state = rvd->vdev_state;
+	vdev_aux_t aux = rvd->vdev_stat.vs_aux;
+
+	if (spa_suspended(spa) &&
+	    (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE))
+		return ("SUSPENDED");
+
+	switch (state) {
+	case VDEV_STATE_CLOSED:
+	case VDEV_STATE_OFFLINE:
+		return ("OFFLINE");
+	case VDEV_STATE_REMOVED:
+		return ("REMOVED");
+	case VDEV_STATE_CANT_OPEN:
+		if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
+			return ("FAULTED");
+		else if (aux == VDEV_AUX_SPLIT_POOL)
+			return ("SPLIT");
+		else
+			return ("UNAVAIL");
+	case VDEV_STATE_FAULTED:
+		return ("FAULTED");
+	case VDEV_STATE_DEGRADED:
+		return ("DEGRADED");
+	case VDEV_STATE_HEALTHY:
+		return ("ONLINE");
+	default:
+		break;
+	}
+
+	return ("UNKNOWN");
+}
+
+boolean_t
+spa_top_vdevs_spacemap_addressable(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
+		if (!vdev_is_spacemap_addressable(rvd->vdev_child[c]))
+			return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+boolean_t
+spa_has_checkpoint(spa_t *spa)
+{
+	return (spa->spa_checkpoint_txg != 0);
+}
+
+boolean_t
+spa_importing_readonly_checkpoint(spa_t *spa)
+{
+	return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) &&
+	    spa->spa_mode == SPA_MODE_READ);
+}
+
+uint64_t
+spa_min_claim_txg(spa_t *spa)
+{
+	uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg;
+
+	if (checkpoint_txg != 0)
+		return (checkpoint_txg + 1);
+
+	return (spa->spa_first_txg);
+}
+
+/*
+ * If there is a checkpoint, async destroys may consume more space from
+ * the pool instead of freeing it. In an attempt to save the pool from
+ * getting suspended when it is about to run out of space, we stop
+ * processing async destroys.
+ */
+boolean_t
+spa_suspend_async_destroy(spa_t *spa)
+{
+	dsl_pool_t *dp = spa_get_dsl(spa);
+
+	uint64_t unreserved = dsl_pool_unreserved_space(dp,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
+	uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+	uint64_t avail = (unreserved > used) ? (unreserved - used) : 0;
+
+	if (spa_has_checkpoint(spa) && avail == 0)
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+#if defined(_KERNEL)
+
+int
+param_set_deadman_failmode_common(const char *val)
+{
+	spa_t *spa = NULL;
+	char *p;
+
+	if (val == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if ((p = strchr(val, '\n')) != NULL)
+		*p = '\0';
+
+	if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 &&
+	    strcmp(val, "panic"))
+		return (SET_ERROR(EINVAL));
+
+	if (spa_mode_global != SPA_MODE_UNINIT) {
+		mutex_enter(&spa_namespace_lock);
+		while ((spa = spa_next(spa)) != NULL)
+			spa_set_deadman_failmode(spa, val);
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	return (0);
+}
+#endif
+
+/* Namespace manipulation */
+EXPORT_SYMBOL(spa_lookup);
+EXPORT_SYMBOL(spa_add);
+EXPORT_SYMBOL(spa_remove);
+EXPORT_SYMBOL(spa_next);
+
+/* Refcount functions */
+EXPORT_SYMBOL(spa_open_ref);
+EXPORT_SYMBOL(spa_close);
+EXPORT_SYMBOL(spa_refcount_zero);
+
+/* Pool configuration lock */
+EXPORT_SYMBOL(spa_config_tryenter);
+EXPORT_SYMBOL(spa_config_enter);
+EXPORT_SYMBOL(spa_config_exit);
+EXPORT_SYMBOL(spa_config_held);
+
+/* Pool vdev add/remove lock */
+EXPORT_SYMBOL(spa_vdev_enter);
+EXPORT_SYMBOL(spa_vdev_exit);
+
+/* Pool vdev state change lock */
+EXPORT_SYMBOL(spa_vdev_state_enter);
+EXPORT_SYMBOL(spa_vdev_state_exit);
+
+/* Accessor functions */
+EXPORT_SYMBOL(spa_shutting_down);
+EXPORT_SYMBOL(spa_get_dsl);
+EXPORT_SYMBOL(spa_get_rootblkptr);
+EXPORT_SYMBOL(spa_set_rootblkptr);
+EXPORT_SYMBOL(spa_altroot);
+EXPORT_SYMBOL(spa_sync_pass);
+EXPORT_SYMBOL(spa_name);
+EXPORT_SYMBOL(spa_guid);
+EXPORT_SYMBOL(spa_last_synced_txg);
+EXPORT_SYMBOL(spa_first_txg);
+EXPORT_SYMBOL(spa_syncing_txg);
+EXPORT_SYMBOL(spa_version);
+EXPORT_SYMBOL(spa_state);
+EXPORT_SYMBOL(spa_load_state);
+EXPORT_SYMBOL(spa_freeze_txg);
+EXPORT_SYMBOL(spa_get_dspace);
+EXPORT_SYMBOL(spa_update_dspace);
+EXPORT_SYMBOL(spa_deflate);
+EXPORT_SYMBOL(spa_normal_class);
+EXPORT_SYMBOL(spa_log_class);
+EXPORT_SYMBOL(spa_special_class);
+EXPORT_SYMBOL(spa_preferred_class);
+EXPORT_SYMBOL(spa_max_replication);
+EXPORT_SYMBOL(spa_prev_software_version);
+EXPORT_SYMBOL(spa_get_failmode);
+EXPORT_SYMBOL(spa_suspended);
+EXPORT_SYMBOL(spa_bootfs);
+EXPORT_SYMBOL(spa_delegation);
+EXPORT_SYMBOL(spa_meta_objset);
+EXPORT_SYMBOL(spa_maxblocksize);
+EXPORT_SYMBOL(spa_maxdnodesize);
+
+/* Miscellaneous support routines */
+EXPORT_SYMBOL(spa_guid_exists);
+EXPORT_SYMBOL(spa_strdup);
+EXPORT_SYMBOL(spa_strfree);
+EXPORT_SYMBOL(spa_get_random);
+EXPORT_SYMBOL(spa_generate_guid);
+EXPORT_SYMBOL(snprintf_blkptr);
+EXPORT_SYMBOL(spa_freeze);
+EXPORT_SYMBOL(spa_upgrade);
+EXPORT_SYMBOL(spa_evict_all);
+EXPORT_SYMBOL(spa_lookup_by_guid);
+EXPORT_SYMBOL(spa_has_spare);
+EXPORT_SYMBOL(dva_get_dsize_sync);
+EXPORT_SYMBOL(bp_get_dsize_sync);
+EXPORT_SYMBOL(bp_get_dsize);
+EXPORT_SYMBOL(spa_has_slogs);
+EXPORT_SYMBOL(spa_is_root);
+EXPORT_SYMBOL(spa_writeable);
+EXPORT_SYMBOL(spa_mode);
+EXPORT_SYMBOL(spa_namespace_lock);
+EXPORT_SYMBOL(spa_trust_config);
+EXPORT_SYMBOL(spa_missing_tvds_allowed);
+EXPORT_SYMBOL(spa_set_missing_tvds);
+EXPORT_SYMBOL(spa_state_to_name);
+EXPORT_SYMBOL(spa_importing_readonly_checkpoint);
+EXPORT_SYMBOL(spa_min_claim_txg);
+EXPORT_SYMBOL(spa_suspend_async_destroy);
+EXPORT_SYMBOL(spa_has_checkpoint);
+EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable);
+
+ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW,
+	"Set additional debugging flags");
+
+ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW,
+	"Set to attempt to recover from fatal errors");
+
+ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW,
+	"Set to ignore IO errors during free and permanently leak the space");
+
+ZFS_MODULE_PARAM(zfs, zfs_, deadman_checktime_ms, ULONG, ZMOD_RW,
+	"Dead I/O check interval in milliseconds");
+
+ZFS_MODULE_PARAM(zfs, zfs_, deadman_enabled, INT, ZMOD_RW,
+	"Enable deadman timer");
+
+ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW,
+	"SPA size estimate multiplication factor");
+
+ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW,
+	"Place DDT data into the special class");
+
+ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW,
+	"Place user data indirect blocks into the special class");
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode,
+	param_set_deadman_failmode, param_get_charp, ZMOD_RW,
+	"Failmode for deadman timer");
+
+ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms,
+	param_set_deadman_synctime, param_get_ulong, ZMOD_RW,
+	"Pool sync expiration time in milliseconds");
+
+ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms,
+	param_set_deadman_ziotime, param_get_ulong, ZMOD_RW,
+	"IO expiration time in milliseconds");
+
+ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW,
+	"Small file blocks in special vdevs depends on this much "
+	"free space available");
+/* END CSTYLED */
+
+ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift,
+	param_get_int, ZMOD_RW, "Reserved free space in pool");
diff --git a/sys/contrib/openzfs/module/zfs/spa_stats.c b/sys/contrib/openzfs/module/zfs/spa_stats.c
new file mode 100644
index 000000000000..c3eacc14239e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/spa_stats.c
@@ -0,0 +1,1029 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa.h>
+#include <zfs_comutil.h>
+
+/*
+ * Keeps stats on last N reads per spa_t, disabled by default.
+ */
+int zfs_read_history = 0;
+
+/*
+ * Include cache hits in history, disabled by default.
+ */
+int zfs_read_history_hits = 0;
+
+/*
+ * Keeps stats on the last 100 txgs by default.
+ */
+int zfs_txg_history = 100;
+
+/*
+ * Keeps stats on the last N MMP updates, disabled by default.
+ */
+int zfs_multihost_history = 0;
+
+/*
+ * ==========================================================================
+ * SPA Read History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Read statistics - Information exported regarding each arc_read call
+ */
+typedef struct spa_read_history {
+	hrtime_t	start;		/* time read completed */
+	uint64_t	objset;		/* read from this objset */
+	uint64_t	object;		/* read of this object number */
+	uint64_t	level;		/* block's indirection level */
+	uint64_t	blkid;		/* read of this block id */
+	char		origin[24];	/* read originated from here */
+	uint32_t	aflags;		/* ARC flags (cached, prefetch, etc.) */
+	pid_t		pid;		/* PID of task doing read */
+	char		comm[16];	/* process name of task doing read */
+	procfs_list_node_t	srh_node;
+} spa_read_history_t;
+
+static int
+spa_read_history_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+	    "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
+	    "level", "blkid", "aflags", "origin", "pid", "process");
+
+	return (0);
+}
+
+static int
+spa_read_history_show(struct seq_file *f, void *data)
+{
+	spa_read_history_t *srh = (spa_read_history_t *)data;
+
+	seq_printf(f, "%-8llu %-16llu 0x%-6llx "
+	    "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
+	    (u_longlong_t)srh->srh_node.pln_id, srh->start,
+	    (longlong_t)srh->objset, (longlong_t)srh->object,
+	    (longlong_t)srh->level, (longlong_t)srh->blkid,
+	    srh->aflags, srh->origin, srh->pid, srh->comm);
+
+	return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+	spa_read_history_t *srh;
+	while (shl->size > size) {
+		srh = list_remove_head(&shl->procfs_list.pl_list);
+		ASSERT3P(srh, !=, NULL);
+		kmem_free(srh, sizeof (spa_read_history_t));
+		shl->size--;
+	}
+
+	if (size == 0)
+		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+}
+
+static int
+spa_read_history_clear(procfs_list_t *procfs_list)
+{
+	spa_history_list_t *shl = procfs_list->pl_private;
+	mutex_enter(&procfs_list->pl_lock);
+	spa_read_history_truncate(shl, 0);
+	mutex_exit(&procfs_list->pl_lock);
+	return (0);
+}
+
+static void
+spa_read_history_init(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.read_history;
+
+	shl->size = 0;
+	shl->procfs_list.pl_private = shl;
+	procfs_list_install("zfs",
+	    spa_name(spa),
+	    "reads",
+	    0600,
+	    &shl->procfs_list,
+	    spa_read_history_show,
+	    spa_read_history_show_header,
+	    spa_read_history_clear,
+	    offsetof(spa_read_history_t, srh_node));
+}
+
+static void
+spa_read_history_destroy(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.read_history;
+	procfs_list_uninstall(&shl->procfs_list);
+	spa_read_history_truncate(shl, 0);
+	procfs_list_destroy(&shl->procfs_list);
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
+{
+	spa_history_list_t *shl = &spa->spa_stats.read_history;
+	spa_read_history_t *srh;
+
+	ASSERT3P(spa, !=, NULL);
+	ASSERT3P(zb,  !=, NULL);
+
+	if (zfs_read_history == 0 && shl->size == 0)
+		return;
+
+	if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
+		return;
+
+	srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
+	strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
+	srh->start  = gethrtime();
+	srh->objset = zb->zb_objset;
+	srh->object = zb->zb_object;
+	srh->level  = zb->zb_level;
+	srh->blkid  = zb->zb_blkid;
+	srh->aflags = aflags;
+	srh->pid    = getpid();
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+
+	procfs_list_add(&shl->procfs_list, srh);
+	shl->size++;
+
+	spa_read_history_truncate(shl, zfs_read_history);
+
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA TXG History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Txg statistics - Information exported regarding each txg sync
+ */
+
+typedef struct spa_txg_history {
+	uint64_t	txg;		/* txg id */
+	txg_state_t	state;		/* active txg state */
+	uint64_t	nread;		/* number of bytes read */
+	uint64_t	nwritten;	/* number of bytes written */
+	uint64_t	reads;		/* number of read operations */
+	uint64_t	writes;		/* number of write operations */
+	uint64_t	ndirty;		/* number of dirty bytes */
+	hrtime_t	times[TXG_STATE_COMMITTED]; /* completion times */
+	procfs_list_node_t	sth_node;
+} spa_txg_history_t;
+
+static int
+spa_txg_history_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
+	    "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
+	    "ndirty", "nread", "nwritten", "reads", "writes",
+	    "otime", "qtime", "wtime", "stime");
+	return (0);
+}
+
+static int
+spa_txg_history_show(struct seq_file *f, void *data)
+{
+	spa_txg_history_t *sth = (spa_txg_history_t *)data;
+	uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
+	char state;
+
+	switch (sth->state) {
+		case TXG_STATE_BIRTH:		state = 'B';	break;
+		case TXG_STATE_OPEN:		state = 'O';	break;
+		case TXG_STATE_QUIESCED:	state = 'Q';	break;
+		case TXG_STATE_WAIT_FOR_SYNC:	state = 'W';	break;
+		case TXG_STATE_SYNCED:		state = 'S';	break;
+		case TXG_STATE_COMMITTED:	state = 'C';	break;
+		default:			state = '?';	break;
+	}
+
+	if (sth->times[TXG_STATE_OPEN])
+		open = sth->times[TXG_STATE_OPEN] -
+		    sth->times[TXG_STATE_BIRTH];
+
+	if (sth->times[TXG_STATE_QUIESCED])
+		quiesce = sth->times[TXG_STATE_QUIESCED] -
+		    sth->times[TXG_STATE_OPEN];
+
+	if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
+		wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
+		    sth->times[TXG_STATE_QUIESCED];
+
+	if (sth->times[TXG_STATE_SYNCED])
+		sync = sth->times[TXG_STATE_SYNCED] -
+		    sth->times[TXG_STATE_WAIT_FOR_SYNC];
+
+	seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
+	    "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
+	    (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
+	    (u_longlong_t)sth->ndirty,
+	    (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
+	    (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
+	    (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
+	    (u_longlong_t)sync);
+
+	return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+	spa_txg_history_t *sth;
+	while (shl->size > size) {
+		sth = list_remove_head(&shl->procfs_list.pl_list);
+		ASSERT3P(sth, !=, NULL);
+		kmem_free(sth, sizeof (spa_txg_history_t));
+		shl->size--;
+	}
+
+	if (size == 0)
+		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_txg_history_clear(procfs_list_t *procfs_list)
+{
+	spa_history_list_t *shl = procfs_list->pl_private;
+	mutex_enter(&procfs_list->pl_lock);
+	spa_txg_history_truncate(shl, 0);
+	mutex_exit(&procfs_list->pl_lock);
+	return (0);
+}
+
+static void
+spa_txg_history_init(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+
+	shl->size = 0;
+	shl->procfs_list.pl_private = shl;
+	procfs_list_install("zfs",
+	    spa_name(spa),
+	    "txgs",
+	    0644,
+	    &shl->procfs_list,
+	    spa_txg_history_show,
+	    spa_txg_history_show_header,
+	    spa_txg_history_clear,
+	    offsetof(spa_txg_history_t, sth_node));
+}
+
+static void
+spa_txg_history_destroy(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	procfs_list_uninstall(&shl->procfs_list);
+	spa_txg_history_truncate(shl, 0);
+	procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Add a new txg to historical record.
+ */
+void
+spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	spa_txg_history_t *sth;
+
+	if (zfs_txg_history == 0 && shl->size == 0)
+		return;
+
+	sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
+	sth->txg = txg;
+	sth->state = TXG_STATE_OPEN;
+	sth->times[TXG_STATE_BIRTH] = birth_time;
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	procfs_list_add(&shl->procfs_list, sth);
+	shl->size++;
+	spa_txg_history_truncate(shl, zfs_txg_history);
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+/*
+ * Set txg state completion time and increment current state.
+ */
+int
+spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
+    hrtime_t completed_time)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	spa_txg_history_t *sth;
+	int error = ENOENT;
+
+	if (zfs_txg_history == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+		if (sth->txg == txg) {
+			sth->times[completed_state] = completed_time;
+			sth->state++;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+/*
+ * Set txg IO stats.
+ */
+static int
+spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
+    uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
+{
+	spa_history_list_t *shl = &spa->spa_stats.txg_history;
+	spa_txg_history_t *sth;
+	int error = ENOENT;
+
+	if (zfs_txg_history == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
+		if (sth->txg == txg) {
+			sth->nread = nread;
+			sth->nwritten = nwritten;
+			sth->reads = reads;
+			sth->writes = writes;
+			sth->ndirty = ndirty;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+txg_stat_t *
+spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
+{
+	txg_stat_t *ts;
+
+	if (zfs_txg_history == 0)
+		return (NULL);
+
+	ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	ts->txg = txg;
+	ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
+
+	spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
+
+	return (ts);
+}
+
+void
+spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
+{
+	if (ts == NULL)
+		return;
+
+	if (zfs_txg_history == 0) {
+		kmem_free(ts, sizeof (txg_stat_t));
+		return;
+	}
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
+	spa_txg_history_set_io(spa, ts->txg,
+	    ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
+	    ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
+	    ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
+	    ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
+	    ts->ndirty);
+
+	kmem_free(ts, sizeof (txg_stat_t));
+}
+
+/*
+ * ==========================================================================
+ * SPA TX Assign Histogram Routines
+ * ==========================================================================
+ */
+
+/*
+ * Tx statistics - Information exported regarding dmu_tx_assign time.
+ */
+
+/*
+ * When the kstat is written zero all buckets.  When the kstat is read
+ * count the number of trailing buckets set to zero and update ks_ndata
+ * such that they are not output.
+ */
+static int
+spa_tx_assign_update(kstat_t *ksp, int rw)
+{
+	spa_t *spa = ksp->ks_private;
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	int i;
+
+	if (rw == KSTAT_WRITE) {
+		for (i = 0; i < shk->count; i++)
+			((kstat_named_t *)shk->priv)[i].value.ui64 = 0;
+	}
+
+	for (i = shk->count; i > 0; i--)
+		if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0)
+			break;
+
+	ksp->ks_ndata = i;
+	ksp->ks_data_size = i * sizeof (kstat_named_t);
+
+	return (0);
+}
+
+static void
+spa_tx_assign_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	char *name;
+	kstat_named_t *ks;
+	kstat_t *ksp;
+	int i;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	shk->count = 42; /* power of two buckets for 1ns to 2,199s */
+	shk->size = shk->count * sizeof (kstat_named_t);
+	shk->priv = kmem_alloc(shk->size, KM_SLEEP);
+
+	name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	for (i = 0; i < shk->count; i++) {
+		ks = &((kstat_named_t *)shk->priv)[i];
+		ks->data_type = KSTAT_DATA_UINT64;
+		ks->value.ui64 = 0;
+		(void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
+		    (u_longlong_t)1 << i);
+	}
+
+	ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
+	    KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
+	shk->kstat = ksp;
+
+	if (ksp) {
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_data = shk->priv;
+		ksp->ks_ndata = shk->count;
+		ksp->ks_data_size = shk->size;
+		ksp->ks_private = spa;
+		ksp->ks_update = spa_tx_assign_update;
+		kstat_install(ksp);
+	}
+	kmem_strfree(name);
+}
+
+static void
+spa_tx_assign_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	kstat_t *ksp;
+
+	ksp = shk->kstat;
+	if (ksp)
+		kstat_delete(ksp);
+
+	kmem_free(shk->priv, shk->size);
+	mutex_destroy(&shk->lock);
+}
+
+void
+spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
+	uint64_t idx = 0;
+
+	while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
+		idx++;
+
+	atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64);
+}
+
+/*
+ * ==========================================================================
+ * SPA IO History Routines
+ * ==========================================================================
+ */
+static int
+spa_io_history_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE)
+		memset(ksp->ks_data, 0, ksp->ks_data_size);
+
+	return (0);
+}
+
+static void
+spa_io_history_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+	char *name;
+	kstat_t *ksp;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	name = kmem_asprintf("zfs/%s", spa_name(spa));
+
+	ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
+	shk->kstat = ksp;
+
+	if (ksp) {
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_private = spa;
+		ksp->ks_update = spa_io_history_update;
+		kstat_install(ksp);
+	}
+	kmem_strfree(name);
+}
+
+static void
+spa_io_history_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+	if (shk->kstat)
+		kstat_delete(shk->kstat);
+
+	mutex_destroy(&shk->lock);
+}
+
+/*
+ * ==========================================================================
+ * SPA MMP History Routines
+ * ==========================================================================
+ */
+
+/*
+ * MMP statistics - Information exported regarding attempted MMP writes
+ *   For MMP writes issued, fields used as per comments below.
+ *   For MMP writes skipped, an entry represents a span of time when
+ *      writes were skipped for same reason (error from mmp_random_leaf).
+ *      Differences are:
+ *      timestamp	time first write skipped, if >1 skipped in a row
+ *      mmp_delay	delay value at timestamp
+ *      vdev_guid	number of writes skipped
+ *      io_error	one of enum mmp_error
+ *      duration	time span (ns) of skipped writes
+ */
+
+typedef struct spa_mmp_history {
+	uint64_t	mmp_node_id;	/* unique # for updates */
+	uint64_t	txg;		/* txg of last sync */
+	uint64_t	timestamp;	/* UTC time MMP write issued */
+	uint64_t	mmp_delay;	/* mmp_thread.mmp_delay at timestamp */
+	uint64_t	vdev_guid;	/* unique ID of leaf vdev */
+	char		*vdev_path;
+	int		vdev_label;	/* vdev label */
+	int		io_error;	/* error status of MMP write */
+	hrtime_t	error_start;	/* hrtime of start of error period */
+	hrtime_t	duration;	/* time from submission to completion */
+	procfs_list_node_t	smh_node;
+} spa_mmp_history_t;
+
+static int
+spa_mmp_history_show_header(struct seq_file *f)
+{
+	seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
+	    "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
+	    "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
+	return (0);
+}
+
+static int
+spa_mmp_history_show(struct seq_file *f, void *data)
+{
+	spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
+	char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
+	    "%-10lld %s\n";
+	char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
+	    "%-10lld %s\n";
+
+	seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
+	    (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
+	    (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
+	    (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
+	    (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
+	    (smh->vdev_path ? smh->vdev_path : "-"));
+
+	return (0);
+}
+
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
+{
+	spa_mmp_history_t *smh;
+	while (shl->size > size) {
+		smh = list_remove_head(&shl->procfs_list.pl_list);
+		if (smh->vdev_path)
+			kmem_strfree(smh->vdev_path);
+		kmem_free(smh, sizeof (spa_mmp_history_t));
+		shl->size--;
+	}
+
+	if (size == 0)
+		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
+
+}
+
+static int
+spa_mmp_history_clear(procfs_list_t *procfs_list)
+{
+	spa_history_list_t *shl = procfs_list->pl_private;
+	mutex_enter(&procfs_list->pl_lock);
+	spa_mmp_history_truncate(shl, 0);
+	mutex_exit(&procfs_list->pl_lock);
+	return (0);
+}
+
+static void
+spa_mmp_history_init(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+
+	shl->size = 0;
+
+	shl->procfs_list.pl_private = shl;
+	procfs_list_install("zfs",
+	    spa_name(spa),
+	    "multihost",
+	    0644,
+	    &shl->procfs_list,
+	    spa_mmp_history_show,
+	    spa_mmp_history_show_header,
+	    spa_mmp_history_clear,
+	    offsetof(spa_mmp_history_t, smh_node));
+}
+
+static void
+spa_mmp_history_destroy(spa_t *spa)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	procfs_list_uninstall(&shl->procfs_list);
+	spa_mmp_history_truncate(shl, 0);
+	procfs_list_destroy(&shl->procfs_list);
+}
+
+/*
+ * Set duration in existing "skip" record to how long we have waited for a leaf
+ * vdev to become available.
+ *
+ * Important that we start search at the tail of the list where new
+ * records are inserted, so this is normally an O(1) operation.
+ */
+int
+spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	spa_mmp_history_t *smh;
+	int error = ENOENT;
+
+	if (zfs_multihost_history == 0 && shl->size == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+		if (smh->mmp_node_id == mmp_node_id) {
+			ASSERT3U(smh->io_error, !=, 0);
+			smh->duration = gethrtime() - smh->error_start;
+			smh->vdev_guid++;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+/*
+ * Set MMP write duration and error status in existing record.
+ * See comment re: search order above spa_mmp_history_set_skip().
+ */
+int
+spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
+    hrtime_t duration)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	spa_mmp_history_t *smh;
+	int error = ENOENT;
+
+	if (zfs_multihost_history == 0 && shl->size == 0)
+		return (0);
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+		if (smh->mmp_node_id == mmp_node_id) {
+			ASSERT(smh->io_error == 0);
+			smh->io_error = io_error;
+			smh->duration = duration;
+			error = 0;
+			break;
+		}
+	}
+	mutex_exit(&shl->procfs_list.pl_lock);
+
+	return (error);
+}
+
+/*
+ * Add a new MMP historical record.
+ * error == 0 : a write was issued.
+ * error != 0 : a write was not issued because no leaves were found.
+ */
+void
+spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+    uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
+    int error)
+{
+	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+	spa_mmp_history_t *smh;
+
+	if (zfs_multihost_history == 0 && shl->size == 0)
+		return;
+
+	smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
+	smh->txg = txg;
+	smh->timestamp = timestamp;
+	smh->mmp_delay = mmp_delay;
+	if (vd) {
+		smh->vdev_guid = vd->vdev_guid;
+		if (vd->vdev_path)
+			smh->vdev_path = kmem_strdup(vd->vdev_path);
+	}
+	smh->vdev_label = label;
+	smh->mmp_node_id = mmp_node_id;
+
+	if (error) {
+		smh->io_error = error;
+		smh->error_start = gethrtime();
+		smh->vdev_guid = 1;
+	}
+
+	mutex_enter(&shl->procfs_list.pl_lock);
+	procfs_list_add(&shl->procfs_list, smh);
+	shl->size++;
+	spa_mmp_history_truncate(shl, zfs_multihost_history);
+	mutex_exit(&shl->procfs_list.pl_lock);
+}
+
+static void *
+spa_state_addr(kstat_t *ksp, loff_t n)
+{
+	if (n == 0)
+		return (ksp->ks_private);	/* return the spa_t */
+	return (NULL);
+}
+
+static int
+spa_state_data(char *buf, size_t size, void *data)
+{
+	spa_t *spa = (spa_t *)data;
+	(void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
+	return (0);
+}
+
+/*
+ * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
+ *
+ * This is a lock-less read of the pool's state (unlike using 'zpool', which
+ * can potentially block for seconds).  Because it doesn't block, it can useful
+ * as a pool heartbeat value.
+ */
+static void
+spa_state_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.state;
+	char *name;
+	kstat_t *ksp;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	name = kmem_asprintf("zfs/%s", spa_name(spa));
+	ksp = kstat_create(name, 0, "state", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+	shk->kstat = ksp;
+	if (ksp) {
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_data = NULL;
+		ksp->ks_private = spa;
+		ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
+		kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
+		kstat_install(ksp);
+	}
+
+	kmem_strfree(name);
+}
+
+static void
+spa_health_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.state;
+	kstat_t *ksp = shk->kstat;
+	if (ksp)
+		kstat_delete(ksp);
+
+	mutex_destroy(&shk->lock);
+}
+
+static spa_iostats_t spa_iostats_template = {
+	{ "trim_extents_written",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_written",			KSTAT_DATA_UINT64 },
+	{ "trim_extents_skipped",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_skipped",			KSTAT_DATA_UINT64 },
+	{ "trim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "trim_bytes_failed",			KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_written",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_written",		KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_skipped",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_skipped",		KSTAT_DATA_UINT64 },
+	{ "autotrim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "autotrim_bytes_failed",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_extents_written",	KSTAT_DATA_UINT64 },
+	{ "simple_trim_bytes_written",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_extents_skipped",	KSTAT_DATA_UINT64 },
+	{ "simple_trim_bytes_skipped",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_extents_failed",		KSTAT_DATA_UINT64 },
+	{ "simple_trim_bytes_failed",		KSTAT_DATA_UINT64 },
+};
+
+#define	SPA_IOSTATS_ADD(stat, val) \
+    atomic_add_64(&iostats->stat.value.ui64, (val));
+
+void
+spa_iostats_trim_add(spa_t *spa, trim_type_t type,
+    uint64_t extents_written, uint64_t bytes_written,
+    uint64_t extents_skipped, uint64_t bytes_skipped,
+    uint64_t extents_failed, uint64_t bytes_failed)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+	spa_iostats_t *iostats;
+
+	if (ksp == NULL)
+		return;
+
+	iostats = ksp->ks_data;
+	if (type == TRIM_TYPE_MANUAL) {
+		SPA_IOSTATS_ADD(trim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
+	} else if (type == TRIM_TYPE_AUTO) {
+		SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
+	} else {
+		SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written);
+		SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written);
+		SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped);
+		SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped);
+		SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed);
+		SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed);
+	}
+}
+
+static int
+spa_iostats_update(kstat_t *ksp, int rw)
+{
+	if (rw == KSTAT_WRITE) {
+		memcpy(ksp->ks_data, &spa_iostats_template,
+		    sizeof (spa_iostats_t));
+	}
+
+	return (0);
+}
+
+static void
+spa_iostats_init(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+
+	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
+
+	char *name = kmem_asprintf("zfs/%s", spa_name(spa));
+	kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	shk->kstat = ksp;
+	if (ksp) {
+		int size = sizeof (spa_iostats_t);
+		ksp->ks_lock = &shk->lock;
+		ksp->ks_private = spa;
+		ksp->ks_update = spa_iostats_update;
+		ksp->ks_data = kmem_alloc(size, KM_SLEEP);
+		memcpy(ksp->ks_data, &spa_iostats_template, size);
+		kstat_install(ksp);
+	}
+
+	kmem_strfree(name);
+}
+
+static void
+spa_iostats_destroy(spa_t *spa)
+{
+	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
+	kstat_t *ksp = shk->kstat;
+	if (ksp) {
+		kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
+		kstat_delete(ksp);
+	}
+
+	mutex_destroy(&shk->lock);
+}
+
+void
+spa_stats_init(spa_t *spa)
+{
+	spa_read_history_init(spa);
+	spa_txg_history_init(spa);
+	spa_tx_assign_init(spa);
+	spa_io_history_init(spa);
+	spa_mmp_history_init(spa);
+	spa_state_init(spa);
+	spa_iostats_init(spa);
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+	spa_iostats_destroy(spa);
+	spa_health_destroy(spa);
+	spa_tx_assign_destroy(spa);
+	spa_txg_history_destroy(spa);
+	spa_read_history_destroy(spa);
+	spa_io_history_destroy(spa);
+	spa_mmp_history_destroy(spa);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW,
+    "Historical statistics for the last N reads");
+
+ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW,
+    "Include cache hits in read history");
+
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW,
+    "Historical statistics for the last N txgs");
+
+ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW,
+    "Historical statistics for last N multihost writes");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/space_map.c b/sys/contrib/openzfs/module/zfs/space_map.c
new file mode 100644
index 000000000000..3db7d199199c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/space_map.c
@@ -0,0 +1,1105 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio.h>
+#include <sys/space_map.h>
+#include <sys/zfeature.h>
+
+/*
+ * Note on space map block size:
+ *
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
+ * when only a few blocks have changed since the last transaction group.
+ */
+
+/*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+/*
+ * Override the default indirect block size of 128K, instead use 16K for
+ * spacemaps (2^14 bytes).  This dramatically reduces write inflation since
+ * appending to a spacemap typically has to write one data block (4KB) and one
+ * or two indirect blocks (16K-32K, rather than 128K).
+ */
+int space_map_ibs = 14;
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+	uint8_t prefix = SM_PREFIX_DECODE(e);
+	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
+/*
+ * Iterate through the space map, invoking the callback on each (non-debug)
+ * space map entry. Stop after reading 'end' bytes of the space map.
+ */
+int
+space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg)
+{
+	uint64_t blksz = sm->sm_blksz;
+
+	ASSERT3U(blksz, !=, 0);
+	ASSERT3U(end, <=, space_map_length(sm));
+	ASSERT0(P2PHASE(end, sizeof (uint64_t)));
+
+	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end,
+	    ZIO_PRIORITY_SYNC_READ);
+
+	int error = 0;
+	uint64_t txg = 0, sync_pass = 0;
+	for (uint64_t block_base = 0; block_base < end && error == 0;
+	    block_base += blksz) {
+		dmu_buf_t *db;
+		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+		    block_base, FTAG, &db, DMU_READ_PREFETCH);
+		if (error != 0)
+			return (error);
+
+		uint64_t *block_start = db->db_data;
+		uint64_t block_length = MIN(end - block_base, blksz);
+		uint64_t *block_end = block_start +
+		    (block_length / sizeof (uint64_t));
+
+		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+		VERIFY3U(block_length, !=, 0);
+		ASSERT3U(blksz, ==, db->db_size);
+
+		for (uint64_t *block_cursor = block_start;
+		    block_cursor < block_end && error == 0; block_cursor++) {
+			uint64_t e = *block_cursor;
+
+			if (sm_entry_is_debug(e)) {
+				/*
+				 * Debug entries are only needed to record the
+				 * current TXG and sync pass if available.
+				 *
+				 * Note though that sometimes there can be
+				 * debug entries that are used as padding
+				 * at the end of space map blocks in-order
+				 * to not split a double-word entry in the
+				 * middle between two blocks. These entries
+				 * have their TXG field set to 0 and we
+				 * skip them without recording the TXG.
+				 * [see comment in space_map_write_seg()]
+				 */
+				uint64_t e_txg = SM_DEBUG_TXG_DECODE(e);
+				if (e_txg != 0) {
+					txg = e_txg;
+					sync_pass = SM_DEBUG_SYNCPASS_DECODE(e);
+				} else {
+					ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e));
+				}
+				continue;
+			}
+
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				/* it is a two-word entry */
+				ASSERT(sm_entry_is_double_word(e));
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move on to the second word */
+				block_cursor++;
+				e = *block_cursor;
+				VERIFY3P(block_cursor, <=, block_end);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
+
+			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+			    sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
+
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+			ASSERT3U(entry_offset, >=, sm->sm_start);
+			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			ASSERT3U(entry_run, <=, sm->sm_size);
+			ASSERT3U(entry_offset + entry_run, <=,
+			    sm->sm_start + sm->sm_size);
+
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run,
+			    .sme_txg = txg,
+			    .sme_sync_pass = sync_pass
+			};
+			error = callback(&sme, arg);
+		}
+		dmu_buf_rele(db, FTAG);
+	}
+	return (error);
+}
+
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+    uint64_t bufsz, uint64_t *nwords)
+{
+	int error = 0;
+	dmu_buf_t *db;
+
+	/*
+	 * Find the offset of the last word in the space map and use
+	 * that to read the last block of the space map with
+	 * dmu_buf_hold().
+	 */
+	uint64_t last_word_offset =
+	    sm->sm_phys->smp_length - sizeof (uint64_t);
+	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+	    FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (error != 0)
+		return (error);
+
+	ASSERT3U(sm->sm_object, ==, db->db_object);
+	ASSERT3U(sm->sm_blksz, ==, db->db_size);
+	ASSERT3U(bufsz, >=, db->db_size);
+	ASSERT(nwords != NULL);
+
+	uint64_t *words = db->db_data;
+	*nwords =
+	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
+
+	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
+
+	uint64_t n = *nwords;
+	uint64_t j = n - 1;
+	for (uint64_t i = 0; i < n; i++) {
+		uint64_t entry = words[i];
+		if (sm_entry_is_double_word(entry)) {
+			/*
+			 * Since we are populating the buffer backwards
+			 * we have to be extra careful and add the two
+			 * words of the double-word entry in the right
+			 * order.
+			 */
+			ASSERT3U(j, >, 0);
+			buf[j - 1] = entry;
+
+			i++;
+			ASSERT3U(i, <, n);
+			entry = words[i];
+			buf[j] = entry;
+			j -= 2;
+		} else {
+			ASSERT(sm_entry_is_debug(entry) ||
+			    sm_entry_is_single_word(entry));
+			buf[j] = entry;
+			j--;
+		}
+	}
+
+	/*
+	 * Assert that we wrote backwards all the
+	 * way to the beginning of the buffer.
+	 */
+	ASSERT3S(j, ==, -1);
+
+	dmu_buf_rele(db, FTAG);
+	return (error);
+}
+
+/*
+ * Note: This function performs destructive actions - specifically
+ * it deletes entries from the end of the space map. Thus, callers
+ * should ensure that they are holding the appropriate locks for
+ * the space map that they provide.
+ */
+int
+space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
+    dmu_tx_t *tx)
+{
+	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+	uint64_t *buf = zio_buf_alloc(bufsz);
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	/*
+	 * Ideally we would want to iterate from the beginning of the
+	 * space map to the end in incremental steps. The issue with this
+	 * approach is that we don't have any field on-disk that points
+	 * us where to start between each step. We could try zeroing out
+	 * entries that we've destroyed, but this doesn't work either as
+	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
+	 *
+	 * As a result, we destroy its entries incrementally starting from
+	 * the end after applying the callback to each of them.
+	 *
+	 * The problem with this approach is that we cannot literally
+	 * iterate through the words in the space map backwards as we
+	 * can't distinguish two-word space map entries from their second
+	 * word. Thus we do the following:
+	 *
+	 * 1] We get all the entries from the last block of the space map
+	 *    and put them into a buffer in reverse order. This way the
+	 *    last entry comes first in the buffer, the second to last is
+	 *    second, etc.
+	 * 2] We iterate through the entries in the buffer and we apply
+	 *    the callback to each one. As we move from entry to entry we
+	 *    we decrease the size of the space map, deleting effectively
+	 *    each entry.
+	 * 3] If there are no more entries in the space map or the callback
+	 *    returns a value other than 0, we stop iterating over the
+	 *    space map. If there are entries remaining and the callback
+	 *    returned 0, we go back to step [1].
+	 */
+	int error = 0;
+	while (space_map_length(sm) > 0 && error == 0) {
+		uint64_t nwords = 0;
+		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+		    &nwords);
+		if (error != 0)
+			break;
+
+		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
+
+		for (uint64_t i = 0; i < nwords; i++) {
+			uint64_t e = buf[i];
+
+			if (sm_entry_is_debug(e)) {
+				sm->sm_phys->smp_length -= sizeof (uint64_t);
+				continue;
+			}
+
+			int words = 1;
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				ASSERT(sm_entry_is_double_word(e));
+				words = 2;
+
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move to the second word */
+				i++;
+				e = buf[i];
+
+				ASSERT3P(i, <=, nwords);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
+
+			uint64_t entry_offset =
+			    (raw_offset << sm->sm_shift) + sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
+
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+			VERIFY3U(entry_offset, >=, sm->sm_start);
+			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			VERIFY3U(entry_run, <=, sm->sm_size);
+			VERIFY3U(entry_offset + entry_run, <=,
+			    sm->sm_start + sm->sm_size);
+
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
+			if (error != 0)
+				break;
+
+			if (type == SM_ALLOC)
+				sm->sm_phys->smp_alloc -= entry_run;
+			else
+				sm->sm_phys->smp_alloc += entry_run;
+			sm->sm_phys->smp_length -= words * sizeof (uint64_t);
+		}
+	}
+
+	if (space_map_length(sm) == 0) {
+		ASSERT0(error);
+		ASSERT0(space_map_allocated(sm));
+	}
+
+	zio_buf_free(buf, bufsz);
+	return (error);
+}
+
+typedef struct space_map_load_arg {
+	space_map_t	*smla_sm;
+	range_tree_t	*smla_rt;
+	maptype_t	smla_type;
+} space_map_load_arg_t;
+
+static int
+space_map_load_callback(space_map_entry_t *sme, void *arg)
+{
+	space_map_load_arg_t *smla = arg;
+	if (sme->sme_type == smla->smla_type) {
+		VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
+		    smla->smla_sm->sm_size);
+		range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
+	} else {
+		range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
+	}
+
+	return (0);
+}
+
+/*
+ * Load the spacemap into the rangetree, like space_map_load. But only
+ * read the first 'length' bytes of the spacemap.
+ */
+int
+space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t length)
+{
+	space_map_load_arg_t smla;
+
+	VERIFY0(range_tree_space(rt));
+
+	if (maptype == SM_FREE)
+		range_tree_add(rt, sm->sm_start, sm->sm_size);
+
+	smla.smla_rt = rt;
+	smla.smla_sm = sm;
+	smla.smla_type = maptype;
+	int err = space_map_iterate(sm, length,
+	    space_map_load_callback, &smla);
+
+	if (err != 0)
+		range_tree_vacate(rt, NULL, NULL);
+
+	return (err);
+}
+
+/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ */
+int
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
+{
+	return (space_map_load_length(sm, rt, maptype, space_map_length(sm)));
+}
+
+void
+space_map_histogram_clear(space_map_t *sm)
+{
+	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+		return;
+
+	bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
+}
+
+boolean_t
+space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
+{
+	/*
+	 * Verify that the in-core range tree does not have any
+	 * ranges smaller than our sm_shift size.
+	 */
+	for (int i = 0; i < sm->sm_shift; i++) {
+		if (rt->rt_histogram[i] != 0)
+			return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+void
+space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
+{
+	int idx = 0;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	VERIFY3U(space_map_object(sm), !=, 0);
+
+	if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+		return;
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	ASSERT(space_map_histogram_verify(sm, rt));
+	/*
+	 * Transfer the content of the range tree histogram to the space
+	 * map histogram. The space map histogram contains 32 buckets ranging
+	 * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
+	 * however, can represent ranges from 2^0 to 2^63. Since the space
+	 * map only cares about allocatable blocks (minimum of sm_shift) we
+	 * can safely ignore all ranges in the range tree smaller than sm_shift.
+	 */
+	for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+
+		/*
+		 * Since the largest histogram bucket in the space map is
+		 * 2^(32+sm_shift-1), we need to normalize the values in
+		 * the range tree for any bucket larger than that size. For
+		 * example given an sm_shift of 9, ranges larger than 2^40
+		 * would get normalized as if they were 1TB ranges. Assume
+		 * the range tree had a count of 5 in the 2^44 (16TB) bucket,
+		 * the calculation below would normalize this to 5 * 2^4 (16).
+		 */
+		ASSERT3U(i, >=, idx + sm->sm_shift);
+		sm->sm_phys->smp_histogram[idx] +=
+		    rt->rt_histogram[i] << (i - idx - sm->sm_shift);
+
+		/*
+		 * Increment the space map's index as long as we haven't
+		 * reached the maximum bucket size. Accumulate all ranges
+		 * larger than the max bucket size into the last bucket.
+		 */
+		if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+			ASSERT3U(idx + sm->sm_shift, ==, i);
+			idx++;
+			ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+		}
+	}
+}
+
+static void
+space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
+{
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+	    SM_DEBUG_ACTION_ENCODE(maptype) |
+	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
+	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length,
+	    sizeof (dentry), &dentry, tx);
+
+	sm->sm_phys->smp_length += sizeof (dentry);
+}
+
+/*
+ * Writes one or more entries given a segment.
+ *
+ * Note: The function may release the dbuf from the pointer initially
+ * passed to it, and return a different dbuf. Also, the space map's
+ * dbuf must be dirty for the changes in sm_phys to take effect.
+ */
+static void
+space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend,
+    maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp,
+    void *tag, dmu_tx_t *tx)
+{
+	ASSERT3U(words, !=, 0);
+	ASSERT3U(words, <=, 2);
+
+	/* ensure the vdev_id can be represented by the space map */
+	ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
+
+	/*
+	 * if this is a single word entry, ensure that no vdev was
+	 * specified.
+	 */
+	IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
+
+	dmu_buf_t *db = *dbp;
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	uint64_t *block_base = db->db_data;
+	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
+	uint64_t *block_cursor = block_base +
+	    (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t);
+
+	ASSERT3P(block_cursor, <=, block_end);
+
+	uint64_t size = (rend - rstart) >> sm->sm_shift;
+	uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift;
+	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
+
+	ASSERT3U(rstart, >=, sm->sm_start);
+	ASSERT3U(rstart, <, sm->sm_start + sm->sm_size);
+	ASSERT3U(rend - rstart, <=, sm->sm_size);
+	ASSERT3U(rend, <=, sm->sm_start + sm->sm_size);
+
+	while (size != 0) {
+		ASSERT3P(block_cursor, <=, block_end);
+
+		/*
+		 * If we are at the end of this block, flush it and start
+		 * writing again from the beginning.
+		 */
+		if (block_cursor == block_end) {
+			dmu_buf_rele(db, tag);
+
+			uint64_t next_word_offset = sm->sm_phys->smp_length;
+			VERIFY0(dmu_buf_hold(sm->sm_os,
+			    space_map_object(sm), next_word_offset,
+			    tag, &db, DMU_READ_PREFETCH));
+			dmu_buf_will_dirty(db, tx);
+
+			/* update caller's dbuf */
+			*dbp = db;
+
+			ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+			block_base = db->db_data;
+			block_cursor = block_base;
+			block_end = block_base +
+			    (db->db_size / sizeof (uint64_t));
+		}
+
+		/*
+		 * If we are writing a two-word entry and we only have one
+		 * word left on this block, just pad it with an empty debug
+		 * entry and write the two-word entry in the next block.
+		 */
+		uint64_t *next_entry = block_cursor + 1;
+		if (next_entry == block_end && words > 1) {
+			ASSERT3U(words, ==, 2);
+			*block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+			    SM_DEBUG_ACTION_ENCODE(0) |
+			    SM_DEBUG_SYNCPASS_ENCODE(0) |
+			    SM_DEBUG_TXG_ENCODE(0);
+			block_cursor++;
+			sm->sm_phys->smp_length += sizeof (uint64_t);
+			ASSERT3P(block_cursor, ==, block_end);
+			continue;
+		}
+
+		uint64_t run_len = MIN(size, run_max);
+		switch (words) {
+		case 1:
+			*block_cursor = SM_OFFSET_ENCODE(start) |
+			    SM_TYPE_ENCODE(maptype) |
+			    SM_RUN_ENCODE(run_len);
+			block_cursor++;
+			break;
+		case 2:
+			/* write the first word of the entry */
+			*block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
+			    SM2_RUN_ENCODE(run_len) |
+			    SM2_VDEV_ENCODE(vdev_id);
+			block_cursor++;
+
+			/* move on to the second word of the entry */
+			ASSERT3P(block_cursor, <, block_end);
+			*block_cursor = SM2_TYPE_ENCODE(maptype) |
+			    SM2_OFFSET_ENCODE(start);
+			block_cursor++;
+			break;
+		default:
+			panic("%d-word space map entries are not supported",
+			    words);
+			break;
+		}
+		sm->sm_phys->smp_length += words * sizeof (uint64_t);
+
+		start += run_len;
+		size -= run_len;
+	}
+	ASSERT0(size);
+
+}
+
+/*
+ * Note: The space map's dbuf must be dirty for the changes in sm_phys to
+ * take effect.
+ */
+static void
+space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t vdev_id, dmu_tx_t *tx)
+{
+	spa_t *spa = tx->tx_pool->dp_spa;
+	dmu_buf_t *db;
+
+	space_map_write_intro_debug(sm, maptype, tx);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * We do this right after we write the intro debug entry
+	 * because the estimate does not take it into account.
+	 */
+	uint64_t initial_objsize = sm->sm_phys->smp_length;
+	uint64_t estimated_growth =
+	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
+	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
+#endif
+
+	/*
+	 * Find the offset right after the last word in the space map
+	 * and use that to get a hold of the last block, so we can
+	 * start appending to it.
+	 */
+	uint64_t next_word_offset = sm->sm_phys->smp_length;
+	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
+	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	dmu_buf_will_dirty(db, tx);
+
+	zfs_btree_t *t = &rt->rt_root;
+	zfs_btree_index_t where;
+	for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL;
+	    rs = zfs_btree_next(t, &where, &where)) {
+		uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >>
+		    sm->sm_shift;
+		uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >>
+		    sm->sm_shift;
+		uint8_t words = 1;
+
+		/*
+		 * We only write two-word entries when both of the following
+		 * are true:
+		 *
+		 * [1] The feature is enabled.
+		 * [2] The offset or run is too big for a single-word entry,
+		 *	or the vdev_id is set (meaning not equal to
+		 *	SM_NO_VDEVID).
+		 *
+		 * Note that for purposes of testing we've added the case that
+		 * we write two-word entries occasionally when the feature is
+		 * enabled and zfs_force_some_double_word_sm_entries has been
+		 * set.
+		 */
+		if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
+		    (offset >= (1ULL << SM_OFFSET_BITS) ||
+		    length > SM_RUN_MAX ||
+		    vdev_id != SM_NO_VDEVID ||
+		    (zfs_force_some_double_word_sm_entries &&
+		    spa_get_random(100) == 0)))
+			words = 2;
+
+		space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs,
+		    rt), maptype, vdev_id, words, &db, FTAG, tx);
+	}
+
+	dmu_buf_rele(db, FTAG);
+
+#ifdef ZFS_DEBUG
+	/*
+	 * We expect our estimation to be based on the worst case
+	 * scenario [see comment in space_map_estimate_optimal_size()].
+	 * Therefore we expect the actual objsize to be equal or less
+	 * than whatever we estimated it to be.
+	 */
+	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length);
+#endif
+}
+
+/*
+ * Note: This function manipulates the state of the given space map but
+ * does not hold any locks implicitly. Thus the caller is responsible
+ * for synchronizing writes to the space map.
+ */
+void
+space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t vdev_id, dmu_tx_t *tx)
+{
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os)));
+	VERIFY3U(space_map_object(sm), !=, 0);
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+
+	/*
+	 * This field is no longer necessary since the in-core space map
+	 * now contains the object number but is maintained for backwards
+	 * compatibility.
+	 */
+	sm->sm_phys->smp_object = sm->sm_object;
+
+	if (range_tree_is_empty(rt)) {
+		VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
+		return;
+	}
+
+	if (maptype == SM_ALLOC)
+		sm->sm_phys->smp_alloc += range_tree_space(rt);
+	else
+		sm->sm_phys->smp_alloc -= range_tree_space(rt);
+
+	uint64_t nodes = zfs_btree_numnodes(&rt->rt_root);
+	uint64_t rt_space = range_tree_space(rt);
+
+	space_map_write_impl(sm, rt, maptype, vdev_id, tx);
+
+	/*
+	 * Ensure that the space_map's accounting wasn't changed
+	 * while we were in the middle of writing it out.
+	 */
+	VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root));
+	VERIFY3U(range_tree_space(rt), ==, rt_space);
+}
+
+static int
+space_map_open_impl(space_map_t *sm)
+{
+	int error;
+	u_longlong_t blocks;
+
+	error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
+	if (error)
+		return (error);
+
+	dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
+	sm->sm_phys = sm->sm_dbuf->db_data;
+	return (0);
+}
+
+int
+space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+    uint64_t start, uint64_t size, uint8_t shift)
+{
+	space_map_t *sm;
+	int error;
+
+	ASSERT(*smp == NULL);
+	ASSERT(os != NULL);
+	ASSERT(object != 0);
+
+	sm = kmem_alloc(sizeof (space_map_t), KM_SLEEP);
+
+	sm->sm_start = start;
+	sm->sm_size = size;
+	sm->sm_shift = shift;
+	sm->sm_os = os;
+	sm->sm_object = object;
+	sm->sm_blksz = 0;
+	sm->sm_dbuf = NULL;
+	sm->sm_phys = NULL;
+
+	error = space_map_open_impl(sm);
+	if (error != 0) {
+		space_map_close(sm);
+		return (error);
+	}
+	*smp = sm;
+
+	return (0);
+}
+
+void
+space_map_close(space_map_t *sm)
+{
+	if (sm == NULL)
+		return;
+
+	if (sm->sm_dbuf != NULL)
+		dmu_buf_rele(sm->sm_dbuf, sm);
+	sm->sm_dbuf = NULL;
+	sm->sm_phys = NULL;
+
+	kmem_free(sm, sizeof (*sm));
+}
+
+void
+space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx)
+{
+	objset_t *os = sm->sm_os;
+	spa_t *spa = dmu_objset_spa(os);
+	dmu_object_info_t doi;
+
+	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+	ASSERT(dmu_tx_is_syncing(tx));
+	VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa));
+
+	dmu_object_info_from_db(sm->sm_dbuf, &doi);
+
+	/*
+	 * If the space map has the wrong bonus size (because
+	 * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
+	 * the wrong block size (because space_map_blksz has changed),
+	 * free and re-allocate its object with the updated sizes.
+	 *
+	 * Otherwise, just truncate the current object.
+	 */
+	if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+	    doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
+	    doi.doi_data_block_size != blocksize ||
+	    doi.doi_metadata_block_size != 1 << space_map_ibs) {
+		zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating "
+		    "object[%llu]: old bonus %u, old blocksz %u",
+		    dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object,
+		    doi.doi_bonus_size, doi.doi_data_block_size);
+
+		space_map_free(sm, tx);
+		dmu_buf_rele(sm->sm_dbuf, sm);
+
+		sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx);
+		VERIFY0(space_map_open_impl(sm));
+	} else {
+		VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
+
+		/*
+		 * If the spacemap is reallocated, its histogram
+		 * will be reset.  Do the same in the common case so that
+		 * bugs related to the uncommon case do not go unnoticed.
+		 */
+		bzero(sm->sm_phys->smp_histogram,
+		    sizeof (sm->sm_phys->smp_histogram));
+	}
+
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
+	sm->sm_phys->smp_length = 0;
+	sm->sm_phys->smp_alloc = 0;
+}
+
+uint64_t
+space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	uint64_t object;
+	int bonuslen;
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+		bonuslen = sizeof (space_map_phys_t);
+		ASSERT3U(bonuslen, <=, dmu_bonus_max());
+	} else {
+		bonuslen = SPACE_MAP_SIZE_V0;
+	}
+
+	object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize,
+	    space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+
+	return (object);
+}
+
+void
+space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_objset_spa(os);
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+		dmu_object_info_t doi;
+
+		VERIFY0(dmu_object_info(os, smobj, &doi));
+		if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
+			spa_feature_decr(spa,
+			    SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+		}
+	}
+
+	VERIFY0(dmu_object_free(os, smobj, tx));
+}
+
+void
+space_map_free(space_map_t *sm, dmu_tx_t *tx)
+{
+	if (sm == NULL)
+		return;
+
+	space_map_free_obj(sm->sm_os, space_map_object(sm), tx);
+	sm->sm_object = 0;
+}
+
+/*
+ * Given a range tree, it makes a worst-case estimate of how much
+ * space would the tree's segments take if they were written to
+ * the given space map.
+ */
+uint64_t
+space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt,
+    uint64_t vdev_id)
+{
+	spa_t *spa = dmu_objset_spa(sm->sm_os);
+	uint64_t shift = sm->sm_shift;
+	uint64_t *histogram = rt->rt_histogram;
+	uint64_t entries_for_seg = 0;
+
+	/*
+	 * In order to get a quick estimate of the optimal size that this
+	 * range tree would have on-disk as a space map, we iterate through
+	 * its histogram buckets instead of iterating through its nodes.
+	 *
+	 * Note that this is a highest-bound/worst-case estimate for the
+	 * following reasons:
+	 *
+	 * 1] We assume that we always add a debug padding for each block
+	 *    we write and we also assume that we start at the last word
+	 *    of a block attempting to write a two-word entry.
+	 * 2] Rounding up errors due to the way segments are distributed
+	 *    in the buckets of the range tree's histogram.
+	 * 3] The activation of zfs_force_some_double_word_sm_entries
+	 *    (tunable) when testing.
+	 *
+	 * = Math and Rounding Errors =
+	 *
+	 * rt_histogram[i] bucket of a range tree represents the number
+	 * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given
+	 * that, we want to divide the buckets into groups: Buckets that
+	 * can be represented using a single-word entry, ones that can
+	 * be represented with a double-word entry, and ones that can
+	 * only be represented with multiple two-word entries.
+	 *
+	 * [Note that if the new encoding feature is not enabled there
+	 * are only two groups: single-word entry buckets and multiple
+	 * single-word entry buckets. The information below assumes
+	 * two-word entries enabled, but it can easily applied when
+	 * the feature is not enabled]
+	 *
+	 * To find the highest bucket that can be represented with a
+	 * single-word entry we look at the maximum run that such entry
+	 * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that
+	 * the run of a space map entry is shifted by sm_shift, thus we
+	 * add it to the exponent]. This way, excluding the value of the
+	 * maximum run that can be represented by a single-word entry,
+	 * all runs that are smaller exist in buckets 0 to
+	 * SM_RUN_BITS + shift - 1.
+	 *
+	 * To find the highest bucket that can be represented with a
+	 * double-word entry, we follow the same approach. Finally, any
+	 * bucket higher than that are represented with multiple two-word
+	 * entries. To be more specific, if the highest bucket whose
+	 * segments can be represented with a single two-word entry is X,
+	 * then bucket X+1 will need 2 two-word entries for each of its
+	 * segments, X+2 will need 4, X+3 will need 8, ...etc.
+	 *
+	 * With all of the above we make our estimation based on bucket
+	 * groups. There is a rounding error though. As we mentioned in
+	 * the example with the one-word entry, the maximum run that can
+	 * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is
+	 * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of
+	 * that length fall into the next bucket (and bucket group) where
+	 * we start counting two-word entries and this is one more reason
+	 * why the estimated size may end up being bigger than the actual
+	 * size written.
+	 */
+	uint64_t size = 0;
+	uint64_t idx = 0;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) ||
+	    (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) {
+
+		/*
+		 * If we are trying to force some double word entries just
+		 * assume the worst-case of every single word entry being
+		 * written as a double word entry.
+		 */
+		uint64_t entry_size =
+		    (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) &&
+		    zfs_force_some_double_word_sm_entries) ?
+		    (2 * sizeof (uint64_t)) : sizeof (uint64_t);
+
+		uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1;
+		for (; idx <= single_entry_max_bucket; idx++)
+			size += histogram[idx] * entry_size;
+
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) {
+			for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+				ASSERT3U(idx, >=, single_entry_max_bucket);
+				entries_for_seg =
+				    1ULL << (idx - single_entry_max_bucket);
+				size += histogram[idx] *
+				    entries_for_seg * entry_size;
+			}
+			return (size);
+		}
+	}
+
+	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2));
+
+	uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1;
+	for (; idx <= double_entry_max_bucket; idx++)
+		size += histogram[idx] * 2 * sizeof (uint64_t);
+
+	for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) {
+		ASSERT3U(idx, >=, double_entry_max_bucket);
+		entries_for_seg = 1ULL << (idx - double_entry_max_bucket);
+		size += histogram[idx] *
+		    entries_for_seg * 2 * sizeof (uint64_t);
+	}
+
+	/*
+	 * Assume the worst case where we start with the padding at the end
+	 * of the current block and we add an extra padding entry at the end
+	 * of all subsequent blocks.
+	 */
+	size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t);
+
+	return (size);
+}
+
+uint64_t
+space_map_object(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_object : 0);
+}
+
+int64_t
+space_map_allocated(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_phys->smp_alloc : 0);
+}
+
+uint64_t
+space_map_length(space_map_t *sm)
+{
+	return (sm != NULL ? sm->sm_phys->smp_length : 0);
+}
+
+uint64_t
+space_map_nblocks(space_map_t *sm)
+{
+	if (sm == NULL)
+		return (0);
+	return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz));
+}
diff --git a/sys/contrib/openzfs/module/zfs/space_reftree.c b/sys/contrib/openzfs/module/zfs/space_reftree.c
new file mode 100644
index 000000000000..080fc6646512
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/space_reftree.c
@@ -0,0 +1,152 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/range_tree.h>
+#include <sys/space_reftree.h>
+
+/*
+ * Space reference trees.
+ *
+ * A range tree is a collection of integers.  Every integer is either
+ * in the tree, or it's not.  A space reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a range tree.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps.  For example, the union of
+ * N range trees is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N range trees is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform.  Unions and intersections
+ * are hard to perform in the 'range tree domain', so we convert the trees
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_reftree_compare(const void *x1, const void *x2)
+{
+	const space_ref_t *sr1 = (const space_ref_t *)x1;
+	const space_ref_t *sr2 = (const space_ref_t *)x2;
+
+	int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset);
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_PCMP(sr1, sr2));
+}
+
+void
+space_reftree_create(avl_tree_t *t)
+{
+	avl_create(t, space_reftree_compare,
+	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_reftree_destroy(avl_tree_t *t)
+{
+	space_ref_t *sr;
+	void *cookie = NULL;
+
+	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(sr, sizeof (*sr));
+
+	avl_destroy(t);
+}
+
+static void
+space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+	space_ref_t *sr;
+
+	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+	sr->sr_offset = offset;
+	sr->sr_refcnt = refcnt;
+
+	avl_add(t, sr);
+}
+
+void
+space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+    int64_t refcnt)
+{
+	space_reftree_add_node(t, start, refcnt);
+	space_reftree_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a range tree into a reference tree.
+ */
+void
+space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt)
+{
+	zfs_btree_index_t where;
+
+	for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs =
+	    zfs_btree_next(&rt->rt_root, &where, &where)) {
+		space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs,
+		    rt),  refcnt);
+	}
+}
+
+/*
+ * Convert a reference tree into a range tree.  The range tree will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref)
+{
+	uint64_t start = -1ULL;
+	int64_t refcnt = 0;
+	space_ref_t *sr;
+
+	range_tree_vacate(rt, NULL, NULL);
+
+	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+		refcnt += sr->sr_refcnt;
+		if (refcnt >= minref) {
+			if (start == -1ULL) {
+				start = sr->sr_offset;
+			}
+		} else {
+			if (start != -1ULL) {
+				uint64_t end = sr->sr_offset;
+				ASSERT(start <= end);
+				if (end > start)
+					range_tree_add(rt, start, end - start);
+				start = -1ULL;
+			}
+		}
+	}
+	ASSERT(refcnt == 0);
+	ASSERT(start == -1ULL);
+}
diff --git a/sys/contrib/openzfs/module/zfs/txg.c b/sys/contrib/openzfs/module/zfs/txg.c
new file mode 100644
index 000000000000..497e19dd58eb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/txg.c
@@ -0,0 +1,1076 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/txg_impl.h>
+#include <sys/dmu_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/zil.h>
+#include <sys/callb.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * ZFS Transaction Groups
+ * ----------------------
+ *
+ * ZFS transaction groups are, as the name implies, groups of transactions
+ * that act on persistent state. ZFS asserts consistency at the granularity of
+ * these transaction groups. Each successive transaction group (txg) is
+ * assigned a 64-bit consecutive identifier. There are three active
+ * transaction group states: open, quiescing, or syncing. At any given time,
+ * there may be an active txg associated with each state; each active txg may
+ * either be processing, or blocked waiting to enter the next state. There may
+ * be up to three active txgs, and there is always a txg in the open state
+ * (though it may be blocked waiting to enter the quiescing state). In broad
+ * strokes, transactions -- operations that change in-memory structures -- are
+ * accepted into the txg in the open state, and are completed while the txg is
+ * in the open or quiescing states. The accumulated changes are written to
+ * disk in the syncing state.
+ *
+ * Open
+ *
+ * When a new txg becomes active, it first enters the open state. New
+ * transactions -- updates to in-memory structures -- are assigned to the
+ * currently open txg. There is always a txg in the open state so that ZFS can
+ * accept new changes (though the txg may refuse new changes if it has hit
+ * some limit). ZFS advances the open txg to the next state for a variety of
+ * reasons such as it hitting a time or size threshold, or the execution of an
+ * administrative action that must be completed in the syncing state.
+ *
+ * Quiescing
+ *
+ * After a txg exits the open state, it enters the quiescing state. The
+ * quiescing state is intended to provide a buffer between accepting new
+ * transactions in the open state and writing them out to stable storage in
+ * the syncing state. While quiescing, transactions can continue their
+ * operation without delaying either of the other states. Typically, a txg is
+ * in the quiescing state very briefly since the operations are bounded by
+ * software latencies rather than, say, slower I/O latencies. After all
+ * transactions complete, the txg is ready to enter the next state.
+ *
+ * Syncing
+ *
+ * In the syncing state, the in-memory state built up during the open and (to
+ * a lesser degree) the quiescing states is written to stable storage. The
+ * process of writing out modified data can, in turn modify more data. For
+ * example when we write new blocks, we need to allocate space for them; those
+ * allocations modify metadata (space maps)... which themselves must be
+ * written to stable storage. During the sync state, ZFS iterates, writing out
+ * data until it converges and all in-memory changes have been written out.
+ * The first such pass is the largest as it encompasses all the modified user
+ * data (as opposed to filesystem metadata). Subsequent passes typically have
+ * far less data to write as they consist exclusively of filesystem metadata.
+ *
+ * To ensure convergence, after a certain number of passes ZFS begins
+ * overwriting locations on stable storage that had been allocated earlier in
+ * the syncing state (and subsequently freed). ZFS usually allocates new
+ * blocks to optimize for large, continuous, writes. For the syncing state to
+ * converge however it must complete a pass where no new blocks are allocated
+ * since each allocation requires a modification of persistent metadata.
+ * Further, to hasten convergence, after a prescribed number of passes, ZFS
+ * also defers frees, and stops compressing.
+ *
+ * In addition to writing out user data, we must also execute synctasks during
+ * the syncing context. A synctask is the mechanism by which some
+ * administrative activities work such as creating and destroying snapshots or
+ * datasets. Note that when a synctask is initiated it enters the open txg,
+ * and ZFS then pushes that txg as quickly as possible to completion of the
+ * syncing state in order to reduce the latency of the administrative
+ * activity. To complete the syncing state, ZFS writes out a new uberblock,
+ * the root of the tree of blocks that comprise all state stored on the ZFS
+ * pool. Finally, if there is a quiesced txg waiting, we signal that it can
+ * now transition to the syncing state.
+ */
+
+static void txg_sync_thread(void *arg);
+static void txg_quiesce_thread(void *arg);
+
+int zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
+
+/*
+ * Prepare the txg subsystem.
+ */
+void
+txg_init(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	int c;
+	bzero(tx, sizeof (tx_state_t));
+
+	tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
+
+	for (c = 0; c < max_ncpus; c++) {
+		int i;
+
+		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP,
+		    NULL);
+		for (i = 0; i < TXG_SIZE; i++) {
+			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
+			    NULL);
+			list_create(&tx->tx_cpu[c].tc_callbacks[i],
+			    sizeof (dmu_tx_callback_t),
+			    offsetof(dmu_tx_callback_t, dcb_node));
+		}
+	}
+
+	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
+
+	tx->tx_open_txg = txg;
+}
+
+/*
+ * Close down the txg subsystem.
+ */
+void
+txg_fini(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	int c;
+
+	ASSERT0(tx->tx_threads);
+
+	mutex_destroy(&tx->tx_sync_lock);
+
+	cv_destroy(&tx->tx_sync_more_cv);
+	cv_destroy(&tx->tx_sync_done_cv);
+	cv_destroy(&tx->tx_quiesce_more_cv);
+	cv_destroy(&tx->tx_quiesce_done_cv);
+	cv_destroy(&tx->tx_exit_cv);
+
+	for (c = 0; c < max_ncpus; c++) {
+		int i;
+
+		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
+		mutex_destroy(&tx->tx_cpu[c].tc_lock);
+		for (i = 0; i < TXG_SIZE; i++) {
+			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
+		}
+	}
+
+	if (tx->tx_commit_cb_taskq != NULL)
+		taskq_destroy(tx->tx_commit_cb_taskq);
+
+	vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
+
+	bzero(tx, sizeof (tx_state_t));
+}
+
+/*
+ * Start syncing transaction groups.
+ */
+void
+txg_sync_start(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	mutex_enter(&tx->tx_sync_lock);
+
+	dprintf("pool %p\n", dp);
+
+	ASSERT0(tx->tx_threads);
+
+	tx->tx_threads = 2;
+
+	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
+	    dp, 0, &p0, TS_RUN, defclsyspri);
+
+	/*
+	 * The sync thread can need a larger-than-default stack size on
+	 * 32-bit x86.  This is due in part to nested pools and
+	 * scrub_visitbp() recursion.
+	 */
+	tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread,
+	    dp, 0, &p0, TS_RUN, defclsyspri);
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
+{
+	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
+	mutex_enter(&tx->tx_sync_lock);
+}
+
+static void
+txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
+{
+	ASSERT(*tpp != NULL);
+	*tpp = NULL;
+	tx->tx_threads--;
+	cv_broadcast(&tx->tx_exit_cv);
+	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
+	thread_exit();
+}
+
+static void
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
+{
+	CALLB_CPR_SAFE_BEGIN(cpr);
+
+	if (time) {
+		(void) cv_timedwait_idle(cv, &tx->tx_sync_lock,
+		    ddi_get_lbolt() + time);
+	} else {
+		cv_wait_idle(cv, &tx->tx_sync_lock);
+	}
+
+	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
+}
+
+/*
+ * Stop syncing transaction groups.
+ */
+void
+txg_sync_stop(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	dprintf("pool %p\n", dp);
+	/*
+	 * Finish off any work in progress.
+	 */
+	ASSERT3U(tx->tx_threads, ==, 2);
+
+	/*
+	 * We need to ensure that we've vacated the deferred metaslab trees.
+	 */
+	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
+
+	/*
+	 * Wake all sync threads and wait for them to die.
+	 */
+	mutex_enter(&tx->tx_sync_lock);
+
+	ASSERT3U(tx->tx_threads, ==, 2);
+
+	tx->tx_exiting = 1;
+
+	cv_broadcast(&tx->tx_quiesce_more_cv);
+	cv_broadcast(&tx->tx_quiesce_done_cv);
+	cv_broadcast(&tx->tx_sync_more_cv);
+
+	while (tx->tx_threads != 0)
+		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
+
+	tx->tx_exiting = 0;
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+/*
+ * Get a handle on the currently open txg and keep it open.
+ *
+ * The txg is guaranteed to stay open until txg_rele_to_quiesce() is called for
+ * the handle. Once txg_rele_to_quiesce() has been called, the txg stays
+ * in quiescing state until txg_rele_to_sync() is called for the handle.
+ *
+ * It is guaranteed that subsequent calls return monotonically increasing
+ * txgs for the same dsl_pool_t. Of course this is not strong monotonicity,
+ * because the same txg can be returned multiple times in a row. This
+ * guarantee holds both for subsequent calls from one thread and for multiple
+ * threads. For example, it is impossible to observe the following sequence
+ * of events:
+ *
+ *           Thread 1                            Thread 2
+ *
+ *   1 <- txg_hold_open(P, ...)
+ *                                       2 <- txg_hold_open(P, ...)
+ *   1 <- txg_hold_open(P, ...)
+ *
+ */
+uint64_t
+txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	tx_cpu_t *tc;
+	uint64_t txg;
+
+	/*
+	 * It appears the processor id is simply used as a "random"
+	 * number to index into the array, and there isn't any other
+	 * significance to the chosen tx_cpu. Because.. Why not use
+	 * the current cpu to index into the array?
+	 */
+	tc = &tx->tx_cpu[CPU_SEQID_UNSTABLE];
+
+	mutex_enter(&tc->tc_open_lock);
+	txg = tx->tx_open_txg;
+
+	mutex_enter(&tc->tc_lock);
+	tc->tc_count[txg & TXG_MASK]++;
+	mutex_exit(&tc->tc_lock);
+
+	th->th_cpu = tc;
+	th->th_txg = txg;
+
+	return (txg);
+}
+
+void
+txg_rele_to_quiesce(txg_handle_t *th)
+{
+	tx_cpu_t *tc = th->th_cpu;
+
+	ASSERT(!MUTEX_HELD(&tc->tc_lock));
+	mutex_exit(&tc->tc_open_lock);
+}
+
+void
+txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
+{
+	tx_cpu_t *tc = th->th_cpu;
+	int g = th->th_txg & TXG_MASK;
+
+	mutex_enter(&tc->tc_lock);
+	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
+	mutex_exit(&tc->tc_lock);
+}
+
+void
+txg_rele_to_sync(txg_handle_t *th)
+{
+	tx_cpu_t *tc = th->th_cpu;
+	int g = th->th_txg & TXG_MASK;
+
+	mutex_enter(&tc->tc_lock);
+	ASSERT(tc->tc_count[g] != 0);
+	if (--tc->tc_count[g] == 0)
+		cv_broadcast(&tc->tc_cv[g]);
+	mutex_exit(&tc->tc_lock);
+
+	th->th_cpu = NULL;	/* defensive */
+}
+
+/*
+ * Blocks until all transactions in the group are committed.
+ *
+ * On return, the transaction group has reached a stable state in which it can
+ * then be passed off to the syncing context.
+ */
+static void
+txg_quiesce(dsl_pool_t *dp, uint64_t txg)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	uint64_t tx_open_time;
+	int g = txg & TXG_MASK;
+	int c;
+
+	/*
+	 * Grab all tc_open_locks so nobody else can get into this txg.
+	 */
+	for (c = 0; c < max_ncpus; c++)
+		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
+
+	ASSERT(txg == tx->tx_open_txg);
+	tx->tx_open_txg++;
+	tx->tx_open_time = tx_open_time = gethrtime();
+
+	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
+	/*
+	 * Now that we've incremented tx_open_txg, we can let threads
+	 * enter the next transaction group.
+	 */
+	for (c = 0; c < max_ncpus; c++)
+		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
+
+	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time);
+	spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time);
+
+	/*
+	 * Quiesce the transaction group by waiting for everyone to
+	 * call txg_rele_to_sync() for their open transaction handles.
+	 */
+	for (c = 0; c < max_ncpus; c++) {
+		tx_cpu_t *tc = &tx->tx_cpu[c];
+		mutex_enter(&tc->tc_lock);
+		while (tc->tc_count[g] != 0)
+			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
+		mutex_exit(&tc->tc_lock);
+	}
+
+	spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime());
+}
+
+static void
+txg_do_callbacks(list_t *cb_list)
+{
+	dmu_tx_do_callbacks(cb_list, 0);
+
+	list_destroy(cb_list);
+
+	kmem_free(cb_list, sizeof (list_t));
+}
+
+/*
+ * Dispatch the commit callbacks registered on this txg to worker threads.
+ *
+ * If no callbacks are registered for a given TXG, nothing happens.
+ * This function creates a taskq for the associated pool, if needed.
+ */
+static void
+txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
+{
+	int c;
+	tx_state_t *tx = &dp->dp_tx;
+	list_t *cb_list;
+
+	for (c = 0; c < max_ncpus; c++) {
+		tx_cpu_t *tc = &tx->tx_cpu[c];
+		/*
+		 * No need to lock tx_cpu_t at this point, since this can
+		 * only be called once a txg has been synced.
+		 */
+
+		int g = txg & TXG_MASK;
+
+		if (list_is_empty(&tc->tc_callbacks[g]))
+			continue;
+
+		if (tx->tx_commit_cb_taskq == NULL) {
+			/*
+			 * Commit callback taskq hasn't been created yet.
+			 */
+			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
+			    100, defclsyspri, boot_ncpus, boot_ncpus * 2,
+			    TASKQ_PREPOPULATE | TASKQ_DYNAMIC |
+			    TASKQ_THREADS_CPU_PCT);
+		}
+
+		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+		list_create(cb_list, sizeof (dmu_tx_callback_t),
+		    offsetof(dmu_tx_callback_t, dcb_node));
+
+		list_move_tail(cb_list, &tc->tc_callbacks[g]);
+
+		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+		    txg_do_callbacks, cb_list, TQ_SLEEP);
+	}
+}
+
+/*
+ * Wait for pending commit callbacks of already-synced transactions to finish
+ * processing.
+ * Calling this function from within a commit callback will deadlock.
+ */
+void
+txg_wait_callbacks(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	if (tx->tx_commit_cb_taskq != NULL)
+		taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0);
+}
+
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_quiesced_txg != 0);
+}
+
+static void
+txg_sync_thread(void *arg)
+{
+	dsl_pool_t *dp = arg;
+	spa_t *spa = dp->dp_spa;
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+	clock_t start, delta;
+
+	(void) spl_fstrans_mark();
+	txg_thread_enter(tx, &cpr);
+
+	start = delta = 0;
+	for (;;) {
+		clock_t timeout = zfs_txg_timeout * hz;
+		clock_t timer;
+		uint64_t txg;
+		uint64_t dirty_min_bytes =
+		    zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
+
+		/*
+		 * We sync when we're scanning, there's someone waiting
+		 * on us, or the quiesce thread has handed off a txg to
+		 * us, or we have reached our timeout.
+		 */
+		timer = (delta >= timeout ? 0 : timeout - delta);
+		while (!dsl_scan_active(dp->dp_scan) &&
+		    !tx->tx_exiting && timer > 0 &&
+		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
+		    !txg_has_quiesced_to_sync(dp) &&
+		    dp->dp_dirty_total < dirty_min_bytes) {
+			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
+			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
+			delta = ddi_get_lbolt() - start;
+			timer = (delta > timeout ? 0 : timeout - delta);
+		}
+
+		/*
+		 * Wait until the quiesce thread hands off a txg to us,
+		 * prompting it to do so if necessary.
+		 */
+		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
+			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
+				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
+			cv_broadcast(&tx->tx_quiesce_more_cv);
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
+		}
+
+		if (tx->tx_exiting)
+			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
+
+		/*
+		 * Consume the quiesced txg which has been handed off to
+		 * us.  This may cause the quiescing thread to now be
+		 * able to quiesce another txg, so we must signal it.
+		 */
+		ASSERT(tx->tx_quiesced_txg != 0);
+		txg = tx->tx_quiesced_txg;
+		tx->tx_quiesced_txg = 0;
+		tx->tx_syncing_txg = txg;
+		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+
+		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+		mutex_exit(&tx->tx_sync_lock);
+
+		txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp);
+		start = ddi_get_lbolt();
+		spa_sync(spa, txg);
+		delta = ddi_get_lbolt() - start;
+		spa_txg_history_fini_io(spa, ts);
+
+		mutex_enter(&tx->tx_sync_lock);
+		tx->tx_synced_txg = txg;
+		tx->tx_syncing_txg = 0;
+		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
+		cv_broadcast(&tx->tx_sync_done_cv);
+
+		/*
+		 * Dispatch commit callbacks to worker threads.
+		 */
+		txg_dispatch_callbacks(dp, txg);
+	}
+}
+
+static void
+txg_quiesce_thread(void *arg)
+{
+	dsl_pool_t *dp = arg;
+	tx_state_t *tx = &dp->dp_tx;
+	callb_cpr_t cpr;
+
+	txg_thread_enter(tx, &cpr);
+
+	for (;;) {
+		uint64_t txg;
+
+		/*
+		 * We quiesce when there's someone waiting on us.
+		 * However, we can only have one txg in "quiescing" or
+		 * "quiesced, waiting to sync" state.  So we wait until
+		 * the "quiesced, waiting to sync" txg has been consumed
+		 * by the sync thread.
+		 */
+		while (!tx->tx_exiting &&
+		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
+		    txg_has_quiesced_to_sync(dp)))
+			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
+
+		if (tx->tx_exiting)
+			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
+
+		txg = tx->tx_open_txg;
+		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+		    txg, tx->tx_quiesce_txg_waiting,
+		    tx->tx_sync_txg_waiting);
+		tx->tx_quiescing_txg = txg;
+
+		mutex_exit(&tx->tx_sync_lock);
+		txg_quiesce(dp, txg);
+		mutex_enter(&tx->tx_sync_lock);
+
+		/*
+		 * Hand this txg off to the sync thread.
+		 */
+		dprintf("quiesce done, handing off txg %llu\n", txg);
+		tx->tx_quiescing_txg = 0;
+		tx->tx_quiesced_txg = txg;
+		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
+		cv_broadcast(&tx->tx_sync_more_cv);
+		cv_broadcast(&tx->tx_quiesce_done_cv);
+	}
+}
+
+/*
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiescing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiescing state.
+ */
+void
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	hrtime_t start = gethrtime();
+
+	/* don't delay if this txg could transition to quiescing immediately */
+	if (tx->tx_open_txg > txg ||
+	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
+		return;
+
+	mutex_enter(&tx->tx_sync_lock);
+	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
+		mutex_exit(&tx->tx_sync_lock);
+		return;
+	}
+
+	while (gethrtime() - start < delay &&
+	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+		    &tx->tx_sync_lock, delay, resolution, 0);
+	}
+
+	DMU_TX_STAT_BUMP(dmu_tx_delay);
+
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+static boolean_t
+txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	ASSERT(!dsl_pool_config_held(dp));
+
+	mutex_enter(&tx->tx_sync_lock);
+	ASSERT3U(tx->tx_threads, ==, 2);
+	if (txg == 0)
+		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
+	if (tx->tx_sync_txg_waiting < txg)
+		tx->tx_sync_txg_waiting = txg;
+	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+	while (tx->tx_synced_txg < txg) {
+		dprintf("broadcasting sync more "
+		    "tx_synced=%llu waiting=%llu dp=%px\n",
+		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
+		cv_broadcast(&tx->tx_sync_more_cv);
+		if (wait_sig) {
+			/*
+			 * Condition wait here but stop if the thread receives a
+			 * signal. The caller may call txg_wait_synced*() again
+			 * to resume waiting for this txg.
+			 */
+			if (cv_wait_io_sig(&tx->tx_sync_done_cv,
+			    &tx->tx_sync_lock) == 0) {
+				mutex_exit(&tx->tx_sync_lock);
+				return (B_TRUE);
+			}
+		} else {
+			cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
+		}
+	}
+	mutex_exit(&tx->tx_sync_lock);
+	return (B_FALSE);
+}
+
+void
+txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
+{
+	VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
+}
+
+/*
+ * Similar to a txg_wait_synced but it can be interrupted from a signal.
+ * Returns B_TRUE if the thread was signaled while waiting.
+ */
+boolean_t
+txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
+{
+	return (txg_wait_synced_impl(dp, txg, B_TRUE));
+}
+
+/*
+ * Wait for the specified open transaction group.  Set should_quiesce
+ * when the current open txg should be quiesced immediately.
+ */
+void
+txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	ASSERT(!dsl_pool_config_held(dp));
+
+	mutex_enter(&tx->tx_sync_lock);
+	ASSERT3U(tx->tx_threads, ==, 2);
+	if (txg == 0)
+		txg = tx->tx_open_txg + 1;
+	if (tx->tx_quiesce_txg_waiting < txg && should_quiesce)
+		tx->tx_quiesce_txg_waiting = txg;
+	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
+	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
+	while (tx->tx_open_txg < txg) {
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+		/*
+		 * Callers setting should_quiesce will use cv_wait_io() and
+		 * be accounted for as iowait time.  Otherwise, the caller is
+		 * understood to be idle and cv_wait_sig() is used to prevent
+		 * incorrectly inflating the system load average.
+		 */
+		if (should_quiesce == B_TRUE) {
+			cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
+		} else {
+			cv_wait_idle(&tx->tx_quiesce_done_cv,
+			    &tx->tx_sync_lock);
+		}
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+/*
+ * If there isn't a txg syncing or in the pipeline, push another txg through
+ * the pipeline by quiescing the open txg.
+ */
+void
+txg_kick(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	ASSERT(!dsl_pool_config_held(dp));
+
+	mutex_enter(&tx->tx_sync_lock);
+	if (!txg_is_syncing(dp) &&
+	    !txg_is_quiescing(dp) &&
+	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
+	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
+	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
+		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
+		cv_broadcast(&tx->tx_quiesce_more_cv);
+	}
+	mutex_exit(&tx->tx_sync_lock);
+}
+
+boolean_t
+txg_stalled(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
+}
+
+boolean_t
+txg_sync_waiting(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+
+	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
+	    tx->tx_quiesced_txg != 0);
+}
+
+/*
+ * Verify that this txg is active (open, quiescing, syncing).  Non-active
+ * txg's should not be manipulated.
+ */
+#ifdef ZFS_DEBUG
+void
+txg_verify(spa_t *spa, uint64_t txg)
+{
+	dsl_pool_t *dp __maybe_unused = spa_get_dsl(spa);
+	if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
+		return;
+	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
+	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
+	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
+}
+#endif
+
+/*
+ * Per-txg object lists.
+ */
+void
+txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
+{
+	int t;
+
+	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	tl->tl_offset = offset;
+	tl->tl_spa = spa;
+
+	for (t = 0; t < TXG_SIZE; t++)
+		tl->tl_head[t] = NULL;
+}
+
+static boolean_t
+txg_list_empty_impl(txg_list_t *tl, uint64_t txg)
+{
+	ASSERT(MUTEX_HELD(&tl->tl_lock));
+	TXG_VERIFY(tl->tl_spa, txg);
+	return (tl->tl_head[txg & TXG_MASK] == NULL);
+}
+
+boolean_t
+txg_list_empty(txg_list_t *tl, uint64_t txg)
+{
+	mutex_enter(&tl->tl_lock);
+	boolean_t ret = txg_list_empty_impl(tl, txg);
+	mutex_exit(&tl->tl_lock);
+
+	return (ret);
+}
+
+void
+txg_list_destroy(txg_list_t *tl)
+{
+	int t;
+
+	mutex_enter(&tl->tl_lock);
+	for (t = 0; t < TXG_SIZE; t++)
+		ASSERT(txg_list_empty_impl(tl, t));
+	mutex_exit(&tl->tl_lock);
+
+	mutex_destroy(&tl->tl_lock);
+}
+
+/*
+ * Returns true if all txg lists are empty.
+ *
+ * Warning: this is inherently racy (an item could be added immediately
+ * after this function returns).
+ */
+boolean_t
+txg_all_lists_empty(txg_list_t *tl)
+{
+	mutex_enter(&tl->tl_lock);
+	for (int i = 0; i < TXG_SIZE; i++) {
+		if (!txg_list_empty_impl(tl, i)) {
+			mutex_exit(&tl->tl_lock);
+			return (B_FALSE);
+		}
+	}
+	mutex_exit(&tl->tl_lock);
+	return (B_TRUE);
+}
+
+/*
+ * Add an entry to the list (unless it's already on the list).
+ * Returns B_TRUE if it was actually added.
+ */
+boolean_t
+txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+	boolean_t add;
+
+	TXG_VERIFY(tl->tl_spa, txg);
+	mutex_enter(&tl->tl_lock);
+	add = (tn->tn_member[t] == 0);
+	if (add) {
+		tn->tn_member[t] = 1;
+		tn->tn_next[t] = tl->tl_head[t];
+		tl->tl_head[t] = tn;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (add);
+}
+
+/*
+ * Add an entry to the end of the list, unless it's already on the list.
+ * (walks list to find end)
+ * Returns B_TRUE if it was actually added.
+ */
+boolean_t
+txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+	boolean_t add;
+
+	TXG_VERIFY(tl->tl_spa, txg);
+	mutex_enter(&tl->tl_lock);
+	add = (tn->tn_member[t] == 0);
+	if (add) {
+		txg_node_t **tp;
+
+		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
+			continue;
+
+		tn->tn_member[t] = 1;
+		tn->tn_next[t] = NULL;
+		*tp = tn;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (add);
+}
+
+/*
+ * Remove the head of the list and return it.
+ */
+void *
+txg_list_remove(txg_list_t *tl, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn;
+	void *p = NULL;
+
+	TXG_VERIFY(tl->tl_spa, txg);
+	mutex_enter(&tl->tl_lock);
+	if ((tn = tl->tl_head[t]) != NULL) {
+		ASSERT(tn->tn_member[t]);
+		ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
+		p = (char *)tn - tl->tl_offset;
+		tl->tl_head[t] = tn->tn_next[t];
+		tn->tn_next[t] = NULL;
+		tn->tn_member[t] = 0;
+	}
+	mutex_exit(&tl->tl_lock);
+
+	return (p);
+}
+
+/*
+ * Remove a specific item from the list and return it.
+ */
+void *
+txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn, **tp;
+
+	TXG_VERIFY(tl->tl_spa, txg);
+	mutex_enter(&tl->tl_lock);
+
+	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
+		if ((char *)tn - tl->tl_offset == p) {
+			*tp = tn->tn_next[t];
+			tn->tn_next[t] = NULL;
+			tn->tn_member[t] = 0;
+			mutex_exit(&tl->tl_lock);
+			return (p);
+		}
+	}
+
+	mutex_exit(&tl->tl_lock);
+
+	return (NULL);
+}
+
+boolean_t
+txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+	TXG_VERIFY(tl->tl_spa, txg);
+	return (tn->tn_member[t] != 0);
+}
+
+/*
+ * Walk a txg list
+ */
+void *
+txg_list_head(txg_list_t *tl, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn;
+
+	mutex_enter(&tl->tl_lock);
+	tn = tl->tl_head[t];
+	mutex_exit(&tl->tl_lock);
+
+	TXG_VERIFY(tl->tl_spa, txg);
+	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+void *
+txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
+{
+	int t = txg & TXG_MASK;
+	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
+
+	TXG_VERIFY(tl->tl_spa, txg);
+
+	mutex_enter(&tl->tl_lock);
+	tn = tn->tn_next[t];
+	mutex_exit(&tl->tl_lock);
+
+	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
+}
+
+EXPORT_SYMBOL(txg_init);
+EXPORT_SYMBOL(txg_fini);
+EXPORT_SYMBOL(txg_sync_start);
+EXPORT_SYMBOL(txg_sync_stop);
+EXPORT_SYMBOL(txg_hold_open);
+EXPORT_SYMBOL(txg_rele_to_quiesce);
+EXPORT_SYMBOL(txg_rele_to_sync);
+EXPORT_SYMBOL(txg_register_callbacks);
+EXPORT_SYMBOL(txg_delay);
+EXPORT_SYMBOL(txg_wait_synced);
+EXPORT_SYMBOL(txg_wait_open);
+EXPORT_SYMBOL(txg_wait_callbacks);
+EXPORT_SYMBOL(txg_stalled);
+EXPORT_SYMBOL(txg_sync_waiting);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, INT, ZMOD_RW,
+	"Max seconds worth of delta per txg");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/uberblock.c b/sys/contrib/openzfs/module/zfs/uberblock.c
new file mode 100644
index 000000000000..b8857d74d810
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/uberblock.c
@@ -0,0 +1,74 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/uberblock_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/mmp.h>
+
+int
+uberblock_verify(uberblock_t *ub)
+{
+	if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC))
+		byteswap_uint64_array(ub, sizeof (uberblock_t));
+
+	if (ub->ub_magic != UBERBLOCK_MAGIC)
+		return (SET_ERROR(EINVAL));
+
+	return (0);
+}
+
+/*
+ * Update the uberblock and return TRUE if anything changed in this
+ * transaction group.
+ */
+boolean_t
+uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay)
+{
+	ASSERT(ub->ub_txg < txg);
+
+	/*
+	 * We explicitly do not set ub_version here, so that older versions
+	 * continue to be written with the previous uberblock version.
+	 */
+	ub->ub_magic = UBERBLOCK_MAGIC;
+	ub->ub_txg = txg;
+	ub->ub_guid_sum = rvd->vdev_guid_sum;
+	ub->ub_timestamp = gethrestime_sec();
+	ub->ub_software_version = SPA_VERSION;
+	ub->ub_mmp_magic = MMP_MAGIC;
+	if (spa_multihost(rvd->vdev_spa)) {
+		ub->ub_mmp_delay = mmp_delay;
+		ub->ub_mmp_config = MMP_SEQ_SET(0) |
+		    MMP_INTERVAL_SET(zfs_multihost_interval) |
+		    MMP_FAIL_INT_SET(zfs_multihost_fail_intervals);
+	} else {
+		ub->ub_mmp_delay = 0;
+		ub->ub_mmp_config = 0;
+	}
+	ub->ub_checkpoint_txg = 0;
+
+	return (ub->ub_rootbp.blk_birth == txg);
+}
diff --git a/sys/contrib/openzfs/module/zfs/unique.c b/sys/contrib/openzfs/module/zfs/unique.c
new file mode 100644
index 000000000000..0e076797a002
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/unique.c
@@ -0,0 +1,112 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+
+#include <sys/zfs_context.h>
+#include <sys/avl.h>
+#include <sys/unique.h>
+
+static avl_tree_t unique_avl;
+static kmutex_t unique_mtx;
+
+typedef struct unique {
+	avl_node_t un_link;
+	uint64_t un_value;
+} unique_t;
+
+#define	UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1)
+
+static int
+unique_compare(const void *a, const void *b)
+{
+	const unique_t *una = (const unique_t *)a;
+	const unique_t *unb = (const unique_t *)b;
+
+	return (TREE_CMP(una->un_value, unb->un_value));
+}
+
+void
+unique_init(void)
+{
+	avl_create(&unique_avl, unique_compare,
+	    sizeof (unique_t), offsetof(unique_t, un_link));
+	mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+unique_fini(void)
+{
+	avl_destroy(&unique_avl);
+	mutex_destroy(&unique_mtx);
+}
+
+uint64_t
+unique_create(void)
+{
+	uint64_t value = unique_insert(0);
+	unique_remove(value);
+	return (value);
+}
+
+uint64_t
+unique_insert(uint64_t value)
+{
+	avl_index_t idx;
+	unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP);
+
+	un->un_value = value;
+
+	mutex_enter(&unique_mtx);
+	while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK ||
+	    avl_find(&unique_avl, un, &idx)) {
+		mutex_exit(&unique_mtx);
+		(void) random_get_pseudo_bytes((void*)&un->un_value,
+		    sizeof (un->un_value));
+		un->un_value &= UNIQUE_MASK;
+		mutex_enter(&unique_mtx);
+	}
+
+	avl_insert(&unique_avl, un, idx);
+	mutex_exit(&unique_mtx);
+
+	return (un->un_value);
+}
+
+void
+unique_remove(uint64_t value)
+{
+	unique_t un_tofind;
+	unique_t *un;
+
+	un_tofind.un_value = value;
+	mutex_enter(&unique_mtx);
+	un = avl_find(&unique_avl, &un_tofind, NULL);
+	if (un != NULL) {
+		avl_remove(&unique_avl, un);
+		kmem_free(un, sizeof (unique_t));
+	}
+	mutex_exit(&unique_mtx);
+}
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
new file mode 100644
index 000000000000..36001e0a6626
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -0,0 +1,5420 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright 2017 Joyent, Inc.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Datto Inc. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/bpobj.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_dir.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/vdev_draid.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/space_map.h>
+#include <sys/space_reftree.h>
+#include <sys/zio.h>
+#include <sys/zap.h>
+#include <sys/fs/zfs.h>
+#include <sys/arc.h>
+#include <sys/zil.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_raidz.h>
+#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+#include <sys/zvol.h>
+#include <sys/zfs_ratelimit.h>
+
+/*
+ * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
+ * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
+ * part of the spa_embedded_log_class.  The metaslab with the most free space
+ * in each vdev is selected for this purpose when the pool is opened (or a
+ * vdev is added).  See vdev_metaslab_init().
+ *
+ * Log blocks can be allocated from the following locations.  Each one is tried
+ * in order until the allocation succeeds:
+ * 1. dedicated log vdevs, aka "slog" (spa_log_class)
+ * 2. embedded slog metaslabs (spa_embedded_log_class)
+ * 3. other metaslabs in normal vdevs (spa_normal_class)
+ *
+ * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
+ * than this number of metaslabs in the vdev.  This ensures that we don't set
+ * aside an unreasonable amount of space for the ZIL.  If set to less than
+ * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
+ * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
+ */
+int zfs_embedded_slog_min_ms = 64;
+
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
+
+/* minimum number of metaslabs per top-level vdev */
+int zfs_vdev_min_ms_count = 16;
+
+/* practical upper limit of total metaslabs per top-level vdev */
+int zfs_vdev_ms_count_limit = 1ULL << 17;
+
+/* lower limit for metaslab size (512M) */
+int zfs_vdev_default_ms_shift = 29;
+
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
+
+int vdev_validate_skip = B_FALSE;
+
+/*
+ * Since the DTL space map of a vdev is not expected to have a lot of
+ * entries, we default its block size to 4K.
+ */
+int zfs_vdev_dtl_sm_blksz = (1 << 12);
+
+/*
+ * Rate limit slow IO (delay) events to this many per second.
+ */
+unsigned int zfs_slow_io_events_per_second = 20;
+
+/*
+ * Rate limit checksum events after this many checksum errors per second.
+ */
+unsigned int zfs_checksum_events_per_second = 20;
+
+/*
+ * Ignore errors during scrub/resilver.  Allows to work around resilver
+ * upon import when there are pool errors.
+ */
+int zfs_scan_ignore_errors = 0;
+
+/*
+ * vdev-wide space maps that have lots of entries written to them at
+ * the end of each transaction can benefit from a higher I/O bandwidth
+ * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
+ */
+int zfs_vdev_standard_sm_blksz = (1 << 17);
+
+/*
+ * Tunable parameter for debugging or performance analysis. Setting this
+ * will cause pool corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+int zfs_nocacheflush = 0;
+
+uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX;
+uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
+
+/*PRINTFLIKE2*/
+void
+vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
+{
+	va_list adx;
+	char buf[256];
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
+	va_end(adx);
+
+	if (vd->vdev_path != NULL) {
+		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
+		    vd->vdev_path, buf);
+	} else {
+		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
+		    vd->vdev_ops->vdev_op_type,
+		    (u_longlong_t)vd->vdev_id,
+		    (u_longlong_t)vd->vdev_guid, buf);
+	}
+}
+
+void
+vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
+{
+	char state[20];
+
+	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
+		zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id,
+		    vd->vdev_ops->vdev_op_type);
+		return;
+	}
+
+	switch (vd->vdev_state) {
+	case VDEV_STATE_UNKNOWN:
+		(void) snprintf(state, sizeof (state), "unknown");
+		break;
+	case VDEV_STATE_CLOSED:
+		(void) snprintf(state, sizeof (state), "closed");
+		break;
+	case VDEV_STATE_OFFLINE:
+		(void) snprintf(state, sizeof (state), "offline");
+		break;
+	case VDEV_STATE_REMOVED:
+		(void) snprintf(state, sizeof (state), "removed");
+		break;
+	case VDEV_STATE_CANT_OPEN:
+		(void) snprintf(state, sizeof (state), "can't open");
+		break;
+	case VDEV_STATE_FAULTED:
+		(void) snprintf(state, sizeof (state), "faulted");
+		break;
+	case VDEV_STATE_DEGRADED:
+		(void) snprintf(state, sizeof (state), "degraded");
+		break;
+	case VDEV_STATE_HEALTHY:
+		(void) snprintf(state, sizeof (state), "healthy");
+		break;
+	default:
+		(void) snprintf(state, sizeof (state), "<state %u>",
+		    (uint_t)vd->vdev_state);
+	}
+
+	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
+	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
+	    vd->vdev_islog ? " (log)" : "",
+	    (u_longlong_t)vd->vdev_guid,
+	    vd->vdev_path ? vd->vdev_path : "N/A", state);
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++)
+		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
+}
+
+/*
+ * Virtual device management.
+ */
+
+static vdev_ops_t *vdev_ops_table[] = {
+	&vdev_root_ops,
+	&vdev_raidz_ops,
+	&vdev_draid_ops,
+	&vdev_draid_spare_ops,
+	&vdev_mirror_ops,
+	&vdev_replacing_ops,
+	&vdev_spare_ops,
+	&vdev_disk_ops,
+	&vdev_file_ops,
+	&vdev_missing_ops,
+	&vdev_hole_ops,
+	&vdev_indirect_ops,
+	NULL
+};
+
+/*
+ * Given a vdev type, return the appropriate ops vector.
+ */
+static vdev_ops_t *
+vdev_getops(const char *type)
+{
+	vdev_ops_t *ops, **opspp;
+
+	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
+		if (strcmp(ops->vdev_op_type, type) == 0)
+			break;
+
+	return (ops);
+}
+
+/*
+ * Given a vdev and a metaslab class, find which metaslab group we're
+ * interested in. All vdevs may belong to two different metaslab classes.
+ * Dedicated slog devices use only the primary metaslab group, rather than a
+ * separate log group. For embedded slogs, the vdev_log_mg will be non-NULL.
+ */
+metaslab_group_t *
+vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
+{
+	if (mc == spa_embedded_log_class(vd->vdev_spa) &&
+	    vd->vdev_log_mg != NULL)
+		return (vd->vdev_log_mg);
+	else
+		return (vd->vdev_mg);
+}
+
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+	physical_rs->rs_start = logical_rs->rs_start;
+	physical_rs->rs_end = logical_rs->rs_end;
+}
+
+/*
+ * Derive the enumerated allocation bias from string input.
+ * String origin is either the per-vdev zap or zpool(8).
+ */
+static vdev_alloc_bias_t
+vdev_derive_alloc_bias(const char *bias)
+{
+	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+
+	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
+		alloc_bias = VDEV_BIAS_LOG;
+	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
+		alloc_bias = VDEV_BIAS_SPECIAL;
+	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
+		alloc_bias = VDEV_BIAS_DEDUP;
+
+	return (alloc_bias);
+}
+
+/*
+ * Default asize function: return the MAX of psize with the asize of
+ * all children.  This is what's used by anything other than RAID-Z.
+ */
+uint64_t
+vdev_default_asize(vdev_t *vd, uint64_t psize)
+{
+	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
+	uint64_t csize;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
+		asize = MAX(asize, csize);
+	}
+
+	return (asize);
+}
+
+uint64_t
+vdev_default_min_asize(vdev_t *vd)
+{
+	return (vd->vdev_min_asize);
+}
+
+/*
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
+ */
+uint64_t
+vdev_get_min_asize(vdev_t *vd)
+{
+	vdev_t *pvd = vd->vdev_parent;
+
+	/*
+	 * If our parent is NULL (inactive spare or cache) or is the root,
+	 * just return our own asize.
+	 */
+	if (pvd == NULL)
+		return (vd->vdev_asize);
+
+	/*
+	 * The top-level vdev just returns the allocatable size rounded
+	 * to the nearest metaslab.
+	 */
+	if (vd == vd->vdev_top)
+		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
+
+	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
+}
+
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+	vd->vdev_min_asize = vdev_get_min_asize(vd);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_set_min_asize(vd->vdev_child[c]);
+}
+
+/*
+ * Get the minimal allocation size for the top-level vdev.
+ */
+uint64_t
+vdev_get_min_alloc(vdev_t *vd)
+{
+	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
+
+	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
+		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
+
+	return (min_alloc);
+}
+
+/*
+ * Get the parity level for a top-level vdev.
+ */
+uint64_t
+vdev_get_nparity(vdev_t *vd)
+{
+	uint64_t nparity = 0;
+
+	if (vd->vdev_ops->vdev_op_nparity != NULL)
+		nparity = vd->vdev_ops->vdev_op_nparity(vd);
+
+	return (nparity);
+}
+
+/*
+ * Get the number of data disks for a top-level vdev.
+ */
+uint64_t
+vdev_get_ndisks(vdev_t *vd)
+{
+	uint64_t ndisks = 1;
+
+	if (vd->vdev_ops->vdev_op_ndisks != NULL)
+		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
+
+	return (ndisks);
+}
+
+vdev_t *
+vdev_lookup_top(spa_t *spa, uint64_t vdev)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	if (vdev < rvd->vdev_children) {
+		ASSERT(rvd->vdev_child[vdev] != NULL);
+		return (rvd->vdev_child[vdev]);
+	}
+
+	return (NULL);
+}
+
+vdev_t *
+vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
+{
+	vdev_t *mvd;
+
+	if (vd->vdev_guid == guid)
+		return (vd);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+	int n = 0;
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		return (1);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+	return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+	int rc;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	rc = vdev_count_leaves_impl(spa->spa_root_vdev);
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	return (rc);
+}
+
+void
+vdev_add_child(vdev_t *pvd, vdev_t *cvd)
+{
+	size_t oldsize, newsize;
+	uint64_t id = cvd->vdev_id;
+	vdev_t **newchild;
+
+	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(cvd->vdev_parent == NULL);
+
+	cvd->vdev_parent = pvd;
+
+	if (pvd == NULL)
+		return;
+
+	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
+
+	oldsize = pvd->vdev_children * sizeof (vdev_t *);
+	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
+	newsize = pvd->vdev_children * sizeof (vdev_t *);
+
+	newchild = kmem_alloc(newsize, KM_SLEEP);
+	if (pvd->vdev_child != NULL) {
+		bcopy(pvd->vdev_child, newchild, oldsize);
+		kmem_free(pvd->vdev_child, oldsize);
+	}
+
+	pvd->vdev_child = newchild;
+	pvd->vdev_child[id] = cvd;
+
+	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
+	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
+
+	/*
+	 * Walk up all ancestors to update guid sum.
+	 */
+	for (; pvd != NULL; pvd = pvd->vdev_parent)
+		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
+
+	if (cvd->vdev_ops->vdev_op_leaf) {
+		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
+		cvd->vdev_spa->spa_leaf_list_gen++;
+	}
+}
+
+void
+vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
+{
+	int c;
+	uint_t id = cvd->vdev_id;
+
+	ASSERT(cvd->vdev_parent == pvd);
+
+	if (pvd == NULL)
+		return;
+
+	ASSERT(id < pvd->vdev_children);
+	ASSERT(pvd->vdev_child[id] == cvd);
+
+	pvd->vdev_child[id] = NULL;
+	cvd->vdev_parent = NULL;
+
+	for (c = 0; c < pvd->vdev_children; c++)
+		if (pvd->vdev_child[c])
+			break;
+
+	if (c == pvd->vdev_children) {
+		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
+		pvd->vdev_child = NULL;
+		pvd->vdev_children = 0;
+	}
+
+	if (cvd->vdev_ops->vdev_op_leaf) {
+		spa_t *spa = cvd->vdev_spa;
+		list_remove(&spa->spa_leaf_list, cvd);
+		spa->spa_leaf_list_gen++;
+	}
+
+	/*
+	 * Walk up all ancestors to update guid sum.
+	 */
+	for (; pvd != NULL; pvd = pvd->vdev_parent)
+		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
+}
+
+/*
+ * Remove any holes in the child array.
+ */
+void
+vdev_compact_children(vdev_t *pvd)
+{
+	vdev_t **newchild, *cvd;
+	int oldc = pvd->vdev_children;
+	int newc;
+
+	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	if (oldc == 0)
+		return;
+
+	for (int c = newc = 0; c < oldc; c++)
+		if (pvd->vdev_child[c])
+			newc++;
+
+	if (newc > 0) {
+		newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
+
+		for (int c = newc = 0; c < oldc; c++) {
+			if ((cvd = pvd->vdev_child[c]) != NULL) {
+				newchild[newc] = cvd;
+				cvd->vdev_id = newc++;
+			}
+		}
+	} else {
+		newchild = NULL;
+	}
+
+	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
+	pvd->vdev_child = newchild;
+	pvd->vdev_children = newc;
+}
+
+/*
+ * Allocate and minimally initialize a vdev_t.
+ */
+vdev_t *
+vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
+{
+	vdev_t *vd;
+	vdev_indirect_config_t *vic;
+
+	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
+	vic = &vd->vdev_indirect_config;
+
+	if (spa->spa_root_vdev == NULL) {
+		ASSERT(ops == &vdev_root_ops);
+		spa->spa_root_vdev = vd;
+		spa->spa_load_guid = spa_generate_guid(NULL);
+	}
+
+	if (guid == 0 && ops != &vdev_hole_ops) {
+		if (spa->spa_root_vdev == vd) {
+			/*
+			 * The root vdev's guid will also be the pool guid,
+			 * which must be unique among all pools.
+			 */
+			guid = spa_generate_guid(NULL);
+		} else {
+			/*
+			 * Any other vdev's guid must be unique within the pool.
+			 */
+			guid = spa_generate_guid(spa);
+		}
+		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
+	}
+
+	vd->vdev_spa = spa;
+	vd->vdev_id = id;
+	vd->vdev_guid = guid;
+	vd->vdev_guid_sum = guid;
+	vd->vdev_ops = ops;
+	vd->vdev_state = VDEV_STATE_CLOSED;
+	vd->vdev_ishole = (ops == &vdev_hole_ops);
+	vic->vic_prev_indirect_vdev = UINT64_MAX;
+
+	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
+	vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL,
+	    0, 0);
+
+	/*
+	 * Initialize rate limit structs for events.  We rate limit ZIO delay
+	 * and checksum events so that we don't overwhelm ZED with thousands
+	 * of events when a disk is acting up.
+	 */
+	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
+	    1);
+	zfs_ratelimit_init(&vd->vdev_checksum_rl,
+	    &zfs_checksum_events_per_second, 1);
+
+	list_link_init(&vd->vdev_config_dirty_node);
+	list_link_init(&vd->vdev_state_dirty_node);
+	list_link_init(&vd->vdev_initialize_node);
+	list_link_init(&vd->vdev_leaf_node);
+	list_link_init(&vd->vdev_trim_node);
+
+	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
+
+	for (int t = 0; t < DTL_TYPES; t++) {
+		vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
+		    0);
+	}
+
+	txg_list_create(&vd->vdev_ms_list, spa,
+	    offsetof(struct metaslab, ms_txg_node));
+	txg_list_create(&vd->vdev_dtl_list, spa,
+	    offsetof(struct vdev, vdev_dtl_node));
+	vd->vdev_stat.vs_timestamp = gethrtime();
+	vdev_queue_init(vd);
+	vdev_cache_init(vd);
+
+	return (vd);
+}
+
+/*
+ * Allocate a new vdev.  The 'alloctype' is used to control whether we are
+ * creating a new vdev or loading an existing one - the behavior is slightly
+ * different for each case.
+ */
+int
+vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
+    int alloctype)
+{
+	vdev_ops_t *ops;
+	char *type;
+	uint64_t guid = 0, islog;
+	vdev_t *vd;
+	vdev_indirect_config_t *vic;
+	char *tmp = NULL;
+	int rc;
+	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
+	boolean_t top_level = (parent && !parent->vdev_parent);
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if ((ops = vdev_getops(type)) == NULL)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * If this is a load, get the vdev guid from the nvlist.
+	 * Otherwise, vdev_alloc_common() will generate one for us.
+	 */
+	if (alloctype == VDEV_ALLOC_LOAD) {
+		uint64_t label_id;
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
+		    label_id != id)
+			return (SET_ERROR(EINVAL));
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (SET_ERROR(EINVAL));
+	} else if (alloctype == VDEV_ALLOC_SPARE) {
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (SET_ERROR(EINVAL));
+	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (SET_ERROR(EINVAL));
+	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * The first allocated vdev must be of type 'root'.
+	 */
+	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * Determine whether we're a log vdev.
+	 */
+	islog = 0;
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
+	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
+		return (SET_ERROR(ENOTSUP));
+
+	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+		return (SET_ERROR(ENOTSUP));
+
+	if (top_level && alloctype == VDEV_ALLOC_ADD) {
+		char *bias;
+
+		/*
+		 * If creating a top-level vdev, check for allocation
+		 * classes input.
+		 */
+		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+		    &bias) == 0) {
+			alloc_bias = vdev_derive_alloc_bias(bias);
+
+			/* spa_vdev_add() expects feature to be enabled */
+			if (spa->spa_load_state != SPA_LOAD_CREATE &&
+			    !spa_feature_is_enabled(spa,
+			    SPA_FEATURE_ALLOCATION_CLASSES)) {
+				return (SET_ERROR(ENOTSUP));
+			}
+		}
+
+		/* spa_vdev_add() expects feature to be enabled */
+		if (ops == &vdev_draid_ops &&
+		    spa->spa_load_state != SPA_LOAD_CREATE &&
+		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
+			return (SET_ERROR(ENOTSUP));
+		}
+	}
+
+	/*
+	 * Initialize the vdev specific data.  This is done before calling
+	 * vdev_alloc_common() since it may fail and this simplifies the
+	 * error reporting and cleanup code paths.
+	 */
+	void *tsd = NULL;
+	if (ops->vdev_op_init != NULL) {
+		rc = ops->vdev_op_init(spa, nv, &tsd);
+		if (rc != 0) {
+			return (rc);
+		}
+	}
+
+	vd = vdev_alloc_common(spa, id, guid, ops);
+	vd->vdev_tsd = tsd;
+	vd->vdev_islog = islog;
+
+	if (top_level && alloc_bias != VDEV_BIAS_NONE)
+		vd->vdev_alloc_bias = alloc_bias;
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
+		vd->vdev_path = spa_strdup(vd->vdev_path);
+
+	/*
+	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
+	 * fault on a vdev and want it to persist across imports (like with
+	 * zpool offline -f).
+	 */
+	rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
+	if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
+		vd->vdev_faulted = 1;
+		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+	}
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
+		vd->vdev_devid = spa_strdup(vd->vdev_devid);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+	    &vd->vdev_physpath) == 0)
+		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+	    &vd->vdev_enc_sysfs_path) == 0)
+		vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path);
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
+		vd->vdev_fru = spa_strdup(vd->vdev_fru);
+
+	/*
+	 * Set the whole_disk property.  If it's not specified, leave the value
+	 * as -1.
+	 */
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+	    &vd->vdev_wholedisk) != 0)
+		vd->vdev_wholedisk = -1ULL;
+
+	vic = &vd->vdev_indirect_config;
+
+	ASSERT0(vic->vic_mapping_object);
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+	    &vic->vic_mapping_object);
+	ASSERT0(vic->vic_births_object);
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+	    &vic->vic_births_object);
+	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+	    &vic->vic_prev_indirect_vdev);
+
+	/*
+	 * Look for the 'not present' flag.  This will only be set if the device
+	 * was not present at the time of import.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &vd->vdev_not_present);
+
+	/*
+	 * Get the alignment requirement.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
+
+	/*
+	 * Retrieve the vdev creation time.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+	    &vd->vdev_crtxg);
+
+	/*
+	 * If we're a top-level vdev, try to load the allocation parameters.
+	 */
+	if (top_level &&
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    &vd->vdev_ms_array);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    &vd->vdev_ms_shift);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    &vd->vdev_asize);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
+		    &vd->vdev_removing);
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+		    &vd->vdev_top_zap);
+	} else {
+		ASSERT0(vd->vdev_top_zap);
+	}
+
+	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
+		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+		    alloctype == VDEV_ALLOC_ADD ||
+		    alloctype == VDEV_ALLOC_SPLIT ||
+		    alloctype == VDEV_ALLOC_ROOTPOOL);
+		/* Note: metaslab_group_create() is now deferred */
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
+		(void) nvlist_lookup_uint64(nv,
+		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
+	} else {
+		ASSERT0(vd->vdev_leaf_zap);
+	}
+
+	/*
+	 * If we're a leaf vdev, try to load the DTL object and other state.
+	 */
+
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
+		if (alloctype == VDEV_ALLOC_LOAD) {
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
+			    &vd->vdev_dtl_object);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
+			    &vd->vdev_unspare);
+		}
+
+		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+			uint64_t spare = 0;
+
+			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+			    &spare) == 0 && spare)
+				spa_spare_add(vd);
+		}
+
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
+		    &vd->vdev_offline);
+
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+		    &vd->vdev_resilver_txg);
+
+		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+		    &vd->vdev_rebuild_txg);
+
+		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
+			vdev_defer_resilver(vd);
+
+		/*
+		 * In general, when importing a pool we want to ignore the
+		 * persistent fault state, as the diagnosis made on another
+		 * system may not be valid in the current context.  The only
+		 * exception is if we forced a vdev to a persistently faulted
+		 * state with 'zpool offline -f'.  The persistent fault will
+		 * remain across imports until cleared.
+		 *
+		 * Local vdevs will remain in the faulted state.
+		 */
+		if (spa_load_state(spa) == SPA_LOAD_OPEN ||
+		    spa_load_state(spa) == SPA_LOAD_IMPORT) {
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
+			    &vd->vdev_faulted);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
+			    &vd->vdev_degraded);
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
+			    &vd->vdev_removed);
+
+			if (vd->vdev_faulted || vd->vdev_degraded) {
+				char *aux;
+
+				vd->vdev_label_aux =
+				    VDEV_AUX_ERR_EXCEEDED;
+				if (nvlist_lookup_string(nv,
+				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+				    strcmp(aux, "external") == 0)
+					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+				else
+					vd->vdev_faulted = 0ULL;
+			}
+		}
+	}
+
+	/*
+	 * Add ourselves to the parent's list of children.
+	 */
+	vdev_add_child(parent, vd);
+
+	*vdp = vd;
+
+	return (0);
+}
+
+void
+vdev_free(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+
+	/*
+	 * Scan queues are normally destroyed at the end of a scan. If the
+	 * queue exists here, that implies the vdev is being removed while
+	 * the scan is still running.
+	 */
+	if (vd->vdev_scan_io_queue != NULL) {
+		mutex_enter(&vd->vdev_scan_io_queue_lock);
+		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
+		vd->vdev_scan_io_queue = NULL;
+		mutex_exit(&vd->vdev_scan_io_queue_lock);
+	}
+
+	/*
+	 * vdev_free() implies closing the vdev first.  This is simpler than
+	 * trying to ensure complicated semantics for all callers.
+	 */
+	vdev_close(vd);
+
+	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
+
+	/*
+	 * Free all children.
+	 */
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_free(vd->vdev_child[c]);
+
+	ASSERT(vd->vdev_child == NULL);
+	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+
+	if (vd->vdev_ops->vdev_op_fini != NULL)
+		vd->vdev_ops->vdev_op_fini(vd);
+
+	/*
+	 * Discard allocation state.
+	 */
+	if (vd->vdev_mg != NULL) {
+		vdev_metaslab_fini(vd);
+		metaslab_group_destroy(vd->vdev_mg);
+		vd->vdev_mg = NULL;
+	}
+	if (vd->vdev_log_mg != NULL) {
+		ASSERT0(vd->vdev_ms_count);
+		metaslab_group_destroy(vd->vdev_log_mg);
+		vd->vdev_log_mg = NULL;
+	}
+
+	ASSERT0(vd->vdev_stat.vs_space);
+	ASSERT0(vd->vdev_stat.vs_dspace);
+	ASSERT0(vd->vdev_stat.vs_alloc);
+
+	/*
+	 * Remove this vdev from its parent's child list.
+	 */
+	vdev_remove_child(vd->vdev_parent, vd);
+
+	ASSERT(vd->vdev_parent == NULL);
+	ASSERT(!list_link_active(&vd->vdev_leaf_node));
+
+	/*
+	 * Clean up vdev structure.
+	 */
+	vdev_queue_fini(vd);
+	vdev_cache_fini(vd);
+
+	if (vd->vdev_path)
+		spa_strfree(vd->vdev_path);
+	if (vd->vdev_devid)
+		spa_strfree(vd->vdev_devid);
+	if (vd->vdev_physpath)
+		spa_strfree(vd->vdev_physpath);
+
+	if (vd->vdev_enc_sysfs_path)
+		spa_strfree(vd->vdev_enc_sysfs_path);
+
+	if (vd->vdev_fru)
+		spa_strfree(vd->vdev_fru);
+
+	if (vd->vdev_isspare)
+		spa_spare_remove(vd);
+	if (vd->vdev_isl2cache)
+		spa_l2cache_remove(vd);
+
+	txg_list_destroy(&vd->vdev_ms_list);
+	txg_list_destroy(&vd->vdev_dtl_list);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	space_map_close(vd->vdev_dtl_sm);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
+		range_tree_destroy(vd->vdev_dtl[t]);
+	}
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	EQUIV(vd->vdev_indirect_births != NULL,
+	    vd->vdev_indirect_mapping != NULL);
+	if (vd->vdev_indirect_births != NULL) {
+		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+		vdev_indirect_births_close(vd->vdev_indirect_births);
+	}
+
+	if (vd->vdev_obsolete_sm != NULL) {
+		ASSERT(vd->vdev_removing ||
+		    vd->vdev_ops == &vdev_indirect_ops);
+		space_map_close(vd->vdev_obsolete_sm);
+		vd->vdev_obsolete_sm = NULL;
+	}
+	range_tree_destroy(vd->vdev_obsolete_segments);
+	rw_destroy(&vd->vdev_indirect_rwlock);
+	mutex_destroy(&vd->vdev_obsolete_lock);
+
+	mutex_destroy(&vd->vdev_dtl_lock);
+	mutex_destroy(&vd->vdev_stat_lock);
+	mutex_destroy(&vd->vdev_probe_lock);
+	mutex_destroy(&vd->vdev_scan_io_queue_lock);
+
+	mutex_destroy(&vd->vdev_initialize_lock);
+	mutex_destroy(&vd->vdev_initialize_io_lock);
+	cv_destroy(&vd->vdev_initialize_io_cv);
+	cv_destroy(&vd->vdev_initialize_cv);
+
+	mutex_destroy(&vd->vdev_trim_lock);
+	mutex_destroy(&vd->vdev_autotrim_lock);
+	mutex_destroy(&vd->vdev_trim_io_lock);
+	cv_destroy(&vd->vdev_trim_cv);
+	cv_destroy(&vd->vdev_autotrim_cv);
+	cv_destroy(&vd->vdev_trim_io_cv);
+
+	mutex_destroy(&vd->vdev_rebuild_lock);
+	cv_destroy(&vd->vdev_rebuild_cv);
+
+	zfs_ratelimit_fini(&vd->vdev_delay_rl);
+	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
+
+	if (vd == spa->spa_root_vdev)
+		spa->spa_root_vdev = NULL;
+
+	kmem_free(vd, sizeof (vdev_t));
+}
+
+/*
+ * Transfer top-level vdev state from svd to tvd.
+ */
+static void
+vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
+{
+	spa_t *spa = svd->vdev_spa;
+	metaslab_t *msp;
+	vdev_t *vd;
+	int t;
+
+	ASSERT(tvd == tvd->vdev_top);
+
+	tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite;
+	tvd->vdev_ms_array = svd->vdev_ms_array;
+	tvd->vdev_ms_shift = svd->vdev_ms_shift;
+	tvd->vdev_ms_count = svd->vdev_ms_count;
+	tvd->vdev_top_zap = svd->vdev_top_zap;
+
+	svd->vdev_ms_array = 0;
+	svd->vdev_ms_shift = 0;
+	svd->vdev_ms_count = 0;
+	svd->vdev_top_zap = 0;
+
+	if (tvd->vdev_mg)
+		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
+	if (tvd->vdev_log_mg)
+		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
+	tvd->vdev_mg = svd->vdev_mg;
+	tvd->vdev_log_mg = svd->vdev_log_mg;
+	tvd->vdev_ms = svd->vdev_ms;
+
+	svd->vdev_mg = NULL;
+	svd->vdev_log_mg = NULL;
+	svd->vdev_ms = NULL;
+
+	if (tvd->vdev_mg != NULL)
+		tvd->vdev_mg->mg_vd = tvd;
+	if (tvd->vdev_log_mg != NULL)
+		tvd->vdev_log_mg->mg_vd = tvd;
+
+	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
+	svd->vdev_checkpoint_sm = NULL;
+
+	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
+	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
+
+	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
+	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
+	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
+
+	svd->vdev_stat.vs_alloc = 0;
+	svd->vdev_stat.vs_space = 0;
+	svd->vdev_stat.vs_dspace = 0;
+
+	/*
+	 * State which may be set on a top-level vdev that's in the
+	 * process of being removed.
+	 */
+	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
+	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
+	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
+	ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL);
+	ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
+	ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
+	ASSERT0(tvd->vdev_removing);
+	ASSERT0(tvd->vdev_rebuilding);
+	tvd->vdev_removing = svd->vdev_removing;
+	tvd->vdev_rebuilding = svd->vdev_rebuilding;
+	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
+	tvd->vdev_indirect_config = svd->vdev_indirect_config;
+	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
+	tvd->vdev_indirect_births = svd->vdev_indirect_births;
+	range_tree_swap(&svd->vdev_obsolete_segments,
+	    &tvd->vdev_obsolete_segments);
+	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
+	svd->vdev_indirect_config.vic_mapping_object = 0;
+	svd->vdev_indirect_config.vic_births_object = 0;
+	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
+	svd->vdev_indirect_mapping = NULL;
+	svd->vdev_indirect_births = NULL;
+	svd->vdev_obsolete_sm = NULL;
+	svd->vdev_removing = 0;
+	svd->vdev_rebuilding = 0;
+
+	for (t = 0; t < TXG_SIZE; t++) {
+		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
+			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
+		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
+			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
+		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
+			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
+	}
+
+	if (list_link_active(&svd->vdev_config_dirty_node)) {
+		vdev_config_clean(svd);
+		vdev_config_dirty(tvd);
+	}
+
+	if (list_link_active(&svd->vdev_state_dirty_node)) {
+		vdev_state_clean(svd);
+		vdev_state_dirty(tvd);
+	}
+
+	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
+	svd->vdev_deflate_ratio = 0;
+
+	tvd->vdev_islog = svd->vdev_islog;
+	svd->vdev_islog = 0;
+
+	dsl_scan_io_queue_vdev_xfer(svd, tvd);
+}
+
+static void
+vdev_top_update(vdev_t *tvd, vdev_t *vd)
+{
+	if (vd == NULL)
+		return;
+
+	vd->vdev_top = tvd;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_top_update(tvd, vd->vdev_child[c]);
+}
+
+/*
+ * Add a mirror/replacing vdev above an existing vdev.  There is no need to
+ * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
+ */
+vdev_t *
+vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
+{
+	spa_t *spa = cvd->vdev_spa;
+	vdev_t *pvd = cvd->vdev_parent;
+	vdev_t *mvd;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
+
+	mvd->vdev_asize = cvd->vdev_asize;
+	mvd->vdev_min_asize = cvd->vdev_min_asize;
+	mvd->vdev_max_asize = cvd->vdev_max_asize;
+	mvd->vdev_psize = cvd->vdev_psize;
+	mvd->vdev_ashift = cvd->vdev_ashift;
+	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
+	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
+	mvd->vdev_state = cvd->vdev_state;
+	mvd->vdev_crtxg = cvd->vdev_crtxg;
+
+	vdev_remove_child(pvd, cvd);
+	vdev_add_child(pvd, mvd);
+	cvd->vdev_id = mvd->vdev_children;
+	vdev_add_child(mvd, cvd);
+	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+	if (mvd == mvd->vdev_top)
+		vdev_top_transfer(cvd, mvd);
+
+	return (mvd);
+}
+
+/*
+ * Remove a 1-way mirror/replacing vdev from the tree.
+ */
+void
+vdev_remove_parent(vdev_t *cvd)
+{
+	vdev_t *mvd = cvd->vdev_parent;
+	vdev_t *pvd = mvd->vdev_parent;
+
+	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	ASSERT(mvd->vdev_children == 1);
+	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
+	    mvd->vdev_ops == &vdev_replacing_ops ||
+	    mvd->vdev_ops == &vdev_spare_ops);
+	cvd->vdev_ashift = mvd->vdev_ashift;
+	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
+	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
+	vdev_remove_child(mvd, cvd);
+	vdev_remove_child(pvd, mvd);
+
+	/*
+	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
+	 * Otherwise, we could have detached an offline device, and when we
+	 * go to import the pool we'll think we have two top-level vdevs,
+	 * instead of a different version of the same top-level vdev.
+	 */
+	if (mvd->vdev_top == mvd) {
+		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+		cvd->vdev_orig_guid = cvd->vdev_guid;
+		cvd->vdev_guid += guid_delta;
+		cvd->vdev_guid_sum += guid_delta;
+
+		/*
+		 * If pool not set for autoexpand, we need to also preserve
+		 * mvd's asize to prevent automatic expansion of cvd.
+		 * Otherwise if we are adjusting the mirror by attaching and
+		 * detaching children of non-uniform sizes, the mirror could
+		 * autoexpand, unexpectedly requiring larger devices to
+		 * re-establish the mirror.
+		 */
+		if (!cvd->vdev_spa->spa_autoexpand)
+			cvd->vdev_asize = mvd->vdev_asize;
+	}
+	cvd->vdev_id = mvd->vdev_id;
+	vdev_add_child(pvd, cvd);
+	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
+
+	if (cvd == cvd->vdev_top)
+		vdev_top_transfer(mvd, cvd);
+
+	ASSERT(mvd->vdev_children == 0);
+	vdev_free(mvd);
+}
+
+void
+vdev_metaslab_group_create(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	/*
+	 * metaslab_group_create was delayed until allocation bias was available
+	 */
+	if (vd->vdev_mg == NULL) {
+		metaslab_class_t *mc;
+
+		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
+			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
+
+		ASSERT3U(vd->vdev_islog, ==,
+		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
+
+		switch (vd->vdev_alloc_bias) {
+		case VDEV_BIAS_LOG:
+			mc = spa_log_class(spa);
+			break;
+		case VDEV_BIAS_SPECIAL:
+			mc = spa_special_class(spa);
+			break;
+		case VDEV_BIAS_DEDUP:
+			mc = spa_dedup_class(spa);
+			break;
+		default:
+			mc = spa_normal_class(spa);
+		}
+
+		vd->vdev_mg = metaslab_group_create(mc, vd,
+		    spa->spa_alloc_count);
+
+		if (!vd->vdev_islog) {
+			vd->vdev_log_mg = metaslab_group_create(
+			    spa_embedded_log_class(spa), vd, 1);
+		}
+
+		/*
+		 * The spa ashift min/max only apply for the normal metaslab
+		 * class. Class destination is late binding so ashift boundry
+		 * setting had to wait until now.
+		 */
+		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
+			if (vd->vdev_ashift > spa->spa_max_ashift)
+				spa->spa_max_ashift = vd->vdev_ashift;
+			if (vd->vdev_ashift < spa->spa_min_ashift)
+				spa->spa_min_ashift = vd->vdev_ashift;
+
+			uint64_t min_alloc = vdev_get_min_alloc(vd);
+			if (min_alloc < spa->spa_min_alloc)
+				spa->spa_min_alloc = min_alloc;
+		}
+	}
+}
+
+int
+vdev_metaslab_init(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	uint64_t oldc = vd->vdev_ms_count;
+	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
+	metaslab_t **mspp;
+	int error;
+	boolean_t expanding = (oldc != 0);
+
+	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+	/*
+	 * This vdev is not being allocated from yet or is a hole.
+	 */
+	if (vd->vdev_ms_shift == 0)
+		return (0);
+
+	ASSERT(!vd->vdev_ishole);
+
+	ASSERT(oldc <= newc);
+
+	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
+
+	if (expanding) {
+		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
+		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
+	}
+
+	vd->vdev_ms = mspp;
+	vd->vdev_ms_count = newc;
+
+	for (uint64_t m = oldc; m < newc; m++) {
+		uint64_t object = 0;
+		/*
+		 * vdev_ms_array may be 0 if we are creating the "fake"
+		 * metaslabs for an indirect vdev for zdb's leak detection.
+		 * See zdb_leak_init().
+		 */
+		if (txg == 0 && vd->vdev_ms_array != 0) {
+			error = dmu_read(spa->spa_meta_objset,
+			    vd->vdev_ms_array,
+			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
+			    DMU_READ_PREFETCH);
+			if (error != 0) {
+				vdev_dbgmsg(vd, "unable to read the metaslab "
+				    "array [error=%d]", error);
+				return (error);
+			}
+		}
+
+		error = metaslab_init(vd->vdev_mg, m, object, txg,
+		    &(vd->vdev_ms[m]));
+		if (error != 0) {
+			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
+			    error);
+			return (error);
+		}
+	}
+
+	/*
+	 * Find the emptiest metaslab on the vdev and mark it for use for
+	 * embedded slog by moving it from the regular to the log metaslab
+	 * group.
+	 */
+	if (vd->vdev_mg->mg_class == spa_normal_class(spa) &&
+	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
+	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
+		uint64_t slog_msid = 0;
+		uint64_t smallest = UINT64_MAX;
+
+		/*
+		 * Note, we only search the new metaslabs, because the old
+		 * (pre-existing) ones may be active (e.g. have non-empty
+		 * range_tree's), and we don't move them to the new
+		 * metaslab_t.
+		 */
+		for (uint64_t m = oldc; m < newc; m++) {
+			uint64_t alloc =
+			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
+			if (alloc < smallest) {
+				slog_msid = m;
+				smallest = alloc;
+			}
+		}
+		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
+		/*
+		 * The metaslab was marked as dirty at the end of
+		 * metaslab_init(). Remove it from the dirty list so that we
+		 * can uninitialize and reinitialize it to the new class.
+		 */
+		if (txg != 0) {
+			(void) txg_list_remove_this(&vd->vdev_ms_list,
+			    slog_ms, txg);
+		}
+		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
+		metaslab_fini(slog_ms);
+		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
+		    &vd->vdev_ms[slog_msid]));
+	}
+
+	if (txg == 0)
+		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+	/*
+	 * If the vdev is being removed we don't activate
+	 * the metaslabs since we want to ensure that no new
+	 * allocations are performed on this device.
+	 */
+	if (!expanding && !vd->vdev_removing) {
+		metaslab_group_activate(vd->vdev_mg);
+		if (vd->vdev_log_mg != NULL)
+			metaslab_group_activate(vd->vdev_log_mg);
+	}
+
+	if (txg == 0)
+		spa_config_exit(spa, SCL_ALLOC, FTAG);
+
+	/*
+	 * Regardless whether this vdev was just added or it is being
+	 * expanded, the metaslab count has changed. Recalculate the
+	 * block limit.
+	 */
+	spa_log_sm_set_blocklimit(spa);
+
+	return (0);
+}
+
+void
+vdev_metaslab_fini(vdev_t *vd)
+{
+	if (vd->vdev_checkpoint_sm != NULL) {
+		ASSERT(spa_feature_is_active(vd->vdev_spa,
+		    SPA_FEATURE_POOL_CHECKPOINT));
+		space_map_close(vd->vdev_checkpoint_sm);
+		/*
+		 * Even though we close the space map, we need to set its
+		 * pointer to NULL. The reason is that vdev_metaslab_fini()
+		 * may be called multiple times for certain operations
+		 * (i.e. when destroying a pool) so we need to ensure that
+		 * this clause never executes twice. This logic is similar
+		 * to the one used for the vdev_ms clause below.
+		 */
+		vd->vdev_checkpoint_sm = NULL;
+	}
+
+	if (vd->vdev_ms != NULL) {
+		metaslab_group_t *mg = vd->vdev_mg;
+
+		metaslab_group_passivate(mg);
+		if (vd->vdev_log_mg != NULL) {
+			ASSERT(!vd->vdev_islog);
+			metaslab_group_passivate(vd->vdev_log_mg);
+		}
+
+		uint64_t count = vd->vdev_ms_count;
+		for (uint64_t m = 0; m < count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+			if (msp != NULL)
+				metaslab_fini(msp);
+		}
+		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
+		vd->vdev_ms = NULL;
+		vd->vdev_ms_count = 0;
+
+		for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+			ASSERT0(mg->mg_histogram[i]);
+			if (vd->vdev_log_mg != NULL)
+				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
+		}
+	}
+	ASSERT0(vd->vdev_ms_count);
+	ASSERT3U(vd->vdev_pending_fastwrite, ==, 0);
+}
+
+typedef struct vdev_probe_stats {
+	boolean_t	vps_readable;
+	boolean_t	vps_writeable;
+	int		vps_flags;
+} vdev_probe_stats_t;
+
+static void
+vdev_probe_done(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	vdev_t *vd = zio->io_vd;
+	vdev_probe_stats_t *vps = zio->io_private;
+
+	ASSERT(vd->vdev_probe_zio != NULL);
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		if (zio->io_error == 0)
+			vps->vps_readable = 1;
+		if (zio->io_error == 0 && spa_writeable(spa)) {
+			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
+			    zio->io_offset, zio->io_size, zio->io_abd,
+			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
+		} else {
+			abd_free(zio->io_abd);
+		}
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
+		if (zio->io_error == 0)
+			vps->vps_writeable = 1;
+		abd_free(zio->io_abd);
+	} else if (zio->io_type == ZIO_TYPE_NULL) {
+		zio_t *pio;
+		zio_link_t *zl;
+
+		vd->vdev_cant_read |= !vps->vps_readable;
+		vd->vdev_cant_write |= !vps->vps_writeable;
+
+		if (vdev_readable(vd) &&
+		    (vdev_writeable(vd) || !spa_writeable(spa))) {
+			zio->io_error = 0;
+		} else {
+			ASSERT(zio->io_error != 0);
+			vdev_dbgmsg(vd, "failed probe");
+			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
+			    spa, vd, NULL, NULL, 0);
+			zio->io_error = SET_ERROR(ENXIO);
+		}
+
+		mutex_enter(&vd->vdev_probe_lock);
+		ASSERT(vd->vdev_probe_zio == zio);
+		vd->vdev_probe_zio = NULL;
+		mutex_exit(&vd->vdev_probe_lock);
+
+		zl = NULL;
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
+			if (!vdev_accessible(vd, pio))
+				pio->io_error = SET_ERROR(ENXIO);
+
+		kmem_free(vps, sizeof (*vps));
+	}
+}
+
+/*
+ * Determine whether this device is accessible.
+ *
+ * Read and write to several known locations: the pad regions of each
+ * vdev label but the first, which we leave alone in case it contains
+ * a VTOC.
+ */
+zio_t *
+vdev_probe(vdev_t *vd, zio_t *zio)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_probe_stats_t *vps = NULL;
+	zio_t *pio;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	/*
+	 * Don't probe the probe.
+	 */
+	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
+		return (NULL);
+
+	/*
+	 * To prevent 'probe storms' when a device fails, we create
+	 * just one probe i/o at a time.  All zios that want to probe
+	 * this vdev will become parents of the probe io.
+	 */
+	mutex_enter(&vd->vdev_probe_lock);
+
+	if ((pio = vd->vdev_probe_zio) == NULL) {
+		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+
+		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
+		    ZIO_FLAG_TRYHARD;
+
+		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
+			/*
+			 * vdev_cant_read and vdev_cant_write can only
+			 * transition from TRUE to FALSE when we have the
+			 * SCL_ZIO lock as writer; otherwise they can only
+			 * transition from FALSE to TRUE.  This ensures that
+			 * any zio looking at these values can assume that
+			 * failures persist for the life of the I/O.  That's
+			 * important because when a device has intermittent
+			 * connectivity problems, we want to ensure that
+			 * they're ascribed to the device (ENXIO) and not
+			 * the zio (EIO).
+			 *
+			 * Since we hold SCL_ZIO as writer here, clear both
+			 * values so the probe can reevaluate from first
+			 * principles.
+			 */
+			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
+			vd->vdev_cant_read = B_FALSE;
+			vd->vdev_cant_write = B_FALSE;
+		}
+
+		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
+		    vdev_probe_done, vps,
+		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+
+		/*
+		 * We can't change the vdev state in this context, so we
+		 * kick off an async task to do it on our behalf.
+		 */
+		if (zio != NULL) {
+			vd->vdev_probe_wanted = B_TRUE;
+			spa_async_request(spa, SPA_ASYNC_PROBE);
+		}
+	}
+
+	if (zio != NULL)
+		zio_add_child(zio, pio);
+
+	mutex_exit(&vd->vdev_probe_lock);
+
+	if (vps == NULL) {
+		ASSERT(zio != NULL);
+		return (NULL);
+	}
+
+	for (int l = 1; l < VDEV_LABELS; l++) {
+		zio_nowait(zio_read_phys(pio, vd,
+		    vdev_label_offset(vd->vdev_psize, l,
+		    offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
+		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
+		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
+		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
+	}
+
+	if (zio == NULL)
+		return (pio);
+
+	zio_nowait(pio);
+	return (NULL);
+}
+
+static void
+vdev_load_child(void *arg)
+{
+	vdev_t *vd = arg;
+
+	vd->vdev_load_error = vdev_load(vd);
+}
+
+static void
+vdev_open_child(void *arg)
+{
+	vdev_t *vd = arg;
+
+	vd->vdev_open_thread = curthread;
+	vd->vdev_open_error = vdev_open(vd);
+	vd->vdev_open_thread = NULL;
+}
+
+static boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+#ifdef _KERNEL
+	if (zvol_is_zvol(vd->vdev_path))
+		return (B_TRUE);
+#endif
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		if (vdev_uses_zvols(vd->vdev_child[c]))
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * Returns B_TRUE if the passed child should be opened.
+ */
+static boolean_t
+vdev_default_open_children_func(vdev_t *vd)
+{
+	return (B_TRUE);
+}
+
+/*
+ * Open the requested child vdevs.  If any of the leaf vdevs are using
+ * a ZFS volume then do the opens in a single thread.  This avoids a
+ * deadlock when the current thread is holding the spa_namespace_lock.
+ */
+static void
+vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+	int children = vd->vdev_children;
+
+	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
+	    children, children, TASKQ_PREPOPULATE);
+	vd->vdev_nonrot = B_TRUE;
+
+	for (int c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (open_func(cvd) == B_FALSE)
+			continue;
+
+		if (tq == NULL || vdev_uses_zvols(vd)) {
+			cvd->vdev_open_error = vdev_open(cvd);
+		} else {
+			VERIFY(taskq_dispatch(tq, vdev_open_child,
+			    cvd, TQ_SLEEP) != TASKQID_INVALID);
+		}
+
+		vd->vdev_nonrot &= cvd->vdev_nonrot;
+	}
+
+	if (tq != NULL) {
+		taskq_wait(tq);
+		taskq_destroy(tq);
+	}
+}
+
+/*
+ * Open all child vdevs.
+ */
+void
+vdev_open_children(vdev_t *vd)
+{
+	vdev_open_children_impl(vd, vdev_default_open_children_func);
+}
+
+/*
+ * Conditionally open a subset of child vdevs.
+ */
+void
+vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
+{
+	vdev_open_children_impl(vd, open_func);
+}
+
+/*
+ * Compute the raidz-deflation ratio.  Note, we hard-code
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
+ */
+static void
+vdev_set_deflate_ratio(vdev_t *vd)
+{
+	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
+		vd->vdev_deflate_ratio = (1 << 17) /
+		    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
+	}
+}
+
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
+static void
+vdev_ashift_optimize(vdev_t *vd)
+{
+	ASSERT(vd == vd->vdev_top);
+
+	if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+		vd->vdev_ashift = MIN(
+		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
+		    MAX(zfs_vdev_min_auto_ashift,
+		    vd->vdev_physical_ashift));
+	} else {
+		/*
+		 * If the logical and physical ashifts are the same, then
+		 * we ensure that the top-level vdev's ashift is not smaller
+		 * than our minimum ashift value. For the unusual case
+		 * where logical ashift > physical ashift, we can't cap
+		 * the calculated ashift based on max ashift as that
+		 * would cause failures.
+		 * We still check if we need to increase it to match
+		 * the min ashift.
+		 */
+		vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
+		    vd->vdev_ashift);
+	}
+}
+
+/*
+ * Prepare a virtual device for access.
+ */
+int
+vdev_open(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	int error;
+	uint64_t osize = 0;
+	uint64_t max_osize = 0;
+	uint64_t asize, max_asize, psize;
+	uint64_t logical_ashift = 0;
+	uint64_t physical_ashift = 0;
+
+	ASSERT(vd->vdev_open_thread == curthread ||
+	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
+	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
+	    vd->vdev_state == VDEV_STATE_OFFLINE);
+
+	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+	vd->vdev_cant_read = B_FALSE;
+	vd->vdev_cant_write = B_FALSE;
+	vd->vdev_min_asize = vdev_get_min_asize(vd);
+
+	/*
+	 * If this vdev is not removed, check its fault status.  If it's
+	 * faulted, bail out of the open.
+	 */
+	if (!vd->vdev_removed && vd->vdev_faulted) {
+		ASSERT(vd->vdev_children == 0);
+		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    vd->vdev_label_aux);
+		return (SET_ERROR(ENXIO));
+	} else if (vd->vdev_offline) {
+		ASSERT(vd->vdev_children == 0);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
+		return (SET_ERROR(ENXIO));
+	}
+
+	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
+	    &logical_ashift, &physical_ashift);
+	/*
+	 * Physical volume size should never be larger than its max size, unless
+	 * the disk has shrunk while we were reading it or the device is buggy
+	 * or damaged: either way it's not safe for use, bail out of the open.
+	 */
+	if (osize > max_osize) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_OPEN_FAILED);
+		return (SET_ERROR(ENXIO));
+	}
+
+	/*
+	 * Reset the vdev_reopening flag so that we actually close
+	 * the vdev on error.
+	 */
+	vd->vdev_reopening = B_FALSE;
+	if (zio_injection_enabled && error == 0)
+		error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
+
+	if (error) {
+		if (vd->vdev_removed &&
+		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
+			vd->vdev_removed = B_FALSE;
+
+		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
+			    vd->vdev_stat.vs_aux);
+		} else {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    vd->vdev_stat.vs_aux);
+		}
+		return (error);
+	}
+
+	vd->vdev_removed = B_FALSE;
+
+	/*
+	 * Recheck the faulted flag now that we have confirmed that
+	 * the vdev is accessible.  If we're faulted, bail.
+	 */
+	if (vd->vdev_faulted) {
+		ASSERT(vd->vdev_children == 0);
+		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    vd->vdev_label_aux);
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (vd->vdev_degraded) {
+		ASSERT(vd->vdev_children == 0);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+		    VDEV_AUX_ERR_EXCEEDED);
+	} else {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
+	}
+
+	/*
+	 * For hole or missing vdevs we just return success.
+	 */
+	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+		return (0);
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+			    VDEV_AUX_NONE);
+			break;
+		}
+	}
+
+	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
+	max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t));
+
+	if (vd->vdev_children == 0) {
+		if (osize < SPA_MINDEVSIZE) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_TOO_SMALL);
+			return (SET_ERROR(EOVERFLOW));
+		}
+		psize = osize;
+		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
+		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
+		    VDEV_LABEL_END_SIZE);
+	} else {
+		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
+		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_TOO_SMALL);
+			return (SET_ERROR(EOVERFLOW));
+		}
+		psize = 0;
+		asize = osize;
+		max_asize = max_osize;
+	}
+
+	/*
+	 * If the vdev was expanded, record this so that we can re-create the
+	 * uberblock rings in labels {2,3}, during the next sync.
+	 */
+	if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
+		vd->vdev_copy_uberblocks = B_TRUE;
+
+	vd->vdev_psize = psize;
+
+	/*
+	 * Make sure the allocatable size hasn't shrunk too much.
+	 */
+	if (asize < vd->vdev_min_asize) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_LABEL);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * We can always set the logical/physical ashift members since
+	 * their values are only used to calculate the vdev_ashift when
+	 * the device is first added to the config. These values should
+	 * not be used for anything else since they may change whenever
+	 * the device is reopened and we don't store them in the label.
+	 */
+	vd->vdev_physical_ashift =
+	    MAX(physical_ashift, vd->vdev_physical_ashift);
+	vd->vdev_logical_ashift = MAX(logical_ashift,
+	    vd->vdev_logical_ashift);
+
+	if (vd->vdev_asize == 0) {
+		/*
+		 * This is the first-ever open, so use the computed values.
+		 * For compatibility, a different ashift can be requested.
+		 */
+		vd->vdev_asize = asize;
+		vd->vdev_max_asize = max_asize;
+
+		/*
+		 * If the vdev_ashift was not overriden at creation time,
+		 * then set it the logical ashift and optimize the ashift.
+		 */
+		if (vd->vdev_ashift == 0) {
+			vd->vdev_ashift = vd->vdev_logical_ashift;
+
+			if (vd->vdev_logical_ashift > ASHIFT_MAX) {
+				vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+				    VDEV_AUX_ASHIFT_TOO_BIG);
+				return (SET_ERROR(EDOM));
+			}
+
+			if (vd->vdev_top == vd) {
+				vdev_ashift_optimize(vd);
+			}
+		}
+		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
+		    vd->vdev_ashift > ASHIFT_MAX)) {
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_BAD_ASHIFT);
+			return (SET_ERROR(EDOM));
+		}
+	} else {
+		/*
+		 * Make sure the alignment required hasn't increased.
+		 */
+		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
+		    vd->vdev_ops->vdev_op_leaf) {
+			(void) zfs_ereport_post(
+			    FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
+			    spa, vd, NULL, NULL, 0);
+			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_BAD_LABEL);
+			return (SET_ERROR(EDOM));
+		}
+		vd->vdev_max_asize = max_asize;
+	}
+
+	/*
+	 * If all children are healthy we update asize if either:
+	 * The asize has increased, due to a device expansion caused by dynamic
+	 * LUN growth or vdev replacement, and automatic expansion is enabled;
+	 * making the additional space available.
+	 *
+	 * The asize has decreased, due to a device shrink usually caused by a
+	 * vdev replace with a smaller device. This ensures that calculations
+	 * based of max_asize and asize e.g. esize are always valid. It's safe
+	 * to do this as we've already validated that asize is greater than
+	 * vdev_min_asize.
+	 */
+	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+	    ((asize > vd->vdev_asize &&
+	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
+	    (asize < vd->vdev_asize)))
+		vd->vdev_asize = asize;
+
+	vdev_set_min_asize(vd);
+
+	/*
+	 * Ensure we can issue some IO before declaring the
+	 * vdev open for business.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    VDEV_AUX_ERR_EXCEEDED);
+		return (error);
+	}
+
+	/*
+	 * Track the the minimum allocation size.
+	 */
+	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
+		uint64_t min_alloc = vdev_get_min_alloc(vd);
+		if (min_alloc < spa->spa_min_alloc)
+			spa->spa_min_alloc = min_alloc;
+	}
+
+	/*
+	 * If this is a leaf vdev, assess whether a resilver is needed.
+	 * But don't do this if we are doing a reopen for a scrub, since
+	 * this would just restart the scrub we are already doing.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
+		dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
+
+	return (0);
+}
+
+static void
+vdev_validate_child(void *arg)
+{
+	vdev_t *vd = arg;
+
+	vd->vdev_validate_thread = curthread;
+	vd->vdev_validate_error = vdev_validate(vd);
+	vd->vdev_validate_thread = NULL;
+}
+
+/*
+ * Called once the vdevs are all opened, this routine validates the label
+ * contents. This needs to be done before vdev_load() so that we don't
+ * inadvertently do repair I/Os to the wrong device.
+ *
+ * This function will only return failure if one of the vdevs indicates that it
+ * has since been destroyed or exported.  This is only possible if
+ * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
+ * will be updated but the function will return 0.
+ */
+int
+vdev_validate(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	taskq_t *tq = NULL;
+	nvlist_t *label;
+	uint64_t guid = 0, aux_guid = 0, top_guid;
+	uint64_t state;
+	nvlist_t *nvl;
+	uint64_t txg;
+	int children = vd->vdev_children;
+
+	if (vdev_validate_skip)
+		return (0);
+
+	if (children > 0) {
+		tq = taskq_create("vdev_validate", children, minclsyspri,
+		    children, children, TASKQ_PREPOPULATE);
+	}
+
+	for (uint64_t c = 0; c < children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (tq == NULL || vdev_uses_zvols(cvd)) {
+			vdev_validate_child(cvd);
+		} else {
+			VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
+			    TQ_SLEEP) != TASKQID_INVALID);
+		}
+	}
+	if (tq != NULL) {
+		taskq_wait(tq);
+		taskq_destroy(tq);
+	}
+	for (int c = 0; c < children; c++) {
+		int error = vd->vdev_child[c]->vdev_validate_error;
+
+		if (error != 0)
+			return (SET_ERROR(EBADF));
+	}
+
+
+	/*
+	 * If the device has already failed, or was marked offline, don't do
+	 * any further validation.  Otherwise, label I/O will fail and we will
+	 * overwrite the previous state.
+	 */
+	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
+		return (0);
+
+	/*
+	 * If we are performing an extreme rewind, we allow for a label that
+	 * was modified at a point after the current txg.
+	 * If config lock is not held do not check for the txg. spa_sync could
+	 * be updating the vdev's label before updating spa_last_synced_txg.
+	 */
+	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
+	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
+		txg = UINT64_MAX;
+	else
+		txg = spa_last_synced_txg(spa);
+
+	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_LABEL);
+		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
+		    "txg %llu", (u_longlong_t)txg);
+		return (0);
+	}
+
+	/*
+	 * Determine if this vdev has been split off into another
+	 * pool.  If so, then refuse to open it.
+	 */
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_SPLIT_POOL);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
+		return (0);
+	}
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_POOL_GUID);
+		return (0);
+	}
+
+	/*
+	 * If config is not trusted then ignore the spa guid check. This is
+	 * necessary because if the machine crashed during a re-guid the new
+	 * guid might have been written to all of the vdev labels, but not the
+	 * cached config. The check will be performed again once we have the
+	 * trusted config from the MOS.
+	 */
+	if (spa->spa_trust_config && guid != spa_guid(spa)) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
+		    "match config (%llu != %llu)", (u_longlong_t)guid,
+		    (u_longlong_t)spa_guid(spa));
+		return (0);
+	}
+
+	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+	    &aux_guid) != 0)
+		aux_guid = 0;
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_GUID);
+		return (0);
+	}
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
+	    != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_TOP_GUID);
+		return (0);
+	}
+
+	/*
+	 * If this vdev just became a top-level vdev because its sibling was
+	 * detached, it will have adopted the parent's vdev guid -- but the
+	 * label may or may not be on disk yet. Fortunately, either version
+	 * of the label will have the same top guid, so if we're a top-level
+	 * vdev, we can safely compare to that instead.
+	 * However, if the config comes from a cachefile that failed to update
+	 * after the detach, a top-level vdev will appear as a non top-level
+	 * vdev in the config. Also relax the constraints if we perform an
+	 * extreme rewind.
+	 *
+	 * If we split this vdev off instead, then we also check the
+	 * original pool's guid. We don't want to consider the vdev
+	 * corrupt if it is partway through a split operation.
+	 */
+	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
+		boolean_t mismatch = B_FALSE;
+		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
+			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
+				mismatch = B_TRUE;
+		} else {
+			if (vd->vdev_guid != top_guid &&
+			    vd->vdev_top->vdev_guid != guid)
+				mismatch = B_TRUE;
+		}
+
+		if (mismatch) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			nvlist_free(label);
+			vdev_dbgmsg(vd, "vdev_validate: config guid "
+			    "doesn't match label guid");
+			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
+			    (u_longlong_t)vd->vdev_guid,
+			    (u_longlong_t)vd->vdev_top->vdev_guid);
+			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
+			    "aux_guid %llu", (u_longlong_t)guid,
+			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
+			return (0);
+		}
+	}
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+	    &state) != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
+		    ZPOOL_CONFIG_POOL_STATE);
+		return (0);
+	}
+
+	nvlist_free(label);
+
+	/*
+	 * If this is a verbatim import, no need to check the
+	 * state of the pool.
+	 */
+	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
+	    spa_load_state(spa) == SPA_LOAD_OPEN &&
+	    state != POOL_STATE_ACTIVE) {
+		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
+		    "for spa %s", (u_longlong_t)state, spa->spa_name);
+		return (SET_ERROR(EBADF));
+	}
+
+	/*
+	 * If we were able to open and validate a vdev that was
+	 * previously marked permanently unavailable, clear that state
+	 * now.
+	 */
+	if (vd->vdev_not_present)
+		vd->vdev_not_present = 0;
+
+	return (0);
+}
+
+static void
+vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
+{
+	if (svd->vdev_path != NULL && dvd->vdev_path != NULL) {
+		if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) {
+			zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed "
+			    "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
+			    dvd->vdev_path, svd->vdev_path);
+			spa_strfree(dvd->vdev_path);
+			dvd->vdev_path = spa_strdup(svd->vdev_path);
+		}
+	} else if (svd->vdev_path != NULL) {
+		dvd->vdev_path = spa_strdup(svd->vdev_path);
+		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
+		    (u_longlong_t)dvd->vdev_guid, dvd->vdev_path);
+	}
+}
+
+/*
+ * Recursively copy vdev paths from one vdev to another. Source and destination
+ * vdev trees must have same geometry otherwise return error. Intended to copy
+ * paths from userland config into MOS config.
+ */
+int
+vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
+{
+	if ((svd->vdev_ops == &vdev_missing_ops) ||
+	    (svd->vdev_ishole && dvd->vdev_ishole) ||
+	    (dvd->vdev_ops == &vdev_indirect_ops))
+		return (0);
+
+	if (svd->vdev_ops != dvd->vdev_ops) {
+		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
+		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (svd->vdev_guid != dvd->vdev_guid) {
+		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
+		    "%llu)", (u_longlong_t)svd->vdev_guid,
+		    (u_longlong_t)dvd->vdev_guid);
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (svd->vdev_children != dvd->vdev_children) {
+		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
+		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
+		    (u_longlong_t)dvd->vdev_children);
+		return (SET_ERROR(EINVAL));
+	}
+
+	for (uint64_t i = 0; i < svd->vdev_children; i++) {
+		int error = vdev_copy_path_strict(svd->vdev_child[i],
+		    dvd->vdev_child[i]);
+		if (error != 0)
+			return (error);
+	}
+
+	if (svd->vdev_ops->vdev_op_leaf)
+		vdev_copy_path_impl(svd, dvd);
+
+	return (0);
+}
+
+static void
+vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
+{
+	ASSERT(stvd->vdev_top == stvd);
+	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
+
+	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
+		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
+	}
+
+	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
+		return;
+
+	/*
+	 * The idea here is that while a vdev can shift positions within
+	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
+	 * step outside of it.
+	 */
+	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
+
+	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
+		return;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	vdev_copy_path_impl(vd, dvd);
+}
+
+/*
+ * Recursively copy vdev paths from one root vdev to another. Source and
+ * destination vdev trees may differ in geometry. For each destination leaf
+ * vdev, search a vdev with the same guid and top vdev id in the source.
+ * Intended to copy paths from userland config into MOS config.
+ */
+void
+vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
+{
+	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
+	ASSERT(srvd->vdev_ops == &vdev_root_ops);
+	ASSERT(drvd->vdev_ops == &vdev_root_ops);
+
+	for (uint64_t i = 0; i < children; i++) {
+		vdev_copy_path_search(srvd->vdev_child[i],
+		    drvd->vdev_child[i]);
+	}
+}
+
+/*
+ * Close a virtual device.
+ */
+void
+vdev_close(vdev_t *vd)
+{
+	vdev_t *pvd = vd->vdev_parent;
+	spa_t *spa __maybe_unused = vd->vdev_spa;
+
+	ASSERT(vd != NULL);
+	ASSERT(vd->vdev_open_thread == curthread ||
+	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	/*
+	 * If our parent is reopening, then we are as well, unless we are
+	 * going offline.
+	 */
+	if (pvd != NULL && pvd->vdev_reopening)
+		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
+	vd->vdev_ops->vdev_op_close(vd);
+
+	vdev_cache_purge(vd);
+
+	/*
+	 * We record the previous state before we close it, so that if we are
+	 * doing a reopen(), we don't generate FMA ereports if we notice that
+	 * it's still faulted.
+	 */
+	vd->vdev_prevstate = vd->vdev_state;
+
+	if (vd->vdev_offline)
+		vd->vdev_state = VDEV_STATE_OFFLINE;
+	else
+		vd->vdev_state = VDEV_STATE_CLOSED;
+	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+}
+
+void
+vdev_hold(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_is_root(spa));
+	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
+		return;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_hold(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_hold(vd);
+}
+
+void
+vdev_rele(vdev_t *vd)
+{
+	ASSERT(spa_is_root(vd->vdev_spa));
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_rele(vd->vdev_child[c]);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_rele(vd);
+}
+
+/*
+ * Reopen all interior vdevs and any unopened leaves.  We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
+void
+vdev_reopen(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	/* set the reopening flag unless we're taking the vdev offline */
+	vd->vdev_reopening = !vd->vdev_offline;
+	vdev_close(vd);
+	(void) vdev_open(vd);
+
+	/*
+	 * Call vdev_validate() here to make sure we have the same device.
+	 * Otherwise, a device with an invalid label could be successfully
+	 * opened in response to vdev_reopen().
+	 */
+	if (vd->vdev_aux) {
+		(void) vdev_validate_aux(vd);
+		if (vdev_readable(vd) && vdev_writeable(vd) &&
+		    vd->vdev_aux == &spa->spa_l2cache) {
+			/*
+			 * In case the vdev is present we should evict all ARC
+			 * buffers and pointers to log blocks and reclaim their
+			 * space before restoring its contents to L2ARC.
+			 */
+			if (l2arc_vdev_present(vd)) {
+				l2arc_rebuild_vdev(vd, B_TRUE);
+			} else {
+				l2arc_add_vdev(spa, vd);
+			}
+			spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
+			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
+		}
+	} else {
+		(void) vdev_validate(vd);
+	}
+
+	/*
+	 * Reassess parent vdev's health.
+	 */
+	vdev_propagate_state(vd);
+}
+
+int
+vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
+{
+	int error;
+
+	/*
+	 * Normally, partial opens (e.g. of a mirror) are allowed.
+	 * For a create, however, we want to fail the request if
+	 * there are any components we can't open.
+	 */
+	error = vdev_open(vd);
+
+	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
+		vdev_close(vd);
+		return (error ? error : SET_ERROR(ENXIO));
+	}
+
+	/*
+	 * Recursively load DTLs and initialize all labels.
+	 */
+	if ((error = vdev_dtl_load(vd)) != 0 ||
+	    (error = vdev_label_init(vd, txg, isreplacing ?
+	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
+		vdev_close(vd);
+		return (error);
+	}
+
+	return (0);
+}
+
+void
+vdev_metaslab_set_size(vdev_t *vd)
+{
+	uint64_t asize = vd->vdev_asize;
+	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
+	uint64_t ms_shift;
+
+	/*
+	 * There are two dimensions to the metaslab sizing calculation:
+	 * the size of the metaslab and the count of metaslabs per vdev.
+	 *
+	 * The default values used below are a good balance between memory
+	 * usage (larger metaslab size means more memory needed for loaded
+	 * metaslabs; more metaslabs means more memory needed for the
+	 * metaslab_t structs), metaslab load time (larger metaslabs take
+	 * longer to load), and metaslab sync time (more metaslabs means
+	 * more time spent syncing all of them).
+	 *
+	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+	 * The range of the dimensions are as follows:
+	 *
+	 *	2^29 <= ms_size  <= 2^34
+	 *	  16 <= ms_count <= 131,072
+	 *
+	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
+	 * at least 512MB (2^29) to minimize fragmentation effects when
+	 * testing with smaller devices.  However, the count constraint
+	 * of at least 16 metaslabs will override this minimum size goal.
+	 *
+	 * On the upper end of vdev sizes, we aim for a maximum metaslab
+	 * size of 16GB.  However, we will cap the total count to 2^17
+	 * metaslabs to keep our memory footprint in check and let the
+	 * metaslab size grow from there if that limit is hit.
+	 *
+	 * The net effect of applying above constrains is summarized below.
+	 *
+	 *   vdev size       metaslab count
+	 *  --------------|-----------------
+	 *      < 8GB        ~16
+	 *  8GB   - 100GB   one per 512MB
+	 *  100GB - 3TB     ~200
+	 *  3TB   - 2PB     one per 16GB
+	 *      > 2PB       ~131,072
+	 *  --------------------------------
+	 *
+	 *  Finally, note that all of the above calculate the initial
+	 *  number of metaslabs. Expanding a top-level vdev will result
+	 *  in additional metaslabs being allocated making it possible
+	 *  to exceed the zfs_vdev_ms_count_limit.
+	 */
+
+	if (ms_count < zfs_vdev_min_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+	else if (ms_count > zfs_vdev_default_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
+	else
+		ms_shift = zfs_vdev_default_ms_shift;
+
+	if (ms_shift < SPA_MAXBLOCKSHIFT) {
+		ms_shift = SPA_MAXBLOCKSHIFT;
+	} else if (ms_shift > zfs_vdev_max_ms_shift) {
+		ms_shift = zfs_vdev_max_ms_shift;
+		/* cap the total count to constrain memory footprint */
+		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
+	}
+
+	vd->vdev_ms_shift = ms_shift;
+	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
+}
+
+void
+vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
+{
+	ASSERT(vd == vd->vdev_top);
+	/* indirect vdevs don't have metaslabs or dtls */
+	ASSERT(vdev_is_concrete(vd) || flags == 0);
+	ASSERT(ISP2(flags));
+	ASSERT(spa_writeable(vd->vdev_spa));
+
+	if (flags & VDD_METASLAB)
+		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
+
+	if (flags & VDD_DTL)
+		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
+
+	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
+}
+
+void
+vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
+
+	if (vd->vdev_ops->vdev_op_leaf)
+		vdev_dirty(vd->vdev_top, flags, vd, txg);
+}
+
+/*
+ * DTLs.
+ *
+ * A vdev's DTL (dirty time log) is the set of transaction groups for which
+ * the vdev has less than perfect replication.  There are four kinds of DTL:
+ *
+ * DTL_MISSING: txgs for which the vdev has no valid copies of the data
+ *
+ * DTL_PARTIAL: txgs for which data is available, but not fully replicated
+ *
+ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
+ *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
+ *	txgs that was scrubbed.
+ *
+ * DTL_OUTAGE: txgs which cannot currently be read, whether due to
+ *	persistent errors or just some device being offline.
+ *	Unlike the other three, the DTL_OUTAGE map is not generally
+ *	maintained; it's only computed when needed, typically to
+ *	determine whether a device can be detached.
+ *
+ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
+ * either has the data or it doesn't.
+ *
+ * For interior vdevs such as mirror and RAID-Z the picture is more complex.
+ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
+ * if any child is less than fully replicated, then so is its parent.
+ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
+ * comprising only those txgs which appear in 'maxfaults' or more children;
+ * those are the txgs we don't have enough replication to read.  For example,
+ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
+ * thus, its DTL_MISSING consists of the set of txgs that appear in more than
+ * two child DTL_MISSING maps.
+ *
+ * It should be clear from the above that to compute the DTLs and outage maps
+ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
+ * Therefore, that is all we keep on disk.  When loading the pool, or after
+ * a configuration change, we generate all other DTLs from first principles.
+ */
+void
+vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
+{
+	range_tree_t *rt = vd->vdev_dtl[t];
+
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+	ASSERT(spa_writeable(vd->vdev_spa));
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	if (!range_tree_contains(rt, txg, size))
+		range_tree_add(rt, txg, size);
+	mutex_exit(&vd->vdev_dtl_lock);
+}
+
+boolean_t
+vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
+{
+	range_tree_t *rt = vd->vdev_dtl[t];
+	boolean_t dirty = B_FALSE;
+
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+	/*
+	 * While we are loading the pool, the DTLs have not been loaded yet.
+	 * This isn't a problem but it can result in devices being tried
+	 * which are known to not have the data.  In which case, the import
+	 * is relying on the checksum to ensure that we get the right data.
+	 * Note that while importing we are only reading the MOS, which is
+	 * always checksummed.
+	 */
+	mutex_enter(&vd->vdev_dtl_lock);
+	if (!range_tree_is_empty(rt))
+		dirty = range_tree_contains(rt, txg, size);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	return (dirty);
+}
+
+boolean_t
+vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
+{
+	range_tree_t *rt = vd->vdev_dtl[t];
+	boolean_t empty;
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	empty = range_tree_is_empty(rt);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	return (empty);
+}
+
+/*
+ * Check if the txg falls within the range which must be
+ * resilvered.  DVAs outside this range can always be skipped.
+ */
+boolean_t
+vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	/* Set by sequential resilver. */
+	if (phys_birth == TXG_UNKNOWN)
+		return (B_TRUE);
+
+	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
+}
+
+/*
+ * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
+ */
+boolean_t
+vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
+	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
+	    vd->vdev_ops->vdev_op_leaf)
+		return (B_TRUE);
+
+	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
+	    phys_birth));
+}
+
+/*
+ * Returns the lowest txg in the DTL range.
+ */
+static uint64_t
+vdev_dtl_min(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+	ASSERT0(vd->vdev_children);
+
+	return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
+}
+
+/*
+ * Returns the highest txg in the DTL.
+ */
+static uint64_t
+vdev_dtl_max(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+	ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+	ASSERT0(vd->vdev_children);
+
+	return (range_tree_max(vd->vdev_dtl[DTL_MISSING]));
+}
+
+/*
+ * Determine if a resilvering vdev should remove any DTL entries from
+ * its range. If the vdev was resilvering for the entire duration of the
+ * scan then it should excise that range from its DTLs. Otherwise, this
+ * vdev is considered partially resilvered and should leave its DTL
+ * entries intact. The comment in vdev_dtl_reassess() describes how we
+ * excise the DTLs.
+ */
+static boolean_t
+vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
+{
+	ASSERT0(vd->vdev_children);
+
+	if (vd->vdev_state < VDEV_STATE_DEGRADED)
+		return (B_FALSE);
+
+	if (vd->vdev_resilver_deferred)
+		return (B_FALSE);
+
+	if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
+		return (B_TRUE);
+
+	if (rebuild_done) {
+		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+		/* Rebuild not initiated by attach */
+		if (vd->vdev_rebuild_txg == 0)
+			return (B_TRUE);
+
+		/*
+		 * When a rebuild completes without error then all missing data
+		 * up to the rebuild max txg has been reconstructed and the DTL
+		 * is eligible for excision.
+		 */
+		if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
+		    vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
+			ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
+			ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
+			ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
+			return (B_TRUE);
+		}
+	} else {
+		dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+		dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
+
+		/* Resilver not initiated by attach */
+		if (vd->vdev_resilver_txg == 0)
+			return (B_TRUE);
+
+		/*
+		 * When a resilver is initiated the scan will assign the
+		 * scn_max_txg value to the highest txg value that exists
+		 * in all DTLs. If this device's max DTL is not part of this
+		 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
+		 * then it is not eligible for excision.
+		 */
+		if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+			ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
+			ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
+			ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Reassess DTLs after a config change or scrub completion. If txg == 0 no
+ * write operations will be issued to the pool.
+ */
+void
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+    boolean_t scrub_done, boolean_t rebuild_done)
+{
+	spa_t *spa = vd->vdev_spa;
+	avl_tree_t reftree;
+	int minref;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_dtl_reassess(vd->vdev_child[c], txg,
+		    scrub_txg, scrub_done, rebuild_done);
+
+	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
+		return;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+		boolean_t check_excise = B_FALSE;
+		boolean_t wasempty = B_TRUE;
+
+		mutex_enter(&vd->vdev_dtl_lock);
+
+		/*
+		 * If requested, pretend the scan or rebuild completed cleanly.
+		 */
+		if (zfs_scan_ignore_errors) {
+			if (scn != NULL)
+				scn->scn_phys.scn_errors = 0;
+			if (vr != NULL)
+				vr->vr_rebuild_phys.vrp_errors = 0;
+		}
+
+		if (scrub_txg != 0 &&
+		    !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+			wasempty = B_FALSE;
+			zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
+			    "dtl:%llu/%llu errors:%llu",
+			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
+			    (u_longlong_t)scrub_txg, spa->spa_scrub_started,
+			    (u_longlong_t)vdev_dtl_min(vd),
+			    (u_longlong_t)vdev_dtl_max(vd),
+			    (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
+		}
+
+		/*
+		 * If we've completed a scrub/resilver or a rebuild cleanly
+		 * then determine if this vdev should remove any DTLs. We
+		 * only want to excise regions on vdevs that were available
+		 * during the entire duration of this scan.
+		 */
+		if (rebuild_done &&
+		    vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
+			check_excise = B_TRUE;
+		} else {
+			if (spa->spa_scrub_started ||
+			    (scn != NULL && scn->scn_phys.scn_errors == 0)) {
+				check_excise = B_TRUE;
+			}
+		}
+
+		if (scrub_txg && check_excise &&
+		    vdev_dtl_should_excise(vd, rebuild_done)) {
+			/*
+			 * We completed a scrub, resilver or rebuild up to
+			 * scrub_txg.  If we did it without rebooting, then
+			 * the scrub dtl will be valid, so excise the old
+			 * region and fold in the scrub dtl.  Otherwise,
+			 * leave the dtl as-is if there was an error.
+			 *
+			 * There's little trick here: to excise the beginning
+			 * of the DTL_MISSING map, we put it into a reference
+			 * tree and then add a segment with refcnt -1 that
+			 * covers the range [0, scrub_txg).  This means
+			 * that each txg in that range has refcnt -1 or 0.
+			 * We then add DTL_SCRUB with a refcnt of 2, so that
+			 * entries in the range [0, scrub_txg) will have a
+			 * positive refcnt -- either 1 or 2.  We then convert
+			 * the reference tree into the new DTL_MISSING map.
+			 */
+			space_reftree_create(&reftree);
+			space_reftree_add_map(&reftree,
+			    vd->vdev_dtl[DTL_MISSING], 1);
+			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
+			space_reftree_add_map(&reftree,
+			    vd->vdev_dtl[DTL_SCRUB], 2);
+			space_reftree_generate_map(&reftree,
+			    vd->vdev_dtl[DTL_MISSING], 1);
+			space_reftree_destroy(&reftree);
+
+			if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+				zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
+				    (u_longlong_t)vdev_dtl_min(vd),
+				    (u_longlong_t)vdev_dtl_max(vd));
+			} else if (!wasempty) {
+				zfs_dbgmsg("DTL_MISSING is now empty");
+			}
+		}
+		range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+		range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+		    range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
+		if (scrub_done)
+			range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+		range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+		if (!vdev_readable(vd))
+			range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+		else
+			range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+			    range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
+
+		/*
+		 * If the vdev was resilvering or rebuilding and no longer
+		 * has any DTLs then reset the appropriate flag and dirty
+		 * the top level so that we persist the change.
+		 */
+		if (txg != 0 &&
+		    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+		    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
+			if (vd->vdev_rebuild_txg != 0) {
+				vd->vdev_rebuild_txg = 0;
+				vdev_config_dirty(vd->vdev_top);
+			} else if (vd->vdev_resilver_txg != 0) {
+				vd->vdev_resilver_txg = 0;
+				vdev_config_dirty(vd->vdev_top);
+			}
+		}
+
+		mutex_exit(&vd->vdev_dtl_lock);
+
+		if (txg != 0)
+			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
+		return;
+	}
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		/* account for child's outage in parent's missing map */
+		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+		if (t == DTL_SCRUB)
+			continue;			/* leaf vdevs only */
+		if (t == DTL_PARTIAL)
+			minref = 1;			/* i.e. non-zero */
+		else if (vdev_get_nparity(vd) != 0)
+			minref = vdev_get_nparity(vd) + 1; /* RAID-Z, dRAID */
+		else
+			minref = vd->vdev_children;	/* any kind of mirror */
+		space_reftree_create(&reftree);
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			mutex_enter(&cvd->vdev_dtl_lock);
+			space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
+			mutex_exit(&cvd->vdev_dtl_lock);
+		}
+		space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
+		space_reftree_destroy(&reftree);
+	}
+	mutex_exit(&vd->vdev_dtl_lock);
+}
+
+int
+vdev_dtl_load(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	range_tree_t *rt;
+	int error = 0;
+
+	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
+		ASSERT(vdev_is_concrete(vd));
+
+		error = space_map_open(&vd->vdev_dtl_sm, mos,
+		    vd->vdev_dtl_object, 0, -1ULL, 0);
+		if (error)
+			return (error);
+		ASSERT(vd->vdev_dtl_sm != NULL);
+
+		rt = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
+		if (error == 0) {
+			mutex_enter(&vd->vdev_dtl_lock);
+			range_tree_walk(rt, range_tree_add,
+			    vd->vdev_dtl[DTL_MISSING]);
+			mutex_exit(&vd->vdev_dtl_lock);
+		}
+
+		range_tree_vacate(rt, NULL, NULL);
+		range_tree_destroy(rt);
+
+		return (error);
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		error = vdev_dtl_load(vd->vdev_child[c]);
+		if (error != 0)
+			break;
+	}
+
+	return (error);
+}
+
+static void
+vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
+	const char *string;
+
+	ASSERT(alloc_bias != VDEV_BIAS_NONE);
+
+	string =
+	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
+	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
+	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
+
+	ASSERT(string != NULL);
+	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
+	    1, strlen(string) + 1, string, tx));
+
+	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
+		spa_activate_allocation_classes(spa, tx);
+	}
+}
+
+void
+vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
+	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+	    zapobj, tx));
+}
+
+uint64_t
+vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
+	    DMU_OT_NONE, 0, tx);
+
+	ASSERT(zap != 0);
+	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
+	    zap, tx));
+
+	return (zap);
+}
+
+void
+vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
+{
+	if (vd->vdev_ops != &vdev_hole_ops &&
+	    vd->vdev_ops != &vdev_missing_ops &&
+	    vd->vdev_ops != &vdev_root_ops &&
+	    !vd->vdev_top->vdev_removing) {
+		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
+			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
+		}
+		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
+			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
+			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
+				vdev_zap_allocation_data(vd, tx);
+		}
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_construct_zaps(vd->vdev_child[i], tx);
+	}
+}
+
+static void
+vdev_dtl_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
+	objset_t *mos = spa->spa_meta_objset;
+	range_tree_t *rtsync;
+	dmu_tx_t *tx;
+	uint64_t object = space_map_object(vd->vdev_dtl_sm);
+
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		space_map_free(vd->vdev_dtl_sm, tx);
+		space_map_close(vd->vdev_dtl_sm);
+		vd->vdev_dtl_sm = NULL;
+		mutex_exit(&vd->vdev_dtl_lock);
+
+		/*
+		 * We only destroy the leaf ZAP for detached leaves or for
+		 * removed log devices. Removed data devices handle leaf ZAP
+		 * cleanup later, once cancellation is no longer possible.
+		 */
+		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
+		    vd->vdev_top->vdev_islog)) {
+			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
+			vd->vdev_leaf_zap = 0;
+		}
+
+		dmu_tx_commit(tx);
+		return;
+	}
+
+	if (vd->vdev_dtl_sm == NULL) {
+		uint64_t new_object;
+
+		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
+		VERIFY3U(new_object, !=, 0);
+
+		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
+		    0, -1ULL, 0));
+		ASSERT(vd->vdev_dtl_sm != NULL);
+	}
+
+	rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+
+	mutex_enter(&vd->vdev_dtl_lock);
+	range_tree_walk(rt, range_tree_add, rtsync);
+	mutex_exit(&vd->vdev_dtl_lock);
+
+	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
+	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
+	range_tree_vacate(rtsync, NULL, NULL);
+
+	range_tree_destroy(rtsync);
+
+	/*
+	 * If the object for the space map has changed then dirty
+	 * the top level so that we update the config.
+	 */
+	if (object != space_map_object(vd->vdev_dtl_sm)) {
+		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
+		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
+		    (u_longlong_t)object,
+		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
+		vdev_config_dirty(vd->vdev_top);
+	}
+
+	dmu_tx_commit(tx);
+}
+
+/*
+ * Determine whether the specified vdev can be offlined/detached/removed
+ * without losing data.
+ */
+boolean_t
+vdev_dtl_required(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	uint8_t cant_read = vd->vdev_cant_read;
+	boolean_t required;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	if (vd == spa->spa_root_vdev || vd == tvd)
+		return (B_TRUE);
+
+	/*
+	 * Temporarily mark the device as unreadable, and then determine
+	 * whether this results in any DTL outages in the top-level vdev.
+	 * If not, we can safely offline/detach/remove the device.
+	 */
+	vd->vdev_cant_read = B_TRUE;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
+	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
+	vd->vdev_cant_read = cant_read;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
+
+	if (!required && zio_injection_enabled) {
+		required = !!zio_handle_device_injection(vd, NULL,
+		    SET_ERROR(ECHILD));
+	}
+
+	return (required);
+}
+
+/*
+ * Determine if resilver is needed, and if so the txg range.
+ */
+boolean_t
+vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
+{
+	boolean_t needed = B_FALSE;
+	uint64_t thismin = UINT64_MAX;
+	uint64_t thismax = 0;
+
+	if (vd->vdev_children == 0) {
+		mutex_enter(&vd->vdev_dtl_lock);
+		if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
+		    vdev_writeable(vd)) {
+
+			thismin = vdev_dtl_min(vd);
+			thismax = vdev_dtl_max(vd);
+			needed = B_TRUE;
+		}
+		mutex_exit(&vd->vdev_dtl_lock);
+	} else {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			uint64_t cmin, cmax;
+
+			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
+				thismin = MIN(thismin, cmin);
+				thismax = MAX(thismax, cmax);
+				needed = B_TRUE;
+			}
+		}
+	}
+
+	if (needed && minp) {
+		*minp = thismin;
+		*maxp = thismax;
+	}
+	return (needed);
+}
+
+/*
+ * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
+ * will contain either the checkpoint spacemap object or zero if none exists.
+ * All other errors are returned to the caller.
+ */
+int
+vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
+{
+	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+	if (vd->vdev_top_zap == 0) {
+		*sm_obj = 0;
+		return (0);
+	}
+
+	int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
+	if (error == ENOENT) {
+		*sm_obj = 0;
+		error = 0;
+	}
+
+	return (error);
+}
+
+int
+vdev_load(vdev_t *vd)
+{
+	int children = vd->vdev_children;
+	int error = 0;
+	taskq_t *tq = NULL;
+
+	/*
+	 * It's only worthwhile to use the taskq for the root vdev, because the
+	 * slow part is metaslab_init, and that only happens for top-level
+	 * vdevs.
+	 */
+	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
+		tq = taskq_create("vdev_load", children, minclsyspri,
+		    children, children, TASKQ_PREPOPULATE);
+	}
+
+	/*
+	 * Recursively load all children.
+	 */
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (tq == NULL || vdev_uses_zvols(cvd)) {
+			cvd->vdev_load_error = vdev_load(cvd);
+		} else {
+			VERIFY(taskq_dispatch(tq, vdev_load_child,
+			    cvd, TQ_SLEEP) != TASKQID_INVALID);
+		}
+	}
+
+	if (tq != NULL) {
+		taskq_wait(tq);
+		taskq_destroy(tq);
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		int error = vd->vdev_child[c]->vdev_load_error;
+
+		if (error != 0)
+			return (error);
+	}
+
+	vdev_set_deflate_ratio(vd);
+
+	/*
+	 * On spa_load path, grab the allocation bias from our zap
+	 */
+	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+		spa_t *spa = vd->vdev_spa;
+		char bias_str[64];
+
+		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
+		    bias_str);
+		if (error == 0) {
+			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
+			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
+		} else if (error != ENOENT) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
+			    "failed [error=%d]", vd->vdev_top_zap, error);
+			return (error);
+		}
+	}
+
+	/*
+	 * Load any rebuild state from the top-level vdev zap.
+	 */
+	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+		error = vdev_rebuild_load(vd);
+		if (error && error != ENOTSUP) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
+			    "failed [error=%d]", error);
+			return (error);
+		}
+	}
+
+	/*
+	 * If this is a top-level vdev, initialize its metaslabs.
+	 */
+	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
+		vdev_metaslab_group_create(vd);
+
+		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
+			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
+			    (u_longlong_t)vd->vdev_asize);
+			return (SET_ERROR(ENXIO));
+		}
+
+		error = vdev_metaslab_init(vd, 0);
+		if (error != 0) {
+			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
+			    "[error=%d]", error);
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			return (error);
+		}
+
+		uint64_t checkpoint_sm_obj;
+		error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
+		if (error == 0 && checkpoint_sm_obj != 0) {
+			objset_t *mos = spa_meta_objset(vd->vdev_spa);
+			ASSERT(vd->vdev_asize != 0);
+			ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL);
+
+			error = space_map_open(&vd->vdev_checkpoint_sm,
+			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
+			    vd->vdev_ashift);
+			if (error != 0) {
+				vdev_dbgmsg(vd, "vdev_load: space_map_open "
+				    "failed for checkpoint spacemap (obj %llu) "
+				    "[error=%d]",
+				    (u_longlong_t)checkpoint_sm_obj, error);
+				return (error);
+			}
+			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
+
+			/*
+			 * Since the checkpoint_sm contains free entries
+			 * exclusively we can use space_map_allocated() to
+			 * indicate the cumulative checkpointed space that
+			 * has been freed.
+			 */
+			vd->vdev_stat.vs_checkpoint_space =
+			    -space_map_allocated(vd->vdev_checkpoint_sm);
+			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
+			    vd->vdev_stat.vs_checkpoint_space;
+		} else if (error != 0) {
+			vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
+			    "checkpoint space map object from vdev ZAP "
+			    "[error=%d]", error);
+			return (error);
+		}
+	}
+
+	/*
+	 * If this is a leaf vdev, load its DTL.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
+		    "[error=%d]", error);
+		return (error);
+	}
+
+	uint64_t obsolete_sm_object;
+	error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
+	if (error == 0 && obsolete_sm_object != 0) {
+		objset_t *mos = vd->vdev_spa->spa_meta_objset;
+		ASSERT(vd->vdev_asize != 0);
+		ASSERT3P(vd->vdev_obsolete_sm, ==, NULL);
+
+		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
+		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
+			    "obsolete spacemap (obj %llu) [error=%d]",
+			    (u_longlong_t)obsolete_sm_object, error);
+			return (error);
+		}
+	} else if (error != 0) {
+		vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
+		    "space map object from vdev ZAP [error=%d]", error);
+		return (error);
+	}
+
+	return (0);
+}
+
+/*
+ * The special vdev case is used for hot spares and l2cache devices.  Its
+ * sole purpose it to set the vdev state for the associated vdev.  To do this,
+ * we make sure that we can open the underlying device, then try to read the
+ * label, and make sure that the label is sane and that it hasn't been
+ * repurposed to another pool.
+ */
+int
+vdev_validate_aux(vdev_t *vd)
+{
+	nvlist_t *label;
+	uint64_t guid, version;
+	uint64_t state;
+
+	if (!vdev_readable(vd))
+		return (0);
+
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		return (-1);
+	}
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
+	    !SPA_VERSION_IS_SUPPORTED(version) ||
+	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
+	    guid != vd->vdev_guid ||
+	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_CORRUPT_DATA);
+		nvlist_free(label);
+		return (-1);
+	}
+
+	/*
+	 * We don't actually check the pool state here.  If it's in fact in
+	 * use by another pool, we update this fact on the fly when requested.
+	 */
+	nvlist_free(label);
+	return (0);
+}
+
+static void
+vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
+{
+	objset_t *mos = spa_meta_objset(vd->vdev_spa);
+
+	if (vd->vdev_top_zap == 0)
+		return;
+
+	uint64_t object = 0;
+	int err = zap_lookup(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
+	if (err == ENOENT)
+		return;
+	VERIFY0(err);
+
+	VERIFY0(dmu_object_free(mos, object, tx));
+	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
+}
+
+/*
+ * Free the objects used to store this vdev's spacemaps, and the array
+ * that points to them.
+ */
+void
+vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
+{
+	if (vd->vdev_ms_array == 0)
+		return;
+
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
+	size_t array_bytes = array_count * sizeof (uint64_t);
+	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
+	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
+	    array_bytes, smobj_array, 0));
+
+	for (uint64_t i = 0; i < array_count; i++) {
+		uint64_t smobj = smobj_array[i];
+		if (smobj == 0)
+			continue;
+
+		space_map_free_obj(mos, smobj, tx);
+	}
+
+	kmem_free(smobj_array, array_bytes);
+	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
+	vdev_destroy_ms_flush_data(vd, tx);
+	vd->vdev_ms_array = 0;
+}
+
+static void
+vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(vd->vdev_islog);
+	ASSERT(vd == vd->vdev_top);
+	ASSERT3U(txg, ==, spa_syncing_txg(spa));
+
+	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+	vdev_destroy_spacemaps(vd, tx);
+	if (vd->vdev_top_zap != 0) {
+		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
+		vd->vdev_top_zap = 0;
+	}
+
+	dmu_tx_commit(tx);
+}
+
+void
+vdev_sync_done(vdev_t *vd, uint64_t txg)
+{
+	metaslab_t *msp;
+	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+	ASSERT(vdev_is_concrete(vd));
+
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+	    != NULL)
+		metaslab_sync_done(msp, txg);
+
+	if (reassess) {
+		metaslab_sync_reassess(vd->vdev_mg);
+		if (vd->vdev_log_mg != NULL)
+			metaslab_sync_reassess(vd->vdev_log_mg);
+	}
+}
+
+void
+vdev_sync(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *lvd;
+	metaslab_t *msp;
+
+	ASSERT3U(txg, ==, spa->spa_syncing_txg);
+	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+	if (range_tree_space(vd->vdev_obsolete_segments) > 0) {
+		ASSERT(vd->vdev_removing ||
+		    vd->vdev_ops == &vdev_indirect_ops);
+
+		vdev_indirect_sync_obsolete(vd, tx);
+
+		/*
+		 * If the vdev is indirect, it can't have dirty
+		 * metaslabs or DTLs.
+		 */
+		if (vd->vdev_ops == &vdev_indirect_ops) {
+			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
+			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
+			dmu_tx_commit(tx);
+			return;
+		}
+	}
+
+	ASSERT(vdev_is_concrete(vd));
+
+	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
+	    !vd->vdev_removing) {
+		ASSERT(vd == vd->vdev_top);
+		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
+		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
+		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
+		ASSERT(vd->vdev_ms_array != 0);
+		vdev_config_dirty(vd);
+	}
+
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
+		metaslab_sync(msp, txg);
+		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
+	}
+
+	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
+		vdev_dtl_sync(lvd, txg);
+
+	/*
+	 * If this is an empty log device being removed, destroy the
+	 * metadata associated with it.
+	 */
+	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
+		vdev_remove_empty_log(vd, txg);
+
+	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
+	dmu_tx_commit(tx);
+}
+
+uint64_t
+vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
+{
+	return (vd->vdev_ops->vdev_op_asize(vd, psize));
+}
+
+/*
+ * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
+ * not be opened, and no I/O is attempted.
+ */
+int
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
+{
+	vdev_t *vd, *tvd;
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+	tvd = vd->vdev_top;
+
+	/*
+	 * If user did a 'zpool offline -f' then make the fault persist across
+	 * reboots.
+	 */
+	if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
+		/*
+		 * There are two kinds of forced faults: temporary and
+		 * persistent.  Temporary faults go away at pool import, while
+		 * persistent faults stay set.  Both types of faults can be
+		 * cleared with a zpool clear.
+		 *
+		 * We tell if a vdev is persistently faulted by looking at the
+		 * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
+		 * import then it's a persistent fault.  Otherwise, it's
+		 * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
+		 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
+		 * tells vdev_config_generate() (which gets run later) to set
+		 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
+		 */
+		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
+		vd->vdev_tmpoffline = B_FALSE;
+		aux = VDEV_AUX_EXTERNAL;
+	} else {
+		vd->vdev_tmpoffline = B_TRUE;
+	}
+
+	/*
+	 * We don't directly use the aux state here, but if we do a
+	 * vdev_reopen(), we need this value to be present to remember why we
+	 * were faulted.
+	 */
+	vd->vdev_label_aux = aux;
+
+	/*
+	 * Faulted state takes precedence over degraded.
+	 */
+	vd->vdev_delayed_close = B_FALSE;
+	vd->vdev_faulted = 1ULL;
+	vd->vdev_degraded = 0ULL;
+	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
+
+	/*
+	 * If this device has the only valid copy of the data, then
+	 * back off and simply mark the vdev as degraded instead.
+	 */
+	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
+		vd->vdev_degraded = 1ULL;
+		vd->vdev_faulted = 0ULL;
+
+		/*
+		 * If we reopen the device and it's not dead, only then do we
+		 * mark it degraded.
+		 */
+		vdev_reopen(tvd);
+
+		if (vdev_readable(vd))
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
+	}
+
+	return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+/*
+ * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
+ * user that something is wrong.  The vdev continues to operate as normal as far
+ * as I/O is concerned.
+ */
+int
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
+{
+	vdev_t *vd;
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+	/*
+	 * If the vdev is already faulted, then don't do anything.
+	 */
+	if (vd->vdev_faulted || vd->vdev_degraded)
+		return (spa_vdev_state_exit(spa, NULL, 0));
+
+	vd->vdev_degraded = 1ULL;
+	if (!vdev_is_dead(vd))
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
+		    aux);
+
+	return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+/*
+ * Online the given vdev.
+ *
+ * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
+ * spare device should be detached when the device finishes resilvering.
+ * Second, the online should be treated like a 'test' online case, so no FMA
+ * events are generated if the device fails to open.
+ */
+int
+vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
+{
+	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
+	boolean_t wasoffline;
+	vdev_state_t oldstate;
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
+	oldstate = vd->vdev_state;
+
+	tvd = vd->vdev_top;
+	vd->vdev_offline = B_FALSE;
+	vd->vdev_tmpoffline = B_FALSE;
+	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
+	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
+
+	/* XXX - L2ARC 1.0 does not support expansion */
+	if (!vd->vdev_aux) {
+		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
+			    spa->spa_autoexpand);
+		vd->vdev_expansion_time = gethrestime_sec();
+	}
+
+	vdev_reopen(tvd);
+	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
+
+	if (!vd->vdev_aux) {
+		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+			pvd->vdev_expanding = B_FALSE;
+	}
+
+	if (newstate)
+		*newstate = vd->vdev_state;
+	if ((flags & ZFS_ONLINE_UNSPARE) &&
+	    !vdev_is_dead(vd) && vd->vdev_parent &&
+	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+	    vd->vdev_parent->vdev_child[0] == vd)
+		vd->vdev_unspare = B_TRUE;
+
+	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
+
+		/* XXX - L2ARC 1.0 does not support expansion */
+		if (vd->vdev_aux)
+			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+	}
+
+	/* Restart initializing if necessary */
+	mutex_enter(&vd->vdev_initialize_lock);
+	if (vdev_writeable(vd) &&
+	    vd->vdev_initialize_thread == NULL &&
+	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+		(void) vdev_initialize(vd);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	/*
+	 * Restart trimming if necessary. We do not restart trimming for cache
+	 * devices here. This is triggered by l2arc_rebuild_vdev()
+	 * asynchronously for the whole device or in l2arc_evict() as it evicts
+	 * space for upcoming writes.
+	 */
+	mutex_enter(&vd->vdev_trim_lock);
+	if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
+	    vd->vdev_trim_thread == NULL &&
+	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
+		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
+		    vd->vdev_trim_secure);
+	}
+	mutex_exit(&vd->vdev_trim_lock);
+
+	if (wasoffline ||
+	    (oldstate < VDEV_STATE_DEGRADED &&
+	    vd->vdev_state >= VDEV_STATE_DEGRADED))
+		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
+
+	return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+	vdev_t *vd, *tvd;
+	int error = 0;
+	uint64_t generation;
+	metaslab_group_t *mg;
+
+top:
+	spa_vdev_state_enter(spa, SCL_ALLOC);
+
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+	tvd = vd->vdev_top;
+	mg = tvd->vdev_mg;
+	generation = spa->spa_config_generation + 1;
+
+	/*
+	 * If the device isn't already offline, try to offline it.
+	 */
+	if (!vd->vdev_offline) {
+		/*
+		 * If this device has the only valid copy of some data,
+		 * don't allow it to be offlined. Log devices are always
+		 * expendable.
+		 */
+		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+		    vdev_dtl_required(vd))
+			return (spa_vdev_state_exit(spa, NULL,
+			    SET_ERROR(EBUSY)));
+
+		/*
+		 * If the top-level is a slog and it has had allocations
+		 * then proceed.  We check that the vdev's metaslab group
+		 * is not NULL since it's possible that we may have just
+		 * added this vdev but not yet initialized its metaslabs.
+		 */
+		if (tvd->vdev_islog && mg != NULL) {
+			/*
+			 * Prevent any future allocations.
+			 */
+			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
+			metaslab_group_passivate(mg);
+			(void) spa_vdev_state_exit(spa, vd, 0);
+
+			error = spa_reset_logs(spa);
+
+			/*
+			 * If the log device was successfully reset but has
+			 * checkpointed data, do not offline it.
+			 */
+			if (error == 0 &&
+			    tvd->vdev_checkpoint_sm != NULL) {
+				ASSERT3U(space_map_allocated(
+				    tvd->vdev_checkpoint_sm), !=, 0);
+				error = ZFS_ERR_CHECKPOINT_EXISTS;
+			}
+
+			spa_vdev_state_enter(spa, SCL_ALLOC);
+
+			/*
+			 * Check to see if the config has changed.
+			 */
+			if (error || generation != spa->spa_config_generation) {
+				metaslab_group_activate(mg);
+				if (error)
+					return (spa_vdev_state_exit(spa,
+					    vd, error));
+				(void) spa_vdev_state_exit(spa, vd, 0);
+				goto top;
+			}
+			ASSERT0(tvd->vdev_stat.vs_alloc);
+		}
+
+		/*
+		 * Offline this device and reopen its top-level vdev.
+		 * If the top-level vdev is a log device then just offline
+		 * it. Otherwise, if this action results in the top-level
+		 * vdev becoming unusable, undo it and fail the request.
+		 */
+		vd->vdev_offline = B_TRUE;
+		vdev_reopen(tvd);
+
+		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+		    vdev_is_dead(tvd)) {
+			vd->vdev_offline = B_FALSE;
+			vdev_reopen(tvd);
+			return (spa_vdev_state_exit(spa, NULL,
+			    SET_ERROR(EBUSY)));
+		}
+
+		/*
+		 * Add the device back into the metaslab rotor so that
+		 * once we online the device it's open for business.
+		 */
+		if (tvd->vdev_islog && mg != NULL)
+			metaslab_group_activate(mg);
+	}
+
+	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
+
+	return (spa_vdev_state_exit(spa, vd, 0));
+}
+
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+	int error;
+
+	mutex_enter(&spa->spa_vdev_top_lock);
+	error = vdev_offline_locked(spa, guid, flags);
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	return (error);
+}
+
+/*
+ * Clear the error counts associated with this vdev.  Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked.  We also clear all
+ * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	if (vd == NULL)
+		vd = rvd;
+
+	vd->vdev_stat.vs_read_errors = 0;
+	vd->vdev_stat.vs_write_errors = 0;
+	vd->vdev_stat.vs_checksum_errors = 0;
+	vd->vdev_stat.vs_slow_ios = 0;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_clear(spa, vd->vdev_child[c]);
+
+	/*
+	 * It makes no sense to "clear" an indirect vdev.
+	 */
+	if (!vdev_is_concrete(vd))
+		return;
+
+	/*
+	 * If we're in the FAULTED state or have experienced failed I/O, then
+	 * clear the persistent state and attempt to reopen the device.  We
+	 * also mark the vdev config dirty, so that the new faulted state is
+	 * written out to disk.
+	 */
+	if (vd->vdev_faulted || vd->vdev_degraded ||
+	    !vdev_readable(vd) || !vdev_writeable(vd)) {
+		/*
+		 * When reopening in response to a clear event, it may be due to
+		 * a fmadm repair request.  In this case, if the device is
+		 * still broken, we want to still post the ereport again.
+		 */
+		vd->vdev_forcefault = B_TRUE;
+
+		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
+		vd->vdev_cant_read = B_FALSE;
+		vd->vdev_cant_write = B_FALSE;
+		vd->vdev_stat.vs_aux = 0;
+
+		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
+
+		vd->vdev_forcefault = B_FALSE;
+
+		if (vd != rvd && vdev_writeable(vd->vdev_top))
+			vdev_state_dirty(vd->vdev_top);
+
+		/* If a resilver isn't required, check if vdevs can be culled */
+		if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
+		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
+		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
+			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
+	}
+
+	/*
+	 * When clearing a FMA-diagnosed fault, we always want to
+	 * unspare the device, as we assume that the original spare was
+	 * done in response to the FMA fault.
+	 */
+	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+	    vd->vdev_parent->vdev_child[0] == vd)
+		vd->vdev_unspare = B_TRUE;
+}
+
+boolean_t
+vdev_is_dead(vdev_t *vd)
+{
+	/*
+	 * Holes and missing devices are always considered "dead".
+	 * This simplifies the code since we don't have to check for
+	 * these types of devices in the various code paths.
+	 * Instead we rely on the fact that we skip over dead devices
+	 * before issuing I/O to them.
+	 */
+	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
+	    vd->vdev_ops == &vdev_hole_ops ||
+	    vd->vdev_ops == &vdev_missing_ops);
+}
+
+boolean_t
+vdev_readable(vdev_t *vd)
+{
+	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
+}
+
+boolean_t
+vdev_writeable(vdev_t *vd)
+{
+	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
+	    vdev_is_concrete(vd));
+}
+
+boolean_t
+vdev_allocatable(vdev_t *vd)
+{
+	uint64_t state = vd->vdev_state;
+
+	/*
+	 * We currently allow allocations from vdevs which may be in the
+	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
+	 * fails to reopen then we'll catch it later when we're holding
+	 * the proper locks.  Note that we have to get the vdev state
+	 * in a local variable because although it changes atomically,
+	 * we're asking two separate questions about it.
+	 */
+	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
+	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
+	    vd->vdev_mg->mg_initialized);
+}
+
+boolean_t
+vdev_accessible(vdev_t *vd, zio_t *zio)
+{
+	ASSERT(zio->io_vd == vd);
+
+	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
+		return (B_FALSE);
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		return (!vd->vdev_cant_read);
+
+	if (zio->io_type == ZIO_TYPE_WRITE)
+		return (!vd->vdev_cant_write);
+
+	return (B_TRUE);
+}
+
+static void
+vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
+{
+	/*
+	 * Exclude the dRAID spare when aggregating to avoid double counting
+	 * the ops and bytes.  These IOs are counted by the physical leaves.
+	 */
+	if (cvd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
+	for (int t = 0; t < VS_ZIO_TYPES; t++) {
+		vs->vs_ops[t] += cvs->vs_ops[t];
+		vs->vs_bytes[t] += cvs->vs_bytes[t];
+	}
+
+	cvs->vs_scan_removing = cvd->vdev_removing;
+}
+
+/*
+ * Get extended stats
+ */
+static void
+vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
+{
+	int t, b;
+	for (t = 0; t < ZIO_TYPES; t++) {
+		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
+			vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
+
+		for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
+			vsx->vsx_total_histo[t][b] +=
+			    cvsx->vsx_total_histo[t][b];
+		}
+	}
+
+	for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
+		for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
+			vsx->vsx_queue_histo[t][b] +=
+			    cvsx->vsx_queue_histo[t][b];
+		}
+		vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
+		vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
+
+		for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
+			vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
+
+		for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
+			vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
+	}
+
+}
+
+boolean_t
+vdev_is_spacemap_addressable(vdev_t *vd)
+{
+	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+		return (B_TRUE);
+
+	/*
+	 * If double-word space map entries are not enabled we assume
+	 * 47 bits of the space map entry are dedicated to the entry's
+	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+	 * to calculate the maximum address that can be described by a
+	 * space map entry for the given device.
+	 */
+	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
+
+	if (shift >= 63) /* detect potential overflow */
+		return (B_TRUE);
+
+	return (vd->vdev_asize < (1ULL << shift));
+}
+
+/*
+ * Get statistics for the given vdev.
+ */
+static void
+vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+{
+	int t;
+	/*
+	 * If we're getting stats on the root vdev, aggregate the I/O counts
+	 * over all top-level vdevs (i.e. the direct children of the root).
+	 */
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		if (vs) {
+			memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
+			memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
+		}
+		if (vsx)
+			memset(vsx, 0, sizeof (*vsx));
+
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			vdev_stat_t *cvs = &cvd->vdev_stat;
+			vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
+
+			vdev_get_stats_ex_impl(cvd, cvs, cvsx);
+			if (vs)
+				vdev_get_child_stat(cvd, vs, cvs);
+			if (vsx)
+				vdev_get_child_stat_ex(cvd, vsx, cvsx);
+		}
+	} else {
+		/*
+		 * We're a leaf.  Just copy our ZIO active queue stats in.  The
+		 * other leaf stats are updated in vdev_stat_update().
+		 */
+		if (!vsx)
+			return;
+
+		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
+
+		for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
+			vsx->vsx_active_queue[t] =
+			    vd->vdev_queue.vq_class[t].vqc_active;
+			vsx->vsx_pend_queue[t] = avl_numnodes(
+			    &vd->vdev_queue.vq_class[t].vqc_queued_tree);
+		}
+	}
+}
+
+void
+vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
+{
+	vdev_t *tvd = vd->vdev_top;
+	mutex_enter(&vd->vdev_stat_lock);
+	if (vs) {
+		bcopy(&vd->vdev_stat, vs, sizeof (*vs));
+		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
+		vs->vs_state = vd->vdev_state;
+		vs->vs_rsize = vdev_get_min_asize(vd);
+
+		if (vd->vdev_ops->vdev_op_leaf) {
+			vs->vs_rsize += VDEV_LABEL_START_SIZE +
+			    VDEV_LABEL_END_SIZE;
+			/*
+			 * Report initializing progress. Since we don't
+			 * have the initializing locks held, this is only
+			 * an estimate (although a fairly accurate one).
+			 */
+			vs->vs_initialize_bytes_done =
+			    vd->vdev_initialize_bytes_done;
+			vs->vs_initialize_bytes_est =
+			    vd->vdev_initialize_bytes_est;
+			vs->vs_initialize_state = vd->vdev_initialize_state;
+			vs->vs_initialize_action_time =
+			    vd->vdev_initialize_action_time;
+
+			/*
+			 * Report manual TRIM progress. Since we don't have
+			 * the manual TRIM locks held, this is only an
+			 * estimate (although fairly accurate one).
+			 */
+			vs->vs_trim_notsup = !vd->vdev_has_trim;
+			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
+			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
+			vs->vs_trim_state = vd->vdev_trim_state;
+			vs->vs_trim_action_time = vd->vdev_trim_action_time;
+
+			/* Set when there is a deferred resilver. */
+			vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
+		}
+
+		/*
+		 * Report expandable space on top-level, non-auxiliary devices
+		 * only. The expandable space is reported in terms of metaslab
+		 * sized units since that determines how much space the pool
+		 * can expand.
+		 */
+		if (vd->vdev_aux == NULL && tvd != NULL) {
+			vs->vs_esize = P2ALIGN(
+			    vd->vdev_max_asize - vd->vdev_asize,
+			    1ULL << tvd->vdev_ms_shift);
+		}
+
+		vs->vs_configured_ashift = vd->vdev_top != NULL
+		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
+		vs->vs_logical_ashift = vd->vdev_logical_ashift;
+		vs->vs_physical_ashift = vd->vdev_physical_ashift;
+
+		/*
+		 * Report fragmentation and rebuild progress for top-level,
+		 * non-auxiliary, concrete devices.
+		 */
+		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
+		    vdev_is_concrete(vd)) {
+			/*
+			 * The vdev fragmentation rating doesn't take into
+			 * account the embedded slog metaslab (vdev_log_mg).
+			 * Since it's only one metaslab, it would have a tiny
+			 * impact on the overall fragmentation.
+			 */
+			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
+			    vd->vdev_mg->mg_fragmentation : 0;
+		}
+	}
+
+	vdev_get_stats_ex_impl(vd, vs, vsx);
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
+{
+	return (vdev_get_stats_ex(vd, vs, NULL));
+}
+
+void
+vdev_clear_stats(vdev_t *vd)
+{
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_space = 0;
+	vd->vdev_stat.vs_dspace = 0;
+	vd->vdev_stat.vs_alloc = 0;
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_scan_stat_init(vdev_t *vd)
+{
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_scan_stat_init(vd->vdev_child[c]);
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vs->vs_scan_processed = 0;
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
+vdev_stat_update(zio_t *zio, uint64_t psize)
+{
+	spa_t *spa = zio->io_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
+	vdev_t *pvd;
+	uint64_t txg = zio->io_txg;
+	vdev_stat_t *vs = &vd->vdev_stat;
+	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
+	zio_type_t type = zio->io_type;
+	int flags = zio->io_flags;
+
+	/*
+	 * If this i/o is a gang leader, it didn't do any actual work.
+	 */
+	if (zio->io_gang_tree)
+		return;
+
+	if (zio->io_error == 0) {
+		/*
+		 * If this is a root i/o, don't count it -- we've already
+		 * counted the top-level vdevs, and vdev_get_stats() will
+		 * aggregate them when asked.  This reduces contention on
+		 * the root vdev_stat_lock and implicitly handles blocks
+		 * that compress away to holes, for which there is no i/o.
+		 * (Holes never create vdev children, so all the counters
+		 * remain zero, which is what we want.)
+		 *
+		 * Note: this only applies to successful i/o (io_error == 0)
+		 * because unlike i/o counts, errors are not additive.
+		 * When reading a ditto block, for example, failure of
+		 * one top-level vdev does not imply a root-level error.
+		 */
+		if (vd == rvd)
+			return;
+
+		ASSERT(vd == zio->io_vd);
+
+		if (flags & ZIO_FLAG_IO_BYPASS)
+			return;
+
+		mutex_enter(&vd->vdev_stat_lock);
+
+		if (flags & ZIO_FLAG_IO_REPAIR) {
+			/*
+			 * Repair is the result of a resilver issued by the
+			 * scan thread (spa_sync).
+			 */
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
+				dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+				dsl_scan_phys_t *scn_phys = &scn->scn_phys;
+				uint64_t *processed = &scn_phys->scn_processed;
+
+				if (vd->vdev_ops->vdev_op_leaf)
+					atomic_add_64(processed, psize);
+				vs->vs_scan_processed += psize;
+			}
+
+			/*
+			 * Repair is the result of a rebuild issued by the
+			 * rebuild thread (vdev_rebuild_thread).  To avoid
+			 * double counting repaired bytes the virtual dRAID
+			 * spare vdev is excluded from the processed bytes.
+			 */
+			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
+				vdev_t *tvd = vd->vdev_top;
+				vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
+
+				if (vd->vdev_ops->vdev_op_leaf &&
+				    vd->vdev_ops != &vdev_draid_spare_ops) {
+					atomic_add_64(rebuilt, psize);
+				}
+				vs->vs_rebuild_processed += psize;
+			}
+
+			if (flags & ZIO_FLAG_SELF_HEAL)
+				vs->vs_self_healed += psize;
+		}
+
+		/*
+		 * The bytes/ops/histograms are recorded at the leaf level and
+		 * aggregated into the higher level vdevs in vdev_get_stats().
+		 */
+		if (vd->vdev_ops->vdev_op_leaf &&
+		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
+			zio_type_t vs_type = type;
+			zio_priority_t priority = zio->io_priority;
+
+			/*
+			 * TRIM ops and bytes are reported to user space as
+			 * ZIO_TYPE_IOCTL.  This is done to preserve the
+			 * vdev_stat_t structure layout for user space.
+			 */
+			if (type == ZIO_TYPE_TRIM)
+				vs_type = ZIO_TYPE_IOCTL;
+
+			/*
+			 * Solely for the purposes of 'zpool iostat -lqrw'
+			 * reporting use the priority to catagorize the IO.
+			 * Only the following are reported to user space:
+			 *
+			 *   ZIO_PRIORITY_SYNC_READ,
+			 *   ZIO_PRIORITY_SYNC_WRITE,
+			 *   ZIO_PRIORITY_ASYNC_READ,
+			 *   ZIO_PRIORITY_ASYNC_WRITE,
+			 *   ZIO_PRIORITY_SCRUB,
+			 *   ZIO_PRIORITY_TRIM.
+			 */
+			if (priority == ZIO_PRIORITY_REBUILD) {
+				priority = ((type == ZIO_TYPE_WRITE) ?
+				    ZIO_PRIORITY_ASYNC_WRITE :
+				    ZIO_PRIORITY_SCRUB);
+			} else if (priority == ZIO_PRIORITY_INITIALIZING) {
+				ASSERT3U(type, ==, ZIO_TYPE_WRITE);
+				priority = ZIO_PRIORITY_ASYNC_WRITE;
+			} else if (priority == ZIO_PRIORITY_REMOVAL) {
+				priority = ((type == ZIO_TYPE_WRITE) ?
+				    ZIO_PRIORITY_ASYNC_WRITE :
+				    ZIO_PRIORITY_ASYNC_READ);
+			}
+
+			vs->vs_ops[vs_type]++;
+			vs->vs_bytes[vs_type] += psize;
+
+			if (flags & ZIO_FLAG_DELEGATED) {
+				vsx->vsx_agg_histo[priority]
+				    [RQ_HISTO(zio->io_size)]++;
+			} else {
+				vsx->vsx_ind_histo[priority]
+				    [RQ_HISTO(zio->io_size)]++;
+			}
+
+			if (zio->io_delta && zio->io_delay) {
+				vsx->vsx_queue_histo[priority]
+				    [L_HISTO(zio->io_delta - zio->io_delay)]++;
+				vsx->vsx_disk_histo[type]
+				    [L_HISTO(zio->io_delay)]++;
+				vsx->vsx_total_histo[type]
+				    [L_HISTO(zio->io_delta)]++;
+			}
+		}
+
+		mutex_exit(&vd->vdev_stat_lock);
+		return;
+	}
+
+	if (flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	/*
+	 * If this is an I/O error that is going to be retried, then ignore the
+	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
+	 * hard errors, when in reality they can happen for any number of
+	 * innocuous reasons (bus resets, MPxIO link failure, etc).
+	 */
+	if (zio->io_error == EIO &&
+	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
+		return;
+
+	/*
+	 * Intent logs writes won't propagate their error to the root
+	 * I/O so don't mark these types of failures as pool-level
+	 * errors.
+	 */
+	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		return;
+
+	if (type == ZIO_TYPE_WRITE && txg != 0 &&
+	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
+	    (flags & ZIO_FLAG_SCAN_THREAD) ||
+	    spa->spa_claiming)) {
+		/*
+		 * This is either a normal write (not a repair), or it's
+		 * a repair induced by the scrub thread, or it's a repair
+		 * made by zil_claim() during spa_load() in the first txg.
+		 * In the normal case, we commit the DTL change in the same
+		 * txg as the block was born.  In the scrub-induced repair
+		 * case, we know that scrubs run in first-pass syncing context,
+		 * so we commit the DTL change in spa_syncing_txg(spa).
+		 * In the zil_claim() case, we commit in spa_first_txg(spa).
+		 *
+		 * We currently do not make DTL entries for failed spontaneous
+		 * self-healing writes triggered by normal (non-scrubbing)
+		 * reads, because we have no transactional context in which to
+		 * do so -- and it's not clear that it'd be desirable anyway.
+		 */
+		if (vd->vdev_ops->vdev_op_leaf) {
+			uint64_t commit_txg = txg;
+			if (flags & ZIO_FLAG_SCAN_THREAD) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				ASSERT(spa_sync_pass(spa) == 1);
+				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+				commit_txg = spa_syncing_txg(spa);
+			} else if (spa->spa_claiming) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				commit_txg = spa_first_txg(spa);
+			}
+			ASSERT(commit_txg >= spa_syncing_txg(spa));
+			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
+				return;
+			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
+		}
+		if (vd != rvd)
+			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
+	}
+}
+
+int64_t
+vdev_deflated_space(vdev_t *vd, int64_t space)
+{
+	ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0);
+	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
+
+	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
+}
+
+/*
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
+ */
+void
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta)
+{
+	int64_t dspace_delta;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(vd == vd->vdev_top);
+
+	/*
+	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
+	 * factor.  We must calculate this here and not at the root vdev
+	 * because the root vdev's psize-to-asize is simply the max of its
+	 * children's, thus not accurate enough for us.
+	 */
+	dspace_delta = vdev_deflated_space(vd, space_delta);
+
+	mutex_enter(&vd->vdev_stat_lock);
+	/* ensure we won't underflow */
+	if (alloc_delta < 0) {
+		ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
+	}
+
+	vd->vdev_stat.vs_alloc += alloc_delta;
+	vd->vdev_stat.vs_space += space_delta;
+	vd->vdev_stat.vs_dspace += dspace_delta;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	/* every class but log contributes to root space stats */
+	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
+		ASSERT(!vd->vdev_isl2cache);
+		mutex_enter(&rvd->vdev_stat_lock);
+		rvd->vdev_stat.vs_alloc += alloc_delta;
+		rvd->vdev_stat.vs_space += space_delta;
+		rvd->vdev_stat.vs_dspace += dspace_delta;
+		mutex_exit(&rvd->vdev_stat_lock);
+	}
+	/* Note: metaslab_class_space_update moved to metaslab_space_update */
+}
+
+/*
+ * Mark a top-level vdev's config as dirty, placing it on the dirty list
+ * so that it will be written out next time the vdev configuration is synced.
+ * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
+ */
+void
+vdev_config_dirty(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	int c;
+
+	ASSERT(spa_writeable(spa));
+
+	/*
+	 * If this is an aux vdev (as with l2cache and spare devices), then we
+	 * update the vdev config manually and set the sync flag.
+	 */
+	if (vd->vdev_aux != NULL) {
+		spa_aux_vdev_t *sav = vd->vdev_aux;
+		nvlist_t **aux;
+		uint_t naux;
+
+		for (c = 0; c < sav->sav_count; c++) {
+			if (sav->sav_vdevs[c] == vd)
+				break;
+		}
+
+		if (c == sav->sav_count) {
+			/*
+			 * We're being removed.  There's nothing more to do.
+			 */
+			ASSERT(sav->sav_sync == B_TRUE);
+			return;
+		}
+
+		sav->sav_sync = B_TRUE;
+
+		if (nvlist_lookup_nvlist_array(sav->sav_config,
+		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
+			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
+		}
+
+		ASSERT(c < naux);
+
+		/*
+		 * Setting the nvlist in the middle if the array is a little
+		 * sketchy, but it will work.
+		 */
+		nvlist_free(aux[c]);
+		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
+
+		return;
+	}
+
+	/*
+	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
+	 * must either hold SCL_CONFIG as writer, or must be the sync thread
+	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
+	 * so this is sufficient to ensure mutual exclusion.
+	 */
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
+
+	if (vd == rvd) {
+		for (c = 0; c < rvd->vdev_children; c++)
+			vdev_config_dirty(rvd->vdev_child[c]);
+	} else {
+		ASSERT(vd == vd->vdev_top);
+
+		if (!list_link_active(&vd->vdev_config_dirty_node) &&
+		    vdev_is_concrete(vd)) {
+			list_insert_head(&spa->spa_config_dirty_list, vd);
+		}
+	}
+}
+
+void
+vdev_config_clean(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
+
+	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
+	list_remove(&spa->spa_config_dirty_list, vd);
+}
+
+/*
+ * Mark a top-level vdev's state as dirty, so that the next pass of
+ * spa_sync() can convert this into vdev_config_dirty().  We distinguish
+ * the state changes from larger config changes because they require
+ * much less locking, and are often needed for administrative actions.
+ */
+void
+vdev_state_dirty(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_writeable(spa));
+	ASSERT(vd == vd->vdev_top);
+
+	/*
+	 * The state list is protected by the SCL_STATE lock.  The caller
+	 * must either hold SCL_STATE as writer, or must be the sync thread
+	 * (which holds SCL_STATE as reader).  There's only one sync thread,
+	 * so this is sufficient to ensure mutual exclusion.
+	 */
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_STATE, RW_READER)));
+
+	if (!list_link_active(&vd->vdev_state_dirty_node) &&
+	    vdev_is_concrete(vd))
+		list_insert_head(&spa->spa_state_dirty_list, vd);
+}
+
+void
+vdev_state_clean(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
+	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
+	    spa_config_held(spa, SCL_STATE, RW_READER)));
+
+	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
+	list_remove(&spa->spa_state_dirty_list, vd);
+}
+
+/*
+ * Propagate vdev state up from children to parent.
+ */
+void
+vdev_propagate_state(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	int degraded = 0, faulted = 0;
+	int corrupted = 0;
+	vdev_t *child;
+
+	if (vd->vdev_children > 0) {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			child = vd->vdev_child[c];
+
+			/*
+			 * Don't factor holes or indirect vdevs into the
+			 * decision.
+			 */
+			if (!vdev_is_concrete(child))
+				continue;
+
+			if (!vdev_readable(child) ||
+			    (!vdev_writeable(child) && spa_writeable(spa))) {
+				/*
+				 * Root special: if there is a top-level log
+				 * device, treat the root vdev as if it were
+				 * degraded.
+				 */
+				if (child->vdev_islog && vd == rvd)
+					degraded++;
+				else
+					faulted++;
+			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
+				degraded++;
+			}
+
+			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+				corrupted++;
+		}
+
+		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
+
+		/*
+		 * Root special: if there is a top-level vdev that cannot be
+		 * opened due to corrupted metadata, then propagate the root
+		 * vdev's aux state as 'corrupt' rather than 'insufficient
+		 * replicas'.
+		 */
+		if (corrupted && vd == rvd &&
+		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
+			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_CORRUPT_DATA);
+	}
+
+	if (vd->vdev_parent)
+		vdev_propagate_state(vd->vdev_parent);
+}
+
+/*
+ * Set a vdev's state.  If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
+ */
+void
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
+{
+	uint64_t save_state;
+	spa_t *spa = vd->vdev_spa;
+
+	if (state == vd->vdev_state) {
+		/*
+		 * Since vdev_offline() code path is already in an offline
+		 * state we can miss a statechange event to OFFLINE. Check
+		 * the previous state to catch this condition.
+		 */
+		if (vd->vdev_ops->vdev_op_leaf &&
+		    (state == VDEV_STATE_OFFLINE) &&
+		    (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
+			/* post an offline state change */
+			zfs_post_state_change(spa, vd, vd->vdev_prevstate);
+		}
+		vd->vdev_stat.vs_aux = aux;
+		return;
+	}
+
+	save_state = vd->vdev_state;
+
+	vd->vdev_state = state;
+	vd->vdev_stat.vs_aux = aux;
+
+	/*
+	 * If we are setting the vdev state to anything but an open state, then
+	 * always close the underlying device unless the device has requested
+	 * a delayed close (i.e. we're about to remove or fault the device).
+	 * Otherwise, we keep accessible but invalid devices open forever.
+	 * We don't call vdev_close() itself, because that implies some extra
+	 * checks (offline, etc) that we don't want here.  This is limited to
+	 * leaf devices, because otherwise closing the device will affect other
+	 * children.
+	 */
+	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
+	    vd->vdev_ops->vdev_op_leaf)
+		vd->vdev_ops->vdev_op_close(vd);
+
+	if (vd->vdev_removed &&
+	    state == VDEV_STATE_CANT_OPEN &&
+	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
+		/*
+		 * If the previous state is set to VDEV_STATE_REMOVED, then this
+		 * device was previously marked removed and someone attempted to
+		 * reopen it.  If this failed due to a nonexistent device, then
+		 * keep the device in the REMOVED state.  We also let this be if
+		 * it is one of our special test online cases, which is only
+		 * attempting to online the device and shouldn't generate an FMA
+		 * fault.
+		 */
+		vd->vdev_state = VDEV_STATE_REMOVED;
+		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+	} else if (state == VDEV_STATE_REMOVED) {
+		vd->vdev_removed = B_TRUE;
+	} else if (state == VDEV_STATE_CANT_OPEN) {
+		/*
+		 * If we fail to open a vdev during an import or recovery, we
+		 * mark it as "not available", which signifies that it was
+		 * never there to begin with.  Failure to open such a device
+		 * is not considered an error.
+		 */
+		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
+		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
+		    vd->vdev_ops->vdev_op_leaf)
+			vd->vdev_not_present = 1;
+
+		/*
+		 * Post the appropriate ereport.  If the 'prevstate' field is
+		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
+		 * that this is part of a vdev_reopen().  In this case, we don't
+		 * want to post the ereport if the device was already in the
+		 * CANT_OPEN state beforehand.
+		 *
+		 * If the 'checkremove' flag is set, then this is an attempt to
+		 * online the device in response to an insertion event.  If we
+		 * hit this case, then we have detected an insertion event for a
+		 * faulted or offline device that wasn't in the removed state.
+		 * In this scenario, we don't post an ereport because we are
+		 * about to replace the device, or attempt an online with
+		 * vdev_forcefault, which will generate the fault for us.
+		 */
+		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
+		    !vd->vdev_not_present && !vd->vdev_checkremove &&
+		    vd != spa->spa_root_vdev) {
+			const char *class;
+
+			switch (aux) {
+			case VDEV_AUX_OPEN_FAILED:
+				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+				break;
+			case VDEV_AUX_CORRUPT_DATA:
+				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+				break;
+			case VDEV_AUX_NO_REPLICAS:
+				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+				break;
+			case VDEV_AUX_BAD_GUID_SUM:
+				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+				break;
+			case VDEV_AUX_TOO_SMALL:
+				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+				break;
+			case VDEV_AUX_BAD_LABEL:
+				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+				break;
+			case VDEV_AUX_BAD_ASHIFT:
+				class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
+				break;
+			default:
+				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+			}
+
+			(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
+			    save_state);
+		}
+
+		/* Erase any notion of persistent removed state */
+		vd->vdev_removed = B_FALSE;
+	} else {
+		vd->vdev_removed = B_FALSE;
+	}
+
+	/*
+	 * Notify ZED of any significant state-change on a leaf vdev.
+	 *
+	 */
+	if (vd->vdev_ops->vdev_op_leaf) {
+		/* preserve original state from a vdev_reopen() */
+		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
+		    (vd->vdev_prevstate != vd->vdev_state) &&
+		    (save_state <= VDEV_STATE_CLOSED))
+			save_state = vd->vdev_prevstate;
+
+		/* filter out state change due to initial vdev_open */
+		if (save_state > VDEV_STATE_CLOSED)
+			zfs_post_state_change(spa, vd, save_state);
+	}
+
+	if (!isopen && vd->vdev_parent)
+		vdev_propagate_state(vd->vdev_parent);
+}
+
+boolean_t
+vdev_children_are_offline(vdev_t *vd)
+{
+	ASSERT(!vd->vdev_ops->vdev_op_leaf);
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
+			return (B_FALSE);
+	}
+
+	return (B_TRUE);
+}
+
+/*
+ * Check the vdev configuration to ensure that it's capable of supporting
+ * a root pool. We do not support partial configuration.
+ */
+boolean_t
+vdev_is_bootable(vdev_t *vd)
+{
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		const char *vdev_type = vd->vdev_ops->vdev_op_type;
+
+		if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 ||
+		    strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) {
+			return (B_FALSE);
+		}
+	}
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		if (!vdev_is_bootable(vd->vdev_child[c]))
+			return (B_FALSE);
+	}
+	return (B_TRUE);
+}
+
+boolean_t
+vdev_is_concrete(vdev_t *vd)
+{
+	vdev_ops_t *ops = vd->vdev_ops;
+	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
+	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
+		return (B_FALSE);
+	} else {
+		return (B_TRUE);
+	}
+}
+
+/*
+ * Determine if a log device has valid content.  If the vdev was
+ * removed or faulted in the MOS config then we know that
+ * the content on the log device has already been written to the pool.
+ */
+boolean_t
+vdev_log_state_valid(vdev_t *vd)
+{
+	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
+	    !vd->vdev_removed)
+		return (B_TRUE);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		if (vdev_log_state_valid(vd->vdev_child[c]))
+			return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+	ASSERT(vd->vdev_top == vd);
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(vdev_is_concrete(vd));
+
+	vdev_set_deflate_ratio(vd);
+
+	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
+	    vdev_is_concrete(vd)) {
+		vdev_metaslab_group_create(vd);
+		VERIFY(vdev_metaslab_init(vd, txg) == 0);
+		vdev_config_dirty(vd);
+	}
+}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+	vdev_t *cvd, *pvd = vd->vdev_parent;
+
+	vdev_remove_child(pvd, vd);
+	vdev_compact_children(pvd);
+
+	cvd = pvd->vdev_child[0];
+	if (pvd->vdev_children == 1) {
+		vdev_remove_parent(cvd);
+		cvd->vdev_splitting = B_TRUE;
+	}
+	vdev_propagate_state(cvd);
+}
+
+void
+vdev_deadman(vdev_t *vd, char *tag)
+{
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		vdev_deadman(cvd, tag);
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		vdev_queue_t *vq = &vd->vdev_queue;
+
+		mutex_enter(&vq->vq_lock);
+		if (avl_numnodes(&vq->vq_active_tree) > 0) {
+			spa_t *spa = vd->vdev_spa;
+			zio_t *fio;
+			uint64_t delta;
+
+			zfs_dbgmsg("slow vdev: %s has %d active IOs",
+			    vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
+
+			/*
+			 * Look at the head of all the pending queues,
+			 * if any I/O has been outstanding for longer than
+			 * the spa_deadman_synctime invoke the deadman logic.
+			 */
+			fio = avl_first(&vq->vq_active_tree);
+			delta = gethrtime() - fio->io_timestamp;
+			if (delta > spa_deadman_synctime(spa))
+				zio_deadman(fio, tag);
+		}
+		mutex_exit(&vq->vq_lock);
+	}
+}
+
+void
+vdev_defer_resilver(vdev_t *vd)
+{
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	vd->vdev_resilver_deferred = B_TRUE;
+	vd->vdev_spa->spa_resilver_deferred = B_TRUE;
+}
+
+/*
+ * Clears the resilver deferred flag on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+boolean_t
+vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+	boolean_t resilver_needed = B_FALSE;
+	spa_t *spa = vd->vdev_spa;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
+	}
+
+	if (vd == spa->spa_root_vdev &&
+	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+		vdev_config_dirty(vd);
+		spa->spa_resilver_deferred = B_FALSE;
+		return (resilver_needed);
+	}
+
+	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+	    !vd->vdev_ops->vdev_op_leaf)
+		return (resilver_needed);
+
+	vd->vdev_resilver_deferred = B_FALSE;
+
+	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+	    vdev_resilver_needed(vd, NULL, NULL));
+}
+
+boolean_t
+vdev_xlate_is_empty(range_seg64_t *rs)
+{
+	return (rs->rs_start == rs->rs_end);
+}
+
+/*
+ * Translate a logical range to the first contiguous physical range for the
+ * specified vdev_t.  This function is initially called with a leaf vdev and
+ * will walk each parent vdev until it reaches a top-level vdev. Once the
+ * top-level is reached the physical range is initialized and the recursive
+ * function begins to unwind. As it unwinds it calls the parent's vdev
+ * specific translation function to do the real conversion.
+ */
+void
+vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+	/*
+	 * Walk up the vdev tree
+	 */
+	if (vd != vd->vdev_top) {
+		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
+		    remain_rs);
+	} else {
+		/*
+		 * We've reached the top-level vdev, initialize the physical
+		 * range to the logical range and set an empty remaining
+		 * range then start to unwind.
+		 */
+		physical_rs->rs_start = logical_rs->rs_start;
+		physical_rs->rs_end = logical_rs->rs_end;
+
+		remain_rs->rs_start = logical_rs->rs_start;
+		remain_rs->rs_end = logical_rs->rs_start;
+
+		return;
+	}
+
+	vdev_t *pvd = vd->vdev_parent;
+	ASSERT3P(pvd, !=, NULL);
+	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
+
+	/*
+	 * As this recursive function unwinds, translate the logical
+	 * range into its physical and any remaining components by calling
+	 * the vdev specific translate function.
+	 */
+	range_seg64_t intermediate = { 0 };
+	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
+
+	physical_rs->rs_start = intermediate.rs_start;
+	physical_rs->rs_end = intermediate.rs_end;
+}
+
+void
+vdev_xlate_walk(vdev_t *vd, const range_seg64_t *logical_rs,
+    vdev_xlate_func_t *func, void *arg)
+{
+	range_seg64_t iter_rs = *logical_rs;
+	range_seg64_t physical_rs;
+	range_seg64_t remain_rs;
+
+	while (!vdev_xlate_is_empty(&iter_rs)) {
+
+		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
+
+		/*
+		 * With raidz and dRAID, it's possible that the logical range
+		 * does not live on this leaf vdev. Only when there is a non-
+		 * zero physical size call the provided function.
+		 */
+		if (!vdev_xlate_is_empty(&physical_rs))
+			func(arg, &physical_rs);
+
+		iter_rs = remain_rs;
+	}
+}
+
+/*
+ * Look at the vdev tree and determine whether any devices are currently being
+ * replaced.
+ */
+boolean_t
+vdev_replace_in_progress(vdev_t *vdev)
+{
+	ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
+
+	if (vdev->vdev_ops == &vdev_replacing_ops)
+		return (B_TRUE);
+
+	/*
+	 * A 'spare' vdev indicates that we have a replace in progress, unless
+	 * it has exactly two children, and the second, the hot spare, has
+	 * finished being resilvered.
+	 */
+	if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
+	    !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
+		return (B_TRUE);
+
+	for (int i = 0; i < vdev->vdev_children; i++) {
+		if (vdev_replace_in_progress(vdev->vdev_child[i]))
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+EXPORT_SYMBOL(vdev_fault);
+EXPORT_SYMBOL(vdev_degrade);
+EXPORT_SYMBOL(vdev_online);
+EXPORT_SYMBOL(vdev_offline);
+EXPORT_SYMBOL(vdev_clear);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW,
+	"Target number of metaslabs per top-level vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW,
+	"Default limit for metaslab size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW,
+	"Minimum number of metaslabs per top-level vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW,
+	"Practical upper limit of total metaslabs per top-level vdev");
+
+ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
+	"Rate limit slow IO (delay) events to this many per second");
+
+ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
+	"Rate limit checksum events to this many checksum errors per second "
+	"(do not set below zed threshold).");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
+	"Ignore errors during resilver/scrub");
+
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
+	"Bypass vdev_validate()");
+
+ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
+	"Disable cache flushes");
+
+ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, INT, ZMOD_RW,
+	"Minimum number of metaslabs required to dedicate one for log blocks");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
+	param_set_min_auto_ashift, param_get_ulong, ZMOD_RW,
+	"Minimum ashift used when creating new top-level vdevs");
+
+ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
+	param_set_max_auto_ashift, param_get_ulong, ZMOD_RW,
+	"Maximum ashift used when optimizing for logical -> physical sector "
+	"size on new top-level vdevs");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_cache.c b/sys/contrib/openzfs/module/zfs/vdev_cache.c
new file mode 100644
index 000000000000..6e82184b800d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_cache.c
@@ -0,0 +1,437 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/kstat.h>
+#include <sys/abd.h>
+
+/*
+ * Virtual device read-ahead caching.
+ *
+ * This file implements a simple LRU read-ahead cache.  When the DMU reads
+ * a given block, it will often want other, nearby blocks soon thereafter.
+ * We take advantage of this by reading a larger disk region and caching
+ * the result.  In the best case, this can turn 128 back-to-back 512-byte
+ * reads into a single 64k read followed by 127 cache hits; this reduces
+ * latency dramatically.  In the worst case, it can turn an isolated 512-byte
+ * read into a 64k read, which doesn't affect latency all that much but is
+ * terribly wasteful of bandwidth.  A more intelligent version of the cache
+ * could keep track of access patterns and not do read-ahead unless it sees
+ * at least two temporally close I/Os to the same region.  Currently, only
+ * metadata I/O is inflated.  A further enhancement could take advantage of
+ * more semantic information about the I/O.  And it could use something
+ * faster than an AVL tree; that was chosen solely for convenience.
+ *
+ * There are five cache operations: allocate, fill, read, write, evict.
+ *
+ * (1) Allocate.  This reserves a cache entry for the specified region.
+ *     We separate the allocate and fill operations so that multiple threads
+ *     don't generate I/O for the same cache miss.
+ *
+ * (2) Fill.  When the I/O for a cache miss completes, the fill routine
+ *     places the data in the previously allocated cache entry.
+ *
+ * (3) Read.  Read data from the cache.
+ *
+ * (4) Write.  Update cache contents after write completion.
+ *
+ * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
+ *     if the total cache size exceeds zfs_vdev_cache_size.
+ */
+
+/*
+ * These tunables are for performance analysis.
+ */
+/*
+ * All i/os smaller than zfs_vdev_cache_max will be turned into
+ * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
+ * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
+ * vdev's vdev_cache.
+ *
+ * TODO: Note that with the current ZFS code, it turns out that the
+ * vdev cache is not helpful, and in some cases actually harmful.  It
+ * is better if we disable this.  Once some time has passed, we should
+ * actually remove this to simplify the code.  For now we just disable
+ * it by setting the zfs_vdev_cache_size to zero.  Note that Solaris 11
+ * has made these same changes.
+ */
+int zfs_vdev_cache_max = 1<<14;			/* 16KB */
+int zfs_vdev_cache_size = 0;
+int zfs_vdev_cache_bshift = 16;
+
+#define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
+
+kstat_t	*vdc_ksp = NULL;
+
+typedef struct vdc_stats {
+	kstat_named_t vdc_stat_delegations;
+	kstat_named_t vdc_stat_hits;
+	kstat_named_t vdc_stat_misses;
+} vdc_stats_t;
+
+static vdc_stats_t vdc_stats = {
+	{ "delegations",	KSTAT_DATA_UINT64 },
+	{ "hits",		KSTAT_DATA_UINT64 },
+	{ "misses",		KSTAT_DATA_UINT64 }
+};
+
+#define	VDCSTAT_BUMP(stat)	atomic_inc_64(&vdc_stats.stat.value.ui64);
+
+static inline int
+vdev_cache_offset_compare(const void *a1, const void *a2)
+{
+	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+	return (TREE_CMP(ve1->ve_offset, ve2->ve_offset));
+}
+
+static int
+vdev_cache_lastused_compare(const void *a1, const void *a2)
+{
+	const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1;
+	const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2;
+
+	int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused);
+	if (likely(cmp))
+		return (cmp);
+
+	/*
+	 * Among equally old entries, sort by offset to ensure uniqueness.
+	 */
+	return (vdev_cache_offset_compare(a1, a2));
+}
+
+/*
+ * Evict the specified entry from the cache.
+ */
+static void
+vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
+{
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+	ASSERT3P(ve->ve_fill_io, ==, NULL);
+	ASSERT3P(ve->ve_abd, !=, NULL);
+
+	avl_remove(&vc->vc_lastused_tree, ve);
+	avl_remove(&vc->vc_offset_tree, ve);
+	abd_free(ve->ve_abd);
+	kmem_free(ve, sizeof (vdev_cache_entry_t));
+}
+
+/*
+ * Allocate an entry in the cache.  At the point we don't have the data,
+ * we're just creating a placeholder so that multiple threads don't all
+ * go off and read the same blocks.
+ */
+static vdev_cache_entry_t *
+vdev_cache_allocate(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
+	vdev_cache_entry_t *ve;
+
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+
+	if (zfs_vdev_cache_size == 0)
+		return (NULL);
+
+	/*
+	 * If adding a new entry would exceed the cache size,
+	 * evict the oldest entry (LRU).
+	 */
+	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
+	    zfs_vdev_cache_size) {
+		ve = avl_first(&vc->vc_lastused_tree);
+		if (ve->ve_fill_io != NULL)
+			return (NULL);
+		ASSERT3U(ve->ve_hits, !=, 0);
+		vdev_cache_evict(vc, ve);
+	}
+
+	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+	ve->ve_offset = offset;
+	ve->ve_lastused = ddi_get_lbolt();
+	ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);
+
+	avl_add(&vc->vc_offset_tree, ve);
+	avl_add(&vc->vc_lastused_tree, ve);
+
+	return (ve);
+}
+
+static void
+vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
+{
+	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
+
+	ASSERT(MUTEX_HELD(&vc->vc_lock));
+	ASSERT3P(ve->ve_fill_io, ==, NULL);
+
+	if (ve->ve_lastused != ddi_get_lbolt()) {
+		avl_remove(&vc->vc_lastused_tree, ve);
+		ve->ve_lastused = ddi_get_lbolt();
+		avl_add(&vc->vc_lastused_tree, ve);
+	}
+
+	ve->ve_hits++;
+	abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size);
+}
+
+/*
+ * Fill a previously allocated cache entry with data.
+ */
+static void
+vdev_cache_fill(zio_t *fio)
+{
+	vdev_t *vd = fio->io_vd;
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve = fio->io_private;
+	zio_t *pio;
+
+	ASSERT3U(fio->io_size, ==, VCBS);
+
+	/*
+	 * Add data to the cache.
+	 */
+	mutex_enter(&vc->vc_lock);
+
+	ASSERT3P(ve->ve_fill_io, ==, fio);
+	ASSERT3U(ve->ve_offset, ==, fio->io_offset);
+	ASSERT3P(ve->ve_abd, ==, fio->io_abd);
+
+	ve->ve_fill_io = NULL;
+
+	/*
+	 * Even if this cache line was invalidated by a missed write update,
+	 * any reads that were queued up before the missed update are still
+	 * valid, so we can satisfy them from this line before we evict it.
+	 */
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(fio, &zl)) != NULL)
+		vdev_cache_hit(vc, ve, pio);
+
+	if (fio->io_error || ve->ve_missed_update)
+		vdev_cache_evict(vc, ve);
+
+	mutex_exit(&vc->vc_lock);
+}
+
+/*
+ * Read data from the cache.  Returns B_TRUE cache hit, B_FALSE on miss.
+ */
+boolean_t
+vdev_cache_read(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	vdev_cache_entry_t *ve, *ve_search;
+	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
+	zio_t *fio;
+	uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS);
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
+		return (B_FALSE);
+
+	if (zio->io_size > zfs_vdev_cache_max)
+		return (B_FALSE);
+
+	/*
+	 * If the I/O straddles two or more cache blocks, don't cache it.
+	 */
+	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
+		return (B_FALSE);
+
+	ASSERT3U(cache_phase + zio->io_size, <=, VCBS);
+
+	mutex_enter(&vc->vc_lock);
+
+	ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
+	ve_search->ve_offset = cache_offset;
+	ve = avl_find(&vc->vc_offset_tree, ve_search, NULL);
+	kmem_free(ve_search, sizeof (vdev_cache_entry_t));
+
+	if (ve != NULL) {
+		if (ve->ve_missed_update) {
+			mutex_exit(&vc->vc_lock);
+			return (B_FALSE);
+		}
+
+		if ((fio = ve->ve_fill_io) != NULL) {
+			zio_vdev_io_bypass(zio);
+			zio_add_child(zio, fio);
+			mutex_exit(&vc->vc_lock);
+			VDCSTAT_BUMP(vdc_stat_delegations);
+			return (B_TRUE);
+		}
+
+		vdev_cache_hit(vc, ve, zio);
+		zio_vdev_io_bypass(zio);
+
+		mutex_exit(&vc->vc_lock);
+		VDCSTAT_BUMP(vdc_stat_hits);
+		return (B_TRUE);
+	}
+
+	ve = vdev_cache_allocate(zio);
+
+	if (ve == NULL) {
+		mutex_exit(&vc->vc_lock);
+		return (B_FALSE);
+	}
+
+	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
+	    ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
+	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
+
+	ve->ve_fill_io = fio;
+	zio_vdev_io_bypass(zio);
+	zio_add_child(zio, fio);
+
+	mutex_exit(&vc->vc_lock);
+	zio_nowait(fio);
+	VDCSTAT_BUMP(vdc_stat_misses);
+
+	return (B_TRUE);
+}
+
+/*
+ * Update cache contents upon write completion.
+ */
+void
+vdev_cache_write(zio_t *zio)
+{
+	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
+	vdev_cache_entry_t *ve, ve_search;
+	uint64_t io_start = zio->io_offset;
+	uint64_t io_end = io_start + zio->io_size;
+	uint64_t min_offset = P2ALIGN(io_start, VCBS);
+	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
+	avl_index_t where;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+	mutex_enter(&vc->vc_lock);
+
+	ve_search.ve_offset = min_offset;
+	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
+
+	if (ve == NULL)
+		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
+
+	while (ve != NULL && ve->ve_offset < max_offset) {
+		uint64_t start = MAX(ve->ve_offset, io_start);
+		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
+
+		if (ve->ve_fill_io != NULL) {
+			ve->ve_missed_update = 1;
+		} else {
+			abd_copy_off(ve->ve_abd, zio->io_abd,
+			    start - ve->ve_offset, start - io_start,
+			    end - start);
+		}
+		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
+	}
+	mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_purge(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+	vdev_cache_entry_t *ve;
+
+	mutex_enter(&vc->vc_lock);
+	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
+		vdev_cache_evict(vc, ve);
+	mutex_exit(&vc->vc_lock);
+}
+
+void
+vdev_cache_init(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+
+	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
+	    sizeof (vdev_cache_entry_t),
+	    offsetof(struct vdev_cache_entry, ve_offset_node));
+
+	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
+	    sizeof (vdev_cache_entry_t),
+	    offsetof(struct vdev_cache_entry, ve_lastused_node));
+}
+
+void
+vdev_cache_fini(vdev_t *vd)
+{
+	vdev_cache_t *vc = &vd->vdev_cache;
+
+	vdev_cache_purge(vd);
+
+	avl_destroy(&vc->vc_offset_tree);
+	avl_destroy(&vc->vc_lastused_tree);
+
+	mutex_destroy(&vc->vc_lock);
+}
+
+void
+vdev_cache_stat_init(void)
+{
+	vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (vdc_ksp != NULL) {
+		vdc_ksp->ks_data = &vdc_stats;
+		kstat_install(vdc_ksp);
+	}
+}
+
+void
+vdev_cache_stat_fini(void)
+{
+	if (vdc_ksp != NULL) {
+		kstat_delete(vdc_ksp);
+		vdc_ksp = NULL;
+	}
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, INT, ZMOD_RW,
+	"Inflate reads small than max");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, INT, ZMOD_RD,
+	"Total size of the per-disk cache");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, INT, ZMOD_RW,
+	"Shift size to inflate reads too");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid.c b/sys/contrib/openzfs/module/zfs/vdev_draid.c
new file mode 100644
index 000000000000..a4f48cf744b0
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid.c
@@ -0,0 +1,2976 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/abd.h>
+#include <sys/zio.h>
+#include <sys/nvpair.h>
+#include <sys/zio_checksum.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <zfs_fletcher.h>
+
+#ifdef ZFS_DEBUG
+#include <sys/vdev.h>	/* For vdev_xlate() in vdev_draid_io_verify() */
+#endif
+
+/*
+ * dRAID is a distributed spare implementation for ZFS. A dRAID vdev is
+ * comprised of multiple raidz redundancy groups which are spread over the
+ * dRAID children. To ensure an even distribution, and avoid hot spots, a
+ * permutation mapping is applied to the order of the dRAID children.
+ * This mixing effectively distributes the parity columns evenly over all
+ * of the disks in the dRAID.
+ *
+ * This is beneficial because it means when resilvering all of the disks
+ * can participate thereby increasing the available IOPs and bandwidth.
+ * Furthermore, by reserving a small fraction of each child's total capacity
+ * virtual distributed spare disks can be created. These spares similarly
+ * benefit from the performance gains of spanning all of the children. The
+ * consequence of which is that resilvering to a distributed spare can
+ * substantially reduce the time required to restore full parity to pool
+ * with a failed disks.
+ *
+ * === dRAID group layout ===
+ *
+ * First, let's define a "row" in the configuration to be a 16M chunk from
+ * each physical drive at the same offset. This is the minimum allowable
+ * size since it must be possible to store a full 16M block when there is
+ * only a single data column. Next, we define a "group" to be a set of
+ * sequential disks containing both the parity and data columns. We allow
+ * groups to span multiple rows in order to align any group size to any
+ * number of physical drives. Finally, a "slice" is comprised of the rows
+ * which contain the target number of groups. The permutation mappings
+ * are applied in a round robin fashion to each slice.
+ *
+ * Given D+P drives in a group (including parity drives) and C-S physical
+ * drives (not including the spare drives), we can distribute the groups
+ * across R rows without remainder by selecting the least common multiple
+ * of D+P and C-S as the number of groups; i.e. ngroups = LCM(D+P, C-S).
+ *
+ * In the example below, there are C=14 physical drives in the configuration
+ * with S=2 drives worth of spare capacity. Each group has a width of 9
+ * which includes D=8 data and P=1 parity drive. There are 4 groups and
+ * 3 rows per slice.  Each group has a size of 144M (16M * 9) and a slice
+ * size is 576M (144M * 4). When allocating from a dRAID each group is
+ * filled before moving on to the next as show in slice0 below.
+ *
+ *             data disks (8 data + 1 parity)          spares (2)
+ *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ *  ^  | 2 | 6 | 1 | 11| 4 | 0 | 7 | 10| 8 | 9 | 13| 5 | 12| 3 | device map 0
+ *  |  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ *  |  |              group 0              |  group 1..|       |
+ *  |  +-----------------------------------+-----------+-------|
+ *  |  | 0   1   2   3   4   5   6   7   8 | 36  37  38|       |  r
+ *  |  | 9   10  11  12  13  14  15  16  17| 45  46  47|       |  o
+ *  |  | 18  19  20  21  22  23  24  25  26| 54  55  56|       |  w
+ *     | 27  28  29  30  31  32  33  34  35| 63  64  65|       |  0
+ *  s  +-----------------------+-----------------------+-------+
+ *  l  |       ..group 1       |        group 2..      |       |
+ *  i  +-----------------------+-----------------------+-------+
+ *  c  | 39  40  41  42  43  44| 72  73  74  75  76  77|       |  r
+ *  e  | 48  49  50  51  52  53| 81  82  83  84  85  86|       |  o
+ *  0  | 57  58  59  60  61  62| 90  91  92  93  94  95|       |  w
+ *     | 66  67  68  69  70  71| 99 100 101 102 103 104|       |  1
+ *  |  +-----------+-----------+-----------------------+-------+
+ *  |  |..group 2  |            group 3                |       |
+ *  |  +-----------+-----------+-----------------------+-------+
+ *  |  | 78  79  80|108 109 110 111 112 113 114 115 116|       |  r
+ *  |  | 87  88  89|117 118 119 120 121 122 123 124 125|       |  o
+ *  |  | 96  97  98|126 127 128 129 130 131 132 133 134|       |  w
+ *  v  |105 106 107|135 136 137 138 139 140 141 142 143|       |  2
+ *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ *     | 9 | 11| 12| 2 | 4 | 1 | 3 | 0 | 10| 13| 8 | 5 | 6 | 7 | device map 1
+ *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ *  l  |              group 4              |  group 5..|       | row 3
+ *  i  +-----------------------+-----------+-----------+-------|
+ *  c  |       ..group 5       |        group 6..      |       | row 4
+ *  e  +-----------+-----------+-----------------------+-------+
+ *  1  |..group 6  |            group 7                |       | row 5
+ *     +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ *     | 3 | 5 | 10| 8 | 6 | 11| 12| 0 | 2 | 4 | 7 | 1 | 9 | 13| device map 2
+ *  s  +===+===+===+===+===+===+===+===+===+===+===+===+===+===+
+ *  l  |              group 8              |  group 9..|       | row 6
+ *  i  +-----------------------------------------------+-------|
+ *  c  |       ..group 9       |        group 10..     |       | row 7
+ *  e  +-----------------------+-----------------------+-------+
+ *  2  |..group 10 |            group 11               |       | row 8
+ *     +-----------+-----------------------------------+-------+
+ *
+ * This layout has several advantages over requiring that each row contain
+ * a whole number of groups.
+ *
+ * 1. The group count is not a relevant parameter when defining a dRAID
+ *    layout. Only the group width is needed, and *all* groups will have
+ *    the desired size.
+ *
+ * 2. All possible group widths (<= physical disk count) can be supported.
+ *
+ * 3. The logic within vdev_draid.c is simplified when the group width is
+ *    the same for all groups (although some of the logic around computing
+ *    permutation numbers and drive offsets is more complicated).
+ *
+ * N.B. The following array describes all valid dRAID permutation maps.
+ * Each row is used to generate a permutation map for a different number
+ * of children from a unique seed. The seeds were generated and carefully
+ * evaluated by the 'draid' utility in order to provide balanced mappings.
+ * In addition to the seed a checksum of the in-memory mapping is stored
+ * for verification.
+ *
+ * The imbalance ratio of a given failure (e.g. 5 disks wide, child 3 failed,
+ * with a given permutation map) is the ratio of the amounts of I/O that will
+ * be sent to the least and most busy disks when resilvering. The average
+ * imbalance ratio (of a given number of disks and permutation map) is the
+ * average of the ratios of all possible single and double disk failures.
+ *
+ * In order to achieve a low imbalance ratio the number of permutations in
+ * the mapping must be significantly larger than the number of children.
+ * For dRAID the number of permutations has been limited to 512 to minimize
+ * the map size. This does result in a gradually increasing imbalance ratio
+ * as seen in the table below. Increasing the number of permutations for
+ * larger child counts would reduce the imbalance ratio. However, in practice
+ * when there are a large number of children each child is responsible for
+ * fewer total IOs so it's less of a concern.
+ *
+ * Note these values are hard coded and must never be changed.  Existing
+ * pools depend on the same mapping always being generated in order to
+ * read and write from the correct locations.  Any change would make
+ * existing pools completely inaccessible.
+ */
+static const draid_map_t draid_maps[VDEV_DRAID_MAX_MAPS] = {
+	{   2, 256, 0x89ef3dabbcc7de37, 0x00000000433d433d },	/* 1.000 */
+	{   3, 256, 0x89a57f3de98121b4, 0x00000000bcd8b7b5 },	/* 1.000 */
+	{   4, 256, 0xc9ea9ec82340c885, 0x00000001819d7c69 },	/* 1.000 */
+	{   5, 256, 0xf46733b7f4d47dfd, 0x00000002a1648d74 },	/* 1.010 */
+	{   6, 256, 0x88c3c62d8585b362, 0x00000003d3b0c2c4 },	/* 1.031 */
+	{   7, 256, 0x3a65d809b4d1b9d5, 0x000000055c4183ee },	/* 1.043 */
+	{   8, 256, 0xe98930e3c5d2e90a, 0x00000006edfb0329 },	/* 1.059 */
+	{   9, 256, 0x5a5430036b982ccb, 0x00000008ceaf6934 },	/* 1.056 */
+	{  10, 256, 0x92bf389e9eadac74, 0x0000000b26668c09 },	/* 1.072 */
+	{  11, 256, 0x74ccebf1dcf3ae80, 0x0000000dd691358c },	/* 1.083 */
+	{  12, 256, 0x8847e41a1a9f5671, 0x00000010a0c63c8e },	/* 1.097 */
+	{  13, 256, 0x7481b56debf0e637, 0x0000001424121fe4 },	/* 1.100 */
+	{  14, 256, 0x559b8c44065f8967, 0x00000016ab2ff079 },	/* 1.121 */
+	{  15, 256, 0x34c49545a2ee7f01, 0x0000001a6028efd6 },	/* 1.103 */
+	{  16, 256, 0xb85f4fa81a7698f7, 0x0000001e95ff5e66 },	/* 1.111 */
+	{  17, 256, 0x6353e47b7e47aba0, 0x00000021a81fa0fe },	/* 1.133 */
+	{  18, 256, 0xaa549746b1cbb81c, 0x00000026f02494c9 },	/* 1.131 */
+	{  19, 256, 0x892e343f2f31d690, 0x00000029eb392835 },	/* 1.130 */
+	{  20, 256, 0x76914824db98cc3f, 0x0000003004f31a7c },	/* 1.141 */
+	{  21, 256, 0x4b3cbabf9cfb1d0f, 0x00000036363a2408 },	/* 1.139 */
+	{  22, 256, 0xf45c77abb4f035d4, 0x00000038dd0f3e84 },	/* 1.150 */
+	{  23, 256, 0x5e18bd7f3fd4baf4, 0x0000003f0660391f },	/* 1.174 */
+	{  24, 256, 0xa7b3a4d285d6503b, 0x000000443dfc9ff6 },	/* 1.168 */
+	{  25, 256, 0x56ac7dd967521f5a, 0x0000004b03a87eb7 },	/* 1.180 */
+	{  26, 256, 0x3a42dfda4eb880f7, 0x000000522c719bba },	/* 1.226 */
+	{  27, 256, 0xd200d2fc6b54bf60, 0x0000005760b4fdf5 },	/* 1.228 */
+	{  28, 256, 0xc52605bbd486c546, 0x0000005e00d8f74c },	/* 1.217 */
+	{  29, 256, 0xc761779e63cd762f, 0x00000067be3cd85c },	/* 1.239 */
+	{  30, 256, 0xca577b1e07f85ca5, 0x0000006f5517f3e4 },	/* 1.238 */
+	{  31, 256, 0xfd50a593c518b3d4, 0x0000007370e7778f },	/* 1.273 */
+	{  32, 512, 0xc6c87ba5b042650b, 0x000000f7eb08a156 },	/* 1.191 */
+	{  33, 512, 0xc3880d0c9d458304, 0x0000010734b5d160 },	/* 1.199 */
+	{  34, 512, 0xe920927e4d8b2c97, 0x00000118c1edbce0 },	/* 1.195 */
+	{  35, 512, 0x8da7fcda87bde316, 0x0000012a3e9f9110 },	/* 1.201 */
+	{  36, 512, 0xcf09937491514a29, 0x0000013bd6a24bef },	/* 1.194 */
+	{  37, 512, 0x9b5abbf345cbd7cc, 0x0000014b9d90fac3 },	/* 1.237 */
+	{  38, 512, 0x506312a44668d6a9, 0x0000015e1b5f6148 },	/* 1.242 */
+	{  39, 512, 0x71659ede62b4755f, 0x00000173ef029bcd },	/* 1.231 */
+	{  40, 512, 0xa7fde73fb74cf2d7, 0x000001866fb72748 },	/* 1.233 */
+	{  41, 512, 0x19e8b461a1dea1d3, 0x000001a046f76b23 },	/* 1.271 */
+	{  42, 512, 0x031c9b868cc3e976, 0x000001afa64c49d3 },	/* 1.263 */
+	{  43, 512, 0xbaa5125faa781854, 0x000001c76789e278 },	/* 1.270 */
+	{  44, 512, 0x4ed55052550d721b, 0x000001d800ccd8eb },	/* 1.281 */
+	{  45, 512, 0x0fd63ddbdff90677, 0x000001f08ad59ed2 },	/* 1.282 */
+	{  46, 512, 0x36d66546de7fdd6f, 0x000002016f09574b },	/* 1.286 */
+	{  47, 512, 0x99f997e7eafb69d7, 0x0000021e42e47cb6 },	/* 1.329 */
+	{  48, 512, 0xbecd9c2571312c5d, 0x000002320fe2872b },	/* 1.286 */
+	{  49, 512, 0xd97371329e488a32, 0x0000024cd73f2ca7 },	/* 1.322 */
+	{  50, 512, 0x30e9b136670749ee, 0x000002681c83b0e0 },	/* 1.335 */
+	{  51, 512, 0x11ad6bc8f47aaeb4, 0x0000027e9261b5d5 },	/* 1.305 */
+	{  52, 512, 0x68e445300af432c1, 0x0000029aa0eb7dbf },	/* 1.330 */
+	{  53, 512, 0x910fb561657ea98c, 0x000002b3dca04853 },	/* 1.365 */
+	{  54, 512, 0xd619693d8ce5e7a5, 0x000002cc280e9c97 },	/* 1.334 */
+	{  55, 512, 0x24e281f564dbb60a, 0x000002e9fa842713 },	/* 1.364 */
+	{  56, 512, 0x947a7d3bdaab44c5, 0x000003046680f72e },	/* 1.374 */
+	{  57, 512, 0x2d44fec9c093e0de, 0x00000324198ba810 },	/* 1.363 */
+	{  58, 512, 0x87743c272d29bb4c, 0x0000033ec48c9ac9 },	/* 1.401 */
+	{  59, 512, 0x96aa3b6f67f5d923, 0x0000034faead902c },	/* 1.392 */
+	{  60, 512, 0x94a4f1faf520b0d3, 0x0000037d713ab005 },	/* 1.360 */
+	{  61, 512, 0xb13ed3a272f711a2, 0x00000397368f3cbd },	/* 1.396 */
+	{  62, 512, 0x3b1b11805fa4a64a, 0x000003b8a5e2840c },	/* 1.453 */
+	{  63, 512, 0x4c74caad9172ba71, 0x000003d4be280290 },	/* 1.437 */
+	{  64, 512, 0x035ff643923dd29e, 0x000003fad6c355e1 },	/* 1.402 */
+	{  65, 512, 0x768e9171b11abd3c, 0x0000040eb07fed20 },	/* 1.459 */
+	{  66, 512, 0x75880e6f78a13ddd, 0x000004433d6acf14 },	/* 1.423 */
+	{  67, 512, 0x910b9714f698a877, 0x00000451ea65d5db },	/* 1.447 */
+	{  68, 512, 0x87f5db6f9fdcf5c7, 0x000004732169e3f7 },	/* 1.450 */
+	{  69, 512, 0x836d4968fbaa3706, 0x000004954068a380 },	/* 1.455 */
+	{  70, 512, 0xc567d73a036421ab, 0x000004bd7cb7bd3d },	/* 1.463 */
+	{  71, 512, 0x619df40f240b8fed, 0x000004e376c2e972 },	/* 1.463 */
+	{  72, 512, 0x42763a680d5bed8e, 0x000005084275c680 },	/* 1.452 */
+	{  73, 512, 0x5866f064b3230431, 0x0000052906f2c9ab },	/* 1.498 */
+	{  74, 512, 0x9fa08548b1621a44, 0x0000054708019247 },	/* 1.526 */
+	{  75, 512, 0xb6053078ce0fc303, 0x00000572cc5c72b0 },	/* 1.491 */
+	{  76, 512, 0x4a7aad7bf3890923, 0x0000058e987bc8e9 },	/* 1.470 */
+	{  77, 512, 0xe165613fd75b5a53, 0x000005c20473a211 },	/* 1.527 */
+	{  78, 512, 0x3ff154ac878163a6, 0x000005d659194bf3 },	/* 1.509 */
+	{  79, 512, 0x24b93ade0aa8a532, 0x0000060a201c4f8e },	/* 1.569 */
+	{  80, 512, 0xc18e2d14cd9bb554, 0x0000062c55cfe48c },	/* 1.555 */
+	{  81, 512, 0x98cc78302feb58b6, 0x0000066656a07194 },	/* 1.509 */
+	{  82, 512, 0xc6c5fd5a2abc0543, 0x0000067cff94fbf8 },	/* 1.596 */
+	{  83, 512, 0xa7962f514acbba21, 0x000006ab7b5afa2e },	/* 1.568 */
+	{  84, 512, 0xba02545069ddc6dc, 0x000006d19861364f },	/* 1.541 */
+	{  85, 512, 0x447c73192c35073e, 0x000006fce315ce35 },	/* 1.623 */
+	{  86, 512, 0x48beef9e2d42b0c2, 0x00000720a8e38b6b },	/* 1.620 */
+	{  87, 512, 0x4874cf98541a35e0, 0x00000758382a2273 },	/* 1.597 */
+	{  88, 512, 0xad4cf8333a31127a, 0x00000781e1651b1b },	/* 1.575 */
+	{  89, 512, 0x47ae4859d57888c1, 0x000007b27edbe5bc },	/* 1.627 */
+	{  90, 512, 0x06f7723cfe5d1891, 0x000007dc2a96d8eb },	/* 1.596 */
+	{  91, 512, 0xd4e44218d660576d, 0x0000080ac46f02d5 },	/* 1.622 */
+	{  92, 512, 0x7066702b0d5be1f2, 0x00000832c96d154e },	/* 1.695 */
+	{  93, 512, 0x011209b4f9e11fb9, 0x0000085eefda104c },	/* 1.605 */
+	{  94, 512, 0x47ffba30a0b35708, 0x00000899badc32dc },	/* 1.625 */
+	{  95, 512, 0x1a95a6ac4538aaa8, 0x000008b6b69a42b2 },	/* 1.687 */
+	{  96, 512, 0xbda2b239bb2008eb, 0x000008f22d2de38a },	/* 1.621 */
+	{  97, 512, 0x7ffa0bea90355c6c, 0x0000092e5b23b816 },	/* 1.699 */
+	{  98, 512, 0x1d56ba34be426795, 0x0000094f482e5d1b },	/* 1.688 */
+	{  99, 512, 0x0aa89d45c502e93d, 0x00000977d94a98ce },	/* 1.642 */
+	{ 100, 512, 0x54369449f6857774, 0x000009c06c9b34cc },	/* 1.683 */
+	{ 101, 512, 0xf7d4dd8445b46765, 0x000009e5dc542259 },	/* 1.755 */
+	{ 102, 512, 0xfa8866312f169469, 0x00000a16b54eae93 },	/* 1.692 */
+	{ 103, 512, 0xd8a5aea08aef3ff9, 0x00000a381d2cbfe7 },	/* 1.747 */
+	{ 104, 512, 0x66bcd2c3d5f9ef0e, 0x00000a8191817be7 },	/* 1.751 */
+	{ 105, 512, 0x3fb13a47a012ec81, 0x00000ab562b9a254 },	/* 1.751 */
+	{ 106, 512, 0x43100f01c9e5e3ca, 0x00000aeee84c185f },	/* 1.726 */
+	{ 107, 512, 0xca09c50ccee2d054, 0x00000b1c359c047d },	/* 1.788 */
+	{ 108, 512, 0xd7176732ac503f9b, 0x00000b578bc52a73 },	/* 1.740 */
+	{ 109, 512, 0xed206e51f8d9422d, 0x00000b8083e0d960 },	/* 1.780 */
+	{ 110, 512, 0x17ead5dc6ba0dcd6, 0x00000bcfb1a32ca8 },	/* 1.836 */
+	{ 111, 512, 0x5f1dc21e38a969eb, 0x00000c0171becdd6 },	/* 1.778 */
+	{ 112, 512, 0xddaa973de33ec528, 0x00000c3edaba4b95 },	/* 1.831 */
+	{ 113, 512, 0x2a5eccd7735a3630, 0x00000c630664e7df },	/* 1.825 */
+	{ 114, 512, 0xafcccee5c0b71446, 0x00000cb65392f6e4 },	/* 1.826 */
+	{ 115, 512, 0x8fa30c5e7b147e27, 0x00000cd4db391e55 },	/* 1.843 */
+	{ 116, 512, 0x5afe0711fdfafd82, 0x00000d08cb4ec35d },	/* 1.826 */
+	{ 117, 512, 0x533a6090238afd4c, 0x00000d336f115d1b },	/* 1.803 */
+	{ 118, 512, 0x90cf11b595e39a84, 0x00000d8e041c2048 },	/* 1.857 */
+	{ 119, 512, 0x0d61a3b809444009, 0x00000dcb798afe35 },	/* 1.877 */
+	{ 120, 512, 0x7f34da0f54b0d114, 0x00000df3922664e1 },	/* 1.849 */
+	{ 121, 512, 0xa52258d5b72f6551, 0x00000e4d37a9872d },	/* 1.867 */
+	{ 122, 512, 0xc1de54d7672878db, 0x00000e6583a94cf6 },	/* 1.978 */
+	{ 123, 512, 0x1d03354316a414ab, 0x00000ebffc50308d },	/* 1.947 */
+	{ 124, 512, 0xcebdcc377665412c, 0x00000edee1997cea },	/* 1.865 */
+	{ 125, 512, 0x4ddd4c04b1a12344, 0x00000f21d64b373f },	/* 1.881 */
+	{ 126, 512, 0x64fc8f94e3973658, 0x00000f8f87a8896b },	/* 1.882 */
+	{ 127, 512, 0x68765f78034a334e, 0x00000fb8fe62197e },	/* 1.867 */
+	{ 128, 512, 0xaf36b871a303e816, 0x00000fec6f3afb1e },	/* 1.972 */
+	{ 129, 512, 0x2a4cbf73866c3a28, 0x00001027febfe4e5 },	/* 1.896 */
+	{ 130, 512, 0x9cb128aacdcd3b2f, 0x0000106aa8ac569d },	/* 1.965 */
+	{ 131, 512, 0x5511d41c55869124, 0x000010bbd755ddf1 },	/* 1.963 */
+	{ 132, 512, 0x42f92461937f284a, 0x000010fb8bceb3b5 },	/* 1.925 */
+	{ 133, 512, 0xe2d89a1cf6f1f287, 0x0000114cf5331e34 },	/* 1.862 */
+	{ 134, 512, 0xdc631a038956200e, 0x0000116428d2adc5 },	/* 2.042 */
+	{ 135, 512, 0xb2e5ac222cd236be, 0x000011ca88e4d4d2 },	/* 1.935 */
+	{ 136, 512, 0xbc7d8236655d88e7, 0x000011e39cb94e66 },	/* 2.005 */
+	{ 137, 512, 0x073e02d88d2d8e75, 0x0000123136c7933c },	/* 2.041 */
+	{ 138, 512, 0x3ddb9c3873166be0, 0x00001280e4ec6d52 },	/* 1.997 */
+	{ 139, 512, 0x7d3b1a845420e1b5, 0x000012c2e7cd6a44 },	/* 1.996 */
+	{ 140, 512, 0x60102308aa7b2a6c, 0x000012fc490e6c7d },	/* 2.053 */
+	{ 141, 512, 0xdb22bb2f9eb894aa, 0x00001343f5a85a1a },	/* 1.971 */
+	{ 142, 512, 0xd853f879a13b1606, 0x000013bb7d5f9048 },	/* 2.018 */
+	{ 143, 512, 0x001620a03f804b1d, 0x000013e74cc794fd },	/* 1.961 */
+	{ 144, 512, 0xfdb52dda76fbf667, 0x00001442d2f22480 },	/* 2.046 */
+	{ 145, 512, 0xa9160110f66e24ff, 0x0000144b899f9dbb },	/* 1.968 */
+	{ 146, 512, 0x77306a30379ae03b, 0x000014cb98eb1f81 },	/* 2.143 */
+	{ 147, 512, 0x14f5985d2752319d, 0x000014feab821fc9 },	/* 2.064 */
+	{ 148, 512, 0xa4b8ff11de7863f8, 0x0000154a0e60b9c9 },	/* 2.023 */
+	{ 149, 512, 0x44b345426455c1b3, 0x000015999c3c569c },	/* 2.136 */
+	{ 150, 512, 0x272677826049b46c, 0x000015c9697f4b92 },	/* 2.063 */
+	{ 151, 512, 0x2f9216e2cd74fe40, 0x0000162b1f7bbd39 },	/* 1.974 */
+	{ 152, 512, 0x706ae3e763ad8771, 0x00001661371c55e1 },	/* 2.210 */
+	{ 153, 512, 0xf7fd345307c2480e, 0x000016e251f28b6a },	/* 2.006 */
+	{ 154, 512, 0x6e94e3d26b3139eb, 0x000016f2429bb8c6 },	/* 2.193 */
+	{ 155, 512, 0x5458bbfbb781fcba, 0x0000173efdeca1b9 },	/* 2.163 */
+	{ 156, 512, 0xa80e2afeccd93b33, 0x000017bfdcb78adc },	/* 2.046 */
+	{ 157, 512, 0x1e4ccbb22796cf9d, 0x00001826fdcc39c9 },	/* 2.084 */
+	{ 158, 512, 0x8fba4b676aaa3663, 0x00001841a1379480 },	/* 2.264 */
+	{ 159, 512, 0xf82b843814b315fa, 0x000018886e19b8a3 },	/* 2.074 */
+	{ 160, 512, 0x7f21e920ecf753a3, 0x0000191812ca0ea7 },	/* 2.282 */
+	{ 161, 512, 0x48bb8ea2c4caa620, 0x0000192f310faccf },	/* 2.148 */
+	{ 162, 512, 0x5cdb652b4952c91b, 0x0000199e1d7437c7 },	/* 2.355 */
+	{ 163, 512, 0x6ac1ba6f78c06cd4, 0x000019cd11f82c70 },	/* 2.164 */
+	{ 164, 512, 0x9faf5f9ca2669a56, 0x00001a18d5431f6a },	/* 2.393 */
+	{ 165, 512, 0xaa57e9383eb01194, 0x00001a9e7d253d85 },	/* 2.178 */
+	{ 166, 512, 0x896967bf495c34d2, 0x00001afb8319b9fc },	/* 2.334 */
+	{ 167, 512, 0xdfad5f05de225f1b, 0x00001b3a59c3093b },	/* 2.266 */
+	{ 168, 512, 0xfd299a99f9f2abdd, 0x00001bb6f1a10799 },	/* 2.304 */
+	{ 169, 512, 0xdda239e798fe9fd4, 0x00001bfae0c9692d },	/* 2.218 */
+	{ 170, 512, 0x5fca670414a32c3e, 0x00001c22129dbcff },	/* 2.377 */
+	{ 171, 512, 0x1bb8934314b087de, 0x00001c955db36cd0 },	/* 2.155 */
+	{ 172, 512, 0xd96394b4b082200d, 0x00001cfc8619b7e6 },	/* 2.404 */
+	{ 173, 512, 0xb612a7735b1c8cbc, 0x00001d303acdd585 },	/* 2.205 */
+	{ 174, 512, 0x28e7430fe5875fe1, 0x00001d7ed5b3697d },	/* 2.359 */
+	{ 175, 512, 0x5038e89efdd981b9, 0x00001dc40ec35c59 },	/* 2.158 */
+	{ 176, 512, 0x075fd78f1d14db7c, 0x00001e31c83b4a2b },	/* 2.614 */
+	{ 177, 512, 0xc50fafdb5021be15, 0x00001e7cdac82fbc },	/* 2.239 */
+	{ 178, 512, 0xe6dc7572ce7b91c7, 0x00001edd8bb454fc },	/* 2.493 */
+	{ 179, 512, 0x21f7843e7beda537, 0x00001f3a8e019d6c },	/* 2.327 */
+	{ 180, 512, 0xc83385e20b43ec82, 0x00001f70735ec137 },	/* 2.231 */
+	{ 181, 512, 0xca818217dddb21fd, 0x0000201ca44c5a3c },	/* 2.237 */
+	{ 182, 512, 0xe6035defea48f933, 0x00002038e3346658 },	/* 2.691 */
+	{ 183, 512, 0x47262a4f953dac5a, 0x000020c2e554314e },	/* 2.170 */
+	{ 184, 512, 0xe24c7246260873ea, 0x000021197e618d64 },	/* 2.600 */
+	{ 185, 512, 0xeef6b57c9b58e9e1, 0x0000217ea48ecddc },	/* 2.391 */
+	{ 186, 512, 0x2becd3346e386142, 0x000021c496d4a5f9 },	/* 2.677 */
+	{ 187, 512, 0x63c6207bdf3b40a3, 0x0000220e0f2eec0c },	/* 2.410 */
+	{ 188, 512, 0x3056ce8989767d4b, 0x0000228eb76cd137 },	/* 2.776 */
+	{ 189, 512, 0x91af61c307cee780, 0x000022e17e2ea501 },	/* 2.266 */
+	{ 190, 512, 0xda359da225f6d54f, 0x00002358a2debc19 },	/* 2.717 */
+	{ 191, 512, 0x0a5f7a2a55607ba0, 0x0000238a79dac18c },	/* 2.474 */
+	{ 192, 512, 0x27bb75bf5224638a, 0x00002403a58e2351 },	/* 2.673 */
+	{ 193, 512, 0x1ebfdb94630f5d0f, 0x00002492a10cb339 },	/* 2.420 */
+	{ 194, 512, 0x6eae5e51d9c5f6fb, 0x000024ce4bf98715 },	/* 2.898 */
+	{ 195, 512, 0x08d903b4daedc2e0, 0x0000250d1e15886c },	/* 2.363 */
+	{ 196, 512, 0xc722a2f7fa7cd686, 0x0000258a99ed0c9e },	/* 2.747 */
+	{ 197, 512, 0x8f71faf0e54e361d, 0x000025dee11976f5 },	/* 2.531 */
+	{ 198, 512, 0x87f64695c91a54e7, 0x0000264e00a43da0 },	/* 2.707 */
+	{ 199, 512, 0xc719cbac2c336b92, 0x000026d327277ac1 },	/* 2.315 */
+	{ 200, 512, 0xe7e647afaf771ade, 0x000027523a5c44bf },	/* 3.012 */
+	{ 201, 512, 0x12d4b5c38ce8c946, 0x0000273898432545 },	/* 2.378 */
+	{ 202, 512, 0xf2e0cd4067bdc94a, 0x000027e47bb2c935 },	/* 2.969 */
+	{ 203, 512, 0x21b79f14d6d947d3, 0x0000281e64977f0d },	/* 2.594 */
+	{ 204, 512, 0x515093f952f18cd6, 0x0000289691a473fd },	/* 2.763 */
+	{ 205, 512, 0xd47b160a1b1022c8, 0x00002903e8b52411 },	/* 2.457 */
+	{ 206, 512, 0xc02fc96684715a16, 0x0000297515608601 },	/* 3.057 */
+	{ 207, 512, 0xef51e68efba72ed0, 0x000029ef73604804 },	/* 2.590 */
+	{ 208, 512, 0x9e3be6e5448b4f33, 0x00002a2846ed074b },	/* 3.047 */
+	{ 209, 512, 0x81d446c6d5fec063, 0x00002a92ca693455 },	/* 2.676 */
+	{ 210, 512, 0xff215de8224e57d5, 0x00002b2271fe3729 },	/* 2.993 */
+	{ 211, 512, 0xe2524d9ba8f69796, 0x00002b64b99c3ba2 },	/* 2.457 */
+	{ 212, 512, 0xf6b28e26097b7e4b, 0x00002bd768b6e068 },	/* 3.182 */
+	{ 213, 512, 0x893a487f30ce1644, 0x00002c67f722b4b2 },	/* 2.563 */
+	{ 214, 512, 0x386566c3fc9871df, 0x00002cc1cf8b4037 },	/* 3.025 */
+	{ 215, 512, 0x1e0ed78edf1f558a, 0x00002d3948d36c7f },	/* 2.730 */
+	{ 216, 512, 0xe3bc20c31e61f113, 0x00002d6d6b12e025 },	/* 3.036 */
+	{ 217, 512, 0xd6c3ad2e23021882, 0x00002deff7572241 },	/* 2.722 */
+	{ 218, 512, 0xb4a9f95cf0f69c5a, 0x00002e67d537aa36 },	/* 3.356 */
+	{ 219, 512, 0x6e98ed6f6c38e82f, 0x00002e9720626789 },	/* 2.697 */
+	{ 220, 512, 0x2e01edba33fddac7, 0x00002f407c6b0198 },	/* 2.979 */
+	{ 221, 512, 0x559d02e1f5f57ccc, 0x00002fb6a5ab4f24 },	/* 2.858 */
+	{ 222, 512, 0xac18f5a916adcd8e, 0x0000304ae1c5c57e },	/* 3.258 */
+	{ 223, 512, 0x15789fbaddb86f4b, 0x0000306f6e019c78 },	/* 2.693 */
+	{ 224, 512, 0xf4a9c36d5bc4c408, 0x000030da40434213 },	/* 3.259 */
+	{ 225, 512, 0xf640f90fd2727f44, 0x00003189ed37b90c },	/* 2.733 */
+	{ 226, 512, 0xb5313d390d61884a, 0x000031e152616b37 },	/* 3.235 */
+	{ 227, 512, 0x4bae6b3ce9160939, 0x0000321f40aeac42 },	/* 2.983 */
+	{ 228, 512, 0x838c34480f1a66a1, 0x000032f389c0f78e },	/* 3.308 */
+	{ 229, 512, 0xb1c4a52c8e3d6060, 0x0000330062a40284 },	/* 2.715 */
+	{ 230, 512, 0xe0f1110c6d0ed822, 0x0000338be435644f },	/* 3.540 */
+	{ 231, 512, 0x9f1a8ccdcea68d4b, 0x000034045a4e97e1 },	/* 2.779 */
+	{ 232, 512, 0x3261ed62223f3099, 0x000034702cfc401c },	/* 3.084 */
+	{ 233, 512, 0xf2191e2311022d65, 0x00003509dd19c9fc },	/* 2.987 */
+	{ 234, 512, 0xf102a395c2033abc, 0x000035654dc96fae },	/* 3.341 */
+	{ 235, 512, 0x11fe378f027906b6, 0x000035b5193b0264 },	/* 2.793 */
+	{ 236, 512, 0xf777f2c026b337aa, 0x000036704f5d9297 },	/* 3.518 */
+	{ 237, 512, 0x1b04e9c2ee143f32, 0x000036dfbb7af218 },	/* 2.962 */
+	{ 238, 512, 0x2fcec95266f9352c, 0x00003785c8df24a9 },	/* 3.196 */
+	{ 239, 512, 0xfe2b0e47e427dd85, 0x000037cbdf5da729 },	/* 2.914 */
+	{ 240, 512, 0x72b49bf2225f6c6d, 0x0000382227c15855 },	/* 3.408 */
+	{ 241, 512, 0x50486b43df7df9c7, 0x0000389b88be6453 },	/* 2.903 */
+	{ 242, 512, 0x5192a3e53181c8ab, 0x000038ddf3d67263 },	/* 3.778 */
+	{ 243, 512, 0xe9f5d8365296fd5e, 0x0000399f1c6c9e9c },	/* 3.026 */
+	{ 244, 512, 0xc740263f0301efa8, 0x00003a147146512d },	/* 3.347 */
+	{ 245, 512, 0x23cd0f2b5671e67d, 0x00003ab10bcc0d9d },	/* 3.212 */
+	{ 246, 512, 0x002ccc7e5cd41390, 0x00003ad6cd14a6c0 },	/* 3.482 */
+	{ 247, 512, 0x9aafb3c02544b31b, 0x00003b8cb8779fb0 },	/* 3.146 */
+	{ 248, 512, 0x72ba07a78b121999, 0x00003c24142a5a3f },	/* 3.626 */
+	{ 249, 512, 0x3d784aa58edfc7b4, 0x00003cd084817d99 },	/* 2.952 */
+	{ 250, 512, 0xaab750424d8004af, 0x00003d506a8e098e },	/* 3.463 */
+	{ 251, 512, 0x84403fcf8e6b5ca2, 0x00003d4c54c2aec4 },	/* 3.131 */
+	{ 252, 512, 0x71eb7455ec98e207, 0x00003e655715cf2c },	/* 3.538 */
+	{ 253, 512, 0xd752b4f19301595b, 0x00003ecd7b2ca5ac },	/* 2.974 */
+	{ 254, 512, 0xc4674129750499de, 0x00003e99e86d3e95 },	/* 3.843 */
+	{ 255, 512, 0x9772baff5cd12ef5, 0x00003f895c019841 },	/* 3.088 */
+};
+
+/*
+ * Verify the map is valid. Each device index must appear exactly
+ * once in every row, and the permutation array checksum must match.
+ */
+static int
+verify_perms(uint8_t *perms, uint64_t children, uint64_t nperms,
+    uint64_t checksum)
+{
+	int countssz = sizeof (uint16_t) * children;
+	uint16_t *counts = kmem_zalloc(countssz, KM_SLEEP);
+
+	for (int i = 0; i < nperms; i++) {
+		for (int j = 0; j < children; j++) {
+			uint8_t val = perms[(i * children) + j];
+
+			if (val >= children || counts[val] != i) {
+				kmem_free(counts, countssz);
+				return (EINVAL);
+			}
+
+			counts[val]++;
+		}
+	}
+
+	if (checksum != 0) {
+		int permssz = sizeof (uint8_t) * children * nperms;
+		zio_cksum_t cksum;
+
+		fletcher_4_native_varsize(perms, permssz, &cksum);
+
+		if (checksum != cksum.zc_word[0]) {
+			kmem_free(counts, countssz);
+			return (ECKSUM);
+		}
+	}
+
+	kmem_free(counts, countssz);
+
+	return (0);
+}
+
+/*
+ * Generate the permutation array for the draid_map_t.  These maps control
+ * the placement of all data in a dRAID.  Therefore it's critical that the
+ * seed always generates the same mapping.  We provide our own pseudo-random
+ * number generator for this purpose.
+ */
+int
+vdev_draid_generate_perms(const draid_map_t *map, uint8_t **permsp)
+{
+	VERIFY3U(map->dm_children, >=, VDEV_DRAID_MIN_CHILDREN);
+	VERIFY3U(map->dm_children, <=, VDEV_DRAID_MAX_CHILDREN);
+	VERIFY3U(map->dm_seed, !=, 0);
+	VERIFY3U(map->dm_nperms, !=, 0);
+	VERIFY3P(map->dm_perms, ==, NULL);
+
+#ifdef _KERNEL
+	/*
+	 * The kernel code always provides both a map_seed and checksum.
+	 * Only the tests/zfs-tests/cmd/draid/draid.c utility will provide
+	 * a zero checksum when generating new candidate maps.
+	 */
+	VERIFY3U(map->dm_checksum, !=, 0);
+#endif
+	uint64_t children = map->dm_children;
+	uint64_t nperms = map->dm_nperms;
+	int rowsz = sizeof (uint8_t) * children;
+	int permssz = rowsz * nperms;
+	uint8_t *perms;
+
+	/* Allocate the permutation array */
+	perms = vmem_alloc(permssz, KM_SLEEP);
+
+	/* Setup an initial row with a known pattern */
+	uint8_t *initial_row = kmem_alloc(rowsz, KM_SLEEP);
+	for (int i = 0; i < children; i++)
+		initial_row[i] = i;
+
+	uint64_t draid_seed[2] = { VDEV_DRAID_SEED, map->dm_seed };
+	uint8_t *current_row, *previous_row = initial_row;
+
+	/*
+	 * Perform a Fisher-Yates shuffle of each row using the previous
+	 * row as the starting point.  An initial_row with known pattern
+	 * is used as the input for the first row.
+	 */
+	for (int i = 0; i < nperms; i++) {
+		current_row = &perms[i * children];
+		memcpy(current_row, previous_row, rowsz);
+
+		for (int j = children - 1; j > 0; j--) {
+			uint64_t k = vdev_draid_rand(draid_seed) % (j + 1);
+			uint8_t val = current_row[j];
+			current_row[j] = current_row[k];
+			current_row[k] = val;
+		}
+
+		previous_row = current_row;
+	}
+
+	kmem_free(initial_row, rowsz);
+
+	int error = verify_perms(perms, children, nperms, map->dm_checksum);
+	if (error) {
+		vmem_free(perms, permssz);
+		return (error);
+	}
+
+	*permsp = perms;
+
+	return (0);
+}
+
+/*
+ * Lookup the fixed draid_map_t for the requested number of children.
+ */
+int
+vdev_draid_lookup_map(uint64_t children, const draid_map_t **mapp)
+{
+	for (int i = 0; i <= VDEV_DRAID_MAX_MAPS; i++) {
+		if (draid_maps[i].dm_children == children) {
+			*mapp = &draid_maps[i];
+			return (0);
+		}
+	}
+
+	return (ENOENT);
+}
+
+/*
+ * Lookup the permutation array and iteration id for the provided offset.
+ */
+static void
+vdev_draid_get_perm(vdev_draid_config_t *vdc, uint64_t pindex,
+    uint8_t **base, uint64_t *iter)
+{
+	uint64_t ncols = vdc->vdc_children;
+	uint64_t poff = pindex % (vdc->vdc_nperms * ncols);
+
+	*base = vdc->vdc_perms + (poff / ncols) * ncols;
+	*iter = poff % ncols;
+}
+
+static inline uint64_t
+vdev_draid_permute_id(vdev_draid_config_t *vdc,
+    uint8_t *base, uint64_t iter, uint64_t index)
+{
+	return ((base[index] + iter) % vdc->vdc_children);
+}
+
+/*
+ * Return the asize which is the psize rounded up to a full group width.
+ * i.e. vdev_draid_psize_to_asize().
+ */
+static uint64_t
+vdev_draid_asize(vdev_t *vd, uint64_t psize)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+	uint64_t ashift = vd->vdev_ashift;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	uint64_t rows = ((psize - 1) / (vdc->vdc_ndata << ashift)) + 1;
+	uint64_t asize = (rows * vdc->vdc_groupwidth) << ashift;
+
+	ASSERT3U(asize, !=, 0);
+	ASSERT3U(asize % (vdc->vdc_groupwidth), ==, 0);
+
+	return (asize);
+}
+
+/*
+ * Deflate the asize to the psize, this includes stripping parity.
+ */
+uint64_t
+vdev_draid_asize_to_psize(vdev_t *vd, uint64_t asize)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT0(asize % vdc->vdc_groupwidth);
+
+	return ((asize / vdc->vdc_groupwidth) * vdc->vdc_ndata);
+}
+
+/*
+ * Convert a logical offset to the corresponding group number.
+ */
+static uint64_t
+vdev_draid_offset_to_group(vdev_t *vd, uint64_t offset)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	return (offset / vdc->vdc_groupsz);
+}
+
+/*
+ * Convert a group number to the logical starting offset for that group.
+ */
+static uint64_t
+vdev_draid_group_to_offset(vdev_t *vd, uint64_t group)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	return (group * vdc->vdc_groupsz);
+}
+
+
+static void
+vdev_draid_map_free_vsd(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+
+	ASSERT0(rm->rm_freed);
+	rm->rm_freed = B_TRUE;
+
+	if (rm->rm_reports == 0) {
+		vdev_raidz_map_free(rm);
+	}
+}
+
+/*ARGSUSED*/
+static void
+vdev_draid_cksum_free(void *arg, size_t ignored)
+{
+	raidz_map_t *rm = arg;
+
+	ASSERT3U(rm->rm_reports, >, 0);
+
+	if (--rm->rm_reports == 0 && rm->rm_freed)
+		vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_draid_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
+{
+	raidz_map_t *rm = zcr->zcr_cbdata;
+	const size_t c = zcr->zcr_cbinfo;
+	uint64_t skip_size = zcr->zcr_sector;
+	uint64_t parity_size;
+	size_t x, offset, size;
+
+	if (good_data == NULL) {
+		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+		return;
+	}
+
+	/*
+	 * Detailed cksum reporting is currently only supported for single
+	 * row draid mappings, this covers the vast majority of zios. Only
+	 * a dRAID zio which spans groups will have multiple rows.
+	 */
+	if (rm->rm_nrows != 1) {
+		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+		return;
+	}
+
+	raidz_row_t *rr = rm->rm_row[0];
+	const abd_t *good = NULL;
+	const abd_t *bad = rr->rr_col[c].rc_abd;
+
+	if (c < rr->rr_firstdatacol) {
+		/*
+		 * The first time through, calculate the parity blocks for
+		 * the good data (this relies on the fact that the good
+		 * data never changes for a given logical zio)
+		 */
+		if (rr->rr_col[0].rc_gdata == NULL) {
+			abd_t *bad_parity[VDEV_DRAID_MAXPARITY];
+
+			/*
+			 * Set up the rr_col[]s to generate the parity for
+			 * good_data, first saving the parity bufs and
+			 * replacing them with buffers to hold the result.
+			 */
+			for (x = 0; x < rr->rr_firstdatacol; x++) {
+				bad_parity[x] = rr->rr_col[x].rc_abd;
+				rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
+				    abd_alloc_sametype(rr->rr_col[x].rc_abd,
+				    rr->rr_col[x].rc_size);
+			}
+
+			/*
+			 * Fill in the data columns from good_data being
+			 * careful to pad short columns and empty columns
+			 * with a skip sector.
+			 */
+			uint64_t good_size = abd_get_size((abd_t *)good_data);
+
+			offset = 0;
+			for (; x < rr->rr_cols; x++) {
+				abd_free(rr->rr_col[x].rc_abd);
+
+				if (offset == good_size) {
+					/* empty data column (small write) */
+					rr->rr_col[x].rc_abd =
+					    abd_get_zeros(skip_size);
+				} else if (x < rr->rr_bigcols) {
+					/* this is a "big column" */
+					size = rr->rr_col[x].rc_size;
+					rr->rr_col[x].rc_abd =
+					    abd_get_offset_size(
+					    (abd_t *)good_data, offset, size);
+					offset += size;
+				} else {
+					/* short data column, add skip sector */
+					size = rr->rr_col[x].rc_size -skip_size;
+					rr->rr_col[x].rc_abd = abd_alloc(
+					    rr->rr_col[x].rc_size, B_TRUE);
+					abd_copy_off(rr->rr_col[x].rc_abd,
+					    (abd_t *)good_data, 0, offset,
+					    size);
+					abd_zero_off(rr->rr_col[x].rc_abd,
+					    size, skip_size);
+					offset += size;
+				}
+			}
+
+			/*
+			 * Construct the parity from the good data.
+			 */
+			vdev_raidz_generate_parity_row(rm, rr);
+
+			/* restore everything back to its original state */
+			for (x = 0; x < rr->rr_firstdatacol; x++)
+				rr->rr_col[x].rc_abd = bad_parity[x];
+
+			offset = 0;
+			for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
+				abd_free(rr->rr_col[x].rc_abd);
+				rr->rr_col[x].rc_abd = abd_get_offset_size(
+				    rr->rr_abd_copy, offset,
+				    rr->rr_col[x].rc_size);
+				offset += rr->rr_col[x].rc_size;
+			}
+		}
+
+		ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
+		good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
+		    rr->rr_col[c].rc_size);
+	} else {
+		/* adjust good_data to point at the start of our column */
+		parity_size = size = rr->rr_col[0].rc_size;
+		if (c >= rr->rr_bigcols) {
+			size -= skip_size;
+			zcr->zcr_length = size;
+		}
+
+		/* empty column */
+		if (size == 0) {
+			zfs_ereport_finish_checksum(zcr, NULL, NULL, B_TRUE);
+			return;
+		}
+
+		offset = 0;
+		for (x = rr->rr_firstdatacol; x < c; x++) {
+			if (x < rr->rr_bigcols) {
+				offset += parity_size;
+			} else {
+				offset += parity_size - skip_size;
+			}
+		}
+
+		good = abd_get_offset_size((abd_t *)good_data, offset, size);
+	}
+
+	/* we drop the ereport if it ends up that the data was good */
+	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+	abd_free((abd_t *)good);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely.  The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_draid_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_draid_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+	size_t c = (size_t)(uintptr_t)arg;
+	raidz_map_t *rm = zio->io_vsd;
+
+	/* set up the report and bump the refcount  */
+	zcr->zcr_cbdata = rm;
+	zcr->zcr_cbinfo = c;
+	zcr->zcr_finish = vdev_draid_cksum_finish;
+	zcr->zcr_free = vdev_draid_cksum_free;
+
+	rm->rm_reports++;
+	ASSERT3U(rm->rm_reports, >, 0);
+
+	if (rm->rm_row[0]->rr_abd_copy != NULL)
+		return;
+
+	/*
+	 * It's the first time we're called for this raidz_map_t, so we need
+	 * to copy the data aside; there's no guarantee that our zio's buffer
+	 * won't be re-used for something else.
+	 *
+	 * Our parity data is already in separate buffers, so there's no need
+	 * to copy them.  Furthermore, all columns should have been expanded
+	 * by vdev_draid_map_alloc_empty() when attempting reconstruction.
+	 */
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+		size_t offset = 0;
+		size_t size = 0;
+
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			ASSERT3U(rr->rr_col[c].rc_size, ==,
+			    rr->rr_col[0].rc_size);
+			size += rr->rr_col[c].rc_size;
+		}
+
+		rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
+
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *col = &rr->rr_col[c];
+			abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
+			    offset, col->rc_size);
+
+			abd_copy(tmp, col->rc_abd, col->rc_size);
+			abd_free(col->rc_abd);
+
+			col->rc_abd = tmp;
+			offset += col->rc_size;
+		}
+		ASSERT3U(offset, ==, size);
+	}
+}
+
+const zio_vsd_ops_t vdev_draid_vsd_ops = {
+	.vsd_free = vdev_draid_map_free_vsd,
+	.vsd_cksum_report = vdev_draid_cksum_report
+};
+
+/*
+ * Full stripe writes.  When writing, all columns (D+P) are required.  Parity
+ * is calculated over all the columns, including empty zero filled sectors,
+ * and each is written to disk.  While only the data columns are needed for
+ * a normal read, all of the columns are required for reconstruction when
+ * performing a sequential resilver.
+ *
+ * For "big columns" it's sufficient to map the correct range of the zio ABD.
+ * Partial columns require allocating a gang ABD in order to zero fill the
+ * empty sectors.  When the column is empty a zero filled sector must be
+ * mapped.  In all cases the data ABDs must be the same size as the parity
+ * ABDs (e.g. rc->rc_size == parity_size).
+ */
+static void
+vdev_draid_map_alloc_write(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+	uint64_t parity_size = rr->rr_col[0].rc_size;
+	uint64_t abd_off = abd_offset;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+	ASSERT3U(parity_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
+
+	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_size == 0) {
+			/* empty data column (small write), add a skip sector */
+			ASSERT3U(skip_size, ==, parity_size);
+			rc->rc_abd = abd_get_zeros(skip_size);
+		} else if (rc->rc_size == parity_size) {
+			/* this is a "big column" */
+			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+			    zio->io_abd, abd_off, rc->rc_size);
+		} else {
+			/* short data column, add a skip sector */
+			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+			rc->rc_abd = abd_alloc_gang();
+			abd_gang_add(rc->rc_abd, abd_get_offset_size(
+			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
+			abd_gang_add(rc->rc_abd, abd_get_zeros(skip_size),
+			    B_TRUE);
+		}
+
+		ASSERT3U(abd_get_size(rc->rc_abd), ==, parity_size);
+
+		abd_off += rc->rc_size;
+		rc->rc_size = parity_size;
+	}
+
+	IMPLY(abd_offset != 0, abd_off == zio->io_size);
+}
+
+/*
+ * Scrub/resilver reads.  In order to store the contents of the skip sectors
+ * an additional ABD is allocated.  The columns are handled in the same way
+ * as a full stripe write except instead of using the zero ABD the newly
+ * allocated skip ABD is used to back the skip sectors.  In all cases the
+ * data ABD must be the same size as the parity ABDs.
+ */
+static void
+vdev_draid_map_alloc_scrub(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+	uint64_t parity_size = rr->rr_col[0].rc_size;
+	uint64_t abd_off = abd_offset;
+	uint64_t skip_off = 0;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+	ASSERT3P(rr->rr_abd_empty, ==, NULL);
+
+	if (rr->rr_nempty > 0) {
+		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
+		    B_FALSE);
+	}
+
+	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_size == 0) {
+			/* empty data column (small read), add a skip sector */
+			ASSERT3U(skip_size, ==, parity_size);
+			ASSERT3U(rr->rr_nempty, !=, 0);
+			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
+			    skip_off, skip_size);
+			skip_off += skip_size;
+		} else if (rc->rc_size == parity_size) {
+			/* this is a "big column" */
+			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+			    zio->io_abd, abd_off, rc->rc_size);
+		} else {
+			/* short data column, add a skip sector */
+			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+			ASSERT3U(rr->rr_nempty, !=, 0);
+			rc->rc_abd = abd_alloc_gang();
+			abd_gang_add(rc->rc_abd, abd_get_offset_size(
+			    zio->io_abd, abd_off, rc->rc_size), B_TRUE);
+			abd_gang_add(rc->rc_abd, abd_get_offset_size(
+			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
+			skip_off += skip_size;
+		}
+
+		uint64_t abd_size = abd_get_size(rc->rc_abd);
+		ASSERT3U(abd_size, ==, abd_get_size(rr->rr_col[0].rc_abd));
+
+		/*
+		 * Increase rc_size so the skip ABD is included in subsequent
+		 * parity calculations.
+		 */
+		abd_off += rc->rc_size;
+		rc->rc_size = abd_size;
+	}
+
+	IMPLY(abd_offset != 0, abd_off == zio->io_size);
+	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
+}
+
+/*
+ * Normal reads.  In this common case only the columns containing data
+ * are read in to the zio ABDs.  Neither the parity columns or empty skip
+ * sectors are read unless the checksum fails verification.  In which case
+ * vdev_raidz_read_all() will call vdev_draid_map_alloc_empty() to expand
+ * the raid map in order to allow reconstruction using the parity data and
+ * skip sectors.
+ */
+static void
+vdev_draid_map_alloc_read(zio_t *zio, uint64_t abd_offset, raidz_row_t *rr)
+{
+	uint64_t abd_off = abd_offset;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_size > 0) {
+			rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+			    zio->io_abd, abd_off, rc->rc_size);
+			abd_off += rc->rc_size;
+		}
+	}
+
+	IMPLY(abd_offset != 0, abd_off == zio->io_size);
+}
+
+/*
+ * Converts a normal "read" raidz_row_t to a "scrub" raidz_row_t. The key
+ * difference is that an ABD is allocated to back skip sectors so they may
+ * be read in to memory, verified, and repaired if needed.
+ */
+void
+vdev_draid_map_alloc_empty(zio_t *zio, raidz_row_t *rr)
+{
+	uint64_t skip_size = 1ULL << zio->io_vd->vdev_top->vdev_ashift;
+	uint64_t parity_size = rr->rr_col[0].rc_size;
+	uint64_t skip_off = 0;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+	ASSERT3P(rr->rr_abd_empty, ==, NULL);
+
+	if (rr->rr_nempty > 0) {
+		rr->rr_abd_empty = abd_alloc_linear(rr->rr_nempty * skip_size,
+		    B_FALSE);
+	}
+
+	for (uint64_t c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_size == 0) {
+			/* empty data column (small read), add a skip sector */
+			ASSERT3U(skip_size, ==, parity_size);
+			ASSERT3U(rr->rr_nempty, !=, 0);
+			ASSERT3P(rc->rc_abd, ==, NULL);
+			rc->rc_abd = abd_get_offset_size(rr->rr_abd_empty,
+			    skip_off, skip_size);
+			skip_off += skip_size;
+		} else if (rc->rc_size == parity_size) {
+			/* this is a "big column", nothing to add */
+			ASSERT3P(rc->rc_abd, !=, NULL);
+		} else {
+			/* short data column, add a skip sector */
+			ASSERT3U(rc->rc_size + skip_size, ==, parity_size);
+			ASSERT3U(rr->rr_nempty, !=, 0);
+			ASSERT3P(rc->rc_abd, !=, NULL);
+			ASSERT(!abd_is_gang(rc->rc_abd));
+			abd_t *read_abd = rc->rc_abd;
+			rc->rc_abd = abd_alloc_gang();
+			abd_gang_add(rc->rc_abd, read_abd, B_TRUE);
+			abd_gang_add(rc->rc_abd, abd_get_offset_size(
+			    rr->rr_abd_empty, skip_off, skip_size), B_TRUE);
+			skip_off += skip_size;
+		}
+
+		/*
+		 * Increase rc_size so the empty ABD is included in subsequent
+		 * parity calculations.
+		 */
+		rc->rc_size = parity_size;
+	}
+
+	ASSERT3U(skip_off, ==, rr->rr_nempty * skip_size);
+}
+
+/*
+ * Given a logical address within a dRAID configuration, return the physical
+ * address on the first drive in the group that this address maps to
+ * (at position 'start' in permutation number 'perm').
+ */
+static uint64_t
+vdev_draid_logical_to_physical(vdev_t *vd, uint64_t logical_offset,
+    uint64_t *perm, uint64_t *start)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	/* b is the dRAID (parent) sector offset. */
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	uint64_t b_offset = logical_offset >> ashift;
+
+	/*
+	 * The height of a row in units of the vdev's minimum sector size.
+	 * This is the amount of data written to each disk of each group
+	 * in a given permutation.
+	 */
+	uint64_t rowheight_sectors = VDEV_DRAID_ROWHEIGHT >> ashift;
+
+	/*
+	 * We cycle through a disk permutation every groupsz * ngroups chunk
+	 * of address space. Note that ngroups * groupsz must be a multiple
+	 * of the number of data drives (ndisks) in order to guarantee
+	 * alignment. So, for example, if our row height is 16MB, our group
+	 * size is 10, and there are 13 data drives in the draid, then ngroups
+	 * will be 13, we will change permutation every 2.08GB and each
+	 * disk will have 160MB of data per chunk.
+	 */
+	uint64_t groupwidth = vdc->vdc_groupwidth;
+	uint64_t ngroups = vdc->vdc_ngroups;
+	uint64_t ndisks = vdc->vdc_ndisks;
+
+	/*
+	 * groupstart is where the group this IO will land in "starts" in
+	 * the permutation array.
+	 */
+	uint64_t group = logical_offset / vdc->vdc_groupsz;
+	uint64_t groupstart = (group * groupwidth) % ndisks;
+	ASSERT3U(groupstart + groupwidth, <=, ndisks + groupstart);
+	*start = groupstart;
+
+	/* b_offset is the sector offset within a group chunk */
+	b_offset = b_offset % (rowheight_sectors * groupwidth);
+	ASSERT0(b_offset % groupwidth);
+
+	/*
+	 * Find the starting byte offset on each child vdev:
+	 * - within a permutation there are ngroups groups spread over the
+	 *   rows, where each row covers a slice portion of the disk
+	 * - each permutation has (groupwidth * ngroups) / ndisks rows
+	 * - so each permutation covers rows * slice portion of the disk
+	 * - so we need to find the row where this IO group target begins
+	 */
+	*perm = group / ngroups;
+	uint64_t row = (*perm * ((groupwidth * ngroups) / ndisks)) +
+	    (((group % ngroups) * groupwidth) / ndisks);
+
+	return (((rowheight_sectors * row) +
+	    (b_offset / groupwidth)) << ashift);
+}
+
+static uint64_t
+vdev_draid_map_alloc_row(zio_t *zio, raidz_row_t **rrp, uint64_t io_offset,
+    uint64_t abd_offset, uint64_t abd_size)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	uint64_t io_size = abd_size;
+	uint64_t io_asize = vdev_draid_asize(vd, io_size);
+	uint64_t group = vdev_draid_offset_to_group(vd, io_offset);
+	uint64_t start_offset = vdev_draid_group_to_offset(vd, group + 1);
+
+	/*
+	 * Limit the io_size to the space remaining in the group.  A second
+	 * row in the raidz_map_t is created for the remainder.
+	 */
+	if (io_offset + io_asize > start_offset) {
+		io_size = vdev_draid_asize_to_psize(vd,
+		    start_offset - io_offset);
+	}
+
+	/*
+	 * At most a block may span the logical end of one group and the start
+	 * of the next group. Therefore, at the end of a group the io_size must
+	 * span the group width evenly and the remainder must be aligned to the
+	 * start of the next group.
+	 */
+	IMPLY(abd_offset == 0 && io_size < zio->io_size,
+	    (io_asize >> ashift) % vdc->vdc_groupwidth == 0);
+	IMPLY(abd_offset != 0,
+	    vdev_draid_group_to_offset(vd, group) == io_offset);
+
+	/* Lookup starting byte offset on each child vdev */
+	uint64_t groupstart, perm;
+	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+	    io_offset, &perm, &groupstart);
+
+	/*
+	 * If there is less than groupwidth drives available after the group
+	 * start, the group is going to wrap onto the next row. 'wrap' is the
+	 * group disk number that starts on the next row.
+	 */
+	uint64_t ndisks = vdc->vdc_ndisks;
+	uint64_t groupwidth = vdc->vdc_groupwidth;
+	uint64_t wrap = groupwidth;
+
+	if (groupstart + groupwidth > ndisks)
+		wrap = ndisks - groupstart;
+
+	/* The io size in units of the vdev's minimum sector size. */
+	const uint64_t psize = io_size >> ashift;
+
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 */
+	uint64_t q = psize / vdc->vdc_ndata;
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
+	uint64_t r = psize - q * vdc->vdc_ndata;
+
+	/* The number of "big columns" - those which contain remainder data. */
+	uint64_t bc = (r == 0 ? 0 : r + vdc->vdc_nparity);
+	ASSERT3U(bc, <, groupwidth);
+
+	/* The total number of data and parity sectors for this I/O. */
+	uint64_t tot = psize + (vdc->vdc_nparity * (q + (r == 0 ? 0 : 1)));
+
+	raidz_row_t *rr;
+	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[groupwidth]), KM_SLEEP);
+	rr->rr_cols = groupwidth;
+	rr->rr_scols = groupwidth;
+	rr->rr_bigcols = bc;
+	rr->rr_missingdata = 0;
+	rr->rr_missingparity = 0;
+	rr->rr_firstdatacol = vdc->vdc_nparity;
+	rr->rr_abd_copy = NULL;
+	rr->rr_abd_empty = NULL;
+#ifdef ZFS_DEBUG
+	rr->rr_offset = io_offset;
+	rr->rr_size = io_size;
+#endif
+	*rrp = rr;
+
+	uint8_t *base;
+	uint64_t iter, asize = 0;
+	vdev_draid_get_perm(vdc, perm, &base, &iter);
+	for (uint64_t i = 0; i < groupwidth; i++) {
+		raidz_col_t *rc = &rr->rr_col[i];
+		uint64_t c = (groupstart + i) % ndisks;
+
+		/* increment the offset if we wrap to the next row */
+		if (i == wrap)
+			physical_offset += VDEV_DRAID_ROWHEIGHT;
+
+		rc->rc_devidx = vdev_draid_permute_id(vdc, base, iter, c);
+		rc->rc_offset = physical_offset;
+		rc->rc_abd = NULL;
+		rc->rc_gdata = NULL;
+		rc->rc_orig_data = NULL;
+		rc->rc_error = 0;
+		rc->rc_tried = 0;
+		rc->rc_skipped = 0;
+		rc->rc_repair = 0;
+		rc->rc_need_orig_restore = B_FALSE;
+
+		if (q == 0 && i >= bc)
+			rc->rc_size = 0;
+		else if (i < bc)
+			rc->rc_size = (q + 1) << ashift;
+		else
+			rc->rc_size = q << ashift;
+
+		asize += rc->rc_size;
+	}
+
+	ASSERT3U(asize, ==, tot << ashift);
+	rr->rr_nempty = roundup(tot, groupwidth) - tot;
+	IMPLY(bc > 0, rr->rr_nempty == groupwidth - bc);
+
+	/* Allocate buffers for the parity columns */
+	for (uint64_t c = 0; c < rr->rr_firstdatacol; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
+	}
+
+	/*
+	 * Map buffers for data columns and allocate/map buffers for skip
+	 * sectors.  There are three distinct cases for dRAID which are
+	 * required to support sequential rebuild.
+	 */
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		vdev_draid_map_alloc_write(zio, abd_offset, rr);
+	} else if ((rr->rr_nempty > 0) &&
+	    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+		vdev_draid_map_alloc_scrub(zio, abd_offset, rr);
+	} else {
+		ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+		vdev_draid_map_alloc_read(zio, abd_offset, rr);
+	}
+
+	return (io_size);
+}
+
+/*
+ * Allocate the raidz mapping to be applied to the dRAID I/O.  The parity
+ * calculations for dRAID are identical to raidz however there are a few
+ * differences in the layout.
+ *
+ * - dRAID always allocates a full stripe width. Any extra sectors due
+ *   this padding are zero filled and written to disk. They will be read
+ *   back during a scrub or repair operation since they are included in
+ *   the parity calculation. This property enables sequential resilvering.
+ *
+ * - When the block at the logical offset spans redundancy groups then two
+ *   rows are allocated in the raidz_map_t. One row resides at the end of
+ *   the first group and the other at the start of the following group.
+ */
+static raidz_map_t *
+vdev_draid_map_alloc(zio_t *zio)
+{
+	raidz_row_t *rr[2];
+	uint64_t abd_offset = 0;
+	uint64_t abd_size = zio->io_size;
+	uint64_t io_offset = zio->io_offset;
+	uint64_t size;
+	int nrows = 1;
+
+	size = vdev_draid_map_alloc_row(zio, &rr[0], io_offset,
+	    abd_offset, abd_size);
+	if (size < abd_size) {
+		vdev_t *vd = zio->io_vd;
+
+		io_offset += vdev_draid_asize(vd, size);
+		abd_offset += size;
+		abd_size -= size;
+		nrows++;
+
+		ASSERT3U(io_offset, ==, vdev_draid_group_to_offset(
+		    vd, vdev_draid_offset_to_group(vd, io_offset)));
+		ASSERT3U(abd_offset, <, zio->io_size);
+		ASSERT3U(abd_size, !=, 0);
+
+		size = vdev_draid_map_alloc_row(zio, &rr[1],
+		    io_offset, abd_offset, abd_size);
+		VERIFY3U(size, ==, abd_size);
+	}
+
+	raidz_map_t *rm;
+	rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[nrows]), KM_SLEEP);
+	rm->rm_ops = vdev_raidz_math_get_ops();
+	rm->rm_nrows = nrows;
+	rm->rm_row[0] = rr[0];
+	if (nrows == 2)
+		rm->rm_row[1] = rr[1];
+
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_draid_vsd_ops;
+
+	return (rm);
+}
+
+/*
+ * Given an offset into a dRAID return the next group width aligned offset
+ * which can be used to start an allocation.
+ */
+static uint64_t
+vdev_draid_get_astart(vdev_t *vd, const uint64_t start)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	return (roundup(start, vdc->vdc_groupwidth << vd->vdev_ashift));
+}
+
+/*
+ * Allocatable space for dRAID is (children - nspares) * sizeof(smallest child)
+ * rounded down to the last full slice.  So each child must provide at least
+ * 1 / (children - nspares) of its asize.
+ */
+static uint64_t
+vdev_draid_min_asize(vdev_t *vd)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	return ((vd->vdev_min_asize + vdc->vdc_ndisks - 1) / (vdc->vdc_ndisks));
+}
+
+/*
+ * When using dRAID the minimum allocation size is determined by the number
+ * of data disks in the redundancy group.  Full stripes are always used.
+ */
+static uint64_t
+vdev_draid_min_alloc(vdev_t *vd)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	return (vdc->vdc_ndata << vd->vdev_ashift);
+}
+
+/*
+ * Returns true if the txg range does not exist on any leaf vdev.
+ *
+ * A dRAID spare does not fit into the DTL model. While it has child vdevs
+ * there is no redundancy among them, and the effective child vdev is
+ * determined by offset. Essentially we do a vdev_dtl_reassess() on the
+ * fly by replacing a dRAID spare with the child vdev under the offset.
+ * Note that it is a recursive process because the child vdev can be
+ * another dRAID spare and so on.
+ */
+boolean_t
+vdev_draid_missing(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
+    uint64_t size)
+{
+	if (vd->vdev_ops == &vdev_spare_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops) {
+		/*
+		 * Check all of the readable children, if any child
+		 * contains the txg range the data it is not missing.
+		 */
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+
+			if (!vdev_readable(cvd))
+				continue;
+
+			if (!vdev_draid_missing(cvd, physical_offset,
+			    txg, size))
+				return (B_FALSE);
+		}
+
+		return (B_TRUE);
+	}
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops) {
+		/*
+		 * When sequentially resilvering we don't have a proper
+		 * txg range so instead we must presume all txgs are
+		 * missing on this vdev until the resilver completes.
+		 */
+		if (vd->vdev_rebuild_txg != 0)
+			return (B_TRUE);
+
+		/*
+		 * DTL_MISSING is set for all prior txgs when a resilver
+		 * is started in spa_vdev_attach().
+		 */
+		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+			return (B_TRUE);
+
+		/*
+		 * Consult the DTL on the relevant vdev. Either a vdev
+		 * leaf or spare/replace mirror child may be returned so
+		 * we must recursively call vdev_draid_missing_impl().
+		 */
+		vd = vdev_draid_spare_get_child(vd, physical_offset);
+		if (vd == NULL)
+			return (B_TRUE);
+
+		return (vdev_draid_missing(vd, physical_offset,
+		    txg, size));
+	}
+
+	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Returns true if the txg is only partially replicated on the leaf vdevs.
+ */
+static boolean_t
+vdev_draid_partial(vdev_t *vd, uint64_t physical_offset, uint64_t txg,
+    uint64_t size)
+{
+	if (vd->vdev_ops == &vdev_spare_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops) {
+		/*
+		 * Check all of the readable children, if any child is
+		 * missing the txg range then it is partially replicated.
+		 */
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+
+			if (!vdev_readable(cvd))
+				continue;
+
+			if (vdev_draid_partial(cvd, physical_offset, txg, size))
+				return (B_TRUE);
+		}
+
+		return (B_FALSE);
+	}
+
+	if (vd->vdev_ops == &vdev_draid_spare_ops) {
+		/*
+		 * When sequentially resilvering we don't have a proper
+		 * txg range so instead we must presume all txgs are
+		 * missing on this vdev until the resilver completes.
+		 */
+		if (vd->vdev_rebuild_txg != 0)
+			return (B_TRUE);
+
+		/*
+		 * DTL_MISSING is set for all prior txgs when a resilver
+		 * is started in spa_vdev_attach().
+		 */
+		if (vdev_dtl_contains(vd, DTL_MISSING, txg, size))
+			return (B_TRUE);
+
+		/*
+		 * Consult the DTL on the relevant vdev. Either a vdev
+		 * leaf or spare/replace mirror child may be returned so
+		 * we must recursively call vdev_draid_missing_impl().
+		 */
+		vd = vdev_draid_spare_get_child(vd, physical_offset);
+		if (vd == NULL)
+			return (B_TRUE);
+
+		return (vdev_draid_partial(vd, physical_offset, txg, size));
+	}
+
+	return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Determine if the vdev is readable at the given offset.
+ */
+boolean_t
+vdev_draid_readable(vdev_t *vd, uint64_t physical_offset)
+{
+	if (vd->vdev_ops == &vdev_draid_spare_ops) {
+		vd = vdev_draid_spare_get_child(vd, physical_offset);
+		if (vd == NULL)
+			return (B_FALSE);
+	}
+
+	if (vd->vdev_ops == &vdev_spare_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops) {
+
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+
+			if (!vdev_readable(cvd))
+				continue;
+
+			if (vdev_draid_readable(cvd, physical_offset))
+				return (B_TRUE);
+		}
+
+		return (B_FALSE);
+	}
+
+	return (vdev_readable(vd));
+}
+
+/*
+ * Returns the first distributed spare found under the provided vdev tree.
+ */
+static vdev_t *
+vdev_draid_find_spare(vdev_t *vd)
+{
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (vd);
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *svd = vdev_draid_find_spare(vd->vdev_child[c]);
+		if (svd != NULL)
+			return (svd);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Returns B_TRUE if the passed in vdev is currently "faulted".
+ * Faulted, in this context, means that the vdev represents a
+ * replacing or sparing vdev tree.
+ */
+static boolean_t
+vdev_draid_faulted(vdev_t *vd, uint64_t physical_offset)
+{
+	if (vd->vdev_ops == &vdev_draid_spare_ops) {
+		vd = vdev_draid_spare_get_child(vd, physical_offset);
+		if (vd == NULL)
+			return (B_FALSE);
+
+		/*
+		 * After resolving the distributed spare to a leaf vdev
+		 * check the parent to determine if it's "faulted".
+		 */
+		vd = vd->vdev_parent;
+	}
+
+	return (vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops);
+}
+
+/*
+ * Determine if the dRAID block at the logical offset is degraded.
+ * Used by sequential resilver.
+ */
+static boolean_t
+vdev_draid_group_degraded(vdev_t *vd, uint64_t offset)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
+
+	uint64_t groupstart, perm;
+	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+	    offset, &perm, &groupstart);
+
+	uint8_t *base;
+	uint64_t iter;
+	vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
+		vdev_t *cvd = vd->vdev_child[cid];
+
+		/* Group contains a faulted vdev. */
+		if (vdev_draid_faulted(cvd, physical_offset))
+			return (B_TRUE);
+
+		/*
+		 * Always check groups with active distributed spares
+		 * because any vdev failure in the pool will affect them.
+		 */
+		if (vdev_draid_find_spare(cvd) != NULL)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Determine if the txg is missing.  Used by healing resilver.
+ */
+static boolean_t
+vdev_draid_group_missing(vdev_t *vd, uint64_t offset, uint64_t txg,
+    uint64_t size)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+	ASSERT3U(vdev_draid_get_astart(vd, offset), ==, offset);
+
+	uint64_t groupstart, perm;
+	uint64_t physical_offset = vdev_draid_logical_to_physical(vd,
+	    offset, &perm, &groupstart);
+
+	uint8_t *base;
+	uint64_t iter;
+	vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+		uint64_t cid = vdev_draid_permute_id(vdc, base, iter, c);
+		vdev_t *cvd = vd->vdev_child[cid];
+
+		/* Transaction group is known to be partially replicated. */
+		if (vdev_draid_partial(cvd, physical_offset, txg, size))
+			return (B_TRUE);
+
+		/*
+		 * Always check groups with active distributed spares
+		 * because any vdev failure in the pool will affect them.
+		 */
+		if (vdev_draid_find_spare(cvd) != NULL)
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Find the smallest child asize and largest sector size to calculate the
+ * available capacity.  Distributed spares are ignored since their capacity
+ * is also based of the minimum child size in the top-level dRAID.
+ */
+static void
+vdev_draid_calculate_asize(vdev_t *vd, uint64_t *asizep, uint64_t *max_asizep,
+    uint64_t *logical_ashiftp, uint64_t *physical_ashiftp)
+{
+	uint64_t logical_ashift = 0, physical_ashift = 0;
+	uint64_t asize = 0, max_asize = 0;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_ops == &vdev_draid_spare_ops)
+			continue;
+
+		asize = MIN(asize - 1, cvd->vdev_asize - 1) + 1;
+		max_asize = MIN(max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+		logical_ashift = MAX(logical_ashift, cvd->vdev_ashift);
+		physical_ashift = MAX(physical_ashift,
+		    cvd->vdev_physical_ashift);
+	}
+
+	*asizep = asize;
+	*max_asizep = max_asize;
+	*logical_ashiftp = logical_ashift;
+	*physical_ashiftp = physical_ashift;
+}
+
+/*
+ * Open spare vdevs.
+ */
+static boolean_t
+vdev_draid_open_spares(vdev_t *vd)
+{
+	return (vd->vdev_ops == &vdev_draid_spare_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops);
+}
+
+/*
+ * Open all children, excluding spares.
+ */
+static boolean_t
+vdev_draid_open_children(vdev_t *vd)
+{
+	return (!vdev_draid_open_spares(vd));
+}
+
+/*
+ * Open a top-level dRAID vdev.
+ */
+static int
+vdev_draid_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	vdev_draid_config_t *vdc =  vd->vdev_tsd;
+	uint64_t nparity = vdc->vdc_nparity;
+	int open_errors = 0;
+
+	if (nparity > VDEV_DRAID_MAXPARITY ||
+	    vd->vdev_children < nparity + 1) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * First open the normal children then the distributed spares.  This
+	 * ordering is important to ensure the distributed spares calculate
+	 * the correct psize in the event that the dRAID vdevs were expanded.
+	 */
+	vdev_open_children_subset(vd, vdev_draid_open_children);
+	vdev_open_children_subset(vd, vdev_draid_open_spares);
+
+	/* Verify enough of the children are available to continue. */
+	for (int c = 0; c < vd->vdev_children; c++) {
+		if (vd->vdev_child[c]->vdev_open_error != 0) {
+			if ((++open_errors) > nparity) {
+				vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+				return (SET_ERROR(ENXIO));
+			}
+		}
+	}
+
+	/*
+	 * Allocatable capacity is the sum of the space on all children less
+	 * the number of distributed spares rounded down to last full row
+	 * and then to the last full group. An additional 32MB of scratch
+	 * space is reserved at the end of each child for use by the dRAID
+	 * expansion feature.
+	 */
+	uint64_t child_asize, child_max_asize;
+	vdev_draid_calculate_asize(vd, &child_asize, &child_max_asize,
+	    logical_ashift, physical_ashift);
+
+	/*
+	 * Should be unreachable since the minimum child size is 64MB, but
+	 * we want to make sure an underflow absolutely cannot occur here.
+	 */
+	if (child_asize < VDEV_DRAID_REFLOW_RESERVE ||
+	    child_max_asize < VDEV_DRAID_REFLOW_RESERVE) {
+		return (SET_ERROR(ENXIO));
+	}
+
+	child_asize = ((child_asize - VDEV_DRAID_REFLOW_RESERVE) /
+	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
+	child_max_asize = ((child_max_asize - VDEV_DRAID_REFLOW_RESERVE) /
+	    VDEV_DRAID_ROWHEIGHT) * VDEV_DRAID_ROWHEIGHT;
+
+	*asize = (((child_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
+	    vdc->vdc_groupsz);
+	*max_asize = (((child_max_asize * vdc->vdc_ndisks) / vdc->vdc_groupsz) *
+	    vdc->vdc_groupsz);
+
+	return (0);
+}
+
+/*
+ * Close a top-level dRAID vdev.
+ */
+static void
+vdev_draid_close(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++) {
+		if (vd->vdev_child[c] != NULL)
+			vdev_close(vd->vdev_child[c]);
+	}
+}
+
+/*
+ * Return the maximum asize for a rebuild zio in the provided range
+ * given the following constraints.  A dRAID chunks may not:
+ *
+ * - Exceed the maximum allowed block size (SPA_MAXBLOCKSIZE), or
+ * - Span dRAID redundancy groups.
+ */
+static uint64_t
+vdev_draid_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
+    uint64_t max_segment)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	uint64_t ashift = vd->vdev_ashift;
+	uint64_t ndata = vdc->vdc_ndata;
+	uint64_t psize = MIN(P2ROUNDUP(max_segment * ndata, 1 << ashift),
+	    SPA_MAXBLOCKSIZE);
+
+	ASSERT3U(vdev_draid_get_astart(vd, start), ==, start);
+	ASSERT3U(asize % (vdc->vdc_groupwidth << ashift), ==, 0);
+
+	/* Chunks must evenly span all data columns in the group. */
+	psize = (((psize >> ashift) / ndata) * ndata) << ashift;
+	uint64_t chunk_size = MIN(asize, vdev_psize_to_asize(vd, psize));
+
+	/* Reduce the chunk size to the group space remaining. */
+	uint64_t group = vdev_draid_offset_to_group(vd, start);
+	uint64_t left = vdev_draid_group_to_offset(vd, group + 1) - start;
+	chunk_size = MIN(chunk_size, left);
+
+	ASSERT3U(chunk_size % (vdc->vdc_groupwidth << ashift), ==, 0);
+	ASSERT3U(vdev_draid_offset_to_group(vd, start), ==,
+	    vdev_draid_offset_to_group(vd, start + chunk_size - 1));
+
+	return (chunk_size);
+}
+
+/*
+ * Align the start of the metaslab to the group width and slightly reduce
+ * its size to a multiple of the group width.  Since full stripe writes are
+ * required by dRAID this space is unallocable.  Furthermore, aligning the
+ * metaslab start is important for vdev initialize and TRIM which both operate
+ * on metaslab boundaries which vdev_xlate() expects to be aligned.
+ */
+static void
+vdev_draid_metaslab_init(vdev_t *vd, uint64_t *ms_start, uint64_t *ms_size)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+
+	uint64_t sz = vdc->vdc_groupwidth << vd->vdev_ashift;
+	uint64_t astart = vdev_draid_get_astart(vd, *ms_start);
+	uint64_t asize = ((*ms_size - (astart - *ms_start)) / sz) * sz;
+
+	*ms_start = astart;
+	*ms_size = asize;
+
+	ASSERT0(*ms_start % sz);
+	ASSERT0(*ms_size % sz);
+}
+
+/*
+ * Add virtual dRAID spares to the list of valid spares. In order to accomplish
+ * this the existing array must be freed and reallocated with the additional
+ * entries.
+ */
+int
+vdev_draid_spare_create(nvlist_t *nvroot, vdev_t *vd, uint64_t *ndraidp,
+    uint64_t next_vdev_id)
+{
+	uint64_t draid_nspares = 0;
+	uint64_t ndraid = 0;
+	int error;
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_t *cvd = vd->vdev_child[i];
+
+		if (cvd->vdev_ops == &vdev_draid_ops) {
+			vdev_draid_config_t *vdc = cvd->vdev_tsd;
+			draid_nspares += vdc->vdc_nspares;
+			ndraid++;
+		}
+	}
+
+	if (draid_nspares == 0) {
+		*ndraidp = ndraid;
+		return (0);
+	}
+
+	nvlist_t **old_spares, **new_spares;
+	uint_t old_nspares;
+	error = nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &old_spares, &old_nspares);
+	if (error)
+		old_nspares = 0;
+
+	/* Allocate memory and copy of the existing spares. */
+	new_spares = kmem_alloc(sizeof (nvlist_t *) *
+	    (draid_nspares + old_nspares), KM_SLEEP);
+	for (uint_t i = 0; i < old_nspares; i++)
+		new_spares[i] = fnvlist_dup(old_spares[i]);
+
+	/* Add new distributed spares to ZPOOL_CONFIG_SPARES. */
+	uint64_t n = old_nspares;
+	for (uint64_t vdev_id = 0; vdev_id < vd->vdev_children; vdev_id++) {
+		vdev_t *cvd = vd->vdev_child[vdev_id];
+		char path[64];
+
+		if (cvd->vdev_ops != &vdev_draid_ops)
+			continue;
+
+		vdev_draid_config_t *vdc = cvd->vdev_tsd;
+		uint64_t nspares = vdc->vdc_nspares;
+		uint64_t nparity = vdc->vdc_nparity;
+
+		for (uint64_t spare_id = 0; spare_id < nspares; spare_id++) {
+			bzero(path, sizeof (path));
+			(void) snprintf(path, sizeof (path) - 1,
+			    "%s%llu-%llu-%llu", VDEV_TYPE_DRAID,
+			    (u_longlong_t)nparity,
+			    (u_longlong_t)next_vdev_id + vdev_id,
+			    (u_longlong_t)spare_id);
+
+			nvlist_t *spare = fnvlist_alloc();
+			fnvlist_add_string(spare, ZPOOL_CONFIG_PATH, path);
+			fnvlist_add_string(spare, ZPOOL_CONFIG_TYPE,
+			    VDEV_TYPE_DRAID_SPARE);
+			fnvlist_add_uint64(spare, ZPOOL_CONFIG_TOP_GUID,
+			    cvd->vdev_guid);
+			fnvlist_add_uint64(spare, ZPOOL_CONFIG_SPARE_ID,
+			    spare_id);
+			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_LOG, 0);
+			fnvlist_add_uint64(spare, ZPOOL_CONFIG_IS_SPARE, 1);
+			fnvlist_add_uint64(spare, ZPOOL_CONFIG_WHOLE_DISK, 1);
+			fnvlist_add_uint64(spare, ZPOOL_CONFIG_ASHIFT,
+			    cvd->vdev_ashift);
+
+			new_spares[n] = spare;
+			n++;
+		}
+	}
+
+	if (n > 0) {
+		(void) nvlist_remove_all(nvroot, ZPOOL_CONFIG_SPARES);
+		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+		    new_spares, n);
+	}
+
+	for (int i = 0; i < n; i++)
+		nvlist_free(new_spares[i]);
+
+	kmem_free(new_spares, sizeof (*new_spares) * n);
+	*ndraidp = ndraid;
+
+	return (0);
+}
+
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered.
+ */
+static boolean_t
+vdev_draid_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t asize = vdev_draid_asize(vd, psize);
+
+	if (phys_birth == TXG_UNKNOWN) {
+		/*
+		 * Sequential resilver.  There is no meaningful phys_birth
+		 * for this block, we can only determine if block resides
+		 * in a degraded group in which case it must be resilvered.
+		 */
+		ASSERT3U(vdev_draid_offset_to_group(vd, offset), ==,
+		    vdev_draid_offset_to_group(vd, offset + asize - 1));
+
+		return (vdev_draid_group_degraded(vd, offset));
+	} else {
+		/*
+		 * Healing resilver.  TXGs not in DTL_PARTIAL are intact,
+		 * as are blocks in non-degraded groups.
+		 */
+		if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+			return (B_FALSE);
+
+		if (vdev_draid_group_missing(vd, offset, phys_birth, 1))
+			return (B_TRUE);
+
+		/* The block may span groups in which case check both. */
+		if (vdev_draid_offset_to_group(vd, offset) !=
+		    vdev_draid_offset_to_group(vd, offset + asize - 1)) {
+			if (vdev_draid_group_missing(vd,
+			    offset + asize, phys_birth, 1))
+				return (B_TRUE);
+		}
+
+		return (B_FALSE);
+	}
+}
+
+static boolean_t
+vdev_draid_rebuilding(vdev_t *vd)
+{
+	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
+		return (B_TRUE);
+
+	for (int i = 0; i < vd->vdev_children; i++) {
+		if (vdev_draid_rebuilding(vd->vdev_child[i])) {
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static void
+vdev_draid_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+{
+#ifdef ZFS_DEBUG
+	range_seg64_t logical_rs, physical_rs, remain_rs;
+	logical_rs.rs_start = rr->rr_offset;
+	logical_rs.rs_end = logical_rs.rs_start +
+	    vdev_draid_asize(vd, rr->rr_size);
+
+	raidz_col_t *rc = &rr->rr_col[col];
+	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
+	ASSERT(vdev_xlate_is_empty(&remain_rs));
+	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+	ASSERT3U(rc->rc_offset + rc->rc_size, ==, physical_rs.rs_end);
+#endif
+}
+
+/*
+ * For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ *    data and parity.  A gang ABD is allocated by vdev_draid_map_alloc()
+ *    if a skip sector needs to be added to a column.
+ */
+static void
+vdev_draid_io_start_write(zio_t *zio, raidz_row_t *rr)
+{
+	vdev_t *vd = zio->io_vd;
+	raidz_map_t *rm = zio->io_vsd;
+
+	vdev_raidz_generate_parity_row(rm, rr);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		/*
+		 * Empty columns are zero filled and included in the parity
+		 * calculation and therefore must be written.
+		 */
+		ASSERT3U(rc->rc_size, !=, 0);
+
+		/* Verify physical to logical translation */
+		vdev_draid_io_verify(vd, rr, c);
+
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[rc->rc_devidx], rc->rc_offset,
+		    rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
+		    0, vdev_raidz_child_done, rc));
+	}
+}
+
+/*
+ * For read operations:
+ * 1. The vdev_draid_map_alloc() function will create a minimal raidz
+ *    mapping for the read based on the zio->io_flags.  There are two
+ *    possible mappings either 1) a normal read, or 2) a scrub/resilver.
+ * 2. Create the zio read operations.  This will include all parity
+ *    columns and skip sectors for a scrub/resilver.
+ */
+static void
+vdev_draid_io_start_read(zio_t *zio, raidz_row_t *rr)
+{
+	vdev_t *vd = zio->io_vd;
+
+	/* Sequential rebuild must do IO at redundancy group boundary. */
+	IMPLY(zio->io_priority == ZIO_PRIORITY_REBUILD, rr->rr_nempty == 0);
+
+	/*
+	 * Iterate over the columns in reverse order so that we hit the parity
+	 * last.  Any errors along the way will force us to read the parity.
+	 * For scrub/resilver IOs which verify skip sectors, a gang ABD will
+	 * have been allocated to store them and rc->rc_size is increased.
+	 */
+	for (int c = rr->rr_cols - 1; c >= 0; c--) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+		if (!vdev_draid_readable(cvd, rc->rc_offset)) {
+			if (c >= rr->rr_firstdatacol)
+				rr->rr_missingdata++;
+			else
+				rr->rr_missingparity++;
+			rc->rc_error = SET_ERROR(ENXIO);
+			rc->rc_tried = 1;
+			rc->rc_skipped = 1;
+			continue;
+		}
+
+		if (vdev_draid_missing(cvd, rc->rc_offset, zio->io_txg, 1)) {
+			if (c >= rr->rr_firstdatacol)
+				rr->rr_missingdata++;
+			else
+				rr->rr_missingparity++;
+			rc->rc_error = SET_ERROR(ESTALE);
+			rc->rc_skipped = 1;
+			continue;
+		}
+
+		/*
+		 * Empty columns may be read during vdev_draid_io_done().
+		 * Only skip them after the readable and missing checks
+		 * verify they are available.
+		 */
+		if (rc->rc_size == 0) {
+			rc->rc_skipped = 1;
+			continue;
+		}
+
+		if (zio->io_flags & ZIO_FLAG_RESILVER) {
+			vdev_t *svd;
+
+			/*
+			 * If this child is a distributed spare then the
+			 * offset might reside on the vdev being replaced.
+			 * In which case this data must be written to the
+			 * new device.  Failure to do so would result in
+			 * checksum errors when the old device is detached
+			 * and the pool is scrubbed.
+			 */
+			if ((svd = vdev_draid_find_spare(cvd)) != NULL) {
+				svd = vdev_draid_spare_get_child(svd,
+				    rc->rc_offset);
+				if (svd && (svd->vdev_ops == &vdev_spare_ops ||
+				    svd->vdev_ops == &vdev_replacing_ops)) {
+					rc->rc_repair = 1;
+				}
+			}
+
+			/*
+			 * Always issue a repair IO to this child when its
+			 * a spare or replacing vdev with an active rebuild.
+			 */
+			if ((cvd->vdev_ops == &vdev_spare_ops ||
+			    cvd->vdev_ops == &vdev_replacing_ops) &&
+			    vdev_draid_rebuilding(cvd)) {
+				rc->rc_repair = 1;
+			}
+		}
+	}
+
+	/*
+	 * Either a parity or data column is missing this means a repair
+	 * may be attempted by vdev_draid_io_done().  Expand the raid map
+	 * to read in empty columns which are needed along with the parity
+	 * during reconstruction.
+	 */
+	if ((rr->rr_missingdata > 0 || rr->rr_missingparity > 0) &&
+	    rr->rr_nempty > 0 && rr->rr_abd_empty == NULL) {
+		vdev_draid_map_alloc_empty(zio, rr);
+	}
+
+	for (int c = rr->rr_cols - 1; c >= 0; c--) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+		if (rc->rc_error || rc->rc_size == 0)
+			continue;
+
+		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
+			    zio->io_type, zio->io_priority, 0,
+			    vdev_raidz_child_done, rc));
+		}
+	}
+}
+
+/*
+ * Start an IO operation to a dRAID vdev.
+ */
+static void
+vdev_draid_io_start(zio_t *zio)
+{
+	vdev_t *vd __maybe_unused = zio->io_vd;
+	raidz_map_t *rm;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+	ASSERT3U(zio->io_offset, ==, vdev_draid_get_astart(vd, zio->io_offset));
+
+	rm = vdev_draid_map_alloc(zio);
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			vdev_draid_io_start_write(zio, rm->rm_row[i]);
+		}
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			vdev_draid_io_start_read(zio, rm->rm_row[i]);
+		}
+	}
+
+	zio_execute(zio);
+}
+
+/*
+ * Complete an IO operation on a dRAID vdev.  The raidz logic can be applied
+ * to dRAID since the layout is fully described by the raidz_map_t.
+ */
+static void
+vdev_draid_io_done(zio_t *zio)
+{
+	vdev_raidz_io_done(zio);
+}
+
+static void
+vdev_draid_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+	ASSERT(vd->vdev_ops == &vdev_draid_ops);
+
+	if (faulted > vdc->vdc_nparity)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+static void
+vdev_draid_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+	vdev_t *raidvd = cvd->vdev_parent;
+	ASSERT(raidvd->vdev_ops == &vdev_draid_ops);
+
+	vdev_draid_config_t *vdc = raidvd->vdev_tsd;
+	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+	/* Make sure the offsets are block-aligned */
+	ASSERT0(logical_rs->rs_start % (1 << ashift));
+	ASSERT0(logical_rs->rs_end % (1 << ashift));
+
+	uint64_t logical_start = logical_rs->rs_start;
+	uint64_t logical_end = logical_rs->rs_end;
+
+	/*
+	 * Unaligned ranges must be skipped. All metaslabs are correctly
+	 * aligned so this should not happen, but this case is handled in
+	 * case it's needed by future callers.
+	 */
+	uint64_t astart = vdev_draid_get_astart(raidvd, logical_start);
+	if (astart != logical_start) {
+		physical_rs->rs_start = logical_start;
+		physical_rs->rs_end = logical_start;
+		remain_rs->rs_start = MIN(astart, logical_end);
+		remain_rs->rs_end = logical_end;
+		return;
+	}
+
+	/*
+	 * Unlike with mirrors and raidz a dRAID logical range can map
+	 * to multiple non-contiguous physical ranges. This is handled by
+	 * limiting the size of the logical range to a single group and
+	 * setting the remain argument such that it describes the remaining
+	 * unmapped logical range. This is stricter than absolutely
+	 * necessary but helps simplify the logic below.
+	 */
+	uint64_t group = vdev_draid_offset_to_group(raidvd, logical_start);
+	uint64_t nextstart = vdev_draid_group_to_offset(raidvd, group + 1);
+	if (logical_end > nextstart)
+		logical_end = nextstart;
+
+	/* Find the starting offset for each vdev in the group */
+	uint64_t perm, groupstart;
+	uint64_t start = vdev_draid_logical_to_physical(raidvd,
+	    logical_start, &perm, &groupstart);
+	uint64_t end = start;
+
+	uint8_t *base;
+	uint64_t iter, id;
+	vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+	/*
+	 * Check if the passed child falls within the group.  If it does
+	 * update the start and end to reflect the physical range.
+	 * Otherwise, leave them unmodified which will result in an empty
+	 * (zero-length) physical range being returned.
+	 */
+	for (uint64_t i = 0; i < vdc->vdc_groupwidth; i++) {
+		uint64_t c = (groupstart + i) % vdc->vdc_ndisks;
+
+		if (c == 0 && i != 0) {
+			/* the group wrapped, increment the start */
+			start += VDEV_DRAID_ROWHEIGHT;
+			end = start;
+		}
+
+		id = vdev_draid_permute_id(vdc, base, iter, c);
+		if (id == cvd->vdev_id) {
+			uint64_t b_size = (logical_end >> ashift) -
+			    (logical_start >> ashift);
+			ASSERT3U(b_size, >, 0);
+			end = start + ((((b_size - 1) /
+			    vdc->vdc_groupwidth) + 1) << ashift);
+			break;
+		}
+	}
+	physical_rs->rs_start = start;
+	physical_rs->rs_end = end;
+
+	/*
+	 * Only top-level vdevs are allowed to set remain_rs because
+	 * when .vdev_op_xlate() is called for their children the full
+	 * logical range is not provided by vdev_xlate().
+	 */
+	remain_rs->rs_start = logical_end;
+	remain_rs->rs_end = logical_rs->rs_end;
+
+	ASSERT3U(physical_rs->rs_start, <=, logical_start);
+	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
+	    logical_end - logical_start);
+}
+
+/*
+ * Add dRAID specific fields to the config nvlist.
+ */
+static void
+vdev_draid_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_ops);
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdc->vdc_nparity);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, vdc->vdc_ndata);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, vdc->vdc_nspares);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, vdc->vdc_ngroups);
+}
+
+/*
+ * Initialize private dRAID specific fields from the nvlist.
+ */
+static int
+vdev_draid_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	uint64_t ndata, nparity, nspares, ngroups;
+	int error;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NDATA, &ndata))
+		return (SET_ERROR(EINVAL));
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) ||
+	    nparity == 0 || nparity > VDEV_DRAID_MAXPARITY) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	uint_t children;
+	nvlist_t **child;
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0 || children == 0 ||
+	    children > VDEV_DRAID_MAX_CHILDREN) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NSPARES, &nspares) ||
+	    nspares > 100 || nspares > (children - (ndata + nparity))) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DRAID_NGROUPS, &ngroups) ||
+	    ngroups == 0 || ngroups > VDEV_DRAID_MAX_CHILDREN) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Validate the minimum number of children exist per group for the
+	 * specified parity level (draid1 >= 2, draid2 >= 3, draid3 >= 4).
+	 */
+	if (children < (ndata + nparity + nspares))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * Create the dRAID configuration using the pool nvlist configuration
+	 * and the fixed mapping for the correct number of children.
+	 */
+	vdev_draid_config_t *vdc;
+	const draid_map_t *map;
+
+	error = vdev_draid_lookup_map(children, &map);
+	if (error)
+		return (SET_ERROR(EINVAL));
+
+	vdc = kmem_zalloc(sizeof (*vdc), KM_SLEEP);
+	vdc->vdc_ndata = ndata;
+	vdc->vdc_nparity = nparity;
+	vdc->vdc_nspares = nspares;
+	vdc->vdc_children = children;
+	vdc->vdc_ngroups = ngroups;
+	vdc->vdc_nperms = map->dm_nperms;
+
+	error = vdev_draid_generate_perms(map, &vdc->vdc_perms);
+	if (error) {
+		kmem_free(vdc, sizeof (*vdc));
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Derived constants.
+	 */
+	vdc->vdc_groupwidth = vdc->vdc_ndata + vdc->vdc_nparity;
+	vdc->vdc_ndisks = vdc->vdc_children - vdc->vdc_nspares;
+	vdc->vdc_groupsz = vdc->vdc_groupwidth * VDEV_DRAID_ROWHEIGHT;
+	vdc->vdc_devslicesz = (vdc->vdc_groupsz * vdc->vdc_ngroups) /
+	    vdc->vdc_ndisks;
+
+	ASSERT3U(vdc->vdc_groupwidth, >=, 2);
+	ASSERT3U(vdc->vdc_groupwidth, <=, vdc->vdc_ndisks);
+	ASSERT3U(vdc->vdc_groupsz, >=, 2 * VDEV_DRAID_ROWHEIGHT);
+	ASSERT3U(vdc->vdc_devslicesz, >=, VDEV_DRAID_ROWHEIGHT);
+	ASSERT3U(vdc->vdc_devslicesz % VDEV_DRAID_ROWHEIGHT, ==, 0);
+	ASSERT3U((vdc->vdc_groupwidth * vdc->vdc_ngroups) %
+	    vdc->vdc_ndisks, ==, 0);
+
+	*tsd = vdc;
+
+	return (0);
+}
+
+static void
+vdev_draid_fini(vdev_t *vd)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	vmem_free(vdc->vdc_perms, sizeof (uint8_t) *
+	    vdc->vdc_children * vdc->vdc_nperms);
+	kmem_free(vdc, sizeof (*vdc));
+}
+
+static uint64_t
+vdev_draid_nparity(vdev_t *vd)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	return (vdc->vdc_nparity);
+}
+
+static uint64_t
+vdev_draid_ndisks(vdev_t *vd)
+{
+	vdev_draid_config_t *vdc = vd->vdev_tsd;
+
+	return (vdc->vdc_ndisks);
+}
+
+vdev_ops_t vdev_draid_ops = {
+	.vdev_op_init = vdev_draid_init,
+	.vdev_op_fini = vdev_draid_fini,
+	.vdev_op_open = vdev_draid_open,
+	.vdev_op_close = vdev_draid_close,
+	.vdev_op_asize = vdev_draid_asize,
+	.vdev_op_min_asize = vdev_draid_min_asize,
+	.vdev_op_min_alloc = vdev_draid_min_alloc,
+	.vdev_op_io_start = vdev_draid_io_start,
+	.vdev_op_io_done = vdev_draid_io_done,
+	.vdev_op_state_change = vdev_draid_state_change,
+	.vdev_op_need_resilver = vdev_draid_need_resilver,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_draid_xlate,
+	.vdev_op_rebuild_asize = vdev_draid_rebuild_asize,
+	.vdev_op_metaslab_init = vdev_draid_metaslab_init,
+	.vdev_op_config_generate = vdev_draid_config_generate,
+	.vdev_op_nparity = vdev_draid_nparity,
+	.vdev_op_ndisks = vdev_draid_ndisks,
+	.vdev_op_type = VDEV_TYPE_DRAID,
+	.vdev_op_leaf = B_FALSE,
+};
+
+
+/*
+ * A dRAID distributed spare is a virtual leaf vdev which is included in the
+ * parent dRAID configuration.  The last N columns of the dRAID permutation
+ * table are used to determine on which dRAID children a specific offset
+ * should be written.  These spare leaf vdevs can only be used to replace
+ * faulted children in the same dRAID configuration.
+ */
+
+/*
+ * Distributed spare state.  All fields are set when the distributed spare is
+ * first opened and are immutable.
+ */
+typedef struct {
+	vdev_t *vds_draid_vdev;		/* top-level parent dRAID vdev */
+	uint64_t vds_top_guid;		/* top-level parent dRAID guid */
+	uint64_t vds_spare_id;		/* spare id (0 - vdc->vdc_nspares-1) */
+} vdev_draid_spare_t;
+
+/*
+ * Returns the parent dRAID vdev to which the distributed spare belongs.
+ * This may be safely called even when the vdev is not open.
+ */
+vdev_t *
+vdev_draid_spare_get_parent(vdev_t *vd)
+{
+	vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+	if (vds->vds_draid_vdev != NULL)
+		return (vds->vds_draid_vdev);
+
+	return (vdev_lookup_by_guid(vd->vdev_spa->spa_root_vdev,
+	    vds->vds_top_guid));
+}
+
+/*
+ * A dRAID space is active when it's the child of a vdev using the
+ * vdev_spare_ops, vdev_replacing_ops or vdev_draid_ops.
+ */
+static boolean_t
+vdev_draid_spare_is_active(vdev_t *vd)
+{
+	vdev_t *pvd = vd->vdev_parent;
+
+	if (pvd != NULL && (pvd->vdev_ops == &vdev_spare_ops ||
+	    pvd->vdev_ops == &vdev_replacing_ops ||
+	    pvd->vdev_ops == &vdev_draid_ops)) {
+		return (B_TRUE);
+	} else {
+		return (B_FALSE);
+	}
+}
+
+/*
+ * Given a dRAID distribute spare vdev, returns the physical child vdev
+ * on which the provided offset resides.  This may involve recursing through
+ * multiple layers of distributed spares.  Note that offset is relative to
+ * this vdev.
+ */
+vdev_t *
+vdev_draid_spare_get_child(vdev_t *vd, uint64_t physical_offset)
+{
+	vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+	/* The vdev is closed */
+	if (vds->vds_draid_vdev == NULL)
+		return (NULL);
+
+	vdev_t *tvd = vds->vds_draid_vdev;
+	vdev_draid_config_t *vdc = tvd->vdev_tsd;
+
+	ASSERT3P(tvd->vdev_ops, ==, &vdev_draid_ops);
+	ASSERT3U(vds->vds_spare_id, <, vdc->vdc_nspares);
+
+	uint8_t *base;
+	uint64_t iter;
+	uint64_t perm = physical_offset / vdc->vdc_devslicesz;
+
+	vdev_draid_get_perm(vdc, perm, &base, &iter);
+
+	uint64_t cid = vdev_draid_permute_id(vdc, base, iter,
+	    (tvd->vdev_children - 1) - vds->vds_spare_id);
+	vdev_t *cvd = tvd->vdev_child[cid];
+
+	if (cvd->vdev_ops == &vdev_draid_spare_ops)
+		return (vdev_draid_spare_get_child(cvd, physical_offset));
+
+	return (cvd);
+}
+
+/* ARGSUSED */
+static void
+vdev_draid_spare_close(vdev_t *vd)
+{
+	vdev_draid_spare_t *vds = vd->vdev_tsd;
+	vds->vds_draid_vdev = NULL;
+}
+
+/*
+ * Opening a dRAID spare device is done by looking up the associated dRAID
+ * top-level vdev guid from the spare configuration.
+ */
+static int
+vdev_draid_spare_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	vdev_draid_spare_t *vds = vd->vdev_tsd;
+	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	uint64_t asize, max_asize;
+
+	vdev_t *tvd = vdev_lookup_by_guid(rvd, vds->vds_top_guid);
+	if (tvd == NULL) {
+		/*
+		 * When spa_vdev_add() is labeling new spares the
+		 * associated dRAID is not attached to the root vdev
+		 * nor does this spare have a parent.  Simulate a valid
+		 * device in order to allow the label to be initialized
+		 * and the distributed spare added to the configuration.
+		 */
+		if (vd->vdev_parent == NULL) {
+			*psize = *max_psize = SPA_MINDEVSIZE;
+			*logical_ashift = *physical_ashift = ASHIFT_MIN;
+			return (0);
+		}
+
+		return (SET_ERROR(EINVAL));
+	}
+
+	vdev_draid_config_t *vdc = tvd->vdev_tsd;
+	if (tvd->vdev_ops != &vdev_draid_ops || vdc == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if (vds->vds_spare_id >= vdc->vdc_nspares)
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * Neither tvd->vdev_asize or tvd->vdev_max_asize can be used here
+	 * because the caller may be vdev_draid_open() in which case the
+	 * values are stale as they haven't yet been updated by vdev_open().
+	 * To avoid this always recalculate the dRAID asize and max_asize.
+	 */
+	vdev_draid_calculate_asize(tvd, &asize, &max_asize,
+	    logical_ashift, physical_ashift);
+
+	*psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+	*max_psize = max_asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+
+	vds->vds_draid_vdev = tvd;
+
+	return (0);
+}
+
+/*
+ * Completed distributed spare IO.  Store the result in the parent zio
+ * as if it had performed the operation itself.  Only the first error is
+ * preserved if there are multiple errors.
+ */
+static void
+vdev_draid_spare_child_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_private;
+
+	/*
+	 * IOs are issued to non-writable vdevs in order to keep their
+	 * DTLs accurate.  However, we don't want to propagate the
+	 * error in to the distributed spare's DTL.  When resilvering
+	 * vdev_draid_need_resilver() will consult the relevant DTL
+	 * to determine if the data is missing and must be repaired.
+	 */
+	if (!vdev_writeable(zio->io_vd))
+		return;
+
+	if (pio->io_error == 0)
+		pio->io_error = zio->io_error;
+}
+
+/*
+ * Returns a valid label nvlist for the distributed spare vdev.  This is
+ * used to bypass the IO pipeline to avoid the complexity of constructing
+ * a complete label with valid checksum to return when read.
+ */
+nvlist_t *
+vdev_draid_read_config_spare(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	spa_aux_vdev_t *sav = &spa->spa_spares;
+	uint64_t guid = vd->vdev_guid;
+
+	nvlist_t *nv = fnvlist_alloc();
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_VERSION, spa_version(spa));
+	fnvlist_add_string(nv, ZPOOL_CONFIG_POOL_NAME, spa_name(spa));
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vd->vdev_top->vdev_guid);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_POOL_STATE,
+	    vdev_draid_spare_is_active(vd) ?
+	    POOL_STATE_ACTIVE : POOL_STATE_SPARE);
+
+	/* Set the vdev guid based on the vdev list in sav_count. */
+	for (int i = 0; i < sav->sav_count; i++) {
+		if (sav->sav_vdevs[i]->vdev_ops == &vdev_draid_spare_ops &&
+		    strcmp(sav->sav_vdevs[i]->vdev_path, vd->vdev_path) == 0) {
+			guid = sav->sav_vdevs[i]->vdev_guid;
+			break;
+		}
+	}
+
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, guid);
+
+	return (nv);
+}
+
+/*
+ * Handle any ioctl requested of the distributed spare.  Only flushes
+ * are supported in which case all children must be flushed.
+ */
+static int
+vdev_draid_spare_ioctl(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	int error = 0;
+
+	if (zio->io_cmd == DKIOCFLUSHWRITECACHE) {
+		for (int c = 0; c < vd->vdev_children; c++) {
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    vd->vdev_child[c], zio->io_offset, zio->io_abd,
+			    zio->io_size, zio->io_type, zio->io_priority, 0,
+			    vdev_draid_spare_child_done, zio));
+		}
+	} else {
+		error = SET_ERROR(ENOTSUP);
+	}
+
+	return (error);
+}
+
+/*
+ * Initiate an IO to the distributed spare.  For normal IOs this entails using
+ * the zio->io_offset and permutation table to calculate which child dRAID vdev
+ * is responsible for the data.  Then passing along the zio to that child to
+ * perform the actual IO.  The label ranges are not stored on disk and require
+ * some special handling which is described below.
+ */
+static void
+vdev_draid_spare_io_start(zio_t *zio)
+{
+	vdev_t *cvd = NULL, *vd = zio->io_vd;
+	vdev_draid_spare_t *vds = vd->vdev_tsd;
+	uint64_t offset = zio->io_offset - VDEV_LABEL_START_SIZE;
+
+	/*
+	 * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+	 * Nothing to be done here but return failure.
+	 */
+	if (vds == NULL) {
+		zio->io_error = ENXIO;
+		zio_interrupt(zio);
+		return;
+	}
+
+	switch (zio->io_type) {
+	case ZIO_TYPE_IOCTL:
+		zio->io_error = vdev_draid_spare_ioctl(zio);
+		break;
+
+	case ZIO_TYPE_WRITE:
+		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
+			/*
+			 * Accept probe IOs and config writers to simulate the
+			 * existence of an on disk label.  vdev_label_sync(),
+			 * vdev_uberblock_sync() and vdev_copy_uberblocks()
+			 * skip the distributed spares.  This only leaves
+			 * vdev_label_init() which is allowed to succeed to
+			 * avoid adding special cases the function.
+			 */
+			if (zio->io_flags & ZIO_FLAG_PROBE ||
+			    zio->io_flags & ZIO_FLAG_CONFIG_WRITER) {
+				zio->io_error = 0;
+			} else {
+				zio->io_error = SET_ERROR(EIO);
+			}
+		} else {
+			cvd = vdev_draid_spare_get_child(vd, offset);
+
+			if (cvd == NULL) {
+				zio->io_error = SET_ERROR(ENXIO);
+			} else {
+				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+				    offset, zio->io_abd, zio->io_size,
+				    zio->io_type, zio->io_priority, 0,
+				    vdev_draid_spare_child_done, zio));
+			}
+		}
+		break;
+
+	case ZIO_TYPE_READ:
+		if (VDEV_OFFSET_IS_LABEL(vd, zio->io_offset)) {
+			/*
+			 * Accept probe IOs to simulate the existence of a
+			 * label.  vdev_label_read_config() bypasses the
+			 * pipeline to read the label configuration and
+			 * vdev_uberblock_load() skips distributed spares
+			 * when attempting to locate the best uberblock.
+			 */
+			if (zio->io_flags & ZIO_FLAG_PROBE) {
+				zio->io_error = 0;
+			} else {
+				zio->io_error = SET_ERROR(EIO);
+			}
+		} else {
+			cvd = vdev_draid_spare_get_child(vd, offset);
+
+			if (cvd == NULL || !vdev_readable(cvd)) {
+				zio->io_error = SET_ERROR(ENXIO);
+			} else {
+				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+				    offset, zio->io_abd, zio->io_size,
+				    zio->io_type, zio->io_priority, 0,
+				    vdev_draid_spare_child_done, zio));
+			}
+		}
+		break;
+
+	case ZIO_TYPE_TRIM:
+		/* The vdev label ranges are never trimmed */
+		ASSERT0(VDEV_OFFSET_IS_LABEL(vd, zio->io_offset));
+
+		cvd = vdev_draid_spare_get_child(vd, offset);
+
+		if (cvd == NULL || !cvd->vdev_has_trim) {
+			zio->io_error = SET_ERROR(ENXIO);
+		} else {
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    offset, zio->io_abd, zio->io_size,
+			    zio->io_type, zio->io_priority, 0,
+			    vdev_draid_spare_child_done, zio));
+		}
+		break;
+
+	default:
+		zio->io_error = SET_ERROR(ENOTSUP);
+		break;
+	}
+
+	zio_execute(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_draid_spare_io_done(zio_t *zio)
+{
+}
+
+/*
+ * Lookup the full spare config in spa->spa_spares.sav_config and
+ * return the top_guid and spare_id for the named spare.
+ */
+static int
+vdev_draid_spare_lookup(spa_t *spa, nvlist_t *nv, uint64_t *top_guidp,
+    uint64_t *spare_idp)
+{
+	nvlist_t **spares;
+	uint_t nspares;
+	int error;
+
+	if ((spa->spa_spares.sav_config == NULL) ||
+	    (nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+	    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)) {
+		return (SET_ERROR(ENOENT));
+	}
+
+	char *spare_name;
+	error = nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &spare_name);
+	if (error != 0)
+		return (SET_ERROR(EINVAL));
+
+	for (int i = 0; i < nspares; i++) {
+		nvlist_t *spare = spares[i];
+		uint64_t top_guid, spare_id;
+		char *type, *path;
+
+		/* Skip non-distributed spares */
+		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_TYPE, &type);
+		if (error != 0 || strcmp(type, VDEV_TYPE_DRAID_SPARE) != 0)
+			continue;
+
+		/* Skip spares with the wrong name */
+		error = nvlist_lookup_string(spare, ZPOOL_CONFIG_PATH, &path);
+		if (error != 0 || strcmp(path, spare_name) != 0)
+			continue;
+
+		/* Found the matching spare */
+		error = nvlist_lookup_uint64(spare,
+		    ZPOOL_CONFIG_TOP_GUID, &top_guid);
+		if (error == 0) {
+			error = nvlist_lookup_uint64(spare,
+			    ZPOOL_CONFIG_SPARE_ID, &spare_id);
+		}
+
+		if (error != 0) {
+			return (SET_ERROR(EINVAL));
+		} else {
+			*top_guidp = top_guid;
+			*spare_idp = spare_id;
+			return (0);
+		}
+	}
+
+	return (SET_ERROR(ENOENT));
+}
+
+/*
+ * Initialize private dRAID spare specific fields from the nvlist.
+ */
+static int
+vdev_draid_spare_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	vdev_draid_spare_t *vds;
+	uint64_t top_guid = 0;
+	uint64_t spare_id;
+
+	/*
+	 * In the normal case check the list of spares stored in the spa
+	 * to lookup the top_guid and spare_id for provided spare config.
+	 * When creating a new pool or adding vdevs the spare list is not
+	 * yet populated and the values are provided in the passed config.
+	 */
+	if (vdev_draid_spare_lookup(spa, nv, &top_guid, &spare_id) != 0) {
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_TOP_GUID,
+		    &top_guid) != 0)
+			return (SET_ERROR(EINVAL));
+
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_SPARE_ID,
+		    &spare_id) != 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	vds = kmem_alloc(sizeof (vdev_draid_spare_t), KM_SLEEP);
+	vds->vds_draid_vdev = NULL;
+	vds->vds_top_guid = top_guid;
+	vds->vds_spare_id = spare_id;
+
+	*tsd = vds;
+
+	return (0);
+}
+
+static void
+vdev_draid_spare_fini(vdev_t *vd)
+{
+	kmem_free(vd->vdev_tsd, sizeof (vdev_draid_spare_t));
+}
+
+static void
+vdev_draid_spare_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+	vdev_draid_spare_t *vds = vd->vdev_tsd;
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_draid_spare_ops);
+
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_TOP_GUID, vds->vds_top_guid);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_SPARE_ID, vds->vds_spare_id);
+}
+
+vdev_ops_t vdev_draid_spare_ops = {
+	.vdev_op_init = vdev_draid_spare_init,
+	.vdev_op_fini = vdev_draid_spare_fini,
+	.vdev_op_open = vdev_draid_spare_open,
+	.vdev_op_close = vdev_draid_spare_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_draid_spare_io_start,
+	.vdev_op_io_done = vdev_draid_spare_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = vdev_draid_spare_config_generate,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_DRAID_SPARE,
+	.vdev_op_leaf = B_TRUE,
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c
new file mode 100644
index 000000000000..fe1a75c11312
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_draid_rand.c
@@ -0,0 +1,40 @@
+/*
+ * Xorshift Pseudo Random Number Generator based on work by David Blackman
+ * and Sebastiano Vigna (vigna@acm.org).
+ *
+ *   "Further scramblings of Marsaglia's xorshift generators"
+ *   http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf
+ *   http://prng.di.unimi.it/xoroshiro128plusplus.c
+ *
+ * To the extent possible under law, the author has dedicated all copyright
+ * and related and neighboring rights to this software to the public domain
+ * worldwide. This software is distributed without any warranty.
+ *
+ * See <http://creativecommons.org/publicdomain/zero/1.0/>.
+ *
+ * This is xoroshiro128++ 1.0, one of our all-purpose, rock-solid,
+ * small-state generators. It is extremely (sub-ns) fast and it passes all
+ * tests we are aware of, but its state space is large enough only for
+ * mild parallelism.
+ */
+
+#include <sys/vdev_draid.h>
+
+static inline uint64_t rotl(const uint64_t x, int k)
+{
+	return (x << k) | (x >> (64 - k));
+}
+
+uint64_t
+vdev_draid_rand(uint64_t *s)
+{
+	const uint64_t s0 = s[0];
+	uint64_t s1 = s[1];
+	const uint64_t result = rotl(s0 + s1, 17) + s0;
+
+	s1 ^= s0;
+	s[0] = rotl(s0, 49) ^ s1 ^ (s1 << 21); // a, b
+	s[1] = rotl(s1, 28); // c
+
+	return (result);
+}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect.c b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
new file mode 100644
index 000000000000..b26d0993711a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect.c
@@ -0,0 +1,1911 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright (c) 2014, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/metaslab.h>
+#include <sys/dmu.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/abd.h>
+#include <sys/zthr.h>
+
+/*
+ * An indirect vdev corresponds to a vdev that has been removed.  Since
+ * we cannot rewrite block pointers of snapshots, etc., we keep a
+ * mapping from old location on the removed device to the new location
+ * on another device in the pool and use this mapping whenever we need
+ * to access the DVA.  Unfortunately, this mapping did not respect
+ * logical block boundaries when it was first created, and so a DVA on
+ * this indirect vdev may be "split" into multiple sections that each
+ * map to a different location.  As a consequence, not all DVAs can be
+ * translated to an equivalent new DVA.  Instead we must provide a
+ * "vdev_remap" operation that executes a callback on each contiguous
+ * segment of the new location.  This function is used in multiple ways:
+ *
+ *  - i/os to this vdev use the callback to determine where the
+ *    data is now located, and issue child i/os for each segment's new
+ *    location.
+ *
+ *  - frees and claims to this vdev use the callback to free or claim
+ *    each mapped segment.  (Note that we don't actually need to claim
+ *    log blocks on indirect vdevs, because we don't allocate to
+ *    removing vdevs.  However, zdb uses zio_claim() for its leak
+ *    detection.)
+ */
+
+/*
+ * "Big theory statement" for how we mark blocks obsolete.
+ *
+ * When a block on an indirect vdev is freed or remapped, a section of
+ * that vdev's mapping may no longer be referenced (aka "obsolete").  We
+ * keep track of how much of each mapping entry is obsolete.  When
+ * an entry becomes completely obsolete, we can remove it, thus reducing
+ * the memory used by the mapping.  The complete picture of obsolescence
+ * is given by the following data structures, described below:
+ *  - the entry-specific obsolete count
+ *  - the vdev-specific obsolete spacemap
+ *  - the pool-specific obsolete bpobj
+ *
+ * == On disk data structures used ==
+ *
+ * We track the obsolete space for the pool using several objects.  Each
+ * of these objects is created on demand and freed when no longer
+ * needed, and is assumed to be empty if it does not exist.
+ * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects.
+ *
+ *  - Each vic_mapping_object (associated with an indirect vdev) can
+ *    have a vimp_counts_object.  This is an array of uint32_t's
+ *    with the same number of entries as the vic_mapping_object.  When
+ *    the mapping is condensed, entries from the vic_obsolete_sm_object
+ *    (see below) are folded into the counts.  Therefore, each
+ *    obsolete_counts entry tells us the number of bytes in the
+ *    corresponding mapping entry that were not referenced when the
+ *    mapping was last condensed.
+ *
+ *  - Each indirect or removing vdev can have a vic_obsolete_sm_object.
+ *    This is a space map containing an alloc entry for every DVA that
+ *    has been obsoleted since the last time this indirect vdev was
+ *    condensed.  We use this object in order to improve performance
+ *    when marking a DVA as obsolete.  Instead of modifying an arbitrary
+ *    offset of the vimp_counts_object, we only need to append an entry
+ *    to the end of this object.  When a DVA becomes obsolete, it is
+ *    added to the obsolete space map.  This happens when the DVA is
+ *    freed, remapped and not referenced by a snapshot, or the last
+ *    snapshot referencing it is destroyed.
+ *
+ *  - Each dataset can have a ds_remap_deadlist object.  This is a
+ *    deadlist object containing all blocks that were remapped in this
+ *    dataset but referenced in a previous snapshot.  Blocks can *only*
+ *    appear on this list if they were remapped (dsl_dataset_block_remapped);
+ *    blocks that were killed in a head dataset are put on the normal
+ *    ds_deadlist and marked obsolete when they are freed.
+ *
+ *  - The pool can have a dp_obsolete_bpobj.  This is a list of blocks
+ *    in the pool that need to be marked obsolete.  When a snapshot is
+ *    destroyed, we move some of the ds_remap_deadlist to the obsolete
+ *    bpobj (see dsl_destroy_snapshot_handle_remaps()).  We then
+ *    asynchronously process the obsolete bpobj, moving its entries to
+ *    the specific vdevs' obsolete space maps.
+ *
+ * == Summary of how we mark blocks as obsolete ==
+ *
+ * - When freeing a block: if any DVA is on an indirect vdev, append to
+ *   vic_obsolete_sm_object.
+ * - When remapping a block, add dva to ds_remap_deadlist (if prev snap
+ *   references; otherwise append to vic_obsolete_sm_object).
+ * - When freeing a snapshot: move parts of ds_remap_deadlist to
+ *   dp_obsolete_bpobj (same algorithm as ds_deadlist).
+ * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to
+ *   individual vdev's vic_obsolete_sm_object.
+ */
+
+/*
+ * "Big theory statement" for how we condense indirect vdevs.
+ *
+ * Condensing an indirect vdev's mapping is the process of determining
+ * the precise counts of obsolete space for each mapping entry (by
+ * integrating the obsolete spacemap into the obsolete counts) and
+ * writing out a new mapping that contains only referenced entries.
+ *
+ * We condense a vdev when we expect the mapping to shrink (see
+ * vdev_indirect_should_condense()), but only perform one condense at a
+ * time to limit the memory usage.  In addition, we use a separate
+ * open-context thread (spa_condense_indirect_thread) to incrementally
+ * create the new mapping object in a way that minimizes the impact on
+ * the rest of the system.
+ *
+ * == Generating a new mapping ==
+ *
+ * To generate a new mapping, we follow these steps:
+ *
+ * 1. Save the old obsolete space map and create a new mapping object
+ *    (see spa_condense_indirect_start_sync()).  This initializes the
+ *    spa_condensing_indirect_phys with the "previous obsolete space map",
+ *    which is now read only.  Newly obsolete DVAs will be added to a
+ *    new (initially empty) obsolete space map, and will not be
+ *    considered as part of this condense operation.
+ *
+ * 2. Construct in memory the precise counts of obsolete space for each
+ *    mapping entry, by incorporating the obsolete space map into the
+ *    counts.  (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().)
+ *
+ * 3. Iterate through each mapping entry, writing to the new mapping any
+ *    entries that are not completely obsolete (i.e. which don't have
+ *    obsolete count == mapping length).  (See
+ *    spa_condense_indirect_generate_new_mapping().)
+ *
+ * 4. Destroy the old mapping object and switch over to the new one
+ *    (spa_condense_indirect_complete_sync).
+ *
+ * == Restarting from failure ==
+ *
+ * To restart the condense when we import/open the pool, we must start
+ * at the 2nd step above: reconstruct the precise counts in memory,
+ * based on the space map + counts.  Then in the 3rd step, we start
+ * iterating where we left off: at vimp_max_offset of the new mapping
+ * object.
+ */
+
+int zfs_condense_indirect_vdevs_enable = B_TRUE;
+
+/*
+ * Condense if at least this percent of the bytes in the mapping is
+ * obsolete.  With the default of 25%, the amount of space mapped
+ * will be reduced to 1% of its original size after at most 16
+ * condenses.  Higher values will condense less often (causing less
+ * i/o); lower values will reduce the mapping size more quickly.
+ */
+int zfs_indirect_condense_obsolete_pct = 25;
+
+/*
+ * Condense if the obsolete space map takes up more than this amount of
+ * space on disk (logically).  This limits the amount of disk space
+ * consumed by the obsolete space map; the default of 1GB is small enough
+ * that we typically don't mind "wasting" it.
+ */
+unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
+
+/*
+ * Don't bother condensing if the mapping uses less than this amount of
+ * memory.  The default of 128KB is considered a "trivial" amount of
+ * memory and not worth reducing.
+ */
+unsigned long zfs_condense_min_mapping_bytes = 128 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a condense (which might otherwise
+ * complete too quickly).  If used to reduce the performance impact of
+ * condensing in production, a maximum value of 1 should be sufficient.
+ */
+int zfs_condense_indirect_commit_entry_delay_ms = 0;
+
+/*
+ * If an indirect split block contains more than this many possible unique
+ * combinations when being reconstructed, consider it too computationally
+ * expensive to check them all. Instead, try at most 100 randomly-selected
+ * combinations each time the block is accessed.  This allows all segment
+ * copies to participate fairly in the reconstruction when all combinations
+ * cannot be checked and prevents repeated use of one bad copy.
+ */
+int zfs_reconstruct_indirect_combinations_max = 4096;
+
+/*
+ * Enable to simulate damaged segments and validate reconstruction.  This
+ * is intentionally not exposed as a module parameter.
+ */
+unsigned long zfs_reconstruct_indirect_damage_fraction = 0;
+
+/*
+ * The indirect_child_t represents the vdev that we will read from, when we
+ * need to read all copies of the data (e.g. for scrub or reconstruction).
+ * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror),
+ * ic_vdev is the same as is_vdev.  However, for mirror top-level vdevs,
+ * ic_vdev is a child of the mirror.
+ */
+typedef struct indirect_child {
+	abd_t *ic_data;
+	vdev_t *ic_vdev;
+
+	/*
+	 * ic_duplicate is NULL when the ic_data contents are unique, when it
+	 * is determined to be a duplicate it references the primary child.
+	 */
+	struct indirect_child *ic_duplicate;
+	list_node_t ic_node; /* node on is_unique_child */
+	int ic_error; /* set when a child does not contain the data */
+} indirect_child_t;
+
+/*
+ * The indirect_split_t represents one mapped segment of an i/o to the
+ * indirect vdev. For non-split (contiguously-mapped) blocks, there will be
+ * only one indirect_split_t, with is_split_offset==0 and is_size==io_size.
+ * For split blocks, there will be several of these.
+ */
+typedef struct indirect_split {
+	list_node_t is_node; /* link on iv_splits */
+
+	/*
+	 * is_split_offset is the offset into the i/o.
+	 * This is the sum of the previous splits' is_size's.
+	 */
+	uint64_t is_split_offset;
+
+	vdev_t *is_vdev; /* top-level vdev */
+	uint64_t is_target_offset; /* offset on is_vdev */
+	uint64_t is_size;
+	int is_children; /* number of entries in is_child[] */
+	int is_unique_children; /* number of entries in is_unique_child */
+	list_t is_unique_child;
+
+	/*
+	 * is_good_child is the child that we are currently using to
+	 * attempt reconstruction.
+	 */
+	indirect_child_t *is_good_child;
+
+	indirect_child_t is_child[1]; /* variable-length */
+} indirect_split_t;
+
+/*
+ * The indirect_vsd_t is associated with each i/o to the indirect vdev.
+ * It is the "Vdev-Specific Data" in the zio_t's io_vsd.
+ */
+typedef struct indirect_vsd {
+	boolean_t iv_split_block;
+	boolean_t iv_reconstruct;
+	uint64_t iv_unique_combinations;
+	uint64_t iv_attempts;
+	uint64_t iv_attempts_max;
+
+	list_t iv_splits; /* list of indirect_split_t's */
+} indirect_vsd_t;
+
+static void
+vdev_indirect_map_free(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	indirect_split_t *is;
+	while ((is = list_head(&iv->iv_splits)) != NULL) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic->ic_data != NULL)
+				abd_free(ic->ic_data);
+		}
+		list_remove(&iv->iv_splits, is);
+
+		indirect_child_t *ic;
+		while ((ic = list_head(&is->is_unique_child)) != NULL)
+			list_remove(&is->is_unique_child, ic);
+
+		list_destroy(&is->is_unique_child);
+
+		kmem_free(is,
+		    offsetof(indirect_split_t, is_child[is->is_children]));
+	}
+	kmem_free(iv, sizeof (*iv));
+}
+
+static const zio_vsd_ops_t vdev_indirect_vsd_ops = {
+	.vsd_free = vdev_indirect_map_free,
+	.vsd_cksum_report = zio_vsd_default_cksum_report
+};
+
+/*
+ * Mark the given offset and size as being obsolete.
+ */
+void
+vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0);
+	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+	ASSERT(size > 0);
+	VERIFY(vdev_indirect_mapping_entry_for_offset(
+	    vd->vdev_indirect_mapping, offset) != NULL);
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		mutex_enter(&vd->vdev_obsolete_lock);
+		range_tree_add(vd->vdev_obsolete_segments, offset, size);
+		mutex_exit(&vd->vdev_obsolete_lock);
+		vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa));
+	}
+}
+
+/*
+ * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This
+ * wrapper is provided because the DMU does not know about vdev_t's and
+ * cannot directly call vdev_indirect_mark_obsolete.
+ */
+void
+spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset,
+    uint64_t size, dmu_tx_t *tx)
+{
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/* The DMU can only remap indirect vdevs. */
+	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+	vdev_indirect_mark_obsolete(vd, offset, size);
+}
+
+static spa_condensing_indirect_t *
+spa_condensing_indirect_create(spa_t *spa)
+{
+	spa_condensing_indirect_phys_t *scip =
+	    &spa->spa_condensing_indirect_phys;
+	spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP);
+	objset_t *mos = spa->spa_meta_objset;
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		list_create(&sci->sci_new_mapping_entries[i],
+		    sizeof (vdev_indirect_mapping_entry_t),
+		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
+	}
+
+	sci->sci_new_mapping =
+	    vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object);
+
+	return (sci);
+}
+
+static void
+spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci)
+{
+	for (int i = 0; i < TXG_SIZE; i++)
+		list_destroy(&sci->sci_new_mapping_entries[i]);
+
+	if (sci->sci_new_mapping != NULL)
+		vdev_indirect_mapping_close(sci->sci_new_mapping);
+
+	kmem_free(sci, sizeof (*sci));
+}
+
+boolean_t
+vdev_indirect_should_condense(vdev_t *vd)
+{
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool));
+
+	if (!zfs_condense_indirect_vdevs_enable)
+		return (B_FALSE);
+
+	/*
+	 * We can only condense one indirect vdev at a time.
+	 */
+	if (spa->spa_condensing_indirect != NULL)
+		return (B_FALSE);
+
+	if (spa_shutting_down(spa))
+		return (B_FALSE);
+
+	/*
+	 * The mapping object size must not change while we are
+	 * condensing, so we can only condense indirect vdevs
+	 * (not vdevs that are still in the middle of being removed).
+	 */
+	if (vd->vdev_ops != &vdev_indirect_ops)
+		return (B_FALSE);
+
+	/*
+	 * If nothing new has been marked obsolete, there is no
+	 * point in condensing.
+	 */
+	uint64_t obsolete_sm_obj __maybe_unused;
+	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
+	if (vd->vdev_obsolete_sm == NULL) {
+		ASSERT0(obsolete_sm_obj);
+		return (B_FALSE);
+	}
+
+	ASSERT(vd->vdev_obsolete_sm != NULL);
+
+	ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm));
+
+	uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim);
+	uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm);
+	uint64_t mapping_size = vdev_indirect_mapping_size(vim);
+	uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm);
+
+	ASSERT3U(bytes_obsolete, <=, bytes_mapped);
+
+	/*
+	 * If a high percentage of the bytes that are mapped have become
+	 * obsolete, condense (unless the mapping is already small enough).
+	 * This has a good chance of reducing the amount of memory used
+	 * by the mapping.
+	 */
+	if (bytes_obsolete * 100 / bytes_mapped >=
+	    zfs_indirect_condense_obsolete_pct &&
+	    mapping_size > zfs_condense_min_mapping_bytes) {
+		zfs_dbgmsg("should condense vdev %llu because obsolete "
+		    "spacemap covers %d%% of %lluMB mapping",
+		    (u_longlong_t)vd->vdev_id,
+		    (int)(bytes_obsolete * 100 / bytes_mapped),
+		    (u_longlong_t)bytes_mapped / 1024 / 1024);
+		return (B_TRUE);
+	}
+
+	/*
+	 * If the obsolete space map takes up too much space on disk,
+	 * condense in order to free up this disk space.
+	 */
+	if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) {
+		zfs_dbgmsg("should condense vdev %llu because obsolete sm "
+		    "length %lluMB >= max size %lluMB",
+		    (u_longlong_t)vd->vdev_id,
+		    (u_longlong_t)obsolete_sm_size / 1024 / 1024,
+		    (u_longlong_t)zfs_condense_max_obsolete_bytes /
+		    1024 / 1024);
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * This sync task completes (finishes) a condense, deleting the old
+ * mapping and replacing it with the new one.
+ */
+static void
+spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_condensing_indirect_t *sci = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	spa_condensing_indirect_phys_t *scip =
+	    &spa->spa_condensing_indirect_phys;
+	vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev);
+	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+	objset_t *mos = spa->spa_meta_objset;
+	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+	uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping);
+	uint64_t new_count =
+	    vdev_indirect_mapping_num_entries(sci->sci_new_mapping);
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+	}
+	ASSERT(vic->vic_mapping_object != 0);
+	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+	ASSERT(scip->scip_next_mapping_object != 0);
+	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+
+	/*
+	 * Reset vdev_indirect_mapping to refer to the new object.
+	 */
+	rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER);
+	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+	vd->vdev_indirect_mapping = sci->sci_new_mapping;
+	rw_exit(&vd->vdev_indirect_rwlock);
+
+	sci->sci_new_mapping = NULL;
+	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+	vic->vic_mapping_object = scip->scip_next_mapping_object;
+	scip->scip_next_mapping_object = 0;
+
+	space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx);
+	spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+	scip->scip_prev_obsolete_sm_object = 0;
+
+	scip->scip_vdev = 0;
+
+	VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CONDENSING_INDIRECT, tx));
+	spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+	spa->spa_condensing_indirect = NULL;
+
+	zfs_dbgmsg("finished condense of vdev %llu in txg %llu: "
+	    "new mapping object %llu has %llu entries "
+	    "(was %llu entries)",
+	    vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object,
+	    new_count, old_count);
+
+	vdev_config_dirty(spa->spa_root_vdev);
+}
+
+/*
+ * This sync task appends entries to the new mapping object.
+ */
+static void
+spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_condensing_indirect_t *sci = arg;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3P(sci, ==, spa->spa_condensing_indirect);
+
+	vdev_indirect_mapping_add_entries(sci->sci_new_mapping,
+	    &sci->sci_new_mapping_entries[txg & TXG_MASK], tx);
+	ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK]));
+}
+
+/*
+ * Open-context function to add one entry to the new mapping.  The new
+ * entry will be remembered and written from syncing context.
+ */
+static void
+spa_condense_indirect_commit_entry(spa_t *spa,
+    vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count)
+{
+	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+
+	ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst));
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count));
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+	/*
+	 * If we are the first entry committed this txg, kick off the sync
+	 * task to write to the MOS on our behalf.
+	 */
+	if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) {
+		dsl_sync_task_nowait(dmu_tx_pool(tx),
+		    spa_condense_indirect_commit_sync, sci, tx);
+	}
+
+	vdev_indirect_mapping_entry_t *vime =
+	    kmem_alloc(sizeof (*vime), KM_SLEEP);
+	vime->vime_mapping = *vimep;
+	vime->vime_obsolete_count = count;
+	list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime);
+
+	dmu_tx_commit(tx);
+}
+
+static void
+spa_condense_indirect_generate_new_mapping(vdev_t *vd,
+    uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr)
+{
+	spa_t *spa = vd->vdev_spa;
+	uint64_t mapi = start_index;
+	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+	uint64_t old_num_entries =
+	    vdev_indirect_mapping_num_entries(old_mapping);
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+	ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev);
+
+	zfs_dbgmsg("starting condense of vdev %llu from index %llu",
+	    (u_longlong_t)vd->vdev_id,
+	    (u_longlong_t)mapi);
+
+	while (mapi < old_num_entries) {
+
+		if (zthr_iscancelled(zthr)) {
+			zfs_dbgmsg("pausing condense of vdev %llu "
+			    "at index %llu", (u_longlong_t)vd->vdev_id,
+			    (u_longlong_t)mapi);
+			break;
+		}
+
+		vdev_indirect_mapping_entry_phys_t *entry =
+		    &old_mapping->vim_entries[mapi];
+		uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst);
+		ASSERT3U(obsolete_counts[mapi], <=, entry_size);
+		if (obsolete_counts[mapi] < entry_size) {
+			spa_condense_indirect_commit_entry(spa, entry,
+			    obsolete_counts[mapi]);
+
+			/*
+			 * This delay may be requested for testing, debugging,
+			 * or performance reasons.
+			 */
+			hrtime_t now = gethrtime();
+			hrtime_t sleep_until = now + MSEC2NSEC(
+			    zfs_condense_indirect_commit_entry_delay_ms);
+			zfs_sleep_until(sleep_until);
+		}
+
+		mapi++;
+	}
+}
+
+/* ARGSUSED */
+static boolean_t
+spa_condense_indirect_thread_check(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+
+	return (spa->spa_condensing_indirect != NULL);
+}
+
+/* ARGSUSED */
+static void
+spa_condense_indirect_thread(void *arg, zthr_t *zthr)
+{
+	spa_t *spa = arg;
+	vdev_t *vd;
+
+	ASSERT3P(spa->spa_condensing_indirect, !=, NULL);
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev);
+	ASSERT3P(vd, !=, NULL);
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	spa_condensing_indirect_t *sci = spa->spa_condensing_indirect;
+	spa_condensing_indirect_phys_t *scip =
+	    &spa->spa_condensing_indirect_phys;
+	uint32_t *counts;
+	uint64_t start_index;
+	vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping;
+	space_map_t *prev_obsolete_sm = NULL;
+
+	ASSERT3U(vd->vdev_id, ==, scip->scip_vdev);
+	ASSERT(scip->scip_next_mapping_object != 0);
+	ASSERT(scip->scip_prev_obsolete_sm_object != 0);
+	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		/*
+		 * The list must start out empty in order for the
+		 * _commit_sync() sync task to be properly registered
+		 * on the first call to _commit_entry(); so it's wise
+		 * to double check and ensure we actually are starting
+		 * with empty lists.
+		 */
+		ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i]));
+	}
+
+	VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
+	    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
+	counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping);
+	if (prev_obsolete_sm != NULL) {
+		vdev_indirect_mapping_load_obsolete_spacemap(old_mapping,
+		    counts, prev_obsolete_sm);
+	}
+	space_map_close(prev_obsolete_sm);
+
+	/*
+	 * Generate new mapping.  Determine what index to continue from
+	 * based on the max offset that we've already written in the
+	 * new mapping.
+	 */
+	uint64_t max_offset =
+	    vdev_indirect_mapping_max_offset(sci->sci_new_mapping);
+	if (max_offset == 0) {
+		/* We haven't written anything to the new mapping yet. */
+		start_index = 0;
+	} else {
+		/*
+		 * Pick up from where we left off. _entry_for_offset()
+		 * returns a pointer into the vim_entries array. If
+		 * max_offset is greater than any of the mappings
+		 * contained in the table  NULL will be returned and
+		 * that indicates we've exhausted our iteration of the
+		 * old_mapping.
+		 */
+
+		vdev_indirect_mapping_entry_phys_t *entry =
+		    vdev_indirect_mapping_entry_for_offset_or_next(old_mapping,
+		    max_offset);
+
+		if (entry == NULL) {
+			/*
+			 * We've already written the whole new mapping.
+			 * This special value will cause us to skip the
+			 * generate_new_mapping step and just do the sync
+			 * task to complete the condense.
+			 */
+			start_index = UINT64_MAX;
+		} else {
+			start_index = entry - old_mapping->vim_entries;
+			ASSERT3U(start_index, <,
+			    vdev_indirect_mapping_num_entries(old_mapping));
+		}
+	}
+
+	spa_condense_indirect_generate_new_mapping(vd, counts,
+	    start_index, zthr);
+
+	vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts);
+
+	/*
+	 * If the zthr has received a cancellation signal while running
+	 * in generate_new_mapping() or at any point after that, then bail
+	 * early. We don't want to complete the condense if the spa is
+	 * shutting down.
+	 */
+	if (zthr_iscancelled(zthr))
+		return;
+
+	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
+	    spa_condense_indirect_complete_sync, sci, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Sync task to begin the condensing process.
+ */
+void
+spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+	spa_condensing_indirect_phys_t *scip =
+	    &spa->spa_condensing_indirect_phys;
+
+	ASSERT0(scip->scip_next_mapping_object);
+	ASSERT0(scip->scip_prev_obsolete_sm_object);
+	ASSERT0(scip->scip_vdev);
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+	ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping));
+
+	uint64_t obsolete_sm_obj;
+	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj));
+	ASSERT3U(obsolete_sm_obj, !=, 0);
+
+	scip->scip_vdev = vd->vdev_id;
+	scip->scip_next_mapping_object =
+	    vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx);
+
+	scip->scip_prev_obsolete_sm_object = obsolete_sm_obj;
+
+	/*
+	 * We don't need to allocate a new space map object, since
+	 * vdev_indirect_sync_obsolete will allocate one when needed.
+	 */
+	space_map_close(vd->vdev_obsolete_sm);
+	vd->vdev_obsolete_sm = NULL;
+	VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+
+	VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+	    sizeof (*scip) / sizeof (uint64_t), scip, tx));
+
+	ASSERT3P(spa->spa_condensing_indirect, ==, NULL);
+	spa->spa_condensing_indirect = spa_condensing_indirect_create(spa);
+
+	zfs_dbgmsg("starting condense of vdev %llu in txg %llu: "
+	    "posm=%llu nm=%llu",
+	    vd->vdev_id, dmu_tx_get_txg(tx),
+	    (u_longlong_t)scip->scip_prev_obsolete_sm_object,
+	    (u_longlong_t)scip->scip_next_mapping_object);
+
+	zthr_wakeup(spa->spa_condense_zthr);
+}
+
+/*
+ * Sync to the given vdev's obsolete space map any segments that are no longer
+ * referenced as of the given txg.
+ *
+ * If the obsolete space map doesn't exist yet, create and open it.
+ */
+void
+vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
+
+	ASSERT3U(vic->vic_mapping_object, !=, 0);
+	ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0);
+	ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops);
+	ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS));
+
+	uint64_t obsolete_sm_object;
+	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+	if (obsolete_sm_object == 0) {
+		obsolete_sm_object = space_map_alloc(spa->spa_meta_objset,
+		    zfs_vdev_standard_sm_blksz, tx);
+
+		ASSERT(vd->vdev_top_zap != 0);
+		VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM,
+		    sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx));
+		ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+		ASSERT3U(obsolete_sm_object, !=, 0);
+
+		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+		VERIFY0(space_map_open(&vd->vdev_obsolete_sm,
+		    spa->spa_meta_objset, obsolete_sm_object,
+		    0, vd->vdev_asize, 0));
+	}
+
+	ASSERT(vd->vdev_obsolete_sm != NULL);
+	ASSERT3U(obsolete_sm_object, ==,
+	    space_map_object(vd->vdev_obsolete_sm));
+
+	space_map_write(vd->vdev_obsolete_sm,
+	    vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx);
+	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+}
+
+int
+spa_condense_init(spa_t *spa)
+{
+	int error = zap_lookup(spa->spa_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t),
+	    sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t),
+	    &spa->spa_condensing_indirect_phys);
+	if (error == 0) {
+		if (spa_writeable(spa)) {
+			spa->spa_condensing_indirect =
+			    spa_condensing_indirect_create(spa);
+		}
+		return (0);
+	} else if (error == ENOENT) {
+		return (0);
+	} else {
+		return (error);
+	}
+}
+
+void
+spa_condense_fini(spa_t *spa)
+{
+	if (spa->spa_condensing_indirect != NULL) {
+		spa_condensing_indirect_destroy(spa->spa_condensing_indirect);
+		spa->spa_condensing_indirect = NULL;
+	}
+}
+
+void
+spa_start_indirect_condensing_thread(spa_t *spa)
+{
+	ASSERT3P(spa->spa_condense_zthr, ==, NULL);
+	spa->spa_condense_zthr = zthr_create("z_indirect_condense",
+	    spa_condense_indirect_thread_check,
+	    spa_condense_indirect_thread, spa);
+}
+
+/*
+ * Gets the obsolete spacemap object from the vdev's ZAP.  On success sm_obj
+ * will contain either the obsolete spacemap object or zero if none exists.
+ * All other errors are returned to the caller.
+ */
+int
+vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj)
+{
+	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+	if (vd->vdev_top_zap == 0) {
+		*sm_obj = 0;
+		return (0);
+	}
+
+	int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj);
+	if (error == ENOENT) {
+		*sm_obj = 0;
+		error = 0;
+	}
+
+	return (error);
+}
+
+/*
+ * Gets the obsolete count are precise spacemap object from the vdev's ZAP.
+ * On success are_precise will be set to reflect if the counts are precise.
+ * All other errors are returned to the caller.
+ */
+int
+vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise)
+{
+	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+	if (vd->vdev_top_zap == 0) {
+		*are_precise = B_FALSE;
+		return (0);
+	}
+
+	uint64_t val = 0;
+	int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val);
+	if (error == 0) {
+		*are_precise = (val != 0);
+	} else if (error == ENOENT) {
+		*are_precise = B_FALSE;
+		error = 0;
+	}
+
+	return (error);
+}
+
+/* ARGSUSED */
+static void
+vdev_indirect_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static int
+vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	*psize = *max_psize = vd->vdev_asize +
+	    VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+	*logical_ashift = vd->vdev_ashift;
+	*physical_ashift = vd->vdev_physical_ashift;
+	return (0);
+}
+
+typedef struct remap_segment {
+	vdev_t *rs_vd;
+	uint64_t rs_offset;
+	uint64_t rs_asize;
+	uint64_t rs_split_offset;
+	list_node_t rs_node;
+} remap_segment_t;
+
+static remap_segment_t *
+rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset)
+{
+	remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP);
+	rs->rs_vd = vd;
+	rs->rs_offset = offset;
+	rs->rs_asize = asize;
+	rs->rs_split_offset = split_offset;
+	return (rs);
+}
+
+/*
+ * Given an indirect vdev and an extent on that vdev, it duplicates the
+ * physical entries of the indirect mapping that correspond to the extent
+ * to a new array and returns a pointer to it. In addition, copied_entries
+ * is populated with the number of mapping entries that were duplicated.
+ *
+ * Note that the function assumes that the caller holds vdev_indirect_rwlock.
+ * This ensures that the mapping won't change due to condensing as we
+ * copy over its contents.
+ *
+ * Finally, since we are doing an allocation, it is up to the caller to
+ * free the array allocated in this function.
+ */
+static vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset,
+    uint64_t asize, uint64_t *copied_entries)
+{
+	vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL;
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t entries = 0;
+
+	ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock));
+
+	vdev_indirect_mapping_entry_phys_t *first_mapping =
+	    vdev_indirect_mapping_entry_for_offset(vim, offset);
+	ASSERT3P(first_mapping, !=, NULL);
+
+	vdev_indirect_mapping_entry_phys_t *m = first_mapping;
+	while (asize > 0) {
+		uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+
+		ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m));
+		ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+
+		uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m);
+		uint64_t inner_size = MIN(asize, size - inner_offset);
+
+		offset += inner_size;
+		asize -= inner_size;
+		entries++;
+		m++;
+	}
+
+	size_t copy_length = entries * sizeof (*first_mapping);
+	duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP);
+	bcopy(first_mapping, duplicate_mappings, copy_length);
+	*copied_entries = entries;
+
+	return (duplicate_mappings);
+}
+
+/*
+ * Goes through the relevant indirect mappings until it hits a concrete vdev
+ * and issues the callback. On the way to the concrete vdev, if any other
+ * indirect vdevs are encountered, then the callback will also be called on
+ * each of those indirect vdevs. For example, if the segment is mapped to
+ * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is
+ * mapped to segment B on concrete vdev 2, then the callback will be called on
+ * both vdev 1 and vdev 2.
+ *
+ * While the callback passed to vdev_indirect_remap() is called on every vdev
+ * the function encounters, certain callbacks only care about concrete vdevs.
+ * These types of callbacks should return immediately and explicitly when they
+ * are called on an indirect vdev.
+ *
+ * Because there is a possibility that a DVA section in the indirect device
+ * has been split into multiple sections in our mapping, we keep track
+ * of the relevant contiguous segments of the new location (remap_segment_t)
+ * in a stack. This way we can call the callback for each of the new sections
+ * created by a single section of the indirect device. Note though, that in
+ * this scenario the callbacks in each split block won't occur in-order in
+ * terms of offset, so callers should not make any assumptions about that.
+ *
+ * For callbacks that don't handle split blocks and immediately return when
+ * they encounter them (as is the case for remap_blkptr_cb), the caller can
+ * assume that its callback will be applied from the first indirect vdev
+ * encountered to the last one and then the concrete vdev, in that order.
+ */
+static void
+vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize,
+    void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg)
+{
+	list_t stack;
+	spa_t *spa = vd->vdev_spa;
+
+	list_create(&stack, sizeof (remap_segment_t),
+	    offsetof(remap_segment_t, rs_node));
+
+	for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0);
+	    rs != NULL; rs = list_remove_head(&stack)) {
+		vdev_t *v = rs->rs_vd;
+		uint64_t num_entries = 0;
+
+		ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+		ASSERT(rs->rs_asize > 0);
+
+		/*
+		 * Note: As this function can be called from open context
+		 * (e.g. zio_read()), we need the following rwlock to
+		 * prevent the mapping from being changed by condensing.
+		 *
+		 * So we grab the lock and we make a copy of the entries
+		 * that are relevant to the extent that we are working on.
+		 * Once that is done, we drop the lock and iterate over
+		 * our copy of the mapping. Once we are done with the with
+		 * the remap segment and we free it, we also free our copy
+		 * of the indirect mapping entries that are relevant to it.
+		 *
+		 * This way we don't need to wait until the function is
+		 * finished with a segment, to condense it. In addition, we
+		 * don't need a recursive rwlock for the case that a call to
+		 * vdev_indirect_remap() needs to call itself (through the
+		 * codepath of its callback) for the same vdev in the middle
+		 * of its execution.
+		 */
+		rw_enter(&v->vdev_indirect_rwlock, RW_READER);
+		ASSERT3P(v->vdev_indirect_mapping, !=, NULL);
+
+		vdev_indirect_mapping_entry_phys_t *mapping =
+		    vdev_indirect_mapping_duplicate_adjacent_entries(v,
+		    rs->rs_offset, rs->rs_asize, &num_entries);
+		ASSERT3P(mapping, !=, NULL);
+		ASSERT3U(num_entries, >, 0);
+		rw_exit(&v->vdev_indirect_rwlock);
+
+		for (uint64_t i = 0; i < num_entries; i++) {
+			/*
+			 * Note: the vdev_indirect_mapping can not change
+			 * while we are running.  It only changes while the
+			 * removal is in progress, and then only from syncing
+			 * context. While a removal is in progress, this
+			 * function is only called for frees, which also only
+			 * happen from syncing context.
+			 */
+			vdev_indirect_mapping_entry_phys_t *m = &mapping[i];
+
+			ASSERT3P(m, !=, NULL);
+			ASSERT3U(rs->rs_asize, >, 0);
+
+			uint64_t size = DVA_GET_ASIZE(&m->vimep_dst);
+			uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst);
+			uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst);
+
+			ASSERT3U(rs->rs_offset, >=,
+			    DVA_MAPPING_GET_SRC_OFFSET(m));
+			ASSERT3U(rs->rs_offset, <,
+			    DVA_MAPPING_GET_SRC_OFFSET(m) + size);
+			ASSERT3U(dst_vdev, !=, v->vdev_id);
+
+			uint64_t inner_offset = rs->rs_offset -
+			    DVA_MAPPING_GET_SRC_OFFSET(m);
+			uint64_t inner_size =
+			    MIN(rs->rs_asize, size - inner_offset);
+
+			vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev);
+			ASSERT3P(dst_v, !=, NULL);
+
+			if (dst_v->vdev_ops == &vdev_indirect_ops) {
+				list_insert_head(&stack,
+				    rs_alloc(dst_v, dst_offset + inner_offset,
+				    inner_size, rs->rs_split_offset));
+
+			}
+
+			if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) &&
+			    IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) {
+				/*
+				 * Note: This clause exists only solely for
+				 * testing purposes. We use it to ensure that
+				 * split blocks work and that the callbacks
+				 * using them yield the same result if issued
+				 * in reverse order.
+				 */
+				uint64_t inner_half = inner_size / 2;
+
+				func(rs->rs_split_offset + inner_half, dst_v,
+				    dst_offset + inner_offset + inner_half,
+				    inner_half, arg);
+
+				func(rs->rs_split_offset, dst_v,
+				    dst_offset + inner_offset,
+				    inner_half, arg);
+			} else {
+				func(rs->rs_split_offset, dst_v,
+				    dst_offset + inner_offset,
+				    inner_size, arg);
+			}
+
+			rs->rs_offset += inner_size;
+			rs->rs_asize -= inner_size;
+			rs->rs_split_offset += inner_size;
+		}
+		VERIFY0(rs->rs_asize);
+
+		kmem_free(mapping, num_entries * sizeof (*mapping));
+		kmem_free(rs, sizeof (remap_segment_t));
+	}
+	list_destroy(&stack);
+}
+
+static void
+vdev_indirect_child_io_done(zio_t *zio)
+{
+	zio_t *pio = zio->io_private;
+
+	mutex_enter(&pio->io_lock);
+	pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
+	mutex_exit(&pio->io_lock);
+
+	abd_free(zio->io_abd);
+}
+
+/*
+ * This is a callback for vdev_indirect_remap() which allocates an
+ * indirect_split_t for each split segment and adds it to iv_splits.
+ */
+static void
+vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset,
+    uint64_t size, void *arg)
+{
+	zio_t *zio = arg;
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	ASSERT3P(vd, !=, NULL);
+
+	if (vd->vdev_ops == &vdev_indirect_ops)
+		return;
+
+	int n = 1;
+	if (vd->vdev_ops == &vdev_mirror_ops)
+		n = vd->vdev_children;
+
+	indirect_split_t *is =
+	    kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP);
+
+	is->is_children = n;
+	is->is_size = size;
+	is->is_split_offset = split_offset;
+	is->is_target_offset = offset;
+	is->is_vdev = vd;
+	list_create(&is->is_unique_child, sizeof (indirect_child_t),
+	    offsetof(indirect_child_t, ic_node));
+
+	/*
+	 * Note that we only consider multiple copies of the data for
+	 * *mirror* vdevs.  We don't for "replacing" or "spare" vdevs, even
+	 * though they use the same ops as mirror, because there's only one
+	 * "good" copy under the replacing/spare.
+	 */
+	if (vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < n; i++) {
+			is->is_child[i].ic_vdev = vd->vdev_child[i];
+			list_link_init(&is->is_child[i].ic_node);
+		}
+	} else {
+		is->is_child[0].ic_vdev = vd;
+	}
+
+	list_insert_tail(&iv->iv_splits, is);
+}
+
+static void
+vdev_indirect_read_split_done(zio_t *zio)
+{
+	indirect_child_t *ic = zio->io_private;
+
+	if (zio->io_error != 0) {
+		/*
+		 * Clear ic_data to indicate that we do not have data for this
+		 * child.
+		 */
+		abd_free(ic->ic_data);
+		ic->ic_data = NULL;
+	}
+}
+
+/*
+ * Issue reads for all copies (mirror children) of all splits.
+ */
+static void
+vdev_indirect_read_all(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic = &is->is_child[i];
+
+			if (!vdev_readable(ic->ic_vdev))
+				continue;
+
+			/*
+			 * If a child is missing the data, set ic_error. Used
+			 * in vdev_indirect_repair(). We perform the read
+			 * nevertheless which provides the opportunity to
+			 * reconstruct the split block if at all possible.
+			 */
+			if (vdev_dtl_contains(ic->ic_vdev, DTL_MISSING,
+			    zio->io_txg, 1))
+				ic->ic_error = SET_ERROR(ESTALE);
+
+			ic->ic_data = abd_alloc_sametype(zio->io_abd,
+			    is->is_size);
+			ic->ic_duplicate = NULL;
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset, ic->ic_data,
+			    is->is_size, zio->io_type, zio->io_priority, 0,
+			    vdev_indirect_read_split_done, ic));
+		}
+	}
+	iv->iv_reconstruct = B_TRUE;
+}
+
+static void
+vdev_indirect_io_start(zio_t *zio)
+{
+	spa_t *spa __maybe_unused = zio->io_spa;
+	indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP);
+	list_create(&iv->iv_splits,
+	    sizeof (indirect_split_t), offsetof(indirect_split_t, is_node));
+
+	zio->io_vsd = iv;
+	zio->io_vsd_ops = &vdev_indirect_vsd_ops;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+	if (zio->io_type != ZIO_TYPE_READ) {
+		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+		/*
+		 * Note: this code can handle other kinds of writes,
+		 * but we don't expect them.
+		 */
+		ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL |
+		    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0);
+	}
+
+	vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size,
+	    vdev_indirect_gather_splits, zio);
+
+	indirect_split_t *first = list_head(&iv->iv_splits);
+	if (first->is_size == zio->io_size) {
+		/*
+		 * This is not a split block; we are pointing to the entire
+		 * data, which will checksum the same as the original data.
+		 * Pass the BP down so that the child i/o can verify the
+		 * checksum, and try a different location if available
+		 * (e.g. on a mirror).
+		 *
+		 * While this special case could be handled the same as the
+		 * general (split block) case, doing it this way ensures
+		 * that the vast majority of blocks on indirect vdevs
+		 * (which are not split) are handled identically to blocks
+		 * on non-indirect vdevs.  This allows us to be less strict
+		 * about performance in the general (but rare) case.
+		 */
+		ASSERT0(first->is_split_offset);
+		ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL);
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    first->is_vdev, first->is_target_offset,
+		    abd_get_offset(zio->io_abd, 0),
+		    zio->io_size, zio->io_type, zio->io_priority, 0,
+		    vdev_indirect_child_io_done, zio));
+	} else {
+		iv->iv_split_block = B_TRUE;
+		if (zio->io_type == ZIO_TYPE_READ &&
+		    zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+			/*
+			 * Read all copies.  Note that for simplicity,
+			 * we don't bother consulting the DTL in the
+			 * resilver case.
+			 */
+			vdev_indirect_read_all(zio);
+		} else {
+			/*
+			 * If this is a read zio, we read one copy of each
+			 * split segment, from the top-level vdev.  Since
+			 * we don't know the checksum of each split
+			 * individually, the child zio can't ensure that
+			 * we get the right data. E.g. if it's a mirror,
+			 * it will just read from a random (healthy) leaf
+			 * vdev. We have to verify the checksum in
+			 * vdev_indirect_io_done().
+			 *
+			 * For write zios, the vdev code will ensure we write
+			 * to all children.
+			 */
+			for (indirect_split_t *is = list_head(&iv->iv_splits);
+			    is != NULL; is = list_next(&iv->iv_splits, is)) {
+				zio_nowait(zio_vdev_child_io(zio, NULL,
+				    is->is_vdev, is->is_target_offset,
+				    abd_get_offset(zio->io_abd,
+				    is->is_split_offset), is->is_size,
+				    zio->io_type, zio->io_priority, 0,
+				    vdev_indirect_child_io_done, zio));
+			}
+
+		}
+	}
+
+	zio_execute(zio);
+}
+
+/*
+ * Report a checksum error for a child.
+ */
+static void
+vdev_indirect_checksum_error(zio_t *zio,
+    indirect_split_t *is, indirect_child_t *ic)
+{
+	vdev_t *vd = ic->ic_vdev;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vd->vdev_stat.vs_checksum_errors++;
+	mutex_exit(&vd->vdev_stat_lock);
+
+	zio_bad_cksum_t zbc = {{{ 0 }}};
+	abd_t *bad_abd = ic->ic_data;
+	abd_t *good_abd = is->is_good_child->ic_data;
+	(void) zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio,
+	    is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc);
+}
+
+/*
+ * Issue repair i/os for any incorrect copies.  We do this by comparing
+ * each split segment's correct data (is_good_child's ic_data) with each
+ * other copy of the data.  If they differ, then we overwrite the bad data
+ * with the good copy.  The DTL is checked in vdev_indirect_read_all() and
+ * if a vdev is missing a copy of the data we set ic_error and the read is
+ * performed. This provides the opportunity to reconstruct the split block
+ * if at all possible. ic_error is checked here and if set it suppresses
+ * incrementing the checksum counter. Aside from this DTLs are not checked,
+ * which simplifies this code and also issues the optimal number of writes
+ * (based on which copies actually read bad data, as opposed to which we
+ * think might be wrong).  For the same reason, we always use
+ * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start().
+ */
+static void
+vdev_indirect_repair(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	enum zio_flag flags = ZIO_FLAG_IO_REPAIR;
+
+	if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)))
+		flags |= ZIO_FLAG_SELF_HEAL;
+
+	if (!spa_writeable(zio->io_spa))
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+			if (ic == is->is_good_child)
+				continue;
+			if (ic->ic_data == NULL)
+				continue;
+			if (ic->ic_duplicate == is->is_good_child)
+				continue;
+
+			zio_nowait(zio_vdev_child_io(zio, NULL,
+			    ic->ic_vdev, is->is_target_offset,
+			    is->is_good_child->ic_data, is->is_size,
+			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
+			    NULL, NULL));
+
+			/*
+			 * If ic_error is set the current child does not have
+			 * a copy of the data, so suppress incrementing the
+			 * checksum counter.
+			 */
+			if (ic->ic_error == ESTALE)
+				continue;
+
+			vdev_indirect_checksum_error(zio, is, ic);
+		}
+	}
+}
+
+/*
+ * Report checksum errors on all children that we read from.
+ */
+static void
+vdev_indirect_all_checksum_errors(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (zio->io_flags & ZIO_FLAG_SPECULATIVE)
+		return;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+
+			if (ic->ic_data == NULL)
+				continue;
+
+			vdev_t *vd = ic->ic_vdev;
+
+			int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
+			    NULL, zio, is->is_target_offset, is->is_size,
+			    NULL, NULL, NULL);
+			if (ret != EALREADY) {
+				mutex_enter(&vd->vdev_stat_lock);
+				vd->vdev_stat.vs_checksum_errors++;
+				mutex_exit(&vd->vdev_stat_lock);
+			}
+		}
+	}
+}
+
+/*
+ * Copy data from all the splits to a main zio then validate the checksum.
+ * If then checksum is successfully validated return success.
+ */
+static int
+vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio)
+{
+	zio_bad_cksum_t zbc;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+
+		ASSERT3P(is->is_good_child->ic_data, !=, NULL);
+		ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL);
+
+		abd_copy_off(zio->io_abd, is->is_good_child->ic_data,
+		    is->is_split_offset, 0, is->is_size);
+	}
+
+	return (zio_checksum_error(zio, &zbc));
+}
+
+/*
+ * There are relatively few possible combinations making it feasible to
+ * deterministically check them all.  We do this by setting the good_child
+ * to the next unique split version.  If we reach the end of the list then
+ * "carry over" to the next unique split version (like counting in base
+ * is_unique_children, but each digit can have a different base).
+ */
+static int
+vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio)
+{
+	boolean_t more = B_TRUE;
+
+	iv->iv_attempts = 0;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is))
+		is->is_good_child = list_head(&is->is_unique_child);
+
+	while (more == B_TRUE) {
+		iv->iv_attempts++;
+		more = B_FALSE;
+
+		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+			return (0);
+
+		for (indirect_split_t *is = list_head(&iv->iv_splits);
+		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+			is->is_good_child = list_next(&is->is_unique_child,
+			    is->is_good_child);
+			if (is->is_good_child != NULL) {
+				more = B_TRUE;
+				break;
+			}
+
+			is->is_good_child = list_head(&is->is_unique_child);
+		}
+	}
+
+	ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations);
+
+	return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * There are too many combinations to try all of them in a reasonable amount
+ * of time.  So try a fixed number of random combinations from the unique
+ * split versions, after which we'll consider the block unrecoverable.
+ */
+static int
+vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio)
+{
+	iv->iv_attempts = 0;
+
+	while (iv->iv_attempts < iv->iv_attempts_max) {
+		iv->iv_attempts++;
+
+		for (indirect_split_t *is = list_head(&iv->iv_splits);
+		    is != NULL; is = list_next(&iv->iv_splits, is)) {
+			indirect_child_t *ic = list_head(&is->is_unique_child);
+			int children = is->is_unique_children;
+
+			for (int i = spa_get_random(children); i > 0; i--)
+				ic = list_next(&is->is_unique_child, ic);
+
+			ASSERT3P(ic, !=, NULL);
+			is->is_good_child = ic;
+		}
+
+		if (vdev_indirect_splits_checksum_validate(iv, zio) == 0)
+			return (0);
+	}
+
+	return (SET_ERROR(ECKSUM));
+}
+
+/*
+ * This is a validation function for reconstruction.  It randomly selects
+ * a good combination, if one can be found, and then it intentionally
+ * damages all other segment copes by zeroing them.  This forces the
+ * reconstruction algorithm to locate the one remaining known good copy.
+ */
+static int
+vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio)
+{
+	int error;
+
+	/* Presume all the copies are unique for initial selection. */
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		is->is_unique_children = 0;
+
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic = &is->is_child[i];
+			if (ic->ic_data != NULL) {
+				is->is_unique_children++;
+				list_insert_tail(&is->is_unique_child, ic);
+			}
+		}
+
+		if (list_is_empty(&is->is_unique_child)) {
+			error = SET_ERROR(EIO);
+			goto out;
+		}
+	}
+
+	/*
+	 * Set each is_good_child to a randomly-selected child which
+	 * is known to contain validated data.
+	 */
+	error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+	if (error)
+		goto out;
+
+	/*
+	 * Damage all but the known good copy by zeroing it.  This will
+	 * result in two or less unique copies per indirect_child_t.
+	 * Both may need to be checked in order to reconstruct the block.
+	 * Set iv->iv_attempts_max such that all unique combinations will
+	 * enumerated, but limit the damage to at most 12 indirect splits.
+	 */
+	iv->iv_attempts_max = 1;
+
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		for (int c = 0; c < is->is_children; c++) {
+			indirect_child_t *ic = &is->is_child[c];
+
+			if (ic == is->is_good_child)
+				continue;
+			if (ic->ic_data == NULL)
+				continue;
+
+			abd_zero(ic->ic_data, abd_get_size(ic->ic_data));
+		}
+
+		iv->iv_attempts_max *= 2;
+		if (iv->iv_attempts_max >= (1ULL << 12)) {
+			iv->iv_attempts_max = UINT64_MAX;
+			break;
+		}
+	}
+
+out:
+	/* Empty the unique children lists so they can be reconstructed. */
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		indirect_child_t *ic;
+		while ((ic = list_head(&is->is_unique_child)) != NULL)
+			list_remove(&is->is_unique_child, ic);
+
+		is->is_unique_children = 0;
+	}
+
+	return (error);
+}
+
+/*
+ * This function is called when we have read all copies of the data and need
+ * to try to find a combination of copies that gives us the right checksum.
+ *
+ * If we pointed to any mirror vdevs, this effectively does the job of the
+ * mirror.  The mirror vdev code can't do its own job because we don't know
+ * the checksum of each split segment individually.
+ *
+ * We have to try every unique combination of copies of split segments, until
+ * we find one that checksums correctly.  Duplicate segment copies are first
+ * identified and latter skipped during reconstruction.  This optimization
+ * reduces the search space and ensures that of the remaining combinations
+ * at most one is correct.
+ *
+ * When the total number of combinations is small they can all be checked.
+ * For example, if we have 3 segments in the split, and each points to a
+ * 2-way mirror with unique copies, we will have the following pieces of data:
+ *
+ *       |     mirror child
+ * split |     [0]        [1]
+ * ======|=====================
+ *   A   |  data_A_0   data_A_1
+ *   B   |  data_B_0   data_B_1
+ *   C   |  data_C_0   data_C_1
+ *
+ * We will try the following (mirror children)^(number of splits) (2^3=8)
+ * combinations, which is similar to bitwise-little-endian counting in
+ * binary.  In general each "digit" corresponds to a split segment, and the
+ * base of each digit is is_children, which can be different for each
+ * digit.
+ *
+ * "low bit"        "high bit"
+ *        v                 v
+ * data_A_0 data_B_0 data_C_0
+ * data_A_1 data_B_0 data_C_0
+ * data_A_0 data_B_1 data_C_0
+ * data_A_1 data_B_1 data_C_0
+ * data_A_0 data_B_0 data_C_1
+ * data_A_1 data_B_0 data_C_1
+ * data_A_0 data_B_1 data_C_1
+ * data_A_1 data_B_1 data_C_1
+ *
+ * Note that the split segments may be on the same or different top-level
+ * vdevs. In either case, we may need to try lots of combinations (see
+ * zfs_reconstruct_indirect_combinations_max).  This ensures that if a mirror
+ * has small silent errors on all of its children, we can still reconstruct
+ * the correct data, as long as those errors are at sufficiently-separated
+ * offsets (specifically, separated by the largest block size - default of
+ * 128KB, but up to 16MB).
+ */
+static void
+vdev_indirect_reconstruct_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+	boolean_t known_good = B_FALSE;
+	int error;
+
+	iv->iv_unique_combinations = 1;
+	iv->iv_attempts_max = UINT64_MAX;
+
+	if (zfs_reconstruct_indirect_combinations_max > 0)
+		iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max;
+
+	/*
+	 * If nonzero, every 1/x blocks will be damaged, in order to validate
+	 * reconstruction when there are split segments with damaged copies.
+	 * Known_good will be TRUE when reconstruction is known to be possible.
+	 */
+	if (zfs_reconstruct_indirect_damage_fraction != 0 &&
+	    spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0)
+		known_good = (vdev_indirect_splits_damage(iv, zio) == 0);
+
+	/*
+	 * Determine the unique children for a split segment and add them
+	 * to the is_unique_child list.  By restricting reconstruction
+	 * to these children, only unique combinations will be considered.
+	 * This can vastly reduce the search space when there are a large
+	 * number of indirect splits.
+	 */
+	for (indirect_split_t *is = list_head(&iv->iv_splits);
+	    is != NULL; is = list_next(&iv->iv_splits, is)) {
+		is->is_unique_children = 0;
+
+		for (int i = 0; i < is->is_children; i++) {
+			indirect_child_t *ic_i = &is->is_child[i];
+
+			if (ic_i->ic_data == NULL ||
+			    ic_i->ic_duplicate != NULL)
+				continue;
+
+			for (int j = i + 1; j < is->is_children; j++) {
+				indirect_child_t *ic_j = &is->is_child[j];
+
+				if (ic_j->ic_data == NULL ||
+				    ic_j->ic_duplicate != NULL)
+					continue;
+
+				if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0)
+					ic_j->ic_duplicate = ic_i;
+			}
+
+			is->is_unique_children++;
+			list_insert_tail(&is->is_unique_child, ic_i);
+		}
+
+		/* Reconstruction is impossible, no valid children */
+		EQUIV(list_is_empty(&is->is_unique_child),
+		    is->is_unique_children == 0);
+		if (list_is_empty(&is->is_unique_child)) {
+			zio->io_error = EIO;
+			vdev_indirect_all_checksum_errors(zio);
+			zio_checksum_verified(zio);
+			return;
+		}
+
+		iv->iv_unique_combinations *= is->is_unique_children;
+	}
+
+	if (iv->iv_unique_combinations <= iv->iv_attempts_max)
+		error = vdev_indirect_splits_enumerate_all(iv, zio);
+	else
+		error = vdev_indirect_splits_enumerate_randomly(iv, zio);
+
+	if (error != 0) {
+		/* All attempted combinations failed. */
+		ASSERT3B(known_good, ==, B_FALSE);
+		zio->io_error = error;
+		vdev_indirect_all_checksum_errors(zio);
+	} else {
+		/*
+		 * The checksum has been successfully validated.  Issue
+		 * repair I/Os to any copies of splits which don't match
+		 * the validated version.
+		 */
+		ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio));
+		vdev_indirect_repair(zio);
+		zio_checksum_verified(zio);
+	}
+}
+
+static void
+vdev_indirect_io_done(zio_t *zio)
+{
+	indirect_vsd_t *iv = zio->io_vsd;
+
+	if (iv->iv_reconstruct) {
+		/*
+		 * We have read all copies of the data (e.g. from mirrors),
+		 * either because this was a scrub/resilver, or because the
+		 * one-copy read didn't checksum correctly.
+		 */
+		vdev_indirect_reconstruct_io_done(zio);
+		return;
+	}
+
+	if (!iv->iv_split_block) {
+		/*
+		 * This was not a split block, so we passed the BP down,
+		 * and the checksum was handled by the (one) child zio.
+		 */
+		return;
+	}
+
+	zio_bad_cksum_t zbc;
+	int ret = zio_checksum_error(zio, &zbc);
+	if (ret == 0) {
+		zio_checksum_verified(zio);
+		return;
+	}
+
+	/*
+	 * The checksum didn't match.  Read all copies of all splits, and
+	 * then we will try to reconstruct.  The next time
+	 * vdev_indirect_io_done() is called, iv_reconstruct will be set.
+	 */
+	vdev_indirect_read_all(zio);
+
+	zio_vdev_io_redone(zio);
+}
+
+vdev_ops_t vdev_indirect_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_indirect_open,
+	.vdev_op_close = vdev_indirect_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_indirect_io_start,
+	.vdev_op_io_done = vdev_indirect_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = vdev_indirect_remap,
+	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_INDIRECT,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* leaf vdev */
+};
+
+EXPORT_SYMBOL(spa_condense_fini);
+EXPORT_SYMBOL(spa_start_indirect_condensing_thread);
+EXPORT_SYMBOL(spa_condense_indirect_start_sync);
+EXPORT_SYMBOL(spa_condense_init);
+EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete);
+EXPORT_SYMBOL(vdev_indirect_mark_obsolete);
+EXPORT_SYMBOL(vdev_indirect_should_condense);
+EXPORT_SYMBOL(vdev_indirect_sync_obsolete);
+EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
+EXPORT_SYMBOL(vdev_obsolete_sm_object);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW,
+	"Whether to attempt condensing indirect vdev mappings");
+
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW,
+	"Don't bother condensing if the mapping uses less than this amount of "
+	"memory");
+
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW,
+	"Minimum size obsolete spacemap to attempt condensing");
+
+ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW,
+	"Used by tests to ensure certain actions happen in the middle of a "
+	"condense. A maximum value of 1 should be sufficient.");
+
+ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW,
+	"Maximum number of combinations when reconstructing split segments");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
new file mode 100644
index 000000000000..99b83c392257
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_births.c
@@ -0,0 +1,226 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/dsl_pool.h>
+#include <sys/vdev_indirect_births.h>
+
+#ifdef ZFS_DEBUG
+static boolean_t
+vdev_indirect_births_verify(vdev_indirect_births_t *vib)
+{
+	ASSERT(vib != NULL);
+
+	ASSERT(vib->vib_object != 0);
+	ASSERT(vib->vib_objset != NULL);
+	ASSERT(vib->vib_phys != NULL);
+	ASSERT(vib->vib_dbuf != NULL);
+
+	EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL);
+
+	return (B_TRUE);
+}
+#endif
+
+uint64_t
+vdev_indirect_births_count(vdev_indirect_births_t *vib)
+{
+	ASSERT(vdev_indirect_births_verify(vib));
+
+	return (vib->vib_phys->vib_count);
+}
+
+uint64_t
+vdev_indirect_births_object(vdev_indirect_births_t *vib)
+{
+	ASSERT(vdev_indirect_births_verify(vib));
+
+	return (vib->vib_object);
+}
+
+static uint64_t
+vdev_indirect_births_size_impl(vdev_indirect_births_t *vib)
+{
+	return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries));
+}
+
+void
+vdev_indirect_births_close(vdev_indirect_births_t *vib)
+{
+	ASSERT(vdev_indirect_births_verify(vib));
+
+	if (vib->vib_phys->vib_count > 0) {
+		uint64_t births_size = vdev_indirect_births_size_impl(vib);
+
+		vmem_free(vib->vib_entries, births_size);
+		vib->vib_entries = NULL;
+	}
+
+	dmu_buf_rele(vib->vib_dbuf, vib);
+
+	vib->vib_objset = NULL;
+	vib->vib_object = 0;
+	vib->vib_dbuf = NULL;
+	vib->vib_phys = NULL;
+
+	kmem_free(vib, sizeof (*vib));
+}
+
+uint64_t
+vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	return (dmu_object_alloc(os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t),
+	    tx));
+}
+
+vdev_indirect_births_t *
+vdev_indirect_births_open(objset_t *os, uint64_t births_object)
+{
+	vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP);
+
+	vib->vib_objset = os;
+	vib->vib_object = births_object;
+
+	VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf));
+	vib->vib_phys = vib->vib_dbuf->db_data;
+
+	if (vib->vib_phys->vib_count > 0) {
+		uint64_t births_size = vdev_indirect_births_size_impl(vib);
+		vib->vib_entries = vmem_alloc(births_size, KM_SLEEP);
+		VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0,
+		    births_size, vib->vib_entries, DMU_READ_PREFETCH));
+	}
+
+	ASSERT(vdev_indirect_births_verify(vib));
+
+	return (vib);
+}
+
+void
+vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	VERIFY0(dmu_object_free(os, object, tx));
+}
+
+void
+vdev_indirect_births_add_entry(vdev_indirect_births_t *vib,
+    uint64_t max_offset, uint64_t txg, dmu_tx_t *tx)
+{
+	vdev_indirect_birth_entry_phys_t vibe;
+	uint64_t old_size;
+	uint64_t new_size;
+	vdev_indirect_birth_entry_phys_t *new_entries;
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+	ASSERT(vdev_indirect_births_verify(vib));
+
+	dmu_buf_will_dirty(vib->vib_dbuf, tx);
+
+	vibe.vibe_offset = max_offset;
+	vibe.vibe_phys_birth_txg = txg;
+
+	old_size = vdev_indirect_births_size_impl(vib);
+	dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe),
+	    &vibe, tx);
+	vib->vib_phys->vib_count++;
+	new_size = vdev_indirect_births_size_impl(vib);
+
+	new_entries = vmem_alloc(new_size, KM_SLEEP);
+	if (old_size > 0) {
+		bcopy(vib->vib_entries, new_entries, old_size);
+		vmem_free(vib->vib_entries, old_size);
+	}
+	new_entries[vib->vib_phys->vib_count - 1] = vibe;
+	vib->vib_entries = new_entries;
+}
+
+uint64_t
+vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib)
+{
+	ASSERT(vdev_indirect_births_verify(vib));
+	ASSERT(vib->vib_phys->vib_count > 0);
+
+	vdev_indirect_birth_entry_phys_t *last =
+	    &vib->vib_entries[vib->vib_phys->vib_count - 1];
+	return (last->vibe_phys_birth_txg);
+}
+
+/*
+ * Return the txg in which the given range was copied (i.e. its physical
+ * birth txg).  The specified offset+asize must be contiguously mapped
+ * (i.e. not a split block).
+ *
+ * The entries are sorted by increasing phys_birth, and also by increasing
+ * offset.  We find the specified offset by binary search.  Note that we
+ * can not use bsearch() because looking at each entry independently is
+ * insufficient to find the correct entry.  Each entry implicitly relies
+ * on the previous entry: an entry indicates that the offsets from the
+ * end of the previous entry to the end of this entry were written in the
+ * specified txg.
+ */
+uint64_t
+vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset,
+    uint64_t asize)
+{
+	vdev_indirect_birth_entry_phys_t *base;
+	vdev_indirect_birth_entry_phys_t *last;
+
+	ASSERT(vdev_indirect_births_verify(vib));
+	ASSERT(vib->vib_phys->vib_count > 0);
+
+	base = vib->vib_entries;
+	last = base + vib->vib_phys->vib_count - 1;
+
+	ASSERT3U(offset, <, last->vibe_offset);
+
+	while (last >= base) {
+		vdev_indirect_birth_entry_phys_t *p =
+		    base + ((last - base) / 2);
+		if (offset >= p->vibe_offset) {
+			base = p + 1;
+		} else if (p == vib->vib_entries ||
+		    offset >= (p - 1)->vibe_offset) {
+			ASSERT3U(offset + asize, <=, p->vibe_offset);
+			return (p->vibe_phys_birth_txg);
+		} else {
+			last = p - 1;
+		}
+	}
+	ASSERT(!"offset not found");
+	return (-1);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(vdev_indirect_births_add_entry);
+EXPORT_SYMBOL(vdev_indirect_births_alloc);
+EXPORT_SYMBOL(vdev_indirect_births_close);
+EXPORT_SYMBOL(vdev_indirect_births_count);
+EXPORT_SYMBOL(vdev_indirect_births_free);
+EXPORT_SYMBOL(vdev_indirect_births_last_entry_txg);
+EXPORT_SYMBOL(vdev_indirect_births_object);
+EXPORT_SYMBOL(vdev_indirect_births_open);
+EXPORT_SYMBOL(vdev_indirect_births_physbirth);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
new file mode 100644
index 000000000000..bb484a401b1b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_indirect_mapping.c
@@ -0,0 +1,616 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2015, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/zfeature.h>
+#include <sys/dmu_objset.h>
+
+#ifdef ZFS_DEBUG
+static boolean_t
+vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim)
+{
+	ASSERT(vim != NULL);
+
+	ASSERT(vim->vim_object != 0);
+	ASSERT(vim->vim_objset != NULL);
+	ASSERT(vim->vim_phys != NULL);
+	ASSERT(vim->vim_dbuf != NULL);
+
+	EQUIV(vim->vim_phys->vimp_num_entries > 0,
+	    vim->vim_entries != NULL);
+	if (vim->vim_phys->vimp_num_entries > 0) {
+		vdev_indirect_mapping_entry_phys_t *last_entry __maybe_unused =
+		    &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1];
+		uint64_t offset __maybe_unused =
+		    DVA_MAPPING_GET_SRC_OFFSET(last_entry);
+		uint64_t size __maybe_unused =
+		    DVA_GET_ASIZE(&last_entry->vimep_dst);
+
+		ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size);
+	}
+	if (vim->vim_havecounts) {
+		ASSERT(vim->vim_phys->vimp_counts_object != 0);
+	}
+
+	return (B_TRUE);
+}
+#endif
+
+uint64_t
+vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	return (vim->vim_phys->vimp_num_entries);
+}
+
+uint64_t
+vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	return (vim->vim_phys->vimp_max_offset);
+}
+
+uint64_t
+vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	return (vim->vim_object);
+}
+
+uint64_t
+vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	return (vim->vim_phys->vimp_bytes_mapped);
+}
+
+/*
+ * The length (in bytes) of the mapping object array in memory and
+ * (logically) on disk.
+ *
+ * Note that unlike most of our accessor functions,
+ * we don't assert that the struct is consistent; therefore it can be
+ * called while there may be concurrent changes, if we don't care about
+ * the value being immediately stale (e.g. from spa_removal_get_stats()).
+ */
+uint64_t
+vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim)
+{
+	return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries));
+}
+
+/*
+ * Compare an offset with an indirect mapping entry; there are three
+ * possible scenarios:
+ *
+ *     1. The offset is "less than" the mapping entry; meaning the
+ *        offset is less than the source offset of the mapping entry. In
+ *        this case, there is no overlap between the offset and the
+ *        mapping entry and -1 will be returned.
+ *
+ *     2. The offset is "greater than" the mapping entry; meaning the
+ *        offset is greater than the mapping entry's source offset plus
+ *        the entry's size. In this case, there is no overlap between
+ *        the offset and the mapping entry and 1 will be returned.
+ *
+ *        NOTE: If the offset is actually equal to the entry's offset
+ *        plus size, this is considered to be "greater" than the entry,
+ *        and this case applies (i.e. 1 will be returned). Thus, the
+ *        entry's "range" can be considered to be inclusive at its
+ *        start, but exclusive at its end: e.g. [src, src + size).
+ *
+ *     3. The last case to consider is if the offset actually falls
+ *        within the mapping entry's range. If this is the case, the
+ *        offset is considered to be "equal to" the mapping entry and
+ *        0 will be returned.
+ *
+ *        NOTE: If the offset is equal to the entry's source offset,
+ *        this case applies and 0 will be returned. If the offset is
+ *        equal to the entry's source plus its size, this case does
+ *        *not* apply (see "NOTE" above for scenario 2), and 1 will be
+ *        returned.
+ */
+static int
+dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem)
+{
+	const uint64_t * const key = v_key;
+	const vdev_indirect_mapping_entry_phys_t * const array_elem =
+	    v_array_elem;
+	uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem);
+
+	if (*key < src_offset) {
+		return (-1);
+	} else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) {
+		return (0);
+	} else {
+		return (1);
+	}
+}
+
+/*
+ * Returns the mapping entry for the given offset.
+ *
+ * It's possible that the given offset will not be in the mapping table
+ * (i.e. no mapping entries contain this offset), in which case, the
+ * return value value depends on the "next_if_missing" parameter.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_FALSE, then NULL will always be returned. The behavior is intended
+ * to allow consumers to get the entry corresponding to the offset
+ * parameter, iff the offset overlaps with an entry in the table.
+ *
+ * If the offset is not found in the table and "next_if_missing" is
+ * B_TRUE, then the entry nearest to the given offset will be returned,
+ * such that the entry's source offset is greater than the offset
+ * passed in (i.e. the "next" mapping entry in the table is returned, if
+ * the offset is missing from the table). If there are no entries whose
+ * source offset is greater than the passed in offset, NULL is returned.
+ */
+static vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim,
+    uint64_t offset, boolean_t next_if_missing)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+	ASSERT(vim->vim_phys->vimp_num_entries > 0);
+
+	vdev_indirect_mapping_entry_phys_t *entry = NULL;
+
+	uint64_t last = vim->vim_phys->vimp_num_entries - 1;
+	uint64_t base = 0;
+
+	/*
+	 * We don't define these inside of the while loop because we use
+	 * their value in the case that offset isn't in the mapping.
+	 */
+	uint64_t mid;
+	int result;
+
+	while (last >= base) {
+		mid = base + ((last - base) >> 1);
+
+		result = dva_mapping_overlap_compare(&offset,
+		    &vim->vim_entries[mid]);
+
+		if (result == 0) {
+			entry = &vim->vim_entries[mid];
+			break;
+		} else if (result < 0) {
+			last = mid - 1;
+		} else {
+			base = mid + 1;
+		}
+	}
+
+	if (entry == NULL && next_if_missing) {
+		ASSERT3U(base, ==, last + 1);
+		ASSERT(mid == base || mid == last);
+		ASSERT3S(result, !=, 0);
+
+		/*
+		 * The offset we're looking for isn't actually contained
+		 * in the mapping table, thus we need to return the
+		 * closest mapping entry that is greater than the
+		 * offset. We reuse the result of the last comparison,
+		 * comparing the mapping entry at index "mid" and the
+		 * offset. The offset is guaranteed to lie between
+		 * indices one less than "mid", and one greater than
+		 * "mid"; we just need to determine if offset is greater
+		 * than, or less than the mapping entry contained at
+		 * index "mid".
+		 */
+
+		uint64_t index;
+		if (result < 0)
+			index = mid;
+		else
+			index = mid + 1;
+
+		ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries);
+
+		if (index == vim->vim_phys->vimp_num_entries) {
+			/*
+			 * If "index" is past the end of the entries
+			 * array, then not only is the offset not in the
+			 * mapping table, but it's actually greater than
+			 * all entries in the table. In this case, we
+			 * can't return a mapping entry greater than the
+			 * offset (since none exist), so we return NULL.
+			 */
+
+			ASSERT3S(dva_mapping_overlap_compare(&offset,
+			    &vim->vim_entries[index - 1]), >, 0);
+
+			return (NULL);
+		} else {
+			/*
+			 * Just to be safe, we verify the offset falls
+			 * in between the mapping entries at index and
+			 * one less than index. Since we know the offset
+			 * doesn't overlap an entry, and we're supposed
+			 * to return the entry just greater than the
+			 * offset, both of the following tests must be
+			 * true.
+			 */
+			ASSERT3S(dva_mapping_overlap_compare(&offset,
+			    &vim->vim_entries[index]), <, 0);
+			IMPLY(index >= 1, dva_mapping_overlap_compare(&offset,
+			    &vim->vim_entries[index - 1]) > 0);
+
+			return (&vim->vim_entries[index]);
+		}
+	} else {
+		return (entry);
+	}
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim,
+    uint64_t offset)
+{
+	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+	    B_FALSE));
+}
+
+vdev_indirect_mapping_entry_phys_t *
+vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim,
+    uint64_t offset)
+{
+	return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset,
+	    B_TRUE));
+}
+
+void
+vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	if (vim->vim_phys->vimp_num_entries > 0) {
+		uint64_t map_size = vdev_indirect_mapping_size(vim);
+		vmem_free(vim->vim_entries, map_size);
+		vim->vim_entries = NULL;
+	}
+
+	dmu_buf_rele(vim->vim_dbuf, vim);
+
+	vim->vim_objset = NULL;
+	vim->vim_object = 0;
+	vim->vim_dbuf = NULL;
+	vim->vim_phys = NULL;
+
+	kmem_free(vim, sizeof (*vim));
+}
+
+uint64_t
+vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx)
+{
+	uint64_t object;
+	ASSERT(dmu_tx_is_syncing(tx));
+	uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0;
+
+	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		bonus_size = sizeof (vdev_indirect_mapping_phys_t);
+	}
+
+	object = dmu_object_alloc(os,
+	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
+	    DMU_OTN_UINT64_METADATA, bonus_size,
+	    tx);
+
+	if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		dmu_buf_t *dbuf;
+		vdev_indirect_mapping_phys_t *vimp;
+
+		VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf));
+		dmu_buf_will_dirty(dbuf, tx);
+		vimp = dbuf->db_data;
+		vimp->vimp_counts_object = dmu_object_alloc(os,
+		    DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE,
+		    DMU_OT_NONE, 0, tx);
+		spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+		dmu_buf_rele(dbuf, FTAG);
+	}
+
+	return (object);
+}
+
+
+vdev_indirect_mapping_t *
+vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object)
+{
+	vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP);
+	dmu_object_info_t doi;
+	VERIFY0(dmu_object_info(os, mapping_object, &doi));
+
+	vim->vim_objset = os;
+	vim->vim_object = mapping_object;
+
+	VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim,
+	    &vim->vim_dbuf));
+	vim->vim_phys = vim->vim_dbuf->db_data;
+
+	vim->vim_havecounts =
+	    (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0);
+
+	if (vim->vim_phys->vimp_num_entries > 0) {
+		uint64_t map_size = vdev_indirect_mapping_size(vim);
+		vim->vim_entries = vmem_alloc(map_size, KM_SLEEP);
+		VERIFY0(dmu_read(os, vim->vim_object, 0, map_size,
+		    vim->vim_entries, DMU_READ_PREFETCH));
+	}
+
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	return (vim);
+}
+
+void
+vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object);
+	if (vim->vim_havecounts) {
+		VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object,
+		    tx));
+		spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+	}
+	vdev_indirect_mapping_close(vim);
+
+	VERIFY0(dmu_object_free(os, object, tx));
+}
+
+/*
+ * Append the list of vdev_indirect_mapping_entry_t's to the on-disk
+ * mapping object.  Also remove the entries from the list and free them.
+ * This also implicitly extends the max_offset of the mapping (to the end
+ * of the last entry).
+ */
+void
+vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim,
+    list_t *list, dmu_tx_t *tx)
+{
+	vdev_indirect_mapping_entry_phys_t *mapbuf;
+	uint64_t old_size;
+	uint32_t *countbuf = NULL;
+	vdev_indirect_mapping_entry_phys_t *old_entries;
+	uint64_t old_count;
+	uint64_t entries_written = 0;
+
+	ASSERT(vdev_indirect_mapping_verify(vim));
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx)));
+	ASSERT(!list_is_empty(list));
+
+	old_size = vdev_indirect_mapping_size(vim);
+	old_entries = vim->vim_entries;
+	old_count = vim->vim_phys->vimp_num_entries;
+
+	dmu_buf_will_dirty(vim->vim_dbuf, tx);
+
+	mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
+	if (vim->vim_havecounts) {
+		countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP);
+		ASSERT(spa_feature_is_active(vim->vim_objset->os_spa,
+		    SPA_FEATURE_OBSOLETE_COUNTS));
+	}
+	while (!list_is_empty(list)) {
+		uint64_t i;
+		/*
+		 * Write entries from the list to the
+		 * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE.
+		 */
+		for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) {
+			vdev_indirect_mapping_entry_t *entry =
+			    list_remove_head(list);
+			if (entry == NULL)
+				break;
+
+			uint64_t size =
+			    DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst);
+			uint64_t src_offset =
+			    DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping);
+
+			/*
+			 * We shouldn't be adding an entry which is fully
+			 * obsolete.
+			 */
+			ASSERT3U(entry->vime_obsolete_count, <, size);
+			IMPLY(entry->vime_obsolete_count != 0,
+			    vim->vim_havecounts);
+
+			mapbuf[i] = entry->vime_mapping;
+			if (vim->vim_havecounts)
+				countbuf[i] = entry->vime_obsolete_count;
+
+			vim->vim_phys->vimp_bytes_mapped += size;
+			ASSERT3U(src_offset, >=,
+			    vim->vim_phys->vimp_max_offset);
+			vim->vim_phys->vimp_max_offset = src_offset + size;
+
+			entries_written++;
+
+			vmem_free(entry, sizeof (*entry));
+		}
+		dmu_write(vim->vim_objset, vim->vim_object,
+		    vim->vim_phys->vimp_num_entries * sizeof (*mapbuf),
+		    i * sizeof (*mapbuf),
+		    mapbuf, tx);
+		if (vim->vim_havecounts) {
+			dmu_write(vim->vim_objset,
+			    vim->vim_phys->vimp_counts_object,
+			    vim->vim_phys->vimp_num_entries *
+			    sizeof (*countbuf),
+			    i * sizeof (*countbuf), countbuf, tx);
+		}
+		vim->vim_phys->vimp_num_entries += i;
+	}
+	vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE);
+	if (vim->vim_havecounts)
+		vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE);
+
+	/*
+	 * Update the entry array to reflect the new entries. First, copy
+	 * over any old entries then read back the new entries we just wrote.
+	 */
+	uint64_t new_size = vdev_indirect_mapping_size(vim);
+	ASSERT3U(new_size, >, old_size);
+	ASSERT3U(new_size - old_size, ==,
+	    entries_written * sizeof (vdev_indirect_mapping_entry_phys_t));
+	vim->vim_entries = vmem_alloc(new_size, KM_SLEEP);
+	if (old_size > 0) {
+		bcopy(old_entries, vim->vim_entries, old_size);
+		vmem_free(old_entries, old_size);
+	}
+	VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size,
+	    new_size - old_size, &vim->vim_entries[old_count],
+	    DMU_READ_PREFETCH));
+
+	zfs_dbgmsg("txg %llu: wrote %llu entries to "
+	    "indirect mapping obj %llu; max offset=0x%llx",
+	    (u_longlong_t)dmu_tx_get_txg(tx),
+	    (u_longlong_t)entries_written,
+	    (u_longlong_t)vim->vim_object,
+	    (u_longlong_t)vim->vim_phys->vimp_max_offset);
+}
+
+/*
+ * Increment the relevant counts for the specified offset and length.
+ * The counts array must be obtained from
+ * vdev_indirect_mapping_load_obsolete_counts().
+ */
+void
+vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim,
+    uint64_t offset, uint64_t length, uint32_t *counts)
+{
+	vdev_indirect_mapping_entry_phys_t *mapping;
+	uint64_t index;
+
+	mapping = vdev_indirect_mapping_entry_for_offset(vim,  offset);
+
+	ASSERT(length > 0);
+	ASSERT3P(mapping, !=, NULL);
+
+	index = mapping - vim->vim_entries;
+
+	while (length > 0) {
+		ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim));
+
+		uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst);
+		uint64_t inner_offset = offset -
+		    DVA_MAPPING_GET_SRC_OFFSET(mapping);
+		VERIFY3U(inner_offset, <, size);
+		uint64_t inner_size = MIN(length, size - inner_offset);
+
+		VERIFY3U(counts[index] + inner_size, <=, size);
+		counts[index] += inner_size;
+
+		offset += inner_size;
+		length -= inner_size;
+		mapping++;
+		index++;
+	}
+}
+
+typedef struct load_obsolete_space_map_arg {
+	vdev_indirect_mapping_t	*losma_vim;
+	uint32_t		*losma_counts;
+} load_obsolete_space_map_arg_t;
+
+static int
+load_obsolete_sm_callback(space_map_entry_t *sme, void *arg)
+{
+	load_obsolete_space_map_arg_t *losma = arg;
+	ASSERT3S(sme->sme_type, ==, SM_ALLOC);
+
+	vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim,
+	    sme->sme_offset, sme->sme_run, losma->losma_counts);
+
+	return (0);
+}
+
+/*
+ * Modify the counts (increment them) based on the spacemap.
+ */
+void
+vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim,
+    uint32_t *counts, space_map_t *obsolete_space_sm)
+{
+	load_obsolete_space_map_arg_t losma;
+	losma.losma_counts = counts;
+	losma.losma_vim = vim;
+	VERIFY0(space_map_iterate(obsolete_space_sm,
+	    space_map_length(obsolete_space_sm),
+	    load_obsolete_sm_callback, &losma));
+}
+
+/*
+ * Read the obsolete counts from disk, returning them in an array.
+ */
+uint32_t *
+vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	uint64_t counts_size =
+	    vim->vim_phys->vimp_num_entries * sizeof (uint32_t);
+	uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP);
+	if (vim->vim_havecounts) {
+		VERIFY0(dmu_read(vim->vim_objset,
+		    vim->vim_phys->vimp_counts_object,
+		    0, counts_size,
+		    counts, DMU_READ_PREFETCH));
+	} else {
+		bzero(counts, counts_size);
+	}
+	return (counts);
+}
+
+extern void
+vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim,
+    uint32_t *counts)
+{
+	ASSERT(vdev_indirect_mapping_verify(vim));
+
+	vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t));
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(vdev_indirect_mapping_add_entries);
+EXPORT_SYMBOL(vdev_indirect_mapping_alloc);
+EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped);
+EXPORT_SYMBOL(vdev_indirect_mapping_close);
+EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset);
+EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next);
+EXPORT_SYMBOL(vdev_indirect_mapping_free);
+EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts);
+EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count);
+EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts);
+EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap);
+EXPORT_SYMBOL(vdev_indirect_mapping_max_offset);
+EXPORT_SYMBOL(vdev_indirect_mapping_num_entries);
+EXPORT_SYMBOL(vdev_indirect_mapping_object);
+EXPORT_SYMBOL(vdev_indirect_mapping_open);
+EXPORT_SYMBOL(vdev_indirect_mapping_size);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_initialize.c b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
new file mode 100644
index 000000000000..083ad2861b5b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_initialize.c
@@ -0,0 +1,766 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/vdev_initialize.h>
+
+/*
+ * Value that is written to disk during initialization.
+ */
+#ifdef _ILP32
+unsigned long zfs_initialize_value = 0xdeadbeefUL;
+#else
+unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL;
+#endif
+
+/* maximum number of I/Os outstanding per leaf vdev */
+int zfs_initialize_limit = 1;
+
+/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */
+unsigned long zfs_initialize_chunk_size = 1024 * 1024;
+
+static boolean_t
+vdev_initialize_should_stop(vdev_t *vd)
+{
+	return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) ||
+	    vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+static void
+vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+	/*
+	 * We pass in the guid instead of the vdev_t since the vdev may
+	 * have been freed prior to the sync task being processed. This
+	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
+	 * stop the initializing thread, schedule the sync task, and free
+	 * the vdev. Later when the scheduled sync task is invoked, it would
+	 * find that the vdev has been freed.
+	 */
+	uint64_t guid = *(uint64_t *)arg;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	kmem_free(arg, sizeof (uint64_t));
+
+	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+		return;
+
+	uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK];
+	vd->vdev_initialize_offset[txg & TXG_MASK] = 0;
+
+	VERIFY(vd->vdev_leaf_zap != 0);
+
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+	if (last_offset > 0) {
+		vd->vdev_initialize_last_offset = last_offset;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+		    sizeof (last_offset), 1, &last_offset, tx));
+	}
+	if (vd->vdev_initialize_action_time > 0) {
+		uint64_t val = (uint64_t)vd->vdev_initialize_action_time;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val),
+		    1, &val, tx));
+	}
+
+	uint64_t initialize_state = vd->vdev_initialize_state;
+	VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+	    VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1,
+	    &initialize_state, tx));
+}
+
+static void
+vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	spa_t *spa = vd->vdev_spa;
+
+	if (new_state == vd->vdev_initialize_state)
+		return;
+
+	/*
+	 * Copy the vd's guid, this will be freed by the sync task.
+	 */
+	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+	*guid = vd->vdev_guid;
+
+	/*
+	 * If we're suspending, then preserving the original start time.
+	 */
+	if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) {
+		vd->vdev_initialize_action_time = gethrestime_sec();
+	}
+
+	vdev_initializing_state_t old_state = vd->vdev_initialize_state;
+	vd->vdev_initialize_state = new_state;
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync,
+	    guid, tx);
+
+	switch (new_state) {
+	case VDEV_INITIALIZE_ACTIVE:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s activated", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_SUSPENDED:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s suspended", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_CANCELED:
+		if (old_state == VDEV_INITIALIZE_ACTIVE ||
+		    old_state == VDEV_INITIALIZE_SUSPENDED)
+			spa_history_log_internal(spa, "initialize", tx,
+			    "vdev=%s canceled", vd->vdev_path);
+		break;
+	case VDEV_INITIALIZE_COMPLETE:
+		spa_history_log_internal(spa, "initialize", tx,
+		    "vdev=%s complete", vd->vdev_path);
+		break;
+	default:
+		panic("invalid state %llu", (unsigned long long)new_state);
+	}
+
+	dmu_tx_commit(tx);
+
+	if (new_state != VDEV_INITIALIZE_ACTIVE)
+		spa_notify_waiters(spa);
+}
+
+static void
+vdev_initialize_cb(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+		/*
+		 * The I/O failed because the vdev was unavailable; roll the
+		 * last offset back. (This works because spa_sync waits on
+		 * spa_txg_zio before it runs sync tasks.)
+		 */
+		uint64_t *off =
+		    &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK];
+		*off = MIN(*off, zio->io_offset);
+	} else {
+		/*
+		 * Since initializing is best-effort, we ignore I/O errors and
+		 * rely on vdev_probe to determine if the errors are more
+		 * critical.
+		 */
+		if (zio->io_error != 0)
+			vd->vdev_stat.vs_initialize_errors++;
+
+		vd->vdev_initialize_bytes_done += zio->io_orig_size;
+	}
+	ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+	vd->vdev_initialize_inflight--;
+	cv_broadcast(&vd->vdev_initialize_io_cv);
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/* Takes care of physical writing and limiting # of concurrent ZIOs. */
+static int
+vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	/* Limit inflight initializing I/Os */
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	while (vd->vdev_initialize_inflight >= zfs_initialize_limit) {
+		cv_wait(&vd->vdev_initialize_io_cv,
+		    &vd->vdev_initialize_io_lock);
+	}
+	vd->vdev_initialize_inflight++;
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+	mutex_enter(&vd->vdev_initialize_lock);
+
+	if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) {
+		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+		*guid = vd->vdev_guid;
+
+		/* This is the first write of this txg. */
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_initialize_zap_update_sync, guid, tx);
+	}
+
+	/*
+	 * We know the vdev struct will still be around since all
+	 * consumers of vdev_free must stop the initialization first.
+	 */
+	if (vdev_initialize_should_stop(vd)) {
+		mutex_enter(&vd->vdev_initialize_io_lock);
+		ASSERT3U(vd->vdev_initialize_inflight, >, 0);
+		vd->vdev_initialize_inflight--;
+		mutex_exit(&vd->vdev_initialize_io_lock);
+		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+		mutex_exit(&vd->vdev_initialize_lock);
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINTR));
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	vd->vdev_initialize_offset[txg & TXG_MASK] = start + size;
+	zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start,
+	    size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL,
+	    ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE));
+	/* vdev_initialize_cb releases SCL_STATE_ALL */
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * Callback to fill each ABD chunk with zfs_initialize_value. len must be
+ * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD
+ * allocation will guarantee these for us.
+ */
+/* ARGSUSED */
+static int
+vdev_initialize_block_fill(void *buf, size_t len, void *unused)
+{
+	ASSERT0(len % sizeof (uint64_t));
+#ifdef _ILP32
+	for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) {
+		*(uint32_t *)((char *)(buf) + i) = zfs_initialize_value;
+	}
+#else
+	for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) {
+		*(uint64_t *)((char *)(buf) + i) = zfs_initialize_value;
+	}
+#endif
+	return (0);
+}
+
+static abd_t *
+vdev_initialize_block_alloc(void)
+{
+	/* Allocate ABD for filler data */
+	abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE);
+
+	ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t));
+	(void) abd_iterate_func(data, 0, zfs_initialize_chunk_size,
+	    vdev_initialize_block_fill, NULL);
+
+	return (data);
+}
+
+static void
+vdev_initialize_block_free(abd_t *data)
+{
+	abd_free(data);
+}
+
+static int
+vdev_initialize_ranges(vdev_t *vd, abd_t *data)
+{
+	range_tree_t *rt = vd->vdev_initialize_tree;
+	zfs_btree_t *bt = &rt->rt_root;
+	zfs_btree_index_t where;
+
+	for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL;
+	    rs = zfs_btree_next(bt, &where, &where)) {
+		uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt);
+
+		/* Split range into legally-sized physical chunks */
+		uint64_t writes_required =
+		    ((size - 1) / zfs_initialize_chunk_size) + 1;
+
+		for (uint64_t w = 0; w < writes_required; w++) {
+			int error;
+
+			error = vdev_initialize_write(vd,
+			    VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) +
+			    (w * zfs_initialize_chunk_size),
+			    MIN(size - (w * zfs_initialize_chunk_size),
+			    zfs_initialize_chunk_size), data);
+			if (error != 0)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+static void
+vdev_initialize_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+	uint64_t *last_rs_end = (uint64_t *)arg;
+
+	if (physical_rs->rs_end > *last_rs_end)
+		*last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_initialize_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+	vdev_t *vd = (vdev_t *)arg;
+
+	uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+	vd->vdev_initialize_bytes_est += size;
+
+	if (vd->vdev_initialize_last_offset > physical_rs->rs_end) {
+		vd->vdev_initialize_bytes_done += size;
+	} else if (vd->vdev_initialize_last_offset > physical_rs->rs_start &&
+	    vd->vdev_initialize_last_offset < physical_rs->rs_end) {
+		vd->vdev_initialize_bytes_done +=
+		    vd->vdev_initialize_last_offset - physical_rs->rs_start;
+	}
+}
+
+static void
+vdev_initialize_calculate_progress(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	vd->vdev_initialize_bytes_est = 0;
+	vd->vdev_initialize_bytes_done = 0;
+
+	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+		mutex_enter(&msp->ms_lock);
+
+		uint64_t ms_free = (msp->ms_size -
+		    metaslab_allocated_space(msp)) /
+		    vdev_get_ndisks(vd->vdev_top);
+
+		/*
+		 * Convert the metaslab range to a physical range
+		 * on our vdev. We use this to determine if we are
+		 * in the middle of this metaslab range.
+		 */
+		range_seg64_t logical_rs, physical_rs, remain_rs;
+		logical_rs.rs_start = msp->ms_start;
+		logical_rs.rs_end = msp->ms_start + msp->ms_size;
+
+		/* Metaslab space after this offset has not been initialized */
+		vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
+		if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) {
+			vd->vdev_initialize_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/* Metaslab space before this offset has been initialized */
+		uint64_t last_rs_end = physical_rs.rs_end;
+		if (!vdev_xlate_is_empty(&remain_rs)) {
+			vdev_xlate_walk(vd, &remain_rs,
+			    vdev_initialize_xlate_last_rs_end, &last_rs_end);
+		}
+
+		if (vd->vdev_initialize_last_offset > last_rs_end) {
+			vd->vdev_initialize_bytes_done += ms_free;
+			vd->vdev_initialize_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/*
+		 * If we get here, we're in the middle of initializing this
+		 * metaslab. Load it and walk the free tree for more accurate
+		 * progress estimation.
+		 */
+		VERIFY0(metaslab_load(msp));
+
+		zfs_btree_index_t where;
+		range_tree_t *rt = msp->ms_allocatable;
+		for (range_seg_t *rs =
+		    zfs_btree_first(&rt->rt_root, &where); rs;
+		    rs = zfs_btree_next(&rt->rt_root, &where,
+		    &where)) {
+			logical_rs.rs_start = rs_get_start(rs, rt);
+			logical_rs.rs_end = rs_get_end(rs, rt);
+
+			vdev_xlate_walk(vd, &logical_rs,
+			    vdev_initialize_xlate_progress, vd);
+		}
+		mutex_exit(&msp->ms_lock);
+	}
+}
+
+static int
+vdev_initialize_load(vdev_t *vd)
+{
+	int err = 0;
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE ||
+	    vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) {
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET,
+		    sizeof (vd->vdev_initialize_last_offset), 1,
+		    &vd->vdev_initialize_last_offset);
+		if (err == ENOENT) {
+			vd->vdev_initialize_last_offset = 0;
+			err = 0;
+		}
+	}
+
+	vdev_initialize_calculate_progress(vd);
+	return (err);
+}
+
+static void
+vdev_initialize_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+{
+	vdev_t *vd = arg;
+
+	/* Only add segments that we have not visited yet */
+	if (physical_rs->rs_end <= vd->vdev_initialize_last_offset)
+		return;
+
+	/* Pick up where we left off mid-range. */
+	if (vd->vdev_initialize_last_offset > physical_rs->rs_start) {
+		zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to "
+		    "(%llu, %llu)", vd->vdev_path,
+		    (u_longlong_t)physical_rs->rs_start,
+		    (u_longlong_t)physical_rs->rs_end,
+		    (u_longlong_t)vd->vdev_initialize_last_offset,
+		    (u_longlong_t)physical_rs->rs_end);
+		ASSERT3U(physical_rs->rs_end, >,
+		    vd->vdev_initialize_last_offset);
+		physical_rs->rs_start = vd->vdev_initialize_last_offset;
+	}
+
+	ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+	range_tree_add(vd->vdev_initialize_tree, physical_rs->rs_start,
+	    physical_rs->rs_end - physical_rs->rs_start);
+}
+
+/*
+ * Convert the logical range into a physical range and add it to our
+ * avl tree.
+ */
+static void
+vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = arg;
+	range_seg64_t logical_rs;
+	logical_rs.rs_start = start;
+	logical_rs.rs_end = start + size;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	vdev_xlate_walk(vd, &logical_rs, vdev_initialize_xlate_range_add, arg);
+}
+
+static void
+vdev_initialize_thread(void *arg)
+{
+	vdev_t *vd = arg;
+	spa_t *spa = vd->vdev_spa;
+	int error = 0;
+	uint64_t ms_count = 0;
+
+	ASSERT(vdev_is_concrete(vd));
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	vd->vdev_initialize_last_offset = 0;
+	VERIFY0(vdev_initialize_load(vd));
+
+	abd_t *deadbeef = vdev_initialize_block_alloc();
+
+	vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+	    0, 0);
+
+	for (uint64_t i = 0; !vd->vdev_detached &&
+	    i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+		boolean_t unload_when_done = B_FALSE;
+
+		/*
+		 * If we've expanded the top-level vdev or it's our
+		 * first pass, calculate our progress.
+		 */
+		if (vd->vdev_top->vdev_ms_count != ms_count) {
+			vdev_initialize_calculate_progress(vd);
+			ms_count = vd->vdev_top->vdev_ms_count;
+		}
+
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		metaslab_disable(msp);
+		mutex_enter(&msp->ms_lock);
+		if (!msp->ms_loaded && !msp->ms_loading)
+			unload_when_done = B_TRUE;
+		VERIFY0(metaslab_load(msp));
+
+		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
+		    vd);
+		mutex_exit(&msp->ms_lock);
+
+		error = vdev_initialize_ranges(vd, deadbeef);
+		metaslab_enable(msp, B_TRUE, unload_when_done);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+		range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);
+		if (error != 0)
+			break;
+	}
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	mutex_enter(&vd->vdev_initialize_io_lock);
+	while (vd->vdev_initialize_inflight > 0) {
+		cv_wait(&vd->vdev_initialize_io_cv,
+		    &vd->vdev_initialize_io_lock);
+	}
+	mutex_exit(&vd->vdev_initialize_io_lock);
+
+	range_tree_destroy(vd->vdev_initialize_tree);
+	vdev_initialize_block_free(deadbeef);
+	vd->vdev_initialize_tree = NULL;
+
+	mutex_enter(&vd->vdev_initialize_lock);
+	if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) {
+		vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE);
+	}
+	ASSERT(vd->vdev_initialize_thread != NULL ||
+	    vd->vdev_initialize_inflight == 0);
+
+	/*
+	 * Drop the vdev_initialize_lock while we sync out the
+	 * txg since it's possible that a device might be trying to
+	 * come online and must check to see if it needs to restart an
+	 * initialization. That thread will be holding the spa_config_lock
+	 * which would prevent the txg_wait_synced from completing.
+	 */
+	mutex_exit(&vd->vdev_initialize_lock);
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	mutex_enter(&vd->vdev_initialize_lock);
+
+	vd->vdev_initialize_thread = NULL;
+	cv_broadcast(&vd->vdev_initialize_cv);
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	thread_exit();
+}
+
+/*
+ * Initiates a device. Caller must hold vdev_initialize_lock.
+ * Device must be a leaf and not already be initializing.
+ */
+void
+vdev_initialize(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT(!vd->vdev_detached);
+	ASSERT(!vd->vdev_initialize_exit_wanted);
+	ASSERT(!vd->vdev_top->vdev_removing);
+
+	vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE);
+	vd->vdev_initialize_thread = thread_create(NULL, 0,
+	    vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Wait for the initialize thread to be terminated (cancelled or stopped).
+ */
+static void
+vdev_initialize_stop_wait_impl(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+
+	while (vd->vdev_initialize_thread != NULL)
+		cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock);
+
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	vd->vdev_initialize_exit_wanted = B_FALSE;
+}
+
+/*
+ * Wait for vdev initialize threads which were either to cleanly exit.
+ */
+void
+vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list)
+{
+	vdev_t *vd;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	while ((vd = list_remove_head(vd_list)) != NULL) {
+		mutex_enter(&vd->vdev_initialize_lock);
+		vdev_initialize_stop_wait_impl(vd);
+		mutex_exit(&vd->vdev_initialize_lock);
+	}
+}
+
+/*
+ * Stop initializing a device, with the resultant initializing state being
+ * tgt_state.  For blocking behavior pass NULL for vd_list.  Otherwise, when
+ * a list_t is provided the stopping vdev is inserted in to the list.  Callers
+ * are then required to call vdev_initialize_stop_wait() to block for all the
+ * initialization threads to exit.  The caller must hold vdev_initialize_lock
+ * and must not be writing to the spa config, as the initializing thread may
+ * try to enter the config as a reader before exiting.
+ */
+void
+vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state,
+    list_t *vd_list)
+{
+	ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
+	ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+
+	/*
+	 * Allow cancel requests to proceed even if the initialize thread
+	 * has stopped.
+	 */
+	if (vd->vdev_initialize_thread == NULL &&
+	    tgt_state != VDEV_INITIALIZE_CANCELED) {
+		return;
+	}
+
+	vdev_initialize_change_state(vd, tgt_state);
+	vd->vdev_initialize_exit_wanted = B_TRUE;
+
+	if (vd_list == NULL) {
+		vdev_initialize_stop_wait_impl(vd);
+	} else {
+		ASSERT(MUTEX_HELD(&spa_namespace_lock));
+		list_insert_tail(vd_list, vd);
+	}
+}
+
+static void
+vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state,
+    list_t *vd_list)
+{
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+		mutex_enter(&vd->vdev_initialize_lock);
+		vdev_initialize_stop(vd, tgt_state, vd_list);
+		mutex_exit(&vd->vdev_initialize_lock);
+		return;
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state,
+		    vd_list);
+	}
+}
+
+/*
+ * Convenience function to stop initializing of a vdev tree and set all
+ * initialize thread pointers to NULL.
+ */
+void
+vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state)
+{
+	spa_t *spa = vd->vdev_spa;
+	list_t vd_list;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	list_create(&vd_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_initialize_node));
+
+	vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list);
+	vdev_initialize_stop_wait(spa, &vd_list);
+
+	if (vd->vdev_spa->spa_sync_on) {
+		/* Make sure that our state has been synced to disk */
+		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+	}
+
+	list_destroy(&vd_list);
+}
+
+void
+vdev_initialize_restart(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+	if (vd->vdev_leaf_zap != 0) {
+		mutex_enter(&vd->vdev_initialize_lock);
+		uint64_t initialize_state = VDEV_INITIALIZE_NONE;
+		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE,
+		    sizeof (initialize_state), 1, &initialize_state);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_initialize_state = initialize_state;
+
+		uint64_t timestamp = 0;
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME,
+		    sizeof (timestamp), 1, &timestamp);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_initialize_action_time = timestamp;
+
+		if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED ||
+		    vd->vdev_offline) {
+			/* load progress for reporting, but don't resume */
+			VERIFY0(vdev_initialize_load(vd));
+		} else if (vd->vdev_initialize_state ==
+		    VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) &&
+		    !vd->vdev_top->vdev_removing &&
+		    vd->vdev_initialize_thread == NULL) {
+			vdev_initialize(vd);
+		}
+
+		mutex_exit(&vd->vdev_initialize_lock);
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_initialize_restart(vd->vdev_child[i]);
+	}
+}
+
+EXPORT_SYMBOL(vdev_initialize);
+EXPORT_SYMBOL(vdev_initialize_stop);
+EXPORT_SYMBOL(vdev_initialize_stop_all);
+EXPORT_SYMBOL(vdev_initialize_stop_wait);
+EXPORT_SYMBOL(vdev_initialize_restart);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW,
+	"Value written during zpool initialize");
+
+ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW,
+	"Size in bytes of writes by zpool initialize");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_label.c b/sys/contrib/openzfs/module/zfs/vdev_label.c
new file mode 100644
index 000000000000..04202a9f8960
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_label.c
@@ -0,0 +1,1992 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * Virtual Device Labels
+ * ---------------------
+ *
+ * The vdev label serves several distinct purposes:
+ *
+ *	1. Uniquely identify this device as part of a ZFS pool and confirm its
+ *	   identity within the pool.
+ *
+ *	2. Verify that all the devices given in a configuration are present
+ *         within the pool.
+ *
+ *	3. Determine the uberblock for the pool.
+ *
+ *	4. In case of an import operation, determine the configuration of the
+ *         toplevel vdev of which it is a part.
+ *
+ *	5. If an import operation cannot find all the devices in the pool,
+ *         provide enough information to the administrator to determine which
+ *         devices are missing.
+ *
+ * It is important to note that while the kernel is responsible for writing the
+ * label, it only consumes the information in the first three cases.  The
+ * latter information is only consumed in userland when determining the
+ * configuration to import a pool.
+ *
+ *
+ * Label Organization
+ * ------------------
+ *
+ * Before describing the contents of the label, it's important to understand how
+ * the labels are written and updated with respect to the uberblock.
+ *
+ * When the pool configuration is altered, either because it was newly created
+ * or a device was added, we want to update all the labels such that we can deal
+ * with fatal failure at any point.  To this end, each disk has two labels which
+ * are updated before and after the uberblock is synced.  Assuming we have
+ * labels and an uberblock with the following transaction groups:
+ *
+ *              L1          UB          L2
+ *           +------+    +------+    +------+
+ *           |      |    |      |    |      |
+ *           | t10  |    | t10  |    | t10  |
+ *           |      |    |      |    |      |
+ *           +------+    +------+    +------+
+ *
+ * In this stable state, the labels and the uberblock were all updated within
+ * the same transaction group (10).  Each label is mirrored and checksummed, so
+ * that we can detect when we fail partway through writing the label.
+ *
+ * In order to identify which labels are valid, the labels are written in the
+ * following manner:
+ *
+ *	1. For each vdev, update 'L1' to the new label
+ *	2. Update the uberblock
+ *	3. For each vdev, update 'L2' to the new label
+ *
+ * Given arbitrary failure, we can determine the correct label to use based on
+ * the transaction group.  If we fail after updating L1 but before updating the
+ * UB, we will notice that L1's transaction group is greater than the uberblock,
+ * so L2 must be valid.  If we fail after writing the uberblock but before
+ * writing L2, we will notice that L2's transaction group is less than L1, and
+ * therefore L1 is valid.
+ *
+ * Another added complexity is that not every label is updated when the config
+ * is synced.  If we add a single device, we do not want to have to re-write
+ * every label for every device in the pool.  This means that both L1 and L2 may
+ * be older than the pool uberblock, because the necessary information is stored
+ * on another vdev.
+ *
+ *
+ * On-disk Format
+ * --------------
+ *
+ * The vdev label consists of two distinct parts, and is wrapped within the
+ * vdev_label_t structure.  The label includes 8k of padding to permit legacy
+ * VTOC disk labels, but is otherwise ignored.
+ *
+ * The first half of the label is a packed nvlist which contains pool wide
+ * properties, per-vdev properties, and configuration information.  It is
+ * described in more detail below.
+ *
+ * The latter half of the label consists of a redundant array of uberblocks.
+ * These uberblocks are updated whenever a transaction group is committed,
+ * or when the configuration is updated.  When a pool is loaded, we scan each
+ * vdev for the 'best' uberblock.
+ *
+ *
+ * Configuration Information
+ * -------------------------
+ *
+ * The nvlist describing the pool and vdev contains the following elements:
+ *
+ *	version		ZFS on-disk version
+ *	name		Pool name
+ *	state		Pool state
+ *	txg		Transaction group in which this label was written
+ *	pool_guid	Unique identifier for this pool
+ *	vdev_tree	An nvlist describing vdev tree.
+ *	features_for_read
+ *			An nvlist of the features necessary for reading the MOS.
+ *
+ * Each leaf device label also contains the following:
+ *
+ *	top_guid	Unique ID for top-level vdev in which this is contained
+ *	guid		Unique ID for the leaf vdev
+ *
+ * The 'vs' configuration follows the format described in 'spa_config.c'.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/uberblock_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/zio.h>
+#include <sys/dsl_scan.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/byteorder.h>
+#include <sys/zfs_bootenv.h>
+
+/*
+ * Basic routines to read and write from a vdev label.
+ * Used throughout the rest of this file.
+ */
+uint64_t
+vdev_label_offset(uint64_t psize, int l, uint64_t offset)
+{
+	ASSERT(offset < sizeof (vdev_label_t));
+	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
+
+	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
+	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
+}
+
+/*
+ * Returns back the vdev label associated with the passed in offset.
+ */
+int
+vdev_label_number(uint64_t psize, uint64_t offset)
+{
+	int l;
+
+	if (offset >= psize - VDEV_LABEL_END_SIZE) {
+		offset -= psize - VDEV_LABEL_END_SIZE;
+		offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
+	}
+	l = offset / sizeof (vdev_label_t);
+	return (l < VDEV_LABELS ? l : -1);
+}
+
+static void
+vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
+    uint64_t size, zio_done_func_t *done, void *private, int flags)
+{
+	ASSERT(
+	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
+	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
+	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
+
+	zio_nowait(zio_read_phys(zio, vd,
+	    vdev_label_offset(vd->vdev_psize, l, offset),
+	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
+	    ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
+}
+
+void
+vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset,
+    uint64_t size, zio_done_func_t *done, void *private, int flags)
+{
+	ASSERT(
+	    spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE ||
+	    spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE);
+	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
+
+	zio_nowait(zio_write_phys(zio, vd,
+	    vdev_label_offset(vd->vdev_psize, l, offset),
+	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
+	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
+}
+
+/*
+ * Generate the nvlist representing this vdev's stats
+ */
+void
+vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv)
+{
+	nvlist_t *nvx;
+	vdev_stat_t *vs;
+	vdev_stat_ex_t *vsx;
+
+	vs = kmem_alloc(sizeof (*vs), KM_SLEEP);
+	vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP);
+
+	vdev_get_stats_ex(vd, vs, vsx);
+	fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+	    (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t));
+
+	/*
+	 * Add extended stats into a special extended stats nvlist.  This keeps
+	 * all the extended stats nicely grouped together.  The extended stats
+	 * nvlist is then added to the main nvlist.
+	 */
+	nvx = fnvlist_alloc();
+
+	/* ZIOs in flight to disk */
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,
+	    vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,
+	    vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,
+	    vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,
+	    vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,
+	    vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE,
+	    vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]);
+
+	/* ZIOs pending */
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,
+	    vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,
+	    vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,
+	    vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,
+	    vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,
+	    vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]);
+
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE,
+	    vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]);
+
+	/* Histograms */
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,
+	    vsx->vsx_total_histo[ZIO_TYPE_READ],
+	    ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,
+	    vsx->vsx_total_histo[ZIO_TYPE_WRITE],
+	    ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,
+	    vsx->vsx_disk_histo[ZIO_TYPE_READ],
+	    ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,
+	    vsx->vsx_disk_histo[ZIO_TYPE_WRITE],
+	    ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,
+	    vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ],
+	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,
+	    vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE],
+	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO,
+	    vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ],
+	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO,
+	    vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE],
+	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,
+	    vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB],
+	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,
+	    vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM],
+	    ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM]));
+
+	/* Request sizes */
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,
+	    vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ],
+	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,
+	    vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE],
+	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,
+	    vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ],
+	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,
+	    vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE],
+	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,
+	    vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB],
+	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,
+	    vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM],
+	    ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,
+	    vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ],
+	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,
+	    vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE],
+	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,
+	    vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ],
+	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,
+	    vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE],
+	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,
+	    vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB],
+	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB]));
+
+	fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,
+	    vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM],
+	    ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM]));
+
+	/* IO delays */
+	fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios);
+
+	/* Add extended stats nvlist to main nvlist */
+	fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx);
+
+	fnvlist_free(nvx);
+	kmem_free(vs, sizeof (*vs));
+	kmem_free(vsx, sizeof (*vsx));
+}
+
+static void
+root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	if (vd != spa->spa_root_vdev)
+		return;
+
+	/* provide either current or previous scan information */
+	pool_scan_stat_t ps;
+	if (spa_scan_get_stats(spa, &ps) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
+		    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
+	}
+
+	pool_removal_stat_t prs;
+	if (spa_removal_get_stats(spa, &prs) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs,
+		    sizeof (prs) / sizeof (uint64_t));
+	}
+
+	pool_checkpoint_stat_t pcs;
+	if (spa_checkpoint_get_stats(spa, &pcs) == 0) {
+		fnvlist_add_uint64_array(nvl,
+		    ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs,
+		    sizeof (pcs) / sizeof (uint64_t));
+	}
+}
+
+static void
+top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+	if (vd == vd->vdev_top) {
+		vdev_rebuild_stat_t vrs;
+		if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
+			fnvlist_add_uint64_array(nvl,
+			    ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
+			    sizeof (vrs) / sizeof (uint64_t));
+		}
+	}
+}
+
+/*
+ * Generate the nvlist representing this vdev's config.
+ */
+nvlist_t *
+vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
+    vdev_config_flag_t flags)
+{
+	nvlist_t *nv = NULL;
+	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+	nv = fnvlist_alloc();
+
+	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
+	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
+
+	if (vd->vdev_path != NULL)
+		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
+
+	if (vd->vdev_devid != NULL)
+		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
+
+	if (vd->vdev_physpath != NULL)
+		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+		    vd->vdev_physpath);
+
+	if (vd->vdev_enc_sysfs_path != NULL)
+		fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
+		    vd->vdev_enc_sysfs_path);
+
+	if (vd->vdev_fru != NULL)
+		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
+
+	if (vd->vdev_ops->vdev_op_config_generate != NULL)
+		vd->vdev_ops->vdev_op_config_generate(vd, nv);
+
+	if (vd->vdev_wholedisk != -1ULL) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+		    vd->vdev_wholedisk);
+	}
+
+	if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING))
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
+
+	if (vd->vdev_isspare)
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
+
+	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
+	    vd == vd->vdev_top) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+		    vd->vdev_ms_array);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+		    vd->vdev_ms_shift);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+		    vd->vdev_asize);
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
+		if (vd->vdev_removing) {
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+			    vd->vdev_removing);
+		}
+
+		/* zpool command expects alloc class data */
+		if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) {
+			const char *bias = NULL;
+
+			switch (vd->vdev_alloc_bias) {
+			case VDEV_BIAS_LOG:
+				bias = VDEV_ALLOC_BIAS_LOG;
+				break;
+			case VDEV_BIAS_SPECIAL:
+				bias = VDEV_ALLOC_BIAS_SPECIAL;
+				break;
+			case VDEV_BIAS_DEDUP:
+				bias = VDEV_ALLOC_BIAS_DEDUP;
+				break;
+			default:
+				ASSERT3U(vd->vdev_alloc_bias, ==,
+				    VDEV_BIAS_NONE);
+			}
+			fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
+			    bias);
+		}
+	}
+
+	if (vd->vdev_dtl_sm != NULL) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+		    space_map_object(vd->vdev_dtl_sm));
+	}
+
+	if (vic->vic_mapping_object != 0) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
+		    vic->vic_mapping_object);
+	}
+
+	if (vic->vic_births_object != 0) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
+		    vic->vic_births_object);
+	}
+
+	if (vic->vic_prev_indirect_vdev != UINT64_MAX) {
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
+		    vic->vic_prev_indirect_vdev);
+	}
+
+	if (vd->vdev_crtxg)
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
+
+	if (vd->vdev_expansion_time)
+		fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME,
+		    vd->vdev_expansion_time);
+
+	if (flags & VDEV_CONFIG_MOS) {
+		if (vd->vdev_leaf_zap != 0) {
+			ASSERT(vd->vdev_ops->vdev_op_leaf);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP,
+			    vd->vdev_leaf_zap);
+		}
+
+		if (vd->vdev_top_zap != 0) {
+			ASSERT(vd == vd->vdev_top);
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
+			    vd->vdev_top_zap);
+		}
+
+		if (vd->vdev_resilver_deferred) {
+			ASSERT(vd->vdev_ops->vdev_op_leaf);
+			ASSERT(spa->spa_resilver_deferred);
+			fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
+		}
+	}
+
+	if (getstats) {
+		vdev_config_generate_stats(vd, nv);
+
+		root_vdev_actions_getprogress(vd, nv);
+		top_vdev_actions_getprogress(vd, nv);
+
+		/*
+		 * Note: this can be called from open context
+		 * (spa_get_stats()), so we need the rwlock to prevent
+		 * the mapping from being changed by condensing.
+		 */
+		rw_enter(&vd->vdev_indirect_rwlock, RW_READER);
+		if (vd->vdev_indirect_mapping != NULL) {
+			ASSERT(vd->vdev_indirect_births != NULL);
+			vdev_indirect_mapping_t *vim =
+			    vd->vdev_indirect_mapping;
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+			    vdev_indirect_mapping_size(vim));
+		}
+		rw_exit(&vd->vdev_indirect_rwlock);
+		if (vd->vdev_mg != NULL &&
+		    vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) {
+			/*
+			 * Compute approximately how much memory would be used
+			 * for the indirect mapping if this device were to
+			 * be removed.
+			 *
+			 * Note: If the frag metric is invalid, then not
+			 * enough metaslabs have been converted to have
+			 * histograms.
+			 */
+			uint64_t seg_count = 0;
+			uint64_t to_alloc = vd->vdev_stat.vs_alloc;
+
+			/*
+			 * There are the same number of allocated segments
+			 * as free segments, so we will have at least one
+			 * entry per free segment.  However, small free
+			 * segments (smaller than vdev_removal_max_span)
+			 * will be combined with adjacent allocated segments
+			 * as a single mapping.
+			 */
+			for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+				if (i + 1 < highbit64(vdev_removal_max_span)
+				    - 1) {
+					to_alloc +=
+					    vd->vdev_mg->mg_histogram[i] <<
+					    (i + 1);
+				} else {
+					seg_count +=
+					    vd->vdev_mg->mg_histogram[i];
+				}
+			}
+
+			/*
+			 * The maximum length of a mapping is
+			 * zfs_remove_max_segment, so we need at least one entry
+			 * per zfs_remove_max_segment of allocated data.
+			 */
+			seg_count += to_alloc / spa_remove_max_segment(spa);
+
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
+			    seg_count *
+			    sizeof (vdev_indirect_mapping_entry_phys_t));
+		}
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf) {
+		nvlist_t **child;
+		int c, idx;
+
+		ASSERT(!vd->vdev_ishole);
+
+		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
+		    KM_SLEEP);
+
+		for (c = 0, idx = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+
+			/*
+			 * If we're generating an nvlist of removing
+			 * vdevs then skip over any device which is
+			 * not being removed.
+			 */
+			if ((flags & VDEV_CONFIG_REMOVING) &&
+			    !cvd->vdev_removing)
+				continue;
+
+			child[idx++] = vdev_config_generate(spa, cvd,
+			    getstats, flags);
+		}
+
+		if (idx) {
+			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+			    child, idx);
+		}
+
+		for (c = 0; c < idx; c++)
+			nvlist_free(child[c]);
+
+		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
+
+	} else {
+		const char *aux = NULL;
+
+		if (vd->vdev_offline && !vd->vdev_tmpoffline)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+		if (vd->vdev_resilver_txg != 0)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+			    vd->vdev_resilver_txg);
+		if (vd->vdev_rebuild_txg != 0)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+			    vd->vdev_rebuild_txg);
+		if (vd->vdev_faulted)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
+		if (vd->vdev_degraded)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
+		if (vd->vdev_removed)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
+		if (vd->vdev_unspare)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
+		if (vd->vdev_ishole)
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
+
+		/* Set the reason why we're FAULTED/DEGRADED. */
+		switch (vd->vdev_stat.vs_aux) {
+		case VDEV_AUX_ERR_EXCEEDED:
+			aux = "err_exceeded";
+			break;
+
+		case VDEV_AUX_EXTERNAL:
+			aux = "external";
+			break;
+		}
+
+		if (aux != NULL && !vd->vdev_tmpoffline) {
+			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
+		} else {
+			/*
+			 * We're healthy - clear any previous AUX_STATE values.
+			 */
+			if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE))
+				nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE);
+		}
+
+		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+			    vd->vdev_orig_guid);
+		}
+	}
+
+	return (nv);
+}
+
+/*
+ * Generate a view of the top-level vdevs.  If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs.  Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t *array;
+	uint_t c, idx;
+
+	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+
+		if (tvd->vdev_ishole) {
+			array[idx++] = c;
+		}
+	}
+
+	if (idx) {
+		VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+		    array, idx) == 0);
+	}
+
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+	    rvd->vdev_children) == 0);
+
+	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
+/*
+ * Returns the configuration from the label of the given vdev. For vdevs
+ * which don't have a txg value stored on their label (i.e. spares/cache)
+ * or have not been completely initialized (txg = 0) just return
+ * the configuration from the first valid label we find. Otherwise,
+ * find the most up-to-date label that does not exceed the specified
+ * 'txg' value.
+ */
+nvlist_t *
+vdev_label_read_config(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	nvlist_t *config = NULL;
+	vdev_phys_t *vp[VDEV_LABELS];
+	abd_t *vp_abd[VDEV_LABELS];
+	zio_t *zio[VDEV_LABELS];
+	uint64_t best_txg = 0;
+	uint64_t label_txg = 0;
+	int error = 0;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE;
+
+	ASSERT(vd->vdev_validate_thread == curthread ||
+	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	if (!vdev_readable(vd))
+		return (NULL);
+
+	/*
+	 * The label for a dRAID distributed spare is not stored on disk.
+	 * Instead it is generated when needed which allows us to bypass
+	 * the pipeline when reading the config from the label.
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return (vdev_draid_read_config_spare(vd));
+
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		vp_abd[l] = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+		vp[l] = abd_to_buf(vp_abd[l]);
+	}
+
+retry:
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		zio[l] = zio_root(spa, NULL, NULL, flags);
+
+		vdev_label_read(zio[l], vd, l, vp_abd[l],
+		    offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
+		    NULL, NULL, flags);
+	}
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		nvlist_t *label = NULL;
+
+		if (zio_wait(zio[l]) == 0 &&
+		    nvlist_unpack(vp[l]->vp_nvlist, sizeof (vp[l]->vp_nvlist),
+		    &label, 0) == 0) {
+			/*
+			 * Auxiliary vdevs won't have txg values in their
+			 * labels and newly added vdevs may not have been
+			 * completely initialized so just return the
+			 * configuration from the first valid label we
+			 * encounter.
+			 */
+			error = nvlist_lookup_uint64(label,
+			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
+			if ((error || label_txg == 0) && !config) {
+				config = label;
+				for (l++; l < VDEV_LABELS; l++)
+					zio_wait(zio[l]);
+				break;
+			} else if (label_txg <= txg && label_txg > best_txg) {
+				best_txg = label_txg;
+				nvlist_free(config);
+				config = fnvlist_dup(label);
+			}
+		}
+
+		if (label != NULL) {
+			nvlist_free(label);
+			label = NULL;
+		}
+	}
+
+	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
+	/*
+	 * We found a valid label but it didn't pass txg restrictions.
+	 */
+	if (config == NULL && label_txg != 0) {
+		vdev_dbgmsg(vd, "label discarded as txg is too large "
+		    "(%llu > %llu)", (u_longlong_t)label_txg,
+		    (u_longlong_t)txg);
+	}
+
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		abd_free(vp_abd[l]);
+	}
+
+	return (config);
+}
+
+/*
+ * Determine if a device is in use.  The 'spare_guid' parameter will be filled
+ * in with the device guid if this spare is active elsewhere on the system.
+ */
+static boolean_t
+vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
+    uint64_t *spare_guid, uint64_t *l2cache_guid)
+{
+	spa_t *spa = vd->vdev_spa;
+	uint64_t state, pool_guid, device_guid, txg, spare_pool;
+	uint64_t vdtxg = 0;
+	nvlist_t *label;
+
+	if (spare_guid)
+		*spare_guid = 0ULL;
+	if (l2cache_guid)
+		*l2cache_guid = 0ULL;
+
+	/*
+	 * Read the label, if any, and perform some basic sanity checks.
+	 */
+	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
+		return (B_FALSE);
+
+	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+	    &vdtxg);
+
+	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+	    &state) != 0 ||
+	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
+	    &device_guid) != 0) {
+		nvlist_free(label);
+		return (B_FALSE);
+	}
+
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
+	    &pool_guid) != 0 ||
+	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+	    &txg) != 0)) {
+		nvlist_free(label);
+		return (B_FALSE);
+	}
+
+	nvlist_free(label);
+
+	/*
+	 * Check to see if this device indeed belongs to the pool it claims to
+	 * be a part of.  The only way this is allowed is if the device is a hot
+	 * spare (which we check for later on).
+	 */
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+	    !spa_guid_exists(pool_guid, device_guid) &&
+	    !spa_spare_exists(device_guid, NULL, NULL) &&
+	    !spa_l2cache_exists(device_guid, NULL))
+		return (B_FALSE);
+
+	/*
+	 * If the transaction group is zero, then this an initialized (but
+	 * unused) label.  This is only an error if the create transaction
+	 * on-disk is the same as the one we're using now, in which case the
+	 * user has attempted to add the same vdev multiple times in the same
+	 * transaction.
+	 */
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+	    txg == 0 && vdtxg == crtxg)
+		return (B_TRUE);
+
+	/*
+	 * Check to see if this is a spare device.  We do an explicit check for
+	 * spa_has_spare() here because it may be on our pending list of spares
+	 * to add.  We also check if it is an l2cache device.
+	 */
+	if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
+	    spa_has_spare(spa, device_guid)) {
+		if (spare_guid)
+			*spare_guid = device_guid;
+
+		switch (reason) {
+		case VDEV_LABEL_CREATE:
+		case VDEV_LABEL_L2CACHE:
+			return (B_TRUE);
+
+		case VDEV_LABEL_REPLACE:
+			return (!spa_has_spare(spa, device_guid) ||
+			    spare_pool != 0ULL);
+
+		case VDEV_LABEL_SPARE:
+			return (spa_has_spare(spa, device_guid));
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * Check to see if this is an l2cache device.
+	 */
+	if (spa_l2cache_exists(device_guid, NULL))
+		return (B_TRUE);
+
+	/*
+	 * We can't rely on a pool's state if it's been imported
+	 * read-only.  Instead we look to see if the pools is marked
+	 * read-only in the namespace and set the state to active.
+	 */
+	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+	    (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+	    spa_mode(spa) == SPA_MODE_READ)
+		state = POOL_STATE_ACTIVE;
+
+	/*
+	 * If the device is marked ACTIVE, then this device is in use by another
+	 * pool on the system.
+	 */
+	return (state == POOL_STATE_ACTIVE);
+}
+
+/*
+ * Initialize a vdev label.  We check to make sure each leaf device is not in
+ * use, and writable.  We put down an initial label which we will later
+ * overwrite with a complete label.  Note that it's important to do this
+ * sequentially, not in parallel, so that we catch cases of multiple use of the
+ * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
+ * itself.
+ */
+int
+vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
+{
+	spa_t *spa = vd->vdev_spa;
+	nvlist_t *label;
+	vdev_phys_t *vp;
+	abd_t *vp_abd;
+	abd_t *bootenv;
+	uberblock_t *ub;
+	abd_t *ub_abd;
+	zio_t *zio;
+	char *buf;
+	size_t buflen;
+	int error;
+	uint64_t spare_guid = 0, l2cache_guid = 0;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		if ((error = vdev_label_init(vd->vdev_child[c],
+		    crtxg, reason)) != 0)
+			return (error);
+
+	/* Track the creation time for this vdev */
+	vd->vdev_crtxg = crtxg;
+
+	if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
+		return (0);
+
+	/*
+	 * Dead vdevs cannot be initialized.
+	 */
+	if (vdev_is_dead(vd))
+		return (SET_ERROR(EIO));
+
+	/*
+	 * Determine if the vdev is in use.
+	 */
+	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
+	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * If this is a request to add or replace a spare or l2cache device
+	 * that is in use elsewhere on the system, then we must update the
+	 * guid (which was initialized to a random value) to reflect the
+	 * actual GUID (which is shared between multiple pools).
+	 */
+	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
+	    spare_guid != 0ULL) {
+		uint64_t guid_delta = spare_guid - vd->vdev_guid;
+
+		vd->vdev_guid += guid_delta;
+
+		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+			pvd->vdev_guid_sum += guid_delta;
+
+		/*
+		 * If this is a replacement, then we want to fallthrough to the
+		 * rest of the code.  If we're adding a spare, then it's already
+		 * labeled appropriately and we can just return.
+		 */
+		if (reason == VDEV_LABEL_SPARE)
+			return (0);
+		ASSERT(reason == VDEV_LABEL_REPLACE ||
+		    reason == VDEV_LABEL_SPLIT);
+	}
+
+	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
+	    l2cache_guid != 0ULL) {
+		uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
+
+		vd->vdev_guid += guid_delta;
+
+		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
+			pvd->vdev_guid_sum += guid_delta;
+
+		/*
+		 * If this is a replacement, then we want to fallthrough to the
+		 * rest of the code.  If we're adding an l2cache, then it's
+		 * already labeled appropriately and we can just return.
+		 */
+		if (reason == VDEV_LABEL_L2CACHE)
+			return (0);
+		ASSERT(reason == VDEV_LABEL_REPLACE);
+	}
+
+	/*
+	 * Initialize its label.
+	 */
+	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+	abd_zero(vp_abd, sizeof (vdev_phys_t));
+	vp = abd_to_buf(vp_abd);
+
+	/*
+	 * Generate a label describing the pool and our top-level vdev.
+	 * We mark it as being from txg 0 to indicate that it's not
+	 * really part of an active pool just yet.  The labels will
+	 * be written again with a meaningful txg by spa_sync().
+	 */
+	if (reason == VDEV_LABEL_SPARE ||
+	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
+		/*
+		 * For inactive hot spares, we generate a special label that
+		 * identifies as a mutually shared hot spare.  We write the
+		 * label if we are adding a hot spare, or if we are removing an
+		 * active hot spare (in which case we want to revert the
+		 * labels).
+		 */
+		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+		    spa_version(spa)) == 0);
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		    POOL_STATE_SPARE) == 0);
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+		    vd->vdev_guid) == 0);
+	} else if (reason == VDEV_LABEL_L2CACHE ||
+	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+		/*
+		 * For level 2 ARC devices, add a special label.
+		 */
+		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+		    spa_version(spa)) == 0);
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+		    POOL_STATE_L2CACHE) == 0);
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+		    vd->vdev_guid) == 0);
+	} else {
+		uint64_t txg = 0ULL;
+
+		if (reason == VDEV_LABEL_SPLIT)
+			txg = spa->spa_uberblock.ub_txg;
+		label = spa_config_generate(spa, vd, txg, B_FALSE);
+
+		/*
+		 * Add our creation time.  This allows us to detect multiple
+		 * vdev uses as described above, and automatically expires if we
+		 * fail.
+		 */
+		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
+		    crtxg) == 0);
+	}
+
+	buf = vp->vp_nvlist;
+	buflen = sizeof (vp->vp_nvlist);
+
+	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
+	if (error != 0) {
+		nvlist_free(label);
+		abd_free(vp_abd);
+		/* EFAULT means nvlist_pack ran out of room */
+		return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL));
+	}
+
+	/*
+	 * Initialize uberblock template.
+	 */
+	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE);
+	abd_zero(ub_abd, VDEV_UBERBLOCK_RING);
+	abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t));
+	ub = abd_to_buf(ub_abd);
+	ub->ub_txg = 0;
+
+	/* Initialize the 2nd padding area. */
+	bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+	abd_zero(bootenv, VDEV_PAD_SIZE);
+
+	/*
+	 * Write everything in parallel.
+	 */
+retry:
+	zio = zio_root(spa, NULL, NULL, flags);
+
+	for (int l = 0; l < VDEV_LABELS; l++) {
+
+		vdev_label_write(zio, vd, l, vp_abd,
+		    offsetof(vdev_label_t, vl_vdev_phys),
+		    sizeof (vdev_phys_t), NULL, NULL, flags);
+
+		/*
+		 * Skip the 1st padding area.
+		 * Zero out the 2nd padding area where it might have
+		 * left over data from previous filesystem format.
+		 */
+		vdev_label_write(zio, vd, l, bootenv,
+		    offsetof(vdev_label_t, vl_be),
+		    VDEV_PAD_SIZE, NULL, NULL, flags);
+
+		vdev_label_write(zio, vd, l, ub_abd,
+		    offsetof(vdev_label_t, vl_uberblock),
+		    VDEV_UBERBLOCK_RING, NULL, NULL, flags);
+	}
+
+	error = zio_wait(zio);
+
+	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
+	nvlist_free(label);
+	abd_free(bootenv);
+	abd_free(ub_abd);
+	abd_free(vp_abd);
+
+	/*
+	 * If this vdev hasn't been previously identified as a spare, then we
+	 * mark it as such only if a) we are labeling it as a spare, or b) it
+	 * exists as a spare elsewhere in the system.  Do the same for
+	 * level 2 ARC devices.
+	 */
+	if (error == 0 && !vd->vdev_isspare &&
+	    (reason == VDEV_LABEL_SPARE ||
+	    spa_spare_exists(vd->vdev_guid, NULL, NULL)))
+		spa_spare_add(vd);
+
+	if (error == 0 && !vd->vdev_isl2cache &&
+	    (reason == VDEV_LABEL_L2CACHE ||
+	    spa_l2cache_exists(vd->vdev_guid, NULL)))
+		spa_l2cache_add(vd);
+
+	return (error);
+}
+
+/*
+ * Done callback for vdev_label_read_bootenv_impl. If this is the first
+ * callback to finish, store our abd in the callback pointer. Otherwise, we
+ * just free our abd and return.
+ */
+static void
+vdev_label_read_bootenv_done(zio_t *zio)
+{
+	zio_t *rio = zio->io_private;
+	abd_t **cbp = rio->io_private;
+
+	ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE);
+
+	if (zio->io_error == 0) {
+		mutex_enter(&rio->io_lock);
+		if (*cbp == NULL) {
+			/* Will free this buffer in vdev_label_read_bootenv. */
+			*cbp = zio->io_abd;
+		} else {
+			abd_free(zio->io_abd);
+		}
+		mutex_exit(&rio->io_lock);
+	} else {
+		abd_free(zio->io_abd);
+	}
+}
+
+static void
+vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags);
+
+	/*
+	 * We just use the first label that has a correct checksum; the
+	 * bootloader should have rewritten them all to be the same on boot,
+	 * and any changes we made since boot have been the same across all
+	 * labels.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+		for (int l = 0; l < VDEV_LABELS; l++) {
+			vdev_label_read(zio, vd, l,
+			    abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE),
+			    offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE,
+			    vdev_label_read_bootenv_done, zio, flags);
+		}
+	}
+}
+
+int
+vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *bootenv)
+{
+	nvlist_t *config;
+	spa_t *spa = rvd->vdev_spa;
+	abd_t *abd = NULL;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+	ASSERT(bootenv);
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	zio_t *zio = zio_root(spa, NULL, &abd, flags);
+	vdev_label_read_bootenv_impl(zio, rvd, flags);
+	int err = zio_wait(zio);
+
+	if (abd != NULL) {
+		char *buf;
+		vdev_boot_envblock_t *vbe = abd_to_buf(abd);
+
+		vbe->vbe_version = ntohll(vbe->vbe_version);
+		switch (vbe->vbe_version) {
+		case VB_RAW:
+			/*
+			 * if we have textual data in vbe_bootenv, create nvlist
+			 * with key "envmap".
+			 */
+			fnvlist_add_uint64(bootenv, BOOTENV_VERSION, VB_RAW);
+			vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0';
+			fnvlist_add_string(bootenv, GRUB_ENVMAP,
+			    vbe->vbe_bootenv);
+			break;
+
+		case VB_NVLIST:
+			err = nvlist_unpack(vbe->vbe_bootenv,
+			    sizeof (vbe->vbe_bootenv), &config, 0);
+			if (err == 0) {
+				fnvlist_merge(bootenv, config);
+				nvlist_free(config);
+				break;
+			}
+			/* FALLTHROUGH */
+		default:
+			/* Check for FreeBSD zfs bootonce command string */
+			buf = abd_to_buf(abd);
+			if (*buf == '\0') {
+				fnvlist_add_uint64(bootenv, BOOTENV_VERSION,
+				    VB_NVLIST);
+				break;
+			}
+			fnvlist_add_string(bootenv, FREEBSD_BOOTONCE, buf);
+		}
+
+		/*
+		 * abd was allocated in vdev_label_read_bootenv_impl()
+		 */
+		abd_free(abd);
+		/*
+		 * If we managed to read any successfully,
+		 * return success.
+		 */
+		return (0);
+	}
+	return (err);
+}
+
+int
+vdev_label_write_bootenv(vdev_t *vd, nvlist_t *env)
+{
+	zio_t *zio;
+	spa_t *spa = vd->vdev_spa;
+	vdev_boot_envblock_t *bootenv;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+	int error;
+	size_t nvsize;
+	char *nvbuf;
+
+	error = nvlist_size(env, &nvsize, NV_ENCODE_XDR);
+	if (error != 0)
+		return (SET_ERROR(error));
+
+	if (nvsize >= sizeof (bootenv->vbe_bootenv)) {
+		return (SET_ERROR(E2BIG));
+	}
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	error = ENXIO;
+	for (int c = 0; c < vd->vdev_children; c++) {
+		int child_err;
+
+		child_err = vdev_label_write_bootenv(vd->vdev_child[c], env);
+		/*
+		 * As long as any of the disks managed to write all of their
+		 * labels successfully, return success.
+		 */
+		if (child_err == 0)
+			error = child_err;
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) ||
+	    !vdev_writeable(vd)) {
+		return (error);
+	}
+	ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE);
+	abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE);
+	abd_zero(abd, VDEV_PAD_SIZE);
+
+	bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE);
+	nvbuf = bootenv->vbe_bootenv;
+	nvsize = sizeof (bootenv->vbe_bootenv);
+
+	bootenv->vbe_version = fnvlist_lookup_uint64(env, BOOTENV_VERSION);
+	switch (bootenv->vbe_version) {
+	case VB_RAW:
+		if (nvlist_lookup_string(env, GRUB_ENVMAP, &nvbuf) == 0) {
+			(void) strlcpy(bootenv->vbe_bootenv, nvbuf, nvsize);
+		}
+		error = 0;
+		break;
+
+	case VB_NVLIST:
+		error = nvlist_pack(env, &nvbuf, &nvsize, NV_ENCODE_XDR,
+		    KM_SLEEP);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	if (error == 0) {
+		bootenv->vbe_version = htonll(bootenv->vbe_version);
+		abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE);
+	} else {
+		abd_free(abd);
+		return (SET_ERROR(error));
+	}
+
+retry:
+	zio = zio_root(spa, NULL, NULL, flags);
+	for (int l = 0; l < VDEV_LABELS; l++) {
+		vdev_label_write(zio, vd, l, abd,
+		    offsetof(vdev_label_t, vl_be),
+		    VDEV_PAD_SIZE, NULL, NULL, flags);
+	}
+
+	error = zio_wait(zio);
+	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
+	abd_free(abd);
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * uberblock load/sync
+ * ==========================================================================
+ */
+
+/*
+ * Consider the following situation: txg is safely synced to disk.  We've
+ * written the first uberblock for txg + 1, and then we lose power.  When we
+ * come back up, we fail to see the uberblock for txg + 1 because, say,
+ * it was on a mirrored device and the replica to which we wrote txg + 1
+ * is now offline.  If we then make some changes and sync txg + 1, and then
+ * the missing replica comes back, then for a few seconds we'll have two
+ * conflicting uberblocks on disk with the same txg.  The solution is simple:
+ * among uberblocks with equal txg, choose the one with the latest timestamp.
+ */
+static int
+vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2)
+{
+	int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg);
+
+	if (likely(cmp))
+		return (cmp);
+
+	cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp);
+	if (likely(cmp))
+		return (cmp);
+
+	/*
+	 * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware
+	 * ZFS, e.g. OpenZFS >= 0.7.
+	 *
+	 * If one ub has MMP and the other does not, they were written by
+	 * different hosts, which matters for MMP.  So we treat no MMP/no SEQ as
+	 * a 0 value.
+	 *
+	 * Since timestamp and txg are the same if we get this far, either is
+	 * acceptable for importing the pool.
+	 */
+	unsigned int seq1 = 0;
+	unsigned int seq2 = 0;
+
+	if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1))
+		seq1 = MMP_SEQ(ub1);
+
+	if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2))
+		seq2 = MMP_SEQ(ub2);
+
+	return (TREE_CMP(seq1, seq2));
+}
+
+struct ubl_cbdata {
+	uberblock_t	*ubl_ubbest;	/* Best uberblock */
+	vdev_t		*ubl_vd;	/* vdev associated with the above */
+};
+
+static void
+vdev_uberblock_load_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	spa_t *spa = zio->io_spa;
+	zio_t *rio = zio->io_private;
+	uberblock_t *ub = abd_to_buf(zio->io_abd);
+	struct ubl_cbdata *cbp = rio->io_private;
+
+	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
+
+	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
+		mutex_enter(&rio->io_lock);
+		if (ub->ub_txg <= spa->spa_load_max_txg &&
+		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
+			/*
+			 * Keep track of the vdev in which this uberblock
+			 * was found. We will use this information later
+			 * to obtain the config nvlist associated with
+			 * this uberblock.
+			 */
+			*cbp->ubl_ubbest = *ub;
+			cbp->ubl_vd = vd;
+		}
+		mutex_exit(&rio->io_lock);
+	}
+
+	abd_free(zio->io_abd);
+}
+
+static void
+vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
+    struct ubl_cbdata *cbp)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
+
+	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd) &&
+	    vd->vdev_ops != &vdev_draid_spare_ops) {
+		for (int l = 0; l < VDEV_LABELS; l++) {
+			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+				vdev_label_read(zio, vd, l,
+				    abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd),
+				    B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n),
+				    VDEV_UBERBLOCK_SIZE(vd),
+				    vdev_uberblock_load_done, zio, flags);
+			}
+		}
+	}
+}
+
+/*
+ * Reads the 'best' uberblock from disk along with its associated
+ * configuration. First, we read the uberblock array of each label of each
+ * vdev, keeping track of the uberblock with the highest txg in each array.
+ * Then, we read the configuration from the same vdev as the best uberblock.
+ */
+void
+vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
+{
+	zio_t *zio;
+	spa_t *spa = rvd->vdev_spa;
+	struct ubl_cbdata cb;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
+
+	ASSERT(ub);
+	ASSERT(config);
+
+	bzero(ub, sizeof (uberblock_t));
+	*config = NULL;
+
+	cb.ubl_ubbest = ub;
+	cb.ubl_vd = NULL;
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	zio = zio_root(spa, NULL, &cb, flags);
+	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
+	(void) zio_wait(zio);
+
+	/*
+	 * It's possible that the best uberblock was discovered on a label
+	 * that has a configuration which was written in a future txg.
+	 * Search all labels on this vdev to find the configuration that
+	 * matches the txg for our uberblock.
+	 */
+	if (cb.ubl_vd != NULL) {
+		vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. "
+		    "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg);
+
+		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
+		if (*config == NULL && spa->spa_extreme_rewind) {
+			vdev_dbgmsg(cb.ubl_vd, "failed to read label config. "
+			    "Trying again without txg restrictions.");
+			*config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX);
+		}
+		if (*config == NULL) {
+			vdev_dbgmsg(cb.ubl_vd, "failed to read label config");
+		}
+	}
+	spa_config_exit(spa, SCL_ALL, FTAG);
+}
+
+/*
+ * For use when a leaf vdev is expanded.
+ * The location of labels 2 and 3 changed, and at the new location the
+ * uberblock rings are either empty or contain garbage.  The sync will write
+ * new configs there because the vdev is dirty, but expansion also needs the
+ * uberblock rings copied.  Read them from label 0 which did not move.
+ *
+ * Since the point is to populate labels {2,3} with valid uberblocks,
+ * we zero uberblocks we fail to read or which are not valid.
+ */
+
+static void
+vdev_copy_uberblocks(vdev_t *vd)
+{
+	abd_t *ub_abd;
+	zio_t *write_zio;
+	int locks = (SCL_L2ARC | SCL_ZIO);
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE;
+
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) ==
+	    SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	/*
+	 * No uberblocks are stored on distributed spares, they may be
+	 * safely skipped when expanding a leaf vdev.
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
+	spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER);
+
+	ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+
+	write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
+	for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
+		const int src_label = 0;
+		zio_t *zio;
+
+		zio = zio_root(vd->vdev_spa, NULL, NULL, flags);
+		vdev_label_read(zio, vd, src_label, ub_abd,
+		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
+		    NULL, NULL, flags);
+
+		if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd)))
+			abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+
+		for (int l = 2; l < VDEV_LABELS; l++)
+			vdev_label_write(write_zio, vd, l, ub_abd,
+			    VDEV_UBERBLOCK_OFFSET(vd, n),
+			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL,
+			    flags | ZIO_FLAG_DONT_PROPAGATE);
+	}
+	(void) zio_wait(write_zio);
+
+	spa_config_exit(vd->vdev_spa, locks, FTAG);
+
+	abd_free(ub_abd);
+}
+
+/*
+ * On success, increment root zio's count of good writes.
+ * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
+ */
+static void
+vdev_uberblock_sync_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_private;
+
+	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
+		atomic_inc_64(good_writes);
+}
+
+/*
+ * Write the uberblock to all labels of all leaves of the specified vdev.
+ */
+static void
+vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes,
+    uberblock_t *ub, vdev_t *vd, int flags)
+{
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_uberblock_sync(zio, good_writes,
+		    ub, vd->vdev_child[c], flags);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (!vdev_writeable(vd))
+		return;
+
+	/*
+	 * There's no need to write uberblocks to a distributed spare, they
+	 * are already stored on all the leaves of the parent dRAID.  For
+	 * this same reason vdev_uberblock_load_impl() skips distributed
+	 * spares when reading uberblocks.
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
+	/* If the vdev was expanded, need to copy uberblock rings. */
+	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
+	    vd->vdev_copy_uberblocks == B_TRUE) {
+		vdev_copy_uberblocks(vd);
+		vd->vdev_copy_uberblocks = B_FALSE;
+	}
+
+	int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0;
+	int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m);
+
+	/* Copy the uberblock_t into the ABD */
+	abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE);
+	abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd));
+	abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t));
+
+	for (int l = 0; l < VDEV_LABELS; l++)
+		vdev_label_write(zio, vd, l, ub_abd,
+		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
+		    vdev_uberblock_sync_done, good_writes,
+		    flags | ZIO_FLAG_DONT_PROPAGATE);
+
+	abd_free(ub_abd);
+}
+
+/* Sync the uberblocks to all vdevs in svd[] */
+static int
+vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
+{
+	spa_t *spa = svd[0]->vdev_spa;
+	zio_t *zio;
+	uint64_t good_writes = 0;
+
+	zio = zio_root(spa, NULL, NULL, flags);
+
+	for (int v = 0; v < svdcount; v++)
+		vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags);
+
+	(void) zio_wait(zio);
+
+	/*
+	 * Flush the uberblocks to disk.  This ensures that the odd labels
+	 * are no longer needed (because the new uberblocks and the even
+	 * labels are safely on disk), so it is safe to overwrite them.
+	 */
+	zio = zio_root(spa, NULL, NULL, flags);
+
+	for (int v = 0; v < svdcount; v++) {
+		if (vdev_writeable(svd[v])) {
+			zio_flush(zio, svd[v]);
+		}
+	}
+
+	(void) zio_wait(zio);
+
+	return (good_writes >= 1 ? 0 : EIO);
+}
+
+/*
+ * On success, increment the count of good writes for our top-level vdev.
+ */
+static void
+vdev_label_sync_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_private;
+
+	if (zio->io_error == 0)
+		atomic_inc_64(good_writes);
+}
+
+/*
+ * If there weren't enough good writes, indicate failure to the parent.
+ */
+static void
+vdev_label_sync_top_done(zio_t *zio)
+{
+	uint64_t *good_writes = zio->io_private;
+
+	if (*good_writes == 0)
+		zio->io_error = SET_ERROR(EIO);
+
+	kmem_free(good_writes, sizeof (uint64_t));
+}
+
+/*
+ * We ignore errors for log and cache devices, simply free the private data.
+ */
+static void
+vdev_label_sync_ignore_done(zio_t *zio)
+{
+	kmem_free(zio->io_private, sizeof (uint64_t));
+}
+
+/*
+ * Write all even or odd labels to all leaves of the specified vdev.
+ */
+static void
+vdev_label_sync(zio_t *zio, uint64_t *good_writes,
+    vdev_t *vd, int l, uint64_t txg, int flags)
+{
+	nvlist_t *label;
+	vdev_phys_t *vp;
+	abd_t *vp_abd;
+	char *buf;
+	size_t buflen;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_label_sync(zio, good_writes,
+		    vd->vdev_child[c], l, txg, flags);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf)
+		return;
+
+	if (!vdev_writeable(vd))
+		return;
+
+	/*
+	 * The top-level config never needs to be written to a distributed
+	 * spare.  When read vdev_dspare_label_read_config() will generate
+	 * the config for the vdev_label_read_config().
+	 */
+	if (vd->vdev_ops == &vdev_draid_spare_ops)
+		return;
+
+	/*
+	 * Generate a label describing the top-level config to which we belong.
+	 */
+	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
+
+	vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE);
+	abd_zero(vp_abd, sizeof (vdev_phys_t));
+	vp = abd_to_buf(vp_abd);
+
+	buf = vp->vp_nvlist;
+	buflen = sizeof (vp->vp_nvlist);
+
+	if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) {
+		for (; l < VDEV_LABELS; l += 2) {
+			vdev_label_write(zio, vd, l, vp_abd,
+			    offsetof(vdev_label_t, vl_vdev_phys),
+			    sizeof (vdev_phys_t),
+			    vdev_label_sync_done, good_writes,
+			    flags | ZIO_FLAG_DONT_PROPAGATE);
+		}
+	}
+
+	abd_free(vp_abd);
+	nvlist_free(label);
+}
+
+static int
+vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
+{
+	list_t *dl = &spa->spa_config_dirty_list;
+	vdev_t *vd;
+	zio_t *zio;
+	int error;
+
+	/*
+	 * Write the new labels to disk.
+	 */
+	zio = zio_root(spa, NULL, NULL, flags);
+
+	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
+		uint64_t *good_writes;
+
+		ASSERT(!vd->vdev_ishole);
+
+		good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+		zio_t *vio = zio_null(zio, spa, NULL,
+		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
+		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
+		    good_writes, flags);
+		vdev_label_sync(vio, good_writes, vd, l, txg, flags);
+		zio_nowait(vio);
+	}
+
+	error = zio_wait(zio);
+
+	/*
+	 * Flush the new labels to disk.
+	 */
+	zio = zio_root(spa, NULL, NULL, flags);
+
+	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
+		zio_flush(zio, vd);
+
+	(void) zio_wait(zio);
+
+	return (error);
+}
+
+/*
+ * Sync the uberblock and any changes to the vdev configuration.
+ *
+ * The order of operations is carefully crafted to ensure that
+ * if the system panics or loses power at any time, the state on disk
+ * is still transactionally consistent.  The in-line comments below
+ * describe the failure semantics at each stage.
+ *
+ * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
+ * at any time, you can just call it again, and it will resume its work.
+ */
+int
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+{
+	spa_t *spa = svd[0]->vdev_spa;
+	uberblock_t *ub = &spa->spa_uberblock;
+	int error = 0;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+
+	ASSERT(svdcount != 0);
+retry:
+	/*
+	 * Normally, we don't want to try too hard to write every label and
+	 * uberblock.  If there is a flaky disk, we don't want the rest of the
+	 * sync process to block while we retry.  But if we can't write a
+	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
+	 * bailing out and declaring the pool faulted.
+	 */
+	if (error != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0)
+			return (error);
+		flags |= ZIO_FLAG_TRYHARD;
+	}
+
+	ASSERT(ub->ub_txg <= txg);
+
+	/*
+	 * If this isn't a resync due to I/O errors,
+	 * and nothing changed in this transaction group,
+	 * and the vdev configuration hasn't changed,
+	 * then there's nothing to do.
+	 */
+	if (ub->ub_txg < txg) {
+		boolean_t changed = uberblock_update(ub, spa->spa_root_vdev,
+		    txg, spa->spa_mmp.mmp_delay);
+
+		if (!changed && list_is_empty(&spa->spa_config_dirty_list))
+			return (0);
+	}
+
+	if (txg > spa_freeze_txg(spa))
+		return (0);
+
+	ASSERT(txg <= spa->spa_final_txg);
+
+	/*
+	 * Flush the write cache of every disk that's been written to
+	 * in this transaction group.  This ensures that all blocks
+	 * written in this txg will be committed to stable storage
+	 * before any uberblock that references them.
+	 */
+	zio_t *zio = zio_root(spa, NULL, NULL, flags);
+
+	for (vdev_t *vd =
+	    txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL;
+	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
+		zio_flush(zio, vd);
+
+	(void) zio_wait(zio);
+
+	/*
+	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
+	 * system dies in the middle of this process, that's OK: all of the
+	 * even labels that made it to disk will be newer than any uberblock,
+	 * and will therefore be considered invalid.  The odd labels (L1, L3),
+	 * which have not yet been touched, will still be valid.  We flush
+	 * the new labels to disk to ensure that all even-label updates
+	 * are committed to stable storage before the uberblock update.
+	 */
+	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+			    "for pool '%s' when syncing out the even labels "
+			    "of dirty vdevs", error, spa_name(spa));
+		}
+		goto retry;
+	}
+
+	/*
+	 * Sync the uberblocks to all vdevs in svd[].
+	 * If the system dies in the middle of this step, there are two cases
+	 * to consider, and the on-disk state is consistent either way:
+	 *
+	 * (1)	If none of the new uberblocks made it to disk, then the
+	 *	previous uberblock will be the newest, and the odd labels
+	 *	(which had not yet been touched) will be valid with respect
+	 *	to that uberblock.
+	 *
+	 * (2)	If one or more new uberblocks made it to disk, then they
+	 *	will be the newest, and the even labels (which had all
+	 *	been successfully committed) will be valid with respect
+	 *	to the new uberblocks.
+	 */
+	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_uberblock_sync_list() returned error "
+			    "%d for pool '%s'", error, spa_name(spa));
+		}
+		goto retry;
+	}
+
+	if (spa_multihost(spa))
+		mmp_update_uberblock(spa, ub);
+
+	/*
+	 * Sync out odd labels for every dirty vdev.  If the system dies
+	 * in the middle of this process, the even labels and the new
+	 * uberblocks will suffice to open the pool.  The next time
+	 * the pool is opened, the first thing we'll do -- before any
+	 * user data is modified -- is mark every vdev dirty so that
+	 * all labels will be brought up to date.  We flush the new labels
+	 * to disk to ensure that all odd-label updates are committed to
+	 * stable storage before the next transaction group begins.
+	 */
+	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) {
+		if ((flags & ZIO_FLAG_TRYHARD) != 0) {
+			zfs_dbgmsg("vdev_label_sync_list() returned error %d "
+			    "for pool '%s' when syncing out the odd labels of "
+			    "dirty vdevs", error, spa_name(spa));
+		}
+		goto retry;
+	}
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/vdev_mirror.c b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
new file mode 100644
index 000000000000..71ca43caec1a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_mirror.c
@@ -0,0 +1,972 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/zio.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Vdev mirror kstats
+ */
+static kstat_t *mirror_ksp = NULL;
+
+typedef struct mirror_stats {
+	kstat_named_t vdev_mirror_stat_rotating_linear;
+	kstat_named_t vdev_mirror_stat_rotating_offset;
+	kstat_named_t vdev_mirror_stat_rotating_seek;
+	kstat_named_t vdev_mirror_stat_non_rotating_linear;
+	kstat_named_t vdev_mirror_stat_non_rotating_seek;
+
+	kstat_named_t vdev_mirror_stat_preferred_found;
+	kstat_named_t vdev_mirror_stat_preferred_not_found;
+} mirror_stats_t;
+
+static mirror_stats_t mirror_stats = {
+	/* New I/O follows directly the last I/O */
+	{ "rotating_linear",			KSTAT_DATA_UINT64 },
+	/* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
+	{ "rotating_offset",			KSTAT_DATA_UINT64 },
+	/* New I/O requires random seek */
+	{ "rotating_seek",			KSTAT_DATA_UINT64 },
+	/* New I/O follows directly the last I/O  (nonrot) */
+	{ "non_rotating_linear",		KSTAT_DATA_UINT64 },
+	/* New I/O requires random seek (nonrot) */
+	{ "non_rotating_seek",			KSTAT_DATA_UINT64 },
+	/* Preferred child vdev found */
+	{ "preferred_found",			KSTAT_DATA_UINT64 },
+	/* Preferred child vdev not found or equal load  */
+	{ "preferred_not_found",		KSTAT_DATA_UINT64 },
+
+};
+
+#define	MIRROR_STAT(stat)		(mirror_stats.stat.value.ui64)
+#define	MIRROR_INCR(stat, val) 		atomic_add_64(&MIRROR_STAT(stat), val)
+#define	MIRROR_BUMP(stat)		MIRROR_INCR(stat, 1)
+
+void
+vdev_mirror_stat_init(void)
+{
+	mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats",
+	    "misc", KSTAT_TYPE_NAMED,
+	    sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+	if (mirror_ksp != NULL) {
+		mirror_ksp->ks_data = &mirror_stats;
+		kstat_install(mirror_ksp);
+	}
+}
+
+void
+vdev_mirror_stat_fini(void)
+{
+	if (mirror_ksp != NULL) {
+		kstat_delete(mirror_ksp);
+		mirror_ksp = NULL;
+	}
+}
+
+/*
+ * Virtual device vector for mirroring.
+ */
+typedef struct mirror_child {
+	vdev_t		*mc_vd;
+	uint64_t	mc_offset;
+	int		mc_error;
+	int		mc_load;
+	uint8_t		mc_tried;
+	uint8_t		mc_skipped;
+	uint8_t		mc_speculative;
+	uint8_t		mc_rebuilding;
+} mirror_child_t;
+
+typedef struct mirror_map {
+	int		*mm_preferred;
+	int		mm_preferred_cnt;
+	int		mm_children;
+	boolean_t	mm_resilvering;
+	boolean_t	mm_rebuilding;
+	boolean_t	mm_root;
+	mirror_child_t	mm_child[];
+} mirror_map_t;
+
+static int vdev_mirror_shift = 21;
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
+ * as it will direct more reads to the non-rotating vdevs which are more likely
+ * to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int zfs_vdev_mirror_rotating_inc = 0;
+static int zfs_vdev_mirror_rotating_seek_inc = 5;
+static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024;
+
+/* Non-rotating media load calculation configuration. */
+static int zfs_vdev_mirror_non_rotating_inc = 0;
+static int zfs_vdev_mirror_non_rotating_seek_inc = 1;
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+	return (offsetof(mirror_map_t, mm_child[children]) +
+	    sizeof (int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+	mirror_map_t *mm;
+
+	mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+	mm->mm_children = children;
+	mm->mm_resilvering = resilvering;
+	mm->mm_root = root;
+	mm->mm_preferred = (int *)((uintptr_t)mm +
+	    offsetof(mirror_map_t, mm_child[children]));
+
+	return (mm);
+}
+
+static void
+vdev_mirror_map_free(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+
+	kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
+}
+
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+	.vsd_free = vdev_mirror_map_free,
+	.vsd_cksum_report = zio_vsd_default_cksum_report
+};
+
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+	uint64_t last_offset;
+	int64_t offset_diff;
+	int load;
+
+	/* All DVAs have equal weight at the root. */
+	if (mm->mm_root)
+		return (INT_MAX);
+
+	/*
+	 * We don't return INT_MAX if the device is resilvering i.e.
+	 * vdev_resilver_txg != 0 as when tested performance was slightly
+	 * worse overall when resilvering with compared to without.
+	 */
+
+	/* Fix zio_offset for leaf vdevs */
+	if (vd->vdev_ops->vdev_op_leaf)
+		zio_offset += VDEV_LABEL_START_SIZE;
+
+	/* Standard load based on pending queue length. */
+	load = vdev_queue_length(vd);
+	last_offset = vdev_queue_last_offset(vd);
+
+	if (vd->vdev_nonrot) {
+		/* Non-rotating media. */
+		if (last_offset == zio_offset) {
+			MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear);
+			return (load + zfs_vdev_mirror_non_rotating_inc);
+		}
+
+		/*
+		 * Apply a seek penalty even for non-rotating devices as
+		 * sequential I/O's can be aggregated into fewer operations on
+		 * the device, thus avoiding unnecessary per-command overhead
+		 * and boosting performance.
+		 */
+		MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek);
+		return (load + zfs_vdev_mirror_non_rotating_seek_inc);
+	}
+
+	/* Rotating media I/O's which directly follow the last I/O. */
+	if (last_offset == zio_offset) {
+		MIRROR_BUMP(vdev_mirror_stat_rotating_linear);
+		return (load + zfs_vdev_mirror_rotating_inc);
+	}
+
+	/*
+	 * Apply half the seek increment to I/O's within seek offset
+	 * of the last I/O issued to this vdev as they should incur less
+	 * of a seek increment.
+	 */
+	offset_diff = (int64_t)(last_offset - zio_offset);
+	if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) {
+		MIRROR_BUMP(vdev_mirror_stat_rotating_offset);
+		return (load + (zfs_vdev_mirror_rotating_seek_inc / 2));
+	}
+
+	/* Apply the full seek increment to all other I/O's. */
+	MIRROR_BUMP(vdev_mirror_stat_rotating_seek);
+	return (load + zfs_vdev_mirror_rotating_seek_inc);
+}
+
+static boolean_t
+vdev_mirror_rebuilding(vdev_t *vd)
+{
+	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_rebuild_txg)
+		return (B_TRUE);
+
+	for (int i = 0; i < vd->vdev_children; i++) {
+		if (vdev_mirror_rebuilding(vd->vdev_child[i])) {
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Avoid inlining the function to keep vdev_mirror_io_start(), which
+ * is this functions only caller, as small as possible on the stack.
+ */
+noinline static mirror_map_t *
+vdev_mirror_map_init(zio_t *zio)
+{
+	mirror_map_t *mm = NULL;
+	mirror_child_t *mc;
+	vdev_t *vd = zio->io_vd;
+	int c;
+
+	if (vd == NULL) {
+		dva_t *dva = zio->io_bp->blk_dva;
+		spa_t *spa = zio->io_spa;
+		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+		dva_t dva_copy[SPA_DVAS_PER_BP];
+
+		/*
+		 * The sequential scrub code sorts and issues all DVAs
+		 * of a bp separately. Each of these IOs includes all
+		 * original DVA copies so that repairs can be performed
+		 * in the event of an error, but we only actually want
+		 * to check the first DVA since the others will be
+		 * checked by their respective sorted IOs. Only if we
+		 * hit an error will we try all DVAs upon retrying.
+		 *
+		 * Note: This check is safe even if the user switches
+		 * from a legacy scrub to a sequential one in the middle
+		 * of processing, since scn_is_sorted isn't updated until
+		 * all outstanding IOs from the previous scrub pass
+		 * complete.
+		 */
+		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+		    dsl_scan_scrubbing(spa->spa_dsl_pool) &&
+		    scn->scn_is_sorted) {
+			c = 1;
+		} else {
+			c = BP_GET_NDVAS(zio->io_bp);
+		}
+
+		/*
+		 * If the pool cannot be written to, then infer that some
+		 * DVAs might be invalid or point to vdevs that do not exist.
+		 * We skip them.
+		 */
+		if (!spa_writeable(spa)) {
+			ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+			int j = 0;
+			for (int i = 0; i < c; i++) {
+				if (zfs_dva_valid(spa, &dva[i], zio->io_bp))
+					dva_copy[j++] = dva[i];
+			}
+			if (j == 0) {
+				zio->io_vsd = NULL;
+				zio->io_error = ENXIO;
+				return (NULL);
+			}
+			if (j < c) {
+				dva = dva_copy;
+				c = j;
+			}
+		}
+
+		mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE);
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+
+			mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
+			mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
+			if (mc->mc_vd == NULL) {
+				kmem_free(mm, vdev_mirror_map_size(
+				    mm->mm_children));
+				zio->io_vsd = NULL;
+				zio->io_error = ENXIO;
+				return (NULL);
+			}
+		}
+	} else {
+		/*
+		 * If we are resilvering, then we should handle scrub reads
+		 * differently; we shouldn't issue them to the resilvering
+		 * device because it might not have those blocks.
+		 *
+		 * We are resilvering iff:
+		 * 1) We are a replacing vdev (ie our name is "replacing-1" or
+		 *    "spare-1" or something like that), and
+		 * 2) The pool is currently being resilvered.
+		 *
+		 * We cannot simply check vd->vdev_resilver_txg, because it's
+		 * not set in this path.
+		 *
+		 * Nor can we just check our vdev_ops; there are cases (such as
+		 * when a user types "zpool replace pool odev spare_dev" and
+		 * spare_dev is in the spare list, or when a spare device is
+		 * automatically used to replace a DEGRADED device) when
+		 * resilvering is complete but both the original vdev and the
+		 * spare vdev remain in the pool.  That behavior is intentional.
+		 * It helps implement the policy that a spare should be
+		 * automatically removed from the pool after the user replaces
+		 * the device that originally failed.
+		 *
+		 * If a spa load is in progress, then spa_dsl_pool may be
+		 * uninitialized.  But we shouldn't be resilvering during a spa
+		 * load anyway.
+		 */
+		boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+		    vd->vdev_ops == &vdev_spare_ops) &&
+		    spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+		    dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+		mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+		    B_FALSE);
+		for (c = 0; c < mm->mm_children; c++) {
+			mc = &mm->mm_child[c];
+			mc->mc_vd = vd->vdev_child[c];
+			mc->mc_offset = zio->io_offset;
+
+			if (vdev_mirror_rebuilding(mc->mc_vd))
+				mm->mm_rebuilding = mc->mc_rebuilding = B_TRUE;
+		}
+	}
+
+	zio->io_vsd = mm;
+	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
+	return (mm);
+}
+
+static int
+vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	int numerrors = 0;
+	int lasterror = 0;
+
+	if (vd->vdev_children == 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	vdev_open_children(vd);
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error) {
+			lasterror = cvd->vdev_open_error;
+			numerrors++;
+			continue;
+		}
+
+		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+		*physical_ashift = MAX(*physical_ashift,
+		    cvd->vdev_physical_ashift);
+	}
+
+	if (numerrors == vd->vdev_children) {
+		if (vdev_children_are_offline(vd))
+			vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE;
+		else
+			vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	return (0);
+}
+
+static void
+vdev_mirror_close(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_mirror_child_done(zio_t *zio)
+{
+	mirror_child_t *mc = zio->io_private;
+
+	mc->mc_error = zio->io_error;
+	mc->mc_tried = 1;
+	mc->mc_skipped = 0;
+}
+
+static void
+vdev_mirror_scrub_done(zio_t *zio)
+{
+	mirror_child_t *mc = zio->io_private;
+
+	if (zio->io_error == 0) {
+		zio_t *pio;
+		zio_link_t *zl = NULL;
+
+		mutex_enter(&zio->io_lock);
+		while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
+			mutex_enter(&pio->io_lock);
+			ASSERT3U(zio->io_size, >=, pio->io_size);
+			abd_copy(pio->io_abd, zio->io_abd, pio->io_size);
+			mutex_exit(&pio->io_lock);
+		}
+		mutex_exit(&zio->io_lock);
+	}
+
+	abd_free(zio->io_abd);
+
+	mc->mc_error = zio->io_error;
+	mc->mc_tried = 1;
+	mc->mc_skipped = 0;
+}
+
+/*
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked.  If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+	dva_t *dva = zio->io_bp->blk_dva;
+	mirror_map_t *mm = zio->io_vsd;
+	int preferred;
+	int c;
+
+	preferred = mm->mm_preferred[p];
+	for (p--; p >= 0; p--) {
+		c = mm->mm_preferred[p];
+		if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+			preferred = c;
+	}
+	return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	int p;
+
+	if (mm->mm_root) {
+		p = spa_get_random(mm->mm_preferred_cnt);
+		return (vdev_mirror_dva_select(zio, p));
+	}
+
+	/*
+	 * To ensure we don't always favour the first matching vdev,
+	 * which could lead to wear leveling issues on SSD's, we
+	 * use the I/O offset as a pseudo random seed into the vdevs
+	 * which have the lowest load.
+	 */
+	p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+	return (mm->mm_preferred[p]);
+}
+
+static boolean_t
+vdev_mirror_child_readable(mirror_child_t *mc)
+{
+	vdev_t *vd = mc->mc_vd;
+
+	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+		return (vdev_draid_readable(vd, mc->mc_offset));
+	else
+		return (vdev_readable(vd));
+}
+
+static boolean_t
+vdev_mirror_child_missing(mirror_child_t *mc, uint64_t txg, uint64_t size)
+{
+	vdev_t *vd = mc->mc_vd;
+
+	if (vd->vdev_top != NULL && vd->vdev_top->vdev_ops == &vdev_draid_ops)
+		return (vdev_draid_missing(vd, mc->mc_offset, txg, size));
+	else
+		return (vdev_dtl_contains(vd, DTL_MISSING, txg, size));
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * preferring vdevs based on determined load. If we can't, try the read on
+ * any vdev we haven't already tried.
+ *
+ * Distributed spares are an exception to the above load rule. They are
+ * always preferred in order to detect gaps in the distributed spare which
+ * are created when another disk in the dRAID fails. In order to restore
+ * redundancy those gaps must be read to trigger the required repair IO.
+ */
+static int
+vdev_mirror_child_select(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	uint64_t txg = zio->io_txg;
+	int c, lowest_load;
+
+	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
+
+	lowest_load = INT_MAX;
+	mm->mm_preferred_cnt = 0;
+	for (c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc;
+
+		mc = &mm->mm_child[c];
+		if (mc->mc_tried || mc->mc_skipped)
+			continue;
+
+		if (mc->mc_vd == NULL ||
+		    !vdev_mirror_child_readable(mc)) {
+			mc->mc_error = SET_ERROR(ENXIO);
+			mc->mc_tried = 1;	/* don't even try */
+			mc->mc_skipped = 1;
+			continue;
+		}
+
+		if (vdev_mirror_child_missing(mc, txg, 1)) {
+			mc->mc_error = SET_ERROR(ESTALE);
+			mc->mc_skipped = 1;
+			mc->mc_speculative = 1;
+			continue;
+		}
+
+		if (mc->mc_vd->vdev_ops == &vdev_draid_spare_ops) {
+			mm->mm_preferred[0] = c;
+			mm->mm_preferred_cnt = 1;
+			break;
+		}
+
+		mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+		if (mc->mc_load > lowest_load)
+			continue;
+
+		if (mc->mc_load < lowest_load) {
+			lowest_load = mc->mc_load;
+			mm->mm_preferred_cnt = 0;
+		}
+		mm->mm_preferred[mm->mm_preferred_cnt] = c;
+		mm->mm_preferred_cnt++;
+	}
+
+	if (mm->mm_preferred_cnt == 1) {
+		MIRROR_BUMP(vdev_mirror_stat_preferred_found);
+		return (mm->mm_preferred[0]);
+	}
+
+	if (mm->mm_preferred_cnt > 1) {
+		MIRROR_BUMP(vdev_mirror_stat_preferred_not_found);
+		return (vdev_mirror_preferred_child_randomize(zio));
+	}
+
+	/*
+	 * Every device is either missing or has this txg in its DTL.
+	 * Look for any child we haven't already tried before giving up.
+	 */
+	for (c = 0; c < mm->mm_children; c++) {
+		if (!mm->mm_child[c].mc_tried)
+			return (c);
+	}
+
+	/*
+	 * Every child failed.  There's no place left to look.
+	 */
+	return (-1);
+}
+
+static void
+vdev_mirror_io_start(zio_t *zio)
+{
+	mirror_map_t *mm;
+	mirror_child_t *mc;
+	int c, children;
+
+	mm = vdev_mirror_map_init(zio);
+
+	if (mm == NULL) {
+		ASSERT(!spa_trust_config(zio->io_spa));
+		ASSERT(zio->io_type == ZIO_TYPE_READ);
+		zio_execute(zio);
+		return;
+	}
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		if (zio->io_bp != NULL &&
+		    (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) {
+			/*
+			 * For scrubbing reads (if we can verify the
+			 * checksum here, as indicated by io_bp being
+			 * non-NULL) we need to allocate a read buffer for
+			 * each child and issue reads to all children.  If
+			 * any child succeeds, it will copy its data into
+			 * zio->io_data in vdev_mirror_scrub_done.
+			 */
+			for (c = 0; c < mm->mm_children; c++) {
+				mc = &mm->mm_child[c];
+				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+				    mc->mc_vd, mc->mc_offset,
+				    abd_alloc_sametype(zio->io_abd,
+				    zio->io_size), zio->io_size,
+				    zio->io_type, zio->io_priority, 0,
+				    vdev_mirror_scrub_done, mc));
+			}
+			zio_execute(zio);
+			return;
+		}
+		/*
+		 * For normal reads just pick one child.
+		 */
+		c = vdev_mirror_child_select(zio);
+		children = (c >= 0);
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+
+		/*
+		 * Writes go to all children.
+		 */
+		c = 0;
+		children = mm->mm_children;
+	}
+
+	while (children--) {
+		mc = &mm->mm_child[c];
+		c++;
+
+		/*
+		 * When sequentially resilvering only issue write repair
+		 * IOs to the vdev which is being rebuilt since performance
+		 * is limited by the slowest child.  This is an issue for
+		 * faster replacement devices such as distributed spares.
+		 */
+		if ((zio->io_priority == ZIO_PRIORITY_REBUILD) &&
+		    (zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+		    !(zio->io_flags & ZIO_FLAG_SCRUB) &&
+		    mm->mm_rebuilding && !mc->mc_rebuilding) {
+			continue;
+		}
+
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
+		    zio->io_type, zio->io_priority, 0,
+		    vdev_mirror_child_done, mc));
+	}
+
+	zio_execute(zio);
+}
+
+static int
+vdev_mirror_worst_error(mirror_map_t *mm)
+{
+	int error[2] = { 0, 0 };
+
+	for (int c = 0; c < mm->mm_children; c++) {
+		mirror_child_t *mc = &mm->mm_child[c];
+		int s = mc->mc_speculative;
+		error[s] = zio_worst_error(error[s], mc->mc_error);
+	}
+
+	return (error[0] ? error[0] : error[1]);
+}
+
+static void
+vdev_mirror_io_done(zio_t *zio)
+{
+	mirror_map_t *mm = zio->io_vsd;
+	mirror_child_t *mc;
+	int c;
+	int good_copies = 0;
+	int unexpected_errors = 0;
+
+	if (mm == NULL)
+		return;
+
+	for (c = 0; c < mm->mm_children; c++) {
+		mc = &mm->mm_child[c];
+
+		if (mc->mc_error) {
+			if (!mc->mc_skipped)
+				unexpected_errors++;
+		} else if (mc->mc_tried) {
+			good_copies++;
+		}
+	}
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		/*
+		 * XXX -- for now, treat partial writes as success.
+		 *
+		 * Now that we support write reallocation, it would be better
+		 * to treat partial failure as real failure unless there are
+		 * no non-degraded top-level vdevs left, and not update DTLs
+		 * if we intend to reallocate.
+		 */
+		/* XXPOLICY */
+		if (good_copies != mm->mm_children) {
+			/*
+			 * Always require at least one good copy.
+			 *
+			 * For ditto blocks (io_vd == NULL), require
+			 * all copies to be good.
+			 *
+			 * XXX -- for replacing vdevs, there's no great answer.
+			 * If the old device is really dead, we may not even
+			 * be able to access it -- so we only want to
+			 * require good writes to the new device.  But if
+			 * the new device turns out to be flaky, we want
+			 * to be able to detach it -- which requires all
+			 * writes to the old device to have succeeded.
+			 */
+			if (good_copies == 0 || zio->io_vd == NULL)
+				zio->io_error = vdev_mirror_worst_error(mm);
+		}
+		return;
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	/*
+	 * If we don't have a good copy yet, keep trying other children.
+	 */
+	/* XXPOLICY */
+	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
+		ASSERT(c >= 0 && c < mm->mm_children);
+		mc = &mm->mm_child[c];
+		zio_vdev_io_redone(zio);
+		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+		    mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size,
+		    ZIO_TYPE_READ, zio->io_priority, 0,
+		    vdev_mirror_child_done, mc));
+		return;
+	}
+
+	/* XXPOLICY */
+	if (good_copies == 0) {
+		zio->io_error = vdev_mirror_worst_error(mm);
+		ASSERT(zio->io_error != 0);
+	}
+
+	if (good_copies && spa_writeable(zio->io_spa) &&
+	    (unexpected_errors ||
+	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
+	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
+		/*
+		 * Use the good data we have in hand to repair damaged children.
+		 */
+		for (c = 0; c < mm->mm_children; c++) {
+			/*
+			 * Don't rewrite known good children.
+			 * Not only is it unnecessary, it could
+			 * actually be harmful: if the system lost
+			 * power while rewriting the only good copy,
+			 * there would be no good copies left!
+			 */
+			mc = &mm->mm_child[c];
+
+			if (mc->mc_error == 0) {
+				vdev_ops_t *ops = mc->mc_vd->vdev_ops;
+
+				if (mc->mc_tried)
+					continue;
+				/*
+				 * We didn't try this child.  We need to
+				 * repair it if:
+				 * 1. it's a scrub (in which case we have
+				 * tried everything that was healthy)
+				 *  - or -
+				 * 2. it's an indirect or distributed spare
+				 * vdev (in which case it could point to any
+				 * other vdev, which might have a bad DTL)
+				 *  - or -
+				 * 3. the DTL indicates that this data is
+				 * missing from this vdev
+				 */
+				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
+				    ops != &vdev_indirect_ops &&
+				    ops != &vdev_draid_spare_ops &&
+				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
+				    zio->io_txg, 1))
+					continue;
+				mc->mc_error = SET_ERROR(ESTALE);
+			}
+
+			zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
+			    mc->mc_vd, mc->mc_offset,
+			    zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
+			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
+			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+		}
+	}
+}
+
+static void
+vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (faulted == vd->vdev_children) {
+		if (vdev_children_are_offline(vd)) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE,
+			    VDEV_AUX_CHILDREN_OFFLINE);
+		} else {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_NO_REPLICAS);
+		}
+	} else if (degraded + faulted != 0) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	} else {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+	}
+}
+
+/*
+ * Return the maximum asize for a rebuild zio in the provided range.
+ */
+static uint64_t
+vdev_mirror_rebuild_asize(vdev_t *vd, uint64_t start, uint64_t asize,
+    uint64_t max_segment)
+{
+	uint64_t psize = MIN(P2ROUNDUP(max_segment, 1 << vd->vdev_ashift),
+	    SPA_MAXBLOCKSIZE);
+
+	return (MIN(asize, vdev_psize_to_asize(vd, psize)));
+}
+
+vdev_ops_t vdev_mirror_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_mirror_open,
+	.vdev_op_close = vdev_mirror_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_mirror_io_start,
+	.vdev_op_io_done = vdev_mirror_io_done,
+	.vdev_op_state_change = vdev_mirror_state_change,
+	.vdev_op_need_resilver = vdev_default_need_resilver,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_MIRROR,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
+};
+
+vdev_ops_t vdev_replacing_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_mirror_open,
+	.vdev_op_close = vdev_mirror_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_mirror_io_start,
+	.vdev_op_io_done = vdev_mirror_io_done,
+	.vdev_op_state_change = vdev_mirror_state_change,
+	.vdev_op_need_resilver = vdev_default_need_resilver,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_REPLACING,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
+};
+
+vdev_ops_t vdev_spare_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_mirror_open,
+	.vdev_op_close = vdev_mirror_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_mirror_io_start,
+	.vdev_op_io_done = vdev_mirror_io_done,
+	.vdev_op_state_change = vdev_mirror_state_change,
+	.vdev_op_need_resilver = vdev_default_need_resilver,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_default_xlate,
+	.vdev_op_rebuild_asize = vdev_mirror_rebuild_asize,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_SPARE,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
+};
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW,
+	"Rotating media load increment for non-seeking I/O's");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW,
+	"Rotating media load increment for seeking I/O's");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW,
+	"Offset in bytes from the last I/O which triggers "
+	"a reduced rotating media seek increment");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW,
+	"Non-rotating media load increment for non-seeking I/O's");
+
+ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW,
+	"Non-rotating media load increment for seeking I/O's");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_missing.c b/sys/contrib/openzfs/module/zfs/vdev_missing.c
new file mode 100644
index 000000000000..e9145fd012d7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_missing.c
@@ -0,0 +1,131 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+/*
+ * The 'missing' vdev is a special vdev type used only during import.  It
+ * signifies a placeholder in the root vdev for some vdev that we know is
+ * missing.  We pass it down to the kernel to allow the rest of the
+ * configuration to parsed and an attempt made to open all available devices.
+ * Because its GUID is always 0, we know that the guid sum will mismatch and we
+ * won't be able to open the pool anyway.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+
+/* ARGSUSED */
+static int
+vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
+    uint64_t *ashift, uint64_t *pshift)
+{
+	/*
+	 * Really this should just fail.  But then the root vdev will be in the
+	 * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is
+	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
+	 * will fail the GUID sum check before ever trying to open the pool.
+	 */
+	*psize = 0;
+	*max_psize = 0;
+	*ashift = 0;
+	*pshift = 0;
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_close(vdev_t *vd)
+{
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_start(zio_t *zio)
+{
+	zio->io_error = SET_ERROR(ENOTSUP);
+	zio_execute(zio);
+}
+
+/* ARGSUSED */
+static void
+vdev_missing_io_done(zio_t *zio)
+{
+}
+
+vdev_ops_t vdev_missing_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_missing_open,
+	.vdev_op_close = vdev_missing_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_missing_io_start,
+	.vdev_op_io_done = vdev_missing_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_MISSING,	/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
+
+vdev_ops_t vdev_hole_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_missing_open,
+	.vdev_op_close = vdev_missing_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_missing_io_start,
+	.vdev_op_io_done = vdev_missing_io_done,
+	.vdev_op_state_change = NULL,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_HOLE,		/* name of this vdev type */
+	.vdev_op_leaf = B_TRUE			/* leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
new file mode 100644
index 000000000000..25a4bc69cc23
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -0,0 +1,1164 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/avl.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/kstat.h>
+#include <sys/abd.h>
+
+/*
+ * ZFS I/O Scheduler
+ * ---------------
+ *
+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
+ * I/O scheduler determines when and in what order those operations are
+ * issued.  The I/O scheduler divides operations into five I/O classes
+ * prioritized in the following order: sync read, sync write, async read,
+ * async write, and scrub/resilver.  Each queue defines the minimum and
+ * maximum number of concurrent operations that may be issued to the device.
+ * In addition, the device has an aggregate maximum. Note that the sum of the
+ * per-queue minimums must not exceed the aggregate maximum. If the
+ * sum of the per-queue maximums exceeds the aggregate maximum, then the
+ * number of active i/os may reach zfs_vdev_max_active, in which case no
+ * further i/os will be issued regardless of whether all per-queue
+ * minimums have been met.
+ *
+ * For many physical devices, throughput increases with the number of
+ * concurrent operations, but latency typically suffers. Further, physical
+ * devices typically have a limit at which more concurrent operations have no
+ * effect on throughput or can actually cause it to decrease.
+ *
+ * The scheduler selects the next operation to issue by first looking for an
+ * I/O class whose minimum has not been satisfied. Once all are satisfied and
+ * the aggregate maximum has not been hit, the scheduler looks for classes
+ * whose maximum has not been satisfied. Iteration through the I/O classes is
+ * done in the order specified above. No further operations are issued if the
+ * aggregate maximum number of concurrent operations has been hit or if there
+ * are no operations queued for an I/O class that has not hit its maximum.
+ * Every time an i/o is queued or an operation completes, the I/O scheduler
+ * looks for new operations to issue.
+ *
+ * All I/O classes have a fixed maximum number of outstanding operations
+ * except for the async write class. Asynchronous writes represent the data
+ * that is committed to stable storage during the syncing stage for
+ * transaction groups (see txg.c). Transaction groups enter the syncing state
+ * periodically so the number of queued async writes will quickly burst up and
+ * then bleed down to zero. Rather than servicing them as quickly as possible,
+ * the I/O scheduler changes the maximum number of active async write i/os
+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since
+ * both throughput and latency typically increase with the number of
+ * concurrent operations issued to physical devices, reducing the burstiness
+ * in the number of concurrent operations also stabilizes the response time of
+ * operations from other -- and in particular synchronous -- queues. In broad
+ * strokes, the I/O scheduler will issue more concurrent operations from the
+ * async write queue as there's more dirty data in the pool.
+ *
+ * Async Writes
+ *
+ * The number of concurrent operations issued for the async write I/O class
+ * follows a piece-wise linear function defined by a few adjustable points.
+ *
+ *        |                   o---------| <-- zfs_vdev_async_write_max_active
+ *   ^    |                  /^         |
+ *   |    |                 / |         |
+ * active |                /  |         |
+ *  I/O   |               /   |         |
+ * count  |              /    |         |
+ *        |             /     |         |
+ *        |------------o      |         | <-- zfs_vdev_async_write_min_active
+ *       0|____________^______|_________|
+ *        0%           |      |       100% of zfs_dirty_data_max
+ *                     |      |
+ *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
+ *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
+ *
+ * Until the amount of dirty data exceeds a minimum percentage of the dirty
+ * data allowed in the pool, the I/O scheduler will limit the number of
+ * concurrent operations to the minimum. As that threshold is crossed, the
+ * number of concurrent operations issued increases linearly to the maximum at
+ * the specified maximum percentage of the dirty data allowed in the pool.
+ *
+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped
+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent
+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
+ * maximum percentage, this indicates that the rate of incoming data is
+ * greater than the rate that the backend storage can handle. In this case, we
+ * must further throttle incoming writes (see dmu_tx_delay() for details).
+ */
+
+/*
+ * The maximum number of i/os active to each device.  Ideally, this will be >=
+ * the sum of each queue's max_active.
+ */
+uint32_t zfs_vdev_max_active = 1000;
+
+/*
+ * Per-queue limits on the number of i/os active to each device.  If the
+ * number of active i/os is < zfs_vdev_max_active, then the min_active comes
+ * into play.  We will send min_active from each queue round-robin, and then
+ * send from queues in the order defined by zio_priority_t up to max_active.
+ * Some queues have additional mechanisms to limit number of active I/Os in
+ * addition to min_active and max_active, see below.
+ *
+ * In general, smaller max_active's will lead to lower latency of synchronous
+ * operations.  Larger max_active's may lead to higher overall throughput,
+ * depending on underlying storage.
+ *
+ * The ratio of the queues' max_actives determines the balance of performance
+ * between reads, writes, and scrubs.  E.g., increasing
+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
+ * more quickly, but reads and writes to have higher latency and lower
+ * throughput.
+ */
+uint32_t zfs_vdev_sync_read_min_active = 10;
+uint32_t zfs_vdev_sync_read_max_active = 10;
+uint32_t zfs_vdev_sync_write_min_active = 10;
+uint32_t zfs_vdev_sync_write_max_active = 10;
+uint32_t zfs_vdev_async_read_min_active = 1;
+uint32_t zfs_vdev_async_read_max_active = 3;
+uint32_t zfs_vdev_async_write_min_active = 2;
+uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_scrub_min_active = 1;
+uint32_t zfs_vdev_scrub_max_active = 3;
+uint32_t zfs_vdev_removal_min_active = 1;
+uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
+uint32_t zfs_vdev_trim_min_active = 1;
+uint32_t zfs_vdev_trim_max_active = 2;
+uint32_t zfs_vdev_rebuild_min_active = 1;
+uint32_t zfs_vdev_rebuild_max_active = 3;
+
+/*
+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
+ * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
+ * zfs_vdev_async_write_active_max_dirty_percent, use
+ * zfs_vdev_async_write_max_active. The value is linearly interpolated
+ * between min and max.
+ */
+int zfs_vdev_async_write_active_min_dirty_percent = 30;
+int zfs_vdev_async_write_active_max_dirty_percent = 60;
+
+/*
+ * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
+ * the number of concurrently-active I/O's is limited to *_min_active, unless
+ * the vdev is "idle".  When there are no interactive I/Os active (sync or
+ * async), and zfs_vdev_nia_delay I/Os have completed since the last
+ * interactive I/O, then the vdev is considered to be "idle", and the number
+ * of concurrently-active non-interactive I/O's is increased to *_max_active.
+ */
+uint_t zfs_vdev_nia_delay = 5;
+
+/*
+ * Some HDDs tend to prioritize sequential I/O so high that concurrent
+ * random I/O latency reaches several seconds.  On some HDDs it happens
+ * even if sequential I/Os are submitted one at a time, and so setting
+ * *_max_active to 1 does not help.  To prevent non-interactive I/Os, like
+ * scrub, from monopolizing the device no more than zfs_vdev_nia_credit
+ * I/Os can be sent while there are outstanding incomplete interactive
+ * I/Os.  This enforced wait ensures the HDD services the interactive I/O
+ * within a reasonable amount of time.
+ */
+uint_t zfs_vdev_nia_credit = 5;
+
+/*
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
+ */
+int zfs_vdev_aggregation_limit = 1 << 20;
+int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
+int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
+
+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+/*
+ * When performing allocations for a given metaslab, we want to make sure that
+ * there are enough IOs to aggregate together to improve throughput. We want to
+ * ensure that there are at least 128k worth of IOs that can be aggregated, and
+ * we assume that the average allocation size is 4k, so we need the queue depth
+ * to be 32 per allocator to get good aggregation of sequential writes.
+ */
+int zfs_vdev_def_queue_depth = 32;
+
+/*
+ * Allow TRIM I/Os to be aggregated.  This should normally not be needed since
+ * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
+ * by the TRIM code in zfs_trim.c.
+ */
+int zfs_vdev_aggregate_trim = 0;
+
+static int
+vdev_queue_offset_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = (const zio_t *)x1;
+	const zio_t *z2 = (const zio_t *)x2;
+
+	int cmp = TREE_CMP(z1->io_offset, z2->io_offset);
+
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_PCMP(z1, z2));
+}
+
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+	return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
+	if (t == ZIO_TYPE_READ)
+		return (&vq->vq_read_offset_tree);
+	else if (t == ZIO_TYPE_WRITE)
+		return (&vq->vq_write_offset_tree);
+	else
+		return (&vq->vq_trim_offset_tree);
+}
+
+static int
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = (const zio_t *)x1;
+	const zio_t *z2 = (const zio_t *)x2;
+
+	int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
+
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_PCMP(z1, z2));
+}
+
+static int
+vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
+{
+	switch (p) {
+	case ZIO_PRIORITY_SYNC_READ:
+		return (zfs_vdev_sync_read_min_active);
+	case ZIO_PRIORITY_SYNC_WRITE:
+		return (zfs_vdev_sync_write_min_active);
+	case ZIO_PRIORITY_ASYNC_READ:
+		return (zfs_vdev_async_read_min_active);
+	case ZIO_PRIORITY_ASYNC_WRITE:
+		return (zfs_vdev_async_write_min_active);
+	case ZIO_PRIORITY_SCRUB:
+		return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
+		    MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
+	case ZIO_PRIORITY_REMOVAL:
+		return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
+		    MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
+	case ZIO_PRIORITY_INITIALIZING:
+		return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
+		    MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_min_active);
+	case ZIO_PRIORITY_REBUILD:
+		return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
+		    MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
+	default:
+		panic("invalid priority %u", p);
+		return (0);
+	}
+}
+
+static int
+vdev_queue_max_async_writes(spa_t *spa)
+{
+	int writes;
+	uint64_t dirty = 0;
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	uint64_t min_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_min_dirty_percent / 100;
+	uint64_t max_bytes = zfs_dirty_data_max *
+	    zfs_vdev_async_write_active_max_dirty_percent / 100;
+
+	/*
+	 * Async writes may occur before the assignment of the spa's
+	 * dsl_pool_t if a self-healing zio is issued prior to the
+	 * completion of dmu_objset_open_impl().
+	 */
+	if (dp == NULL)
+		return (zfs_vdev_async_write_max_active);
+
+	/*
+	 * Sync tasks correspond to interactive user actions. To reduce the
+	 * execution time of those actions we push data out as fast as possible.
+	 */
+	dirty = dp->dp_dirty_total;
+	if (dirty > max_bytes || spa_has_pending_synctask(spa))
+		return (zfs_vdev_async_write_max_active);
+
+	if (dirty < min_bytes)
+		return (zfs_vdev_async_write_min_active);
+
+	/*
+	 * linear interpolation:
+	 * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
+	 * move right by min_bytes
+	 * move up by min_writes
+	 */
+	writes = (dirty - min_bytes) *
+	    (zfs_vdev_async_write_max_active -
+	    zfs_vdev_async_write_min_active) /
+	    (max_bytes - min_bytes) +
+	    zfs_vdev_async_write_min_active;
+	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
+	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+	return (writes);
+}
+
+static int
+vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
+{
+	switch (p) {
+	case ZIO_PRIORITY_SYNC_READ:
+		return (zfs_vdev_sync_read_max_active);
+	case ZIO_PRIORITY_SYNC_WRITE:
+		return (zfs_vdev_sync_write_max_active);
+	case ZIO_PRIORITY_ASYNC_READ:
+		return (zfs_vdev_async_read_max_active);
+	case ZIO_PRIORITY_ASYNC_WRITE:
+		return (vdev_queue_max_async_writes(spa));
+	case ZIO_PRIORITY_SCRUB:
+		if (vq->vq_ia_active > 0) {
+			return (MIN(vq->vq_nia_credit,
+			    zfs_vdev_scrub_min_active));
+		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+			return (MAX(1, zfs_vdev_scrub_min_active));
+		return (zfs_vdev_scrub_max_active);
+	case ZIO_PRIORITY_REMOVAL:
+		if (vq->vq_ia_active > 0) {
+			return (MIN(vq->vq_nia_credit,
+			    zfs_vdev_removal_min_active));
+		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+			return (MAX(1, zfs_vdev_removal_min_active));
+		return (zfs_vdev_removal_max_active);
+	case ZIO_PRIORITY_INITIALIZING:
+		if (vq->vq_ia_active > 0) {
+			return (MIN(vq->vq_nia_credit,
+			    zfs_vdev_initializing_min_active));
+		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+			return (MAX(1, zfs_vdev_initializing_min_active));
+		return (zfs_vdev_initializing_max_active);
+	case ZIO_PRIORITY_TRIM:
+		return (zfs_vdev_trim_max_active);
+	case ZIO_PRIORITY_REBUILD:
+		if (vq->vq_ia_active > 0) {
+			return (MIN(vq->vq_nia_credit,
+			    zfs_vdev_rebuild_min_active));
+		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
+			return (MAX(1, zfs_vdev_rebuild_min_active));
+		return (zfs_vdev_rebuild_max_active);
+	default:
+		panic("invalid priority %u", p);
+		return (0);
+	}
+}
+
+/*
+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * there is no eligible class.
+ */
+static zio_priority_t
+vdev_queue_class_to_issue(vdev_queue_t *vq)
+{
+	spa_t *spa = vq->vq_vdev->vdev_spa;
+	zio_priority_t p, n;
+
+	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+		return (ZIO_PRIORITY_NUM_QUEUEABLE);
+
+	/*
+	 * Find a queue that has not reached its minimum # outstanding i/os.
+	 * Do round-robin to reduce starvation due to zfs_vdev_max_active
+	 * and vq_nia_credit limits.
+	 */
+	for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
+		p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+		    vq->vq_class[p].vqc_active <
+		    vdev_queue_class_min_active(vq, p)) {
+			vq->vq_last_prio = p;
+			return (p);
+		}
+	}
+
+	/*
+	 * If we haven't found a queue, look for one that hasn't reached its
+	 * maximum # outstanding i/os.
+	 */
+	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+		    vq->vq_class[p].vqc_active <
+		    vdev_queue_class_max_active(spa, vq, p)) {
+			vq->vq_last_prio = p;
+			return (p);
+		}
+	}
+
+	/* No eligible queued i/os */
+	return (ZIO_PRIORITY_NUM_QUEUEABLE);
+}
+
+void
+vdev_queue_init(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+	zio_priority_t p;
+
+	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+	vq->vq_vdev = vd;
+	taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
+
+	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
+	    vdev_queue_offset_compare, sizeof (zio_t),
+	    offsetof(struct zio, io_offset_node));
+
+	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+		int (*compfn) (const void *, const void *);
+
+		/*
+		 * The synchronous/trim i/o queues are dispatched in FIFO rather
+		 * than LBA order. This provides more consistent latency for
+		 * these i/os.
+		 */
+		if (p == ZIO_PRIORITY_SYNC_READ ||
+		    p == ZIO_PRIORITY_SYNC_WRITE ||
+		    p == ZIO_PRIORITY_TRIM) {
+			compfn = vdev_queue_timestamp_compare;
+		} else {
+			compfn = vdev_queue_offset_compare;
+		}
+		avl_create(vdev_queue_class_tree(vq, p), compfn,
+		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
+	}
+
+	vq->vq_last_offset = 0;
+}
+
+void
+vdev_queue_fini(vdev_t *vd)
+{
+	vdev_queue_t *vq = &vd->vdev_queue;
+
+	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+		avl_destroy(vdev_queue_class_tree(vq, p));
+	avl_destroy(&vq->vq_active_tree);
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
+	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
+
+	mutex_destroy(&vq->vq_lock);
+}
+
+static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
+
+	if (shk->kstat != NULL) {
+		mutex_enter(&shk->lock);
+		kstat_waitq_enter(shk->kstat->ks_data);
+		mutex_exit(&shk->lock);
+	}
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
+
+	if (shk->kstat != NULL) {
+		mutex_enter(&shk->lock);
+		kstat_waitq_exit(shk->kstat->ks_data);
+		mutex_exit(&shk->lock);
+	}
+}
+
+static boolean_t
+vdev_queue_is_interactive(zio_priority_t p)
+{
+	switch (p) {
+	case ZIO_PRIORITY_SCRUB:
+	case ZIO_PRIORITY_REMOVAL:
+	case ZIO_PRIORITY_INITIALIZING:
+	case ZIO_PRIORITY_REBUILD:
+		return (B_FALSE);
+	default:
+		return (B_TRUE);
+	}
+}
+
+static void
+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	vq->vq_class[zio->io_priority].vqc_active++;
+	if (vdev_queue_is_interactive(zio->io_priority)) {
+		if (++vq->vq_ia_active == 1)
+			vq->vq_nia_credit = 1;
+	} else if (vq->vq_ia_active > 0) {
+		vq->vq_nia_credit--;
+	}
+	avl_add(&vq->vq_active_tree, zio);
+
+	if (shk->kstat != NULL) {
+		mutex_enter(&shk->lock);
+		kstat_runq_enter(shk->kstat->ks_data);
+		mutex_exit(&shk->lock);
+	}
+}
+
+static void
+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
+
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	vq->vq_class[zio->io_priority].vqc_active--;
+	if (vdev_queue_is_interactive(zio->io_priority)) {
+		if (--vq->vq_ia_active == 0)
+			vq->vq_nia_credit = 0;
+		else
+			vq->vq_nia_credit = zfs_vdev_nia_credit;
+	} else if (vq->vq_ia_active == 0)
+		vq->vq_nia_credit++;
+	avl_remove(&vq->vq_active_tree, zio);
+
+	if (shk->kstat != NULL) {
+		kstat_io_t *ksio = shk->kstat->ks_data;
+
+		mutex_enter(&shk->lock);
+		kstat_runq_exit(ksio);
+		if (zio->io_type == ZIO_TYPE_READ) {
+			ksio->reads++;
+			ksio->nread += zio->io_size;
+		} else if (zio->io_type == ZIO_TYPE_WRITE) {
+			ksio->writes++;
+			ksio->nwritten += zio->io_size;
+		}
+		mutex_exit(&shk->lock);
+	}
+}
+
+static void
+vdev_queue_agg_io_done(zio_t *aio)
+{
+	abd_free(aio->io_abd);
+}
+
+/*
+ * Compute the range spanned by two i/os, which is the endpoint of the last
+ * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
+ * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
+ * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
+ */
+#define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
+#define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
+
+/*
+ * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this
+ * by creating a gang ABD from the adjacent ZIOs io_abd's. By using
+ * a gang ABD we avoid doing memory copies to and from the parent,
+ * child ZIOs. The gang ABD also accounts for gaps between adjacent
+ * io_offsets by simply getting the zero ABD for writes or allocating
+ * a new ABD for reads and placing them in the gang ABD as well.
+ */
+static zio_t *
+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
+{
+	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
+	zio_link_t *zl = NULL;
+	uint64_t maxgap = 0;
+	uint64_t size;
+	uint64_t limit;
+	int maxblocksize;
+	boolean_t stretch = B_FALSE;
+	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
+	enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+	uint64_t next_offset;
+	abd_t *abd;
+
+	maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
+	if (vq->vq_vdev->vdev_nonrot)
+		limit = zfs_vdev_aggregation_limit_non_rotating;
+	else
+		limit = zfs_vdev_aggregation_limit;
+	limit = MAX(MIN(limit, maxblocksize), 0);
+
+	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
+		return (NULL);
+
+	/*
+	 * While TRIM commands could be aggregated based on offset this
+	 * behavior is disabled until it's determined to be beneficial.
+	 */
+	if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
+		return (NULL);
+
+	/*
+	 * I/Os to distributed spares are directly dispatched to the dRAID
+	 * leaf vdevs for aggregation.  See the comment at the end of the
+	 * zio_vdev_io_start() function.
+	 */
+	ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
+
+	first = last = zio;
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		maxgap = zfs_vdev_read_gap_limit;
+
+	/*
+	 * We can aggregate I/Os that are sufficiently adjacent and of
+	 * the same flavor, as expressed by the AGG_INHERIT flags.
+	 * The latter requirement is necessary so that certain
+	 * attributes of the I/O, such as whether it's a normal I/O
+	 * or a scrub/resilver, can be preserved in the aggregate.
+	 * We can include optional I/Os, but don't allow them
+	 * to begin a range as they add no benefit in that situation.
+	 */
+
+	/*
+	 * We keep track of the last non-optional I/O.
+	 */
+	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
+
+	/*
+	 * Walk backwards through sufficiently contiguous I/Os
+	 * recording the last non-optional I/O.
+	 */
+	while ((dio = AVL_PREV(t, first)) != NULL &&
+	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+	    IO_SPAN(dio, last) <= limit &&
+	    IO_GAP(dio, first) <= maxgap &&
+	    dio->io_type == zio->io_type) {
+		first = dio;
+		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
+			mandatory = first;
+	}
+
+	/*
+	 * Skip any initial optional I/Os.
+	 */
+	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
+		first = AVL_NEXT(t, first);
+		ASSERT(first != NULL);
+	}
+
+
+	/*
+	 * Walk forward through sufficiently contiguous I/Os.
+	 * The aggregation limit does not apply to optional i/os, so that
+	 * we can issue contiguous writes even if they are larger than the
+	 * aggregation limit.
+	 */
+	while ((dio = AVL_NEXT(t, last)) != NULL &&
+	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+	    (IO_SPAN(first, dio) <= limit ||
+	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
+	    IO_SPAN(first, dio) <= maxblocksize &&
+	    IO_GAP(last, dio) <= maxgap &&
+	    dio->io_type == zio->io_type) {
+		last = dio;
+		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
+			mandatory = last;
+	}
+
+	/*
+	 * Now that we've established the range of the I/O aggregation
+	 * we must decide what to do with trailing optional I/Os.
+	 * For reads, there's nothing to do. While we are unable to
+	 * aggregate further, it's possible that a trailing optional
+	 * I/O would allow the underlying device to aggregate with
+	 * subsequent I/Os. We must therefore determine if the next
+	 * non-optional I/O is close enough to make aggregation
+	 * worthwhile.
+	 */
+	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
+		zio_t *nio = last;
+		while ((dio = AVL_NEXT(t, nio)) != NULL &&
+		    IO_GAP(nio, dio) == 0 &&
+		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
+			nio = dio;
+			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+				stretch = B_TRUE;
+				break;
+			}
+		}
+	}
+
+	if (stretch) {
+		/*
+		 * We are going to include an optional io in our aggregated
+		 * span, thus closing the write gap.  Only mandatory i/os can
+		 * start aggregated spans, so make sure that the next i/o
+		 * after our span is mandatory.
+		 */
+		dio = AVL_NEXT(t, last);
+		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+	} else {
+		/* do not include the optional i/o */
+		while (last != mandatory && last != first) {
+			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
+			last = AVL_PREV(t, last);
+			ASSERT(last != NULL);
+		}
+	}
+
+	if (first == last)
+		return (NULL);
+
+	size = IO_SPAN(first, last);
+	ASSERT3U(size, <=, maxblocksize);
+
+	abd = abd_alloc_gang();
+	if (abd == NULL)
+		return (NULL);
+
+	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
+	    abd, size, first->io_type, zio->io_priority,
+	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+	    vdev_queue_agg_io_done, NULL);
+	aio->io_timestamp = first->io_timestamp;
+
+	nio = first;
+	next_offset = first->io_offset;
+	do {
+		dio = nio;
+		nio = AVL_NEXT(t, dio);
+		zio_add_child(dio, aio);
+		vdev_queue_io_remove(vq, dio);
+
+		if (dio->io_offset != next_offset) {
+			/* allocate a buffer for a read gap */
+			ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ);
+			ASSERT3U(dio->io_offset, >, next_offset);
+			abd = abd_alloc_for_io(
+			    dio->io_offset - next_offset, B_TRUE);
+			abd_gang_add(aio->io_abd, abd, B_TRUE);
+		}
+		if (dio->io_abd &&
+		    (dio->io_size != abd_get_size(dio->io_abd))) {
+			/* abd size not the same as IO size */
+			ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size);
+			abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size);
+			abd_gang_add(aio->io_abd, abd, B_TRUE);
+		} else {
+			if (dio->io_flags & ZIO_FLAG_NODATA) {
+				/* allocate a buffer for a write gap */
+				ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
+				ASSERT3P(dio->io_abd, ==, NULL);
+				abd_gang_add(aio->io_abd,
+				    abd_get_zeros(dio->io_size), B_TRUE);
+			} else {
+				/*
+				 * We pass B_FALSE to abd_gang_add()
+				 * because we did not allocate a new
+				 * ABD, so it is assumed the caller
+				 * will free this ABD.
+				 */
+				abd_gang_add(aio->io_abd, dio->io_abd,
+				    B_FALSE);
+			}
+		}
+		next_offset = dio->io_offset + dio->io_size;
+	} while (dio != last);
+	ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size);
+
+	/*
+	 * We need to drop the vdev queue's lock during zio_execute() to
+	 * avoid a deadlock that we could encounter due to lock order
+	 * reversal between vq_lock and io_lock in zio_change_priority().
+	 */
+	mutex_exit(&vq->vq_lock);
+	while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
+		ASSERT3U(dio->io_type, ==, aio->io_type);
+
+		zio_vdev_io_bypass(dio);
+		zio_execute(dio);
+	}
+	mutex_enter(&vq->vq_lock);
+
+	return (aio);
+}
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq)
+{
+	zio_t *zio, *aio;
+	zio_priority_t p;
+	avl_index_t idx;
+	avl_tree_t *tree;
+
+again:
+	ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+	p = vdev_queue_class_to_issue(vq);
+
+	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
+		/* No eligible queued i/os */
+		return (NULL);
+	}
+
+	/*
+	 * For LBA-ordered queues (async / scrub / initializing), issue the
+	 * i/o which follows the most recently issued i/o in LBA (offset) order.
+	 *
+	 * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
+	 */
+	tree = vdev_queue_class_tree(vq, p);
+	vq->vq_io_search.io_timestamp = 0;
+	vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
+	VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
+	zio = avl_nearest(tree, idx, AVL_AFTER);
+	if (zio == NULL)
+		zio = avl_first(tree);
+	ASSERT3U(zio->io_priority, ==, p);
+
+	aio = vdev_queue_aggregate(vq, zio);
+	if (aio != NULL)
+		zio = aio;
+	else
+		vdev_queue_io_remove(vq, zio);
+
+	/*
+	 * If the I/O is or was optional and therefore has no data, we need to
+	 * simply discard it. We need to drop the vdev queue's lock to avoid a
+	 * deadlock that we could encounter since this I/O will complete
+	 * immediately.
+	 */
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		mutex_exit(&vq->vq_lock);
+		zio_vdev_io_bypass(zio);
+		zio_execute(zio);
+		mutex_enter(&vq->vq_lock);
+		goto again;
+	}
+
+	vdev_queue_pending_add(vq, zio);
+	vq->vq_last_offset = zio->io_offset + zio->io_size;
+
+	return (zio);
+}
+
+zio_t *
+vdev_queue_io(zio_t *zio)
+{
+	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
+
+	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
+		return (zio);
+
+	/*
+	 * Children i/os inherent their parent's priority, which might
+	 * not match the child's i/o type.  Fix it up here.
+	 */
+	if (zio->io_type == ZIO_TYPE_READ) {
+		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
+		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
+		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
+		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+		    zio->io_priority != ZIO_PRIORITY_REBUILD) {
+			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
+		}
+	} else if (zio->io_type == ZIO_TYPE_WRITE) {
+		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
+
+		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
+		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+		    zio->io_priority != ZIO_PRIORITY_REBUILD) {
+			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+		}
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_TRIM);
+		ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
+	}
+
+	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
+
+	mutex_enter(&vq->vq_lock);
+	zio->io_timestamp = gethrtime();
+	vdev_queue_io_add(vq, zio);
+	nio = vdev_queue_io_to_issue(vq);
+	mutex_exit(&vq->vq_lock);
+
+	if (nio == NULL)
+		return (NULL);
+
+	if (nio->io_done == vdev_queue_agg_io_done) {
+		zio_nowait(nio);
+		return (NULL);
+	}
+
+	return (nio);
+}
+
+void
+vdev_queue_io_done(zio_t *zio)
+{
+	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	zio_t *nio;
+
+	mutex_enter(&vq->vq_lock);
+
+	vdev_queue_pending_remove(vq, zio);
+
+	zio->io_delta = gethrtime() - zio->io_timestamp;
+	vq->vq_io_complete_ts = gethrtime();
+	vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
+
+	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
+		mutex_exit(&vq->vq_lock);
+		if (nio->io_done == vdev_queue_agg_io_done) {
+			zio_nowait(nio);
+		} else {
+			zio_vdev_io_reissue(nio);
+			zio_execute(nio);
+		}
+		mutex_enter(&vq->vq_lock);
+	}
+
+	mutex_exit(&vq->vq_lock);
+}
+
+void
+vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
+{
+	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+	avl_tree_t *tree;
+
+	/*
+	 * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
+	 * code to issue IOs without adding them to the vdev queue. In this
+	 * case, the zio is already going to be issued as quickly as possible
+	 * and so it doesn't need any reprioritization to help.
+	 */
+	if (zio->io_priority == ZIO_PRIORITY_NOW)
+		return;
+
+	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+	if (zio->io_type == ZIO_TYPE_READ) {
+		if (priority != ZIO_PRIORITY_SYNC_READ &&
+		    priority != ZIO_PRIORITY_ASYNC_READ &&
+		    priority != ZIO_PRIORITY_SCRUB)
+			priority = ZIO_PRIORITY_ASYNC_READ;
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		if (priority != ZIO_PRIORITY_SYNC_WRITE &&
+		    priority != ZIO_PRIORITY_ASYNC_WRITE)
+			priority = ZIO_PRIORITY_ASYNC_WRITE;
+	}
+
+	mutex_enter(&vq->vq_lock);
+
+	/*
+	 * If the zio is in none of the queues we can simply change
+	 * the priority. If the zio is waiting to be submitted we must
+	 * remove it from the queue and re-insert it with the new priority.
+	 * Otherwise, the zio is currently active and we cannot change its
+	 * priority.
+	 */
+	tree = vdev_queue_class_tree(vq, zio->io_priority);
+	if (avl_find(tree, zio, NULL) == zio) {
+		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+		zio->io_priority = priority;
+		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+	} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
+		zio->io_priority = priority;
+	}
+
+	mutex_exit(&vq->vq_lock);
+}
+
+/*
+ * As these two methods are only used for load calculations we're not
+ * concerned if we get an incorrect value on 32bit platforms due to lack of
+ * vq_lock mutex use here, instead we prefer to keep it lock free for
+ * performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_last_offset(vdev_t *vd)
+{
+	return (vd->vdev_queue.vq_last_offset);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW,
+	"Max vdev I/O aggregation size");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW,
+	"Max vdev I/O aggregation size for non-rotating media");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW,
+	"Allow TRIM I/O to be aggregated");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW,
+	"Aggregate read I/O over gap");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW,
+	"Aggregate write I/O over gap");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW,
+	"Maximum number of active I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW,
+	"Async write concurrency max threshold");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW,
+	"Async write concurrency min threshold");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW,
+	"Max active async read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW,
+	"Min active async read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW,
+	"Max active async write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW,
+	"Min active async write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW,
+	"Max active initializing I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW,
+	"Min active initializing I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW,
+	"Max active removal I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW,
+	"Min active removal I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW,
+	"Max active scrub I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW,
+	"Min active scrub I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW,
+	"Max active sync read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW,
+	"Min active sync read I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW,
+	"Max active sync write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW,
+	"Min active sync write I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
+	"Max active trim/discard I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
+	"Min active trim/discard I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
+	"Max active rebuild I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
+	"Min active rebuild I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
+	"Number of non-interactive I/Os to allow in sequence");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
+	"Number of non-interactive I/Os before _max_active");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
+	"Queue depth percentage for each top-level vdev");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz.c b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
new file mode 100644
index 000000000000..f4812e61252c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz.c
@@ -0,0 +1,2747 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/abd.h>
+#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <sys/vdev_draid.h>
+
+#ifdef ZFS_DEBUG
+#include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
+#endif
+
+/*
+ * Virtual device vector for RAID-Z.
+ *
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
+ *
+ *   o addition (+) is represented by a bitwise XOR
+ *   o subtraction (-) is therefore identical to addition: A + B = A - B
+ *   o multiplication of A by 2 is defined by the following bitwise expression:
+ *
+ *	(A * 2)_7 = A_6
+ *	(A * 2)_6 = A_5
+ *	(A * 2)_5 = A_4
+ *	(A * 2)_4 = A_3 + A_7
+ *	(A * 2)_3 = A_2 + A_7
+ *	(A * 2)_2 = A_1 + A_7
+ *	(A * 2)_1 = A_0
+ *	(A * 2)_0 = A_7
+ *
+ * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
+ *
+ * Observe that any number in the field (except for 0) can be expressed as a
+ * power of 2 -- a generator for the field. We store a table of the powers of
+ * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
+ * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
+ *
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
+ *
+ *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
+ *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
+ *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
+ *
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
+ */
+
+#define	VDEV_RAIDZ_P		0
+#define	VDEV_RAIDZ_Q		1
+#define	VDEV_RAIDZ_R		2
+
+#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define	VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+	(mask) = (x) & 0x8080808080808080ULL; \
+	(mask) = ((mask) << 1) - ((mask) >> 7); \
+	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
+}
+
+#define	VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+}
+
+static void
+vdev_raidz_row_free(raidz_row_t *rr)
+{
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_size != 0)
+			abd_free(rc->rc_abd);
+		if (rc->rc_gdata != NULL)
+			abd_free(rc->rc_gdata);
+		if (rc->rc_orig_data != NULL)
+			zio_buf_free(rc->rc_orig_data, rc->rc_size);
+	}
+
+	if (rr->rr_abd_copy != NULL)
+		abd_free(rr->rr_abd_copy);
+
+	if (rr->rr_abd_empty != NULL)
+		abd_free(rr->rr_abd_empty);
+
+	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
+}
+
+void
+vdev_raidz_map_free(raidz_map_t *rm)
+{
+	for (int i = 0; i < rm->rm_nrows; i++)
+		vdev_raidz_row_free(rm->rm_row[i]);
+
+	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
+}
+
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+
+	ASSERT0(rm->rm_freed);
+	rm->rm_freed = B_TRUE;
+
+	if (rm->rm_reports == 0) {
+		vdev_raidz_map_free(rm);
+	}
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+	raidz_map_t *rm = arg;
+
+	ASSERT3U(rm->rm_reports, >, 0);
+
+	if (--rm->rm_reports == 0 && rm->rm_freed)
+		vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
+{
+	raidz_map_t *rm = zcr->zcr_cbdata;
+	const size_t c = zcr->zcr_cbinfo;
+	size_t x, offset;
+
+	if (good_data == NULL) {
+		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+		return;
+	}
+
+	ASSERT3U(rm->rm_nrows, ==, 1);
+	raidz_row_t *rr = rm->rm_row[0];
+
+	const abd_t *good = NULL;
+	const abd_t *bad = rr->rr_col[c].rc_abd;
+
+	if (c < rr->rr_firstdatacol) {
+		/*
+		 * The first time through, calculate the parity blocks for
+		 * the good data (this relies on the fact that the good
+		 * data never changes for a given logical ZIO)
+		 */
+		if (rr->rr_col[0].rc_gdata == NULL) {
+			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
+
+			/*
+			 * Set up the rr_col[]s to generate the parity for
+			 * good_data, first saving the parity bufs and
+			 * replacing them with buffers to hold the result.
+			 */
+			for (x = 0; x < rr->rr_firstdatacol; x++) {
+				bad_parity[x] = rr->rr_col[x].rc_abd;
+				rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
+				    abd_alloc_sametype(rr->rr_col[x].rc_abd,
+				    rr->rr_col[x].rc_size);
+			}
+
+			/* fill in the data columns from good_data */
+			offset = 0;
+			for (; x < rr->rr_cols; x++) {
+				abd_free(rr->rr_col[x].rc_abd);
+
+				rr->rr_col[x].rc_abd =
+				    abd_get_offset_size((abd_t *)good_data,
+				    offset, rr->rr_col[x].rc_size);
+				offset += rr->rr_col[x].rc_size;
+			}
+
+			/*
+			 * Construct the parity from the good data.
+			 */
+			vdev_raidz_generate_parity_row(rm, rr);
+
+			/* restore everything back to its original state */
+			for (x = 0; x < rr->rr_firstdatacol; x++)
+				rr->rr_col[x].rc_abd = bad_parity[x];
+
+			offset = 0;
+			for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
+				abd_free(rr->rr_col[x].rc_abd);
+				rr->rr_col[x].rc_abd = abd_get_offset_size(
+				    rr->rr_abd_copy, offset,
+				    rr->rr_col[x].rc_size);
+				offset += rr->rr_col[x].rc_size;
+			}
+		}
+
+		ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
+		good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
+		    rr->rr_col[c].rc_size);
+	} else {
+		/* adjust good_data to point at the start of our column */
+		offset = 0;
+		for (x = rr->rr_firstdatacol; x < c; x++)
+			offset += rr->rr_col[x].rc_size;
+
+		good = abd_get_offset_size((abd_t *)good_data, offset,
+		    rr->rr_col[c].rc_size);
+	}
+
+	/* we drop the ereport if it ends up that the data was good */
+	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+	abd_free((abd_t *)good);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely.  The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+	size_t c = (size_t)(uintptr_t)arg;
+	raidz_map_t *rm = zio->io_vsd;
+
+	/* set up the report and bump the refcount  */
+	zcr->zcr_cbdata = rm;
+	zcr->zcr_cbinfo = c;
+	zcr->zcr_finish = vdev_raidz_cksum_finish;
+	zcr->zcr_free = vdev_raidz_cksum_free;
+
+	rm->rm_reports++;
+	ASSERT3U(rm->rm_reports, >, 0);
+	ASSERT3U(rm->rm_nrows, ==, 1);
+
+	if (rm->rm_row[0]->rr_abd_copy != NULL)
+		return;
+
+	/*
+	 * It's the first time we're called for this raidz_map_t, so we need
+	 * to copy the data aside; there's no guarantee that our zio's buffer
+	 * won't be re-used for something else.
+	 *
+	 * Our parity data is already in separate buffers, so there's no need
+	 * to copy them.
+	 */
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+		size_t offset = 0;
+		size_t size = 0;
+
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
+			size += rr->rr_col[c].rc_size;
+
+		rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
+
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *col = &rr->rr_col[c];
+			abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
+			    offset, col->rc_size);
+
+			abd_copy(tmp, col->rc_abd, col->rc_size);
+
+			abd_free(col->rc_abd);
+			col->rc_abd = tmp;
+
+			offset += col->rc_size;
+		}
+		ASSERT3U(offset, ==, size);
+	}
+}
+
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+	.vsd_free = vdev_raidz_map_free_vsd,
+	.vsd_cksum_report = vdev_raidz_cksum_report
+};
+
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ *
+ * Avoid inlining the function to keep vdev_raidz_io_start(), which
+ * is this functions only caller, as small as possible on the stack.
+ */
+noinline raidz_map_t *
+vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
+    uint64_t nparity)
+{
+	raidz_row_t *rr;
+	/* The starting RAIDZ (parent) vdev sector of the block. */
+	uint64_t b = zio->io_offset >> ashift;
+	/* The zio's size in units of the vdev's minimum sector size. */
+	uint64_t s = zio->io_size >> ashift;
+	/* The first column for this stripe. */
+	uint64_t f = b % dcols;
+	/* The starting byte offset on each child vdev. */
+	uint64_t o = (b / dcols) << ashift;
+	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+
+	raidz_map_t *rm =
+	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
+	rm->rm_nrows = 1;
+
+	/*
+	 * "Quotient": The number of data sectors for this stripe on all but
+	 * the "big column" child vdevs that also contain "remainder" data.
+	 */
+	q = s / (dcols - nparity);
+
+	/*
+	 * "Remainder": The number of partial stripe data sectors in this I/O.
+	 * This will add a sector to some, but not all, child vdevs.
+	 */
+	r = s - q * (dcols - nparity);
+
+	/* The number of "big columns" - those which contain remainder data. */
+	bc = (r == 0 ? 0 : r + nparity);
+
+	/*
+	 * The total number of data and parity sectors associated with
+	 * this I/O.
+	 */
+	tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+	/*
+	 * acols: The columns that will be accessed.
+	 * scols: The columns that will be accessed or skipped.
+	 */
+	if (q == 0) {
+		/* Our I/O request doesn't span all child vdevs. */
+		acols = bc;
+		scols = MIN(dcols, roundup(bc, nparity + 1));
+	} else {
+		acols = dcols;
+		scols = dcols;
+	}
+
+	ASSERT3U(acols, <=, scols);
+
+	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
+	rm->rm_row[0] = rr;
+
+	rr->rr_cols = acols;
+	rr->rr_scols = scols;
+	rr->rr_bigcols = bc;
+	rr->rr_missingdata = 0;
+	rr->rr_missingparity = 0;
+	rr->rr_firstdatacol = nparity;
+	rr->rr_abd_copy = NULL;
+	rr->rr_abd_empty = NULL;
+	rr->rr_nempty = 0;
+#ifdef ZFS_DEBUG
+	rr->rr_offset = zio->io_offset;
+	rr->rr_size = zio->io_size;
+#endif
+
+	asize = 0;
+
+	for (c = 0; c < scols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		col = f + c;
+		coff = o;
+		if (col >= dcols) {
+			col -= dcols;
+			coff += 1ULL << ashift;
+		}
+		rc->rc_devidx = col;
+		rc->rc_offset = coff;
+		rc->rc_abd = NULL;
+		rc->rc_gdata = NULL;
+		rc->rc_orig_data = NULL;
+		rc->rc_error = 0;
+		rc->rc_tried = 0;
+		rc->rc_skipped = 0;
+		rc->rc_repair = 0;
+		rc->rc_need_orig_restore = B_FALSE;
+
+		if (c >= acols)
+			rc->rc_size = 0;
+		else if (c < bc)
+			rc->rc_size = (q + 1) << ashift;
+		else
+			rc->rc_size = q << ashift;
+
+		asize += rc->rc_size;
+	}
+
+	ASSERT3U(asize, ==, tot << ashift);
+	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+	rm->rm_skipstart = bc;
+
+	for (c = 0; c < rr->rr_firstdatacol; c++)
+		rr->rr_col[c].rc_abd =
+		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
+
+	for (uint64_t off = 0; c < acols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
+		    zio->io_abd, off, rc->rc_size);
+		off += rc->rc_size;
+	}
+
+	/*
+	 * If all data stored spans all columns, there's a danger that parity
+	 * will always be on the same device and, since parity isn't read
+	 * during normal operation, that device's I/O bandwidth won't be
+	 * used effectively. We therefore switch the parity every 1MB.
+	 *
+	 * ... at least that was, ostensibly, the theory. As a practical
+	 * matter unless we juggle the parity between all devices evenly, we
+	 * won't see any benefit. Further, occasional writes that aren't a
+	 * multiple of the LCM of the number of children and the minimum
+	 * stripe width are sufficient to avoid pessimal behavior.
+	 * Unfortunately, this decision created an implicit on-disk format
+	 * requirement that we need to support for all eternity, but only
+	 * for single-parity RAID-Z.
+	 *
+	 * If we intend to skip a sector in the zeroth column for padding
+	 * we must make sure to note this swap. We will never intend to
+	 * skip the first column since at least one data and one parity
+	 * column must appear in each row.
+	 */
+	ASSERT(rr->rr_cols >= 2);
+	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
+
+	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+		devidx = rr->rr_col[0].rc_devidx;
+		o = rr->rr_col[0].rc_offset;
+		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
+		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
+		rr->rr_col[1].rc_devidx = devidx;
+		rr->rr_col[1].rc_offset = o;
+
+		if (rm->rm_skipstart == 0)
+			rm->rm_skipstart = 1;
+	}
+
+	/* init RAIDZ parity ops */
+	rm->rm_ops = vdev_raidz_math_get_ops();
+
+	return (rm);
+}
+
+struct pqr_struct {
+	uint64_t *p;
+	uint64_t *q;
+	uint64_t *r;
+};
+
+static int
+vdev_raidz_p_func(void *buf, size_t size, void *private)
+{
+	struct pqr_struct *pqr = private;
+	const uint64_t *src = buf;
+	int i, cnt = size / sizeof (src[0]);
+
+	ASSERT(pqr->p && !pqr->q && !pqr->r);
+
+	for (i = 0; i < cnt; i++, src++, pqr->p++)
+		*pqr->p ^= *src;
+
+	return (0);
+}
+
+static int
+vdev_raidz_pq_func(void *buf, size_t size, void *private)
+{
+	struct pqr_struct *pqr = private;
+	const uint64_t *src = buf;
+	uint64_t mask;
+	int i, cnt = size / sizeof (src[0]);
+
+	ASSERT(pqr->p && pqr->q && !pqr->r);
+
+	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
+		*pqr->p ^= *src;
+		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+		*pqr->q ^= *src;
+	}
+
+	return (0);
+}
+
+static int
+vdev_raidz_pqr_func(void *buf, size_t size, void *private)
+{
+	struct pqr_struct *pqr = private;
+	const uint64_t *src = buf;
+	uint64_t mask;
+	int i, cnt = size / sizeof (src[0]);
+
+	ASSERT(pqr->p && pqr->q && pqr->r);
+
+	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
+		*pqr->p ^= *src;
+		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
+		*pqr->q ^= *src;
+		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
+		*pqr->r ^= *src;
+	}
+
+	return (0);
+}
+
+static void
+vdev_raidz_generate_parity_p(raidz_row_t *rr)
+{
+	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		abd_t *src = rr->rr_col[c].rc_abd;
+
+		if (c == rr->rr_firstdatacol) {
+			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+		} else {
+			struct pqr_struct pqr = { p, NULL, NULL };
+			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
+			    vdev_raidz_p_func, &pqr);
+		}
+	}
+}
+
+static void
+vdev_raidz_generate_parity_pq(raidz_row_t *rr)
+{
+	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		abd_t *src = rr->rr_col[c].rc_abd;
+
+		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
+
+		if (c == rr->rr_firstdatacol) {
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+			(void) memcpy(q, p, rr->rr_col[c].rc_size);
+
+			for (uint64_t i = ccnt; i < pcnt; i++) {
+				p[i] = 0;
+				q[i] = 0;
+			}
+		} else {
+			struct pqr_struct pqr = { p, q, NULL };
+
+			ASSERT(ccnt <= pcnt);
+			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
+			    vdev_raidz_pq_func, &pqr);
+
+			/*
+			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
+			 */
+			uint64_t mask;
+			for (uint64_t i = ccnt; i < pcnt; i++) {
+				VDEV_RAIDZ_64MUL_2(q[i], mask);
+			}
+		}
+	}
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
+{
+	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
+	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
+	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
+	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
+
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		abd_t *src = rr->rr_col[c].rc_abd;
+
+		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
+
+		if (c == rr->rr_firstdatacol) {
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
+			(void) memcpy(q, p, rr->rr_col[c].rc_size);
+			(void) memcpy(r, p, rr->rr_col[c].rc_size);
+
+			for (uint64_t i = ccnt; i < pcnt; i++) {
+				p[i] = 0;
+				q[i] = 0;
+				r[i] = 0;
+			}
+		} else {
+			struct pqr_struct pqr = { p, q, r };
+
+			ASSERT(ccnt <= pcnt);
+			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
+			    vdev_raidz_pqr_func, &pqr);
+
+			/*
+			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
+			 */
+			uint64_t mask;
+			for (uint64_t i = ccnt; i < pcnt; i++) {
+				VDEV_RAIDZ_64MUL_2(q[i], mask);
+				VDEV_RAIDZ_64MUL_4(r[i], mask);
+			}
+		}
+	}
+}
+
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
+void
+vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
+{
+	ASSERT3U(rr->rr_cols, !=, 0);
+
+	/* Generate using the new math implementation */
+	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
+		return;
+
+	switch (rr->rr_firstdatacol) {
+	case 1:
+		vdev_raidz_generate_parity_p(rr);
+		break;
+	case 2:
+		vdev_raidz_generate_parity_pq(rr);
+		break;
+	case 3:
+		vdev_raidz_generate_parity_pqr(rr);
+		break;
+	default:
+		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+	}
+}
+
+void
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+		vdev_raidz_generate_parity_row(rm, rr);
+	}
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
+{
+	uint64_t *dst = dbuf;
+	uint64_t *src = sbuf;
+	int cnt = size / sizeof (src[0]);
+
+	for (int i = 0; i < cnt; i++) {
+		dst[i] ^= src[i];
+	}
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
+    void *private)
+{
+	uint64_t *dst = dbuf;
+	uint64_t *src = sbuf;
+	uint64_t mask;
+	int cnt = size / sizeof (dst[0]);
+
+	for (int i = 0; i < cnt; i++, dst++, src++) {
+		VDEV_RAIDZ_64MUL_2(*dst, mask);
+		*dst ^= *src;
+	}
+
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
+{
+	uint64_t *dst = buf;
+	uint64_t mask;
+	int cnt = size / sizeof (dst[0]);
+
+	for (int i = 0; i < cnt; i++, dst++) {
+		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
+		VDEV_RAIDZ_64MUL_2(*dst, mask);
+	}
+
+	return (0);
+}
+
+struct reconst_q_struct {
+	uint64_t *q;
+	int exp;
+};
+
+static int
+vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
+{
+	struct reconst_q_struct *rq = private;
+	uint64_t *dst = buf;
+	int cnt = size / sizeof (dst[0]);
+
+	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
+		int j;
+		uint8_t *b;
+
+		*dst ^= *rq->q;
+		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
+			*b = vdev_raidz_exp2(*b, rq->exp);
+		}
+	}
+
+	return (0);
+}
+
+struct reconst_pq_struct {
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *pxy;
+	uint8_t *qxy;
+	int aexp;
+	int bexp;
+};
+
+static int
+vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
+{
+	struct reconst_pq_struct *rpq = private;
+	uint8_t *xd = xbuf;
+	uint8_t *yd = ybuf;
+
+	for (int i = 0; i < size;
+	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
+		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
+	}
+
+	return (0);
+}
+
+static int
+vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
+{
+	struct reconst_pq_struct *rpq = private;
+	uint8_t *xd = xbuf;
+
+	for (int i = 0; i < size;
+	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
+		/* same operation as vdev_raidz_reconst_pq_func() on xd */
+		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
+		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
+	}
+
+	return (0);
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
+{
+	int x = tgts[0];
+	abd_t *dst, *src;
+
+	ASSERT3U(ntgts, ==, 1);
+	ASSERT3U(x, >=, rr->rr_firstdatacol);
+	ASSERT3U(x, <, rr->rr_cols);
+
+	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
+
+	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+	dst = rr->rr_col[x].rc_abd;
+
+	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
+
+	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		uint64_t size = MIN(rr->rr_col[x].rc_size,
+		    rr->rr_col[c].rc_size);
+
+		src = rr->rr_col[c].rc_abd;
+
+		if (c == x)
+			continue;
+
+		(void) abd_iterate_func2(dst, src, 0, 0, size,
+		    vdev_raidz_reconst_p_func, NULL);
+	}
+
+	return (1 << VDEV_RAIDZ_P);
+}
+
+static int
+vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
+{
+	int x = tgts[0];
+	int c, exp;
+	abd_t *dst, *src;
+
+	ASSERT(ntgts == 1);
+
+	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
+
+	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
+		    rr->rr_col[c].rc_size);
+
+		src = rr->rr_col[c].rc_abd;
+		dst = rr->rr_col[x].rc_abd;
+
+		if (c == rr->rr_firstdatacol) {
+			abd_copy(dst, src, size);
+			if (rr->rr_col[x].rc_size > size) {
+				abd_zero_off(dst, size,
+				    rr->rr_col[x].rc_size - size);
+			}
+		} else {
+			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
+			(void) abd_iterate_func2(dst, src, 0, 0, size,
+			    vdev_raidz_reconst_q_pre_func, NULL);
+			(void) abd_iterate_func(dst,
+			    size, rr->rr_col[x].rc_size - size,
+			    vdev_raidz_reconst_q_pre_tail_func, NULL);
+		}
+	}
+
+	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+	dst = rr->rr_col[x].rc_abd;
+	exp = 255 - (rr->rr_cols - 1 - x);
+
+	struct reconst_q_struct rq = { abd_to_buf(src), exp };
+	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
+	    vdev_raidz_reconst_q_post_func, &rq);
+
+	return (1 << VDEV_RAIDZ_Q);
+}
+
+static int
+vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
+{
+	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
+	abd_t *pdata, *qdata;
+	uint64_t xsize, ysize;
+	int x = tgts[0];
+	int y = tgts[1];
+	abd_t *xd, *yd;
+
+	ASSERT(ntgts == 2);
+	ASSERT(x < y);
+	ASSERT(x >= rr->rr_firstdatacol);
+	ASSERT(y < rr->rr_cols);
+
+	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
+
+	/*
+	 * Move the parity data aside -- we're going to compute parity as
+	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
+	 * reuse the parity generation mechanism without trashing the actual
+	 * parity so we make those columns appear to be full of zeros by
+	 * setting their lengths to zero.
+	 */
+	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
+	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
+	xsize = rr->rr_col[x].rc_size;
+	ysize = rr->rr_col[y].rc_size;
+
+	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
+	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
+	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
+	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
+	rr->rr_col[x].rc_size = 0;
+	rr->rr_col[y].rc_size = 0;
+
+	vdev_raidz_generate_parity_pq(rr);
+
+	rr->rr_col[x].rc_size = xsize;
+	rr->rr_col[y].rc_size = ysize;
+
+	p = abd_to_buf(pdata);
+	q = abd_to_buf(qdata);
+	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+	xd = rr->rr_col[x].rc_abd;
+	yd = rr->rr_col[y].rc_abd;
+
+	/*
+	 * We now have:
+	 *	Pxy = P + D_x + D_y
+	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
+	 *
+	 * We can then solve for D_x:
+	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
+	 * where
+	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
+	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
+	 *
+	 * With D_x in hand, we can easily solve for D_y:
+	 *	D_y = P + Pxy + D_x
+	 */
+
+	a = vdev_raidz_pow2[255 + x - y];
+	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
+	tmp = 255 - vdev_raidz_log2[a ^ 1];
+
+	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
+	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
+
+	ASSERT3U(xsize, >=, ysize);
+	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
+
+	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
+	    vdev_raidz_reconst_pq_func, &rpq);
+	(void) abd_iterate_func(xd, ysize, xsize - ysize,
+	    vdev_raidz_reconst_pq_tail_func, &rpq);
+
+	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
+	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
+
+	/*
+	 * Restore the saved parity data.
+	 */
+	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
+	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
+
+	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coefficients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ *            __   __                     __     __
+ *            |     |         __     __   |  p_0  |
+ *            |  V  |         |  D_0  |   | p_m-1 |
+ *            |     |    x    |   :   | = |  d_0  |
+ *            |  I  |         | D_n-1 |   |   :   |
+ *            |     |         ~~     ~~   | d_n-1 |
+ *            ~~   ~~                     ~~     ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coefficients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ *      __               __               __     __
+ *      |   1   ..  1 1 1 |               |  p_0  |
+ *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
+ *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
+ *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
+ *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
+ *      |   :       : : : |   |   :   |   |  d_2  |
+ *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
+ *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
+ *      |   0   ..  0 0 1 |               | d_n-1 |
+ *      ~~               ~~               ~~     ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
+ *           |  19 205 116  29  64  16  4   1  |      / /
+ *           |  1   0   0   0   0   0   0   0  |     / /
+ *           |  0   1   0   0   0   0   0   0  | <--' /
+ *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  |
+ *           |  19 205 116  29  64  16  4   1  |
+ *           |  1   0   0   0   0   0   0   0  |
+ *           |  0   1   0   0   0   0   0   0  |
+ *  (V|I)' = |  0   0   1   0   0   0   0   0  |
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __                                                                 __
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ *                   __                               __
+ *                   |  0   0   1   0   0   0   0   0  |
+ *                   | 167 100  5   41 159 169 217 208 |
+ *                   | 166 100  4   40 158 168 216 209 |
+ *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
+ *                   |  0   0   0   0   1   0   0   0  |
+ *                   |  0   0   0   0   0   1   0   0  |
+ *                   |  0   0   0   0   0   0   1   0  |
+ *                   |  0   0   0   0   0   0   0   1  |
+ *                   ~~                               ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
+    uint8_t **rows)
+{
+	int i, j;
+	int pow;
+
+	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
+
+	/*
+	 * Fill in the missing rows of interest.
+	 */
+	for (i = 0; i < nmap; i++) {
+		ASSERT3S(0, <=, map[i]);
+		ASSERT3S(map[i], <=, 2);
+
+		pow = map[i] * n;
+		if (pow > 255)
+			pow -= 255;
+		ASSERT(pow <= 255);
+
+		for (j = 0; j < n; j++) {
+			pow -= map[i];
+			if (pow < 0)
+				pow += 255;
+			rows[i][j] = vdev_raidz_pow2[pow];
+		}
+	}
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
+    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, ii, jj;
+	uint8_t log;
+
+	/*
+	 * Assert that the first nmissing entries from the array of used
+	 * columns correspond to parity columns and that subsequent entries
+	 * correspond to data columns.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		ASSERT3S(used[i], <, rr->rr_firstdatacol);
+	}
+	for (; i < n; i++) {
+		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
+	}
+
+	/*
+	 * First initialize the storage where we'll compute the inverse rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			invrows[i][j] = (i == j) ? 1 : 0;
+		}
+	}
+
+	/*
+	 * Subtract all trivial rows from the rows of consequence.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = nmissing; j < n; j++) {
+			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
+			jj = used[j] - rr->rr_firstdatacol;
+			ASSERT3S(jj, <, n);
+			invrows[i][j] = rows[i][jj];
+			rows[i][jj] = 0;
+		}
+	}
+
+	/*
+	 * For each of the rows of interest, we must normalize it and subtract
+	 * a multiple of it from the other rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < missing[i]; j++) {
+			ASSERT0(rows[i][j]);
+		}
+		ASSERT3U(rows[i][missing[i]], !=, 0);
+
+		/*
+		 * Compute the inverse of the first element and multiply each
+		 * element in the row by that value.
+		 */
+		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+		for (j = 0; j < n; j++) {
+			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+		}
+
+		for (ii = 0; ii < nmissing; ii++) {
+			if (i == ii)
+				continue;
+
+			ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+			log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+			for (j = 0; j < n; j++) {
+				rows[ii][j] ^=
+				    vdev_raidz_exp2(rows[i][j], log);
+				invrows[ii][j] ^=
+				    vdev_raidz_exp2(invrows[i][j], log);
+			}
+		}
+	}
+
+	/*
+	 * Verify that the data that is left in the rows are properly part of
+	 * an identity matrix.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			if (j == missing[i]) {
+				ASSERT3U(rows[i][j], ==, 1);
+			} else {
+				ASSERT0(rows[i][j]);
+			}
+		}
+	}
+}
+
+static void
+vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
+    int *missing, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, x, cc, c;
+	uint8_t *src;
+	uint64_t ccount;
+	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
+	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
+	uint8_t log = 0;
+	uint8_t val;
+	int ll;
+	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *p, *pp;
+	size_t psize;
+
+	psize = sizeof (invlog[0][0]) * n * nmissing;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing; i++) {
+		invlog[i] = pp;
+		pp += n;
+	}
+
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			ASSERT3U(invrows[i][j], !=, 0);
+			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+		}
+	}
+
+	for (i = 0; i < n; i++) {
+		c = used[i];
+		ASSERT3U(c, <, rr->rr_cols);
+
+		ccount = rr->rr_col[c].rc_size;
+		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
+		if (ccount == 0)
+			continue;
+		src = abd_to_buf(rr->rr_col[c].rc_abd);
+		for (j = 0; j < nmissing; j++) {
+			cc = missing[j] + rr->rr_firstdatacol;
+			ASSERT3U(cc, >=, rr->rr_firstdatacol);
+			ASSERT3U(cc, <, rr->rr_cols);
+			ASSERT3U(cc, !=, c);
+
+			dcount[j] = rr->rr_col[cc].rc_size;
+			if (dcount[j] != 0)
+				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
+		}
+
+		for (x = 0; x < ccount; x++, src++) {
+			if (*src != 0)
+				log = vdev_raidz_log2[*src];
+
+			for (cc = 0; cc < nmissing; cc++) {
+				if (x >= dcount[cc])
+					continue;
+
+				if (*src == 0) {
+					val = 0;
+				} else {
+					if ((ll = log + invlog[cc][i]) >= 255)
+						ll -= 255;
+					val = vdev_raidz_pow2[ll];
+				}
+
+				if (i == 0)
+					dst[cc][x] = val;
+				else
+					dst[cc][x] ^= val;
+			}
+		}
+	}
+
+	kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
+{
+	int n, i, c, t, tt;
+	int nmissing_rows;
+	int missing_rows[VDEV_RAIDZ_MAXPARITY];
+	int parity_map[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *p, *pp;
+	size_t psize;
+	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *used;
+
+	abd_t **bufs = NULL;
+
+	int code = 0;
+
+	/*
+	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
+	 * temporary linear ABDs if any non-linear ABDs are found.
+	 */
+	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
+		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
+			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
+			    KM_PUSHPAGE);
+
+			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+				raidz_col_t *col = &rr->rr_col[c];
+
+				bufs[c] = col->rc_abd;
+				if (bufs[c] != NULL) {
+					col->rc_abd = abd_alloc_linear(
+					    col->rc_size, B_TRUE);
+					abd_copy(col->rc_abd, bufs[c],
+					    col->rc_size);
+				}
+			}
+
+			break;
+		}
+	}
+
+	n = rr->rr_cols - rr->rr_firstdatacol;
+
+	/*
+	 * Figure out which data columns are missing.
+	 */
+	nmissing_rows = 0;
+	for (t = 0; t < ntgts; t++) {
+		if (tgts[t] >= rr->rr_firstdatacol) {
+			missing_rows[nmissing_rows++] =
+			    tgts[t] - rr->rr_firstdatacol;
+		}
+	}
+
+	/*
+	 * Figure out which parity columns to use to help generate the missing
+	 * data columns.
+	 */
+	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+		ASSERT(tt < ntgts);
+		ASSERT(c < rr->rr_firstdatacol);
+
+		/*
+		 * Skip any targeted parity columns.
+		 */
+		if (c == tgts[tt]) {
+			tt++;
+			continue;
+		}
+
+		code |= 1 << c;
+
+		parity_map[i] = c;
+		i++;
+	}
+
+	ASSERT(code != 0);
+	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+	    nmissing_rows * n + sizeof (used[0]) * n;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing_rows; i++) {
+		rows[i] = pp;
+		pp += n;
+		invrows[i] = pp;
+		pp += n;
+	}
+	used = pp;
+
+	for (i = 0; i < nmissing_rows; i++) {
+		used[i] = parity_map[i];
+	}
+
+	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+		if (tt < nmissing_rows &&
+		    c == missing_rows[tt] + rr->rr_firstdatacol) {
+			tt++;
+			continue;
+		}
+
+		ASSERT3S(i, <, n);
+		used[i] = c;
+		i++;
+	}
+
+	/*
+	 * Initialize the interesting rows of the matrix.
+	 */
+	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
+
+	/*
+	 * Invert the matrix.
+	 */
+	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
+	    invrows, used);
+
+	/*
+	 * Reconstruct the missing data using the generated matrix.
+	 */
+	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
+	    invrows, used);
+
+	kmem_free(p, psize);
+
+	/*
+	 * copy back from temporary linear abds and free them
+	 */
+	if (bufs) {
+		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *col = &rr->rr_col[c];
+
+			if (bufs[c] != NULL) {
+				abd_copy(bufs[c], col->rc_abd, col->rc_size);
+				abd_free(col->rc_abd);
+			}
+			col->rc_abd = bufs[c];
+		}
+		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
+	}
+
+	return (code);
+}
+
+static int
+vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
+    const int *t, int nt)
+{
+	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+	int ntgts;
+	int i, c, ret;
+	int code;
+	int nbadparity, nbaddata;
+	int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+	nbadparity = rr->rr_firstdatacol;
+	nbaddata = rr->rr_cols - nbadparity;
+	ntgts = 0;
+	for (i = 0, c = 0; c < rr->rr_cols; c++) {
+		if (c < rr->rr_firstdatacol)
+			parity_valid[c] = B_FALSE;
+
+		if (i < nt && c == t[i]) {
+			tgts[ntgts++] = c;
+			i++;
+		} else if (rr->rr_col[c].rc_error != 0) {
+			tgts[ntgts++] = c;
+		} else if (c >= rr->rr_firstdatacol) {
+			nbaddata--;
+		} else {
+			parity_valid[c] = B_TRUE;
+			nbadparity--;
+		}
+	}
+
+	ASSERT(ntgts >= nt);
+	ASSERT(nbaddata >= 0);
+	ASSERT(nbaddata + nbadparity == ntgts);
+
+	dt = &tgts[nbadparity];
+
+	/* Reconstruct using the new math implementation */
+	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
+	if (ret != RAIDZ_ORIGINAL_IMPL)
+		return (ret);
+
+	/*
+	 * See if we can use any of our optimized reconstruction routines.
+	 */
+	switch (nbaddata) {
+	case 1:
+		if (parity_valid[VDEV_RAIDZ_P])
+			return (vdev_raidz_reconstruct_p(rr, dt, 1));
+
+		ASSERT(rr->rr_firstdatacol > 1);
+
+		if (parity_valid[VDEV_RAIDZ_Q])
+			return (vdev_raidz_reconstruct_q(rr, dt, 1));
+
+		ASSERT(rr->rr_firstdatacol > 2);
+		break;
+
+	case 2:
+		ASSERT(rr->rr_firstdatacol > 1);
+
+		if (parity_valid[VDEV_RAIDZ_P] &&
+		    parity_valid[VDEV_RAIDZ_Q])
+			return (vdev_raidz_reconstruct_pq(rr, dt, 2));
+
+		ASSERT(rr->rr_firstdatacol > 2);
+
+		break;
+	}
+
+	code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
+	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+	ASSERT(code > 0);
+	return (code);
+}
+
+static int
+vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *logical_ashift, uint64_t *physical_ashift)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	uint64_t nparity = vdrz->vd_nparity;
+	int c;
+	int lasterror = 0;
+	int numerrors = 0;
+
+	ASSERT(nparity > 0);
+
+	if (nparity > VDEV_RAIDZ_MAXPARITY ||
+	    vd->vdev_children < nparity + 1) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	vdev_open_children(vd);
+
+	for (c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error != 0) {
+			lasterror = cvd->vdev_open_error;
+			numerrors++;
+			continue;
+		}
+
+		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
+		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
+		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+		*physical_ashift = MAX(*physical_ashift,
+		    cvd->vdev_physical_ashift);
+	}
+
+	*asize *= vd->vdev_children;
+	*max_asize *= vd->vdev_children;
+
+	if (numerrors > nparity) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	return (0);
+}
+
+static void
+vdev_raidz_close(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++) {
+		if (vd->vdev_child[c] != NULL)
+			vdev_close(vd->vdev_child[c]);
+	}
+}
+
+static uint64_t
+vdev_raidz_asize(vdev_t *vd, uint64_t psize)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	uint64_t asize;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	uint64_t cols = vdrz->vd_logical_width;
+	uint64_t nparity = vdrz->vd_nparity;
+
+	asize = ((psize - 1) >> ashift) + 1;
+	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
+	asize = roundup(asize, nparity + 1) << ashift;
+
+	return (asize);
+}
+
+/*
+ * The allocatable space for a raidz vdev is N * sizeof(smallest child)
+ * so each child must provide at least 1/Nth of its asize.
+ */
+static uint64_t
+vdev_raidz_min_asize(vdev_t *vd)
+{
+	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
+	    vd->vdev_children);
+}
+
+void
+vdev_raidz_child_done(zio_t *zio)
+{
+	raidz_col_t *rc = zio->io_private;
+
+	rc->rc_error = zio->io_error;
+	rc->rc_tried = 1;
+	rc->rc_skipped = 0;
+}
+
+static void
+vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
+{
+#ifdef ZFS_DEBUG
+	vdev_t *tvd = vd->vdev_top;
+
+	range_seg64_t logical_rs, physical_rs, remain_rs;
+	logical_rs.rs_start = rr->rr_offset;
+	logical_rs.rs_end = logical_rs.rs_start +
+	    vdev_raidz_asize(vd, rr->rr_size);
+
+	raidz_col_t *rc = &rr->rr_col[col];
+	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
+	ASSERT(vdev_xlate_is_empty(&remain_rs));
+	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+	/*
+	 * It would be nice to assert that rs_end is equal
+	 * to rc_offset + rc_size but there might be an
+	 * optional I/O at the end that is not accounted in
+	 * rc_size.
+	 */
+	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+		    rc->rc_size + (1 << tvd->vdev_ashift));
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+	}
+#endif
+}
+
+static void
+vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
+{
+	vdev_t *vd = zio->io_vd;
+	raidz_map_t *rm = zio->io_vsd;
+	int c, i;
+
+	vdev_raidz_generate_parity_row(rm, rr);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		if (rc->rc_size == 0)
+			continue;
+
+		/* Verify physical to logical translation */
+		vdev_raidz_io_verify(vd, rr, c);
+
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[rc->rc_devidx], rc->rc_offset,
+		    rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
+		    0, vdev_raidz_child_done, rc));
+	}
+
+	/*
+	 * Generate optional I/Os for skip sectors to improve aggregation
+	 * contiguity.
+	 */
+	for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+		ASSERT(c <= rr->rr_scols);
+		if (c == rr->rr_scols)
+			c = 0;
+
+		raidz_col_t *rc = &rr->rr_col[c];
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+		    rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
+		    zio->io_type, zio->io_priority,
+		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+	}
+}
+
+static void
+vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
+{
+	vdev_t *vd = zio->io_vd;
+
+	/*
+	 * Iterate over the columns in reverse order so that we hit the parity
+	 * last -- any errors along the way will force us to read the parity.
+	 */
+	for (int c = rr->rr_cols - 1; c >= 0; c--) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		if (rc->rc_size == 0)
+			continue;
+		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+		if (!vdev_readable(cvd)) {
+			if (c >= rr->rr_firstdatacol)
+				rr->rr_missingdata++;
+			else
+				rr->rr_missingparity++;
+			rc->rc_error = SET_ERROR(ENXIO);
+			rc->rc_tried = 1;	/* don't even try */
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
+			if (c >= rr->rr_firstdatacol)
+				rr->rr_missingdata++;
+			else
+				rr->rr_missingparity++;
+			rc->rc_error = SET_ERROR(ESTALE);
+			rc->rc_skipped = 1;
+			continue;
+		}
+		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
+		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
+			    zio->io_type, zio->io_priority, 0,
+			    vdev_raidz_child_done, rc));
+		}
+	}
+}
+
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ *   1. Generate the parity data
+ *   2. Create child zio write operations to each column's vdev, for both
+ *      data and parity.
+ *   3. If the column skips any sectors for padding, create optional dummy
+ *      write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ *   1. Create child zio read operations to each data column's vdev to read
+ *      the range of data required for zio.
+ *   2. If this is a scrub or resilver operation, or if any of the data
+ *      vdevs have had errors, then create zio read operations to the parity
+ *      columns' VDevs as well.
+ */
+static void
+vdev_raidz_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	raidz_map_t *rm;
+
+	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
+	    vdrz->vd_logical_width, vdrz->vd_nparity);
+
+	/*
+	 * Until raidz expansion is implemented all maps for a raidz vdev
+	 * contain a single row.
+	 */
+	ASSERT3U(rm->rm_nrows, ==, 1);
+	raidz_row_t *rr = rm->rm_row[0];
+
+	zio->io_vsd = rm;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
+	} else {
+		ASSERT(zio->io_type == ZIO_TYPE_READ);
+		vdev_raidz_io_start_read(zio, rr);
+	}
+
+	zio_execute(zio);
+}
+
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
+{
+	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
+
+	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
+	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
+		zio_bad_cksum_t zbc;
+		raidz_map_t *rm = zio->io_vsd;
+
+		zbc.zbc_has_cksum = 0;
+		zbc.zbc_injected = rm->rm_ecksuminjected;
+
+		int ret = zfs_ereport_post_checksum(zio->io_spa, vd,
+		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
+		    rc->rc_abd, bad_data, &zbc);
+		if (ret != EALREADY) {
+			mutex_enter(&vd->vdev_stat_lock);
+			vd->vdev_stat.vs_checksum_errors++;
+			mutex_exit(&vd->vdev_stat_lock);
+		}
+	}
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+	zio_bad_cksum_t zbc;
+	raidz_map_t *rm = zio->io_vsd;
+
+	bzero(&zbc, sizeof (zio_bad_cksum_t));
+
+	int ret = zio_checksum_error(zio, &zbc);
+	if (ret != 0 && zbc.zbc_injected != 0)
+		rm->rm_ecksuminjected = 1;
+
+	return (ret);
+}
+
+/*
+ * Generate the parity from the data columns. If we tried and were able to
+ * read the parity without error, verify that the generated parity matches the
+ * data we read. If it doesn't, we fire off a checksum error. Return the
+ * number of such failures.
+ */
+static int
+raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
+{
+	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
+	int c, ret = 0;
+	raidz_map_t *rm = zio->io_vsd;
+	raidz_col_t *rc;
+
+	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+	if (checksum == ZIO_CHECKSUM_NOPARITY)
+		return (ret);
+
+	for (c = 0; c < rr->rr_firstdatacol; c++) {
+		rc = &rr->rr_col[c];
+		if (!rc->rc_tried || rc->rc_error != 0)
+			continue;
+
+		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
+		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
+	}
+
+	/*
+	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
+	 * isn't harmful but it does have the side effect of fixing stuff
+	 * we didn't realize was necessary (i.e. even if we return 0).
+	 */
+	vdev_raidz_generate_parity_row(rm, rr);
+
+	for (c = 0; c < rr->rr_firstdatacol; c++) {
+		rc = &rr->rr_col[c];
+
+		if (!rc->rc_tried || rc->rc_error != 0)
+			continue;
+
+		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
+			raidz_checksum_error(zio, rc, orig[c]);
+			rc->rc_error = SET_ERROR(ECKSUM);
+			ret++;
+		}
+		abd_free(orig[c]);
+	}
+
+	return (ret);
+}
+
+static int
+vdev_raidz_worst_error(raidz_row_t *rr)
+{
+	int error = 0;
+
+	for (int c = 0; c < rr->rr_cols; c++)
+		error = zio_worst_error(error, rr->rr_col[c].rc_error);
+
+	return (error);
+}
+
+static void
+vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
+{
+	int unexpected_errors = 0;
+	int parity_errors = 0;
+	int parity_untried = 0;
+	int data_errors = 0;
+
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_error) {
+			if (c < rr->rr_firstdatacol)
+				parity_errors++;
+			else
+				data_errors++;
+
+			if (!rc->rc_skipped)
+				unexpected_errors++;
+		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+			parity_untried++;
+		}
+	}
+
+	/*
+	 * If we read more parity disks than were used for
+	 * reconstruction, confirm that the other parity disks produced
+	 * correct data.
+	 *
+	 * Note that we also regenerate parity when resilvering so we
+	 * can write it out to failed devices later.
+	 */
+	if (parity_errors + parity_untried <
+	    rr->rr_firstdatacol - data_errors ||
+	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
+		int n = raidz_parity_verify(zio, rr);
+		unexpected_errors += n;
+		ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
+	}
+
+	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
+	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
+		/*
+		 * Use the good data we have in hand to repair damaged children.
+		 */
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			vdev_t *vd = zio->io_vd;
+			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+			if ((rc->rc_error == 0 || rc->rc_size == 0) &&
+			    (rc->rc_repair == 0)) {
+				continue;
+			}
+
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset, rc->rc_abd, rc->rc_size,
+			    ZIO_TYPE_WRITE,
+			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
+			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
+		}
+	}
+}
+
+static void
+raidz_restore_orig_data(raidz_map_t *rm)
+{
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			if (rc->rc_need_orig_restore) {
+				abd_copy_from_buf(rc->rc_abd,
+				    rc->rc_orig_data, rc->rc_size);
+				rc->rc_need_orig_restore = B_FALSE;
+			}
+		}
+	}
+}
+
+/*
+ * returns EINVAL if reconstruction of the block will not be possible
+ * returns ECKSUM if this specific reconstruction failed
+ * returns 0 on successful reconstruction
+ */
+static int
+raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
+{
+	raidz_map_t *rm = zio->io_vsd;
+
+	/* Reconstruct each row */
+	for (int r = 0; r < rm->rm_nrows; r++) {
+		raidz_row_t *rr = rm->rm_row[r];
+		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
+		int t = 0;
+		int dead = 0;
+		int dead_data = 0;
+
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			ASSERT0(rc->rc_need_orig_restore);
+			if (rc->rc_error != 0) {
+				dead++;
+				if (c >= nparity)
+					dead_data++;
+				continue;
+			}
+			if (rc->rc_size == 0)
+				continue;
+			for (int lt = 0; lt < ntgts; lt++) {
+				if (rc->rc_devidx == ltgts[lt]) {
+					if (rc->rc_orig_data == NULL) {
+						rc->rc_orig_data =
+						    zio_buf_alloc(rc->rc_size);
+						abd_copy_to_buf(
+						    rc->rc_orig_data,
+						    rc->rc_abd, rc->rc_size);
+					}
+					rc->rc_need_orig_restore = B_TRUE;
+
+					dead++;
+					if (c >= nparity)
+						dead_data++;
+					my_tgts[t++] = c;
+					break;
+				}
+			}
+		}
+		if (dead > nparity) {
+			/* reconstruction not possible */
+			raidz_restore_orig_data(rm);
+			return (EINVAL);
+		}
+		rr->rr_code = 0;
+		if (dead_data > 0)
+			rr->rr_code = vdev_raidz_reconstruct_row(rm, rr,
+			    my_tgts, t);
+	}
+
+	/* Check for success */
+	if (raidz_checksum_verify(zio) == 0) {
+
+		/* Reconstruction succeeded - report errors */
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+
+			for (int c = 0; c < rr->rr_cols; c++) {
+				raidz_col_t *rc = &rr->rr_col[c];
+				if (rc->rc_need_orig_restore) {
+					/*
+					 * Note: if this is a parity column,
+					 * we don't really know if it's wrong.
+					 * We need to let
+					 * vdev_raidz_io_done_verified() check
+					 * it, and if we set rc_error, it will
+					 * think that it is a "known" error
+					 * that doesn't need to be checked
+					 * or corrected.
+					 */
+					if (rc->rc_error == 0 &&
+					    c >= rr->rr_firstdatacol) {
+						raidz_checksum_error(zio,
+						    rc, rc->rc_gdata);
+						rc->rc_error =
+						    SET_ERROR(ECKSUM);
+					}
+					rc->rc_need_orig_restore = B_FALSE;
+				}
+			}
+
+			vdev_raidz_io_done_verified(zio, rr);
+		}
+
+		zio_checksum_verified(zio);
+
+		return (0);
+	}
+
+	/* Reconstruction failed - restore original data */
+	raidz_restore_orig_data(rm);
+	return (ECKSUM);
+}
+
+/*
+ * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ *
+ * The order that we find the various possible combinations of failed
+ * disks is dictated by these rules:
+ * - Examine each "slot" (the "i" in tgts[i])
+ *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
+ *   - if we can't increment because it runs into the next slot,
+ *     reset our slot to the minimum, and examine the next slot
+ *
+ *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
+ *  3 columns to reconstruct), we will generate the following sequence:
+ *
+ *  STATE        ACTION
+ *  0 1 2        special case: skip since these are all parity
+ *  0 1   3      first slot: reset to 0; middle slot: increment to 2
+ *  0   2 3      first slot: increment to 1
+ *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
+ *  0 1     4    first: reset to 0; middle: increment to 2
+ *  0   2   4    first: increment to 1
+ *    1 2   4    first: reset to 0; middle: increment to 3
+ *  0     3 4    first: increment to 1
+ *    1   3 4    first: increment to 2
+ *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
+ *  0 1       5  first: reset to 0; middle: increment to 2
+ *  0   2     5  first: increment to 1
+ *    1 2     5  first: reset to 0; middle: increment to 3
+ *  0     3   5  first: increment to 1
+ *    1   3   5  first: increment to 2
+ *      2 3   5  first: reset to 0; middle: increment to 4
+ *  0       4 5  first: increment to 1
+ *    1     4 5  first: increment to 2
+ *      2   4 5  first: increment to 3
+ *        3 4 5  done
+ *
+ * This strategy works for dRAID but is less effecient when there are a large
+ * number of child vdevs and therefore permutations to check. Furthermore,
+ * since the raidz_map_t rows likely do not overlap reconstruction would be
+ * possible as long as there are no more than nparity data errors per row.
+ * These additional permutations are not currently checked but could be as
+ * a future improvement.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio)
+{
+	int nparity = vdev_get_nparity(zio->io_vd);
+	raidz_map_t *rm = zio->io_vsd;
+
+	/* Check if there's enough data to attempt reconstrution. */
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+		int total_errors = 0;
+
+		for (int c = 0; c < rr->rr_cols; c++) {
+			if (rr->rr_col[c].rc_error)
+				total_errors++;
+		}
+
+		if (total_errors > nparity)
+			return (vdev_raidz_worst_error(rr));
+	}
+
+	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
+		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+		int *ltgts = &tstore[1]; /* value is logical child ID */
+
+		/* Determine number of logical children, n */
+		int n = zio->io_vd->vdev_children;
+
+		ASSERT3U(num_failures, <=, nparity);
+		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
+
+		/* Handle corner cases in combrec logic */
+		ltgts[-1] = -1;
+		for (int i = 0; i < num_failures; i++) {
+			ltgts[i] = i;
+		}
+		ltgts[num_failures] = n;
+
+		for (;;) {
+			int err = raidz_reconstruct(zio, ltgts, num_failures,
+			    nparity);
+			if (err == EINVAL) {
+				/*
+				 * Reconstruction not possible with this #
+				 * failures; try more failures.
+				 */
+				break;
+			} else if (err == 0)
+				return (0);
+
+			/* Compute next targets to try */
+			for (int t = 0; ; t++) {
+				ASSERT3U(t, <, num_failures);
+				ltgts[t]++;
+				if (ltgts[t] == n) {
+					/* try more failures */
+					ASSERT3U(t, ==, num_failures - 1);
+					break;
+				}
+
+				ASSERT3U(ltgts[t], <, n);
+				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
+
+				/*
+				 * If that spot is available, we're done here.
+				 * Try the next combination.
+				 */
+				if (ltgts[t] != ltgts[t + 1])
+					break;
+
+				/*
+				 * Otherwise, reset this tgt to the minimum,
+				 * and move on to the next tgt.
+				 */
+				ltgts[t] = ltgts[t - 1] + 1;
+				ASSERT3U(ltgts[t], ==, t);
+			}
+
+			/* Increase the number of failures and keep trying. */
+			if (ltgts[num_failures - 1] == n)
+				break;
+		}
+	}
+
+	return (ECKSUM);
+}
+
+void
+vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
+{
+	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
+		raidz_row_t *rr = rm->rm_row[row];
+		vdev_raidz_reconstruct_row(rm, rr, t, nt);
+	}
+}
+
+/*
+ * Complete a write IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ *   1. Check for errors on the child IOs.
+ *   2. Return, setting an error code if too few child VDevs were written
+ *      to reconstruct the data later.  Note that partial writes are
+ *      considered successful if they can be reconstructed at all.
+ */
+static void
+vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
+{
+	int total_errors = 0;
+
+	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_error) {
+			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
+
+			total_errors++;
+		}
+	}
+
+	/*
+	 * Treat partial writes as a success. If we couldn't write enough
+	 * columns to reconstruct the data, the I/O failed.  Otherwise,
+	 * good enough.
+	 *
+	 * Now that we support write reallocation, it would be better
+	 * to treat partial failure as real failure unless there are
+	 * no non-degraded top-level vdevs left, and not update DTLs
+	 * if we intend to reallocate.
+	 */
+	if (total_errors > rr->rr_firstdatacol) {
+		zio->io_error = zio_worst_error(zio->io_error,
+		    vdev_raidz_worst_error(rr));
+	}
+}
+
+/*
+ * return 0 if no reconstruction occurred, otherwise the "code" from
+ * vdev_raidz_reconstruct().
+ */
+static int
+vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
+    raidz_row_t *rr)
+{
+	int parity_errors = 0;
+	int parity_untried = 0;
+	int data_errors = 0;
+	int total_errors = 0;
+	int code = 0;
+
+	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
+	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+
+		if (rc->rc_error) {
+			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
+
+			if (c < rr->rr_firstdatacol)
+				parity_errors++;
+			else
+				data_errors++;
+
+			total_errors++;
+		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
+			parity_untried++;
+		}
+	}
+
+	/*
+	 * If there were data errors and the number of errors we saw was
+	 * correctable -- less than or equal to the number of parity disks read
+	 * -- reconstruct based on the missing data.
+	 */
+	if (data_errors != 0 &&
+	    total_errors <= rr->rr_firstdatacol - parity_untried) {
+		/*
+		 * We either attempt to read all the parity columns or
+		 * none of them. If we didn't try to read parity, we
+		 * wouldn't be here in the correctable case. There must
+		 * also have been fewer parity errors than parity
+		 * columns or, again, we wouldn't be in this code path.
+		 */
+		ASSERT(parity_untried == 0);
+		ASSERT(parity_errors < rr->rr_firstdatacol);
+
+		/*
+		 * Identify the data columns that reported an error.
+		 */
+		int n = 0;
+		int tgts[VDEV_RAIDZ_MAXPARITY];
+		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			if (rc->rc_error != 0) {
+				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+				tgts[n++] = c;
+			}
+		}
+
+		ASSERT(rr->rr_firstdatacol >= n);
+
+		code = vdev_raidz_reconstruct_row(rm, rr, tgts, n);
+	}
+
+	return (code);
+}
+
+/*
+ * Return the number of reads issued.
+ */
+static int
+vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
+{
+	vdev_t *vd = zio->io_vd;
+	int nread = 0;
+
+	rr->rr_missingdata = 0;
+	rr->rr_missingparity = 0;
+
+	/*
+	 * If this rows contains empty sectors which are not required
+	 * for a normal read then allocate an ABD for them now so they
+	 * may be read, verified, and any needed repairs performed.
+	 */
+	if (rr->rr_nempty && rr->rr_abd_empty == NULL)
+		vdev_draid_map_alloc_empty(zio, rr);
+
+	for (int c = 0; c < rr->rr_cols; c++) {
+		raidz_col_t *rc = &rr->rr_col[c];
+		if (rc->rc_tried || rc->rc_size == 0)
+			continue;
+
+		zio_nowait(zio_vdev_child_io(zio, NULL,
+		    vd->vdev_child[rc->rc_devidx],
+		    rc->rc_offset, rc->rc_abd, rc->rc_size,
+		    zio->io_type, zio->io_priority, 0,
+		    vdev_raidz_child_done, rc));
+		nread++;
+	}
+	return (nread);
+}
+
+/*
+ * We're here because either there were too many errors to even attempt
+ * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
+ * failed. In either case, there is enough bad data to prevent reconstruction.
+ * Start checksum ereports for all children which haven't failed.
+ */
+static void
+vdev_raidz_io_done_unrecoverable(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+
+	for (int i = 0; i < rm->rm_nrows; i++) {
+		raidz_row_t *rr = rm->rm_row[i];
+
+		for (int c = 0; c < rr->rr_cols; c++) {
+			raidz_col_t *rc = &rr->rr_col[c];
+			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
+
+			if (rc->rc_error != 0)
+				continue;
+
+			zio_bad_cksum_t zbc;
+			zbc.zbc_has_cksum = 0;
+			zbc.zbc_injected = rm->rm_ecksuminjected;
+
+			int ret = zfs_ereport_start_checksum(zio->io_spa,
+			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
+			    rc->rc_size, (void *)(uintptr_t)c, &zbc);
+			if (ret != EALREADY) {
+				mutex_enter(&cvd->vdev_stat_lock);
+				cvd->vdev_stat.vs_checksum_errors++;
+				mutex_exit(&cvd->vdev_stat_lock);
+			}
+		}
+	}
+}
+
+void
+vdev_raidz_io_done(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
+		}
+	} else {
+		for (int i = 0; i < rm->rm_nrows; i++) {
+			raidz_row_t *rr = rm->rm_row[i];
+			rr->rr_code =
+			    vdev_raidz_io_done_reconstruct_known_missing(zio,
+			    rm, rr);
+		}
+
+		if (raidz_checksum_verify(zio) == 0) {
+			for (int i = 0; i < rm->rm_nrows; i++) {
+				raidz_row_t *rr = rm->rm_row[i];
+				vdev_raidz_io_done_verified(zio, rr);
+			}
+			zio_checksum_verified(zio);
+		} else {
+			/*
+			 * A sequential resilver has no checksum which makes
+			 * combinatoral reconstruction impossible. This code
+			 * path is unreachable since raidz_checksum_verify()
+			 * has no checksum to verify and must succeed.
+			 */
+			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
+
+			/*
+			 * This isn't a typical situation -- either we got a
+			 * read error or a child silently returned bad data.
+			 * Read every block so we can try again with as much
+			 * data and parity as we can track down. If we've
+			 * already been through once before, all children will
+			 * be marked as tried so we'll proceed to combinatorial
+			 * reconstruction.
+			 */
+			int nread = 0;
+			for (int i = 0; i < rm->rm_nrows; i++) {
+				nread += vdev_raidz_read_all(zio,
+				    rm->rm_row[i]);
+			}
+			if (nread != 0) {
+				/*
+				 * Normally our stage is VDEV_IO_DONE, but if
+				 * we've already called redone(), it will have
+				 * changed to VDEV_IO_START, in which case we
+				 * don't want to call redone() again.
+				 */
+				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
+					zio_vdev_io_redone(zio);
+				return;
+			}
+
+			zio->io_error = vdev_raidz_combrec(zio);
+			if (zio->io_error == ECKSUM &&
+			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+				vdev_raidz_io_done_unrecoverable(zio);
+			}
+		}
+	}
+}
+
+static void
+vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	if (faulted > vdrz->vd_nparity)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
+	else if (degraded + faulted != 0)
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	else
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+}
+
+/*
+ * Determine if any portion of the provided block resides on a child vdev
+ * with a dirty DTL and therefore needs to be resilvered.  The function
+ * assumes that at least one DTL is dirty which implies that full stripe
+ * width blocks must be resilvered.
+ */
+static boolean_t
+vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	uint64_t dcols = vd->vdev_children;
+	uint64_t nparity = vdrz->vd_nparity;
+	uint64_t ashift = vd->vdev_top->vdev_ashift;
+	/* The starting RAIDZ (parent) vdev sector of the block. */
+	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
+	/* The zio's size in units of the vdev's minimum sector size. */
+	uint64_t s = ((psize - 1) >> ashift) + 1;
+	/* The first column for this stripe. */
+	uint64_t f = b % dcols;
+
+	/* Unreachable by sequential resilver. */
+	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
+
+	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+		return (B_FALSE);
+
+	if (s + nparity >= dcols)
+		return (B_TRUE);
+
+	for (uint64_t c = 0; c < s + nparity; c++) {
+		uint64_t devidx = (f + c) % dcols;
+		vdev_t *cvd = vd->vdev_child[devidx];
+
+		/*
+		 * dsl_scan_need_resilver() already checked vd with
+		 * vdev_dtl_contains(). So here just check cvd with
+		 * vdev_dtl_empty(), cheaper and a good approximation.
+		 */
+		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
+			return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
+    range_seg64_t *physical_rs, range_seg64_t *remain_rs)
+{
+	vdev_t *raidvd = cvd->vdev_parent;
+	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+	uint64_t width = raidvd->vdev_children;
+	uint64_t tgt_col = cvd->vdev_id;
+	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+	/* make sure the offsets are block-aligned */
+	ASSERT0(logical_rs->rs_start % (1 << ashift));
+	ASSERT0(logical_rs->rs_end % (1 << ashift));
+	uint64_t b_start = logical_rs->rs_start >> ashift;
+	uint64_t b_end = logical_rs->rs_end >> ashift;
+
+	uint64_t start_row = 0;
+	if (b_start > tgt_col) /* avoid underflow */
+		start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+	uint64_t end_row = 0;
+	if (b_end > tgt_col)
+		end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+	physical_rs->rs_start = start_row << ashift;
+	physical_rs->rs_end = end_row << ashift;
+
+	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
+	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
+	    logical_rs->rs_end - logical_rs->rs_start);
+}
+
+/*
+ * Initialize private RAIDZ specific fields from the nvlist.
+ */
+static int
+vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
+{
+	vdev_raidz_t *vdrz;
+	uint64_t nparity;
+
+	uint_t children;
+	nvlist_t **child;
+	int error = nvlist_lookup_nvlist_array(nv,
+	    ZPOOL_CONFIG_CHILDREN, &child, &children);
+	if (error != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
+		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * Previous versions could only support 1 or 2 parity
+		 * device.
+		 */
+		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
+			return (SET_ERROR(EINVAL));
+		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
+			return (SET_ERROR(EINVAL));
+	} else {
+		/*
+		 * We require the parity to be specified for SPAs that
+		 * support multiple parity levels.
+		 */
+		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * Otherwise, we default to 1 parity device for RAID-Z.
+		 */
+		nparity = 1;
+	}
+
+	vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
+	vdrz->vd_logical_width = children;
+	vdrz->vd_nparity = nparity;
+
+	*tsd = vdrz;
+
+	return (0);
+}
+
+static void
+vdev_raidz_fini(vdev_t *vd)
+{
+	kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
+}
+
+/*
+ * Add RAIDZ specific fields to the config nvlist.
+ */
+static void
+vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
+{
+	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+
+	/*
+	 * Make sure someone hasn't managed to sneak a fancy new vdev
+	 * into a crufty old storage pool.
+	 */
+	ASSERT(vdrz->vd_nparity == 1 ||
+	    (vdrz->vd_nparity <= 2 &&
+	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
+	    (vdrz->vd_nparity <= 3 &&
+	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
+
+	/*
+	 * Note that we'll add these even on storage pools where they
+	 * aren't strictly required -- older software will just ignore
+	 * it.
+	 */
+	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
+}
+
+static uint64_t
+vdev_raidz_nparity(vdev_t *vd)
+{
+	vdev_raidz_t *vdrz = vd->vdev_tsd;
+	return (vdrz->vd_nparity);
+}
+
+static uint64_t
+vdev_raidz_ndisks(vdev_t *vd)
+{
+	return (vd->vdev_children);
+}
+
+vdev_ops_t vdev_raidz_ops = {
+	.vdev_op_init = vdev_raidz_init,
+	.vdev_op_fini = vdev_raidz_fini,
+	.vdev_op_open = vdev_raidz_open,
+	.vdev_op_close = vdev_raidz_close,
+	.vdev_op_asize = vdev_raidz_asize,
+	.vdev_op_min_asize = vdev_raidz_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = vdev_raidz_io_start,
+	.vdev_op_io_done = vdev_raidz_io_done,
+	.vdev_op_state_change = vdev_raidz_state_change,
+	.vdev_op_need_resilver = vdev_raidz_need_resilver,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = vdev_raidz_xlate,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = vdev_raidz_config_generate,
+	.vdev_op_nparity = vdev_raidz_nparity,
+	.vdev_op_ndisks = vdev_raidz_ndisks,
+	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
new file mode 100644
index 000000000000..25d76970e99a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math.c
@@ -0,0 +1,666 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/types.h>
+#include <sys/zio.h>
+#include <sys/debug.h>
+#include <sys/zfs_debug.h>
+#include <sys/vdev_raidz.h>
+#include <sys/vdev_raidz_impl.h>
+#include <sys/simd.h>
+
+/* Opaque implementation with NULL methods to represent original methods */
+static const raidz_impl_ops_t vdev_raidz_original_impl = {
+	.name = "original",
+	.is_supported = raidz_will_scalar_work,
+};
+
+/* RAIDZ parity op that contain the fastest methods */
+static raidz_impl_ops_t vdev_raidz_fastest_impl = {
+	.name = "fastest"
+};
+
+/* All compiled in implementations */
+const raidz_impl_ops_t *raidz_all_maths[] = {
+	&vdev_raidz_original_impl,
+	&vdev_raidz_scalar_impl,
+#if defined(__x86_64) && defined(HAVE_SSE2)	/* only x86_64 for now */
+	&vdev_raidz_sse2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSSE3)	/* only x86_64 for now */
+	&vdev_raidz_ssse3_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX2)	/* only x86_64 for now */
+	&vdev_raidz_avx2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512F)	/* only x86_64 for now */
+	&vdev_raidz_avx512f_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512BW)	/* only x86_64 for now */
+	&vdev_raidz_avx512bw_impl,
+#endif
+#if defined(__aarch64__) && !defined(__FreeBSD__)
+	&vdev_raidz_aarch64_neon_impl,
+	&vdev_raidz_aarch64_neonx2_impl,
+#endif
+#if defined(__powerpc__) && defined(__altivec__)
+	&vdev_raidz_powerpc_altivec_impl,
+#endif
+};
+
+/* Indicate that benchmark has been completed */
+static boolean_t raidz_math_initialized = B_FALSE;
+
+/* Select raidz implementation */
+#define	IMPL_FASTEST	(UINT32_MAX)
+#define	IMPL_CYCLE	(UINT32_MAX - 1)
+#define	IMPL_ORIGINAL	(0)
+#define	IMPL_SCALAR	(1)
+
+#define	RAIDZ_IMPL_READ(i)	(*(volatile uint32_t *) &(i))
+
+static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR;
+static uint32_t user_sel_impl = IMPL_FASTEST;
+
+/* Hold all supported implementations */
+static size_t raidz_supp_impl_cnt = 0;
+static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
+
+#if defined(_KERNEL)
+/*
+ * kstats values for supported implementations
+ * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
+ */
+static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
+
+/* kstat for benchmarked implementations */
+static kstat_t *raidz_math_kstat = NULL;
+#endif
+
+/*
+ * Returns the RAIDZ operations for raidz_map() parity calculations.   When
+ * a SIMD implementation is not allowed in the current context, then fallback
+ * to the fastest generic implementation.
+ */
+const raidz_impl_ops_t *
+vdev_raidz_math_get_ops(void)
+{
+	if (!kfpu_allowed())
+		return (&vdev_raidz_scalar_impl);
+
+	raidz_impl_ops_t *ops = NULL;
+	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+
+	switch (impl) {
+	case IMPL_FASTEST:
+		ASSERT(raidz_math_initialized);
+		ops = &vdev_raidz_fastest_impl;
+		break;
+	case IMPL_CYCLE:
+		/* Cycle through all supported implementations */
+		ASSERT(raidz_math_initialized);
+		ASSERT3U(raidz_supp_impl_cnt, >, 0);
+		static size_t cycle_impl_idx = 0;
+		size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
+		ops = raidz_supp_impl[idx];
+		break;
+	case IMPL_ORIGINAL:
+		ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
+		break;
+	case IMPL_SCALAR:
+		ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl;
+		break;
+	default:
+		ASSERT3U(impl, <, raidz_supp_impl_cnt);
+		ASSERT3U(raidz_supp_impl_cnt, >, 0);
+		if (impl < ARRAY_SIZE(raidz_all_maths))
+			ops = raidz_supp_impl[impl];
+		break;
+	}
+
+	ASSERT3P(ops, !=, NULL);
+
+	return (ops);
+}
+
+/*
+ * Select parity generation method for raidz_map
+ */
+int
+vdev_raidz_math_generate(raidz_map_t *rm, raidz_row_t *rr)
+{
+	raidz_gen_f gen_parity = NULL;
+
+	switch (raidz_parity(rm)) {
+		case 1:
+			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P];
+			break;
+		case 2:
+			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ];
+			break;
+		case 3:
+			gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR];
+			break;
+		default:
+			gen_parity = NULL;
+			cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
+			    raidz_parity(rm));
+			break;
+	}
+
+	/* if method is NULL execute the original implementation */
+	if (gen_parity == NULL)
+		return (RAIDZ_ORIGINAL_IMPL);
+
+	gen_parity(rr);
+
+	return (0);
+}
+
+static raidz_rec_f
+reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid,
+    const int nbaddata)
+{
+	if (nbaddata == 1 && parity_valid[CODE_P]) {
+		return (rm->rm_ops->rec[RAIDZ_REC_P]);
+	}
+	return ((raidz_rec_f) NULL);
+}
+
+static raidz_rec_f
+reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid,
+    const int nbaddata)
+{
+	if (nbaddata == 1) {
+		if (parity_valid[CODE_P]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_P]);
+		} else if (parity_valid[CODE_Q]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
+		}
+	} else if (nbaddata == 2 &&
+	    parity_valid[CODE_P] && parity_valid[CODE_Q]) {
+		return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
+	}
+	return ((raidz_rec_f) NULL);
+}
+
+static raidz_rec_f
+reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid,
+    const int nbaddata)
+{
+	if (nbaddata == 1) {
+		if (parity_valid[CODE_P]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_P]);
+		} else if (parity_valid[CODE_Q]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_Q]);
+		} else if (parity_valid[CODE_R]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_R]);
+		}
+	} else if (nbaddata == 2) {
+		if (parity_valid[CODE_P] && parity_valid[CODE_Q]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_PQ]);
+		} else if (parity_valid[CODE_P] && parity_valid[CODE_R]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_PR]);
+		} else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) {
+			return (rm->rm_ops->rec[RAIDZ_REC_QR]);
+		}
+	} else if (nbaddata == 3 &&
+	    parity_valid[CODE_P] && parity_valid[CODE_Q] &&
+	    parity_valid[CODE_R]) {
+		return (rm->rm_ops->rec[RAIDZ_REC_PQR]);
+	}
+	return ((raidz_rec_f) NULL);
+}
+
+/*
+ * Select data reconstruction method for raidz_map
+ * @parity_valid - Parity validity flag
+ * @dt           - Failed data index array
+ * @nbaddata     - Number of failed data columns
+ */
+int
+vdev_raidz_math_reconstruct(raidz_map_t *rm, raidz_row_t *rr,
+    const int *parity_valid, const int *dt, const int nbaddata)
+{
+	raidz_rec_f rec_fn = NULL;
+
+	switch (raidz_parity(rm)) {
+	case PARITY_P:
+		rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata);
+		break;
+	case PARITY_PQ:
+		rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata);
+		break;
+	case PARITY_PQR:
+		rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata);
+		break;
+	default:
+		cmn_err(CE_PANIC, "invalid RAID-Z configuration %d",
+		    raidz_parity(rm));
+		break;
+	}
+
+	if (rec_fn == NULL)
+		return (RAIDZ_ORIGINAL_IMPL);
+	else
+		return (rec_fn(rr, dt));
+}
+
+const char *raidz_gen_name[] = {
+	"gen_p", "gen_pq", "gen_pqr"
+};
+const char *raidz_rec_name[] = {
+	"rec_p", "rec_q", "rec_r",
+	"rec_pq", "rec_pr", "rec_qr", "rec_pqr"
+};
+
+#if defined(_KERNEL)
+
+#define	RAIDZ_KSTAT_LINE_LEN	(17 + 10*12 + 1)
+
+static int
+raidz_math_kstat_headers(char *buf, size_t size)
+{
+	int i;
+	ssize_t off;
+
+	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
+
+	off = snprintf(buf, size, "%-17s", "implementation");
+
+	for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
+		off += snprintf(buf + off, size - off, "%-16s",
+		    raidz_gen_name[i]);
+
+	for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
+		off += snprintf(buf + off, size - off, "%-16s",
+		    raidz_rec_name[i]);
+
+	(void) snprintf(buf + off, size - off, "\n");
+
+	return (0);
+}
+
+static int
+raidz_math_kstat_data(char *buf, size_t size, void *data)
+{
+	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
+	raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data;
+	ssize_t off = 0;
+	int i;
+
+	ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN);
+
+	if (cstat == fstat) {
+		off += snprintf(buf + off, size - off, "%-17s", "fastest");
+
+		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) {
+			int id = fstat->gen[i];
+			off += snprintf(buf + off, size - off, "%-16s",
+			    raidz_supp_impl[id]->name);
+		}
+		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) {
+			int id = fstat->rec[i];
+			off += snprintf(buf + off, size - off, "%-16s",
+			    raidz_supp_impl[id]->name);
+		}
+	} else {
+		ptrdiff_t id = cstat - raidz_impl_kstats;
+
+		off += snprintf(buf + off, size - off, "%-17s",
+		    raidz_supp_impl[id]->name);
+
+		for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++)
+			off += snprintf(buf + off, size - off, "%-16llu",
+			    (u_longlong_t)cstat->gen[i]);
+
+		for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++)
+			off += snprintf(buf + off, size - off, "%-16llu",
+			    (u_longlong_t)cstat->rec[i]);
+	}
+
+	(void) snprintf(buf + off, size - off, "\n");
+
+	return (0);
+}
+
+static void *
+raidz_math_kstat_addr(kstat_t *ksp, loff_t n)
+{
+	if (n <= raidz_supp_impl_cnt)
+		ksp->ks_private = (void *) (raidz_impl_kstats + n);
+	else
+		ksp->ks_private = NULL;
+
+	return (ksp->ks_private);
+}
+
+#define	BENCH_D_COLS	(8ULL)
+#define	BENCH_COLS	(BENCH_D_COLS + PARITY_PQR)
+#define	BENCH_ZIO_SIZE	(1ULL << SPA_OLD_MAXBLOCKSHIFT)	/* 128 kiB */
+#define	BENCH_NS	MSEC2NSEC(1)			/* 1ms */
+
+typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn);
+
+static void
+benchmark_gen_impl(raidz_map_t *rm, const int fn)
+{
+	(void) fn;
+	vdev_raidz_generate_parity(rm);
+}
+
+static void
+benchmark_rec_impl(raidz_map_t *rm, const int fn)
+{
+	static const int rec_tgt[7][3] = {
+		{1, 2, 3},	/* rec_p:   bad QR & D[0]	*/
+		{0, 2, 3},	/* rec_q:   bad PR & D[0]	*/
+		{0, 1, 3},	/* rec_r:   bad PQ & D[0]	*/
+		{2, 3, 4},	/* rec_pq:  bad R  & D[0][1]	*/
+		{1, 3, 4},	/* rec_pr:  bad Q  & D[0][1]	*/
+		{0, 3, 4},	/* rec_qr:  bad P  & D[0][1]	*/
+		{3, 4, 5}	/* rec_pqr: bad    & D[0][1][2] */
+	};
+
+	vdev_raidz_reconstruct(rm, rec_tgt[fn], 3);
+}
+
+/*
+ * Benchmarking of all supported implementations (raidz_supp_impl_cnt)
+ * is performed by setting the rm_ops pointer and calling the top level
+ * generate/reconstruct methods of bench_rm.
+ */
+static void
+benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
+{
+	uint64_t run_cnt, speed, best_speed = 0;
+	hrtime_t t_start, t_diff;
+	raidz_impl_ops_t *curr_impl;
+	raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt];
+	int impl, i;
+
+	for (impl = 0; impl < raidz_supp_impl_cnt; impl++) {
+		/* set an implementation to benchmark */
+		curr_impl = raidz_supp_impl[impl];
+		bench_rm->rm_ops = curr_impl;
+
+		run_cnt = 0;
+		t_start = gethrtime();
+
+		do {
+			for (i = 0; i < 5; i++, run_cnt++)
+				bench_fn(bench_rm, fn);
+
+			t_diff = gethrtime() - t_start;
+		} while (t_diff < BENCH_NS);
+
+		speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC;
+		speed /= (t_diff * BENCH_COLS);
+
+		if (bench_fn == benchmark_gen_impl)
+			raidz_impl_kstats[impl].gen[fn] = speed;
+		else
+			raidz_impl_kstats[impl].rec[fn] = speed;
+
+		/* Update fastest implementation method */
+		if (speed > best_speed) {
+			best_speed = speed;
+
+			if (bench_fn == benchmark_gen_impl) {
+				fstat->gen[fn] = impl;
+				vdev_raidz_fastest_impl.gen[fn] =
+				    curr_impl->gen[fn];
+			} else {
+				fstat->rec[fn] = impl;
+				vdev_raidz_fastest_impl.rec[fn] =
+				    curr_impl->rec[fn];
+			}
+		}
+	}
+}
+#endif
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+benchmark_raidz(void)
+{
+	raidz_impl_ops_t *curr_impl;
+	int i, c;
+
+	/* Move supported impl into raidz_supp_impl */
+	for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+		curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
+
+		if (curr_impl->init)
+			curr_impl->init();
+
+		if (curr_impl->is_supported())
+			raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl;
+	}
+	membar_producer();		/* complete raidz_supp_impl[] init */
+	raidz_supp_impl_cnt = c;	/* number of supported impl */
+
+#if defined(_KERNEL)
+	zio_t *bench_zio = NULL;
+	raidz_map_t *bench_rm = NULL;
+	uint64_t bench_parity;
+
+	/* Fake a zio and run the benchmark on a warmed up buffer */
+	bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+	bench_zio->io_offset = 0;
+	bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */
+	bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE);
+	memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
+
+	/* Benchmark parity generation methods */
+	for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+		bench_parity = fn + 1;
+		/* New raidz_map is needed for each generate_p/q/r */
+		bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+		    BENCH_D_COLS + bench_parity, bench_parity);
+
+		benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl);
+
+		vdev_raidz_map_free(bench_rm);
+	}
+
+	/* Benchmark data reconstruction methods */
+	bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+	    BENCH_COLS, PARITY_PQR);
+
+	for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
+		benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
+
+	vdev_raidz_map_free(bench_rm);
+
+	/* cleanup the bench zio */
+	abd_free(bench_zio->io_abd);
+	kmem_free(bench_zio, sizeof (zio_t));
+#else
+	/*
+	 * Skip the benchmark in user space to avoid impacting libzpool
+	 * consumers (zdb, zhack, zinject, ztest).  The last implementation
+	 * is assumed to be the fastest and used by default.
+	 */
+	memcpy(&vdev_raidz_fastest_impl,
+	    raidz_supp_impl[raidz_supp_impl_cnt - 1],
+	    sizeof (vdev_raidz_fastest_impl));
+	strcpy(vdev_raidz_fastest_impl.name, "fastest");
+#endif /* _KERNEL */
+}
+
+void
+vdev_raidz_math_init(void)
+{
+	/* Determine the fastest available implementation. */
+	benchmark_raidz();
+
+#if defined(_KERNEL)
+	/* Install kstats for all implementations */
+	raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
+	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+	if (raidz_math_kstat != NULL) {
+		raidz_math_kstat->ks_data = NULL;
+		raidz_math_kstat->ks_ndata = UINT32_MAX;
+		kstat_set_raw_ops(raidz_math_kstat,
+		    raidz_math_kstat_headers,
+		    raidz_math_kstat_data,
+		    raidz_math_kstat_addr);
+		kstat_install(raidz_math_kstat);
+	}
+#endif
+
+	/* Finish initialization */
+	atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
+	raidz_math_initialized = B_TRUE;
+}
+
+void
+vdev_raidz_math_fini(void)
+{
+	raidz_impl_ops_t const *curr_impl;
+
+#if defined(_KERNEL)
+	if (raidz_math_kstat != NULL) {
+		kstat_delete(raidz_math_kstat);
+		raidz_math_kstat = NULL;
+	}
+#endif
+
+	for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+		curr_impl = raidz_all_maths[i];
+		if (curr_impl->fini)
+			curr_impl->fini();
+	}
+}
+
+static const struct {
+	char *name;
+	uint32_t sel;
+} math_impl_opts[] = {
+		{ "cycle",	IMPL_CYCLE },
+		{ "fastest",	IMPL_FASTEST },
+		{ "original",	IMPL_ORIGINAL },
+		{ "scalar",	IMPL_SCALAR }
+};
+
+/*
+ * Function sets desired raidz implementation.
+ *
+ * If we are called before init(), user preference will be saved in
+ * user_sel_impl, and applied in later init() call. This occurs when module
+ * parameter is specified on module load. Otherwise, directly update
+ * zfs_vdev_raidz_impl.
+ *
+ * @val		Name of raidz implementation to use
+ * @param	Unused.
+ */
+int
+vdev_raidz_impl_set(const char *val)
+{
+	int err = -EINVAL;
+	char req_name[RAIDZ_IMPL_NAME_MAX];
+	uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl);
+	size_t i;
+
+	/* sanitize input */
+	i = strnlen(val, RAIDZ_IMPL_NAME_MAX);
+	if (i == 0 || i == RAIDZ_IMPL_NAME_MAX)
+		return (err);
+
+	strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX);
+	while (i > 0 && !!isspace(req_name[i-1]))
+		i--;
+	req_name[i] = '\0';
+
+	/* Check mandatory options */
+	for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) {
+		if (strcmp(req_name, math_impl_opts[i].name) == 0) {
+			impl = math_impl_opts[i].sel;
+			err = 0;
+			break;
+		}
+	}
+
+	/* check all supported impl if init() was already called */
+	if (err != 0 && raidz_math_initialized) {
+		/* check all supported implementations */
+		for (i = 0; i < raidz_supp_impl_cnt; i++) {
+			if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) {
+				impl = i;
+				err = 0;
+				break;
+			}
+		}
+	}
+
+	if (err == 0) {
+		if (raidz_math_initialized)
+			atomic_swap_32(&zfs_vdev_raidz_impl, impl);
+		else
+			atomic_swap_32(&user_sel_impl, impl);
+	}
+
+	return (err);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+
+static int
+zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp)
+{
+	return (vdev_raidz_impl_set(val));
+}
+
+static int
+zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+	int i, cnt = 0;
+	char *fmt;
+	const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+
+	ASSERT(raidz_math_initialized);
+
+	/* list mandatory options */
+	for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) {
+		fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name);
+	}
+
+	/* list all supported implementations */
+	for (i = 0; i < raidz_supp_impl_cnt; i++) {
+		fmt = (i == impl) ? "[%s] " : "%s ";
+		cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name);
+	}
+
+	return (cnt);
+}
+
+module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set,
+    zfs_vdev_raidz_impl_get, NULL, 0644);
+MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation.");
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
new file mode 100644
index 000000000000..0a67ceb84920
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon.c
@@ -0,0 +1,2279 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+
+#if defined(__aarch64__)
+
+#include "vdev_raidz_math_aarch64_neon_common.h"
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		4
+#define	ZERO_DEFINE()	\
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	ZERO_D			0, 1, 2, 3
+
+#define	COPY_STRIDE		4
+#define	COPY_DEFINE()	\
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	COPY_D			0, 1, 2, 3
+
+#define	ADD_STRIDE		4
+#define	ADD_DEFINE()	\
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	ADD_D			0, 1, 2, 3
+
+#define	MUL_STRIDE		4
+#define	MUL_DEFINE()	\
+	GEN_X_DEFINE_0_3()  \
+	GEN_X_DEFINE_33_36()
+#define	MUL_D			0, 1, 2, 3
+
+#define	GEN_P_DEFINE() \
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	GEN_P_STRIDE		4
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_Q_STRIDE		4
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_R_STRIDE		4
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PQ_STRIDE		4
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PQ_STRIDE		2
+#define	REC_PQ_X		0, 1
+#define	REC_PQ_Y		2, 3
+#define	REC_PQ_T		4, 5
+
+#define	SYN_PR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PR_STRIDE		4
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PR_STRIDE		2
+#define	REC_PR_X		0, 1
+#define	REC_PR_Y		2, 3
+#define	REC_PR_T		4, 5
+
+#define	SYN_QR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_QR_STRIDE		4
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_QR_STRIDE		2
+#define	REC_QR_X		0, 1
+#define	REC_QR_Y		2, 3
+#define	REC_QR_T		4, 5
+
+#define	SYN_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PQR_STRIDE		 4
+#define	SYN_PQR_D		 0, 1, 2, 3
+#define	SYN_PQR_X		 4, 5, 6, 7
+
+#define	REC_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PQR_STRIDE		2
+#define	REC_PQR_X		0, 1
+#define	REC_PQR_Y		2, 3
+#define	REC_PQR_Z		4, 5
+#define	REC_PQR_XS		6, 7
+#define	REC_PQR_YS		8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(aarch64_neon);
+DEFINE_REC_METHODS(aarch64_neon);
+
+static boolean_t
+raidz_will_aarch64_neon_work(void)
+{
+	return (kfpu_allowed());
+}
+
+const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(aarch64_neon),
+	.rec = RAIDZ_REC_METHODS(aarch64_neon),
+	.is_supported = &raidz_will_aarch64_neon_work,
+	.name = "aarch64_neon"
+};
+
+#endif /* defined(__aarch64__) */
+
+
+#if defined(__aarch64__)
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+	    0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+	    0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+	    0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+	    0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+	    0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+	    0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+	    0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+	    0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+	    0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+	    0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+	    0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+	    0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+	    0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+	    0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+	    0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+	    0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+	    0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+	    0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+	    0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+	    0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+	    0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+	    0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+	    0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+	    0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+	    0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+	    0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+	    0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+	    0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+	    0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+	    0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+	    0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+	    0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+	    0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+	    0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+	    0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+	    0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+	    0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+	    0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+	    0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+	    0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+	    0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+	    0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+	    0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+	    0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+	    0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+	    0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+	    0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+	    0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+	    0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+	    0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+	    0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+	    0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+	    0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+	    0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+	    0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+	    0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7,
+	    0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce,
+	    0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9,
+	    0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc,
+	    0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb,
+	    0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2,
+	    0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5,
+	    0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8,
+	    0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff,
+	    0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6,
+	    0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1,
+	    0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4,
+	    0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3,
+	    0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed,
+	    0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7,
+	    0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe,
+	    0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac,
+	    0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab,
+	    0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2,
+	    0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5,
+	    0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88,
+	    0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f,
+	    0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86,
+	    0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81,
+	    0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94,
+	    0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93,
+	    0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a,
+	    0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d,
+	    0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27,
+	    0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e,
+	    0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29,
+	    0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c,
+	    0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b,
+	    0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32,
+	    0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35,
+	    0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18,
+	    0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16,
+	    0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11,
+	    0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04,
+	    0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03,
+	    0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a,
+	    0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d,
+	    0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57,
+	    0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e,
+	    0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59,
+	    0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b,
+	    0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42,
+	    0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45,
+	    0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68,
+	    0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f,
+	    0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66,
+	    0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61,
+	    0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74,
+	    0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73,
+	    0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a,
+	    0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d,
+	    0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e,
+	    0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89,
+	    0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c,
+	    0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b,
+	    0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92,
+	    0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95,
+	    0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8,
+	    0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf,
+	    0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6,
+	    0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1,
+	    0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4,
+	    0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3,
+	    0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa,
+	    0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad,
+	    0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7,
+	    0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe,
+	    0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9,
+	    0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec,
+	    0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb,
+	    0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2,
+	    0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5,
+	    0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8,
+	    0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf,
+	    0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6,
+	    0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1,
+	    0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3,
+	    0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda,
+	    0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd,
+	    0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67,
+	    0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e,
+	    0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69,
+	    0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c,
+	    0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b,
+	    0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75,
+	    0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58,
+	    0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f,
+	    0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56,
+	    0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51,
+	    0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44,
+	    0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43,
+	    0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a,
+	    0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d,
+	    0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17,
+	    0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e,
+	    0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19,
+	    0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c,
+	    0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b,
+	    0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02,
+	    0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05,
+	    0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28,
+	    0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f,
+	    0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26,
+	    0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34,
+	    0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33,
+	    0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a,
+	    0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d,
+	    0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47,
+	    0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e,
+	    0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49,
+	    0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c,
+	    0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b,
+	    0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52,
+	    0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55,
+	    0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78,
+	    0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f,
+	    0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76,
+	    0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71,
+	    0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64,
+	    0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63,
+	    0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a,
+	    0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37,
+	    0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39,
+	    0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c,
+	    0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b,
+	    0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22,
+	    0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25,
+	    0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08,
+	    0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f,
+	    0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06,
+	    0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01,
+	    0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14,
+	    0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13,
+	    0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a,
+	    0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d,
+	    0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7,
+	    0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae,
+	    0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9,
+	    0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc,
+	    0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb,
+	    0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2,
+	    0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5,
+	    0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f,
+	    0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96,
+	    0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91,
+	    0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84,
+	    0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83,
+	    0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a,
+	    0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d,
+	    0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7,
+	    0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde,
+	    0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9,
+	    0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc,
+	    0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2,
+	    0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5,
+	    0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8,
+	    0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef,
+	    0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6,
+	    0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1,
+	    0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4,
+	    0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3,
+	    0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa,
+	    0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd,
+	    0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05  }
+};
+/* END CSTYLED */
+#endif /* defined(__aarch64__) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
new file mode 100644
index 000000000000..e46b2536546c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neon_common.h
@@ -0,0 +1,684 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define	VR0_(REG, ...) "%[w"#REG"]"
+#define	VR1_(_1, REG, ...) "%[w"#REG"]"
+#define	VR2_(_1, _2, REG, ...) "%[w"#REG"]"
+#define	VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
+#define	VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
+#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
+#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
+#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
+
+/*
+ * Here we need registers not used otherwise.
+ * They will be used in unused ASM for the case
+ * with more registers than required... but GCC
+ * will still need to make sure the constraints
+ * are correct, and duplicate constraints are illegal
+ * ... and we use the "register" number as a name
+ */
+
+#define	VR0(r...) VR0_(r)
+#define	VR1(r...) VR1_(r)
+#define	VR2(r...) VR2_(r, 36)
+#define	VR3(r...) VR3_(r, 36, 35)
+#define	VR4(r...) VR4_(r, 36, 35, 34, 33)
+#define	VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
+#define	VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
+#define	VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	VR(X) "%[w"#X"]"
+
+#define	RVR0_(REG, ...) [w##REG] "w" (w##REG)
+#define	RVR1_(_1, REG, ...) [w##REG] "w" (w##REG)
+#define	RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG)
+#define	RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG)
+#define	RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG)
+#define	RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG)
+#define	RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG)
+#define	RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG)
+
+#define	RVR0(r...) RVR0_(r)
+#define	RVR1(r...) RVR1_(r)
+#define	RVR2(r...) RVR2_(r, 36)
+#define	RVR3(r...) RVR3_(r, 36, 35)
+#define	RVR4(r...) RVR4_(r, 36, 35, 34, 33)
+#define	RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
+#define	RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
+#define	RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	RVR(X) [w##X] "w" (w##X)
+
+#define	WVR0_(REG, ...) [w##REG] "=w" (w##REG)
+#define	WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG)
+#define	WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG)
+#define	WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG)
+#define	WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG)
+#define	WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG)
+#define	WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG)
+#define	WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG)
+
+#define	WVR0(r...) WVR0_(r)
+#define	WVR1(r...) WVR1_(r)
+#define	WVR2(r...) WVR2_(r, 36)
+#define	WVR3(r...) WVR3_(r, 36, 35)
+#define	WVR4(r...) WVR4_(r, 36, 35, 34, 33)
+#define	WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
+#define	WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
+#define	WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	WVR(X) [w##X] "=w" (w##X)
+
+#define	UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
+#define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
+
+#define	UVR0(r...) UVR0_(r)
+#define	UVR1(r...) UVR1_(r)
+#define	UVR2(r...) UVR2_(r, 36)
+#define	UVR3(r...) UVR3_(r, 36, 35)
+#define	UVR4(r...) UVR4_(r, 36, 35, 34, 33)
+#define	UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
+#define	UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
+#define	UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	UVR(X) [w##X] "+&w" (w##X)
+
+#define	R_01(REG1, REG2, ...) REG1, REG2
+#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define	R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define	ZFS_ASM_BUG()	ASSERT(0)
+
+#define	OFFSET(ptr, val)	(((unsigned char *)(ptr))+val)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define	ELEM_SIZE 16
+
+typedef struct v {
+	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define	XOR_ACC(src, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		"ld1 { v21.4s },%[SRC0]\n"				\
+		"ld1 { v20.4s },%[SRC1]\n"				\
+		"ld1 { v19.4s },%[SRC2]\n"				\
+		"ld1 { v18.4s },%[SRC3]\n"				\
+		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
+		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
+		"eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"		\
+		"eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"		\
+		"ld1 { v21.4s },%[SRC4]\n"				\
+		"ld1 { v20.4s },%[SRC5]\n"				\
+		"ld1 { v19.4s },%[SRC6]\n"				\
+		"ld1 { v18.4s },%[SRC7]\n"				\
+		"eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n"		\
+		"eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n"		\
+		"eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n"		\
+		"eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n"		\
+		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r),		\
+			UVR4(r), UVR5(r), UVR6(r), UVR7(r)		\
+		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
+		[SRC1] "Q" (*(OFFSET(src, 16))),			\
+		[SRC2] "Q" (*(OFFSET(src, 32))),			\
+		[SRC3] "Q" (*(OFFSET(src, 48))),			\
+		[SRC4] "Q" (*(OFFSET(src, 64))),			\
+		[SRC5] "Q" (*(OFFSET(src, 80))),			\
+		[SRC6] "Q" (*(OFFSET(src, 96))),			\
+		[SRC7] "Q" (*(OFFSET(src, 112)))			\
+		:	"v18", "v19", "v20", "v21");			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		"ld1 { v21.4s },%[SRC0]\n"				\
+		"ld1 { v20.4s },%[SRC1]\n"				\
+		"ld1 { v19.4s },%[SRC2]\n"				\
+		"ld1 { v18.4s },%[SRC3]\n"				\
+		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
+		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
+		"eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"		\
+		"eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"		\
+		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)		\
+		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
+		[SRC1] "Q" (*(OFFSET(src, 16))),			\
+		[SRC2] "Q" (*(OFFSET(src, 32))),			\
+		[SRC3] "Q" (*(OFFSET(src, 48)))				\
+		:	"v18", "v19", "v20", "v21");			\
+		break;							\
+	case 2:								\
+		__asm(							\
+		"ld1 { v21.4s },%[SRC0]\n"				\
+		"ld1 { v20.4s },%[SRC1]\n"				\
+		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
+		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
+		:	UVR0(r), UVR1(r)				\
+		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
+		[SRC1] "Q" (*(OFFSET(src, 16)))				\
+		:	"v20", "v21");					\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	XOR(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		"eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n"	\
+		"eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n"	\
+		"eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n"	\
+		"eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n"	\
+		:	UVR4(r), UVR5(r), UVR6(r), UVR7(r)		\
+		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
+		break;							\
+	case 4:								\
+		__asm(							\
+		"eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n"	\
+		"eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n"	\
+		:	UVR2(r), UVR3(r)				\
+		:	RVR0(r), RVR1(r));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	ZERO(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
+		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
+		"eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"	\
+		"eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"	\
+		"eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n"	\
+		"eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n"	\
+		"eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n"	\
+		"eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n"	\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),		\
+			WVR4(r), WVR5(r), WVR6(r), WVR7(r));		\
+		break;							\
+	case 4:								\
+		__asm(							\
+		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
+		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
+		"eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"	\
+		"eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"	\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r));		\
+		break;							\
+	case 2:								\
+		__asm(							\
+		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
+		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
+		:	WVR0(r), WVR1(r));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	COPY(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		"mov " VR4(r) ".16b," VR0(r) ".16b\n"			\
+		"mov " VR5(r) ".16b," VR1(r) ".16b\n"			\
+		"mov " VR6(r) ".16b," VR2(r) ".16b\n"			\
+		"mov " VR7(r) ".16b," VR3(r) ".16b\n"			\
+		:	WVR4(r), WVR5(r), WVR6(r), WVR7(r)		\
+		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
+		break;							\
+	case 4:								\
+		__asm(							\
+		"mov " VR2(r) ".16b," VR0(r) ".16b\n"			\
+		"mov " VR3(r) ".16b," VR1(r) ".16b\n"			\
+		:	WVR2(r), WVR3(r)				\
+		:	RVR0(r), RVR1(r));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	LOAD(src, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
+		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
+		"ld1 { " VR2(r) ".4s },%[SRC2]\n"			\
+		"ld1 { " VR3(r) ".4s },%[SRC3]\n"			\
+		"ld1 { " VR4(r) ".4s },%[SRC4]\n"			\
+		"ld1 { " VR5(r) ".4s },%[SRC5]\n"			\
+		"ld1 { " VR6(r) ".4s },%[SRC6]\n"			\
+		"ld1 { " VR7(r) ".4s },%[SRC7]\n"			\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),		\
+			WVR4(r), WVR5(r), WVR6(r), WVR7(r)		\
+		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
+		[SRC1] "Q" (*(OFFSET(src, 16))),			\
+		[SRC2] "Q" (*(OFFSET(src, 32))),			\
+		[SRC3] "Q" (*(OFFSET(src, 48))),			\
+		[SRC4] "Q" (*(OFFSET(src, 64))),			\
+		[SRC5] "Q" (*(OFFSET(src, 80))),			\
+		[SRC6] "Q" (*(OFFSET(src, 96))),			\
+		[SRC7] "Q" (*(OFFSET(src, 112))));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
+		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
+		"ld1 { " VR2(r) ".4s },%[SRC2]\n"			\
+		"ld1 { " VR3(r) ".4s },%[SRC3]\n"			\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r)		\
+		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
+		[SRC1] "Q" (*(OFFSET(src, 16))),			\
+		[SRC2] "Q" (*(OFFSET(src, 32))),			\
+		[SRC3] "Q" (*(OFFSET(src, 48))));			\
+		break;							\
+	case 2:								\
+		__asm(							\
+		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
+		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
+		:	WVR0(r), WVR1(r)				\
+		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
+		[SRC1] "Q" (*(OFFSET(src, 16))));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	STORE(dst, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
+		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
+		"st1 { " VR2(r) ".4s },%[DST2]\n"			\
+		"st1 { " VR3(r) ".4s },%[DST3]\n"			\
+		"st1 { " VR4(r) ".4s },%[DST4]\n"			\
+		"st1 { " VR5(r) ".4s },%[DST5]\n"			\
+		"st1 { " VR6(r) ".4s },%[DST6]\n"			\
+		"st1 { " VR7(r) ".4s },%[DST7]\n"			\
+		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
+		[DST1] "=Q" (*(OFFSET(dst, 16))),			\
+		[DST2] "=Q" (*(OFFSET(dst, 32))),			\
+		[DST3] "=Q" (*(OFFSET(dst, 48))),			\
+		[DST4] "=Q" (*(OFFSET(dst, 64))),			\
+		[DST5] "=Q" (*(OFFSET(dst, 80))),			\
+		[DST6] "=Q" (*(OFFSET(dst, 96))),			\
+		[DST7] "=Q" (*(OFFSET(dst, 112)))			\
+		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r),		\
+			RVR4(r), RVR5(r), RVR6(r), RVR7(r));		\
+		break;							\
+	case 4:								\
+		__asm(							\
+		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
+		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
+		"st1 { " VR2(r) ".4s },%[DST2]\n"			\
+		"st1 { " VR3(r) ".4s },%[DST3]\n"			\
+		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
+		[DST1] "=Q" (*(OFFSET(dst, 16))),			\
+		[DST2] "=Q" (*(OFFSET(dst, 32))),			\
+		[DST3] "=Q" (*(OFFSET(dst, 48)))			\
+		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
+		break;							\
+	case 2:								\
+		__asm(							\
+		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
+		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
+		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
+		[DST1] "=Q" (*(OFFSET(dst, 16)))			\
+		:	RVR0(r), RVR1(r));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a numbered variable is
+ */
+#define	_00	"v17"
+#define	_1d	"v16"
+#define	_temp0	"v19"
+#define	_temp1	"v18"
+
+#define	MUL2_SETUP()							\
+{									\
+	__asm(								\
+	"eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n"		\
+	"movi " VR(16) ".16b,#0x1d\n"					\
+	:	WVR(16), WVR(17));					\
+}
+
+#define	MUL2(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		"cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"		\
+		"cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"		\
+		"cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n"		\
+		"cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n"		\
+		"and v19.16b,v19.16b," VR(16) ".16b\n"			\
+		"and v18.16b,v18.16b," VR(16) ".16b\n"			\
+		"and v21.16b,v21.16b," VR(16) ".16b\n"			\
+		"and v20.16b,v20.16b," VR(16) ".16b\n"			\
+		"shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"		\
+		"shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"		\
+		"shl " VR2(r) ".16b," VR2(r) ".16b,#1\n"		\
+		"shl " VR3(r) ".16b," VR3(r) ".16b,#1\n"		\
+		"eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"		\
+		"eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"		\
+		"eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n"		\
+		"eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n"		\
+		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)		\
+		:	RVR(17), RVR(16)				\
+		:	"v18", "v19", "v20", "v21");			\
+		break;							\
+	case 2:								\
+		__asm(							\
+		"cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"		\
+		"cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"		\
+		"and v19.16b,v19.16b," VR(16) ".16b\n"			\
+		"and v18.16b,v18.16b," VR(16) ".16b\n"			\
+		"shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"		\
+		"shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"		\
+		"eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"		\
+		"eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"		\
+		:	UVR0(r), UVR1(r)				\
+		:	RVR(17), RVR(16)				\
+		:	"v18", "v19");					\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL4(r...)							\
+{									\
+	MUL2(r);							\
+	MUL2(r);							\
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a register is
+ * (here we're using actual registers for the
+ * clobbered ones)
+ */
+#define	_0f		"v15"
+#define	_a_save		"v14"
+#define	_b_save		"v13"
+#define	_lt_mod_a	"v12"
+#define	_lt_clmul_a	"v11"
+#define	_lt_mod_b	"v10"
+#define	_lt_clmul_b	"v15"
+
+#define	_MULx2(c, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		/* lts for upper part */				\
+		"movi v15.16b,#0x0f\n"					\
+		"ld1 { v10.4s },%[lt0]\n"				\
+		"ld1 { v11.4s },%[lt1]\n"				\
+		/* upper part */					\
+		"and v14.16b," VR0(r) ".16b,v15.16b\n"			\
+		"and v13.16b," VR1(r) ".16b,v15.16b\n"			\
+		"ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n"		\
+		"ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n"		\
+									\
+		"tbl v12.16b,{v10.16b}," VR0(r) ".16b\n"		\
+		"tbl v10.16b,{v10.16b}," VR1(r) ".16b\n"		\
+		"tbl v15.16b,{v11.16b}," VR0(r) ".16b\n"		\
+		"tbl v11.16b,{v11.16b}," VR1(r) ".16b\n"		\
+									\
+		"eor " VR0(r) ".16b,v15.16b,v12.16b\n"			\
+		"eor " VR1(r) ".16b,v11.16b,v10.16b\n"			\
+		/* lts for lower part */				\
+		"ld1 { v10.4s },%[lt2]\n"				\
+		"ld1 { v15.4s },%[lt3]\n"				\
+		/* lower part */					\
+		"tbl v12.16b,{v10.16b},v14.16b\n"			\
+		"tbl v10.16b,{v10.16b},v13.16b\n"			\
+		"tbl v11.16b,{v15.16b},v14.16b\n"			\
+		"tbl v15.16b,{v15.16b},v13.16b\n"			\
+									\
+		"eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n"		\
+		"eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n"		\
+		"eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n"		\
+		"eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n"		\
+		:	UVR0(r), UVR1(r)				\
+		:	[lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])),	\
+		[lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])),		\
+		[lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])),		\
+		[lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0]))		\
+		:	"v10", "v11", "v12", "v13", "v14", "v15");	\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL(c, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		_MULx2(c, R_23(r));					\
+		_MULx2(c, R_01(r));					\
+		break;							\
+	case 2:								\
+		_MULx2(c, R_01(r));					\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	raidz_math_begin()	kfpu_begin()
+#define	raidz_math_end()	kfpu_end()
+
+/* Overkill... */
+#if defined(_KERNEL)
+#define	GEN_X_DEFINE_0_3()	\
+register unsigned char w0 asm("v0") __attribute__((vector_size(16)));	\
+register unsigned char w1 asm("v1") __attribute__((vector_size(16)));	\
+register unsigned char w2 asm("v2") __attribute__((vector_size(16)));	\
+register unsigned char w3 asm("v3") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_4_5()	\
+register unsigned char w4 asm("v4") __attribute__((vector_size(16)));	\
+register unsigned char w5 asm("v5") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_6_7()	\
+register unsigned char w6 asm("v6") __attribute__((vector_size(16)));	\
+register unsigned char w7 asm("v7") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_8_9()	\
+register unsigned char w8 asm("v8") __attribute__((vector_size(16)));	\
+register unsigned char w9 asm("v9") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_10_11()	\
+register unsigned char w10 asm("v10") __attribute__((vector_size(16)));	\
+register unsigned char w11 asm("v11") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_12_15()	\
+register unsigned char w12 asm("v12") __attribute__((vector_size(16)));	\
+register unsigned char w13 asm("v13") __attribute__((vector_size(16)));	\
+register unsigned char w14 asm("v14") __attribute__((vector_size(16)));	\
+register unsigned char w15 asm("v15") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_16()	\
+register unsigned char w16 asm("v16") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_17()	\
+register unsigned char w17 asm("v17") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_18_21()	\
+register unsigned char w18 asm("v18") __attribute__((vector_size(16)));	\
+register unsigned char w19 asm("v19") __attribute__((vector_size(16)));	\
+register unsigned char w20 asm("v20") __attribute__((vector_size(16)));	\
+register unsigned char w21 asm("v21") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_22_23()	\
+register unsigned char w22 asm("v22") __attribute__((vector_size(16)));	\
+register unsigned char w23 asm("v23") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_24_27()	\
+register unsigned char w24 asm("v24") __attribute__((vector_size(16)));	\
+register unsigned char w25 asm("v25") __attribute__((vector_size(16)));	\
+register unsigned char w26 asm("v26") __attribute__((vector_size(16)));	\
+register unsigned char w27 asm("v27") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_28_30()	\
+register unsigned char w28 asm("v28") __attribute__((vector_size(16)));	\
+register unsigned char w29 asm("v29") __attribute__((vector_size(16)));	\
+register unsigned char w30 asm("v30") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_31()	\
+register unsigned char w31 asm("v31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_32()	\
+register unsigned char w32 asm("v31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_33_36()	\
+register unsigned char w33 asm("v31") __attribute__((vector_size(16)));	\
+register unsigned char w34 asm("v31") __attribute__((vector_size(16)));	\
+register unsigned char w35 asm("v31") __attribute__((vector_size(16)));	\
+register unsigned char w36 asm("v31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_37_38()	\
+register unsigned char w37 asm("v31") __attribute__((vector_size(16)));	\
+register unsigned char w38 asm("v31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_ALL()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_10_11()	\
+	GEN_X_DEFINE_12_15()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_18_21()	\
+	GEN_X_DEFINE_22_23()	\
+	GEN_X_DEFINE_24_27()	\
+	GEN_X_DEFINE_28_30()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36() 	\
+	GEN_X_DEFINE_37_38()
+#else
+#define	GEN_X_DEFINE_0_3()	\
+	unsigned char w0 __attribute__((vector_size(16)));	\
+	unsigned char w1 __attribute__((vector_size(16)));	\
+	unsigned char w2 __attribute__((vector_size(16)));	\
+	unsigned char w3 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_4_5()	\
+	unsigned char w4 __attribute__((vector_size(16)));	\
+	unsigned char w5 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_6_7()	\
+	unsigned char w6 __attribute__((vector_size(16)));	\
+	unsigned char w7 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_8_9()	\
+	unsigned char w8 __attribute__((vector_size(16)));	\
+	unsigned char w9 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_10_11()	\
+	unsigned char w10 __attribute__((vector_size(16)));	\
+	unsigned char w11 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_12_15()	\
+	unsigned char w12 __attribute__((vector_size(16)));	\
+	unsigned char w13 __attribute__((vector_size(16)));	\
+	unsigned char w14 __attribute__((vector_size(16)));	\
+	unsigned char w15 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_16()	\
+	unsigned char w16 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_17()	\
+	unsigned char w17 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_18_21()	\
+	unsigned char w18 __attribute__((vector_size(16)));	\
+	unsigned char w19 __attribute__((vector_size(16)));	\
+	unsigned char w20 __attribute__((vector_size(16)));	\
+	unsigned char w21 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_22_23()	\
+	unsigned char w22 __attribute__((vector_size(16)));	\
+	unsigned char w23 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_24_27()	\
+	unsigned char w24 __attribute__((vector_size(16)));	\
+	unsigned char w25 __attribute__((vector_size(16)));	\
+	unsigned char w26 __attribute__((vector_size(16)));	\
+	unsigned char w27 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_28_30()	\
+	unsigned char w28 __attribute__((vector_size(16)));	\
+	unsigned char w29 __attribute__((vector_size(16)));	\
+	unsigned char w30 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_31()	\
+	unsigned char w31 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_32()	\
+	unsigned char w32 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_33_36()	\
+	unsigned char w33 __attribute__((vector_size(16)));	\
+	unsigned char w34 __attribute__((vector_size(16)));	\
+	unsigned char w35 __attribute__((vector_size(16)));	\
+	unsigned char w36 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_37_38()	\
+	unsigned char w37 __attribute__((vector_size(16)));	\
+	unsigned char w38 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_ALL()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_10_11()	\
+	GEN_X_DEFINE_12_15()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_18_21()	\
+	GEN_X_DEFINE_22_23()	\
+	GEN_X_DEFINE_24_27()	\
+	GEN_X_DEFINE_28_30()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()	\
+	GEN_X_DEFINE_37_38()
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
new file mode 100644
index 000000000000..e072f51cd635
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_aarch64_neonx2.c
@@ -0,0 +1,232 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__aarch64__)
+
+#include "vdev_raidz_math_aarch64_neon_common.h"
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		8
+#define	ZERO_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()
+#define	ZERO_D			0, 1, 2, 3, 4, 5, 6, 7
+
+#define	COPY_STRIDE		8
+#define	COPY_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()
+#define	COPY_D			0, 1, 2, 3, 4, 5, 6, 7
+
+#define	ADD_STRIDE		8
+#define	ADD_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()
+#define	ADD_D			0, 1, 2, 3, 4, 5, 6, 7
+
+#define	MUL_STRIDE		4
+#define	MUL_DEFINE()	\
+	GEN_X_DEFINE_0_3()  \
+	GEN_X_DEFINE_33_36()
+#define	MUL_D			0, 1, 2, 3
+
+#define	GEN_P_DEFINE() \
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	GEN_P_STRIDE		4
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_Q_STRIDE		4
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_R_STRIDE		4
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PQ_STRIDE		4
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_22_23()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PQ_STRIDE		4
+#define	REC_PQ_X		0, 1, 2, 3
+#define	REC_PQ_Y		4, 5, 6, 7
+#define	REC_PQ_T		8, 9, 22, 23
+
+#define	SYN_PR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PR_STRIDE		4
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_22_23()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PR_STRIDE		4
+#define	REC_PR_X		0, 1, 2, 3
+#define	REC_PR_Y		4, 5, 6, 7
+#define	REC_PR_T		8, 9, 22, 23
+
+#define	SYN_QR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_QR_STRIDE		4
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_22_23()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_QR_STRIDE		4
+#define	REC_QR_X		0, 1, 2, 3
+#define	REC_QR_Y		4, 5, 6, 7
+#define	REC_QR_T		8, 9, 22, 23
+
+#define	SYN_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PQR_STRIDE		 4
+#define	SYN_PQR_D		 0, 1, 2, 3
+#define	SYN_PQR_X		 4, 5, 6, 7
+
+#define	REC_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PQR_STRIDE		2
+#define	REC_PQR_X		0, 1
+#define	REC_PQR_Y		2, 3
+#define	REC_PQR_Z		4, 5
+#define	REC_PQR_XS		6, 7
+#define	REC_PQR_YS		8, 9
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(aarch64_neonx2);
+/*
+ * If compiled with -O0, gcc doesn't do any stack frame coalescing
+ * and -Wframe-larger-than=1024 is triggered in debug mode.
+ */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+DEFINE_REC_METHODS(aarch64_neonx2);
+#pragma GCC diagnostic pop
+
+static boolean_t
+raidz_will_aarch64_neonx2_work(void)
+{
+	return (kfpu_allowed());
+}
+
+const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(aarch64_neonx2),
+	.rec = RAIDZ_REC_METHODS(aarch64_neonx2),
+	.is_supported = &raidz_will_aarch64_neonx2_work,
+	.name = "aarch64_neonx2"
+};
+
+#endif /* defined(__aarch64__) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
new file mode 100644
index 000000000000..65e4bebce8fa
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx2.c
@@ -0,0 +1,413 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_AVX2)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define	VR0_(REG, ...) "ymm"#REG
+#define	VR1_(_1, REG, ...) "ymm"#REG
+#define	VR2_(_1, _2, REG, ...) "ymm"#REG
+#define	VR3_(_1, _2, _3, REG, ...) "ymm"#REG
+#define	VR4_(_1, _2, _3, _4, REG, ...) "ymm"#REG
+#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG
+#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG
+#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG
+
+#define	VR0(r...) VR0_(r)
+#define	VR1(r...) VR1_(r)
+#define	VR2(r...) VR2_(r, 1)
+#define	VR3(r...) VR3_(r, 1, 2)
+#define	VR4(r...) VR4_(r, 1, 2)
+#define	VR5(r...) VR5_(r, 1, 2, 3)
+#define	VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define	R_01(REG1, REG2, ...) REG1, REG2
+#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define	R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define	ZFS_ASM_BUG()	ASSERT(0)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define	ELEM_SIZE 32
+
+typedef struct v {
+	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define	XOR_ACC(src, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n"	\
+		    "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n"	\
+		    "vpxor 0x40(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n"	\
+		    "vpxor 0x60(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n"	\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n"	\
+		    "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n"	\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	XOR(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "vpxor %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n"	\
+		    "vpxor %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n"	\
+		    "vpxor %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n"	\
+		    "vpxor %" VR3(r) ", %" VR7(r)", %" VR7(r));		\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "vpxor %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n"	\
+		    "vpxor %" VR1(r) ", %" VR3(r)", %" VR3(r));		\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	ZERO(r...)	XOR(r, r)
+
+#define	COPY(r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "vmovdqa %" VR0(r) ", %" VR4(r) "\n"		\
+		    "vmovdqa %" VR1(r) ", %" VR5(r) "\n"		\
+		    "vmovdqa %" VR2(r) ", %" VR6(r) "\n"		\
+		    "vmovdqa %" VR3(r) ", %" VR7(r));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa %" VR0(r) ", %" VR2(r) "\n"		\
+		    "vmovdqa %" VR1(r) ", %" VR3(r));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	LOAD(src, r...) 						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n"		\
+		    "vmovdqa 0x40(%[SRC]), %%" VR2(r) "\n"		\
+		    "vmovdqa 0x60(%[SRC]), %%" VR3(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	STORE(dst, r...)   						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n"		\
+		    "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n"		\
+		    "vmovdqa %%" VR2(r) ", 0x40(%[DST])\n"		\
+		    "vmovdqa %%" VR3(r) ", 0x60(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n"		\
+		    "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	FLUSH()								\
+{									\
+	__asm("vzeroupper");						\
+}
+
+#define	MUL2_SETUP() 							\
+{   									\
+	__asm("vmovq %0,   %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d));	\
+	__asm("vpbroadcastq %xmm14, %ymm14");				\
+	__asm("vpxor        %ymm15, %ymm15 ,%ymm15");			\
+}
+
+#define	_MUL2(r...) 							\
+{									\
+	switch	(REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		    "vpcmpgtb %" VR0(r)", %ymm15,     %ymm12\n"		\
+		    "vpcmpgtb %" VR1(r)", %ymm15,     %ymm13\n"		\
+		    "vpaddb   %" VR0(r)", %" VR0(r)", %" VR0(r) "\n"	\
+		    "vpaddb   %" VR1(r)", %" VR1(r)", %" VR1(r) "\n"	\
+		    "vpand    %ymm14,     %ymm12,     %ymm12\n"		\
+		    "vpand    %ymm14,     %ymm13,     %ymm13\n"		\
+		    "vpxor    %ymm12,     %" VR0(r)", %" VR0(r) "\n"	\
+		    "vpxor    %ymm13,     %" VR1(r)", %" VR1(r));	\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL2(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+	    _MUL2(R_01(r));						\
+	    _MUL2(R_23(r));						\
+	    break;							\
+	case 2:								\
+	    _MUL2(r);							\
+	    break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL4(r...)							\
+{									\
+	MUL2(r);							\
+	MUL2(r);							\
+}
+
+#define	_0f		"ymm15"
+#define	_as		"ymm14"
+#define	_bs		"ymm13"
+#define	_ltmod		"ymm12"
+#define	_ltmul		"ymm11"
+#define	_ta		"ymm10"
+#define	_tb		"ymm15"
+
+static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F;
+
+#define	_MULx2(c, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		    "vpbroadcastb (%[mask]), %%" _0f "\n"		\
+		    /* upper bits */					\
+		    "vbroadcasti128 0x00(%[lt]), %%" _ltmod "\n"	\
+		    "vbroadcasti128 0x10(%[lt]), %%" _ltmul "\n"	\
+									\
+		    "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n"		\
+		    "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n"		\
+		    "vpand %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n"	\
+		    "vpand %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n"	\
+		    "vpand %%" _0f ", %%" _as ", %%" _as "\n"		\
+		    "vpand %%" _0f ", %%" _bs ", %%" _bs "\n"		\
+									\
+		    "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n"	\
+		    "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n"	\
+		    "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n"	\
+		    "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n"	\
+		    /* lower bits */					\
+		    "vbroadcasti128 0x20(%[lt]), %%" _ltmod "\n"	\
+		    "vbroadcasti128 0x30(%[lt]), %%" _ltmul "\n"	\
+									\
+		    "vpxor %%" _ta ", %%" _as ", %%" _as "\n"		\
+		    "vpxor %%" _tb ", %%" _bs ", %%" _bs "\n"		\
+									\
+		    "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n"	\
+		    "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n"	\
+		    "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\
+		    "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\
+									\
+		    "vpxor %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n"	\
+		    "vpxor %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n"	\
+		    "vpxor %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n"	\
+		    "vpxor %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n"	\
+		    : : [mask] "r" (&_mul_mask),			\
+		    [lt] "r" (gf_clmul_mod_lt[4*(c)]));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL(c, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		_MULx2(c, R_01(r));					\
+		_MULx2(c, R_23(r));					\
+		break;							\
+	case 2:								\
+		_MULx2(c, R_01(r));					\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	raidz_math_begin()	kfpu_begin()
+#define	raidz_math_end()						\
+{									\
+	FLUSH();							\
+	kfpu_end();							\
+}
+
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		4
+#define	ZERO_DEFINE()		{}
+#define	ZERO_D			0, 1, 2, 3
+
+#define	COPY_STRIDE		4
+#define	COPY_DEFINE()		{}
+#define	COPY_D			0, 1, 2, 3
+
+#define	ADD_STRIDE		4
+#define	ADD_DEFINE()		{}
+#define	ADD_D 			0, 1, 2, 3
+
+#define	MUL_STRIDE		4
+#define	MUL_DEFINE() 		{}
+#define	MUL_D			0, 1, 2, 3
+
+#define	GEN_P_STRIDE		4
+#define	GEN_P_DEFINE()		{}
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_DEFINE() 	{}
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_DEFINE() 	{}
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE()		{}
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE()		{}
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() 	{}
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_STRIDE		2
+#define	REC_PQ_DEFINE() 	{}
+#define	REC_PQ_X		0, 1
+#define	REC_PQ_Y		2, 3
+#define	REC_PQ_T		4, 5
+
+#define	SYN_PR_DEFINE() 	{}
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_STRIDE		2
+#define	REC_PR_DEFINE() 	{}
+#define	REC_PR_X		0, 1
+#define	REC_PR_Y		2, 3
+#define	REC_PR_T		4, 5
+
+#define	SYN_QR_DEFINE() 	{}
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_STRIDE		2
+#define	REC_QR_DEFINE() 	{}
+#define	REC_QR_X		0, 1
+#define	REC_QR_Y		2, 3
+#define	REC_QR_T		4, 5
+
+#define	SYN_PQR_DEFINE() 	{}
+#define	SYN_PQR_D		0, 1, 2, 3
+#define	SYN_PQR_X		4, 5, 6, 7
+
+#define	REC_PQR_STRIDE		2
+#define	REC_PQR_DEFINE() 	{}
+#define	REC_PQR_X		0, 1
+#define	REC_PQR_Y		2, 3
+#define	REC_PQR_Z		4, 5
+#define	REC_PQR_XS		6, 7
+#define	REC_PQR_YS		8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(avx2);
+DEFINE_REC_METHODS(avx2);
+
+static boolean_t
+raidz_will_avx2_work(void)
+{
+	return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_avx2_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(avx2),
+	.rec = RAIDZ_REC_METHODS(avx2),
+	.is_supported = &raidz_will_avx2_work,
+	.name = "avx2"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX2) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
new file mode 100644
index 000000000000..f06b469023eb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512bw.c
@@ -0,0 +1,413 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_AVX512BW)
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/simd.h>
+
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define	VR0_(REG, ...) "zmm"#REG
+#define	VR1_(_1, REG, ...) "zmm"#REG
+#define	VR2_(_1, _2, REG, ...) "zmm"#REG
+#define	VR3_(_1, _2, _3, REG, ...) "zmm"#REG
+#define	VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG
+#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG
+#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG
+#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG
+
+#define	VR0(r...) VR0_(r)
+#define	VR1(r...) VR1_(r)
+#define	VR2(r...) VR2_(r, 1)
+#define	VR3(r...) VR3_(r, 1, 2)
+#define	VR4(r...) VR4_(r, 1, 2)
+#define	VR5(r...) VR5_(r, 1, 2, 3)
+#define	VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define	R_01(REG1, REG2, ...) REG1, REG2
+#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define	R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define	ZFS_ASM_BUG()	ASSERT(0)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define	ELEM_SIZE 64
+
+typedef struct v {
+	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define	XOR_ACC(src, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n"	\
+		    "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n"	\
+		    "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n"	\
+		    "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n"	\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n"	\
+		    "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n"	\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	XOR(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n"	\
+		    "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n"	\
+		    "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n"	\
+		    "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r));	\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n"	\
+		    "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r));	\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	ZERO(r...)	XOR(r, r)
+
+#define	COPY(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n"		\
+		    "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n"		\
+		    "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n"		\
+		    "vmovdqa64 %" VR3(r) ", %" VR7(r));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n"		\
+		    "vmovdqa64 %" VR1(r) ", %" VR3(r));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	LOAD(src, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n"		\
+		    "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n"		\
+		    "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	STORE(dst, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n"		\
+		    "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n"		\
+		    "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n"		\
+		    "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n"		\
+		    "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL2_SETUP() 							\
+{   									\
+	__asm("vmovq %0,    %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d));	\
+	__asm("vpbroadcastq  %xmm22, %zmm22");				\
+	__asm("vpxord        %zmm23, %zmm23 ,%zmm23");			\
+}
+
+#define	_MUL2(r...)							\
+{									\
+	switch	(REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		    "vpcmpb $1, %zmm23,     %" VR0(r)", %k1\n"		\
+		    "vpcmpb $1, %zmm23,     %" VR1(r)", %k2\n"		\
+		    "vpaddb     %" VR0(r)", %" VR0(r)", %" VR0(r) "\n"	\
+		    "vpaddb     %" VR1(r)", %" VR1(r)", %" VR1(r) "\n"	\
+		    "vpxord     %zmm22,     %" VR0(r)", %zmm12\n"	\
+		    "vpxord     %zmm22,     %" VR1(r)", %zmm13\n"	\
+		    "vmovdqu8   %zmm12,     %" VR0(r) "{%k1}\n"		\
+		    "vmovdqu8   %zmm13,     %" VR1(r) "{%k2}");		\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL2(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+	    _MUL2(R_01(r));						\
+	    _MUL2(R_23(r));						\
+	    break;							\
+	case 2:								\
+	    _MUL2(r);							\
+	    break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL4(r...)							\
+{									\
+	MUL2(r);							\
+	MUL2(r);							\
+}
+
+#define	_0f		"zmm15"
+#define	_as		"zmm14"
+#define	_bs		"zmm13"
+#define	_ltmod		"zmm12"
+#define	_ltmul		"zmm11"
+#define	_ta		"zmm10"
+#define	_tb		"zmm15"
+
+static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
+
+#define	_MULx2(c, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		    "vpbroadcastb (%[mask]), %%" _0f "\n"		\
+		    /* upper bits */					\
+		    "vbroadcasti32x4 0x00(%[lt]), %%" _ltmod "\n"	\
+		    "vbroadcasti32x4 0x10(%[lt]), %%" _ltmul "\n"	\
+									\
+		    "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n"		\
+		    "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n"		\
+		    "vpandq %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n"	\
+		    "vpandq %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n"	\
+		    "vpandq %%" _0f ", %%" _as ", %%" _as "\n"		\
+		    "vpandq %%" _0f ", %%" _bs ", %%" _bs "\n"		\
+									\
+		    "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n"	\
+		    "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n"	\
+		    "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n"	\
+		    "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n"	\
+		    /* lower bits */					\
+		    "vbroadcasti32x4 0x20(%[lt]), %%" _ltmod "\n"	\
+		    "vbroadcasti32x4 0x30(%[lt]), %%" _ltmul "\n"	\
+									\
+		    "vpxorq %%" _ta ", %%" _as ", %%" _as "\n"		\
+		    "vpxorq %%" _tb ", %%" _bs ", %%" _bs "\n"		\
+									\
+		    "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n"	\
+		    "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n"	\
+		    "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\
+		    "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\
+									\
+		    "vpxorq %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n"	\
+		    "vpxorq %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n"	\
+		    "vpxorq %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n"	\
+		    "vpxorq %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n"	\
+		    : : [mask] "r" (&_mul_mask),			\
+		    [lt] "r" (gf_clmul_mod_lt[4*(c)]));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL(c, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		_MULx2(c, R_01(r));					\
+		_MULx2(c, R_23(r));					\
+		break;							\
+	case 2:								\
+		_MULx2(c, R_01(r));					\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	raidz_math_begin()	kfpu_begin()
+#define	raidz_math_end()	kfpu_end()
+
+/*
+ * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
+ * the stride of these operations for avx512 must not exceed 4. Otherwise, a
+ * single step would exceed 512B block size.
+ */
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		4
+#define	ZERO_DEFINE()		{}
+#define	ZERO_D			0, 1, 2, 3
+
+#define	COPY_STRIDE		4
+#define	COPY_DEFINE()		{}
+#define	COPY_D			0, 1, 2, 3
+
+#define	ADD_STRIDE		4
+#define	ADD_DEFINE()		{}
+#define	ADD_D			0, 1, 2, 3
+
+#define	MUL_STRIDE		4
+#define	MUL_DEFINE()		{}
+#define	MUL_D			0, 1, 2, 3
+
+#define	GEN_P_STRIDE		4
+#define	GEN_P_DEFINE()		{}
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_DEFINE() 	{}
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_DEFINE() 	{}
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE()		{}
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE()		{}
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() 	{}
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_STRIDE		2
+#define	REC_PQ_DEFINE() 	{}
+#define	REC_PQ_X		0, 1
+#define	REC_PQ_Y		2, 3
+#define	REC_PQ_T		4, 5
+
+#define	SYN_PR_DEFINE() 	{}
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_STRIDE		2
+#define	REC_PR_DEFINE() 	{}
+#define	REC_PR_X		0, 1
+#define	REC_PR_Y		2, 3
+#define	REC_PR_T		4, 5
+
+#define	SYN_QR_DEFINE() 	{}
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_STRIDE		2
+#define	REC_QR_DEFINE() 	{}
+#define	REC_QR_X		0, 1
+#define	REC_QR_Y		2, 3
+#define	REC_QR_T		4, 5
+
+#define	SYN_PQR_DEFINE() 	{}
+#define	SYN_PQR_D		0, 1, 2, 3
+#define	SYN_PQR_X		4, 5, 6, 7
+
+#define	REC_PQR_STRIDE		2
+#define	REC_PQR_DEFINE() 	{}
+#define	REC_PQR_X		0, 1
+#define	REC_PQR_Y		2, 3
+#define	REC_PQR_Z		4, 5
+#define	REC_PQR_XS		6, 7
+#define	REC_PQR_YS		8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(avx512bw);
+DEFINE_REC_METHODS(avx512bw);
+
+static boolean_t
+raidz_will_avx512bw_work(void)
+{
+	return (kfpu_allowed() && zfs_avx_available() &&
+	    zfs_avx512f_available() && zfs_avx512bw_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(avx512bw),
+	.rec = RAIDZ_REC_METHODS(avx512bw),
+	.is_supported = &raidz_will_avx512bw_work,
+	.name = "avx512bw"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX512BW) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
new file mode 100644
index 000000000000..aab653b77491
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_avx512f.c
@@ -0,0 +1,494 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_AVX512F)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+#include <sys/debug.h>
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define	VR0_(REG, ...) "zmm"#REG
+#define	VR1_(_1, REG, ...) "zmm"#REG
+#define	VR2_(_1, _2, REG, ...) "zmm"#REG
+#define	VR3_(_1, _2, _3, REG, ...) "zmm"#REG
+#define	VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG
+#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG
+#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG
+#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG
+
+#define	VR0(r...) VR0_(r)
+#define	VR1(r...) VR1_(r)
+#define	VR2(r...) VR2_(r, 1)
+#define	VR3(r...) VR3_(r, 1, 2)
+#define	VR4(r...) VR4_(r, 1, 2)
+#define	VR5(r...) VR5_(r, 1, 2, 3)
+#define	VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define	VRy0_(REG, ...) "ymm"#REG
+#define	VRy1_(_1, REG, ...) "ymm"#REG
+#define	VRy2_(_1, _2, REG, ...) "ymm"#REG
+#define	VRy3_(_1, _2, _3, REG, ...) "ymm"#REG
+#define	VRy4_(_1, _2, _3, _4, REG, ...) "ymm"#REG
+#define	VRy5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG
+#define	VRy6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG
+#define	VRy7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG
+
+#define	VRy0(r...) VRy0_(r)
+#define	VRy1(r...) VRy1_(r)
+#define	VRy2(r...) VRy2_(r, 1)
+#define	VRy3(r...) VRy3_(r, 1, 2)
+#define	VRy4(r...) VRy4_(r, 1, 2)
+#define	VRy5(r...) VRy5_(r, 1, 2, 3)
+#define	VRy6(r...) VRy6_(r, 1, 2, 3, 4)
+#define	VRy7(r...) VRy7_(r, 1, 2, 3, 4, 5)
+
+#define	R_01(REG1, REG2, ...) REG1, REG2
+#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define	R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define	ELEM_SIZE 64
+
+typedef struct v {
+	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define	XOR_ACC(src, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n"	\
+		    "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n"	\
+		    "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n"	\
+		    "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n"	\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	}								\
+}
+
+#define	XOR(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n"	\
+		    "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n"	\
+		    "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n"	\
+		    "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r));	\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n"	\
+		    "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r));	\
+		break;							\
+	}								\
+}
+
+
+#define	ZERO(r...)	XOR(r, r)
+
+
+#define	COPY(r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n"		\
+		    "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n"		\
+		    "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n"		\
+		    "vmovdqa64 %" VR3(r) ", %" VR7(r));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n"		\
+		    "vmovdqa64 %" VR1(r) ", %" VR3(r));			\
+		break;							\
+	}								\
+}
+
+#define	LOAD(src, r...) 						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n"		\
+		    "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n"		\
+		    "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	}								\
+}
+
+#define	STORE(dst, r...)   						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n"		\
+		    "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n"		\
+		    "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n"		\
+		    "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	}								\
+}
+
+#define	MUL2_SETUP() 							\
+{   									\
+	__asm("vmovq %0,   %%xmm31" :: "r"(0x1d1d1d1d1d1d1d1d));	\
+	__asm("vpbroadcastq %xmm31, %zmm31");				\
+	__asm("vmovq %0,   %%xmm30" :: "r"(0x8080808080808080));	\
+	__asm("vpbroadcastq %xmm30, %zmm30");				\
+	__asm("vmovq %0,   %%xmm29" :: "r"(0xfefefefefefefefe));	\
+	__asm("vpbroadcastq %xmm29, %zmm29");				\
+}
+
+#define	_MUL2(r...) 							\
+{									\
+	switch	(REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		    "vpandq   %" VR0(r)", %zmm30, %zmm26\n"		\
+		    "vpandq   %" VR1(r)", %zmm30, %zmm25\n"		\
+		    "vpsrlq   $7, %zmm26, %zmm28\n"			\
+		    "vpsrlq   $7, %zmm25, %zmm27\n"			\
+		    "vpsllq   $1, %zmm26, %zmm26\n"			\
+		    "vpsllq   $1, %zmm25, %zmm25\n"			\
+		    "vpsubq   %zmm28, %zmm26, %zmm26\n"			\
+		    "vpsubq   %zmm27, %zmm25, %zmm25\n"			\
+		    "vpsllq   $1, %" VR0(r)", %" VR0(r) "\n"		\
+		    "vpsllq   $1, %" VR1(r)", %" VR1(r) "\n"		\
+		    "vpandq   %zmm26, %zmm31, %zmm26\n" 		\
+		    "vpandq   %zmm25, %zmm31, %zmm25\n" 		\
+		    "vpternlogd $0x6c,%zmm29, %zmm26, %" VR0(r) "\n"	\
+		    "vpternlogd $0x6c,%zmm29, %zmm25, %" VR1(r));	\
+		break;							\
+	default:							\
+		VERIFY(0);						\
+	}								\
+}
+
+#define	MUL2(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+	    _MUL2(R_01(r));						\
+	    _MUL2(R_23(r));						\
+	    break;							\
+	case 2:								\
+	    _MUL2(r);							\
+	    break;							\
+	}								\
+}
+
+#define	MUL4(r...)							\
+{									\
+	MUL2(r);							\
+	MUL2(r);							\
+}
+
+
+/* General multiplication by adding powers of two */
+
+#define	_mul_x2_in	21, 22
+#define	_mul_x2_acc	23, 24
+
+#define	_MUL_PARAM(x, in, acc)						\
+{									\
+	if (x & 0x01) {	COPY(in, acc); } else { ZERO(acc); }		\
+	if (x & 0xfe) { MUL2(in); }					\
+	if (x & 0x02) { XOR(in, acc); }					\
+	if (x & 0xfc) { MUL2(in); }					\
+	if (x & 0x04) { XOR(in, acc); }					\
+	if (x & 0xf8) { MUL2(in); }					\
+	if (x & 0x08) { XOR(in, acc); }					\
+	if (x & 0xf0) { MUL2(in); }					\
+	if (x & 0x10) { XOR(in, acc); }					\
+	if (x & 0xe0) { MUL2(in); }					\
+	if (x & 0x20) { XOR(in, acc); }					\
+	if (x & 0xc0) { MUL2(in); }					\
+	if (x & 0x40) { XOR(in, acc); }					\
+	if (x & 0x80) { MUL2(in); XOR(in, acc); }			\
+}
+
+#define	MUL_x2_DEFINE(x)						\
+static void 								\
+mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); }
+
+
+MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3);
+MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7);
+MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11);
+MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15);
+MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19);
+MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23);
+MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27);
+MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31);
+MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35);
+MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39);
+MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43);
+MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47);
+MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51);
+MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55);
+MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59);
+MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63);
+MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67);
+MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71);
+MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75);
+MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79);
+MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83);
+MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87);
+MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91);
+MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95);
+MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99);
+MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103);
+MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107);
+MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111);
+MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115);
+MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119);
+MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123);
+MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127);
+MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131);
+MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135);
+MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139);
+MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143);
+MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147);
+MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151);
+MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155);
+MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159);
+MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163);
+MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167);
+MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171);
+MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175);
+MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179);
+MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183);
+MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187);
+MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191);
+MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195);
+MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199);
+MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203);
+MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207);
+MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211);
+MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215);
+MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219);
+MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223);
+MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227);
+MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231);
+MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235);
+MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239);
+MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243);
+MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247);
+MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251);
+MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255);
+
+
+typedef void (*mul_fn_ptr_t)(void);
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x2_mul_fns[256] = {
+	mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5,
+	mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11,
+	mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17,
+	mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23,
+	mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29,
+	mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35,
+	mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41,
+	mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47,
+	mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53,
+	mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59,
+	mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65,
+	mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71,
+	mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77,
+	mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83,
+	mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89,
+	mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95,
+	mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101,
+	mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107,
+	mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113,
+	mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119,
+	mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125,
+	mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131,
+	mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137,
+	mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143,
+	mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149,
+	mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155,
+	mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161,
+	mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167,
+	mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173,
+	mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179,
+	mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185,
+	mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191,
+	mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197,
+	mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203,
+	mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209,
+	mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215,
+	mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221,
+	mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227,
+	mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233,
+	mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239,
+	mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245,
+	mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251,
+	mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255
+};
+
+#define	MUL(c, r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		COPY(R_01(r), _mul_x2_in);				\
+		gf_x2_mul_fns[c]();					\
+		COPY(_mul_x2_acc, R_01(r));				\
+		COPY(R_23(r), _mul_x2_in);				\
+		gf_x2_mul_fns[c]();					\
+		COPY(_mul_x2_acc, R_23(r));				\
+		break;							\
+	default:							\
+		VERIFY(0);						\
+	}								\
+}
+
+
+#define	raidz_math_begin()	kfpu_begin()
+#define	raidz_math_end()	kfpu_end()
+
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		4
+#define	ZERO_DEFINE()		{}
+#define	ZERO_D			0, 1, 2, 3
+
+#define	COPY_STRIDE		4
+#define	COPY_DEFINE()		{}
+#define	COPY_D			0, 1, 2, 3
+
+#define	ADD_STRIDE		4
+#define	ADD_DEFINE()		{}
+#define	ADD_D 			0, 1, 2, 3
+
+#define	MUL_STRIDE		4
+#define	MUL_DEFINE() 		MUL2_SETUP()
+#define	MUL_D			0, 1, 2, 3
+
+#define	GEN_P_STRIDE		4
+#define	GEN_P_DEFINE()		{}
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_DEFINE() 	{}
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_DEFINE() 	{}
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE()		{}
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE()		{}
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() 	{}
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_STRIDE		4
+#define	REC_PQ_DEFINE()		MUL2_SETUP()
+#define	REC_PQ_X		0, 1, 2, 3
+#define	REC_PQ_Y		4, 5, 6, 7
+#define	REC_PQ_T		8, 9, 10, 11
+
+#define	SYN_PR_DEFINE() 	{}
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_STRIDE		4
+#define	REC_PR_DEFINE() 	MUL2_SETUP()
+#define	REC_PR_X		0, 1, 2, 3
+#define	REC_PR_Y		4, 5, 6, 7
+#define	REC_PR_T		8, 9, 10, 11
+
+#define	SYN_QR_DEFINE() 	{}
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_STRIDE		4
+#define	REC_QR_DEFINE() 	MUL2_SETUP()
+#define	REC_QR_X		0, 1, 2, 3
+#define	REC_QR_Y		4, 5, 6, 7
+#define	REC_QR_T		8, 9, 10, 11
+
+#define	SYN_PQR_DEFINE() 	{}
+#define	SYN_PQR_D		0, 1, 2, 3
+#define	SYN_PQR_X		4, 5, 6, 7
+
+#define	REC_PQR_STRIDE		4
+#define	REC_PQR_DEFINE() 	MUL2_SETUP()
+#define	REC_PQR_X		0, 1, 2, 3
+#define	REC_PQR_Y		4, 5, 6, 7
+#define	REC_PQR_Z		8, 9, 10, 11
+#define	REC_PQR_XS		12, 13, 14, 15
+#define	REC_PQR_YS		16, 17, 18, 19
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(avx512f);
+DEFINE_REC_METHODS(avx512f);
+
+static boolean_t
+raidz_will_avx512f_work(void)
+{
+	return (kfpu_allowed() && zfs_avx_available() &&
+	    zfs_avx2_available() && zfs_avx512f_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(avx512f),
+	.rec = RAIDZ_REC_METHODS(avx512f),
+	.is_supported = &raidz_will_avx512f_work,
+	.name = "avx512f"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
new file mode 100644
index 000000000000..35e016fc65a5
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_impl.h
@@ -0,0 +1,1502 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#ifndef _VDEV_RAIDZ_MATH_IMPL_H
+#define	_VDEV_RAIDZ_MATH_IMPL_H
+
+#include <sys/types.h>
+#include <sys/vdev_raidz_impl.h>
+
+#define	raidz_inline inline __attribute__((always_inline))
+#ifndef noinline
+#define	noinline __attribute__((noinline))
+#endif
+
+/*
+ * Functions calculate multiplication constants for data reconstruction.
+ * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and
+ * used parity columns for reconstruction.
+ * @rr			RAIDZ row
+ * @tgtidx		array of missing data indexes
+ * @coeff		output array of coefficients. Array must be provided by
+ *         		user and must hold minimum MUL_CNT values.
+ */
+static noinline void
+raidz_rec_q_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+	const unsigned ncols = rr->rr_cols;
+	const unsigned x = tgtidx[TARGET_X];
+
+	coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1));
+}
+
+static noinline void
+raidz_rec_r_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+	const unsigned ncols = rr->rr_cols;
+	const unsigned x = tgtidx[TARGET_X];
+
+	coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1));
+}
+
+static noinline void
+raidz_rec_pq_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+	const unsigned ncols = rr->rr_cols;
+	const unsigned x = tgtidx[TARGET_X];
+	const unsigned y = tgtidx[TARGET_Y];
+	gf_t a, b, e;
+
+	a = gf_exp2(x + 255 - y);
+	b = gf_exp2(255 - (ncols - x - 1));
+	e = a ^ 0x01;
+
+	coeff[MUL_PQ_X] = gf_div(a, e);
+	coeff[MUL_PQ_Y] = gf_div(b, e);
+}
+
+static noinline void
+raidz_rec_pr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+	const unsigned ncols = rr->rr_cols;
+	const unsigned x = tgtidx[TARGET_X];
+	const unsigned y = tgtidx[TARGET_Y];
+
+	gf_t a, b, e;
+
+	a = gf_exp4(x + 255 - y);
+	b = gf_exp4(255 - (ncols - x - 1));
+	e = a ^ 0x01;
+
+	coeff[MUL_PR_X] = gf_div(a, e);
+	coeff[MUL_PR_Y] = gf_div(b, e);
+}
+
+static noinline void
+raidz_rec_qr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+	const unsigned ncols = rr->rr_cols;
+	const unsigned x = tgtidx[TARGET_X];
+	const unsigned y = tgtidx[TARGET_Y];
+
+	gf_t nx, ny, nxxy, nxyy, d;
+
+	nx = gf_exp2(ncols - x - 1);
+	ny = gf_exp2(ncols - y - 1);
+	nxxy = gf_mul(gf_mul(nx, nx), ny);
+	nxyy = gf_mul(gf_mul(nx, ny), ny);
+	d = nxxy ^ nxyy;
+
+	coeff[MUL_QR_XQ] = ny;
+	coeff[MUL_QR_X]	= gf_div(ny, d);
+	coeff[MUL_QR_YQ] = nx;
+	coeff[MUL_QR_Y]	= gf_div(nx, d);
+}
+
+static noinline void
+raidz_rec_pqr_coeff(const raidz_row_t *rr, const int *tgtidx, unsigned *coeff)
+{
+	const unsigned ncols = rr->rr_cols;
+	const unsigned x = tgtidx[TARGET_X];
+	const unsigned y = tgtidx[TARGET_Y];
+	const unsigned z = tgtidx[TARGET_Z];
+
+	gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd;
+
+	nx = gf_exp2(ncols - x - 1);
+	ny = gf_exp2(ncols - y - 1);
+	nz = gf_exp2(ncols - z - 1);
+
+	nxx = gf_exp4(ncols - x - 1);
+	nyy = gf_exp4(ncols - y - 1);
+	nzz = gf_exp4(ncols - z - 1);
+
+	nyyz = gf_mul(gf_mul(ny, nz), ny);
+	nyzz = gf_mul(nzz, ny);
+
+	xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^
+	    gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^  nyzz;
+
+	yd = gf_inv(ny ^ nz);
+
+	coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd);
+	coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd);
+	coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd);
+	coeff[MUL_PQR_YU] = nx;
+	coeff[MUL_PQR_YP] = gf_mul(nz, yd);
+	coeff[MUL_PQR_YQ] = yd;
+}
+
+/*
+ * Method for zeroing a buffer (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc		Destination buffer
+ * @dsize	Destination buffer size
+ * @private	Unused
+ */
+static int
+raidz_zero_abd_cb(void *dc, size_t dsize, void *private)
+{
+	v_t *dst = (v_t *)dc;
+	size_t i;
+
+	ZERO_DEFINE();
+
+	(void) private; /* unused */
+
+	ZERO(ZERO_D);
+
+	for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) {
+		STORE(dst + i, ZERO_D);
+		STORE(dst + i + ZERO_STRIDE, ZERO_D);
+	}
+
+	return (0);
+}
+
+#define	raidz_zero(dabd, size)						\
+{									\
+	abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL);	\
+}
+
+/*
+ * Method for copying two buffers (can be implemented using SIMD).
+ * This method is used by multiple for gen/rec functions.
+ *
+ * @dc		Destination buffer
+ * @sc		Source buffer
+ * @dsize	Destination buffer size
+ * @ssize	Source buffer size
+ * @private	Unused
+ */
+static int
+raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private)
+{
+	v_t *dst = (v_t *)dc;
+	const v_t *src = (v_t *)sc;
+	size_t i;
+
+	COPY_DEFINE();
+
+	(void) private; /* unused */
+
+	for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) {
+		LOAD(src + i, COPY_D);
+		STORE(dst + i, COPY_D);
+
+		LOAD(src + i + COPY_STRIDE, COPY_D);
+		STORE(dst + i + COPY_STRIDE, COPY_D);
+	}
+
+	return (0);
+}
+
+
+#define	raidz_copy(dabd, sabd, size)					\
+{									\
+	abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\
+}
+
+/*
+ * Method for adding (XORing) two buffers.
+ * Source and destination are XORed together and result is stored in
+ * destination buffer. This method is used by multiple for gen/rec functions.
+ *
+ * @dc		Destination buffer
+ * @sc		Source buffer
+ * @dsize	Destination buffer size
+ * @ssize	Source buffer size
+ * @private	Unused
+ */
+static int
+raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private)
+{
+	v_t *dst = (v_t *)dc;
+	const v_t *src = (v_t *)sc;
+	size_t i;
+
+	ADD_DEFINE();
+
+	(void) private; /* unused */
+
+	for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) {
+		LOAD(dst + i, ADD_D);
+		XOR_ACC(src + i, ADD_D);
+		STORE(dst + i, ADD_D);
+
+		LOAD(dst + i + ADD_STRIDE, ADD_D);
+		XOR_ACC(src + i + ADD_STRIDE, ADD_D);
+		STORE(dst + i + ADD_STRIDE, ADD_D);
+	}
+
+	return (0);
+}
+
+#define	raidz_add(dabd, sabd, size)					\
+{									\
+	abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\
+}
+
+/*
+ * Method for multiplying a buffer with a constant in GF(2^8).
+ * Symbols from buffer are multiplied by a constant and result is stored
+ * back in the same buffer.
+ *
+ * @dc		In/Out data buffer.
+ * @size	Size of the buffer
+ * @private	pointer to the multiplication constant (unsigned)
+ */
+static int
+raidz_mul_abd_cb(void *dc, size_t size, void *private)
+{
+	const unsigned mul = *((unsigned *)private);
+	v_t *d = (v_t *)dc;
+	size_t i;
+
+	MUL_DEFINE();
+
+	for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) {
+		LOAD(d + i, MUL_D);
+		MUL(mul, MUL_D);
+		STORE(d + i, MUL_D);
+
+		LOAD(d + i + MUL_STRIDE, MUL_D);
+		MUL(mul, MUL_D);
+		STORE(d + i + MUL_STRIDE, MUL_D);
+	}
+
+	return (0);
+}
+
+
+/*
+ * Syndrome generation/update macros
+ *
+ * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros
+ */
+#define	P_D_SYNDROME(D, T, t)		\
+{					\
+	LOAD((t), T);			\
+	XOR(D, T);			\
+	STORE((t), T);			\
+}
+
+#define	Q_D_SYNDROME(D, T, t)		\
+{					\
+	LOAD((t), T);			\
+	MUL2(T);			\
+	XOR(D, T);			\
+	STORE((t), T);			\
+}
+
+#define	Q_SYNDROME(T, t)		\
+{					\
+	LOAD((t), T);			\
+	MUL2(T);			\
+	STORE((t), T);			\
+}
+
+#define	R_D_SYNDROME(D, T, t)		\
+{					\
+	LOAD((t), T);			\
+	MUL4(T);			\
+	XOR(D, T);			\
+	STORE((t), T);			\
+}
+
+#define	R_SYNDROME(T, t)		\
+{					\
+	LOAD((t), T);			\
+	MUL4(T);			\
+	STORE((t), T);			\
+}
+
+
+/*
+ * PARITY CALCULATION
+ *
+ * Macros *_SYNDROME are used for parity/syndrome calculation.
+ * *_D_SYNDROME() macros are used to calculate syndrome between 0 and
+ * length of data column, and *_SYNDROME() macros are only for updating
+ * the parity/syndrome if data column is shorter.
+ *
+ * P parity is calculated using raidz_add_abd().
+ */
+
+/*
+ * Generate P parity (RAIDZ1)
+ *
+ * @rr	RAIDZ row
+ */
+static raidz_inline void
+raidz_generate_p_impl(raidz_row_t * const rr)
+{
+	size_t c;
+	const size_t ncols = rr->rr_cols;
+	const size_t psize = rr->rr_col[CODE_P].rc_size;
+	abd_t *pabd = rr->rr_col[CODE_P].rc_abd;
+	size_t size;
+	abd_t *dabd;
+
+	raidz_math_begin();
+
+	/* start with first data column */
+	raidz_copy(pabd, rr->rr_col[1].rc_abd, psize);
+
+	for (c = 2; c < ncols; c++) {
+		dabd = rr->rr_col[c].rc_abd;
+		size = rr->rr_col[c].rc_size;
+
+		/* add data column */
+		raidz_add(pabd, dabd, size);
+	}
+
+	raidz_math_end();
+}
+
+
+/*
+ * Generate PQ parity (RAIDZ2)
+ * The function is called per data column.
+ *
+ * @c		array of pointers to parity (code) columns
+ * @dc		pointer to data column
+ * @csize	size of parity columns
+ * @dsize	size of data column
+ */
+static void
+raidz_gen_pq_add(void **c, const void *dc, const size_t csize,
+    const size_t dsize)
+{
+	v_t *p = (v_t *)c[0];
+	v_t *q = (v_t *)c[1];
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+	const v_t * const qend = q + (csize / sizeof (v_t));
+
+	GEN_PQ_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE,
+	    q += GEN_PQ_STRIDE) {
+		LOAD(d, GEN_PQ_D);
+		P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p);
+		Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q);
+	}
+	for (; q < qend; q += GEN_PQ_STRIDE) {
+		Q_SYNDROME(GEN_PQ_C, q);
+	}
+}
+
+
+/*
+ * Generate PQ parity (RAIDZ2)
+ *
+ * @rr	RAIDZ row
+ */
+static raidz_inline void
+raidz_generate_pq_impl(raidz_row_t * const rr)
+{
+	size_t c;
+	const size_t ncols = rr->rr_cols;
+	const size_t csize = rr->rr_col[CODE_P].rc_size;
+	size_t dsize;
+	abd_t *dabd;
+	abd_t *cabds[] = {
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd
+	};
+
+	raidz_math_begin();
+
+	raidz_copy(cabds[CODE_P], rr->rr_col[2].rc_abd, csize);
+	raidz_copy(cabds[CODE_Q], rr->rr_col[2].rc_abd, csize);
+
+	for (c = 3; c < ncols; c++) {
+		dabd = rr->rr_col[c].rc_abd;
+		dsize = rr->rr_col[c].rc_size;
+
+		abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2,
+		    raidz_gen_pq_add);
+	}
+
+	raidz_math_end();
+}
+
+
+/*
+ * Generate PQR parity (RAIDZ3)
+ * The function is called per data column.
+ *
+ * @c		array of pointers to parity (code) columns
+ * @dc		pointer to data column
+ * @csize	size of parity columns
+ * @dsize	size of data column
+ */
+static void
+raidz_gen_pqr_add(void **c, const void *dc, const size_t csize,
+    const size_t dsize)
+{
+	v_t *p = (v_t *)c[0];
+	v_t *q = (v_t *)c[1];
+	v_t *r = (v_t *)c[CODE_R];
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+	const v_t * const qend = q + (csize / sizeof (v_t));
+
+	GEN_PQR_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE,
+	    q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+		LOAD(d, GEN_PQR_D);
+		P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p);
+		Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q);
+		R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r);
+	}
+	for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) {
+		Q_SYNDROME(GEN_PQR_C, q);
+		R_SYNDROME(GEN_PQR_C, r);
+	}
+}
+
+
+/*
+ * Generate PQR parity (RAIDZ2)
+ *
+ * @rr	RAIDZ row
+ */
+static raidz_inline void
+raidz_generate_pqr_impl(raidz_row_t * const rr)
+{
+	size_t c;
+	const size_t ncols = rr->rr_cols;
+	const size_t csize = rr->rr_col[CODE_P].rc_size;
+	size_t dsize;
+	abd_t *dabd;
+	abd_t *cabds[] = {
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
+	};
+
+	raidz_math_begin();
+
+	raidz_copy(cabds[CODE_P], rr->rr_col[3].rc_abd, csize);
+	raidz_copy(cabds[CODE_Q], rr->rr_col[3].rc_abd, csize);
+	raidz_copy(cabds[CODE_R], rr->rr_col[3].rc_abd, csize);
+
+	for (c = 4; c < ncols; c++) {
+		dabd = rr->rr_col[c].rc_abd;
+		dsize = rr->rr_col[c].rc_size;
+
+		abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3,
+		    raidz_gen_pqr_add);
+	}
+
+	raidz_math_end();
+}
+
+
+/*
+ * DATA RECONSTRUCTION
+ *
+ * Data reconstruction process consists of two phases:
+ * 	- Syndrome calculation
+ * 	- Data reconstruction
+ *
+ * Syndrome is calculated by generating parity using available data columns
+ * and zeros in places of erasure. Existing parity is added to corresponding
+ * syndrome value to obtain the [P|Q|R]syn values from equation:
+ * 	P = Psyn + Dx + Dy + Dz
+ * 	Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz
+ * 	R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz
+ *
+ * For data reconstruction phase, the corresponding equations are solved
+ * for missing data (Dx, Dy, Dz). This generally involves multiplying known
+ * symbols by an coefficient and adding them together. The multiplication
+ * constant coefficients are calculated ahead of the operation in
+ * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions.
+ *
+ * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big"
+ * and "short" columns.
+ * For this reason, reconstruction is performed in minimum of
+ * two steps. First, from offset 0 to short_size, then from short_size to
+ * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work
+ * over both ranges. The split also enables removal of conditional expressions
+ * from loop bodies, improving throughput of SIMD implementations.
+ * For the best performance, all functions marked with raidz_inline attribute
+ * must be inlined by compiler.
+ *
+ *    parity          data
+ *    columns         columns
+ * <----------> <------------------>
+ *                   x       y  <----+ missing columns (x, y)
+ *                   |       |
+ * +---+---+---+---+-v-+---+-v-+---+   ^ 0
+ * |   |   |   |   |   |   |   |   |   |
+ * |   |   |   |   |   |   |   |   |   |
+ * | P | Q | R | D | D | D | D | D |   |
+ * |   |   |   | 0 | 1 | 2 | 3 | 4 |   |
+ * |   |   |   |   |   |   |   |   |   v
+ * |   |   |   |   |   +---+---+---+   ^ short_size
+ * |   |   |   |   |   |               |
+ * +---+---+---+---+---+               v big_size
+ * <------------------> <---------->
+ *      big columns     short columns
+ *
+ */
+
+
+
+
+/*
+ * Reconstruct single data column using P parity
+ *
+ * @syn_method	raidz_add_abd()
+ * @rec_method	not applicable
+ *
+ * @rr		RAIDZ row
+ * @tgtidx	array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_p_impl(raidz_row_t *rr, const int *tgtidx)
+{
+	size_t c;
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
+	const size_t x = tgtidx[TARGET_X];
+	const size_t xsize = rr->rr_col[x].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	size_t size;
+	abd_t *dabd;
+
+	if (xabd == NULL)
+		return (1 << CODE_P);
+
+	raidz_math_begin();
+
+	/* copy P into target */
+	raidz_copy(xabd, rr->rr_col[CODE_P].rc_abd, xsize);
+
+	/* generate p_syndrome */
+	for (c = firstdc; c < ncols; c++) {
+		if (c == x)
+			continue;
+
+		dabd = rr->rr_col[c].rc_abd;
+		size = MIN(rr->rr_col[c].rc_size, xsize);
+
+		raidz_add(xabd, dabd, size);
+	}
+
+	raidz_math_end();
+
+	return (1 << CODE_P);
+}
+
+
+/*
+ * Generate Q syndrome (Qsyn)
+ *
+ * @xc		array of pointers to syndrome columns
+ * @dc		data column (NULL if missing)
+ * @xsize	size of syndrome columns
+ * @dsize	size of data column (0 if missing)
+ */
+static void
+raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize,
+    const size_t dsize)
+{
+	v_t *x = (v_t *)xc[TARGET_X];
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+	const v_t * const xend = x + (xsize / sizeof (v_t));
+
+	SYN_Q_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+		LOAD(d, SYN_Q_D);
+		Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x);
+	}
+	for (; x < xend; x += SYN_STRIDE) {
+		Q_SYNDROME(SYN_Q_X, x);
+	}
+}
+
+
+/*
+ * Reconstruct single data column using Q parity
+ *
+ * @syn_method	raidz_add_abd()
+ * @rec_method	raidz_mul_abd_cb()
+ *
+ * @rr		RAIDZ row
+ * @tgtidx	array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_q_impl(raidz_row_t *rr, const int *tgtidx)
+{
+	size_t c;
+	size_t dsize;
+	abd_t *dabd;
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
+	const size_t x = tgtidx[TARGET_X];
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	const size_t xsize = rr->rr_col[x].rc_size;
+	abd_t *tabds[] = { xabd };
+
+	if (xabd == NULL)
+		return (1 << CODE_Q);
+
+	unsigned coeff[MUL_CNT];
+	raidz_rec_q_coeff(rr, tgtidx, coeff);
+
+	raidz_math_begin();
+
+	/* Start with first data column if present */
+	if (firstdc != x) {
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+	} else {
+		raidz_zero(xabd, xsize);
+	}
+
+	/* generate q_syndrome */
+	for (c = firstdc+1; c < ncols; c++) {
+		if (c == x) {
+			dabd = NULL;
+			dsize = 0;
+		} else {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+		}
+
+		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+		    raidz_syn_q_abd);
+	}
+
+	/* add Q to the syndrome */
+	raidz_add(xabd, rr->rr_col[CODE_Q].rc_abd, xsize);
+
+	/* transform the syndrome */
+	abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff);
+
+	raidz_math_end();
+
+	return (1 << CODE_Q);
+}
+
+
+/*
+ * Generate R syndrome (Rsyn)
+ *
+ * @xc		array of pointers to syndrome columns
+ * @dc		data column (NULL if missing)
+ * @tsize	size of syndrome columns
+ * @dsize	size of data column (0 if missing)
+ */
+static void
+raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize,
+    const size_t dsize)
+{
+	v_t *x = (v_t *)xc[TARGET_X];
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+	const v_t * const xend = x + (tsize / sizeof (v_t));
+
+	SYN_R_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) {
+		LOAD(d, SYN_R_D);
+		R_D_SYNDROME(SYN_R_D, SYN_R_X, x);
+	}
+	for (; x < xend; x += SYN_STRIDE) {
+		R_SYNDROME(SYN_R_X, x);
+	}
+}
+
+
+/*
+ * Reconstruct single data column using R parity
+ *
+ * @syn_method	raidz_add_abd()
+ * @rec_method	raidz_mul_abd_cb()
+ *
+ * @rr		RAIDZ rr
+ * @tgtidx	array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_r_impl(raidz_row_t *rr, const int *tgtidx)
+{
+	size_t c;
+	size_t dsize;
+	abd_t *dabd;
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
+	const size_t x = tgtidx[TARGET_X];
+	const size_t xsize = rr->rr_col[x].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *tabds[] = { xabd };
+
+	if (xabd == NULL)
+		return (1 << CODE_R);
+
+	unsigned coeff[MUL_CNT];
+	raidz_rec_r_coeff(rr, tgtidx, coeff);
+
+	raidz_math_begin();
+
+	/* Start with first data column if present */
+	if (firstdc != x) {
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+	} else {
+		raidz_zero(xabd, xsize);
+	}
+
+
+	/* generate q_syndrome */
+	for (c = firstdc+1; c < ncols; c++) {
+		if (c == x) {
+			dabd = NULL;
+			dsize = 0;
+		} else {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+		}
+
+		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1,
+		    raidz_syn_r_abd);
+	}
+
+	/* add R to the syndrome */
+	raidz_add(xabd, rr->rr_col[CODE_R].rc_abd, xsize);
+
+	/* transform the syndrome */
+	abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff);
+
+	raidz_math_end();
+
+	return (1 << CODE_R);
+}
+
+
+/*
+ * Generate P and Q syndromes
+ *
+ * @xc		array of pointers to syndrome columns
+ * @dc		data column (NULL if missing)
+ * @tsize	size of syndrome columns
+ * @dsize	size of data column (0 if missing)
+ */
+static void
+raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize,
+    const size_t dsize)
+{
+	v_t *x = (v_t *)tc[TARGET_X];
+	v_t *y = (v_t *)tc[TARGET_Y];
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+	const v_t * const yend = y + (tsize / sizeof (v_t));
+
+	SYN_PQ_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+		LOAD(d, SYN_PQ_D);
+		P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x);
+		Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y);
+	}
+	for (; y < yend; y += SYN_STRIDE) {
+		Q_SYNDROME(SYN_PQ_X, y);
+	}
+}
+
+/*
+ * Reconstruct data using PQ parity and PQ syndromes
+ *
+ * @tc		syndrome/result columns
+ * @tsize	size of syndrome/result columns
+ * @c		parity columns
+ * @mul		array of multiplication constants
+ */
+static void
+raidz_rec_pq_abd(void **tc, const size_t tsize, void **c,
+    const unsigned *mul)
+{
+	v_t *x = (v_t *)tc[TARGET_X];
+	v_t *y = (v_t *)tc[TARGET_Y];
+	const v_t * const xend = x + (tsize / sizeof (v_t));
+	const v_t *p = (v_t *)c[CODE_P];
+	const v_t *q = (v_t *)c[CODE_Q];
+
+	REC_PQ_DEFINE();
+
+	for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE,
+	    p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) {
+		LOAD(x, REC_PQ_X);
+		LOAD(y, REC_PQ_Y);
+
+		XOR_ACC(p, REC_PQ_X);
+		XOR_ACC(q, REC_PQ_Y);
+
+		/* Save Pxy */
+		COPY(REC_PQ_X,  REC_PQ_T);
+
+		/* Calc X */
+		MUL(mul[MUL_PQ_X], REC_PQ_X);
+		MUL(mul[MUL_PQ_Y], REC_PQ_Y);
+		XOR(REC_PQ_Y,  REC_PQ_X);
+		STORE(x, REC_PQ_X);
+
+		/* Calc Y */
+		XOR(REC_PQ_T,  REC_PQ_X);
+		STORE(y, REC_PQ_X);
+	}
+}
+
+
+/*
+ * Reconstruct two data columns using PQ parity
+ *
+ * @syn_method	raidz_syn_pq_abd()
+ * @rec_method	raidz_rec_pq_abd()
+ *
+ * @rr		RAIDZ row
+ * @tgtidx	array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pq_impl(raidz_row_t *rr, const int *tgtidx)
+{
+	size_t c;
+	size_t dsize;
+	abd_t *dabd;
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
+	const size_t x = tgtidx[TARGET_X];
+	const size_t y = tgtidx[TARGET_Y];
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
+	abd_t *tabds[2] = { xabd, yabd };
+	abd_t *cabds[] = {
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd
+	};
+
+	if (xabd == NULL)
+		return ((1 << CODE_P) | (1 << CODE_Q));
+
+	unsigned coeff[MUL_CNT];
+	raidz_rec_pq_coeff(rr, tgtidx, coeff);
+
+	/*
+	 * Check if some of targets is shorter then others
+	 * In this case, shorter target needs to be replaced with
+	 * new buffer so that syndrome can be calculated.
+	 */
+	if (ysize < xsize) {
+		yabd = abd_alloc(xsize, B_FALSE);
+		tabds[1] = yabd;
+	}
+
+	raidz_math_begin();
+
+	/* Start with first data column if present */
+	if (firstdc != x) {
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+	} else {
+		raidz_zero(xabd, xsize);
+		raidz_zero(yabd, xsize);
+	}
+
+	/* generate q_syndrome */
+	for (c = firstdc+1; c < ncols; c++) {
+		if (c == x || c == y) {
+			dabd = NULL;
+			dsize = 0;
+		} else {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+		}
+
+		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+		    raidz_syn_pq_abd);
+	}
+
+	abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff);
+
+	/* Copy shorter targets back to the original abd buffer */
+	if (ysize < xsize)
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+
+	raidz_math_end();
+
+	if (ysize < xsize)
+		abd_free(yabd);
+
+	return ((1 << CODE_P) | (1 << CODE_Q));
+}
+
+
+/*
+ * Generate P and R syndromes
+ *
+ * @xc		array of pointers to syndrome columns
+ * @dc		data column (NULL if missing)
+ * @tsize	size of syndrome columns
+ * @dsize	size of data column (0 if missing)
+ */
+static void
+raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize,
+    const size_t dsize)
+{
+	v_t *x = (v_t *)c[TARGET_X];
+	v_t *y = (v_t *)c[TARGET_Y];
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+	const v_t * const yend = y + (tsize / sizeof (v_t));
+
+	SYN_PR_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+		LOAD(d, SYN_PR_D);
+		P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x);
+		R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y);
+	}
+	for (; y < yend; y += SYN_STRIDE) {
+		R_SYNDROME(SYN_PR_X, y);
+	}
+}
+
+/*
+ * Reconstruct data using PR parity and PR syndromes
+ *
+ * @tc		syndrome/result columns
+ * @tsize	size of syndrome/result columns
+ * @c		parity columns
+ * @mul		array of multiplication constants
+ */
+static void
+raidz_rec_pr_abd(void **t, const size_t tsize, void **c,
+    const unsigned *mul)
+{
+	v_t *x = (v_t *)t[TARGET_X];
+	v_t *y = (v_t *)t[TARGET_Y];
+	const v_t * const xend = x + (tsize / sizeof (v_t));
+	const v_t *p = (v_t *)c[CODE_P];
+	const v_t *q = (v_t *)c[CODE_Q];
+
+	REC_PR_DEFINE();
+
+	for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE,
+	    p += REC_PR_STRIDE, q += REC_PR_STRIDE) {
+		LOAD(x, REC_PR_X);
+		LOAD(y, REC_PR_Y);
+		XOR_ACC(p, REC_PR_X);
+		XOR_ACC(q, REC_PR_Y);
+
+		/* Save Pxy */
+		COPY(REC_PR_X,  REC_PR_T);
+
+		/* Calc X */
+		MUL(mul[MUL_PR_X], REC_PR_X);
+		MUL(mul[MUL_PR_Y], REC_PR_Y);
+		XOR(REC_PR_Y,  REC_PR_X);
+		STORE(x, REC_PR_X);
+
+		/* Calc Y */
+		XOR(REC_PR_T,  REC_PR_X);
+		STORE(y, REC_PR_X);
+	}
+}
+
+
+/*
+ * Reconstruct two data columns using PR parity
+ *
+ * @syn_method	raidz_syn_pr_abd()
+ * @rec_method	raidz_rec_pr_abd()
+ *
+ * @rr		RAIDZ row
+ * @tgtidx	array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pr_impl(raidz_row_t *rr, const int *tgtidx)
+{
+	size_t c;
+	size_t dsize;
+	abd_t *dabd;
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
+	const size_t x = tgtidx[0];
+	const size_t y = tgtidx[1];
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
+	abd_t *tabds[2] = { xabd, yabd };
+	abd_t *cabds[] = {
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
+	};
+
+	if (xabd == NULL)
+		return ((1 << CODE_P) | (1 << CODE_R));
+
+	unsigned coeff[MUL_CNT];
+	raidz_rec_pr_coeff(rr, tgtidx, coeff);
+
+	/*
+	 * Check if some of targets are shorter then others.
+	 * They need to be replaced with a new buffer so that syndrome can
+	 * be calculated on full length.
+	 */
+	if (ysize < xsize) {
+		yabd = abd_alloc(xsize, B_FALSE);
+		tabds[1] = yabd;
+	}
+
+	raidz_math_begin();
+
+	/* Start with first data column if present */
+	if (firstdc != x) {
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+	} else {
+		raidz_zero(xabd, xsize);
+		raidz_zero(yabd, xsize);
+	}
+
+	/* generate q_syndrome */
+	for (c = firstdc+1; c < ncols; c++) {
+		if (c == x || c == y) {
+			dabd = NULL;
+			dsize = 0;
+		} else {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+		}
+
+		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+		    raidz_syn_pr_abd);
+	}
+
+	abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff);
+
+	/*
+	 * Copy shorter targets back to the original abd buffer
+	 */
+	if (ysize < xsize)
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+
+	raidz_math_end();
+
+	if (ysize < xsize)
+		abd_free(yabd);
+
+	return ((1 << CODE_P) | (1 << CODE_R));
+}
+
+
+/*
+ * Generate Q and R syndromes
+ *
+ * @xc		array of pointers to syndrome columns
+ * @dc		data column (NULL if missing)
+ * @tsize	size of syndrome columns
+ * @dsize	size of data column (0 if missing)
+ */
+static void
+raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize,
+    const size_t dsize)
+{
+	v_t *x = (v_t *)c[TARGET_X];
+	v_t *y = (v_t *)c[TARGET_Y];
+	const v_t * const xend = x + (tsize / sizeof (v_t));
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+
+	SYN_QR_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) {
+		LOAD(d, SYN_PQ_D);
+		Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x);
+		R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y);
+	}
+	for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) {
+		Q_SYNDROME(SYN_QR_X, x);
+		R_SYNDROME(SYN_QR_X, y);
+	}
+}
+
+
+/*
+ * Reconstruct data using QR parity and QR syndromes
+ *
+ * @tc		syndrome/result columns
+ * @tsize	size of syndrome/result columns
+ * @c		parity columns
+ * @mul		array of multiplication constants
+ */
+static void
+raidz_rec_qr_abd(void **t, const size_t tsize, void **c,
+    const unsigned *mul)
+{
+	v_t *x = (v_t *)t[TARGET_X];
+	v_t *y = (v_t *)t[TARGET_Y];
+	const v_t * const xend = x + (tsize / sizeof (v_t));
+	const v_t *p = (v_t *)c[CODE_P];
+	const v_t *q = (v_t *)c[CODE_Q];
+
+	REC_QR_DEFINE();
+
+	for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE,
+	    p += REC_QR_STRIDE, q += REC_QR_STRIDE) {
+		LOAD(x, REC_QR_X);
+		LOAD(y, REC_QR_Y);
+
+		XOR_ACC(p, REC_QR_X);
+		XOR_ACC(q, REC_QR_Y);
+
+		/* Save Pxy */
+		COPY(REC_QR_X,  REC_QR_T);
+
+		/* Calc X */
+		MUL(mul[MUL_QR_XQ], REC_QR_X);	/* X = Q * xqm */
+		XOR(REC_QR_Y, REC_QR_X);	/* X = R ^ X   */
+		MUL(mul[MUL_QR_X], REC_QR_X);	/* X = X * xm  */
+		STORE(x, REC_QR_X);
+
+		/* Calc Y */
+		MUL(mul[MUL_QR_YQ], REC_QR_T);	/* X = Q * xqm */
+		XOR(REC_QR_Y, REC_QR_T);	/* X = R ^ X   */
+		MUL(mul[MUL_QR_Y], REC_QR_T);	/* X = X * xm  */
+		STORE(y, REC_QR_T);
+	}
+}
+
+
+/*
+ * Reconstruct two data columns using QR parity
+ *
+ * @syn_method	raidz_syn_qr_abd()
+ * @rec_method	raidz_rec_qr_abd()
+ *
+ * @rr		RAIDZ row
+ * @tgtidx	array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_qr_impl(raidz_row_t *rr, const int *tgtidx)
+{
+	size_t c;
+	size_t dsize;
+	abd_t *dabd;
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
+	const size_t x = tgtidx[TARGET_X];
+	const size_t y = tgtidx[TARGET_Y];
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
+	abd_t *tabds[2] = { xabd, yabd };
+	abd_t *cabds[] = {
+		rr->rr_col[CODE_Q].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
+	};
+
+	if (xabd == NULL)
+		return ((1 << CODE_Q) | (1 << CODE_R));
+
+	unsigned coeff[MUL_CNT];
+	raidz_rec_qr_coeff(rr, tgtidx, coeff);
+
+	/*
+	 * Check if some of targets is shorter then others
+	 * In this case, shorter target needs to be replaced with
+	 * new buffer so that syndrome can be calculated.
+	 */
+	if (ysize < xsize) {
+		yabd = abd_alloc(xsize, B_FALSE);
+		tabds[1] = yabd;
+	}
+
+	raidz_math_begin();
+
+	/* Start with first data column if present */
+	if (firstdc != x) {
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+	} else {
+		raidz_zero(xabd, xsize);
+		raidz_zero(yabd, xsize);
+	}
+
+	/* generate q_syndrome */
+	for (c = firstdc+1; c < ncols; c++) {
+		if (c == x || c == y) {
+			dabd = NULL;
+			dsize = 0;
+		} else {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+		}
+
+		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2,
+		    raidz_syn_qr_abd);
+	}
+
+	abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff);
+
+	/*
+	 * Copy shorter targets back to the original abd buffer
+	 */
+	if (ysize < xsize)
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+
+	raidz_math_end();
+
+	if (ysize < xsize)
+		abd_free(yabd);
+
+
+	return ((1 << CODE_Q) | (1 << CODE_R));
+}
+
+
+/*
+ * Generate P, Q, and R syndromes
+ *
+ * @xc		array of pointers to syndrome columns
+ * @dc		data column (NULL if missing)
+ * @tsize	size of syndrome columns
+ * @dsize	size of data column (0 if missing)
+ */
+static void
+raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize,
+    const size_t dsize)
+{
+	v_t *x = (v_t *)c[TARGET_X];
+	v_t *y = (v_t *)c[TARGET_Y];
+	v_t *z = (v_t *)c[TARGET_Z];
+	const v_t * const yend = y + (tsize / sizeof (v_t));
+	const v_t *d = (const v_t *)dc;
+	const v_t * const dend = d + (dsize / sizeof (v_t));
+
+	SYN_PQR_DEFINE();
+
+	MUL2_SETUP();
+
+	for (; d < dend;  d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE,
+	    z += SYN_STRIDE) {
+		LOAD(d, SYN_PQR_D);
+		P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x)
+		Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y);
+		R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z);
+	}
+	for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) {
+		Q_SYNDROME(SYN_PQR_X, y);
+		R_SYNDROME(SYN_PQR_X, z);
+	}
+}
+
+
+/*
+ * Reconstruct data using PRQ parity and PQR syndromes
+ *
+ * @tc		syndrome/result columns
+ * @tsize	size of syndrome/result columns
+ * @c		parity columns
+ * @mul		array of multiplication constants
+ */
+static void
+raidz_rec_pqr_abd(void **t, const size_t tsize, void **c,
+    const unsigned * const mul)
+{
+	v_t *x = (v_t *)t[TARGET_X];
+	v_t *y = (v_t *)t[TARGET_Y];
+	v_t *z = (v_t *)t[TARGET_Z];
+	const v_t * const xend = x + (tsize / sizeof (v_t));
+	const v_t *p = (v_t *)c[CODE_P];
+	const v_t *q = (v_t *)c[CODE_Q];
+	const v_t *r = (v_t *)c[CODE_R];
+
+	REC_PQR_DEFINE();
+
+	for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE,
+	    z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE,
+	    r += REC_PQR_STRIDE) {
+		LOAD(x, REC_PQR_X);
+		LOAD(y, REC_PQR_Y);
+		LOAD(z, REC_PQR_Z);
+
+		XOR_ACC(p, REC_PQR_X);
+		XOR_ACC(q, REC_PQR_Y);
+		XOR_ACC(r, REC_PQR_Z);
+
+		/* Save Pxyz and Qxyz */
+		COPY(REC_PQR_X, REC_PQR_XS);
+		COPY(REC_PQR_Y, REC_PQR_YS);
+
+		/* Calc X */
+		MUL(mul[MUL_PQR_XP], REC_PQR_X);	/* Xp = Pxyz * xp   */
+		MUL(mul[MUL_PQR_XQ], REC_PQR_Y);	/* Xq = Qxyz * xq   */
+		XOR(REC_PQR_Y, REC_PQR_X);
+		MUL(mul[MUL_PQR_XR], REC_PQR_Z);	/* Xr = Rxyz * xr   */
+		XOR(REC_PQR_Z, REC_PQR_X);		/* X = Xp + Xq + Xr */
+		STORE(x, REC_PQR_X);
+
+		/* Calc Y */
+		XOR(REC_PQR_X, REC_PQR_XS); 		/* Pyz = Pxyz + X */
+		MUL(mul[MUL_PQR_YU], REC_PQR_X);  	/* Xq = X * upd_q */
+		XOR(REC_PQR_X, REC_PQR_YS); 		/* Qyz = Qxyz + Xq */
+		COPY(REC_PQR_XS, REC_PQR_X);		/* restore Pyz */
+		MUL(mul[MUL_PQR_YP], REC_PQR_X);	/* Yp = Pyz * yp */
+		MUL(mul[MUL_PQR_YQ], REC_PQR_YS);	/* Yq = Qyz * yq */
+		XOR(REC_PQR_X, REC_PQR_YS); 		/* Y = Yp + Yq */
+		STORE(y, REC_PQR_YS);
+
+		/* Calc Z */
+		XOR(REC_PQR_XS, REC_PQR_YS);		/* Z = Pz = Pyz + Y */
+		STORE(z, REC_PQR_YS);
+	}
+}
+
+
+/*
+ * Reconstruct three data columns using PQR parity
+ *
+ * @syn_method	raidz_syn_pqr_abd()
+ * @rec_method	raidz_rec_pqr_abd()
+ *
+ * @rr		RAIDZ row
+ * @tgtidx	array of missing data indexes
+ */
+static raidz_inline int
+raidz_reconstruct_pqr_impl(raidz_row_t *rr, const int *tgtidx)
+{
+	size_t c;
+	size_t dsize;
+	abd_t *dabd;
+	const size_t firstdc = rr->rr_firstdatacol;
+	const size_t ncols = rr->rr_cols;
+	const size_t x = tgtidx[TARGET_X];
+	const size_t y = tgtidx[TARGET_Y];
+	const size_t z = tgtidx[TARGET_Z];
+	const size_t xsize = rr->rr_col[x].rc_size;
+	const size_t ysize = rr->rr_col[y].rc_size;
+	const size_t zsize = rr->rr_col[z].rc_size;
+	abd_t *xabd = rr->rr_col[x].rc_abd;
+	abd_t *yabd = rr->rr_col[y].rc_abd;
+	abd_t *zabd = rr->rr_col[z].rc_abd;
+	abd_t *tabds[] = { xabd, yabd, zabd };
+	abd_t *cabds[] = {
+		rr->rr_col[CODE_P].rc_abd,
+		rr->rr_col[CODE_Q].rc_abd,
+		rr->rr_col[CODE_R].rc_abd
+	};
+
+	if (xabd == NULL)
+		return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
+
+	unsigned coeff[MUL_CNT];
+	raidz_rec_pqr_coeff(rr, tgtidx, coeff);
+
+	/*
+	 * Check if some of targets is shorter then others
+	 * In this case, shorter target needs to be replaced with
+	 * new buffer so that syndrome can be calculated.
+	 */
+	if (ysize < xsize) {
+		yabd = abd_alloc(xsize, B_FALSE);
+		tabds[1] = yabd;
+	}
+	if (zsize < xsize) {
+		zabd = abd_alloc(xsize, B_FALSE);
+		tabds[2] = zabd;
+	}
+
+	raidz_math_begin();
+
+	/* Start with first data column if present */
+	if (firstdc != x) {
+		raidz_copy(xabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(yabd, rr->rr_col[firstdc].rc_abd, xsize);
+		raidz_copy(zabd, rr->rr_col[firstdc].rc_abd, xsize);
+	} else {
+		raidz_zero(xabd, xsize);
+		raidz_zero(yabd, xsize);
+		raidz_zero(zabd, xsize);
+	}
+
+	/* generate q_syndrome */
+	for (c = firstdc+1; c < ncols; c++) {
+		if (c == x || c == y || c == z) {
+			dabd = NULL;
+			dsize = 0;
+		} else {
+			dabd = rr->rr_col[c].rc_abd;
+			dsize = rr->rr_col[c].rc_size;
+		}
+
+		abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3,
+		    raidz_syn_pqr_abd);
+	}
+
+	abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff);
+
+	/*
+	 * Copy shorter targets back to the original abd buffer
+	 */
+	if (ysize < xsize)
+		raidz_copy(rr->rr_col[y].rc_abd, yabd, ysize);
+	if (zsize < xsize)
+		raidz_copy(rr->rr_col[z].rc_abd, zabd, zsize);
+
+	raidz_math_end();
+
+	if (ysize < xsize)
+		abd_free(yabd);
+	if (zsize < xsize)
+		abd_free(zabd);
+
+	return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R));
+}
+
+#endif /* _VDEV_RAIDZ_MATH_IMPL_H */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
new file mode 100644
index 000000000000..1db2c4cd3a47
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec.c
@@ -0,0 +1,4337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2019 Romain Dolbeau. All rights reserved.
+ *           <romain.dolbeau@european-processor-initiative.eu>
+ */
+
+#include <sys/isa_defs.h>
+#include <sys/types.h>
+
+#if defined(__powerpc__)
+#pragma GCC target("altivec")
+
+#include "vdev_raidz_math_powerpc_altivec_common.h"
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		4
+#define	ZERO_DEFINE()	\
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	ZERO_D			0, 1, 2, 3
+
+#define	COPY_STRIDE		4
+#define	COPY_DEFINE()	\
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	COPY_D			0, 1, 2, 3
+
+#define	ADD_STRIDE		4
+#define	ADD_DEFINE()	\
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	ADD_D			0, 1, 2, 3
+
+#define	MUL_STRIDE		4
+#define	MUL_DEFINE()	\
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	MUL_D			0, 1, 2, 3
+
+#define	GEN_P_DEFINE() \
+	GEN_X_DEFINE_0_3() \
+	GEN_X_DEFINE_33_36()
+#define	GEN_P_STRIDE		4
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_Q_STRIDE		4
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_R_STRIDE		4
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PQ_STRIDE		4
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PQ_STRIDE		2
+#define	REC_PQ_X		0, 1
+#define	REC_PQ_Y		2, 3
+#define	REC_PQ_T		4, 5
+
+#define	SYN_PR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PR_STRIDE		4
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PR_STRIDE		2
+#define	REC_PR_X		0, 1
+#define	REC_PR_Y		2, 3
+#define	REC_PR_T		4, 5
+
+#define	SYN_QR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_QR_STRIDE		4
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_DEFINE()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_QR_STRIDE		2
+#define	REC_QR_X		0, 1
+#define	REC_QR_Y		2, 3
+#define	REC_QR_T		4, 5
+
+#define	SYN_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_33_36()
+#define	SYN_PQR_STRIDE		 4
+#define	SYN_PQR_D		 0, 1, 2, 3
+#define	SYN_PQR_X		 4, 5, 6, 7
+
+#define	REC_PQR_DEFINE() \
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()
+#define	REC_PQR_STRIDE		2
+#define	REC_PQR_X		0, 1
+#define	REC_PQR_Y		2, 3
+#define	REC_PQR_Z		4, 5
+#define	REC_PQR_XS		6, 7
+#define	REC_PQR_YS		8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(powerpc_altivec);
+DEFINE_REC_METHODS(powerpc_altivec);
+
+static boolean_t
+raidz_will_powerpc_altivec_work(void)
+{
+	return (kfpu_allowed()) && zfs_altivec_available();
+}
+
+const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(powerpc_altivec),
+	.rec = RAIDZ_REC_METHODS(powerpc_altivec),
+	.is_supported = &raidz_will_powerpc_altivec_work,
+	.name = "powerpc_altivec"
+};
+
+#endif /* defined(__powerpc__) */
+
+
+#if defined(__powerpc__)
+#if defined(_ZFS_LITTLE_ENDIAN) && _LITTLE_ENDIAN
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
+		0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10,
+		0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x11, 0x12, 0x17, 0x14, 0x1d, 0x1e, 0x1b, 0x18,
+		0x09, 0x0a, 0x0f, 0x0c, 0x05, 0x06, 0x03, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x3c, 0x38, 0x34, 0x30, 0x2c, 0x28, 0x24, 0x20,
+		0x1c, 0x18, 0x14, 0x10, 0x0c, 0x08, 0x04, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x33, 0x36, 0x39, 0x3c, 0x27, 0x22, 0x2d, 0x28,
+		0x1b, 0x1e, 0x11, 0x14, 0x0f, 0x0a, 0x05, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x22, 0x24, 0x2e, 0x28, 0x3a, 0x3c, 0x36, 0x30,
+		0x12, 0x14, 0x1e, 0x18, 0x0a, 0x0c, 0x06, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x2d, 0x2a, 0x23, 0x24, 0x31, 0x36, 0x3f, 0x38,
+		0x15, 0x12, 0x1b, 0x1c, 0x09, 0x0e, 0x07, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
+		0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x77, 0x7e, 0x65, 0x6c, 0x53, 0x5a, 0x41, 0x48,
+		0x3f, 0x36, 0x2d, 0x24, 0x1b, 0x12, 0x09, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x66, 0x6c, 0x72, 0x78, 0x4e, 0x44, 0x5a, 0x50,
+		0x36, 0x3c, 0x22, 0x28, 0x1e, 0x14, 0x0a, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x69, 0x62, 0x7f, 0x74, 0x45, 0x4e, 0x53, 0x58,
+		0x31, 0x3a, 0x27, 0x2c, 0x1d, 0x16, 0x0b, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x44, 0x48, 0x5c, 0x50, 0x74, 0x78, 0x6c, 0x60,
+		0x24, 0x28, 0x3c, 0x30, 0x14, 0x18, 0x0c, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x4b, 0x46, 0x51, 0x5c, 0x7f, 0x72, 0x65, 0x68,
+		0x23, 0x2e, 0x39, 0x34, 0x17, 0x1a, 0x0d, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x5a, 0x54, 0x46, 0x48, 0x62, 0x6c, 0x7e, 0x70,
+		0x2a, 0x24, 0x36, 0x38, 0x12, 0x1c, 0x0e, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x55, 0x5a, 0x4b, 0x44, 0x69, 0x66, 0x77, 0x78,
+		0x2d, 0x22, 0x33, 0x3c, 0x11, 0x1e, 0x0f, 0x00  },
+	{	0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8,
+		0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8,
+		0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88,
+		0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00  },
+	{	0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5,
+		0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xee, 0xfc, 0xca, 0xd8, 0xa6, 0xb4, 0x82, 0x90,
+		0x7e, 0x6c, 0x5a, 0x48, 0x36, 0x24, 0x12, 0x00  },
+	{	0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5,
+		0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xe1, 0xf2, 0xc7, 0xd4, 0xad, 0xbe, 0x8b, 0x98,
+		0x79, 0x6a, 0x5f, 0x4c, 0x35, 0x26, 0x13, 0x00  },
+	{	0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2,
+		0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xcc, 0xd8, 0xe4, 0xf0, 0x9c, 0x88, 0xb4, 0xa0,
+		0x6c, 0x78, 0x44, 0x50, 0x3c, 0x28, 0x14, 0x00  },
+	{	0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2,
+		0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xc3, 0xd6, 0xe9, 0xfc, 0x97, 0x82, 0xbd, 0xa8,
+		0x6b, 0x7e, 0x41, 0x54, 0x3f, 0x2a, 0x15, 0x00  },
+	{	0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf,
+		0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xd2, 0xc4, 0xfe, 0xe8, 0x8a, 0x9c, 0xa6, 0xb0,
+		0x62, 0x74, 0x4e, 0x58, 0x3a, 0x2c, 0x16, 0x00  },
+	{	0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf,
+		0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xdd, 0xca, 0xf3, 0xe4, 0x81, 0x96, 0xaf, 0xb8,
+		0x65, 0x72, 0x4b, 0x5c, 0x39, 0x2e, 0x17, 0x00  },
+	{	0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c,
+		0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x88, 0x90, 0xb8, 0xa0, 0xe8, 0xf0, 0xd8, 0xc0,
+		0x48, 0x50, 0x78, 0x60, 0x28, 0x30, 0x18, 0x00  },
+	{	0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c,
+		0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x87, 0x9e, 0xb5, 0xac, 0xe3, 0xfa, 0xd1, 0xc8,
+		0x4f, 0x56, 0x7d, 0x64, 0x2b, 0x32, 0x19, 0x00  },
+	{	0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81,
+		0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x96, 0x8c, 0xa2, 0xb8, 0xfe, 0xe4, 0xca, 0xd0,
+		0x46, 0x5c, 0x72, 0x68, 0x2e, 0x34, 0x1a, 0x00  },
+	{	0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81,
+		0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x99, 0x82, 0xaf, 0xb4, 0xf5, 0xee, 0xc3, 0xd8,
+		0x41, 0x5a, 0x77, 0x6c, 0x2d, 0x36, 0x1b, 0x00  },
+	{	0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6,
+		0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xb4, 0xa8, 0x8c, 0x90, 0xc4, 0xd8, 0xfc, 0xe0,
+		0x54, 0x48, 0x6c, 0x70, 0x24, 0x38, 0x1c, 0x00  },
+	{	0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6,
+		0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8,
+		0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00  },
+	{	0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb,
+		0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xaa, 0xb4, 0x96, 0x88, 0xd2, 0xcc, 0xee, 0xf0,
+		0x5a, 0x44, 0x66, 0x78, 0x22, 0x3c, 0x1e, 0x00  },
+	{	0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb,
+		0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xa5, 0xba, 0x9b, 0x84, 0xd9, 0xc6, 0xe7, 0xf8,
+		0x5d, 0x42, 0x63, 0x7c, 0x21, 0x3e, 0x1f, 0x00  },
+	{	0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd,
+		0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd,
+		0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xef, 0xce, 0xad, 0x8c, 0x6b, 0x4a, 0x29, 0x08,
+		0xe7, 0xc6, 0xa5, 0x84, 0x63, 0x42, 0x21, 0x00  },
+	{	0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0,
+		0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10,
+		0xee, 0xcc, 0xaa, 0x88, 0x66, 0x44, 0x22, 0x00  },
+	{	0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0,
+		0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xf1, 0xd2, 0xb7, 0x94, 0x7d, 0x5e, 0x3b, 0x18,
+		0xe9, 0xca, 0xaf, 0x8c, 0x65, 0x46, 0x23, 0x00  },
+	{	0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7,
+		0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xdc, 0xf8, 0x94, 0xb0, 0x4c, 0x68, 0x04, 0x20,
+		0xfc, 0xd8, 0xb4, 0x90, 0x6c, 0x48, 0x24, 0x00  },
+	{	0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7,
+		0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xd3, 0xf6, 0x99, 0xbc, 0x47, 0x62, 0x0d, 0x28,
+		0xfb, 0xde, 0xb1, 0x94, 0x6f, 0x4a, 0x25, 0x00  },
+	{	0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea,
+		0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xc2, 0xe4, 0x8e, 0xa8, 0x5a, 0x7c, 0x16, 0x30,
+		0xf2, 0xd4, 0xbe, 0x98, 0x6a, 0x4c, 0x26, 0x00  },
+	{	0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea,
+		0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38,
+		0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00  },
+	{	0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9,
+		0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x98, 0xb0, 0xc8, 0xe0, 0x38, 0x10, 0x68, 0x40,
+		0xd8, 0xf0, 0x88, 0xa0, 0x78, 0x50, 0x28, 0x00  },
+	{	0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9,
+		0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x97, 0xbe, 0xc5, 0xec, 0x33, 0x1a, 0x61, 0x48,
+		0xdf, 0xf6, 0x8d, 0xa4, 0x7b, 0x52, 0x29, 0x00  },
+	{	0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4,
+		0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x86, 0xac, 0xd2, 0xf8, 0x2e, 0x04, 0x7a, 0x50,
+		0xd6, 0xfc, 0x82, 0xa8, 0x7e, 0x54, 0x2a, 0x00  },
+	{	0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4,
+		0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x89, 0xa2, 0xdf, 0xf4, 0x25, 0x0e, 0x73, 0x58,
+		0xd1, 0xfa, 0x87, 0xac, 0x7d, 0x56, 0x2b, 0x00  },
+	{	0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83,
+		0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xa4, 0x88, 0xfc, 0xd0, 0x14, 0x38, 0x4c, 0x60,
+		0xc4, 0xe8, 0x9c, 0xb0, 0x74, 0x58, 0x2c, 0x00  },
+	{	0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83,
+		0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xab, 0x86, 0xf1, 0xdc, 0x1f, 0x32, 0x45, 0x68,
+		0xc3, 0xee, 0x99, 0xb4, 0x77, 0x5a, 0x2d, 0x00  },
+	{	0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e,
+		0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xba, 0x94, 0xe6, 0xc8, 0x02, 0x2c, 0x5e, 0x70,
+		0xca, 0xe4, 0x96, 0xb8, 0x72, 0x5c, 0x2e, 0x00  },
+	{	0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e,
+		0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0xb5, 0x9a, 0xeb, 0xc4, 0x09, 0x26, 0x57, 0x78,
+		0xcd, 0xe2, 0x93, 0xbc, 0x71, 0x5e, 0x2f, 0x00  },
+	{	0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25,
+		0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25,
+		0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x1f, 0x2e, 0x7d, 0x4c, 0xdb, 0xea, 0xb9, 0x88,
+		0x97, 0xa6, 0xf5, 0xc4, 0x53, 0x62, 0x31, 0x00  },
+	{	0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38,
+		0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x0e, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90,
+		0x9e, 0xac, 0xfa, 0xc8, 0x56, 0x64, 0x32, 0x00  },
+	{	0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38,
+		0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x01, 0x32, 0x67, 0x54, 0xcd, 0xfe, 0xab, 0x98,
+		0x99, 0xaa, 0xff, 0xcc, 0x55, 0x66, 0x33, 0x00  },
+	{	0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f,
+		0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x2c, 0x18, 0x44, 0x70, 0xfc, 0xc8, 0x94, 0xa0,
+		0x8c, 0xb8, 0xe4, 0xd0, 0x5c, 0x68, 0x34, 0x00  },
+	{	0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f,
+		0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x23, 0x16, 0x49, 0x7c, 0xf7, 0xc2, 0x9d, 0xa8,
+		0x8b, 0xbe, 0xe1, 0xd4, 0x5f, 0x6a, 0x35, 0x00  },
+	{	0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02,
+		0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x32, 0x04, 0x5e, 0x68, 0xea, 0xdc, 0x86, 0xb0,
+		0x82, 0xb4, 0xee, 0xd8, 0x5a, 0x6c, 0x36, 0x00  },
+	{	0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02,
+		0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x3d, 0x0a, 0x53, 0x64, 0xe1, 0xd6, 0x8f, 0xb8,
+		0x85, 0xb2, 0xeb, 0xdc, 0x59, 0x6e, 0x37, 0x00  },
+	{	0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51,
+		0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x68, 0x50, 0x18, 0x20, 0x88, 0xb0, 0xf8, 0xc0,
+		0xa8, 0x90, 0xd8, 0xe0, 0x48, 0x70, 0x38, 0x00  },
+	{	0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51,
+		0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x67, 0x5e, 0x15, 0x2c, 0x83, 0xba, 0xf1, 0xc8,
+		0xaf, 0x96, 0xdd, 0xe4, 0x4b, 0x72, 0x39, 0x00  },
+	{	0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c,
+		0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0,
+		0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00  },
+	{	0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c,
+		0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x79, 0x42, 0x0f, 0x34, 0x95, 0xae, 0xe3, 0xd8,
+		0xa1, 0x9a, 0xd7, 0xec, 0x4d, 0x76, 0x3b, 0x00  },
+	{	0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b,
+		0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x54, 0x68, 0x2c, 0x10, 0xa4, 0x98, 0xdc, 0xe0,
+		0xb4, 0x88, 0xcc, 0xf0, 0x44, 0x78, 0x3c, 0x00  },
+	{	0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b,
+		0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x5b, 0x66, 0x21, 0x1c, 0xaf, 0x92, 0xd5, 0xe8,
+		0xb3, 0x8e, 0xc9, 0xf4, 0x47, 0x7a, 0x3d, 0x00  },
+	{	0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76,
+		0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x4a, 0x74, 0x36, 0x08, 0xb2, 0x8c, 0xce, 0xf0,
+		0xba, 0x84, 0xc6, 0xf8, 0x42, 0x7c, 0x3e, 0x00  },
+	{	0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76,
+		0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x45, 0x7a, 0x3b, 0x04, 0xb9, 0x86, 0xc7, 0xf8,
+		0xbd, 0x82, 0xc3, 0xfc, 0x41, 0x7e, 0x3f, 0x00  },
+	{	0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87,
+		0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87,
+		0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xcf, 0x8e, 0x4d, 0x0c, 0xcb, 0x8a, 0x49, 0x08,
+		0xc7, 0x86, 0x45, 0x04, 0xc3, 0x82, 0x41, 0x00  },
+	{	0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a,
+		0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xde, 0x9c, 0x5a, 0x18, 0xd6, 0x94, 0x52, 0x10,
+		0xce, 0x8c, 0x4a, 0x08, 0xc6, 0x84, 0x42, 0x00  },
+	{	0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a,
+		0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xd1, 0x92, 0x57, 0x14, 0xdd, 0x9e, 0x5b, 0x18,
+		0xc9, 0x8a, 0x4f, 0x0c, 0xc5, 0x86, 0x43, 0x00  },
+	{	0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd,
+		0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xfc, 0xb8, 0x74, 0x30, 0xec, 0xa8, 0x64, 0x20,
+		0xdc, 0x98, 0x54, 0x10, 0xcc, 0x88, 0x44, 0x00  },
+	{	0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd,
+		0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xf3, 0xb6, 0x79, 0x3c, 0xe7, 0xa2, 0x6d, 0x28,
+		0xdb, 0x9e, 0x51, 0x14, 0xcf, 0x8a, 0x45, 0x00  },
+	{	0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0,
+		0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xe2, 0xa4, 0x6e, 0x28, 0xfa, 0xbc, 0x76, 0x30,
+		0xd2, 0x94, 0x5e, 0x18, 0xca, 0x8c, 0x46, 0x00  },
+	{	0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0,
+		0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xed, 0xaa, 0x63, 0x24, 0xf1, 0xb6, 0x7f, 0x38,
+		0xd5, 0x92, 0x5b, 0x1c, 0xc9, 0x8e, 0x47, 0x00  },
+	{	0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3,
+		0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xb8, 0xf0, 0x28, 0x60, 0x98, 0xd0, 0x08, 0x40,
+		0xf8, 0xb0, 0x68, 0x20, 0xd8, 0x90, 0x48, 0x00  },
+	{	0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3,
+		0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xb7, 0xfe, 0x25, 0x6c, 0x93, 0xda, 0x01, 0x48,
+		0xff, 0xb6, 0x6d, 0x24, 0xdb, 0x92, 0x49, 0x00  },
+	{	0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee,
+		0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xa6, 0xec, 0x32, 0x78, 0x8e, 0xc4, 0x1a, 0x50,
+		0xf6, 0xbc, 0x62, 0x28, 0xde, 0x94, 0x4a, 0x00  },
+	{	0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee,
+		0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xa9, 0xe2, 0x3f, 0x74, 0x85, 0xce, 0x13, 0x58,
+		0xf1, 0xba, 0x67, 0x2c, 0xdd, 0x96, 0x4b, 0x00  },
+	{	0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9,
+		0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x84, 0xc8, 0x1c, 0x50, 0xb4, 0xf8, 0x2c, 0x60,
+		0xe4, 0xa8, 0x7c, 0x30, 0xd4, 0x98, 0x4c, 0x00  },
+	{	0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9,
+		0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x8b, 0xc6, 0x11, 0x5c, 0xbf, 0xf2, 0x25, 0x68,
+		0xe3, 0xae, 0x79, 0x34, 0xd7, 0x9a, 0x4d, 0x00  },
+	{	0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4,
+		0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70,
+		0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4,
+		0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x95, 0xda, 0x0b, 0x44, 0xa9, 0xe6, 0x37, 0x78,
+		0xed, 0xa2, 0x73, 0x3c, 0xd1, 0x9e, 0x4f, 0x00  },
+	{	0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f,
+		0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f,
+		0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x3f, 0x6e, 0x9d, 0xcc, 0x7b, 0x2a, 0xd9, 0x88,
+		0xb7, 0xe6, 0x15, 0x44, 0xf3, 0xa2, 0x51, 0x00  },
+	{	0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72,
+		0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x2e, 0x7c, 0x8a, 0xd8, 0x66, 0x34, 0xc2, 0x90,
+		0xbe, 0xec, 0x1a, 0x48, 0xf6, 0xa4, 0x52, 0x00  },
+	{	0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72,
+		0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98,
+		0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55,
+		0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x0c, 0x58, 0xa4, 0xf0, 0x5c, 0x08, 0xf4, 0xa0,
+		0xac, 0xf8, 0x04, 0x50, 0xfc, 0xa8, 0x54, 0x00  },
+	{	0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55,
+		0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x03, 0x56, 0xa9, 0xfc, 0x57, 0x02, 0xfd, 0xa8,
+		0xab, 0xfe, 0x01, 0x54, 0xff, 0xaa, 0x55, 0x00  },
+	{	0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48,
+		0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x12, 0x44, 0xbe, 0xe8, 0x4a, 0x1c, 0xe6, 0xb0,
+		0xa2, 0xf4, 0x0e, 0x58, 0xfa, 0xac, 0x56, 0x00  },
+	{	0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48,
+		0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x1d, 0x4a, 0xb3, 0xe4, 0x41, 0x16, 0xef, 0xb8,
+		0xa5, 0xf2, 0x0b, 0x5c, 0xf9, 0xae, 0x57, 0x00  },
+	{	0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b,
+		0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x48, 0x10, 0xf8, 0xa0, 0x28, 0x70, 0x98, 0xc0,
+		0x88, 0xd0, 0x38, 0x60, 0xe8, 0xb0, 0x58, 0x00  },
+	{	0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b,
+		0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x47, 0x1e, 0xf5, 0xac, 0x23, 0x7a, 0x91, 0xc8,
+		0x8f, 0xd6, 0x3d, 0x64, 0xeb, 0xb2, 0x59, 0x00  },
+	{	0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06,
+		0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x56, 0x0c, 0xe2, 0xb8, 0x3e, 0x64, 0x8a, 0xd0,
+		0x86, 0xdc, 0x32, 0x68, 0xee, 0xb4, 0x5a, 0x00  },
+	{	0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06,
+		0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x59, 0x02, 0xef, 0xb4, 0x35, 0x6e, 0x83, 0xd8,
+		0x81, 0xda, 0x37, 0x6c, 0xed, 0xb6, 0x5b, 0x00  },
+	{	0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21,
+		0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x74, 0x28, 0xcc, 0x90, 0x04, 0x58, 0xbc, 0xe0,
+		0x94, 0xc8, 0x2c, 0x70, 0xe4, 0xb8, 0x5c, 0x00  },
+	{	0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21,
+		0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x7b, 0x26, 0xc1, 0x9c, 0x0f, 0x52, 0xb5, 0xe8,
+		0x93, 0xce, 0x29, 0x74, 0xe7, 0xba, 0x5d, 0x00  },
+	{	0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c,
+		0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x6a, 0x34, 0xd6, 0x88, 0x12, 0x4c, 0xae, 0xf0,
+		0x9a, 0xc4, 0x26, 0x78, 0xe2, 0xbc, 0x5e, 0x00  },
+	{	0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c,
+		0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x65, 0x3a, 0xdb, 0x84, 0x19, 0x46, 0xa7, 0xf8,
+		0x9d, 0xc2, 0x23, 0x7c, 0xe1, 0xbe, 0x5f, 0x00  },
+	{	0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a,
+		0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a,
+		0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x2f, 0x4e, 0xed, 0x8c, 0xab, 0xca, 0x69, 0x08,
+		0x27, 0x46, 0xe5, 0x84, 0xa3, 0xc2, 0x61, 0x00  },
+	{	0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57,
+		0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x3e, 0x5c, 0xfa, 0x98, 0xb6, 0xd4, 0x72, 0x10,
+		0x2e, 0x4c, 0xea, 0x88, 0xa6, 0xc4, 0x62, 0x00  },
+	{	0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57,
+		0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x31, 0x52, 0xf7, 0x94, 0xbd, 0xde, 0x7b, 0x18,
+		0x29, 0x4a, 0xef, 0x8c, 0xa5, 0xc6, 0x63, 0x00  },
+	{	0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70,
+		0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x1c, 0x78, 0xd4, 0xb0, 0x8c, 0xe8, 0x44, 0x20,
+		0x3c, 0x58, 0xf4, 0x90, 0xac, 0xc8, 0x64, 0x00  },
+	{	0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70,
+		0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x13, 0x76, 0xd9, 0xbc, 0x87, 0xe2, 0x4d, 0x28,
+		0x3b, 0x5e, 0xf1, 0x94, 0xaf, 0xca, 0x65, 0x00  },
+	{	0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d,
+		0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x02, 0x64, 0xce, 0xa8, 0x9a, 0xfc, 0x56, 0x30,
+		0x32, 0x54, 0xfe, 0x98, 0xaa, 0xcc, 0x66, 0x00  },
+	{	0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d,
+		0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x0d, 0x6a, 0xc3, 0xa4, 0x91, 0xf6, 0x5f, 0x38,
+		0x35, 0x52, 0xfb, 0x9c, 0xa9, 0xce, 0x67, 0x00  },
+	{	0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e,
+		0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x58, 0x30, 0x88, 0xe0, 0xf8, 0x90, 0x28, 0x40,
+		0x18, 0x70, 0xc8, 0xa0, 0xb8, 0xd0, 0x68, 0x00  },
+	{	0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e,
+		0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48,
+		0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00  },
+	{	0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23,
+		0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x46, 0x2c, 0x92, 0xf8, 0xee, 0x84, 0x3a, 0x50,
+		0x16, 0x7c, 0xc2, 0xa8, 0xbe, 0xd4, 0x6a, 0x00  },
+	{	0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23,
+		0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x49, 0x22, 0x9f, 0xf4, 0xe5, 0x8e, 0x33, 0x58,
+		0x11, 0x7a, 0xc7, 0xac, 0xbd, 0xd6, 0x6b, 0x00  },
+	{	0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04,
+		0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x64, 0x08, 0xbc, 0xd0, 0xd4, 0xb8, 0x0c, 0x60,
+		0x04, 0x68, 0xdc, 0xb0, 0xb4, 0xd8, 0x6c, 0x00  },
+	{	0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04,
+		0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x6b, 0x06, 0xb1, 0xdc, 0xdf, 0xb2, 0x05, 0x68,
+		0x03, 0x6e, 0xd9, 0xb4, 0xb7, 0xda, 0x6d, 0x00  },
+	{	0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19,
+		0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x7a, 0x14, 0xa6, 0xc8, 0xc2, 0xac, 0x1e, 0x70,
+		0x0a, 0x64, 0xd6, 0xb8, 0xb2, 0xdc, 0x6e, 0x00  },
+	{	0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19,
+		0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x75, 0x1a, 0xab, 0xc4, 0xc9, 0xa6, 0x17, 0x78,
+		0x0d, 0x62, 0xd3, 0xbc, 0xb1, 0xde, 0x6f, 0x00  },
+	{	0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2,
+		0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2,
+		0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xdf, 0xae, 0x3d, 0x4c, 0x1b, 0x6a, 0xf9, 0x88,
+		0x57, 0x26, 0xb5, 0xc4, 0x93, 0xe2, 0x71, 0x00  },
+	{	0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf,
+		0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xce, 0xbc, 0x2a, 0x58, 0x06, 0x74, 0xe2, 0x90,
+		0x5e, 0x2c, 0xba, 0xc8, 0x96, 0xe4, 0x72, 0x00  },
+	{	0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf,
+		0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xc1, 0xb2, 0x27, 0x54, 0x0d, 0x7e, 0xeb, 0x98,
+		0x59, 0x2a, 0xbf, 0xcc, 0x95, 0xe6, 0x73, 0x00  },
+	{	0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98,
+		0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0,
+		0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00  },
+	{	0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98,
+		0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xe3, 0x96, 0x09, 0x7c, 0x37, 0x42, 0xdd, 0xa8,
+		0x4b, 0x3e, 0xa1, 0xd4, 0x9f, 0xea, 0x75, 0x00  },
+	{	0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85,
+		0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xf2, 0x84, 0x1e, 0x68, 0x2a, 0x5c, 0xc6, 0xb0,
+		0x42, 0x34, 0xae, 0xd8, 0x9a, 0xec, 0x76, 0x00  },
+	{	0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85,
+		0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xfd, 0x8a, 0x13, 0x64, 0x21, 0x56, 0xcf, 0xb8,
+		0x45, 0x32, 0xab, 0xdc, 0x99, 0xee, 0x77, 0x00  },
+	{	0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6,
+		0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xa8, 0xd0, 0x58, 0x20, 0x48, 0x30, 0xb8, 0xc0,
+		0x68, 0x10, 0x98, 0xe0, 0x88, 0xf0, 0x78, 0x00  },
+	{	0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6,
+		0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xa7, 0xde, 0x55, 0x2c, 0x43, 0x3a, 0xb1, 0xc8,
+		0x6f, 0x16, 0x9d, 0xe4, 0x8b, 0xf2, 0x79, 0x00  },
+	{	0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb,
+		0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xb6, 0xcc, 0x42, 0x38, 0x5e, 0x24, 0xaa, 0xd0,
+		0x66, 0x1c, 0x92, 0xe8, 0x8e, 0xf4, 0x7a, 0x00  },
+	{	0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb,
+		0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0xb9, 0xc2, 0x4f, 0x34, 0x55, 0x2e, 0xa3, 0xd8,
+		0x61, 0x1a, 0x97, 0xec, 0x8d, 0xf6, 0x7b, 0x00  },
+	{	0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec,
+		0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x94, 0xe8, 0x6c, 0x10, 0x64, 0x18, 0x9c, 0xe0,
+		0x74, 0x08, 0x8c, 0xf0, 0x84, 0xf8, 0x7c, 0x00  },
+	{	0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec,
+		0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x9b, 0xe6, 0x61, 0x1c, 0x6f, 0x12, 0x95, 0xe8,
+		0x73, 0x0e, 0x89, 0xf4, 0x87, 0xfa, 0x7d, 0x00  },
+	{	0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1,
+		0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x8a, 0xf4, 0x76, 0x08, 0x72, 0x0c, 0x8e, 0xf0,
+		0x7a, 0x04, 0x86, 0xf8, 0x82, 0xfc, 0x7e, 0x00  },
+	{	0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1,
+		0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00  },
+	{	0x85, 0xfa, 0x7b, 0x04, 0x79, 0x06, 0x87, 0xf8,
+		0x7d, 0x02, 0x83, 0xfc, 0x81, 0xfe, 0x7f, 0x00  },
+	{	0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13,
+		0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13,
+		0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08,
+		0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00  },
+	{	0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e,
+		0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x9e, 0x1c, 0x9a, 0x18, 0x96, 0x14, 0x92, 0x10,
+		0x8e, 0x0c, 0x8a, 0x08, 0x86, 0x04, 0x82, 0x00  },
+	{	0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e,
+		0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x91, 0x12, 0x97, 0x14, 0x9d, 0x1e, 0x9b, 0x18,
+		0x89, 0x0a, 0x8f, 0x0c, 0x85, 0x06, 0x83, 0x00  },
+	{	0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29,
+		0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xbc, 0x38, 0xb4, 0x30, 0xac, 0x28, 0xa4, 0x20,
+		0x9c, 0x18, 0x94, 0x10, 0x8c, 0x08, 0x84, 0x00  },
+	{	0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29,
+		0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xb3, 0x36, 0xb9, 0x3c, 0xa7, 0x22, 0xad, 0x28,
+		0x9b, 0x1e, 0x91, 0x14, 0x8f, 0x0a, 0x85, 0x00  },
+	{	0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34,
+		0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xa2, 0x24, 0xae, 0x28, 0xba, 0x3c, 0xb6, 0x30,
+		0x92, 0x14, 0x9e, 0x18, 0x8a, 0x0c, 0x86, 0x00  },
+	{	0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34,
+		0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xad, 0x2a, 0xa3, 0x24, 0xb1, 0x36, 0xbf, 0x38,
+		0x95, 0x12, 0x9b, 0x1c, 0x89, 0x0e, 0x87, 0x00  },
+	{	0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67,
+		0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xf8, 0x70, 0xe8, 0x60, 0xd8, 0x50, 0xc8, 0x40,
+		0xb8, 0x30, 0xa8, 0x20, 0x98, 0x10, 0x88, 0x00  },
+	{	0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67,
+		0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xf7, 0x7e, 0xe5, 0x6c, 0xd3, 0x5a, 0xc1, 0x48,
+		0xbf, 0x36, 0xad, 0x24, 0x9b, 0x12, 0x89, 0x00  },
+	{	0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a,
+		0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xe6, 0x6c, 0xf2, 0x78, 0xce, 0x44, 0xda, 0x50,
+		0xb6, 0x3c, 0xa2, 0x28, 0x9e, 0x14, 0x8a, 0x00  },
+	{	0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a,
+		0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xe9, 0x62, 0xff, 0x74, 0xc5, 0x4e, 0xd3, 0x58,
+		0xb1, 0x3a, 0xa7, 0x2c, 0x9d, 0x16, 0x8b, 0x00  },
+	{	0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d,
+		0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xc4, 0x48, 0xdc, 0x50, 0xf4, 0x78, 0xec, 0x60,
+		0xa4, 0x28, 0xbc, 0x30, 0x94, 0x18, 0x8c, 0x00  },
+	{	0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d,
+		0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xcb, 0x46, 0xd1, 0x5c, 0xff, 0x72, 0xe5, 0x68,
+		0xa3, 0x2e, 0xb9, 0x34, 0x97, 0x1a, 0x8d, 0x00  },
+	{	0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40,
+		0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xda, 0x54, 0xc6, 0x48, 0xe2, 0x6c, 0xfe, 0x70,
+		0xaa, 0x24, 0xb6, 0x38, 0x92, 0x1c, 0x8e, 0x00  },
+	{	0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40,
+		0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xd5, 0x5a, 0xcb, 0x44, 0xe9, 0x66, 0xf7, 0x78,
+		0xad, 0x22, 0xb3, 0x3c, 0x91, 0x1e, 0x8f, 0x00  },
+	{	0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb,
+		0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb,
+		0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x7f, 0xee, 0x5d, 0xcc, 0x3b, 0xaa, 0x19, 0x88,
+		0xf7, 0x66, 0xd5, 0x44, 0xb3, 0x22, 0x91, 0x00  },
+	{	0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6,
+		0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x6e, 0xfc, 0x4a, 0xd8, 0x26, 0xb4, 0x02, 0x90,
+		0xfe, 0x6c, 0xda, 0x48, 0xb6, 0x24, 0x92, 0x00  },
+	{	0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6,
+		0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x61, 0xf2, 0x47, 0xd4, 0x2d, 0xbe, 0x0b, 0x98,
+		0xf9, 0x6a, 0xdf, 0x4c, 0xb5, 0x26, 0x93, 0x00  },
+	{	0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1,
+		0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x4c, 0xd8, 0x64, 0xf0, 0x1c, 0x88, 0x34, 0xa0,
+		0xec, 0x78, 0xc4, 0x50, 0xbc, 0x28, 0x94, 0x00  },
+	{	0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1,
+		0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x43, 0xd6, 0x69, 0xfc, 0x17, 0x82, 0x3d, 0xa8,
+		0xeb, 0x7e, 0xc1, 0x54, 0xbf, 0x2a, 0x95, 0x00  },
+	{	0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc,
+		0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x52, 0xc4, 0x7e, 0xe8, 0x0a, 0x9c, 0x26, 0xb0,
+		0xe2, 0x74, 0xce, 0x58, 0xba, 0x2c, 0x96, 0x00  },
+	{	0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc,
+		0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x5d, 0xca, 0x73, 0xe4, 0x01, 0x96, 0x2f, 0xb8,
+		0xe5, 0x72, 0xcb, 0x5c, 0xb9, 0x2e, 0x97, 0x00  },
+	{	0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f,
+		0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x08, 0x90, 0x38, 0xa0, 0x68, 0xf0, 0x58, 0xc0,
+		0xc8, 0x50, 0xf8, 0x60, 0xa8, 0x30, 0x98, 0x00  },
+	{	0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f,
+		0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x07, 0x9e, 0x35, 0xac, 0x63, 0xfa, 0x51, 0xc8,
+		0xcf, 0x56, 0xfd, 0x64, 0xab, 0x32, 0x99, 0x00  },
+	{	0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92,
+		0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x16, 0x8c, 0x22, 0xb8, 0x7e, 0xe4, 0x4a, 0xd0,
+		0xc6, 0x5c, 0xf2, 0x68, 0xae, 0x34, 0x9a, 0x00  },
+	{	0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92,
+		0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x19, 0x82, 0x2f, 0xb4, 0x75, 0xee, 0x43, 0xd8,
+		0xc1, 0x5a, 0xf7, 0x6c, 0xad, 0x36, 0x9b, 0x00  },
+	{	0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5,
+		0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0,
+		0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5,
+		0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x3b, 0xa6, 0x01, 0x9c, 0x4f, 0xd2, 0x75, 0xe8,
+		0xd3, 0x4e, 0xe9, 0x74, 0xa7, 0x3a, 0x9d, 0x00  },
+	{	0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8,
+		0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x2a, 0xb4, 0x16, 0x88, 0x52, 0xcc, 0x6e, 0xf0,
+		0xda, 0x44, 0xe6, 0x78, 0xa2, 0x3c, 0x9e, 0x00  },
+	{	0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8,
+		0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x25, 0xba, 0x1b, 0x84, 0x59, 0xc6, 0x67, 0xf8,
+		0xdd, 0x42, 0xe3, 0x7c, 0xa1, 0x3e, 0x9f, 0x00  },
+	{	0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde,
+		0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde,
+		0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x6f, 0xce, 0x2d, 0x8c, 0xeb, 0x4a, 0xa9, 0x08,
+		0x67, 0xc6, 0x25, 0x84, 0xe3, 0x42, 0xa1, 0x00  },
+	{	0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3,
+		0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x7e, 0xdc, 0x3a, 0x98, 0xf6, 0x54, 0xb2, 0x10,
+		0x6e, 0xcc, 0x2a, 0x88, 0xe6, 0x44, 0xa2, 0x00  },
+	{	0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3,
+		0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x71, 0xd2, 0x37, 0x94, 0xfd, 0x5e, 0xbb, 0x18,
+		0x69, 0xca, 0x2f, 0x8c, 0xe5, 0x46, 0xa3, 0x00  },
+	{	0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4,
+		0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x5c, 0xf8, 0x14, 0xb0, 0xcc, 0x68, 0x84, 0x20,
+		0x7c, 0xd8, 0x34, 0x90, 0xec, 0x48, 0xa4, 0x00  },
+	{	0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4,
+		0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x53, 0xf6, 0x19, 0xbc, 0xc7, 0x62, 0x8d, 0x28,
+		0x7b, 0xde, 0x31, 0x94, 0xef, 0x4a, 0xa5, 0x00  },
+	{	0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9,
+		0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30,
+		0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9,
+		0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x4d, 0xea, 0x03, 0xa4, 0xd1, 0x76, 0x9f, 0x38,
+		0x75, 0xd2, 0x3b, 0x9c, 0xe9, 0x4e, 0xa7, 0x00  },
+	{	0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa,
+		0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x18, 0xb0, 0x48, 0xe0, 0xb8, 0x10, 0xe8, 0x40,
+		0x58, 0xf0, 0x08, 0xa0, 0xf8, 0x50, 0xa8, 0x00  },
+	{	0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa,
+		0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x17, 0xbe, 0x45, 0xec, 0xb3, 0x1a, 0xe1, 0x48,
+		0x5f, 0xf6, 0x0d, 0xa4, 0xfb, 0x52, 0xa9, 0x00  },
+	{	0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7,
+		0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x06, 0xac, 0x52, 0xf8, 0xae, 0x04, 0xfa, 0x50,
+		0x56, 0xfc, 0x02, 0xa8, 0xfe, 0x54, 0xaa, 0x00  },
+	{	0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7,
+		0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x09, 0xa2, 0x5f, 0xf4, 0xa5, 0x0e, 0xf3, 0x58,
+		0x51, 0xfa, 0x07, 0xac, 0xfd, 0x56, 0xab, 0x00  },
+	{	0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90,
+		0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x24, 0x88, 0x7c, 0xd0, 0x94, 0x38, 0xcc, 0x60,
+		0x44, 0xe8, 0x1c, 0xb0, 0xf4, 0x58, 0xac, 0x00  },
+	{	0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90,
+		0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x2b, 0x86, 0x71, 0xdc, 0x9f, 0x32, 0xc5, 0x68,
+		0x43, 0xee, 0x19, 0xb4, 0xf7, 0x5a, 0xad, 0x00  },
+	{	0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d,
+		0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x3a, 0x94, 0x66, 0xc8, 0x82, 0x2c, 0xde, 0x70,
+		0x4a, 0xe4, 0x16, 0xb8, 0xf2, 0x5c, 0xae, 0x00  },
+	{	0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d,
+		0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x35, 0x9a, 0x6b, 0xc4, 0x89, 0x26, 0xd7, 0x78,
+		0x4d, 0xe2, 0x13, 0xbc, 0xf1, 0x5e, 0xaf, 0x00  },
+	{	0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36,
+		0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36,
+		0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x9f, 0x2e, 0xfd, 0x4c, 0x5b, 0xea, 0x39, 0x88,
+		0x17, 0xa6, 0x75, 0xc4, 0xd3, 0x62, 0xb1, 0x00  },
+	{	0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b,
+		0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x8e, 0x3c, 0xea, 0x58, 0x46, 0xf4, 0x22, 0x90,
+		0x1e, 0xac, 0x7a, 0xc8, 0xd6, 0x64, 0xb2, 0x00  },
+	{	0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b,
+		0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x81, 0x32, 0xe7, 0x54, 0x4d, 0xfe, 0x2b, 0x98,
+		0x19, 0xaa, 0x7f, 0xcc, 0xd5, 0x66, 0xb3, 0x00  },
+	{	0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c,
+		0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xac, 0x18, 0xc4, 0x70, 0x7c, 0xc8, 0x14, 0xa0,
+		0x0c, 0xb8, 0x64, 0xd0, 0xdc, 0x68, 0xb4, 0x00  },
+	{	0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c,
+		0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xa3, 0x16, 0xc9, 0x7c, 0x77, 0xc2, 0x1d, 0xa8,
+		0x0b, 0xbe, 0x61, 0xd4, 0xdf, 0x6a, 0xb5, 0x00  },
+	{	0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11,
+		0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xb2, 0x04, 0xde, 0x68, 0x6a, 0xdc, 0x06, 0xb0,
+		0x02, 0xb4, 0x6e, 0xd8, 0xda, 0x6c, 0xb6, 0x00  },
+	{	0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11,
+		0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xbd, 0x0a, 0xd3, 0x64, 0x61, 0xd6, 0x0f, 0xb8,
+		0x05, 0xb2, 0x6b, 0xdc, 0xd9, 0x6e, 0xb7, 0x00  },
+	{	0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42,
+		0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xe8, 0x50, 0x98, 0x20, 0x08, 0xb0, 0x78, 0xc0,
+		0x28, 0x90, 0x58, 0xe0, 0xc8, 0x70, 0xb8, 0x00  },
+	{	0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42,
+		0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xe7, 0x5e, 0x95, 0x2c, 0x03, 0xba, 0x71, 0xc8,
+		0x2f, 0x96, 0x5d, 0xe4, 0xcb, 0x72, 0xb9, 0x00  },
+	{	0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f,
+		0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xf6, 0x4c, 0x82, 0x38, 0x1e, 0xa4, 0x6a, 0xd0,
+		0x26, 0x9c, 0x52, 0xe8, 0xce, 0x74, 0xba, 0x00  },
+	{	0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f,
+		0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8,
+		0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78,
+		0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xd4, 0x68, 0xac, 0x10, 0x24, 0x98, 0x5c, 0xe0,
+		0x34, 0x88, 0x4c, 0xf0, 0xc4, 0x78, 0xbc, 0x00  },
+	{	0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78,
+		0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xdb, 0x66, 0xa1, 0x1c, 0x2f, 0x92, 0x55, 0xe8,
+		0x33, 0x8e, 0x49, 0xf4, 0xc7, 0x7a, 0xbd, 0x00  },
+	{	0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65,
+		0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xca, 0x74, 0xb6, 0x08, 0x32, 0x8c, 0x4e, 0xf0,
+		0x3a, 0x84, 0x46, 0xf8, 0xc2, 0x7c, 0xbe, 0x00  },
+	{	0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65,
+		0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69,
+		0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xc5, 0x7a, 0xbb, 0x04, 0x39, 0x86, 0x47, 0xf8,
+		0x3d, 0x82, 0x43, 0xfc, 0xc1, 0x7e, 0xbf, 0x00  },
+	{	0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94,
+		0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94,
+		0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x4f, 0x8e, 0xcd, 0x0c, 0x4b, 0x8a, 0xc9, 0x08,
+		0x47, 0x86, 0xc5, 0x04, 0x43, 0x82, 0xc1, 0x00  },
+	{	0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89,
+		0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x5e, 0x9c, 0xda, 0x18, 0x56, 0x94, 0xd2, 0x10,
+		0x4e, 0x8c, 0xca, 0x08, 0x46, 0x84, 0xc2, 0x00  },
+	{	0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89,
+		0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x51, 0x92, 0xd7, 0x14, 0x5d, 0x9e, 0xdb, 0x18,
+		0x49, 0x8a, 0xcf, 0x0c, 0x45, 0x86, 0xc3, 0x00  },
+	{	0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae,
+		0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x7c, 0xb8, 0xf4, 0x30, 0x6c, 0xa8, 0xe4, 0x20,
+		0x5c, 0x98, 0xd4, 0x10, 0x4c, 0x88, 0xc4, 0x00  },
+	{	0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae,
+		0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x73, 0xb6, 0xf9, 0x3c, 0x67, 0xa2, 0xed, 0x28,
+		0x5b, 0x9e, 0xd1, 0x14, 0x4f, 0x8a, 0xc5, 0x00  },
+	{	0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3,
+		0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x62, 0xa4, 0xee, 0x28, 0x7a, 0xbc, 0xf6, 0x30,
+		0x52, 0x94, 0xde, 0x18, 0x4a, 0x8c, 0xc6, 0x00  },
+	{	0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3,
+		0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x6d, 0xaa, 0xe3, 0x24, 0x71, 0xb6, 0xff, 0x38,
+		0x55, 0x92, 0xdb, 0x1c, 0x49, 0x8e, 0xc7, 0x00  },
+	{	0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0,
+		0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x38, 0xf0, 0xa8, 0x60, 0x18, 0xd0, 0x88, 0x40,
+		0x78, 0xb0, 0xe8, 0x20, 0x58, 0x90, 0xc8, 0x00  },
+	{	0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0,
+		0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x37, 0xfe, 0xa5, 0x6c, 0x13, 0xda, 0x81, 0x48,
+		0x7f, 0xb6, 0xed, 0x24, 0x5b, 0x92, 0xc9, 0x00  },
+	{	0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd,
+		0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x26, 0xec, 0xb2, 0x78, 0x0e, 0xc4, 0x9a, 0x50,
+		0x76, 0xbc, 0xe2, 0x28, 0x5e, 0x94, 0xca, 0x00  },
+	{	0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd,
+		0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x29, 0xe2, 0xbf, 0x74, 0x05, 0xce, 0x93, 0x58,
+		0x71, 0xba, 0xe7, 0x2c, 0x5d, 0x96, 0xcb, 0x00  },
+	{	0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda,
+		0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x04, 0xc8, 0x9c, 0x50, 0x34, 0xf8, 0xac, 0x60,
+		0x64, 0xa8, 0xfc, 0x30, 0x54, 0x98, 0xcc, 0x00  },
+	{	0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda,
+		0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x0b, 0xc6, 0x91, 0x5c, 0x3f, 0xf2, 0xa5, 0x68,
+		0x63, 0xae, 0xf9, 0x34, 0x57, 0x9a, 0xcd, 0x00  },
+	{	0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7,
+		0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x1a, 0xd4, 0x86, 0x48, 0x22, 0xec, 0xbe, 0x70,
+		0x6a, 0xa4, 0xf6, 0x38, 0x52, 0x9c, 0xce, 0x00  },
+	{	0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7,
+		0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78,
+		0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00  },
+	{	0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c,
+		0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c,
+		0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xbf, 0x6e, 0x1d, 0xcc, 0xfb, 0x2a, 0x59, 0x88,
+		0x37, 0xe6, 0x95, 0x44, 0x73, 0xa2, 0xd1, 0x00  },
+	{	0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61,
+		0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90,
+		0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00  },
+	{	0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61,
+		0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xa1, 0x72, 0x07, 0xd4, 0xed, 0x3e, 0x4b, 0x98,
+		0x39, 0xea, 0x9f, 0x4c, 0x75, 0xa6, 0xd3, 0x00  },
+	{	0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46,
+		0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x8c, 0x58, 0x24, 0xf0, 0xdc, 0x08, 0x74, 0xa0,
+		0x2c, 0xf8, 0x84, 0x50, 0x7c, 0xa8, 0xd4, 0x00  },
+	{	0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46,
+		0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x83, 0x56, 0x29, 0xfc, 0xd7, 0x02, 0x7d, 0xa8,
+		0x2b, 0xfe, 0x81, 0x54, 0x7f, 0xaa, 0xd5, 0x00  },
+	{	0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b,
+		0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x92, 0x44, 0x3e, 0xe8, 0xca, 0x1c, 0x66, 0xb0,
+		0x22, 0xf4, 0x8e, 0x58, 0x7a, 0xac, 0xd6, 0x00  },
+	{	0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b,
+		0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x9d, 0x4a, 0x33, 0xe4, 0xc1, 0x16, 0x6f, 0xb8,
+		0x25, 0xf2, 0x8b, 0x5c, 0x79, 0xae, 0xd7, 0x00  },
+	{	0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08,
+		0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xc8, 0x10, 0x78, 0xa0, 0xa8, 0x70, 0x18, 0xc0,
+		0x08, 0xd0, 0xb8, 0x60, 0x68, 0xb0, 0xd8, 0x00  },
+	{	0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08,
+		0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xc7, 0x1e, 0x75, 0xac, 0xa3, 0x7a, 0x11, 0xc8,
+		0x0f, 0xd6, 0xbd, 0x64, 0x6b, 0xb2, 0xd9, 0x00  },
+	{	0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15,
+		0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xd6, 0x0c, 0x62, 0xb8, 0xbe, 0x64, 0x0a, 0xd0,
+		0x06, 0xdc, 0xb2, 0x68, 0x6e, 0xb4, 0xda, 0x00  },
+	{	0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15,
+		0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xd9, 0x02, 0x6f, 0xb4, 0xb5, 0x6e, 0x03, 0xd8,
+		0x01, 0xda, 0xb7, 0x6c, 0x6d, 0xb6, 0xdb, 0x00  },
+	{	0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32,
+		0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xf4, 0x28, 0x4c, 0x90, 0x84, 0x58, 0x3c, 0xe0,
+		0x14, 0xc8, 0xac, 0x70, 0x64, 0xb8, 0xdc, 0x00  },
+	{	0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32,
+		0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xfb, 0x26, 0x41, 0x9c, 0x8f, 0x52, 0x35, 0xe8,
+		0x13, 0xce, 0xa9, 0x74, 0x67, 0xba, 0xdd, 0x00  },
+	{	0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f,
+		0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xea, 0x34, 0x56, 0x88, 0x92, 0x4c, 0x2e, 0xf0,
+		0x1a, 0xc4, 0xa6, 0x78, 0x62, 0xbc, 0xde, 0x00  },
+	{	0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f,
+		0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xe5, 0x3a, 0x5b, 0x84, 0x99, 0x46, 0x27, 0xf8,
+		0x1d, 0xc2, 0xa3, 0x7c, 0x61, 0xbe, 0xdf, 0x00  },
+	{	0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59,
+		0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59,
+		0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xaf, 0x4e, 0x6d, 0x8c, 0x2b, 0xca, 0xe9, 0x08,
+		0xa7, 0x46, 0x65, 0x84, 0x23, 0xc2, 0xe1, 0x00  },
+	{	0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44,
+		0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xbe, 0x5c, 0x7a, 0x98, 0x36, 0xd4, 0xf2, 0x10,
+		0xae, 0x4c, 0x6a, 0x88, 0x26, 0xc4, 0xe2, 0x00  },
+	{	0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44,
+		0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xb1, 0x52, 0x77, 0x94, 0x3d, 0xde, 0xfb, 0x18,
+		0xa9, 0x4a, 0x6f, 0x8c, 0x25, 0xc6, 0xe3, 0x00  },
+	{	0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63,
+		0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x9c, 0x78, 0x54, 0xb0, 0x0c, 0xe8, 0xc4, 0x20,
+		0xbc, 0x58, 0x74, 0x90, 0x2c, 0xc8, 0xe4, 0x00  },
+	{	0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63,
+		0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x93, 0x76, 0x59, 0xbc, 0x07, 0xe2, 0xcd, 0x28,
+		0xbb, 0x5e, 0x71, 0x94, 0x2f, 0xca, 0xe5, 0x00  },
+	{	0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e,
+		0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x82, 0x64, 0x4e, 0xa8, 0x1a, 0xfc, 0xd6, 0x30,
+		0xb2, 0x54, 0x7e, 0x98, 0x2a, 0xcc, 0xe6, 0x00  },
+	{	0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e,
+		0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x8d, 0x6a, 0x43, 0xa4, 0x11, 0xf6, 0xdf, 0x38,
+		0xb5, 0x52, 0x7b, 0x9c, 0x29, 0xce, 0xe7, 0x00  },
+	{	0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d,
+		0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40,
+		0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00  },
+	{	0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d,
+		0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xd7, 0x3e, 0x05, 0xec, 0x73, 0x9a, 0xa1, 0x48,
+		0x9f, 0x76, 0x4d, 0xa4, 0x3b, 0xd2, 0xe9, 0x00  },
+	{	0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30,
+		0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xc6, 0x2c, 0x12, 0xf8, 0x6e, 0x84, 0xba, 0x50,
+		0x96, 0x7c, 0x42, 0xa8, 0x3e, 0xd4, 0xea, 0x00  },
+	{	0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30,
+		0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xc9, 0x22, 0x1f, 0xf4, 0x65, 0x8e, 0xb3, 0x58,
+		0x91, 0x7a, 0x47, 0xac, 0x3d, 0xd6, 0xeb, 0x00  },
+	{	0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17,
+		0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xe4, 0x08, 0x3c, 0xd0, 0x54, 0xb8, 0x8c, 0x60,
+		0x84, 0x68, 0x5c, 0xb0, 0x34, 0xd8, 0xec, 0x00  },
+	{	0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17,
+		0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xeb, 0x06, 0x31, 0xdc, 0x5f, 0xb2, 0x85, 0x68,
+		0x83, 0x6e, 0x59, 0xb4, 0x37, 0xda, 0xed, 0x00  },
+	{	0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a,
+		0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xfa, 0x14, 0x26, 0xc8, 0x42, 0xac, 0x9e, 0x70,
+		0x8a, 0x64, 0x56, 0xb8, 0x32, 0xdc, 0xee, 0x00  },
+	{	0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a,
+		0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0xf5, 0x1a, 0x2b, 0xc4, 0x49, 0xa6, 0x97, 0x78,
+		0x8d, 0x62, 0x53, 0xbc, 0x31, 0xde, 0xef, 0x00  },
+	{	0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1,
+		0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1,
+		0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80,
+		0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x5f, 0xae, 0xbd, 0x4c, 0x9b, 0x6a, 0x79, 0x88,
+		0xd7, 0x26, 0x35, 0xc4, 0x13, 0xe2, 0xf1, 0x00  },
+	{	0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac,
+		0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00,
+		0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x4e, 0xbc, 0xaa, 0x58, 0x86, 0x74, 0x62, 0x90,
+		0xde, 0x2c, 0x3a, 0xc8, 0x16, 0xe4, 0xf2, 0x00  },
+	{	0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac,
+		0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80,
+		0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x41, 0xb2, 0xa7, 0x54, 0x8d, 0x7e, 0x6b, 0x98,
+		0xd9, 0x2a, 0x3f, 0xcc, 0x15, 0xe6, 0xf3, 0x00  },
+	{	0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b,
+		0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00,
+		0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x6c, 0x98, 0x84, 0x70, 0xbc, 0x48, 0x54, 0xa0,
+		0xcc, 0x38, 0x24, 0xd0, 0x1c, 0xe8, 0xf4, 0x00  },
+	{	0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b,
+		0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80,
+		0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8,
+		0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00  },
+	{	0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96,
+		0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00,
+		0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x72, 0x84, 0x9e, 0x68, 0xaa, 0x5c, 0x46, 0xb0,
+		0xc2, 0x34, 0x2e, 0xd8, 0x1a, 0xec, 0xf6, 0x00  },
+	{	0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96,
+		0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00  },
+	{	0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80,
+		0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x7d, 0x8a, 0x93, 0x64, 0xa1, 0x56, 0x4f, 0xb8,
+		0xc5, 0x32, 0x2b, 0xdc, 0x19, 0xee, 0xf7, 0x00  },
+	{	0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5,
+		0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
+		0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x28, 0xd0, 0xd8, 0x20, 0xc8, 0x30, 0x38, 0xc0,
+		0xe8, 0x10, 0x18, 0xe0, 0x08, 0xf0, 0xf8, 0x00  },
+	{	0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5,
+		0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80,
+		0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x27, 0xde, 0xd5, 0x2c, 0xc3, 0x3a, 0x31, 0xc8,
+		0xef, 0x16, 0x1d, 0xe4, 0x0b, 0xf2, 0xf9, 0x00  },
+	{	0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8,
+		0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00,
+		0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x36, 0xcc, 0xc2, 0x38, 0xde, 0x24, 0x2a, 0xd0,
+		0xe6, 0x1c, 0x12, 0xe8, 0x0e, 0xf4, 0xfa, 0x00  },
+	{	0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8,
+		0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80,
+		0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x39, 0xc2, 0xcf, 0x34, 0xd5, 0x2e, 0x23, 0xd8,
+		0xe1, 0x1a, 0x17, 0xec, 0x0d, 0xf6, 0xfb, 0x00  },
+	{	0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff,
+		0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00,
+		0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x14, 0xe8, 0xec, 0x10, 0xe4, 0x18, 0x1c, 0xe0,
+		0xf4, 0x08, 0x0c, 0xf0, 0x04, 0xf8, 0xfc, 0x00  },
+	{	0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff,
+		0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80,
+		0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x1b, 0xe6, 0xe1, 0x1c, 0xef, 0x12, 0x15, 0xe8,
+		0xf3, 0x0e, 0x09, 0xf4, 0x07, 0xfa, 0xfd, 0x00  },
+	{	0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2,
+		0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00,
+		0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x0a, 0xf4, 0xf6, 0x08, 0xf2, 0x0c, 0x0e, 0xf0,
+		0xfa, 0x04, 0x06, 0xf8, 0x02, 0xfc, 0xfe, 0x00  },
+	{	0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2,
+		0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00  },
+	{	0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80,
+		0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00  },
+	{	0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53,
+		0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00  },
+	{	0x05, 0xfa, 0xfb, 0x04, 0xf9, 0x06, 0x07, 0xf8,
+		0xfd, 0x02, 0x03, 0xfc, 0x01, 0xfe, 0xff, 0x00  }
+};
+/* END CSTYLED */
+#else
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = {
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+		0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+		0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+		0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+		0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+		0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+		0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+		0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+		0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+		0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+		0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+		0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+		0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+		0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+		0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+		0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+		0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+		0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+		0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+		0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+		0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+		0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+		0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+		0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+		0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+		0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+		0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+		0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+		0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+		0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+		0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+		0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+		0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+		0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+		0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+		0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+		0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+		0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+		0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+		0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+		0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+		0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+		0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+		0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa  },
+	{	0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+		0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+		0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+		0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+		0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+		0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+		0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+		0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+		0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+		0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+		0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+		0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+		0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+		0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+		0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+		0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+		0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+		0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+		0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+		0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+		0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+		0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+		0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+		0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+		0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+		0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+		0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+		0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+		0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+		0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+		0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+		0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba  },
+	{	0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+		0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+		0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+		0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+		0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+		0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+		0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+		0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+		0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+		0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+		0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+		0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+		0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+		0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+		0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+		0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32  },
+	{	0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+		0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+		0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d  },
+	{	0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+		0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+		0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68  },
+	{	0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+		0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+		0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67  },
+	{	0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+		0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+		0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{	0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+		0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+		0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79  },
+	{	0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+		0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+		0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54  },
+	{	0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+		0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+		0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b  },
+	{	0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+		0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+		0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a  },
+	{	0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+		0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{	0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+		0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+		0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+		0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7,
+		0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+		0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce,
+		0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+		0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9,
+		0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+		0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc,
+		0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+		0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb,
+		0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+		0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2,
+		0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+		0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5,
+		0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+		0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8,
+		0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+		0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff,
+		0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+		0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6,
+		0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+		0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1,
+		0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+		0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4,
+		0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+		0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3,
+		0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+		0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+		0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{	0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+		0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed,
+		0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+		0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+		0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7,
+		0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+		0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe,
+		0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+		0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+		0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+		0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac,
+		0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+		0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab,
+		0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+		0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2,
+		0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+		0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5,
+		0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+		0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88,
+		0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+		0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f,
+		0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+		0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86,
+		0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+		0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81,
+		0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+		0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94,
+		0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+		0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93,
+		0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+		0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a,
+		0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a  },
+	{	0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+		0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{	0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d,
+		0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+		0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+		0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27,
+		0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+		0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e,
+		0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+		0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29,
+		0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+		0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c,
+		0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+		0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b,
+		0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+		0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32,
+		0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02  },
+	{	0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+		0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35,
+		0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+		0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18,
+		0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+		0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+		0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+		0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16,
+		0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+		0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11,
+		0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+		0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04,
+		0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+		0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03,
+		0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+		0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a,
+		0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a  },
+	{	0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+		0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d,
+		0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+		0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+		0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57,
+		0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+		0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e,
+		0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+		0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59,
+		0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+		0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+		0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+		0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b,
+		0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+		0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42,
+		0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2  },
+	{	0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+		0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45,
+		0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+		0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68,
+		0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+		0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f,
+		0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+		0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66,
+		0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+		0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61,
+		0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+		0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74,
+		0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+		0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73,
+		0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+		0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a,
+		0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a  },
+	{	0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+		0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{	0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d,
+		0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+		0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+		0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+		0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+		0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e,
+		0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+		0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89,
+		0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+		0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c,
+		0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+		0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b,
+		0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+		0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92,
+		0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2  },
+	{	0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+		0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95,
+		0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+		0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8,
+		0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+		0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf,
+		0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+		0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6,
+		0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+		0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1,
+		0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+		0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4,
+		0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+		0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3,
+		0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+		0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa,
+		0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+		0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad,
+		0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+		0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+		0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7,
+		0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+		0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe,
+		0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+		0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9,
+		0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+		0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec,
+		0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+		0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb,
+		0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+		0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2,
+		0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52  },
+	{	0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+		0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5,
+		0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+		0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8,
+		0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+		0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf,
+		0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+		0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6,
+		0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+		0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1,
+		0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+		0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+		0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+		0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3,
+		0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+		0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda,
+		0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+		0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{	0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd,
+		0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+		0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+		0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67,
+		0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+		0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e,
+		0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+		0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69,
+		0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+		0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c,
+		0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+		0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b,
+		0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+		0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+		0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{	0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+		0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75,
+		0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+		0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58,
+		0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+		0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f,
+		0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+		0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56,
+		0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+		0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51,
+		0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+		0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44,
+		0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+		0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43,
+		0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+		0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a,
+		0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+		0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d,
+		0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+		0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+		0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17,
+		0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+		0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e,
+		0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+		0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19,
+		0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+		0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c,
+		0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+		0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b,
+		0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+		0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02,
+		0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2  },
+	{	0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+		0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05,
+		0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+		0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28,
+		0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+		0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f,
+		0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+		0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26,
+		0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+		0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+		0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+		0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34,
+		0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+		0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33,
+		0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+		0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a,
+		0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+		0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+		0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{	0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d,
+		0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+		0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+		0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47,
+		0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+		0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e,
+		0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+		0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49,
+		0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+		0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c,
+		0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+		0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b,
+		0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+		0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52,
+		0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62  },
+	{	0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+		0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55,
+		0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+		0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78,
+		0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+		0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f,
+		0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+		0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76,
+		0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+		0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71,
+		0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+		0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64,
+		0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+		0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63,
+		0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+		0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a,
+		0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a  },
+	{	0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+		0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+		0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+		0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+		0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37,
+		0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+		0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+		0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+		0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39,
+		0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+		0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c,
+		0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+		0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b,
+		0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+		0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22,
+		0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92  },
+	{	0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+		0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25,
+		0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d  },
+	{	0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+		0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08,
+		0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8  },
+	{	0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+		0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f,
+		0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7  },
+	{	0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+		0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06,
+		0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6  },
+	{	0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+		0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01,
+		0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9  },
+	{	0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+		0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14,
+		0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4  },
+	{	0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+		0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13,
+		0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb  },
+	{	0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+		0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a,
+		0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea  },
+	{	0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+		0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{	0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d,
+		0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+		0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+		0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7,
+		0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+		0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae,
+		0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+		0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9,
+		0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+		0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc,
+		0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+		0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb,
+		0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+		0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2,
+		0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82  },
+	{	0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+		0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5,
+		0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+		0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+		0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+		0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f,
+		0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+		0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96,
+		0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+		0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91,
+		0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+		0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84,
+		0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+		0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83,
+		0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+		0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a,
+		0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa  },
+	{	0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+		0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d,
+		0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+		0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7  },
+	{	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+		0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7  },
+	{	0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+		0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7,
+		0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+		0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa  },
+	{	0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+		0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde,
+		0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+		0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa  },
+	{	0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+		0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9,
+		0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+		0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90  },
+	{	0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+		0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc,
+		0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+		0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90  },
+	{	0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+		0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+		0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+		0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d  },
+	{	0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+		0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2,
+		0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72  },
+	{	0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+		0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d  },
+	{	0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+		0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5,
+		0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+		0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4  },
+	{	0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+		0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8,
+		0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+		0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4  },
+	{	0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+		0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef,
+		0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+		0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{	0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+		0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6,
+		0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+		0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{	0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+		0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1,
+		0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+		0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3  },
+	{	0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+		0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4,
+		0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+		0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3  },
+	{	0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+		0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3,
+		0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+		0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde  },
+	{	0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+		0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa,
+		0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a  },
+	{	0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+		0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde  },
+	{	0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+		0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{	0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+		0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{	0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd,
+		0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05  }
+};
+/* END CSTYLED */
+#endif // ENDIANNESS
+#endif /* defined(__powerpc__) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
new file mode 100644
index 000000000000..3842f5fd637c
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h
@@ -0,0 +1,690 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2019 Romain Dolbeau. All rights reserved.
+ *           <romain.dolbeau@european-processor-initiative.eu>
+ */
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define	VR0_(REG, ...) "%[w"#REG"]"
+#define	VR1_(_1, REG, ...) "%[w"#REG"]"
+#define	VR2_(_1, _2, REG, ...) "%[w"#REG"]"
+#define	VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
+#define	VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
+#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
+#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
+#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
+
+/*
+ * Here we need registers not used otherwise.
+ * They will be used in unused ASM for the case
+ * with more registers than required... but GCC
+ * will still need to make sure the constraints
+ * are correct, and duplicate constraints are illegal
+ * ... and we use the "register" number as a name
+ */
+
+#define	VR0(r...) VR0_(r)
+#define	VR1(r...) VR1_(r)
+#define	VR2(r...) VR2_(r, 36)
+#define	VR3(r...) VR3_(r, 36, 35)
+#define	VR4(r...) VR4_(r, 36, 35, 34, 33)
+#define	VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
+#define	VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
+#define	VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	VR(X) "%[w"#X"]"
+
+#define	RVR0_(REG, ...) [w##REG] "v" (w##REG)
+#define	RVR1_(_1, REG, ...) [w##REG] "v" (w##REG)
+#define	RVR2_(_1, _2, REG, ...) [w##REG] "v" (w##REG)
+#define	RVR3_(_1, _2, _3, REG, ...) [w##REG] "v" (w##REG)
+#define	RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "v" (w##REG)
+#define	RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "v" (w##REG)
+#define	RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "v" (w##REG)
+#define	RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "v" (w##REG)
+
+#define	RVR0(r...) RVR0_(r)
+#define	RVR1(r...) RVR1_(r)
+#define	RVR2(r...) RVR2_(r, 36)
+#define	RVR3(r...) RVR3_(r, 36, 35)
+#define	RVR4(r...) RVR4_(r, 36, 35, 34, 33)
+#define	RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
+#define	RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
+#define	RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	RVR(X) [w##X] "v" (w##X)
+
+#define	WVR0_(REG, ...) [w##REG] "=v" (w##REG)
+#define	WVR1_(_1, REG, ...) [w##REG] "=v" (w##REG)
+#define	WVR2_(_1, _2, REG, ...) [w##REG] "=v" (w##REG)
+#define	WVR3_(_1, _2, _3, REG, ...) [w##REG] "=v" (w##REG)
+#define	WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=v" (w##REG)
+#define	WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=v" (w##REG)
+#define	WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=v" (w##REG)
+#define	WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=v" (w##REG)
+
+#define	WVR0(r...) WVR0_(r)
+#define	WVR1(r...) WVR1_(r)
+#define	WVR2(r...) WVR2_(r, 36)
+#define	WVR3(r...) WVR3_(r, 36, 35)
+#define	WVR4(r...) WVR4_(r, 36, 35, 34, 33)
+#define	WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
+#define	WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
+#define	WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	WVR(X) [w##X] "=v" (w##X)
+
+#define	UVR0_(REG, ...) [w##REG] "+&v" (w##REG)
+#define	UVR1_(_1, REG, ...) [w##REG] "+&v" (w##REG)
+#define	UVR2_(_1, _2, REG, ...) [w##REG] "+&v" (w##REG)
+#define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&v" (w##REG)
+#define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&v" (w##REG)
+#define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&v" (w##REG)
+#define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&v" (w##REG)
+#define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&v" (w##REG)
+
+#define	UVR0(r...) UVR0_(r)
+#define	UVR1(r...) UVR1_(r)
+#define	UVR2(r...) UVR2_(r, 36)
+#define	UVR3(r...) UVR3_(r, 36, 35)
+#define	UVR4(r...) UVR4_(r, 36, 35, 34, 33)
+#define	UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
+#define	UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
+#define	UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
+
+#define	UVR(X) [w##X] "+&v" (w##X)
+
+#define	R_01(REG1, REG2, ...) REG1, REG2
+#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define	R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define	ZFS_ASM_BUG()	ASSERT(0)
+
+#define	OFFSET(ptr, val)	(((unsigned char *)(ptr))+val)
+
+extern const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define	ELEM_SIZE 16
+
+typedef struct v {
+	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define	XOR_ACC(src, r...)					\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 8:							\
+		__asm(						\
+		"lvx 21,0,%[SRC0]\n"				\
+		"lvx 20,0,%[SRC1]\n"				\
+		"lvx 19,0,%[SRC2]\n"				\
+		"lvx 18,0,%[SRC3]\n"				\
+		"vxor " VR0(r) "," VR0(r) ",21\n"		\
+		"vxor " VR1(r) "," VR1(r) ",20\n"		\
+		"vxor " VR2(r) "," VR2(r) ",19\n"		\
+		"vxor " VR3(r) "," VR3(r) ",18\n"		\
+		"lvx 21,0,%[SRC4]\n"				\
+		"lvx 20,0,%[SRC5]\n"				\
+		"lvx 19,0,%[SRC6]\n"				\
+		"lvx 18,0,%[SRC7]\n"				\
+		"vxor " VR4(r) "," VR4(r) ",21\n"		\
+		"vxor " VR5(r) "," VR5(r) ",20\n"		\
+		"vxor " VR6(r) "," VR6(r) ",19\n"		\
+		"vxor " VR7(r) "," VR7(r) ",18\n"		\
+		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r),	\
+			UVR4(r), UVR5(r), UVR6(r), UVR7(r)	\
+		:	[SRC0] "r" ((OFFSET(src, 0))),		\
+		[SRC1] "r" ((OFFSET(src, 16))),			\
+		[SRC2] "r" ((OFFSET(src, 32))),			\
+		[SRC3] "r" ((OFFSET(src, 48))),			\
+		[SRC4] "r" ((OFFSET(src, 64))),			\
+		[SRC5] "r" ((OFFSET(src, 80))),			\
+		[SRC6] "r" ((OFFSET(src, 96))),			\
+		[SRC7] "r" ((OFFSET(src, 112)))			\
+		:	"v18", "v19", "v20", "v21");		\
+		break;						\
+	case 4:							\
+		__asm(						\
+		"lvx 21,0,%[SRC0]\n"				\
+		"lvx 20,0,%[SRC1]\n"				\
+		"lvx 19,0,%[SRC2]\n"				\
+		"lvx 18,0,%[SRC3]\n"				\
+		"vxor " VR0(r) "," VR0(r) ",21\n"		\
+		"vxor " VR1(r) "," VR1(r) ",20\n"		\
+		"vxor " VR2(r) "," VR2(r) ",19\n"		\
+		"vxor " VR3(r) "," VR3(r) ",18\n"		\
+		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)	\
+		:	[SRC0] "r" ((OFFSET(src, 0))),		\
+		[SRC1] "r" ((OFFSET(src, 16))),			\
+		[SRC2] "r" ((OFFSET(src, 32))),			\
+		[SRC3] "r" ((OFFSET(src, 48)))			\
+		:	"v18", "v19", "v20", "v21");		\
+		break;						\
+	case 2:							\
+		__asm(						\
+		"lvx 21,0,%[SRC0]\n"				\
+		"lvx 20,0,%[SRC1]\n"				\
+		"vxor " VR0(r) "," VR0(r) ",21\n"		\
+		"vxor " VR1(r) "," VR1(r) ",20\n"		\
+		:	UVR0(r), UVR1(r)			\
+		:	[SRC0] "r" ((OFFSET(src, 0))),		\
+		[SRC1] "r" ((OFFSET(src, 16)))			\
+		:	"v20", "v21");				\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	XOR(r...)						\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 8:							\
+		__asm(						\
+		"vxor " VR4(r) "," VR4(r) "," VR0(r) "\n"	\
+		"vxor " VR5(r) "," VR5(r) "," VR1(r) "\n"	\
+		"vxor " VR6(r) "," VR6(r) "," VR2(r) "\n"	\
+		"vxor " VR7(r) "," VR7(r) "," VR3(r) "\n"	\
+		:	UVR4(r), UVR5(r), UVR6(r), UVR7(r)	\
+		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
+		break;						\
+	case 4:							\
+		__asm(						\
+		"vxor " VR2(r) "," VR2(r) "," VR0(r) "\n"	\
+		"vxor " VR3(r) "," VR3(r) "," VR1(r) "\n"	\
+		:	UVR2(r), UVR3(r)			\
+		:	RVR0(r), RVR1(r));			\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	ZERO(r...)						\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 8:							\
+		__asm(						\
+		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
+		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
+		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
+		"vxor " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
+		"vxor " VR4(r) "," VR4(r) "," VR4(r) "\n"	\
+		"vxor " VR5(r) "," VR5(r) "," VR5(r) "\n"	\
+		"vxor " VR6(r) "," VR6(r) "," VR6(r) "\n"	\
+		"vxor " VR7(r) "," VR7(r) "," VR7(r) "\n"	\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),	\
+			WVR4(r), WVR5(r), WVR6(r), WVR7(r));	\
+		break;						\
+	case 4:							\
+		__asm(						\
+		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
+		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
+		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
+		"vxor " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r));	\
+		break;						\
+	case 2:							\
+		__asm(						\
+		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
+		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
+		:	WVR0(r), WVR1(r));			\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	COPY(r...)						\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 8:							\
+		__asm(						\
+		"vor " VR4(r) "," VR0(r) "," VR0(r) "\n"	\
+		"vor " VR5(r) "," VR1(r) "," VR1(r) "\n"	\
+		"vor " VR6(r) "," VR2(r) "," VR2(r) "\n"	\
+		"vor " VR7(r) "," VR3(r) "," VR3(r) "\n"	\
+		:	WVR4(r), WVR5(r), WVR6(r), WVR7(r)	\
+		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
+		break;						\
+	case 4:							\
+		__asm(						\
+		"vor " VR2(r) "," VR0(r) "," VR0(r) "\n"	\
+		"vor " VR3(r) "," VR1(r) "," VR1(r) "\n"	\
+		:	WVR2(r), WVR3(r)			\
+		:	RVR0(r), RVR1(r));			\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	LOAD(src, r...)						\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 8:							\
+		__asm(						\
+		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
+		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
+		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
+		"lvx " VR3(r) " ,0,%[SRC3]\n"			\
+		"lvx " VR4(r) " ,0,%[SRC4]\n"			\
+		"lvx " VR5(r) " ,0,%[SRC5]\n"			\
+		"lvx " VR6(r) " ,0,%[SRC6]\n"			\
+		"lvx " VR7(r) " ,0,%[SRC7]\n"			\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),	\
+			WVR4(r), WVR5(r), WVR6(r), WVR7(r)	\
+		:	[SRC0] "r" ((OFFSET(src, 0))),		\
+		[SRC1] "r" ((OFFSET(src, 16))),			\
+		[SRC2] "r" ((OFFSET(src, 32))),			\
+		[SRC3] "r" ((OFFSET(src, 48))),			\
+		[SRC4] "r" ((OFFSET(src, 64))),			\
+		[SRC5] "r" ((OFFSET(src, 80))),			\
+		[SRC6] "r" ((OFFSET(src, 96))),			\
+		[SRC7] "r" ((OFFSET(src, 112))));		\
+		break;						\
+	case 4:							\
+		__asm(						\
+		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
+		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
+		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
+		"lvx " VR3(r) " ,0,%[SRC3]\n"			\
+		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r)	\
+		:	[SRC0] "r" ((OFFSET(src, 0))),		\
+		[SRC1] "r" ((OFFSET(src, 16))),			\
+		[SRC2] "r" ((OFFSET(src, 32))),			\
+		[SRC3] "r" ((OFFSET(src, 48))));		\
+		break;						\
+	case 2:							\
+		__asm(						\
+		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
+		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
+		:	WVR0(r), WVR1(r)			\
+		:	[SRC0] "r" ((OFFSET(src, 0))),		\
+		[SRC1] "r" ((OFFSET(src, 16))));		\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	STORE(dst, r...)					\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 8:							\
+		__asm(						\
+		"stvx " VR0(r) " ,0,%[DST0]\n"			\
+		"stvx " VR1(r) " ,0,%[DST1]\n"			\
+		"stvx " VR2(r) " ,0,%[DST2]\n"			\
+		"stvx " VR3(r) " ,0,%[DST3]\n"			\
+		"stvx " VR4(r) " ,0,%[DST4]\n"			\
+		"stvx " VR5(r) " ,0,%[DST5]\n"			\
+		"stvx " VR6(r) " ,0,%[DST6]\n"			\
+		"stvx " VR7(r) " ,0,%[DST7]\n"			\
+		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
+		[DST1] "r" ((OFFSET(dst, 16))),			\
+		[DST2] "r" ((OFFSET(dst, 32))),			\
+		[DST3] "r" ((OFFSET(dst, 48))),			\
+		[DST4] "r" ((OFFSET(dst, 64))),			\
+		[DST5] "r" ((OFFSET(dst, 80))),			\
+		[DST6] "r" ((OFFSET(dst, 96))),			\
+		[DST7] "r" ((OFFSET(dst, 112))),		\
+		RVR0(r), RVR1(r), RVR2(r), RVR3(r),		\
+		RVR4(r), RVR5(r), RVR6(r), RVR7(r)		\
+		:	"memory");				\
+		break;						\
+	case 4:							\
+		__asm(						\
+		"stvx " VR0(r) " ,0,%[DST0]\n"			\
+		"stvx " VR1(r) " ,0,%[DST1]\n"			\
+		"stvx " VR2(r) " ,0,%[DST2]\n"			\
+		"stvx " VR3(r) " ,0,%[DST3]\n"			\
+		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
+		[DST1] "r" ((OFFSET(dst, 16))),			\
+		[DST2] "r" ((OFFSET(dst, 32))),			\
+		[DST3] "r" ((OFFSET(dst, 48))),			\
+		RVR0(r), RVR1(r), RVR2(r), RVR3(r)		\
+		: "memory");					\
+		break;						\
+	case 2:							\
+		__asm(						\
+		"stvx " VR0(r) " ,0,%[DST0]\n"			\
+		"stvx " VR1(r) " ,0,%[DST1]\n"			\
+		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
+		[DST1] "r" ((OFFSET(dst, 16))),			\
+		RVR0(r), RVR1(r) : "memory");			\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a numbered variable is
+ */
+#define	_00	"17"
+#define	_1d	"16"
+#define	_temp0	"19"
+#define	_temp1	"18"
+
+#define	MUL2_SETUP()						\
+{								\
+	__asm(							\
+		"vspltisb " VR(16) ",14\n"			\
+		"vspltisb " VR(17) ",15\n"			\
+		"vaddubm " VR(16) "," VR(17) "," VR(16) "\n"	\
+		"vxor " VR(17) "," VR(17) "," VR(17) "\n"	\
+		:	WVR(16), WVR(17));			\
+}
+
+#define	MUL2(r...)						\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 4:							\
+		__asm(						\
+		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
+		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
+		"vcmpgtsb 21," VR(17) "," VR2(r) "\n"		\
+		"vcmpgtsb 20," VR(17) "," VR3(r) "\n"		\
+		"vand 19,19," VR(16) "\n"			\
+		"vand 18,18," VR(16) "\n"			\
+		"vand 21,21," VR(16) "\n"			\
+		"vand 20,20," VR(16) "\n"			\
+		"vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
+		"vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
+		"vaddubm " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
+		"vaddubm " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
+		"vxor " VR0(r) ",19," VR0(r) "\n"		\
+		"vxor " VR1(r) ",18," VR1(r) "\n"		\
+		"vxor " VR2(r) ",21," VR2(r) "\n"		\
+		"vxor " VR3(r) ",20," VR3(r) "\n"		\
+		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)	\
+		:	RVR(17), RVR(16)			\
+		:	"v18", "v19", "v20", "v21");		\
+		break;						\
+	case 2:							\
+		__asm(						\
+		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
+		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
+		"vand 19,19," VR(16) "\n"			\
+		"vand 18,18," VR(16) "\n"			\
+		"vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
+		"vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
+		"vxor " VR0(r) ",19," VR0(r) "\n"		\
+		"vxor " VR1(r) ",18," VR1(r) "\n"		\
+		:	UVR0(r), UVR1(r)			\
+		:	RVR(17), RVR(16)			\
+		:	"v18", "v19");				\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	MUL4(r...)						\
+{								\
+	MUL2(r);						\
+	MUL2(r);						\
+}
+
+/*
+ * Unfortunately cannot use the macro, because GCC
+ * will try to use the macro name and not value
+ * later on...
+ * Kept as a reference to what a register is
+ * (here we're using actual registers for the
+ * clobbered ones)
+ */
+#define	_0f		"15"
+#define	_a_save		"14"
+#define	_b_save		"13"
+#define	_lt_mod_a	"12"
+#define	_lt_clmul_a	"11"
+#define	_lt_mod_b	"10"
+#define	_lt_clmul_b	"15"
+
+#define	_MULx2(c, r...)						\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 2:							\
+		__asm(						\
+		/* lts for upper part */			\
+		"vspltisb 15,15\n"				\
+		"lvx 10,0,%[lt0]\n"				\
+		"lvx 11,0,%[lt1]\n"				\
+		/* upper part */				\
+		"vand 14," VR0(r) ",15\n"			\
+		"vand 13," VR1(r) ",15\n"			\
+		"vspltisb 15,4\n"				\
+		"vsrab " VR0(r) "," VR0(r) ",15\n"		\
+		"vsrab " VR1(r) "," VR1(r) ",15\n"		\
+								\
+		"vperm 12,10,10," VR0(r) "\n"			\
+		"vperm 10,10,10," VR1(r) "\n"			\
+		"vperm 15,11,11," VR0(r) "\n"			\
+		"vperm 11,11,11," VR1(r) "\n"			\
+								\
+		"vxor " VR0(r) ",15,12\n"			\
+		"vxor " VR1(r) ",11,10\n"			\
+		/* lts for lower part */			\
+		"lvx 10,0,%[lt2]\n"				\
+		"lvx 15,0,%[lt3]\n"				\
+		/* lower part */				\
+		"vperm 12,10,10,14\n"				\
+		"vperm 10,10,10,13\n"				\
+		"vperm 11,15,15,14\n"				\
+		"vperm 15,15,15,13\n"				\
+								\
+		"vxor " VR0(r) "," VR0(r) ",12\n"		\
+		"vxor " VR1(r) "," VR1(r) ",10\n"		\
+		"vxor " VR0(r) "," VR0(r) ",11\n"		\
+		"vxor " VR1(r) "," VR1(r) ",15\n"		\
+		: UVR0(r), UVR1(r)				\
+		: [lt0] "r" (&(gf_clmul_mod_lt[4*(c)+0][0])),	\
+		[lt1] "r" (&(gf_clmul_mod_lt[4*(c)+1][0])),	\
+		[lt2] "r" (&(gf_clmul_mod_lt[4*(c)+2][0])),	\
+		[lt3] "r" (&(gf_clmul_mod_lt[4*(c)+3][0]))	\
+		: "v10", "v11", "v12", "v13", "v14", "v15");	\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	MUL(c, r...)						\
+{								\
+	switch (REG_CNT(r)) {					\
+	case 4:							\
+		_MULx2(c, R_23(r));				\
+		_MULx2(c, R_01(r));				\
+		break;						\
+	case 2:							\
+		_MULx2(c, R_01(r));				\
+		break;						\
+	default:						\
+		ZFS_ASM_BUG();					\
+	}							\
+}
+
+#define	raidz_math_begin()	kfpu_begin()
+#define	raidz_math_end()	kfpu_end()
+
+/* Overkill... */
+#if 0 // defined(_KERNEL)
+#define	GEN_X_DEFINE_0_3()	\
+register unsigned char w0 asm("0") __attribute__((vector_size(16)));	\
+register unsigned char w1 asm("1") __attribute__((vector_size(16)));	\
+register unsigned char w2 asm("2") __attribute__((vector_size(16)));	\
+register unsigned char w3 asm("3") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_4_5()	\
+register unsigned char w4 asm("4") __attribute__((vector_size(16)));	\
+register unsigned char w5 asm("5") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_6_7()	\
+register unsigned char w6 asm("6") __attribute__((vector_size(16)));	\
+register unsigned char w7 asm("7") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_8_9()	\
+register unsigned char w8 asm("8") __attribute__((vector_size(16)));	\
+register unsigned char w9 asm("9") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_10_11()	\
+register unsigned char w10 asm("10") __attribute__((vector_size(16)));	\
+register unsigned char w11 asm("11") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_12_15()	\
+register unsigned char w12 asm("12") __attribute__((vector_size(16)));	\
+register unsigned char w13 asm("13") __attribute__((vector_size(16)));	\
+register unsigned char w14 asm("14") __attribute__((vector_size(16)));	\
+register unsigned char w15 asm("15") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_16()	\
+register unsigned char w16 asm("16") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_17()	\
+register unsigned char w17 asm("17") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_18_21()	\
+register unsigned char w18 asm("18") __attribute__((vector_size(16)));	\
+register unsigned char w19 asm("19") __attribute__((vector_size(16)));	\
+register unsigned char w20 asm("20") __attribute__((vector_size(16)));	\
+register unsigned char w21 asm("21") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_22_23()	\
+register unsigned char w22 asm("22") __attribute__((vector_size(16)));	\
+register unsigned char w23 asm("23") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_24_27()	\
+register unsigned char w24 asm("24") __attribute__((vector_size(16)));	\
+register unsigned char w25 asm("25") __attribute__((vector_size(16)));	\
+register unsigned char w26 asm("26") __attribute__((vector_size(16)));	\
+register unsigned char w27 asm("27") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_28_30()	\
+register unsigned char w28 asm("28") __attribute__((vector_size(16)));	\
+register unsigned char w29 asm("29") __attribute__((vector_size(16)));	\
+register unsigned char w30 asm("30") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_31()	\
+register unsigned char w31 asm("31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_32()	\
+register unsigned char w32 asm("31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_33_36()	\
+register unsigned char w33 asm("31") __attribute__((vector_size(16)));	\
+register unsigned char w34 asm("31") __attribute__((vector_size(16)));	\
+register unsigned char w35 asm("31") __attribute__((vector_size(16)));	\
+register unsigned char w36 asm("31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_37_38()	\
+register unsigned char w37 asm("31") __attribute__((vector_size(16)));	\
+register unsigned char w38 asm("31") __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_ALL()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_10_11()	\
+	GEN_X_DEFINE_12_15()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_18_21()	\
+	GEN_X_DEFINE_22_23()	\
+	GEN_X_DEFINE_24_27()	\
+	GEN_X_DEFINE_28_30()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36() 	\
+	GEN_X_DEFINE_37_38()
+#else
+#define	GEN_X_DEFINE_0_3()	\
+	unsigned char w0 __attribute__((vector_size(16)));	\
+	unsigned char w1 __attribute__((vector_size(16)));	\
+	unsigned char w2 __attribute__((vector_size(16)));	\
+	unsigned char w3 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_4_5()	\
+	unsigned char w4 __attribute__((vector_size(16)));	\
+	unsigned char w5 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_6_7()	\
+	unsigned char w6 __attribute__((vector_size(16)));	\
+	unsigned char w7 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_8_9()	\
+	unsigned char w8 __attribute__((vector_size(16)));	\
+	unsigned char w9 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_10_11()	\
+	unsigned char w10 __attribute__((vector_size(16)));	\
+	unsigned char w11 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_12_15()	\
+	unsigned char w12 __attribute__((vector_size(16)));	\
+	unsigned char w13 __attribute__((vector_size(16)));	\
+	unsigned char w14 __attribute__((vector_size(16)));	\
+	unsigned char w15 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_16()	\
+	unsigned char w16 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_17()	\
+	unsigned char w17 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_18_21()	\
+	unsigned char w18 __attribute__((vector_size(16)));	\
+	unsigned char w19 __attribute__((vector_size(16)));	\
+	unsigned char w20 __attribute__((vector_size(16)));	\
+	unsigned char w21 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_22_23()	\
+	unsigned char w22 __attribute__((vector_size(16)));	\
+	unsigned char w23 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_24_27()	\
+	unsigned char w24 __attribute__((vector_size(16)));	\
+	unsigned char w25 __attribute__((vector_size(16)));	\
+	unsigned char w26 __attribute__((vector_size(16)));	\
+	unsigned char w27 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_28_30()	\
+	unsigned char w28 __attribute__((vector_size(16)));	\
+	unsigned char w29 __attribute__((vector_size(16)));	\
+	unsigned char w30 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_31()	\
+	unsigned char w31 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_32()	\
+	unsigned char w32 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_33_36()	\
+	unsigned char w33 __attribute__((vector_size(16)));	\
+	unsigned char w34 __attribute__((vector_size(16)));	\
+	unsigned char w35 __attribute__((vector_size(16)));	\
+	unsigned char w36 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_37_38()	\
+	unsigned char w37 __attribute__((vector_size(16)));	\
+	unsigned char w38 __attribute__((vector_size(16)));
+#define	GEN_X_DEFINE_ALL()	\
+	GEN_X_DEFINE_0_3()	\
+	GEN_X_DEFINE_4_5()	\
+	GEN_X_DEFINE_6_7()	\
+	GEN_X_DEFINE_8_9()	\
+	GEN_X_DEFINE_10_11()	\
+	GEN_X_DEFINE_12_15()	\
+	GEN_X_DEFINE_16()	\
+	GEN_X_DEFINE_17()	\
+	GEN_X_DEFINE_18_21()	\
+	GEN_X_DEFINE_22_23()	\
+	GEN_X_DEFINE_24_27()	\
+	GEN_X_DEFINE_28_30()	\
+	GEN_X_DEFINE_31()	\
+	GEN_X_DEFINE_32()	\
+	GEN_X_DEFINE_33_36()	\
+	GEN_X_DEFINE_37_38()
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
new file mode 100644
index 000000000000..cd742e146ca6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c
@@ -0,0 +1,337 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/vdev_raidz_impl.h>
+
+/*
+ * Provide native CPU scalar routines.
+ * Support 32bit and 64bit CPUs.
+ */
+#if ((~(0x0ULL)) >> 24) == 0xffULL
+#define	ELEM_SIZE	4
+typedef uint32_t iv_t;
+#elif ((~(0x0ULL)) >> 56) == 0xffULL
+#define	ELEM_SIZE	8
+typedef uint64_t iv_t;
+#endif
+
+/*
+ * Vector type used in scalar implementation
+ *
+ * The union is expected to be of native CPU register size. Since addition
+ * uses XOR operation, it can be performed an all byte elements at once.
+ * Multiplication requires per byte access.
+ */
+typedef union {
+	iv_t e;
+	uint8_t b[ELEM_SIZE];
+} v_t;
+
+/*
+ * Precomputed lookup tables for multiplication by a constant
+ *
+ * Reconstruction path requires multiplication by a constant factors. Instead of
+ * performing two step lookup (log & exp tables), a direct lookup can be used
+ * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
+ *
+ * 	r = vdev_raidz_mul_lt[c_log][a];
+ *
+ * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
+ * they are faster to obtain while solving the syndrome equations.
+ *
+ * PERFORMANCE NOTE:
+ * Even though the complete lookup table uses 64kiB, only relatively small
+ * portion of it is used at the same time. Following shows number of accessed
+ * bytes for different cases:
+ * 	- 1 failed disk: 256B (1 mul. coefficient)
+ * 	- 2 failed disks: 512B (2 mul. coefficients)
+ * 	- 3 failed disks: 1536B (6 mul. coefficients)
+ *
+ * Size of actually accessed lookup table regions is only larger for
+ * reconstruction of 3 failed disks, when compared to traditional log/exp
+ * method. But since the result is obtained in one lookup step performance is
+ * doubled.
+ */
+static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
+
+static void
+raidz_init_scalar(void)
+{
+	int c, i;
+	for (c = 0; c < 256; c++)
+		for (i = 0; i < 256; i++)
+			vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
+
+}
+
+#define	PREFETCHNTA(ptr, offset)	{}
+#define	PREFETCH(ptr, offset) 		{}
+
+#define	XOR_ACC(src, acc)	acc.e ^= ((v_t *)src)[0].e
+#define	XOR(src, acc)		acc.e ^= src.e
+#define	ZERO(acc)		acc.e = 0
+#define	COPY(src, dst)		dst = src
+#define	LOAD(src, val) 		val = ((v_t *)src)[0]
+#define	STORE(dst, val)		((v_t *)dst)[0] = val
+
+/*
+ * Constants used for optimized multiplication by 2.
+ */
+static const struct {
+	iv_t mod;
+	iv_t mask;
+	iv_t msb;
+} scalar_mul2_consts = {
+#if ELEM_SIZE == 8
+	.mod	= 0x1d1d1d1d1d1d1d1dULL,
+	.mask	= 0xfefefefefefefefeULL,
+	.msb	= 0x8080808080808080ULL,
+#else
+	.mod	= 0x1d1d1d1dULL,
+	.mask	= 0xfefefefeULL,
+	.msb	= 0x80808080ULL,
+#endif
+};
+
+#define	MUL2_SETUP() {}
+
+#define	MUL2(a)								\
+{									\
+	iv_t _mask;							\
+									\
+	_mask = (a).e & scalar_mul2_consts.msb;				\
+	_mask = (_mask << 1) - (_mask >> 7);				\
+	(a).e = ((a).e << 1) & scalar_mul2_consts.mask;			\
+	(a).e = (a).e ^ (_mask & scalar_mul2_consts.mod);		\
+}
+
+#define	MUL4(a) 							\
+{									\
+	MUL2(a);							\
+	MUL2(a);							\
+}
+
+#define	MUL(c, a)							\
+{									\
+	const uint8_t *mul_lt = vdev_raidz_mul_lt[c];			\
+	switch (ELEM_SIZE) {						\
+	case 8:								\
+		a.b[7] = mul_lt[a.b[7]];				\
+		a.b[6] = mul_lt[a.b[6]];				\
+		a.b[5] = mul_lt[a.b[5]];				\
+		a.b[4] = mul_lt[a.b[4]];				\
+		/* falls through */					\
+	case 4:								\
+		a.b[3] = mul_lt[a.b[3]];				\
+		a.b[2] = mul_lt[a.b[2]];				\
+		a.b[1] = mul_lt[a.b[1]];				\
+		a.b[0] = mul_lt[a.b[0]];				\
+		break;							\
+	}								\
+}
+
+#define	raidz_math_begin()	{}
+#define	raidz_math_end()	{}
+
+#define	SYN_STRIDE		1
+
+#define	ZERO_DEFINE()		v_t d0
+#define	ZERO_STRIDE		1
+#define	ZERO_D			d0
+
+#define	COPY_DEFINE()		v_t d0
+#define	COPY_STRIDE		1
+#define	COPY_D			d0
+
+#define	ADD_DEFINE()		v_t d0
+#define	ADD_STRIDE		1
+#define	ADD_D			d0
+
+#define	MUL_DEFINE()		v_t d0
+#define	MUL_STRIDE		1
+#define	MUL_D			d0
+
+#define	GEN_P_STRIDE		1
+#define	GEN_P_DEFINE()		v_t p0
+#define	GEN_P_P			p0
+
+#define	GEN_PQ_STRIDE		1
+#define	GEN_PQ_DEFINE()		v_t d0, c0
+#define	GEN_PQ_D		d0
+#define	GEN_PQ_C		c0
+
+#define	GEN_PQR_STRIDE		1
+#define	GEN_PQR_DEFINE()	v_t d0, c0
+#define	GEN_PQR_D		d0
+#define	GEN_PQR_C		c0
+
+#define	SYN_Q_DEFINE()		v_t d0, x0
+#define	SYN_Q_D			d0
+#define	SYN_Q_X			x0
+
+
+#define	SYN_R_DEFINE()		v_t d0, x0
+#define	SYN_R_D			d0
+#define	SYN_R_X			x0
+
+
+#define	SYN_PQ_DEFINE()		v_t d0, x0
+#define	SYN_PQ_D		d0
+#define	SYN_PQ_X		x0
+
+
+#define	REC_PQ_STRIDE		1
+#define	REC_PQ_DEFINE()		v_t x0, y0, t0
+#define	REC_PQ_X		x0
+#define	REC_PQ_Y		y0
+#define	REC_PQ_T		t0
+
+
+#define	SYN_PR_DEFINE()		v_t d0, x0
+#define	SYN_PR_D		d0
+#define	SYN_PR_X		x0
+
+#define	REC_PR_STRIDE		1
+#define	REC_PR_DEFINE()		v_t x0, y0, t0
+#define	REC_PR_X		x0
+#define	REC_PR_Y		y0
+#define	REC_PR_T		t0
+
+
+#define	SYN_QR_DEFINE()		v_t d0, x0
+#define	SYN_QR_D		d0
+#define	SYN_QR_X		x0
+
+
+#define	REC_QR_STRIDE		1
+#define	REC_QR_DEFINE()		v_t x0, y0, t0
+#define	REC_QR_X		x0
+#define	REC_QR_Y		y0
+#define	REC_QR_T		t0
+
+
+#define	SYN_PQR_DEFINE()	v_t d0, x0
+#define	SYN_PQR_D		d0
+#define	SYN_PQR_X		x0
+
+#define	REC_PQR_STRIDE		1
+#define	REC_PQR_DEFINE()	v_t x0, y0, z0, xs0, ys0
+#define	REC_PQR_X		x0
+#define	REC_PQR_Y		y0
+#define	REC_PQR_Z		z0
+#define	REC_PQR_XS		xs0
+#define	REC_PQR_YS		ys0
+
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(scalar);
+DEFINE_REC_METHODS(scalar);
+
+boolean_t
+raidz_will_scalar_work(void)
+{
+	return (B_TRUE); /* always */
+}
+
+const raidz_impl_ops_t vdev_raidz_scalar_impl = {
+	.init = raidz_init_scalar,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(scalar),
+	.rec = RAIDZ_REC_METHODS(scalar),
+	.is_supported = &raidz_will_scalar_work,
+	.name = "scalar"
+};
+
+/* Powers of 2 in the RAID-Z Galois field. */
+const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
+	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
+	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
+	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
+	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
+	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
+	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
+	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
+	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
+	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
+	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
+	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
+	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
+	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
+	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
+	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
+	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
+	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
+	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
+	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
+	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
+	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
+	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
+	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
+	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
+	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
+	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
+	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
+	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
+	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
+	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
+	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
+	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
+};
+
+/* Logs of 2 in the RAID-Z Galois field. */
+const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
+	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
+	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
+	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
+	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
+	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
+	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
+	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
+	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
+	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
+	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
+	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
+	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
+	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
+	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
+	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
+	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
+	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
+	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
+	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
+	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
+	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
+	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
+	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
+	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
+	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
+	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
+	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
+	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
+	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
+	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
+	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
+	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
new file mode 100644
index 000000000000..56a0b123d952
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_sse2.c
@@ -0,0 +1,631 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_SSE2)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+#include <sys/debug.h>
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define	VR0_(REG, ...) "xmm"#REG
+#define	VR1_(_1, REG, ...) "xmm"#REG
+#define	VR2_(_1, _2, REG, ...) "xmm"#REG
+#define	VR3_(_1, _2, _3, REG, ...) "xmm"#REG
+#define	VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG
+#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG
+#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG
+#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG
+
+#define	VR0(r...) VR0_(r, 1, 2, 3, 4, 5, 6)
+#define	VR1(r...) VR1_(r, 1, 2, 3, 4, 5, 6)
+#define	VR2(r...) VR2_(r, 1, 2, 3, 4, 5, 6)
+#define	VR3(r...) VR3_(r, 1, 2, 3, 4, 5, 6)
+#define	VR4(r...) VR4_(r, 1, 2, 3, 4, 5, 6)
+#define	VR5(r...) VR5_(r, 1, 2, 3, 4, 5, 6)
+#define	VR6(r...) VR6_(r, 1, 2, 3, 4, 5, 6)
+#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5, 6)
+
+#define	ELEM_SIZE 16
+
+typedef struct v {
+	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+#define	XOR_ACC(src, r...) 						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "pxor 0x00(%[SRC]), %%" VR0(r) "\n"			\
+		    "pxor 0x10(%[SRC]), %%" VR1(r) "\n"			\
+		    "pxor 0x20(%[SRC]), %%" VR2(r) "\n"			\
+		    "pxor 0x30(%[SRC]), %%" VR3(r) "\n"			\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "pxor 0x00(%[SRC]), %%" VR0(r) "\n"			\
+		    "pxor 0x10(%[SRC]), %%" VR1(r) "\n"			\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 1:								\
+		__asm("pxor 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	}								\
+}
+
+#define	XOR(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "pxor %" VR0(r) ", %" VR4(r) "\n"			\
+		    "pxor %" VR1(r) ", %" VR5(r) "\n"			\
+		    "pxor %" VR2(r) ", %" VR6(r) "\n"			\
+		    "pxor %" VR3(r) ", %" VR7(r));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "pxor %" VR0(r) ", %" VR2(r) "\n"			\
+		    "pxor %" VR1(r) ", %" VR3(r));			\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "pxor %" VR0(r) ", %" VR1(r));			\
+		break;							\
+	}								\
+}
+
+#define	ZERO(r...)	XOR(r, r)
+
+#define	COPY(r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "movdqa %" VR0(r) ", %" VR4(r) "\n"			\
+		    "movdqa %" VR1(r) ", %" VR5(r) "\n"			\
+		    "movdqa %" VR2(r) ", %" VR6(r) "\n"			\
+		    "movdqa %" VR3(r) ", %" VR7(r));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "movdqa %" VR0(r) ", %" VR2(r) "\n"			\
+		    "movdqa %" VR1(r) ", %" VR3(r));			\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "movdqa %" VR0(r) ", %" VR1(r));			\
+		break;							\
+	default:							\
+		VERIFY(0);						\
+	}								\
+}
+
+#define	LOAD(src, r...) 						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "movdqa 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "movdqa 0x10(%[SRC]), %%" VR1(r) "\n"		\
+		    "movdqa 0x20(%[SRC]), %%" VR2(r) "\n"		\
+		    "movdqa 0x30(%[SRC]), %%" VR3(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "movdqa 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "movdqa 0x10(%[SRC]), %%" VR1(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 1:								\
+		__asm(							\
+		    "movdqa 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	}								\
+}
+
+#define	STORE(dst, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "movdqa %%" VR0(r)", 0x00(%[DST])\n"		\
+		    "movdqa %%" VR1(r)", 0x10(%[DST])\n"		\
+		    "movdqa %%" VR2(r)", 0x20(%[DST])\n"		\
+		    "movdqa %%" VR3(r)", 0x30(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "movdqa %%" VR0(r)", 0x00(%[DST])\n"		\
+		    "movdqa %%" VR1(r)", 0x10(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	case 1:								\
+		__asm(							\
+		    "movdqa %%" VR0(r)", 0x00(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	default:							\
+		VERIFY(0);						\
+	}								\
+}
+
+#define	MUL2_SETUP()							\
+{   									\
+	__asm(								\
+	    "movd %[mask], %%xmm15\n"					\
+	    "pshufd $0x0, %%xmm15, %%xmm15\n"				\
+	    : : [mask] "r" (0x1d1d1d1d));				\
+}
+
+#define	_MUL2_x1(a0) 							\
+{									\
+	__asm(								\
+	    "pxor    %xmm14,      %xmm14\n"				\
+	    "pcmpgtb %" a0",  %xmm14\n"					\
+	    "pand    %xmm15,      %xmm14\n"				\
+	    "paddb   %" a0",  %" a0 "\n"				\
+	    "pxor    %xmm14,      %" a0);				\
+}
+
+#define	_MUL2_x2(a0, a1) 						\
+{									\
+	__asm(								\
+	    "pxor    %xmm14,      %xmm14\n"				\
+	    "pxor    %xmm13,      %xmm13\n"				\
+	    "pcmpgtb %" a0",  %xmm14\n"					\
+	    "pcmpgtb %" a1",  %xmm13\n"					\
+	    "pand    %xmm15,      %xmm14\n"				\
+	    "pand    %xmm15,      %xmm13\n"				\
+	    "paddb   %" a0",  %" a0 "\n"				\
+	    "paddb   %" a1",  %" a1 "\n"				\
+	    "pxor    %xmm14,      %" a0 "\n"				\
+	    "pxor    %xmm13,      %" a1);				\
+}
+
+#define	MUL2(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		_MUL2_x2(VR0(r), VR1(r));				\
+		_MUL2_x2(VR2(r), VR3(r));				\
+		break;							\
+	case 2:								\
+		_MUL2_x2(VR0(r), VR1(r));				\
+		break;							\
+	case 1:								\
+		_MUL2_x1(VR0(r));					\
+		break;							\
+	}								\
+}
+
+#define	MUL4(r...)							\
+{									\
+	MUL2(r);							\
+	MUL2(r);							\
+}
+
+/* General multiplication by adding powers of two */
+
+#define	_MUL_PARAM(x, in, acc)						\
+{									\
+	if (x & 0x01) {	COPY(in, acc); } else { ZERO(acc); }	\
+	if (x & 0xfe) { MUL2(in); }					\
+	if (x & 0x02) { XOR(in, acc); }					\
+	if (x & 0xfc) { MUL2(in); }					\
+	if (x & 0x04) { XOR(in, acc); }					\
+	if (x & 0xf8) { MUL2(in); }					\
+	if (x & 0x08) { XOR(in, acc); }					\
+	if (x & 0xf0) { MUL2(in); }					\
+	if (x & 0x10) { XOR(in, acc); }					\
+	if (x & 0xe0) { MUL2(in); }					\
+	if (x & 0x20) { XOR(in, acc); }					\
+	if (x & 0xc0) { MUL2(in); }					\
+	if (x & 0x40) { XOR(in, acc); }					\
+	if (x & 0x80) { MUL2(in); XOR(in, acc); }			\
+}
+
+#define	_mul_x1_in	11
+#define	_mul_x1_acc	12
+
+#define	MUL_x1_DEFINE(x)						\
+static void 								\
+mul_x1_ ## x(void) { _MUL_PARAM(x, _mul_x1_in, _mul_x1_acc); }
+
+#define	_mul_x2_in	9, 10
+#define	_mul_x2_acc	11, 12
+
+#define	MUL_x2_DEFINE(x)						\
+static void 								\
+mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); }
+
+MUL_x1_DEFINE(0); MUL_x1_DEFINE(1); MUL_x1_DEFINE(2); MUL_x1_DEFINE(3);
+MUL_x1_DEFINE(4); MUL_x1_DEFINE(5); MUL_x1_DEFINE(6); MUL_x1_DEFINE(7);
+MUL_x1_DEFINE(8); MUL_x1_DEFINE(9); MUL_x1_DEFINE(10); MUL_x1_DEFINE(11);
+MUL_x1_DEFINE(12); MUL_x1_DEFINE(13); MUL_x1_DEFINE(14); MUL_x1_DEFINE(15);
+MUL_x1_DEFINE(16); MUL_x1_DEFINE(17); MUL_x1_DEFINE(18); MUL_x1_DEFINE(19);
+MUL_x1_DEFINE(20); MUL_x1_DEFINE(21); MUL_x1_DEFINE(22); MUL_x1_DEFINE(23);
+MUL_x1_DEFINE(24); MUL_x1_DEFINE(25); MUL_x1_DEFINE(26); MUL_x1_DEFINE(27);
+MUL_x1_DEFINE(28); MUL_x1_DEFINE(29); MUL_x1_DEFINE(30); MUL_x1_DEFINE(31);
+MUL_x1_DEFINE(32); MUL_x1_DEFINE(33); MUL_x1_DEFINE(34); MUL_x1_DEFINE(35);
+MUL_x1_DEFINE(36); MUL_x1_DEFINE(37); MUL_x1_DEFINE(38); MUL_x1_DEFINE(39);
+MUL_x1_DEFINE(40); MUL_x1_DEFINE(41); MUL_x1_DEFINE(42); MUL_x1_DEFINE(43);
+MUL_x1_DEFINE(44); MUL_x1_DEFINE(45); MUL_x1_DEFINE(46); MUL_x1_DEFINE(47);
+MUL_x1_DEFINE(48); MUL_x1_DEFINE(49); MUL_x1_DEFINE(50); MUL_x1_DEFINE(51);
+MUL_x1_DEFINE(52); MUL_x1_DEFINE(53); MUL_x1_DEFINE(54); MUL_x1_DEFINE(55);
+MUL_x1_DEFINE(56); MUL_x1_DEFINE(57); MUL_x1_DEFINE(58); MUL_x1_DEFINE(59);
+MUL_x1_DEFINE(60); MUL_x1_DEFINE(61); MUL_x1_DEFINE(62); MUL_x1_DEFINE(63);
+MUL_x1_DEFINE(64); MUL_x1_DEFINE(65); MUL_x1_DEFINE(66); MUL_x1_DEFINE(67);
+MUL_x1_DEFINE(68); MUL_x1_DEFINE(69); MUL_x1_DEFINE(70); MUL_x1_DEFINE(71);
+MUL_x1_DEFINE(72); MUL_x1_DEFINE(73); MUL_x1_DEFINE(74); MUL_x1_DEFINE(75);
+MUL_x1_DEFINE(76); MUL_x1_DEFINE(77); MUL_x1_DEFINE(78); MUL_x1_DEFINE(79);
+MUL_x1_DEFINE(80); MUL_x1_DEFINE(81); MUL_x1_DEFINE(82); MUL_x1_DEFINE(83);
+MUL_x1_DEFINE(84); MUL_x1_DEFINE(85); MUL_x1_DEFINE(86); MUL_x1_DEFINE(87);
+MUL_x1_DEFINE(88); MUL_x1_DEFINE(89); MUL_x1_DEFINE(90); MUL_x1_DEFINE(91);
+MUL_x1_DEFINE(92); MUL_x1_DEFINE(93); MUL_x1_DEFINE(94); MUL_x1_DEFINE(95);
+MUL_x1_DEFINE(96); MUL_x1_DEFINE(97); MUL_x1_DEFINE(98); MUL_x1_DEFINE(99);
+MUL_x1_DEFINE(100); MUL_x1_DEFINE(101); MUL_x1_DEFINE(102); MUL_x1_DEFINE(103);
+MUL_x1_DEFINE(104); MUL_x1_DEFINE(105); MUL_x1_DEFINE(106); MUL_x1_DEFINE(107);
+MUL_x1_DEFINE(108); MUL_x1_DEFINE(109); MUL_x1_DEFINE(110); MUL_x1_DEFINE(111);
+MUL_x1_DEFINE(112); MUL_x1_DEFINE(113); MUL_x1_DEFINE(114); MUL_x1_DEFINE(115);
+MUL_x1_DEFINE(116); MUL_x1_DEFINE(117); MUL_x1_DEFINE(118); MUL_x1_DEFINE(119);
+MUL_x1_DEFINE(120); MUL_x1_DEFINE(121); MUL_x1_DEFINE(122); MUL_x1_DEFINE(123);
+MUL_x1_DEFINE(124); MUL_x1_DEFINE(125); MUL_x1_DEFINE(126); MUL_x1_DEFINE(127);
+MUL_x1_DEFINE(128); MUL_x1_DEFINE(129); MUL_x1_DEFINE(130); MUL_x1_DEFINE(131);
+MUL_x1_DEFINE(132); MUL_x1_DEFINE(133); MUL_x1_DEFINE(134); MUL_x1_DEFINE(135);
+MUL_x1_DEFINE(136); MUL_x1_DEFINE(137); MUL_x1_DEFINE(138); MUL_x1_DEFINE(139);
+MUL_x1_DEFINE(140); MUL_x1_DEFINE(141); MUL_x1_DEFINE(142); MUL_x1_DEFINE(143);
+MUL_x1_DEFINE(144); MUL_x1_DEFINE(145); MUL_x1_DEFINE(146); MUL_x1_DEFINE(147);
+MUL_x1_DEFINE(148); MUL_x1_DEFINE(149); MUL_x1_DEFINE(150); MUL_x1_DEFINE(151);
+MUL_x1_DEFINE(152); MUL_x1_DEFINE(153); MUL_x1_DEFINE(154); MUL_x1_DEFINE(155);
+MUL_x1_DEFINE(156); MUL_x1_DEFINE(157); MUL_x1_DEFINE(158); MUL_x1_DEFINE(159);
+MUL_x1_DEFINE(160); MUL_x1_DEFINE(161); MUL_x1_DEFINE(162); MUL_x1_DEFINE(163);
+MUL_x1_DEFINE(164); MUL_x1_DEFINE(165); MUL_x1_DEFINE(166); MUL_x1_DEFINE(167);
+MUL_x1_DEFINE(168); MUL_x1_DEFINE(169); MUL_x1_DEFINE(170); MUL_x1_DEFINE(171);
+MUL_x1_DEFINE(172); MUL_x1_DEFINE(173); MUL_x1_DEFINE(174); MUL_x1_DEFINE(175);
+MUL_x1_DEFINE(176); MUL_x1_DEFINE(177); MUL_x1_DEFINE(178); MUL_x1_DEFINE(179);
+MUL_x1_DEFINE(180); MUL_x1_DEFINE(181); MUL_x1_DEFINE(182); MUL_x1_DEFINE(183);
+MUL_x1_DEFINE(184); MUL_x1_DEFINE(185); MUL_x1_DEFINE(186); MUL_x1_DEFINE(187);
+MUL_x1_DEFINE(188); MUL_x1_DEFINE(189); MUL_x1_DEFINE(190); MUL_x1_DEFINE(191);
+MUL_x1_DEFINE(192); MUL_x1_DEFINE(193); MUL_x1_DEFINE(194); MUL_x1_DEFINE(195);
+MUL_x1_DEFINE(196); MUL_x1_DEFINE(197); MUL_x1_DEFINE(198); MUL_x1_DEFINE(199);
+MUL_x1_DEFINE(200); MUL_x1_DEFINE(201); MUL_x1_DEFINE(202); MUL_x1_DEFINE(203);
+MUL_x1_DEFINE(204); MUL_x1_DEFINE(205); MUL_x1_DEFINE(206); MUL_x1_DEFINE(207);
+MUL_x1_DEFINE(208); MUL_x1_DEFINE(209); MUL_x1_DEFINE(210); MUL_x1_DEFINE(211);
+MUL_x1_DEFINE(212); MUL_x1_DEFINE(213); MUL_x1_DEFINE(214); MUL_x1_DEFINE(215);
+MUL_x1_DEFINE(216); MUL_x1_DEFINE(217); MUL_x1_DEFINE(218); MUL_x1_DEFINE(219);
+MUL_x1_DEFINE(220); MUL_x1_DEFINE(221); MUL_x1_DEFINE(222); MUL_x1_DEFINE(223);
+MUL_x1_DEFINE(224); MUL_x1_DEFINE(225); MUL_x1_DEFINE(226); MUL_x1_DEFINE(227);
+MUL_x1_DEFINE(228); MUL_x1_DEFINE(229); MUL_x1_DEFINE(230); MUL_x1_DEFINE(231);
+MUL_x1_DEFINE(232); MUL_x1_DEFINE(233); MUL_x1_DEFINE(234); MUL_x1_DEFINE(235);
+MUL_x1_DEFINE(236); MUL_x1_DEFINE(237); MUL_x1_DEFINE(238); MUL_x1_DEFINE(239);
+MUL_x1_DEFINE(240); MUL_x1_DEFINE(241); MUL_x1_DEFINE(242); MUL_x1_DEFINE(243);
+MUL_x1_DEFINE(244); MUL_x1_DEFINE(245); MUL_x1_DEFINE(246); MUL_x1_DEFINE(247);
+MUL_x1_DEFINE(248); MUL_x1_DEFINE(249); MUL_x1_DEFINE(250); MUL_x1_DEFINE(251);
+MUL_x1_DEFINE(252); MUL_x1_DEFINE(253); MUL_x1_DEFINE(254); MUL_x1_DEFINE(255);
+
+MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3);
+MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7);
+MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11);
+MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15);
+MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19);
+MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23);
+MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27);
+MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31);
+MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35);
+MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39);
+MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43);
+MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47);
+MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51);
+MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55);
+MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59);
+MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63);
+MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67);
+MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71);
+MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75);
+MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79);
+MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83);
+MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87);
+MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91);
+MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95);
+MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99);
+MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103);
+MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107);
+MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111);
+MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115);
+MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119);
+MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123);
+MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127);
+MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131);
+MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135);
+MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139);
+MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143);
+MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147);
+MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151);
+MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155);
+MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159);
+MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163);
+MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167);
+MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171);
+MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175);
+MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179);
+MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183);
+MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187);
+MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191);
+MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195);
+MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199);
+MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203);
+MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207);
+MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211);
+MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215);
+MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219);
+MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223);
+MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227);
+MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231);
+MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235);
+MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239);
+MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243);
+MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247);
+MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251);
+MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255);
+
+
+
+typedef void (*mul_fn_ptr_t)(void);
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x1_mul_fns[256] = {
+	mul_x1_0, mul_x1_1, mul_x1_2, mul_x1_3, mul_x1_4, mul_x1_5,
+	mul_x1_6, mul_x1_7, mul_x1_8, mul_x1_9, mul_x1_10, mul_x1_11,
+	mul_x1_12, mul_x1_13, mul_x1_14, mul_x1_15, mul_x1_16, mul_x1_17,
+	mul_x1_18, mul_x1_19, mul_x1_20, mul_x1_21, mul_x1_22, mul_x1_23,
+	mul_x1_24, mul_x1_25, mul_x1_26, mul_x1_27, mul_x1_28, mul_x1_29,
+	mul_x1_30, mul_x1_31, mul_x1_32, mul_x1_33, mul_x1_34, mul_x1_35,
+	mul_x1_36, mul_x1_37, mul_x1_38, mul_x1_39, mul_x1_40, mul_x1_41,
+	mul_x1_42, mul_x1_43, mul_x1_44, mul_x1_45, mul_x1_46, mul_x1_47,
+	mul_x1_48, mul_x1_49, mul_x1_50, mul_x1_51, mul_x1_52, mul_x1_53,
+	mul_x1_54, mul_x1_55, mul_x1_56, mul_x1_57, mul_x1_58, mul_x1_59,
+	mul_x1_60, mul_x1_61, mul_x1_62, mul_x1_63, mul_x1_64, mul_x1_65,
+	mul_x1_66, mul_x1_67, mul_x1_68, mul_x1_69, mul_x1_70, mul_x1_71,
+	mul_x1_72, mul_x1_73, mul_x1_74, mul_x1_75, mul_x1_76, mul_x1_77,
+	mul_x1_78, mul_x1_79, mul_x1_80, mul_x1_81, mul_x1_82, mul_x1_83,
+	mul_x1_84, mul_x1_85, mul_x1_86, mul_x1_87, mul_x1_88, mul_x1_89,
+	mul_x1_90, mul_x1_91, mul_x1_92, mul_x1_93, mul_x1_94, mul_x1_95,
+	mul_x1_96, mul_x1_97, mul_x1_98, mul_x1_99, mul_x1_100, mul_x1_101,
+	mul_x1_102, mul_x1_103, mul_x1_104, mul_x1_105, mul_x1_106, mul_x1_107,
+	mul_x1_108, mul_x1_109, mul_x1_110, mul_x1_111, mul_x1_112, mul_x1_113,
+	mul_x1_114, mul_x1_115, mul_x1_116, mul_x1_117, mul_x1_118, mul_x1_119,
+	mul_x1_120, mul_x1_121, mul_x1_122, mul_x1_123, mul_x1_124, mul_x1_125,
+	mul_x1_126, mul_x1_127, mul_x1_128, mul_x1_129, mul_x1_130, mul_x1_131,
+	mul_x1_132, mul_x1_133, mul_x1_134, mul_x1_135, mul_x1_136, mul_x1_137,
+	mul_x1_138, mul_x1_139, mul_x1_140, mul_x1_141, mul_x1_142, mul_x1_143,
+	mul_x1_144, mul_x1_145, mul_x1_146, mul_x1_147, mul_x1_148, mul_x1_149,
+	mul_x1_150, mul_x1_151, mul_x1_152, mul_x1_153, mul_x1_154, mul_x1_155,
+	mul_x1_156, mul_x1_157, mul_x1_158, mul_x1_159, mul_x1_160, mul_x1_161,
+	mul_x1_162, mul_x1_163, mul_x1_164, mul_x1_165, mul_x1_166, mul_x1_167,
+	mul_x1_168, mul_x1_169, mul_x1_170, mul_x1_171, mul_x1_172, mul_x1_173,
+	mul_x1_174, mul_x1_175, mul_x1_176, mul_x1_177, mul_x1_178, mul_x1_179,
+	mul_x1_180, mul_x1_181, mul_x1_182, mul_x1_183, mul_x1_184, mul_x1_185,
+	mul_x1_186, mul_x1_187, mul_x1_188, mul_x1_189, mul_x1_190, mul_x1_191,
+	mul_x1_192, mul_x1_193, mul_x1_194, mul_x1_195, mul_x1_196, mul_x1_197,
+	mul_x1_198, mul_x1_199, mul_x1_200, mul_x1_201, mul_x1_202, mul_x1_203,
+	mul_x1_204, mul_x1_205, mul_x1_206, mul_x1_207, mul_x1_208, mul_x1_209,
+	mul_x1_210, mul_x1_211, mul_x1_212, mul_x1_213, mul_x1_214, mul_x1_215,
+	mul_x1_216, mul_x1_217, mul_x1_218, mul_x1_219, mul_x1_220, mul_x1_221,
+	mul_x1_222, mul_x1_223, mul_x1_224, mul_x1_225, mul_x1_226, mul_x1_227,
+	mul_x1_228, mul_x1_229, mul_x1_230, mul_x1_231, mul_x1_232, mul_x1_233,
+	mul_x1_234, mul_x1_235, mul_x1_236, mul_x1_237, mul_x1_238, mul_x1_239,
+	mul_x1_240, mul_x1_241, mul_x1_242, mul_x1_243, mul_x1_244, mul_x1_245,
+	mul_x1_246, mul_x1_247, mul_x1_248, mul_x1_249, mul_x1_250, mul_x1_251,
+	mul_x1_252, mul_x1_253, mul_x1_254, mul_x1_255
+};
+
+static const mul_fn_ptr_t __attribute__((aligned(256)))
+gf_x2_mul_fns[256] = {
+	mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5,
+	mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11,
+	mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17,
+	mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23,
+	mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29,
+	mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35,
+	mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41,
+	mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47,
+	mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53,
+	mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59,
+	mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65,
+	mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71,
+	mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77,
+	mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83,
+	mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89,
+	mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95,
+	mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101,
+	mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107,
+	mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113,
+	mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119,
+	mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125,
+	mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131,
+	mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137,
+	mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143,
+	mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149,
+	mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155,
+	mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161,
+	mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167,
+	mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173,
+	mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179,
+	mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185,
+	mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191,
+	mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197,
+	mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203,
+	mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209,
+	mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215,
+	mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221,
+	mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227,
+	mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233,
+	mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239,
+	mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245,
+	mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251,
+	mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255
+};
+
+#define	MUL(c, r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 2:								\
+		COPY(r, _mul_x2_in);					\
+		gf_x2_mul_fns[c]();					\
+		COPY(_mul_x2_acc, r);					\
+		break;							\
+	case 1:								\
+		COPY(r, _mul_x1_in);					\
+		gf_x1_mul_fns[c]();					\
+		COPY(_mul_x1_acc, r);					\
+		break;							\
+	default:							\
+		VERIFY(0);						\
+	}								\
+}
+
+
+#define	raidz_math_begin()	kfpu_begin()
+#define	raidz_math_end()	kfpu_end()
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		4
+#define	ZERO_DEFINE()		{}
+#define	ZERO_D			0, 1, 2, 3
+
+#define	COPY_STRIDE		4
+#define	COPY_DEFINE()		{}
+#define	COPY_D			0, 1, 2, 3
+
+#define	ADD_STRIDE		4
+#define	ADD_DEFINE()		{}
+#define	ADD_D 			0, 1, 2, 3
+
+#define	MUL_STRIDE		2
+#define	MUL_DEFINE() 		MUL2_SETUP()
+#define	MUL_D			0, 1
+
+#define	GEN_P_STRIDE		4
+#define	GEN_P_DEFINE()		{}
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_DEFINE() 	{}
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_DEFINE() 	{}
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE()		{}
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE()		{}
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() 	{}
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_STRIDE		2
+#define	REC_PQ_DEFINE() 	MUL2_SETUP()
+#define	REC_PQ_X		0, 1
+#define	REC_PQ_Y		2, 3
+#define	REC_PQ_T		4, 5
+
+#define	SYN_PR_DEFINE() 	{}
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_STRIDE		2
+#define	REC_PR_DEFINE() 	MUL2_SETUP()
+#define	REC_PR_X		0, 1
+#define	REC_PR_Y		2, 3
+#define	REC_PR_T		4, 5
+
+#define	SYN_QR_DEFINE() 	{}
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_STRIDE		2
+#define	REC_QR_DEFINE() 	MUL2_SETUP()
+#define	REC_QR_X		0, 1
+#define	REC_QR_Y		2, 3
+#define	REC_QR_T		4, 5
+
+#define	SYN_PQR_DEFINE() 	{}
+#define	SYN_PQR_D		0, 1, 2, 3
+#define	SYN_PQR_X		4, 5, 6, 7
+
+#define	REC_PQR_STRIDE		1
+#define	REC_PQR_DEFINE() 	MUL2_SETUP()
+#define	REC_PQR_X		0
+#define	REC_PQR_Y		1
+#define	REC_PQR_Z		2
+#define	REC_PQR_XS		3
+#define	REC_PQR_YS		4
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(sse2);
+DEFINE_REC_METHODS(sse2);
+
+static boolean_t
+raidz_will_sse2_work(void)
+{
+	return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_sse2_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(sse2),
+	.rec = RAIDZ_REC_METHODS(sse2),
+	.is_supported = &raidz_will_sse2_work,
+	.name = "sse2"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_SSE2) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
new file mode 100644
index 000000000000..5ddc079a4f5d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_raidz_math_ssse3.c
@@ -0,0 +1,2477 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
+ */
+
+#include <sys/isa_defs.h>
+
+#if defined(__x86_64) && defined(HAVE_SSSE3)
+
+#include <sys/types.h>
+#include <sys/simd.h>
+
+#ifdef __linux__
+#define	__asm __asm__ __volatile__
+#endif
+
+#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
+#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
+
+#define	VR0_(REG, ...) "xmm"#REG
+#define	VR1_(_1, REG, ...) "xmm"#REG
+#define	VR2_(_1, _2, REG, ...) "xmm"#REG
+#define	VR3_(_1, _2, _3, REG, ...) "xmm"#REG
+#define	VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG
+#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG
+#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG
+#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG
+
+#define	VR0(r...) VR0_(r)
+#define	VR1(r...) VR1_(r)
+#define	VR2(r...) VR2_(r, 1)
+#define	VR3(r...) VR3_(r, 1, 2)
+#define	VR4(r...) VR4_(r, 1, 2)
+#define	VR5(r...) VR5_(r, 1, 2, 3)
+#define	VR6(r...) VR6_(r, 1, 2, 3, 4)
+#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
+
+#define	R_01(REG1, REG2, ...) REG1, REG2
+#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
+#define	R_23(REG...) _R_23(REG, 1, 2, 3)
+
+#define	ZFS_ASM_BUG()	ASSERT(0)
+
+const uint8_t gf_clmul_mod_lt[4*256][16];
+
+#define	ELEM_SIZE 16
+
+typedef struct v {
+	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
+} v_t;
+
+
+#define	XOR_ACC(src, r...) 						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "pxor 0x00(%[SRC]), %%" VR0(r) "\n"			\
+		    "pxor 0x10(%[SRC]), %%" VR1(r) "\n"			\
+		    "pxor 0x20(%[SRC]), %%" VR2(r) "\n"			\
+		    "pxor 0x30(%[SRC]), %%" VR3(r) "\n"			\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "pxor 0x00(%[SRC]), %%" VR0(r) "\n"			\
+		    "pxor 0x10(%[SRC]), %%" VR1(r) "\n"			\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	XOR(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "pxor %" VR0(r) ", %" VR4(r) "\n"			\
+		    "pxor %" VR1(r) ", %" VR5(r) "\n"			\
+		    "pxor %" VR2(r) ", %" VR6(r) "\n"			\
+		    "pxor %" VR3(r) ", %" VR7(r));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "pxor %" VR0(r) ", %" VR2(r) "\n"			\
+		    "pxor %" VR1(r) ", %" VR3(r));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	ZERO(r...)	XOR(r, r)
+
+#define	COPY(r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 8:								\
+		__asm(							\
+		    "movdqa %" VR0(r) ", %" VR4(r) "\n"			\
+		    "movdqa %" VR1(r) ", %" VR5(r) "\n"			\
+		    "movdqa %" VR2(r) ", %" VR6(r) "\n"			\
+		    "movdqa %" VR3(r) ", %" VR7(r));			\
+		break;							\
+	case 4:								\
+		__asm(							\
+		    "movdqa %" VR0(r) ", %" VR2(r) "\n"			\
+		    "movdqa %" VR1(r) ", %" VR3(r));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	LOAD(src, r...) 						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "movdqa 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "movdqa 0x10(%[SRC]), %%" VR1(r) "\n"		\
+		    "movdqa 0x20(%[SRC]), %%" VR2(r) "\n"		\
+		    "movdqa 0x30(%[SRC]), %%" VR3(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "movdqa 0x00(%[SRC]), %%" VR0(r) "\n"		\
+		    "movdqa 0x10(%[SRC]), %%" VR1(r) "\n"		\
+		    : : [SRC] "r" (src));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	STORE(dst, r...)						\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		__asm(							\
+		    "movdqa %%" VR0(r)", 0x00(%[DST])\n"		\
+		    "movdqa %%" VR1(r)", 0x10(%[DST])\n"		\
+		    "movdqa %%" VR2(r)", 0x20(%[DST])\n"		\
+		    "movdqa %%" VR3(r)", 0x30(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	case 2:								\
+		__asm(							\
+		    "movdqa %%" VR0(r)", 0x00(%[DST])\n"		\
+		    "movdqa %%" VR1(r)", 0x10(%[DST])\n"		\
+		    : : [DST] "r" (dst));				\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL2_SETUP()							\
+{   									\
+	__asm(								\
+	    "movd %[mask], %%xmm15\n"					\
+	    "pshufd $0x0, %%xmm15, %%xmm15\n"				\
+	    : : [mask] "r" (0x1d1d1d1d));				\
+}
+
+#define	_MUL2_x2(r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		    "pxor    %xmm14,      %xmm14\n"			\
+		    "pxor    %xmm13,      %xmm13\n"			\
+		    "pcmpgtb %" VR0(r)",  %xmm14\n"			\
+		    "pcmpgtb %" VR1(r)",  %xmm13\n"			\
+		    "pand    %xmm15,      %xmm14\n"			\
+		    "pand    %xmm15,      %xmm13\n"			\
+		    "paddb   %" VR0(r)",  %" VR0(r) "\n"		\
+		    "paddb   %" VR1(r)",  %" VR1(r) "\n"		\
+		    "pxor    %xmm14,      %" VR0(r) "\n"		\
+		    "pxor    %xmm13,      %" VR1(r));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL2(r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		_MUL2_x2(R_01(r));					\
+		_MUL2_x2(R_23(r));					\
+		break;							\
+	case 2:								\
+		_MUL2_x2(r);						\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL4(r...)							\
+{									\
+	MUL2(r);							\
+	MUL2(r);							\
+}
+
+#define	_0f		"xmm15"
+#define	_a_save		"xmm14"
+#define	_b_save		"xmm13"
+#define	_lt_mod_a	"xmm12"
+#define	_lt_clmul_a	"xmm11"
+#define	_lt_mod_b	"xmm10"
+#define	_lt_clmul_b	"xmm15"
+
+#define	_MULx2(c, r...)							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 2:								\
+		__asm(							\
+		    /* lts for upper part */				\
+		    "movd %[mask], %%" _0f "\n"				\
+		    "pshufd $0x0, %%" _0f ", %%" _0f "\n"		\
+		    "movdqa 0x00(%[lt]), %%" _lt_mod_a "\n"		\
+		    "movdqa 0x10(%[lt]), %%" _lt_clmul_a "\n"		\
+		    /* upper part */					\
+		    "movdqa %%" VR0(r) ", %%" _a_save "\n"		\
+		    "movdqa %%" VR1(r) ", %%" _b_save "\n"		\
+		    "psraw $0x4, %%" VR0(r) "\n"			\
+		    "psraw $0x4, %%" VR1(r) "\n"			\
+		    "pand %%" _0f ", %%" _a_save "\n"			\
+		    "pand %%" _0f ", %%" _b_save "\n"			\
+		    "pand %%" _0f ", %%" VR0(r) "\n"			\
+		    "pand %%" _0f ", %%" VR1(r) "\n"			\
+									\
+		    "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n"		\
+		    "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n"	\
+									\
+		    "pshufb %%" VR0(r) ",%%" _lt_mod_a "\n"		\
+		    "pshufb %%" VR1(r) ",%%" _lt_mod_b "\n"		\
+		    "pshufb %%" VR0(r) ",%%" _lt_clmul_a "\n"		\
+		    "pshufb %%" VR1(r) ",%%" _lt_clmul_b "\n"		\
+									\
+		    "pxor %%" _lt_mod_a ",%%" _lt_clmul_a "\n"		\
+		    "pxor %%" _lt_mod_b ",%%" _lt_clmul_b "\n"		\
+		    "movdqa %%" _lt_clmul_a ",%%" VR0(r) "\n"		\
+		    "movdqa %%" _lt_clmul_b ",%%" VR1(r) "\n"		\
+		    /* lts for lower part */				\
+		    "movdqa 0x20(%[lt]), %%" _lt_mod_a "\n"		\
+		    "movdqa 0x30(%[lt]), %%" _lt_clmul_a "\n"		\
+		    "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n"		\
+		    "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n"	\
+		    /* lower part */					\
+		    "pshufb %%" _a_save ",%%" _lt_mod_a "\n"		\
+		    "pshufb %%" _b_save ",%%" _lt_mod_b "\n"		\
+		    "pshufb %%" _a_save ",%%" _lt_clmul_a "\n"		\
+		    "pshufb %%" _b_save ",%%" _lt_clmul_b "\n"		\
+									\
+		    "pxor %%" _lt_mod_a ",%%" VR0(r) "\n"		\
+		    "pxor %%" _lt_mod_b ",%%" VR1(r) "\n"		\
+		    "pxor %%" _lt_clmul_a ",%%" VR0(r) "\n"		\
+		    "pxor %%" _lt_clmul_b ",%%" VR1(r) "\n"		\
+		    : : [mask] "r" (0x0f0f0f0f),			\
+		    [lt] "r" (gf_clmul_mod_lt[4*(c)]));			\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	MUL(c, r...) 							\
+{									\
+	switch (REG_CNT(r)) {						\
+	case 4:								\
+		_MULx2(c, R_23(r));					\
+		_MULx2(c, R_01(r));					\
+		break;							\
+	case 2:								\
+		_MULx2(c, R_01(r));					\
+		break;							\
+	default:							\
+		ZFS_ASM_BUG();						\
+	}								\
+}
+
+#define	raidz_math_begin()	kfpu_begin()
+#define	raidz_math_end()	kfpu_end()
+
+
+#define	SYN_STRIDE		4
+
+#define	ZERO_STRIDE		4
+#define	ZERO_DEFINE()		{}
+#define	ZERO_D			0, 1, 2, 3
+
+#define	COPY_STRIDE		4
+#define	COPY_DEFINE()		{}
+#define	COPY_D			0, 1, 2, 3
+
+#define	ADD_STRIDE		4
+#define	ADD_DEFINE()		{}
+#define	ADD_D 			0, 1, 2, 3
+
+#define	MUL_STRIDE		4
+#define	MUL_DEFINE() 		{}
+#define	MUL_D			0, 1, 2, 3
+
+#define	GEN_P_STRIDE		4
+#define	GEN_P_DEFINE()		{}
+#define	GEN_P_P			0, 1, 2, 3
+
+#define	GEN_PQ_STRIDE		4
+#define	GEN_PQ_DEFINE() 	{}
+#define	GEN_PQ_D		0, 1, 2, 3
+#define	GEN_PQ_C		4, 5, 6, 7
+
+#define	GEN_PQR_STRIDE		4
+#define	GEN_PQR_DEFINE() 	{}
+#define	GEN_PQR_D		0, 1, 2, 3
+#define	GEN_PQR_C		4, 5, 6, 7
+
+#define	SYN_Q_DEFINE()		{}
+#define	SYN_Q_D			0, 1, 2, 3
+#define	SYN_Q_X			4, 5, 6, 7
+
+#define	SYN_R_DEFINE()		{}
+#define	SYN_R_D			0, 1, 2, 3
+#define	SYN_R_X			4, 5, 6, 7
+
+#define	SYN_PQ_DEFINE() 	{}
+#define	SYN_PQ_D		0, 1, 2, 3
+#define	SYN_PQ_X		4, 5, 6, 7
+
+#define	REC_PQ_STRIDE		2
+#define	REC_PQ_DEFINE() 	{}
+#define	REC_PQ_X		0, 1
+#define	REC_PQ_Y		2, 3
+#define	REC_PQ_T		4, 5
+
+#define	SYN_PR_DEFINE() 	{}
+#define	SYN_PR_D		0, 1, 2, 3
+#define	SYN_PR_X		4, 5, 6, 7
+
+#define	REC_PR_STRIDE		2
+#define	REC_PR_DEFINE() 	{}
+#define	REC_PR_X		0, 1
+#define	REC_PR_Y		2, 3
+#define	REC_PR_T		4, 5
+
+#define	SYN_QR_DEFINE() 	{}
+#define	SYN_QR_D		0, 1, 2, 3
+#define	SYN_QR_X		4, 5, 6, 7
+
+#define	REC_QR_STRIDE		2
+#define	REC_QR_DEFINE() 	{}
+#define	REC_QR_X		0, 1
+#define	REC_QR_Y		2, 3
+#define	REC_QR_T		4, 5
+
+#define	SYN_PQR_DEFINE() 	{}
+#define	SYN_PQR_D		0, 1, 2, 3
+#define	SYN_PQR_X		4, 5, 6, 7
+
+#define	REC_PQR_STRIDE		2
+#define	REC_PQR_DEFINE() 	{}
+#define	REC_PQR_X		0, 1
+#define	REC_PQR_Y		2, 3
+#define	REC_PQR_Z		4, 5
+#define	REC_PQR_XS		6, 7
+#define	REC_PQR_YS		8, 9
+
+
+#include <sys/vdev_raidz_impl.h>
+#include "vdev_raidz_math_impl.h"
+
+DEFINE_GEN_METHODS(ssse3);
+DEFINE_REC_METHODS(ssse3);
+
+static boolean_t
+raidz_will_ssse3_work(void)
+{
+	return (kfpu_allowed() && zfs_sse_available() &&
+	    zfs_sse2_available() && zfs_ssse3_available());
+}
+
+const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
+	.init = NULL,
+	.fini = NULL,
+	.gen = RAIDZ_GEN_METHODS(ssse3),
+	.rec = RAIDZ_REC_METHODS(ssse3),
+	.is_supported = &raidz_will_ssse3_work,
+	.name = "ssse3"
+};
+
+#endif /* defined(__x86_64) && defined(HAVE_SSSE3) */
+
+
+#if defined(__x86_64)
+#if defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW)
+/* BEGIN CSTYLED */
+const uint8_t
+__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] =
+{
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e,
+	    0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09,
+	    0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c,
+	    0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b,
+	    0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12,
+	    0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15,
+	    0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38,
+	    0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f,
+	    0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36,
+	    0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31,
+	    0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24,
+	    0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23,
+	    0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a,
+	    0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d,
+	    0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77,
+	    0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e,
+	    0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79,
+	    0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c,
+	    0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b,
+	    0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62,
+	    0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e,
+	    0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65,
+	    0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48,
+	    0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f,
+	    0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46,
+	    0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74,
+	    0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41,
+	    0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54,
+	    0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53,
+	    0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a,
+	    0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa  },
+	{ 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69,
+	    0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d,
+	    0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7,
+	    0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee,
+	    0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9,
+	    0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc,
+	    0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb,
+	    0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2,
+	    0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb,
+	    0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8,
+	    0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf,
+	    0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6,
+	    0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81,
+	    0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1,
+	    0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4,
+	    0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3,
+	    0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca,
+	    0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba  },
+	{ 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c,
+	    0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd,
+	    0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97,
+	    0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e,
+	    0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5,
+	    0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99,
+	    0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c,
+	    0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b,
+	    0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82,
+	    0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32  },
+	{ 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8,
+	    0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85,
+	    0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8,
+	    0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf,
+	    0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6,
+	    0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2,
+	    0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1,
+	    0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4,
+	    0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3,
+	    0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba,
+	    0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a  },
+	{ 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf,
+	    0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d  },
+	{ 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd,
+	    0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7,
+	    0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce,
+	    0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51,
+	    0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9,
+	    0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc,
+	    0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb,
+	    0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2,
+	    0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5,
+	    0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8,
+	    0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff,
+	    0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6,
+	    0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76,
+	    0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1,
+	    0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4,
+	    0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3,
+	    0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{ 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b,
+	    0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed,
+	    0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7,
+	    0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe,
+	    0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02,
+	    0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac,
+	    0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab,
+	    0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2,
+	    0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5,
+	    0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88,
+	    0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f,
+	    0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86,
+	    0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25,
+	    0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81,
+	    0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94,
+	    0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93,
+	    0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a,
+	    0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a  },
+	{ 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38,
+	    0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27  },
+	{ 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d,
+	    0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27,
+	    0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e,
+	    0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7,
+	    0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29,
+	    0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c,
+	    0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b,
+	    0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32,
+	    0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02  },
+	{ 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea,
+	    0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35,
+	    0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18,
+	    0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f,
+	    0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16,
+	    0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0,
+	    0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11,
+	    0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04,
+	    0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03,
+	    0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a,
+	    0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a  },
+	{ 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd,
+	    0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d,
+	    0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57,
+	    0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e,
+	    0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4,
+	    0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59,
+	    0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c,
+	    0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b,
+	    0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42,
+	    0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2  },
+	{ 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9,
+	    0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45,
+	    0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68,
+	    0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f,
+	    0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66,
+	    0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83,
+	    0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61,
+	    0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74,
+	    0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73,
+	    0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a,
+	    0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a  },
+	{ 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e,
+	    0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d,
+	    0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a  },
+	{ 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d,
+	    0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e,
+	    0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2,
+	    0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89,
+	    0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c,
+	    0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b,
+	    0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92,
+	    0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2  },
+	{ 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf,
+	    0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95,
+	    0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8,
+	    0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf,
+	    0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6,
+	    0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85,
+	    0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1,
+	    0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4,
+	    0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3,
+	    0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa,
+	    0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad,
+	    0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7,
+	    0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe,
+	    0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1,
+	    0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9,
+	    0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec,
+	    0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb,
+	    0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2,
+	    0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52  },
+	{ 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec,
+	    0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5,
+	    0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8,
+	    0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf,
+	    0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6,
+	    0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6,
+	    0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1,
+	    0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3,
+	    0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda,
+	    0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53  },
+	{ 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd,
+	    0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67,
+	    0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e,
+	    0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04,
+	    0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69,
+	    0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c,
+	    0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b,
+	    0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{ 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19,
+	    0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75,
+	    0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58,
+	    0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f,
+	    0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56,
+	    0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23,
+	    0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51,
+	    0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44,
+	    0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43,
+	    0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a,
+	    0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d,
+	    0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17,
+	    0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e,
+	    0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57,
+	    0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19,
+	    0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c,
+	    0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b,
+	    0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02,
+	    0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2  },
+	{ 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a,
+	    0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05,
+	    0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28,
+	    0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f,
+	    0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26,
+	    0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70,
+	    0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34,
+	    0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33,
+	    0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a,
+	    0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27,
+	    0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e  },
+	{ 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d,
+	    0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47,
+	    0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e,
+	    0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3,
+	    0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49,
+	    0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c,
+	    0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b,
+	    0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52,
+	    0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62  },
+	{ 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee,
+	    0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55,
+	    0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78,
+	    0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f,
+	    0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76,
+	    0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4,
+	    0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71,
+	    0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64,
+	    0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63,
+	    0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a,
+	    0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a  },
+	{ 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9,
+	    0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d,
+	    0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37,
+	    0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e,
+	    0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0,
+	    0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39,
+	    0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c,
+	    0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b,
+	    0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22,
+	    0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92  },
+	{ 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd,
+	    0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25,
+	    0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08,
+	    0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f,
+	    0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06,
+	    0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87,
+	    0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01,
+	    0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14,
+	    0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13,
+	    0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a,
+	    0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea  },
+	{ 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a,
+	    0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74  },
+	{ 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d,
+	    0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7,
+	    0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae,
+	    0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55,
+	    0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9,
+	    0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc,
+	    0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb,
+	    0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2,
+	    0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82  },
+	{ 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48,
+	    0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5,
+	    0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98,
+	    0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f,
+	    0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96,
+	    0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72,
+	    0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91,
+	    0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84,
+	    0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83,
+	    0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a,
+	    0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa  },
+	{ 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f,
+	    0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d,
+	    0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7  },
+	{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7  },
+	{ 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70,
+	    0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7,
+	    0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa  },
+	{ 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0,
+	    0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde,
+	    0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06,
+	    0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa  },
+	{ 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90,
+	    0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9,
+	    0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90  },
+	{ 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0,
+	    0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc,
+	    0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90  },
+	{ 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0,
+	    0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb,
+	    0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d  },
+	{ 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20,
+	    0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2,
+	    0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72  },
+	{ 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b,
+	    0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d  },
+	{ 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50,
+	    0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5,
+	    0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4  },
+	{ 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+	    0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8,
+	    0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4  },
+	{ 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0,
+	    0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef,
+	    0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{ 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60,
+	    0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6,
+	    0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21,
+	    0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9  },
+	{ 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10,
+	    0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1,
+	    0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3  },
+	{ 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40,
+	    0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4,
+	    0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3  },
+	{ 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30,
+	    0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3,
+	    0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde  },
+	{ 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0,
+	    0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa,
+	    0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a  },
+	{ 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c,
+	    0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde  },
+	{ 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0,
+	    0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50  },
+	{ 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a,
+	    0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69  },
+	{ 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd,
+	    0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05  }
+};
+/* END CSTYLED */
+#endif /* defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) */
+#endif /* defined(__x86_64) */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_rebuild.c b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
new file mode 100644
index 000000000000..784d1af15a81
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_rebuild.c
@@ -0,0 +1,1147 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/vdev_impl.h>
+#include <sys/vdev_draid.h>
+#include <sys/dsl_scan.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/zio.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+
+/*
+ * This file contains the sequential reconstruction implementation for
+ * resilvering.  This form of resilvering is internally referred to as device
+ * rebuild to avoid conflating it with the traditional healing reconstruction
+ * performed by the dsl scan code.
+ *
+ * When replacing a device, or scrubbing the pool, ZFS has historically used
+ * a process called resilvering which is a form of healing reconstruction.
+ * This approach has the advantage that as blocks are read from disk their
+ * checksums can be immediately verified and the data repaired.  Unfortunately,
+ * it also results in a random IO pattern to the disk even when extra care
+ * is taken to sequentialize the IO as much as possible.  This substantially
+ * increases the time required to resilver the pool and restore redundancy.
+ *
+ * For mirrored devices it's possible to implement an alternate sequential
+ * reconstruction strategy when resilvering.  Sequential reconstruction
+ * behaves like a traditional RAID rebuild and reconstructs a device in LBA
+ * order without verifying the checksum.  After this phase completes a second
+ * scrub phase is started to verify all of the checksums.  This two phase
+ * process will take longer than the healing reconstruction described above.
+ * However, it has that advantage that after the reconstruction first phase
+ * completes redundancy has been restored.  At this point the pool can incur
+ * another device failure without risking data loss.
+ *
+ * There are a few noteworthy limitations and other advantages of resilvering
+ * using sequential reconstruction vs healing reconstruction.
+ *
+ * Limitations:
+ *
+ *   - Sequential reconstruction is not possible on RAIDZ due to its
+ *     variable stripe width.  Note dRAID uses a fixed stripe width which
+ *     avoids this issue, but comes at the expense of some usable capacity.
+ *
+ *   - Block checksums are not verified during sequential reconstruction.
+ *     Similar to traditional RAID the parity/mirror data is reconstructed
+ *     but cannot be immediately double checked.  For this reason when the
+ *     last active resilver completes the pool is automatically scrubbed
+ *     by default.
+ *
+ *   - Deferred resilvers using sequential reconstruction are not currently
+ *     supported.  When adding another vdev to an active top-level resilver
+ *     it must be restarted.
+ *
+ * Advantages:
+ *
+ *   - Sequential reconstruction is performed in LBA order which may be faster
+ *     than healing reconstruction particularly when using using HDDs (or
+ *     especially with SMR devices).  Only allocated capacity is resilvered.
+ *
+ *   - Sequential reconstruction is not constrained by ZFS block boundaries.
+ *     This allows it to issue larger IOs to disk which span multiple blocks
+ *     allowing all of these logical blocks to be repaired with a single IO.
+ *
+ *   - Unlike a healing resilver or scrub which are pool wide operations,
+ *     sequential reconstruction is handled by the top-level vdevs.  This
+ *     allows for it to be started or canceled on a top-level vdev without
+ *     impacting any other top-level vdevs in the pool.
+ *
+ *   - Data only referenced by a pool checkpoint will be repaired because
+ *     that space is reflected in the space maps.  This differs for a
+ *     healing resilver or scrub which will not repair that data.
+ */
+
+
+/*
+ * Size of rebuild reads; defaults to 1MiB per data disk and is capped at
+ * SPA_MAXBLOCKSIZE.
+ */
+unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev caused by a
+ * sequential resilver.  We attempt to strike a balance here between keeping
+ * the vdev queues full of I/Os at all times and not overflowing the queues
+ * to cause long latency, which would cause long txg sync times.
+ *
+ * A large default value can be safely used here because the default target
+ * segment size is also large (zfs_rebuild_max_segment=1M).  This helps keep
+ * the queue depth short.
+ *
+ * 32MB was selected as the default value to achieve good performance with
+ * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential
+ * rebuild was unable to saturate all of the drives using smaller values.
+ * With a value of 32MB the sequential resilver write rate was measured at
+ * 800MB/s sustained while rebuilding to a distributed spare.
+ */
+unsigned long zfs_rebuild_vdev_limit = 32 << 20;
+
+/*
+ * Automatically start a pool scrub when the last active sequential resilver
+ * completes in order to verify the checksums of all blocks which have been
+ * resilvered. This option is enabled by default and is strongly recommended.
+ */
+int zfs_rebuild_scrub_enabled = 1;
+
+/*
+ * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
+ */
+static void vdev_rebuild_thread(void *arg);
+
+/*
+ * Clear the per-vdev rebuild bytes value for a vdev tree.
+ */
+static void
+clear_rebuild_bytes(vdev_t *vd)
+{
+	vdev_stat_t *vs = &vd->vdev_stat;
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++)
+		clear_rebuild_bytes(vd->vdev_child[i]);
+
+	mutex_enter(&vd->vdev_stat_lock);
+	vs->vs_rebuild_processed = 0;
+	mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Determines whether a vdev_rebuild_thread() should be stopped.
+ */
+static boolean_t
+vdev_rebuild_should_stop(vdev_t *vd)
+{
+	return (!vdev_writeable(vd) || vd->vdev_removing ||
+	    vd->vdev_rebuild_exit_wanted ||
+	    vd->vdev_rebuild_cancel_wanted ||
+	    vd->vdev_rebuild_reset_wanted);
+}
+
+/*
+ * Determine if the rebuild should be canceled.  This may happen when all
+ * vdevs with MISSING DTLs are detached.
+ */
+static boolean_t
+vdev_rebuild_should_cancel(vdev_t *vd)
+{
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+	if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg))
+		return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+/*
+ * The sync task for updating the on-disk state of a rebuild.  This is
+ * scheduled by vdev_rebuild_range().
+ */
+static void
+vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx)
+{
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+
+	if (vr->vr_scan_offset[txg & TXG_MASK] > 0) {
+		vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK];
+		vr->vr_scan_offset[txg & TXG_MASK] = 0;
+	}
+
+	vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms +
+	    NSEC2MSEC(gethrtime() - vr->vr_pass_start_time);
+
+	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+	    REBUILD_PHYS_ENTRIES, vrp, tx));
+
+	mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Initialize the on-disk state for a new rebuild, start the rebuild thread.
+ */
+static void
+vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+	ASSERT(vd->vdev_rebuilding);
+
+	spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+	bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+	vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
+	vrp->vrp_min_txg = 0;
+	vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+	vrp->vrp_start_time = gethrestime_sec();
+	vrp->vrp_scan_time_ms = 0;
+	vr->vr_prev_scan_time_ms = 0;
+
+	/*
+	 * Rebuilds are currently only used when replacing a device, in which
+	 * case there must be DTL_MISSING entries.  In the future, we could
+	 * allow rebuilds to be used in a way similar to a scrub.  This would
+	 * be useful because it would allow us to rebuild the space used by
+	 * pool checkpoints.
+	 */
+	VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+	    REBUILD_PHYS_ENTRIES, vrp, tx));
+
+	spa_history_log_internal(spa, "rebuild", tx,
+	    "vdev_id=%llu vdev_guid=%llu started",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+	vd->vdev_rebuild_thread = thread_create(NULL, 0,
+	    vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+	mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name)
+{
+	nvlist_t *aux = fnvlist_alloc();
+
+	fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential");
+	spa_event_notify(spa, vd, aux, name);
+	nvlist_free(aux);
+}
+
+/*
+ * Called to request that a new rebuild be started.  The feature will remain
+ * active for the duration of the rebuild, then revert to the enabled state.
+ */
+static void
+vdev_rebuild_initiate(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(vd->vdev_top == vd);
+	ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock));
+	ASSERT(!vd->vdev_rebuilding);
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	vd->vdev_rebuilding = B_TRUE;
+
+	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync,
+	    (void *)(uintptr_t)vd->vdev_id, tx);
+	dmu_tx_commit(tx);
+
+	vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START);
+}
+
+/*
+ * Update the on-disk state to completed when a rebuild finishes.
+ */
+static void
+vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
+{
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+	vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
+	vrp->vrp_end_time = gethrestime_sec();
+
+	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+	    REBUILD_PHYS_ENTRIES, vrp, tx));
+
+	vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
+	spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+	spa_history_log_internal(spa, "rebuild",  tx,
+	    "vdev_id=%llu vdev_guid=%llu complete",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+	vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+	/* Handles detaching of spares */
+	spa_async_request(spa, SPA_ASYNC_REBUILD_DONE);
+	vd->vdev_rebuilding = B_FALSE;
+	mutex_exit(&vd->vdev_rebuild_lock);
+
+	/*
+	 * While we're in syncing context take the opportunity to
+	 * setup the scrub when there are no more active rebuilds.
+	 */
+	if (!vdev_rebuild_active(spa->spa_root_vdev) &&
+	    zfs_rebuild_scrub_enabled) {
+		pool_scan_func_t func = POOL_SCAN_SCRUB;
+		dsl_scan_setup_sync(&func, tx);
+	}
+
+	cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Update the on-disk state to canceled when a rebuild finishes.
+ */
+static void
+vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+	vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED;
+	vrp->vrp_end_time = gethrestime_sec();
+
+	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+	    REBUILD_PHYS_ENTRIES, vrp, tx));
+
+	spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+	spa_history_log_internal(spa, "rebuild",  tx,
+	    "vdev_id=%llu vdev_guid=%llu canceled",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+	vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+	vd->vdev_rebuild_cancel_wanted = B_FALSE;
+	vd->vdev_rebuilding = B_FALSE;
+	mutex_exit(&vd->vdev_rebuild_lock);
+
+	spa_notify_waiters(spa);
+	cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Resets the progress of a running rebuild.  This will occur when a new
+ * vdev is added to rebuild.
+ */
+static void
+vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx)
+{
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+
+	ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+	ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+
+	vrp->vrp_last_offset = 0;
+	vrp->vrp_min_txg = 0;
+	vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+	vrp->vrp_bytes_scanned = 0;
+	vrp->vrp_bytes_issued = 0;
+	vrp->vrp_bytes_rebuilt = 0;
+	vrp->vrp_bytes_est = 0;
+	vrp->vrp_scan_time_ms = 0;
+	vr->vr_prev_scan_time_ms = 0;
+
+	/* See vdev_rebuild_initiate_sync comment */
+	VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+	VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+	    REBUILD_PHYS_ENTRIES, vrp, tx));
+
+	spa_history_log_internal(spa, "rebuild",  tx,
+	    "vdev_id=%llu vdev_guid=%llu reset",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+	vd->vdev_rebuild_reset_wanted = B_FALSE;
+	ASSERT(vd->vdev_rebuilding);
+
+	vd->vdev_rebuild_thread = thread_create(NULL, 0,
+	    vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+	mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Clear the last rebuild status.
+ */
+void
+vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx)
+{
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+	objset_t *mos = spa_meta_objset(spa);
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) ||
+	    vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) {
+		mutex_exit(&vd->vdev_rebuild_lock);
+		return;
+	}
+
+	clear_rebuild_bytes(vd);
+	bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+
+	if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) {
+		VERIFY0(zap_update(mos, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+		    REBUILD_PHYS_ENTRIES, vrp, tx));
+	}
+
+	mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * The zio_done_func_t callback for each rebuild I/O issued.  It's responsible
+ * for updating the rebuild stats and limiting the number of in flight I/Os.
+ */
+static void
+vdev_rebuild_cb(zio_t *zio)
+{
+	vdev_rebuild_t *vr = zio->io_private;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+	vdev_t *vd = vr->vr_top_vdev;
+
+	mutex_enter(&vr->vr_io_lock);
+	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+		/*
+		 * The I/O failed because the top-level vdev was unavailable.
+		 * Attempt to roll back to the last completed offset, in order
+		 * resume from the correct location if the pool is resumed.
+		 * (This works because spa_sync waits on spa_txg_zio before
+		 * it runs sync tasks.)
+		 */
+		uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK];
+		*off = MIN(*off, zio->io_offset);
+	} else if (zio->io_error) {
+		vrp->vrp_errors++;
+	}
+
+	abd_free(zio->io_abd);
+
+	ASSERT3U(vr->vr_bytes_inflight, >, 0);
+	vr->vr_bytes_inflight -= zio->io_size;
+	cv_broadcast(&vr->vr_io_cv);
+	mutex_exit(&vr->vr_io_lock);
+
+	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * Initialize a block pointer that can be used to read the given segment
+ * for sequential rebuild.
+ */
+static void
+vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start,
+    uint64_t asize)
+{
+	ASSERT(vd->vdev_ops == &vdev_draid_ops ||
+	    vd->vdev_ops == &vdev_mirror_ops ||
+	    vd->vdev_ops == &vdev_replacing_ops ||
+	    vd->vdev_ops == &vdev_spare_ops);
+
+	uint64_t psize = vd->vdev_ops == &vdev_draid_ops ?
+	    vdev_draid_asize_to_psize(vd, asize) : asize;
+
+	BP_ZERO(bp);
+
+	DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+	DVA_SET_OFFSET(&bp->blk_dva[0], start);
+	DVA_SET_GANG(&bp->blk_dva[0], 0);
+	DVA_SET_ASIZE(&bp->blk_dva[0], asize);
+
+	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+	BP_SET_LSIZE(bp, psize);
+	BP_SET_PSIZE(bp, psize);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+/*
+ * Issues a rebuild I/O and takes care of rate limiting the number of queued
+ * rebuild I/Os.  The provided start and size must be properly aligned for the
+ * top-level vdev type being rebuilt.
+ */
+static int
+vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
+{
+	uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
+	vdev_t *vd = vr->vr_top_vdev;
+	spa_t *spa = vd->vdev_spa;
+	blkptr_t blk;
+
+	ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
+	ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
+
+	vr->vr_pass_bytes_scanned += size;
+	vr->vr_rebuild_phys.vrp_bytes_scanned += size;
+
+	/*
+	 * Rebuild the data in this range by constructing a special block
+	 * pointer.  It has no relation to any existing blocks in the pool.
+	 * However, by disabling checksum verification and issuing a scrub IO
+	 * we can reconstruct and repair any children with missing data.
+	 */
+	vdev_rebuild_blkptr_init(&blk, vd, start, size);
+	uint64_t psize = BP_GET_PSIZE(&blk);
+
+	if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
+		return (0);
+
+	mutex_enter(&vr->vr_io_lock);
+
+	/* Limit in flight rebuild I/Os */
+	while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max)
+		cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
+
+	vr->vr_bytes_inflight += psize;
+	mutex_exit(&vr->vr_io_lock);
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+	mutex_enter(&vd->vdev_rebuild_lock);
+
+	/* This is the first I/O for this txg. */
+	if (vr->vr_scan_offset[txg & TXG_MASK] == 0) {
+		vr->vr_scan_offset[txg & TXG_MASK] = start;
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_rebuild_update_sync,
+		    (void *)(uintptr_t)vd->vdev_id, tx);
+	}
+
+	/* When exiting write out our progress. */
+	if (vdev_rebuild_should_stop(vd)) {
+		mutex_enter(&vr->vr_io_lock);
+		vr->vr_bytes_inflight -= psize;
+		mutex_exit(&vr->vr_io_lock);
+		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+		mutex_exit(&vd->vdev_rebuild_lock);
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINTR));
+	}
+	mutex_exit(&vd->vdev_rebuild_lock);
+	dmu_tx_commit(tx);
+
+	vr->vr_scan_offset[txg & TXG_MASK] = start + size;
+	vr->vr_pass_bytes_issued += size;
+	vr->vr_rebuild_phys.vrp_bytes_issued += size;
+
+	zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk,
+	    abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
+	    ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_RESILVER, NULL));
+
+	return (0);
+}
+
+/*
+ * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
+ */
+static int
+vdev_rebuild_ranges(vdev_rebuild_t *vr)
+{
+	vdev_t *vd = vr->vr_top_vdev;
+	zfs_btree_t *t = &vr->vr_scan_tree->rt_root;
+	zfs_btree_index_t idx;
+	int error;
+
+	for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+	    rs = zfs_btree_next(t, &idx, &idx)) {
+		uint64_t start = rs_get_start(rs, vr->vr_scan_tree);
+		uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start;
+
+		/*
+		 * zfs_scan_suspend_progress can be set to disable rebuild
+		 * progress for testing.  See comment in dsl_scan_sync().
+		 */
+		while (zfs_scan_suspend_progress &&
+		    !vdev_rebuild_should_stop(vd)) {
+			delay(hz);
+		}
+
+		while (size > 0) {
+			uint64_t chunk_size;
+
+			/*
+			 * Split range into legally-sized logical chunks
+			 * given the constraints of the top-level vdev
+			 * being rebuilt (dRAID or mirror).
+			 */
+			ASSERT3P(vd->vdev_ops, !=, NULL);
+			chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd,
+			    start, size, zfs_rebuild_max_segment);
+
+			error = vdev_rebuild_range(vr, start, chunk_size);
+			if (error != 0)
+				return (error);
+
+			size -= chunk_size;
+			start += chunk_size;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Calculates the estimated capacity which remains to be scanned.  Since
+ * we traverse the pool in metaslab order only allocated capacity beyond
+ * the vrp_last_offset need be considered.  All lower offsets must have
+ * already been rebuilt and are thus already included in vrp_bytes_scanned.
+ */
+static void
+vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id)
+{
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+	uint64_t bytes_est = vrp->vrp_bytes_scanned;
+
+	if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start)
+		return;
+
+	for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_ms[i];
+
+		mutex_enter(&msp->ms_lock);
+		bytes_est += metaslab_allocated_space(msp);
+		mutex_exit(&msp->ms_lock);
+	}
+
+	vrp->vrp_bytes_est = bytes_est;
+}
+
+/*
+ * Load from disk the top-level vdev's rebuild information.
+ */
+int
+vdev_rebuild_load(vdev_t *vd)
+{
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+	spa_t *spa = vd->vdev_spa;
+	int err = 0;
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+	vd->vdev_rebuilding = B_FALSE;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) {
+		bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+		mutex_exit(&vd->vdev_rebuild_lock);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	ASSERT(vd->vdev_top == vd);
+
+	err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+	    VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+	    REBUILD_PHYS_ENTRIES, vrp);
+
+	/*
+	 * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should
+	 * not prevent a pool from being imported.  Clear the rebuild
+	 * status allowing a new resilver/rebuild to be started.
+	 */
+	if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) {
+		bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+	} else if (err) {
+		mutex_exit(&vd->vdev_rebuild_lock);
+		return (err);
+	}
+
+	vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms;
+	vr->vr_top_vdev = vd;
+
+	mutex_exit(&vd->vdev_rebuild_lock);
+
+	return (0);
+}
+
+/*
+ * Each scan thread is responsible for rebuilding a top-level vdev.  The
+ * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS.
+ */
+static void
+vdev_rebuild_thread(void *arg)
+{
+	vdev_t *vd = arg;
+	spa_t *spa = vd->vdev_spa;
+	int error = 0;
+
+	/*
+	 * If there's a scrub in process request that it be stopped.  This
+	 * is not required for a correct rebuild, but we do want rebuilds to
+	 * emulate the resilver behavior as much as possible.
+	 */
+	dsl_pool_t *dsl = spa_get_dsl(spa);
+	if (dsl_scan_scrubbing(dsl))
+		dsl_scan_cancel(dsl);
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	mutex_enter(&vd->vdev_rebuild_lock);
+
+	ASSERT3P(vd->vdev_top, ==, vd);
+	ASSERT3P(vd->vdev_rebuild_thread, !=, NULL);
+	ASSERT(vd->vdev_rebuilding);
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
+	ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
+	ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE);
+
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+	vr->vr_top_vdev = vd;
+	vr->vr_scan_msp = NULL;
+	vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL);
+
+	vr->vr_pass_start_time = gethrtime();
+	vr->vr_pass_bytes_scanned = 0;
+	vr->vr_pass_bytes_issued = 0;
+
+	vr->vr_bytes_inflight_max = MAX(1ULL << 20,
+	    zfs_rebuild_vdev_limit * vd->vdev_children);
+
+	uint64_t update_est_time = gethrtime();
+	vdev_rebuild_update_bytes_est(vd, 0);
+
+	clear_rebuild_bytes(vr->vr_top_vdev);
+
+	mutex_exit(&vd->vdev_rebuild_lock);
+
+	/*
+	 * Systematically walk the metaslabs and issue rebuild I/Os for
+	 * all ranges in the allocated space map.
+	 */
+	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_ms[i];
+		vr->vr_scan_msp = msp;
+
+		/*
+		 * Removal of vdevs from the vdev tree may eliminate the need
+		 * for the rebuild, in which case it should be canceled.  The
+		 * vdev_rebuild_cancel_wanted flag is set until the sync task
+		 * completes.  This may be after the rebuild thread exits.
+		 */
+		if (vdev_rebuild_should_cancel(vd)) {
+			vd->vdev_rebuild_cancel_wanted = B_TRUE;
+			error = EINTR;
+			break;
+		}
+
+		ASSERT0(range_tree_space(vr->vr_scan_tree));
+
+		/* Disable any new allocations to this metaslab */
+		metaslab_disable(msp);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+		mutex_enter(&msp->ms_sync_lock);
+		mutex_enter(&msp->ms_lock);
+
+		/*
+		 * If there are outstanding allocations wait for them to be
+		 * synced.  This is needed to ensure all allocated ranges are
+		 * on disk and therefore will be rebuilt.
+		 */
+		for (int j = 0; j < TXG_SIZE; j++) {
+			if (range_tree_space(msp->ms_allocating[j])) {
+				mutex_exit(&msp->ms_lock);
+				mutex_exit(&msp->ms_sync_lock);
+				txg_wait_synced(dsl, 0);
+				mutex_enter(&msp->ms_sync_lock);
+				mutex_enter(&msp->ms_lock);
+				break;
+			}
+		}
+
+		/*
+		 * When a metaslab has been allocated from read its allocated
+		 * ranges from the space map object into the vr_scan_tree.
+		 * Then add inflight / unflushed ranges and remove inflight /
+		 * unflushed frees.  This is the minimum range to be rebuilt.
+		 */
+		if (msp->ms_sm != NULL) {
+			VERIFY0(space_map_load(msp->ms_sm,
+			    vr->vr_scan_tree, SM_ALLOC));
+
+			for (int i = 0; i < TXG_SIZE; i++) {
+				ASSERT0(range_tree_space(
+				    msp->ms_allocating[i]));
+			}
+
+			range_tree_walk(msp->ms_unflushed_allocs,
+			    range_tree_add, vr->vr_scan_tree);
+			range_tree_walk(msp->ms_unflushed_frees,
+			    range_tree_remove, vr->vr_scan_tree);
+
+			/*
+			 * Remove ranges which have already been rebuilt based
+			 * on the last offset.  This can happen when restarting
+			 * a scan after exporting and re-importing the pool.
+			 */
+			range_tree_clear(vr->vr_scan_tree, 0,
+			    vrp->vrp_last_offset);
+		}
+
+		mutex_exit(&msp->ms_lock);
+		mutex_exit(&msp->ms_sync_lock);
+
+		/*
+		 * To provide an accurate estimate re-calculate the estimated
+		 * size every 5 minutes to account for recent allocations and
+		 * frees made to space maps which have not yet been rebuilt.
+		 */
+		if (gethrtime() > update_est_time + SEC2NSEC(300)) {
+			update_est_time = gethrtime();
+			vdev_rebuild_update_bytes_est(vd, i);
+		}
+
+		/*
+		 * Walk the allocated space map and issue the rebuild I/O.
+		 */
+		error = vdev_rebuild_ranges(vr);
+		range_tree_vacate(vr->vr_scan_tree, NULL, NULL);
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		metaslab_enable(msp, B_FALSE, B_FALSE);
+
+		if (error != 0)
+			break;
+	}
+
+	range_tree_destroy(vr->vr_scan_tree);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	/* Wait for any remaining rebuild I/O to complete */
+	mutex_enter(&vr->vr_io_lock);
+	while (vr->vr_bytes_inflight > 0)
+		cv_wait(&vr->vr_io_cv, &vr->vr_io_lock);
+
+	mutex_exit(&vr->vr_io_lock);
+
+	mutex_destroy(&vr->vr_io_lock);
+	cv_destroy(&vr->vr_io_cv);
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	dsl_pool_t *dp = spa_get_dsl(spa);
+	dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+	if (error == 0) {
+		/*
+		 * After a successful rebuild clear the DTLs of all ranges
+		 * which were missing when the rebuild was started.  These
+		 * ranges must have been rebuilt as a consequence of rebuilding
+		 * all allocated space.  Note that unlike a scrub or resilver
+		 * the rebuild operation will reconstruct data only referenced
+		 * by a pool checkpoint.  See the dsl_scan_done() comments.
+		 */
+		dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync,
+		    (void *)(uintptr_t)vd->vdev_id, tx);
+	} else if (vd->vdev_rebuild_cancel_wanted) {
+		/*
+		 * The rebuild operation was canceled.  This will occur when
+		 * a device participating in the rebuild is detached.
+		 */
+		dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync,
+		    (void *)(uintptr_t)vd->vdev_id, tx);
+	} else if (vd->vdev_rebuild_reset_wanted) {
+		/*
+		 * Reset the running rebuild without canceling and restarting
+		 * it.  This will occur when a new device is attached and must
+		 * participate in the rebuild.
+		 */
+		dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync,
+		    (void *)(uintptr_t)vd->vdev_id, tx);
+	} else {
+		/*
+		 * The rebuild operation should be suspended.  This may occur
+		 * when detaching a child vdev or when exporting the pool.  The
+		 * rebuild is left in the active state so it will be resumed.
+		 */
+		ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+		vd->vdev_rebuilding = B_FALSE;
+	}
+
+	dmu_tx_commit(tx);
+
+	vd->vdev_rebuild_thread = NULL;
+	mutex_exit(&vd->vdev_rebuild_lock);
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	cv_broadcast(&vd->vdev_rebuild_cv);
+
+	thread_exit();
+}
+
+/*
+ * Returns B_TRUE if any top-level vdev are rebuilding.
+ */
+boolean_t
+vdev_rebuild_active(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	boolean_t ret = B_FALSE;
+
+	if (vd == spa->spa_root_vdev) {
+		for (uint64_t i = 0; i < vd->vdev_children; i++) {
+			ret = vdev_rebuild_active(vd->vdev_child[i]);
+			if (ret)
+				return (ret);
+		}
+	} else if (vd->vdev_top_zap != 0) {
+		vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+		mutex_enter(&vd->vdev_rebuild_lock);
+		ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+		mutex_exit(&vd->vdev_rebuild_lock);
+	}
+
+	return (ret);
+}
+
+/*
+ * Start a rebuild operation.  The rebuild may be restarted when the
+ * top-level vdev is currently actively rebuilding.
+ */
+void
+vdev_rebuild(vdev_t *vd)
+{
+	vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+	vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys;
+
+	ASSERT(vd->vdev_top == vd);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT(!vd->vdev_removing);
+	ASSERT(spa_feature_is_enabled(vd->vdev_spa,
+	    SPA_FEATURE_DEVICE_REBUILD));
+
+	mutex_enter(&vd->vdev_rebuild_lock);
+	if (vd->vdev_rebuilding) {
+		ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE);
+
+		/*
+		 * Signal a running rebuild operation that it should restart
+		 * from the beginning because a new device was attached.  The
+		 * vdev_rebuild_reset_wanted flag is set until the sync task
+		 * completes.  This may be after the rebuild thread exits.
+		 */
+		if (!vd->vdev_rebuild_reset_wanted)
+			vd->vdev_rebuild_reset_wanted = B_TRUE;
+	} else {
+		vdev_rebuild_initiate(vd);
+	}
+	mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_restart_impl(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	if (vd == spa->spa_root_vdev) {
+		for (uint64_t i = 0; i < vd->vdev_children; i++)
+			vdev_rebuild_restart_impl(vd->vdev_child[i]);
+
+	} else if (vd->vdev_top_zap != 0) {
+		vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+		mutex_enter(&vd->vdev_rebuild_lock);
+		if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE &&
+		    vdev_writeable(vd) && !vd->vdev_rebuilding) {
+			ASSERT(spa_feature_is_active(spa,
+			    SPA_FEATURE_DEVICE_REBUILD));
+			vd->vdev_rebuilding = B_TRUE;
+			vd->vdev_rebuild_thread = thread_create(NULL, 0,
+			    vdev_rebuild_thread, vd, 0, &p0, TS_RUN,
+			    maxclsyspri);
+		}
+		mutex_exit(&vd->vdev_rebuild_lock);
+	}
+}
+
+/*
+ * Conditionally restart all of the vdev_rebuild_thread's for a pool.  The
+ * feature flag must be active and the rebuild in the active state.   This
+ * cannot be used to start a new rebuild.
+ */
+void
+vdev_rebuild_restart(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	vdev_rebuild_restart_impl(spa->spa_root_vdev);
+}
+
+/*
+ * Stop and wait for all of the vdev_rebuild_thread's associated with the
+ * vdev tree provide to be terminated (canceled or stopped).
+ */
+void
+vdev_rebuild_stop_wait(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	if (vd == spa->spa_root_vdev) {
+		for (uint64_t i = 0; i < vd->vdev_children; i++)
+			vdev_rebuild_stop_wait(vd->vdev_child[i]);
+
+	} else if (vd->vdev_top_zap != 0) {
+		ASSERT(vd == vd->vdev_top);
+
+		mutex_enter(&vd->vdev_rebuild_lock);
+		if (vd->vdev_rebuild_thread != NULL) {
+			vd->vdev_rebuild_exit_wanted = B_TRUE;
+			while (vd->vdev_rebuilding) {
+				cv_wait(&vd->vdev_rebuild_cv,
+				    &vd->vdev_rebuild_lock);
+			}
+			vd->vdev_rebuild_exit_wanted = B_FALSE;
+		}
+		mutex_exit(&vd->vdev_rebuild_lock);
+	}
+}
+
+/*
+ * Stop all rebuild operations but leave them in the active state so they
+ * will be resumed when importing the pool.
+ */
+void
+vdev_rebuild_stop_all(spa_t *spa)
+{
+	vdev_rebuild_stop_wait(spa->spa_root_vdev);
+}
+
+/*
+ * Rebuild statistics reported per top-level vdev.
+ */
+int
+vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
+{
+	spa_t *spa = tvd->vdev_spa;
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+		return (SET_ERROR(ENOTSUP));
+
+	if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0)
+		return (SET_ERROR(EINVAL));
+
+	int error = zap_contains(spa_meta_objset(spa),
+	    tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS);
+
+	if (error == ENOENT) {
+		bzero(vrs, sizeof (vdev_rebuild_stat_t));
+		vrs->vrs_state = VDEV_REBUILD_NONE;
+		error = 0;
+	} else if (error == 0) {
+		vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+		mutex_enter(&tvd->vdev_rebuild_lock);
+		vrs->vrs_state = vrp->vrp_rebuild_state;
+		vrs->vrs_start_time = vrp->vrp_start_time;
+		vrs->vrs_end_time = vrp->vrp_end_time;
+		vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms;
+		vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned;
+		vrs->vrs_bytes_issued = vrp->vrp_bytes_issued;
+		vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt;
+		vrs->vrs_bytes_est = vrp->vrp_bytes_est;
+		vrs->vrs_errors = vrp->vrp_errors;
+		vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() -
+		    vr->vr_pass_start_time);
+		vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
+		vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+		mutex_exit(&tvd->vdev_rebuild_lock);
+	}
+
+	return (error);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
+	"Max segment size in bytes of rebuild reads");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW,
+	"Max bytes in flight per leaf vdev for sequential resilvers");
+
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW,
+	"Automatically scrub after sequential resilver completes");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
new file mode 100644
index 000000000000..a758fe4fb343
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -0,0 +1,2390 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/zap.h>
+#include <sys/vdev_impl.h>
+#include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
+#include <sys/uberblock_impl.h>
+#include <sys/txg.h>
+#include <sys/avl.h>
+#include <sys/bpobj.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dir.h>
+#include <sys/arc.h>
+#include <sys/zfeature.h>
+#include <sys/vdev_indirect_births.h>
+#include <sys/vdev_indirect_mapping.h>
+#include <sys/abd.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * This file contains the necessary logic to remove vdevs from a
+ * storage pool.  Currently, the only devices that can be removed
+ * are log, cache, and spare devices; and top level vdevs from a pool
+ * w/o raidz or mirrors.  (Note that members of a mirror can be removed
+ * by the detach operation.)
+ *
+ * Log vdevs are removed by evacuating them and then turning the vdev
+ * into a hole vdev while holding spa config locks.
+ *
+ * Top level vdevs are removed and converted into an indirect vdev via
+ * a multi-step process:
+ *
+ *  - Disable allocations from this device (spa_vdev_remove_top).
+ *
+ *  - From a new thread (spa_vdev_remove_thread), copy data from
+ *    the removing vdev to a different vdev.  The copy happens in open
+ *    context (spa_vdev_copy_impl) and issues a sync task
+ *    (vdev_mapping_sync) so the sync thread can update the partial
+ *    indirect mappings in core and on disk.
+ *
+ *  - If a free happens during a removal, it is freed from the
+ *    removing vdev, and if it has already been copied, from the new
+ *    location as well (free_from_removing_vdev).
+ *
+ *  - After the removal is completed, the copy thread converts the vdev
+ *    into an indirect vdev (vdev_remove_complete) before instructing
+ *    the sync thread to destroy the space maps and finish the removal
+ *    (spa_finish_removal).
+ */
+
+typedef struct vdev_copy_arg {
+	metaslab_t	*vca_msp;
+	uint64_t	vca_outstanding_bytes;
+	uint64_t	vca_read_error_bytes;
+	uint64_t	vca_write_error_bytes;
+	kcondvar_t	vca_cv;
+	kmutex_t	vca_lock;
+} vdev_copy_arg_t;
+
+/*
+ * The maximum amount of memory we can use for outstanding i/o while
+ * doing a device removal.  This determines how much i/o we can have
+ * in flight concurrently.
+ */
+int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
+
+/*
+ * The largest contiguous segment that we will attempt to allocate when
+ * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
+ * there is a performance problem with attempting to allocate large blocks,
+ * consider decreasing this.
+ *
+ * See also the accessor function spa_remove_max_segment().
+ */
+int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+
+/*
+ * Ignore hard IO errors during device removal.  When set if a device
+ * encounters hard IO error during the removal process the removal will
+ * not be cancelled.  This can result in a normally recoverable block
+ * becoming permanently damaged and is not recommended.
+ */
+int zfs_removal_ignore_errors = 0;
+
+/*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops.  The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ *  - the mapping will be smaller, since one entry can cover more allocated
+ *    segments
+ *  - more of the fragmentation in the removing device will be preserved
+ *  - we'll do larger allocations, which may fail and fall back on smaller
+ *    allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
+ * This is used by the test suite so that it can ensure that certain
+ * actions happen while in the middle of a removal.
+ */
+int zfs_removal_suspend_progress = 0;
+
+#define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
+
+static void spa_vdev_remove_thread(void *arg);
+static int spa_vdev_remove_cancel_impl(spa_t *spa);
+
+static void
+spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
+{
+	VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_REMOVING, sizeof (uint64_t),
+	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+	    &spa->spa_removing_phys, tx));
+}
+
+static nvlist_t *
+spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
+{
+	for (int i = 0; i < count; i++) {
+		uint64_t guid =
+		    fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
+
+		if (guid == target_guid)
+			return (nvpp[i]);
+	}
+
+	return (NULL);
+}
+
+static void
+spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
+    nvlist_t *dev_to_remove)
+{
+	nvlist_t **newdev = NULL;
+
+	if (count > 1)
+		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
+
+	for (int i = 0, j = 0; i < count; i++) {
+		if (dev[i] == dev_to_remove)
+			continue;
+		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
+	}
+
+	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
+	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
+
+	for (int i = 0; i < count - 1; i++)
+		nvlist_free(newdev[i]);
+
+	if (count > 1)
+		kmem_free(newdev, (count - 1) * sizeof (void *));
+}
+
+static spa_vdev_removal_t *
+spa_vdev_removal_create(vdev_t *vd)
+{
+	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
+	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
+	svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	svr->svr_vdev_id = vd->vdev_id;
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL,
+		    0, 0);
+		list_create(&svr->svr_new_segments[i],
+		    sizeof (vdev_indirect_mapping_entry_t),
+		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
+	}
+
+	return (svr);
+}
+
+void
+spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
+{
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT0(svr->svr_bytes_done[i]);
+		ASSERT0(svr->svr_max_offset_to_sync[i]);
+		range_tree_destroy(svr->svr_frees[i]);
+		list_destroy(&svr->svr_new_segments[i]);
+	}
+
+	range_tree_destroy(svr->svr_allocd_segs);
+	mutex_destroy(&svr->svr_lock);
+	cv_destroy(&svr->svr_cv);
+	kmem_free(svr, sizeof (*svr));
+}
+
+/*
+ * This is called as a synctask in the txg in which we will mark this vdev
+ * as removing (in the config stored in the MOS).
+ *
+ * It begins the evacuation of a toplevel vdev by:
+ * - initializing the spa_removing_phys which tracks this removal
+ * - computing the amount of space to remove for accounting purposes
+ * - dirtying all dbufs in the spa_config_object
+ * - creating the spa_vdev_removal
+ * - starting the spa_vdev_remove_thread
+ */
+static void
+vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+	int vdev_id = (uintptr_t)arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
+	spa_vdev_removal_t *svr = NULL;
+	uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
+
+	ASSERT0(vdev_get_nparity(vd));
+	svr = spa_vdev_removal_create(vd);
+
+	ASSERT(vd->vdev_removing);
+	ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
+
+	spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		/*
+		 * By activating the OBSOLETE_COUNTS feature, we prevent
+		 * the pool from being downgraded and ensure that the
+		 * refcounts are precise.
+		 */
+		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+		uint64_t one = 1;
+		VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
+		    &one, tx));
+		boolean_t are_precise __maybe_unused;
+		ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+		ASSERT3B(are_precise, ==, B_TRUE);
+	}
+
+	vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
+	vd->vdev_indirect_mapping =
+	    vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
+	vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
+	vd->vdev_indirect_births =
+	    vdev_indirect_births_open(mos, vic->vic_births_object);
+	spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
+	spa->spa_removing_phys.sr_start_time = gethrestime_sec();
+	spa->spa_removing_phys.sr_end_time = 0;
+	spa->spa_removing_phys.sr_state = DSS_SCANNING;
+	spa->spa_removing_phys.sr_to_copy = 0;
+	spa->spa_removing_phys.sr_copied = 0;
+
+	/*
+	 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
+	 * there may be space in the defer tree, which is free, but still
+	 * counted in vs_alloc.
+	 */
+	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+		metaslab_t *ms = vd->vdev_ms[i];
+		if (ms->ms_sm == NULL)
+			continue;
+
+		spa->spa_removing_phys.sr_to_copy +=
+		    metaslab_allocated_space(ms);
+
+		/*
+		 * Space which we are freeing this txg does not need to
+		 * be copied.
+		 */
+		spa->spa_removing_phys.sr_to_copy -=
+		    range_tree_space(ms->ms_freeing);
+
+		ASSERT0(range_tree_space(ms->ms_freed));
+		for (int t = 0; t < TXG_SIZE; t++)
+			ASSERT0(range_tree_space(ms->ms_allocating[t]));
+	}
+
+	/*
+	 * Sync tasks are called before metaslab_sync(), so there should
+	 * be no already-synced metaslabs in the TXG_CLEAN list.
+	 */
+	ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
+
+	spa_sync_removing_state(spa, tx);
+
+	/*
+	 * All blocks that we need to read the most recent mapping must be
+	 * stored on concrete vdevs.  Therefore, we must dirty anything that
+	 * is read before spa_remove_init().  Specifically, the
+	 * spa_config_object.  (Note that although we already modified the
+	 * spa_config_object in spa_sync_removing_state, that may not have
+	 * modified all blocks of the object.)
+	 */
+	dmu_object_info_t doi;
+	VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
+	for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
+		dmu_buf_t *dbuf;
+		VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
+		    offset, FTAG, &dbuf, 0));
+		dmu_buf_will_dirty(dbuf, tx);
+		offset += dbuf->db_size;
+		dmu_buf_rele(dbuf, FTAG);
+	}
+
+	/*
+	 * Now that we've allocated the im_object, dirty the vdev to ensure
+	 * that the object gets written to the config on disk.
+	 */
+	vdev_config_dirty(vd);
+
+	zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu "
+	    "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx),
+	    vic->vic_mapping_object);
+
+	spa_history_log_internal(spa, "vdev remove started", tx,
+	    "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id,
+	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+	/*
+	 * Setting spa_vdev_removal causes subsequent frees to call
+	 * free_from_removing_vdev().  Note that we don't need any locking
+	 * because we are the sync thread, and metaslab_free_impl() is only
+	 * called from syncing context (potentially from a zio taskq thread,
+	 * but in any case only when there are outstanding free i/os, which
+	 * there are not).
+	 */
+	ASSERT3P(spa->spa_vdev_removal, ==, NULL);
+	spa->spa_vdev_removal = svr;
+	svr->svr_thread = thread_create(NULL, 0,
+	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * When we are opening a pool, we must read the mapping for each
+ * indirect vdev in order from most recently removed to least
+ * recently removed.  We do this because the blocks for the mapping
+ * of older indirect vdevs may be stored on more recently removed vdevs.
+ * In order to read each indirect mapping object, we must have
+ * initialized all more recently removed vdevs.
+ */
+int
+spa_remove_init(spa_t *spa)
+{
+	int error;
+
+	error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
+	    DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_REMOVING, sizeof (uint64_t),
+	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
+	    &spa->spa_removing_phys);
+
+	if (error == ENOENT) {
+		spa->spa_removing_phys.sr_state = DSS_NONE;
+		spa->spa_removing_phys.sr_removing_vdev = -1;
+		spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
+		spa->spa_indirect_vdevs_loaded = B_TRUE;
+		return (0);
+	} else if (error != 0) {
+		return (error);
+	}
+
+	if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
+		/*
+		 * We are currently removing a vdev.  Create and
+		 * initialize a spa_vdev_removal_t from the bonus
+		 * buffer of the removing vdevs vdev_im_object, and
+		 * initialize its partial mapping.
+		 */
+		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+		vdev_t *vd = vdev_lookup_top(spa,
+		    spa->spa_removing_phys.sr_removing_vdev);
+
+		if (vd == NULL) {
+			spa_config_exit(spa, SCL_STATE, FTAG);
+			return (EINVAL);
+		}
+
+		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+		ASSERT(vdev_is_concrete(vd));
+		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
+		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
+		ASSERT(vd->vdev_removing);
+
+		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+		    spa->spa_meta_objset, vic->vic_mapping_object);
+		vd->vdev_indirect_births = vdev_indirect_births_open(
+		    spa->spa_meta_objset, vic->vic_births_object);
+		spa_config_exit(spa, SCL_STATE, FTAG);
+
+		spa->spa_vdev_removal = svr;
+	}
+
+	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
+	uint64_t indirect_vdev_id =
+	    spa->spa_removing_phys.sr_prev_indirect_vdev;
+	while (indirect_vdev_id != UINT64_MAX) {
+		vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
+		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
+		    spa->spa_meta_objset, vic->vic_mapping_object);
+		vd->vdev_indirect_births = vdev_indirect_births_open(
+		    spa->spa_meta_objset, vic->vic_births_object);
+
+		indirect_vdev_id = vic->vic_prev_indirect_vdev;
+	}
+	spa_config_exit(spa, SCL_STATE, FTAG);
+
+	/*
+	 * Now that we've loaded all the indirect mappings, we can allow
+	 * reads from other blocks (e.g. via predictive prefetch).
+	 */
+	spa->spa_indirect_vdevs_loaded = B_TRUE;
+	return (0);
+}
+
+void
+spa_restart_removal(spa_t *spa)
+{
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+	if (svr == NULL)
+		return;
+
+	/*
+	 * In general when this function is called there is no
+	 * removal thread running. The only scenario where this
+	 * is not true is during spa_import() where this function
+	 * is called twice [once from spa_import_impl() and
+	 * spa_async_resume()]. Thus, in the scenario where we
+	 * import a pool that has an ongoing removal we don't
+	 * want to spawn a second thread.
+	 */
+	if (svr->svr_thread != NULL)
+		return;
+
+	if (!spa_writeable(spa))
+		return;
+
+	zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id);
+	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
+	    0, &p0, TS_RUN, minclsyspri);
+}
+
+/*
+ * Process freeing from a device which is in the middle of being removed.
+ * We must handle this carefully so that we attempt to copy freed data,
+ * and we correctly free already-copied data.
+ */
+void
+free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
+{
+	spa_t *spa = vd->vdev_spa;
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t txg = spa_syncing_txg(spa);
+	uint64_t max_offset_yet = 0;
+
+	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
+	    vdev_indirect_mapping_object(vim));
+	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
+
+	mutex_enter(&svr->svr_lock);
+
+	/*
+	 * Remove the segment from the removing vdev's spacemap.  This
+	 * ensures that we will not attempt to copy this space (if the
+	 * removal thread has not yet visited it), and also ensures
+	 * that we know what is actually allocated on the new vdevs
+	 * (needed if we cancel the removal).
+	 *
+	 * Note: we must do the metaslab_free_concrete() with the svr_lock
+	 * held, so that the remove_thread can not load this metaslab and then
+	 * visit this offset between the time that we metaslab_free_concrete()
+	 * and when we check to see if it has been visited.
+	 *
+	 * Note: The checkpoint flag is set to false as having/taking
+	 * a checkpoint and removing a device can't happen at the same
+	 * time.
+	 */
+	ASSERT(!spa_has_checkpoint(spa));
+	metaslab_free_concrete(vd, offset, size, B_FALSE);
+
+	uint64_t synced_size = 0;
+	uint64_t synced_offset = 0;
+	uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
+	if (offset < max_offset_synced) {
+		/*
+		 * The mapping for this offset is already on disk.
+		 * Free from the new location.
+		 *
+		 * Note that we use svr_max_synced_offset because it is
+		 * updated atomically with respect to the in-core mapping.
+		 * By contrast, vim_max_offset is not.
+		 *
+		 * This block may be split between a synced entry and an
+		 * in-flight or unvisited entry.  Only process the synced
+		 * portion of it here.
+		 */
+		synced_size = MIN(size, max_offset_synced - offset);
+		synced_offset = offset;
+
+		ASSERT3U(max_offset_yet, <=, max_offset_synced);
+		max_offset_yet = max_offset_synced;
+
+		DTRACE_PROBE3(remove__free__synced,
+		    spa_t *, spa,
+		    uint64_t, offset,
+		    uint64_t, synced_size);
+
+		size -= synced_size;
+		offset += synced_size;
+	}
+
+	/*
+	 * Look at all in-flight txgs starting from the currently syncing one
+	 * and see if a section of this free is being copied. By starting from
+	 * this txg and iterating forward, we might find that this region
+	 * was copied in two different txgs and handle it appropriately.
+	 */
+	for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
+		int txgoff = (txg + i) & TXG_MASK;
+		if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
+			/*
+			 * The mapping for this offset is in flight, and
+			 * will be synced in txg+i.
+			 */
+			uint64_t inflight_size = MIN(size,
+			    svr->svr_max_offset_to_sync[txgoff] - offset);
+
+			DTRACE_PROBE4(remove__free__inflight,
+			    spa_t *, spa,
+			    uint64_t, offset,
+			    uint64_t, inflight_size,
+			    uint64_t, txg + i);
+
+			/*
+			 * We copy data in order of increasing offset.
+			 * Therefore the max_offset_to_sync[] must increase
+			 * (or be zero, indicating that nothing is being
+			 * copied in that txg).
+			 */
+			if (svr->svr_max_offset_to_sync[txgoff] != 0) {
+				ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
+				    >=, max_offset_yet);
+				max_offset_yet =
+				    svr->svr_max_offset_to_sync[txgoff];
+			}
+
+			/*
+			 * We've already committed to copying this segment:
+			 * we have allocated space elsewhere in the pool for
+			 * it and have an IO outstanding to copy the data. We
+			 * cannot free the space before the copy has
+			 * completed, or else the copy IO might overwrite any
+			 * new data. To free that space, we record the
+			 * segment in the appropriate svr_frees tree and free
+			 * the mapped space later, in the txg where we have
+			 * completed the copy and synced the mapping (see
+			 * vdev_mapping_sync).
+			 */
+			range_tree_add(svr->svr_frees[txgoff],
+			    offset, inflight_size);
+			size -= inflight_size;
+			offset += inflight_size;
+
+			/*
+			 * This space is already accounted for as being
+			 * done, because it is being copied in txg+i.
+			 * However, if i!=0, then it is being copied in
+			 * a future txg.  If we crash after this txg
+			 * syncs but before txg+i syncs, then the space
+			 * will be free.  Therefore we must account
+			 * for the space being done in *this* txg
+			 * (when it is freed) rather than the future txg
+			 * (when it will be copied).
+			 */
+			ASSERT3U(svr->svr_bytes_done[txgoff], >=,
+			    inflight_size);
+			svr->svr_bytes_done[txgoff] -= inflight_size;
+			svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
+		}
+	}
+	ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
+
+	if (size > 0) {
+		/*
+		 * The copy thread has not yet visited this offset.  Ensure
+		 * that it doesn't.
+		 */
+
+		DTRACE_PROBE3(remove__free__unvisited,
+		    spa_t *, spa,
+		    uint64_t, offset,
+		    uint64_t, size);
+
+		if (svr->svr_allocd_segs != NULL)
+			range_tree_clear(svr->svr_allocd_segs, offset, size);
+
+		/*
+		 * Since we now do not need to copy this data, for
+		 * accounting purposes we have done our job and can count
+		 * it as completed.
+		 */
+		svr->svr_bytes_done[txg & TXG_MASK] += size;
+	}
+	mutex_exit(&svr->svr_lock);
+
+	/*
+	 * Now that we have dropped svr_lock, process the synced portion
+	 * of this free.
+	 */
+	if (synced_size > 0) {
+		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
+
+		/*
+		 * Note: this can only be called from syncing context,
+		 * and the vdev_indirect_mapping is only changed from the
+		 * sync thread, so we don't need svr_lock while doing
+		 * metaslab_free_impl_cb.
+		 */
+		boolean_t checkpoint = B_FALSE;
+		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
+		    metaslab_free_impl_cb, &checkpoint);
+	}
+}
+
+/*
+ * Stop an active removal and update the spa_removing phys.
+ */
+static void
+spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
+{
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+	ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
+
+	/* Ensure the removal thread has completed before we free the svr. */
+	spa_vdev_remove_suspend(spa);
+
+	ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
+
+	if (state == DSS_FINISHED) {
+		spa_removing_phys_t *srp = &spa->spa_removing_phys;
+		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+
+		if (srp->sr_prev_indirect_vdev != -1) {
+			vdev_t *pvd;
+			pvd = vdev_lookup_top(spa,
+			    srp->sr_prev_indirect_vdev);
+			ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
+		}
+
+		vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
+		srp->sr_prev_indirect_vdev = vd->vdev_id;
+	}
+	spa->spa_removing_phys.sr_state = state;
+	spa->spa_removing_phys.sr_end_time = gethrestime_sec();
+
+	spa->spa_vdev_removal = NULL;
+	spa_vdev_removal_destroy(svr);
+
+	spa_sync_removing_state(spa, tx);
+	spa_notify_waiters(spa);
+
+	vdev_config_dirty(spa->spa_root_vdev);
+}
+
+static void
+free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
+{
+	vdev_t *vd = arg;
+	vdev_indirect_mark_obsolete(vd, offset, size);
+	boolean_t checkpoint = B_FALSE;
+	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
+	    metaslab_free_impl_cb, &checkpoint);
+}
+
+/*
+ * On behalf of the removal thread, syncs an incremental bit more of
+ * the indirect mapping to disk and updates the in-memory mapping.
+ * Called as a sync task in every txg that the removal thread makes progress.
+ */
+static void
+vdev_mapping_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_vdev_removal_t *svr = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+	ASSERT(vic->vic_mapping_object != 0);
+	ASSERT3U(txg, ==, spa_syncing_txg(spa));
+
+	vdev_indirect_mapping_add_entries(vim,
+	    &svr->svr_new_segments[txg & TXG_MASK], tx);
+	vdev_indirect_births_add_entry(vd->vdev_indirect_births,
+	    vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
+
+	/*
+	 * Free the copied data for anything that was freed while the
+	 * mapping entries were in flight.
+	 */
+	mutex_enter(&svr->svr_lock);
+	range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
+	    free_mapped_segment_cb, vd);
+	ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
+	    vdev_indirect_mapping_max_offset(vim));
+	svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
+	mutex_exit(&svr->svr_lock);
+
+	spa_sync_removing_state(spa, tx);
+}
+
+typedef struct vdev_copy_segment_arg {
+	spa_t *vcsa_spa;
+	dva_t *vcsa_dest_dva;
+	uint64_t vcsa_txg;
+	range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_copy_segment_arg_t *vcsa = arg;
+	spa_t *spa = vcsa->vcsa_spa;
+	blkptr_t bp = { { { {0} } } };
+
+	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+	BP_SET_LSIZE(&bp, size);
+	BP_SET_PSIZE(&bp, size);
+	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(&bp, DMU_OT_NONE);
+	BP_SET_LEVEL(&bp, 0);
+	BP_SET_DEDUP(&bp, 0);
+	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+	DVA_SET_OFFSET(&bp.blk_dva[0],
+	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+	DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+	zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
+/*
+ * All reads and writes associated with a call to spa_vdev_copy_segment()
+ * are done.
+ */
+static void
+spa_vdev_copy_segment_done(zio_t *zio)
+{
+	vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+	range_tree_vacate(vcsa->vcsa_obsolete_segs,
+	    unalloc_seg, vcsa);
+	range_tree_destroy(vcsa->vcsa_obsolete_segs);
+	kmem_free(vcsa, sizeof (*vcsa));
+
+	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
+}
+
+/*
+ * The write of the new location is done.
+ */
+static void
+spa_vdev_copy_segment_write_done(zio_t *zio)
+{
+	vdev_copy_arg_t *vca = zio->io_private;
+
+	abd_free(zio->io_abd);
+
+	mutex_enter(&vca->vca_lock);
+	vca->vca_outstanding_bytes -= zio->io_size;
+
+	if (zio->io_error != 0)
+		vca->vca_write_error_bytes += zio->io_size;
+
+	cv_signal(&vca->vca_cv);
+	mutex_exit(&vca->vca_lock);
+}
+
+/*
+ * The read of the old location is done.  The parent zio is the write to
+ * the new location.  Allow it to start.
+ */
+static void
+spa_vdev_copy_segment_read_done(zio_t *zio)
+{
+	vdev_copy_arg_t *vca = zio->io_private;
+
+	if (zio->io_error != 0) {
+		mutex_enter(&vca->vca_lock);
+		vca->vca_read_error_bytes += zio->io_size;
+		mutex_exit(&vca->vca_lock);
+	}
+
+	zio_nowait(zio_unique_parent(zio));
+}
+
+/*
+ * If the old and new vdevs are mirrors, we will read both sides of the old
+ * mirror, and write each copy to the corresponding side of the new mirror.
+ * If the old and new vdevs have a different number of children, we will do
+ * this as best as possible.  Since we aren't verifying checksums, this
+ * ensures that as long as there's a good copy of the data, we'll have a
+ * good copy after the removal, even if there's silent damage to one side
+ * of the mirror. If we're removing a mirror that has some silent damage,
+ * we'll have exactly the same damage in the new location (assuming that
+ * the new location is also a mirror).
+ *
+ * We accomplish this by creating a tree of zio_t's, with as many writes as
+ * there are "children" of the new vdev (a non-redundant vdev counts as one
+ * child, a 2-way mirror has 2 children, etc). Each write has an associated
+ * read from a child of the old vdev. Typically there will be the same
+ * number of children of the old and new vdevs.  However, if there are more
+ * children of the new vdev, some child(ren) of the old vdev will be issued
+ * multiple reads.  If there are more children of the old vdev, some copies
+ * will be dropped.
+ *
+ * For example, the tree of zio_t's for a 2-way mirror is:
+ *
+ *                            null
+ *                           /    \
+ *    write(new vdev, child 0)      write(new vdev, child 1)
+ *      |                             |
+ *    read(old vdev, child 0)       read(old vdev, child 1)
+ *
+ * Child zio's complete before their parents complete.  However, zio's
+ * created with zio_vdev_child_io() may be issued before their children
+ * complete.  In this case we need to make sure that the children (reads)
+ * complete before the parents (writes) are *issued*.  We do this by not
+ * calling zio_nowait() on each write until its corresponding read has
+ * completed.
+ *
+ * The spa_config_lock must be held while zio's created by
+ * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
+ * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
+ * zio is needed to release the spa_config_lock after all the reads and
+ * writes complete. (Note that we can't grab the config lock for each read,
+ * because it is not reentrant - we could deadlock with a thread waiting
+ * for a write lock.)
+ */
+static void
+spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
+    vdev_t *source_vd, uint64_t source_offset,
+    vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
+{
+	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
+
+	/*
+	 * If the destination child in unwritable then there is no point
+	 * in issuing the source reads which cannot be written.
+	 */
+	if (!vdev_writeable(dest_child_vd))
+		return;
+
+	mutex_enter(&vca->vca_lock);
+	vca->vca_outstanding_bytes += size;
+	mutex_exit(&vca->vca_lock);
+
+	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
+
+	vdev_t *source_child_vd = NULL;
+	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
+		/*
+		 * Source and dest are both mirrors.  Copy from the same
+		 * child id as we are copying to (wrapping around if there
+		 * are more dest children than source children).  If the
+		 * preferred source child is unreadable select another.
+		 */
+		for (int i = 0; i < source_vd->vdev_children; i++) {
+			source_child_vd = source_vd->vdev_child[
+			    (dest_id + i) % source_vd->vdev_children];
+			if (vdev_readable(source_child_vd))
+				break;
+		}
+	} else {
+		source_child_vd = source_vd;
+	}
+
+	/*
+	 * There should always be at least one readable source child or
+	 * the pool would be in a suspended state.  Somehow selecting an
+	 * unreadable child would result in IO errors, the removal process
+	 * being cancelled, and the pool reverting to its pre-removal state.
+	 */
+	ASSERT3P(source_child_vd, !=, NULL);
+
+	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
+	    dest_child_vd, dest_offset, abd, size,
+	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_write_done, vca);
+
+	zio_nowait(zio_vdev_child_io(write_zio, NULL,
+	    source_child_vd, source_offset, abd, size,
+	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
+	    ZIO_FLAG_CANFAIL,
+	    spa_vdev_copy_segment_read_done, vca));
+}
+
+/*
+ * Allocate a new location for this segment, and create the zio_t's to
+ * read from the old location and write to the new location.
+ */
+static int
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+    uint64_t maxalloc, uint64_t txg,
+    vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
+{
+	metaslab_group_t *mg = vd->vdev_mg;
+	spa_t *spa = vd->vdev_spa;
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+	vdev_indirect_mapping_entry_t *entry;
+	dva_t dst = {{ 0 }};
+	uint64_t start = range_tree_min(segs);
+	ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
+
+	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
+	ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
+
+	uint64_t size = range_tree_span(segs);
+	if (range_tree_span(segs) > maxalloc) {
+		/*
+		 * We can't allocate all the segments.  Prefer to end
+		 * the allocation at the end of a segment, thus avoiding
+		 * additional split blocks.
+		 */
+		range_seg_max_t search;
+		zfs_btree_index_t where;
+		rs_set_start(&search, segs, start + maxalloc);
+		rs_set_end(&search, segs, start + maxalloc);
+		(void) zfs_btree_find(&segs->rt_root, &search, &where);
+		range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
+		    &where);
+		if (rs != NULL) {
+			size = rs_get_end(rs, segs) - start;
+		} else {
+			/*
+			 * There are no segments that end before maxalloc.
+			 * I.e. the first segment is larger than maxalloc,
+			 * so we must split it.
+			 */
+			size = maxalloc;
+		}
+	}
+	ASSERT3U(size, <=, maxalloc);
+	ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
+
+	/*
+	 * An allocation class might not have any remaining vdevs or space
+	 */
+	metaslab_class_t *mc = mg->mg_class;
+	if (mc->mc_groups == 0)
+		mc = spa_normal_class(spa);
+	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0,
+	    zal, 0);
+	if (error == ENOSPC && mc != spa_normal_class(spa)) {
+		error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
+		    &dst, 0, NULL, txg, 0, zal, 0);
+	}
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Determine the ranges that are not actually needed.  Offsets are
+	 * relative to the start of the range to be copied (i.e. relative to the
+	 * local variable "start").
+	 */
+	range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL,
+	    0, 0);
+
+	zfs_btree_index_t where;
+	range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
+	ASSERT3U(rs_get_start(rs, segs), ==, start);
+	uint64_t prev_seg_end = rs_get_end(rs, segs);
+	while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
+		if (rs_get_start(rs, segs) >= start + size) {
+			break;
+		} else {
+			range_tree_add(obsolete_segs,
+			    prev_seg_end - start,
+			    rs_get_start(rs, segs) - prev_seg_end);
+		}
+		prev_seg_end = rs_get_end(rs, segs);
+	}
+	/* We don't end in the middle of an obsolete range */
+	ASSERT3U(start + size, <=, prev_seg_end);
+
+	range_tree_clear(segs, start, size);
+
+	/*
+	 * We can't have any padding of the allocated size, otherwise we will
+	 * misunderstand what's allocated, and the size of the mapping. We
+	 * prevent padding by ensuring that all devices in the pool have the
+	 * same ashift, and the allocation size is a multiple of the ashift.
+	 */
+	VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
+
+	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
+	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
+	entry->vime_mapping.vimep_dst = dst;
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+	}
+
+	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+	vcsa->vcsa_obsolete_segs = obsolete_segs;
+	vcsa->vcsa_spa = spa;
+	vcsa->vcsa_txg = txg;
+
+	/*
+	 * See comment before spa_vdev_copy_one_child().
+	 */
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
+	    spa_vdev_copy_segment_done, vcsa, 0);
+	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
+	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
+		for (int i = 0; i < dest_vd->vdev_children; i++) {
+			vdev_t *child = dest_vd->vdev_child[i];
+			spa_vdev_copy_one_child(vca, nzio, vd, start,
+			    child, DVA_GET_OFFSET(&dst), i, size);
+		}
+	} else {
+		spa_vdev_copy_one_child(vca, nzio, vd, start,
+		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
+	}
+	zio_nowait(nzio);
+
+	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
+	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
+	vdev_dirty(vd, 0, NULL, txg);
+
+	return (0);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called as a
+ * synctask in the same txg that we will sync out the new config (to the
+ * MOS object) which indicates that this vdev is indirect.
+ */
+static void
+vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_vdev_removal_t *svr = arg;
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
+	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT0(svr->svr_bytes_done[i]);
+	}
+
+	ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
+	    spa->spa_removing_phys.sr_to_copy);
+
+	vdev_destroy_spacemaps(vd, tx);
+
+	/* destroy leaf zaps, if any */
+	ASSERT3P(svr->svr_zaplist, !=, NULL);
+	for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
+		vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
+	}
+	fnvlist_free(svr->svr_zaplist);
+
+	spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
+	/* vd->vdev_path is not available here */
+	spa_history_log_internal(spa, "vdev remove completed",  tx,
+	    "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id);
+}
+
+static void
+vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
+{
+	ASSERT3P(zlist, !=, NULL);
+	ASSERT0(vdev_get_nparity(vd));
+
+	if (vd->vdev_leaf_zap != 0) {
+		char zkey[32];
+		(void) snprintf(zkey, sizeof (zkey), "%s-%llu",
+		    VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap);
+		fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
+	}
+
+	for (uint64_t id = 0; id < vd->vdev_children; id++) {
+		vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
+	}
+}
+
+static void
+vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
+{
+	vdev_t *ivd;
+	dmu_tx_t *tx;
+	spa_t *spa = vd->vdev_spa;
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+	/*
+	 * First, build a list of leaf zaps to be destroyed.
+	 * This is passed to the sync context thread,
+	 * which does the actual unlinking.
+	 */
+	svr->svr_zaplist = fnvlist_alloc();
+	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
+
+	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
+	ivd->vdev_removing = 0;
+
+	vd->vdev_leaf_zap = 0;
+
+	vdev_remove_child(ivd, vd);
+	vdev_compact_children(ivd);
+
+	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
+
+	mutex_enter(&svr->svr_lock);
+	svr->svr_thread = NULL;
+	cv_broadcast(&svr->svr_cv);
+	mutex_exit(&svr->svr_lock);
+
+	/* After this, we can not use svr. */
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+	dsl_sync_task_nowait(spa->spa_dsl_pool,
+	    vdev_remove_complete_sync, svr, tx);
+	dmu_tx_commit(tx);
+}
+
+/*
+ * Complete the removal of a toplevel vdev. This is called in open
+ * context by the removal thread after we have copied all vdev's data.
+ */
+static void
+vdev_remove_complete(spa_t *spa)
+{
+	uint64_t txg;
+
+	/*
+	 * Wait for any deferred frees to be synced before we call
+	 * vdev_metaslab_fini()
+	 */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	txg = spa_vdev_enter(spa);
+	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
+	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+	ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+
+	sysevent_t *ev = spa_event_create(spa, vd, NULL,
+	    ESC_ZFS_VDEV_REMOVE_DEV);
+
+	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
+	    vd->vdev_id, txg);
+
+	/*
+	 * Discard allocation state.
+	 */
+	if (vd->vdev_mg != NULL) {
+		vdev_metaslab_fini(vd);
+		metaslab_group_destroy(vd->vdev_mg);
+		vd->vdev_mg = NULL;
+		spa_log_sm_set_blocklimit(spa);
+	}
+	if (vd->vdev_log_mg != NULL) {
+		ASSERT0(vd->vdev_ms_count);
+		metaslab_group_destroy(vd->vdev_log_mg);
+		vd->vdev_log_mg = NULL;
+	}
+	ASSERT0(vd->vdev_stat.vs_space);
+	ASSERT0(vd->vdev_stat.vs_dspace);
+
+	vdev_remove_replace_with_indirect(vd, txg);
+
+	/*
+	 * We now release the locks, allowing spa_sync to run and finish the
+	 * removal via vdev_remove_complete_sync in syncing context.
+	 *
+	 * Note that we hold on to the vdev_t that has been replaced.  Since
+	 * it isn't part of the vdev tree any longer, it can't be concurrently
+	 * manipulated, even while we don't have the config lock.
+	 */
+	(void) spa_vdev_exit(spa, NULL, txg, 0);
+
+	/*
+	 * Top ZAP should have been transferred to the indirect vdev in
+	 * vdev_remove_replace_with_indirect.
+	 */
+	ASSERT0(vd->vdev_top_zap);
+
+	/*
+	 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
+	 */
+	ASSERT0(vd->vdev_leaf_zap);
+
+	txg = spa_vdev_enter(spa);
+	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+	/*
+	 * Request to update the config and the config cachefile.
+	 */
+	vdev_config_dirty(spa->spa_root_vdev);
+	(void) spa_vdev_exit(spa, vd, txg, 0);
+
+	if (ev != NULL)
+		spa_event_post(ev);
+}
+
+/*
+ * Evacuates a segment of size at most max_alloc from the vdev
+ * via repeated calls to spa_vdev_copy_segment. If an allocation
+ * fails, the pool is probably too fragmented to handle such a
+ * large size, so decrease max_alloc so that the caller will not try
+ * this size again this txg.
+ */
+static void
+spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
+    uint64_t *max_alloc, dmu_tx_t *tx)
+{
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	mutex_enter(&svr->svr_lock);
+
+	/*
+	 * Determine how big of a chunk to copy.  We can allocate up
+	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
+	 * bytes of unallocated space at a time.  "segs" will track the
+	 * allocated segments that we are copying.  We may also be copying
+	 * free segments (of up to vdev_removal_max_span bytes).
+	 */
+	range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	for (;;) {
+		range_tree_t *rt = svr->svr_allocd_segs;
+		range_seg_t *rs = range_tree_first(rt);
+
+		if (rs == NULL)
+			break;
+
+		uint64_t seg_length;
+
+		if (range_tree_is_empty(segs)) {
+			/* need to truncate the first seg based on max_alloc */
+			seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs,
+			    rt), *max_alloc);
+		} else {
+			if (rs_get_start(rs, rt) - range_tree_max(segs) >
+			    vdev_removal_max_span) {
+				/*
+				 * Including this segment would cause us to
+				 * copy a larger unneeded chunk than is allowed.
+				 */
+				break;
+			} else if (rs_get_end(rs, rt) - range_tree_min(segs) >
+			    *max_alloc) {
+				/*
+				 * This additional segment would extend past
+				 * max_alloc. Rather than splitting this
+				 * segment, leave it for the next mapping.
+				 */
+				break;
+			} else {
+				seg_length = rs_get_end(rs, rt) -
+				    rs_get_start(rs, rt);
+			}
+		}
+
+		range_tree_add(segs, rs_get_start(rs, rt), seg_length);
+		range_tree_remove(svr->svr_allocd_segs,
+		    rs_get_start(rs, rt), seg_length);
+	}
+
+	if (range_tree_is_empty(segs)) {
+		mutex_exit(&svr->svr_lock);
+		range_tree_destroy(segs);
+		return;
+	}
+
+	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
+		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
+		    svr, tx);
+	}
+
+	svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
+
+	/*
+	 * Note: this is the amount of *allocated* space
+	 * that we are taking care of each txg.
+	 */
+	svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
+
+	mutex_exit(&svr->svr_lock);
+
+	zio_alloc_list_t zal;
+	metaslab_trace_init(&zal);
+	uint64_t thismax = SPA_MAXBLOCKSIZE;
+	while (!range_tree_is_empty(segs)) {
+		int error = spa_vdev_copy_segment(vd,
+		    segs, thismax, txg, vca, &zal);
+
+		if (error == ENOSPC) {
+			/*
+			 * Cut our segment in half, and don't try this
+			 * segment size again this txg.  Note that the
+			 * allocation size must be aligned to the highest
+			 * ashift in the pool, so that the allocation will
+			 * not be padded out to a multiple of the ashift,
+			 * which could cause us to think that this mapping
+			 * is larger than we intended.
+			 */
+			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
+			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
+			uint64_t attempted =
+			    MIN(range_tree_span(segs), thismax);
+			thismax = P2ROUNDUP(attempted / 2,
+			    1 << spa->spa_max_ashift);
+			/*
+			 * The minimum-size allocation can not fail.
+			 */
+			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+			*max_alloc = attempted - (1 << spa->spa_max_ashift);
+		} else {
+			ASSERT0(error);
+
+			/*
+			 * We've performed an allocation, so reset the
+			 * alloc trace list.
+			 */
+			metaslab_trace_fini(&zal);
+			metaslab_trace_init(&zal);
+		}
+	}
+	metaslab_trace_fini(&zal);
+	range_tree_destroy(segs);
+}
+
+/*
+ * The size of each removal mapping is limited by the tunable
+ * zfs_remove_max_segment, but we must adjust this to be a multiple of the
+ * pool's ashift, so that we don't try to split individual sectors regardless
+ * of the tunable value.  (Note that device removal requires that all devices
+ * have the same ashift, so there's no difference between spa_min_ashift and
+ * spa_max_ashift.) The raw tunable should not be used elsewhere.
+ */
+uint64_t
+spa_remove_max_segment(spa_t *spa)
+{
+	return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
+}
+
+/*
+ * The removal thread operates in open context.  It iterates over all
+ * allocated space in the vdev, by loading each metaslab's spacemap.
+ * For each contiguous segment of allocated space (capping the segment
+ * size at SPA_MAXBLOCKSIZE), we:
+ *    - Allocate space for it on another vdev.
+ *    - Create a new mapping from the old location to the new location
+ *      (as a record in svr_new_segments).
+ *    - Initiate a physical read zio to get the data off the removing disk.
+ *    - In the read zio's done callback, initiate a physical write zio to
+ *      write it to the new vdev.
+ * Note that all of this will take effect when a particular TXG syncs.
+ * The sync thread ensures that all the phys reads and writes for the syncing
+ * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
+ * (see vdev_mapping_sync()).
+ */
+static void
+spa_vdev_remove_thread(void *arg)
+{
+	spa_t *spa = arg;
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+	vdev_copy_arg_t vca;
+	uint64_t max_alloc = spa_remove_max_segment(spa);
+	uint64_t last_txg = 0;
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
+
+	ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT(vd->vdev_removing);
+	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
+	ASSERT(vim != NULL);
+
+	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
+	vca.vca_outstanding_bytes = 0;
+	vca.vca_read_error_bytes = 0;
+	vca.vca_write_error_bytes = 0;
+
+	mutex_enter(&svr->svr_lock);
+
+	/*
+	 * Start from vim_max_offset so we pick up where we left off
+	 * if we are restarting the removal after opening the pool.
+	 */
+	uint64_t msi;
+	for (msi = start_offset >> vd->vdev_ms_shift;
+	    msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
+		metaslab_t *msp = vd->vdev_ms[msi];
+		ASSERT3U(msi, <=, vd->vdev_ms_count);
+
+		ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+		mutex_enter(&msp->ms_sync_lock);
+		mutex_enter(&msp->ms_lock);
+
+		/*
+		 * Assert nothing in flight -- ms_*tree is empty.
+		 */
+		for (int i = 0; i < TXG_SIZE; i++) {
+			ASSERT0(range_tree_space(msp->ms_allocating[i]));
+		}
+
+		/*
+		 * If the metaslab has ever been allocated from (ms_sm!=NULL),
+		 * read the allocated segments from the space map object
+		 * into svr_allocd_segs. Since we do this while holding
+		 * svr_lock and ms_sync_lock, concurrent frees (which
+		 * would have modified the space map) will wait for us
+		 * to finish loading the spacemap, and then take the
+		 * appropriate action (see free_from_removing_vdev()).
+		 */
+		if (msp->ms_sm != NULL) {
+			VERIFY0(space_map_load(msp->ms_sm,
+			    svr->svr_allocd_segs, SM_ALLOC));
+
+			range_tree_walk(msp->ms_unflushed_allocs,
+			    range_tree_add, svr->svr_allocd_segs);
+			range_tree_walk(msp->ms_unflushed_frees,
+			    range_tree_remove, svr->svr_allocd_segs);
+			range_tree_walk(msp->ms_freeing,
+			    range_tree_remove, svr->svr_allocd_segs);
+
+			/*
+			 * When we are resuming from a paused removal (i.e.
+			 * when importing a pool with a removal in progress),
+			 * discard any state that we have already processed.
+			 */
+			range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
+		}
+		mutex_exit(&msp->ms_lock);
+		mutex_exit(&msp->ms_sync_lock);
+
+		vca.vca_msp = msp;
+		zfs_dbgmsg("copying %llu segments for metaslab %llu",
+		    zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root),
+		    msp->ms_id);
+
+		while (!svr->svr_thread_exit &&
+		    !range_tree_is_empty(svr->svr_allocd_segs)) {
+
+			mutex_exit(&svr->svr_lock);
+
+			/*
+			 * We need to periodically drop the config lock so that
+			 * writers can get in.  Additionally, we can't wait
+			 * for a txg to sync while holding a config lock
+			 * (since a waiting writer could cause a 3-way deadlock
+			 * with the sync thread, which also gets a config
+			 * lock for reader).  So we can't hold the config lock
+			 * while calling dmu_tx_assign().
+			 */
+			spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+			/*
+			 * This delay will pause the removal around the point
+			 * specified by zfs_removal_suspend_progress. We do this
+			 * solely from the test suite or during debugging.
+			 */
+			uint64_t bytes_copied =
+			    spa->spa_removing_phys.sr_copied;
+			for (int i = 0; i < TXG_SIZE; i++)
+				bytes_copied += svr->svr_bytes_done[i];
+			while (zfs_removal_suspend_progress &&
+			    !svr->svr_thread_exit)
+				delay(hz);
+
+			mutex_enter(&vca.vca_lock);
+			while (vca.vca_outstanding_bytes >
+			    zfs_remove_max_copy_bytes) {
+				cv_wait(&vca.vca_cv, &vca.vca_lock);
+			}
+			mutex_exit(&vca.vca_lock);
+
+			dmu_tx_t *tx =
+			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+
+			VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+			uint64_t txg = dmu_tx_get_txg(tx);
+
+			/*
+			 * Reacquire the vdev_config lock.  The vdev_t
+			 * that we're removing may have changed, e.g. due
+			 * to a vdev_attach or vdev_detach.
+			 */
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+
+			if (txg != last_txg)
+				max_alloc = spa_remove_max_segment(spa);
+			last_txg = txg;
+
+			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
+
+			dmu_tx_commit(tx);
+			mutex_enter(&svr->svr_lock);
+		}
+
+		mutex_enter(&vca.vca_lock);
+		if (zfs_removal_ignore_errors == 0 &&
+		    (vca.vca_read_error_bytes > 0 ||
+		    vca.vca_write_error_bytes > 0)) {
+			svr->svr_thread_exit = B_TRUE;
+		}
+		mutex_exit(&vca.vca_lock);
+	}
+
+	mutex_exit(&svr->svr_lock);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	/*
+	 * Wait for all copies to finish before cleaning up the vca.
+	 */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	ASSERT0(vca.vca_outstanding_bytes);
+
+	mutex_destroy(&vca.vca_lock);
+	cv_destroy(&vca.vca_cv);
+
+	if (svr->svr_thread_exit) {
+		mutex_enter(&svr->svr_lock);
+		range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
+		svr->svr_thread = NULL;
+		cv_broadcast(&svr->svr_cv);
+		mutex_exit(&svr->svr_lock);
+
+		/*
+		 * During the removal process an unrecoverable read or write
+		 * error was encountered.  The removal process must be
+		 * cancelled or this damage may become permanent.
+		 */
+		if (zfs_removal_ignore_errors == 0 &&
+		    (vca.vca_read_error_bytes > 0 ||
+		    vca.vca_write_error_bytes > 0)) {
+			zfs_dbgmsg("canceling removal due to IO errors: "
+			    "[read_error_bytes=%llu] [write_error_bytes=%llu]",
+			    vca.vca_read_error_bytes,
+			    vca.vca_write_error_bytes);
+			spa_vdev_remove_cancel_impl(spa);
+		}
+	} else {
+		ASSERT0(range_tree_space(svr->svr_allocd_segs));
+		vdev_remove_complete(spa);
+	}
+
+	thread_exit();
+}
+
+void
+spa_vdev_remove_suspend(spa_t *spa)
+{
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+
+	if (svr == NULL)
+		return;
+
+	mutex_enter(&svr->svr_lock);
+	svr->svr_thread_exit = B_TRUE;
+	while (svr->svr_thread != NULL)
+		cv_wait(&svr->svr_cv, &svr->svr_lock);
+	svr->svr_thread_exit = B_FALSE;
+	mutex_exit(&svr->svr_lock);
+}
+
+/* ARGSUSED */
+static int
+spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+	if (spa->spa_vdev_removal == NULL)
+		return (ENOTACTIVE);
+	return (0);
+}
+
+/*
+ * Cancel a removal by freeing all entries from the partial mapping
+ * and marking the vdev as no longer being removing.
+ */
+/* ARGSUSED */
+static void
+spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
+	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+	objset_t *mos = spa->spa_meta_objset;
+
+	ASSERT3P(svr->svr_thread, ==, NULL);
+
+	spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
+
+	boolean_t are_precise;
+	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
+	if (are_precise) {
+		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
+	}
+
+	uint64_t obsolete_sm_object;
+	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
+	if (obsolete_sm_object != 0) {
+		ASSERT(vd->vdev_obsolete_sm != NULL);
+		ASSERT3U(obsolete_sm_object, ==,
+		    space_map_object(vd->vdev_obsolete_sm));
+
+		space_map_free(vd->vdev_obsolete_sm, tx);
+		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
+		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
+		space_map_close(vd->vdev_obsolete_sm);
+		vd->vdev_obsolete_sm = NULL;
+		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
+	}
+	for (int i = 0; i < TXG_SIZE; i++) {
+		ASSERT(list_is_empty(&svr->svr_new_segments[i]));
+		ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
+		    vdev_indirect_mapping_max_offset(vim));
+	}
+
+	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
+		metaslab_t *msp = vd->vdev_ms[msi];
+
+		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
+			break;
+
+		ASSERT0(range_tree_space(svr->svr_allocd_segs));
+
+		mutex_enter(&msp->ms_lock);
+
+		/*
+		 * Assert nothing in flight -- ms_*tree is empty.
+		 */
+		for (int i = 0; i < TXG_SIZE; i++)
+			ASSERT0(range_tree_space(msp->ms_allocating[i]));
+		for (int i = 0; i < TXG_DEFER_SIZE; i++)
+			ASSERT0(range_tree_space(msp->ms_defer[i]));
+		ASSERT0(range_tree_space(msp->ms_freed));
+
+		if (msp->ms_sm != NULL) {
+			mutex_enter(&svr->svr_lock);
+			VERIFY0(space_map_load(msp->ms_sm,
+			    svr->svr_allocd_segs, SM_ALLOC));
+
+			range_tree_walk(msp->ms_unflushed_allocs,
+			    range_tree_add, svr->svr_allocd_segs);
+			range_tree_walk(msp->ms_unflushed_frees,
+			    range_tree_remove, svr->svr_allocd_segs);
+			range_tree_walk(msp->ms_freeing,
+			    range_tree_remove, svr->svr_allocd_segs);
+
+			/*
+			 * Clear everything past what has been synced,
+			 * because we have not allocated mappings for it yet.
+			 */
+			uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
+			uint64_t sm_end = msp->ms_sm->sm_start +
+			    msp->ms_sm->sm_size;
+			if (sm_end > syncd)
+				range_tree_clear(svr->svr_allocd_segs,
+				    syncd, sm_end - syncd);
+
+			mutex_exit(&svr->svr_lock);
+		}
+		mutex_exit(&msp->ms_lock);
+
+		mutex_enter(&svr->svr_lock);
+		range_tree_vacate(svr->svr_allocd_segs,
+		    free_mapped_segment_cb, vd);
+		mutex_exit(&svr->svr_lock);
+	}
+
+	/*
+	 * Note: this must happen after we invoke free_mapped_segment_cb,
+	 * because it adds to the obsolete_segments.
+	 */
+	range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
+
+	ASSERT3U(vic->vic_mapping_object, ==,
+	    vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
+	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
+	vd->vdev_indirect_mapping = NULL;
+	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
+	vic->vic_mapping_object = 0;
+
+	ASSERT3U(vic->vic_births_object, ==,
+	    vdev_indirect_births_object(vd->vdev_indirect_births));
+	vdev_indirect_births_close(vd->vdev_indirect_births);
+	vd->vdev_indirect_births = NULL;
+	vdev_indirect_births_free(mos, vic->vic_births_object, tx);
+	vic->vic_births_object = 0;
+
+	/*
+	 * We may have processed some frees from the removing vdev in this
+	 * txg, thus increasing svr_bytes_done; discard that here to
+	 * satisfy the assertions in spa_vdev_removal_destroy().
+	 * Note that future txg's can not have any bytes_done, because
+	 * future TXG's are only modified from open context, and we have
+	 * already shut down the copying thread.
+	 */
+	svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
+	spa_finish_removal(spa, DSS_CANCELED, tx);
+
+	vd->vdev_removing = B_FALSE;
+	vdev_config_dirty(vd);
+
+	zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
+	    vd->vdev_id, dmu_tx_get_txg(tx));
+	spa_history_log_internal(spa, "vdev remove canceled", tx,
+	    "%s vdev %llu %s", spa_name(spa),
+	    (u_longlong_t)vd->vdev_id,
+	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
+}
+
+static int
+spa_vdev_remove_cancel_impl(spa_t *spa)
+{
+	uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id;
+
+	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
+	    spa_vdev_remove_cancel_sync, NULL, 0,
+	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
+
+	if (error == 0) {
+		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
+		vdev_t *vd = vdev_lookup_top(spa, vdid);
+		metaslab_group_activate(vd->vdev_mg);
+		ASSERT(!vd->vdev_islog);
+		metaslab_group_activate(vd->vdev_log_mg);
+		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
+	}
+
+	return (error);
+}
+
+int
+spa_vdev_remove_cancel(spa_t *spa)
+{
+	spa_vdev_remove_suspend(spa);
+
+	if (spa->spa_vdev_removal == NULL)
+		return (ENOTACTIVE);
+
+	return (spa_vdev_remove_cancel_impl(spa));
+}
+
+void
+svr_sync(spa_t *spa, dmu_tx_t *tx)
+{
+	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
+	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+	if (svr == NULL)
+		return;
+
+	/*
+	 * This check is necessary so that we do not dirty the
+	 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
+	 * is nothing to do.  Dirtying it every time would prevent us
+	 * from syncing-to-convergence.
+	 */
+	if (svr->svr_bytes_done[txgoff] == 0)
+		return;
+
+	/*
+	 * Update progress accounting.
+	 */
+	spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
+	svr->svr_bytes_done[txgoff] = 0;
+
+	spa_sync_removing_state(spa, tx);
+}
+
+static void
+vdev_remove_make_hole_and_free(vdev_t *vd)
+{
+	uint64_t id = vd->vdev_id;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	vdev_free(vd);
+
+	vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+	vdev_add_child(rvd, vd);
+	vdev_config_dirty(rvd);
+
+	/*
+	 * Reassess the health of our root vdev.
+	 */
+	vdev_reopen(rvd);
+}
+
+/*
+ * Remove a log device.  The config lock is held for the specified TXG.
+ */
+static int
+spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
+{
+	metaslab_group_t *mg = vd->vdev_mg;
+	spa_t *spa = vd->vdev_spa;
+	int error = 0;
+
+	ASSERT(vd->vdev_islog);
+	ASSERT(vd == vd->vdev_top);
+	ASSERT3P(vd->vdev_log_mg, ==, NULL);
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	/*
+	 * Stop allocating from this vdev.
+	 */
+	metaslab_group_passivate(mg);
+
+	/*
+	 * Wait for the youngest allocations and frees to sync,
+	 * and then wait for the deferral of those frees to finish.
+	 */
+	spa_vdev_config_exit(spa, NULL,
+	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+	/*
+	 * Cancel any initialize or TRIM which was in progress.
+	 */
+	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+	vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
+	vdev_autotrim_stop_wait(vd);
+
+	/*
+	 * Evacuate the device.  We don't hold the config lock as
+	 * writer since we need to do I/O but we do keep the
+	 * spa_namespace_lock held.  Once this completes the device
+	 * should no longer have any blocks allocated on it.
+	 */
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (vd->vdev_stat.vs_alloc != 0)
+		error = spa_reset_logs(spa);
+
+	*txg = spa_vdev_config_enter(spa);
+
+	if (error != 0) {
+		metaslab_group_activate(mg);
+		ASSERT3P(vd->vdev_log_mg, ==, NULL);
+		return (error);
+	}
+	ASSERT0(vd->vdev_stat.vs_alloc);
+
+	/*
+	 * The evacuation succeeded.  Remove any remaining MOS metadata
+	 * associated with this vdev, and wait for these changes to sync.
+	 */
+	vd->vdev_removing = B_TRUE;
+
+	vdev_dirty_leaves(vd, VDD_DTL, *txg);
+	vdev_config_dirty(vd);
+
+	/*
+	 * When the log space map feature is enabled we look at
+	 * the vdev's top_zap to find the on-disk flush data of
+	 * the metaslab we just flushed. Thus, while removing a
+	 * log vdev we make sure to call vdev_metaslab_fini()
+	 * first, which removes all metaslabs of this vdev from
+	 * spa_metaslabs_by_flushed before vdev_remove_empty()
+	 * destroys the top_zap of this log vdev.
+	 *
+	 * This avoids the scenario where we flush a metaslab
+	 * from the log vdev being removed that doesn't have a
+	 * top_zap and end up failing to lookup its on-disk flush
+	 * data.
+	 *
+	 * We don't call metaslab_group_destroy() right away
+	 * though (it will be called in vdev_free() later) as
+	 * during metaslab_sync() of metaslabs from other vdevs
+	 * we may touch the metaslab group of this vdev through
+	 * metaslab_class_histogram_verify()
+	 */
+	vdev_metaslab_fini(vd);
+	spa_log_sm_set_blocklimit(spa);
+
+	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
+	*txg = spa_vdev_config_enter(spa);
+
+	sysevent_t *ev = spa_event_create(spa, vd, NULL,
+	    ESC_ZFS_VDEV_REMOVE_DEV);
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	/* The top ZAP should have been destroyed by vdev_remove_empty. */
+	ASSERT0(vd->vdev_top_zap);
+	/* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
+	ASSERT0(vd->vdev_leaf_zap);
+
+	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+	if (list_link_active(&vd->vdev_state_dirty_node))
+		vdev_state_clean(vd);
+	if (list_link_active(&vd->vdev_config_dirty_node))
+		vdev_config_clean(vd);
+
+	ASSERT0(vd->vdev_stat.vs_alloc);
+
+	/*
+	 * Clean up the vdev namespace.
+	 */
+	vdev_remove_make_hole_and_free(vd);
+
+	if (ev != NULL)
+		spa_event_post(ev);
+
+	return (0);
+}
+
+static int
+spa_vdev_remove_top_check(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+
+	if (vd != vd->vdev_top)
+		return (SET_ERROR(ENOTSUP));
+
+	if (!vdev_is_concrete(vd))
+		return (SET_ERROR(ENOTSUP));
+
+	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
+		return (SET_ERROR(ENOTSUP));
+
+
+	metaslab_class_t *mc = vd->vdev_mg->mg_class;
+	metaslab_class_t *normal = spa_normal_class(spa);
+	if (mc != normal) {
+		/*
+		 * Space allocated from the special (or dedup) class is
+		 * included in the DMU's space usage, but it's not included
+		 * in spa_dspace (or dsl_pool_adjustedsize()).  Therefore
+		 * there is always at least as much free space in the normal
+		 * class, as is allocated from the special (and dedup) class.
+		 * As a backup check, we will return ENOSPC if this is
+		 * violated. See also spa_update_dspace().
+		 */
+		uint64_t available = metaslab_class_get_space(normal) -
+		    metaslab_class_get_alloc(normal);
+		ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
+		if (available < vd->vdev_stat.vs_alloc)
+			return (SET_ERROR(ENOSPC));
+	} else {
+		/* available space in the pool's normal class */
+		uint64_t available = dsl_dir_space_available(
+		    spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
+		if (available <
+		    vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) {
+			/*
+			 * This is a normal device. There has to be enough free
+			 * space to remove the device and leave double the
+			 * "slop" space (i.e. we must leave at least 3% of the
+			 * pool free, in addition to the normal slop space).
+			 */
+			return (SET_ERROR(ENOSPC));
+		}
+	}
+
+	/*
+	 * There can not be a removal in progress.
+	 */
+	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * The device must have all its data.
+	 */
+	if (!vdev_dtl_empty(vd, DTL_MISSING) ||
+	    !vdev_dtl_empty(vd, DTL_OUTAGE))
+		return (SET_ERROR(EBUSY));
+
+	/*
+	 * The device must be healthy.
+	 */
+	if (!vdev_readable(vd))
+		return (SET_ERROR(EIO));
+
+	/*
+	 * All vdevs in normal class must have the same ashift.
+	 */
+	if (spa->spa_max_ashift != spa->spa_min_ashift) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * A removed special/dedup vdev must have same ashift as normal class.
+	 */
+	ASSERT(!vd->vdev_islog);
+	if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
+	    vd->vdev_ashift != spa->spa_max_ashift) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * All vdevs in normal class must have the same ashift
+	 * and not be raidz or draid.
+	 */
+	vdev_t *rvd = spa->spa_root_vdev;
+	int num_indirect = 0;
+	for (uint64_t id = 0; id < rvd->vdev_children; id++) {
+		vdev_t *cvd = rvd->vdev_child[id];
+
+		/*
+		 * A removed special/dedup vdev must have the same ashift
+		 * across all vdevs in its class.
+		 */
+		if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
+		    cvd->vdev_alloc_bias == vd->vdev_alloc_bias &&
+		    cvd->vdev_ashift != vd->vdev_ashift) {
+			return (SET_ERROR(EINVAL));
+		}
+		if (cvd->vdev_ashift != 0 &&
+		    cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
+			ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
+		if (cvd->vdev_ops == &vdev_indirect_ops)
+			num_indirect++;
+		if (!vdev_is_concrete(cvd))
+			continue;
+		if (vdev_get_nparity(cvd) != 0)
+			return (SET_ERROR(EINVAL));
+		/*
+		 * Need the mirror to be mirror of leaf vdevs only
+		 */
+		if (cvd->vdev_ops == &vdev_mirror_ops) {
+			for (uint64_t cid = 0;
+			    cid < cvd->vdev_children; cid++) {
+				if (!cvd->vdev_child[cid]->vdev_ops->
+				    vdev_op_leaf)
+					return (SET_ERROR(EINVAL));
+			}
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Initiate removal of a top-level vdev, reducing the total space in the pool.
+ * The config lock is held for the specified TXG.  Once initiated,
+ * evacuation of all allocated space (copying it to other vdevs) happens
+ * in the background (see spa_vdev_remove_thread()), and can be canceled
+ * (see spa_vdev_remove_cancel()).  If successful, the vdev will
+ * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
+ */
+static int
+spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	int error;
+
+	/*
+	 * Check for errors up-front, so that we don't waste time
+	 * passivating the metaslab group and clearing the ZIL if there
+	 * are errors.
+	 */
+	error = spa_vdev_remove_top_check(vd);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * Stop allocating from this vdev.  Note that we must check
+	 * that this is not the only device in the pool before
+	 * passivating, otherwise we will not be able to make
+	 * progress because we can't allocate from any vdevs.
+	 * The above check for sufficient free space serves this
+	 * purpose.
+	 */
+	metaslab_group_t *mg = vd->vdev_mg;
+	metaslab_group_passivate(mg);
+	ASSERT(!vd->vdev_islog);
+	metaslab_group_passivate(vd->vdev_log_mg);
+
+	/*
+	 * Wait for the youngest allocations and frees to sync,
+	 * and then wait for the deferral of those frees to finish.
+	 */
+	spa_vdev_config_exit(spa, NULL,
+	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+	/*
+	 * We must ensure that no "stubby" log blocks are allocated
+	 * on the device to be removed.  These blocks could be
+	 * written at any time, including while we are in the middle
+	 * of copying them.
+	 */
+	error = spa_reset_logs(spa);
+
+	/*
+	 * We stop any initializing and TRIM that is currently in progress
+	 * but leave the state as "active". This will allow the process to
+	 * resume if the removal is canceled sometime later.
+	 */
+	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+	vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
+	vdev_autotrim_stop_wait(vd);
+
+	*txg = spa_vdev_config_enter(spa);
+
+	/*
+	 * Things might have changed while the config lock was dropped
+	 * (e.g. space usage).  Check for errors again.
+	 */
+	if (error == 0)
+		error = spa_vdev_remove_top_check(vd);
+
+	if (error != 0) {
+		metaslab_group_activate(mg);
+		ASSERT(!vd->vdev_islog);
+		metaslab_group_activate(vd->vdev_log_mg);
+		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
+		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
+		return (error);
+	}
+
+	vd->vdev_removing = B_TRUE;
+
+	vdev_dirty_leaves(vd, VDD_DTL, *txg);
+	vdev_config_dirty(vd);
+	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
+	dsl_sync_task_nowait(spa->spa_dsl_pool,
+	    vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * Remove a device from the pool.
+ *
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time.  As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock.  During each step the configuration is synced out.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+	vdev_t *vd;
+	nvlist_t **spares, **l2cache, *nv;
+	uint64_t txg = 0;
+	uint_t nspares, nl2cache;
+	int error = 0, error_log;
+	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
+	sysevent_t *ev = NULL;
+	char *vd_type = NULL, *vd_path = NULL;
+
+	ASSERT(spa_writeable(spa));
+
+	if (!locked)
+		txg = spa_vdev_enter(spa);
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+		error = (spa_has_checkpoint(spa)) ?
+		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
+
+		if (!locked)
+			return (spa_vdev_exit(spa, NULL, txg, error));
+
+		return (error);
+	}
+
+	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+
+	if (spa->spa_spares.sav_vdevs != NULL &&
+	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
+	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
+		/*
+		 * Only remove the hot spare if it's not currently in use
+		 * in this pool.
+		 */
+		if (vd == NULL || unspare) {
+			char *type;
+			boolean_t draid_spare = B_FALSE;
+
+			if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
+			    == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
+				draid_spare = B_TRUE;
+
+			if (vd == NULL && draid_spare) {
+				error = SET_ERROR(ENOTSUP);
+			} else {
+				if (vd == NULL)
+					vd = spa_lookup_by_guid(spa,
+					    guid, B_TRUE);
+				ev = spa_event_create(spa, vd, NULL,
+				    ESC_ZFS_VDEV_REMOVE_AUX);
+
+				vd_type = VDEV_TYPE_SPARE;
+				vd_path = spa_strdup(fnvlist_lookup_string(
+				    nv, ZPOOL_CONFIG_PATH));
+				spa_vdev_remove_aux(spa->spa_spares.sav_config,
+				    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
+				spa_load_spares(spa);
+				spa->spa_spares.sav_sync = B_TRUE;
+			}
+		} else {
+			error = SET_ERROR(EBUSY);
+		}
+	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
+	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
+	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
+		vd_type = VDEV_TYPE_L2CACHE;
+		vd_path = spa_strdup(fnvlist_lookup_string(
+		    nv, ZPOOL_CONFIG_PATH));
+		/*
+		 * Cache devices can always be removed.
+		 */
+		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+
+		/*
+		 * Stop trimming the cache device. We need to release the
+		 * config lock to allow the syncing of TRIM transactions
+		 * without releasing the spa_namespace_lock. The same
+		 * strategy is employed in spa_vdev_remove_top().
+		 */
+		spa_vdev_config_exit(spa, NULL,
+		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+		mutex_enter(&vd->vdev_trim_lock);
+		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
+		mutex_exit(&vd->vdev_trim_lock);
+		txg = spa_vdev_config_enter(spa);
+
+		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
+		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
+		spa_load_l2cache(spa);
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	} else if (vd != NULL && vd->vdev_islog) {
+		ASSERT(!locked);
+		vd_type = VDEV_TYPE_LOG;
+		vd_path = spa_strdup((vd->vdev_path != NULL) ?
+		    vd->vdev_path : "-");
+		error = spa_vdev_remove_log(vd, &txg);
+	} else if (vd != NULL) {
+		ASSERT(!locked);
+		error = spa_vdev_remove_top(vd, &txg);
+	} else {
+		/*
+		 * There is no vdev of any kind with the specified guid.
+		 */
+		error = SET_ERROR(ENOENT);
+	}
+
+	error_log = error;
+
+	if (!locked)
+		error = spa_vdev_exit(spa, NULL, txg, error);
+
+	/*
+	 * Logging must be done outside the spa config lock. Otherwise,
+	 * this code path could end up holding the spa config lock while
+	 * waiting for a txg_sync so it can write to the internal log.
+	 * Doing that would prevent the txg sync from actually happening,
+	 * causing a deadlock.
+	 */
+	if (error_log == 0 && vd_type != NULL && vd_path != NULL) {
+		spa_history_log_internal(spa, "vdev remove", NULL,
+		    "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
+	}
+	if (vd_path != NULL)
+		spa_strfree(vd_path);
+
+	if (ev != NULL)
+		spa_event_post(ev);
+
+	return (error);
+}
+
+int
+spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
+{
+	prs->prs_state = spa->spa_removing_phys.sr_state;
+
+	if (prs->prs_state == DSS_NONE)
+		return (SET_ERROR(ENOENT));
+
+	prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
+	prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
+	prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
+	prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
+	prs->prs_copied = spa->spa_removing_phys.sr_copied;
+
+	prs->prs_mapping_memory = 0;
+	uint64_t indirect_vdev_id =
+	    spa->spa_removing_phys.sr_prev_indirect_vdev;
+	while (indirect_vdev_id != -1) {
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
+		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
+		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
+
+		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
+		prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
+		indirect_vdev_id = vic->vic_prev_indirect_vdev;
+	}
+
+	return (0);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
+	"Ignore hard IO errors when removing device");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW,
+	"Largest contiguous segment to allocate when removing device");
+
+ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW,
+	"Largest span of free chunks a remap segment can span");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW,
+	"Pause device removal after this many bytes are copied "
+	"(debug use only - causes removal to hang)");
+/* END CSTYLED */
+
+EXPORT_SYMBOL(free_from_removing_vdev);
+EXPORT_SYMBOL(spa_removal_get_stats);
+EXPORT_SYMBOL(spa_remove_init);
+EXPORT_SYMBOL(spa_restart_removal);
+EXPORT_SYMBOL(spa_vdev_removal_destroy);
+EXPORT_SYMBOL(spa_vdev_remove);
+EXPORT_SYMBOL(spa_vdev_remove_cancel);
+EXPORT_SYMBOL(spa_vdev_remove_suspend);
+EXPORT_SYMBOL(svr_sync);
diff --git a/sys/contrib/openzfs/module/zfs/vdev_root.c b/sys/contrib/openzfs/module/zfs/vdev_root.c
new file mode 100644
index 000000000000..45ddc2f71927
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_root.c
@@ -0,0 +1,167 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/fs/zfs.h>
+
+/*
+ * Virtual device vector for the pool's root vdev.
+ */
+
+static uint64_t
+vdev_root_core_tvds(vdev_t *vd)
+{
+	uint64_t tvds = 0;
+
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (!cvd->vdev_ishole && !cvd->vdev_islog &&
+		    cvd->vdev_ops != &vdev_indirect_ops) {
+			tvds++;
+		}
+	}
+
+	return (tvds);
+}
+
+/*
+ * We should be able to tolerate one failure with absolutely no damage
+ * to our metadata.  Two failures will take out space maps, a bunch of
+ * indirect block trees, meta dnodes, dnodes, etc.  Probably not a happy
+ * place to live.  When we get smarter, we can liberalize this policy.
+ * e.g. If we haven't lost two consecutive top-level vdevs, then we are
+ * probably fine.  Adding bean counters during alloc/free can make this
+ * future guesswork more accurate.
+ */
+static boolean_t
+too_many_errors(vdev_t *vd, uint64_t numerrors)
+{
+	uint64_t tvds;
+
+	if (numerrors == 0)
+		return (B_FALSE);
+
+	tvds = vdev_root_core_tvds(vd);
+	ASSERT3U(numerrors, <=, tvds);
+
+	if (numerrors == tvds)
+		return (B_TRUE);
+
+	return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa));
+}
+
+static int
+vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
+    uint64_t *ashift, uint64_t *pshift)
+{
+	spa_t *spa = vd->vdev_spa;
+	int lasterror = 0;
+	int numerrors = 0;
+
+	if (vd->vdev_children == 0) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+		return (SET_ERROR(EINVAL));
+	}
+
+	vdev_open_children(vd);
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error && !cvd->vdev_islog &&
+		    cvd->vdev_ops != &vdev_indirect_ops) {
+			lasterror = cvd->vdev_open_error;
+			numerrors++;
+		}
+	}
+
+	if (spa_load_state(spa) != SPA_LOAD_NONE)
+		spa_set_missing_tvds(spa, numerrors);
+
+	if (too_many_errors(vd, numerrors)) {
+		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
+		return (lasterror);
+	}
+
+	*asize = 0;
+	*max_asize = 0;
+	*ashift = 0;
+	*pshift = 0;
+
+	return (0);
+}
+
+static void
+vdev_root_close(vdev_t *vd)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_close(vd->vdev_child[c]);
+}
+
+static void
+vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
+{
+	if (too_many_errors(vd, faulted)) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_NO_REPLICAS);
+	} else if (degraded || faulted) {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+	} else {
+		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+	}
+}
+
+vdev_ops_t vdev_root_ops = {
+	.vdev_op_init = NULL,
+	.vdev_op_fini = NULL,
+	.vdev_op_open = vdev_root_open,
+	.vdev_op_close = vdev_root_close,
+	.vdev_op_asize = vdev_default_asize,
+	.vdev_op_min_asize = vdev_default_min_asize,
+	.vdev_op_min_alloc = NULL,
+	.vdev_op_io_start = NULL,	/* not applicable to the root */
+	.vdev_op_io_done = NULL,	/* not applicable to the root */
+	.vdev_op_state_change = vdev_root_state_change,
+	.vdev_op_need_resilver = NULL,
+	.vdev_op_hold = NULL,
+	.vdev_op_rele = NULL,
+	.vdev_op_remap = NULL,
+	.vdev_op_xlate = NULL,
+	.vdev_op_rebuild_asize = NULL,
+	.vdev_op_metaslab_init = NULL,
+	.vdev_op_config_generate = NULL,
+	.vdev_op_nparity = NULL,
+	.vdev_op_ndisks = NULL,
+	.vdev_op_type = VDEV_TYPE_ROOT,	/* name of this vdev type */
+	.vdev_op_leaf = B_FALSE		/* not a leaf vdev */
+};
diff --git a/sys/contrib/openzfs/module/zfs/vdev_trim.c b/sys/contrib/openzfs/module/zfs/vdev_trim.c
new file mode 100644
index 000000000000..895957bda195
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/vdev_trim.c
@@ -0,0 +1,1719 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/txg.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/metaslab_impl.h>
+#include <sys/dsl_synctask.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc_impl.h>
+
+/*
+ * TRIM is a feature which is used to notify a SSD that some previously
+ * written space is no longer allocated by the pool.  This is useful because
+ * writes to a SSD must be performed to blocks which have first been erased.
+ * Ensuring the SSD always has a supply of erased blocks for new writes
+ * helps prevent the performance from deteriorating.
+ *
+ * There are two supported TRIM methods; manual and automatic.
+ *
+ * Manual TRIM:
+ *
+ * A manual TRIM is initiated by running the 'zpool trim' command.  A single
+ * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for
+ * managing that vdev TRIM process.  This involves iterating over all the
+ * metaslabs, calculating the unallocated space ranges, and then issuing the
+ * required TRIM I/Os.
+ *
+ * While a metaslab is being actively trimmed it is not eligible to perform
+ * new allocations.  After traversing all of the metaslabs the thread is
+ * terminated.  Finally, both the requested options and current progress of
+ * the TRIM are regularly written to the pool.  This allows the TRIM to be
+ * suspended and resumed as needed.
+ *
+ * Automatic TRIM:
+ *
+ * An automatic TRIM is enabled by setting the 'autotrim' pool property
+ * to 'on'.  When enabled, a `vdev_autotrim' thread is created for each
+ * top-level (not leaf) vdev in the pool.  These threads perform the same
+ * core TRIM process as a manual TRIM, but with a few key differences.
+ *
+ * 1) Automatic TRIM happens continuously in the background and operates
+ *    solely on recently freed blocks (ms_trim not ms_allocatable).
+ *
+ * 2) Each thread is associated with a top-level (not leaf) vdev.  This has
+ *    the benefit of simplifying the threading model, it makes it easier
+ *    to coordinate administrative commands, and it ensures only a single
+ *    metaslab is disabled at a time.  Unlike manual TRIM, this means each
+ *    'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its
+ *    children.
+ *
+ * 3) There is no automatic TRIM progress information stored on disk, nor
+ *    is it reported by 'zpool status'.
+ *
+ * While the automatic TRIM process is highly effective it is more likely
+ * than a manual TRIM to encounter tiny ranges.  Ranges less than or equal to
+ * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently
+ * TRIM and are skipped.  This means small amounts of freed space may not
+ * be automatically trimmed.
+ *
+ * Furthermore, devices with attached hot spares and devices being actively
+ * replaced are skipped.  This is done to avoid adding additional stress to
+ * a potentially unhealthy device and to minimize the required rebuild time.
+ *
+ * For this reason it may be beneficial to occasionally manually TRIM a pool
+ * even when automatic TRIM is enabled.
+ */
+
+/*
+ * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths.
+ */
+unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024;
+
+/*
+ * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped.
+ */
+unsigned int zfs_trim_extent_bytes_min = 32 * 1024;
+
+/*
+ * Skip uninitialized metaslabs during the TRIM process.  This option is
+ * useful for pools constructed from large thinly-provisioned devices where
+ * TRIM operations are slow.  As a pool ages an increasing fraction of
+ * the pools metaslabs will be initialized progressively degrading the
+ * usefulness of this option.  This setting is stored when starting a
+ * manual TRIM and will persist for the duration of the requested TRIM.
+ */
+unsigned int zfs_trim_metaslab_skip = 0;
+
+/*
+ * Maximum number of queued TRIM I/Os per leaf vdev.  The number of
+ * concurrent TRIM I/Os issued to the device is controlled by the
+ * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options.
+ */
+unsigned int zfs_trim_queue_limit = 10;
+
+/*
+ * The minimum number of transaction groups between automatic trims of a
+ * metaslab.  This setting represents a trade-off between issuing more
+ * efficient TRIM operations, by allowing them to be aggregated longer,
+ * and issuing them promptly so the trimmed space is available.  Note
+ * that this value is a minimum; metaslabs can be trimmed less frequently
+ * when there are a large number of ranges which need to be trimmed.
+ *
+ * Increasing this value will allow frees to be aggregated for a longer
+ * time.  This can result is larger TRIM operations, and increased memory
+ * usage in order to track the ranges to be trimmed.  Decreasing this value
+ * has the opposite effect.  The default value of 32 was determined though
+ * testing to be a reasonable compromise.
+ */
+unsigned int zfs_trim_txg_batch = 32;
+
+/*
+ * The trim_args are a control structure which describe how a leaf vdev
+ * should be trimmed.  The core elements are the vdev, the metaslab being
+ * trimmed and a range tree containing the extents to TRIM.  All provided
+ * ranges must be within the metaslab.
+ */
+typedef struct trim_args {
+	/*
+	 * These fields are set by the caller of vdev_trim_ranges().
+	 */
+	vdev_t		*trim_vdev;		/* Leaf vdev to TRIM */
+	metaslab_t	*trim_msp;		/* Disabled metaslab */
+	range_tree_t	*trim_tree;		/* TRIM ranges (in metaslab) */
+	trim_type_t	trim_type;		/* Manual or auto TRIM */
+	uint64_t	trim_extent_bytes_max;	/* Maximum TRIM I/O size */
+	uint64_t	trim_extent_bytes_min;	/* Minimum TRIM I/O size */
+	enum trim_flag	trim_flags;		/* TRIM flags (secure) */
+
+	/*
+	 * These fields are updated by vdev_trim_ranges().
+	 */
+	hrtime_t	trim_start_time;	/* Start time */
+	uint64_t	trim_bytes_done;	/* Bytes trimmed */
+} trim_args_t;
+
+/*
+ * Determines whether a vdev_trim_thread() should be stopped.
+ */
+static boolean_t
+vdev_trim_should_stop(vdev_t *vd)
+{
+	return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) ||
+	    vd->vdev_detached || vd->vdev_top->vdev_removing);
+}
+
+/*
+ * Determines whether a vdev_autotrim_thread() should be stopped.
+ */
+static boolean_t
+vdev_autotrim_should_stop(vdev_t *tvd)
+{
+	return (tvd->vdev_autotrim_exit_wanted ||
+	    !vdev_writeable(tvd) || tvd->vdev_removing ||
+	    spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF);
+}
+
+/*
+ * The sync task for updating the on-disk state of a manual TRIM.  This
+ * is scheduled by vdev_trim_change_state().
+ */
+static void
+vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx)
+{
+	/*
+	 * We pass in the guid instead of the vdev_t since the vdev may
+	 * have been freed prior to the sync task being processed.  This
+	 * happens when a vdev is detached as we call spa_config_vdev_exit(),
+	 * stop the trimming thread, schedule the sync task, and free
+	 * the vdev. Later when the scheduled sync task is invoked, it would
+	 * find that the vdev has been freed.
+	 */
+	uint64_t guid = *(uint64_t *)arg;
+	uint64_t txg = dmu_tx_get_txg(tx);
+	kmem_free(arg, sizeof (uint64_t));
+
+	vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd))
+		return;
+
+	uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK];
+	vd->vdev_trim_offset[txg & TXG_MASK] = 0;
+
+	VERIFY3U(vd->vdev_leaf_zap, !=, 0);
+
+	objset_t *mos = vd->vdev_spa->spa_meta_objset;
+
+	if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) {
+
+		if (vd->vdev_trim_last_offset == UINT64_MAX)
+			last_offset = 0;
+
+		vd->vdev_trim_last_offset = last_offset;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
+		    sizeof (last_offset), 1, &last_offset, tx));
+	}
+
+	if (vd->vdev_trim_action_time > 0) {
+		uint64_t val = (uint64_t)vd->vdev_trim_action_time;
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val),
+		    1, &val, tx));
+	}
+
+	if (vd->vdev_trim_rate > 0) {
+		uint64_t rate = (uint64_t)vd->vdev_trim_rate;
+
+		if (rate == UINT64_MAX)
+			rate = 0;
+
+		VERIFY0(zap_update(mos, vd->vdev_leaf_zap,
+		    VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx));
+	}
+
+	uint64_t partial = vd->vdev_trim_partial;
+	if (partial == UINT64_MAX)
+		partial = 0;
+
+	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
+	    sizeof (partial), 1, &partial, tx));
+
+	uint64_t secure = vd->vdev_trim_secure;
+	if (secure == UINT64_MAX)
+		secure = 0;
+
+	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
+	    sizeof (secure), 1, &secure, tx));
+
+
+	uint64_t trim_state = vd->vdev_trim_state;
+	VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
+	    sizeof (trim_state), 1, &trim_state, tx));
+}
+
+/*
+ * Update the on-disk state of a manual TRIM.  This is called to request
+ * that a TRIM be started/suspended/canceled, or to change one of the
+ * TRIM options (partial, secure, rate).
+ */
+static void
+vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state,
+    uint64_t rate, boolean_t partial, boolean_t secure)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+	spa_t *spa = vd->vdev_spa;
+
+	if (new_state == vd->vdev_trim_state)
+		return;
+
+	/*
+	 * Copy the vd's guid, this will be freed by the sync task.
+	 */
+	uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+	*guid = vd->vdev_guid;
+
+	/*
+	 * If we're suspending, then preserve the original start time.
+	 */
+	if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) {
+		vd->vdev_trim_action_time = gethrestime_sec();
+	}
+
+	/*
+	 * If we're activating, then preserve the requested rate and trim
+	 * method.  Setting the last offset and rate to UINT64_MAX is used
+	 * as a sentinel to indicate they should be reset to default values.
+	 */
+	if (new_state == VDEV_TRIM_ACTIVE) {
+		if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE ||
+		    vd->vdev_trim_state == VDEV_TRIM_CANCELED) {
+			vd->vdev_trim_last_offset = UINT64_MAX;
+			vd->vdev_trim_rate = UINT64_MAX;
+			vd->vdev_trim_partial = UINT64_MAX;
+			vd->vdev_trim_secure = UINT64_MAX;
+		}
+
+		if (rate != 0)
+			vd->vdev_trim_rate = rate;
+
+		if (partial != 0)
+			vd->vdev_trim_partial = partial;
+
+		if (secure != 0)
+			vd->vdev_trim_secure = secure;
+	}
+
+	vdev_trim_state_t old_state = vd->vdev_trim_state;
+	boolean_t resumed = (old_state == VDEV_TRIM_SUSPENDED);
+	vd->vdev_trim_state = new_state;
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync,
+	    guid, tx);
+
+	switch (new_state) {
+	case VDEV_TRIM_ACTIVE:
+		spa_event_notify(spa, vd, NULL,
+		    resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START);
+		spa_history_log_internal(spa, "trim", tx,
+		    "vdev=%s activated", vd->vdev_path);
+		break;
+	case VDEV_TRIM_SUSPENDED:
+		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND);
+		spa_history_log_internal(spa, "trim", tx,
+		    "vdev=%s suspended", vd->vdev_path);
+		break;
+	case VDEV_TRIM_CANCELED:
+		if (old_state == VDEV_TRIM_ACTIVE ||
+		    old_state == VDEV_TRIM_SUSPENDED) {
+			spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL);
+			spa_history_log_internal(spa, "trim", tx,
+			    "vdev=%s canceled", vd->vdev_path);
+		}
+		break;
+	case VDEV_TRIM_COMPLETE:
+		spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH);
+		spa_history_log_internal(spa, "trim", tx,
+		    "vdev=%s complete", vd->vdev_path);
+		break;
+	default:
+		panic("invalid state %llu", (unsigned long long)new_state);
+	}
+
+	dmu_tx_commit(tx);
+
+	if (new_state != VDEV_TRIM_ACTIVE)
+		spa_notify_waiters(spa);
+}
+
+/*
+ * The zio_done_func_t done callback for each manual TRIM issued.  It is
+ * responsible for updating the TRIM stats, reissuing failed TRIM I/Os,
+ * and limiting the number of in flight TRIM I/Os.
+ */
+static void
+vdev_trim_cb(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	mutex_enter(&vd->vdev_trim_io_lock);
+	if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+		/*
+		 * The I/O failed because the vdev was unavailable; roll the
+		 * last offset back. (This works because spa_sync waits on
+		 * spa_txg_zio before it runs sync tasks.)
+		 */
+		uint64_t *offset =
+		    &vd->vdev_trim_offset[zio->io_txg & TXG_MASK];
+		*offset = MIN(*offset, zio->io_offset);
+	} else {
+		if (zio->io_error != 0) {
+			vd->vdev_stat.vs_trim_errors++;
+			spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
+			    0, 0, 0, 0, 1, zio->io_orig_size);
+		} else {
+			spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL,
+			    1, zio->io_orig_size, 0, 0, 0, 0);
+		}
+
+		vd->vdev_trim_bytes_done += zio->io_orig_size;
+	}
+
+	ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0);
+	vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--;
+	cv_broadcast(&vd->vdev_trim_io_cv);
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * The zio_done_func_t done callback for each automatic TRIM issued.  It
+ * is responsible for updating the TRIM stats and limiting the number of
+ * in flight TRIM I/Os.  Automatic TRIM I/Os are best effort and are
+ * never reissued on failure.
+ */
+static void
+vdev_autotrim_cb(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	mutex_enter(&vd->vdev_trim_io_lock);
+
+	if (zio->io_error != 0) {
+		vd->vdev_stat.vs_trim_errors++;
+		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
+		    0, 0, 0, 0, 1, zio->io_orig_size);
+	} else {
+		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO,
+		    1, zio->io_orig_size, 0, 0, 0, 0);
+	}
+
+	ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0);
+	vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--;
+	cv_broadcast(&vd->vdev_trim_io_cv);
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * The zio_done_func_t done callback for each TRIM issued via
+ * vdev_trim_simple(). It is responsible for updating the TRIM stats and
+ * limiting the number of in flight TRIM I/Os.  Simple TRIM I/Os are best
+ * effort and are never reissued on failure.
+ */
+static void
+vdev_trim_simple_cb(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	mutex_enter(&vd->vdev_trim_io_lock);
+
+	if (zio->io_error != 0) {
+		vd->vdev_stat.vs_trim_errors++;
+		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+		    0, 0, 0, 0, 1, zio->io_orig_size);
+	} else {
+		spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE,
+		    1, zio->io_orig_size, 0, 0, 0, 0);
+	}
+
+	ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0);
+	vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--;
+	cv_broadcast(&vd->vdev_trim_io_cv);
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+/*
+ * Returns the average trim rate in bytes/sec for the ta->trim_vdev.
+ */
+static uint64_t
+vdev_trim_calculate_rate(trim_args_t *ta)
+{
+	return (ta->trim_bytes_done * 1000 /
+	    (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1));
+}
+
+/*
+ * Issues a physical TRIM and takes care of rate limiting (bytes/sec)
+ * and number of concurrent TRIM I/Os.
+ */
+static int
+vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = ta->trim_vdev;
+	spa_t *spa = vd->vdev_spa;
+	void *cb;
+
+	mutex_enter(&vd->vdev_trim_io_lock);
+
+	/*
+	 * Limit manual TRIM I/Os to the requested rate.  This does not
+	 * apply to automatic TRIM since no per vdev rate can be specified.
+	 */
+	if (ta->trim_type == TRIM_TYPE_MANUAL) {
+		while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) &&
+		    vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) {
+			cv_timedwait_idle(&vd->vdev_trim_io_cv,
+			    &vd->vdev_trim_io_lock, ddi_get_lbolt() +
+			    MSEC_TO_TICK(10));
+		}
+	}
+	ta->trim_bytes_done += size;
+
+	/* Limit in flight trimming I/Os */
+	while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] +
+	    vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) {
+		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+	}
+	vd->vdev_trim_inflight[ta->trim_type]++;
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	uint64_t txg = dmu_tx_get_txg(tx);
+
+	spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+	mutex_enter(&vd->vdev_trim_lock);
+
+	if (ta->trim_type == TRIM_TYPE_MANUAL &&
+	    vd->vdev_trim_offset[txg & TXG_MASK] == 0) {
+		uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP);
+		*guid = vd->vdev_guid;
+
+		/* This is the first write of this txg. */
+		dsl_sync_task_nowait(spa_get_dsl(spa),
+		    vdev_trim_zap_update_sync, guid, tx);
+	}
+
+	/*
+	 * We know the vdev_t will still be around since all consumers of
+	 * vdev_free must stop the trimming first.
+	 */
+	if ((ta->trim_type == TRIM_TYPE_MANUAL &&
+	    vdev_trim_should_stop(vd)) ||
+	    (ta->trim_type == TRIM_TYPE_AUTO &&
+	    vdev_autotrim_should_stop(vd->vdev_top))) {
+		mutex_enter(&vd->vdev_trim_io_lock);
+		vd->vdev_trim_inflight[ta->trim_type]--;
+		mutex_exit(&vd->vdev_trim_io_lock);
+		spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+		mutex_exit(&vd->vdev_trim_lock);
+		dmu_tx_commit(tx);
+		return (SET_ERROR(EINTR));
+	}
+	mutex_exit(&vd->vdev_trim_lock);
+
+	if (ta->trim_type == TRIM_TYPE_MANUAL)
+		vd->vdev_trim_offset[txg & TXG_MASK] = start + size;
+
+	if (ta->trim_type == TRIM_TYPE_MANUAL) {
+		cb = vdev_trim_cb;
+	} else if (ta->trim_type == TRIM_TYPE_AUTO) {
+		cb = vdev_autotrim_cb;
+	} else {
+		cb = vdev_trim_simple_cb;
+	}
+
+	zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd,
+	    start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL,
+	    ta->trim_flags));
+	/* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+/*
+ * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree.
+ * Additional parameters describing how the TRIM should be performed must
+ * be set in the trim_args structure.  See the trim_args definition for
+ * additional information.
+ */
+static int
+vdev_trim_ranges(trim_args_t *ta)
+{
+	vdev_t *vd = ta->trim_vdev;
+	zfs_btree_t *t = &ta->trim_tree->rt_root;
+	zfs_btree_index_t idx;
+	uint64_t extent_bytes_max = ta->trim_extent_bytes_max;
+	uint64_t extent_bytes_min = ta->trim_extent_bytes_min;
+	spa_t *spa = vd->vdev_spa;
+
+	ta->trim_start_time = gethrtime();
+	ta->trim_bytes_done = 0;
+
+	for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+	    rs = zfs_btree_next(t, &idx, &idx)) {
+		uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs,
+		    ta->trim_tree);
+
+		if (extent_bytes_min && size < extent_bytes_min) {
+			spa_iostats_trim_add(spa, ta->trim_type,
+			    0, 0, 1, size, 0, 0);
+			continue;
+		}
+
+		/* Split range into legally-sized physical chunks */
+		uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1;
+
+		for (uint64_t w = 0; w < writes_required; w++) {
+			int error;
+
+			error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE +
+			    rs_get_start(rs, ta->trim_tree) +
+			    (w *extent_bytes_max), MIN(size -
+			    (w * extent_bytes_max), extent_bytes_max));
+			if (error != 0) {
+				return (error);
+			}
+		}
+	}
+
+	return (0);
+}
+
+static void
+vdev_trim_xlate_last_rs_end(void *arg, range_seg64_t *physical_rs)
+{
+	uint64_t *last_rs_end = (uint64_t *)arg;
+
+	if (physical_rs->rs_end > *last_rs_end)
+		*last_rs_end = physical_rs->rs_end;
+}
+
+static void
+vdev_trim_xlate_progress(void *arg, range_seg64_t *physical_rs)
+{
+	vdev_t *vd = (vdev_t *)arg;
+
+	uint64_t size = physical_rs->rs_end - physical_rs->rs_start;
+	vd->vdev_trim_bytes_est += size;
+
+	if (vd->vdev_trim_last_offset >= physical_rs->rs_end) {
+		vd->vdev_trim_bytes_done += size;
+	} else if (vd->vdev_trim_last_offset > physical_rs->rs_start &&
+	    vd->vdev_trim_last_offset <= physical_rs->rs_end) {
+		vd->vdev_trim_bytes_done +=
+		    vd->vdev_trim_last_offset - physical_rs->rs_start;
+	}
+}
+
+/*
+ * Calculates the completion percentage of a manual TRIM.
+ */
+static void
+vdev_trim_calculate_progress(vdev_t *vd)
+{
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	vd->vdev_trim_bytes_est = 0;
+	vd->vdev_trim_bytes_done = 0;
+
+	for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+		mutex_enter(&msp->ms_lock);
+
+		uint64_t ms_free = (msp->ms_size -
+		    metaslab_allocated_space(msp)) /
+		    vdev_get_ndisks(vd->vdev_top);
+
+		/*
+		 * Convert the metaslab range to a physical range
+		 * on our vdev. We use this to determine if we are
+		 * in the middle of this metaslab range.
+		 */
+		range_seg64_t logical_rs, physical_rs, remain_rs;
+		logical_rs.rs_start = msp->ms_start;
+		logical_rs.rs_end = msp->ms_start + msp->ms_size;
+
+		/* Metaslab space after this offset has not been trimmed. */
+		vdev_xlate(vd, &logical_rs, &physical_rs, &remain_rs);
+		if (vd->vdev_trim_last_offset <= physical_rs.rs_start) {
+			vd->vdev_trim_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/* Metaslab space before this offset has been trimmed */
+		uint64_t last_rs_end = physical_rs.rs_end;
+		if (!vdev_xlate_is_empty(&remain_rs)) {
+			vdev_xlate_walk(vd, &remain_rs,
+			    vdev_trim_xlate_last_rs_end, &last_rs_end);
+		}
+
+		if (vd->vdev_trim_last_offset > last_rs_end) {
+			vd->vdev_trim_bytes_done += ms_free;
+			vd->vdev_trim_bytes_est += ms_free;
+			mutex_exit(&msp->ms_lock);
+			continue;
+		}
+
+		/*
+		 * If we get here, we're in the middle of trimming this
+		 * metaslab.  Load it and walk the free tree for more
+		 * accurate progress estimation.
+		 */
+		VERIFY0(metaslab_load(msp));
+
+		range_tree_t *rt = msp->ms_allocatable;
+		zfs_btree_t *bt = &rt->rt_root;
+		zfs_btree_index_t idx;
+		for (range_seg_t *rs = zfs_btree_first(bt, &idx);
+		    rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) {
+			logical_rs.rs_start = rs_get_start(rs, rt);
+			logical_rs.rs_end = rs_get_end(rs, rt);
+
+			vdev_xlate_walk(vd, &logical_rs,
+			    vdev_trim_xlate_progress, vd);
+		}
+		mutex_exit(&msp->ms_lock);
+	}
+}
+
+/*
+ * Load from disk the vdev's manual TRIM information.  This includes the
+ * state, progress, and options provided when initiating the manual TRIM.
+ */
+static int
+vdev_trim_load(vdev_t *vd)
+{
+	int err = 0;
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) ||
+	    spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER));
+	ASSERT(vd->vdev_leaf_zap != 0);
+
+	if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE ||
+	    vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) {
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET,
+		    sizeof (vd->vdev_trim_last_offset), 1,
+		    &vd->vdev_trim_last_offset);
+		if (err == ENOENT) {
+			vd->vdev_trim_last_offset = 0;
+			err = 0;
+		}
+
+		if (err == 0) {
+			err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+			    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE,
+			    sizeof (vd->vdev_trim_rate), 1,
+			    &vd->vdev_trim_rate);
+			if (err == ENOENT) {
+				vd->vdev_trim_rate = 0;
+				err = 0;
+			}
+		}
+
+		if (err == 0) {
+			err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+			    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL,
+			    sizeof (vd->vdev_trim_partial), 1,
+			    &vd->vdev_trim_partial);
+			if (err == ENOENT) {
+				vd->vdev_trim_partial = 0;
+				err = 0;
+			}
+		}
+
+		if (err == 0) {
+			err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+			    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE,
+			    sizeof (vd->vdev_trim_secure), 1,
+			    &vd->vdev_trim_secure);
+			if (err == ENOENT) {
+				vd->vdev_trim_secure = 0;
+				err = 0;
+			}
+		}
+	}
+
+	vdev_trim_calculate_progress(vd);
+
+	return (err);
+}
+
+static void
+vdev_trim_xlate_range_add(void *arg, range_seg64_t *physical_rs)
+{
+	trim_args_t *ta = arg;
+	vdev_t *vd = ta->trim_vdev;
+
+	/*
+	 * Only a manual trim will be traversing the vdev sequentially.
+	 * For an auto trim all valid ranges should be added.
+	 */
+	if (ta->trim_type == TRIM_TYPE_MANUAL) {
+
+		/* Only add segments that we have not visited yet */
+		if (physical_rs->rs_end <= vd->vdev_trim_last_offset)
+			return;
+
+		/* Pick up where we left off mid-range. */
+		if (vd->vdev_trim_last_offset > physical_rs->rs_start) {
+			ASSERT3U(physical_rs->rs_end, >,
+			    vd->vdev_trim_last_offset);
+			physical_rs->rs_start = vd->vdev_trim_last_offset;
+		}
+	}
+
+	ASSERT3U(physical_rs->rs_end, >, physical_rs->rs_start);
+
+	range_tree_add(ta->trim_tree, physical_rs->rs_start,
+	    physical_rs->rs_end - physical_rs->rs_start);
+}
+
+/*
+ * Convert the logical range into physical ranges and add them to the
+ * range tree passed in the trim_args_t.
+ */
+static void
+vdev_trim_range_add(void *arg, uint64_t start, uint64_t size)
+{
+	trim_args_t *ta = arg;
+	vdev_t *vd = ta->trim_vdev;
+	range_seg64_t logical_rs;
+	logical_rs.rs_start = start;
+	logical_rs.rs_end = start + size;
+
+	/*
+	 * Every range to be trimmed must be part of ms_allocatable.
+	 * When ZFS_DEBUG_TRIM is set load the metaslab to verify this
+	 * is always the case.
+	 */
+	if (zfs_flags & ZFS_DEBUG_TRIM) {
+		metaslab_t *msp = ta->trim_msp;
+		VERIFY0(metaslab_load(msp));
+		VERIFY3B(msp->ms_loaded, ==, B_TRUE);
+		VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
+	}
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	vdev_xlate_walk(vd, &logical_rs, vdev_trim_xlate_range_add, arg);
+}
+
+/*
+ * Each manual TRIM thread is responsible for trimming the unallocated
+ * space for each leaf vdev.  This is accomplished by sequentially iterating
+ * over its top-level metaslabs and issuing TRIM I/O for the space described
+ * by its ms_allocatable.  While a metaslab is undergoing trimming it is
+ * not eligible for new allocations.
+ */
+static void
+vdev_trim_thread(void *arg)
+{
+	vdev_t *vd = arg;
+	spa_t *spa = vd->vdev_spa;
+	trim_args_t ta;
+	int error = 0;
+
+	/*
+	 * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by
+	 * vdev_trim().  Wait for the updated values to be reflected
+	 * in the zap in order to start with the requested settings.
+	 */
+	txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+
+	ASSERT(vdev_is_concrete(vd));
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	vd->vdev_trim_last_offset = 0;
+	vd->vdev_trim_rate = 0;
+	vd->vdev_trim_partial = 0;
+	vd->vdev_trim_secure = 0;
+
+	VERIFY0(vdev_trim_load(vd));
+
+	ta.trim_vdev = vd;
+	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+	ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min;
+	ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	ta.trim_type = TRIM_TYPE_MANUAL;
+	ta.trim_flags = 0;
+
+	/*
+	 * When a secure TRIM has been requested infer that the intent
+	 * is that everything must be trimmed.  Override the default
+	 * minimum TRIM size to prevent ranges from being skipped.
+	 */
+	if (vd->vdev_trim_secure) {
+		ta.trim_flags |= ZIO_TRIM_SECURE;
+		ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+	}
+
+	uint64_t ms_count = 0;
+	for (uint64_t i = 0; !vd->vdev_detached &&
+	    i < vd->vdev_top->vdev_ms_count; i++) {
+		metaslab_t *msp = vd->vdev_top->vdev_ms[i];
+
+		/*
+		 * If we've expanded the top-level vdev or it's our
+		 * first pass, calculate our progress.
+		 */
+		if (vd->vdev_top->vdev_ms_count != ms_count) {
+			vdev_trim_calculate_progress(vd);
+			ms_count = vd->vdev_top->vdev_ms_count;
+		}
+
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		metaslab_disable(msp);
+		mutex_enter(&msp->ms_lock);
+		VERIFY0(metaslab_load(msp));
+
+		/*
+		 * If a partial TRIM was requested skip metaslabs which have
+		 * never been initialized and thus have never been written.
+		 */
+		if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
+			mutex_exit(&msp->ms_lock);
+			metaslab_enable(msp, B_FALSE, B_FALSE);
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+			vdev_trim_calculate_progress(vd);
+			continue;
+		}
+
+		ta.trim_msp = msp;
+		range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta);
+		range_tree_vacate(msp->ms_trim, NULL, NULL);
+		mutex_exit(&msp->ms_lock);
+
+		error = vdev_trim_ranges(&ta);
+		metaslab_enable(msp, B_TRUE, B_FALSE);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+		range_tree_vacate(ta.trim_tree, NULL, NULL);
+		if (error != 0)
+			break;
+	}
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	mutex_enter(&vd->vdev_trim_io_lock);
+	while (vd->vdev_trim_inflight[0] > 0) {
+		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+	}
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	range_tree_destroy(ta.trim_tree);
+
+	mutex_enter(&vd->vdev_trim_lock);
+	if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
+		vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
+		    vd->vdev_trim_rate, vd->vdev_trim_partial,
+		    vd->vdev_trim_secure);
+	}
+	ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0);
+
+	/*
+	 * Drop the vdev_trim_lock while we sync out the txg since it's
+	 * possible that a device might be trying to come online and must
+	 * check to see if it needs to restart a trim. That thread will be
+	 * holding the spa_config_lock which would prevent the txg_wait_synced
+	 * from completing.
+	 */
+	mutex_exit(&vd->vdev_trim_lock);
+	txg_wait_synced(spa_get_dsl(spa), 0);
+	mutex_enter(&vd->vdev_trim_lock);
+
+	vd->vdev_trim_thread = NULL;
+	cv_broadcast(&vd->vdev_trim_cv);
+	mutex_exit(&vd->vdev_trim_lock);
+
+	thread_exit();
+}
+
+/*
+ * Initiates a manual TRIM for the vdev_t.  Callers must hold vdev_trim_lock,
+ * the vdev_t must be a leaf and cannot already be manually trimming.
+ */
+void
+vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+	ASSERT(!vd->vdev_detached);
+	ASSERT(!vd->vdev_trim_exit_wanted);
+	ASSERT(!vd->vdev_top->vdev_removing);
+
+	vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure);
+	vd->vdev_trim_thread = thread_create(NULL, 0,
+	    vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+}
+
+/*
+ * Wait for the trimming thread to be terminated (canceled or stopped).
+ */
+static void
+vdev_trim_stop_wait_impl(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+
+	while (vd->vdev_trim_thread != NULL)
+		cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock);
+
+	ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+	vd->vdev_trim_exit_wanted = B_FALSE;
+}
+
+/*
+ * Wait for vdev trim threads which were listed to cleanly exit.
+ */
+void
+vdev_trim_stop_wait(spa_t *spa, list_t *vd_list)
+{
+	vdev_t *vd;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	while ((vd = list_remove_head(vd_list)) != NULL) {
+		mutex_enter(&vd->vdev_trim_lock);
+		vdev_trim_stop_wait_impl(vd);
+		mutex_exit(&vd->vdev_trim_lock);
+	}
+}
+
+/*
+ * Stop trimming a device, with the resultant trimming state being tgt_state.
+ * For blocking behavior pass NULL for vd_list.  Otherwise, when a list_t is
+ * provided the stopping vdev is inserted in to the list.  Callers are then
+ * required to call vdev_trim_stop_wait() to block for all the trim threads
+ * to exit.  The caller must hold vdev_trim_lock and must not be writing to
+ * the spa config, as the trimming thread may try to enter the config as a
+ * reader before exiting.
+ */
+void
+vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list)
+{
+	ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER));
+	ASSERT(MUTEX_HELD(&vd->vdev_trim_lock));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(vdev_is_concrete(vd));
+
+	/*
+	 * Allow cancel requests to proceed even if the trim thread has
+	 * stopped.
+	 */
+	if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED)
+		return;
+
+	vdev_trim_change_state(vd, tgt_state, 0, 0, 0);
+	vd->vdev_trim_exit_wanted = B_TRUE;
+
+	if (vd_list == NULL) {
+		vdev_trim_stop_wait_impl(vd);
+	} else {
+		ASSERT(MUTEX_HELD(&spa_namespace_lock));
+		list_insert_tail(vd_list, vd);
+	}
+}
+
+/*
+ * Requests that all listed vdevs stop trimming.
+ */
+static void
+vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state,
+    list_t *vd_list)
+{
+	if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) {
+		mutex_enter(&vd->vdev_trim_lock);
+		vdev_trim_stop(vd, tgt_state, vd_list);
+		mutex_exit(&vd->vdev_trim_lock);
+		return;
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state,
+		    vd_list);
+	}
+}
+
+/*
+ * Convenience function to stop trimming of a vdev tree and set all trim
+ * thread pointers to NULL.
+ */
+void
+vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state)
+{
+	spa_t *spa = vd->vdev_spa;
+	list_t vd_list;
+	vdev_t *vd_l2cache;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	list_create(&vd_list, sizeof (vdev_t),
+	    offsetof(vdev_t, vdev_trim_node));
+
+	vdev_trim_stop_all_impl(vd, tgt_state, &vd_list);
+
+	/*
+	 * Iterate over cache devices and request stop trimming the
+	 * whole device in case we export the pool or remove the cache
+	 * device prematurely.
+	 */
+	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+		vd_l2cache = spa->spa_l2cache.sav_vdevs[i];
+		vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list);
+	}
+
+	vdev_trim_stop_wait(spa, &vd_list);
+
+	if (vd->vdev_spa->spa_sync_on) {
+		/* Make sure that our state has been synced to disk */
+		txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+	}
+
+	list_destroy(&vd_list);
+}
+
+/*
+ * Conditionally restarts a manual TRIM given its on-disk state.
+ */
+void
+vdev_trim_restart(vdev_t *vd)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
+
+	if (vd->vdev_leaf_zap != 0) {
+		mutex_enter(&vd->vdev_trim_lock);
+		uint64_t trim_state = VDEV_TRIM_NONE;
+		int err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE,
+		    sizeof (trim_state), 1, &trim_state);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_trim_state = trim_state;
+
+		uint64_t timestamp = 0;
+		err = zap_lookup(vd->vdev_spa->spa_meta_objset,
+		    vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME,
+		    sizeof (timestamp), 1, &timestamp);
+		ASSERT(err == 0 || err == ENOENT);
+		vd->vdev_trim_action_time = timestamp;
+
+		if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED ||
+		    vd->vdev_offline) {
+			/* load progress for reporting, but don't resume */
+			VERIFY0(vdev_trim_load(vd));
+		} else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE &&
+		    vdev_writeable(vd) && !vd->vdev_top->vdev_removing &&
+		    vd->vdev_trim_thread == NULL) {
+			VERIFY0(vdev_trim_load(vd));
+			vdev_trim(vd, vd->vdev_trim_rate,
+			    vd->vdev_trim_partial, vd->vdev_trim_secure);
+		}
+
+		mutex_exit(&vd->vdev_trim_lock);
+	}
+
+	for (uint64_t i = 0; i < vd->vdev_children; i++) {
+		vdev_trim_restart(vd->vdev_child[i]);
+	}
+}
+
+/*
+ * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that
+ * every TRIM range is contained within ms_allocatable.
+ */
+static void
+vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size)
+{
+	trim_args_t *ta = arg;
+	metaslab_t *msp = ta->trim_msp;
+
+	VERIFY3B(msp->ms_loaded, ==, B_TRUE);
+	VERIFY3U(msp->ms_disabled, >, 0);
+	VERIFY(range_tree_contains(msp->ms_allocatable, start, size));
+}
+
+/*
+ * Each automatic TRIM thread is responsible for managing the trimming of a
+ * top-level vdev in the pool.  No automatic TRIM state is maintained on-disk.
+ *
+ * N.B. This behavior is different from a manual TRIM where a thread
+ * is created for each leaf vdev, instead of each top-level vdev.
+ */
+static void
+vdev_autotrim_thread(void *arg)
+{
+	vdev_t *vd = arg;
+	spa_t *spa = vd->vdev_spa;
+	int shift = 0;
+
+	mutex_enter(&vd->vdev_autotrim_lock);
+	ASSERT3P(vd->vdev_top, ==, vd);
+	ASSERT3P(vd->vdev_autotrim_thread, !=, NULL);
+	mutex_exit(&vd->vdev_autotrim_lock);
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	uint64_t extent_bytes_max = zfs_trim_extent_bytes_max;
+	uint64_t extent_bytes_min = zfs_trim_extent_bytes_min;
+
+	while (!vdev_autotrim_should_stop(vd)) {
+		int txgs_per_trim = MAX(zfs_trim_txg_batch, 1);
+		boolean_t issued_trim = B_FALSE;
+
+		/*
+		 * All of the metaslabs are divided in to groups of size
+		 * num_metaslabs / zfs_trim_txg_batch.  Each of these groups
+		 * is composed of metaslabs which are spread evenly over the
+		 * device.
+		 *
+		 * For example, when zfs_trim_txg_batch = 32 (default) then
+		 * group 0 will contain metaslabs 0, 32, 64, ...;
+		 * group 1 will contain metaslabs 1, 33, 65, ...;
+		 * group 2 will contain metaslabs 2, 34, 66, ...; and so on.
+		 *
+		 * On each pass through the while() loop one of these groups
+		 * is selected.  This is accomplished by using a shift value
+		 * to select the starting metaslab, then striding over the
+		 * metaslabs using the zfs_trim_txg_batch size.  This is
+		 * done to accomplish two things.
+		 *
+		 * 1) By dividing the metaslabs in to groups, and making sure
+		 *    that each group takes a minimum of one txg to process.
+		 *    Then zfs_trim_txg_batch controls the minimum number of
+		 *    txgs which must occur before a metaslab is revisited.
+		 *
+		 * 2) Selecting non-consecutive metaslabs distributes the
+		 *    TRIM commands for a group evenly over the entire device.
+		 *    This can be advantageous for certain types of devices.
+		 */
+		for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count;
+		    i += txgs_per_trim) {
+			metaslab_t *msp = vd->vdev_ms[i];
+			range_tree_t *trim_tree;
+
+			spa_config_exit(spa, SCL_CONFIG, FTAG);
+			metaslab_disable(msp);
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+			mutex_enter(&msp->ms_lock);
+
+			/*
+			 * Skip the metaslab when it has never been allocated
+			 * or when there are no recent frees to trim.
+			 */
+			if (msp->ms_sm == NULL ||
+			    range_tree_is_empty(msp->ms_trim)) {
+				mutex_exit(&msp->ms_lock);
+				metaslab_enable(msp, B_FALSE, B_FALSE);
+				continue;
+			}
+
+			/*
+			 * Skip the metaslab when it has already been disabled.
+			 * This may happen when a manual TRIM or initialize
+			 * operation is running concurrently.  In the case
+			 * of a manual TRIM, the ms_trim tree will have been
+			 * vacated.  Only ranges added after the manual TRIM
+			 * disabled the metaslab will be included in the tree.
+			 * These will be processed when the automatic TRIM
+			 * next revisits this metaslab.
+			 */
+			if (msp->ms_disabled > 1) {
+				mutex_exit(&msp->ms_lock);
+				metaslab_enable(msp, B_FALSE, B_FALSE);
+				continue;
+			}
+
+			/*
+			 * Allocate an empty range tree which is swapped in
+			 * for the existing ms_trim tree while it is processed.
+			 */
+			trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL,
+			    0, 0);
+			range_tree_swap(&msp->ms_trim, &trim_tree);
+			ASSERT(range_tree_is_empty(msp->ms_trim));
+
+			/*
+			 * There are two cases when constructing the per-vdev
+			 * trim trees for a metaslab.  If the top-level vdev
+			 * has no children then it is also a leaf and should
+			 * be trimmed.  Otherwise our children are the leaves
+			 * and a trim tree should be constructed for each.
+			 */
+			trim_args_t *tap;
+			uint64_t children = vd->vdev_children;
+			if (children == 0) {
+				children = 1;
+				tap = kmem_zalloc(sizeof (trim_args_t) *
+				    children, KM_SLEEP);
+				tap[0].trim_vdev = vd;
+			} else {
+				tap = kmem_zalloc(sizeof (trim_args_t) *
+				    children, KM_SLEEP);
+
+				for (uint64_t c = 0; c < children; c++) {
+					tap[c].trim_vdev = vd->vdev_child[c];
+				}
+			}
+
+			for (uint64_t c = 0; c < children; c++) {
+				trim_args_t *ta = &tap[c];
+				vdev_t *cvd = ta->trim_vdev;
+
+				ta->trim_msp = msp;
+				ta->trim_extent_bytes_max = extent_bytes_max;
+				ta->trim_extent_bytes_min = extent_bytes_min;
+				ta->trim_type = TRIM_TYPE_AUTO;
+				ta->trim_flags = 0;
+
+				if (cvd->vdev_detached ||
+				    !vdev_writeable(cvd) ||
+				    !cvd->vdev_has_trim ||
+				    cvd->vdev_trim_thread != NULL) {
+					continue;
+				}
+
+				/*
+				 * When a device has an attached hot spare, or
+				 * is being replaced it will not be trimmed.
+				 * This is done to avoid adding additional
+				 * stress to a potentially unhealthy device,
+				 * and to minimize the required rebuild time.
+				 */
+				if (!cvd->vdev_ops->vdev_op_leaf)
+					continue;
+
+				ta->trim_tree = range_tree_create(NULL,
+				    RANGE_SEG64, NULL, 0, 0);
+				range_tree_walk(trim_tree,
+				    vdev_trim_range_add, ta);
+			}
+
+			mutex_exit(&msp->ms_lock);
+			spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+			/*
+			 * Issue the TRIM I/Os for all ranges covered by the
+			 * TRIM trees.  These ranges are safe to TRIM because
+			 * no new allocations will be performed until the call
+			 * to metaslab_enabled() below.
+			 */
+			for (uint64_t c = 0; c < children; c++) {
+				trim_args_t *ta = &tap[c];
+
+				/*
+				 * Always yield to a manual TRIM if one has
+				 * been started for the child vdev.
+				 */
+				if (ta->trim_tree == NULL ||
+				    ta->trim_vdev->vdev_trim_thread != NULL) {
+					continue;
+				}
+
+				/*
+				 * After this point metaslab_enable() must be
+				 * called with the sync flag set.  This is done
+				 * here because vdev_trim_ranges() is allowed
+				 * to be interrupted (EINTR) before issuing all
+				 * of the required TRIM I/Os.
+				 */
+				issued_trim = B_TRUE;
+
+				int error = vdev_trim_ranges(ta);
+				if (error)
+					break;
+			}
+
+			/*
+			 * Verify every range which was trimmed is still
+			 * contained within the ms_allocatable tree.
+			 */
+			if (zfs_flags & ZFS_DEBUG_TRIM) {
+				mutex_enter(&msp->ms_lock);
+				VERIFY0(metaslab_load(msp));
+				VERIFY3P(tap[0].trim_msp, ==, msp);
+				range_tree_walk(trim_tree,
+				    vdev_trim_range_verify, &tap[0]);
+				mutex_exit(&msp->ms_lock);
+			}
+
+			range_tree_vacate(trim_tree, NULL, NULL);
+			range_tree_destroy(trim_tree);
+
+			metaslab_enable(msp, issued_trim, B_FALSE);
+			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+			for (uint64_t c = 0; c < children; c++) {
+				trim_args_t *ta = &tap[c];
+
+				if (ta->trim_tree == NULL)
+					continue;
+
+				range_tree_vacate(ta->trim_tree, NULL, NULL);
+				range_tree_destroy(ta->trim_tree);
+			}
+
+			kmem_free(tap, sizeof (trim_args_t) * children);
+		}
+
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+		/*
+		 * After completing the group of metaslabs wait for the next
+		 * open txg.  This is done to make sure that a minimum of
+		 * zfs_trim_txg_batch txgs will occur before these metaslabs
+		 * are trimmed again.
+		 */
+		txg_wait_open(spa_get_dsl(spa), 0, issued_trim);
+
+		shift++;
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	}
+
+	for (uint64_t c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		mutex_enter(&cvd->vdev_trim_io_lock);
+
+		while (cvd->vdev_trim_inflight[1] > 0) {
+			cv_wait(&cvd->vdev_trim_io_cv,
+			    &cvd->vdev_trim_io_lock);
+		}
+		mutex_exit(&cvd->vdev_trim_io_lock);
+	}
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	/*
+	 * When exiting because the autotrim property was set to off, then
+	 * abandon any unprocessed ms_trim ranges to reclaim the memory.
+	 */
+	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) {
+		for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+			metaslab_t *msp = vd->vdev_ms[i];
+
+			mutex_enter(&msp->ms_lock);
+			range_tree_vacate(msp->ms_trim, NULL, NULL);
+			mutex_exit(&msp->ms_lock);
+		}
+	}
+
+	mutex_enter(&vd->vdev_autotrim_lock);
+	ASSERT(vd->vdev_autotrim_thread != NULL);
+	vd->vdev_autotrim_thread = NULL;
+	cv_broadcast(&vd->vdev_autotrim_cv);
+	mutex_exit(&vd->vdev_autotrim_lock);
+
+	thread_exit();
+}
+
+/*
+ * Starts an autotrim thread, if needed, for each top-level vdev which can be
+ * trimmed.  A top-level vdev which has been evacuated will never be trimmed.
+ */
+void
+vdev_autotrim(spa_t *spa)
+{
+	vdev_t *root_vd = spa->spa_root_vdev;
+
+	for (uint64_t i = 0; i < root_vd->vdev_children; i++) {
+		vdev_t *tvd = root_vd->vdev_child[i];
+
+		mutex_enter(&tvd->vdev_autotrim_lock);
+		if (vdev_writeable(tvd) && !tvd->vdev_removing &&
+		    tvd->vdev_autotrim_thread == NULL) {
+			ASSERT3P(tvd->vdev_top, ==, tvd);
+
+			tvd->vdev_autotrim_thread = thread_create(NULL, 0,
+			    vdev_autotrim_thread, tvd, 0, &p0, TS_RUN,
+			    maxclsyspri);
+			ASSERT(tvd->vdev_autotrim_thread != NULL);
+		}
+		mutex_exit(&tvd->vdev_autotrim_lock);
+	}
+}
+
+/*
+ * Wait for the vdev_autotrim_thread associated with the passed top-level
+ * vdev to be terminated (canceled or stopped).
+ */
+void
+vdev_autotrim_stop_wait(vdev_t *tvd)
+{
+	mutex_enter(&tvd->vdev_autotrim_lock);
+	if (tvd->vdev_autotrim_thread != NULL) {
+		tvd->vdev_autotrim_exit_wanted = B_TRUE;
+
+		while (tvd->vdev_autotrim_thread != NULL) {
+			cv_wait(&tvd->vdev_autotrim_cv,
+			    &tvd->vdev_autotrim_lock);
+		}
+
+		ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL);
+		tvd->vdev_autotrim_exit_wanted = B_FALSE;
+	}
+	mutex_exit(&tvd->vdev_autotrim_lock);
+}
+
+/*
+ * Wait for all of the vdev_autotrim_thread associated with the pool to
+ * be terminated (canceled or stopped).
+ */
+void
+vdev_autotrim_stop_all(spa_t *spa)
+{
+	vdev_t *root_vd = spa->spa_root_vdev;
+
+	for (uint64_t i = 0; i < root_vd->vdev_children; i++)
+		vdev_autotrim_stop_wait(root_vd->vdev_child[i]);
+}
+
+/*
+ * Conditionally restart all of the vdev_autotrim_thread's for the pool.
+ */
+void
+vdev_autotrim_restart(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	if (spa->spa_autotrim)
+		vdev_autotrim(spa);
+}
+
+static void
+vdev_trim_l2arc_thread(void *arg)
+{
+	vdev_t		*vd = arg;
+	spa_t		*spa = vd->vdev_spa;
+	l2arc_dev_t	*dev = l2arc_vdev_get(vd);
+	trim_args_t	ta;
+	range_seg64_t 	physical_rs;
+
+	ASSERT(vdev_is_concrete(vd));
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	vd->vdev_trim_last_offset = 0;
+	vd->vdev_trim_rate = 0;
+	vd->vdev_trim_partial = 0;
+	vd->vdev_trim_secure = 0;
+
+	bzero(&ta, sizeof (ta));
+	ta.trim_vdev = vd;
+	ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	ta.trim_type = TRIM_TYPE_MANUAL;
+	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+	ta.trim_flags = 0;
+
+	physical_rs.rs_start = vd->vdev_trim_bytes_done = 0;
+	physical_rs.rs_end = vd->vdev_trim_bytes_est =
+	    vdev_get_min_asize(vd);
+
+	range_tree_add(ta.trim_tree, physical_rs.rs_start,
+	    physical_rs.rs_end - physical_rs.rs_start);
+
+	mutex_enter(&vd->vdev_trim_lock);
+	vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+	mutex_exit(&vd->vdev_trim_lock);
+
+	(void) vdev_trim_ranges(&ta);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	mutex_enter(&vd->vdev_trim_io_lock);
+	while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) {
+		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+	}
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	range_tree_vacate(ta.trim_tree, NULL, NULL);
+	range_tree_destroy(ta.trim_tree);
+
+	mutex_enter(&vd->vdev_trim_lock);
+	if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) {
+		vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE,
+		    vd->vdev_trim_rate, vd->vdev_trim_partial,
+		    vd->vdev_trim_secure);
+	}
+	ASSERT(vd->vdev_trim_thread != NULL ||
+	    vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0);
+
+	/*
+	 * Drop the vdev_trim_lock while we sync out the txg since it's
+	 * possible that a device might be trying to come online and
+	 * must check to see if it needs to restart a trim. That thread
+	 * will be holding the spa_config_lock which would prevent the
+	 * txg_wait_synced from completing. Same strategy as in
+	 * vdev_trim_thread().
+	 */
+	mutex_exit(&vd->vdev_trim_lock);
+	txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0);
+	mutex_enter(&vd->vdev_trim_lock);
+
+	/*
+	 * Update the header of the cache device here, before
+	 * broadcasting vdev_trim_cv which may lead to the removal
+	 * of the device. The same applies for setting l2ad_trim_all to
+	 * false.
+	 */
+	spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd,
+	    RW_READER);
+	bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
+	l2arc_dev_hdr_update(dev);
+	spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd);
+
+	vd->vdev_trim_thread = NULL;
+	if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE)
+		dev->l2ad_trim_all = B_FALSE;
+
+	cv_broadcast(&vd->vdev_trim_cv);
+	mutex_exit(&vd->vdev_trim_lock);
+
+	thread_exit();
+}
+
+/*
+ * Punches out TRIM threads for the L2ARC devices in a spa and assigns them
+ * to vd->vdev_trim_thread variable. This facilitates the management of
+ * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition
+ * to a pool or pool creation or when the header of the device is invalid.
+ */
+void
+vdev_trim_l2arc(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	/*
+	 * Locate the spa's l2arc devices and kick off TRIM threads.
+	 */
+	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
+		vdev_t *vd = spa->spa_l2cache.sav_vdevs[i];
+		l2arc_dev_t *dev = l2arc_vdev_get(vd);
+
+		if (dev == NULL || !dev->l2ad_trim_all) {
+			/*
+			 * Don't attempt TRIM if the vdev is UNAVAIL or if the
+			 * cache device was not marked for whole device TRIM
+			 * (ie l2arc_trim_ahead = 0, or the L2ARC device header
+			 * is valid with trim_state = VDEV_TRIM_COMPLETE and
+			 * l2ad_log_entries > 0).
+			 */
+			continue;
+		}
+
+		mutex_enter(&vd->vdev_trim_lock);
+		ASSERT(vd->vdev_ops->vdev_op_leaf);
+		ASSERT(vdev_is_concrete(vd));
+		ASSERT3P(vd->vdev_trim_thread, ==, NULL);
+		ASSERT(!vd->vdev_detached);
+		ASSERT(!vd->vdev_trim_exit_wanted);
+		ASSERT(!vd->vdev_top->vdev_removing);
+		vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0);
+		vd->vdev_trim_thread = thread_create(NULL, 0,
+		    vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+		mutex_exit(&vd->vdev_trim_lock);
+	}
+}
+
+/*
+ * A wrapper which calls vdev_trim_ranges(). It is intended to be called
+ * on leaf vdevs.
+ */
+int
+vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size)
+{
+	trim_args_t		ta;
+	range_seg64_t 		physical_rs;
+	int			error;
+	physical_rs.rs_start = start;
+	physical_rs.rs_end = start + size;
+
+	ASSERT(vdev_is_concrete(vd));
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	ASSERT(!vd->vdev_detached);
+	ASSERT(!vd->vdev_top->vdev_removing);
+
+	bzero(&ta, sizeof (ta));
+	ta.trim_vdev = vd;
+	ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+	ta.trim_type = TRIM_TYPE_SIMPLE;
+	ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max;
+	ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE;
+	ta.trim_flags = 0;
+
+	ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start);
+
+	if (physical_rs.rs_end > physical_rs.rs_start) {
+		range_tree_add(ta.trim_tree, physical_rs.rs_start,
+		    physical_rs.rs_end - physical_rs.rs_start);
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start);
+	}
+
+	error = vdev_trim_ranges(&ta);
+
+	mutex_enter(&vd->vdev_trim_io_lock);
+	while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) {
+		cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock);
+	}
+	mutex_exit(&vd->vdev_trim_io_lock);
+
+	range_tree_vacate(ta.trim_tree, NULL, NULL);
+	range_tree_destroy(ta.trim_tree);
+
+	return (error);
+}
+
+EXPORT_SYMBOL(vdev_trim);
+EXPORT_SYMBOL(vdev_trim_stop);
+EXPORT_SYMBOL(vdev_trim_stop_all);
+EXPORT_SYMBOL(vdev_trim_stop_wait);
+EXPORT_SYMBOL(vdev_trim_restart);
+EXPORT_SYMBOL(vdev_autotrim);
+EXPORT_SYMBOL(vdev_autotrim_stop_all);
+EXPORT_SYMBOL(vdev_autotrim_stop_wait);
+EXPORT_SYMBOL(vdev_autotrim_restart);
+EXPORT_SYMBOL(vdev_trim_l2arc);
+EXPORT_SYMBOL(vdev_trim_simple);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW,
+    "Max size of TRIM commands, larger will be split");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW,
+    "Min size of TRIM commands, smaller will be skipped");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW,
+    "Skip metaslabs which have never been initialized");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW,
+    "Min number of txgs to aggregate frees before issuing TRIM");
+
+ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW,
+    "Max queued TRIMs outstanding per leaf vdev");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zap.c b/sys/contrib/openzfs/module/zfs/zap.c
new file mode 100644
index 000000000000..c0c280c52076
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zap.c
@@ -0,0 +1,1384 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ */
+
+/*
+ * This file contains the top half of the zfs directory structure
+ * implementation. The bottom half is in zap_leaf.c.
+ *
+ * The zdir is an extendable hash data structure. There is a table of
+ * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are
+ * each a constant size and hold a variable number of directory entries.
+ * The buckets (aka "leaf nodes") are implemented in zap_leaf.c.
+ *
+ * The pointer table holds a power of 2 number of pointers.
+ * (1<<zap_t->zd_data->zd_phys->zd_prefix_len).  The bucket pointed to
+ * by the pointer at index i in the table holds entries whose hash value
+ * has a zd_prefix_len - bit prefix
+ */
+
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+
+/*
+ * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object
+ * (all leaf blocks) when we start iterating over it.
+ *
+ * For zap_cursor_init(), the callers all intend to iterate through all the
+ * entries.  There are a few cases where an error (typically i/o error) could
+ * cause it to bail out early.
+ *
+ * For zap_cursor_init_serialized(), there are callers that do the iteration
+ * outside of ZFS.  Typically they would iterate over everything, but we
+ * don't have control of that.  E.g. zfs_ioc_snapshot_list_next(),
+ * zcp_snapshots_iter(), and other iterators over things in the MOS - these
+ * are called by /sbin/zfs and channel programs.  The other example is
+ * zfs_readdir() which iterates over directory entries for the getdents()
+ * syscall.  /sbin/ls iterates to the end (unless it receives a signal), but
+ * userland doesn't have to.
+ *
+ * Given that the ZAP entries aren't returned in a specific order, the only
+ * legitimate use cases for partial iteration would be:
+ *
+ * 1. Pagination: e.g. you only want to display 100 entries at a time, so you
+ *    get the first 100 and then wait for the user to hit "next page", which
+ *    they may never do).
+ *
+ * 2. You want to know if there are more than X entries, without relying on
+ *    the zfs-specific implementation of the directory's st_size (which is
+ *    the number of entries).
+ */
+int zap_iterate_prefetch = B_TRUE;
+
+int fzap_default_block_shift = 14; /* 16k blocksize */
+
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+
+static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
+
+void
+fzap_byteswap(void *vbuf, size_t size)
+{
+	uint64_t block_type = *(uint64_t *)vbuf;
+
+	if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF))
+		zap_leaf_byteswap(vbuf, size);
+	else {
+		/* it's a ptrtbl block */
+		byteswap_uint64_array(vbuf, size);
+	}
+}
+
+void
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
+{
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	zap->zap_ismicro = FALSE;
+
+	zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync;
+	zap->zap_dbu.dbu_evict_func_async = NULL;
+
+	mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0);
+	zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
+
+	zap_phys_t *zp = zap_f_phys(zap);
+	/*
+	 * explicitly zero it since it might be coming from an
+	 * initialized microzap
+	 */
+	bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size);
+	zp->zap_block_type = ZBT_HEADER;
+	zp->zap_magic = ZAP_MAGIC;
+
+	zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap);
+
+	zp->zap_freeblk = 2;		/* block 1 will be the first leaf */
+	zp->zap_num_leafs = 1;
+	zp->zap_num_entries = 0;
+	zp->zap_salt = zap->zap_salt;
+	zp->zap_normflags = zap->zap_normflags;
+	zp->zap_flags = flags;
+
+	/* block 1 will be the first leaf */
+	for (int i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
+		ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1;
+
+	/*
+	 * set up block 1 - the first leaf
+	 */
+	dmu_buf_t *db;
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db, DMU_READ_NO_PREFETCH));
+	dmu_buf_will_dirty(db, tx);
+
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	l->l_dbuf = db;
+
+	zap_leaf_init(l, zp->zap_normflags != 0);
+
+	kmem_free(l, sizeof (zap_leaf_t));
+	dmu_buf_rele(db, FTAG);
+}
+
+static int
+zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx)
+{
+	if (RW_WRITE_HELD(&zap->zap_rwlock))
+		return (1);
+	if (rw_tryupgrade(&zap->zap_rwlock)) {
+		dmu_buf_will_dirty(zap->zap_dbuf, tx);
+		return (1);
+	}
+	return (0);
+}
+
+/*
+ * Generic routines for dealing with the pointer & cookie tables.
+ */
+
+static int
+zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
+    void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n),
+    dmu_tx_t *tx)
+{
+	uint64_t newblk;
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	int hepb = 1<<(bs-4);
+	/* hepb = half the number of entries in a block */
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+	ASSERT(tbl->zt_numblks > 0);
+
+	if (tbl->zt_nextblk != 0) {
+		newblk = tbl->zt_nextblk;
+	} else {
+		newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
+		tbl->zt_nextblk = newblk;
+		ASSERT0(tbl->zt_blks_copied);
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
+	}
+
+	/*
+	 * Copy the ptrtbl from the old to new location.
+	 */
+
+	uint64_t b = tbl->zt_blks_copied;
+	dmu_buf_t *db_old;
+	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+
+	/* first half of entries in old[b] go to new[2*b+0] */
+	dmu_buf_t *db_new;
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func(db_old->db_data, db_new->db_data, hepb);
+	dmu_buf_rele(db_new, FTAG);
+
+	/* second half of entries in old[b] go to new[2*b+1] */
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH));
+	dmu_buf_will_dirty(db_new, tx);
+	transfer_func((uint64_t *)db_old->db_data + hepb,
+	    db_new->db_data, hepb);
+	dmu_buf_rele(db_new, FTAG);
+
+	dmu_buf_rele(db_old, FTAG);
+
+	tbl->zt_blks_copied++;
+
+	dprintf("copied block %llu of %llu\n",
+	    tbl->zt_blks_copied, tbl->zt_numblks);
+
+	if (tbl->zt_blks_copied == tbl->zt_numblks) {
+		(void) dmu_free_range(zap->zap_objset, zap->zap_object,
+		    tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
+
+		tbl->zt_blk = newblk;
+		tbl->zt_numblks *= 2;
+		tbl->zt_shift++;
+		tbl->zt_nextblk = 0;
+		tbl->zt_blks_copied = 0;
+
+		dprintf("finished; numblocks now %llu (%uk entries)\n",
+		    tbl->zt_numblks, 1<<(tbl->zt_shift-10));
+	}
+
+	return (0);
+}
+
+static int
+zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
+    dmu_tx_t *tx)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(tbl->zt_blk != 0);
+
+	dprintf("storing %llx at index %llx\n", val, idx);
+
+	uint64_t blk = idx >> (bs-3);
+	uint64_t off = idx & ((1<<(bs-3))-1);
+
+	dmu_buf_t *db;
+	int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+	dmu_buf_will_dirty(db, tx);
+
+	if (tbl->zt_nextblk != 0) {
+		uint64_t idx2 = idx * 2;
+		uint64_t blk2 = idx2 >> (bs-3);
+		uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+		dmu_buf_t *db2;
+
+		err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    (tbl->zt_nextblk + blk2) << bs, FTAG, &db2,
+		    DMU_READ_NO_PREFETCH);
+		if (err != 0) {
+			dmu_buf_rele(db, FTAG);
+			return (err);
+		}
+		dmu_buf_will_dirty(db2, tx);
+		((uint64_t *)db2->db_data)[off2] = val;
+		((uint64_t *)db2->db_data)[off2+1] = val;
+		dmu_buf_rele(db2, FTAG);
+	}
+
+	((uint64_t *)db->db_data)[off] = val;
+	dmu_buf_rele(db, FTAG);
+
+	return (0);
+}
+
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	uint64_t blk = idx >> (bs-3);
+	uint64_t off = idx & ((1<<(bs-3))-1);
+
+	/*
+	 * Note: this is equivalent to dmu_buf_hold(), but we use
+	 * _dnode_enter / _by_dnode because it's faster because we don't
+	 * have to hold the dnode.
+	 */
+	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	dmu_buf_t *db;
+	int err = dmu_buf_hold_by_dnode(dn,
+	    (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
+	if (err != 0)
+		return (err);
+	*valp = ((uint64_t *)db->db_data)[off];
+	dmu_buf_rele(db, FTAG);
+
+	if (tbl->zt_nextblk != 0) {
+		/*
+		 * read the nextblk for the sake of i/o error checking,
+		 * so that zap_table_load() will catch errors for
+		 * zap_table_store.
+		 */
+		blk = (idx*2) >> (bs-3);
+
+		dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+		err = dmu_buf_hold_by_dnode(dn,
+		    (tbl->zt_nextblk + blk) << bs, FTAG, &db,
+		    DMU_READ_NO_PREFETCH);
+		dmu_buf_dnode_exit(zap->zap_dbuf);
+		if (err == 0)
+			dmu_buf_rele(db, FTAG);
+	}
+	return (err);
+}
+
+/*
+ * Routines for growing the ptrtbl.
+ */
+
+static void
+zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
+{
+	for (int i = 0; i < n; i++) {
+		uint64_t lb = src[i];
+		dst[2 * i + 0] = lb;
+		dst[2 * i + 1] = lb;
+	}
+}
+
+static int
+zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
+{
+	/*
+	 * The pointer table should never use more hash bits than we
+	 * have (otherwise we'd be using useless zero bits to index it).
+	 * If we are within 2 bits of running out, stop growing, since
+	 * this is already an aberrant condition.
+	 */
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+		return (SET_ERROR(ENOSPC));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+		/*
+		 * We are outgrowing the "embedded" ptrtbl (the one
+		 * stored in the header block).  Give it its own entire
+		 * block, which will double the size of the ptrtbl.
+		 */
+		ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+		    ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+		ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
+
+		uint64_t newblk = zap_allocate_blocks(zap, 1);
+		dmu_buf_t *db_new;
+		int err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+		    newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new,
+		    DMU_READ_NO_PREFETCH);
+		if (err != 0)
+			return (err);
+		dmu_buf_will_dirty(db_new, tx);
+		zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+		    db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
+		dmu_buf_rele(db_new, FTAG);
+
+		zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+		zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+		zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
+
+		ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
+		    (FZAP_BLOCK_SHIFT(zap)-3));
+
+		return (0);
+	} else {
+		return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
+		    zap_ptrtbl_transfer, tx));
+	}
+}
+
+static void
+zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx)
+{
+	dmu_buf_will_dirty(zap->zap_dbuf, tx);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx);
+	ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+	zap_f_phys(zap)->zap_num_entries += delta;
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+}
+
+static uint64_t
+zap_allocate_blocks(zap_t *zap, int nblocks)
+{
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	uint64_t newblk = zap_f_phys(zap)->zap_freeblk;
+	zap_f_phys(zap)->zap_freeblk += nblocks;
+	return (newblk);
+}
+
+static void
+zap_leaf_evict_sync(void *dbu)
+{
+	zap_leaf_t *l = dbu;
+
+	rw_destroy(&l->l_rwlock);
+	kmem_free(l, sizeof (zap_leaf_t));
+}
+
+static zap_leaf_t *
+zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
+{
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL);
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	l->l_blkid = zap_allocate_blocks(zap, 1);
+	l->l_dbuf = NULL;
+
+	VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object,
+	    l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
+	    DMU_READ_NO_PREFETCH));
+	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+	VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu));
+	dmu_buf_will_dirty(l->l_dbuf, tx);
+
+	zap_leaf_init(l, zap->zap_normflags != 0);
+
+	zap_f_phys(zap)->zap_num_leafs++;
+
+	return (l);
+}
+
+int
+fzap_count(zap_t *zap, uint64_t *count)
+{
+	ASSERT(!zap->zap_ismicro);
+	mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
+	*count = zap_f_phys(zap)->zap_num_entries;
+	mutex_exit(&zap->zap_f.zap_num_entries_mtx);
+	return (0);
+}
+
+/*
+ * Routines for obtaining zap_leaf_t's
+ */
+
+void
+zap_put_leaf(zap_leaf_t *l)
+{
+	rw_exit(&l->l_rwlock);
+	dmu_buf_rele(l->l_dbuf, NULL);
+}
+
+static zap_leaf_t *
+zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
+{
+	ASSERT(blkid != 0);
+
+	zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
+	rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL);
+	rw_enter(&l->l_rwlock, RW_WRITER);
+	l->l_blkid = blkid;
+	l->l_bs = highbit64(db->db_size) - 1;
+	l->l_dbuf = db;
+
+	dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf);
+	zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu);
+
+	rw_exit(&l->l_rwlock);
+	if (winner != NULL) {
+		/* someone else set it first */
+		zap_leaf_evict_sync(&l->l_dbu);
+		l = winner;
+	}
+
+	/*
+	 * lhr_pad was previously used for the next leaf in the leaf
+	 * chain.  There should be no chained leafs (as we have removed
+	 * support for them).
+	 */
+	ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
+
+	/*
+	 * There should be more hash entries than there can be
+	 * chunks to put in the hash table
+	 */
+	ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3);
+
+	/* The chunks should begin at the end of the hash table */
+	ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *)
+	    &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+
+	/* The chunks should end at the end of the block */
+	ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
+	    (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
+
+	return (l);
+}
+
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+    zap_leaf_t **lp)
+{
+	dmu_buf_t *db;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	/*
+	 * If system crashed just after dmu_free_long_range in zfs_rmnode, we
+	 * would be left with an empty xattr dir in delete queue. blkid=0
+	 * would be passed in when doing zfs_purgedir. If that's the case we
+	 * should just return immediately. The underlying objects should
+	 * already be freed, so this should be perfectly fine.
+	 */
+	if (blkid == 0)
+		return (SET_ERROR(ENOENT));
+
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+	int err = dmu_buf_hold_by_dnode(dn,
+	    blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+	dmu_buf_dnode_exit(zap->zap_dbuf);
+	if (err != 0)
+		return (err);
+
+	ASSERT3U(db->db_object, ==, zap->zap_object);
+	ASSERT3U(db->db_offset, ==, blkid << bs);
+	ASSERT3U(db->db_size, ==, 1 << bs);
+	ASSERT(blkid != 0);
+
+	zap_leaf_t *l = dmu_buf_get_user(db);
+
+	if (l == NULL)
+		l = zap_open_leaf(blkid, db);
+
+	rw_enter(&l->l_rwlock, lt);
+	/*
+	 * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
+	 * causing ASSERT below to fail.
+	 */
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+	ASSERT3U(l->l_blkid, ==, blkid);
+	ASSERT3P(l->l_dbuf, ==, db);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	*lp = l;
+	return (0);
+}
+
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
+{
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+		ASSERT3U(idx, <,
+		    (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
+		*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+		return (0);
+	} else {
+		return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
+		    idx, valp));
+	}
+}
+
+static int
+zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
+{
+	ASSERT(tx != NULL);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
+		ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+		return (0);
+	} else {
+		return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
+		    idx, blk, tx));
+	}
+}
+
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
+{
+	uint64_t blk;
+
+	ASSERT(zap->zap_dbuf == NULL ||
+	    zap_f_phys(zap) == zap->zap_dbuf->db_data);
+
+	/* Reality check for corrupt zap objects (leaf or header). */
+	if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF &&
+	    zap_f_phys(zap)->zap_block_type != ZBT_HEADER) ||
+	    zap_f_phys(zap)->zap_magic != ZAP_MAGIC) {
+		return (SET_ERROR(EIO));
+	}
+
+	uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	int err = zap_idx_to_blk(zap, idx, &blk);
+	if (err != 0)
+		return (err);
+	err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
+
+	ASSERT(err ||
+	    ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+	    zap_leaf_phys(*lp)->l_hdr.lh_prefix);
+	return (err);
+}
+
+static int
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
+{
+	zap_t *zap = zn->zn_zap;
+	uint64_t hash = zn->zn_hash;
+	int err;
+	int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+	ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+	if (zap_tryupgradedir(zap, tx) == 0 ||
+	    old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+		/* We failed to upgrade, or need to grow the pointer table */
+		objset_t *os = zap->zap_objset;
+		uint64_t object = zap->zap_object;
+
+		zap_put_leaf(l);
+		zap_unlockdir(zap, tag);
+		err = zap_lockdir(os, object, tx, RW_WRITER,
+		    FALSE, FALSE, tag, &zn->zn_zap);
+		zap = zn->zn_zap;
+		if (err != 0)
+			return (err);
+		ASSERT(!zap->zap_ismicro);
+
+		while (old_prefix_len ==
+		    zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
+			err = zap_grow_ptrtbl(zap, tx);
+			if (err != 0)
+				return (err);
+		}
+
+		err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+		if (err != 0)
+			return (err);
+
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
+			/* it split while our locks were down */
+			*lp = l;
+			return (0);
+		}
+	}
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+	ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
+	    zap_leaf_phys(l)->l_hdr.lh_prefix);
+
+	int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	    (old_prefix_len + 1);
+	uint64_t sibling =
+	    (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
+
+	/* check for i/o errors before doing zap_leaf_split */
+	for (int i = 0; i < (1ULL << prefix_diff); i++) {
+		uint64_t blk;
+		err = zap_idx_to_blk(zap, sibling + i, &blk);
+		if (err != 0)
+			return (err);
+		ASSERT3U(blk, ==, l->l_blkid);
+	}
+
+	zap_leaf_t *nl = zap_create_leaf(zap, tx);
+	zap_leaf_split(l, nl, zap->zap_normflags != 0);
+
+	/* set sibling pointers */
+	for (int i = 0; i < (1ULL << prefix_diff); i++) {
+		err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx);
+		ASSERT0(err); /* we checked for i/o errors above */
+	}
+
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0);
+
+	if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
+		/* we want the sibling */
+		zap_put_leaf(l);
+		*lp = nl;
+	} else {
+		zap_put_leaf(nl);
+		*lp = l;
+	}
+
+	return (0);
+}
+
+static void
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+    void *tag, dmu_tx_t *tx)
+{
+	zap_t *zap = zn->zn_zap;
+	int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+	    zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+
+	zap_put_leaf(l);
+
+	if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
+		/*
+		 * We are in the middle of growing the pointer table, or
+		 * this leaf will soon make us grow it.
+		 */
+		if (zap_tryupgradedir(zap, tx) == 0) {
+			objset_t *os = zap->zap_objset;
+			uint64_t zapobj = zap->zap_object;
+
+			zap_unlockdir(zap, tag);
+			int err = zap_lockdir(os, zapobj, tx,
+			    RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
+			zap = zn->zn_zap;
+			if (err != 0)
+				return;
+		}
+
+		/* could have finished growing while our locks were down */
+		if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
+			(void) zap_grow_ptrtbl(zap, tx);
+	}
+}
+
+static int
+fzap_checkname(zap_name_t *zn)
+{
+	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+		return (SET_ERROR(ENAMETOOLONG));
+	return (0);
+}
+
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
+	/* Only integer sizes supported by C */
+	switch (integer_size) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		break;
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (integer_size * num_integers > ZAP_MAXVALUELEN)
+		return (SET_ERROR(E2BIG));
+
+	return (0);
+}
+
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+	int err = fzap_checkname(zn);
+	if (err != 0)
+		return (err);
+	return (fzap_checksize(integer_size, num_integers));
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+int
+fzap_lookup(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    char *realname, int rn_len, boolean_t *ncp)
+{
+	zap_leaf_t *l;
+	zap_entry_handle_t zeh;
+
+	int err = fzap_checkname(zn);
+	if (err != 0)
+		return (err);
+
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err == 0) {
+		if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+			zap_put_leaf(l);
+			return (err);
+		}
+
+		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
+		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
+		if (ncp) {
+			*ncp = zap_entry_normalization_conflict(&zeh,
+			    zn, NULL, zn->zn_zap);
+		}
+	}
+
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_add_cd(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	int err;
+	zap_entry_handle_t zeh;
+	zap_t *zap = zn->zn_zap;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	ASSERT(!zap->zap_ismicro);
+	ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
+
+	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
+retry:
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err == 0) {
+		err = SET_ERROR(EEXIST);
+		goto out;
+	}
+	if (err != ENOENT)
+		goto out;
+
+	err = zap_entry_create(l, zn, cd,
+	    integer_size, num_integers, val, &zeh);
+
+	if (err == 0) {
+		zap_increment_num_entries(zap, 1, tx);
+	} else if (err == EAGAIN) {
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
+		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
+		if (err == 0) {
+			goto retry;
+		} else if (err == ENOSPC) {
+			/*
+			 * If we failed to expand the leaf, then bailout
+			 * as there is no point trying
+			 * zap_put_leaf_maybe_grow_ptrtbl().
+			 */
+			return (err);
+		}
+	}
+
+out:
+	if (zap != NULL)
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	return (err);
+}
+
+int
+fzap_add(zap_name_t *zn,
+    uint64_t integer_size, uint64_t num_integers,
+    const void *val, void *tag, dmu_tx_t *tx)
+{
+	int err = fzap_check(zn, integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	return (fzap_add_cd(zn, integer_size, num_integers,
+	    val, ZAP_NEED_CD, tag, tx));
+}
+
+int
+fzap_update(zap_name_t *zn,
+    int integer_size, uint64_t num_integers, const void *val,
+    void *tag, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	int err;
+	boolean_t create;
+	zap_entry_handle_t zeh;
+	zap_t *zap = zn->zn_zap;
+
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+	err = fzap_check(zn, integer_size, num_integers);
+	if (err != 0)
+		return (err);
+
+	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
+retry:
+	err = zap_leaf_lookup(l, zn, &zeh);
+	create = (err == ENOENT);
+	ASSERT(err == 0 || err == ENOENT);
+
+	if (create) {
+		err = zap_entry_create(l, zn, ZAP_NEED_CD,
+		    integer_size, num_integers, val, &zeh);
+		if (err == 0)
+			zap_increment_num_entries(zap, 1, tx);
+	} else {
+		err = zap_entry_update(&zeh, integer_size, num_integers, val);
+	}
+
+	if (err == EAGAIN) {
+		err = zap_expand_leaf(zn, l, tag, tx, &l);
+		zap = zn->zn_zap;	/* zap_expand_leaf() may change zap */
+		if (err == 0)
+			goto retry;
+	}
+
+	if (zap != NULL)
+		zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
+	return (err);
+}
+
+int
+fzap_length(zap_name_t *zn,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_leaf_t *l;
+	int err;
+	zap_entry_handle_t zeh;
+
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err != 0)
+		goto out;
+
+	if (integer_size != 0)
+		*integer_size = zeh.zeh_integer_size;
+	if (num_integers != 0)
+		*num_integers = zeh.zeh_num_integers;
+out:
+	zap_put_leaf(l);
+	return (err);
+}
+
+int
+fzap_remove(zap_name_t *zn, dmu_tx_t *tx)
+{
+	zap_leaf_t *l;
+	int err;
+	zap_entry_handle_t zeh;
+
+	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l);
+	if (err != 0)
+		return (err);
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err == 0) {
+		zap_entry_remove(&zeh);
+		zap_increment_num_entries(zn->zn_zap, -1, tx);
+	}
+	zap_put_leaf(l);
+	return (err);
+}
+
+void
+fzap_prefetch(zap_name_t *zn)
+{
+	uint64_t blk;
+	zap_t *zap = zn->zn_zap;
+
+	uint64_t idx = ZAP_HASH_IDX(zn->zn_hash,
+	    zap_f_phys(zap)->zap_ptrtbl.zt_shift);
+	if (zap_idx_to_blk(zap, idx, &blk) != 0)
+		return;
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+	    ZIO_PRIORITY_SYNC_READ);
+}
+
+/*
+ * Helper functions for consumers.
+ */
+
+uint64_t
+zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+    const char *name, dmu_tx_t *tx)
+{
+	return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx));
+}
+
+uint64_t
+zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj,
+    const char *name, int dnodesize, dmu_tx_t *tx)
+{
+	uint64_t new_obj;
+
+	new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx);
+	VERIFY(new_obj != 0);
+	VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj,
+	    tx));
+
+	return (new_obj);
+}
+
+int
+zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask,
+    char *name)
+{
+	zap_cursor_t zc;
+	int err;
+
+	if (mask == 0)
+		mask = -1ULL;
+
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+	for (zap_cursor_init(&zc, os, zapobj);
+	    (err = zap_cursor_retrieve(&zc, za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if ((za->za_first_integer & mask) == (value & mask)) {
+			(void) strlcpy(name, za->za_name, MAXNAMELEN);
+			break;
+		}
+	}
+	zap_cursor_fini(&zc);
+	kmem_free(za, sizeof (*za));
+	return (err);
+}
+
+int
+zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	int err = 0;
+
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
+		err = zap_add(os, intoobj, za->za_name,
+		    8, 1, &za->za_first_integer, tx);
+		if (err != 0)
+			break;
+	}
+	zap_cursor_fini(&zc);
+	kmem_free(za, sizeof (*za));
+	return (err);
+}
+
+int
+zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    uint64_t value, dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	int err = 0;
+
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
+		err = zap_add(os, intoobj, za->za_name,
+		    8, 1, &value, tx);
+		if (err != 0)
+			break;
+	}
+	zap_cursor_fini(&zc);
+	kmem_free(za, sizeof (*za));
+	return (err);
+}
+
+int
+zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
+    dmu_tx_t *tx)
+{
+	zap_cursor_t zc;
+	int err = 0;
+
+	zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP);
+	for (zap_cursor_init(&zc, os, fromobj);
+	    zap_cursor_retrieve(&zc, za) == 0;
+	    (void) zap_cursor_advance(&zc)) {
+		uint64_t delta = 0;
+
+		if (za->za_integer_length != 8 || za->za_num_integers != 1) {
+			err = SET_ERROR(EINVAL);
+			break;
+		}
+
+		err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta);
+		if (err != 0 && err != ENOENT)
+			break;
+		delta += za->za_first_integer;
+		err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx);
+		if (err != 0)
+			break;
+	}
+	zap_cursor_fini(&zc);
+	kmem_free(za, sizeof (*za));
+	return (err);
+}
+
+int
+zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_remove(os, obj, name, tx));
+}
+
+int
+zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)value);
+	return (zap_lookup(os, obj, name, 8, 1, &value));
+}
+
+int
+zap_add_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_add(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_update_int_key(objset_t *os, uint64_t obj,
+    uint64_t key, uint64_t value, dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_update(os, obj, name, 8, 1, &value, tx));
+}
+
+int
+zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_lookup(os, obj, name, 8, 1, valuep));
+}
+
+int
+zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta,
+    dmu_tx_t *tx)
+{
+	uint64_t value = 0;
+
+	if (delta == 0)
+		return (0);
+
+	int err = zap_lookup(os, obj, name, 8, 1, &value);
+	if (err != 0 && err != ENOENT)
+		return (err);
+	value += delta;
+	if (value == 0)
+		err = zap_remove(os, obj, name, tx);
+	else
+		err = zap_update(os, obj, name, 8, 1, &value, tx);
+	return (err);
+}
+
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx)
+{
+	char name[20];
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	return (zap_increment(os, obj, name, delta, tx));
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+int
+fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
+{
+	int err = ENOENT;
+	zap_entry_handle_t zeh;
+	zap_leaf_t *l;
+
+	/* retrieve the next entry at or after zc_hash/zc_cd */
+	/* if no entry, return ENOENT */
+
+	/*
+	 * If we are reading from the beginning, we're almost certain to
+	 * iterate over the entire ZAP object.  If there are multiple leaf
+	 * blocks (freeblk > 2), prefetch the whole object (up to
+	 * dmu_prefetch_max bytes), so that we read the leaf blocks
+	 * concurrently. (Unless noprefetch was requested via
+	 * zap_cursor_init_noprefetch()).
+	 */
+	if (zc->zc_hash == 0 && zap_iterate_prefetch &&
+	    zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) {
+		dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0,
+		    zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap),
+		    ZIO_PRIORITY_ASYNC_READ);
+	}
+
+	if (zc->zc_leaf &&
+	    (ZAP_HASH_IDX(zc->zc_hash,
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+	    zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+		zap_put_leaf(zc->zc_leaf);
+		zc->zc_leaf = NULL;
+	}
+
+again:
+	if (zc->zc_leaf == NULL) {
+		err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+		    &zc->zc_leaf);
+		if (err != 0)
+			return (err);
+	} else {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+	}
+	l = zc->zc_leaf;
+
+	err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh);
+
+	if (err == ENOENT) {
+		if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) {
+			zc->zc_hash = -1ULL;
+			zc->zc_cd = 0;
+		} else {
+			uint64_t nocare = (1ULL <<
+			    (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
+
+			zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
+			zc->zc_cd = 0;
+
+			if (zc->zc_hash == 0) {
+				zc->zc_hash = -1ULL;
+			} else {
+				zap_put_leaf(zc->zc_leaf);
+				zc->zc_leaf = NULL;
+				goto again;
+			}
+		}
+	}
+
+	if (err == 0) {
+		zc->zc_hash = zeh.zeh_hash;
+		zc->zc_cd = zeh.zeh_cd;
+		za->za_integer_length = zeh.zeh_integer_size;
+		za->za_num_integers = zeh.zeh_num_integers;
+		if (zeh.zeh_num_integers == 0) {
+			za->za_first_integer = 0;
+		} else {
+			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
+			ASSERT(err == 0 || err == EOVERFLOW);
+		}
+		err = zap_entry_read_name(zap, &zeh,
+		    sizeof (za->za_name), za->za_name);
+		ASSERT(err == 0);
+
+		za->za_normalization_conflict =
+		    zap_entry_normalization_conflict(&zeh,
+		    NULL, za->za_name, zap);
+	}
+	rw_exit(&zc->zc_leaf->l_rwlock);
+	return (err);
+}
+
+static void
+zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
+{
+	uint64_t lastblk = 0;
+
+	/*
+	 * NB: if a leaf has more pointers than an entire ptrtbl block
+	 * can hold, then it'll be accounted for more than once, since
+	 * we won't have lastblk.
+	 */
+	for (int i = 0; i < len; i++) {
+		zap_leaf_t *l;
+
+		if (tbl[i] == lastblk)
+			continue;
+		lastblk = tbl[i];
+
+		int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+		if (err == 0) {
+			zap_leaf_stats(zap, l, zs);
+			zap_put_leaf(l);
+		}
+	}
+}
+
+void
+fzap_get_stats(zap_t *zap, zap_stats_t *zs)
+{
+	int bs = FZAP_BLOCK_SHIFT(zap);
+	zs->zs_blocksize = 1ULL << bs;
+
+	/*
+	 * Set zap_phys_t fields
+	 */
+	zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+	zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+	zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+	zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+	zs->zs_magic = zap_f_phys(zap)->zap_magic;
+	zs->zs_salt = zap_f_phys(zap)->zap_salt;
+
+	/*
+	 * Set zap_ptrtbl fields
+	 */
+	zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+	zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
+	zs->zs_ptrtbl_blks_copied =
+	    zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+	zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+	zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+	zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+
+	if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
+		/* the ptrtbl is entirely in the header block. */
+		zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
+		    1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
+	} else {
+		dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+		    zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+		    ZIO_PRIORITY_SYNC_READ);
+
+		for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+		    b++) {
+			dmu_buf_t *db;
+			int err;
+
+			err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+			    (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
+			    FTAG, &db, DMU_READ_NO_PREFETCH);
+			if (err == 0) {
+				zap_stats_ptrtbl(zap, db->db_data,
+				    1<<(bs-3), zs);
+				dmu_buf_rele(db, FTAG);
+			}
+		}
+	}
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW,
+	"When iterating ZAP object, prefetch it");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zap_leaf.c b/sys/contrib/openzfs/module/zfs/zap_leaf.c
new file mode 100644
index 000000000000..aa6c298c3b4b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zap_leaf.c
@@ -0,0 +1,849 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/*
+ * The 512-byte leaf is broken into 32 16-byte chunks.
+ * chunk number n means l_chunk[n], even though the header precedes it.
+ * the names are stored null-terminated.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/arc.h>
+
+static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
+
+#define	CHAIN_END 0xffff /* end of the chunk chain */
+
+#define	LEAF_HASH(l, h) \
+	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
+	((h) >> \
+	(64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
+
+#define	LEAF_HASH_ENTPTR(l, h)	(&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
+
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
+
+static void
+zap_memset(void *a, int c, size_t n)
+{
+	char *cp = a;
+	char *cpend = cp + n;
+
+	while (cp < cpend)
+		*cp++ = c;
+}
+
+static void
+stv(int len, void *addr, uint64_t value)
+{
+	switch (len) {
+	case 1:
+		*(uint8_t *)addr = value;
+		return;
+	case 2:
+		*(uint16_t *)addr = value;
+		return;
+	case 4:
+		*(uint32_t *)addr = value;
+		return;
+	case 8:
+		*(uint64_t *)addr = value;
+		return;
+	default:
+		cmn_err(CE_PANIC, "bad int len %d", len);
+	}
+}
+
+static uint64_t
+ldv(int len, const void *addr)
+{
+	switch (len) {
+	case 1:
+		return (*(uint8_t *)addr);
+	case 2:
+		return (*(uint16_t *)addr);
+	case 4:
+		return (*(uint32_t *)addr);
+	case 8:
+		return (*(uint64_t *)addr);
+	default:
+		cmn_err(CE_PANIC, "bad int len %d", len);
+	}
+	return (0xFEEDFACEDEADBEEFULL);
+}
+
+void
+zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
+{
+	zap_leaf_t l;
+	dmu_buf_t l_dbuf;
+
+	l_dbuf.db_data = buf;
+	l.l_bs = highbit64(size) - 1;
+	l.l_dbuf = &l_dbuf;
+
+	buf->l_hdr.lh_block_type =	BSWAP_64(buf->l_hdr.lh_block_type);
+	buf->l_hdr.lh_prefix =		BSWAP_64(buf->l_hdr.lh_prefix);
+	buf->l_hdr.lh_magic =		BSWAP_32(buf->l_hdr.lh_magic);
+	buf->l_hdr.lh_nfree =		BSWAP_16(buf->l_hdr.lh_nfree);
+	buf->l_hdr.lh_nentries =	BSWAP_16(buf->l_hdr.lh_nentries);
+	buf->l_hdr.lh_prefix_len =	BSWAP_16(buf->l_hdr.lh_prefix_len);
+	buf->l_hdr.lh_freelist =	BSWAP_16(buf->l_hdr.lh_freelist);
+
+	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
+		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
+
+	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
+		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
+		struct zap_leaf_entry *le;
+
+		switch (lc->l_free.lf_type) {
+		case ZAP_CHUNK_ENTRY:
+			le = &lc->l_entry;
+
+			le->le_type =		BSWAP_8(le->le_type);
+			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
+			le->le_next =		BSWAP_16(le->le_next);
+			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
+			le->le_name_numints =	BSWAP_16(le->le_name_numints);
+			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
+			le->le_value_numints =	BSWAP_16(le->le_value_numints);
+			le->le_cd =		BSWAP_32(le->le_cd);
+			le->le_hash =		BSWAP_64(le->le_hash);
+			break;
+		case ZAP_CHUNK_FREE:
+			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
+			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
+			break;
+		case ZAP_CHUNK_ARRAY:
+			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
+			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
+			/* la_array doesn't need swapping */
+			break;
+		default:
+			cmn_err(CE_PANIC, "bad leaf type %d",
+			    lc->l_free.lf_type);
+		}
+	}
+}
+
+void
+zap_leaf_init(zap_leaf_t *l, boolean_t sort)
+{
+	l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
+	zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+	    sizeof (struct zap_leaf_header));
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
+	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
+		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
+	}
+	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
+	zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+	zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+	zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+	if (sort)
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+}
+
+/*
+ * Routines which manipulate leaf chunks (l_chunk[]).
+ */
+
+static uint16_t
+zap_leaf_chunk_alloc(zap_leaf_t *l)
+{
+	ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
+
+	int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
+
+	zap_leaf_phys(l)->l_hdr.lh_freelist =
+	    ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+
+	zap_leaf_phys(l)->l_hdr.lh_nfree--;
+
+	return (chunk);
+}
+
+static void
+zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
+{
+	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
+
+	zlf->lf_type = ZAP_CHUNK_FREE;
+	zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
+	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
+	zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
+
+	zap_leaf_phys(l)->l_hdr.lh_nfree++;
+}
+
+/*
+ * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
+ */
+
+static uint16_t
+zap_leaf_array_create(zap_leaf_t *l, const char *buf,
+    int integer_size, int num_integers)
+{
+	uint16_t chunk_head;
+	uint16_t *chunkp = &chunk_head;
+	int byten = 0;
+	uint64_t value = 0;
+	int shift = (integer_size - 1) * 8;
+	int len = num_integers;
+
+	ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN);
+
+	while (len > 0) {
+		uint16_t chunk = zap_leaf_chunk_alloc(l);
+		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+
+		la->la_type = ZAP_CHUNK_ARRAY;
+		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
+			if (byten == 0)
+				value = ldv(integer_size, buf);
+			la->la_array[i] = value >> shift;
+			value <<= 8;
+			if (++byten == integer_size) {
+				byten = 0;
+				buf += integer_size;
+				if (--len == 0)
+					break;
+			}
+		}
+
+		*chunkp = chunk;
+		chunkp = &la->la_next;
+	}
+	*chunkp = CHAIN_END;
+
+	return (chunk_head);
+}
+
+static void
+zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
+{
+	uint16_t chunk = *chunkp;
+
+	*chunkp = CHAIN_END;
+
+	while (chunk != CHAIN_END) {
+		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
+		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
+		    ZAP_CHUNK_ARRAY);
+		zap_leaf_chunk_free(l, chunk);
+		chunk = nextchunk;
+	}
+}
+
+/* array_len and buf_len are in integers, not bytes */
+static void
+zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
+    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
+    void *buf)
+{
+	int len = MIN(array_len, buf_len);
+	int byten = 0;
+	uint64_t value = 0;
+	char *p = buf;
+
+	ASSERT3U(array_int_len, <=, buf_int_len);
+
+	/* Fast path for one 8-byte integer */
+	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
+		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+		uint8_t *ip = la->la_array;
+		uint64_t *buf64 = buf;
+
+		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
+		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
+		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
+		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
+		return;
+	}
+
+	/* Fast path for an array of 1-byte integers (eg. the entry name) */
+	if (array_int_len == 1 && buf_int_len == 1 &&
+	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
+		while (chunk != CHAIN_END) {
+			struct zap_leaf_array *la =
+			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
+			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+			p += ZAP_LEAF_ARRAY_BYTES;
+			chunk = la->la_next;
+		}
+		return;
+	}
+
+	while (len > 0) {
+		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+		for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
+			value = (value << 8) | la->la_array[i];
+			byten++;
+			if (byten == array_int_len) {
+				stv(buf_int_len, p, value);
+				byten = 0;
+				len--;
+				if (len == 0)
+					return;
+				p += buf_int_len;
+			}
+		}
+		chunk = la->la_next;
+	}
+}
+
+static boolean_t
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
+    int chunk, int array_numints)
+{
+	int bseen = 0;
+
+	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+		uint64_t *thiskey =
+		    kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP);
+		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+
+		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
+		    sizeof (*thiskey), array_numints, thiskey);
+		boolean_t match = bcmp(thiskey, zn->zn_key_orig,
+		    array_numints * sizeof (*thiskey)) == 0;
+		kmem_free(thiskey, array_numints * sizeof (*thiskey));
+		return (match);
+	}
+
+	ASSERT(zn->zn_key_intlen == 1);
+	if (zn->zn_matchtype & MT_NORMALIZE) {
+		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
+
+		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
+		    sizeof (char), array_numints, thisname);
+		boolean_t match = zap_match(zn, thisname);
+		kmem_free(thisname, array_numints);
+		return (match);
+	}
+
+	/*
+	 * Fast path for exact matching.
+	 * First check that the lengths match, so that we don't read
+	 * past the end of the zn_key_orig array.
+	 */
+	if (array_numints != zn->zn_key_orig_numints)
+		return (B_FALSE);
+	while (bseen < array_numints) {
+		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
+		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
+			break;
+		chunk = la->la_next;
+		bseen += toread;
+	}
+	return (bseen == array_numints);
+}
+
+/*
+ * Routines which manipulate leaf entries.
+ */
+
+int
+zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
+{
+	struct zap_leaf_entry *le;
+
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
+	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
+		uint16_t chunk = *chunkp;
+		le = ZAP_LEAF_ENTRY(l, chunk);
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+		if (le->le_hash != zn->zn_hash)
+			continue;
+
+		/*
+		 * NB: the entry chain is always sorted by cd on
+		 * normalized zap objects, so this will find the
+		 * lowest-cd match for MT_NORMALIZE.
+		 */
+		ASSERT((zn->zn_matchtype == 0) ||
+		    (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
+		    le->le_name_numints)) {
+			zeh->zeh_num_integers = le->le_value_numints;
+			zeh->zeh_integer_size = le->le_value_intlen;
+			zeh->zeh_cd = le->le_cd;
+			zeh->zeh_hash = le->le_hash;
+			zeh->zeh_chunkp = chunkp;
+			zeh->zeh_leaf = l;
+			return (0);
+		}
+	}
+
+	return (SET_ERROR(ENOENT));
+}
+
+/* Return (h1,cd1 >= h2,cd2) */
+#define	HCD_GTEQ(h1, cd1, h2, cd2) \
+	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
+
+int
+zap_leaf_lookup_closest(zap_leaf_t *l,
+    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
+{
+	uint64_t besth = -1ULL;
+	uint32_t bestcd = -1U;
+	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
+	struct zap_leaf_entry *le;
+
+	ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+
+	for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
+		for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh];
+		    chunk != CHAIN_END; chunk = le->le_next) {
+			le = ZAP_LEAF_ENTRY(l, chunk);
+
+			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
+			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
+				ASSERT3U(bestlh, >=, lh);
+				bestlh = lh;
+				besth = le->le_hash;
+				bestcd = le->le_cd;
+
+				zeh->zeh_num_integers = le->le_value_numints;
+				zeh->zeh_integer_size = le->le_value_intlen;
+				zeh->zeh_cd = le->le_cd;
+				zeh->zeh_hash = le->le_hash;
+				zeh->zeh_fakechunk = chunk;
+				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
+				zeh->zeh_leaf = l;
+			}
+		}
+	}
+
+	return (bestcd == -1U ? SET_ERROR(ENOENT) : 0);
+}
+
+int
+zap_entry_read(const zap_entry_handle_t *zeh,
+    uint8_t integer_size, uint64_t num_integers, void *buf)
+{
+	struct zap_leaf_entry *le =
+	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+	if (le->le_value_intlen > integer_size)
+		return (SET_ERROR(EINVAL));
+
+	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
+	    le->le_value_intlen, le->le_value_numints,
+	    integer_size, num_integers, buf);
+
+	if (zeh->zeh_num_integers > num_integers)
+		return (SET_ERROR(EOVERFLOW));
+	return (0);
+
+}
+
+int
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+    char *buf)
+{
+	struct zap_leaf_entry *le =
+	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
+	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+		    le->le_name_numints, 8, buflen / 8, buf);
+	} else {
+		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+		    le->le_name_numints, 1, buflen, buf);
+	}
+	if (le->le_name_numints > buflen)
+		return (SET_ERROR(EOVERFLOW));
+	return (0);
+}
+
+int
+zap_entry_update(zap_entry_handle_t *zeh,
+    uint8_t integer_size, uint64_t num_integers, const void *buf)
+{
+	zap_leaf_t *l = zeh->zeh_leaf;
+	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
+
+	int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
+	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
+
+	if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
+		return (SET_ERROR(EAGAIN));
+
+	zap_leaf_array_free(l, &le->le_value_chunk);
+	le->le_value_chunk =
+	    zap_leaf_array_create(l, buf, integer_size, num_integers);
+	le->le_value_numints = num_integers;
+	le->le_value_intlen = integer_size;
+	return (0);
+}
+
+void
+zap_entry_remove(zap_entry_handle_t *zeh)
+{
+	zap_leaf_t *l = zeh->zeh_leaf;
+
+	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
+
+	uint16_t entry_chunk = *zeh->zeh_chunkp;
+	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk);
+	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+	zap_leaf_array_free(l, &le->le_name_chunk);
+	zap_leaf_array_free(l, &le->le_value_chunk);
+
+	*zeh->zeh_chunkp = le->le_next;
+	zap_leaf_chunk_free(l, entry_chunk);
+
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
+}
+
+int
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
+    uint8_t integer_size, uint64_t num_integers, const void *buf,
+    zap_entry_handle_t *zeh)
+{
+	uint16_t chunk;
+	struct zap_leaf_entry *le;
+	uint64_t h = zn->zn_hash;
+
+	uint64_t valuelen = integer_size * num_integers;
+
+	int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
+		return (SET_ERROR(E2BIG));
+
+	if (cd == ZAP_NEED_CD) {
+		/* find the lowest unused cd */
+		if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+			cd = 0;
+
+			for (chunk = *LEAF_HASH_ENTPTR(l, h);
+			    chunk != CHAIN_END; chunk = le->le_next) {
+				le = ZAP_LEAF_ENTRY(l, chunk);
+				if (le->le_cd > cd)
+					break;
+				if (le->le_hash == h) {
+					ASSERT3U(cd, ==, le->le_cd);
+					cd++;
+				}
+			}
+		} else {
+			/* old unsorted format; do it the O(n^2) way */
+			for (cd = 0; ; cd++) {
+				for (chunk = *LEAF_HASH_ENTPTR(l, h);
+				    chunk != CHAIN_END; chunk = le->le_next) {
+					le = ZAP_LEAF_ENTRY(l, chunk);
+					if (le->le_hash == h &&
+					    le->le_cd == cd) {
+						break;
+					}
+				}
+				/* If this cd is not in use, we are good. */
+				if (chunk == CHAIN_END)
+					break;
+			}
+		}
+		/*
+		 * We would run out of space in a block before we could
+		 * store enough entries to run out of CD values.
+		 */
+		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
+	}
+
+	if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
+		return (SET_ERROR(EAGAIN));
+
+	/* make the entry */
+	chunk = zap_leaf_chunk_alloc(l);
+	le = ZAP_LEAF_ENTRY(l, chunk);
+	le->le_type = ZAP_CHUNK_ENTRY;
+	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+	    zn->zn_key_intlen, zn->zn_key_orig_numints);
+	le->le_name_numints = zn->zn_key_orig_numints;
+	le->le_value_chunk =
+	    zap_leaf_array_create(l, buf, integer_size, num_integers);
+	le->le_value_numints = num_integers;
+	le->le_value_intlen = integer_size;
+	le->le_hash = h;
+	le->le_cd = cd;
+
+	/* link it into the hash chain */
+	/* XXX if we did the search above, we could just use that */
+	uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk);
+
+	zap_leaf_phys(l)->l_hdr.lh_nentries++;
+
+	zeh->zeh_leaf = l;
+	zeh->zeh_num_integers = num_integers;
+	zeh->zeh_integer_size = le->le_value_intlen;
+	zeh->zeh_cd = le->le_cd;
+	zeh->zeh_hash = le->le_hash;
+	zeh->zeh_chunkp = chunkp;
+
+	return (0);
+}
+
+/*
+ * Determine if there is another entry with the same normalized form.
+ * For performance purposes, either zn or name must be provided (the
+ * other can be NULL).  Note, there usually won't be any hash
+ * conflicts, in which case we don't need the concatenated/normalized
+ * form of the name.  But all callers have one of these on hand anyway,
+ * so might as well take advantage.  A cleaner but slower interface
+ * would accept neither argument, and compute the normalized name as
+ * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
+ */
+boolean_t
+zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
+    const char *name, zap_t *zap)
+{
+	struct zap_leaf_entry *le;
+	boolean_t allocdzn = B_FALSE;
+
+	if (zap->zap_normflags == 0)
+		return (B_FALSE);
+
+	for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
+	    chunk != CHAIN_END; chunk = le->le_next) {
+		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
+		if (le->le_hash != zeh->zeh_hash)
+			continue;
+		if (le->le_cd == zeh->zeh_cd)
+			continue;
+
+		if (zn == NULL) {
+			zn = zap_name_alloc(zap, name, MT_NORMALIZE);
+			allocdzn = B_TRUE;
+		}
+		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
+		    le->le_name_chunk, le->le_name_numints)) {
+			if (allocdzn)
+				zap_name_free(zn);
+			return (B_TRUE);
+		}
+	}
+	if (allocdzn)
+		zap_name_free(zn);
+	return (B_FALSE);
+}
+
+/*
+ * Routines for transferring entries between leafs.
+ */
+
+static uint16_t *
+zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
+{
+	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+	struct zap_leaf_entry *le2;
+	uint16_t *chunkp;
+
+	/*
+	 * keep the entry chain sorted by cd
+	 * NB: this will not cause problems for unsorted leafs, though
+	 * it is unnecessary there.
+	 */
+	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
+	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
+		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
+		if (le2->le_cd > le->le_cd)
+			break;
+	}
+
+	le->le_next = *chunkp;
+	*chunkp = entry;
+	return (chunkp);
+}
+
+static uint16_t
+zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
+{
+	uint16_t new_chunk;
+	uint16_t *nchunkp = &new_chunk;
+
+	while (chunk != CHAIN_END) {
+		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
+		struct zap_leaf_array *nla =
+		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
+		struct zap_leaf_array *la =
+		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
+		int nextchunk = la->la_next;
+
+		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
+		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
+
+		*nla = *la; /* structure assignment */
+
+		zap_leaf_chunk_free(l, chunk);
+		chunk = nextchunk;
+		*nchunkp = nchunk;
+		nchunkp = &nla->la_next;
+	}
+	*nchunkp = CHAIN_END;
+	return (new_chunk);
+}
+
+static void
+zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
+{
+	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
+	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
+
+	uint16_t chunk = zap_leaf_chunk_alloc(nl);
+	struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk);
+	*nle = *le; /* structure assignment */
+
+	(void) zap_leaf_rehash_entry(nl, chunk);
+
+	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
+	nle->le_value_chunk =
+	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
+
+	zap_leaf_chunk_free(l, entry);
+
+	zap_leaf_phys(l)->l_hdr.lh_nentries--;
+	zap_leaf_phys(nl)->l_hdr.lh_nentries++;
+}
+
+/*
+ * Transfer the entries whose hash prefix ends in 1 to the new leaf.
+ */
+void
+zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
+{
+	int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+	/* set new prefix and prefix_len */
+	zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+	zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+	zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+
+	/* break existing hash chains */
+	zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+	    2*ZAP_LEAF_HASH_NUMENTRIES(l));
+
+	if (sort)
+		zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+
+	/*
+	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
+	 * the remaining entries
+	 *
+	 * NB: We could find entries via the hashtable instead. That
+	 * would be O(hashents+numents) rather than O(numblks+numents),
+	 * but this accesses memory more sequentially, and when we're
+	 * called, the block is usually pretty full.
+	 */
+	for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
+		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
+		if (le->le_type != ZAP_CHUNK_ENTRY)
+			continue;
+
+		if (le->le_hash & (1ULL << bit))
+			zap_leaf_transfer_entry(l, i, nl);
+		else
+			(void) zap_leaf_rehash_entry(l, i);
+	}
+}
+
+void
+zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
+{
+	int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+	    zap_leaf_phys(l)->l_hdr.lh_prefix_len;
+	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+	zs->zs_leafs_with_2n_pointers[n]++;
+
+
+	n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
+	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+	zs->zs_blocks_with_n5_entries[n]++;
+
+	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
+	    zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+	    (1<<FZAP_BLOCK_SHIFT(zap));
+	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+	zs->zs_blocks_n_tenths_full[n]++;
+
+	for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
+		int nentries = 0;
+		int chunk = zap_leaf_phys(l)->l_hash[i];
+
+		while (chunk != CHAIN_END) {
+			struct zap_leaf_entry *le =
+			    ZAP_LEAF_ENTRY(l, chunk);
+
+			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
+			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
+			    le->le_value_intlen);
+			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+			zs->zs_entries_using_n_chunks[n]++;
+
+			chunk = le->le_next;
+			nentries++;
+		}
+
+		n = nentries;
+		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
+		zs->zs_buckets_with_n_entries[n]++;
+	}
+}
diff --git a/sys/contrib/openzfs/module/zfs/zap_micro.c b/sys/contrib/openzfs/module/zfs/zap_micro.c
new file mode 100644
index 000000000000..5d9bc2076068
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zap_micro.c
@@ -0,0 +1,1697 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
+#include <sys/zfs_context.h>
+#include <sys/zap.h>
+#include <sys/zap_impl.h>
+#include <sys/zap_leaf.h>
+#include <sys/avl.h>
+#include <sys/arc.h>
+#include <sys/dmu_objset.h>
+
+#ifdef _KERNEL
+#include <sys/sunddi.h>
+#endif
+
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+
+static int mzap_upgrade(zap_t **zapp,
+    void *tag, dmu_tx_t *tx, zap_flags_t flags);
+
+uint64_t
+zap_getflags(zap_t *zap)
+{
+	if (zap->zap_ismicro)
+		return (0);
+	return (zap_f_phys(zap)->zap_flags);
+}
+
+int
+zap_hashbits(zap_t *zap)
+{
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return (48);
+	else
+		return (28);
+}
+
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return ((1<<16)-1);
+	else
+		return (-1U);
+}
+
+static uint64_t
+zap_hash(zap_name_t *zn)
+{
+	zap_t *zap = zn->zn_zap;
+	uint64_t h = 0;
+
+	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+		h = *(uint64_t *)zn->zn_key_orig;
+	} else {
+		h = zap->zap_salt;
+		ASSERT(h != 0);
+		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+			const uint64_t *wp = zn->zn_key_norm;
+
+			ASSERT(zn->zn_key_intlen == 8);
+			for (int i = 0; i < zn->zn_key_norm_numints;
+			    wp++, i++) {
+				uint64_t word = *wp;
+
+				for (int j = 0; j < zn->zn_key_intlen; j++) {
+					h = (h >> 8) ^
+					    zfs_crc64_table[(h ^ word) & 0xFF];
+					word >>= NBBY;
+				}
+			}
+		} else {
+			const uint8_t *cp = zn->zn_key_norm;
+
+			/*
+			 * We previously stored the terminating null on
+			 * disk, but didn't hash it, so we need to
+			 * continue to not hash it.  (The
+			 * zn_key_*_numints includes the terminating
+			 * null for non-binary keys.)
+			 */
+			int len = zn->zn_key_norm_numints - 1;
+
+			ASSERT(zn->zn_key_intlen == 1);
+			for (int i = 0; i < len; cp++, i++) {
+				h = (h >> 8) ^
+				    zfs_crc64_table[(h ^ *cp) & 0xFF];
+			}
+		}
+	}
+	/*
+	 * Don't use all 64 bits, since we need some in the cookie for
+	 * the collision differentiator.  We MUST use the high bits,
+	 * since those are the ones that we first pay attention to when
+	 * choosing the bucket.
+	 */
+	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
+
+	return (h);
+}
+
+static int
+zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
+{
+	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
+	size_t inlen = strlen(name) + 1;
+	size_t outlen = ZAP_MAXNAMELEN;
+
+	int err = 0;
+	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
+	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
+	    U8_UNICODE_LATEST, &err);
+
+	return (err);
+}
+
+boolean_t
+zap_match(zap_name_t *zn, const char *matchname)
+{
+	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
+	if (zn->zn_matchtype & MT_NORMALIZE) {
+		char norm[ZAP_MAXNAMELEN];
+
+		if (zap_normalize(zn->zn_zap, matchname, norm,
+		    zn->zn_normflags) != 0)
+			return (B_FALSE);
+
+		return (strcmp(zn->zn_key_norm, norm) == 0);
+	} else {
+		return (strcmp(zn->zn_key_orig, matchname) == 0);
+	}
+}
+
+void
+zap_name_free(zap_name_t *zn)
+{
+	kmem_free(zn, sizeof (zap_name_t));
+}
+
+zap_name_t *
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
+{
+	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+	zn->zn_zap = zap;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = key;
+	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
+	zn->zn_matchtype = mt;
+	zn->zn_normflags = zap->zap_normflags;
+
+	/*
+	 * If we're dealing with a case sensitive lookup on a mixed or
+	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
+	 * will fold case to all caps overriding the lookup request.
+	 */
+	if (mt & MT_MATCH_CASE)
+		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
+
+	if (zap->zap_normflags) {
+		/*
+		 * We *must* use zap_normflags because this normalization is
+		 * what the hash is computed from.
+		 */
+		if (zap_normalize(zap, key, zn->zn_normbuf,
+		    zap->zap_normflags) != 0) {
+			zap_name_free(zn);
+			return (NULL);
+		}
+		zn->zn_key_norm = zn->zn_normbuf;
+		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+	} else {
+		if (mt != 0) {
+			zap_name_free(zn);
+			return (NULL);
+		}
+		zn->zn_key_norm = zn->zn_key_orig;
+		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
+	}
+
+	zn->zn_hash = zap_hash(zn);
+
+	if (zap->zap_normflags != zn->zn_normflags) {
+		/*
+		 * We *must* use zn_normflags because this normalization is
+		 * what the matching is based on.  (Not the hash!)
+		 */
+		if (zap_normalize(zap, key, zn->zn_normbuf,
+		    zn->zn_normflags) != 0) {
+			zap_name_free(zn);
+			return (NULL);
+		}
+		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
+	}
+
+	return (zn);
+}
+
+static zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+	ASSERT(zap->zap_normflags == 0);
+	zn->zn_zap = zap;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = zn->zn_key_norm = key;
+	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+	zn->zn_matchtype = 0;
+
+	zn->zn_hash = zap_hash(zn);
+	return (zn);
+}
+
+static void
+mzap_byteswap(mzap_phys_t *buf, size_t size)
+{
+	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
+	buf->mz_salt = BSWAP_64(buf->mz_salt);
+	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
+	int max = (size / MZAP_ENT_LEN) - 1;
+	for (int i = 0; i < max; i++) {
+		buf->mz_chunk[i].mze_value =
+		    BSWAP_64(buf->mz_chunk[i].mze_value);
+		buf->mz_chunk[i].mze_cd =
+		    BSWAP_32(buf->mz_chunk[i].mze_cd);
+	}
+}
+
+void
+zap_byteswap(void *buf, size_t size)
+{
+	uint64_t block_type = *(uint64_t *)buf;
+
+	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
+		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
+		mzap_byteswap(buf, size);
+	} else {
+		fzap_byteswap(buf, size);
+	}
+}
+
+static int
+mze_compare(const void *arg1, const void *arg2)
+{
+	const mzap_ent_t *mze1 = arg1;
+	const mzap_ent_t *mze2 = arg2;
+
+	int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash);
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
+}
+
+static void
+mze_insert(zap_t *zap, int chunkid, uint64_t hash)
+{
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
+	mze->mze_chunkid = chunkid;
+	mze->mze_hash = hash;
+	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
+	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
+	avl_add(&zap->zap_m.zap_avl, mze);
+}
+
+static mzap_ent_t *
+mze_find(zap_name_t *zn)
+{
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+	avl_index_t idx;
+	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
+
+	ASSERT(zn->zn_zap->zap_ismicro);
+	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
+
+	mze_tofind.mze_hash = zn->zn_hash;
+	mze_tofind.mze_cd = 0;
+
+	mze = avl_find(avl, &mze_tofind, &idx);
+	if (mze == NULL)
+		mze = avl_nearest(avl, idx, AVL_AFTER);
+	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
+		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
+		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
+			return (mze);
+	}
+
+	return (NULL);
+}
+
+static uint32_t
+mze_find_unused_cd(zap_t *zap, uint64_t hash)
+{
+	mzap_ent_t mze_tofind;
+	avl_index_t idx;
+	avl_tree_t *avl = &zap->zap_m.zap_avl;
+
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
+
+	mze_tofind.mze_hash = hash;
+	mze_tofind.mze_cd = 0;
+
+	uint32_t cd = 0;
+	for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
+	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+		if (mze->mze_cd != cd)
+			break;
+		cd++;
+	}
+
+	return (cd);
+}
+
+/*
+ * Each mzap entry requires at max : 4 chunks
+ * 3 chunks for names + 1 chunk for value.
+ */
+#define	MZAP_ENT_CHUNKS	(1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
+	ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
+
+/*
+ * Check if the current entry keeps the colliding entries under the fatzap leaf
+ * size.
+ */
+static boolean_t
+mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
+{
+	zap_t *zap = zn->zn_zap;
+	mzap_ent_t mze_tofind;
+	mzap_ent_t *mze;
+	avl_index_t idx;
+	avl_tree_t *avl = &zap->zap_m.zap_avl;
+	uint32_t mzap_ents = 0;
+
+	mze_tofind.mze_hash = hash;
+	mze_tofind.mze_cd = 0;
+
+	for (mze = avl_find(avl, &mze_tofind, &idx);
+	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
+		mzap_ents++;
+	}
+
+	/* Include the new entry being added */
+	mzap_ents++;
+
+	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
+}
+
+static void
+mze_remove(zap_t *zap, mzap_ent_t *mze)
+{
+	ASSERT(zap->zap_ismicro);
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	avl_remove(&zap->zap_m.zap_avl, mze);
+	kmem_free(mze, sizeof (mzap_ent_t));
+}
+
+static void
+mze_destroy(zap_t *zap)
+{
+	mzap_ent_t *mze;
+	void *avlcookie = NULL;
+
+	while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
+		kmem_free(mze, sizeof (mzap_ent_t));
+	avl_destroy(&zap->zap_m.zap_avl);
+}
+
+static zap_t *
+mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
+{
+	zap_t *winner;
+	uint64_t *zap_hdr = (uint64_t *)db->db_data;
+	uint64_t zap_block_type = zap_hdr[0];
+	uint64_t zap_magic = zap_hdr[1];
+
+	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
+
+	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
+	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
+	rw_enter(&zap->zap_rwlock, RW_WRITER);
+	zap->zap_objset = os;
+	zap->zap_object = obj;
+	zap->zap_dbuf = db;
+
+	if (zap_block_type != ZBT_MICRO) {
+		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
+		    0);
+		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
+		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
+			winner = NULL;	/* No actual winner here... */
+			goto handle_winner;
+		}
+	} else {
+		zap->zap_ismicro = TRUE;
+	}
+
+	/*
+	 * Make sure that zap_ismicro is set before we let others see
+	 * it, because zap_lockdir() checks zap_ismicro without the lock
+	 * held.
+	 */
+	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
+	winner = dmu_buf_set_user(db, &zap->zap_dbu);
+
+	if (winner != NULL)
+		goto handle_winner;
+
+	if (zap->zap_ismicro) {
+		zap->zap_salt = zap_m_phys(zap)->mz_salt;
+		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
+		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
+		avl_create(&zap->zap_m.zap_avl, mze_compare,
+		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
+
+		for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+			mzap_ent_phys_t *mze =
+			    &zap_m_phys(zap)->mz_chunk[i];
+			if (mze->mze_name[0]) {
+				zap_name_t *zn;
+
+				zap->zap_m.zap_num_entries++;
+				zn = zap_name_alloc(zap, mze->mze_name, 0);
+				mze_insert(zap, i, zn->zn_hash);
+				zap_name_free(zn);
+			}
+		}
+	} else {
+		zap->zap_salt = zap_f_phys(zap)->zap_salt;
+		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
+
+		ASSERT3U(sizeof (struct zap_leaf_header), ==,
+		    2*ZAP_LEAF_CHUNKSIZE);
+
+		/*
+		 * The embedded pointer table should not overlap the
+		 * other members.
+		 */
+		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
+		    &zap_f_phys(zap)->zap_salt);
+
+		/*
+		 * The embedded pointer table should end at the end of
+		 * the block
+		 */
+		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
+		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
+		    (uintptr_t)zap_f_phys(zap), ==,
+		    zap->zap_dbuf->db_size);
+	}
+	rw_exit(&zap->zap_rwlock);
+	return (zap);
+
+handle_winner:
+	rw_exit(&zap->zap_rwlock);
+	rw_destroy(&zap->zap_rwlock);
+	if (!zap->zap_ismicro)
+		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+	kmem_free(zap, sizeof (zap_t));
+	return (winner);
+}
+
+/*
+ * This routine "consumes" the caller's hold on the dbuf, which must
+ * have the specified tag.
+ */
+static int
+zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
+{
+	ASSERT0(db->db_offset);
+	objset_t *os = dmu_buf_get_objset(db);
+	uint64_t obj = db->db_object;
+	dmu_object_info_t doi;
+
+	*zapp = NULL;
+
+	dmu_object_info_from_db(db, &doi);
+	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
+		return (SET_ERROR(EINVAL));
+
+	zap_t *zap = dmu_buf_get_user(db);
+	if (zap == NULL) {
+		zap = mzap_open(os, obj, db);
+		if (zap == NULL) {
+			/*
+			 * mzap_open() didn't like what it saw on-disk.
+			 * Check for corruption!
+			 */
+			return (SET_ERROR(EIO));
+		}
+	}
+
+	/*
+	 * We're checking zap_ismicro without the lock held, in order to
+	 * tell what type of lock we want.  Once we have some sort of
+	 * lock, see if it really is the right type.  In practice this
+	 * can only be different if it was upgraded from micro to fat,
+	 * and micro wanted WRITER but fat only needs READER.
+	 */
+	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
+	rw_enter(&zap->zap_rwlock, lt);
+	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
+		/* it was upgraded, now we only need reader */
+		ASSERT(lt == RW_WRITER);
+		ASSERT(RW_READER ==
+		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
+		rw_downgrade(&zap->zap_rwlock);
+		lt = RW_READER;
+	}
+
+	zap->zap_objset = os;
+
+	if (lt == RW_WRITER)
+		dmu_buf_will_dirty(db, tx);
+
+	ASSERT3P(zap->zap_dbuf, ==, db);
+
+	ASSERT(!zap->zap_ismicro ||
+	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
+	if (zap->zap_ismicro && tx && adding &&
+	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
+		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
+		if (newsz > MZAP_MAX_BLKSZ) {
+			dprintf("upgrading obj %llu: num_entries=%u\n",
+			    obj, zap->zap_m.zap_num_entries);
+			*zapp = zap;
+			int err = mzap_upgrade(zapp, tag, tx, 0);
+			if (err != 0)
+				rw_exit(&zap->zap_rwlock);
+			return (err);
+		}
+		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
+		zap->zap_m.zap_num_chunks =
+		    db->db_size / MZAP_ENT_LEN - 1;
+	}
+
+	*zapp = zap;
+	return (0);
+}
+
+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+
+	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0) {
+		return (err);
+	}
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	}
+#endif
+
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0) {
+		dmu_buf_rele(db, tag);
+	}
+	return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+	dmu_buf_t *db;
+
+	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+	if (err != 0)
+		return (err);
+#ifdef ZFS_DEBUG
+	{
+		dmu_object_info_t doi;
+		dmu_object_info_from_db(db, &doi);
+		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
+	}
+#endif
+	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+	if (err != 0)
+		dmu_buf_rele(db, tag);
+	return (err);
+}
+
+void
+zap_unlockdir(zap_t *zap, void *tag)
+{
+	rw_exit(&zap->zap_rwlock);
+	dmu_buf_rele(zap->zap_dbuf, tag);
+}
+
+static int
+mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
+{
+	int err = 0;
+	zap_t *zap = *zapp;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+	int sz = zap->zap_dbuf->db_size;
+	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
+	bcopy(zap->zap_dbuf->db_data, mzp, sz);
+	int nchunks = zap->zap_m.zap_num_chunks;
+
+	if (!flags) {
+		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+		    1ULL << fzap_default_block_shift, 0, tx);
+		if (err != 0) {
+			vmem_free(mzp, sz);
+			return (err);
+		}
+	}
+
+	dprintf("upgrading obj=%llu with %u chunks\n",
+	    zap->zap_object, nchunks);
+	/* XXX destroy the avl later, so we can use the stored hash value */
+	mze_destroy(zap);
+
+	fzap_upgrade(zap, tx, flags);
+
+	for (int i = 0; i < nchunks; i++) {
+		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
+		if (mze->mze_name[0] == 0)
+			continue;
+		dprintf("adding %s=%llu\n",
+		    mze->mze_name, mze->mze_value);
+		zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
+		/* If we fail here, we would end up losing entries */
+		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+		    tag, tx));
+		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
+		zap_name_free(zn);
+	}
+	vmem_free(mzp, sz);
+	*zapp = zap;
+	return (0);
+}
+
+/*
+ * The "normflags" determine the behavior of the matchtype_t which is
+ * passed to zap_lookup_norm().  Names which have the same normalized
+ * version will be stored with the same hash value, and therefore we can
+ * perform normalization-insensitive lookups.  We can be Unicode form-
+ * insensitive and/or case-insensitive.  The following flags are valid for
+ * "normflags":
+ *
+ * U8_TEXTPREP_NFC
+ * U8_TEXTPREP_NFD
+ * U8_TEXTPREP_NFKC
+ * U8_TEXTPREP_NFKD
+ * U8_TEXTPREP_TOUPPER
+ *
+ * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
+ * of them may be supplied.
+ */
+void
+mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
+{
+	dmu_buf_t *db;
+
+	VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
+
+	dmu_buf_will_dirty(db, tx);
+	mzap_phys_t *zp = db->db_data;
+	zp->mz_block_type = ZBT_MICRO;
+	zp->mz_salt =
+	    ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
+	zp->mz_normflags = normflags;
+
+	if (flags != 0) {
+		zap_t *zap;
+		/* Only fat zap supports flags; upgrade immediately. */
+		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
+		    B_FALSE, B_FALSE, &zap));
+		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
+		zap_unlockdir(zap, FTAG);
+	} else {
+		dmu_buf_rele(db, FTAG);
+	}
+}
+
+static uint64_t
+zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+	uint64_t obj;
+
+	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+
+	if (allocated_dnode == NULL) {
+		dnode_t *dn;
+		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+		    indirect_blockshift, bonustype, bonuslen, dnodesize,
+		    &dn, FTAG, tx);
+		mzap_create_impl(dn, normflags, flags, tx);
+		dnode_rele(dn, FTAG);
+	} else {
+		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
+		    indirect_blockshift, bonustype, bonuslen, dnodesize,
+		    allocated_dnode, tag, tx);
+		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
+	}
+
+	return (obj);
+}
+
+int
+zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
+	    0, tx));
+}
+
+int
+zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_claim_norm_dnsize(os, obj,
+	    0, ot, bonustype, bonuslen, dnodesize, tx));
+}
+
+int
+zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
+    dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
+	    bonuslen, 0, tx));
+}
+
+int
+zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
+    dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
+    int dnodesize, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	int error;
+
+	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
+	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
+	    dnodesize, tx);
+	if (error != 0)
+		return (error);
+
+	error = dnode_hold(os, obj, FTAG, &dn);
+	if (error != 0)
+		return (error);
+
+	mzap_create_impl(dn, normflags, 0, tx);
+
+	dnode_rele(dn, FTAG);
+
+	return (0);
+}
+
+uint64_t
+zap_create(objset_t *os, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
+}
+
+uint64_t
+zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
+	    dnodesize, tx));
+}
+
+uint64_t
+zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
+	    0, tx));
+}
+
+uint64_t
+zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
+	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	return (zap_create_flags_dnsize(os, normflags, flags, ot,
+	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
+}
+
+uint64_t
+zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
+{
+	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
+	    tx));
+}
+
+/*
+ * Create a zap object and return a pointer to the newly allocated dnode via
+ * the allocated_dnode argument.  The returned dnode will be held and the
+ * caller is responsible for releasing the hold by calling dnode_rele().
+ */
+uint64_t
+zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
+    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
+{
+	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
+	    indirect_blockshift, bonustype, bonuslen, dnodesize,
+	    allocated_dnode, tag, tx));
+}
+
+int
+zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
+{
+	/*
+	 * dmu_object_free will free the object number and free the
+	 * data.  Freeing the data will cause our pageout function to be
+	 * called, which will destroy our data (zap_leaf_t's and zap_t).
+	 */
+
+	return (dmu_object_free(os, zapobj, tx));
+}
+
+void
+zap_evict_sync(void *dbu)
+{
+	zap_t *zap = dbu;
+
+	rw_destroy(&zap->zap_rwlock);
+
+	if (zap->zap_ismicro)
+		mze_destroy(zap);
+	else
+		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
+
+	kmem_free(zap, sizeof (zap_t));
+}
+
+int
+zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	if (!zap->zap_ismicro) {
+		err = fzap_count(zap, count);
+	} else {
+		*count = zap->zap_m.zap_num_entries;
+	}
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+/*
+ * zn may be NULL; if not specified, it will be computed if needed.
+ * See also the comment above zap_entry_normalization_conflict().
+ */
+static boolean_t
+mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
+{
+	int direction = AVL_BEFORE;
+	boolean_t allocdzn = B_FALSE;
+
+	if (zap->zap_normflags == 0)
+		return (B_FALSE);
+
+again:
+	for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
+	    other && other->mze_hash == mze->mze_hash;
+	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
+
+		if (zn == NULL) {
+			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
+			    MT_NORMALIZE);
+			allocdzn = B_TRUE;
+		}
+		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
+			if (allocdzn)
+				zap_name_free(zn);
+			return (B_TRUE);
+		}
+	}
+
+	if (direction == AVL_BEFORE) {
+		direction = AVL_AFTER;
+		goto again;
+	}
+
+	if (allocdzn)
+		zap_name_free(zn);
+	return (B_FALSE);
+}
+
+/*
+ * Routines for manipulating attributes.
+ */
+
+int
+zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm(os, zapobj, name, integer_size,
+	    num_integers, buf, 0, NULL, 0, NULL));
+}
+
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	int err = 0;
+
+	zap_name_t *zn = zap_name_alloc(zap, name, mt);
+	if (zn == NULL)
+		return (SET_ERROR(ENOTSUP));
+
+	if (!zap->zap_ismicro) {
+		err = fzap_lookup(zn, integer_size, num_integers, buf,
+		    realname, rn_len, ncp);
+	} else {
+		mzap_ent_t *mze = mze_find(zn);
+		if (mze == NULL) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			if (num_integers < 1) {
+				err = SET_ERROR(EOVERFLOW);
+			} else if (integer_size != 8) {
+				err = SET_ERROR(EINVAL);
+			} else {
+				*(uint64_t *)buf =
+				    MZE_PHYS(zap, mze)->mze_value;
+				(void) strlcpy(realname,
+				    MZE_PHYS(zap, mze)->mze_name, rn_len);
+				if (ncp) {
+					*ncp = mzap_normalization_conflict(zap,
+					    zn, mze);
+				}
+			}
+		}
+	}
+	zap_name_free(zn);
+	return (err);
+}
+
+int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc(zap, name, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	fzap_prefetch(zn);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+	    num_integers, buf, 0, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+    uint64_t integer_size, uint64_t num_integers, void *buf,
+    matchtype_t mt, char *realname, int rn_len,
+    boolean_t *ncp)
+{
+	zap_t *zap;
+
+	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+	    FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_lookup_impl(zap, name, integer_size,
+	    num_integers, buf, mt, realname, rn_len, ncp);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	fzap_prefetch(zn);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	err = fzap_lookup(zn, integer_size, num_integers, buf,
+	    NULL, 0, NULL);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+	int err = zap_lookup_norm(os, zapobj, name, 0,
+	    0, NULL, 0, NULL, 0, NULL);
+	if (err == EOVERFLOW || err == EINVAL)
+		err = 0; /* found, but skipped reading the value */
+	return (err);
+}
+
+int
+zap_length(objset_t *os, uint64_t zapobj, const char *name,
+    uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc(zap, name, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	if (!zap->zap_ismicro) {
+		err = fzap_length(zn, integer_size, num_integers);
+	} else {
+		mzap_ent_t *mze = mze_find(zn);
+		if (mze == NULL) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			if (integer_size)
+				*integer_size = 8;
+			if (num_integers)
+				*num_integers = 1;
+		}
+	}
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_length(zn, integer_size, num_integers);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+static void
+mzap_addent(zap_name_t *zn, uint64_t value)
+{
+	zap_t *zap = zn->zn_zap;
+	int start = zap->zap_m.zap_alloc_next;
+
+	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
+
+#ifdef ZFS_DEBUG
+	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
+		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
+	}
+#endif
+
+	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
+	/* given the limited size of the microzap, this can't happen */
+	ASSERT(cd < zap_maxcd(zap));
+
+again:
+	for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
+		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
+		if (mze->mze_name[0] == 0) {
+			mze->mze_value = value;
+			mze->mze_cd = cd;
+			(void) strlcpy(mze->mze_name, zn->zn_key_orig,
+			    sizeof (mze->mze_name));
+			zap->zap_m.zap_num_entries++;
+			zap->zap_m.zap_alloc_next = i+1;
+			if (zap->zap_m.zap_alloc_next ==
+			    zap->zap_m.zap_num_chunks)
+				zap->zap_m.zap_alloc_next = 0;
+			mze_insert(zap, i, zn->zn_hash);
+			return;
+		}
+	}
+	if (start != 0) {
+		start = 0;
+		goto again;
+	}
+	cmn_err(CE_PANIC, "out of entries!");
+}
+
+static int
+zap_add_impl(zap_t *zap, const char *key,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx, void *tag)
+{
+	const uint64_t *intval = val;
+	int err = 0;
+
+	zap_name_t *zn = zap_name_alloc(zap, key, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, tag);
+		return (SET_ERROR(ENOTSUP));
+	}
+	if (!zap->zap_ismicro) {
+		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
+		zap = zn->zn_zap;	/* fzap_add() may change zap */
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(key) >= MZAP_NAME_LEN ||
+	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
+		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
+		if (err == 0) {
+			err = fzap_add(zn, integer_size, num_integers, val,
+			    tag, tx);
+		}
+		zap = zn->zn_zap;	/* fzap_add() may change zap */
+	} else {
+		if (mze_find(zn) != NULL) {
+			err = SET_ERROR(EEXIST);
+		} else {
+			mzap_addent(zn, *intval);
+		}
+	}
+	ASSERT(zap == zn->zn_zap);
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, tag);
+	return (err);
+}
+
+int
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_by_dnode(dnode_t *dn, const char *key,
+    int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
+	/* zap_add_impl() calls zap_unlockdir() */
+	return (err);
+}
+
+int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_update(objset_t *os, uint64_t zapobj, const char *name,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	const uint64_t *intval = val;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc(zap, name, 0);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	if (!zap->zap_ismicro) {
+		err = fzap_update(zn, integer_size, num_integers, val,
+		    FTAG, tx);
+		zap = zn->zn_zap;	/* fzap_update() may change zap */
+	} else if (integer_size != 8 || num_integers != 1 ||
+	    strlen(name) >= MZAP_NAME_LEN) {
+		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
+		    zapobj, integer_size, num_integers, name);
+		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+		if (err == 0) {
+			err = fzap_update(zn, integer_size, num_integers,
+			    val, FTAG, tx);
+		}
+		zap = zn->zn_zap;	/* fzap_update() may change zap */
+	} else {
+		mzap_ent_t *mze = mze_find(zn);
+		if (mze != NULL) {
+			MZE_PHYS(zap, mze)->mze_value = *intval;
+		} else {
+			mzap_addent(zn, *intval);
+		}
+	}
+	ASSERT(zap == zn->zn_zap);
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
+		zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
+	zap = zn->zn_zap;	/* fzap_update() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
+		zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
+{
+	return (zap_remove_norm(os, zapobj, name, 0, tx));
+}
+
+static int
+zap_remove_impl(zap_t *zap, const char *name,
+    matchtype_t mt, dmu_tx_t *tx)
+{
+	int err = 0;
+
+	zap_name_t *zn = zap_name_alloc(zap, name, mt);
+	if (zn == NULL)
+		return (SET_ERROR(ENOTSUP));
+	if (!zap->zap_ismicro) {
+		err = fzap_remove(zn, tx);
+	} else {
+		mzap_ent_t *mze = mze_find(zn);
+		if (mze == NULL) {
+			err = SET_ERROR(ENOENT);
+		} else {
+			zap->zap_m.zap_num_entries--;
+			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
+			    sizeof (mzap_ent_phys_t));
+			mze_remove(zap, mze);
+		}
+	}
+	zap_name_free(zn);
+	return (err);
+}
+
+int
+zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
+    matchtype_t mt, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	err = zap_remove_impl(zap, name, mt, tx);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+
+	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err)
+		return (err);
+	err = zap_remove_impl(zap, name, 0, tx);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap, FTAG);
+	return (err);
+}
+
+/*
+ * Routines for iterating over the attributes.
+ */
+
+static void
+zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized, boolean_t prefetch)
+{
+	zc->zc_objset = os;
+	zc->zc_zap = NULL;
+	zc->zc_leaf = NULL;
+	zc->zc_zapobj = zapobj;
+	zc->zc_serialized = serialized;
+	zc->zc_hash = 0;
+	zc->zc_cd = 0;
+	zc->zc_prefetch = prefetch;
+}
+void
+zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
+    uint64_t serialized)
+{
+	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning of the ZAP object.  The entire
+ * ZAP object will be prefetched.
+ */
+void
+zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
+}
+
+/*
+ * Initialize a cursor at the beginning, but request that we not prefetch
+ * the entire ZAP object.
+ */
+void
+zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
+{
+	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
+}
+
+void
+zap_cursor_fini(zap_cursor_t *zc)
+{
+	if (zc->zc_zap) {
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+		zap_unlockdir(zc->zc_zap, NULL);
+		zc->zc_zap = NULL;
+	}
+	if (zc->zc_leaf) {
+		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
+		zap_put_leaf(zc->zc_leaf);
+		zc->zc_leaf = NULL;
+	}
+	zc->zc_objset = NULL;
+}
+
+uint64_t
+zap_cursor_serialize(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return (-1ULL);
+	if (zc->zc_zap == NULL)
+		return (zc->zc_serialized);
+	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+	/*
+	 * We want to keep the high 32 bits of the cursor zero if we can, so
+	 * that 32-bit programs can access this.  So usually use a small
+	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+	 * of the cursor.
+	 *
+	 * [ collision differentiator | zap_hashbits()-bit hash value ]
+	 */
+	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
+}
+
+int
+zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
+{
+	int err;
+
+	if (zc->zc_hash == -1ULL)
+		return (SET_ERROR(ENOENT));
+
+	if (zc->zc_zap == NULL) {
+		int hb;
+		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
+		if (err != 0)
+			return (err);
+
+		/*
+		 * To support zap_cursor_init_serialized, advance, retrieve,
+		 * we must add to the existing zc_cd, which may already
+		 * be 1 due to the zap_cursor_advance.
+		 */
+		ASSERT(zc->zc_hash == 0);
+		hb = zap_hashbits(zc->zc_zap);
+		zc->zc_hash = zc->zc_serialized << (64 - hb);
+		zc->zc_cd += zc->zc_serialized >> hb;
+		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+			zc->zc_cd = 0;
+	} else {
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
+	}
+	if (!zc->zc_zap->zap_ismicro) {
+		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
+	} else {
+		avl_index_t idx;
+		mzap_ent_t mze_tofind;
+
+		mze_tofind.mze_hash = zc->zc_hash;
+		mze_tofind.mze_cd = zc->zc_cd;
+
+		mzap_ent_t *mze =
+		    avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
+		if (mze == NULL) {
+			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
+			    idx, AVL_AFTER);
+		}
+		if (mze) {
+			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
+			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
+			za->za_normalization_conflict =
+			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
+			za->za_integer_length = 8;
+			za->za_num_integers = 1;
+			za->za_first_integer = mzep->mze_value;
+			(void) strlcpy(za->za_name, mzep->mze_name,
+			    sizeof (za->za_name));
+			zc->zc_hash = mze->mze_hash;
+			zc->zc_cd = mze->mze_cd;
+			err = 0;
+		} else {
+			zc->zc_hash = -1ULL;
+			err = SET_ERROR(ENOENT);
+		}
+	}
+	rw_exit(&zc->zc_zap->zap_rwlock);
+	return (err);
+}
+
+void
+zap_cursor_advance(zap_cursor_t *zc)
+{
+	if (zc->zc_hash == -1ULL)
+		return;
+	zc->zc_cd++;
+}
+
+int
+zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
+{
+	zap_t *zap;
+
+	int err =
+	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+	if (err != 0)
+		return (err);
+
+	bzero(zs, sizeof (zap_stats_t));
+
+	if (zap->zap_ismicro) {
+		zs->zs_blocksize = zap->zap_dbuf->db_size;
+		zs->zs_num_entries = zap->zap_m.zap_num_entries;
+		zs->zs_num_blocks = 1;
+	} else {
+		fzap_get_stats(zap, zs);
+	}
+	zap_unlockdir(zap, FTAG);
+	return (0);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zap_create);
+EXPORT_SYMBOL(zap_create_dnsize);
+EXPORT_SYMBOL(zap_create_norm);
+EXPORT_SYMBOL(zap_create_norm_dnsize);
+EXPORT_SYMBOL(zap_create_flags);
+EXPORT_SYMBOL(zap_create_flags_dnsize);
+EXPORT_SYMBOL(zap_create_claim);
+EXPORT_SYMBOL(zap_create_claim_norm);
+EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
+EXPORT_SYMBOL(zap_create_hold);
+EXPORT_SYMBOL(zap_destroy);
+EXPORT_SYMBOL(zap_lookup);
+EXPORT_SYMBOL(zap_lookup_by_dnode);
+EXPORT_SYMBOL(zap_lookup_norm);
+EXPORT_SYMBOL(zap_lookup_uint64);
+EXPORT_SYMBOL(zap_contains);
+EXPORT_SYMBOL(zap_prefetch);
+EXPORT_SYMBOL(zap_prefetch_uint64);
+EXPORT_SYMBOL(zap_add);
+EXPORT_SYMBOL(zap_add_by_dnode);
+EXPORT_SYMBOL(zap_add_uint64);
+EXPORT_SYMBOL(zap_update);
+EXPORT_SYMBOL(zap_update_uint64);
+EXPORT_SYMBOL(zap_length);
+EXPORT_SYMBOL(zap_length_uint64);
+EXPORT_SYMBOL(zap_remove);
+EXPORT_SYMBOL(zap_remove_by_dnode);
+EXPORT_SYMBOL(zap_remove_norm);
+EXPORT_SYMBOL(zap_remove_uint64);
+EXPORT_SYMBOL(zap_count);
+EXPORT_SYMBOL(zap_value_search);
+EXPORT_SYMBOL(zap_join);
+EXPORT_SYMBOL(zap_join_increment);
+EXPORT_SYMBOL(zap_add_int);
+EXPORT_SYMBOL(zap_remove_int);
+EXPORT_SYMBOL(zap_lookup_int);
+EXPORT_SYMBOL(zap_increment_int);
+EXPORT_SYMBOL(zap_add_int_key);
+EXPORT_SYMBOL(zap_lookup_int_key);
+EXPORT_SYMBOL(zap_increment);
+EXPORT_SYMBOL(zap_cursor_init);
+EXPORT_SYMBOL(zap_cursor_fini);
+EXPORT_SYMBOL(zap_cursor_retrieve);
+EXPORT_SYMBOL(zap_cursor_advance);
+EXPORT_SYMBOL(zap_cursor_serialize);
+EXPORT_SYMBOL(zap_cursor_init_serialized);
+EXPORT_SYMBOL(zap_get_stats);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zcp.c b/sys/contrib/openzfs/module/zfs/zcp.c
new file mode 100644
index 000000000000..1ad53eae1eef
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp.c
@@ -0,0 +1,1451 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2018 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZFS Channel Programs (ZCP)
+ *
+ * The ZCP interface allows various ZFS commands and operations ZFS
+ * administrative operations (e.g. creating and destroying snapshots, typically
+ * performed via an ioctl to /dev/zfs by the zfs(8) command and
+ * libzfs/libzfs_core) to be run * programmatically as a Lua script.  A ZCP
+ * script is run as a dsl_sync_task and fully executed during one transaction
+ * group sync.  This ensures that no other changes can be written concurrently
+ * with a running Lua script.  Combining multiple calls to the exposed ZFS
+ * functions into one script gives a number of benefits:
+ *
+ * 1. Atomicity.  For some compound or iterative operations, it's useful to be
+ * able to guarantee that the state of a pool has not changed between calls to
+ * ZFS.
+ *
+ * 2. Performance.  If a large number of changes need to be made (e.g. deleting
+ * many filesystems), there can be a significant performance penalty as a
+ * result of the need to wait for a transaction group sync to pass for every
+ * single operation.  When expressed as a single ZCP script, all these changes
+ * can be performed at once in one txg sync.
+ *
+ * A modified version of the Lua 5.2 interpreter is used to run channel program
+ * scripts. The Lua 5.2 manual can be found at:
+ *
+ *      http://www.lua.org/manual/5.2/
+ *
+ * If being run by a user (via an ioctl syscall), executing a ZCP script
+ * requires root privileges in the global zone.
+ *
+ * Scripts are passed to zcp_eval() as a string, then run in a synctask by
+ * zcp_eval_sync().  Arguments can be passed into the Lua script as an nvlist,
+ * which will be converted to a Lua table.  Similarly, values returned from
+ * a ZCP script will be converted to an nvlist.  See zcp_lua_to_nvlist_impl()
+ * for details on exact allowed types and conversion.
+ *
+ * ZFS functionality is exposed to a ZCP script as a library of function calls.
+ * These calls are sorted into submodules, such as zfs.list and zfs.sync, for
+ * iterators and synctasks, respectively.  Each of these submodules resides in
+ * its own source file, with a zcp_*_info structure describing each library
+ * call in the submodule.
+ *
+ * Error handling in ZCP scripts is handled by a number of different methods
+ * based on severity:
+ *
+ * 1. Memory and time limits are in place to prevent a channel program from
+ * consuming excessive system or running forever.  If one of these limits is
+ * hit, the channel program will be stopped immediately and return from
+ * zcp_eval() with an error code. No attempt will be made to roll back or undo
+ * any changes made by the channel program before the error occurred.
+ * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time
+ * limit of 0, disabling the time limit.
+ *
+ * 2. Internal Lua errors can occur as a result of a syntax error, calling a
+ * library function with incorrect arguments, invoking the error() function,
+ * failing an assert(), or other runtime errors.  In these cases the channel
+ * program will stop executing and return from zcp_eval() with an error code.
+ * In place of a return value, an error message will also be returned in the
+ * 'result' nvlist containing information about the error. No attempt will be
+ * made to roll back or undo any changes made by the channel program before the
+ * error occurred.
+ *
+ * 3. If an error occurs inside a ZFS library call which returns an error code,
+ * the error is returned to the Lua script to be handled as desired.
+ *
+ * In the first two cases, Lua's error-throwing mechanism is used, which
+ * longjumps out of the script execution with luaL_error() and returns with the
+ * error.
+ *
+ * See zfs-program(8) for more information on high level usage.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lualib.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/zcp.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_prop.h>
+#include <sys/zcp_global.h>
+#include <sys/zvol.h>
+
+#ifndef KM_NORMALPRI
+#define	KM_NORMALPRI	0
+#endif
+
+#define	ZCP_NVLIST_MAX_DEPTH 20
+
+uint64_t zfs_lua_check_instrlimit_interval = 100;
+unsigned long zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT;
+unsigned long zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT;
+
+/*
+ * Forward declarations for mutually recursive functions
+ */
+static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int);
+static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *,
+    int);
+
+/*
+ * The outer-most error callback handler for use with lua_pcall(). On
+ * error Lua will call this callback with a single argument that
+ * represents the error value. In most cases this will be a string
+ * containing an error message, but channel programs can use Lua's
+ * error() function to return arbitrary objects as errors. This callback
+ * returns (on the Lua stack) the original error object along with a traceback.
+ *
+ * Fatal Lua errors can occur while resources are held, so we also call any
+ * registered cleanup function here.
+ */
+static int
+zcp_error_handler(lua_State *state)
+{
+	const char *msg;
+
+	zcp_cleanup(state);
+
+	VERIFY3U(1, ==, lua_gettop(state));
+	msg = lua_tostring(state, 1);
+	luaL_traceback(state, state, msg, 1);
+	return (1);
+}
+
+int
+zcp_argerror(lua_State *state, int narg, const char *msg, ...)
+{
+	va_list alist;
+
+	va_start(alist, msg);
+	const char *buf = lua_pushvfstring(state, msg, alist);
+	va_end(alist);
+
+	return (luaL_argerror(state, narg, buf));
+}
+
+/*
+ * Install a new cleanup function, which will be invoked with the given
+ * opaque argument if a fatal error causes the Lua interpreter to longjump out
+ * of a function call.
+ *
+ * If an error occurs, the cleanup function will be invoked exactly once and
+ * then unregistered.
+ *
+ * Returns the registered cleanup handler so the caller can deregister it
+ * if no error occurs.
+ */
+zcp_cleanup_handler_t *
+zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg)
+{
+	zcp_run_info_t *ri = zcp_run_info(state);
+
+	zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP);
+	zch->zch_cleanup_func = cleanfunc;
+	zch->zch_cleanup_arg = cleanarg;
+	list_insert_head(&ri->zri_cleanup_handlers, zch);
+
+	return (zch);
+}
+
+void
+zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch)
+{
+	zcp_run_info_t *ri = zcp_run_info(state);
+	list_remove(&ri->zri_cleanup_handlers, zch);
+	kmem_free(zch, sizeof (*zch));
+}
+
+/*
+ * Execute the currently registered cleanup handlers then free them and
+ * destroy the handler list.
+ */
+void
+zcp_cleanup(lua_State *state)
+{
+	zcp_run_info_t *ri = zcp_run_info(state);
+
+	for (zcp_cleanup_handler_t *zch =
+	    list_remove_head(&ri->zri_cleanup_handlers); zch != NULL;
+	    zch = list_remove_head(&ri->zri_cleanup_handlers)) {
+		zch->zch_cleanup_func(zch->zch_cleanup_arg);
+		kmem_free(zch, sizeof (*zch));
+	}
+}
+
+/*
+ * Convert the lua table at the given index on the Lua stack to an nvlist
+ * and return it.
+ *
+ * If the table can not be converted for any reason, NULL is returned and
+ * an error message is pushed onto the Lua stack.
+ */
+static nvlist_t *
+zcp_table_to_nvlist(lua_State *state, int index, int depth)
+{
+	nvlist_t *nvl;
+	/*
+	 * Converting a Lua table to an nvlist with key uniqueness checking is
+	 * O(n^2) in the number of keys in the nvlist, which can take a long
+	 * time when we return a large table from a channel program.
+	 * Furthermore, Lua's table interface *almost* guarantees unique keys
+	 * on its own (details below). Therefore, we don't use fnvlist_alloc()
+	 * here to avoid the built-in uniqueness checking.
+	 *
+	 * The *almost* is because it's possible to have key collisions between
+	 * e.g. the string "1" and the number 1, or the string "true" and the
+	 * boolean true, so we explicitly check that when we're looking at a
+	 * key which is an integer / boolean or a string that can be parsed as
+	 * one of those types. In the worst case this could still devolve into
+	 * O(n^2), so we only start doing these checks on boolean/integer keys
+	 * once we've seen a string key which fits this weird usage pattern.
+	 *
+	 * Ultimately, we still want callers to know that the keys in this
+	 * nvlist are unique, so before we return this we set the nvlist's
+	 * flags to reflect that.
+	 */
+	VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP));
+
+	/*
+	 * Push an empty stack slot where lua_next() will store each
+	 * table key.
+	 */
+	lua_pushnil(state);
+	boolean_t saw_str_could_collide = B_FALSE;
+	while (lua_next(state, index) != 0) {
+		/*
+		 * The next key-value pair from the table at index is
+		 * now on the stack, with the key at stack slot -2 and
+		 * the value at slot -1.
+		 */
+		int err = 0;
+		char buf[32];
+		const char *key = NULL;
+		boolean_t key_could_collide = B_FALSE;
+
+		switch (lua_type(state, -2)) {
+		case LUA_TSTRING:
+			key = lua_tostring(state, -2);
+
+			/* check if this could collide with a number or bool */
+			long long tmp;
+			int parselen;
+			if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 &&
+			    parselen == strlen(key)) ||
+			    strcmp(key, "true") == 0 ||
+			    strcmp(key, "false") == 0) {
+				key_could_collide = B_TRUE;
+				saw_str_could_collide = B_TRUE;
+			}
+			break;
+		case LUA_TBOOLEAN:
+			key = (lua_toboolean(state, -2) == B_TRUE ?
+			    "true" : "false");
+			if (saw_str_could_collide) {
+				key_could_collide = B_TRUE;
+			}
+			break;
+		case LUA_TNUMBER:
+			VERIFY3U(sizeof (buf), >,
+			    snprintf(buf, sizeof (buf), "%lld",
+			    (longlong_t)lua_tonumber(state, -2)));
+			key = buf;
+			if (saw_str_could_collide) {
+				key_could_collide = B_TRUE;
+			}
+			break;
+		default:
+			fnvlist_free(nvl);
+			(void) lua_pushfstring(state, "Invalid key "
+			    "type '%s' in table",
+			    lua_typename(state, lua_type(state, -2)));
+			return (NULL);
+		}
+		/*
+		 * Check for type-mismatched key collisions, and throw an error.
+		 */
+		if (key_could_collide && nvlist_exists(nvl, key)) {
+			fnvlist_free(nvl);
+			(void) lua_pushfstring(state, "Collision of "
+			    "key '%s' in table", key);
+			return (NULL);
+		}
+		/*
+		 * Recursively convert the table value and insert into
+		 * the new nvlist with the parsed key.  To prevent
+		 * stack overflow on circular or heavily nested tables,
+		 * we track the current nvlist depth.
+		 */
+		if (depth >= ZCP_NVLIST_MAX_DEPTH) {
+			fnvlist_free(nvl);
+			(void) lua_pushfstring(state, "Maximum table "
+			    "depth (%d) exceeded for table",
+			    ZCP_NVLIST_MAX_DEPTH);
+			return (NULL);
+		}
+		err = zcp_lua_to_nvlist_impl(state, -1, nvl, key,
+		    depth + 1);
+		if (err != 0) {
+			fnvlist_free(nvl);
+			/*
+			 * Error message has been pushed to the lua
+			 * stack by the recursive call.
+			 */
+			return (NULL);
+		}
+		/*
+		 * Pop the value pushed by lua_next().
+		 */
+		lua_pop(state, 1);
+	}
+
+	/*
+	 * Mark the nvlist as having unique keys. This is a little ugly, but we
+	 * ensured above that there are no duplicate keys in the nvlist.
+	 */
+	nvl->nvl_nvflag |= NV_UNIQUE_NAME;
+
+	return (nvl);
+}
+
+/*
+ * Convert a value from the given index into the lua stack to an nvpair, adding
+ * it to an nvlist with the given key.
+ *
+ * Values are converted as follows:
+ *
+ *   string -> string
+ *   number -> int64
+ *   boolean -> boolean
+ *   nil -> boolean (no value)
+ *
+ * Lua tables are converted to nvlists and then inserted. The table's keys
+ * are converted to strings then used as keys in the nvlist to store each table
+ * element.  Keys are converted as follows:
+ *
+ *   string -> no change
+ *   number -> "%lld"
+ *   boolean -> "true" | "false"
+ *   nil -> error
+ *
+ * In the case of a key collision, an error is thrown.
+ *
+ * If an error is encountered, a nonzero error code is returned, and an error
+ * string will be pushed onto the Lua stack.
+ */
+static int
+zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl,
+    const char *key, int depth)
+{
+	/*
+	 * Verify that we have enough remaining space in the lua stack to parse
+	 * a key-value pair and push an error.
+	 */
+	if (!lua_checkstack(state, 3)) {
+		(void) lua_pushstring(state, "Lua stack overflow");
+		return (1);
+	}
+
+	index = lua_absindex(state, index);
+
+	switch (lua_type(state, index)) {
+	case LUA_TNIL:
+		fnvlist_add_boolean(nvl, key);
+		break;
+	case LUA_TBOOLEAN:
+		fnvlist_add_boolean_value(nvl, key,
+		    lua_toboolean(state, index));
+		break;
+	case LUA_TNUMBER:
+		fnvlist_add_int64(nvl, key, lua_tonumber(state, index));
+		break;
+	case LUA_TSTRING:
+		fnvlist_add_string(nvl, key, lua_tostring(state, index));
+		break;
+	case LUA_TTABLE: {
+		nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth);
+		if (value_nvl == NULL)
+			return (SET_ERROR(EINVAL));
+
+		fnvlist_add_nvlist(nvl, key, value_nvl);
+		fnvlist_free(value_nvl);
+		break;
+	}
+	default:
+		(void) lua_pushfstring(state,
+		    "Invalid value type '%s' for key '%s'",
+		    lua_typename(state, lua_type(state, index)), key);
+		return (SET_ERROR(EINVAL));
+	}
+
+	return (0);
+}
+
+/*
+ * Convert a lua value to an nvpair, adding it to an nvlist with the given key.
+ */
+static void
+zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key)
+{
+	/*
+	 * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua
+	 * stack before returning with a nonzero error code. If an error is
+	 * returned, throw a fatal lua error with the given string.
+	 */
+	if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0)
+		(void) lua_error(state);
+}
+
+static int
+zcp_lua_to_nvlist_helper(lua_State *state)
+{
+	nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2);
+	const char *key = (const char *)lua_touserdata(state, 1);
+	zcp_lua_to_nvlist(state, 3, nv, key);
+	return (0);
+}
+
+static void
+zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
+    const char *key, int *result)
+{
+	int err;
+	VERIFY3U(1, ==, lua_gettop(state));
+	lua_pushcfunction(state, zcp_lua_to_nvlist_helper);
+	lua_pushlightuserdata(state, (char *)key);
+	lua_pushlightuserdata(state, nvl);
+	lua_pushvalue(state, 1);
+	lua_remove(state, 1);
+	err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */
+	if (err != 0) {
+		zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR);
+		*result = SET_ERROR(ECHRNG);
+	}
+}
+
+/*
+ * Push a Lua table representing nvl onto the stack.  If it can't be
+ * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may
+ * be specified as NULL, in which case no error string will be output.
+ *
+ * Most nvlists are converted as simple key->value Lua tables, but we make
+ * an exception for the case where all nvlist entries are BOOLEANs (a string
+ * key without a value). In Lua, a table key pointing to a value of Nil
+ * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist
+ * entry can't be directly converted to a Lua table entry. Nvlists of entirely
+ * BOOLEAN entries are frequently used to pass around lists of datasets, so for
+ * convenience we check for this case, and convert it to a simple Lua array of
+ * strings.
+ */
+int
+zcp_nvlist_to_lua(lua_State *state, nvlist_t *nvl,
+    char *errbuf, int errbuf_len)
+{
+	nvpair_t *pair;
+	lua_newtable(state);
+	boolean_t has_values = B_FALSE;
+	/*
+	 * If the list doesn't have any values, just convert it to a string
+	 * array.
+	 */
+	for (pair = nvlist_next_nvpair(nvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+		if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) {
+			has_values = B_TRUE;
+			break;
+		}
+	}
+	if (!has_values) {
+		int i = 1;
+		for (pair = nvlist_next_nvpair(nvl, NULL);
+		    pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+			(void) lua_pushinteger(state, i);
+			(void) lua_pushstring(state, nvpair_name(pair));
+			(void) lua_settable(state, -3);
+			i++;
+		}
+	} else {
+		for (pair = nvlist_next_nvpair(nvl, NULL);
+		    pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) {
+			int err = zcp_nvpair_value_to_lua(state, pair,
+			    errbuf, errbuf_len);
+			if (err != 0) {
+				lua_pop(state, 1);
+				return (err);
+			}
+			(void) lua_setfield(state, -2, nvpair_name(pair));
+		}
+	}
+	return (0);
+}
+
+/*
+ * Push a Lua object representing the value of "pair" onto the stack.
+ *
+ * Only understands boolean_value, string, int64, nvlist,
+ * string_array, and int64_array type values.  For other
+ * types, returns EINVAL, fills in errbuf, and pushes nothing.
+ */
+static int
+zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair,
+    char *errbuf, int errbuf_len)
+{
+	int err = 0;
+
+	if (pair == NULL) {
+		lua_pushnil(state);
+		return (0);
+	}
+
+	switch (nvpair_type(pair)) {
+	case DATA_TYPE_BOOLEAN_VALUE:
+		(void) lua_pushboolean(state,
+		    fnvpair_value_boolean_value(pair));
+		break;
+	case DATA_TYPE_STRING:
+		(void) lua_pushstring(state, fnvpair_value_string(pair));
+		break;
+	case DATA_TYPE_INT64:
+		(void) lua_pushinteger(state, fnvpair_value_int64(pair));
+		break;
+	case DATA_TYPE_NVLIST:
+		err = zcp_nvlist_to_lua(state,
+		    fnvpair_value_nvlist(pair), errbuf, errbuf_len);
+		break;
+	case DATA_TYPE_STRING_ARRAY: {
+		char **strarr;
+		uint_t nelem;
+		(void) nvpair_value_string_array(pair, &strarr, &nelem);
+		lua_newtable(state);
+		for (int i = 0; i < nelem; i++) {
+			(void) lua_pushinteger(state, i + 1);
+			(void) lua_pushstring(state, strarr[i]);
+			(void) lua_settable(state, -3);
+		}
+		break;
+	}
+	case DATA_TYPE_UINT64_ARRAY: {
+		uint64_t *intarr;
+		uint_t nelem;
+		(void) nvpair_value_uint64_array(pair, &intarr, &nelem);
+		lua_newtable(state);
+		for (int i = 0; i < nelem; i++) {
+			(void) lua_pushinteger(state, i + 1);
+			(void) lua_pushinteger(state, intarr[i]);
+			(void) lua_settable(state, -3);
+		}
+		break;
+	}
+	case DATA_TYPE_INT64_ARRAY: {
+		int64_t *intarr;
+		uint_t nelem;
+		(void) nvpair_value_int64_array(pair, &intarr, &nelem);
+		lua_newtable(state);
+		for (int i = 0; i < nelem; i++) {
+			(void) lua_pushinteger(state, i + 1);
+			(void) lua_pushinteger(state, intarr[i]);
+			(void) lua_settable(state, -3);
+		}
+		break;
+	}
+	default: {
+		if (errbuf != NULL) {
+			(void) snprintf(errbuf, errbuf_len,
+			    "Unhandled nvpair type %d for key '%s'",
+			    nvpair_type(pair), nvpair_name(pair));
+		}
+		return (SET_ERROR(EINVAL));
+	}
+	}
+	return (err);
+}
+
+int
+zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname,
+    int error)
+{
+	if (error == ENOENT) {
+		(void) zcp_argerror(state, 1, "no such dataset '%s'", dsname);
+		return (0); /* not reached; zcp_argerror will longjmp */
+	} else if (error == EXDEV) {
+		(void) zcp_argerror(state, 1,
+		    "dataset '%s' is not in the target pool '%s'",
+		    dsname, spa_name(dp->dp_spa));
+		return (0); /* not reached; zcp_argerror will longjmp */
+	} else if (error == EIO) {
+		(void) luaL_error(state,
+		    "I/O error while accessing dataset '%s'", dsname);
+		return (0); /* not reached; luaL_error will longjmp */
+	} else if (error != 0) {
+		(void) luaL_error(state,
+		    "unexpected error %d while accessing dataset '%s'",
+		    error, dsname);
+		return (0); /* not reached; luaL_error will longjmp */
+	}
+	return (0);
+}
+
+/*
+ * Note: will longjmp (via lua_error()) on error.
+ * Assumes that the dsname is argument #1 (for error reporting purposes).
+ */
+dsl_dataset_t *
+zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname,
+    void *tag)
+{
+	dsl_dataset_t *ds;
+	int error = dsl_dataset_hold(dp, dsname, tag, &ds);
+	(void) zcp_dataset_hold_error(state, dp, dsname, error);
+	return (ds);
+}
+
+static int zcp_debug(lua_State *);
+static zcp_lib_info_t zcp_debug_info = {
+	.name = "debug",
+	.func = zcp_debug,
+	.pargs = {
+	    { .za_name = "debug string", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_debug(lua_State *state)
+{
+	const char *dbgstring;
+	zcp_run_info_t *ri = zcp_run_info(state);
+	zcp_lib_info_t *libinfo = &zcp_debug_info;
+
+	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+	dbgstring = lua_tostring(state, 1);
+
+	zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring);
+
+	return (0);
+}
+
+static int zcp_exists(lua_State *);
+static zcp_lib_info_t zcp_exists_info = {
+	.name = "exists",
+	.func = zcp_exists,
+	.pargs = {
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_exists(lua_State *state)
+{
+	zcp_run_info_t *ri = zcp_run_info(state);
+	dsl_pool_t *dp = ri->zri_pool;
+	zcp_lib_info_t *libinfo = &zcp_exists_info;
+
+	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+	const char *dsname = lua_tostring(state, 1);
+
+	dsl_dataset_t *ds;
+	int error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
+	if (error == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		lua_pushboolean(state, B_TRUE);
+	} else if (error == ENOENT) {
+		lua_pushboolean(state, B_FALSE);
+	} else if (error == EXDEV) {
+		return (luaL_error(state, "dataset '%s' is not in the "
+		    "target pool", dsname));
+	} else if (error == EIO) {
+		return (luaL_error(state, "I/O error opening dataset '%s'",
+		    dsname));
+	} else if (error != 0) {
+		return (luaL_error(state, "unexpected error %d", error));
+	}
+
+	return (1);
+}
+
+/*
+ * Allocate/realloc/free a buffer for the lua interpreter.
+ *
+ * When nsize is 0, behaves as free() and returns NULL.
+ *
+ * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size
+ * at least nsize.
+ *
+ * Otherwise, behaves as realloc(), changing the allocation from osize to nsize.
+ * Shrinking the buffer size never fails.
+ *
+ * The original allocated buffer size is stored as a uint64 at the beginning of
+ * the buffer to avoid actually reallocating when shrinking a buffer, since lua
+ * requires that this operation never fail.
+ */
+static void *
+zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
+{
+	zcp_alloc_arg_t *allocargs = ud;
+
+	if (nsize == 0) {
+		if (ptr != NULL) {
+			int64_t *allocbuf = (int64_t *)ptr - 1;
+			int64_t allocsize = *allocbuf;
+			ASSERT3S(allocsize, >, 0);
+			ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=,
+			    allocargs->aa_alloc_limit);
+			allocargs->aa_alloc_remaining += allocsize;
+			vmem_free(allocbuf, allocsize);
+		}
+		return (NULL);
+	} else if (ptr == NULL) {
+		int64_t *allocbuf;
+		int64_t allocsize = nsize + sizeof (int64_t);
+
+		if (!allocargs->aa_must_succeed &&
+		    (allocsize <= 0 ||
+		    allocsize > allocargs->aa_alloc_remaining)) {
+			return (NULL);
+		}
+
+		allocbuf = vmem_alloc(allocsize, KM_SLEEP);
+		allocargs->aa_alloc_remaining -= allocsize;
+
+		*allocbuf = allocsize;
+		return (allocbuf + 1);
+	} else if (nsize <= osize) {
+		/*
+		 * If shrinking the buffer, lua requires that the reallocation
+		 * never fail.
+		 */
+		return (ptr);
+	} else {
+		ASSERT3U(nsize, >, osize);
+
+		uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize);
+		if (luabuf == NULL) {
+			return (NULL);
+		}
+		(void) memcpy(luabuf, ptr, osize);
+		VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL);
+		return (luabuf);
+	}
+}
+
+/* ARGSUSED */
+static void
+zcp_lua_counthook(lua_State *state, lua_Debug *ar)
+{
+	lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+	zcp_run_info_t *ri = lua_touserdata(state, -1);
+
+	/*
+	 * Check if we were canceled while waiting for the
+	 * txg to sync or from our open context thread
+	 */
+	if (ri->zri_canceled ||
+	    (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
+		ri->zri_canceled = B_TRUE;
+		(void) lua_pushstring(state, "Channel program was canceled.");
+		(void) lua_error(state);
+		/* Unreachable */
+	}
+
+	/*
+	 * Check how many instructions the channel program has
+	 * executed so far, and compare against the limit.
+	 */
+	ri->zri_curinstrs += zfs_lua_check_instrlimit_interval;
+	if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) {
+		ri->zri_timed_out = B_TRUE;
+		(void) lua_pushstring(state,
+		    "Channel program timed out.");
+		(void) lua_error(state);
+		/* Unreachable */
+	}
+}
+
+static int
+zcp_panic_cb(lua_State *state)
+{
+	panic("unprotected error in call to Lua API (%s)\n",
+	    lua_tostring(state, -1));
+	return (0);
+}
+
+static void
+zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri)
+{
+	int err;
+	lua_State *state = ri->zri_state;
+
+	VERIFY3U(3, ==, lua_gettop(state));
+
+	/* finish initializing our runtime state */
+	ri->zri_pool = dmu_tx_pool(tx);
+	ri->zri_tx = tx;
+	list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
+	    offsetof(zcp_cleanup_handler_t, zch_node));
+
+	/*
+	 * Store the zcp_run_info_t struct for this run in the Lua registry.
+	 * Registry entries are not directly accessible by the Lua scripts but
+	 * can be accessed by our callbacks.
+	 */
+	lua_pushlightuserdata(state, ri);
+	lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+	VERIFY3U(3, ==, lua_gettop(state));
+
+	/*
+	 * Tell the Lua interpreter to call our handler every count
+	 * instructions. Channel programs that execute too many instructions
+	 * should die with ETIME.
+	 */
+	(void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT,
+	    zfs_lua_check_instrlimit_interval);
+
+	/*
+	 * Tell the Lua memory allocator to stop using KM_SLEEP before handing
+	 * off control to the channel program. Channel programs that use too
+	 * much memory should die with ENOSPC.
+	 */
+	ri->zri_allocargs->aa_must_succeed = B_FALSE;
+
+	/*
+	 * Call the Lua function that open-context passed us. This pops the
+	 * function and its input from the stack and pushes any return
+	 * or error values.
+	 */
+	err = lua_pcall(state, 1, LUA_MULTRET, 1);
+
+	/*
+	 * Let Lua use KM_SLEEP while we interpret the return values.
+	 */
+	ri->zri_allocargs->aa_must_succeed = B_TRUE;
+
+	/*
+	 * Remove the error handler callback from the stack. At this point,
+	 * there shouldn't be any cleanup handler registered in the handler
+	 * list (zri_cleanup_handlers), regardless of whether it ran or not.
+	 */
+	list_destroy(&ri->zri_cleanup_handlers);
+	lua_remove(state, 1);
+
+	switch (err) {
+	case LUA_OK: {
+		/*
+		 * Lua supports returning multiple values in a single return
+		 * statement.  Return values will have been pushed onto the
+		 * stack:
+		 * 1: Return value 1
+		 * 2: Return value 2
+		 * 3: etc...
+		 * To simplify the process of retrieving a return value from a
+		 * channel program, we disallow returning more than one value
+		 * to ZFS from the Lua script, yielding a singleton return
+		 * nvlist of the form { "return": Return value 1 }.
+		 */
+		int return_count = lua_gettop(state);
+
+		if (return_count == 1) {
+			ri->zri_result = 0;
+			zcp_convert_return_values(state, ri->zri_outnvl,
+			    ZCP_RET_RETURN, &ri->zri_result);
+		} else if (return_count > 1) {
+			ri->zri_result = SET_ERROR(ECHRNG);
+			lua_settop(state, 0);
+			(void) lua_pushfstring(state, "Multiple return "
+			    "values not supported");
+			zcp_convert_return_values(state, ri->zri_outnvl,
+			    ZCP_RET_ERROR, &ri->zri_result);
+		}
+		break;
+	}
+	case LUA_ERRRUN:
+	case LUA_ERRGCMM: {
+		/*
+		 * The channel program encountered a fatal error within the
+		 * script, such as failing an assertion, or calling a function
+		 * with incompatible arguments. The error value and the
+		 * traceback generated by zcp_error_handler() should be on the
+		 * stack.
+		 */
+		VERIFY3U(1, ==, lua_gettop(state));
+		if (ri->zri_timed_out) {
+			ri->zri_result = SET_ERROR(ETIME);
+		} else if (ri->zri_canceled) {
+			ri->zri_result = SET_ERROR(EINTR);
+		} else {
+			ri->zri_result = SET_ERROR(ECHRNG);
+		}
+
+		zcp_convert_return_values(state, ri->zri_outnvl,
+		    ZCP_RET_ERROR, &ri->zri_result);
+
+		if (ri->zri_result == ETIME && ri->zri_outnvl != NULL) {
+			(void) nvlist_add_uint64(ri->zri_outnvl,
+			    ZCP_ARG_INSTRLIMIT, ri->zri_curinstrs);
+		}
+		break;
+	}
+	case LUA_ERRERR: {
+		/*
+		 * The channel program encountered a fatal error within the
+		 * script, and we encountered another error while trying to
+		 * compute the traceback in zcp_error_handler(). We can only
+		 * return the error message.
+		 */
+		VERIFY3U(1, ==, lua_gettop(state));
+		if (ri->zri_timed_out) {
+			ri->zri_result = SET_ERROR(ETIME);
+		} else if (ri->zri_canceled) {
+			ri->zri_result = SET_ERROR(EINTR);
+		} else {
+			ri->zri_result = SET_ERROR(ECHRNG);
+		}
+
+		zcp_convert_return_values(state, ri->zri_outnvl,
+		    ZCP_RET_ERROR, &ri->zri_result);
+		break;
+	}
+	case LUA_ERRMEM:
+		/*
+		 * Lua ran out of memory while running the channel program.
+		 * There's not much we can do.
+		 */
+		ri->zri_result = SET_ERROR(ENOSPC);
+		break;
+	default:
+		VERIFY0(err);
+	}
+}
+
+static void
+zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
+{
+	ri->zri_result = SET_ERROR(ECHRNG);
+	lua_settop(ri->zri_state, 0);
+	(void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
+	    poolname);
+	zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
+	    ZCP_RET_ERROR, &ri->zri_result);
+
+}
+
+/*
+ * This callback is called when txg_wait_synced_sig encountered a signal.
+ * The txg_wait_synced_sig will continue to wait for the txg to complete
+ * after calling this callback.
+ */
+/* ARGSUSED */
+static void
+zcp_eval_sig(void *arg, dmu_tx_t *tx)
+{
+	zcp_run_info_t *ri = arg;
+
+	ri->zri_canceled = B_TRUE;
+}
+
+static void
+zcp_eval_sync(void *arg, dmu_tx_t *tx)
+{
+	zcp_run_info_t *ri = arg;
+
+	/*
+	 * Open context should have setup the stack to contain:
+	 * 1: Error handler callback
+	 * 2: Script to run (converted to a Lua function)
+	 * 3: nvlist input to function (converted to Lua table or nil)
+	 */
+	VERIFY3U(3, ==, lua_gettop(ri->zri_state));
+
+	zcp_eval_impl(tx, ri);
+}
+
+static void
+zcp_eval_open(zcp_run_info_t *ri, const char *poolname)
+{
+	int error;
+	dsl_pool_t *dp;
+	dmu_tx_t *tx;
+
+	/*
+	 * See comment from the same assertion in zcp_eval_sync().
+	 */
+	VERIFY3U(3, ==, lua_gettop(ri->zri_state));
+
+	error = dsl_pool_hold(poolname, FTAG, &dp);
+	if (error != 0) {
+		zcp_pool_error(ri, poolname);
+		return;
+	}
+
+	/*
+	 * As we are running in open-context, we have no transaction associated
+	 * with the channel program. At the same time, functions from the
+	 * zfs.check submodule need to be associated with a transaction as
+	 * they are basically dry-runs of their counterparts in the zfs.sync
+	 * submodule. These functions should be able to run in open-context.
+	 * Therefore we create a new transaction that we later abort once
+	 * the channel program has been evaluated.
+	 */
+	tx = dmu_tx_create_dd(dp->dp_mos_dir);
+
+	zcp_eval_impl(tx, ri);
+
+	dmu_tx_abort(tx);
+
+	dsl_pool_rele(dp, FTAG);
+}
+
+int
+zcp_eval(const char *poolname, const char *program, boolean_t sync,
+    uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl)
+{
+	int err;
+	lua_State *state;
+	zcp_run_info_t runinfo;
+
+	if (instrlimit > zfs_lua_max_instrlimit)
+		return (SET_ERROR(EINVAL));
+	if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
+		return (SET_ERROR(EINVAL));
+
+	zcp_alloc_arg_t allocargs = {
+		.aa_must_succeed = B_TRUE,
+		.aa_alloc_remaining = (int64_t)memlimit,
+		.aa_alloc_limit = (int64_t)memlimit,
+	};
+
+	/*
+	 * Creates a Lua state with a memory allocator that uses KM_SLEEP.
+	 * This should never fail.
+	 */
+	state = lua_newstate(zcp_lua_alloc, &allocargs);
+	VERIFY(state != NULL);
+	(void) lua_atpanic(state, zcp_panic_cb);
+
+	/*
+	 * Load core Lua libraries we want access to.
+	 */
+	VERIFY3U(1, ==, luaopen_base(state));
+	lua_pop(state, 1);
+	VERIFY3U(1, ==, luaopen_coroutine(state));
+	lua_setglobal(state, LUA_COLIBNAME);
+	VERIFY0(lua_gettop(state));
+	VERIFY3U(1, ==, luaopen_string(state));
+	lua_setglobal(state, LUA_STRLIBNAME);
+	VERIFY0(lua_gettop(state));
+	VERIFY3U(1, ==, luaopen_table(state));
+	lua_setglobal(state, LUA_TABLIBNAME);
+	VERIFY0(lua_gettop(state));
+
+	/*
+	 * Load globally visible variables such as errno aliases.
+	 */
+	zcp_load_globals(state);
+	VERIFY0(lua_gettop(state));
+
+	/*
+	 * Load ZFS-specific modules.
+	 */
+	lua_newtable(state);
+	VERIFY3U(1, ==, zcp_load_list_lib(state));
+	lua_setfield(state, -2, "list");
+	VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE));
+	lua_setfield(state, -2, "check");
+	VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE));
+	lua_setfield(state, -2, "sync");
+	VERIFY3U(1, ==, zcp_load_get_lib(state));
+	lua_pushcclosure(state, zcp_debug_info.func, 0);
+	lua_setfield(state, -2, zcp_debug_info.name);
+	lua_pushcclosure(state, zcp_exists_info.func, 0);
+	lua_setfield(state, -2, zcp_exists_info.name);
+	lua_setglobal(state, "zfs");
+	VERIFY0(lua_gettop(state));
+
+	/*
+	 * Push the error-callback that calculates Lua stack traces on
+	 * unexpected failures.
+	 */
+	lua_pushcfunction(state, zcp_error_handler);
+	VERIFY3U(1, ==, lua_gettop(state));
+
+	/*
+	 * Load the actual script as a function onto the stack as text ("t").
+	 * The only valid error condition is a syntax error in the script.
+	 * ERRMEM should not be possible because our allocator is using
+	 * KM_SLEEP.  ERRGCMM should not be possible because we have not added
+	 * any objects with __gc metamethods to the interpreter that could
+	 * fail.
+	 */
+	err = luaL_loadbufferx(state, program, strlen(program),
+	    "channel program", "t");
+	if (err == LUA_ERRSYNTAX) {
+		fnvlist_add_string(outnvl, ZCP_RET_ERROR,
+		    lua_tostring(state, -1));
+		lua_close(state);
+		return (SET_ERROR(EINVAL));
+	}
+	VERIFY0(err);
+	VERIFY3U(2, ==, lua_gettop(state));
+
+	/*
+	 * Convert the input nvlist to a Lua object and put it on top of the
+	 * stack.
+	 */
+	char errmsg[128];
+	err = zcp_nvpair_value_to_lua(state, nvarg,
+	    errmsg, sizeof (errmsg));
+	if (err != 0) {
+		fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg);
+		lua_close(state);
+		return (SET_ERROR(EINVAL));
+	}
+	VERIFY3U(3, ==, lua_gettop(state));
+
+	runinfo.zri_state = state;
+	runinfo.zri_allocargs = &allocargs;
+	runinfo.zri_outnvl = outnvl;
+	runinfo.zri_result = 0;
+	runinfo.zri_cred = CRED();
+	runinfo.zri_proc = curproc;
+	runinfo.zri_timed_out = B_FALSE;
+	runinfo.zri_canceled = B_FALSE;
+	runinfo.zri_sync = sync;
+	runinfo.zri_space_used = 0;
+	runinfo.zri_curinstrs = 0;
+	runinfo.zri_maxinstrs = instrlimit;
+	runinfo.zri_new_zvols = fnvlist_alloc();
+
+	if (sync) {
+		err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
+		    zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
+		if (err != 0)
+			zcp_pool_error(&runinfo, poolname);
+	} else {
+		zcp_eval_open(&runinfo, poolname);
+	}
+	lua_close(state);
+
+	/*
+	 * Create device minor nodes for any new zvols.
+	 */
+	for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL);
+	    pair != NULL;
+	    pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) {
+		zvol_create_minor(nvpair_name(pair));
+	}
+	fnvlist_free(runinfo.zri_new_zvols);
+
+	return (runinfo.zri_result);
+}
+
+/*
+ * Retrieve metadata about the currently running channel program.
+ */
+zcp_run_info_t *
+zcp_run_info(lua_State *state)
+{
+	zcp_run_info_t *ri;
+
+	lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
+	ri = lua_touserdata(state, -1);
+	lua_pop(state, 1);
+	return (ri);
+}
+
+/*
+ * Argument Parsing
+ * ================
+ *
+ * The Lua language allows methods to be called with any number
+ * of arguments of any type. When calling back into ZFS we need to sanitize
+ * arguments from channel programs to make sure unexpected arguments or
+ * arguments of the wrong type result in clear error messages. To do this
+ * in a uniform way all callbacks from channel programs should use the
+ * zcp_parse_args() function to interpret inputs.
+ *
+ * Positional vs Keyword Arguments
+ * ===============================
+ *
+ * Every callback function takes a fixed set of required positional arguments
+ * and optional keyword arguments. For example, the destroy function takes
+ * a single positional string argument (the name of the dataset to destroy)
+ * and an optional "defer" keyword boolean argument. When calling lua functions
+ * with parentheses, only positional arguments can be used:
+ *
+ *     zfs.sync.snapshot("rpool@snap")
+ *
+ * To use keyword arguments functions should be called with a single argument
+ * that is a lua table containing mappings of integer -> positional arguments
+ * and string -> keyword arguments:
+ *
+ *     zfs.sync.snapshot({1="rpool@snap", defer=true})
+ *
+ * The lua language allows curly braces to be used in place of parenthesis as
+ * syntactic sugar for this calling convention:
+ *
+ *     zfs.sync.snapshot{"rpool@snap", defer=true}
+ */
+
+/*
+ * Throw an error and print the given arguments.  If there are too many
+ * arguments to fit in the output buffer, only the error format string is
+ * output.
+ */
+static void
+zcp_args_error(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+    const zcp_arg_t *kwargs, const char *fmt, ...)
+{
+	int i;
+	char errmsg[512];
+	size_t len = sizeof (errmsg);
+	size_t msglen = 0;
+	va_list argp;
+
+	va_start(argp, fmt);
+	VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp));
+	va_end(argp);
+
+	/*
+	 * Calculate the total length of the final string, including extra
+	 * formatting characters. If the argument dump would be too large,
+	 * only print the error string.
+	 */
+	msglen = strlen(errmsg);
+	msglen += strlen(fname) + 4; /* : + {} + null terminator */
+	for (i = 0; pargs[i].za_name != NULL; i++) {
+		msglen += strlen(pargs[i].za_name);
+		msglen += strlen(lua_typename(state, pargs[i].za_lua_type));
+		if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL)
+			msglen += 5; /* < + ( + )> + , */
+		else
+			msglen += 4; /* < + ( + )> */
+	}
+	for (i = 0; kwargs[i].za_name != NULL; i++) {
+		msglen += strlen(kwargs[i].za_name);
+		msglen += strlen(lua_typename(state, kwargs[i].za_lua_type));
+		if (kwargs[i + 1].za_name != NULL)
+			msglen += 4; /* =( + ) + , */
+		else
+			msglen += 3; /* =( + ) */
+	}
+
+	if (msglen >= len)
+		(void) luaL_error(state, errmsg);
+
+	VERIFY3U(len, >, strlcat(errmsg, ": ", len));
+	VERIFY3U(len, >, strlcat(errmsg, fname, len));
+	VERIFY3U(len, >, strlcat(errmsg, "{", len));
+	for (i = 0; pargs[i].za_name != NULL; i++) {
+		VERIFY3U(len, >, strlcat(errmsg, "<", len));
+		VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len));
+		VERIFY3U(len, >, strlcat(errmsg, "(", len));
+		VERIFY3U(len, >, strlcat(errmsg,
+		    lua_typename(state, pargs[i].za_lua_type), len));
+		VERIFY3U(len, >, strlcat(errmsg, ")>", len));
+		if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) {
+			VERIFY3U(len, >, strlcat(errmsg, ", ", len));
+		}
+	}
+	for (i = 0; kwargs[i].za_name != NULL; i++) {
+		VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len));
+		VERIFY3U(len, >, strlcat(errmsg, "=(", len));
+		VERIFY3U(len, >, strlcat(errmsg,
+		    lua_typename(state, kwargs[i].za_lua_type), len));
+		VERIFY3U(len, >, strlcat(errmsg, ")", len));
+		if (kwargs[i + 1].za_name != NULL) {
+			VERIFY3U(len, >, strlcat(errmsg, ", ", len));
+		}
+	}
+	VERIFY3U(len, >, strlcat(errmsg, "}", len));
+
+	(void) luaL_error(state, errmsg);
+	panic("unreachable code");
+}
+
+static void
+zcp_parse_table_args(lua_State *state, const char *fname,
+    const zcp_arg_t *pargs, const zcp_arg_t *kwargs)
+{
+	int i;
+	int type;
+
+	for (i = 0; pargs[i].za_name != NULL; i++) {
+		/*
+		 * Check the table for this positional argument, leaving it
+		 * on the top of the stack once we finish validating it.
+		 */
+		lua_pushinteger(state, i + 1);
+		lua_gettable(state, 1);
+
+		type = lua_type(state, -1);
+		if (type == LUA_TNIL) {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "too few arguments");
+			panic("unreachable code");
+		} else if (type != pargs[i].za_lua_type) {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "arg %d wrong type (is '%s', expected '%s')",
+			    i + 1, lua_typename(state, type),
+			    lua_typename(state, pargs[i].za_lua_type));
+			panic("unreachable code");
+		}
+
+		/*
+		 * Remove the positional argument from the table.
+		 */
+		lua_pushinteger(state, i + 1);
+		lua_pushnil(state);
+		lua_settable(state, 1);
+	}
+
+	for (i = 0; kwargs[i].za_name != NULL; i++) {
+		/*
+		 * Check the table for this keyword argument, which may be
+		 * nil if it was omitted. Leave the value on the top of
+		 * the stack after validating it.
+		 */
+		lua_getfield(state, 1, kwargs[i].za_name);
+
+		type = lua_type(state, -1);
+		if (type != LUA_TNIL && type != kwargs[i].za_lua_type) {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "kwarg '%s' wrong type (is '%s', expected '%s')",
+			    kwargs[i].za_name, lua_typename(state, type),
+			    lua_typename(state, kwargs[i].za_lua_type));
+			panic("unreachable code");
+		}
+
+		/*
+		 * Remove the keyword argument from the table.
+		 */
+		lua_pushnil(state);
+		lua_setfield(state, 1, kwargs[i].za_name);
+	}
+
+	/*
+	 * Any entries remaining in the table are invalid inputs, print
+	 * an error message based on what the entry is.
+	 */
+	lua_pushnil(state);
+	if (lua_next(state, 1)) {
+		if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "too many positional arguments");
+		} else if (lua_isstring(state, -2)) {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "invalid kwarg '%s'", lua_tostring(state, -2));
+		} else {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "kwarg keys must be strings");
+		}
+		panic("unreachable code");
+	}
+
+	lua_remove(state, 1);
+}
+
+static void
+zcp_parse_pos_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+    const zcp_arg_t *kwargs)
+{
+	int i;
+	int type;
+
+	for (i = 0; pargs[i].za_name != NULL; i++) {
+		type = lua_type(state, i + 1);
+		if (type == LUA_TNONE) {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "too few arguments");
+			panic("unreachable code");
+		} else if (type != pargs[i].za_lua_type) {
+			zcp_args_error(state, fname, pargs, kwargs,
+			    "arg %d wrong type (is '%s', expected '%s')",
+			    i + 1, lua_typename(state, type),
+			    lua_typename(state, pargs[i].za_lua_type));
+			panic("unreachable code");
+		}
+	}
+	if (lua_gettop(state) != i) {
+		zcp_args_error(state, fname, pargs, kwargs,
+		    "too many positional arguments");
+		panic("unreachable code");
+	}
+
+	for (i = 0; kwargs[i].za_name != NULL; i++) {
+		lua_pushnil(state);
+	}
+}
+
+/*
+ * Checks the current Lua stack against an expected set of positional and
+ * keyword arguments. If the stack does not match the expected arguments
+ * aborts the current channel program with a useful error message, otherwise
+ * it re-arranges the stack so that it contains the positional arguments
+ * followed by the keyword argument values in declaration order. Any missing
+ * keyword argument will be represented by a nil value on the stack.
+ *
+ * If the stack contains exactly one argument of type LUA_TTABLE the curly
+ * braces calling convention is assumed, otherwise the stack is parsed for
+ * positional arguments only.
+ *
+ * This function should be used by every function callback. It should be called
+ * before the callback manipulates the Lua stack as it assumes the stack
+ * represents the function arguments.
+ */
+void
+zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs,
+    const zcp_arg_t *kwargs)
+{
+	if (lua_gettop(state) == 1 && lua_istable(state, 1)) {
+		zcp_parse_table_args(state, fname, pargs, kwargs);
+	} else {
+		zcp_parse_pos_args(state, fname, pargs, kwargs);
+	}
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW,
+	"Max instruction limit that can be specified for a channel program");
+
+ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW,
+	"Max memory limit that can be specified for a channel program");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zcp_get.c b/sys/contrib/openzfs/module/zfs/zcp_get.c
new file mode 100644
index 000000000000..7256e4de1915
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_get.c
@@ -0,0 +1,813 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lualib.h>
+#include <sys/lua/lauxlib.h>
+
+#include <zfs_prop.h>
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_dir.h>
+#include <sys/dmu_objset.h>
+#include <sys/mntent.h>
+#include <sys/sunddi.h>
+#include <sys/zap.h>
+#include <sys/zcp.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_global.h>
+#include <sys/zcp_prop.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_znode.h>
+#include <sys/zvol.h>
+
+#ifdef _KERNEL
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#endif
+
+static int
+get_objset_type(dsl_dataset_t *ds, zfs_type_t *type)
+{
+	int error;
+	objset_t *os;
+	error = dmu_objset_from_ds(ds, &os);
+	if (error != 0)
+		return (error);
+	if (ds->ds_is_snapshot) {
+		*type = ZFS_TYPE_SNAPSHOT;
+	} else {
+		switch (os->os_phys->os_type) {
+		case DMU_OST_ZFS:
+			*type = ZFS_TYPE_FILESYSTEM;
+			break;
+		case DMU_OST_ZVOL:
+			*type = ZFS_TYPE_VOLUME;
+			break;
+		default:
+			return (EINVAL);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Returns the string name of ds's type in str (a buffer which should be
+ * at least 12 bytes long).
+ */
+static int
+get_objset_type_name(dsl_dataset_t *ds, char *str)
+{
+	int error;
+	zfs_type_t type;
+	error = get_objset_type(ds, &type);
+	if (error != 0)
+		return (error);
+	switch (type) {
+	case ZFS_TYPE_SNAPSHOT:
+		(void) strlcpy(str, "snapshot", ZAP_MAXVALUELEN);
+		break;
+	case ZFS_TYPE_FILESYSTEM:
+		(void) strlcpy(str, "filesystem", ZAP_MAXVALUELEN);
+		break;
+	case ZFS_TYPE_VOLUME:
+		(void) strlcpy(str, "volume", ZAP_MAXVALUELEN);
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * Determines the source of a property given its setpoint and
+ * property type. It pushes the source to the lua stack.
+ */
+static void
+get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop)
+{
+	if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) {
+		lua_pushnil(state);
+	} else {
+		const char *src;
+		if (strcmp("", setpoint) == 0) {
+			src = "default";
+		} else {
+			src = setpoint;
+		}
+		(void) lua_pushstring(state, src);
+	}
+}
+
+/*
+ * Given an error encountered while getting properties, either longjmp's for
+ * a fatal error or pushes nothing to the stack for a non fatal one.
+ */
+static int
+zcp_handle_error(lua_State *state, const char *dataset_name,
+    const char *property_name, int error)
+{
+	ASSERT3S(error, !=, 0);
+	if (error == ENOENT) {
+		return (0);
+	} else if (error == EINVAL) {
+		return (luaL_error(state,
+		    "property '%s' is not a valid property on dataset '%s'",
+		    property_name, dataset_name));
+	} else if (error == EIO) {
+		return (luaL_error(state,
+		    "I/O error while retrieving property '%s' on dataset '%s'",
+		    property_name, dataset_name));
+	} else {
+		return (luaL_error(state, "unexpected error %d while "
+		    "retrieving property '%s' on dataset '%s'",
+		    error, property_name, dataset_name));
+	}
+}
+
+/*
+ * Look up a user defined property in the zap object. If it exists, push it
+ * and the setpoint onto the stack, otherwise don't push anything.
+ */
+static int
+zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
+    const char *property_name)
+{
+	int error;
+	char *buf;
+	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+	/*
+	 * zcp_dataset_hold will either successfully return the requested
+	 * dataset or throw a lua error and longjmp out of the zfs.get_prop call
+	 * without returning.
+	 */
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+	error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN,
+	    buf, setpoint);
+	dsl_dataset_rele(ds, FTAG);
+
+	if (error != 0) {
+		kmem_free(buf, ZAP_MAXVALUELEN);
+		return (zcp_handle_error(state, dataset_name, property_name,
+		    error));
+	}
+	(void) lua_pushstring(state, buf);
+	(void) lua_pushstring(state, setpoint);
+	kmem_free(buf, ZAP_MAXVALUELEN);
+	return (2);
+}
+
+/*
+ * Check if the property we're looking for is stored in the ds_dir. If so,
+ * return it in the 'val' argument. Return 0 on success and ENOENT and if
+ * the property is not present.
+ */
+static int
+get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop,
+    uint64_t *val)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	mutex_enter(&dd->dd_lock);
+	switch (zfs_prop) {
+	case ZFS_PROP_USEDSNAP:
+		*val = dsl_dir_get_usedsnap(dd);
+		break;
+	case ZFS_PROP_USEDCHILD:
+		*val = dsl_dir_get_usedchild(dd);
+		break;
+	case ZFS_PROP_USEDDS:
+		*val = dsl_dir_get_usedds(dd);
+		break;
+	case ZFS_PROP_USEDREFRESERV:
+		*val = dsl_dir_get_usedrefreserv(dd);
+		break;
+	case ZFS_PROP_LOGICALUSED:
+		*val = dsl_dir_get_logicalused(dd);
+		break;
+	default:
+		mutex_exit(&dd->dd_lock);
+		return (SET_ERROR(ENOENT));
+	}
+	mutex_exit(&dd->dd_lock);
+	return (0);
+}
+
+/*
+ * Check if the property we're looking for is stored at the dsl_dataset or
+ * dsl_dir level. If so, push the property value and source onto the lua stack
+ * and return 0. If it is not present or a failure occurs in lookup, return a
+ * non-zero error value.
+ */
+static int
+get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname,
+    zfs_prop_t zfs_prop)
+{
+	int error = 0;
+	objset_t *os;
+	uint64_t numval = 0;
+	char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+	char setpoint[ZFS_MAX_DATASET_NAME_LEN] =
+	    "Internal error - setpoint not determined";
+	zfs_type_t ds_type;
+	zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
+	(void) get_objset_type(ds, &ds_type);
+
+	switch (zfs_prop) {
+	case ZFS_PROP_REFRATIO:
+		numval = dsl_get_refratio(ds);
+		break;
+	case ZFS_PROP_USED:
+		numval = dsl_get_used(ds);
+		break;
+	case ZFS_PROP_CLONES: {
+		nvlist_t *clones = fnvlist_alloc();
+		error = get_clones_stat_impl(ds, clones);
+		if (error == 0) {
+			/* push list to lua stack */
+			VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0ULL));
+			/* source */
+			(void) lua_pushnil(state);
+		}
+		nvlist_free(clones);
+		kmem_free(strval, ZAP_MAXVALUELEN);
+		return (error);
+	}
+	case ZFS_PROP_COMPRESSRATIO:
+		numval = dsl_get_compressratio(ds);
+		break;
+	case ZFS_PROP_CREATION:
+		numval = dsl_get_creation(ds);
+		break;
+	case ZFS_PROP_REFERENCED:
+		numval = dsl_get_referenced(ds);
+		break;
+	case ZFS_PROP_AVAILABLE:
+		numval = dsl_get_available(ds);
+		break;
+	case ZFS_PROP_LOGICALREFERENCED:
+		numval = dsl_get_logicalreferenced(ds);
+		break;
+	case ZFS_PROP_CREATETXG:
+		numval = dsl_get_creationtxg(ds);
+		break;
+	case ZFS_PROP_GUID:
+		numval = dsl_get_guid(ds);
+		break;
+	case ZFS_PROP_UNIQUE:
+		numval = dsl_get_unique(ds);
+		break;
+	case ZFS_PROP_OBJSETID:
+		numval = dsl_get_objsetid(ds);
+		break;
+	case ZFS_PROP_ORIGIN:
+		dsl_dir_get_origin(ds->ds_dir, strval);
+		break;
+	case ZFS_PROP_USERACCOUNTING:
+		error = dmu_objset_from_ds(ds, &os);
+		if (error == 0)
+			numval = dmu_objset_userspace_present(os);
+		break;
+	case ZFS_PROP_WRITTEN:
+		error = dsl_get_written(ds, &numval);
+		break;
+	case ZFS_PROP_TYPE:
+		error = get_objset_type_name(ds, strval);
+		break;
+	case ZFS_PROP_PREV_SNAP:
+		error = dsl_get_prev_snap(ds, strval);
+		break;
+	case ZFS_PROP_NAME:
+		dsl_dataset_name(ds, strval);
+		break;
+	case ZFS_PROP_MOUNTPOINT:
+		error = dsl_get_mountpoint(ds, dsname, strval, setpoint);
+		break;
+	case ZFS_PROP_VERSION:
+		/* should be a snapshot or filesystem */
+		ASSERT(ds_type != ZFS_TYPE_VOLUME);
+		error = dmu_objset_from_ds(ds, &os);
+		/* look in the master node for the version */
+		if (error == 0) {
+			error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+			    sizeof (numval), 1, &numval);
+		}
+		break;
+	case ZFS_PROP_DEFER_DESTROY:
+		numval = dsl_get_defer_destroy(ds);
+		break;
+	case ZFS_PROP_USERREFS:
+		numval = dsl_get_userrefs(ds);
+		break;
+	case ZFS_PROP_FILESYSTEM_COUNT:
+		error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval);
+		(void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN);
+		break;
+	case ZFS_PROP_SNAPSHOT_COUNT:
+		error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval);
+		(void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN);
+		break;
+	case ZFS_PROP_NUMCLONES:
+		numval = dsl_get_numclones(ds);
+		break;
+	case ZFS_PROP_INCONSISTENT:
+		numval = dsl_get_inconsistent(ds);
+		break;
+	case ZFS_PROP_IVSET_GUID:
+		if (dsl_dataset_is_zapified(ds)) {
+			error = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
+			    ds->ds_object, DS_FIELD_IVSET_GUID,
+			    sizeof (numval), 1, &numval);
+		} else {
+			error = ENOENT;
+		}
+		break;
+	case ZFS_PROP_RECEIVE_RESUME_TOKEN: {
+		char *token = get_receive_resume_stats_impl(ds);
+
+		(void) strlcpy(strval, token, ZAP_MAXVALUELEN);
+		if (strcmp(strval, "") == 0) {
+			char *childval = get_child_receive_stats(ds);
+
+			(void) strlcpy(strval, childval, ZAP_MAXVALUELEN);
+			if (strcmp(strval, "") == 0)
+				error = ENOENT;
+
+			kmem_strfree(childval);
+		}
+		kmem_strfree(token);
+		break;
+	}
+	case ZFS_PROP_VOLSIZE:
+		ASSERT(ds_type == ZFS_TYPE_VOLUME ||
+		    ds_type == ZFS_TYPE_SNAPSHOT);
+		error = dmu_objset_from_ds(ds, &os);
+		if (error == 0) {
+			error = zap_lookup(os, ZVOL_ZAP_OBJ, "size",
+			    sizeof (numval), 1, &numval);
+		}
+		if (error == 0)
+			(void) strlcpy(setpoint, dsname,
+			    ZFS_MAX_DATASET_NAME_LEN);
+
+		break;
+	case ZFS_PROP_VOLBLOCKSIZE: {
+		ASSERT(ds_type == ZFS_TYPE_VOLUME);
+		dmu_object_info_t doi;
+		error = dmu_objset_from_ds(ds, &os);
+		if (error == 0) {
+			error = dmu_object_info(os, ZVOL_OBJ, &doi);
+			if (error == 0)
+				numval = doi.doi_data_block_size;
+		}
+		break;
+	}
+
+	case ZFS_PROP_KEYSTATUS:
+	case ZFS_PROP_KEYFORMAT: {
+		/* provide defaults in case no crypto obj exists */
+		setpoint[0] = '\0';
+		if (zfs_prop == ZFS_PROP_KEYSTATUS)
+			numval = ZFS_KEYSTATUS_NONE;
+		else
+			numval = ZFS_KEYFORMAT_NONE;
+
+		nvlist_t *nvl, *propval;
+		nvl = fnvlist_alloc();
+		dsl_dataset_crypt_stats(ds, nvl);
+		if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop),
+		    &propval) == 0) {
+			char *source;
+
+			(void) nvlist_lookup_uint64(propval, ZPROP_VALUE,
+			    &numval);
+			if (nvlist_lookup_string(propval, ZPROP_SOURCE,
+			    &source) == 0)
+				strlcpy(setpoint, source, sizeof (setpoint));
+		}
+		nvlist_free(nvl);
+		break;
+	}
+
+	default:
+		/* Did not match these props, check in the dsl_dir */
+		error = get_dsl_dir_prop(ds, zfs_prop, &numval);
+	}
+	if (error != 0) {
+		kmem_free(strval, ZAP_MAXVALUELEN);
+		return (error);
+	}
+
+	switch (prop_type) {
+	case PROP_TYPE_NUMBER: {
+		(void) lua_pushnumber(state, numval);
+		break;
+	}
+	case PROP_TYPE_STRING: {
+		(void) lua_pushstring(state, strval);
+		break;
+	}
+	case PROP_TYPE_INDEX: {
+		const char *propval;
+		error = zfs_prop_index_to_string(zfs_prop, numval, &propval);
+		if (error != 0) {
+			kmem_free(strval, ZAP_MAXVALUELEN);
+			return (error);
+		}
+		(void) lua_pushstring(state, propval);
+		break;
+	}
+	}
+	kmem_free(strval, ZAP_MAXVALUELEN);
+
+	/* Push the source to the stack */
+	get_prop_src(state, setpoint, zfs_prop);
+	return (0);
+}
+
+/*
+ * Look up a property and its source in the zap object. If the value is
+ * present and successfully retrieved, push the value and source on the
+ * lua stack and return 0. On failure, return a non-zero error value.
+ */
+static int
+get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop)
+{
+	int error = 0;
+	char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+	char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+	uint64_t numval;
+	const char *prop_name = zfs_prop_to_name(zfs_prop);
+	zprop_type_t prop_type = zfs_prop_get_type(zfs_prop);
+
+	if (prop_type == PROP_TYPE_STRING) {
+		/* Push value to lua stack */
+		error = dsl_prop_get_ds(ds, prop_name, 1,
+		    ZAP_MAXVALUELEN, strval, setpoint);
+		if (error == 0)
+			(void) lua_pushstring(state, strval);
+	} else {
+		error = dsl_prop_get_ds(ds, prop_name, sizeof (numval),
+		    1, &numval, setpoint);
+
+#ifdef _KERNEL
+		/* Fill in temporary value for prop, if applicable */
+		(void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint);
+#else
+		return (luaL_error(state,
+		    "temporary properties only supported in kernel mode",
+		    prop_name));
+#endif
+		/* Push value to lua stack */
+		if (prop_type == PROP_TYPE_INDEX) {
+			const char *propval;
+			error = zfs_prop_index_to_string(zfs_prop, numval,
+			    &propval);
+			if (error == 0)
+				(void) lua_pushstring(state, propval);
+		} else {
+			if (error == 0)
+				(void) lua_pushnumber(state, numval);
+		}
+	}
+	kmem_free(strval, ZAP_MAXVALUELEN);
+	if (error == 0)
+		get_prop_src(state, setpoint, zfs_prop);
+	return (error);
+}
+
+/*
+ * Determine whether property is valid for a given dataset
+ */
+boolean_t
+prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop)
+{
+	int error;
+	zfs_type_t zfs_type;
+
+	/* properties not supported */
+	if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) ||
+	    (zfs_prop == ZFS_PROP_MOUNTED))
+		return (B_FALSE);
+
+	/* if we want the origin prop, ds must be a clone */
+	if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir)))
+		return (B_FALSE);
+
+	error = get_objset_type(ds, &zfs_type);
+	if (error != 0)
+		return (B_FALSE);
+	return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE));
+}
+
+/*
+ * Look up a given dataset property. On success return 2, the number of
+ * values pushed to the lua stack (property value and source). On a fatal
+ * error, longjmp. On a non fatal error push nothing.
+ */
+static int
+zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name,
+    zfs_prop_t zfs_prop)
+{
+	int error;
+	/*
+	 * zcp_dataset_hold will either successfully return the requested
+	 * dataset or throw a lua error and longjmp out of the zfs.get_prop call
+	 * without returning.
+	 */
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	/* Check that the property is valid for the given dataset */
+	const char *prop_name = zfs_prop_to_name(zfs_prop);
+	if (!prop_valid_for_ds(ds, zfs_prop)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	/* Check if the property can be accessed directly */
+	error = get_special_prop(state, ds, dataset_name, zfs_prop);
+	if (error == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		/* The value and source have been pushed by get_special_prop */
+		return (2);
+	}
+	if (error != ENOENT) {
+		dsl_dataset_rele(ds, FTAG);
+		return (zcp_handle_error(state, dataset_name,
+		    prop_name, error));
+	}
+
+	/* If we were unable to find it, look in the zap object */
+	error = get_zap_prop(state, ds, zfs_prop);
+	dsl_dataset_rele(ds, FTAG);
+	if (error != 0) {
+		return (zcp_handle_error(state, dataset_name,
+		    prop_name, error));
+	}
+	/* The value and source have been pushed by get_zap_prop */
+	return (2);
+}
+
+#ifdef _KERNEL
+static zfs_userquota_prop_t
+get_userquota_prop(const char *prop_name)
+{
+	zfs_userquota_prop_t type;
+	/* Figure out the property type ({user|group}{quota|used}) */
+	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
+		if (strncmp(prop_name, zfs_userquota_prop_prefixes[type],
+		    strlen(zfs_userquota_prop_prefixes[type])) == 0)
+			break;
+	}
+	return (type);
+}
+
+/*
+ * Given the name of a zfs_userquota_prop, this function determines the
+ * prop type as well as the numeric group/user ids based on the string
+ * following the '@' in the property name. On success, returns 0. On failure,
+ * returns a non-zero error.
+ * 'domain' must be free'd by caller using kmem_strfree()
+ */
+static int
+parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type,
+    char **domain, uint64_t *rid)
+{
+	char *cp, *end, *domain_val;
+
+	*type = get_userquota_prop(prop_name);
+	if (*type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+
+	*rid = 0;
+	cp = strchr(prop_name, '@') + 1;
+	if (strncmp(cp, "S-1-", 4) == 0) {
+		/*
+		 * It's a numeric SID (eg "S-1-234-567-89") and we want to
+		 * separate the domain id and the rid
+		 */
+		int domain_len = strrchr(cp, '-') - cp;
+		domain_val = kmem_alloc(domain_len + 1, KM_SLEEP);
+		(void) strncpy(domain_val, cp, domain_len);
+		domain_val[domain_len] = '\0';
+		cp += domain_len + 1;
+
+		(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
+		if (*end != '\0') {
+			kmem_strfree(domain_val);
+			return (EINVAL);
+		}
+	} else {
+		/* It's only a user/group ID (eg "12345"), just get the rid */
+		domain_val = NULL;
+		(void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid);
+		if (*end != '\0')
+			return (EINVAL);
+	}
+	*domain = domain_val;
+	return (0);
+}
+
+/*
+ * Look up {user|group}{quota|used} property for given dataset. On success
+ * push the value (quota or used amount) and the setpoint. On failure, push
+ * a lua error.
+ */
+static int
+zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp,
+    const char *dataset_name, const char *prop_name)
+{
+	zfsvfs_t *zfvp;
+	zfsvfs_t *zfsvfs;
+	int error;
+	zfs_userquota_prop_t type;
+	char *domain;
+	uint64_t rid, value = 0;
+	objset_t *os;
+
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	error = parse_userquota_prop(prop_name, &type, &domain, &rid);
+	if (error == 0) {
+		error = dmu_objset_from_ds(ds, &os);
+		if (error == 0) {
+			zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+			error = zfsvfs_create_impl(&zfvp, zfsvfs, os);
+			if (error == 0) {
+				error = zfs_userspace_one(zfvp, type, domain,
+				    rid, &value);
+				zfsvfs_free(zfvp);
+			}
+		}
+		if (domain != NULL)
+			kmem_strfree(domain);
+	}
+	dsl_dataset_rele(ds, FTAG);
+
+	if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) ||
+	    (type == ZFS_PROP_GROUPQUOTA)))
+		error = SET_ERROR(ENOENT);
+	if (error != 0) {
+		return (zcp_handle_error(state, dataset_name,
+		    prop_name, error));
+	}
+
+	(void) lua_pushnumber(state, value);
+	(void) lua_pushstring(state, dataset_name);
+	return (2);
+}
+#endif
+
+/*
+ * Determines the name of the snapshot referenced in the written property
+ * name. Returns snapshot name in snap_name, a buffer that must be at least
+ * as large as ZFS_MAX_DATASET_NAME_LEN
+ */
+static void
+parse_written_prop(const char *dataset_name, const char *prop_name,
+    char *snap_name)
+{
+	ASSERT(zfs_prop_written(prop_name));
+	const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN;
+	if (strchr(name, '@') == NULL) {
+		(void) snprintf(snap_name, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
+		    dataset_name, name);
+	} else {
+		(void) strlcpy(snap_name, name, ZFS_MAX_DATASET_NAME_LEN);
+	}
+}
+
+/*
+ * Look up written@ property for given dataset. On success
+ * push the value and the setpoint. If error is fatal, we will
+ * longjmp, otherwise push nothing.
+ */
+static int
+zcp_get_written_prop(lua_State *state, dsl_pool_t *dp,
+    const char *dataset_name, const char *prop_name)
+{
+	char snap_name[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t used, comp, uncomp;
+	dsl_dataset_t *old;
+	int error = 0;
+
+	parse_written_prop(dataset_name, prop_name, snap_name);
+	dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+	if (new == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	error = dsl_dataset_hold(dp, snap_name, FTAG, &old);
+	if (error != 0) {
+		dsl_dataset_rele(new, FTAG);
+		return (zcp_dataset_hold_error(state, dp, snap_name,
+		    error));
+	}
+	error = dsl_dataset_space_written(old, new,
+	    &used, &comp, &uncomp);
+
+	dsl_dataset_rele(old, FTAG);
+	dsl_dataset_rele(new, FTAG);
+
+	if (error != 0) {
+		return (zcp_handle_error(state, dataset_name,
+		    snap_name, error));
+	}
+	(void) lua_pushnumber(state, used);
+	(void) lua_pushstring(state, dataset_name);
+	return (2);
+}
+
+static int zcp_get_prop(lua_State *state);
+static zcp_lib_info_t zcp_get_prop_info = {
+	.name = "get_prop",
+	.func = zcp_get_prop,
+	.pargs = {
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+	    { .za_name = "property", .za_lua_type =  LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_get_prop(lua_State *state)
+{
+	const char *dataset_name;
+	const char *property_name;
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	zcp_lib_info_t *libinfo = &zcp_get_prop_info;
+
+	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+
+	dataset_name = lua_tostring(state, 1);
+	property_name = lua_tostring(state, 2);
+
+	/* User defined property */
+	if (zfs_prop_user(property_name)) {
+		return (zcp_get_user_prop(state, dp,
+		    dataset_name, property_name));
+	}
+	/* userspace property */
+	if (zfs_prop_userquota(property_name)) {
+#ifdef _KERNEL
+		return (zcp_get_userquota_prop(state, dp,
+		    dataset_name, property_name));
+#else
+		return (luaL_error(state,
+		    "user quota properties only supported in kernel mode",
+		    property_name));
+#endif
+	}
+	/* written@ property */
+	if (zfs_prop_written(property_name)) {
+		return (zcp_get_written_prop(state, dp,
+		    dataset_name, property_name));
+	}
+
+	zfs_prop_t zfs_prop = zfs_name_to_prop(property_name);
+	/* Valid system property */
+	if (zfs_prop != ZPROP_INVAL) {
+		return (zcp_get_system_prop(state, dp, dataset_name,
+		    zfs_prop));
+	}
+
+	/* Invalid property name */
+	return (luaL_error(state,
+	    "'%s' is not a valid property", property_name));
+}
+
+int
+zcp_load_get_lib(lua_State *state)
+{
+	lua_pushcclosure(state, zcp_get_prop_info.func, 0);
+	lua_setfield(state, -2, zcp_get_prop_info.name);
+
+	return (1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_global.c b/sys/contrib/openzfs/module/zfs/zcp_global.c
new file mode 100644
index 000000000000..8e166e0736d6
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_global.c
@@ -0,0 +1,89 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/zcp_global.h>
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+
+typedef struct zcp_errno_global {
+	const char *zeg_name;
+	int zeg_errno;
+} zcp_errno_global_t;
+
+static const zcp_errno_global_t errno_globals[] = {
+	{"EPERM", EPERM},
+	{"ENOENT", ENOENT},
+	{"ESRCH", ESRCH},
+	{"EINTR", EINTR},
+	{"EIO", EIO},
+	{"ENXIO", ENXIO},
+	{"E2BIG", E2BIG},
+	{"ENOEXEC", ENOEXEC},
+	{"EBADF", EBADF},
+	{"ECHILD", ECHILD},
+	{"EAGAIN", EAGAIN},
+	{"ENOMEM", ENOMEM},
+	{"EACCES", EACCES},
+	{"EFAULT", EFAULT},
+	{"ENOTBLK", ENOTBLK},
+	{"EBUSY", EBUSY},
+	{"EEXIST", EEXIST},
+	{"EXDEV", EXDEV},
+	{"ENODEV", ENODEV},
+	{"ENOTDIR", ENOTDIR},
+	{"EISDIR", EISDIR},
+	{"EINVAL", EINVAL},
+	{"ENFILE", ENFILE},
+	{"EMFILE", EMFILE},
+	{"ENOTTY", ENOTTY},
+	{"ETXTBSY", ETXTBSY},
+	{"EFBIG", EFBIG},
+	{"ENOSPC", ENOSPC},
+	{"ESPIPE", ESPIPE},
+	{"EROFS", EROFS},
+	{"EMLINK", EMLINK},
+	{"EPIPE", EPIPE},
+	{"EDOM", EDOM},
+	{"ERANGE", ERANGE},
+	{"EDEADLK", EDEADLK},
+	{"ENOLCK", ENOLCK},
+	{"ECANCELED", ECANCELED},
+	{"ENOTSUP", ENOTSUP},
+	{"EDQUOT", EDQUOT},
+	{"ENAMETOOLONG", ENAMETOOLONG},
+	{0, 0}
+};
+
+static void
+zcp_load_errno_globals(lua_State *state)
+{
+	const zcp_errno_global_t *global = errno_globals;
+	while (global->zeg_name != NULL) {
+		lua_pushnumber(state, (lua_Number)global->zeg_errno);
+		lua_setglobal(state, global->zeg_name);
+		global++;
+	}
+}
+
+void
+zcp_load_globals(lua_State *state)
+{
+	zcp_load_errno_globals(state);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_iter.c b/sys/contrib/openzfs/module/zfs/zcp_iter.c
new file mode 100644
index 000000000000..f727c56f212d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_iter.c
@@ -0,0 +1,751 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/dmu.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_pool.h>
+#include <sys/dmu_tx.h>
+#include <sys/dmu_objset.h>
+#include <sys/zap.h>
+#include <sys/dsl_dir.h>
+#include <sys/zcp_prop.h>
+
+#include <sys/zcp.h>
+
+#include "zfs_comutil.h"
+
+typedef int (zcp_list_func_t)(lua_State *);
+typedef struct zcp_list_info {
+	const char *name;
+	zcp_list_func_t *func;
+	zcp_list_func_t *gc;
+	const zcp_arg_t pargs[4];
+	const zcp_arg_t kwargs[2];
+} zcp_list_info_t;
+
+static int
+zcp_clones_iter(lua_State *state)
+{
+	int err;
+	char clonename[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	dsl_dataset_t *ds, *clone;
+	zap_attribute_t za;
+	zap_cursor_t zc;
+
+	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err == ENOENT) {
+		return (0);
+	} else if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+		    err));
+	}
+
+	if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	zap_cursor_init_serialized(&zc, dp->dp_meta_objset,
+	    dsl_dataset_phys(ds)->ds_next_clones_obj, cursor);
+	dsl_dataset_rele(ds, FTAG);
+
+	err = zap_cursor_retrieve(&zc, &za);
+	if (err != 0) {
+		zap_cursor_fini(&zc);
+		if (err != ENOENT) {
+			return (luaL_error(state,
+			    "unexpected error %d from zap_cursor_retrieve()",
+			    err));
+		}
+		return (0);
+	}
+	zap_cursor_advance(&zc);
+	cursor = zap_cursor_serialize(&zc);
+	zap_cursor_fini(&zc);
+
+	err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone);
+	if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from "
+		    "dsl_dataset_hold_obj(za_first_integer)", err));
+	}
+
+	dsl_dir_name(clone->ds_dir, clonename);
+	dsl_dataset_rele(clone, FTAG);
+
+	lua_pushnumber(state, cursor);
+	lua_replace(state, lua_upvalueindex(2));
+
+	(void) lua_pushstring(state, clonename);
+	return (1);
+}
+
+static int zcp_clones_list(lua_State *);
+static zcp_list_info_t zcp_clones_list_info = {
+	.name = "clones",
+	.func = zcp_clones_list,
+	.gc = NULL,
+	.pargs = {
+	    { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_clones_list(lua_State *state)
+{
+	const char *snapname = lua_tostring(state, 1);
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+
+	/*
+	 * zcp_dataset_hold will either successfully return the requested
+	 * dataset or throw a lua error and longjmp out of the zfs.list.clones
+	 * call without returning.
+	 */
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+	boolean_t issnap = ds->ds_is_snapshot;
+	uint64_t cursor = 0;
+	uint64_t dsobj = ds->ds_object;
+	dsl_dataset_rele(ds, FTAG);
+
+	if (!issnap) {
+		return (zcp_argerror(state, 1, "%s is not a snapshot",
+		    snapname));
+	}
+
+	lua_pushnumber(state, dsobj);
+	lua_pushnumber(state, cursor);
+	lua_pushcclosure(state, &zcp_clones_iter, 2);
+	return (1);
+}
+
+static int
+zcp_snapshots_iter(lua_State *state)
+{
+	int err;
+	char snapname[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	dsl_dataset_t *ds;
+	objset_t *os;
+	char *p;
+
+	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+		    err));
+	}
+
+	dsl_dataset_name(ds, snapname);
+	VERIFY3U(sizeof (snapname), >,
+	    strlcat(snapname, "@", sizeof (snapname)));
+
+	p = strchr(snapname, '\0');
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	err = dmu_snapshot_list_next(os,
+	    sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL);
+	dsl_dataset_rele(ds, FTAG);
+
+	if (err == ENOENT) {
+		return (0);
+	} else if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from dmu_snapshot_list_next()", err));
+	}
+
+	lua_pushnumber(state, cursor);
+	lua_replace(state, lua_upvalueindex(2));
+
+	(void) lua_pushstring(state, snapname);
+	return (1);
+}
+
+static int zcp_snapshots_list(lua_State *);
+static zcp_list_info_t zcp_snapshots_list_info = {
+	.name = "snapshots",
+	.func = zcp_snapshots_list,
+	.gc = NULL,
+	.pargs = {
+	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_snapshots_list(lua_State *state)
+{
+	const char *fsname = lua_tostring(state, 1);
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	boolean_t issnap;
+	uint64_t dsobj;
+
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+	issnap = ds->ds_is_snapshot;
+	dsobj = ds->ds_object;
+	dsl_dataset_rele(ds, FTAG);
+
+	if (issnap) {
+		return (zcp_argerror(state, 1,
+		    "argument %s cannot be a snapshot", fsname));
+	}
+
+	lua_pushnumber(state, dsobj);
+	lua_pushnumber(state, 0);
+	lua_pushcclosure(state, &zcp_snapshots_iter, 2);
+	return (1);
+}
+
+static int
+zcp_children_iter(lua_State *state)
+{
+	int err;
+	char childname[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+	zcp_run_info_t *ri = zcp_run_info(state);
+	dsl_pool_t *dp = ri->zri_pool;
+	dsl_dataset_t *ds;
+	objset_t *os;
+	char *p;
+
+	err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+		    err));
+	}
+
+	dsl_dataset_name(ds, childname);
+	VERIFY3U(sizeof (childname), >,
+	    strlcat(childname, "/", sizeof (childname)));
+	p = strchr(childname, '\0');
+
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	do {
+		err = dmu_dir_list_next(os,
+		    sizeof (childname) - (p - childname), p, NULL, &cursor);
+	} while (err == 0 && zfs_dataset_name_hidden(childname));
+	dsl_dataset_rele(ds, FTAG);
+
+	if (err == ENOENT) {
+		return (0);
+	} else if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from dmu_dir_list_next()",
+		    err));
+	}
+
+	lua_pushnumber(state, cursor);
+	lua_replace(state, lua_upvalueindex(2));
+
+	(void) lua_pushstring(state, childname);
+	return (1);
+}
+
+static int zcp_children_list(lua_State *);
+static zcp_list_info_t zcp_children_list_info = {
+	.name = "children",
+	.func = zcp_children_list,
+	.gc = NULL,
+	.pargs = {
+	    { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_children_list(lua_State *state)
+{
+	const char *fsname = lua_tostring(state, 1);
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	boolean_t issnap;
+	uint64_t dsobj;
+
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	issnap = ds->ds_is_snapshot;
+	dsobj = ds->ds_object;
+	dsl_dataset_rele(ds, FTAG);
+
+	if (issnap) {
+		return (zcp_argerror(state, 1,
+		    "argument %s cannot be a snapshot", fsname));
+	}
+
+	lua_pushnumber(state, dsobj);
+	lua_pushnumber(state, 0);
+	lua_pushcclosure(state, &zcp_children_iter, 2);
+	return (1);
+}
+
+static int
+zcp_user_props_list_gc(lua_State *state)
+{
+	nvlist_t **props = lua_touserdata(state, 1);
+	if (*props != NULL)
+		fnvlist_free(*props);
+	return (0);
+}
+
+static int
+zcp_user_props_iter(lua_State *state)
+{
+	char *source, *val;
+	nvlist_t *nvprop;
+	nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1));
+	nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2));
+
+	do {
+		pair = nvlist_next_nvpair(*props, pair);
+		if (pair == NULL) {
+			fnvlist_free(*props);
+			*props = NULL;
+			return (0);
+		}
+	} while (!zfs_prop_user(nvpair_name(pair)));
+
+	lua_pushlightuserdata(state, pair);
+	lua_replace(state, lua_upvalueindex(2));
+
+	nvprop = fnvpair_value_nvlist(pair);
+	val = fnvlist_lookup_string(nvprop, ZPROP_VALUE);
+	source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE);
+
+	(void) lua_pushstring(state, nvpair_name(pair));
+	(void) lua_pushstring(state, val);
+	(void) lua_pushstring(state, source);
+	return (3);
+}
+
+static int zcp_user_props_list(lua_State *);
+static zcp_list_info_t zcp_user_props_list_info = {
+	.name = "user_properties",
+	.func = zcp_user_props_list,
+	.gc = zcp_user_props_list_gc,
+	.pargs = {
+	    { .za_name = "filesystem | snapshot | volume",
+	    .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+/*
+ * 'properties' was the initial name for 'user_properties' seen
+ * above. 'user_properties' is a better name as it distinguishes
+ * these properties from 'system_properties' which are different.
+ * In order to avoid breaking compatibility between different
+ * versions of ZFS, we declare 'properties' as an alias for
+ * 'user_properties'.
+ */
+static zcp_list_info_t zcp_props_list_info = {
+	.name = "properties",
+	.func = zcp_user_props_list,
+	.gc = zcp_user_props_list_gc,
+	.pargs = {
+	    { .za_name = "filesystem | snapshot | volume",
+	    .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_user_props_list(lua_State *state)
+{
+	const char *dsname = lua_tostring(state, 1);
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	objset_t *os;
+	nvlist_t **props = lua_newuserdata(state, sizeof (nvlist_t *));
+
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+	VERIFY0(dmu_objset_from_ds(ds, &os));
+	VERIFY0(dsl_prop_get_all(os, props));
+	dsl_dataset_rele(ds, FTAG);
+
+	/*
+	 * Set the metatable for the properties list to free it on
+	 * completion.
+	 */
+	luaL_getmetatable(state, zcp_user_props_list_info.name);
+	(void) lua_setmetatable(state, -2);
+
+	lua_pushlightuserdata(state, NULL);
+	lua_pushcclosure(state, &zcp_user_props_iter, 2);
+	return (1);
+}
+
+
+/*
+ * Populate nv with all valid system properties and their values for the given
+ * dataset.
+ */
+static void
+zcp_dataset_system_props(dsl_dataset_t *ds, nvlist_t *nv)
+{
+	for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) {
+		/* Do not display hidden props */
+		if (!zfs_prop_visible(prop))
+			continue;
+		/* Do not display props not valid for this dataset */
+		if (!prop_valid_for_ds(ds, prop))
+			continue;
+		fnvlist_add_boolean(nv, zfs_prop_to_name(prop));
+	}
+}
+
+static int zcp_system_props_list(lua_State *);
+static zcp_list_info_t zcp_system_props_list_info = {
+	.name = "system_properties",
+	.func = zcp_system_props_list,
+	.pargs = {
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+/*
+ * Get a list of all visible system properties and their values for a given
+ * dataset. Returned on the stack as a Lua table.
+ */
+static int
+zcp_system_props_list(lua_State *state)
+{
+	int error;
+	char errbuf[128];
+	const char *dataset_name;
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	zcp_list_info_t *libinfo = &zcp_system_props_list_info;
+	zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs);
+	dataset_name = lua_tostring(state, 1);
+	nvlist_t *nv = fnvlist_alloc();
+
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	/* Get the names of all valid system properties for this dataset */
+	zcp_dataset_system_props(ds, nv);
+	dsl_dataset_rele(ds, FTAG);
+
+	/* push list as lua table */
+	error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf));
+	nvlist_free(nv);
+	if (error != 0) {
+		return (luaL_error(state,
+		    "Error returning nvlist: %s", errbuf));
+	}
+	return (1);
+}
+
+static int
+zcp_bookmarks_iter(lua_State *state)
+{
+	char ds_name[ZFS_MAX_DATASET_NAME_LEN];
+	char bookmark_name[ZFS_MAX_DATASET_NAME_LEN];
+	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	dsl_dataset_t *ds;
+	zap_attribute_t za;
+	zap_cursor_t zc;
+
+	int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err == ENOENT) {
+		return (0);
+	} else if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+		    err));
+	}
+
+	if (!dsl_dataset_is_zapified(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	err = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+	    DS_FIELD_BOOKMARK_NAMES, sizeof (ds->ds_bookmarks_obj), 1,
+	    &ds->ds_bookmarks_obj);
+	if (err != 0 && err != ENOENT) {
+		dsl_dataset_rele(ds, FTAG);
+		return (luaL_error(state,
+		    "unexpected error %d from zap_lookup()", err));
+	}
+	if (ds->ds_bookmarks_obj == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	/* Store the dataset's name so we can append the bookmark's name */
+	dsl_dataset_name(ds, ds_name);
+
+	zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+	    ds->ds_bookmarks_obj, cursor);
+	dsl_dataset_rele(ds, FTAG);
+
+	err = zap_cursor_retrieve(&zc, &za);
+	if (err != 0) {
+		zap_cursor_fini(&zc);
+		if (err != ENOENT) {
+			return (luaL_error(state,
+			    "unexpected error %d from zap_cursor_retrieve()",
+			    err));
+		}
+		return (0);
+	}
+	zap_cursor_advance(&zc);
+	cursor = zap_cursor_serialize(&zc);
+	zap_cursor_fini(&zc);
+
+	/* Create the full "pool/fs#bookmark" string to return */
+	int n = snprintf(bookmark_name, ZFS_MAX_DATASET_NAME_LEN, "%s#%s",
+	    ds_name, za.za_name);
+	if (n >= ZFS_MAX_DATASET_NAME_LEN) {
+		return (luaL_error(state,
+		    "unexpected error %d from snprintf()", ENAMETOOLONG));
+	}
+
+	lua_pushnumber(state, cursor);
+	lua_replace(state, lua_upvalueindex(2));
+
+	(void) lua_pushstring(state, bookmark_name);
+	return (1);
+}
+
+static int zcp_bookmarks_list(lua_State *);
+static zcp_list_info_t zcp_bookmarks_list_info = {
+	.name = "bookmarks",
+	.func = zcp_bookmarks_list,
+	.pargs = {
+	    { .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+static int
+zcp_bookmarks_list(lua_State *state)
+{
+	const char *dsname = lua_tostring(state, 1);
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	boolean_t issnap = ds->ds_is_snapshot;
+	uint64_t dsobj = ds->ds_object;
+	uint64_t cursor = 0;
+	dsl_dataset_rele(ds, FTAG);
+
+	if (issnap) {
+		return (zcp_argerror(state, 1, "%s is a snapshot", dsname));
+	}
+
+	lua_pushnumber(state, dsobj);
+	lua_pushnumber(state, cursor);
+	lua_pushcclosure(state, &zcp_bookmarks_iter, 2);
+	return (1);
+}
+
+static int
+zcp_holds_iter(lua_State *state)
+{
+	uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1));
+	uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2));
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+	dsl_dataset_t *ds;
+	zap_attribute_t za;
+	zap_cursor_t zc;
+
+	int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	if (err == ENOENT) {
+		return (0);
+	} else if (err != 0) {
+		return (luaL_error(state,
+		    "unexpected error %d from dsl_dataset_hold_obj(dsobj)",
+		    err));
+	}
+
+	if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
+		dsl_dataset_rele(ds, FTAG);
+		return (0);
+	}
+
+	zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+	    dsl_dataset_phys(ds)->ds_userrefs_obj, cursor);
+	dsl_dataset_rele(ds, FTAG);
+
+	err = zap_cursor_retrieve(&zc, &za);
+	if (err != 0) {
+		zap_cursor_fini(&zc);
+		if (err != ENOENT) {
+			return (luaL_error(state,
+			    "unexpected error %d from zap_cursor_retrieve()",
+			    err));
+		}
+		return (0);
+	}
+	zap_cursor_advance(&zc);
+	cursor = zap_cursor_serialize(&zc);
+	zap_cursor_fini(&zc);
+
+	lua_pushnumber(state, cursor);
+	lua_replace(state, lua_upvalueindex(2));
+
+	(void) lua_pushstring(state, za.za_name);
+	(void) lua_pushnumber(state, za.za_first_integer);
+	return (2);
+}
+
+static int zcp_holds_list(lua_State *);
+static zcp_list_info_t zcp_holds_list_info = {
+	.name = "holds",
+	.func = zcp_holds_list,
+	.gc = NULL,
+	.pargs = {
+	    { .za_name = "snapshot", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	}
+};
+
+/*
+ * Iterate over all the holds for a given dataset. Each iteration returns
+ * a hold's tag and its timestamp as an integer.
+ */
+static int
+zcp_holds_list(lua_State *state)
+{
+	const char *snapname = lua_tostring(state, 1);
+	dsl_pool_t *dp = zcp_run_info(state)->zri_pool;
+
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG);
+	if (ds == NULL)
+		return (1); /* not reached; zcp_dataset_hold() longjmp'd */
+
+	boolean_t issnap = ds->ds_is_snapshot;
+	uint64_t dsobj = ds->ds_object;
+	uint64_t cursor = 0;
+	dsl_dataset_rele(ds, FTAG);
+
+	if (!issnap) {
+		return (zcp_argerror(state, 1, "%s is not a snapshot",
+		    snapname));
+	}
+
+	lua_pushnumber(state, dsobj);
+	lua_pushnumber(state, cursor);
+	lua_pushcclosure(state, &zcp_holds_iter, 2);
+	return (1);
+}
+
+static int
+zcp_list_func(lua_State *state)
+{
+	zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
+
+	zcp_parse_args(state, info->name, info->pargs, info->kwargs);
+
+	return (info->func(state));
+}
+
+int
+zcp_load_list_lib(lua_State *state)
+{
+	int i;
+	zcp_list_info_t *zcp_list_funcs[] = {
+		&zcp_children_list_info,
+		&zcp_snapshots_list_info,
+		&zcp_user_props_list_info,
+		&zcp_props_list_info,
+		&zcp_clones_list_info,
+		&zcp_system_props_list_info,
+		&zcp_bookmarks_list_info,
+		&zcp_holds_list_info,
+		NULL
+	};
+
+	lua_newtable(state);
+
+	for (i = 0; zcp_list_funcs[i] != NULL; i++) {
+		zcp_list_info_t *info = zcp_list_funcs[i];
+
+		if (info->gc != NULL) {
+			/*
+			 * If the function requires garbage collection, create
+			 * a metatable with its name and register the __gc
+			 * function.
+			 */
+			(void) luaL_newmetatable(state, info->name);
+			(void) lua_pushstring(state, "__gc");
+			lua_pushcfunction(state, info->gc);
+			lua_settable(state, -3);
+			lua_pop(state, 1);
+		}
+
+		lua_pushlightuserdata(state, info);
+		lua_pushcclosure(state, &zcp_list_func, 1);
+		lua_setfield(state, -2, info->name);
+		info++;
+	}
+
+	return (1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_set.c b/sys/contrib/openzfs/module/zfs/zcp_set.c
new file mode 100644
index 000000000000..cebb56a5f181
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_set.c
@@ -0,0 +1,100 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016 by Delphix. All rights reserved.
+ * Copyrigh 2020 Joyent, Inc.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lualib.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/zcp.h>
+#include <sys/zcp_set.h>
+#include <sys/zcp_iter.h>
+#include <sys/zcp_global.h>
+#include <sys/zvol.h>
+
+#include <zfs_prop.h>
+
+static void
+zcp_set_user_prop(lua_State *state, dsl_pool_t *dp, const char *dsname,
+    const char *prop_name, const char *prop_val, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG);
+	if (ds == NULL)
+		return; /* not reached; zcp_dataset_hold() longjmp'd */
+
+	nvlist_t *nvl = fnvlist_alloc();
+	fnvlist_add_string(nvl, prop_name, prop_val);
+
+	dsl_props_set_sync_impl(ds, ZPROP_SRC_LOCAL, nvl, tx);
+
+	fnvlist_free(nvl);
+	dsl_dataset_rele(ds, FTAG);
+}
+
+int
+zcp_set_prop_check(void *arg, dmu_tx_t *tx)
+{
+	zcp_set_prop_arg_t *args = arg;
+	const char *prop_name = args->prop;
+	dsl_props_set_arg_t dpsa = {
+		.dpsa_dsname = args->dsname,
+		.dpsa_source = ZPROP_SRC_LOCAL,
+	};
+	nvlist_t *nvl = NULL;
+	int ret = 0;
+
+	/*
+	 * Only user properties are currently supported. When non-user
+	 * properties are supported, we will want to use
+	 * zfs_valid_proplist() to verify the properties.
+	 */
+	if (!zfs_prop_user(prop_name)) {
+		return (EINVAL);
+	}
+
+	nvl = fnvlist_alloc();
+	fnvlist_add_string(nvl, args->prop, args->val);
+	dpsa.dpsa_props = nvl;
+
+	ret = dsl_props_set_check(&dpsa, tx);
+	nvlist_free(nvl);
+
+	return (ret);
+}
+
+void
+zcp_set_prop_sync(void *arg, dmu_tx_t *tx)
+{
+	zcp_set_prop_arg_t *args = arg;
+	zcp_run_info_t *ri = zcp_run_info(args->state);
+	dsl_pool_t *dp = ri->zri_pool;
+
+	const char *dsname = args->dsname;
+	const char *prop_name = args->prop;
+	const char *prop_val = args->val;
+
+	if (zfs_prop_user(prop_name)) {
+		zcp_set_user_prop(args->state, dp, dsname, prop_name,
+		    prop_val, tx);
+	}
+}
diff --git a/sys/contrib/openzfs/module/zfs/zcp_synctask.c b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
new file mode 100644
index 000000000000..4e0fa0d85cbf
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zcp_synctask.c
@@ -0,0 +1,544 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2016, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
+ * Copyright 2020 Joyent, Inc.
+ */
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+
+#include <sys/zcp.h>
+#include <sys/zcp_set.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_synctask.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dmu_objset.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfeature.h>
+#include <sys/metaslab.h>
+
+#define	DST_AVG_BLKSHIFT 14
+
+typedef struct zcp_inherit_prop_arg {
+	lua_State		*zipa_state;
+	const char		*zipa_prop;
+	dsl_props_set_arg_t	zipa_dpsa;
+} zcp_inherit_prop_arg_t;
+
+typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *);
+typedef struct zcp_synctask_info {
+	const char *name;
+	zcp_synctask_func_t *func;
+	const zcp_arg_t pargs[4];
+	const zcp_arg_t kwargs[2];
+	zfs_space_check_t space_check;
+	int blocks_modified;
+} zcp_synctask_info_t;
+
+/*
+ * Generic synctask interface for channel program syncfuncs.
+ *
+ * To perform some action in syncing context, we'd generally call
+ * dsl_sync_task(), but since the Lua script is already running inside a
+ * synctask we need to leave out some actions (such as acquiring the config
+ * rwlock and performing space checks).
+ *
+ * If 'sync' is false, executes a dry run and returns the error code.
+ *
+ * If we are not running in syncing context and we are not doing a dry run
+ * (meaning we are running a zfs.sync function in open-context) then we
+ * return a Lua error.
+ *
+ * This function also handles common fatal error cases for channel program
+ * library functions. If a fatal error occurs, err_dsname will be the dataset
+ * name reported in error messages, if supplied.
+ */
+static int
+zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc,
+    dsl_syncfunc_t *syncfunc, void *arg, boolean_t sync, const char *err_dsname)
+{
+	int err;
+	zcp_run_info_t *ri = zcp_run_info(state);
+
+	err = checkfunc(arg, ri->zri_tx);
+	if (!sync)
+		return (err);
+
+	if (!ri->zri_sync) {
+		return (luaL_error(state, "running functions from the zfs.sync "
+		    "submodule requires passing sync=TRUE to "
+		    "lzc_channel_program() (i.e. do not specify the \"-n\" "
+		    "command line argument)"));
+	}
+
+	if (err == 0) {
+		syncfunc(arg, ri->zri_tx);
+	} else if (err == EIO) {
+		if (err_dsname != NULL) {
+			return (luaL_error(state,
+			    "I/O error while accessing dataset '%s'",
+			    err_dsname));
+		} else {
+			return (luaL_error(state,
+			    "I/O error while accessing dataset."));
+		}
+	}
+
+	return (err);
+}
+
+
+static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_destroy_info = {
+	.name = "destroy",
+	.func = zcp_synctask_destroy,
+	.pargs = {
+	    {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN},
+	    {NULL, 0}
+	},
+	.space_check = ZFS_SPACE_CHECK_DESTROY,
+	.blocks_modified = 0
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+	int err;
+	const char *dsname = lua_tostring(state, 1);
+
+	boolean_t issnap = (strchr(dsname, '@') != NULL);
+
+	if (!issnap && !lua_isnil(state, 2)) {
+		return (luaL_error(state,
+		    "'deferred' kwarg only supported for snapshots: %s",
+		    dsname));
+	}
+
+	if (issnap) {
+		dsl_destroy_snapshot_arg_t ddsa = { 0 };
+		ddsa.ddsa_name = dsname;
+		if (!lua_isnil(state, 2)) {
+			ddsa.ddsa_defer = lua_toboolean(state, 2);
+		} else {
+			ddsa.ddsa_defer = B_FALSE;
+		}
+
+		err = zcp_sync_task(state, dsl_destroy_snapshot_check,
+		    dsl_destroy_snapshot_sync, &ddsa, sync, dsname);
+	} else {
+		dsl_destroy_head_arg_t ddha = { 0 };
+		ddha.ddha_name = dsname;
+
+		err = zcp_sync_task(state, dsl_destroy_head_check,
+		    dsl_destroy_head_sync, &ddha, sync, dsname);
+	}
+
+	return (err);
+}
+
+static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_promote_info = {
+	.name = "promote",
+	.func = zcp_synctask_promote,
+	.pargs = {
+	    {.za_name = "clone", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	},
+	.space_check = ZFS_SPACE_CHECK_RESERVED,
+	.blocks_modified = 3
+};
+
+static int
+zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+	int err;
+	dsl_dataset_promote_arg_t ddpa = { 0 };
+	const char *dsname = lua_tostring(state, 1);
+	zcp_run_info_t *ri = zcp_run_info(state);
+
+	ddpa.ddpa_clonename = dsname;
+	ddpa.err_ds = err_details;
+	ddpa.cr = ri->zri_cred;
+	ddpa.proc = ri->zri_proc;
+
+	/*
+	 * If there was a snapshot name conflict, then err_ds will be filled
+	 * with a list of conflicting snapshot names.
+	 */
+	err = zcp_sync_task(state, dsl_dataset_promote_check,
+	    dsl_dataset_promote_sync, &ddpa, sync, dsname);
+
+	return (err);
+}
+
+static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details);
+static zcp_synctask_info_t zcp_synctask_rollback_info = {
+	.name = "rollback",
+	.func = zcp_synctask_rollback,
+	.space_check = ZFS_SPACE_CHECK_RESERVED,
+	.blocks_modified = 1,
+	.pargs = {
+	    {.za_name = "filesystem", .za_lua_type = LUA_TSTRING},
+	    {0, 0}
+	},
+	.kwargs = {
+	    {0, 0}
+	}
+};
+
+static int
+zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+	int err;
+	const char *dsname = lua_tostring(state, 1);
+	dsl_dataset_rollback_arg_t ddra = { 0 };
+
+	ddra.ddra_fsname = dsname;
+	ddra.ddra_result = err_details;
+
+	err = zcp_sync_task(state, dsl_dataset_rollback_check,
+	    dsl_dataset_rollback_sync, &ddra, sync, dsname);
+
+	return (err);
+}
+
+static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_snapshot_info = {
+	.name = "snapshot",
+	.func = zcp_synctask_snapshot,
+	.pargs = {
+	    {.za_name = "filesystem@snapname | volume@snapname",
+	    .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	},
+	.space_check = ZFS_SPACE_CHECK_NORMAL,
+	.blocks_modified = 3
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+	int err;
+	dsl_dataset_snapshot_arg_t ddsa = { 0 };
+	const char *dsname = lua_tostring(state, 1);
+	zcp_run_info_t *ri = zcp_run_info(state);
+
+	/*
+	 * On old pools, the ZIL must not be active when a snapshot is created,
+	 * but we can't suspend the ZIL because we're already in syncing
+	 * context.
+	 */
+	if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	/*
+	 * We only allow for a single snapshot rather than a list, so the
+	 * error list output is unnecessary.
+	 */
+	ddsa.ddsa_errors = NULL;
+	ddsa.ddsa_props = NULL;
+	ddsa.ddsa_cr = ri->zri_cred;
+	ddsa.ddsa_proc = ri->zri_proc;
+	ddsa.ddsa_snaps = fnvlist_alloc();
+	fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
+
+	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
+	    (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
+
+	err = zcp_sync_task(state, dsl_dataset_snapshot_check,
+	    dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
+
+	if (err == 0) {
+		/*
+		 * We may need to create a new device minor node for this
+		 * dataset (if it is a zvol and the "snapdev" property is set).
+		 * Save it in the nvlist so that it can be processed in open
+		 * context.
+		 */
+		fnvlist_add_boolean(ri->zri_new_zvols, dsname);
+	}
+
+	zcp_deregister_cleanup(state, zch);
+	fnvlist_free(ddsa.ddsa_snaps);
+
+	return (err);
+}
+
+static int zcp_synctask_inherit_prop(lua_State *, boolean_t,
+    nvlist_t *err_details);
+static zcp_synctask_info_t zcp_synctask_inherit_prop_info = {
+	.name = "inherit",
+	.func = zcp_synctask_inherit_prop,
+	.space_check = ZFS_SPACE_CHECK_RESERVED,
+	.blocks_modified = 2, /* 2 * numprops */
+	.pargs = {
+		{ .za_name = "dataset", .za_lua_type = LUA_TSTRING },
+		{ .za_name = "property", .za_lua_type = LUA_TSTRING },
+		{ NULL, 0 }
+	},
+	.kwargs = {
+		{ NULL, 0 }
+	},
+};
+
+static int
+zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx)
+{
+	zcp_inherit_prop_arg_t *args = arg;
+	zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop);
+
+	if (prop == ZPROP_INVAL) {
+		if (zfs_prop_user(args->zipa_prop))
+			return (0);
+
+		return (EINVAL);
+	}
+
+	if (zfs_prop_readonly(prop))
+		return (EINVAL);
+
+	if (!zfs_prop_inheritable(prop))
+		return (EINVAL);
+
+	return (dsl_props_set_check(&args->zipa_dpsa, tx));
+}
+
+static void
+zcp_synctask_inherit_prop_sync(void *arg, dmu_tx_t *tx)
+{
+	zcp_inherit_prop_arg_t *args = arg;
+	dsl_props_set_arg_t *dpsa = &args->zipa_dpsa;
+
+	dsl_props_set_sync(dpsa, tx);
+}
+
+static int
+zcp_synctask_inherit_prop(lua_State *state, boolean_t sync,
+    nvlist_t *err_details)
+{
+	int err;
+	zcp_inherit_prop_arg_t zipa = { 0 };
+	dsl_props_set_arg_t *dpsa = &zipa.zipa_dpsa;
+
+	const char *dsname = lua_tostring(state, 1);
+	const char *prop = lua_tostring(state, 2);
+
+	zipa.zipa_state = state;
+	zipa.zipa_prop = prop;
+	dpsa->dpsa_dsname = dsname;
+	dpsa->dpsa_source = ZPROP_SRC_INHERITED;
+	dpsa->dpsa_props = fnvlist_alloc();
+	fnvlist_add_boolean(dpsa->dpsa_props, prop);
+
+	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
+	    (zcp_cleanup_t *)&fnvlist_free, dpsa->dpsa_props);
+
+	err = zcp_sync_task(state, zcp_synctask_inherit_prop_check,
+	    zcp_synctask_inherit_prop_sync, &zipa, sync, dsname);
+
+	zcp_deregister_cleanup(state, zch);
+	fnvlist_free(dpsa->dpsa_props);
+
+	return (err);
+}
+
+static int zcp_synctask_bookmark(lua_State *, boolean_t, nvlist_t *);
+static zcp_synctask_info_t zcp_synctask_bookmark_info = {
+	.name = "bookmark",
+	.func = zcp_synctask_bookmark,
+	.pargs = {
+	    {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING},
+	    {.za_name = "bookmark", .za_lua_type = LUA_TSTRING},
+	    {NULL, 0}
+	},
+	.kwargs = {
+	    {NULL, 0}
+	},
+	.space_check = ZFS_SPACE_CHECK_NORMAL,
+	.blocks_modified = 1,
+};
+
+/* ARGSUSED */
+static int
+zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+	int err;
+	const char *source = lua_tostring(state, 1);
+	const char *new = lua_tostring(state, 2);
+
+	nvlist_t *bmarks = fnvlist_alloc();
+	fnvlist_add_string(bmarks, new, source);
+
+	zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
+	    (zcp_cleanup_t *)&fnvlist_free, bmarks);
+
+	dsl_bookmark_create_arg_t dbca = {
+		.dbca_bmarks = bmarks,
+		.dbca_errors = NULL,
+	};
+	err = zcp_sync_task(state, dsl_bookmark_create_check,
+	    dsl_bookmark_create_sync, &dbca, sync, source);
+
+	zcp_deregister_cleanup(state, zch);
+	fnvlist_free(bmarks);
+
+	return (err);
+}
+
+static int zcp_synctask_set_prop(lua_State *, boolean_t, nvlist_t *err_details);
+static zcp_synctask_info_t zcp_synctask_set_prop_info = {
+	.name = "set_prop",
+	.func = zcp_synctask_set_prop,
+	.space_check = ZFS_SPACE_CHECK_RESERVED,
+	.blocks_modified = 2,
+	.pargs = {
+		{ .za_name = "dataset", .za_lua_type = LUA_TSTRING},
+		{ .za_name = "property", .za_lua_type =  LUA_TSTRING},
+		{ .za_name = "value", .za_lua_type =  LUA_TSTRING},
+		{ NULL, 0 }
+	},
+	.kwargs = {
+		{ NULL, 0 }
+	}
+};
+
+static int
+zcp_synctask_set_prop(lua_State *state, boolean_t sync, nvlist_t *err_details)
+{
+	int err;
+	zcp_set_prop_arg_t args = { 0 };
+
+	const char *dsname = lua_tostring(state, 1);
+	const char *prop = lua_tostring(state, 2);
+	const char *val = lua_tostring(state, 3);
+
+	args.state = state;
+	args.dsname = dsname;
+	args.prop = prop;
+	args.val = val;
+
+	err = zcp_sync_task(state, zcp_set_prop_check, zcp_set_prop_sync,
+	    &args, sync, dsname);
+
+	return (err);
+}
+
+static int
+zcp_synctask_wrapper(lua_State *state)
+{
+	int err;
+	zcp_cleanup_handler_t *zch;
+	int num_ret = 1;
+	nvlist_t *err_details = fnvlist_alloc();
+
+	/*
+	 * Make sure err_details is properly freed, even if a fatal error is
+	 * thrown during the synctask.
+	 */
+	zch = zcp_register_cleanup(state,
+	    (zcp_cleanup_t *)&fnvlist_free, err_details);
+
+	zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
+	boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
+
+	zcp_run_info_t *ri = zcp_run_info(state);
+	dsl_pool_t *dp = ri->zri_pool;
+
+	/* MOS space is triple-dittoed, so we multiply by 3. */
+	uint64_t funcspace =
+	    ((uint64_t)info->blocks_modified << DST_AVG_BLKSHIFT) * 3;
+
+	zcp_parse_args(state, info->name, info->pargs, info->kwargs);
+
+	err = 0;
+	if (info->space_check != ZFS_SPACE_CHECK_NONE) {
+		uint64_t quota = dsl_pool_unreserved_space(dp,
+		    info->space_check);
+		uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes +
+		    ri->zri_space_used;
+
+		if (used + funcspace > quota) {
+			err = SET_ERROR(ENOSPC);
+		}
+	}
+
+	if (err == 0) {
+		err = info->func(state, sync, err_details);
+	}
+
+	if (err == 0) {
+		ri->zri_space_used += funcspace;
+	}
+
+	lua_pushnumber(state, (lua_Number)err);
+	if (fnvlist_num_pairs(err_details) > 0) {
+		(void) zcp_nvlist_to_lua(state, err_details, NULL, 0);
+		num_ret++;
+	}
+
+	zcp_deregister_cleanup(state, zch);
+	fnvlist_free(err_details);
+
+	return (num_ret);
+}
+
+int
+zcp_load_synctask_lib(lua_State *state, boolean_t sync)
+{
+	int i;
+	zcp_synctask_info_t *zcp_synctask_funcs[] = {
+		&zcp_synctask_destroy_info,
+		&zcp_synctask_promote_info,
+		&zcp_synctask_rollback_info,
+		&zcp_synctask_snapshot_info,
+		&zcp_synctask_inherit_prop_info,
+		&zcp_synctask_bookmark_info,
+		&zcp_synctask_set_prop_info,
+		NULL
+	};
+
+	lua_newtable(state);
+
+	for (i = 0; zcp_synctask_funcs[i] != NULL; i++) {
+		zcp_synctask_info_t *info = zcp_synctask_funcs[i];
+		lua_pushlightuserdata(state, info);
+		lua_pushboolean(state, sync);
+		lua_pushcclosure(state, &zcp_synctask_wrapper, 2);
+		lua_setfield(state, -2, info->name);
+		info++;
+	}
+
+	return (1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
new file mode 100644
index 000000000000..9d16fff81d0a
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -0,0 +1,526 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfeature.h>
+#include <sys/dmu.h>
+#include <sys/nvpair.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include "zfeature_common.h"
+#include <sys/spa_impl.h>
+
+/*
+ * ZFS Feature Flags
+ * -----------------
+ *
+ * ZFS feature flags are used to provide fine-grained versioning to the ZFS
+ * on-disk format. Once enabled on a pool feature flags replace the old
+ * spa_version() number.
+ *
+ * Each new on-disk format change will be given a uniquely identifying string
+ * GUID rather than a version number. This avoids the problem of different
+ * organizations creating new on-disk formats with the same version number. To
+ * keep feature GUIDs unique they should consist of the reverse dns name of the
+ * organization which implemented the feature and a short name for the feature,
+ * separated by a colon (e.g. com.delphix:async_destroy).
+ *
+ * Reference Counts
+ * ----------------
+ *
+ * Within each pool features can be in one of three states: disabled, enabled,
+ * or active. These states are differentiated by a reference count stored on
+ * disk for each feature:
+ *
+ *   1) If there is no reference count stored on disk the feature is disabled.
+ *   2) If the reference count is 0 a system administrator has enabled the
+ *      feature, but the feature has not been used yet, so no on-disk
+ *      format changes have been made.
+ *   3) If the reference count is greater than 0 the feature is active.
+ *      The format changes required by the feature are currently on disk.
+ *      Note that if the feature's format changes are reversed the feature
+ *      may choose to set its reference count back to 0.
+ *
+ * Feature flags makes no differentiation between non-zero reference counts
+ * for an active feature (e.g. a reference count of 1 means the same thing as a
+ * reference count of 27834721), but feature implementations may choose to use
+ * the reference count to store meaningful information. For example, a new RAID
+ * implementation might set the reference count to the number of vdevs using
+ * it. If all those disks are removed from the pool the feature goes back to
+ * having a reference count of 0.
+ *
+ * It is the responsibility of the individual features to maintain a non-zero
+ * reference count as long as the feature's format changes are present on disk.
+ *
+ * Dependencies
+ * ------------
+ *
+ * Each feature may depend on other features. The only effect of this
+ * relationship is that when a feature is enabled all of its dependencies are
+ * automatically enabled as well. Any future work to support disabling of
+ * features would need to ensure that features cannot be disabled if other
+ * enabled features depend on them.
+ *
+ * On-disk Format
+ * --------------
+ *
+ * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
+ * (5000). In order for this to work the pool is automatically upgraded to
+ * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
+ * format changes will be in use.
+ *
+ * Information about features is stored in 3 ZAP objects in the pool's MOS.
+ * These objects are linked to by the following names in the pool directory
+ * object:
+ *
+ * 1) features_for_read: feature GUID -> reference count
+ *    Features needed to open the pool for reading.
+ * 2) features_for_write: feature GUID -> reference count
+ *    Features needed to open the pool for writing.
+ * 3) feature_descriptions: feature GUID -> descriptive string
+ *    A human readable string.
+ *
+ * All enabled features appear in either features_for_read or
+ * features_for_write, but not both.
+ *
+ * To open a pool in read-only mode only the features listed in
+ * features_for_read need to be supported.
+ *
+ * To open the pool in read-write mode features in both features_for_read and
+ * features_for_write need to be supported.
+ *
+ * Some features may be required to read the ZAP objects containing feature
+ * information. To allow software to check for compatibility with these features
+ * before the pool is opened their names must be stored in the label in a
+ * new "features_for_read" entry (note that features that are only required
+ * to write to a pool never need to be stored in the label since the
+ * features_for_write ZAP object can be read before the pool is written to).
+ * To save space in the label features must be explicitly marked as needing to
+ * be written to the label. Also, reference counts are not stored in the label,
+ * instead any feature whose reference count drops to 0 is removed from the
+ * label.
+ *
+ * Adding New Features
+ * -------------------
+ *
+ * Features must be registered in zpool_feature_init() function in
+ * zfeature_common.c using the zfeature_register() function. This function
+ * has arguments to specify if the feature should be stored in the
+ * features_for_read or features_for_write ZAP object and if it needs to be
+ * written to the label when active.
+ *
+ * Once a feature is registered it will appear as a "feature@<feature name>"
+ * property which can be set by an administrator. Feature implementors should
+ * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
+ * query the state of a feature and the spa_feature_incr() and
+ * spa_feature_decr() functions to change an enabled feature's reference count.
+ * Reference counts may only be updated in the syncing context.
+ *
+ * Features may not perform enable-time initialization. Instead, any such
+ * initialization should occur when the feature is first used. This design
+ * enforces that on-disk changes be made only when features are used. Code
+ * should only check if a feature is enabled using spa_feature_is_enabled(),
+ * not by relying on any feature specific metadata existing. If a feature is
+ * enabled, but the feature's metadata is not on disk yet then it should be
+ * created as needed.
+ *
+ * As an example, consider the com.delphix:async_destroy feature. This feature
+ * relies on the existence of a bptree in the MOS that store blocks for
+ * asynchronous freeing. This bptree is not created when async_destroy is
+ * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
+ * called to check if async_destroy is enabled. If it is and the bptree object
+ * does not exist yet, the bptree object is created as part of the dataset
+ * destroy and async_destroy's reference count is incremented to indicate it
+ * has made an on-disk format change. Later, after the destroyed dataset's
+ * blocks have all been asynchronously freed there is no longer any use for the
+ * bptree object, so it is destroyed and async_destroy's reference count is
+ * decremented back to 0 to indicate that it has undone its on-disk format
+ * changes.
+ */
+
+typedef enum {
+	FEATURE_ACTION_INCR,
+	FEATURE_ACTION_DECR,
+} feature_action_t;
+
+/*
+ * Checks that the active features in the pool are supported by
+ * this software.  Adds each unsupported feature (name -> description) to
+ * the supplied nvlist.
+ */
+boolean_t
+spa_features_check(spa_t *spa, boolean_t for_write,
+    nvlist_t *unsup_feat, nvlist_t *enabled_feat)
+{
+	objset_t *os = spa->spa_meta_objset;
+	boolean_t supported;
+	zap_cursor_t *zc;
+	zap_attribute_t *za;
+	uint64_t obj = for_write ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+	char *buf;
+
+	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+	za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+
+	supported = B_TRUE;
+	for (zap_cursor_init(zc, os, obj);
+	    zap_cursor_retrieve(zc, za) == 0;
+	    zap_cursor_advance(zc)) {
+		ASSERT(za->za_integer_length == sizeof (uint64_t) &&
+		    za->za_num_integers == 1);
+
+		if (NULL != enabled_feat) {
+			fnvlist_add_uint64(enabled_feat, za->za_name,
+			    za->za_first_integer);
+		}
+
+		if (za->za_first_integer != 0 &&
+		    !zfeature_is_supported(za->za_name)) {
+			supported = B_FALSE;
+
+			if (NULL != unsup_feat) {
+				const char *desc = "";
+
+				if (zap_lookup(os, spa->spa_feat_desc_obj,
+				    za->za_name, 1, MAXPATHLEN, buf) == 0)
+					desc = buf;
+
+				VERIFY(nvlist_add_string(unsup_feat,
+				    za->za_name, desc) == 0);
+			}
+		}
+	}
+	zap_cursor_fini(zc);
+
+	kmem_free(buf, MAXPATHLEN);
+	kmem_free(za, sizeof (zap_attribute_t));
+	kmem_free(zc, sizeof (zap_cursor_t));
+
+	return (supported);
+}
+
+/*
+ * Use an in-memory cache of feature refcounts for quick retrieval.
+ *
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
+ */
+int
+feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+	ASSERT(VALID_FEATURE_FID(feature->fi_feature));
+	if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
+	    SPA_FEATURE_DISABLED) {
+		return (SET_ERROR(ENOTSUP));
+	}
+	*res = spa->spa_feat_refcount_cache[feature->fi_feature];
+	return (0);
+}
+
+/*
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+    uint64_t *res)
+{
+	int err;
+	uint64_t refcount;
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	/*
+	 * If the pool is currently being created, the feature objects may not
+	 * have been allocated yet.  Act as though all features are disabled.
+	 */
+	if (zapobj == 0)
+		return (SET_ERROR(ENOTSUP));
+
+	err = zap_lookup(spa->spa_meta_objset, zapobj,
+	    feature->fi_guid, sizeof (uint64_t), 1, &refcount);
+	if (err != 0) {
+		if (err == ENOENT)
+			return (SET_ERROR(ENOTSUP));
+		else
+			return (err);
+	}
+	*res = refcount;
+	return (0);
+}
+
+
+static int
+feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+	uint64_t enabled_txg_obj __maybe_unused = spa->spa_feat_enabled_txg_obj;
+
+	ASSERT(zfeature_depends_on(feature->fi_feature,
+	    SPA_FEATURE_ENABLED_TXG));
+
+	if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	ASSERT(enabled_txg_obj != 0);
+
+	VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
+	    feature->fi_guid, sizeof (uint64_t), 1, res));
+
+	return (0);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
+    dmu_tx_t *tx)
+{
+	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
+	    sizeof (uint64_t), 1, &refcount, tx));
+
+	/*
+	 * feature_sync is called directly from zhack, allowing the
+	 * creation of arbitrary features whose fi_feature field may
+	 * be greater than SPA_FEATURES. When called from zhack, the
+	 * zfeature_info_t object's fi_feature field will be set to
+	 * SPA_FEATURE_NONE.
+	 */
+	if (feature->fi_feature != SPA_FEATURE_NONE) {
+		uint64_t *refcount_cache =
+		    &spa->spa_feat_refcount_cache[feature->fi_feature];
+		VERIFY3U(*refcount_cache, ==,
+		    atomic_swap_64(refcount_cache, refcount));
+	}
+
+	if (refcount == 0)
+		spa_deactivate_mos_feature(spa, feature->fi_guid);
+	else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
+		spa_activate_mos_feature(spa, feature->fi_guid, tx);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+	uint64_t initial_refcount =
+	    (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	ASSERT(0 != zapobj);
+	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+	/*
+	 * If the feature is already enabled, ignore the request.
+	 */
+	if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
+		return;
+
+	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
+		spa_feature_enable(spa, feature->fi_depends[i], tx);
+
+	VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
+	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+	    feature->fi_desc, tx));
+
+	feature_sync(spa, feature, initial_refcount, tx);
+
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
+		uint64_t enabling_txg = dmu_tx_get_txg(tx);
+
+		if (spa->spa_feat_enabled_txg_obj == 0ULL) {
+			spa->spa_feat_enabled_txg_obj =
+			    zap_create_link(spa->spa_meta_objset,
+			    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+			    DMU_POOL_FEATURE_ENABLED_TXG, tx);
+		}
+		spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
+
+		VERIFY0(zap_add(spa->spa_meta_objset,
+		    spa->spa_feat_enabled_txg_obj, feature->fi_guid,
+		    sizeof (uint64_t), 1, &enabling_txg, tx));
+	}
+
+	/*
+	 * Errata #4 is mostly a problem with encrypted datasets, but it
+	 * is also a problem where the old encryption feature did not
+	 * depend on the bookmark_v2 feature. If the pool does not have
+	 * any encrypted datasets we can resolve this issue simply by
+	 * enabling this dependency.
+	 */
+	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_8308_ENCRYPTION &&
+	    spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
+	    !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) &&
+	    feature->fi_feature == SPA_FEATURE_BOOKMARK_V2)
+		spa->spa_errata = 0;
+}
+
+static void
+feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
+    dmu_tx_t *tx)
+{
+	uint64_t refcount = 0;
+	zfeature_info_t *feature = &spa_feature_table[fid];
+	uint64_t zapobj __maybe_unused =
+	    (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	ASSERT(0 != zapobj);
+	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
+
+	switch (action) {
+	case FEATURE_ACTION_INCR:
+		VERIFY3U(refcount, !=, UINT64_MAX);
+		refcount++;
+		break;
+	case FEATURE_ACTION_DECR:
+		VERIFY3U(refcount, !=, 0);
+		refcount--;
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+
+	feature_sync(spa, feature, refcount, tx);
+}
+
+void
+spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
+{
+	/*
+	 * We create feature flags ZAP objects in two instances: during pool
+	 * creation and during pool upgrade.
+	 */
+	ASSERT((!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL) ||
+	    dsl_pool_sync_context(spa_get_dsl(spa)));
+
+	spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURES_FOR_READ, tx);
+	spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURES_FOR_WRITE, tx);
+	spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
+	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_FEATURE_DESCRIPTIONS, tx);
+}
+
+/*
+ * Enable any required dependencies, then enable the requested feature.
+ */
+void
+spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+	ASSERT(VALID_FEATURE_FID(fid));
+	feature_enable_sync(spa, &spa_feature_table[fid], tx);
+}
+
+void
+spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+	feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
+}
+
+void
+spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
+{
+	feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
+}
+
+boolean_t
+spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
+{
+	int err;
+	uint64_t refcount = 0;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+	ASSERT(err == 0 || err == ENOTSUP);
+	return (err == 0);
+}
+
+boolean_t
+spa_feature_is_active(spa_t *spa, spa_feature_t fid)
+{
+	int err;
+	uint64_t refcount = 0;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
+	ASSERT(err == 0 || err == ENOTSUP);
+	return (err == 0 && refcount > 0);
+}
+
+/*
+ * For the feature specified by fid (which must depend on
+ * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
+ * OUT txg argument.
+ *
+ * Returns B_TRUE if the feature is enabled, in which case txg will be filled
+ * with the transaction group in which the specified feature was enabled.
+ * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
+ */
+boolean_t
+spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
+{
+	int err;
+
+	ASSERT(VALID_FEATURE_FID(fid));
+	if (spa_version(spa) < SPA_VERSION_FEATURES)
+		return (B_FALSE);
+
+	err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
+	ASSERT(err == 0 || err == ENOTSUP);
+
+	return (err == 0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_byteswap.c b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
new file mode 100644
index 000000000000..cd35849c3f37
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_byteswap.c
@@ -0,0 +1,211 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_sa.h>
+#include <sys/zfs_acl.h>
+
+#ifndef _KERNEL
+static
+#endif
+void
+zfs_oldace_byteswap(ace_t *ace, int ace_cnt)
+{
+	int i;
+
+	for (i = 0; i != ace_cnt; i++, ace++) {
+		ace->a_who = BSWAP_32(ace->a_who);
+		ace->a_access_mask = BSWAP_32(ace->a_access_mask);
+		ace->a_flags = BSWAP_16(ace->a_flags);
+		ace->a_type = BSWAP_16(ace->a_type);
+	}
+}
+
+/*
+ * swap ace_t and ace_object_t
+ */
+#ifndef _KERNEL
+static
+#endif
+void
+zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
+{
+	caddr_t end;
+	caddr_t ptr;
+	zfs_ace_t *zacep = NULL;
+	ace_t *acep;
+	uint16_t entry_type;
+	size_t entry_size;
+	int ace_type;
+
+	end = (caddr_t)buf + size;
+	ptr = buf;
+
+	while (ptr < end) {
+		if (zfs_layout) {
+			/*
+			 * Avoid overrun.  Embedded aces can have one
+			 * of several sizes.  We don't know exactly
+			 * how many our present, only the size of the
+			 * buffer containing them.  That size may be
+			 * larger than needed to hold the aces
+			 * present.  As long as we do not do any
+			 * swapping beyond the end of our block we are
+			 * okay.  It is safe to swap any non-ace data
+			 * within the block since it is just zeros.
+			 */
+			if (ptr + sizeof (zfs_ace_hdr_t) > end) {
+				break;
+			}
+			zacep = (zfs_ace_t *)ptr;
+			zacep->z_hdr.z_access_mask =
+			    BSWAP_32(zacep->z_hdr.z_access_mask);
+			zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags);
+			ace_type = zacep->z_hdr.z_type =
+			    BSWAP_16(zacep->z_hdr.z_type);
+			entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
+		} else {
+			/* Overrun avoidance */
+			if (ptr + sizeof (ace_t) > end) {
+				break;
+			}
+			acep = (ace_t *)ptr;
+			acep->a_access_mask = BSWAP_32(acep->a_access_mask);
+			acep->a_flags = BSWAP_16(acep->a_flags);
+			ace_type = acep->a_type = BSWAP_16(acep->a_type);
+			acep->a_who = BSWAP_32(acep->a_who);
+			entry_type = acep->a_flags & ACE_TYPE_FLAGS;
+		}
+		switch (entry_type) {
+		case ACE_OWNER:
+		case ACE_EVERYONE:
+		case (ACE_IDENTIFIER_GROUP | ACE_GROUP):
+			entry_size = zfs_layout ?
+			    sizeof (zfs_ace_hdr_t) : sizeof (ace_t);
+			break;
+		case ACE_IDENTIFIER_GROUP:
+		default:
+			/* Overrun avoidance */
+			if (zfs_layout) {
+				if (ptr + sizeof (zfs_ace_t) <= end) {
+					zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+				} else {
+					entry_size = sizeof (zfs_ace_t);
+					break;
+				}
+			}
+			switch (ace_type) {
+			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
+			case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE:
+			case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE:
+				entry_size = zfs_layout ?
+				    sizeof (zfs_object_ace_t) :
+				    sizeof (ace_object_t);
+				break;
+			default:
+				entry_size = zfs_layout ? sizeof (zfs_ace_t) :
+				    sizeof (ace_t);
+				break;
+			}
+		}
+		ptr = ptr + entry_size;
+	}
+}
+
+/* ARGSUSED */
+void
+zfs_oldacl_byteswap(void *buf, size_t size)
+{
+	int cnt;
+
+	/*
+	 * Arggh, since we don't know how many ACEs are in
+	 * the array, we have to swap the entire block
+	 */
+
+	cnt = size / sizeof (ace_t);
+
+	zfs_oldace_byteswap((ace_t *)buf, cnt);
+}
+
+/* ARGSUSED */
+void
+zfs_acl_byteswap(void *buf, size_t size)
+{
+	zfs_ace_byteswap(buf, size, B_TRUE);
+}
+
+void
+zfs_znode_byteswap(void *buf, size_t size)
+{
+	znode_phys_t *zp = buf;
+
+	ASSERT(size >= sizeof (znode_phys_t));
+
+	zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]);
+	zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]);
+	zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]);
+	zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]);
+	zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]);
+	zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]);
+	zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]);
+	zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]);
+	zp->zp_gen = BSWAP_64(zp->zp_gen);
+	zp->zp_mode = BSWAP_64(zp->zp_mode);
+	zp->zp_size = BSWAP_64(zp->zp_size);
+	zp->zp_parent = BSWAP_64(zp->zp_parent);
+	zp->zp_links = BSWAP_64(zp->zp_links);
+	zp->zp_xattr = BSWAP_64(zp->zp_xattr);
+	zp->zp_rdev = BSWAP_64(zp->zp_rdev);
+	zp->zp_flags = BSWAP_64(zp->zp_flags);
+	zp->zp_uid = BSWAP_64(zp->zp_uid);
+	zp->zp_gid = BSWAP_64(zp->zp_gid);
+	zp->zp_zap = BSWAP_64(zp->zp_zap);
+	zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]);
+	zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]);
+	zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]);
+
+	zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj);
+	zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size);
+	zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version);
+	zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count);
+	if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
+		zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
+		    ZFS_ACE_SPACE);
+	} else {
+		zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
+		    ACE_SLOT_CNT);
+	}
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_oldacl_byteswap);
+EXPORT_SYMBOL(zfs_acl_byteswap);
+EXPORT_SYMBOL(zfs_znode_byteswap);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fm.c b/sys/contrib/openzfs/module/zfs/zfs_fm.c
new file mode 100644
index 000000000000..ea71ef325c89
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_fm.c
@@ -0,0 +1,1416 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Copyright (c) 2012,2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/sysevent.h>
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports.  The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * 	EREPORT			POOL	VDEV	IO
+ * 	block			X	X	X
+ * 	data			X		X
+ * 	device			X	X
+ * 	pool			X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA (Error Numeric Association).
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
+ * to chain together all ereports associated with a logical piece of data.  For
+ * read I/Os, there  are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ *      +---------------+
+ * 	| Aggregate I/O |	No associated logical data or device
+ * 	+---------------+
+ *              |
+ *              V
+ * 	+---------------+	Reads associated with a piece of logical data.
+ * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
+ * 	+---------------+       mirrors, gang blocks, retries, etc.
+ *              |
+ *              V
+ * 	+---------------+	Reads associated with a particular device, but
+ * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
+ * 	+---------------+	and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer.  But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs.  They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data.  When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
+ * this pointer is set to NULL, and no ereport will be generated (since it
+ * doesn't actually correspond to any particular device or piece of data,
+ * and the caller will always retry without caching or queueing anyway).
+ *
+ * For checksum errors, we want to include more information about the actual
+ * error which occurs.  Accordingly, we build an ereport when the error is
+ * noticed, but instead of sending it in immediately, we hang it off of the
+ * io_cksum_report field of the logical IO.  When the logical IO completes
+ * (successfully or not), zfs_ereport_finish_checksum() is called with the
+ * good and bad versions of the buffer (if available), and we annotate the
+ * ereport with information about the differences.
+ */
+
+#ifdef _KERNEL
+/*
+ * Duplicate ereport Detection
+ *
+ * Some ereports are retained momentarily for detecting duplicates.  These
+ * are kept in a recent_events_node_t in both a time-ordered list and an AVL
+ * tree of recent unique ereports.
+ *
+ * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
+ * task is used to purge stale entries.
+ */
+static list_t recent_events_list;
+static avl_tree_t recent_events_tree;
+static kmutex_t recent_events_lock;
+static taskqid_t recent_events_cleaner_tqid;
+
+/*
+ * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
+ *
+ * This setting can be changed dynamically and setting it to zero
+ * disables duplicate detection.
+ */
+unsigned int zfs_zevent_retain_max = 2000;
+
+/*
+ * The lifespan for a recent ereport entry. The default of 15 minutes is
+ * intended to outlive the zfs diagnosis engine's threshold of 10 errors
+ * over a period of 10 minutes.
+ */
+unsigned int zfs_zevent_retain_expire_secs = 900;
+
+typedef enum zfs_subclass {
+	ZSC_IO,
+	ZSC_DATA,
+	ZSC_CHECKSUM
+} zfs_subclass_t;
+
+typedef struct {
+	/* common criteria */
+	uint64_t	re_pool_guid;
+	uint64_t	re_vdev_guid;
+	int		re_io_error;
+	uint64_t	re_io_size;
+	uint64_t	re_io_offset;
+	zfs_subclass_t	re_subclass;
+	zio_priority_t	re_io_priority;
+
+	/* logical zio criteria (optional) */
+	zbookmark_phys_t re_io_bookmark;
+
+	/* internal state */
+	avl_node_t	re_tree_link;
+	list_node_t	re_list_link;
+	uint64_t	re_timestamp;
+} recent_events_node_t;
+
+static int
+recent_events_compare(const void *a, const void *b)
+{
+	const recent_events_node_t *node1 = a;
+	const recent_events_node_t *node2 = b;
+	int cmp;
+
+	/*
+	 * The comparison order here is somewhat arbitrary.
+	 * What's important is that if every criteria matches, then it
+	 * is a duplicate (i.e. compare returns 0)
+	 */
+	if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
+		return (cmp);
+
+	const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
+	const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
+
+	if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
+		return (cmp);
+	if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
+		return (cmp);
+
+	return (0);
+}
+
+static void zfs_ereport_schedule_cleaner(void);
+
+/*
+ * background task to clean stale recent event nodes.
+ */
+/*ARGSUSED*/
+static void
+zfs_ereport_cleaner(void *arg)
+{
+	recent_events_node_t *entry;
+	uint64_t now = gethrtime();
+
+	/*
+	 * purge expired entries
+	 */
+	mutex_enter(&recent_events_lock);
+	while ((entry = list_tail(&recent_events_list)) != NULL) {
+		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+		if (age <= zfs_zevent_retain_expire_secs)
+			break;
+
+		/* remove expired node */
+		avl_remove(&recent_events_tree, entry);
+		list_remove(&recent_events_list, entry);
+		kmem_free(entry, sizeof (*entry));
+	}
+
+	/* Restart the cleaner if more entries remain */
+	recent_events_cleaner_tqid = 0;
+	if (!list_is_empty(&recent_events_list))
+		zfs_ereport_schedule_cleaner();
+
+	mutex_exit(&recent_events_lock);
+}
+
+static void
+zfs_ereport_schedule_cleaner(void)
+{
+	ASSERT(MUTEX_HELD(&recent_events_lock));
+
+	uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
+
+	recent_events_cleaner_tqid = taskq_dispatch_delay(
+	    system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
+	    ddi_get_lbolt() + NSEC_TO_TICK(timeout));
+}
+
+/*
+ * Check if an ereport would be a duplicate of one recently posted.
+ *
+ * An ereport is considered a duplicate if the set of criteria in
+ * recent_events_node_t all match.
+ *
+ * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
+ * are candidates for duplicate checking.
+ */
+static boolean_t
+zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
+    const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
+{
+	recent_events_node_t search = {0}, *entry;
+
+	if (vd == NULL || zio == NULL)
+		return (B_FALSE);
+
+	if (zfs_zevent_retain_max == 0)
+		return (B_FALSE);
+
+	if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
+		search.re_subclass = ZSC_IO;
+	else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
+		search.re_subclass = ZSC_DATA;
+	else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
+		search.re_subclass = ZSC_CHECKSUM;
+	else
+		return (B_FALSE);
+
+	search.re_pool_guid = spa_guid(spa);
+	search.re_vdev_guid = vd->vdev_guid;
+	search.re_io_error = zio->io_error;
+	search.re_io_priority = zio->io_priority;
+	/* if size is supplied use it over what's in zio */
+	if (size) {
+		search.re_io_size = size;
+		search.re_io_offset = offset;
+	} else {
+		search.re_io_size = zio->io_size;
+		search.re_io_offset = zio->io_offset;
+	}
+
+	/* grab optional logical zio criteria */
+	if (zb != NULL) {
+		search.re_io_bookmark.zb_objset = zb->zb_objset;
+		search.re_io_bookmark.zb_object = zb->zb_object;
+		search.re_io_bookmark.zb_level = zb->zb_level;
+		search.re_io_bookmark.zb_blkid = zb->zb_blkid;
+	}
+
+	uint64_t now = gethrtime();
+
+	mutex_enter(&recent_events_lock);
+
+	/* check if we have seen this one recently */
+	entry = avl_find(&recent_events_tree, &search, NULL);
+	if (entry != NULL) {
+		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
+
+		/*
+		 * There is still an active cleaner (since we're here).
+		 * Reset the last seen time for this duplicate entry
+		 * so that its lifespand gets extended.
+		 */
+		list_remove(&recent_events_list, entry);
+		list_insert_head(&recent_events_list, entry);
+		entry->re_timestamp = now;
+
+		zfs_zevent_track_duplicate();
+		mutex_exit(&recent_events_lock);
+
+		return (age <= zfs_zevent_retain_expire_secs);
+	}
+
+	if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
+		/* recycle oldest node */
+		entry = list_tail(&recent_events_list);
+		ASSERT(entry != NULL);
+		list_remove(&recent_events_list, entry);
+		avl_remove(&recent_events_tree, entry);
+	} else {
+		entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
+	}
+
+	/* record this as a recent ereport */
+	*entry = search;
+	avl_add(&recent_events_tree, entry);
+	list_insert_head(&recent_events_list, entry);
+	entry->re_timestamp = now;
+
+	/* Start a cleaner if not already scheduled */
+	if (recent_events_cleaner_tqid == 0)
+		zfs_ereport_schedule_cleaner();
+
+	mutex_exit(&recent_events_lock);
+	return (B_FALSE);
+}
+
+void
+zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
+{
+	if (nvl)
+		fm_nvlist_destroy(nvl, FM_NVA_FREE);
+
+	if (detector)
+		fm_nvlist_destroy(detector, FM_NVA_FREE);
+}
+
+/*
+ * We want to rate limit ZIO delay and checksum events so as to not
+ * flood ZED when a disk is acting up.
+ *
+ * Returns 1 if we're ratelimiting, 0 if not.
+ */
+static int
+zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
+{
+	int rc = 0;
+	/*
+	 * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
+	 * are.  Invert it to get our return value.
+	 */
+	if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
+		rc = !zfs_ratelimit(&vd->vdev_delay_rl);
+	} else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
+		rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
+	}
+
+	if (rc)	{
+		/* We're rate limiting */
+		fm_erpt_dropped_increment();
+	}
+
+	return (rc);
+}
+
+/*
+ * Return B_TRUE if the event actually posted, B_FALSE if not.
+ */
+static boolean_t
+zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
+    const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
+    zio_t *zio, uint64_t stateoroffset, uint64_t size)
+{
+	nvlist_t *ereport, *detector;
+
+	uint64_t ena;
+	char class[64];
+
+	if ((ereport = fm_nvlist_create(NULL)) == NULL)
+		return (B_FALSE);
+
+	if ((detector = fm_nvlist_create(NULL)) == NULL) {
+		fm_nvlist_destroy(ereport, FM_NVA_FREE);
+		return (B_FALSE);
+	}
+
+	/*
+	 * Serialize ereport generation
+	 */
+	mutex_enter(&spa->spa_errlist_lock);
+
+	/*
+	 * Determine the ENA to use for this event.  If we are in a loading
+	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
+	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
+	 */
+	if (spa_load_state(spa) != SPA_LOAD_NONE) {
+		if (spa->spa_ena == 0)
+			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+		ena = spa->spa_ena;
+	} else if (zio != NULL && zio->io_logical != NULL) {
+		if (zio->io_logical->io_ena == 0)
+			zio->io_logical->io_ena =
+			    fm_ena_generate(0, FM_ENA_FMT1);
+		ena = zio->io_logical->io_ena;
+	} else {
+		ena = fm_ena_generate(0, FM_ENA_FMT1);
+	}
+
+	/*
+	 * Construct the full class, detector, and other standard FMA fields.
+	 */
+	(void) snprintf(class, sizeof (class), "%s.%s",
+	    ZFS_ERROR_CLASS, subclass);
+
+	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+	    vd != NULL ? vd->vdev_guid : 0);
+
+	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
+
+	/*
+	 * Construct the per-ereport payload, depending on which parameters are
+	 * passed in.
+	 */
+
+	/*
+	 * Generic payload members common to all ereports.
+	 */
+	fm_payload_set(ereport,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
+	    (uint64_t)spa_state(spa),
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+	    (int32_t)spa_load_state(spa), NULL);
+
+	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
+	    DATA_TYPE_STRING,
+	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
+	    FM_EREPORT_FAILMODE_WAIT :
+	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
+	    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
+	    NULL);
+
+	if (vd != NULL) {
+		vdev_t *pvd = vd->vdev_parent;
+		vdev_queue_t *vq = &vd->vdev_queue;
+		vdev_stat_t *vs = &vd->vdev_stat;
+		vdev_t *spare_vd;
+		uint64_t *spare_guids;
+		char **spare_paths;
+		int i, spare_count;
+
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+		    DATA_TYPE_UINT64, vd->vdev_guid,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
+		if (vd->vdev_path != NULL)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+			    DATA_TYPE_STRING, vd->vdev_path, NULL);
+		if (vd->vdev_devid != NULL)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
+		if (vd->vdev_fru != NULL)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
+			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
+		if (vd->vdev_enc_sysfs_path != NULL)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
+			    DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
+		if (vd->vdev_ashift)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
+			    DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
+
+		if (vq != NULL) {
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
+			    DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
+			    DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
+		}
+
+		if (vs != NULL) {
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
+			    DATA_TYPE_UINT64, vs->vs_read_errors,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
+			    DATA_TYPE_UINT64, vs->vs_write_errors,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
+			    DATA_TYPE_UINT64, vs->vs_checksum_errors,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
+			    DATA_TYPE_UINT64, vs->vs_slow_ios,
+			    NULL);
+		}
+
+		if (pvd != NULL) {
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+			    DATA_TYPE_UINT64, pvd->vdev_guid,
+			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+			    NULL);
+			if (pvd->vdev_path)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
+			if (pvd->vdev_devid)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
+		}
+
+		spare_count = spa->spa_spares.sav_count;
+		spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
+		    KM_SLEEP);
+		spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
+		    KM_SLEEP);
+
+		for (i = 0; i < spare_count; i++) {
+			spare_vd = spa->spa_spares.sav_vdevs[i];
+			if (spare_vd) {
+				spare_paths[i] = spare_vd->vdev_path;
+				spare_guids[i] = spare_vd->vdev_guid;
+			}
+		}
+
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
+		    DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
+		    DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
+
+		kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
+		kmem_free(spare_paths, sizeof (char *) * spare_count);
+	}
+
+	if (zio != NULL) {
+		/*
+		 * Payload common to all I/Os.
+		 */
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+		    DATA_TYPE_INT32, zio->io_error, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
+		    DATA_TYPE_INT32, zio->io_flags, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
+		    DATA_TYPE_UINT32, zio->io_stage, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
+		    DATA_TYPE_UINT32, zio->io_pipeline, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
+		    DATA_TYPE_UINT64, zio->io_delay, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
+		    DATA_TYPE_UINT64, zio->io_timestamp, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
+		    DATA_TYPE_UINT64, zio->io_delta, NULL);
+		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
+		    DATA_TYPE_UINT32, zio->io_priority, NULL);
+
+		/*
+		 * If the 'size' parameter is non-zero, it indicates this is a
+		 * RAID-Z or other I/O where the physical offset and length are
+		 * provided for us, instead of within the zio_t.
+		 */
+		if (vd != NULL) {
+			if (size)
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+				    DATA_TYPE_UINT64, stateoroffset,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+				    DATA_TYPE_UINT64, size, NULL);
+			else
+				fm_payload_set(ereport,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+				    DATA_TYPE_UINT64, zio->io_offset,
+				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+				    DATA_TYPE_UINT64, zio->io_size, NULL);
+		}
+	} else if (vd != NULL) {
+		/*
+		 * If we have a vdev but no zio, this is a device fault, and the
+		 * 'stateoroffset' parameter indicates the previous state of the
+		 * vdev.
+		 */
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+		    DATA_TYPE_UINT64, stateoroffset, NULL);
+	}
+
+	/*
+	 * Payload for I/Os with corresponding logical information.
+	 */
+	if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+		    DATA_TYPE_UINT64, zb->zb_objset,
+		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+		    DATA_TYPE_UINT64, zb->zb_object,
+		    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+		    DATA_TYPE_INT64, zb->zb_level,
+		    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+		    DATA_TYPE_UINT64, zb->zb_blkid, NULL);
+	}
+
+	mutex_exit(&spa->spa_errlist_lock);
+
+	*ereport_out = ereport;
+	*detector_out = detector;
+	return (B_TRUE);
+}
+
+/* if it's <= 128 bytes, save the corruption directly */
+#define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
+
+#define	MAX_RANGES		16
+
+typedef struct zfs_ecksum_info {
+	/* histograms of set and cleared bits by bit number in a 64-bit word */
+	uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+	uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+
+	/* inline arrays of bits set and cleared. */
+	uint64_t zei_bits_set[ZFM_MAX_INLINE];
+	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
+
+	/*
+	 * for each range, the number of bits set and cleared.  The Hamming
+	 * distance between the good and bad buffers is the sum of them all.
+	 */
+	uint32_t zei_range_sets[MAX_RANGES];
+	uint32_t zei_range_clears[MAX_RANGES];
+
+	struct zei_ranges {
+		uint32_t	zr_start;
+		uint32_t	zr_end;
+	} zei_ranges[MAX_RANGES];
+
+	size_t	zei_range_count;
+	uint32_t zei_mingap;
+	uint32_t zei_allowed_mingap;
+
+} zfs_ecksum_info_t;
+
+static void
+update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
+{
+	size_t i;
+	size_t bits = 0;
+	uint64_t value = BE_64(value_arg);
+
+	/* We store the bits in big-endian (largest-first) order */
+	for (i = 0; i < 64; i++) {
+		if (value & (1ull << i)) {
+			hist[63 - i]++;
+			++bits;
+		}
+	}
+	/* update the count of bits changed */
+	*count += bits;
+}
+
+/*
+ * We've now filled up the range array, and need to increase "mingap" and
+ * shrink the range list accordingly.  zei_mingap is always the smallest
+ * distance between array entries, so we set the new_allowed_gap to be
+ * one greater than that.  We then go through the list, joining together
+ * any ranges which are closer than the new_allowed_gap.
+ *
+ * By construction, there will be at least one.  We also update zei_mingap
+ * to the new smallest gap, to prepare for our next invocation.
+ */
+static void
+zei_shrink_ranges(zfs_ecksum_info_t *eip)
+{
+	uint32_t mingap = UINT32_MAX;
+	uint32_t new_allowed_gap = eip->zei_mingap + 1;
+
+	size_t idx, output;
+	size_t max = eip->zei_range_count;
+
+	struct zei_ranges *r = eip->zei_ranges;
+
+	ASSERT3U(eip->zei_range_count, >, 0);
+	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
+
+	output = idx = 0;
+	while (idx < max - 1) {
+		uint32_t start = r[idx].zr_start;
+		uint32_t end = r[idx].zr_end;
+
+		while (idx < max - 1) {
+			idx++;
+
+			uint32_t nstart = r[idx].zr_start;
+			uint32_t nend = r[idx].zr_end;
+
+			uint32_t gap = nstart - end;
+			if (gap < new_allowed_gap) {
+				end = nend;
+				continue;
+			}
+			if (gap < mingap)
+				mingap = gap;
+			break;
+		}
+		r[output].zr_start = start;
+		r[output].zr_end = end;
+		output++;
+	}
+	ASSERT3U(output, <, eip->zei_range_count);
+	eip->zei_range_count = output;
+	eip->zei_mingap = mingap;
+	eip->zei_allowed_mingap = new_allowed_gap;
+}
+
+static void
+zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
+{
+	struct zei_ranges *r = eip->zei_ranges;
+	size_t count = eip->zei_range_count;
+
+	if (count >= MAX_RANGES) {
+		zei_shrink_ranges(eip);
+		count = eip->zei_range_count;
+	}
+	if (count == 0) {
+		eip->zei_mingap = UINT32_MAX;
+		eip->zei_allowed_mingap = 1;
+	} else {
+		int gap = start - r[count - 1].zr_end;
+
+		if (gap < eip->zei_allowed_mingap) {
+			r[count - 1].zr_end = end;
+			return;
+		}
+		if (gap < eip->zei_mingap)
+			eip->zei_mingap = gap;
+	}
+	r[count].zr_start = start;
+	r[count].zr_end = end;
+	eip->zei_range_count++;
+}
+
+static size_t
+zei_range_total_size(zfs_ecksum_info_t *eip)
+{
+	struct zei_ranges *r = eip->zei_ranges;
+	size_t count = eip->zei_range_count;
+	size_t result = 0;
+	size_t idx;
+
+	for (idx = 0; idx < count; idx++)
+		result += (r[idx].zr_end - r[idx].zr_start);
+
+	return (result);
+}
+
+static zfs_ecksum_info_t *
+annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
+    const abd_t *goodabd, const abd_t *badabd, size_t size,
+    boolean_t drop_if_identical)
+{
+	const uint64_t *good;
+	const uint64_t *bad;
+
+	uint64_t allset = 0;
+	uint64_t allcleared = 0;
+
+	size_t nui64s = size / sizeof (uint64_t);
+
+	size_t inline_size;
+	int no_inline = 0;
+	size_t idx;
+	size_t range;
+
+	size_t offset = 0;
+	ssize_t start = -1;
+
+	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
+
+	/* don't do any annotation for injected checksum errors */
+	if (info != NULL && info->zbc_injected)
+		return (eip);
+
+	if (info != NULL && info->zbc_has_cksum) {
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
+		    DATA_TYPE_UINT64_ARRAY,
+		    sizeof (info->zbc_expected) / sizeof (uint64_t),
+		    (uint64_t *)&info->zbc_expected,
+		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
+		    DATA_TYPE_UINT64_ARRAY,
+		    sizeof (info->zbc_actual) / sizeof (uint64_t),
+		    (uint64_t *)&info->zbc_actual,
+		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
+		    DATA_TYPE_STRING,
+		    info->zbc_checksum_name,
+		    NULL);
+
+		if (info->zbc_byteswapped) {
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
+			    DATA_TYPE_BOOLEAN, 1,
+			    NULL);
+		}
+	}
+
+	if (badabd == NULL || goodabd == NULL)
+		return (eip);
+
+	ASSERT3U(nui64s, <=, UINT32_MAX);
+	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(size, <=, UINT32_MAX);
+
+	good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
+	bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
+
+	/* build up the range list by comparing the two buffers. */
+	for (idx = 0; idx < nui64s; idx++) {
+		if (good[idx] == bad[idx]) {
+			if (start == -1)
+				continue;
+
+			zei_add_range(eip, start, idx);
+			start = -1;
+		} else {
+			if (start != -1)
+				continue;
+
+			start = idx;
+		}
+	}
+	if (start != -1)
+		zei_add_range(eip, start, idx);
+
+	/* See if it will fit in our inline buffers */
+	inline_size = zei_range_total_size(eip);
+	if (inline_size > ZFM_MAX_INLINE)
+		no_inline = 1;
+
+	/*
+	 * If there is no change and we want to drop if the buffers are
+	 * identical, do so.
+	 */
+	if (inline_size == 0 && drop_if_identical) {
+		kmem_free(eip, sizeof (*eip));
+		abd_return_buf((abd_t *)goodabd, (void *)good, size);
+		abd_return_buf((abd_t *)badabd, (void *)bad, size);
+		return (NULL);
+	}
+
+	/*
+	 * Now walk through the ranges, filling in the details of the
+	 * differences.  Also convert our uint64_t-array offsets to byte
+	 * offsets.
+	 */
+	for (range = 0; range < eip->zei_range_count; range++) {
+		size_t start = eip->zei_ranges[range].zr_start;
+		size_t end = eip->zei_ranges[range].zr_end;
+
+		for (idx = start; idx < end; idx++) {
+			uint64_t set, cleared;
+
+			// bits set in bad, but not in good
+			set = ((~good[idx]) & bad[idx]);
+			// bits set in good, but not in bad
+			cleared = (good[idx] & (~bad[idx]));
+
+			allset |= set;
+			allcleared |= cleared;
+
+			if (!no_inline) {
+				ASSERT3U(offset, <, inline_size);
+				eip->zei_bits_set[offset] = set;
+				eip->zei_bits_cleared[offset] = cleared;
+				offset++;
+			}
+
+			update_histogram(set, eip->zei_histogram_set,
+			    &eip->zei_range_sets[range]);
+			update_histogram(cleared, eip->zei_histogram_cleared,
+			    &eip->zei_range_clears[range]);
+		}
+
+		/* convert to byte offsets */
+		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
+		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
+	}
+
+	abd_return_buf((abd_t *)goodabd, (void *)good, size);
+	abd_return_buf((abd_t *)badabd, (void *)bad, size);
+
+	eip->zei_allowed_mingap	*= sizeof (uint64_t);
+	inline_size		*= sizeof (uint64_t);
+
+	/* fill in ereport */
+	fm_payload_set(ereport,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
+	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
+	    (uint32_t *)eip->zei_ranges,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
+	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
+	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
+	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
+	    NULL);
+
+	if (!no_inline) {
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
+		    DATA_TYPE_UINT8_ARRAY,
+		    inline_size, (uint8_t *)eip->zei_bits_set,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
+		    DATA_TYPE_UINT8_ARRAY,
+		    inline_size, (uint8_t *)eip->zei_bits_cleared,
+		    NULL);
+	} else {
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
+		    DATA_TYPE_UINT32_ARRAY,
+		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
+		    DATA_TYPE_UINT32_ARRAY,
+		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
+		    NULL);
+	}
+	return (eip);
+}
+#endif
+
+/*
+ * Make sure our event is still valid for the given zio/vdev/pool.  For example,
+ * we don't want to keep logging events for a faulted or missing vdev.
+ */
+boolean_t
+zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
+{
+#ifdef _KERNEL
+	/*
+	 * If we are doing a spa_tryimport() or in recovery mode,
+	 * ignore errors.
+	 */
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+	    spa_load_state(spa) == SPA_LOAD_RECOVER)
+		return (B_FALSE);
+
+	/*
+	 * If we are in the middle of opening a pool, and the previous attempt
+	 * failed, don't bother logging any new ereports - we're just going to
+	 * get the same diagnosis anyway.
+	 */
+	if (spa_load_state(spa) != SPA_LOAD_NONE &&
+	    spa->spa_last_open_failed)
+		return (B_FALSE);
+
+	if (zio != NULL) {
+		/*
+		 * If this is not a read or write zio, ignore the error.  This
+		 * can occur if the DKIOCFLUSHWRITECACHE ioctl fails.
+		 */
+		if (zio->io_type != ZIO_TYPE_READ &&
+		    zio->io_type != ZIO_TYPE_WRITE)
+			return (B_FALSE);
+
+		if (vd != NULL) {
+			/*
+			 * If the vdev has already been marked as failing due
+			 * to a failed probe, then ignore any subsequent I/O
+			 * errors, as the DE will automatically fault the vdev
+			 * on the first such failure.  This also catches cases
+			 * where vdev_remove_wanted is set and the device has
+			 * not yet been asynchronously placed into the REMOVED
+			 * state.
+			 */
+			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
+				return (B_FALSE);
+
+			/*
+			 * Ignore checksum errors for reads from DTL regions of
+			 * leaf vdevs.
+			 */
+			if (zio->io_type == ZIO_TYPE_READ &&
+			    zio->io_error == ECKSUM &&
+			    vd->vdev_ops->vdev_op_leaf &&
+			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+				return (B_FALSE);
+		}
+	}
+
+	/*
+	 * For probe failure, we want to avoid posting ereports if we've
+	 * already removed the device in the meantime.
+	 */
+	if (vd != NULL &&
+	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+		return (B_FALSE);
+
+	/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
+	if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
+	    (zio != NULL) && (!zio->io_timestamp)) {
+		return (B_FALSE);
+	}
+#endif
+	return (B_TRUE);
+}
+
+/*
+ * Post an ereport for the given subclass
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
+    const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
+{
+	int rc = 0;
+#ifdef _KERNEL
+	nvlist_t *ereport = NULL;
+	nvlist_t *detector = NULL;
+
+	if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
+		return (EINVAL);
+
+	if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
+		return (SET_ERROR(EALREADY));
+
+	if (zfs_is_ratelimiting_event(subclass, vd))
+		return (SET_ERROR(EBUSY));
+
+	if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
+	    zb, zio, state, 0))
+		return (SET_ERROR(EINVAL));	/* couldn't post event */
+
+	if (ereport == NULL)
+		return (SET_ERROR(EINVAL));
+
+	/* Cleanup is handled by the callback function */
+	rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+#endif
+	return (rc);
+}
+
+/*
+ * Prepare a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
+    struct zio *zio, uint64_t offset, uint64_t length, void *arg,
+    zio_bad_cksum_t *info)
+{
+	zio_cksum_report_t *report;
+
+#ifdef _KERNEL
+	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+	    offset, length))
+		return (SET_ERROR(EALREADY));
+
+	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+		return (SET_ERROR(EBUSY));
+#endif
+
+	report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+
+	if (zio->io_vsd != NULL)
+		zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
+	else
+		zio_vsd_default_cksum_report(zio, report, arg);
+
+	/* copy the checksum failure information if it was provided */
+	if (info != NULL) {
+		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
+		bcopy(info, report->zcr_ckinfo, sizeof (*info));
+	}
+
+	report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
+	report->zcr_align =
+	    vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
+	report->zcr_length = length;
+
+#ifdef _KERNEL
+	(void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
+
+	if (report->zcr_ereport == NULL) {
+		zfs_ereport_free_checksum(report);
+		return (0);
+	}
+#endif
+
+	mutex_enter(&spa->spa_errlist_lock);
+	report->zcr_next = zio->io_logical->io_cksum_report;
+	zio->io_logical->io_cksum_report = report;
+	mutex_exit(&spa->spa_errlist_lock);
+	return (0);
+}
+
+void
+zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
+    const abd_t *bad_data, boolean_t drop_if_identical)
+{
+#ifdef _KERNEL
+	zfs_ecksum_info_t *info;
+
+	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
+	    good_data, bad_data, report->zcr_length, drop_if_identical);
+	if (info != NULL)
+		zfs_zevent_post(report->zcr_ereport,
+		    report->zcr_detector, zfs_zevent_post_cb);
+	else
+		zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
+
+	report->zcr_ereport = report->zcr_detector = NULL;
+	if (info != NULL)
+		kmem_free(info, sizeof (*info));
+#endif
+}
+
+void
+zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
+{
+#ifdef _KERNEL
+	if (rpt->zcr_ereport != NULL) {
+		fm_nvlist_destroy(rpt->zcr_ereport,
+		    FM_NVA_FREE);
+		fm_nvlist_destroy(rpt->zcr_detector,
+		    FM_NVA_FREE);
+	}
+#endif
+	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
+
+	if (rpt->zcr_ckinfo != NULL)
+		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
+
+	kmem_free(rpt, sizeof (*rpt));
+}
+
+/*
+ * Post a checksum ereport
+ *
+ * Returns
+ * - 0 if an event was posted
+ * - EINVAL if there was a problem posting event
+ * - EBUSY if the event was rate limited
+ * - EALREADY if the event was already posted (duplicate)
+ */
+int
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
+    struct zio *zio, uint64_t offset, uint64_t length,
+    const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
+{
+	int rc = 0;
+#ifdef _KERNEL
+	nvlist_t *ereport = NULL;
+	nvlist_t *detector = NULL;
+	zfs_ecksum_info_t *info;
+
+	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
+		return (SET_ERROR(EINVAL));
+
+	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
+	    offset, length))
+		return (SET_ERROR(EALREADY));
+
+	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
+		return (SET_ERROR(EBUSY));
+
+	if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
+	    spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
+	    B_FALSE);
+
+	if (info != NULL) {
+		rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
+		kmem_free(info, sizeof (*info));
+	}
+#endif
+	return (rc);
+}
+
+/*
+ * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
+ * change in the pool.  All sysevents are listed in sys/sysevent/eventdefs.h
+ * and are designed to be consumed by the ZFS Event Daemon (ZED).  For
+ * additional details refer to the zed(8) man page.
+ */
+nvlist_t *
+zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
+    nvlist_t *aux)
+{
+	nvlist_t *resource = NULL;
+#ifdef _KERNEL
+	char class[64];
+
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+		return (NULL);
+
+	if ((resource = fm_nvlist_create(NULL)) == NULL)
+		return (NULL);
+
+	(void) snprintf(class, sizeof (class), "%s.%s.%s", type,
+	    ZFS_ERROR_CLASS, name);
+	VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
+	VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
+	VERIFY0(nvlist_add_string(resource,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
+	VERIFY0(nvlist_add_uint64(resource,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
+	VERIFY0(nvlist_add_uint64(resource,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
+	VERIFY0(nvlist_add_int32(resource,
+	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
+
+	if (vd) {
+		VERIFY0(nvlist_add_uint64(resource,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
+		VERIFY0(nvlist_add_uint64(resource,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
+		if (vd->vdev_path != NULL)
+			VERIFY0(nvlist_add_string(resource,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
+		if (vd->vdev_devid != NULL)
+			VERIFY0(nvlist_add_string(resource,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
+		if (vd->vdev_fru != NULL)
+			VERIFY0(nvlist_add_string(resource,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
+		if (vd->vdev_enc_sysfs_path != NULL)
+			VERIFY0(nvlist_add_string(resource,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
+			    vd->vdev_enc_sysfs_path));
+	}
+
+	/* also copy any optional payload data */
+	if (aux) {
+		nvpair_t *elem = NULL;
+
+		while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
+			(void) nvlist_add_nvpair(resource, elem);
+	}
+
+#endif
+	return (resource);
+}
+
+static void
+zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
+    nvlist_t *aux)
+{
+#ifdef _KERNEL
+	nvlist_t *resource;
+
+	resource = zfs_event_create(spa, vd, type, name, aux);
+	if (resource)
+		zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
+ * has been removed from the system.  This will cause the DE to ignore any
+ * recent I/O errors, inferring that they are due to the asynchronous device
+ * removal.
+ */
+void
+zfs_post_remove(spa_t *spa, vdev_t *vd)
+{
+	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL);
+}
+
+/*
+ * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
+ * has the 'autoreplace' property set, and therefore any broken vdevs will be
+ * handled by higher level logic, and no vdev fault should be generated.
+ */
+void
+zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
+{
+	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
+}
+
+/*
+ * The 'resource.fs.zfs.statechange' event is an internal signal that the
+ * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
+ * cause the retire agent to repair any outstanding fault management cases
+ * open because the device was not found (fault.fs.zfs.device).
+ */
+void
+zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
+{
+#ifdef _KERNEL
+	nvlist_t *aux;
+
+	/*
+	 * Add optional supplemental keys to payload
+	 */
+	aux = fm_nvlist_create(NULL);
+	if (vd && aux) {
+		if (vd->vdev_physpath) {
+			(void) nvlist_add_string(aux,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
+			    vd->vdev_physpath);
+		}
+		if (vd->vdev_enc_sysfs_path) {
+			(void) nvlist_add_string(aux,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
+			    vd->vdev_enc_sysfs_path);
+		}
+
+		(void) nvlist_add_uint64(aux,
+		    FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
+	}
+
+	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
+	    aux);
+
+	if (aux)
+		fm_nvlist_destroy(aux, FM_NVA_FREE);
+#endif
+}
+
+#ifdef _KERNEL
+void
+zfs_ereport_init(void)
+{
+	mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&recent_events_list, sizeof (recent_events_node_t),
+	    offsetof(recent_events_node_t, re_list_link));
+	avl_create(&recent_events_tree,  recent_events_compare,
+	    sizeof (recent_events_node_t), offsetof(recent_events_node_t,
+	    re_tree_link));
+}
+
+/*
+ * This 'early' fini needs to run before zfs_fini() which on Linux waits
+ * for the system_delay_taskq to drain.
+ */
+void
+zfs_ereport_taskq_fini(void)
+{
+	mutex_enter(&recent_events_lock);
+	if (recent_events_cleaner_tqid != 0) {
+		taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
+		recent_events_cleaner_tqid = 0;
+	}
+	mutex_exit(&recent_events_lock);
+}
+
+void
+zfs_ereport_fini(void)
+{
+	recent_events_node_t *entry;
+
+	while ((entry = list_head(&recent_events_list)) != NULL) {
+		avl_remove(&recent_events_tree, entry);
+		list_remove(&recent_events_list, entry);
+		kmem_free(entry, sizeof (*entry));
+	}
+	avl_destroy(&recent_events_tree);
+	list_destroy(&recent_events_list);
+	mutex_destroy(&recent_events_lock);
+}
+
+EXPORT_SYMBOL(zfs_ereport_post);
+EXPORT_SYMBOL(zfs_ereport_is_valid);
+EXPORT_SYMBOL(zfs_ereport_post_checksum);
+EXPORT_SYMBOL(zfs_post_remove);
+EXPORT_SYMBOL(zfs_post_autoreplace);
+EXPORT_SYMBOL(zfs_post_state_change);
+
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
+	"Maximum recent zevents records to retain for duplicate checking");
+ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
+	"Expiration time for recent zevents records");
+#endif /* _KERNEL */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_fuid.c b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
new file mode 100644
index 000000000000..015dde4811e4
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_fuid.c
@@ -0,0 +1,815 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/dmu.h>
+#include <sys/avl.h>
+#include <sys/zap.h>
+#include <sys/nvpair.h>
+#ifdef _KERNEL
+#include <sys/sid.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#endif
+#include <sys/zfs_fuid.h>
+
+/*
+ * FUID Domain table(s).
+ *
+ * The FUID table is stored as a packed nvlist of an array
+ * of nvlists which contain an index, domain string and offset
+ *
+ * During file system initialization the nvlist(s) are read and
+ * two AVL trees are created.  One tree is keyed by the index number
+ * and the other by the domain string.  Nodes are never removed from
+ * trees, but new entries may be added.  If a new entry is added then
+ * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
+ * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
+ *
+ */
+
+#define	FUID_IDX	"fuid_idx"
+#define	FUID_DOMAIN	"fuid_domain"
+#define	FUID_OFFSET	"fuid_offset"
+#define	FUID_NVP_ARRAY	"fuid_nvlist"
+
+typedef struct fuid_domain {
+	avl_node_t	f_domnode;
+	avl_node_t	f_idxnode;
+	ksiddomain_t	*f_ksid;
+	uint64_t	f_idx;
+} fuid_domain_t;
+
+static char *nulldomain = "";
+
+/*
+ * Compare two indexes.
+ */
+static int
+idx_compare(const void *arg1, const void *arg2)
+{
+	const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+	const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
+
+	return (TREE_CMP(node1->f_idx, node2->f_idx));
+}
+
+/*
+ * Compare two domain strings.
+ */
+static int
+domain_compare(const void *arg1, const void *arg2)
+{
+	const fuid_domain_t *node1 = (const fuid_domain_t *)arg1;
+	const fuid_domain_t *node2 = (const fuid_domain_t *)arg2;
+	int val;
+
+	val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name);
+
+	return (TREE_ISIGN(val));
+}
+
+void
+zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+	avl_create(idx_tree, idx_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+	avl_create(domain_tree, domain_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+}
+
+/*
+ * load initial fuid domain and idx trees.  This function is used by
+ * both the kernel and zdb.
+ */
+uint64_t
+zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
+    avl_tree_t *domain_tree)
+{
+	dmu_buf_t *db;
+	uint64_t fuid_size;
+
+	ASSERT(fuid_obj != 0);
+	VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
+	    FTAG, &db));
+	fuid_size = *(uint64_t *)db->db_data;
+	dmu_buf_rele(db, FTAG);
+
+	if (fuid_size)  {
+		nvlist_t **fuidnvp;
+		nvlist_t *nvp = NULL;
+		uint_t count;
+		char *packed;
+		int i;
+
+		packed = kmem_alloc(fuid_size, KM_SLEEP);
+		VERIFY(dmu_read(os, fuid_obj, 0,
+		    fuid_size, packed, DMU_READ_PREFETCH) == 0);
+		VERIFY(nvlist_unpack(packed, fuid_size,
+		    &nvp, 0) == 0);
+		VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
+		    &fuidnvp, &count) == 0);
+
+		for (i = 0; i != count; i++) {
+			fuid_domain_t *domnode;
+			char *domain;
+			uint64_t idx;
+
+			VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN,
+			    &domain) == 0);
+			VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX,
+			    &idx) == 0);
+
+			domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+
+			domnode->f_idx = idx;
+			domnode->f_ksid = ksid_lookupdomain(domain);
+			avl_add(idx_tree, domnode);
+			avl_add(domain_tree, domnode);
+		}
+		nvlist_free(nvp);
+		kmem_free(packed, fuid_size);
+	}
+	return (fuid_size);
+}
+
+void
+zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+	fuid_domain_t *domnode;
+	void *cookie;
+
+	cookie = NULL;
+	while ((domnode = avl_destroy_nodes(domain_tree, &cookie)))
+		ksiddomain_rele(domnode->f_ksid);
+
+	avl_destroy(domain_tree);
+	cookie = NULL;
+	while ((domnode = avl_destroy_nodes(idx_tree, &cookie)))
+		kmem_free(domnode, sizeof (fuid_domain_t));
+	avl_destroy(idx_tree);
+}
+
+char *
+zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
+{
+	fuid_domain_t searchnode, *findnode;
+	avl_index_t loc;
+
+	searchnode.f_idx = idx;
+
+	findnode = avl_find(idx_tree, &searchnode, &loc);
+
+	return (findnode ? findnode->f_ksid->kd_name : nulldomain);
+}
+
+#ifdef _KERNEL
+/*
+ * Load the fuid table(s) into memory.
+ */
+static void
+zfs_fuid_init(zfsvfs_t *zfsvfs)
+{
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+	if (zfsvfs->z_fuid_loaded) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return;
+	}
+
+	zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+
+	(void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+	    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
+	if (zfsvfs->z_fuid_obj != 0) {
+		zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
+		    zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
+		    &zfsvfs->z_fuid_domain);
+	}
+
+	zfsvfs->z_fuid_loaded = B_TRUE;
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * sync out AVL trees to persistent storage.
+ */
+void
+zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	nvlist_t *nvp;
+	nvlist_t **fuids;
+	size_t nvsize = 0;
+	char *packed;
+	dmu_buf_t *db;
+	fuid_domain_t *domnode;
+	int numnodes;
+	int i;
+
+	if (!zfsvfs->z_fuid_dirty) {
+		return;
+	}
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+	/*
+	 * First see if table needs to be created?
+	 */
+	if (zfsvfs->z_fuid_obj == 0) {
+		zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+		    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+		    sizeof (uint64_t), tx);
+		VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+		    &zfsvfs->z_fuid_obj, tx) == 0);
+	}
+
+	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
+	fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
+	for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
+	    domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
+		VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+		    domnode->f_idx) == 0);
+		VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
+		VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
+		    domnode->f_ksid->kd_name) == 0);
+	}
+	VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+	    fuids, numnodes) == 0);
+	for (i = 0; i != numnodes; i++)
+		nvlist_free(fuids[i]);
+	kmem_free(fuids, numnodes * sizeof (void *));
+	VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+	packed = kmem_alloc(nvsize, KM_SLEEP);
+	VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+	    NV_ENCODE_XDR, KM_SLEEP) == 0);
+	nvlist_free(nvp);
+	zfsvfs->z_fuid_size = nvsize;
+	dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+	    zfsvfs->z_fuid_size, packed, tx);
+	kmem_free(packed, zfsvfs->z_fuid_size);
+	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+	    FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+	dmu_buf_rele(db, FTAG);
+
+	zfsvfs->z_fuid_dirty = B_FALSE;
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Query domain table for a given domain.
+ *
+ * If domain isn't found and addok is set, it is added to AVL trees and
+ * the zfsvfs->z_fuid_dirty flag will be set to TRUE.  It will then be
+ * necessary for the caller or another thread to detect the dirty table
+ * and sync out the changes.
+ */
+int
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
+    char **retdomain, boolean_t addok)
+{
+	fuid_domain_t searchnode, *findnode;
+	avl_index_t loc;
+	krw_t rw = RW_READER;
+
+	/*
+	 * If the dummy "nobody" domain then return an index of 0
+	 * to cause the created FUID to be a standard POSIX id
+	 * for the user nobody.
+	 */
+	if (domain[0] == '\0') {
+		if (retdomain)
+			*retdomain = nulldomain;
+		return (0);
+	}
+
+	searchnode.f_ksid = ksid_lookupdomain(domain);
+	if (retdomain)
+		*retdomain = searchnode.f_ksid->kd_name;
+	if (!zfsvfs->z_fuid_loaded)
+		zfs_fuid_init(zfsvfs);
+
+retry:
+	rw_enter(&zfsvfs->z_fuid_lock, rw);
+	findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc);
+
+	if (findnode) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		ksiddomain_rele(searchnode.f_ksid);
+		return (findnode->f_idx);
+	} else if (addok) {
+		fuid_domain_t *domnode;
+		uint64_t retidx;
+
+		if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
+			rw_exit(&zfsvfs->z_fuid_lock);
+			rw = RW_WRITER;
+			goto retry;
+		}
+
+		domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP);
+		domnode->f_ksid = searchnode.f_ksid;
+
+		retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1;
+
+		avl_add(&zfsvfs->z_fuid_domain, domnode);
+		avl_add(&zfsvfs->z_fuid_idx, domnode);
+		zfsvfs->z_fuid_dirty = B_TRUE;
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return (retidx);
+	} else {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return (-1);
+	}
+}
+
+/*
+ * Query domain table by index, returning domain string
+ *
+ * Returns a pointer from an avl node of the domain string.
+ *
+ */
+const char *
+zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
+{
+	char *domain;
+
+	if (idx == 0 || !zfsvfs->z_use_fuids)
+		return (NULL);
+
+	if (!zfsvfs->z_fuid_loaded)
+		zfs_fuid_init(zfsvfs);
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
+
+	if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty)
+		domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx);
+	else
+		domain = nulldomain;
+	rw_exit(&zfsvfs->z_fuid_lock);
+
+	ASSERT(domain);
+	return (domain);
+}
+
+void
+zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp)
+{
+	*uidp = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOUID(zp)),
+	    cr, ZFS_OWNER);
+	*gidp = zfs_fuid_map_id(ZTOZSB(zp), KGID_TO_SGID(ZTOGID(zp)),
+	    cr, ZFS_GROUP);
+}
+
+#ifdef __FreeBSD__
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+    cred_t *cr, zfs_fuid_type_t type)
+{
+	uint32_t index = FUID_INDEX(fuid);
+
+	if (index == 0)
+		return (fuid);
+
+	return (UID_NOBODY);
+}
+#elif defined(__linux__)
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+    cred_t *cr, zfs_fuid_type_t type)
+{
+	/*
+	 * The Linux port only supports POSIX IDs, use the passed id.
+	 */
+	return (fuid);
+}
+
+#else
+uid_t
+zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
+    cred_t *cr, zfs_fuid_type_t type)
+{
+	uint32_t index = FUID_INDEX(fuid);
+	const char *domain;
+	uid_t id;
+
+	if (index == 0)
+		return (fuid);
+
+	domain = zfs_fuid_find_by_idx(zfsvfs, index);
+	ASSERT(domain != NULL);
+
+	if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
+		(void) kidmap_getuidbysid(crgetzone(cr), domain,
+		    FUID_RID(fuid), &id);
+	} else {
+		(void) kidmap_getgidbysid(crgetzone(cr), domain,
+		    FUID_RID(fuid), &id);
+	}
+	return (id);
+}
+#endif
+
+/*
+ * Add a FUID node to the list of fuid's being created for this
+ * ACL
+ *
+ * If ACL has multiple domains, then keep only one copy of each unique
+ * domain.
+ */
+void
+zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
+    uint64_t idx, uint64_t id, zfs_fuid_type_t type)
+{
+	zfs_fuid_t *fuid;
+	zfs_fuid_domain_t *fuid_domain;
+	zfs_fuid_info_t *fuidp;
+	uint64_t fuididx;
+	boolean_t found = B_FALSE;
+
+	if (*fuidpp == NULL)
+		*fuidpp = zfs_fuid_info_alloc();
+
+	fuidp = *fuidpp;
+	/*
+	 * First find fuid domain index in linked list
+	 *
+	 * If one isn't found then create an entry.
+	 */
+
+	for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains);
+	    fuid_domain; fuid_domain = list_next(&fuidp->z_domains,
+	    fuid_domain), fuididx++) {
+		if (idx == fuid_domain->z_domidx) {
+			found = B_TRUE;
+			break;
+		}
+	}
+
+	if (!found) {
+		fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP);
+		fuid_domain->z_domain = domain;
+		fuid_domain->z_domidx = idx;
+		list_insert_tail(&fuidp->z_domains, fuid_domain);
+		fuidp->z_domain_str_sz += strlen(domain) + 1;
+		fuidp->z_domain_cnt++;
+	}
+
+	if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+
+		/*
+		 * Now allocate fuid entry and add it on the end of the list
+		 */
+
+		fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+		fuid->z_id = id;
+		fuid->z_domidx = idx;
+		fuid->z_logfuid = FUID_ENCODE(fuididx, rid);
+
+		list_insert_tail(&fuidp->z_fuids, fuid);
+		fuidp->z_fuid_cnt++;
+	} else {
+		if (type == ZFS_OWNER)
+			fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid);
+		else
+			fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid);
+	}
+}
+
+#ifdef HAVE_KSID
+/*
+ * Create a file system FUID, based on information in the users cred
+ *
+ * If cred contains KSID_OWNER then it should be used to determine
+ * the uid otherwise cred's uid will be used. By default cred's gid
+ * is used unless it's an ephemeral ID in which case KSID_GROUP will
+ * be used if it exists.
+ */
+uint64_t
+zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
+    cred_t *cr, zfs_fuid_info_t **fuidp)
+{
+	uint64_t	idx;
+	ksid_t		*ksid;
+	uint32_t	rid;
+	char		*kdomain;
+	const char	*domain;
+	uid_t		id;
+
+	VERIFY(type == ZFS_OWNER || type == ZFS_GROUP);
+
+	ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP);
+
+	if (!zfsvfs->z_use_fuids || (ksid == NULL)) {
+		id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr);
+
+		if (IS_EPHEMERAL(id))
+			return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY);
+
+		return ((uint64_t)id);
+	}
+
+	/*
+	 * ksid is present and FUID is supported
+	 */
+	id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr);
+
+	if (!IS_EPHEMERAL(id))
+		return ((uint64_t)id);
+
+	if (type == ZFS_GROUP)
+		id = ksid_getid(ksid);
+
+	rid = ksid_getrid(ksid);
+	domain = ksid_getdomain(ksid);
+
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
+
+	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
+
+	return (FUID_ENCODE(idx, rid));
+}
+#endif /* HAVE_KSID */
+
+/*
+ * Create a file system FUID for an ACL ace
+ * or a chown/chgrp of the file.
+ * This is similar to zfs_fuid_create_cred, except that
+ * we can't find the domain + rid information in the
+ * cred.  Instead we have to query Winchester for the
+ * domain and rid.
+ *
+ * During replay operations the domain+rid information is
+ * found in the zfs_fuid_info_t that the replay code has
+ * attached to the zfsvfs of the file system.
+ */
+uint64_t
+zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
+    zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
+{
+#ifdef HAVE_KSID
+	const char *domain;
+	char *kdomain;
+	uint32_t fuid_idx = FUID_INDEX(id);
+	uint32_t rid = 0;
+	idmap_stat status;
+	uint64_t idx = UID_NOBODY;
+	zfs_fuid_t *zfuid = NULL;
+	zfs_fuid_info_t *fuidp = NULL;
+
+	/*
+	 * If POSIX ID, or entry is already a FUID then
+	 * just return the id
+	 *
+	 * We may also be handed an already FUID'ized id via
+	 * chmod.
+	 */
+
+	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
+		return (id);
+
+	if (zfsvfs->z_replay) {
+		fuidp = zfsvfs->z_fuid_replay;
+
+		/*
+		 * If we are passed an ephemeral id, but no
+		 * fuid_info was logged then return NOBODY.
+		 * This is most likely a result of idmap service
+		 * not being available.
+		 */
+		if (fuidp == NULL)
+			return (UID_NOBODY);
+
+		VERIFY3U(type, >=, ZFS_OWNER);
+		VERIFY3U(type, <=, ZFS_ACE_GROUP);
+
+		switch (type) {
+		case ZFS_ACE_USER:
+		case ZFS_ACE_GROUP:
+			zfuid = list_head(&fuidp->z_fuids);
+			rid = FUID_RID(zfuid->z_logfuid);
+			idx = FUID_INDEX(zfuid->z_logfuid);
+			break;
+		case ZFS_OWNER:
+			rid = FUID_RID(fuidp->z_fuid_owner);
+			idx = FUID_INDEX(fuidp->z_fuid_owner);
+			break;
+		case ZFS_GROUP:
+			rid = FUID_RID(fuidp->z_fuid_group);
+			idx = FUID_INDEX(fuidp->z_fuid_group);
+			break;
+		};
+		domain = fuidp->z_domain_table[idx - 1];
+	} else {
+		if (type == ZFS_OWNER || type == ZFS_ACE_USER)
+			status = kidmap_getsidbyuid(crgetzone(cr), id,
+			    &domain, &rid);
+		else
+			status = kidmap_getsidbygid(crgetzone(cr), id,
+			    &domain, &rid);
+
+		if (status != 0) {
+			/*
+			 * When returning nobody we will need to
+			 * make a dummy fuid table entry for logging
+			 * purposes.
+			 */
+			rid = UID_NOBODY;
+			domain = nulldomain;
+		}
+	}
+
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
+
+	if (!zfsvfs->z_replay)
+		zfs_fuid_node_add(fuidpp, kdomain,
+		    rid, idx, id, type);
+	else if (zfuid != NULL) {
+		list_remove(&fuidp->z_fuids, zfuid);
+		kmem_free(zfuid, sizeof (zfs_fuid_t));
+	}
+	return (FUID_ENCODE(idx, rid));
+#else
+	/*
+	 * The Linux port only supports POSIX IDs, use the passed id.
+	 */
+	return (id);
+#endif
+}
+
+void
+zfs_fuid_destroy(zfsvfs_t *zfsvfs)
+{
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+	if (!zfsvfs->z_fuid_loaded) {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return;
+	}
+	zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * Allocate zfs_fuid_info for tracking FUIDs created during
+ * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR()
+ */
+zfs_fuid_info_t *
+zfs_fuid_info_alloc(void)
+{
+	zfs_fuid_info_t *fuidp;
+
+	fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP);
+	list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t),
+	    offsetof(zfs_fuid_domain_t, z_next));
+	list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t),
+	    offsetof(zfs_fuid_t, z_next));
+	return (fuidp);
+}
+
+/*
+ * Release all memory associated with zfs_fuid_info_t
+ */
+void
+zfs_fuid_info_free(zfs_fuid_info_t *fuidp)
+{
+	zfs_fuid_t *zfuid;
+	zfs_fuid_domain_t *zdomain;
+
+	while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) {
+		list_remove(&fuidp->z_fuids, zfuid);
+		kmem_free(zfuid, sizeof (zfs_fuid_t));
+	}
+
+	if (fuidp->z_domain_table != NULL)
+		kmem_free(fuidp->z_domain_table,
+		    (sizeof (char *)) * fuidp->z_domain_cnt);
+
+	while ((zdomain = list_head(&fuidp->z_domains)) != NULL) {
+		list_remove(&fuidp->z_domains, zdomain);
+		kmem_free(zdomain, sizeof (zfs_fuid_domain_t));
+	}
+
+	kmem_free(fuidp, sizeof (zfs_fuid_info_t));
+}
+
+/*
+ * Check to see if id is a groupmember.  If cred
+ * has ksid info then sidlist is checked first
+ * and if still not found then POSIX groups are checked
+ *
+ * Will use a straight FUID compare when possible.
+ */
+boolean_t
+zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
+{
+#ifdef HAVE_KSID
+	uid_t		gid;
+
+#ifdef illumos
+	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
+	ksidlist_t	*ksidlist = crgetsidlist(cr);
+
+	if (ksid && ksidlist) {
+		int		i;
+		ksid_t		*ksid_groups;
+		uint32_t	idx = FUID_INDEX(id);
+		uint32_t	rid = FUID_RID(id);
+
+		ksid_groups = ksidlist->ksl_sids;
+
+		for (i = 0; i != ksidlist->ksl_nsid; i++) {
+			if (idx == 0) {
+				if (id != IDMAP_WK_CREATOR_GROUP_GID &&
+				    id == ksid_groups[i].ks_id) {
+					return (B_TRUE);
+				}
+			} else {
+				const char *domain;
+
+				domain = zfs_fuid_find_by_idx(zfsvfs, idx);
+				ASSERT(domain != NULL);
+
+				if (strcmp(domain,
+				    IDMAP_WK_CREATOR_SID_AUTHORITY) == 0)
+					return (B_FALSE);
+
+				if ((strcmp(domain,
+				    ksid_groups[i].ks_domain->kd_name) == 0) &&
+				    rid == ksid_groups[i].ks_rid)
+					return (B_TRUE);
+			}
+		}
+	}
+#endif /* illumos */
+
+	/*
+	 * Not found in ksidlist, check posix groups
+	 */
+	gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
+	return (groupmember(gid, cr));
+#else
+	return (B_TRUE);
+#endif
+}
+
+void
+zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	if (zfsvfs->z_fuid_obj == 0) {
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    FUID_SIZE_ESTIMATE(zfsvfs));
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+	} else {
+		dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+		dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+		    FUID_SIZE_ESTIMATE(zfsvfs));
+	}
+}
+
+/*
+ * buf must be big enough (eg, 32 bytes)
+ */
+int
+zfs_id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
+    char *buf, size_t len, boolean_t addok)
+{
+	uint64_t fuid;
+	int domainid = 0;
+
+	if (domain && domain[0]) {
+		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
+		if (domainid == -1)
+			return (SET_ERROR(ENOENT));
+	}
+	fuid = FUID_ENCODE(domainid, rid);
+	(void) snprintf(buf, len, "%llx", (longlong_t)fuid);
+	return (0);
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
new file mode 100644
index 000000000000..0e35fd069cbb
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -0,0 +1,7688 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Portions Copyright 2011 Martin Matuska
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Toomas Soome <tsoome@me.com>
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
+ * Copyright 2017 RackTop Systems.
+ * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
+ * Copyright (c) 2019 Datto Inc.
+ * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+/*
+ * ZFS ioctls.
+ *
+ * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
+ * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
+ *
+ * There are two ways that we handle ioctls: the legacy way where almost
+ * all of the logic is in the ioctl callback, and the new way where most
+ * of the marshalling is handled in the common entry point, zfsdev_ioctl().
+ *
+ * Non-legacy ioctls should be registered by calling
+ * zfs_ioctl_register() from zfs_ioctl_init().  The ioctl is invoked
+ * from userland by lzc_ioctl().
+ *
+ * The registration arguments are as follows:
+ *
+ * const char *name
+ *   The name of the ioctl.  This is used for history logging.  If the
+ *   ioctl returns successfully (the callback returns 0), and allow_log
+ *   is true, then a history log entry will be recorded with the input &
+ *   output nvlists.  The log entry can be printed with "zpool history -i".
+ *
+ * zfs_ioc_t ioc
+ *   The ioctl request number, which userland will pass to ioctl(2).
+ *   We want newer versions of libzfs and libzfs_core to run against
+ *   existing zfs kernel modules (i.e. a deferred reboot after an update).
+ *   Therefore the ioctl numbers cannot change from release to release.
+ *
+ * zfs_secpolicy_func_t *secpolicy
+ *   This function will be called before the zfs_ioc_func_t, to
+ *   determine if this operation is permitted.  It should return EPERM
+ *   on failure, and 0 on success.  Checks include determining if the
+ *   dataset is visible in this zone, and if the user has either all
+ *   zfs privileges in the zone (SYS_MOUNT), or has been granted permission
+ *   to do this operation on this dataset with "zfs allow".
+ *
+ * zfs_ioc_namecheck_t namecheck
+ *   This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
+ *   name, a dataset name, or nothing.  If the name is not well-formed,
+ *   the ioctl will fail and the callback will not be called.
+ *   Therefore, the callback can assume that the name is well-formed
+ *   (e.g. is null-terminated, doesn't have more than one '@' character,
+ *   doesn't have invalid characters).
+ *
+ * zfs_ioc_poolcheck_t pool_check
+ *   This specifies requirements on the pool state.  If the pool does
+ *   not meet them (is suspended or is readonly), the ioctl will fail
+ *   and the callback will not be called.  If any checks are specified
+ *   (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
+ *   Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
+ *   POOL_CHECK_READONLY).
+ *
+ * zfs_ioc_key_t *nvl_keys
+ *  The list of expected/allowable innvl input keys. This list is used
+ *  to validate the nvlist input to the ioctl.
+ *
+ * boolean_t smush_outnvlist
+ *   If smush_outnvlist is true, then the output is presumed to be a
+ *   list of errors, and it will be "smushed" down to fit into the
+ *   caller's buffer, by removing some entries and replacing them with a
+ *   single "N_MORE_ERRORS" entry indicating how many were removed.  See
+ *   nvlist_smush() for details.  If smush_outnvlist is false, and the
+ *   outnvlist does not fit into the userland-provided buffer, then the
+ *   ioctl will fail with ENOMEM.
+ *
+ * zfs_ioc_func_t *func
+ *   The callback function that will perform the operation.
+ *
+ *   The callback should return 0 on success, or an error number on
+ *   failure.  If the function fails, the userland ioctl will return -1,
+ *   and errno will be set to the callback's return value.  The callback
+ *   will be called with the following arguments:
+ *
+ *   const char *name
+ *     The name of the pool or dataset to operate on, from
+ *     zfs_cmd_t:zc_name.  The 'namecheck' argument specifies the
+ *     expected type (pool, dataset, or none).
+ *
+ *   nvlist_t *innvl
+ *     The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src.  Or
+ *     NULL if no input nvlist was provided.  Changes to this nvlist are
+ *     ignored.  If the input nvlist could not be deserialized, the
+ *     ioctl will fail and the callback will not be called.
+ *
+ *   nvlist_t *outnvl
+ *     The output nvlist, initially empty.  The callback can fill it in,
+ *     and it will be returned to userland by serializing it into
+ *     zfs_cmd_t:zc_nvlist_dst.  If it is non-empty, and serialization
+ *     fails (e.g. because the caller didn't supply a large enough
+ *     buffer), then the overall ioctl will fail.  See the
+ *     'smush_nvlist' argument above for additional behaviors.
+ *
+ *     There are two typical uses of the output nvlist:
+ *       - To return state, e.g. property values.  In this case,
+ *         smush_outnvlist should be false.  If the buffer was not large
+ *         enough, the caller will reallocate a larger buffer and try
+ *         the ioctl again.
+ *
+ *       - To return multiple errors from an ioctl which makes on-disk
+ *         changes.  In this case, smush_outnvlist should be true.
+ *         Ioctls which make on-disk modifications should generally not
+ *         use the outnvl if they succeed, because the caller can not
+ *         distinguish between the operation failing, and
+ *         deserialization failing.
+ *
+ * IOCTL Interface Errors
+ *
+ * The following ioctl input errors can be returned:
+ *   ZFS_ERR_IOC_CMD_UNAVAIL	the ioctl number is not supported by kernel
+ *   ZFS_ERR_IOC_ARG_UNAVAIL	an input argument is not supported by kernel
+ *   ZFS_ERR_IOC_ARG_REQUIRED	a required input argument is missing
+ *   ZFS_ERR_IOC_ARG_BADTYPE	an input argument has an invalid type
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu.h>
+#include <sys/dsl_dir.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_deleg.h>
+#include <sys/dmu_objset.h>
+#include <sys/dmu_impl.h>
+#include <sys/dmu_redact.h>
+#include <sys/dmu_tx.h>
+#include <sys/sunddi.h>
+#include <sys/policy.h>
+#include <sys/zone.h>
+#include <sys/nvpair.h>
+#include <sys/pathname.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ctldir.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+#include <sys/dsl_scan.h>
+#include <sys/fm/util.h>
+#include <sys/dsl_crypt.h>
+#include <sys/rrwlock.h>
+#include <sys/zfs_file.h>
+
+#include <sys/dmu_recv.h>
+#include <sys/dmu_send.h>
+#include <sys/dmu_recv.h>
+#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dsl_userhold.h>
+#include <sys/zfeature.h>
+#include <sys/zcp.h>
+#include <sys/zio_checksum.h>
+#include <sys/vdev_removal.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
+#include <sys/vdev_trim.h>
+
+#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+#include "zfs_deleg.h"
+#include "zfs_comutil.h"
+
+#include <sys/lua/lua.h>
+#include <sys/lua/lauxlib.h>
+#include <sys/zfs_ioctl_impl.h>
+
+kmutex_t zfsdev_state_lock;
+zfsdev_state_t *zfsdev_state_list;
+
+/*
+ * Limit maximum nvlist size.  We don't want users passing in insane values
+ * for zc->zc_nvlist_src_size, since we will need to allocate that much memory.
+ * Defaults to 0=auto which is handled by platform code.
+ */
+unsigned long zfs_max_nvlist_src_size = 0;
+
+/*
+ * When logging the output nvlist of an ioctl in the on-disk history, limit
+ * the logged size to this many bytes.  This must be less then DMU_MAX_ACCESS.
+ * This applies primarily to zfs_ioc_channel_program().
+ */
+unsigned long zfs_history_output_max = 1024 * 1024;
+
+uint_t zfs_fsyncer_key;
+uint_t zfs_allow_log_key;
+
+/* DATA_TYPE_ANY is used when zkey_type can vary. */
+#define	DATA_TYPE_ANY	DATA_TYPE_UNKNOWN
+
+typedef struct zfs_ioc_vec {
+	zfs_ioc_legacy_func_t	*zvec_legacy_func;
+	zfs_ioc_func_t		*zvec_func;
+	zfs_secpolicy_func_t	*zvec_secpolicy;
+	zfs_ioc_namecheck_t	zvec_namecheck;
+	boolean_t		zvec_allow_log;
+	zfs_ioc_poolcheck_t	zvec_pool_check;
+	boolean_t		zvec_smush_outnvlist;
+	const char		*zvec_name;
+	const zfs_ioc_key_t	*zvec_nvl_keys;
+	size_t			zvec_nvl_key_count;
+} zfs_ioc_vec_t;
+
+/* This array is indexed by zfs_userquota_prop_t */
+static const char *userquota_perms[] = {
+	ZFS_DELEG_PERM_USERUSED,
+	ZFS_DELEG_PERM_USERQUOTA,
+	ZFS_DELEG_PERM_GROUPUSED,
+	ZFS_DELEG_PERM_GROUPQUOTA,
+	ZFS_DELEG_PERM_USEROBJUSED,
+	ZFS_DELEG_PERM_USEROBJQUOTA,
+	ZFS_DELEG_PERM_GROUPOBJUSED,
+	ZFS_DELEG_PERM_GROUPOBJQUOTA,
+	ZFS_DELEG_PERM_PROJECTUSED,
+	ZFS_DELEG_PERM_PROJECTQUOTA,
+	ZFS_DELEG_PERM_PROJECTOBJUSED,
+	ZFS_DELEG_PERM_PROJECTOBJQUOTA,
+};
+
+static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
+static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc);
+static int zfs_check_settable(const char *name, nvpair_t *property,
+    cred_t *cr);
+static int zfs_check_clearable(const char *dataset, nvlist_t *props,
+    nvlist_t **errors);
+static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
+    boolean_t *);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
+static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
+
+static void
+history_str_free(char *buf)
+{
+	kmem_free(buf, HIS_MAX_RECORD_LEN);
+}
+
+static char *
+history_str_get(zfs_cmd_t *zc)
+{
+	char *buf;
+
+	if (zc->zc_history == 0)
+		return (NULL);
+
+	buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
+	if (copyinstr((void *)(uintptr_t)zc->zc_history,
+	    buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
+		history_str_free(buf);
+		return (NULL);
+	}
+
+	buf[HIS_MAX_RECORD_LEN -1] = '\0';
+
+	return (buf);
+}
+
+/*
+ * Return non-zero if the spa version is less than requested version.
+ */
+static int
+zfs_earlier_version(const char *name, int version)
+{
+	spa_t *spa;
+
+	if (spa_open(name, &spa, FTAG) == 0) {
+		if (spa_version(spa) < version) {
+			spa_close(spa, FTAG);
+			return (1);
+		}
+		spa_close(spa, FTAG);
+	}
+	return (0);
+}
+
+/*
+ * Return TRUE if the ZPL version is less than requested version.
+ */
+static boolean_t
+zpl_earlier_version(const char *name, int version)
+{
+	objset_t *os;
+	boolean_t rc = B_TRUE;
+
+	if (dmu_objset_hold(name, FTAG, &os) == 0) {
+		uint64_t zplversion;
+
+		if (dmu_objset_type(os) != DMU_OST_ZFS) {
+			dmu_objset_rele(os, FTAG);
+			return (B_TRUE);
+		}
+		/* XXX reading from non-owned objset */
+		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
+			rc = zplversion < version;
+		dmu_objset_rele(os, FTAG);
+	}
+	return (rc);
+}
+
+static void
+zfs_log_history(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *buf;
+
+	if ((buf = history_str_get(zc)) == NULL)
+		return;
+
+	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
+		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
+			(void) spa_history_log(spa, buf);
+		spa_close(spa, FTAG);
+	}
+	history_str_free(buf);
+}
+
+/*
+ * Policy for top-level read operations (list pools).  Requires no privileges,
+ * and can be used in the local zone, as there is no associated dataset.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (0);
+}
+
+/*
+ * Policy for dataset read operations (list children, get statistics).  Requires
+ * no privileges, but must be visible in the local zone.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	if (INGLOBALZONE(curproc) ||
+	    zone_dataset_visible(zc->zc_name, NULL))
+		return (0);
+
+	return (SET_ERROR(ENOENT));
+}
+
+static int
+zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
+{
+	int writable = 1;
+
+	/*
+	 * The dataset must be visible by this zone -- check this first
+	 * so they don't see EPERM on something they shouldn't know about.
+	 */
+	if (!INGLOBALZONE(curproc) &&
+	    !zone_dataset_visible(dataset, &writable))
+		return (SET_ERROR(ENOENT));
+
+	if (INGLOBALZONE(curproc)) {
+		/*
+		 * If the fs is zoned, only root can access it from the
+		 * global zone.
+		 */
+		if (secpolicy_zfs(cr) && zoned)
+			return (SET_ERROR(EPERM));
+	} else {
+		/*
+		 * If we are in a local zone, the 'zoned' property must be set.
+		 */
+		if (!zoned)
+			return (SET_ERROR(EPERM));
+
+		/* must be writable by this zone */
+		if (!writable)
+			return (SET_ERROR(EPERM));
+	}
+	return (0);
+}
+
+static int
+zfs_dozonecheck(const char *dataset, cred_t *cr)
+{
+	uint64_t zoned;
+
+	if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED),
+	    &zoned, NULL))
+		return (SET_ERROR(ENOENT));
+
+	return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
+{
+	uint64_t zoned;
+
+	if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned))
+		return (SET_ERROR(ENOENT));
+
+	return (zfs_dozonecheck_impl(dataset, zoned, cr));
+}
+
+static int
+zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
+    const char *perm, cred_t *cr)
+{
+	int error;
+
+	error = zfs_dozonecheck_ds(name, ds, cr);
+	if (error == 0) {
+		error = secpolicy_zfs(cr);
+		if (error != 0)
+			error = dsl_deleg_access_impl(ds, perm, cr);
+	}
+	return (error);
+}
+
+static int
+zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
+{
+	int error;
+	dsl_dataset_t *ds;
+	dsl_pool_t *dp;
+
+	/*
+	 * First do a quick check for root in the global zone, which
+	 * is allowed to do all write_perms.  This ensures that zfs_ioc_*
+	 * will get to handle nonexistent datasets.
+	 */
+	if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0)
+		return (0);
+
+	error = dsl_pool_hold(name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, name, FTAG, &ds);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
+
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (error);
+}
+
+/*
+ * Policy for setting the security label property.
+ *
+ * Returns 0 for success, non-zero for access and other errors.
+ */
+static int
+zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr)
+{
+#ifdef HAVE_MLSLABEL
+	char		ds_hexsl[MAXNAMELEN];
+	bslabel_t	ds_sl, new_sl;
+	boolean_t	new_default = FALSE;
+	uint64_t	zoned;
+	int		needed_priv = -1;
+	int		error;
+
+	/* First get the existing dataset label. */
+	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+	if (error != 0)
+		return (SET_ERROR(EPERM));
+
+	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+		new_default = TRUE;
+
+	/* The label must be translatable */
+	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
+		return (SET_ERROR(EINVAL));
+
+	/*
+	 * In a non-global zone, disallow attempts to set a label that
+	 * doesn't match that of the zone; otherwise no other checks
+	 * are needed.
+	 */
+	if (!INGLOBALZONE(curproc)) {
+		if (new_default || !blequal(&new_sl, CR_SL(CRED())))
+			return (SET_ERROR(EPERM));
+		return (0);
+	}
+
+	/*
+	 * For global-zone datasets (i.e., those whose zoned property is
+	 * "off", verify that the specified new label is valid for the
+	 * global zone.
+	 */
+	if (dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+		return (SET_ERROR(EPERM));
+	if (!zoned) {
+		if (zfs_check_global_label(name, strval) != 0)
+			return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * If the existing dataset label is nondefault, check if the
+	 * dataset is mounted (label cannot be changed while mounted).
+	 * Get the zfsvfs_t; if there isn't one, then the dataset isn't
+	 * mounted (or isn't a dataset, doesn't exist, ...).
+	 */
+	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
+		objset_t *os;
+		static const char *setsl_tag = "setsl_tag";
+
+		/*
+		 * Try to own the dataset; abort if there is any error,
+		 * (e.g., already mounted, in use, or other error).
+		 */
+		error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE,
+		    setsl_tag, &os);
+		if (error != 0)
+			return (SET_ERROR(EPERM));
+
+		dmu_objset_disown(os, B_TRUE, setsl_tag);
+
+		if (new_default) {
+			needed_priv = PRIV_FILE_DOWNGRADE_SL;
+			goto out_check;
+		}
+
+		if (hexstr_to_label(strval, &new_sl) != 0)
+			return (SET_ERROR(EPERM));
+
+		if (blstrictdom(&ds_sl, &new_sl))
+			needed_priv = PRIV_FILE_DOWNGRADE_SL;
+		else if (blstrictdom(&new_sl, &ds_sl))
+			needed_priv = PRIV_FILE_UPGRADE_SL;
+	} else {
+		/* dataset currently has a default label */
+		if (!new_default)
+			needed_priv = PRIV_FILE_UPGRADE_SL;
+	}
+
+out_check:
+	if (needed_priv != -1)
+		return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
+	return (0);
+#else
+	return (SET_ERROR(ENOTSUP));
+#endif /* HAVE_MLSLABEL */
+}
+
+static int
+zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
+    cred_t *cr)
+{
+	char *strval;
+
+	/*
+	 * Check permissions for special properties.
+	 */
+	switch (prop) {
+	default:
+		break;
+	case ZFS_PROP_ZONED:
+		/*
+		 * Disallow setting of 'zoned' from within a local zone.
+		 */
+		if (!INGLOBALZONE(curproc))
+			return (SET_ERROR(EPERM));
+		break;
+
+	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+		if (!INGLOBALZONE(curproc)) {
+			uint64_t zoned;
+			char setpoint[ZFS_MAX_DATASET_NAME_LEN];
+			/*
+			 * Unprivileged users are allowed to modify the
+			 * limit on things *under* (ie. contained by)
+			 * the thing they own.
+			 */
+			if (dsl_prop_get_integer(dsname,
+			    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint))
+				return (SET_ERROR(EPERM));
+			if (!zoned || strlen(dsname) <= strlen(setpoint))
+				return (SET_ERROR(EPERM));
+		}
+		break;
+
+	case ZFS_PROP_MLSLABEL:
+		if (!is_system_labeled())
+			return (SET_ERROR(EPERM));
+
+		if (nvpair_value_string(propval, &strval) == 0) {
+			int err;
+
+			err = zfs_set_slabel_policy(dsname, strval, CRED());
+			if (err != 0)
+				return (err);
+		}
+		break;
+	}
+
+	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	int error;
+
+	error = zfs_dozonecheck(zc->zc_name, cr);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * permission to set permissions will be evaluated later in
+	 * dsl_deleg_can_allow()
+	 */
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_ROLLBACK, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	const char *cp;
+	int error;
+
+	/*
+	 * Generate the current snapshot name from the given objsetid, then
+	 * use that name for the secpolicy/zone checks.
+	 */
+	cp = strchr(zc->zc_name, '@');
+	if (cp == NULL)
+		return (SET_ERROR(EINVAL));
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	dsl_dataset_name(ds, zc->zc_name);
+
+	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
+	    ZFS_DELEG_PERM_SEND, cr);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_SEND, cr));
+}
+
+static int
+zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (SET_ERROR(ENOTSUP));
+}
+
+static int
+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (SET_ERROR(ENOTSUP));
+}
+
+static int
+zfs_get_parent(const char *datasetname, char *parent, int parentsize)
+{
+	char *cp;
+
+	/*
+	 * Remove the @bla or /bla from the end of the name to get the parent.
+	 */
+	(void) strncpy(parent, datasetname, parentsize);
+	cp = strrchr(parent, '@');
+	if (cp != NULL) {
+		cp[0] = '\0';
+	} else {
+		cp = strrchr(parent, '/');
+		if (cp == NULL)
+			return (SET_ERROR(ENOENT));
+		cp[0] = '\0';
+	}
+
+	return (0);
+}
+
+int
+zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
+{
+	int error;
+
+	if ((error = zfs_secpolicy_write_perms(name,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
+
+	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
+}
+
+/*
+ * Destroying snapshots with delegated permissions requires
+ * descendant mount and destroy permissions.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	nvlist_t *snaps;
+	nvpair_t *pair, *nextpair;
+	int error = 0;
+
+	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nextpair) {
+		nextpair = nvlist_next_nvpair(snaps, pair);
+		error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
+		if (error == ENOENT) {
+			/*
+			 * Ignore any snapshots that don't exist (we consider
+			 * them "already destroyed").  Remove the name from the
+			 * nvl here in case the snapshot is created between
+			 * now and when we try to destroy it (in which case
+			 * we don't want to destroy it since we haven't
+			 * checked for permission).
+			 */
+			fnvlist_remove_nvpair(snaps, pair);
+			error = 0;
+		}
+		if (error != 0)
+			break;
+	}
+
+	return (error);
+}
+
+int
+zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
+{
+	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
+	int	error;
+
+	if ((error = zfs_secpolicy_write_perms(from,
+	    ZFS_DELEG_PERM_RENAME, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(from,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_get_parent(to, parentname,
+	    sizeof (parentname))) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *clone;
+	int error;
+
+	error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_PROMOTE, cr);
+	if (error != 0)
+		return (error);
+
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
+
+	if (error == 0) {
+		char parentname[ZFS_MAX_DATASET_NAME_LEN];
+		dsl_dataset_t *origin = NULL;
+		dsl_dir_t *dd;
+		dd = clone->ds_dir;
+
+		error = dsl_dataset_hold_obj(dd->dd_pool,
+		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
+		if (error != 0) {
+			dsl_dataset_rele(clone, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (error);
+		}
+
+		error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
+		    ZFS_DELEG_PERM_MOUNT, cr);
+
+		dsl_dataset_name(origin, parentname);
+		if (error == 0) {
+			error = zfs_secpolicy_write_perms_ds(parentname, origin,
+			    ZFS_DELEG_PERM_PROMOTE, cr);
+		}
+		dsl_dataset_rele(clone, FTAG);
+		dsl_dataset_rele(origin, FTAG);
+	}
+	dsl_pool_rele(dp, FTAG);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	int error;
+
+	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
+		return (error);
+
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_CREATE, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_recv(zc, innvl, cr));
+}
+
+int
+zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(name,
+	    ZFS_DELEG_PERM_SNAPSHOT, cr));
+}
+
+/*
+ * Check for permission to create each snapshot in the nvlist.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	nvlist_t *snaps;
+	int error = 0;
+	nvpair_t *pair;
+
+	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(snaps, pair)) {
+		char *name = nvpair_name(pair);
+		char *atp = strchr(name, '@');
+
+		if (atp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		*atp = '\0';
+		error = zfs_secpolicy_snapshot_perms(name, cr);
+		*atp = '@';
+		if (error != 0)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * Check for permission to create each bookmark in the nvlist.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	int error = 0;
+
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		char *name = nvpair_name(pair);
+		char *hashp = strchr(name, '#');
+
+		if (hashp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+		*hashp = '\0';
+		error = zfs_secpolicy_write_perms(name,
+		    ZFS_DELEG_PERM_BOOKMARK, cr);
+		*hashp = '#';
+		if (error != 0)
+			break;
+	}
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	nvpair_t *pair, *nextpair;
+	int error = 0;
+
+	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+	    pair = nextpair) {
+		char *name = nvpair_name(pair);
+		char *hashp = strchr(name, '#');
+		nextpair = nvlist_next_nvpair(innvl, pair);
+
+		if (hashp == NULL) {
+			error = SET_ERROR(EINVAL);
+			break;
+		}
+
+		*hashp = '\0';
+		error = zfs_secpolicy_write_perms(name,
+		    ZFS_DELEG_PERM_DESTROY, cr);
+		*hashp = '#';
+		if (error == ENOENT) {
+			/*
+			 * Ignore any filesystems that don't exist (we consider
+			 * their bookmarks "already destroyed").  Remove
+			 * the name from the nvl here in case the filesystem
+			 * is created between now and when we try to destroy
+			 * the bookmark (in which case we don't want to
+			 * destroy it since we haven't checked for permission).
+			 */
+			fnvlist_remove_nvpair(innvl, pair);
+			error = 0;
+		}
+		if (error != 0)
+			break;
+	}
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	/*
+	 * Even root must have a proper TSD so that we know what pool
+	 * to log to.
+	 */
+	if (tsd_get(zfs_allow_log_key) == NULL)
+		return (SET_ERROR(EPERM));
+	return (0);
+}
+
+static int
+zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
+	int	error;
+	char	*origin;
+
+	if ((error = zfs_get_parent(zc->zc_name, parentname,
+	    sizeof (parentname))) != 0)
+		return (error);
+
+	if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
+	    (error = zfs_secpolicy_write_perms(origin,
+	    ZFS_DELEG_PERM_CLONE, cr)) != 0)
+		return (error);
+
+	if ((error = zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
+		return (error);
+
+	return (zfs_secpolicy_write_perms(parentname,
+	    ZFS_DELEG_PERM_MOUNT, cr));
+}
+
+/*
+ * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
+ * SYS_CONFIG privilege, which is not available in a local zone.
+ */
+/* ARGSUSED */
+int
+zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	if (secpolicy_sys_config(cr, B_FALSE) != 0)
+		return (SET_ERROR(EPERM));
+
+	return (0);
+}
+
+/*
+ * Policy for object to name lookups.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	int error;
+
+	if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0)
+		return (0);
+
+	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
+	return (error);
+}
+
+/*
+ * Policy for fault injection.  Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (secpolicy_zinject(cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
+
+	if (prop == ZPROP_INVAL) {
+		if (!zfs_prop_user(zc->zc_value))
+			return (SET_ERROR(EINVAL));
+		return (zfs_secpolicy_write_perms(zc->zc_name,
+		    ZFS_DELEG_PERM_USERPROP, cr));
+	} else {
+		return (zfs_secpolicy_setprop(zc->zc_name, prop,
+		    NULL, cr));
+	}
+}
+
+static int
+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	int err = zfs_secpolicy_read(zc, innvl, cr);
+	if (err)
+		return (err);
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (SET_ERROR(EINVAL));
+
+	if (zc->zc_value[0] == 0) {
+		/*
+		 * They are asking about a posix uid/gid.  If it's
+		 * themself, allow it.
+		 */
+		if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
+		    zc->zc_objset_type == ZFS_PROP_USERQUOTA ||
+		    zc->zc_objset_type == ZFS_PROP_USEROBJUSED ||
+		    zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) {
+			if (zc->zc_guid == crgetuid(cr))
+				return (0);
+		} else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED ||
+		    zc->zc_objset_type == ZFS_PROP_GROUPQUOTA ||
+		    zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED ||
+		    zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) {
+			if (groupmember(zc->zc_guid, cr))
+				return (0);
+		}
+		/* else is for project quota/used */
+	}
+
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	int err = zfs_secpolicy_read(zc, innvl, cr);
+	if (err)
+		return (err);
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (SET_ERROR(EINVAL));
+
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    userquota_perms[zc->zc_objset_type], cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
+	    NULL, cr));
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	nvpair_t *pair;
+	nvlist_t *holds;
+	int error;
+
+	holds = fnvlist_lookup_nvlist(innvl, "holds");
+
+	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		char fsname[ZFS_MAX_DATASET_NAME_LEN];
+		error = dmu_fsname(nvpair_name(pair), fsname);
+		if (error != 0)
+			return (error);
+		error = zfs_secpolicy_write_perms(fsname,
+		    ZFS_DELEG_PERM_HOLD, cr);
+		if (error != 0)
+			return (error);
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	nvpair_t *pair;
+	int error;
+
+	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(innvl, pair)) {
+		char fsname[ZFS_MAX_DATASET_NAME_LEN];
+		error = dmu_fsname(nvpair_name(pair), fsname);
+		if (error != 0)
+			return (error);
+		error = zfs_secpolicy_write_perms(fsname,
+		    ZFS_DELEG_PERM_RELEASE, cr);
+		if (error != 0)
+			return (error);
+	}
+	return (0);
+}
+
+/*
+ * Policy for allowing temporary snapshots to be taken or released
+ */
+static int
+zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	/*
+	 * A temporary snapshot is the same as a snapshot,
+	 * hold, destroy and release all rolled into one.
+	 * Delegated diff alone is sufficient that we allow this.
+	 */
+	int error;
+
+	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_DIFF, cr)) == 0)
+		return (0);
+
+	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
+
+	if (innvl != NULL) {
+		if (error == 0)
+			error = zfs_secpolicy_hold(zc, innvl, cr);
+		if (error == 0)
+			error = zfs_secpolicy_release(zc, innvl, cr);
+		if (error == 0)
+			error = zfs_secpolicy_destroy(zc, innvl, cr);
+	}
+	return (error);
+}
+
+static int
+zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_LOAD_KEY, cr));
+}
+
+static int
+zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_CHANGE_KEY, cr));
+}
+
+/*
+ * Returns the nvlist as specified by the user in the zfs_cmd_t.
+ */
+static int
+get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
+{
+	char *packed;
+	int error;
+	nvlist_t *list = NULL;
+
+	/*
+	 * Read in and unpack the user-supplied nvlist.
+	 */
+	if (size == 0)
+		return (SET_ERROR(EINVAL));
+
+	packed = vmem_alloc(size, KM_SLEEP);
+
+	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+	    iflag)) != 0) {
+		vmem_free(packed, size);
+		return (SET_ERROR(EFAULT));
+	}
+
+	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
+		vmem_free(packed, size);
+		return (error);
+	}
+
+	vmem_free(packed, size);
+
+	*nvp = list;
+	return (0);
+}
+
+/*
+ * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
+ * Entries will be removed from the end of the nvlist, and one int32 entry
+ * named "N_MORE_ERRORS" will be added indicating how many entries were
+ * removed.
+ */
+static int
+nvlist_smush(nvlist_t *errors, size_t max)
+{
+	size_t size;
+
+	size = fnvlist_size(errors);
+
+	if (size > max) {
+		nvpair_t *more_errors;
+		int n = 0;
+
+		if (max < 1024)
+			return (SET_ERROR(ENOMEM));
+
+		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
+		more_errors = nvlist_prev_nvpair(errors, NULL);
+
+		do {
+			nvpair_t *pair = nvlist_prev_nvpair(errors,
+			    more_errors);
+			fnvlist_remove_nvpair(errors, pair);
+			n++;
+			size = fnvlist_size(errors);
+		} while (size > max);
+
+		fnvlist_remove_nvpair(errors, more_errors);
+		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
+		ASSERT3U(fnvlist_size(errors), <=, max);
+	}
+
+	return (0);
+}
+
+static int
+put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
+{
+	char *packed = NULL;
+	int error = 0;
+	size_t size;
+
+	size = fnvlist_size(nvl);
+
+	if (size > zc->zc_nvlist_dst_size) {
+		error = SET_ERROR(ENOMEM);
+	} else {
+		packed = fnvlist_pack(nvl, &size);
+		if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
+		    size, zc->zc_iflags) != 0)
+			error = SET_ERROR(EFAULT);
+		fnvlist_pack_free(packed, size);
+	}
+
+	zc->zc_nvlist_dst_size = size;
+	zc->zc_nvlist_dst_filled = B_TRUE;
+	return (error);
+}
+
+int
+getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp)
+{
+	int error = 0;
+	if (dmu_objset_type(os) != DMU_OST_ZFS) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	mutex_enter(&os->os_user_ptr_lock);
+	*zfvp = dmu_objset_get_user(os);
+	/* bump s_active only when non-zero to prevent umount race */
+	error = zfs_vfs_ref(zfvp);
+	mutex_exit(&os->os_user_ptr_lock);
+	return (error);
+}
+
+int
+getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_hold(dsname, FTAG, &os);
+	if (error != 0)
+		return (error);
+
+	error = getzfsvfs_impl(os, zfvp);
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+/*
+ * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
+ * case its z_sb will be NULL, and it will be opened as the owner.
+ * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
+ * which prevents all inode ops from running.
+ */
+static int
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
+{
+	int error = 0;
+
+	if (getzfsvfs(name, zfvp) != 0)
+		error = zfsvfs_create(name, B_FALSE, zfvp);
+	if (error == 0) {
+		if (writer)
+			ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag);
+		else
+			ZFS_TEARDOWN_ENTER_READ(*zfvp, tag);
+		if ((*zfvp)->z_unmounted) {
+			/*
+			 * XXX we could probably try again, since the unmounting
+			 * thread should be just about to disassociate the
+			 * objset from the zfsvfs.
+			 */
+			ZFS_TEARDOWN_EXIT(*zfvp, tag);
+			return (SET_ERROR(EBUSY));
+		}
+	}
+	return (error);
+}
+
+static void
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+{
+	ZFS_TEARDOWN_EXIT(zfsvfs, tag);
+
+	if (zfs_vfs_held(zfsvfs)) {
+		zfs_vfs_rele(zfsvfs);
+	} else {
+		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
+		zfsvfs_free(zfsvfs);
+	}
+}
+
+static int
+zfs_ioc_pool_create(zfs_cmd_t *zc)
+{
+	int error;
+	nvlist_t *config, *props = NULL;
+	nvlist_t *rootprops = NULL;
+	nvlist_t *zplprops = NULL;
+	dsl_crypto_params_t *dcp = NULL;
+	const char *spa_name = zc->zc_name;
+	boolean_t unload_wkey = B_TRUE;
+
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &config)))
+		return (error);
+
+	if (zc->zc_nvlist_src_size != 0 && (error =
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props))) {
+		nvlist_free(config);
+		return (error);
+	}
+
+	if (props) {
+		nvlist_t *nvl = NULL;
+		nvlist_t *hidden_args = NULL;
+		uint64_t version = SPA_VERSION;
+		char *tname;
+
+		(void) nvlist_lookup_uint64(props,
+		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
+		if (!SPA_VERSION_IS_SUPPORTED(version)) {
+			error = SET_ERROR(EINVAL);
+			goto pool_props_bad;
+		}
+		(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
+		if (nvl) {
+			error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
+			if (error != 0)
+				goto pool_props_bad;
+			(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
+		}
+
+		(void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS,
+		    &hidden_args);
+		error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
+		    rootprops, hidden_args, &dcp);
+		if (error != 0)
+			goto pool_props_bad;
+		(void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS);
+
+		VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		error = zfs_fill_zplprops_root(version, rootprops,
+		    zplprops, NULL);
+		if (error != 0)
+			goto pool_props_bad;
+
+		if (nvlist_lookup_string(props,
+		    zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0)
+			spa_name = tname;
+	}
+
+	error = spa_create(zc->zc_name, config, props, zplprops, dcp);
+
+	/*
+	 * Set the remaining root properties
+	 */
+	if (!error && (error = zfs_set_prop_nvlist(spa_name,
+	    ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) {
+		(void) spa_destroy(spa_name);
+		unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */
+	}
+
+pool_props_bad:
+	nvlist_free(rootprops);
+	nvlist_free(zplprops);
+	nvlist_free(config);
+	nvlist_free(props);
+	dsl_crypto_params_free(dcp, unload_wkey && !!error);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_destroy(zfs_cmd_t *zc)
+{
+	int error;
+	zfs_log_history(zc);
+	error = spa_destroy(zc->zc_name);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_import(zfs_cmd_t *zc)
+{
+	nvlist_t *config, *props = NULL;
+	uint64_t guid;
+	int error;
+
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &config)) != 0)
+		return (error);
+
+	if (zc->zc_nvlist_src_size != 0 && (error =
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props))) {
+		nvlist_free(config);
+		return (error);
+	}
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
+	    guid != zc->zc_guid)
+		error = SET_ERROR(EINVAL);
+	else
+		error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
+
+	if (zc->zc_nvlist_dst != 0) {
+		int err;
+
+		if ((err = put_nvlist(zc, config)) != 0)
+			error = err;
+	}
+
+	nvlist_free(config);
+	nvlist_free(props);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_export(zfs_cmd_t *zc)
+{
+	int error;
+	boolean_t force = (boolean_t)zc->zc_cookie;
+	boolean_t hardforce = (boolean_t)zc->zc_guid;
+
+	zfs_log_history(zc);
+	error = spa_export(zc->zc_name, NULL, force, hardforce);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_configs(zfs_cmd_t *zc)
+{
+	nvlist_t *configs;
+	int error;
+
+	if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
+		return (SET_ERROR(EEXIST));
+
+	error = put_nvlist(zc, configs);
+
+	nvlist_free(configs);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of the pool
+ *
+ * outputs:
+ * zc_cookie		real errno
+ * zc_nvlist_dst	config nvlist
+ * zc_nvlist_dst_size	size of config nvlist
+ */
+static int
+zfs_ioc_pool_stats(zfs_cmd_t *zc)
+{
+	nvlist_t *config;
+	int error;
+	int ret = 0;
+
+	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
+	    sizeof (zc->zc_value));
+
+	if (config != NULL) {
+		ret = put_nvlist(zc, config);
+		nvlist_free(config);
+
+		/*
+		 * The config may be present even if 'error' is non-zero.
+		 * In this case we return success, and preserve the real errno
+		 * in 'zc_cookie'.
+		 */
+		zc->zc_cookie = error;
+	} else {
+		ret = error;
+	}
+
+	return (ret);
+}
+
+/*
+ * Try to import the given pool, returning pool stats as appropriate so that
+ * user land knows which devices are available and overall pool health.
+ */
+static int
+zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
+{
+	nvlist_t *tryconfig, *config = NULL;
+	int error;
+
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &tryconfig)) != 0)
+		return (error);
+
+	config = spa_tryimport(tryconfig);
+
+	nvlist_free(tryconfig);
+
+	if (config == NULL)
+		return (SET_ERROR(EINVAL));
+
+	error = put_nvlist(zc, config);
+	nvlist_free(config);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name              name of the pool
+ * zc_cookie            scan func (pool_scan_func_t)
+ * zc_flags             scrub pause/resume flag (pool_scrub_cmd_t)
+ */
+static int
+zfs_ioc_pool_scan(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
+		return (SET_ERROR(EINVAL));
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	if (zc->zc_flags == POOL_SCRUB_PAUSE)
+		error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
+	else if (zc->zc_cookie == POOL_SCAN_NONE)
+		error = spa_scan_stop(spa);
+	else
+		error = spa_scan(spa, zc->zc_cookie);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_freeze(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		spa_freeze(spa);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	if (zc->zc_cookie < spa_version(spa) ||
+	    !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	spa_upgrade(spa, zc->zc_cookie);
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_get_history(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *hist_buf;
+	uint64_t size;
+	int error;
+
+	if ((size = zc->zc_history_len) == 0)
+		return (SET_ERROR(EINVAL));
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	hist_buf = vmem_alloc(size, KM_SLEEP);
+	if ((error = spa_history_get(spa, &zc->zc_history_offset,
+	    &zc->zc_history_len, hist_buf)) == 0) {
+		error = ddi_copyout(hist_buf,
+		    (void *)(uintptr_t)zc->zc_history,
+		    zc->zc_history_len, zc->zc_iflags);
+	}
+
+	spa_close(spa, FTAG);
+	vmem_free(hist_buf, size);
+	return (error);
+}
+
+static int
+zfs_ioc_pool_reguid(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error == 0) {
+		error = spa_change_guid(spa);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+static int
+zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
+{
+	return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_obj		object to find
+ *
+ * outputs:
+ * zc_value		name of object
+ */
+static int
+zfs_ioc_obj_to_path(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+
+	/* XXX reading from objset not owned */
+	if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
+	    FTAG, &os)) != 0)
+		return (error);
+	if (dmu_objset_type(os) != DMU_OST_ZFS) {
+		dmu_objset_rele_flags(os, B_TRUE, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
+	    sizeof (zc->zc_value));
+	dmu_objset_rele_flags(os, B_TRUE, FTAG);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_obj		object to find
+ *
+ * outputs:
+ * zc_stat		stats on object
+ * zc_value		path to object
+ */
+static int
+zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+
+	/* XXX reading from objset not owned */
+	if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
+	    FTAG, &os)) != 0)
+		return (error);
+	if (dmu_objset_type(os) != DMU_OST_ZFS) {
+		dmu_objset_rele_flags(os, B_TRUE, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+	error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
+	    sizeof (zc->zc_value));
+	dmu_objset_rele_flags(os, B_TRUE, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_add(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	nvlist_t *config;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &config);
+	if (error == 0) {
+		error = spa_vdev_add(spa, config);
+		nvlist_free(config);
+	}
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of the pool
+ * zc_guid		guid of vdev to remove
+ * zc_cookie		cancel removal
+ */
+static int
+zfs_ioc_vdev_remove(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+	if (zc->zc_cookie != 0) {
+		error = spa_vdev_remove_cancel(spa);
+	} else {
+		error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
+	}
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+	switch (zc->zc_cookie) {
+	case VDEV_STATE_ONLINE:
+		error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
+		break;
+
+	case VDEV_STATE_OFFLINE:
+		error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
+		break;
+
+	case VDEV_STATE_FAULTED:
+		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+		    zc->zc_obj != VDEV_AUX_EXTERNAL &&
+		    zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST)
+			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+		error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
+		break;
+
+	case VDEV_STATE_DEGRADED:
+		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+		    zc->zc_obj != VDEV_AUX_EXTERNAL)
+			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+		error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
+		break;
+
+	default:
+		error = SET_ERROR(EINVAL);
+	}
+	zc->zc_cookie = newstate;
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_attach(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	nvlist_t *config;
+	int replacing = zc->zc_cookie;
+	int rebuild = zc->zc_simple;
+	int error;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &config)) == 0) {
+		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing,
+		    rebuild);
+		nvlist_free(config);
+	}
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_detach(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
+
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_split(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	nvlist_t *config, *props = NULL;
+	int error;
+	boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &config))) {
+		spa_close(spa, FTAG);
+		return (error);
+	}
+
+	if (zc->zc_nvlist_src_size != 0 && (error =
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props))) {
+		spa_close(spa, FTAG);
+		nvlist_free(config);
+		return (error);
+	}
+
+	error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
+
+	spa_close(spa, FTAG);
+
+	nvlist_free(config);
+	nvlist_free(props);
+
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	const char *path = zc->zc_value;
+	uint64_t guid = zc->zc_guid;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = spa_vdev_setpath(spa, guid, path);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	const char *fru = zc->zc_value;
+	uint64_t guid = zc->zc_guid;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = spa_vdev_setfru(spa, guid, fru);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
+{
+	int error = 0;
+	nvlist_t *nv;
+
+	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+	if (zc->zc_nvlist_dst != 0 &&
+	    (error = dsl_prop_get_all(os, &nv)) == 0) {
+		dmu_objset_stats(os, nv);
+		/*
+		 * NB: zvol_get_stats() will read the objset contents,
+		 * which we aren't supposed to do with a
+		 * DS_MODE_USER hold, because it could be
+		 * inconsistent.  So this is a bit of a workaround...
+		 * XXX reading without owning
+		 */
+		if (!zc->zc_objset_stats.dds_inconsistent &&
+		    dmu_objset_type(os) == DMU_OST_ZVOL) {
+			error = zvol_get_stats(os, nv);
+			if (error == EIO) {
+				nvlist_free(nv);
+				return (error);
+			}
+			VERIFY0(error);
+		}
+		if (error == 0)
+			error = put_nvlist(zc, nv);
+		nvlist_free(nv);
+	}
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_objset_stats	stats
+ * zc_nvlist_dst	property nvlist
+ * zc_nvlist_dst_size	size of property nvlist
+ */
+static int
+zfs_ioc_objset_stats(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+	if (error == 0) {
+		error = zfs_ioc_objset_stats_impl(zc, os);
+		dmu_objset_rele(os, FTAG);
+	}
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst	received property nvlist
+ * zc_nvlist_dst_size	size of received property nvlist
+ *
+ * Gets received properties (distinct from local properties on or after
+ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
+ * local property values.
+ */
+static int
+zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
+{
+	int error = 0;
+	nvlist_t *nv;
+
+	/*
+	 * Without this check, we would return local property values if the
+	 * caller has not already received properties on or after
+	 * SPA_VERSION_RECVD_PROPS.
+	 */
+	if (!dsl_prop_get_hasrecvd(zc->zc_name))
+		return (SET_ERROR(ENOTSUP));
+
+	if (zc->zc_nvlist_dst != 0 &&
+	    (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
+		error = put_nvlist(zc, nv);
+		nvlist_free(nv);
+	}
+
+	return (error);
+}
+
+static int
+nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
+{
+	uint64_t value;
+	int error;
+
+	/*
+	 * zfs_get_zplprop() will either find a value or give us
+	 * the default value (if there is one).
+	 */
+	if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
+		return (error);
+	VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0);
+	return (0);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for zpl property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst	zpl property nvlist
+ * zc_nvlist_dst_size	size of zpl property nvlist
+ */
+static int
+zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int err;
+
+	/* XXX reading without owning */
+	if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os)))
+		return (err);
+
+	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
+
+	/*
+	 * NB: nvl_add_zplprop() will read the objset contents,
+	 * which we aren't supposed to do with a DS_MODE_USER
+	 * hold, because it could be inconsistent.
+	 */
+	if (zc->zc_nvlist_dst != 0 &&
+	    !zc->zc_objset_stats.dds_inconsistent &&
+	    dmu_objset_type(os) == DMU_OST_ZFS) {
+		nvlist_t *nv;
+
+		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
+		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
+		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
+		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0)
+			err = put_nvlist(zc, nv);
+		nvlist_free(nv);
+	} else {
+		err = SET_ERROR(ENOENT);
+	}
+	dmu_objset_rele(os, FTAG);
+	return (err);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_cookie		zap cursor
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_name		name of next filesystem
+ * zc_cookie		zap cursor
+ * zc_objset_stats	stats
+ * zc_nvlist_dst	property nvlist
+ * zc_nvlist_dst_size	size of property nvlist
+ */
+static int
+zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+	char *p;
+	size_t orig_len = strlen(zc->zc_name);
+
+top:
+	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) {
+		if (error == ENOENT)
+			error = SET_ERROR(ESRCH);
+		return (error);
+	}
+
+	p = strrchr(zc->zc_name, '/');
+	if (p == NULL || p[1] != '\0')
+		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
+	p = zc->zc_name + strlen(zc->zc_name);
+
+	do {
+		error = dmu_dir_list_next(os,
+		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
+		    NULL, &zc->zc_cookie);
+		if (error == ENOENT)
+			error = SET_ERROR(ESRCH);
+	} while (error == 0 && zfs_dataset_name_hidden(zc->zc_name));
+	dmu_objset_rele(os, FTAG);
+
+	/*
+	 * If it's an internal dataset (ie. with a '$' in its name),
+	 * don't try to get stats for it, otherwise we'll return ENOENT.
+	 */
+	if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
+		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
+		if (error == ENOENT) {
+			/* We lost a race with destroy, get the next one. */
+			zc->zc_name[orig_len] = '\0';
+			goto top;
+		}
+	}
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_cookie		zap cursor
+ * zc_nvlist_src	iteration range nvlist
+ * zc_nvlist_src_size	size of iteration range nvlist
+ *
+ * outputs:
+ * zc_name		name of next snapshot
+ * zc_objset_stats	stats
+ * zc_nvlist_dst	property nvlist
+ * zc_nvlist_dst_size	size of property nvlist
+ */
+static int
+zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
+{
+	int error;
+	objset_t *os, *ossnap;
+	dsl_dataset_t *ds;
+	uint64_t min_txg = 0, max_txg = 0;
+
+	if (zc->zc_nvlist_src_size != 0) {
+		nvlist_t *props = NULL;
+		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+		    zc->zc_iflags, &props);
+		if (error != 0)
+			return (error);
+		(void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG,
+		    &min_txg);
+		(void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG,
+		    &max_txg);
+		nvlist_free(props);
+	}
+
+	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+	if (error != 0) {
+		return (error == ENOENT ? SET_ERROR(ESRCH) : error);
+	}
+
+	/*
+	 * A dataset name of maximum length cannot have any snapshots,
+	 * so exit immediately.
+	 */
+	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+	    ZFS_MAX_DATASET_NAME_LEN) {
+		dmu_objset_rele(os, FTAG);
+		return (SET_ERROR(ESRCH));
+	}
+
+	while (error == 0) {
+		if (issig(JUSTLOOKING) && issig(FORREAL)) {
+			error = SET_ERROR(EINTR);
+			break;
+		}
+
+		error = dmu_snapshot_list_next(os,
+		    sizeof (zc->zc_name) - strlen(zc->zc_name),
+		    zc->zc_name + strlen(zc->zc_name), &zc->zc_obj,
+		    &zc->zc_cookie, NULL);
+		if (error == ENOENT) {
+			error = SET_ERROR(ESRCH);
+			break;
+		} else if (error != 0) {
+			break;
+		}
+
+		error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj,
+		    FTAG, &ds);
+		if (error != 0)
+			break;
+
+		if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) ||
+		    (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) {
+			dsl_dataset_rele(ds, FTAG);
+			/* undo snapshot name append */
+			*(strchr(zc->zc_name, '@') + 1) = '\0';
+			/* skip snapshot */
+			continue;
+		}
+
+		if (zc->zc_simple) {
+			dsl_dataset_rele(ds, FTAG);
+			break;
+		}
+
+		if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			break;
+		}
+		if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) {
+			dsl_dataset_rele(ds, FTAG);
+			break;
+		}
+		dsl_dataset_rele(ds, FTAG);
+		break;
+	}
+
+	dmu_objset_rele(os, FTAG);
+	/* if we failed, undo the @ that we tacked on to zc_name */
+	if (error != 0)
+		*strchr(zc->zc_name, '@') = '\0';
+	return (error);
+}
+
+static int
+zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
+{
+	const char *propname = nvpair_name(pair);
+	uint64_t *valary;
+	unsigned int vallen;
+	const char *dash, *domain;
+	zfs_userquota_prop_t type;
+	uint64_t rid;
+	uint64_t quota;
+	zfsvfs_t *zfsvfs;
+	int err;
+
+	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+		if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &pair) != 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * A correctly constructed propname is encoded as
+	 * userquota@<rid>-<domain>.
+	 */
+	if ((dash = strchr(propname, '-')) == NULL ||
+	    nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
+	    vallen != 3)
+		return (SET_ERROR(EINVAL));
+
+	domain = dash + 1;
+	type = valary[0];
+	rid = valary[1];
+	quota = valary[2];
+
+	err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
+	if (err == 0) {
+		err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
+		zfsvfs_rele(zfsvfs, FTAG);
+	}
+
+	return (err);
+}
+
+/*
+ * If the named property is one that has a special function to set its value,
+ * return 0 on success and a positive error code on failure; otherwise if it is
+ * not one of the special properties handled by this function, return -1.
+ *
+ * XXX: It would be better for callers of the property interface if we handled
+ * these special cases in dsl_prop.c (in the dsl layer).
+ */
+static int
+zfs_prop_set_special(const char *dsname, zprop_source_t source,
+    nvpair_t *pair)
+{
+	const char *propname = nvpair_name(pair);
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	uint64_t intval = 0;
+	const char *strval = NULL;
+	int err = -1;
+
+	if (prop == ZPROP_INVAL) {
+		if (zfs_prop_userquota(propname))
+			return (zfs_prop_set_userquota(dsname, pair));
+		return (-1);
+	}
+
+	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &pair) == 0);
+	}
+
+	/* all special properties are numeric except for keylocation */
+	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+		strval = fnvpair_value_string(pair);
+	} else {
+		intval = fnvpair_value_uint64(pair);
+	}
+
+	switch (prop) {
+	case ZFS_PROP_QUOTA:
+		err = dsl_dir_set_quota(dsname, source, intval);
+		break;
+	case ZFS_PROP_REFQUOTA:
+		err = dsl_dataset_set_refquota(dsname, source, intval);
+		break;
+	case ZFS_PROP_FILESYSTEM_LIMIT:
+	case ZFS_PROP_SNAPSHOT_LIMIT:
+		if (intval == UINT64_MAX) {
+			/* clearing the limit, just do it */
+			err = 0;
+		} else {
+			err = dsl_dir_activate_fs_ss_limit(dsname);
+		}
+		/*
+		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
+		 * default path to set the value in the nvlist.
+		 */
+		if (err == 0)
+			err = -1;
+		break;
+	case ZFS_PROP_KEYLOCATION:
+		err = dsl_crypto_can_set_keylocation(dsname, strval);
+
+		/*
+		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
+		 * default path to set the value in the nvlist.
+		 */
+		if (err == 0)
+			err = -1;
+		break;
+	case ZFS_PROP_RESERVATION:
+		err = dsl_dir_set_reservation(dsname, source, intval);
+		break;
+	case ZFS_PROP_REFRESERVATION:
+		err = dsl_dataset_set_refreservation(dsname, source, intval);
+		break;
+	case ZFS_PROP_COMPRESSION:
+		err = dsl_dataset_set_compression(dsname, source, intval);
+		/*
+		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
+		 * default path to set the value in the nvlist.
+		 */
+		if (err == 0)
+			err = -1;
+		break;
+	case ZFS_PROP_VOLSIZE:
+		err = zvol_set_volsize(dsname, intval);
+		break;
+	case ZFS_PROP_SNAPDEV:
+		err = zvol_set_snapdev(dsname, source, intval);
+		break;
+	case ZFS_PROP_VOLMODE:
+		err = zvol_set_volmode(dsname, source, intval);
+		break;
+	case ZFS_PROP_VERSION:
+	{
+		zfsvfs_t *zfsvfs;
+
+		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
+			break;
+
+		err = zfs_set_version(zfsvfs, intval);
+		zfsvfs_rele(zfsvfs, FTAG);
+
+		if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
+			zfs_cmd_t *zc;
+
+			zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+			(void) strlcpy(zc->zc_name, dsname,
+			    sizeof (zc->zc_name));
+			(void) zfs_ioc_userspace_upgrade(zc);
+			(void) zfs_ioc_id_quota_upgrade(zc);
+			kmem_free(zc, sizeof (zfs_cmd_t));
+		}
+		break;
+	}
+	default:
+		err = -1;
+	}
+
+	return (err);
+}
+
+/*
+ * This function is best effort. If it fails to set any of the given properties,
+ * it continues to set as many as it can and returns the last error
+ * encountered. If the caller provides a non-NULL errlist, it will be filled in
+ * with the list of names of all the properties that failed along with the
+ * corresponding error numbers.
+ *
+ * If every property is set successfully, zero is returned and errlist is not
+ * modified.
+ */
+int
+zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
+    nvlist_t *errlist)
+{
+	nvpair_t *pair;
+	nvpair_t *propval;
+	int rv = 0;
+	uint64_t intval;
+	const char *strval;
+
+	nvlist_t *genericnvl = fnvlist_alloc();
+	nvlist_t *retrynvl = fnvlist_alloc();
+retry:
+	pair = NULL;
+	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+		const char *propname = nvpair_name(pair);
+		zfs_prop_t prop = zfs_name_to_prop(propname);
+		int err = 0;
+
+		/* decode the property value */
+		propval = pair;
+		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+			nvlist_t *attrs;
+			attrs = fnvpair_value_nvlist(pair);
+			if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+			    &propval) != 0)
+				err = SET_ERROR(EINVAL);
+		}
+
+		/* Validate value type */
+		if (err == 0 && source == ZPROP_SRC_INHERITED) {
+			/* inherited properties are expected to be booleans */
+			if (nvpair_type(propval) != DATA_TYPE_BOOLEAN)
+				err = SET_ERROR(EINVAL);
+		} else if (err == 0 && prop == ZPROP_INVAL) {
+			if (zfs_prop_user(propname)) {
+				if (nvpair_type(propval) != DATA_TYPE_STRING)
+					err = SET_ERROR(EINVAL);
+			} else if (zfs_prop_userquota(propname)) {
+				if (nvpair_type(propval) !=
+				    DATA_TYPE_UINT64_ARRAY)
+					err = SET_ERROR(EINVAL);
+			} else {
+				err = SET_ERROR(EINVAL);
+			}
+		} else if (err == 0) {
+			if (nvpair_type(propval) == DATA_TYPE_STRING) {
+				if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
+					err = SET_ERROR(EINVAL);
+			} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
+				const char *unused;
+
+				intval = fnvpair_value_uint64(propval);
+
+				switch (zfs_prop_get_type(prop)) {
+				case PROP_TYPE_NUMBER:
+					break;
+				case PROP_TYPE_STRING:
+					err = SET_ERROR(EINVAL);
+					break;
+				case PROP_TYPE_INDEX:
+					if (zfs_prop_index_to_string(prop,
+					    intval, &unused) != 0)
+						err =
+						    SET_ERROR(ZFS_ERR_BADPROP);
+					break;
+				default:
+					cmn_err(CE_PANIC,
+					    "unknown property type");
+				}
+			} else {
+				err = SET_ERROR(EINVAL);
+			}
+		}
+
+		/* Validate permissions */
+		if (err == 0)
+			err = zfs_check_settable(dsname, pair, CRED());
+
+		if (err == 0) {
+			if (source == ZPROP_SRC_INHERITED)
+				err = -1; /* does not need special handling */
+			else
+				err = zfs_prop_set_special(dsname, source,
+				    pair);
+			if (err == -1) {
+				/*
+				 * For better performance we build up a list of
+				 * properties to set in a single transaction.
+				 */
+				err = nvlist_add_nvpair(genericnvl, pair);
+			} else if (err != 0 && nvl != retrynvl) {
+				/*
+				 * This may be a spurious error caused by
+				 * receiving quota and reservation out of order.
+				 * Try again in a second pass.
+				 */
+				err = nvlist_add_nvpair(retrynvl, pair);
+			}
+		}
+
+		if (err != 0) {
+			if (errlist != NULL)
+				fnvlist_add_int32(errlist, propname, err);
+			rv = err;
+		}
+	}
+
+	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
+		nvl = retrynvl;
+		goto retry;
+	}
+
+	if (!nvlist_empty(genericnvl) &&
+	    dsl_props_set(dsname, source, genericnvl) != 0) {
+		/*
+		 * If this fails, we still want to set as many properties as we
+		 * can, so try setting them individually.
+		 */
+		pair = NULL;
+		while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+			const char *propname = nvpair_name(pair);
+			int err = 0;
+
+			propval = pair;
+			if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+				nvlist_t *attrs;
+				attrs = fnvpair_value_nvlist(pair);
+				propval = fnvlist_lookup_nvpair(attrs,
+				    ZPROP_VALUE);
+			}
+
+			if (nvpair_type(propval) == DATA_TYPE_STRING) {
+				strval = fnvpair_value_string(propval);
+				err = dsl_prop_set_string(dsname, propname,
+				    source, strval);
+			} else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) {
+				err = dsl_prop_inherit(dsname, propname,
+				    source);
+			} else {
+				intval = fnvpair_value_uint64(propval);
+				err = dsl_prop_set_int(dsname, propname, source,
+				    intval);
+			}
+
+			if (err != 0) {
+				if (errlist != NULL) {
+					fnvlist_add_int32(errlist, propname,
+					    err);
+				}
+				rv = err;
+			}
+		}
+	}
+	nvlist_free(genericnvl);
+	nvlist_free(retrynvl);
+
+	return (rv);
+}
+
+/*
+ * Check that all the properties are valid user properties.
+ */
+static int
+zfs_check_userprops(nvlist_t *nvl)
+{
+	nvpair_t *pair = NULL;
+
+	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+		const char *propname = nvpair_name(pair);
+
+		if (!zfs_prop_user(propname) ||
+		    nvpair_type(pair) != DATA_TYPE_STRING)
+			return (SET_ERROR(EINVAL));
+
+		if (strlen(propname) >= ZAP_MAXNAMELEN)
+			return (SET_ERROR(ENAMETOOLONG));
+
+		if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
+			return (SET_ERROR(E2BIG));
+	}
+	return (0);
+}
+
+static void
+props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
+{
+	nvpair_t *pair;
+
+	VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	pair = NULL;
+	while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
+		if (nvlist_exists(skipped, nvpair_name(pair)))
+			continue;
+
+		VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
+	}
+}
+
+static int
+clear_received_props(const char *dsname, nvlist_t *props,
+    nvlist_t *skipped)
+{
+	int err = 0;
+	nvlist_t *cleared_props = NULL;
+	props_skip(props, skipped, &cleared_props);
+	if (!nvlist_empty(cleared_props)) {
+		/*
+		 * Acts on local properties until the dataset has received
+		 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
+		 */
+		zprop_source_t flags = (ZPROP_SRC_NONE |
+		    (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
+		err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
+	}
+	nvlist_free(cleared_props);
+	return (err);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_value		name of property to set
+ * zc_nvlist_src{_size}	nvlist of properties to apply
+ * zc_cookie		received properties flag
+ *
+ * outputs:
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ */
+static int
+zfs_ioc_set_prop(zfs_cmd_t *zc)
+{
+	nvlist_t *nvl;
+	boolean_t received = zc->zc_cookie;
+	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
+	    ZPROP_SRC_LOCAL);
+	nvlist_t *errors;
+	int error;
+
+	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &nvl)) != 0)
+		return (error);
+
+	if (received) {
+		nvlist_t *origprops;
+
+		if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
+			(void) clear_received_props(zc->zc_name,
+			    origprops, nvl);
+			nvlist_free(origprops);
+		}
+
+		error = dsl_prop_set_hasrecvd(zc->zc_name);
+	}
+
+	errors = fnvlist_alloc();
+	if (error == 0)
+		error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
+
+	if (zc->zc_nvlist_dst != 0 && errors != NULL) {
+		(void) put_nvlist(zc, errors);
+	}
+
+	nvlist_free(errors);
+	nvlist_free(nvl);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_value		name of property to inherit
+ * zc_cookie		revert to received value if TRUE
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_inherit_prop(zfs_cmd_t *zc)
+{
+	const char *propname = zc->zc_value;
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	boolean_t received = zc->zc_cookie;
+	zprop_source_t source = (received
+	    ? ZPROP_SRC_NONE		/* revert to received value, if any */
+	    : ZPROP_SRC_INHERITED);	/* explicitly inherit */
+	nvlist_t *dummy;
+	nvpair_t *pair;
+	zprop_type_t type;
+	int err;
+
+	if (!received) {
+		/*
+		 * Only check this in the non-received case. We want to allow
+		 * 'inherit -S' to revert non-inheritable properties like quota
+		 * and reservation to the received or default values even though
+		 * they are not considered inheritable.
+		 */
+		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+			return (SET_ERROR(EINVAL));
+	}
+
+	if (prop == ZPROP_INVAL) {
+		if (!zfs_prop_user(propname))
+			return (SET_ERROR(EINVAL));
+
+		type = PROP_TYPE_STRING;
+	} else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) {
+		return (SET_ERROR(EINVAL));
+	} else {
+		type = zfs_prop_get_type(prop);
+	}
+
+	/*
+	 * zfs_prop_set_special() expects properties in the form of an
+	 * nvpair with type info.
+	 */
+	dummy = fnvlist_alloc();
+
+	switch (type) {
+	case PROP_TYPE_STRING:
+		VERIFY(0 == nvlist_add_string(dummy, propname, ""));
+		break;
+	case PROP_TYPE_NUMBER:
+	case PROP_TYPE_INDEX:
+		VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
+		break;
+	default:
+		err = SET_ERROR(EINVAL);
+		goto errout;
+	}
+
+	pair = nvlist_next_nvpair(dummy, NULL);
+	if (pair == NULL) {
+		err = SET_ERROR(EINVAL);
+	} else {
+		err = zfs_prop_set_special(zc->zc_name, source, pair);
+		if (err == -1) /* property is not "special", needs handling */
+			err = dsl_prop_inherit(zc->zc_name, zc->zc_value,
+			    source);
+	}
+
+errout:
+	nvlist_free(dummy);
+	return (err);
+}
+
+static int
+zfs_ioc_pool_set_props(zfs_cmd_t *zc)
+{
+	nvlist_t *props;
+	spa_t *spa;
+	int error;
+	nvpair_t *pair;
+
+	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props)))
+		return (error);
+
+	/*
+	 * If the only property is the configfile, then just do a spa_lookup()
+	 * to handle the faulted case.
+	 */
+	pair = nvlist_next_nvpair(props, NULL);
+	if (pair != NULL && strcmp(nvpair_name(pair),
+	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
+	    nvlist_next_nvpair(props, pair) == NULL) {
+		mutex_enter(&spa_namespace_lock);
+		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
+			spa_configfile_set(spa, props, B_FALSE);
+			spa_write_cachefile(spa, B_FALSE, B_TRUE);
+		}
+		mutex_exit(&spa_namespace_lock);
+		if (spa != NULL) {
+			nvlist_free(props);
+			return (0);
+		}
+	}
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+		nvlist_free(props);
+		return (error);
+	}
+
+	error = spa_prop_set(spa, props);
+
+	nvlist_free(props);
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_pool_get_props(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	nvlist_t *nvp = NULL;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+		/*
+		 * If the pool is faulted, there may be properties we can still
+		 * get (such as altroot and cachefile), so attempt to get them
+		 * anyway.
+		 */
+		mutex_enter(&spa_namespace_lock);
+		if ((spa = spa_lookup(zc->zc_name)) != NULL)
+			error = spa_prop_get(spa, &nvp);
+		mutex_exit(&spa_namespace_lock);
+	} else {
+		error = spa_prop_get(spa, &nvp);
+		spa_close(spa, FTAG);
+	}
+
+	if (error == 0 && zc->zc_nvlist_dst != 0)
+		error = put_nvlist(zc, nvp);
+	else
+		error = SET_ERROR(EFAULT);
+
+	nvlist_free(nvp);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_src{_size}	nvlist of delegated permissions
+ * zc_perm_action	allow/unallow flag
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_set_fsacl(zfs_cmd_t *zc)
+{
+	int error;
+	nvlist_t *fsaclnv = NULL;
+
+	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &fsaclnv)) != 0)
+		return (error);
+
+	/*
+	 * Verify nvlist is constructed correctly
+	 */
+	if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) {
+		nvlist_free(fsaclnv);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * If we don't have PRIV_SYS_MOUNT, then validate
+	 * that user is allowed to hand out each permission in
+	 * the nvlist(s)
+	 */
+
+	error = secpolicy_zfs(CRED());
+	if (error != 0) {
+		if (zc->zc_perm_action == B_FALSE) {
+			error = dsl_deleg_can_allow(zc->zc_name,
+			    fsaclnv, CRED());
+		} else {
+			error = dsl_deleg_can_unallow(zc->zc_name,
+			    fsaclnv, CRED());
+		}
+	}
+
+	if (error == 0)
+		error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
+
+	nvlist_free(fsaclnv);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size}	nvlist of delegated permissions
+ */
+static int
+zfs_ioc_get_fsacl(zfs_cmd_t *zc)
+{
+	nvlist_t *nvp;
+	int error;
+
+	if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
+		error = put_nvlist(zc, nvp);
+		nvlist_free(nvp);
+	}
+
+	return (error);
+}
+
+/* ARGSUSED */
+static void
+zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	zfs_creat_t *zct = arg;
+
+	zfs_create_fs(os, cr, zct->zct_zplprops, tx);
+}
+
+#define	ZFS_PROP_UNDEFINED	((uint64_t)-1)
+
+/*
+ * inputs:
+ * os			parent objset pointer (NULL if root fs)
+ * fuids_ok		fuids allowed in this version of the spa?
+ * sa_ok		SAs allowed in this version of the spa?
+ * createprops		list of properties requested by creator
+ *
+ * outputs:
+ * zplprops	values for the zplprops we attach to the master node object
+ * is_ci	true if requested file system will be purely case-insensitive
+ *
+ * Determine the settings for utf8only, normalization and
+ * casesensitivity.  Specific values may have been requested by the
+ * creator and/or we can inherit values from the parent dataset.  If
+ * the file system is of too early a vintage, a creator can not
+ * request settings for these properties, even if the requested
+ * setting is the default value.  We don't actually want to create dsl
+ * properties for these, so remove them from the source nvlist after
+ * processing.
+ */
+static int
+zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
+    boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
+    nvlist_t *zplprops, boolean_t *is_ci)
+{
+	uint64_t sense = ZFS_PROP_UNDEFINED;
+	uint64_t norm = ZFS_PROP_UNDEFINED;
+	uint64_t u8 = ZFS_PROP_UNDEFINED;
+	int error;
+
+	ASSERT(zplprops != NULL);
+
+	/* parent dataset must be a filesystem */
+	if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
+		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
+
+	/*
+	 * Pull out creator prop choices, if any.
+	 */
+	if (createprops) {
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
+		(void) nvlist_remove_all(createprops,
+		    zfs_prop_to_name(ZFS_PROP_NORMALIZE));
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
+		(void) nvlist_remove_all(createprops,
+		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
+		(void) nvlist_lookup_uint64(createprops,
+		    zfs_prop_to_name(ZFS_PROP_CASE), &sense);
+		(void) nvlist_remove_all(createprops,
+		    zfs_prop_to_name(ZFS_PROP_CASE));
+	}
+
+	/*
+	 * If the zpl version requested is whacky or the file system
+	 * or pool is version is too "young" to support normalization
+	 * and the creator tried to set a value for one of the props,
+	 * error out.
+	 */
+	if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
+	    (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
+	    (zplver >= ZPL_VERSION_SA && !sa_ok) ||
+	    (zplver < ZPL_VERSION_NORMALIZATION &&
+	    (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
+	    sense != ZFS_PROP_UNDEFINED)))
+		return (SET_ERROR(ENOTSUP));
+
+	/*
+	 * Put the version in the zplprops
+	 */
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0);
+
+	if (norm == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
+		return (error);
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0);
+
+	/*
+	 * If we're normalizing, names must always be valid UTF-8 strings.
+	 */
+	if (norm)
+		u8 = 1;
+	if (u8 == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
+		return (error);
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0);
+
+	if (sense == ZFS_PROP_UNDEFINED &&
+	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
+		return (error);
+	VERIFY(nvlist_add_uint64(zplprops,
+	    zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0);
+
+	if (is_ci)
+		*is_ci = (sense == ZFS_CASE_INSENSITIVE);
+
+	return (0);
+}
+
+static int
+zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
+    nvlist_t *zplprops, boolean_t *is_ci)
+{
+	boolean_t fuids_ok, sa_ok;
+	uint64_t zplver = ZPL_VERSION;
+	objset_t *os = NULL;
+	char parentname[ZFS_MAX_DATASET_NAME_LEN];
+	spa_t *spa;
+	uint64_t spa_vers;
+	int error;
+
+	zfs_get_parent(dataset, parentname, sizeof (parentname));
+
+	if ((error = spa_open(dataset, &spa, FTAG)) != 0)
+		return (error);
+
+	spa_vers = spa_version(spa);
+	spa_close(spa, FTAG);
+
+	zplver = zfs_zpl_version_map(spa_vers);
+	fuids_ok = (zplver >= ZPL_VERSION_FUID);
+	sa_ok = (zplver >= ZPL_VERSION_SA);
+
+	/*
+	 * Open parent object set so we can inherit zplprop values.
+	 */
+	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
+		return (error);
+
+	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
+	    zplprops, is_ci);
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+static int
+zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
+    nvlist_t *zplprops, boolean_t *is_ci)
+{
+	boolean_t fuids_ok;
+	boolean_t sa_ok;
+	uint64_t zplver = ZPL_VERSION;
+	int error;
+
+	zplver = zfs_zpl_version_map(spa_vers);
+	fuids_ok = (zplver >= ZPL_VERSION_FUID);
+	sa_ok = (zplver >= ZPL_VERSION_SA);
+
+	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
+	    createprops, zplprops, is_ci);
+	return (error);
+}
+
+/*
+ * innvl: {
+ *     "type" -> dmu_objset_type_t (int32)
+ *     (optional) "props" -> { prop -> value }
+ *     (optional) "hidden_args" -> { "wkeydata" -> value }
+ *         raw uint8_t array of encryption wrapping key data (32 bytes)
+ * }
+ *
+ * outnvl: propname -> error code (int32)
+ */
+
+static const zfs_ioc_key_t zfs_keys_create[] = {
+	{"type",	DATA_TYPE_INT32,	0},
+	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int error = 0;
+	zfs_creat_t zct = { 0 };
+	nvlist_t *nvprops = NULL;
+	nvlist_t *hidden_args = NULL;
+	void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
+	dmu_objset_type_t type;
+	boolean_t is_insensitive = B_FALSE;
+	dsl_crypto_params_t *dcp = NULL;
+
+	type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type");
+	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+	(void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+
+	switch (type) {
+	case DMU_OST_ZFS:
+		cbfunc = zfs_create_cb;
+		break;
+
+	case DMU_OST_ZVOL:
+		cbfunc = zvol_create_cb;
+		break;
+
+	default:
+		cbfunc = NULL;
+		break;
+	}
+	if (strchr(fsname, '@') ||
+	    strchr(fsname, '%'))
+		return (SET_ERROR(EINVAL));
+
+	zct.zct_props = nvprops;
+
+	if (cbfunc == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if (type == DMU_OST_ZVOL) {
+		uint64_t volsize, volblocksize;
+
+		if (nvprops == NULL)
+			return (SET_ERROR(EINVAL));
+		if (nvlist_lookup_uint64(nvprops,
+		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
+			return (SET_ERROR(EINVAL));
+
+		if ((error = nvlist_lookup_uint64(nvprops,
+		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+		    &volblocksize)) != 0 && error != ENOENT)
+			return (SET_ERROR(EINVAL));
+
+		if (error != 0)
+			volblocksize = zfs_prop_default_numeric(
+			    ZFS_PROP_VOLBLOCKSIZE);
+
+		if ((error = zvol_check_volblocksize(fsname,
+		    volblocksize)) != 0 ||
+		    (error = zvol_check_volsize(volsize,
+		    volblocksize)) != 0)
+			return (error);
+	} else if (type == DMU_OST_ZFS) {
+		int error;
+
+		/*
+		 * We have to have normalization and
+		 * case-folding flags correct when we do the
+		 * file system creation, so go figure them out
+		 * now.
+		 */
+		VERIFY(nvlist_alloc(&zct.zct_zplprops,
+		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		error = zfs_fill_zplprops(fsname, nvprops,
+		    zct.zct_zplprops, &is_insensitive);
+		if (error != 0) {
+			nvlist_free(zct.zct_zplprops);
+			return (error);
+		}
+	}
+
+	error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops,
+	    hidden_args, &dcp);
+	if (error != 0) {
+		nvlist_free(zct.zct_zplprops);
+		return (error);
+	}
+
+	error = dmu_objset_create(fsname, type,
+	    is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct);
+
+	nvlist_free(zct.zct_zplprops);
+	dsl_crypto_params_free(dcp, !!error);
+
+	/*
+	 * It would be nice to do this atomically.
+	 */
+	if (error == 0) {
+		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+		    nvprops, outnvl);
+		if (error != 0) {
+			spa_t *spa;
+			int error2;
+
+			/*
+			 * Volumes will return EBUSY and cannot be destroyed
+			 * until all asynchronous minor handling (e.g. from
+			 * setting the volmode property) has completed. Wait for
+			 * the spa_zvol_taskq to drain then retry.
+			 */
+			error2 = dsl_destroy_head(fsname);
+			while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) {
+				error2 = spa_open(fsname, &spa, FTAG);
+				if (error2 == 0) {
+					taskq_wait(spa->spa_zvol_taskq);
+					spa_close(spa, FTAG);
+				}
+				error2 = dsl_destroy_head(fsname);
+			}
+		}
+	}
+	return (error);
+}
+
+/*
+ * innvl: {
+ *     "origin" -> name of origin snapshot
+ *     (optional) "props" -> { prop -> value }
+ *     (optional) "hidden_args" -> { "wkeydata" -> value }
+ *         raw uint8_t array of encryption wrapping key data (32 bytes)
+ * }
+ *
+ * outputs:
+ * outnvl: propname -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_clone[] = {
+	{"origin",	DATA_TYPE_STRING,	0},
+	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int error = 0;
+	nvlist_t *nvprops = NULL;
+	const char *origin_name;
+
+	origin_name = fnvlist_lookup_string(innvl, "origin");
+	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
+
+	if (strchr(fsname, '@') ||
+	    strchr(fsname, '%'))
+		return (SET_ERROR(EINVAL));
+
+	if (dataset_namecheck(origin_name, NULL, NULL) != 0)
+		return (SET_ERROR(EINVAL));
+
+	error = dmu_objset_clone(fsname, origin_name);
+
+	/*
+	 * It would be nice to do this atomically.
+	 */
+	if (error == 0) {
+		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
+		    nvprops, outnvl);
+		if (error != 0)
+			(void) dsl_destroy_head(fsname);
+	}
+	return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_remap[] = {
+	/* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	/* This IOCTL is no longer supported. */
+	return (0);
+}
+
+/*
+ * innvl: {
+ *     "snaps" -> { snapshot1, snapshot2 }
+ *     (optional) "props" -> { prop -> value (string) }
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_snapshot[] = {
+	{"snaps",	DATA_TYPE_NVLIST,	0},
+	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	nvlist_t *snaps;
+	nvlist_t *props = NULL;
+	int error, poollen;
+	nvpair_t *pair;
+
+	(void) nvlist_lookup_nvlist(innvl, "props", &props);
+	if (!nvlist_empty(props) &&
+	    zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
+		return (SET_ERROR(ENOTSUP));
+	if ((error = zfs_check_userprops(props)) != 0)
+		return (error);
+
+	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+	poollen = strlen(poolname);
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(snaps, pair)) {
+		const char *name = nvpair_name(pair);
+		char *cp = strchr(name, '@');
+
+		/*
+		 * The snap name must contain an @, and the part after it must
+		 * contain only valid characters.
+		 */
+		if (cp == NULL ||
+		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The snap must be in the specified pool.
+		 */
+		if (strncmp(name, poolname, poollen) != 0 ||
+		    (name[poollen] != '/' && name[poollen] != '@'))
+			return (SET_ERROR(EXDEV));
+
+		/*
+		 * Check for permission to set the properties on the fs.
+		 */
+		if (!nvlist_empty(props)) {
+			*cp = '\0';
+			error = zfs_secpolicy_write_perms(name,
+			    ZFS_DELEG_PERM_USERPROP, CRED());
+			*cp = '@';
+			if (error != 0)
+				return (error);
+		}
+
+		/* This must be the only snap of this fs. */
+		for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
+		    pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
+			if (strncmp(name, nvpair_name(pair2), cp - name + 1)
+			    == 0) {
+				return (SET_ERROR(EXDEV));
+			}
+		}
+	}
+
+	error = dsl_dataset_snapshot(snaps, props, outnvl);
+
+	return (error);
+}
+
+/*
+ * innvl: "message" -> string
+ */
+static const zfs_ioc_key_t zfs_keys_log_history[] = {
+	{"message",	DATA_TYPE_STRING,	0},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	const char *message;
+	char *poolname;
+	spa_t *spa;
+	int error;
+
+	/*
+	 * The poolname in the ioctl is not set, we get it from the TSD,
+	 * which was set at the end of the last successful ioctl that allows
+	 * logging.  The secpolicy func already checked that it is set.
+	 * Only one log ioctl is allowed after each successful ioctl, so
+	 * we clear the TSD here.
+	 */
+	poolname = tsd_get(zfs_allow_log_key);
+	if (poolname == NULL)
+		return (SET_ERROR(EINVAL));
+	(void) tsd_set(zfs_allow_log_key, NULL);
+	error = spa_open(poolname, &spa, FTAG);
+	kmem_strfree(poolname);
+	if (error != 0)
+		return (error);
+
+	message = fnvlist_lookup_string(innvl, "message");
+
+	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
+		spa_close(spa, FTAG);
+		return (SET_ERROR(ENOTSUP));
+	}
+
+	error = spa_history_log(spa, message);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/*
+ * This ioctl is used to set the bootenv configuration on the current
+ * pool. This configuration is stored in the second padding area of the label,
+ * and it is used by the bootloader(s) to store the bootloader and/or system
+ * specific data.
+ * The data is stored as nvlist data stream, and is protected by
+ * an embedded checksum.
+ * The version can have two possible values:
+ * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING.
+ * VB_NVLIST: nvlist with arbitrary <key, value> pairs.
+ */
+static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {
+	{"version",	DATA_TYPE_UINT64,	0},
+	{"<keys>",	DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST},
+};
+
+static int
+zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int error;
+	spa_t *spa;
+
+	if ((error = spa_open(name, &spa, FTAG)) != 0)
+		return (error);
+	spa_vdev_state_enter(spa, SCL_ALL);
+	error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {
+	/* no nvl keys */
+};
+
+static int
+zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+
+	if ((error = spa_open(name, &spa, FTAG)) != 0)
+		return (error);
+	spa_vdev_state_enter(spa, SCL_ALL);
+	error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/*
+ * The dp_config_rwlock must not be held when calling this, because the
+ * unmount may need to write out data.
+ *
+ * This function is best-effort.  Callers must deal gracefully if it
+ * remains mounted (or is remounted after this call).
+ *
+ * Returns 0 if the argument is not a snapshot, or it is not currently a
+ * filesystem, or we were able to unmount it.  Returns error code otherwise.
+ */
+void
+zfs_unmount_snap(const char *snapname)
+{
+	if (strchr(snapname, '@') == NULL)
+		return;
+
+	(void) zfsctl_snapshot_unmount(snapname, MNT_FORCE);
+}
+
+/* ARGSUSED */
+static int
+zfs_unmount_snap_cb(const char *snapname, void *arg)
+{
+	zfs_unmount_snap(snapname);
+	return (0);
+}
+
+/*
+ * When a clone is destroyed, its origin may also need to be destroyed,
+ * in which case it must be unmounted.  This routine will do that unmount
+ * if necessary.
+ */
+void
+zfs_destroy_unmount_origin(const char *fsname)
+{
+	int error;
+	objset_t *os;
+	dsl_dataset_t *ds;
+
+	error = dmu_objset_hold(fsname, FTAG, &os);
+	if (error != 0)
+		return;
+	ds = dmu_objset_ds(os);
+	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
+		char originname[ZFS_MAX_DATASET_NAME_LEN];
+		dsl_dataset_name(ds->ds_prev, originname);
+		dmu_objset_rele(os, FTAG);
+		zfs_unmount_snap(originname);
+	} else {
+		dmu_objset_rele(os, FTAG);
+	}
+}
+
+/*
+ * innvl: {
+ *     "snaps" -> { snapshot1, snapshot2 }
+ *     (optional boolean) "defer"
+ * }
+ *
+ * outnvl: snapshot -> error code (int32)
+ */
+static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
+	{"snaps",	DATA_TYPE_NVLIST,	0},
+	{"defer",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int poollen;
+	nvlist_t *snaps;
+	nvpair_t *pair;
+	boolean_t defer;
+	spa_t *spa;
+
+	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
+	defer = nvlist_exists(innvl, "defer");
+
+	poollen = strlen(poolname);
+	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(snaps, pair)) {
+		const char *name = nvpair_name(pair);
+
+		/*
+		 * The snap must be in the specified pool to prevent the
+		 * invalid removal of zvol minors below.
+		 */
+		if (strncmp(name, poolname, poollen) != 0 ||
+		    (name[poollen] != '/' && name[poollen] != '@'))
+			return (SET_ERROR(EXDEV));
+
+		zfs_unmount_snap(nvpair_name(pair));
+		if (spa_open(name, &spa, FTAG) == 0) {
+			zvol_remove_minors(spa, name, B_TRUE);
+			spa_close(spa, FTAG);
+		}
+	}
+
+	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
+}
+
+/*
+ * Create bookmarks. The bookmark names are of the form <fs>#<bmark>.
+ * All bookmarks and snapshots must be in the same pool.
+ * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail.
+ *
+ * innvl: {
+ *     new_bookmark1 -> existing_snapshot,
+ *     new_bookmark2 -> existing_bookmark,
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_bookmark[] = {
+	{"<bookmark>...",	DATA_TYPE_STRING,	ZK_WILDCARDLIST},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	return (dsl_bookmark_create(innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ *     property 1, property 2, ...
+ * }
+ *
+ * outnvl: {
+ *     bookmark name 1 -> { property 1, property 2, ... },
+ *     bookmark name 2 -> { property 1, property 2, ... }
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = {
+	{"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	return (dsl_get_bookmarks(fsname, innvl, outnvl));
+}
+
+/*
+ * innvl is not used.
+ *
+ * outnvl: {
+ *     property 1, property 2, ...
+ * }
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = {
+	/* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	char fsname[ZFS_MAX_DATASET_NAME_LEN];
+	char *bmname;
+
+	bmname = strchr(bookmark, '#');
+	if (bmname == NULL)
+		return (SET_ERROR(EINVAL));
+	bmname++;
+
+	(void) strlcpy(fsname, bookmark, sizeof (fsname));
+	*(strchr(fsname, '#')) = '\0';
+
+	return (dsl_get_bookmark_props(fsname, bmname, outnvl));
+}
+
+/*
+ * innvl: {
+ *     bookmark name 1, bookmark name 2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = {
+	{"<bookmark>...",	DATA_TYPE_BOOLEAN,	ZK_WILDCARDLIST},
+};
+
+static int
+zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	int error, poollen;
+
+	poollen = strlen(poolname);
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		const char *name = nvpair_name(pair);
+		const char *cp = strchr(name, '#');
+
+		/*
+		 * The bookmark name must contain an #, and the part after it
+		 * must contain only valid characters.
+		 */
+		if (cp == NULL ||
+		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The bookmark must be in the specified pool.
+		 */
+		if (strncmp(name, poolname, poollen) != 0 ||
+		    (name[poollen] != '/' && name[poollen] != '#'))
+			return (SET_ERROR(EXDEV));
+	}
+
+	error = dsl_bookmark_destroy(innvl, outnvl);
+	return (error);
+}
+
+static const zfs_ioc_key_t zfs_keys_channel_program[] = {
+	{"program",	DATA_TYPE_STRING,		0},
+	{"arg",		DATA_TYPE_ANY,			0},
+	{"sync",	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
+	{"instrlimit",	DATA_TYPE_UINT64,		ZK_OPTIONAL},
+	{"memlimit",	DATA_TYPE_UINT64,		ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	char *program;
+	uint64_t instrlimit, memlimit;
+	boolean_t sync_flag;
+	nvpair_t *nvarg = NULL;
+
+	program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM);
+	if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
+		sync_flag = B_TRUE;
+	}
+	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
+		instrlimit = ZCP_DEFAULT_INSTRLIMIT;
+	}
+	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
+		memlimit = ZCP_DEFAULT_MEMLIMIT;
+	}
+	nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST);
+
+	if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit)
+		return (SET_ERROR(EINVAL));
+	if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
+		return (SET_ERROR(EINVAL));
+
+	return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
+	    nvarg, outnvl));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
+	/* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	return (spa_checkpoint(poolname));
+}
+
+/*
+ * innvl: unused
+ * outnvl: empty
+ */
+static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
+	/* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
+    nvlist_t *outnvl)
+{
+	return (spa_checkpoint_discard(poolname));
+}
+
+/*
+ * inputs:
+ * zc_name		name of dataset to destroy
+ * zc_defer_destroy	mark for deferred destroy
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_destroy(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	dmu_objset_type_t ost;
+	int err;
+
+	err = dmu_objset_hold(zc->zc_name, FTAG, &os);
+	if (err != 0)
+		return (err);
+	ost = dmu_objset_type(os);
+	dmu_objset_rele(os, FTAG);
+
+	if (ost == DMU_OST_ZFS)
+		zfs_unmount_snap(zc->zc_name);
+
+	if (strchr(zc->zc_name, '@')) {
+		err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
+	} else {
+		err = dsl_destroy_head(zc->zc_name);
+		if (err == EEXIST) {
+			/*
+			 * It is possible that the given DS may have
+			 * hidden child (%recv) datasets - "leftovers"
+			 * resulting from the previously interrupted
+			 * 'zfs receive'.
+			 *
+			 * 6 extra bytes for /%recv
+			 */
+			char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+			if (snprintf(namebuf, sizeof (namebuf), "%s/%s",
+			    zc->zc_name, recv_clone_name) >=
+			    sizeof (namebuf))
+				return (SET_ERROR(EINVAL));
+
+			/*
+			 * Try to remove the hidden child (%recv) and after
+			 * that try to remove the target dataset.
+			 * If the hidden child (%recv) does not exist
+			 * the original error (EEXIST) will be returned
+			 */
+			err = dsl_destroy_head(namebuf);
+			if (err == 0)
+				err = dsl_destroy_head(zc->zc_name);
+			else if (err == ENOENT)
+				err = SET_ERROR(EEXIST);
+		}
+	}
+
+	return (err);
+}
+
+/*
+ * innvl: {
+ *     "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64)
+ *     "initialize_vdevs": { -> guids to initialize (nvlist)
+ *         "vdev_path_1": vdev_guid_1, (uint64),
+ *         "vdev_path_2": vdev_guid_2, (uint64),
+ *         ...
+ *     },
+ * }
+ *
+ * outnvl: {
+ *     "initialize_vdevs": { -> initialization errors (nvlist)
+ *         "vdev_path_1": errno, see function body for possible errnos (uint64)
+ *         "vdev_path_2": errno, ... (uint64)
+ *         ...
+ *     }
+ * }
+ *
+ * EINVAL is returned for an unknown commands or if any of the provided vdev
+ * guids have be specified with a type other than uint64.
+ */
+static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
+	{ZPOOL_INITIALIZE_COMMAND,	DATA_TYPE_UINT64,	0},
+	{ZPOOL_INITIALIZE_VDEVS,	DATA_TYPE_NVLIST,	0}
+};
+
+static int
+zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	uint64_t cmd_type;
+	if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
+	    &cmd_type) != 0) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
+	    cmd_type == POOL_INITIALIZE_START ||
+	    cmd_type == POOL_INITIALIZE_SUSPEND)) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	nvlist_t *vdev_guids;
+	if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
+	    &vdev_guids) != 0) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+		uint64_t vdev_guid;
+		if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
+			return (SET_ERROR(EINVAL));
+		}
+	}
+
+	spa_t *spa;
+	int error = spa_open(poolname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	nvlist_t *vdev_errlist = fnvlist_alloc();
+	int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type,
+	    vdev_errlist);
+
+	if (fnvlist_size(vdev_errlist) > 0) {
+		fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
+		    vdev_errlist);
+	}
+	fnvlist_free(vdev_errlist);
+
+	spa_close(spa, FTAG);
+	return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
+ * innvl: {
+ *     "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64)
+ *     "trim_vdevs": { -> guids to TRIM (nvlist)
+ *         "vdev_path_1": vdev_guid_1, (uint64),
+ *         "vdev_path_2": vdev_guid_2, (uint64),
+ *         ...
+ *     },
+ *     "trim_rate" -> Target TRIM rate in bytes/sec.
+ *     "trim_secure" -> Set to request a secure TRIM.
+ * }
+ *
+ * outnvl: {
+ *     "trim_vdevs": { -> TRIM errors (nvlist)
+ *         "vdev_path_1": errno, see function body for possible errnos (uint64)
+ *         "vdev_path_2": errno, ... (uint64)
+ *         ...
+ *     }
+ * }
+ *
+ * EINVAL is returned for an unknown commands or if any of the provided vdev
+ * guids have be specified with a type other than uint64.
+ */
+static const zfs_ioc_key_t zfs_keys_pool_trim[] = {
+	{ZPOOL_TRIM_COMMAND,	DATA_TYPE_UINT64,		0},
+	{ZPOOL_TRIM_VDEVS,	DATA_TYPE_NVLIST,		0},
+	{ZPOOL_TRIM_RATE,	DATA_TYPE_UINT64,		ZK_OPTIONAL},
+	{ZPOOL_TRIM_SECURE,	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	uint64_t cmd_type;
+	if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (!(cmd_type == POOL_TRIM_CANCEL ||
+	    cmd_type == POOL_TRIM_START ||
+	    cmd_type == POOL_TRIM_SUSPEND)) {
+		return (SET_ERROR(EINVAL));
+	}
+
+	nvlist_t *vdev_guids;
+	if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0)
+		return (SET_ERROR(EINVAL));
+
+	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
+		uint64_t vdev_guid;
+		if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
+			return (SET_ERROR(EINVAL));
+		}
+	}
+
+	/* Optional, defaults to maximum rate when not provided */
+	uint64_t rate;
+	if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0)
+		rate = 0;
+
+	/* Optional, defaults to standard TRIM when not provided */
+	boolean_t secure;
+	if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE,
+	    &secure) != 0) {
+		secure = B_FALSE;
+	}
+
+	spa_t *spa;
+	int error = spa_open(poolname, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	nvlist_t *vdev_errlist = fnvlist_alloc();
+	int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type,
+	    rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist);
+
+	if (fnvlist_size(vdev_errlist) > 0)
+		fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist);
+
+	fnvlist_free(vdev_errlist);
+
+	spa_close(spa, FTAG);
+	return (total_errors > 0 ? EINVAL : 0);
+}
+
+/*
+ * This ioctl waits for activity of a particular type to complete. If there is
+ * no activity of that type in progress, it returns immediately, and the
+ * returned value "waited" is false. If there is activity in progress, and no
+ * tag is passed in, the ioctl blocks until all activity of that type is
+ * complete, and then returns with "waited" set to true.
+ *
+ * If a tag is provided, it identifies a particular instance of an activity to
+ * wait for. Currently, this is only valid for use with 'initialize', because
+ * that is the only activity for which there can be multiple instances running
+ * concurrently. In the case of 'initialize', the tag corresponds to the guid of
+ * the vdev on which to wait.
+ *
+ * If a thread waiting in the ioctl receives a signal, the call will return
+ * immediately, and the return value will be EINTR.
+ *
+ * innvl: {
+ *     "wait_activity" -> int32_t
+ *     (optional) "wait_tag" -> uint64_t
+ * }
+ *
+ * outnvl: "waited" -> boolean_t
+ */
+static const zfs_ioc_key_t zfs_keys_pool_wait[] = {
+	{ZPOOL_WAIT_ACTIVITY,	DATA_TYPE_INT32,		0},
+	{ZPOOL_WAIT_TAG,	DATA_TYPE_UINT64,		ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int32_t activity;
+	uint64_t tag;
+	boolean_t waited;
+	int error;
+
+	if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0)
+		return (EINVAL);
+
+	if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0)
+		error = spa_wait_tag(name, activity, tag, &waited);
+	else
+		error = spa_wait(name, activity, &waited);
+
+	if (error == 0)
+		fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited);
+
+	return (error);
+}
+
+/*
+ * This ioctl waits for activity of a particular type to complete. If there is
+ * no activity of that type in progress, it returns immediately, and the
+ * returned value "waited" is false. If there is activity in progress, and no
+ * tag is passed in, the ioctl blocks until all activity of that type is
+ * complete, and then returns with "waited" set to true.
+ *
+ * If a thread waiting in the ioctl receives a signal, the call will return
+ * immediately, and the return value will be EINTR.
+ *
+ * innvl: {
+ *     "wait_activity" -> int32_t
+ * }
+ *
+ * outnvl: "waited" -> boolean_t
+ */
+static const zfs_ioc_key_t zfs_keys_fs_wait[] = {
+	{ZFS_WAIT_ACTIVITY,	DATA_TYPE_INT32,		0},
+};
+
+static int
+zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int32_t activity;
+	boolean_t waited = B_FALSE;
+	int error;
+	dsl_pool_t *dp;
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+
+	if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0)
+		return (SET_ERROR(EINVAL));
+
+	if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0)
+		return (SET_ERROR(EINVAL));
+
+	if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0)
+		return (error);
+
+	if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	dd = ds->ds_dir;
+	mutex_enter(&dd->dd_activity_lock);
+	dd->dd_activity_waiters++;
+
+	/*
+	 * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t
+	 * aren't evicted while we're waiting. Normally this is prevented by
+	 * holding the pool, but we can't do that while we're waiting since
+	 * that would prevent TXGs from syncing out. Some of the functionality
+	 * of long-holds (e.g. preventing deletion) is unnecessary for this
+	 * case, since we would cancel the waiters before proceeding with a
+	 * deletion. An alternative mechanism for keeping the dataset around
+	 * could be developed but this is simpler.
+	 */
+	dsl_dataset_long_hold(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	error = dsl_dir_wait(dd, ds, activity, &waited);
+
+	dsl_dataset_long_rele(ds, FTAG);
+	dd->dd_activity_waiters--;
+	if (dd->dd_activity_waiters == 0)
+		cv_signal(&dd->dd_activity_cv);
+	mutex_exit(&dd->dd_activity_lock);
+
+	dsl_dataset_rele(ds, FTAG);
+
+	if (error == 0)
+		fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited);
+
+	return (error);
+}
+
+/*
+ * fsname is name of dataset to rollback (to most recent snapshot)
+ *
+ * innvl may contain name of expected target snapshot
+ *
+ * outnvl: "target" -> name of most recent snapshot
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_rollback[] = {
+	{"target",	DATA_TYPE_STRING,	ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	zfsvfs_t *zfsvfs;
+	zvol_state_handle_t *zv;
+	char *target = NULL;
+	int error;
+
+	(void) nvlist_lookup_string(innvl, "target", &target);
+	if (target != NULL) {
+		const char *cp = strchr(target, '@');
+
+		/*
+		 * The snap name must contain an @, and the part after it must
+		 * contain only valid characters.
+		 */
+		if (cp == NULL ||
+		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	if (getzfsvfs(fsname, &zfsvfs) == 0) {
+		dsl_dataset_t *ds;
+
+		ds = dmu_objset_ds(zfsvfs->z_os);
+		error = zfs_suspend_fs(zfsvfs);
+		if (error == 0) {
+			int resume_err;
+
+			error = dsl_dataset_rollback(fsname, target, zfsvfs,
+			    outnvl);
+			resume_err = zfs_resume_fs(zfsvfs, ds);
+			error = error ? error : resume_err;
+		}
+		zfs_vfs_rele(zfsvfs);
+	} else if ((zv = zvol_suspend(fsname)) != NULL) {
+		error = dsl_dataset_rollback(fsname, target, zvol_tag(zv),
+		    outnvl);
+		zvol_resume(zv);
+	} else {
+		error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
+	}
+	return (error);
+}
+
+static int
+recursive_unmount(const char *fsname, void *arg)
+{
+	const char *snapname = arg;
+	char *fullname;
+
+	fullname = kmem_asprintf("%s@%s", fsname, snapname);
+	zfs_unmount_snap(fullname);
+	kmem_strfree(fullname);
+
+	return (0);
+}
+
+/*
+ *
+ * snapname is the snapshot to redact.
+ * innvl: {
+ *     "bookname" -> (string)
+ *         shortname of the redaction bookmark to generate
+ *     "snapnv" -> (nvlist, values ignored)
+ *         snapshots to redact snapname with respect to
+ * }
+ *
+ * outnvl is unused
+ */
+
+/* ARGSUSED */
+static const zfs_ioc_key_t zfs_keys_redact[] = {
+	{"bookname",		DATA_TYPE_STRING,	0},
+	{"snapnv",		DATA_TYPE_NVLIST,	0},
+};
+static int
+zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	nvlist_t *redactnvl = NULL;
+	char *redactbook = NULL;
+
+	if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0)
+		return (SET_ERROR(EINVAL));
+	if (fnvlist_num_pairs(redactnvl) == 0)
+		return (SET_ERROR(ENXIO));
+	if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0)
+		return (SET_ERROR(EINVAL));
+
+	return (dmu_redact_snap(snapname, redactnvl, redactbook));
+}
+
+/*
+ * inputs:
+ * zc_name	old name of dataset
+ * zc_value	new name of dataset
+ * zc_cookie	recursive flag (only valid for snapshots)
+ *
+ * outputs:	none
+ */
+static int
+zfs_ioc_rename(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	dmu_objset_type_t ost;
+	boolean_t recursive = zc->zc_cookie & 1;
+	boolean_t nounmount = !!(zc->zc_cookie & 2);
+	char *at;
+	int err;
+
+	/* "zfs rename" from and to ...%recv datasets should both fail */
+	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
+	if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
+	    dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+	    strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%'))
+		return (SET_ERROR(EINVAL));
+
+	err = dmu_objset_hold(zc->zc_name, FTAG, &os);
+	if (err != 0)
+		return (err);
+	ost = dmu_objset_type(os);
+	dmu_objset_rele(os, FTAG);
+
+	at = strchr(zc->zc_name, '@');
+	if (at != NULL) {
+		/* snaps must be in same fs */
+		int error;
+
+		if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
+			return (SET_ERROR(EXDEV));
+		*at = '\0';
+		if (ost == DMU_OST_ZFS && !nounmount) {
+			error = dmu_objset_find(zc->zc_name,
+			    recursive_unmount, at + 1,
+			    recursive ? DS_FIND_CHILDREN : 0);
+			if (error != 0) {
+				*at = '@';
+				return (error);
+			}
+		}
+		error = dsl_dataset_rename_snapshot(zc->zc_name,
+		    at + 1, strchr(zc->zc_value, '@') + 1, recursive);
+		*at = '@';
+
+		return (error);
+	} else {
+		return (dsl_dir_rename(zc->zc_name, zc->zc_value));
+	}
+}
+
+static int
+zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+{
+	const char *propname = nvpair_name(pair);
+	boolean_t issnap = (strchr(dsname, '@') != NULL);
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	uint64_t intval, compval;
+	int err;
+
+	if (prop == ZPROP_INVAL) {
+		if (zfs_prop_user(propname)) {
+			if ((err = zfs_secpolicy_write_perms(dsname,
+			    ZFS_DELEG_PERM_USERPROP, cr)))
+				return (err);
+			return (0);
+		}
+
+		if (!issnap && zfs_prop_userquota(propname)) {
+			const char *perm = NULL;
+			const char *uq_prefix =
+			    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
+			const char *gq_prefix =
+			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
+			const char *uiq_prefix =
+			    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA];
+			const char *giq_prefix =
+			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA];
+			const char *pq_prefix =
+			    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA];
+			const char *piq_prefix = zfs_userquota_prop_prefixes[\
+			    ZFS_PROP_PROJECTOBJQUOTA];
+
+			if (strncmp(propname, uq_prefix,
+			    strlen(uq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_USERQUOTA;
+			} else if (strncmp(propname, uiq_prefix,
+			    strlen(uiq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_USEROBJQUOTA;
+			} else if (strncmp(propname, gq_prefix,
+			    strlen(gq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_GROUPQUOTA;
+			} else if (strncmp(propname, giq_prefix,
+			    strlen(giq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_GROUPOBJQUOTA;
+			} else if (strncmp(propname, pq_prefix,
+			    strlen(pq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_PROJECTQUOTA;
+			} else if (strncmp(propname, piq_prefix,
+			    strlen(piq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA;
+			} else {
+				/* {USER|GROUP|PROJECT}USED are read-only */
+				return (SET_ERROR(EINVAL));
+			}
+
+			if ((err = zfs_secpolicy_write_perms(dsname, perm, cr)))
+				return (err);
+			return (0);
+		}
+
+		return (SET_ERROR(EINVAL));
+	}
+
+	if (issnap)
+		return (SET_ERROR(EINVAL));
+
+	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+		/*
+		 * dsl_prop_get_all_impl() returns properties in this
+		 * format.
+		 */
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &pair) == 0);
+	}
+
+	/*
+	 * Check that this value is valid for this pool version
+	 */
+	switch (prop) {
+	case ZFS_PROP_COMPRESSION:
+		/*
+		 * If the user specified gzip compression, make sure
+		 * the SPA supports it. We ignore any errors here since
+		 * we'll catch them later.
+		 */
+		if (nvpair_value_uint64(pair, &intval) == 0) {
+			compval = ZIO_COMPRESS_ALGO(intval);
+			if (compval >= ZIO_COMPRESS_GZIP_1 &&
+			    compval <= ZIO_COMPRESS_GZIP_9 &&
+			    zfs_earlier_version(dsname,
+			    SPA_VERSION_GZIP_COMPRESSION)) {
+				return (SET_ERROR(ENOTSUP));
+			}
+
+			if (compval == ZIO_COMPRESS_ZLE &&
+			    zfs_earlier_version(dsname,
+			    SPA_VERSION_ZLE_COMPRESSION))
+				return (SET_ERROR(ENOTSUP));
+
+			if (compval == ZIO_COMPRESS_LZ4) {
+				spa_t *spa;
+
+				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+					return (err);
+
+				if (!spa_feature_is_enabled(spa,
+				    SPA_FEATURE_LZ4_COMPRESS)) {
+					spa_close(spa, FTAG);
+					return (SET_ERROR(ENOTSUP));
+				}
+				spa_close(spa, FTAG);
+			}
+
+			if (compval == ZIO_COMPRESS_ZSTD) {
+				spa_t *spa;
+
+				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+					return (err);
+
+				if (!spa_feature_is_enabled(spa,
+				    SPA_FEATURE_ZSTD_COMPRESS)) {
+					spa_close(spa, FTAG);
+					return (SET_ERROR(ENOTSUP));
+				}
+				spa_close(spa, FTAG);
+			}
+		}
+		break;
+
+	case ZFS_PROP_COPIES:
+		if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
+			return (SET_ERROR(ENOTSUP));
+		break;
+
+	case ZFS_PROP_VOLBLOCKSIZE:
+	case ZFS_PROP_RECORDSIZE:
+		/* Record sizes above 128k need the feature to be enabled */
+		if (nvpair_value_uint64(pair, &intval) == 0 &&
+		    intval > SPA_OLD_MAXBLOCKSIZE) {
+			spa_t *spa;
+
+			/*
+			 * We don't allow setting the property above 1MB,
+			 * unless the tunable has been changed.
+			 */
+			if (intval > zfs_max_recordsize ||
+			    intval > SPA_MAXBLOCKSIZE)
+				return (SET_ERROR(ERANGE));
+
+			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+				return (err);
+
+			if (!spa_feature_is_enabled(spa,
+			    SPA_FEATURE_LARGE_BLOCKS)) {
+				spa_close(spa, FTAG);
+				return (SET_ERROR(ENOTSUP));
+			}
+			spa_close(spa, FTAG);
+		}
+		break;
+
+	case ZFS_PROP_DNODESIZE:
+		/* Dnode sizes above 512 need the feature to be enabled */
+		if (nvpair_value_uint64(pair, &intval) == 0 &&
+		    intval != ZFS_DNSIZE_LEGACY) {
+			spa_t *spa;
+
+			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+				return (err);
+
+			if (!spa_feature_is_enabled(spa,
+			    SPA_FEATURE_LARGE_DNODE)) {
+				spa_close(spa, FTAG);
+				return (SET_ERROR(ENOTSUP));
+			}
+			spa_close(spa, FTAG);
+		}
+		break;
+
+	case ZFS_PROP_SPECIAL_SMALL_BLOCKS:
+		/*
+		 * This property could require the allocation classes
+		 * feature to be active for setting, however we allow
+		 * it so that tests of settable properties succeed.
+		 * The CLI will issue a warning in this case.
+		 */
+		break;
+
+	case ZFS_PROP_SHARESMB:
+		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
+			return (SET_ERROR(ENOTSUP));
+		break;
+
+	case ZFS_PROP_ACLINHERIT:
+		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+		    nvpair_value_uint64(pair, &intval) == 0) {
+			if (intval == ZFS_ACL_PASSTHROUGH_X &&
+			    zfs_earlier_version(dsname,
+			    SPA_VERSION_PASSTHROUGH_X))
+				return (SET_ERROR(ENOTSUP));
+		}
+		break;
+	case ZFS_PROP_CHECKSUM:
+	case ZFS_PROP_DEDUP:
+	{
+		spa_feature_t feature;
+		spa_t *spa;
+		int err;
+
+		/* dedup feature version checks */
+		if (prop == ZFS_PROP_DEDUP &&
+		    zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+			return (SET_ERROR(ENOTSUP));
+
+		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+		    nvpair_value_uint64(pair, &intval) == 0) {
+			/* check prop value is enabled in features */
+			feature = zio_checksum_to_feature(
+			    intval & ZIO_CHECKSUM_MASK);
+			if (feature == SPA_FEATURE_NONE)
+				break;
+
+			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+				return (err);
+
+			if (!spa_feature_is_enabled(spa, feature)) {
+				spa_close(spa, FTAG);
+				return (SET_ERROR(ENOTSUP));
+			}
+			spa_close(spa, FTAG);
+		}
+		break;
+	}
+
+	default:
+		break;
+	}
+
+	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
+}
+
+/*
+ * Removes properties from the given props list that fail permission checks
+ * needed to clear them and to restore them in case of a receive error. For each
+ * property, make sure we have both set and inherit permissions.
+ *
+ * Returns the first error encountered if any permission checks fail. If the
+ * caller provides a non-NULL errlist, it also gives the complete list of names
+ * of all the properties that failed a permission check along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property checks out successfully, zero is returned and the list
+ * pointed at by errlist is NULL.
+ */
+static int
+zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist)
+{
+	zfs_cmd_t *zc;
+	nvpair_t *pair, *next_pair;
+	nvlist_t *errors;
+	int err, rv = 0;
+
+	if (props == NULL)
+		return (0);
+
+	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
+	(void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name));
+	pair = nvlist_next_nvpair(props, NULL);
+	while (pair != NULL) {
+		next_pair = nvlist_next_nvpair(props, pair);
+
+		(void) strlcpy(zc->zc_value, nvpair_name(pair),
+		    sizeof (zc->zc_value));
+		if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
+		    (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
+			VERIFY(nvlist_remove_nvpair(props, pair) == 0);
+			VERIFY(nvlist_add_int32(errors,
+			    zc->zc_value, err) == 0);
+		}
+		pair = next_pair;
+	}
+	kmem_free(zc, sizeof (zfs_cmd_t));
+
+	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+		nvlist_free(errors);
+		errors = NULL;
+	} else {
+		VERIFY(nvpair_value_int32(pair, &rv) == 0);
+	}
+
+	if (errlist == NULL)
+		nvlist_free(errors);
+	else
+		*errlist = errors;
+
+	return (rv);
+}
+
+static boolean_t
+propval_equals(nvpair_t *p1, nvpair_t *p2)
+{
+	if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
+		/* dsl_prop_get_all_impl() format */
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &p1) == 0);
+	}
+
+	if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &p2) == 0);
+	}
+
+	if (nvpair_type(p1) != nvpair_type(p2))
+		return (B_FALSE);
+
+	if (nvpair_type(p1) == DATA_TYPE_STRING) {
+		char *valstr1, *valstr2;
+
+		VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
+		VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+		return (strcmp(valstr1, valstr2) == 0);
+	} else {
+		uint64_t intval1, intval2;
+
+		VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
+		VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
+		return (intval1 == intval2);
+	}
+}
+
+/*
+ * Remove properties from props if they are not going to change (as determined
+ * by comparison with origprops). Remove them from origprops as well, since we
+ * do not need to clear or restore properties that won't change.
+ */
+static void
+props_reduce(nvlist_t *props, nvlist_t *origprops)
+{
+	nvpair_t *pair, *next_pair;
+
+	if (origprops == NULL)
+		return; /* all props need to be received */
+
+	pair = nvlist_next_nvpair(props, NULL);
+	while (pair != NULL) {
+		const char *propname = nvpair_name(pair);
+		nvpair_t *match;
+
+		next_pair = nvlist_next_nvpair(props, pair);
+
+		if ((nvlist_lookup_nvpair(origprops, propname,
+		    &match) != 0) || !propval_equals(pair, match))
+			goto next; /* need to set received value */
+
+		/* don't clear the existing received value */
+		(void) nvlist_remove_nvpair(origprops, match);
+		/* don't bother receiving the property */
+		(void) nvlist_remove_nvpair(props, pair);
+next:
+		pair = next_pair;
+	}
+}
+
+/*
+ * Extract properties that cannot be set PRIOR to the receipt of a dataset.
+ * For example, refquota cannot be set until after the receipt of a dataset,
+ * because in replication streams, an older/earlier snapshot may exceed the
+ * refquota.  We want to receive the older/earlier snapshot, but setting
+ * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
+ * the older/earlier snapshot from being received (with EDQUOT).
+ *
+ * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
+ *
+ * libzfs will need to be judicious handling errors encountered by props
+ * extracted by this function.
+ */
+static nvlist_t *
+extract_delay_props(nvlist_t *props)
+{
+	nvlist_t *delayprops;
+	nvpair_t *nvp, *tmp;
+	static const zfs_prop_t delayable[] = {
+		ZFS_PROP_REFQUOTA,
+		ZFS_PROP_KEYLOCATION,
+		0
+	};
+	int i;
+
+	VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
+	    nvp = nvlist_next_nvpair(props, nvp)) {
+		/*
+		 * strcmp() is safe because zfs_prop_to_name() always returns
+		 * a bounded string.
+		 */
+		for (i = 0; delayable[i] != 0; i++) {
+			if (strcmp(zfs_prop_to_name(delayable[i]),
+			    nvpair_name(nvp)) == 0) {
+				break;
+			}
+		}
+		if (delayable[i] != 0) {
+			tmp = nvlist_prev_nvpair(props, nvp);
+			VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
+			VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
+			nvp = tmp;
+		}
+	}
+
+	if (nvlist_empty(delayprops)) {
+		nvlist_free(delayprops);
+		delayprops = NULL;
+	}
+	return (delayprops);
+}
+
+static void
+zfs_allow_log_destroy(void *arg)
+{
+	char *poolname = arg;
+
+	if (poolname != NULL)
+		kmem_strfree(poolname);
+}
+
+#ifdef	ZFS_DEBUG
+static boolean_t zfs_ioc_recv_inject_err;
+#endif
+
+/*
+ * nvlist 'errors' is always allocated. It will contain descriptions of
+ * encountered errors, if any. It's the callers responsibility to free.
+ */
+static int
+zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops,
+    nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force,
+    boolean_t resumable, int input_fd,
+    dmu_replay_record_t *begin_record, uint64_t *read_bytes,
+    uint64_t *errflags, nvlist_t **errors)
+{
+	dmu_recv_cookie_t drc;
+	int error = 0;
+	int props_error = 0;
+	offset_t off, noff;
+	nvlist_t *local_delayprops = NULL;
+	nvlist_t *recv_delayprops = NULL;
+	nvlist_t *origprops = NULL; /* existing properties */
+	nvlist_t *origrecvd = NULL; /* existing received properties */
+	boolean_t first_recvd_props = B_FALSE;
+	boolean_t tofs_was_redacted;
+	zfs_file_t *input_fp;
+
+	*read_bytes = 0;
+	*errflags = 0;
+	*errors = fnvlist_alloc();
+	off = 0;
+
+	if ((error = zfs_file_get(input_fd, &input_fp)))
+		return (error);
+
+	noff = off = zfs_file_off(input_fp);
+	error = dmu_recv_begin(tofs, tosnap, begin_record, force,
+	    resumable, localprops, hidden_args, origin, &drc, input_fp,
+	    &off);
+	if (error != 0)
+		goto out;
+	tofs_was_redacted = dsl_get_redacted(drc.drc_ds);
+
+	/*
+	 * Set properties before we receive the stream so that they are applied
+	 * to the new data. Note that we must call dmu_recv_stream() if
+	 * dmu_recv_begin() succeeds.
+	 */
+	if (recvprops != NULL && !drc.drc_newfs) {
+		if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
+		    SPA_VERSION_RECVD_PROPS &&
+		    !dsl_prop_get_hasrecvd(tofs))
+			first_recvd_props = B_TRUE;
+
+		/*
+		 * If new received properties are supplied, they are to
+		 * completely replace the existing received properties,
+		 * so stash away the existing ones.
+		 */
+		if (dsl_prop_get_received(tofs, &origrecvd) == 0) {
+			nvlist_t *errlist = NULL;
+			/*
+			 * Don't bother writing a property if its value won't
+			 * change (and avoid the unnecessary security checks).
+			 *
+			 * The first receive after SPA_VERSION_RECVD_PROPS is a
+			 * special case where we blow away all local properties
+			 * regardless.
+			 */
+			if (!first_recvd_props)
+				props_reduce(recvprops, origrecvd);
+			if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0)
+				(void) nvlist_merge(*errors, errlist, 0);
+			nvlist_free(errlist);
+
+			if (clear_received_props(tofs, origrecvd,
+			    first_recvd_props ? NULL : recvprops) != 0)
+				*errflags |= ZPROP_ERR_NOCLEAR;
+		} else {
+			*errflags |= ZPROP_ERR_NOCLEAR;
+		}
+	}
+
+	/*
+	 * Stash away existing properties so we can restore them on error unless
+	 * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which
+	 * case "origrecvd" will take care of that.
+	 */
+	if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) {
+		objset_t *os;
+		if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
+			if (dsl_prop_get_all(os, &origprops) != 0) {
+				*errflags |= ZPROP_ERR_NOCLEAR;
+			}
+			dmu_objset_rele(os, FTAG);
+		} else {
+			*errflags |= ZPROP_ERR_NOCLEAR;
+		}
+	}
+
+	if (recvprops != NULL) {
+		props_error = dsl_prop_set_hasrecvd(tofs);
+
+		if (props_error == 0) {
+			recv_delayprops = extract_delay_props(recvprops);
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+			    recvprops, *errors);
+		}
+	}
+
+	if (localprops != NULL) {
+		nvlist_t *oprops = fnvlist_alloc();
+		nvlist_t *xprops = fnvlist_alloc();
+		nvpair_t *nvp = NULL;
+
+		while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) {
+			if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) {
+				/* -x property */
+				const char *name = nvpair_name(nvp);
+				zfs_prop_t prop = zfs_name_to_prop(name);
+				if (prop != ZPROP_INVAL) {
+					if (!zfs_prop_inheritable(prop))
+						continue;
+				} else if (!zfs_prop_user(name))
+					continue;
+				fnvlist_add_boolean(xprops, name);
+			} else {
+				/* -o property=value */
+				fnvlist_add_nvpair(oprops, nvp);
+			}
+		}
+
+		local_delayprops = extract_delay_props(oprops);
+		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
+		    oprops, *errors);
+		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
+		    xprops, *errors);
+
+		nvlist_free(oprops);
+		nvlist_free(xprops);
+	}
+
+	error = dmu_recv_stream(&drc, &off);
+
+	if (error == 0) {
+		zfsvfs_t *zfsvfs = NULL;
+		zvol_state_handle_t *zv = NULL;
+
+		if (getzfsvfs(tofs, &zfsvfs) == 0) {
+			/* online recv */
+			dsl_dataset_t *ds;
+			int end_err;
+			boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS(
+			    begin_record->drr_u.drr_begin.
+			    drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED;
+
+			ds = dmu_objset_ds(zfsvfs->z_os);
+			error = zfs_suspend_fs(zfsvfs);
+			/*
+			 * If the suspend fails, then the recv_end will
+			 * likely also fail, and clean up after itself.
+			 */
+			end_err = dmu_recv_end(&drc, zfsvfs);
+			/*
+			 * If the dataset was not redacted, but we received a
+			 * redacted stream onto it, we need to unmount the
+			 * dataset.  Otherwise, resume the filesystem.
+			 */
+			if (error == 0 && !drc.drc_newfs &&
+			    stream_is_redacted && !tofs_was_redacted) {
+				error = zfs_end_fs(zfsvfs, ds);
+			} else if (error == 0) {
+				error = zfs_resume_fs(zfsvfs, ds);
+			}
+			error = error ? error : end_err;
+			zfs_vfs_rele(zfsvfs);
+		} else if ((zv = zvol_suspend(tofs)) != NULL) {
+			error = dmu_recv_end(&drc, zvol_tag(zv));
+			zvol_resume(zv);
+		} else {
+			error = dmu_recv_end(&drc, NULL);
+		}
+
+		/* Set delayed properties now, after we're done receiving. */
+		if (recv_delayprops != NULL && error == 0) {
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+			    recv_delayprops, *errors);
+		}
+		if (local_delayprops != NULL && error == 0) {
+			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
+			    local_delayprops, *errors);
+		}
+	}
+
+	/*
+	 * Merge delayed props back in with initial props, in case
+	 * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
+	 * we have to make sure clear_received_props() includes
+	 * the delayed properties).
+	 *
+	 * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
+	 * using ASSERT() will be just like a VERIFY.
+	 */
+	if (recv_delayprops != NULL) {
+		ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0);
+		nvlist_free(recv_delayprops);
+	}
+	if (local_delayprops != NULL) {
+		ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0);
+		nvlist_free(local_delayprops);
+	}
+	*read_bytes = off - noff;
+
+#ifdef	ZFS_DEBUG
+	if (zfs_ioc_recv_inject_err) {
+		zfs_ioc_recv_inject_err = B_FALSE;
+		error = 1;
+	}
+#endif
+
+	/*
+	 * On error, restore the original props.
+	 */
+	if (error != 0 && recvprops != NULL && !drc.drc_newfs) {
+		if (clear_received_props(tofs, recvprops, NULL) != 0) {
+			/*
+			 * We failed to clear the received properties.
+			 * Since we may have left a $recvd value on the
+			 * system, we can't clear the $hasrecvd flag.
+			 */
+			*errflags |= ZPROP_ERR_NORESTORE;
+		} else if (first_recvd_props) {
+			dsl_prop_unset_hasrecvd(tofs);
+		}
+
+		if (origrecvd == NULL && !drc.drc_newfs) {
+			/* We failed to stash the original properties. */
+			*errflags |= ZPROP_ERR_NORESTORE;
+		}
+
+		/*
+		 * dsl_props_set() will not convert RECEIVED to LOCAL on or
+		 * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
+		 * explicitly if we're restoring local properties cleared in the
+		 * first new-style receive.
+		 */
+		if (origrecvd != NULL &&
+		    zfs_set_prop_nvlist(tofs, (first_recvd_props ?
+		    ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
+		    origrecvd, NULL) != 0) {
+			/*
+			 * We stashed the original properties but failed to
+			 * restore them.
+			 */
+			*errflags |= ZPROP_ERR_NORESTORE;
+		}
+	}
+	if (error != 0 && localprops != NULL && !drc.drc_newfs &&
+	    !first_recvd_props) {
+		nvlist_t *setprops;
+		nvlist_t *inheritprops;
+		nvpair_t *nvp;
+
+		if (origprops == NULL) {
+			/* We failed to stash the original properties. */
+			*errflags |= ZPROP_ERR_NORESTORE;
+			goto out;
+		}
+
+		/* Restore original props */
+		setprops = fnvlist_alloc();
+		inheritprops = fnvlist_alloc();
+		nvp = NULL;
+		while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) {
+			const char *name = nvpair_name(nvp);
+			const char *source;
+			nvlist_t *attrs;
+
+			if (!nvlist_exists(origprops, name)) {
+				/*
+				 * Property was not present or was explicitly
+				 * inherited before the receive, restore this.
+				 */
+				fnvlist_add_boolean(inheritprops, name);
+				continue;
+			}
+			attrs = fnvlist_lookup_nvlist(origprops, name);
+			source = fnvlist_lookup_string(attrs, ZPROP_SOURCE);
+
+			/* Skip received properties */
+			if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0)
+				continue;
+
+			if (strcmp(source, tofs) == 0) {
+				/* Property was locally set */
+				fnvlist_add_nvlist(setprops, name, attrs);
+			} else {
+				/* Property was implicitly inherited */
+				fnvlist_add_boolean(inheritprops, name);
+			}
+		}
+
+		if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops,
+		    NULL) != 0)
+			*errflags |= ZPROP_ERR_NORESTORE;
+		if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops,
+		    NULL) != 0)
+			*errflags |= ZPROP_ERR_NORESTORE;
+
+		nvlist_free(setprops);
+		nvlist_free(inheritprops);
+	}
+out:
+	zfs_file_put(input_fd);
+	nvlist_free(origrecvd);
+	nvlist_free(origprops);
+
+	if (error == 0)
+		error = props_error;
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of containing filesystem (unused)
+ * zc_nvlist_src{_size}	nvlist of properties to apply
+ * zc_nvlist_conf{_size}	nvlist of properties to exclude
+ *			(DATA_TYPE_BOOLEAN) and override (everything else)
+ * zc_value		name of snapshot to create
+ * zc_string		name of clone origin (if DRR_FLAG_CLONE)
+ * zc_cookie		file descriptor to recv from
+ * zc_begin_record	the BEGIN record of the stream (not byteswapped)
+ * zc_guid		force flag
+ *
+ * outputs:
+ * zc_cookie		number of bytes read
+ * zc_obj		zprop_errflags_t
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ */
+static int
+zfs_ioc_recv(zfs_cmd_t *zc)
+{
+	dmu_replay_record_t begin_record;
+	nvlist_t *errors = NULL;
+	nvlist_t *recvdprops = NULL;
+	nvlist_t *localprops = NULL;
+	char *origin = NULL;
+	char *tosnap;
+	char tofs[ZFS_MAX_DATASET_NAME_LEN];
+	int error = 0;
+
+	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
+	    strchr(zc->zc_value, '@') == NULL ||
+	    strchr(zc->zc_value, '%'))
+		return (SET_ERROR(EINVAL));
+
+	(void) strlcpy(tofs, zc->zc_value, sizeof (tofs));
+	tosnap = strchr(tofs, '@');
+	*tosnap++ = '\0';
+
+	if (zc->zc_nvlist_src != 0 &&
+	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &recvdprops)) != 0)
+		return (error);
+
+	if (zc->zc_nvlist_conf != 0 &&
+	    (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &localprops)) != 0)
+		return (error);
+
+	if (zc->zc_string[0])
+		origin = zc->zc_string;
+
+	begin_record.drr_type = DRR_BEGIN;
+	begin_record.drr_payloadlen = 0;
+	begin_record.drr_u.drr_begin = zc->zc_begin_record;
+
+	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
+	    NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record,
+	    &zc->zc_cookie, &zc->zc_obj, &errors);
+	nvlist_free(recvdprops);
+	nvlist_free(localprops);
+
+	/*
+	 * Now that all props, initial and delayed, are set, report the prop
+	 * errors to the caller.
+	 */
+	if (zc->zc_nvlist_dst_size != 0 && errors != NULL &&
+	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
+	    put_nvlist(zc, errors) != 0)) {
+		/*
+		 * Caller made zc->zc_nvlist_dst less than the minimum expected
+		 * size or supplied an invalid address.
+		 */
+		error = SET_ERROR(EINVAL);
+	}
+
+	nvlist_free(errors);
+
+	return (error);
+}
+
+/*
+ * innvl: {
+ *     "snapname" -> full name of the snapshot to create
+ *     (optional) "props" -> received properties to set (nvlist)
+ *     (optional) "localprops" -> override and exclude properties (nvlist)
+ *     (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE)
+ *     "begin_record" -> non-byteswapped dmu_replay_record_t
+ *     "input_fd" -> file descriptor to read stream from (int32)
+ *     (optional) "force" -> force flag (value ignored)
+ *     (optional) "resumable" -> resumable flag (value ignored)
+ *     (optional) "cleanup_fd" -> unused
+ *     (optional) "action_handle" -> unused
+ *     (optional) "hidden_args" -> { "wkeydata" -> value }
+ * }
+ *
+ * outnvl: {
+ *     "read_bytes" -> number of bytes read
+ *     "error_flags" -> zprop_errflags_t
+ *     "errors" -> error for each unapplied received property (nvlist)
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_recv_new[] = {
+	{"snapname",		DATA_TYPE_STRING,	0},
+	{"props",		DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+	{"localprops",		DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+	{"origin",		DATA_TYPE_STRING,	ZK_OPTIONAL},
+	{"begin_record",	DATA_TYPE_BYTE_ARRAY,	0},
+	{"input_fd",		DATA_TYPE_INT32,	0},
+	{"force",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"resumable",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"cleanup_fd",		DATA_TYPE_INT32,	ZK_OPTIONAL},
+	{"action_handle",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
+	{"hidden_args",		DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	dmu_replay_record_t *begin_record;
+	uint_t begin_record_size;
+	nvlist_t *errors = NULL;
+	nvlist_t *recvprops = NULL;
+	nvlist_t *localprops = NULL;
+	nvlist_t *hidden_args = NULL;
+	char *snapname;
+	char *origin = NULL;
+	char *tosnap;
+	char tofs[ZFS_MAX_DATASET_NAME_LEN];
+	boolean_t force;
+	boolean_t resumable;
+	uint64_t read_bytes = 0;
+	uint64_t errflags = 0;
+	int input_fd = -1;
+	int error;
+
+	snapname = fnvlist_lookup_string(innvl, "snapname");
+
+	if (dataset_namecheck(snapname, NULL, NULL) != 0 ||
+	    strchr(snapname, '@') == NULL ||
+	    strchr(snapname, '%'))
+		return (SET_ERROR(EINVAL));
+
+	(void) strlcpy(tofs, snapname, sizeof (tofs));
+	tosnap = strchr(tofs, '@');
+	*tosnap++ = '\0';
+
+	error = nvlist_lookup_string(innvl, "origin", &origin);
+	if (error && error != ENOENT)
+		return (error);
+
+	error = nvlist_lookup_byte_array(innvl, "begin_record",
+	    (uchar_t **)&begin_record, &begin_record_size);
+	if (error != 0 || begin_record_size != sizeof (*begin_record))
+		return (SET_ERROR(EINVAL));
+
+	input_fd = fnvlist_lookup_int32(innvl, "input_fd");
+
+	force = nvlist_exists(innvl, "force");
+	resumable = nvlist_exists(innvl, "resumable");
+
+	/* we still use "props" here for backwards compatibility */
+	error = nvlist_lookup_nvlist(innvl, "props", &recvprops);
+	if (error && error != ENOENT)
+		return (error);
+
+	error = nvlist_lookup_nvlist(innvl, "localprops", &localprops);
+	if (error && error != ENOENT)
+		return (error);
+
+	error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+	if (error && error != ENOENT)
+		return (error);
+
+	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
+	    hidden_args, force, resumable, input_fd, begin_record,
+	    &read_bytes, &errflags, &errors);
+
+	fnvlist_add_uint64(outnvl, "read_bytes", read_bytes);
+	fnvlist_add_uint64(outnvl, "error_flags", errflags);
+	fnvlist_add_nvlist(outnvl, "errors", errors);
+
+	nvlist_free(errors);
+	nvlist_free(recvprops);
+	nvlist_free(localprops);
+
+	return (error);
+}
+
+typedef struct dump_bytes_io {
+	zfs_file_t	*dbi_fp;
+	caddr_t		dbi_buf;
+	int		dbi_len;
+	int		dbi_err;
+} dump_bytes_io_t;
+
+static void
+dump_bytes_cb(void *arg)
+{
+	dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
+	zfs_file_t *fp;
+	caddr_t buf;
+
+	fp = dbi->dbi_fp;
+	buf = dbi->dbi_buf;
+
+	dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL);
+}
+
+static int
+dump_bytes(objset_t *os, void *buf, int len, void *arg)
+{
+	dump_bytes_io_t dbi;
+
+	dbi.dbi_fp = arg;
+	dbi.dbi_buf = buf;
+	dbi.dbi_len = len;
+
+#if defined(HAVE_LARGE_STACKS)
+	dump_bytes_cb(&dbi);
+#else
+	/*
+	 * The vn_rdwr() call is performed in a taskq to ensure that there is
+	 * always enough stack space to write safely to the target filesystem.
+	 * The ZIO_TYPE_FREE threads are used because there can be a lot of
+	 * them and they are used in vdev_file.c for a similar purpose.
+	 */
+	spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE,
+	    ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
+#endif /* HAVE_LARGE_STACKS */
+
+	return (dbi.dbi_err);
+}
+
+/*
+ * inputs:
+ * zc_name	name of snapshot to send
+ * zc_cookie	file descriptor to send stream to
+ * zc_obj	fromorigin flag (mutually exclusive with zc_fromobj)
+ * zc_sendobj	objsetid of snapshot to send
+ * zc_fromobj	objsetid of incremental fromsnap (may be zero)
+ * zc_guid	if set, estimate size of stream only.  zc_cookie is ignored.
+ *		output size in zc_objset_type.
+ * zc_flags	lzc_send_flags
+ *
+ * outputs:
+ * zc_objset_type	estimated size, if zc_guid is set
+ *
+ * NOTE: This is no longer the preferred interface, any new functionality
+ *	  should be added to zfs_ioc_send_new() instead.
+ */
+static int
+zfs_ioc_send(zfs_cmd_t *zc)
+{
+	int error;
+	offset_t off;
+	boolean_t estimate = (zc->zc_guid != 0);
+	boolean_t embedok = (zc->zc_flags & 0x1);
+	boolean_t large_block_ok = (zc->zc_flags & 0x2);
+	boolean_t compressok = (zc->zc_flags & 0x4);
+	boolean_t rawok = (zc->zc_flags & 0x8);
+	boolean_t savedok = (zc->zc_flags & 0x10);
+
+	if (zc->zc_obj != 0) {
+		dsl_pool_t *dp;
+		dsl_dataset_t *tosnap;
+
+		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+		if (error != 0)
+			return (error);
+
+		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
+		if (error != 0) {
+			dsl_pool_rele(dp, FTAG);
+			return (error);
+		}
+
+		if (dsl_dir_is_clone(tosnap->ds_dir))
+			zc->zc_fromobj =
+			    dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+	}
+
+	if (estimate) {
+		dsl_pool_t *dp;
+		dsl_dataset_t *tosnap;
+		dsl_dataset_t *fromsnap = NULL;
+
+		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+		if (error != 0)
+			return (error);
+
+		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj,
+		    FTAG, &tosnap);
+		if (error != 0) {
+			dsl_pool_rele(dp, FTAG);
+			return (error);
+		}
+
+		if (zc->zc_fromobj != 0) {
+			error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
+			    FTAG, &fromsnap);
+			if (error != 0) {
+				dsl_dataset_rele(tosnap, FTAG);
+				dsl_pool_rele(dp, FTAG);
+				return (error);
+			}
+		}
+
+		error = dmu_send_estimate_fast(tosnap, fromsnap, NULL,
+		    compressok || rawok, savedok, &zc->zc_objset_type);
+
+		if (fromsnap != NULL)
+			dsl_dataset_rele(fromsnap, FTAG);
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+	} else {
+		zfs_file_t *fp;
+		dmu_send_outparams_t out = {0};
+
+		if ((error = zfs_file_get(zc->zc_cookie, &fp)))
+			return (error);
+
+		off = zfs_file_off(fp);
+		out.dso_outfunc = dump_bytes;
+		out.dso_arg = fp;
+		out.dso_dryrun = B_FALSE;
+		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+		    zc->zc_fromobj, embedok, large_block_ok, compressok,
+		    rawok, savedok, zc->zc_cookie, &off, &out);
+
+		zfs_file_put(zc->zc_cookie);
+	}
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of snapshot on which to report progress
+ * zc_cookie		file descriptor of send stream
+ *
+ * outputs:
+ * zc_cookie		number of bytes written in send stream thus far
+ * zc_objset_type	logical size of data traversed by send thus far
+ */
+static int
+zfs_ioc_send_progress(zfs_cmd_t *zc)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	dmu_sendstatus_t *dsp = NULL;
+	int error;
+
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	mutex_enter(&ds->ds_sendstream_lock);
+
+	/*
+	 * Iterate over all the send streams currently active on this dataset.
+	 * If there's one which matches the specified file descriptor _and_ the
+	 * stream was started by the current process, return the progress of
+	 * that stream.
+	 */
+
+	for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
+	    dsp = list_next(&ds->ds_sendstreams, dsp)) {
+		if (dsp->dss_outfd == zc->zc_cookie &&
+		    zfs_proc_is_caller(dsp->dss_proc))
+			break;
+	}
+
+	if (dsp != NULL) {
+		zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off,
+		    0, 0);
+		/* This is the closest thing we have to atomic_read_64. */
+		zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0);
+	} else {
+		error = SET_ERROR(ENOENT);
+	}
+
+	mutex_exit(&ds->ds_sendstream_lock);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+	int id, error;
+
+	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+	    &zc->zc_inject_record);
+
+	if (error == 0)
+		zc->zc_guid = (uint64_t)id;
+
+	return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+	return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+	int id = (int)zc->zc_guid;
+	int error;
+
+	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+	    &zc->zc_inject_record);
+
+	zc->zc_guid = id;
+
+	return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	int error;
+	size_t count = (size_t)zc->zc_nvlist_dst_size;
+
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+		return (error);
+
+	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
+	    &count);
+	if (error == 0)
+		zc->zc_nvlist_dst_size = count;
+	else
+		zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	vdev_t *vd;
+	int error;
+
+	/*
+	 * On zpool clear we also fix up missing slogs
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa = spa_lookup(zc->zc_name);
+	if (spa == NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EIO));
+	}
+	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
+		/* we need to let spa_open/spa_load clear the chains */
+		spa_set_log_state(spa, SPA_LOG_CLEAR);
+	}
+	spa->spa_last_open_failed = 0;
+	mutex_exit(&spa_namespace_lock);
+
+	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
+		error = spa_open(zc->zc_name, &spa, FTAG);
+	} else {
+		nvlist_t *policy;
+		nvlist_t *config = NULL;
+
+		if (zc->zc_nvlist_src == 0)
+			return (SET_ERROR(EINVAL));
+
+		if ((error = get_nvlist(zc->zc_nvlist_src,
+		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+			    policy, &config);
+			if (config != NULL) {
+				int err;
+
+				if ((err = put_nvlist(zc, config)) != 0)
+					error = err;
+				nvlist_free(config);
+			}
+			nvlist_free(policy);
+		}
+	}
+
+	if (error != 0)
+		return (error);
+
+	/*
+	 * If multihost is enabled, resuming I/O is unsafe as another
+	 * host may have imported the pool.
+	 */
+	if (spa_multihost(spa) && spa_suspended(spa))
+		return (SET_ERROR(EINVAL));
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+
+	if (zc->zc_guid == 0) {
+		vd = NULL;
+	} else {
+		vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
+		if (vd == NULL) {
+			error = SET_ERROR(ENODEV);
+			(void) spa_vdev_state_exit(spa, NULL, error);
+			spa_close(spa, FTAG);
+			return (error);
+		}
+	}
+
+	vdev_clear(spa, vd);
+
+	(void) spa_vdev_state_exit(spa, spa_suspended(spa) ?
+	    NULL : spa->spa_root_vdev, 0);
+
+	/*
+	 * Resume any suspended I/Os.
+	 */
+	if (zio_resume(spa) != 0)
+		error = SET_ERROR(EIO);
+
+	spa_close(spa, FTAG);
+
+	return (error);
+}
+
+/*
+ * Reopen all the vdevs associated with the pool.
+ *
+ * innvl: {
+ *  "scrub_restart" -> when true and scrub is running, allow to restart
+ *              scrub as the side effect of the reopen (boolean).
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
+	{"scrub_restart",	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	spa_t *spa;
+	int error;
+	boolean_t rc, scrub_restart = B_TRUE;
+
+	if (innvl) {
+		error = nvlist_lookup_boolean_value(innvl,
+		    "scrub_restart", &rc);
+		if (error == 0)
+			scrub_restart = rc;
+	}
+
+	error = spa_open(pool, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+
+	/*
+	 * If the scrub_restart flag is B_FALSE and a scrub is already
+	 * in progress then set spa_scrub_reopen flag to B_TRUE so that
+	 * we don't restart the scrub as a side effect of the reopen.
+	 * Otherwise, let vdev_open() decided if a resilver is required.
+	 */
+
+	spa->spa_scrub_reopen = (!scrub_restart &&
+	    dsl_scan_scrubbing(spa->spa_dsl_pool));
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	spa_close(spa, FTAG);
+	return (0);
+}
+
+/*
+ * inputs:
+ * zc_name	name of filesystem
+ *
+ * outputs:
+ * zc_string	name of conflicting snapshot, if there is one
+ */
+static int
+zfs_ioc_promote(zfs_cmd_t *zc)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds, *ods;
+	char origin[ZFS_MAX_DATASET_NAME_LEN];
+	char *cp;
+	int error;
+
+	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+	if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
+	    strchr(zc->zc_name, '%'))
+		return (SET_ERROR(EINVAL));
+
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	if (!dsl_dir_is_clone(ds->ds_dir)) {
+		dsl_dataset_rele(ds, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (SET_ERROR(EINVAL));
+	}
+
+	error = dsl_dataset_hold_obj(dp,
+	    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
+	if (error != 0) {
+		dsl_dataset_rele(ds, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	dsl_dataset_name(ods, origin);
+	dsl_dataset_rele(ods, FTAG);
+	dsl_dataset_rele(ds, FTAG);
+	dsl_pool_rele(dp, FTAG);
+
+	/*
+	 * We don't need to unmount *all* the origin fs's snapshots, but
+	 * it's easier.
+	 */
+	cp = strchr(origin, '@');
+	if (cp)
+		*cp = '\0';
+	(void) dmu_objset_find(origin,
+	    zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
+	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
+}
+
+/*
+ * Retrieve a single {user|group|project}{used|quota}@... property.
+ *
+ * inputs:
+ * zc_name	name of filesystem
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_value	domain name (eg. "S-1-234-567-89")
+ * zc_guid	RID/UID/GID
+ *
+ * outputs:
+ * zc_cookie	property value
+ */
+static int
+zfs_ioc_userspace_one(zfs_cmd_t *zc)
+{
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (SET_ERROR(EINVAL));
+
+	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+	if (error != 0)
+		return (error);
+
+	error = zfs_userspace_one(zfsvfs,
+	    zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
+	zfsvfs_rele(zfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_cookie		zap cursor
+ * zc_objset_type	zfs_userquota_prop_t
+ * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
+ *
+ * outputs:
+ * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
+ * zc_cookie	zap cursor
+ */
+static int
+zfs_ioc_userspace_many(zfs_cmd_t *zc)
+{
+	zfsvfs_t *zfsvfs;
+	int bufsize = zc->zc_nvlist_dst_size;
+
+	if (bufsize <= 0)
+		return (SET_ERROR(ENOMEM));
+
+	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
+	if (error != 0)
+		return (error);
+
+	void *buf = vmem_alloc(bufsize, KM_SLEEP);
+
+	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
+	    buf, &zc->zc_nvlist_dst_size);
+
+	if (error == 0) {
+		error = xcopyout(buf,
+		    (void *)(uintptr_t)zc->zc_nvlist_dst,
+		    zc->zc_nvlist_dst_size);
+	}
+	vmem_free(buf, bufsize);
+	zfsvfs_rele(zfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
+{
+	int error = 0;
+	zfsvfs_t *zfsvfs;
+
+	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+		if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
+			/*
+			 * If userused is not enabled, it may be because the
+			 * objset needs to be closed & reopened (to grow the
+			 * objset_phys_t).  Suspend/resume the fs will do that.
+			 */
+			dsl_dataset_t *ds, *newds;
+
+			ds = dmu_objset_ds(zfsvfs->z_os);
+			error = zfs_suspend_fs(zfsvfs);
+			if (error == 0) {
+				dmu_objset_refresh_ownership(ds, &newds,
+				    B_TRUE, zfsvfs);
+				error = zfs_resume_fs(zfsvfs, newds);
+			}
+		}
+		if (error == 0) {
+			mutex_enter(&zfsvfs->z_os->os_upgrade_lock);
+			if (zfsvfs->z_os->os_upgrade_id == 0) {
+				/* clear potential error code and retry */
+				zfsvfs->z_os->os_upgrade_status = 0;
+				mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
+
+				dsl_pool_config_enter(
+				    dmu_objset_pool(zfsvfs->z_os), FTAG);
+				dmu_objset_userspace_upgrade(zfsvfs->z_os);
+				dsl_pool_config_exit(
+				    dmu_objset_pool(zfsvfs->z_os), FTAG);
+			} else {
+				mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
+			}
+
+			taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq,
+			    zfsvfs->z_os->os_upgrade_id);
+			error = zfsvfs->z_os->os_upgrade_status;
+		}
+		zfs_vfs_rele(zfsvfs);
+	} else {
+		objset_t *os;
+
+		/* XXX kind of reading contents without owning */
+		error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
+		if (error != 0)
+			return (error);
+
+		mutex_enter(&os->os_upgrade_lock);
+		if (os->os_upgrade_id == 0) {
+			/* clear potential error code and retry */
+			os->os_upgrade_status = 0;
+			mutex_exit(&os->os_upgrade_lock);
+
+			dmu_objset_userspace_upgrade(os);
+		} else {
+			mutex_exit(&os->os_upgrade_lock);
+		}
+
+		dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+		taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
+		error = os->os_upgrade_status;
+
+		dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT,
+		    FTAG);
+	}
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
+	if (error != 0)
+		return (error);
+
+	if (dmu_objset_userobjspace_upgradable(os) ||
+	    dmu_objset_projectquota_upgradable(os)) {
+		mutex_enter(&os->os_upgrade_lock);
+		if (os->os_upgrade_id == 0) {
+			/* clear potential error code and retry */
+			os->os_upgrade_status = 0;
+			mutex_exit(&os->os_upgrade_lock);
+
+			dmu_objset_id_quota_upgrade(os);
+		} else {
+			mutex_exit(&os->os_upgrade_lock);
+		}
+
+		dsl_pool_rele(dmu_objset_pool(os), FTAG);
+
+		taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
+		error = os->os_upgrade_status;
+	} else {
+		dsl_pool_rele(dmu_objset_pool(os), FTAG);
+	}
+
+	dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG);
+
+	return (error);
+}
+
+static int
+zfs_ioc_share(zfs_cmd_t *zc)
+{
+	return (SET_ERROR(ENOSYS));
+}
+
+ace_t full_access[] = {
+	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
+};
+
+/*
+ * inputs:
+ * zc_name		name of containing filesystem
+ * zc_obj		object # beyond which we want next in-use object #
+ *
+ * outputs:
+ * zc_obj		next in-use object #
+ */
+static int
+zfs_ioc_next_obj(zfs_cmd_t *zc)
+{
+	objset_t *os = NULL;
+	int error;
+
+	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+	if (error != 0)
+		return (error);
+
+	error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0);
+
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_value		prefix name for snapshot
+ * zc_cleanup_fd	cleanup-on-exit file descriptor for calling process
+ *
+ * outputs:
+ * zc_value		short name of new snapshot
+ */
+static int
+zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
+{
+	char *snap_name;
+	char *hold_name;
+	int error;
+	minor_t minor;
+
+	error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
+	if (error != 0)
+		return (error);
+
+	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
+	    (u_longlong_t)ddi_get_lbolt64());
+	hold_name = kmem_asprintf("%%%s", zc->zc_value);
+
+	error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
+	    hold_name);
+	if (error == 0)
+		(void) strlcpy(zc->zc_value, snap_name,
+		    sizeof (zc->zc_value));
+	kmem_strfree(snap_name);
+	kmem_strfree(hold_name);
+	zfs_onexit_fd_rele(zc->zc_cleanup_fd);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of "to" snapshot
+ * zc_value		name of "from" snapshot
+ * zc_cookie		file descriptor to write diff data on
+ *
+ * outputs:
+ * dmu_diff_record_t's to the file descriptor
+ */
+static int
+zfs_ioc_diff(zfs_cmd_t *zc)
+{
+	zfs_file_t *fp;
+	offset_t off;
+	int error;
+
+	if ((error = zfs_file_get(zc->zc_cookie, &fp)))
+		return (error);
+
+	off = zfs_file_off(fp);
+	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
+
+	zfs_file_put(zc->zc_cookie);
+
+	return (error);
+}
+
+static int
+zfs_ioc_smb_acl(zfs_cmd_t *zc)
+{
+	return (SET_ERROR(ENOTSUP));
+}
+
+/*
+ * innvl: {
+ *     "holds" -> { snapname -> holdname (string), ... }
+ *     (optional) "cleanup_fd" -> fd (int32)
+ * }
+ *
+ * outnvl: {
+ *     snapname -> error value (int32)
+ *     ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_hold[] = {
+	{"holds",		DATA_TYPE_NVLIST,	0},
+	{"cleanup_fd",		DATA_TYPE_INT32,	ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
+{
+	nvpair_t *pair;
+	nvlist_t *holds;
+	int cleanup_fd = -1;
+	int error;
+	minor_t minor = 0;
+
+	holds = fnvlist_lookup_nvlist(args, "holds");
+
+	/* make sure the user didn't pass us any invalid (empty) tags */
+	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+	    pair = nvlist_next_nvpair(holds, pair)) {
+		char *htag;
+
+		error = nvpair_value_string(pair, &htag);
+		if (error != 0)
+			return (SET_ERROR(error));
+
+		if (strlen(htag) == 0)
+			return (SET_ERROR(EINVAL));
+	}
+
+	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
+		error = zfs_onexit_fd_hold(cleanup_fd, &minor);
+		if (error != 0)
+			return (SET_ERROR(error));
+	}
+
+	error = dsl_dataset_user_hold(holds, minor, errlist);
+	if (minor != 0)
+		zfs_onexit_fd_rele(cleanup_fd);
+	return (SET_ERROR(error));
+}
+
+/*
+ * innvl is not used.
+ *
+ * outnvl: {
+ *    holdname -> time added (uint64 seconds since epoch)
+ *    ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_get_holds[] = {
+	/* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
+{
+	return (dsl_dataset_get_holds(snapname, outnvl));
+}
+
+/*
+ * innvl: {
+ *     snapname -> { holdname, ... }
+ *     ...
+ * }
+ *
+ * outnvl: {
+ *     snapname -> error value (int32)
+ *     ...
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_release[] = {
+	{"<snapname>...",	DATA_TYPE_NVLIST,	ZK_WILDCARDLIST},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
+{
+	return (dsl_dataset_user_release(holds, errlist));
+}
+
+/*
+ * inputs:
+ * zc_guid		flags (ZEVENT_NONBLOCK)
+ * zc_cleanup_fd	zevent file descriptor
+ *
+ * outputs:
+ * zc_nvlist_dst	next nvlist event
+ * zc_cookie		dropped events since last get
+ */
+static int
+zfs_ioc_events_next(zfs_cmd_t *zc)
+{
+	zfs_zevent_t *ze;
+	nvlist_t *event = NULL;
+	minor_t minor;
+	uint64_t dropped = 0;
+	int error;
+
+	error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
+	if (error != 0)
+		return (error);
+
+	do {
+		error = zfs_zevent_next(ze, &event,
+		    &zc->zc_nvlist_dst_size, &dropped);
+		if (event != NULL) {
+			zc->zc_cookie = dropped;
+			error = put_nvlist(zc, event);
+			nvlist_free(event);
+		}
+
+		if (zc->zc_guid & ZEVENT_NONBLOCK)
+			break;
+
+		if ((error == 0) || (error != ENOENT))
+			break;
+
+		error = zfs_zevent_wait(ze);
+		if (error != 0)
+			break;
+	} while (1);
+
+	zfs_zevent_fd_rele(zc->zc_cleanup_fd);
+
+	return (error);
+}
+
+/*
+ * outputs:
+ * zc_cookie		cleared events count
+ */
+static int
+zfs_ioc_events_clear(zfs_cmd_t *zc)
+{
+	int count;
+
+	zfs_zevent_drain_all(&count);
+	zc->zc_cookie = count;
+
+	return (0);
+}
+
+/*
+ * inputs:
+ * zc_guid		eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END
+ * zc_cleanup		zevent file descriptor
+ */
+static int
+zfs_ioc_events_seek(zfs_cmd_t *zc)
+{
+	zfs_zevent_t *ze;
+	minor_t minor;
+	int error;
+
+	error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
+	if (error != 0)
+		return (error);
+
+	error = zfs_zevent_seek(ze, zc->zc_guid);
+	zfs_zevent_fd_rele(zc->zc_cleanup_fd);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of later filesystem or snapshot
+ * zc_value		full name of old snapshot or bookmark
+ *
+ * outputs:
+ * zc_cookie		space in bytes
+ * zc_objset_type	compressed space in bytes
+ * zc_perm_action	uncompressed space in bytes
+ */
+static int
+zfs_ioc_space_written(zfs_cmd_t *zc)
+{
+	int error;
+	dsl_pool_t *dp;
+	dsl_dataset_t *new;
+
+	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
+	if (error != 0)
+		return (error);
+	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+	if (strchr(zc->zc_value, '#') != NULL) {
+		zfs_bookmark_phys_t bmp;
+		error = dsl_bookmark_lookup(dp, zc->zc_value,
+		    new, &bmp);
+		if (error == 0) {
+			error = dsl_dataset_space_written_bookmark(&bmp, new,
+			    &zc->zc_cookie,
+			    &zc->zc_objset_type, &zc->zc_perm_action);
+		}
+	} else {
+		dsl_dataset_t *old;
+		error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
+
+		if (error == 0) {
+			error = dsl_dataset_space_written(old, new,
+			    &zc->zc_cookie,
+			    &zc->zc_objset_type, &zc->zc_perm_action);
+			dsl_dataset_rele(old, FTAG);
+		}
+	}
+	dsl_dataset_rele(new, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	return (error);
+}
+
+/*
+ * innvl: {
+ *     "firstsnap" -> snapshot name
+ * }
+ *
+ * outnvl: {
+ *     "used" -> space in bytes
+ *     "compressed" -> compressed space in bytes
+ *     "uncompressed" -> uncompressed space in bytes
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_space_snaps[] = {
+	{"firstsnap",	DATA_TYPE_STRING,	0},
+};
+
+static int
+zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int error;
+	dsl_pool_t *dp;
+	dsl_dataset_t *new, *old;
+	char *firstsnap;
+	uint64_t used, comp, uncomp;
+
+	firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
+
+	error = dsl_pool_hold(lastsnap, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+	if (error == 0 && !new->ds_is_snapshot) {
+		dsl_dataset_rele(new, FTAG);
+		error = SET_ERROR(EINVAL);
+	}
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
+	if (error == 0 && !old->ds_is_snapshot) {
+		dsl_dataset_rele(old, FTAG);
+		error = SET_ERROR(EINVAL);
+	}
+	if (error != 0) {
+		dsl_dataset_rele(new, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+
+	error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
+	dsl_dataset_rele(old, FTAG);
+	dsl_dataset_rele(new, FTAG);
+	dsl_pool_rele(dp, FTAG);
+	fnvlist_add_uint64(outnvl, "used", used);
+	fnvlist_add_uint64(outnvl, "compressed", comp);
+	fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
+	return (error);
+}
+
+/*
+ * innvl: {
+ *     "fd" -> file descriptor to write stream to (int32)
+ *     (optional) "fromsnap" -> full snap name to send an incremental from
+ *     (optional) "largeblockok" -> (value ignored)
+ *         indicates that blocks > 128KB are permitted
+ *     (optional) "embedok" -> (value ignored)
+ *         presence indicates DRR_WRITE_EMBEDDED records are permitted
+ *     (optional) "compressok" -> (value ignored)
+ *         presence indicates compressed DRR_WRITE records are permitted
+ *     (optional) "rawok" -> (value ignored)
+ *         presence indicates raw encrypted records should be used.
+ *     (optional) "savedok" -> (value ignored)
+ *         presence indicates we should send a partially received snapshot
+ *     (optional) "resume_object" and "resume_offset" -> (uint64)
+ *         if present, resume send stream from specified object and offset.
+ *     (optional) "redactbook" -> (string)
+ *         if present, use this bookmark's redaction list to generate a redacted
+ *         send stream
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_send_new[] = {
+	{"fd",			DATA_TYPE_INT32,	0},
+	{"fromsnap",		DATA_TYPE_STRING,	ZK_OPTIONAL},
+	{"largeblockok",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"embedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"compressok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"rawok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"savedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"resume_object",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
+	{"resume_offset",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
+	{"redactbook",		DATA_TYPE_STRING,	ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int error;
+	offset_t off;
+	char *fromname = NULL;
+	int fd;
+	zfs_file_t *fp;
+	boolean_t largeblockok;
+	boolean_t embedok;
+	boolean_t compressok;
+	boolean_t rawok;
+	boolean_t savedok;
+	uint64_t resumeobj = 0;
+	uint64_t resumeoff = 0;
+	char *redactbook = NULL;
+
+	fd = fnvlist_lookup_int32(innvl, "fd");
+
+	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+
+	largeblockok = nvlist_exists(innvl, "largeblockok");
+	embedok = nvlist_exists(innvl, "embedok");
+	compressok = nvlist_exists(innvl, "compressok");
+	rawok = nvlist_exists(innvl, "rawok");
+	savedok = nvlist_exists(innvl, "savedok");
+
+	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
+	(void) nvlist_lookup_string(innvl, "redactbook", &redactbook);
+
+	if ((error = zfs_file_get(fd, &fp)))
+		return (error);
+
+	off = zfs_file_off(fp);
+
+	dmu_send_outparams_t out = {0};
+	out.dso_outfunc = dump_bytes;
+	out.dso_arg = fp;
+	out.dso_dryrun = B_FALSE;
+	error = dmu_send(snapname, fromname, embedok, largeblockok,
+	    compressok, rawok, savedok, resumeobj, resumeoff,
+	    redactbook, fd, &off, &out);
+
+	zfs_file_put(fd);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+send_space_sum(objset_t *os, void *buf, int len, void *arg)
+{
+	uint64_t *size = arg;
+	*size += len;
+	return (0);
+}
+
+/*
+ * Determine approximately how large a zfs send stream will be -- the number
+ * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
+ *
+ * innvl: {
+ *     (optional) "from" -> full snap or bookmark name to send an incremental
+ *                          from
+ *     (optional) "largeblockok" -> (value ignored)
+ *         indicates that blocks > 128KB are permitted
+ *     (optional) "embedok" -> (value ignored)
+ *         presence indicates DRR_WRITE_EMBEDDED records are permitted
+ *     (optional) "compressok" -> (value ignored)
+ *         presence indicates compressed DRR_WRITE records are permitted
+ *     (optional) "rawok" -> (value ignored)
+ *         presence indicates raw encrypted records should be used.
+ *     (optional) "resume_object" and "resume_offset" -> (uint64)
+ *         if present, resume send stream from specified object and offset.
+ *     (optional) "fd" -> file descriptor to use as a cookie for progress
+ *         tracking (int32)
+ * }
+ *
+ * outnvl: {
+ *     "space" -> bytes of space (uint64)
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_send_space[] = {
+	{"from",		DATA_TYPE_STRING,	ZK_OPTIONAL},
+	{"fromsnap",		DATA_TYPE_STRING,	ZK_OPTIONAL},
+	{"largeblockok",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"embedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"compressok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"rawok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+	{"fd",			DATA_TYPE_INT32,	ZK_OPTIONAL},
+	{"redactbook",		DATA_TYPE_STRING,	ZK_OPTIONAL},
+	{"resume_object",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
+	{"resume_offset",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
+	{"bytes",		DATA_TYPE_UINT64,	ZK_OPTIONAL},
+};
+
+static int
+zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *tosnap;
+	dsl_dataset_t *fromsnap = NULL;
+	int error;
+	char *fromname = NULL;
+	char *redactlist_book = NULL;
+	boolean_t largeblockok;
+	boolean_t embedok;
+	boolean_t compressok;
+	boolean_t rawok;
+	boolean_t savedok;
+	uint64_t space = 0;
+	boolean_t full_estimate = B_FALSE;
+	uint64_t resumeobj = 0;
+	uint64_t resumeoff = 0;
+	uint64_t resume_bytes = 0;
+	int32_t fd = -1;
+	zfs_bookmark_phys_t zbm = {0};
+
+	error = dsl_pool_hold(snapname, FTAG, &dp);
+	if (error != 0)
+		return (error);
+
+	error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
+	if (error != 0) {
+		dsl_pool_rele(dp, FTAG);
+		return (error);
+	}
+	(void) nvlist_lookup_int32(innvl, "fd", &fd);
+
+	largeblockok = nvlist_exists(innvl, "largeblockok");
+	embedok = nvlist_exists(innvl, "embedok");
+	compressok = nvlist_exists(innvl, "compressok");
+	rawok = nvlist_exists(innvl, "rawok");
+	savedok = nvlist_exists(innvl, "savedok");
+	boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0);
+	boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook",
+	    &redactlist_book) == 0);
+
+	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+	(void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes);
+
+	if (altbook) {
+		full_estimate = B_TRUE;
+	} else if (from) {
+		if (strchr(fromname, '#')) {
+			error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm);
+
+			/*
+			 * dsl_bookmark_lookup() will fail with EXDEV if
+			 * the from-bookmark and tosnap are at the same txg.
+			 * However, it's valid to do a send (and therefore,
+			 * a send estimate) from and to the same time point,
+			 * if the bookmark is redacted (the incremental send
+			 * can change what's redacted on the target).  In
+			 * this case, dsl_bookmark_lookup() fills in zbm
+			 * but returns EXDEV.  Ignore this error.
+			 */
+			if (error == EXDEV && zbm.zbm_redaction_obj != 0 &&
+			    zbm.zbm_guid ==
+			    dsl_dataset_phys(tosnap)->ds_guid)
+				error = 0;
+
+			if (error != 0) {
+				dsl_dataset_rele(tosnap, FTAG);
+				dsl_pool_rele(dp, FTAG);
+				return (error);
+			}
+			if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags &
+			    ZBM_FLAG_HAS_FBN)) {
+				full_estimate = B_TRUE;
+			}
+		} else if (strchr(fromname, '@')) {
+			error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+			if (error != 0) {
+				dsl_dataset_rele(tosnap, FTAG);
+				dsl_pool_rele(dp, FTAG);
+				return (error);
+			}
+
+			if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
+				full_estimate = B_TRUE;
+				dsl_dataset_rele(fromsnap, FTAG);
+			}
+		} else {
+			/*
+			 * from is not properly formatted as a snapshot or
+			 * bookmark
+			 */
+			dsl_dataset_rele(tosnap, FTAG);
+			dsl_pool_rele(dp, FTAG);
+			return (SET_ERROR(EINVAL));
+		}
+	}
+
+	if (full_estimate) {
+		dmu_send_outparams_t out = {0};
+		offset_t off = 0;
+		out.dso_outfunc = send_space_sum;
+		out.dso_arg = &space;
+		out.dso_dryrun = B_TRUE;
+		/*
+		 * We have to release these holds so dmu_send can take them.  It
+		 * will do all the error checking we need.
+		 */
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+		error = dmu_send(snapname, fromname, embedok, largeblockok,
+		    compressok, rawok, savedok, resumeobj, resumeoff,
+		    redactlist_book, fd, &off, &out);
+	} else {
+		error = dmu_send_estimate_fast(tosnap, fromsnap,
+		    (from && strchr(fromname, '#') != NULL ? &zbm : NULL),
+		    compressok || rawok, savedok, &space);
+		space -= resume_bytes;
+		if (fromsnap != NULL)
+			dsl_dataset_rele(fromsnap, FTAG);
+		dsl_dataset_rele(tosnap, FTAG);
+		dsl_pool_rele(dp, FTAG);
+	}
+
+	fnvlist_add_uint64(outnvl, "space", space);
+
+	return (error);
+}
+
+/*
+ * Sync the currently open TXG to disk for the specified pool.
+ * This is somewhat similar to 'zfs_sync()'.
+ * For cases that do not result in error this ioctl will wait for
+ * the currently open TXG to commit before returning back to the caller.
+ *
+ * innvl: {
+ *  "force" -> when true, force uberblock update even if there is no dirty data.
+ *             In addition this will cause the vdev configuration to be written
+ *             out including updating the zpool cache file. (boolean_t)
+ * }
+ *
+ * onvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
+	{"force",	DATA_TYPE_BOOLEAN_VALUE,	0},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
+{
+	int err;
+	boolean_t rc, force = B_FALSE;
+	spa_t *spa;
+
+	if ((err = spa_open(pool, &spa, FTAG)) != 0)
+		return (err);
+
+	if (innvl) {
+		err = nvlist_lookup_boolean_value(innvl, "force", &rc);
+		if (err == 0)
+			force = rc;
+	}
+
+	if (force) {
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
+		vdev_config_dirty(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	spa_close(spa, FTAG);
+
+	return (0);
+}
+
+/*
+ * Load a user's wrapping key into the kernel.
+ * innvl: {
+ *     "hidden_args" -> { "wkeydata" -> value }
+ *         raw uint8_t array of encryption wrapping key data (32 bytes)
+ *     (optional) "noop" -> (value ignored)
+ *         presence indicated key should only be verified, not loaded
+ * }
+ */
+static const zfs_ioc_key_t zfs_keys_load_key[] = {
+	{"hidden_args",	DATA_TYPE_NVLIST,	0},
+	{"noop",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int ret;
+	dsl_crypto_params_t *dcp = NULL;
+	nvlist_t *hidden_args;
+	boolean_t noop = nvlist_exists(innvl, "noop");
+
+	if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+		ret = SET_ERROR(EINVAL);
+		goto error;
+	}
+
+	hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS);
+
+	ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
+	    hidden_args, &dcp);
+	if (ret != 0)
+		goto error;
+
+	ret = spa_keystore_load_wkey(dsname, dcp, noop);
+	if (ret != 0)
+		goto error;
+
+	dsl_crypto_params_free(dcp, noop);
+
+	return (0);
+
+error:
+	dsl_crypto_params_free(dcp, B_TRUE);
+	return (ret);
+}
+
+/*
+ * Unload a user's wrapping key from the kernel.
+ * Both innvl and outnvl are unused.
+ */
+static const zfs_ioc_key_t zfs_keys_unload_key[] = {
+	/* no nvl keys */
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int ret = 0;
+
+	if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+		ret = (SET_ERROR(EINVAL));
+		goto out;
+	}
+
+	ret = spa_keystore_unload_wkey(dsname);
+	if (ret != 0)
+		goto out;
+
+out:
+	return (ret);
+}
+
+/*
+ * Changes a user's wrapping key used to decrypt a dataset. The keyformat,
+ * keylocation, pbkdf2salt, and  pbkdf2iters properties can also be specified
+ * here to change how the key is derived in userspace.
+ *
+ * innvl: {
+ *    "hidden_args" (optional) -> { "wkeydata" -> value }
+ *         raw uint8_t array of new encryption wrapping key data (32 bytes)
+ *    "props" (optional) -> { prop -> value }
+ * }
+ *
+ * outnvl is unused
+ */
+static const zfs_ioc_key_t zfs_keys_change_key[] = {
+	{"crypt_cmd",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
+	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
+};
+
+/* ARGSUSED */
+static int
+zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+	int ret;
+	uint64_t cmd = DCP_CMD_NONE;
+	dsl_crypto_params_t *dcp = NULL;
+	nvlist_t *args = NULL, *hidden_args = NULL;
+
+	if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
+		ret = (SET_ERROR(EINVAL));
+		goto error;
+	}
+
+	(void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd);
+	(void) nvlist_lookup_nvlist(innvl, "props", &args);
+	(void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
+
+	ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp);
+	if (ret != 0)
+		goto error;
+
+	ret = spa_keystore_change_key(dsname, dcp);
+	if (ret != 0)
+		goto error;
+
+	dsl_crypto_params_free(dcp, B_FALSE);
+
+	return (0);
+
+error:
+	dsl_crypto_params_free(dcp, B_TRUE);
+	return (ret);
+}
+
+static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
+
+static void
+zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+    boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
+{
+	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
+
+	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+	ASSERT3U(ioc, <, ZFS_IOC_LAST);
+	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+	ASSERT3P(vec->zvec_func, ==, NULL);
+
+	vec->zvec_legacy_func = func;
+	vec->zvec_secpolicy = secpolicy;
+	vec->zvec_namecheck = namecheck;
+	vec->zvec_allow_log = log_history;
+	vec->zvec_pool_check = pool_check;
+}
+
+/*
+ * See the block comment at the beginning of this file for details on
+ * each argument to this function.
+ */
+void
+zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
+    zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
+    boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys)
+{
+	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
+
+	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
+	ASSERT3U(ioc, <, ZFS_IOC_LAST);
+	ASSERT3P(vec->zvec_legacy_func, ==, NULL);
+	ASSERT3P(vec->zvec_func, ==, NULL);
+
+	/* if we are logging, the name must be valid */
+	ASSERT(!allow_log || namecheck != NO_NAME);
+
+	vec->zvec_name = name;
+	vec->zvec_func = func;
+	vec->zvec_secpolicy = secpolicy;
+	vec->zvec_namecheck = namecheck;
+	vec->zvec_pool_check = pool_check;
+	vec->zvec_smush_outnvlist = smush_outnvlist;
+	vec->zvec_allow_log = allow_log;
+	vec->zvec_nvl_keys = nvl_keys;
+	vec->zvec_nvl_key_count = num_keys;
+}
+
+static void
+zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
+    zfs_ioc_poolcheck_t pool_check)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    POOL_NAME, log_history, pool_check);
+}
+
+void
+zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    DATASET_NAME, B_FALSE, pool_check);
+}
+
+static void
+zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+{
+	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
+	    POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+}
+
+static void
+zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    NO_NAME, B_FALSE, POOL_CHECK_NONE);
+}
+
+static void
+zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
+    zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
+}
+
+static void
+zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
+{
+	zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
+	    zfs_secpolicy_read);
+}
+
+static void
+zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
+    zfs_secpolicy_func_t *secpolicy)
+{
+	zfs_ioctl_register_legacy(ioc, func, secpolicy,
+	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+}
+
+static void
+zfs_ioctl_init(void)
+{
+	zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
+	    zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot));
+
+	zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
+	    zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+	    zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history));
+
+	zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
+	    zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+	    zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps));
+
+	zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
+	    zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+	    zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new));
+
+	zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
+	    zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+	    zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space));
+
+	zfs_ioctl_register("create", ZFS_IOC_CREATE,
+	    zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_create, ARRAY_SIZE(zfs_keys_create));
+
+	zfs_ioctl_register("clone", ZFS_IOC_CLONE,
+	    zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone));
+
+	zfs_ioctl_register("remap", ZFS_IOC_REMAP,
+	    zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+	    zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap));
+
+	zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
+	    zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps));
+
+	zfs_ioctl_register("hold", ZFS_IOC_HOLD,
+	    zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold));
+	zfs_ioctl_register("release", ZFS_IOC_RELEASE,
+	    zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_release, ARRAY_SIZE(zfs_keys_release));
+
+	zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
+	    zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+	    zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds));
+
+	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
+	    zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+	    zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback));
+
+	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
+	    zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark));
+
+	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
+	    zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
+	    zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks));
+
+	zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS,
+	    zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props,
+	    ARRAY_SIZE(zfs_keys_get_bookmark_props));
+
+	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
+	    zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
+	    POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_destroy_bookmarks,
+	    ARRAY_SIZE(zfs_keys_destroy_bookmarks));
+
+	zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW,
+	    zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new));
+	zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
+	    zfs_ioc_load_key, zfs_secpolicy_load_key,
+	    DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
+	    zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key));
+	zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY,
+	    zfs_ioc_unload_key, zfs_secpolicy_load_key,
+	    DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
+	    zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key));
+	zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY,
+	    zfs_ioc_change_key, zfs_secpolicy_change_key,
+	    DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY,
+	    B_TRUE, B_TRUE, zfs_keys_change_key,
+	    ARRAY_SIZE(zfs_keys_change_key));
+
+	zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
+	    zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+	    zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync));
+	zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
+	    zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE,
+	    B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen));
+
+	zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
+	    zfs_ioc_channel_program, zfs_secpolicy_config,
+	    POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
+	    B_TRUE, zfs_keys_channel_program,
+	    ARRAY_SIZE(zfs_keys_channel_program));
+
+	zfs_ioctl_register("redact", ZFS_IOC_REDACT,
+	    zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact));
+
+	zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
+	    zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint));
+
+	zfs_ioctl_register("zpool_discard_checkpoint",
+	    ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
+	    zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_pool_discard_checkpoint,
+	    ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
+
+	zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
+	    zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
+
+	zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM,
+	    zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
+	    zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim));
+
+	zfs_ioctl_register("wait", ZFS_IOC_WAIT,
+	    zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+	    zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait));
+
+	zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS,
+	    zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
+	    zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait));
+
+	zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV,
+	    zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
+	    zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv));
+
+	zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV,
+	    zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME,
+	    POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
+	    zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
+
+	/* IOCTLS that use the legacy function signature */
+
+	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
+	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
+
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
+	    zfs_ioc_pool_scan);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
+	    zfs_ioc_pool_upgrade);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
+	    zfs_ioc_vdev_add);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
+	    zfs_ioc_vdev_remove);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
+	    zfs_ioc_vdev_set_state);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
+	    zfs_ioc_vdev_attach);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
+	    zfs_ioc_vdev_detach);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
+	    zfs_ioc_vdev_setpath);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
+	    zfs_ioc_vdev_setfru);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
+	    zfs_ioc_pool_set_props);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
+	    zfs_ioc_vdev_split);
+	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
+	    zfs_ioc_pool_reguid);
+
+	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
+	    zfs_ioc_pool_configs, zfs_secpolicy_none);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
+	    zfs_ioc_pool_tryimport, zfs_secpolicy_config);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
+	    zfs_ioc_inject_fault, zfs_secpolicy_inject);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
+	    zfs_ioc_clear_fault, zfs_secpolicy_inject);
+	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
+	    zfs_ioc_inject_list_next, zfs_secpolicy_inject);
+
+	/*
+	 * pool destroy, and export don't log the history as part of
+	 * zfsdev_ioctl, but rather zfs_ioc_pool_export
+	 * does the logging of those commands.
+	 */
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
+	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
+	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
+	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props,
+	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
+
+	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
+	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED);
+	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
+	    zfs_ioc_dsobj_to_dsname,
+	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED);
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
+	    zfs_ioc_pool_get_history,
+	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
+
+	zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
+
+	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY);
+
+	zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
+	    zfs_ioc_space_written);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
+	    zfs_ioc_objset_recvd_props);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
+	    zfs_ioc_next_obj);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
+	    zfs_ioc_get_fsacl);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
+	    zfs_ioc_objset_stats);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
+	    zfs_ioc_objset_zplprops);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
+	    zfs_ioc_dataset_list_next);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
+	    zfs_ioc_snapshot_list_next);
+	zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
+	    zfs_ioc_send_progress);
+
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
+	    zfs_ioc_diff, zfs_secpolicy_diff);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
+	    zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
+	    zfs_ioc_obj_to_path, zfs_secpolicy_diff);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
+	    zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
+	    zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
+	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
+	    zfs_ioc_send, zfs_secpolicy_send);
+
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
+	    zfs_secpolicy_none);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
+	    zfs_secpolicy_destroy);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
+	    zfs_secpolicy_rename);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
+	    zfs_secpolicy_recv);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
+	    zfs_secpolicy_promote);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
+	    zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
+	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
+	    zfs_secpolicy_set_fsacl);
+
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
+	    zfs_secpolicy_share, POOL_CHECK_NONE);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
+	    zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
+	    zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+	zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
+	    zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
+	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
+
+	zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next,
+	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
+	zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear,
+	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
+	zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek,
+	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
+
+	zfs_ioctl_init_os();
+}
+
+/*
+ * Verify that for non-legacy ioctls the input nvlist
+ * pairs match against the expected input.
+ *
+ * Possible errors are:
+ * ZFS_ERR_IOC_ARG_UNAVAIL	An unrecognized nvpair was encountered
+ * ZFS_ERR_IOC_ARG_REQUIRED	A required nvpair is missing
+ * ZFS_ERR_IOC_ARG_BADTYPE	Invalid type for nvpair
+ */
+static int
+zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
+{
+	const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys;
+	boolean_t required_keys_found = B_FALSE;
+
+	/*
+	 * examine each input pair
+	 */
+	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+		char *name = nvpair_name(pair);
+		data_type_t type = nvpair_type(pair);
+		boolean_t identified = B_FALSE;
+
+		/*
+		 * check pair against the documented names and type
+		 */
+		for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+			/* if not a wild card name, check for an exact match */
+			if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 &&
+			    strcmp(nvl_keys[k].zkey_name, name) != 0)
+				continue;
+
+			identified = B_TRUE;
+
+			if (nvl_keys[k].zkey_type != DATA_TYPE_ANY &&
+			    nvl_keys[k].zkey_type != type) {
+				return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE));
+			}
+
+			if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+				continue;
+
+			required_keys_found = B_TRUE;
+			break;
+		}
+
+		/* allow an 'optional' key, everything else is invalid */
+		if (!identified &&
+		    (strcmp(name, "optional") != 0 ||
+		    type != DATA_TYPE_NVLIST)) {
+			return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL));
+		}
+	}
+
+	/* verify that all required keys were found */
+	for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
+		if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
+			continue;
+
+		if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) {
+			/* at least one non-optional key is expected here */
+			if (!required_keys_found)
+				return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+			continue;
+		}
+
+		if (!nvlist_exists(innvl, nvl_keys[k].zkey_name))
+			return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
+	}
+
+	return (0);
+}
+
+static int
+pool_status_check(const char *name, zfs_ioc_namecheck_t type,
+    zfs_ioc_poolcheck_t check)
+{
+	spa_t *spa;
+	int error;
+
+	ASSERT(type == POOL_NAME || type == DATASET_NAME ||
+	    type == ENTITY_NAME);
+
+	if (check & POOL_CHECK_NONE)
+		return (0);
+
+	error = spa_open(name, &spa, FTAG);
+	if (error == 0) {
+		if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
+			error = SET_ERROR(EAGAIN);
+		else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
+			error = SET_ERROR(EROFS);
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
+int
+zfsdev_getminor(int fd, minor_t *minorp)
+{
+	zfsdev_state_t *zs, *fpd;
+	zfs_file_t *fp;
+	int rc;
+
+	ASSERT(!MUTEX_HELD(&zfsdev_state_lock));
+
+	if ((rc = zfs_file_get(fd, &fp)))
+		return (rc);
+
+	fpd = zfs_file_private(fp);
+	if (fpd == NULL)
+		return (SET_ERROR(EBADF));
+
+	mutex_enter(&zfsdev_state_lock);
+
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+
+		if (zs->zs_minor == -1)
+			continue;
+
+		if (fpd == zs) {
+			*minorp = fpd->zs_minor;
+			mutex_exit(&zfsdev_state_lock);
+			return (0);
+		}
+	}
+
+	mutex_exit(&zfsdev_state_lock);
+
+	return (SET_ERROR(EBADF));
+}
+
+static void *
+zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which)
+{
+	zfsdev_state_t *zs;
+
+	for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) {
+		if (zs->zs_minor == minor) {
+			smp_rmb();
+			switch (which) {
+			case ZST_ONEXIT:
+				return (zs->zs_onexit);
+			case ZST_ZEVENT:
+				return (zs->zs_zevent);
+			case ZST_ALL:
+				return (zs);
+			}
+		}
+	}
+
+	return (NULL);
+}
+
+void *
+zfsdev_get_state(minor_t minor, enum zfsdev_state_type which)
+{
+	void *ptr;
+
+	ptr = zfsdev_get_state_impl(minor, which);
+
+	return (ptr);
+}
+
+/*
+ * Find a free minor number.  The zfsdev_state_list is expected to
+ * be short since it is only a list of currently open file handles.
+ */
+minor_t
+zfsdev_minor_alloc(void)
+{
+	static minor_t last_minor = 0;
+	minor_t m;
+
+	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+
+	for (m = last_minor + 1; m != last_minor; m++) {
+		if (m > ZFSDEV_MAX_MINOR)
+			m = 1;
+		if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) {
+			last_minor = m;
+			return (m);
+		}
+	}
+
+	return (0);
+}
+
+long
+zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag)
+{
+	int error, cmd;
+	const zfs_ioc_vec_t *vec;
+	char *saved_poolname = NULL;
+	uint64_t max_nvlist_src_size;
+	size_t saved_poolname_len = 0;
+	nvlist_t *innvl = NULL;
+	fstrans_cookie_t cookie;
+	hrtime_t start_time = gethrtime();
+
+	cmd = vecnum;
+	error = 0;
+	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
+		return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
+
+	vec = &zfs_ioc_vec[vecnum];
+
+	/*
+	 * The registered ioctl list may be sparse, verify that either
+	 * a normal or legacy handler are registered.
+	 */
+	if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL)
+		return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
+
+	zc->zc_iflags = flag & FKIOCTL;
+	max_nvlist_src_size = zfs_max_nvlist_src_size_os();
+	if (zc->zc_nvlist_src_size > max_nvlist_src_size) {
+		/*
+		 * Make sure the user doesn't pass in an insane value for
+		 * zc_nvlist_src_size.  We have to check, since we will end
+		 * up allocating that much memory inside of get_nvlist().  This
+		 * prevents a nefarious user from allocating tons of kernel
+		 * memory.
+		 *
+		 * Also, we return EINVAL instead of ENOMEM here.  The reason
+		 * being that returning ENOMEM from an ioctl() has a special
+		 * connotation; that the user's size value is too small and
+		 * needs to be expanded to hold the nvlist.  See
+		 * zcmd_expand_dst_nvlist() for details.
+		 */
+		error = SET_ERROR(EINVAL);	/* User's size too big */
+
+	} else if (zc->zc_nvlist_src_size != 0) {
+		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+		    zc->zc_iflags, &innvl);
+		if (error != 0)
+			goto out;
+	}
+
+	/*
+	 * Ensure that all pool/dataset names are valid before we pass down to
+	 * the lower layers.
+	 */
+	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+	switch (vec->zvec_namecheck) {
+	case POOL_NAME:
+		if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
+			error = SET_ERROR(EINVAL);
+		else
+			error = pool_status_check(zc->zc_name,
+			    vec->zvec_namecheck, vec->zvec_pool_check);
+		break;
+
+	case DATASET_NAME:
+		if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
+			error = SET_ERROR(EINVAL);
+		else
+			error = pool_status_check(zc->zc_name,
+			    vec->zvec_namecheck, vec->zvec_pool_check);
+		break;
+
+	case ENTITY_NAME:
+		if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) {
+			error = SET_ERROR(EINVAL);
+		} else {
+			error = pool_status_check(zc->zc_name,
+			    vec->zvec_namecheck, vec->zvec_pool_check);
+		}
+		break;
+
+	case NO_NAME:
+		break;
+	}
+	/*
+	 * Ensure that all input pairs are valid before we pass them down
+	 * to the lower layers.
+	 *
+	 * The vectored functions can use fnvlist_lookup_{type} for any
+	 * required pairs since zfs_check_input_nvpairs() confirmed that
+	 * they exist and are of the correct type.
+	 */
+	if (error == 0 && vec->zvec_func != NULL) {
+		error = zfs_check_input_nvpairs(innvl, vec);
+		if (error != 0)
+			goto out;
+	}
+
+	if (error == 0) {
+		cookie = spl_fstrans_mark();
+		error = vec->zvec_secpolicy(zc, innvl, CRED());
+		spl_fstrans_unmark(cookie);
+	}
+
+	if (error != 0)
+		goto out;
+
+	/* legacy ioctls can modify zc_name */
+	/*
+	 * Can't use kmem_strdup() as we might truncate the string and
+	 * kmem_strfree() would then free with incorrect size.
+	 */
+	saved_poolname_len = strlen(zc->zc_name) + 1;
+	saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP);
+
+	strlcpy(saved_poolname, zc->zc_name, saved_poolname_len);
+	saved_poolname[strcspn(saved_poolname, "/@#")] = '\0';
+
+	if (vec->zvec_func != NULL) {
+		nvlist_t *outnvl;
+		int puterror = 0;
+		spa_t *spa;
+		nvlist_t *lognv = NULL;
+
+		ASSERT(vec->zvec_legacy_func == NULL);
+
+		/*
+		 * Add the innvl to the lognv before calling the func,
+		 * in case the func changes the innvl.
+		 */
+		if (vec->zvec_allow_log) {
+			lognv = fnvlist_alloc();
+			fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
+			    vec->zvec_name);
+			if (!nvlist_empty(innvl)) {
+				fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
+				    innvl);
+			}
+		}
+
+		outnvl = fnvlist_alloc();
+		cookie = spl_fstrans_mark();
+		error = vec->zvec_func(zc->zc_name, innvl, outnvl);
+		spl_fstrans_unmark(cookie);
+
+		/*
+		 * Some commands can partially execute, modify state, and still
+		 * return an error.  In these cases, attempt to record what
+		 * was modified.
+		 */
+		if ((error == 0 ||
+		    (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
+		    vec->zvec_allow_log &&
+		    spa_open(zc->zc_name, &spa, FTAG) == 0) {
+			if (!nvlist_empty(outnvl)) {
+				size_t out_size = fnvlist_size(outnvl);
+				if (out_size > zfs_history_output_max) {
+					fnvlist_add_int64(lognv,
+					    ZPOOL_HIST_OUTPUT_SIZE, out_size);
+				} else {
+					fnvlist_add_nvlist(lognv,
+					    ZPOOL_HIST_OUTPUT_NVL, outnvl);
+				}
+			}
+			if (error != 0) {
+				fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
+				    error);
+			}
+			fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS,
+			    gethrtime() - start_time);
+			(void) spa_history_log_nvl(spa, lognv);
+			spa_close(spa, FTAG);
+		}
+		fnvlist_free(lognv);
+
+		if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
+			int smusherror = 0;
+			if (vec->zvec_smush_outnvlist) {
+				smusherror = nvlist_smush(outnvl,
+				    zc->zc_nvlist_dst_size);
+			}
+			if (smusherror == 0)
+				puterror = put_nvlist(zc, outnvl);
+		}
+
+		if (puterror != 0)
+			error = puterror;
+
+		nvlist_free(outnvl);
+	} else {
+		cookie = spl_fstrans_mark();
+		error = vec->zvec_legacy_func(zc);
+		spl_fstrans_unmark(cookie);
+	}
+
+out:
+	nvlist_free(innvl);
+	if (error == 0 && vec->zvec_allow_log) {
+		char *s = tsd_get(zfs_allow_log_key);
+		if (s != NULL)
+			kmem_strfree(s);
+		(void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname));
+	}
+	if (saved_poolname != NULL)
+		kmem_free(saved_poolname, saved_poolname_len);
+
+	return (error);
+}
+
+int
+zfs_kmod_init(void)
+{
+	int error;
+
+	if ((error = zvol_init()) != 0)
+		return (error);
+
+	spa_init(SPA_MODE_READ | SPA_MODE_WRITE);
+	zfs_init();
+
+	zfs_ioctl_init();
+
+	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+	zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
+	zfsdev_state_list->zs_minor = -1;
+
+	if ((error = zfsdev_attach()) != 0)
+		goto out;
+
+	tsd_create(&zfs_fsyncer_key, NULL);
+	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+
+	return (0);
+out:
+	zfs_fini();
+	spa_fini();
+	zvol_fini();
+
+	return (error);
+}
+
+void
+zfs_kmod_fini(void)
+{
+	zfsdev_state_t *zs, *zsnext = NULL;
+
+	zfsdev_detach();
+
+	mutex_destroy(&zfsdev_state_lock);
+
+	for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) {
+		zsnext = zs->zs_next;
+		if (zs->zs_onexit)
+			zfs_onexit_destroy(zs->zs_onexit);
+		if (zs->zs_zevent)
+			zfs_zevent_destroy(zs->zs_zevent);
+		kmem_free(zs, sizeof (zfsdev_state_t));
+	}
+
+	zfs_ereport_taskq_fini();	/* run before zfs_fini() on Linux */
+	zfs_fini();
+	spa_fini();
+	zvol_fini();
+
+	tsd_destroy(&zfs_fsyncer_key);
+	tsd_destroy(&rrw_tsd_key);
+	tsd_destroy(&zfs_allow_log_key);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW,
+    "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
+
+ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, ULONG, ZMOD_RW,
+    "Maximum size in bytes of ZFS ioctl output that will be logged");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_log.c b/sys/contrib/openzfs/module/zfs/zfs_log.c
new file mode 100644
index 000000000000..4bb529f78838
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_log.c
@@ -0,0 +1,781 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ */
+
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/vfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/byteorder.h>
+#include <sys/policy.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/dmu.h>
+#include <sys/dbuf.h>
+#include <sys/spa.h>
+#include <sys/zfs_fuid.h>
+#include <sys/dsl_dataset.h>
+
+/*
+ * These zfs_log_* functions must be called within a dmu tx, in one
+ * of 2 contexts depending on zilog->z_replay:
+ *
+ * Non replay mode
+ * ---------------
+ * We need to record the transaction so that if it is committed to
+ * the Intent Log then it can be replayed.  An intent log transaction
+ * structure (itx_t) is allocated and all the information necessary to
+ * possibly replay the transaction is saved in it. The itx is then assigned
+ * a sequence number and inserted in the in-memory list anchored in the zilog.
+ *
+ * Replay mode
+ * -----------
+ * We need to mark the intent log record as replayed in the log header.
+ * This is done in the same transaction as the replay so that they
+ * commit atomically.
+ */
+
+int
+zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
+{
+	int isxvattr = (vap->va_mask & ATTR_XVATTR);
+	switch (type) {
+	case Z_FILE:
+		if (vsecp == NULL && !isxvattr)
+			return (TX_CREATE);
+		if (vsecp && isxvattr)
+			return (TX_CREATE_ACL_ATTR);
+		if (vsecp)
+			return (TX_CREATE_ACL);
+		else
+			return (TX_CREATE_ATTR);
+		/*NOTREACHED*/
+	case Z_DIR:
+		if (vsecp == NULL && !isxvattr)
+			return (TX_MKDIR);
+		if (vsecp && isxvattr)
+			return (TX_MKDIR_ACL_ATTR);
+		if (vsecp)
+			return (TX_MKDIR_ACL);
+		else
+			return (TX_MKDIR_ATTR);
+	case Z_XATTRDIR:
+		return (TX_MKXATTR);
+	}
+	ASSERT(0);
+	return (TX_MAX_TYPE);
+}
+
+/*
+ * build up the log data necessary for logging xvattr_t
+ * First lr_attr_t is initialized.  following the lr_attr_t
+ * is the mapsize and attribute bitmap copied from the xvattr_t.
+ * Following the bitmap and bitmapsize two 64 bit words are reserved
+ * for the create time which may be set.  Following the create time
+ * records a single 64 bit integer which has the bits to set on
+ * replay for the xvattr.
+ */
+static void
+zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+	uint32_t	*bitmap;
+	uint64_t	*attrs;
+	uint64_t	*crtime;
+	xoptattr_t	*xoap;
+	void		*scanstamp;
+	int		i;
+
+	xoap = xva_getxoptattr(xvap);
+	ASSERT(xoap);
+
+	lrattr->lr_attr_masksize = xvap->xva_mapsize;
+	bitmap = &lrattr->lr_attr_bitmap;
+	for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
+		*bitmap = xvap->xva_reqattrmap[i];
+	}
+
+	/* Now pack the attributes up in a single uint64_t */
+	attrs = (uint64_t *)bitmap;
+	crtime = attrs + 1;
+	scanstamp = (caddr_t)(crtime + 2);
+	*attrs = 0;
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+		*attrs |= (xoap->xoa_readonly == 0) ? 0 :
+		    XAT0_READONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+		*attrs |= (xoap->xoa_hidden == 0) ? 0 :
+		    XAT0_HIDDEN;
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+		*attrs |= (xoap->xoa_system == 0) ? 0 :
+		    XAT0_SYSTEM;
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+		*attrs |= (xoap->xoa_archive == 0) ? 0 :
+		    XAT0_ARCHIVE;
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+		*attrs |= (xoap->xoa_immutable == 0) ? 0 :
+		    XAT0_IMMUTABLE;
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+		*attrs |= (xoap->xoa_nounlink == 0) ? 0 :
+		    XAT0_NOUNLINK;
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+		*attrs |= (xoap->xoa_appendonly == 0) ? 0 :
+		    XAT0_APPENDONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+		*attrs |= (xoap->xoa_opaque == 0) ? 0 :
+		    XAT0_APPENDONLY;
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+		*attrs |= (xoap->xoa_nodump == 0) ? 0 :
+		    XAT0_NODUMP;
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+		*attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
+		    XAT0_AV_QUARANTINED;
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+		*attrs |= (xoap->xoa_av_modified == 0) ? 0 :
+		    XAT0_AV_MODIFIED;
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+		ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
+
+		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+	} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+		/*
+		 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
+		 * at the same time, so we can share the same space.
+		 */
+		bcopy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+		*attrs |= (xoap->xoa_reparse == 0) ? 0 :
+		    XAT0_REPARSE;
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+		*attrs |= (xoap->xoa_offline == 0) ? 0 :
+		    XAT0_OFFLINE;
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+		*attrs |= (xoap->xoa_sparse == 0) ? 0 :
+		    XAT0_SPARSE;
+	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
+		*attrs |= (xoap->xoa_projinherit == 0) ? 0 :
+		    XAT0_PROJINHERIT;
+}
+
+static void *
+zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
+{
+	zfs_fuid_t *zfuid;
+	uint64_t *fuidloc = start;
+
+	/* First copy in the ACE FUIDs */
+	for (zfuid = list_head(&fuidp->z_fuids); zfuid;
+	    zfuid = list_next(&fuidp->z_fuids, zfuid)) {
+		*fuidloc++ = zfuid->z_logfuid;
+	}
+	return (fuidloc);
+}
+
+
+static void *
+zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
+{
+	zfs_fuid_domain_t *zdomain;
+
+	/* now copy in the domain info, if any */
+	if (fuidp->z_domain_str_sz != 0) {
+		for (zdomain = list_head(&fuidp->z_domains); zdomain;
+		    zdomain = list_next(&fuidp->z_domains, zdomain)) {
+			bcopy((void *)zdomain->z_domain, start,
+			    strlen(zdomain->z_domain) + 1);
+			start = (caddr_t)start +
+			    strlen(zdomain->z_domain) + 1;
+		}
+	}
+	return (start);
+}
+
+/*
+ * If zp is an xattr node, check whether the xattr owner is unlinked.
+ * We don't want to log anything if the owner is unlinked.
+ */
+static int
+zfs_xattr_owner_unlinked(znode_t *zp)
+{
+	int unlinked = 0;
+	znode_t *dzp;
+#ifdef __FreeBSD__
+	znode_t *tzp = zp;
+
+	/*
+	 * zrele drops the vnode lock which violates the VOP locking contract
+	 * on FreeBSD. See comment at the top of zfs_replay.c for more detail.
+	 */
+	/*
+	 * if zp is XATTR node, keep walking up via z_xattr_parent until we
+	 * get the owner
+	 */
+	while (tzp->z_pflags & ZFS_XATTR) {
+		ASSERT3U(zp->z_xattr_parent, !=, 0);
+		if (zfs_zget(ZTOZSB(tzp), tzp->z_xattr_parent, &dzp) != 0) {
+			unlinked = 1;
+			break;
+		}
+
+		if (tzp != zp)
+			zrele(tzp);
+		tzp = dzp;
+		unlinked = tzp->z_unlinked;
+	}
+	if (tzp != zp)
+		zrele(tzp);
+#else
+	zhold(zp);
+	/*
+	 * if zp is XATTR node, keep walking up via z_xattr_parent until we
+	 * get the owner
+	 */
+	while (zp->z_pflags & ZFS_XATTR) {
+		ASSERT3U(zp->z_xattr_parent, !=, 0);
+		if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) {
+			unlinked = 1;
+			break;
+		}
+
+		zrele(zp);
+		zp = dzp;
+		unlinked = zp->z_unlinked;
+	}
+	zrele(zp);
+#endif
+	return (unlinked);
+}
+
+/*
+ * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
+ * TK_MKXATTR transactions.
+ *
+ * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
+ * domain information appended prior to the name.  In this case the
+ * uid/gid in the log record will be a log centric FUID.
+ *
+ * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
+ * may contain attributes, ACL and optional fuid information.
+ *
+ * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
+ * and ACL and normal users/groups in the ACEs.
+ *
+ * There may be an optional xvattr attribute information similar
+ * to zfs_log_setattr.
+ *
+ * Also, after the file name "domain" strings may be appended.
+ */
+void
+zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, const char *name, vsecattr_t *vsecp,
+    zfs_fuid_info_t *fuidp, vattr_t *vap)
+{
+	itx_t *itx;
+	lr_create_t *lr;
+	lr_acl_create_t *lracl;
+	size_t aclsize = 0;
+	size_t xvatsize = 0;
+	size_t txsize;
+	xvattr_t *xvap = (xvattr_t *)vap;
+	void *end;
+	size_t lrsize;
+	size_t namesize = strlen(name) + 1;
+	size_t fuidsz = 0;
+
+	if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp))
+		return;
+
+	/*
+	 * If we have FUIDs present then add in space for
+	 * domains and ACE fuid's if any.
+	 */
+	if (fuidp) {
+		fuidsz += fuidp->z_domain_str_sz;
+		fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
+	}
+
+	if (vap->va_mask & ATTR_XVATTR)
+		xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+	if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
+	    (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
+	    (int)txtype == TX_MKXATTR) {
+		txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
+		lrsize = sizeof (*lr);
+	} else {
+		txsize =
+		    sizeof (lr_acl_create_t) + namesize + fuidsz +
+		    ZIL_ACE_LENGTH(aclsize) + xvatsize;
+		lrsize = sizeof (lr_acl_create_t);
+	}
+
+	itx = zil_itx_create(txtype, txsize);
+
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	/* Store dnode slot count in 8 bits above object id. */
+	LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT);
+	lr->lr_mode = zp->z_mode;
+	if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp)))) {
+		lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOUID(zp));
+	} else {
+		lr->lr_uid = fuidp->z_fuid_owner;
+	}
+	if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp)))) {
+		lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOGID(zp));
+	} else {
+		lr->lr_gid = fuidp->z_fuid_group;
+	}
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen,
+	    sizeof (uint64_t));
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+	    lr->lr_crtime, sizeof (uint64_t) * 2);
+
+	if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(zp)), &lr->lr_rdev,
+	    sizeof (lr->lr_rdev)) != 0)
+		lr->lr_rdev = 0;
+
+	/*
+	 * Fill in xvattr info if any
+	 */
+	if (vap->va_mask & ATTR_XVATTR) {
+		zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
+		end = (caddr_t)lr + lrsize + xvatsize;
+	} else {
+		end = (caddr_t)lr + lrsize;
+	}
+
+	/* Now fill in any ACL info */
+
+	if (vsecp) {
+		lracl = (lr_acl_create_t *)&itx->itx_lr;
+		lracl->lr_aclcnt = vsecp->vsa_aclcnt;
+		lracl->lr_acl_bytes = aclsize;
+		lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+		lracl->lr_fuidcnt  = fuidp ? fuidp->z_fuid_cnt : 0;
+		if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
+			lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+		else
+			lracl->lr_acl_flags = 0;
+
+		bcopy(vsecp->vsa_aclentp, end, aclsize);
+		end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
+	}
+
+	/* drop in FUID info */
+	if (fuidp) {
+		end = zfs_log_fuid_ids(fuidp, end);
+		end = zfs_log_fuid_domains(fuidp, end);
+	}
+	/*
+	 * Now place file name in log record
+	 */
+	bcopy(name, end, namesize);
+
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles both TX_REMOVE and TX_RMDIR transactions.
+ */
+void
+zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, const char *name, uint64_t foid, boolean_t unlinked)
+{
+	itx_t *itx;
+	lr_remove_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp))
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_remove_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	itx->itx_oid = foid;
+
+	/*
+	 * Object ids can be re-instantiated in the next txg so
+	 * remove any async transactions to avoid future leaks.
+	 * This can happen if a fsync occurs on the re-instantiated
+	 * object for a WR_INDIRECT or WR_NEED_COPY write, which gets
+	 * the new file data and flushes a write record for the old object.
+	 */
+	if (unlinked) {
+		ASSERT((txtype & ~TX_CI) == TX_REMOVE);
+		zil_remove_async(zilog, foid);
+	}
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_LINK transactions.
+ */
+void
+zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, const char *name)
+{
+	itx_t *itx;
+	lr_link_t *lr;
+	size_t namesize = strlen(name) + 1;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
+	lr = (lr_link_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_link_obj = zp->z_id;
+	bcopy(name, (char *)(lr + 1), namesize);
+
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_SYMLINK transactions.
+ */
+void
+zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
+    znode_t *dzp, znode_t *zp, const char *name, const char *link)
+{
+	itx_t *itx;
+	lr_create_t *lr;
+	size_t namesize = strlen(name) + 1;
+	size_t linksize = strlen(link) + 1;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
+	lr = (lr_create_t *)&itx->itx_lr;
+	lr->lr_doid = dzp->z_id;
+	lr->lr_foid = zp->z_id;
+	lr->lr_uid = KUID_TO_SUID(ZTOUID(zp));
+	lr->lr_gid = KGID_TO_SGID(ZTOGID(zp));
+	lr->lr_mode = zp->z_mode;
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen,
+	    sizeof (uint64_t));
+	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)),
+	    lr->lr_crtime, sizeof (uint64_t) * 2);
+	bcopy(name, (char *)(lr + 1), namesize);
+	bcopy(link, (char *)(lr + 1) + namesize, linksize);
+
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_RENAME transactions.
+ */
+void
+zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, znode_t *sdzp,
+    const char *sname, znode_t *tdzp, const char *dname, znode_t *szp)
+{
+	itx_t *itx;
+	lr_rename_t *lr;
+	size_t snamesize = strlen(sname) + 1;
+	size_t dnamesize = strlen(dname) + 1;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
+	lr = (lr_rename_t *)&itx->itx_lr;
+	lr->lr_sdoid = sdzp->z_id;
+	lr->lr_tdoid = tdzp->z_id;
+	bcopy(sname, (char *)(lr + 1), snamesize);
+	bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
+	itx->itx_oid = szp->z_id;
+
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * zfs_log_write() handles TX_WRITE transactions. The specified callback is
+ * called as soon as the write is on stable storage (be it via a DMU sync or a
+ * ZIL commit).
+ */
+long zfs_immediate_write_sz = 32768;
+
+void
+zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, offset_t off, ssize_t resid, int ioflag,
+    zil_callback_t callback, void *callback_data)
+{
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+	uint32_t blocksize = zp->z_blksz;
+	itx_wr_state_t write_state;
+	uintptr_t fsync_cnt;
+
+	if (zil_replaying(zilog, tx) || zp->z_unlinked ||
+	    zfs_xattr_owner_unlinked(zp)) {
+		if (callback != NULL)
+			callback(callback_data);
+		return;
+	}
+
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= zfs_immediate_write_sz)
+		write_state = WR_INDIRECT;
+	else if (ioflag & (O_SYNC | O_DSYNC))
+		write_state = WR_COPIED;
+	else
+		write_state = WR_NEED_COPY;
+
+	if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
+		(void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
+	}
+
+	while (resid) {
+		itx_t *itx;
+		lr_write_t *lr;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
+
+		/*
+		 * A WR_COPIED record must fit entirely in one log block.
+		 * Large writes can use WR_NEED_COPY, which the ZIL will
+		 * split into multiple records across several log blocks
+		 * if necessary.
+		 */
+		if (wr_state == WR_COPIED &&
+		    resid > zil_max_copied_data(zilog))
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
+
+		itx = zil_itx_create(txtype, sizeof (*lr) +
+		    (wr_state == WR_COPIED ? len : 0));
+		lr = (lr_write_t *)&itx->itx_lr;
+
+		/*
+		 * For WR_COPIED records, copy the data into the lr_write_t.
+		 */
+		if (wr_state == WR_COPIED) {
+			int err;
+			DB_DNODE_ENTER(db);
+			err = dmu_read_by_dnode(DB_DNODE(db), off, len, lr + 1,
+			    DMU_READ_NO_PREFETCH);
+			if (err != 0) {
+				zil_itx_destroy(itx);
+				itx = zil_itx_create(txtype, sizeof (*lr));
+				lr = (lr_write_t *)&itx->itx_lr;
+				wr_state = WR_NEED_COPY;
+			}
+			DB_DNODE_EXIT(db);
+		}
+
+		itx->itx_wr_state = wr_state;
+		lr->lr_foid = zp->z_id;
+		lr->lr_offset = off;
+		lr->lr_length = len;
+		lr->lr_blkoff = 0;
+		BP_ZERO(&lr->lr_blkptr);
+
+		itx->itx_private = ZTOZSB(zp);
+
+		if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) &&
+		    (fsync_cnt == 0))
+			itx->itx_sync = B_FALSE;
+
+		itx->itx_callback = callback;
+		itx->itx_callback_data = callback_data;
+		zil_itx_assign(zilog, itx, tx);
+
+		off += len;
+		resid -= len;
+	}
+}
+
+/*
+ * Handles TX_TRUNCATE transactions.
+ */
+void
+zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, uint64_t off, uint64_t len)
+{
+	itx_t *itx;
+	lr_truncate_t *lr;
+
+	if (zil_replaying(zilog, tx) || zp->z_unlinked ||
+	    zfs_xattr_owner_unlinked(zp))
+		return;
+
+	itx = zil_itx_create(txtype, sizeof (*lr));
+	lr = (lr_truncate_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+
+	itx->itx_sync = (zp->z_sync_cnt != 0);
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_SETATTR transactions.
+ */
+void
+zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+{
+	itx_t		*itx;
+	lr_setattr_t	*lr;
+	xvattr_t	*xvap = (xvattr_t *)vap;
+	size_t		recsize = sizeof (lr_setattr_t);
+	void		*start;
+
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
+		return;
+
+	/*
+	 * If XVATTR set, then log record size needs to allow
+	 * for lr_attr_t + xvattr mask, mapsize and create time
+	 * plus actual attribute values
+	 */
+	if (vap->va_mask & ATTR_XVATTR)
+		recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+
+	if (fuidp)
+		recsize += fuidp->z_domain_str_sz;
+
+	itx = zil_itx_create(txtype, recsize);
+	lr = (lr_setattr_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	lr->lr_mask = (uint64_t)mask_applied;
+	lr->lr_mode = (uint64_t)vap->va_mode;
+	if ((mask_applied & ATTR_UID) && IS_EPHEMERAL(vap->va_uid))
+		lr->lr_uid = fuidp->z_fuid_owner;
+	else
+		lr->lr_uid = (uint64_t)vap->va_uid;
+
+	if ((mask_applied & ATTR_GID) && IS_EPHEMERAL(vap->va_gid))
+		lr->lr_gid = fuidp->z_fuid_group;
+	else
+		lr->lr_gid = (uint64_t)vap->va_gid;
+
+	lr->lr_size = (uint64_t)vap->va_size;
+	ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
+	ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
+	start = (lr_setattr_t *)(lr + 1);
+	if (vap->va_mask & ATTR_XVATTR) {
+		zfs_log_xvattr((lr_attr_t *)start, xvap);
+		start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
+	}
+
+	/*
+	 * Now stick on domain information if any on end
+	 */
+
+	if (fuidp)
+		(void) zfs_log_fuid_domains(fuidp, start);
+
+	itx->itx_sync = (zp->z_sync_cnt != 0);
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/*
+ * Handles TX_ACL transactions.
+ */
+void
+zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
+    vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
+{
+	itx_t *itx;
+	lr_acl_v0_t *lrv0;
+	lr_acl_t *lr;
+	int txtype;
+	int lrsize;
+	size_t txsize;
+	size_t aclbytes = vsecp->vsa_aclentsz;
+
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
+		return;
+
+	txtype = (ZTOZSB(zp)->z_version < ZPL_VERSION_FUID) ?
+	    TX_ACL_V0 : TX_ACL;
+
+	if (txtype == TX_ACL)
+		lrsize = sizeof (*lr);
+	else
+		lrsize = sizeof (*lrv0);
+
+	txsize = lrsize +
+	    ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
+	    (fuidp ? fuidp->z_domain_str_sz : 0) +
+	    sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
+
+	itx = zil_itx_create(txtype, txsize);
+
+	lr = (lr_acl_t *)&itx->itx_lr;
+	lr->lr_foid = zp->z_id;
+	if (txtype == TX_ACL) {
+		lr->lr_acl_bytes = aclbytes;
+		lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
+		lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
+		if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
+			lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
+		else
+			lr->lr_acl_flags = 0;
+	}
+	lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
+
+	if (txtype == TX_ACL_V0) {
+		lrv0 = (lr_acl_v0_t *)lr;
+		bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
+	} else {
+		void *start = (ace_t *)(lr + 1);
+
+		bcopy(vsecp->vsa_aclentp, start, aclbytes);
+
+		start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
+
+		if (fuidp) {
+			start = zfs_log_fuid_ids(fuidp, start);
+			(void) zfs_log_fuid_domains(fuidp, start);
+		}
+	}
+
+	itx->itx_sync = (zp->z_sync_cnt != 0);
+	zil_itx_assign(zilog, itx, tx);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW,
+	"Largest data block to write to zil");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zfs_onexit.c b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
new file mode 100644
index 000000000000..2a1332e715ee
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_onexit.c
@@ -0,0 +1,173 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2020 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/kmem.h>
+#include <sys/sunddi.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zfs_onexit.h>
+#include <sys/zvol.h>
+
+/*
+ * ZFS kernel routines may add/delete callback routines to be invoked
+ * upon process exit (triggered via the close operation from the /dev/zfs
+ * driver).
+ *
+ * These cleanup callbacks are intended to allow for the accumulation
+ * of kernel state across multiple ioctls.  User processes participate
+ * simply by opening ZFS_DEV. This causes the ZFS driver to do create
+ * some private data for the file descriptor and generating a unique
+ * minor number. The process then passes along that file descriptor to
+ * each ioctl that might have a cleanup operation.
+ *
+ * Consumers of the onexit routines should call zfs_onexit_fd_hold() early
+ * on to validate the given fd and add a reference to its file table entry.
+ * This allows the consumer to do its work and then add a callback, knowing
+ * that zfs_onexit_add_cb() won't fail with EBADF.  When finished, consumers
+ * should call zfs_onexit_fd_rele().
+ *
+ * A simple example is zfs_ioc_recv(), where we might create an AVL tree
+ * with dataset/GUID mappings and then reuse that tree on subsequent
+ * zfs_ioc_recv() calls.
+ *
+ * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc()
+ * the AVL tree and pass it along with a callback function to
+ * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the
+ * callback and return an action handle.
+ *
+ * The action handle is then passed from user space to subsequent
+ * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree
+ * by calling zfs_onexit_cb_data() with the device minor number and
+ * action handle.
+ *
+ * If the user process exits abnormally, the callback is invoked implicitly
+ * as part of the driver close operation.  Once the user space process is
+ * finished with the accumulated kernel state, it can also just call close(2)
+ * on the cleanup fd to trigger the cleanup callback.
+ */
+
+void
+zfs_onexit_init(zfs_onexit_t **zop)
+{
+	zfs_onexit_t *zo;
+
+	zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP);
+	mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t),
+	    offsetof(zfs_onexit_action_node_t, za_link));
+}
+
+void
+zfs_onexit_destroy(zfs_onexit_t *zo)
+{
+	zfs_onexit_action_node_t *ap;
+
+	mutex_enter(&zo->zo_lock);
+	while ((ap = list_head(&zo->zo_actions)) != NULL) {
+		list_remove(&zo->zo_actions, ap);
+		mutex_exit(&zo->zo_lock);
+		ap->za_func(ap->za_data);
+		kmem_free(ap, sizeof (zfs_onexit_action_node_t));
+		mutex_enter(&zo->zo_lock);
+	}
+	mutex_exit(&zo->zo_lock);
+
+	list_destroy(&zo->zo_actions);
+	mutex_destroy(&zo->zo_lock);
+	kmem_free(zo, sizeof (zfs_onexit_t));
+}
+
+/*
+ * Consumers might need to operate by minor number instead of fd, since
+ * they might be running in another thread (e.g. txg_sync_thread). Callers
+ * of this function must call zfs_onexit_fd_rele() when they're finished
+ * using the minor number.
+ */
+int
+zfs_onexit_fd_hold(int fd, minor_t *minorp)
+{
+	zfs_onexit_t *zo = NULL;
+	int error;
+
+	error = zfsdev_getminor(fd, minorp);
+	if (error) {
+		zfs_onexit_fd_rele(fd);
+		return (error);
+	}
+
+	zo = zfsdev_get_state(*minorp, ZST_ONEXIT);
+	if (zo == NULL) {
+		zfs_onexit_fd_rele(fd);
+		return (SET_ERROR(EBADF));
+	}
+	return (0);
+}
+
+void
+zfs_onexit_fd_rele(int fd)
+{
+	zfs_file_put(fd);
+}
+
+static int
+zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo)
+{
+	*zo = zfsdev_get_state(minor, ZST_ONEXIT);
+	if (*zo == NULL)
+		return (SET_ERROR(EBADF));
+
+	return (0);
+}
+
+/*
+ * Add a callback to be invoked when the calling process exits.
+ */
+int
+zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
+    uint64_t *action_handle)
+{
+	zfs_onexit_t *zo;
+	zfs_onexit_action_node_t *ap;
+	int error;
+
+	error = zfs_onexit_minor_to_state(minor, &zo);
+	if (error)
+		return (error);
+
+	ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP);
+	list_link_init(&ap->za_link);
+	ap->za_func = func;
+	ap->za_data = data;
+
+	mutex_enter(&zo->zo_lock);
+	list_insert_tail(&zo->zo_actions, ap);
+	mutex_exit(&zo->zo_lock);
+	if (action_handle)
+		*action_handle = (uint64_t)(uintptr_t)ap;
+
+	return (0);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_quota.c b/sys/contrib/openzfs/module/zfs/zfs_quota.c
new file mode 100644
index 000000000000..e61db5c7ab83
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_quota.c
@@ -0,0 +1,476 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
+ * All rights reserved.
+ * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/avl.h>
+#include <sys/dmu_objset.h>
+#include <sys/sa.h>
+#include <sys/sa_impl.h>
+#include <sys/zap.h>
+#include <sys/zfs_project.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_znode.h>
+
+int
+zpl_get_file_info(dmu_object_type_t bonustype, const void *data,
+    zfs_file_info_t *zoi)
+{
+	/*
+	 * Is it a valid type of object to track?
+	 */
+	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
+		return (SET_ERROR(ENOENT));
+
+	zoi->zfi_project = ZFS_DEFAULT_PROJID;
+
+	/*
+	 * If we have a NULL data pointer
+	 * then assume the id's aren't changing and
+	 * return EEXIST to the dmu to let it know to
+	 * use the same ids
+	 */
+	if (data == NULL)
+		return (SET_ERROR(EEXIST));
+
+	if (bonustype == DMU_OT_ZNODE) {
+		const znode_phys_t *znp = data;
+		zoi->zfi_user = znp->zp_uid;
+		zoi->zfi_group = znp->zp_gid;
+		zoi->zfi_generation = znp->zp_gen;
+		return (0);
+	}
+
+	const sa_hdr_phys_t *sap = data;
+	if (sap->sa_magic == 0) {
+		/*
+		 * This should only happen for newly created files
+		 * that haven't had the znode data filled in yet.
+		 */
+		zoi->zfi_user = 0;
+		zoi->zfi_group = 0;
+		zoi->zfi_generation = 0;
+		return (0);
+	}
+
+	sa_hdr_phys_t sa = *sap;
+	boolean_t swap = B_FALSE;
+	if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
+		sa.sa_magic = SA_MAGIC;
+		sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
+		swap = B_TRUE;
+	}
+	VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
+
+	int hdrsize = sa_hdrsize(&sa);
+	VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
+
+	uintptr_t data_after_hdr = (uintptr_t)data + hdrsize;
+	zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET));
+	zoi->zfi_group = *((uint64_t *)(data_after_hdr + SA_GID_OFFSET));
+	zoi->zfi_generation = *((uint64_t *)(data_after_hdr + SA_GEN_OFFSET));
+	uint64_t flags = *((uint64_t *)(data_after_hdr + SA_FLAGS_OFFSET));
+	if (swap)
+		flags = BSWAP_64(flags);
+
+	if (flags & ZFS_PROJID) {
+		zoi->zfi_project =
+		    *((uint64_t *)(data_after_hdr + SA_PROJID_OFFSET));
+	}
+
+	if (swap) {
+		zoi->zfi_user = BSWAP_64(zoi->zfi_user);
+		zoi->zfi_group = BSWAP_64(zoi->zfi_group);
+		zoi->zfi_project = BSWAP_64(zoi->zfi_project);
+		zoi->zfi_generation = BSWAP_64(zoi->zfi_generation);
+	}
+	return (0);
+}
+
+static void
+fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
+    char *domainbuf, int buflen, uid_t *ridp)
+{
+	uint64_t fuid;
+	const char *domain;
+
+	fuid = zfs_strtonum(fuidstr, NULL);
+
+	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
+	if (domain)
+		(void) strlcpy(domainbuf, domain, buflen);
+	else
+		domainbuf[0] = '\0';
+	*ridp = FUID_RID(fuid);
+}
+
+static uint64_t
+zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
+{
+	switch (type) {
+	case ZFS_PROP_USERUSED:
+	case ZFS_PROP_USEROBJUSED:
+		return (DMU_USERUSED_OBJECT);
+	case ZFS_PROP_GROUPUSED:
+	case ZFS_PROP_GROUPOBJUSED:
+		return (DMU_GROUPUSED_OBJECT);
+	case ZFS_PROP_PROJECTUSED:
+	case ZFS_PROP_PROJECTOBJUSED:
+		return (DMU_PROJECTUSED_OBJECT);
+	case ZFS_PROP_USERQUOTA:
+		return (zfsvfs->z_userquota_obj);
+	case ZFS_PROP_GROUPQUOTA:
+		return (zfsvfs->z_groupquota_obj);
+	case ZFS_PROP_USEROBJQUOTA:
+		return (zfsvfs->z_userobjquota_obj);
+	case ZFS_PROP_GROUPOBJQUOTA:
+		return (zfsvfs->z_groupobjquota_obj);
+	case ZFS_PROP_PROJECTQUOTA:
+		return (zfsvfs->z_projectquota_obj);
+	case ZFS_PROP_PROJECTOBJQUOTA:
+		return (zfsvfs->z_projectobjquota_obj);
+	default:
+		return (ZFS_NO_OBJECT);
+	}
+}
+
+int
+zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
+{
+	int error;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zfs_useracct_t *buf = vbuf;
+	uint64_t obj;
+	int offset = 0;
+
+	if (!dmu_objset_userspace_present(zfsvfs->z_os))
+		return (SET_ERROR(ENOTSUP));
+
+	if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
+	    type == ZFS_PROP_PROJECTOBJQUOTA ||
+	    type == ZFS_PROP_PROJECTOBJUSED) &&
+	    !dmu_objset_projectquota_present(zfsvfs->z_os))
+		return (SET_ERROR(ENOTSUP));
+
+	if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+	    type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
+	    type == ZFS_PROP_PROJECTOBJUSED ||
+	    type == ZFS_PROP_PROJECTOBJQUOTA) &&
+	    !dmu_objset_userobjspace_present(zfsvfs->z_os))
+		return (SET_ERROR(ENOTSUP));
+
+	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+	if (obj == ZFS_NO_OBJECT) {
+		*bufsizep = 0;
+		return (0);
+	}
+
+	if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+	    type == ZFS_PROP_PROJECTOBJUSED)
+		offset = DMU_OBJACCT_PREFIX_LEN;
+
+	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
+		    *bufsizep)
+			break;
+
+		/*
+		 * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX)
+		 * when dealing with block quota and vice versa.
+		 */
+		if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX,
+		    DMU_OBJACCT_PREFIX_LEN) == 0))
+			continue;
+
+		fuidstr_to_sid(zfsvfs, za.za_name + offset,
+		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
+
+		buf->zu_space = za.za_first_integer;
+		buf++;
+	}
+	if (error == ENOENT)
+		error = 0;
+
+	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
+	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
+	*cookiep = zap_cursor_serialize(&zc);
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+int
+zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t *valp)
+{
+	char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+	int offset = 0;
+	int err;
+	uint64_t obj;
+
+	*valp = 0;
+
+	if (!dmu_objset_userspace_present(zfsvfs->z_os))
+		return (SET_ERROR(ENOTSUP));
+
+	if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+	    type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA ||
+	    type == ZFS_PROP_PROJECTOBJUSED ||
+	    type == ZFS_PROP_PROJECTOBJQUOTA) &&
+	    !dmu_objset_userobjspace_present(zfsvfs->z_os))
+		return (SET_ERROR(ENOTSUP));
+
+	if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED ||
+	    type == ZFS_PROP_PROJECTOBJQUOTA ||
+	    type == ZFS_PROP_PROJECTOBJUSED) {
+		if (!dmu_objset_projectquota_present(zfsvfs->z_os))
+			return (SET_ERROR(ENOTSUP));
+		if (!zpl_is_valid_projid(rid))
+			return (SET_ERROR(EINVAL));
+	}
+
+	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+	if (obj == ZFS_NO_OBJECT)
+		return (0);
+
+	if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED ||
+	    type == ZFS_PROP_PROJECTOBJUSED) {
+		strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1);
+		offset = DMU_OBJACCT_PREFIX_LEN;
+	}
+
+	err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf + offset,
+	    sizeof (buf) - offset, B_FALSE);
+	if (err)
+		return (err);
+
+	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
+	if (err == ENOENT)
+		err = 0;
+	return (err);
+}
+
+int
+zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t quota)
+{
+	char buf[32];
+	int err;
+	dmu_tx_t *tx;
+	uint64_t *objp;
+	boolean_t fuid_dirtied;
+
+	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
+		return (SET_ERROR(ENOTSUP));
+
+	switch (type) {
+	case ZFS_PROP_USERQUOTA:
+		objp = &zfsvfs->z_userquota_obj;
+		break;
+	case ZFS_PROP_GROUPQUOTA:
+		objp = &zfsvfs->z_groupquota_obj;
+		break;
+	case ZFS_PROP_USEROBJQUOTA:
+		objp = &zfsvfs->z_userobjquota_obj;
+		break;
+	case ZFS_PROP_GROUPOBJQUOTA:
+		objp = &zfsvfs->z_groupobjquota_obj;
+		break;
+	case ZFS_PROP_PROJECTQUOTA:
+		if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
+			return (SET_ERROR(ENOTSUP));
+		if (!zpl_is_valid_projid(rid))
+			return (SET_ERROR(EINVAL));
+
+		objp = &zfsvfs->z_projectquota_obj;
+		break;
+	case ZFS_PROP_PROJECTOBJQUOTA:
+		if (!dmu_objset_projectquota_enabled(zfsvfs->z_os))
+			return (SET_ERROR(ENOTSUP));
+		if (!zpl_is_valid_projid(rid))
+			return (SET_ERROR(EINVAL));
+
+		objp = &zfsvfs->z_projectobjquota_obj;
+		break;
+	default:
+		return (SET_ERROR(EINVAL));
+	}
+
+	err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof (buf), B_TRUE);
+	if (err)
+		return (err);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
+	if (*objp == 0) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    zfs_userquota_prop_prefixes[type]);
+	}
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	mutex_enter(&zfsvfs->z_lock);
+	if (*objp == 0) {
+		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
+		    DMU_OT_NONE, 0, tx);
+		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
+	}
+	mutex_exit(&zfsvfs->z_lock);
+
+	if (quota == 0) {
+		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
+		if (err == ENOENT)
+			err = 0;
+	} else {
+		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
+	}
+	ASSERT(err == 0);
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+	dmu_tx_commit(tx);
+	return (err);
+}
+
+boolean_t
+zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+	char buf[20 + DMU_OBJACCT_PREFIX_LEN];
+	uint64_t used, quota, quotaobj;
+	int err;
+
+	if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) {
+		if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) {
+			dsl_pool_config_enter(
+			    dmu_objset_pool(zfsvfs->z_os), FTAG);
+			dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+			dsl_pool_config_exit(
+			    dmu_objset_pool(zfsvfs->z_os), FTAG);
+		}
+		return (B_FALSE);
+	}
+
+	if (usedobj == DMU_PROJECTUSED_OBJECT) {
+		if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
+			if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
+				dsl_pool_config_enter(
+				    dmu_objset_pool(zfsvfs->z_os), FTAG);
+				dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+				dsl_pool_config_exit(
+				    dmu_objset_pool(zfsvfs->z_os), FTAG);
+			}
+			return (B_FALSE);
+		}
+		quotaobj = zfsvfs->z_projectobjquota_obj;
+	} else if (usedobj == DMU_USERUSED_OBJECT) {
+		quotaobj = zfsvfs->z_userobjquota_obj;
+	} else if (usedobj == DMU_GROUPUSED_OBJECT) {
+		quotaobj = zfsvfs->z_groupobjquota_obj;
+	} else {
+		return (B_FALSE);
+	}
+	if (quotaobj == 0 || zfsvfs->z_replay)
+		return (B_FALSE);
+
+	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id);
+	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+	if (err != 0)
+		return (B_FALSE);
+
+	(void) snprintf(buf, sizeof (buf), DMU_OBJACCT_PREFIX "%llx",
+	    (longlong_t)id);
+	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+	if (err != 0)
+		return (B_FALSE);
+	return (used >= quota);
+}
+
+boolean_t
+zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+	char buf[20];
+	uint64_t used, quota, quotaobj;
+	int err;
+
+	if (usedobj == DMU_PROJECTUSED_OBJECT) {
+		if (!dmu_objset_projectquota_present(zfsvfs->z_os)) {
+			if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) {
+				dsl_pool_config_enter(
+				    dmu_objset_pool(zfsvfs->z_os), FTAG);
+				dmu_objset_id_quota_upgrade(zfsvfs->z_os);
+				dsl_pool_config_exit(
+				    dmu_objset_pool(zfsvfs->z_os), FTAG);
+			}
+			return (B_FALSE);
+		}
+		quotaobj = zfsvfs->z_projectquota_obj;
+	} else if (usedobj == DMU_USERUSED_OBJECT) {
+		quotaobj = zfsvfs->z_userquota_obj;
+	} else if (usedobj == DMU_GROUPUSED_OBJECT) {
+		quotaobj = zfsvfs->z_groupquota_obj;
+	} else {
+		return (B_FALSE);
+	}
+	if (quotaobj == 0 || zfsvfs->z_replay)
+		return (B_FALSE);
+
+	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id);
+	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+	if (err != 0)
+		return (B_FALSE);
+
+	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+	if (err != 0)
+		return (B_FALSE);
+	return (used >= quota);
+}
+
+boolean_t
+zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id)
+{
+	return (zfs_id_overblockquota(zfsvfs, usedobj, id) ||
+	    zfs_id_overobjquota(zfsvfs, usedobj, id));
+}
+
+EXPORT_SYMBOL(zpl_get_file_info);
+EXPORT_SYMBOL(zfs_userspace_one);
+EXPORT_SYMBOL(zfs_userspace_many);
+EXPORT_SYMBOL(zfs_set_userquota);
+EXPORT_SYMBOL(zfs_id_overblockquota);
+EXPORT_SYMBOL(zfs_id_overobjquota);
+EXPORT_SYMBOL(zfs_id_overquota);
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
new file mode 100644
index 000000000000..b18b480ce527
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_ratelimit.c
@@ -0,0 +1,99 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/zfs_ratelimit.h>
+
+/*
+ * Initialize rate limit struct
+ *
+ * rl:		zfs_ratelimit_t struct
+ * burst:	Number to allow in an interval before rate limiting
+ * interval:	Interval time in seconds
+ */
+void
+zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int *burst,
+    unsigned int interval)
+{
+	rl->count = 0;
+	rl->start = 0;
+	rl->interval = interval;
+	rl->burst = burst;
+	mutex_init(&rl->lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/*
+ * Finalize rate limit struct
+ *
+ * rl:		zfs_ratelimit_t struct
+ */
+void
+zfs_ratelimit_fini(zfs_ratelimit_t *rl)
+{
+	mutex_destroy(&rl->lock);
+}
+
+/*
+ * Re-implementation of the kernel's __ratelimit() function
+ *
+ * We had to write our own rate limiter because the kernel's __ratelimit()
+ * function annoyingly prints out how many times it rate limited to the kernel
+ * logs (and there's no way to turn it off):
+ *
+ *	__ratelimit: 59 callbacks suppressed
+ *
+ * If the kernel ever allows us to disable these prints, we should go back to
+ * using __ratelimit() instead.
+ *
+ * Return values are the same as __ratelimit():
+ *
+ * 0: If we're rate limiting
+ * 1: If we're not rate limiting.
+ */
+int
+zfs_ratelimit(zfs_ratelimit_t *rl)
+{
+	hrtime_t now;
+
+	hrtime_t elapsed;
+	int error = 1;
+
+	mutex_enter(&rl->lock);
+
+	now = gethrtime();
+	elapsed = now - rl->start;
+
+	rl->count++;
+	if (NSEC2SEC(elapsed) >= rl->interval) {
+		rl->start = now;
+		rl->count = 0;
+	} else {
+		if (rl->count >= *rl->burst) {
+			error = 0; /* We're ratelimiting */
+		}
+	}
+	mutex_exit(&rl->lock);
+
+	return (error);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c
new file mode 100644
index 000000000000..53c7dbd5df43
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c
@@ -0,0 +1,997 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 Cyril Plisko. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/cmn_err.h>
+#include <sys/kmem.h>
+#include <sys/thread.h>
+#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/vfs.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_fuid.h>
+#include <sys/zfs_vnops.h>
+#include <sys/spa.h>
+#include <sys/zil.h>
+#include <sys/byteorder.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+#include <sys/atomic.h>
+#include <sys/cred.h>
+#include <sys/zpl.h>
+
+/*
+ * NB: FreeBSD expects to be able to do vnode locking in lookup and
+ * hold the locks across all subsequent VOPs until vput is called.
+ * This means that its zfs vnops routines can't do any internal locking.
+ * In order to have the same contract as the Linux vnops there would
+ * needed to be duplicate locked vnops. If the vnops were used more widely
+ * in common code this would likely be preferable. However, currently
+ * this is the only file where this is the case.
+ */
+
+/*
+ * Functions to replay ZFS intent log (ZIL) records
+ * The functions are called through a function vector (zfs_replay_vector)
+ * which is indexed by the transaction type.
+ */
+
+static void
+zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
+    uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+{
+	bzero(vap, sizeof (*vap));
+	vap->va_mask = (uint_t)mask;
+	vap->va_mode = mode;
+#ifdef __FreeBSD__
+	vap->va_type = IFTOVT(mode);
+#endif
+	vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
+	vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
+	vap->va_rdev = zfs_cmpldev(rdev);
+	vap->va_nodeid = nodeid;
+}
+
+/* ARGSUSED */
+static int
+zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
+{
+	return (SET_ERROR(ENOTSUP));
+}
+
+static void
+zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
+{
+	xoptattr_t *xoap = NULL;
+	uint64_t *attrs;
+	uint64_t *crtime;
+	uint32_t *bitmap;
+	void *scanstamp;
+	int i;
+
+	xvap->xva_vattr.va_mask |= ATTR_XVATTR;
+	if ((xoap = xva_getxoptattr(xvap)) == NULL) {
+		xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */
+		return;
+	}
+
+	ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
+
+	bitmap = &lrattr->lr_attr_bitmap;
+	for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
+		xvap->xva_reqattrmap[i] = *bitmap;
+
+	attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
+	crtime = attrs + 1;
+	scanstamp = (caddr_t)(crtime + 2);
+
+	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
+		xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
+		xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
+		xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
+		xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
+		xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
+		xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
+		xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
+		xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
+		xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
+		xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
+		xoap->xoa_av_quarantined =
+		    ((*attrs & XAT0_AV_QUARANTINED) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
+		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
+	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
+		ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
+
+		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+	} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
+		/*
+		 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
+		 * at the same time, so we can share the same space.
+		 */
+		bcopy(scanstamp, &xoap->xoa_projid, sizeof (uint64_t));
+	}
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+		xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
+		xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
+		xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
+	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
+		xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0);
+}
+
+static int
+zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
+{
+	uint64_t uid_idx;
+	uint64_t gid_idx;
+	int domcnt = 0;
+
+	uid_idx = FUID_INDEX(uid);
+	gid_idx = FUID_INDEX(gid);
+	if (uid_idx)
+		domcnt++;
+	if (gid_idx > 0 && gid_idx != uid_idx)
+		domcnt++;
+
+	return (domcnt);
+}
+
+static void *
+zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
+    int domcnt)
+{
+	int i;
+
+	for (i = 0; i != domcnt; i++) {
+		fuid_infop->z_domain_table[i] = start;
+		start = (caddr_t)start + strlen(start) + 1;
+	}
+
+	return (start);
+}
+
+/*
+ * Set the uid/gid in the fuid_info structure.
+ */
+static void
+zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
+{
+	/*
+	 * If owner or group are log specific FUIDs then slurp up
+	 * domain information and build zfs_fuid_info_t
+	 */
+	if (IS_EPHEMERAL(uid))
+		fuid_infop->z_fuid_owner = uid;
+
+	if (IS_EPHEMERAL(gid))
+		fuid_infop->z_fuid_group = gid;
+}
+
+/*
+ * Load fuid domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
+{
+	int domcnt;
+
+	zfs_fuid_info_t *fuid_infop;
+
+	fuid_infop = zfs_fuid_info_alloc();
+
+	domcnt = zfs_replay_domain_cnt(uid, gid);
+
+	if (domcnt == 0)
+		return (fuid_infop);
+
+	fuid_infop->z_domain_table =
+	    kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
+
+	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+	fuid_infop->z_domain_cnt = domcnt;
+	*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
+	return (fuid_infop);
+}
+
+/*
+ * load zfs_fuid_t's and fuid_domains into fuid_info_t
+ */
+static zfs_fuid_info_t *
+zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
+    uint64_t gid)
+{
+	uint64_t *log_fuid = (uint64_t *)start;
+	zfs_fuid_info_t *fuid_infop;
+	int i;
+
+	fuid_infop = zfs_fuid_info_alloc();
+	fuid_infop->z_domain_cnt = domcnt;
+
+	fuid_infop->z_domain_table =
+	    kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
+
+	for (i = 0; i != idcnt; i++) {
+		zfs_fuid_t *zfuid;
+
+		zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
+		zfuid->z_logfuid = *log_fuid;
+		zfuid->z_id = -1;
+		zfuid->z_domidx = 0;
+		list_insert_tail(&fuid_infop->z_fuids, zfuid);
+		log_fuid++;
+	}
+
+	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
+
+	*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
+	return (fuid_infop);
+}
+
+static void
+zfs_replay_swap_attrs(lr_attr_t *lrattr)
+{
+	/* swap the lr_attr structure */
+	byteswap_uint32_array(lrattr, sizeof (*lrattr));
+	/* swap the bitmap */
+	byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
+	    sizeof (uint32_t));
+	/* swap the attributes, create time + 64 bit word for attributes */
+	byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
+	    (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
+}
+
+/*
+ * Replay file create with optional ACL, xvattr information as well
+ * as option FUID information.
+ */
+static int
+zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_acl_create_t *lracl = arg2;
+	char *name = NULL;		/* location determined later */
+	lr_create_t *lr = (lr_create_t *)lracl;
+	znode_t *dzp;
+	znode_t *zp;
+	xvattr_t xva;
+	int vflg = 0;
+	vsecattr_t vsec = { 0 };
+	lr_attr_t *lrattr;
+	void *aclstart;
+	void *fuidstart;
+	size_t xvatlen = 0;
+	uint64_t txtype;
+	uint64_t objid;
+	uint64_t dnodesize;
+	int error;
+
+	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
+	if (byteswap) {
+		byteswap_uint64_array(lracl, sizeof (*lracl));
+		if (txtype == TX_CREATE_ACL_ATTR ||
+		    txtype == TX_MKDIR_ACL_ATTR) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			zfs_replay_swap_attrs(lrattr);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		}
+
+		aclstart = (caddr_t)(lracl + 1) + xvatlen;
+		zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
+		/* swap fuids */
+		if (lracl->lr_fuidcnt) {
+			byteswap_uint64_array((caddr_t)aclstart +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
+			    lracl->lr_fuidcnt * sizeof (uint64_t));
+		}
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	objid = LR_FOID_GET_OBJ(lr->lr_foid);
+	dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
+	xva_init(&xva);
+	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
+	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
+
+	/*
+	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+	 * eventually end up in zfs_mknode(), which assigns the object's
+	 * creation time, generation number, and dnode size. The generic
+	 * zfs_create() has no concept of these attributes, so we smuggle
+	 * the values inside the vattr's otherwise unused va_ctime,
+	 * va_nblocks, and va_fsid fields.
+	 */
+	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+	xva.xva_vattr.va_nblocks = lr->lr_gen;
+	xva.xva_vattr.va_fsid = dnodesize;
+
+	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+	if (error)
+		goto bail;
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+	switch (txtype) {
+	case TX_CREATE_ACL:
+		aclstart = (caddr_t)(lracl + 1);
+		fuidstart = (caddr_t)aclstart +
+		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+		    lr->lr_uid, lr->lr_gid);
+		/*FALLTHROUGH*/
+	case TX_CREATE_ACL_ATTR:
+		if (name == NULL) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+			xva.xva_vattr.va_mask |= ATTR_XVATTR;
+			zfs_replay_xvattr(lrattr, &xva);
+		}
+		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+		vsec.vsa_aclcnt = lracl->lr_aclcnt;
+		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+		vsec.vsa_aclflags = lracl->lr_acl_flags;
+		if (zfsvfs->z_fuid_replay == NULL) {
+			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+			zfsvfs->z_fuid_replay =
+			    zfs_replay_fuids(fuidstart,
+			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+			    lr->lr_uid, lr->lr_gid);
+		}
+
+		error = zfs_create(dzp, name, &xva.xva_vattr,
+		    0, 0, &zp, kcred, vflg, &vsec);
+		break;
+	case TX_MKDIR_ACL:
+		aclstart = (caddr_t)(lracl + 1);
+		fuidstart = (caddr_t)aclstart +
+		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
+		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+		    lr->lr_uid, lr->lr_gid);
+		/*FALLTHROUGH*/
+	case TX_MKDIR_ACL_ATTR:
+		if (name == NULL) {
+			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
+			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+			zfs_replay_xvattr(lrattr, &xva);
+		}
+		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
+		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
+		vsec.vsa_aclcnt = lracl->lr_aclcnt;
+		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
+		vsec.vsa_aclflags = lracl->lr_acl_flags;
+		if (zfsvfs->z_fuid_replay == NULL) {
+			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
+			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
+			zfsvfs->z_fuid_replay =
+			    zfs_replay_fuids(fuidstart,
+			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
+			    lr->lr_uid, lr->lr_gid);
+		}
+		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+		    &zp, kcred, vflg, &vsec);
+		break;
+	default:
+		error = SET_ERROR(ENOTSUP);
+	}
+
+bail:
+	if (error == 0 && zp != NULL) {
+#ifdef __FreeBSD__
+		VOP_UNLOCK1(ZTOV(zp));
+#endif
+		zrele(zp);
+	}
+	zrele(dzp);
+
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
+
+	return (error);
+}
+
+static int
+zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_create_t *lr = arg2;
+	char *name = NULL;		/* location determined later */
+	char *link;			/* symlink content follows name */
+	znode_t *dzp;
+	znode_t *zp = NULL;
+	xvattr_t xva;
+	int vflg = 0;
+	size_t lrsize = sizeof (lr_create_t);
+	lr_attr_t *lrattr;
+	void *start;
+	size_t xvatlen;
+	uint64_t txtype;
+	uint64_t objid;
+	uint64_t dnodesize;
+	int error;
+
+	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
+			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+	}
+
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	objid = LR_FOID_GET_OBJ(lr->lr_foid);
+	dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
+
+	xva_init(&xva);
+	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
+	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
+
+	/*
+	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
+	 * eventually end up in zfs_mknode(), which assigns the object's
+	 * creation time, generation number, and dnode slot count. The
+	 * generic zfs_create() has no concept of these attributes, so
+	 * we smuggle the values inside the vattr's otherwise unused
+	 * va_ctime, va_nblocks, and va_fsid fields.
+	 */
+	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
+	xva.xva_vattr.va_nblocks = lr->lr_gen;
+	xva.xva_vattr.va_fsid = dnodesize;
+
+	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
+	if (error)
+		goto out;
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	/*
+	 * Symlinks don't have fuid info, and CIFS never creates
+	 * symlinks.
+	 *
+	 * The _ATTR versions will grab the fuid info in their subcases.
+	 */
+	if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
+	    (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
+	    (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
+		start = (lr + 1);
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+	}
+
+	switch (txtype) {
+	case TX_CREATE_ATTR:
+		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+		start = (caddr_t)(lr + 1) + xvatlen;
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+		name = (char *)start;
+
+		/*FALLTHROUGH*/
+	case TX_CREATE:
+		if (name == NULL)
+			name = (char *)start;
+
+		error = zfs_create(dzp, name, &xva.xva_vattr,
+		    0, 0, &zp, kcred, vflg, NULL);
+		break;
+	case TX_MKDIR_ATTR:
+		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
+		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
+		start = (caddr_t)(lr + 1) + xvatlen;
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuid_domain(start, &start,
+		    lr->lr_uid, lr->lr_gid);
+		name = (char *)start;
+
+		/*FALLTHROUGH*/
+	case TX_MKDIR:
+		if (name == NULL)
+			name = (char *)(lr + 1);
+
+		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
+		    &zp, kcred, vflg, NULL);
+		break;
+	case TX_MKXATTR:
+		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred);
+		break;
+	case TX_SYMLINK:
+		name = (char *)(lr + 1);
+		link = name + strlen(name) + 1;
+		error = zfs_symlink(dzp, name, &xva.xva_vattr,
+		    link, &zp, kcred, vflg);
+		break;
+	default:
+		error = SET_ERROR(ENOTSUP);
+	}
+
+out:
+	if (error == 0 && zp != NULL) {
+#ifdef __FreeBSD__
+		VOP_UNLOCK1(ZTOV(zp));
+#endif
+		zrele(zp);
+	}
+	zrele(dzp);
+
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
+	return (error);
+}
+
+static int
+zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_remove_t *lr = arg2;
+	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
+	znode_t *dzp;
+	int error;
+	int vflg = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	switch ((int)lr->lr_common.lrc_txtype) {
+	case TX_REMOVE:
+		error = zfs_remove(dzp, name, kcred, vflg);
+		break;
+	case TX_RMDIR:
+		error = zfs_rmdir(dzp, name, NULL, kcred, vflg);
+		break;
+	default:
+		error = SET_ERROR(ENOTSUP);
+	}
+
+	zrele(dzp);
+
+	return (error);
+}
+
+static int
+zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_link_t *lr = arg2;
+	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
+	znode_t *dzp, *zp;
+	int error;
+	int vflg = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
+		zrele(dzp);
+		return (error);
+	}
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	error = zfs_link(dzp, zp, name, kcred, vflg);
+	zrele(zp);
+	zrele(dzp);
+
+	return (error);
+}
+
+static int
+zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_rename_t *lr = arg2;
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
+	znode_t *sdzp, *tdzp;
+	int error;
+	int vflg = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
+		return (error);
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
+		zrele(sdzp);
+		return (error);
+	}
+
+	if (lr->lr_common.lrc_txtype & TX_CI)
+		vflg |= FIGNORECASE;
+
+	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg);
+
+	zrele(tdzp);
+	zrele(sdzp);
+	return (error);
+}
+
+static int
+zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_write_t *lr = arg2;
+	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
+	znode_t	*zp;
+	int error;
+	uint64_t eod, offset, length;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+		/*
+		 * As we can log writes out of order, it's possible the
+		 * file has been removed. In this case just drop the write
+		 * and return success.
+		 */
+		if (error == ENOENT)
+			error = 0;
+		return (error);
+	}
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+	eod = offset + length;	/* end of data for this write */
+
+	/*
+	 * This may be a write from a dmu_sync() for a whole block,
+	 * and may extend beyond the current end of the file.
+	 * We can't just replay what was written for this TX_WRITE as
+	 * a future TX_WRITE2 may extend the eof and the data for that
+	 * write needs to be there. So we write the whole block and
+	 * reduce the eof. This needs to be done within the single dmu
+	 * transaction created within vn_rdwr -> zfs_write. So a possible
+	 * new end of file is passed through in zfsvfs->z_replay_eof
+	 */
+
+	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+		if (zp->z_size < eod)
+			zfsvfs->z_replay_eof = eod;
+	}
+	error = zfs_write_simple(zp, data, length, offset, NULL);
+	zrele(zp);
+	zfsvfs->z_replay_eof = 0;	/* safety */
+
+	return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_write_t *lr = arg2;
+	znode_t	*zp;
+	int error;
+	uint64_t end;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+top:
+	end = lr->lr_offset + lr->lr_length;
+	if (end > zp->z_size) {
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+
+		zp->z_size = end;
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			zrele(zp);
+			if (error == ERESTART) {
+				dmu_tx_wait(tx);
+				dmu_tx_abort(tx);
+				goto top;
+			}
+			dmu_tx_abort(tx);
+			return (error);
+		}
+		(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+		    (void *)&zp->z_size, sizeof (uint64_t), tx);
+
+		/* Ensure the replayed seq is updated */
+		(void) zil_replaying(zfsvfs->z_log, tx);
+
+		dmu_tx_commit(tx);
+	}
+
+	zrele(zp);
+
+	return (error);
+}
+
+static int
+zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_truncate_t *lr = arg2;
+	znode_t *zp;
+	flock64_t fl;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	bzero(&fl, sizeof (fl));
+	fl.l_type = F_WRLCK;
+	fl.l_whence = SEEK_SET;
+	fl.l_start = lr->lr_offset;
+	fl.l_len = lr->lr_length;
+
+	error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE,
+	    lr->lr_offset, kcred);
+
+	zrele(zp);
+
+	return (error);
+}
+
+static int
+zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_setattr_t *lr = arg2;
+	znode_t *zp;
+	xvattr_t xva;
+	vattr_t *vap = &xva.xva_vattr;
+	int error;
+	void *start;
+
+	xva_init(&xva);
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+		if ((lr->lr_mask & ATTR_XVATTR) &&
+		    zfsvfs->z_version >= ZPL_VERSION_INITIAL)
+			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
+	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
+
+	vap->va_size = lr->lr_size;
+	ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
+	ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
+	gethrestime(&vap->va_ctime);
+	vap->va_mask |= ATTR_CTIME;
+
+	/*
+	 * Fill in xvattr_t portions if necessary.
+	 */
+
+	start = (lr_setattr_t *)(lr + 1);
+	if (vap->va_mask & ATTR_XVATTR) {
+		zfs_replay_xvattr((lr_attr_t *)start, &xva);
+		start = (caddr_t)start +
+		    ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
+	} else
+		xva.xva_vattr.va_mask &= ~ATTR_XVATTR;
+
+	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
+	    lr->lr_uid, lr->lr_gid);
+
+	/*
+	 * Satisfy assertions.
+	 */
+	vn_seqc_write_begin(ZTOV(zp));
+	error = zfs_setattr(zp, vap, 0, kcred);
+	vn_seqc_write_end(ZTOV(zp));
+
+	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	zfsvfs->z_fuid_replay = NULL;
+	zrele(zp);
+
+	return (error);
+}
+
+static int
+zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_acl_v0_t *lr = arg2;
+	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
+	vsecattr_t vsa;
+	znode_t *zp;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	bzero(&vsa, sizeof (vsa));
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
+	vsa.vsa_aclcnt = lr->lr_aclcnt;
+	vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
+	vsa.vsa_aclflags = 0;
+	vsa.vsa_aclentp = ace;
+
+	error = zfs_setsecattr(zp, &vsa, 0, kcred);
+
+	zrele(zp);
+
+	return (error);
+}
+
+/*
+ * Replaying ACLs is complicated by FUID support.
+ * The log record may contain some optional data
+ * to be used for replaying FUID's.  These pieces
+ * are the actual FUIDs that were created initially.
+ * The FUID table index may no longer be valid and
+ * during zfs_create() a new index may be assigned.
+ * Because of this the log will contain the original
+ * domain+rid in order to create a new FUID.
+ *
+ * The individual ACEs may contain an ephemeral uid/gid which is no
+ * longer valid and will need to be replaced with an actual FUID.
+ *
+ */
+static int
+zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zfsvfs_t *zfsvfs = arg1;
+	lr_acl_t *lr = arg2;
+	ace_t *ace = (ace_t *)(lr + 1);
+	vsecattr_t vsa;
+	znode_t *zp;
+	int error;
+
+	if (byteswap) {
+		byteswap_uint64_array(lr, sizeof (*lr));
+		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
+		if (lr->lr_fuidcnt) {
+			byteswap_uint64_array((caddr_t)ace +
+			    ZIL_ACE_LENGTH(lr->lr_acl_bytes),
+			    lr->lr_fuidcnt * sizeof (uint64_t));
+		}
+	}
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	bzero(&vsa, sizeof (vsa));
+	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
+	vsa.vsa_aclcnt = lr->lr_aclcnt;
+	vsa.vsa_aclentp = ace;
+	vsa.vsa_aclentsz = lr->lr_acl_bytes;
+	vsa.vsa_aclflags = lr->lr_acl_flags;
+
+	if (lr->lr_fuidcnt) {
+		void *fuidstart = (caddr_t)ace +
+		    ZIL_ACE_LENGTH(lr->lr_acl_bytes);
+
+		zfsvfs->z_fuid_replay =
+		    zfs_replay_fuids(fuidstart, &fuidstart,
+		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
+	}
+
+	error = zfs_setsecattr(zp, &vsa, 0, kcred);
+
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+
+	zfsvfs->z_fuid_replay = NULL;
+	zrele(zp);
+
+	return (error);
+}
+
+/*
+ * Callback vectors for replaying records
+ */
+zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
+	zfs_replay_error,	/* no such type */
+	zfs_replay_create,	/* TX_CREATE */
+	zfs_replay_create,	/* TX_MKDIR */
+	zfs_replay_create,	/* TX_MKXATTR */
+	zfs_replay_create,	/* TX_SYMLINK */
+	zfs_replay_remove,	/* TX_REMOVE */
+	zfs_replay_remove,	/* TX_RMDIR */
+	zfs_replay_link,	/* TX_LINK */
+	zfs_replay_rename,	/* TX_RENAME */
+	zfs_replay_write,	/* TX_WRITE */
+	zfs_replay_truncate,	/* TX_TRUNCATE */
+	zfs_replay_setattr,	/* TX_SETATTR */
+	zfs_replay_acl_v0,	/* TX_ACL_V0 */
+	zfs_replay_acl,		/* TX_ACL */
+	zfs_replay_create_acl,	/* TX_CREATE_ACL */
+	zfs_replay_create,	/* TX_CREATE_ATTR */
+	zfs_replay_create_acl,	/* TX_CREATE_ACL_ATTR */
+	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
+	zfs_replay_create,	/* TX_MKDIR_ATTR */
+	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
+	zfs_replay_write2,	/* TX_WRITE2 */
+};
diff --git a/sys/contrib/openzfs/module/zfs/zfs_rlock.c b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
new file mode 100644
index 000000000000..06a5e031a7df
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_rlock.c
@@ -0,0 +1,691 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ */
+
+/*
+ * This file contains the code to implement file range locking in
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind is
+ * support for growing the blocksize).
+ *
+ * Interface
+ * ---------
+ * Defined in zfs_rlock.h but essentially:
+ *	lr = rangelock_enter(zp, off, len, lock_type);
+ *	rangelock_reduce(lr, off, len); // optional
+ *	rangelock_exit(lr);
+ *
+ * Range locking rules
+ * --------------------
+ * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole
+ *    file range needs to be locked as RL_WRITER. Only then can the pages be
+ *    freed etc and zp_size reset. zp_size must be set within range lock.
+ * 2. For writes and punching holes (zfs_write & zfs_space) just the range
+ *    being written or freed needs to be locked as RL_WRITER.
+ *    Multiple writes at the end of the file must coordinate zp_size updates
+ *    to ensure data isn't lost. A compare and swap loop is currently used
+ *    to ensure the file size is at least the offset last written.
+ * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being
+ *    read needs to be locked as RL_READER. A check against zp_size can then
+ *    be made for reading beyond end of file.
+ *
+ * AVL tree
+ * --------
+ * An AVL tree is used to maintain the state of the existing ranges
+ * that are locked for exclusive (writer) or shared (reader) use.
+ * The starting range offset is used for searching and sorting the tree.
+ *
+ * Common case
+ * -----------
+ * The (hopefully) usual case is of no overlaps or contention for locks. On
+ * entry to rangelock_enter(), a locked_range_t is allocated; the tree
+ * searched that finds no overlap, and *this* locked_range_t is placed in the
+ * tree.
+ *
+ * Overlaps/Reference counting/Proxy locks
+ * ---------------------------------------
+ * The avl code only allows one node at a particular offset. Also it's very
+ * inefficient to search through all previous entries looking for overlaps
+ * (because the very 1st in the ordered list might be at offset 0 but
+ * cover the whole file).
+ * So this implementation uses reference counts and proxy range locks.
+ * Firstly, only reader locks use reference counts and proxy locks,
+ * because writer locks are exclusive.
+ * When a reader lock overlaps with another then a proxy lock is created
+ * for that range and replaces the original lock. If the overlap
+ * is exact then the reference count of the proxy is simply incremented.
+ * Otherwise, the proxy lock is split into smaller lock ranges and
+ * new proxy locks created for non overlapping ranges.
+ * The reference counts are adjusted accordingly.
+ * Meanwhile, the original lock is kept around (this is the callers handle)
+ * and its offset and length are used when releasing the lock.
+ *
+ * Thread coordination
+ * -------------------
+ * In order to make wakeups efficient and to ensure multiple continuous
+ * readers on a range don't starve a writer for the same range lock,
+ * two condition variables are allocated in each rl_t.
+ * If a writer (or reader) can't get a range it initialises the writer
+ * (or reader) cv; sets a flag saying there's a writer (or reader) waiting;
+ * and waits on that cv. When a thread unlocks that range it wakes up all
+ * writers then all readers before destroying the lock.
+ *
+ * Append mode writes
+ * ------------------
+ * Append mode writes need to lock a range at the end of a file.
+ * The offset of the end of the file is determined under the
+ * range locking mutex, and the lock type converted from RL_APPEND to
+ * RL_WRITER and the range locked.
+ *
+ * Grow block handling
+ * -------------------
+ * ZFS supports multiple block sizes, up to 16MB. The smallest
+ * block size is used for the file which is grown as needed. During this
+ * growth all other writers and readers must be excluded.
+ * So if the block size needs to be grown then the whole file is
+ * exclusively locked, then later the caller will reduce the lock
+ * range to just the range to be written using rangelock_reduce().
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zfs_rlock.h>
+
+
+/*
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
+ */
+static int
+zfs_rangelock_compare(const void *arg1, const void *arg2)
+{
+	const zfs_locked_range_t *rl1 = (const zfs_locked_range_t *)arg1;
+	const zfs_locked_range_t *rl2 = (const zfs_locked_range_t *)arg2;
+
+	return (TREE_CMP(rl1->lr_offset, rl2->lr_offset));
+}
+
+/*
+ * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock.
+ * It must convert RL_APPEND to RL_WRITER (starting at the end of the file),
+ * and may increase the range that's locked for RL_WRITER.
+ */
+void
+zfs_rangelock_init(zfs_rangelock_t *rl, zfs_rangelock_cb_t *cb, void *arg)
+{
+	mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&rl->rl_tree, zfs_rangelock_compare,
+	    sizeof (zfs_locked_range_t), offsetof(zfs_locked_range_t, lr_node));
+	rl->rl_cb = cb;
+	rl->rl_arg = arg;
+}
+
+void
+zfs_rangelock_fini(zfs_rangelock_t *rl)
+{
+	mutex_destroy(&rl->rl_lock);
+	avl_destroy(&rl->rl_tree);
+}
+
+/*
+ * Check if a write lock can be grabbed.  If not, fail immediately or sleep and
+ * recheck until available, depending on the value of the "nonblock" parameter.
+ */
+static boolean_t
+zfs_rangelock_enter_writer(zfs_rangelock_t *rl, zfs_locked_range_t *new,
+    boolean_t nonblock)
+{
+	avl_tree_t *tree = &rl->rl_tree;
+	zfs_locked_range_t *lr;
+	avl_index_t where;
+	uint64_t orig_off = new->lr_offset;
+	uint64_t orig_len = new->lr_length;
+	zfs_rangelock_type_t orig_type = new->lr_type;
+
+	for (;;) {
+		/*
+		 * Call callback which can modify new->r_off,len,type.
+		 * Note, the callback is used by the ZPL to handle appending
+		 * and changing blocksizes.  It isn't needed for zvols.
+		 */
+		if (rl->rl_cb != NULL) {
+			rl->rl_cb(new, rl->rl_arg);
+		}
+
+		/*
+		 * If the type was APPEND, the callback must convert it to
+		 * WRITER.
+		 */
+		ASSERT3U(new->lr_type, ==, RL_WRITER);
+
+		/*
+		 * First check for the usual case of no locks
+		 */
+		if (avl_numnodes(tree) == 0) {
+			avl_add(tree, new);
+			return (B_TRUE);
+		}
+
+		/*
+		 * Look for any locks in the range.
+		 */
+		lr = avl_find(tree, new, &where);
+		if (lr != NULL)
+			goto wait; /* already locked at same offset */
+
+		lr = avl_nearest(tree, where, AVL_AFTER);
+		if (lr != NULL &&
+		    lr->lr_offset < new->lr_offset + new->lr_length)
+			goto wait;
+
+		lr = avl_nearest(tree, where, AVL_BEFORE);
+		if (lr != NULL &&
+		    lr->lr_offset + lr->lr_length > new->lr_offset)
+			goto wait;
+
+		avl_insert(tree, new, where);
+		return (B_TRUE);
+wait:
+		if (nonblock)
+			return (B_FALSE);
+		if (!lr->lr_write_wanted) {
+			cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL);
+			lr->lr_write_wanted = B_TRUE;
+		}
+		cv_wait(&lr->lr_write_cv, &rl->rl_lock);
+
+		/* reset to original */
+		new->lr_offset = orig_off;
+		new->lr_length = orig_len;
+		new->lr_type = orig_type;
+	}
+}
+
+/*
+ * If this is an original (non-proxy) lock then replace it by
+ * a proxy and return the proxy.
+ */
+static zfs_locked_range_t *
+zfs_rangelock_proxify(avl_tree_t *tree, zfs_locked_range_t *lr)
+{
+	zfs_locked_range_t *proxy;
+
+	if (lr->lr_proxy)
+		return (lr); /* already a proxy */
+
+	ASSERT3U(lr->lr_count, ==, 1);
+	ASSERT(lr->lr_write_wanted == B_FALSE);
+	ASSERT(lr->lr_read_wanted == B_FALSE);
+	avl_remove(tree, lr);
+	lr->lr_count = 0;
+
+	/* create a proxy range lock */
+	proxy = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+	proxy->lr_offset = lr->lr_offset;
+	proxy->lr_length = lr->lr_length;
+	proxy->lr_count = 1;
+	proxy->lr_type = RL_READER;
+	proxy->lr_proxy = B_TRUE;
+	proxy->lr_write_wanted = B_FALSE;
+	proxy->lr_read_wanted = B_FALSE;
+	avl_add(tree, proxy);
+
+	return (proxy);
+}
+
+/*
+ * Split the range lock at the supplied offset
+ * returning the *front* proxy.
+ */
+static zfs_locked_range_t *
+zfs_rangelock_split(avl_tree_t *tree, zfs_locked_range_t *lr, uint64_t off)
+{
+	zfs_locked_range_t *rear;
+
+	ASSERT3U(lr->lr_length, >, 1);
+	ASSERT3U(off, >, lr->lr_offset);
+	ASSERT3U(off, <, lr->lr_offset + lr->lr_length);
+	ASSERT(lr->lr_write_wanted == B_FALSE);
+	ASSERT(lr->lr_read_wanted == B_FALSE);
+
+	/* create the rear proxy range lock */
+	rear = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+	rear->lr_offset = off;
+	rear->lr_length = lr->lr_offset + lr->lr_length - off;
+	rear->lr_count = lr->lr_count;
+	rear->lr_type = RL_READER;
+	rear->lr_proxy = B_TRUE;
+	rear->lr_write_wanted = B_FALSE;
+	rear->lr_read_wanted = B_FALSE;
+
+	zfs_locked_range_t *front = zfs_rangelock_proxify(tree, lr);
+	front->lr_length = off - lr->lr_offset;
+
+	avl_insert_here(tree, rear, front, AVL_AFTER);
+	return (front);
+}
+
+/*
+ * Create and add a new proxy range lock for the supplied range.
+ */
+static void
+zfs_rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len)
+{
+	zfs_locked_range_t *lr;
+
+	ASSERT(len != 0);
+	lr = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+	lr->lr_offset = off;
+	lr->lr_length = len;
+	lr->lr_count = 1;
+	lr->lr_type = RL_READER;
+	lr->lr_proxy = B_TRUE;
+	lr->lr_write_wanted = B_FALSE;
+	lr->lr_read_wanted = B_FALSE;
+	avl_add(tree, lr);
+}
+
+static void
+zfs_rangelock_add_reader(avl_tree_t *tree, zfs_locked_range_t *new,
+    zfs_locked_range_t *prev, avl_index_t where)
+{
+	zfs_locked_range_t *next;
+	uint64_t off = new->lr_offset;
+	uint64_t len = new->lr_length;
+
+	/*
+	 * prev arrives either:
+	 * - pointing to an entry at the same offset
+	 * - pointing to the entry with the closest previous offset whose
+	 *   range may overlap with the new range
+	 * - null, if there were no ranges starting before the new one
+	 */
+	if (prev != NULL) {
+		if (prev->lr_offset + prev->lr_length <= off) {
+			prev = NULL;
+		} else if (prev->lr_offset != off) {
+			/*
+			 * convert to proxy if needed then
+			 * split this entry and bump ref count
+			 */
+			prev = zfs_rangelock_split(tree, prev, off);
+			prev = AVL_NEXT(tree, prev); /* move to rear range */
+		}
+	}
+	ASSERT((prev == NULL) || (prev->lr_offset == off));
+
+	if (prev != NULL)
+		next = prev;
+	else
+		next = avl_nearest(tree, where, AVL_AFTER);
+
+	if (next == NULL || off + len <= next->lr_offset) {
+		/* no overlaps, use the original new rl_t in the tree */
+		avl_insert(tree, new, where);
+		return;
+	}
+
+	if (off < next->lr_offset) {
+		/* Add a proxy for initial range before the overlap */
+		zfs_rangelock_new_proxy(tree, off, next->lr_offset - off);
+	}
+
+	new->lr_count = 0; /* will use proxies in tree */
+	/*
+	 * We now search forward through the ranges, until we go past the end
+	 * of the new range. For each entry we make it a proxy if it
+	 * isn't already, then bump its reference count. If there's any
+	 * gaps between the ranges then we create a new proxy range.
+	 */
+	for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) {
+		if (off + len <= next->lr_offset)
+			break;
+		if (prev != NULL && prev->lr_offset + prev->lr_length <
+		    next->lr_offset) {
+			/* there's a gap */
+			ASSERT3U(next->lr_offset, >,
+			    prev->lr_offset + prev->lr_length);
+			zfs_rangelock_new_proxy(tree,
+			    prev->lr_offset + prev->lr_length,
+			    next->lr_offset -
+			    (prev->lr_offset + prev->lr_length));
+		}
+		if (off + len == next->lr_offset + next->lr_length) {
+			/* exact overlap with end */
+			next = zfs_rangelock_proxify(tree, next);
+			next->lr_count++;
+			return;
+		}
+		if (off + len < next->lr_offset + next->lr_length) {
+			/* new range ends in the middle of this block */
+			next = zfs_rangelock_split(tree, next, off + len);
+			next->lr_count++;
+			return;
+		}
+		ASSERT3U(off + len, >, next->lr_offset + next->lr_length);
+		next = zfs_rangelock_proxify(tree, next);
+		next->lr_count++;
+	}
+
+	/* Add the remaining end range. */
+	zfs_rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length,
+	    (off + len) - (prev->lr_offset + prev->lr_length));
+}
+
+/*
+ * Check if a reader lock can be grabbed.  If not, fail immediately or sleep and
+ * recheck until available, depending on the value of the "nonblock" parameter.
+ */
+static boolean_t
+zfs_rangelock_enter_reader(zfs_rangelock_t *rl, zfs_locked_range_t *new,
+    boolean_t nonblock)
+{
+	avl_tree_t *tree = &rl->rl_tree;
+	zfs_locked_range_t *prev, *next;
+	avl_index_t where;
+	uint64_t off = new->lr_offset;
+	uint64_t len = new->lr_length;
+
+	/*
+	 * Look for any writer locks in the range.
+	 */
+retry:
+	prev = avl_find(tree, new, &where);
+	if (prev == NULL)
+		prev = avl_nearest(tree, where, AVL_BEFORE);
+
+	/*
+	 * Check the previous range for a writer lock overlap.
+	 */
+	if (prev && (off < prev->lr_offset + prev->lr_length)) {
+		if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) {
+			if (nonblock)
+				return (B_FALSE);
+			if (!prev->lr_read_wanted) {
+				cv_init(&prev->lr_read_cv,
+				    NULL, CV_DEFAULT, NULL);
+				prev->lr_read_wanted = B_TRUE;
+			}
+			cv_wait(&prev->lr_read_cv, &rl->rl_lock);
+			goto retry;
+		}
+		if (off + len < prev->lr_offset + prev->lr_length)
+			goto got_lock;
+	}
+
+	/*
+	 * Search through the following ranges to see if there's
+	 * write lock any overlap.
+	 */
+	if (prev != NULL)
+		next = AVL_NEXT(tree, prev);
+	else
+		next = avl_nearest(tree, where, AVL_AFTER);
+	for (; next != NULL; next = AVL_NEXT(tree, next)) {
+		if (off + len <= next->lr_offset)
+			goto got_lock;
+		if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) {
+			if (nonblock)
+				return (B_FALSE);
+			if (!next->lr_read_wanted) {
+				cv_init(&next->lr_read_cv,
+				    NULL, CV_DEFAULT, NULL);
+				next->lr_read_wanted = B_TRUE;
+			}
+			cv_wait(&next->lr_read_cv, &rl->rl_lock);
+			goto retry;
+		}
+		if (off + len <= next->lr_offset + next->lr_length)
+			goto got_lock;
+	}
+
+got_lock:
+	/*
+	 * Add the read lock, which may involve splitting existing
+	 * locks and bumping ref counts (r_count).
+	 */
+	zfs_rangelock_add_reader(tree, new, prev, where);
+	return (B_TRUE);
+}
+
+/*
+ * Lock a range (offset, length) as either shared (RL_READER) or exclusive
+ * (RL_WRITER or RL_APPEND).  If RL_APPEND is specified, rl_cb() will convert
+ * it to a RL_WRITER lock (with the offset at the end of the file).  Returns
+ * the range lock structure for later unlocking (or reduce range if the
+ * entire file is locked as RL_WRITER), or NULL if nonblock is true and the
+ * lock could not be acquired immediately.
+ */
+static zfs_locked_range_t *
+zfs_rangelock_enter_impl(zfs_rangelock_t *rl, uint64_t off, uint64_t len,
+    zfs_rangelock_type_t type, boolean_t nonblock)
+{
+	zfs_locked_range_t *new;
+
+	ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND);
+
+	new = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP);
+	new->lr_rangelock = rl;
+	new->lr_offset = off;
+	if (len + off < off)	/* overflow */
+		len = UINT64_MAX - off;
+	new->lr_length = len;
+	new->lr_count = 1; /* assume it's going to be in the tree */
+	new->lr_type = type;
+	new->lr_proxy = B_FALSE;
+	new->lr_write_wanted = B_FALSE;
+	new->lr_read_wanted = B_FALSE;
+
+	mutex_enter(&rl->rl_lock);
+	if (type == RL_READER) {
+		/*
+		 * First check for the usual case of no locks
+		 */
+		if (avl_numnodes(&rl->rl_tree) == 0) {
+			avl_add(&rl->rl_tree, new);
+		} else if (!zfs_rangelock_enter_reader(rl, new, nonblock)) {
+			kmem_free(new, sizeof (*new));
+			new = NULL;
+		}
+	} else if (!zfs_rangelock_enter_writer(rl, new, nonblock)) {
+		kmem_free(new, sizeof (*new));
+		new = NULL;
+	}
+	mutex_exit(&rl->rl_lock);
+	return (new);
+}
+
+zfs_locked_range_t *
+zfs_rangelock_enter(zfs_rangelock_t *rl, uint64_t off, uint64_t len,
+    zfs_rangelock_type_t type)
+{
+	return (zfs_rangelock_enter_impl(rl, off, len, type, B_FALSE));
+}
+
+zfs_locked_range_t *
+zfs_rangelock_tryenter(zfs_rangelock_t *rl, uint64_t off, uint64_t len,
+    zfs_rangelock_type_t type)
+{
+	return (zfs_rangelock_enter_impl(rl, off, len, type, B_TRUE));
+}
+
+/*
+ * Safely free the zfs_locked_range_t.
+ */
+static void
+zfs_rangelock_free(zfs_locked_range_t *lr)
+{
+	if (lr->lr_write_wanted)
+		cv_destroy(&lr->lr_write_cv);
+
+	if (lr->lr_read_wanted)
+		cv_destroy(&lr->lr_read_cv);
+
+	kmem_free(lr, sizeof (zfs_locked_range_t));
+}
+
+/*
+ * Unlock a reader lock
+ */
+static void
+zfs_rangelock_exit_reader(zfs_rangelock_t *rl, zfs_locked_range_t *remove,
+    list_t *free_list)
+{
+	avl_tree_t *tree = &rl->rl_tree;
+	uint64_t len;
+
+	/*
+	 * The common case is when the remove entry is in the tree
+	 * (cnt == 1) meaning there's been no other reader locks overlapping
+	 * with this one. Otherwise the remove entry will have been
+	 * removed from the tree and replaced by proxies (one or
+	 * more ranges mapping to the entire range).
+	 */
+	if (remove->lr_count == 1) {
+		avl_remove(tree, remove);
+		if (remove->lr_write_wanted)
+			cv_broadcast(&remove->lr_write_cv);
+		if (remove->lr_read_wanted)
+			cv_broadcast(&remove->lr_read_cv);
+		list_insert_tail(free_list, remove);
+	} else {
+		ASSERT0(remove->lr_count);
+		ASSERT0(remove->lr_write_wanted);
+		ASSERT0(remove->lr_read_wanted);
+		/*
+		 * Find start proxy representing this reader lock,
+		 * then decrement ref count on all proxies
+		 * that make up this range, freeing them as needed.
+		 */
+		zfs_locked_range_t *lr = avl_find(tree, remove, NULL);
+		ASSERT3P(lr, !=, NULL);
+		ASSERT3U(lr->lr_count, !=, 0);
+		ASSERT3U(lr->lr_type, ==, RL_READER);
+		zfs_locked_range_t *next = NULL;
+		for (len = remove->lr_length; len != 0; lr = next) {
+			len -= lr->lr_length;
+			if (len != 0) {
+				next = AVL_NEXT(tree, lr);
+				ASSERT3P(next, !=, NULL);
+				ASSERT3U(lr->lr_offset + lr->lr_length, ==,
+				    next->lr_offset);
+				ASSERT3U(next->lr_count, !=, 0);
+				ASSERT3U(next->lr_type, ==, RL_READER);
+			}
+			lr->lr_count--;
+			if (lr->lr_count == 0) {
+				avl_remove(tree, lr);
+				if (lr->lr_write_wanted)
+					cv_broadcast(&lr->lr_write_cv);
+				if (lr->lr_read_wanted)
+					cv_broadcast(&lr->lr_read_cv);
+				list_insert_tail(free_list, lr);
+			}
+		}
+		kmem_free(remove, sizeof (zfs_locked_range_t));
+	}
+}
+
+/*
+ * Unlock range and destroy range lock structure.
+ */
+void
+zfs_rangelock_exit(zfs_locked_range_t *lr)
+{
+	zfs_rangelock_t *rl = lr->lr_rangelock;
+	list_t free_list;
+	zfs_locked_range_t *free_lr;
+
+	ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER);
+	ASSERT(lr->lr_count == 1 || lr->lr_count == 0);
+	ASSERT(!lr->lr_proxy);
+
+	/*
+	 * The free list is used to defer the cv_destroy() and
+	 * subsequent kmem_free until after the mutex is dropped.
+	 */
+	list_create(&free_list, sizeof (zfs_locked_range_t),
+	    offsetof(zfs_locked_range_t, lr_node));
+
+	mutex_enter(&rl->rl_lock);
+	if (lr->lr_type == RL_WRITER) {
+		/* writer locks can't be shared or split */
+		avl_remove(&rl->rl_tree, lr);
+		if (lr->lr_write_wanted)
+			cv_broadcast(&lr->lr_write_cv);
+		if (lr->lr_read_wanted)
+			cv_broadcast(&lr->lr_read_cv);
+		list_insert_tail(&free_list, lr);
+	} else {
+		/*
+		 * lock may be shared, let rangelock_exit_reader()
+		 * release the lock and free the zfs_locked_range_t.
+		 */
+		zfs_rangelock_exit_reader(rl, lr, &free_list);
+	}
+	mutex_exit(&rl->rl_lock);
+
+	while ((free_lr = list_remove_head(&free_list)) != NULL)
+		zfs_rangelock_free(free_lr);
+
+	list_destroy(&free_list);
+}
+
+/*
+ * Reduce range locked as RL_WRITER from whole file to specified range.
+ * Asserts the whole file is exclusively locked and so there's only one
+ * entry in the tree.
+ */
+void
+zfs_rangelock_reduce(zfs_locked_range_t *lr, uint64_t off, uint64_t len)
+{
+	zfs_rangelock_t *rl = lr->lr_rangelock;
+
+	/* Ensure there are no other locks */
+	ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1);
+	ASSERT3U(lr->lr_offset, ==, 0);
+	ASSERT3U(lr->lr_type, ==, RL_WRITER);
+	ASSERT(!lr->lr_proxy);
+	ASSERT3U(lr->lr_length, ==, UINT64_MAX);
+	ASSERT3U(lr->lr_count, ==, 1);
+
+	mutex_enter(&rl->rl_lock);
+	lr->lr_offset = off;
+	lr->lr_length = len;
+	mutex_exit(&rl->rl_lock);
+	if (lr->lr_write_wanted)
+		cv_broadcast(&lr->lr_write_cv);
+	if (lr->lr_read_wanted)
+		cv_broadcast(&lr->lr_read_cv);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zfs_rangelock_init);
+EXPORT_SYMBOL(zfs_rangelock_fini);
+EXPORT_SYMBOL(zfs_rangelock_enter);
+EXPORT_SYMBOL(zfs_rangelock_tryenter);
+EXPORT_SYMBOL(zfs_rangelock_exit);
+EXPORT_SYMBOL(zfs_rangelock_reduce);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_sa.c b/sys/contrib/openzfs/module/zfs/zfs_sa.c
new file mode 100644
index 000000000000..67be131da63b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_sa.c
@@ -0,0 +1,446 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/vnode.h>
+#include <sys/sa.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_sa.h>
+#include <sys/dmu_objset.h>
+#include <sys/sa_impl.h>
+
+/*
+ * ZPL attribute registration table.
+ * Order of attributes doesn't matter
+ * a unique value will be assigned for each
+ * attribute that is file system specific
+ *
+ * This is just the set of ZPL attributes that this
+ * version of ZFS deals with natively.  The file system
+ * could have other attributes stored in files, but they will be
+ * ignored.  The SA framework will preserve them, just that
+ * this version of ZFS won't change or delete them.
+ */
+
+sa_attr_reg_t zfs_attr_table[ZPL_END+1] = {
+	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
+	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
+	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
+	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
+	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
+	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
+	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
+	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
+	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
+	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
+	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
+	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
+	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
+	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
+	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
+	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
+	{"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+	{"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0},
+	{"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0},
+	{"ZPL_DACL_ACES", 0, SA_ACL, 0},
+	{"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0},
+	{"ZPL_PROJID", sizeof (uint64_t), SA_UINT64_ARRAY, 0},
+	{NULL, 0, 0, 0}
+};
+
+#ifdef _KERNEL
+int
+zfs_sa_readlink(znode_t *zp, zfs_uio_t *uio)
+{
+	dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+	size_t bufsz;
+	int error;
+
+	bufsz = zp->z_size;
+	if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) {
+		error = zfs_uiomove((caddr_t)db->db_data +
+		    ZFS_OLD_ZNODE_PHYS_SIZE,
+		    MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ, uio);
+	} else {
+		dmu_buf_t *dbp;
+		if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id,
+		    0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) {
+			error = zfs_uiomove(dbp->db_data,
+			    MIN((size_t)bufsz, zfs_uio_resid(uio)), UIO_READ,
+			    uio);
+			dmu_buf_rele(dbp, FTAG);
+		}
+	}
+	return (error);
+}
+
+void
+zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx)
+{
+	dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+
+	if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) {
+		VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx));
+		if (len) {
+			bcopy(link, (caddr_t)db->db_data +
+			    ZFS_OLD_ZNODE_PHYS_SIZE, len);
+		}
+	} else {
+		dmu_buf_t *dbp;
+
+		zfs_grow_blocksize(zp, len, tx);
+		VERIFY0(dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp,
+		    DMU_READ_NO_PREFETCH));
+
+		dmu_buf_will_dirty(dbp, tx);
+
+		ASSERT3U(len, <=, dbp->db_size);
+		bcopy(link, dbp->db_data, len);
+		dmu_buf_rele(dbp, FTAG);
+	}
+}
+
+void
+zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	xoptattr_t *xoap;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+	if (zp->z_is_sa) {
+		if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+		    &xoap->xoa_av_scanstamp,
+		    sizeof (xoap->xoa_av_scanstamp)) != 0)
+			return;
+	} else {
+		dmu_object_info_t doi;
+		dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+		int len;
+
+		if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP))
+			return;
+
+		sa_object_info(zp->z_sa_hdl, &doi);
+		len = sizeof (xoap->xoa_av_scanstamp) +
+		    ZFS_OLD_ZNODE_PHYS_SIZE;
+
+		if (len <= doi.doi_bonus_size) {
+			(void) memcpy(xoap->xoa_av_scanstamp,
+			    (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+			    sizeof (xoap->xoa_av_scanstamp));
+		}
+	}
+	XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
+}
+
+void
+zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	xoptattr_t *xoap;
+
+	ASSERT(MUTEX_HELD(&zp->z_lock));
+	VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
+	if (zp->z_is_sa)
+		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
+		    &xoap->xoa_av_scanstamp,
+		    sizeof (xoap->xoa_av_scanstamp), tx));
+	else {
+		dmu_object_info_t doi;
+		dmu_buf_t *db = sa_get_db(zp->z_sa_hdl);
+		int len;
+
+		sa_object_info(zp->z_sa_hdl, &doi);
+		len = sizeof (xoap->xoa_av_scanstamp) +
+		    ZFS_OLD_ZNODE_PHYS_SIZE;
+		if (len > doi.doi_bonus_size)
+			VERIFY(dmu_set_bonus(db, len, tx) == 0);
+		(void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+		    xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp));
+
+		zp->z_pflags |= ZFS_BONUS_SCANSTAMP;
+		VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
+		    &zp->z_pflags, sizeof (uint64_t), tx));
+	}
+}
+
+int
+zfs_sa_get_xattr(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	char *obj;
+	int size;
+	int error;
+
+	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
+	ASSERT(!zp->z_xattr_cached);
+	ASSERT(zp->z_is_sa);
+
+	error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zfsvfs), &size);
+	if (error) {
+		if (error == ENOENT)
+			return nvlist_alloc(&zp->z_xattr_cached,
+			    NV_UNIQUE_NAME, KM_SLEEP);
+		else
+			return (error);
+	}
+
+	obj = vmem_alloc(size, KM_SLEEP);
+
+	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zfsvfs), obj, size);
+	if (error == 0)
+		error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP);
+
+	vmem_free(obj, size);
+
+	return (error);
+}
+
+int
+zfs_sa_set_xattr(znode_t *zp)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	dmu_tx_t *tx;
+	char *obj;
+	size_t size;
+	int error;
+
+	ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
+	ASSERT(zp->z_xattr_cached);
+	ASSERT(zp->z_is_sa);
+
+	error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR);
+	if ((error == 0) && (size > SA_ATTR_MAX_LEN))
+		error = SET_ERROR(EFBIG);
+	if (error)
+		goto out;
+
+	obj = vmem_alloc(size, KM_SLEEP);
+
+	error = nvlist_pack(zp->z_xattr_cached, &obj, &size,
+	    NV_ENCODE_XDR, KM_SLEEP);
+	if (error)
+		goto out_free;
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_sa_create(tx, size);
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		int count = 0;
+		sa_bulk_attr_t bulk[2];
+		uint64_t ctime[2];
+
+		zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs),
+		    NULL, obj, size);
+		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
+		    NULL, &ctime, 16);
+		VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx));
+
+		dmu_tx_commit(tx);
+	}
+out_free:
+	vmem_free(obj, size);
+out:
+	return (error);
+}
+
+/*
+ * I'm not convinced we should do any of this upgrade.
+ * since the SA code can read both old/new znode formats
+ * with probably little to no performance difference.
+ *
+ * All new files will be created with the new format.
+ */
+
+void
+zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx)
+{
+	dmu_buf_t *db = sa_get_db(hdl);
+	znode_t *zp = sa_get_userdata(hdl);
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int count = 0;
+	sa_bulk_attr_t *bulk, *sa_attrs;
+	zfs_acl_locator_cb_t locate = { 0 };
+	uint64_t uid, gid, mode, rdev, xattr, parent, tmp_gen;
+	uint64_t crtime[2], mtime[2], ctime[2], atime[2];
+	uint64_t links;
+	zfs_acl_phys_t znode_acl;
+	char scanstamp[AV_SCANSTAMP_SZ];
+	boolean_t drop_lock = B_FALSE;
+
+	/*
+	 * No upgrade if ACL isn't cached
+	 * since we won't know which locks are held
+	 * and ready the ACL would require special "locked"
+	 * interfaces that would be messy
+	 */
+	if (zp->z_acl_cached == NULL || Z_ISLNK(ZTOTYPE(zp)))
+		return;
+
+	/*
+	 * If the z_lock is held and we aren't the owner
+	 * the just return since we don't want to deadlock
+	 * trying to update the status of z_is_sa.  This
+	 * file can then be upgraded at a later time.
+	 *
+	 * Otherwise, we know we are doing the
+	 * sa_update() that caused us to enter this function.
+	 */
+	if (MUTEX_NOT_HELD(&zp->z_lock)) {
+		if (mutex_tryenter(&zp->z_lock) == 0)
+			return;
+		else
+			drop_lock = B_TRUE;
+	}
+
+	/* First do a bulk query of the attributes that aren't cached */
+	bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
+	    &znode_acl, 88);
+
+	if (sa_bulk_lookup_locked(hdl, bulk, count) != 0)
+		goto done;
+
+	if (dmu_objset_projectquota_enabled(hdl->sa_os) &&
+	    !(zp->z_pflags & ZFS_PROJID)) {
+		zp->z_pflags |= ZFS_PROJID;
+		zp->z_projid = ZFS_DEFAULT_PROJID;
+	}
+
+	/*
+	 * While the order here doesn't matter its best to try and organize
+	 * it is such a way to pick up an already existing layout number
+	 */
+	count = 0;
+	sa_attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs),
+	    NULL, &tmp_gen, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs),
+	    NULL, &parent, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL,
+	    &atime, 16);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL,
+	    &mtime, 16);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL,
+	    &ctime, 16);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
+	    &crtime, 16);
+	links = ZTONLNK(zp);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL,
+	    &links, 8);
+	if (dmu_objset_projectquota_enabled(hdl->sa_os))
+		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PROJID(zfsvfs), NULL,
+		    &zp->z_projid, 8);
+	if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
+		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
+		    &rdev, 8);
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
+	    &zp->z_acl_cached->z_acl_count, 8);
+
+	if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
+		zfs_acl_xform(zp, zp->z_acl_cached, CRED());
+
+	locate.cb_aclp = zp->z_acl_cached;
+	SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
+	    zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes);
+
+	if (xattr)
+		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs),
+		    NULL, &xattr, 8);
+
+	/* if scanstamp then add scanstamp */
+
+	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
+		bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
+		    scanstamp, AV_SCANSTAMP_SZ);
+		SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs),
+		    NULL, scanstamp, AV_SCANSTAMP_SZ);
+		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
+	}
+
+	VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
+	VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs,
+	    count, tx) == 0);
+	if (znode_acl.z_acl_extern_obj)
+		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
+		    znode_acl.z_acl_extern_obj, tx));
+
+	zp->z_is_sa = B_TRUE;
+	kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
+done:
+	kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
+	if (drop_lock)
+		mutex_exit(&zp->z_lock);
+}
+
+void
+zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp)
+{
+	if (!ZTOZSB(zp)->z_use_sa || zp->z_is_sa)
+		return;
+
+
+	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
+
+	if (zfs_external_acl(zp)) {
+		dmu_tx_hold_free(tx, zfs_external_acl(zp), 0,
+		    DMU_OBJECT_END);
+	}
+}
+
+EXPORT_SYMBOL(zfs_attr_table);
+EXPORT_SYMBOL(zfs_sa_readlink);
+EXPORT_SYMBOL(zfs_sa_symlink);
+EXPORT_SYMBOL(zfs_sa_get_scanstamp);
+EXPORT_SYMBOL(zfs_sa_set_scanstamp);
+EXPORT_SYMBOL(zfs_sa_get_xattr);
+EXPORT_SYMBOL(zfs_sa_set_xattr);
+EXPORT_SYMBOL(zfs_sa_upgrade);
+EXPORT_SYMBOL(zfs_sa_upgrade_txholds);
+
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
new file mode 100644
index 000000000000..61d5f06c6455
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -0,0 +1,897 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
+ * Copyright 2017 Nexenta Systems, Inc.
+ */
+
+/* Portions Copyright 2007 Jeremy Teo */
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/sysmacros.h>
+#include <sys/vfs.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/kmem.h>
+#include <sys/cmn_err.h>
+#include <sys/errno.h>
+#include <sys/zfs_dir.h>
+#include <sys/zfs_acl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/fs/zfs.h>
+#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/dbuf.h>
+#include <sys/policy.h>
+#include <sys/zfs_vnops.h>
+#include <sys/zfs_quota.h>
+#include <sys/zfs_vfsops.h>
+#include <sys/zfs_znode.h>
+
+
+static ulong_t zfs_fsync_sync_cnt = 4;
+
+int
+zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+
+	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
+
+	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
+		ZFS_ENTER(zfsvfs);
+		ZFS_VERIFY_ZP(zp);
+		zil_commit(zfsvfs->z_log, zp->z_id);
+		ZFS_EXIT(zfsvfs);
+	}
+	tsd_set(zfs_fsyncer_key, NULL);
+
+	return (0);
+}
+
+
+#if defined(SEEK_HOLE) && defined(SEEK_DATA)
+/*
+ * Lseek support for finding holes (cmd == SEEK_HOLE) and
+ * data (cmd == SEEK_DATA). "off" is an in/out parameter.
+ */
+static int
+zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
+{
+	uint64_t noff = (uint64_t)*off; /* new offset */
+	uint64_t file_sz;
+	int error;
+	boolean_t hole;
+
+	file_sz = zp->z_size;
+	if (noff >= file_sz)  {
+		return (SET_ERROR(ENXIO));
+	}
+
+	if (cmd == F_SEEK_HOLE)
+		hole = B_TRUE;
+	else
+		hole = B_FALSE;
+
+	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
+
+	if (error == ESRCH)
+		return (SET_ERROR(ENXIO));
+
+	/* file was dirty, so fall back to using generic logic */
+	if (error == EBUSY) {
+		if (hole)
+			*off = file_sz;
+
+		return (0);
+	}
+
+	/*
+	 * We could find a hole that begins after the logical end-of-file,
+	 * because dmu_offset_next() only works on whole blocks.  If the
+	 * EOF falls mid-block, then indicate that the "virtual hole"
+	 * at the end of the file begins at the logical EOF, rather than
+	 * at the end of the last block.
+	 */
+	if (noff > file_sz) {
+		ASSERT(hole);
+		noff = file_sz;
+	}
+
+	if (noff < *off)
+		return (error);
+	*off = noff;
+	return (error);
+}
+
+int
+zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	error = zfs_holey_common(zp, cmd, off);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+#endif /* SEEK_HOLE && SEEK_DATA */
+
+/*ARGSUSED*/
+int
+zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (flag & V_ACE_MASK)
+		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
+	else
+		error = zfs_zaccess_rwx(zp, mode, flag, cr);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+static unsigned long zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
+
+/*
+ * Read bytes from specified file into supplied buffer.
+ *
+ *	IN:	zp	- inode of file to be read from.
+ *		uio	- structure supplying read location, range info,
+ *			  and return buffer.
+ *		ioflag	- O_SYNC flags; used to provide FRSYNC semantics.
+ *			  O_DIRECT flag; used to bypass page cache.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range, buffer filled.
+ *
+ *	RETURN:	0 on success, error code on failure.
+ *
+ * Side Effects:
+ *	inode - atime updated if byte count > 0
+ */
+/* ARGSUSED */
+int
+zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
+{
+	int error = 0;
+	boolean_t frsync = B_FALSE;
+
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EACCES));
+	}
+
+	/* We don't copy out anything useful for directories. */
+	if (Z_ISDIR(ZTOTYPE(zp))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EISDIR));
+	}
+
+	/*
+	 * Validate file offset
+	 */
+	if (zfs_uio_offset(uio) < (offset_t)0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * Fasttrack empty reads
+	 */
+	if (zfs_uio_resid(uio) == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+#ifdef FRSYNC
+	/*
+	 * If we're in FRSYNC mode, sync out this znode before reading it.
+	 * Only do this for non-snapshots.
+	 *
+	 * Some platforms do not support FRSYNC and instead map it
+	 * to O_SYNC, which results in unnecessary calls to zil_commit. We
+	 * only honor FRSYNC requests on platforms which support it.
+	 */
+	frsync = !!(ioflag & FRSYNC);
+#endif
+	if (zfsvfs->z_log &&
+	    (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
+		zil_commit(zfsvfs->z_log, zp->z_id);
+
+	/*
+	 * Lock the range against changes.
+	 */
+	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
+	    zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER);
+
+	/*
+	 * If we are reading past end-of-file we can skip
+	 * to the end; but we might still need to set atime.
+	 */
+	if (zfs_uio_offset(uio) >= zp->z_size) {
+		error = 0;
+		goto out;
+	}
+
+	ASSERT(zfs_uio_offset(uio) < zp->z_size);
+	ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
+	ssize_t start_resid = n;
+
+	while (n > 0) {
+		ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
+		    P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
+#ifdef UIO_NOCOPY
+		if (zfs_uio_segflg(uio) == UIO_NOCOPY)
+			error = mappedread_sf(zp, nbytes, uio);
+		else
+#endif
+		if (zn_has_cached_data(zp) && !(ioflag & O_DIRECT)) {
+			error = mappedread(zp, nbytes, uio);
+		} else {
+			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes);
+		}
+
+		if (error) {
+			/* convert checksum errors into IO errors */
+			if (error == ECKSUM)
+				error = SET_ERROR(EIO);
+			break;
+		}
+
+		n -= nbytes;
+	}
+
+	int64_t nread = start_resid - n;
+	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
+	task_io_account_read(nread);
+out:
+	zfs_rangelock_exit(lr);
+
+	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+/*
+ * Write the bytes to a file.
+ *
+ *	IN:	zp	- znode of file to be written to.
+ *		uio	- structure supplying write location, range info,
+ *			  and data buffer.
+ *		ioflag	- O_APPEND flag set if in append mode.
+ *			  O_DIRECT flag; used to bypass page cache.
+ *		cr	- credentials of caller.
+ *
+ *	OUT:	uio	- updated offset and range.
+ *
+ *	RETURN:	0 if success
+ *		error code if failure
+ *
+ * Timestamps:
+ *	ip - ctime|mtime updated if byte count > 0
+ */
+
+/* ARGSUSED */
+int
+zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
+{
+	int error = 0;
+	ssize_t start_resid = zfs_uio_resid(uio);
+
+	/*
+	 * Fasttrack empty write
+	 */
+	ssize_t n = start_resid;
+	if (n == 0)
+		return (0);
+
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	sa_bulk_attr_t bulk[4];
+	int count = 0;
+	uint64_t mtime[2], ctime[2];
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+	    &zp->z_size, 8);
+	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+	    &zp->z_pflags, 8);
+
+	/*
+	 * Callers might not be able to detect properly that we are read-only,
+	 * so check it explicitly here.
+	 */
+	if (zfs_is_readonly(zfsvfs)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EROFS));
+	}
+
+	/*
+	 * If immutable or not appending then return EPERM
+	 */
+	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
+	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
+	    (zfs_uio_offset(uio) < zp->z_size))) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EPERM));
+	}
+
+	/*
+	 * Validate file offset
+	 */
+	offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
+	if (woff < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EINVAL));
+	}
+
+	const uint64_t max_blksz = zfsvfs->z_max_blksz;
+
+	/*
+	 * Pre-fault the pages to ensure slow (eg NFS) pages
+	 * don't hold up txg.
+	 * Skip this if uio contains loaned arc_buf.
+	 */
+	if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EFAULT));
+	}
+
+	/*
+	 * If in append mode, set the io offset pointer to eof.
+	 */
+	zfs_locked_range_t *lr;
+	if (ioflag & O_APPEND) {
+		/*
+		 * Obtain an appending range lock to guarantee file append
+		 * semantics.  We reset the write offset once we have the lock.
+		 */
+		lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
+		woff = lr->lr_offset;
+		if (lr->lr_length == UINT64_MAX) {
+			/*
+			 * We overlocked the file because this write will cause
+			 * the file block size to increase.
+			 * Note that zp_size cannot change with this lock held.
+			 */
+			woff = zp->z_size;
+		}
+		zfs_uio_setoffset(uio, woff);
+	} else {
+		/*
+		 * Note that if the file block size will change as a result of
+		 * this write, then this range lock will lock the entire file
+		 * so that we can re-write the block safely.
+		 */
+		lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
+	}
+
+	if (zn_rlimit_fsize(zp, uio)) {
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EFBIG));
+	}
+
+	const rlim64_t limit = MAXOFFSET_T;
+
+	if (woff >= limit) {
+		zfs_rangelock_exit(lr);
+		ZFS_EXIT(zfsvfs);
+		return (SET_ERROR(EFBIG));
+	}
+
+	if (n > limit - woff)
+		n = limit - woff;
+
+	uint64_t end_size = MAX(zp->z_size, woff + n);
+	zilog_t *zilog = zfsvfs->z_log;
+
+	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
+	const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
+	const uint64_t projid = zp->z_projid;
+
+	/*
+	 * Write the file in reasonable size chunks.  Each chunk is written
+	 * in a separate transaction; this keeps the intent log records small
+	 * and allows us to do more fine-grained space accounting.
+	 */
+	while (n > 0) {
+		woff = zfs_uio_offset(uio);
+
+		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
+		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
+		    (projid != ZFS_DEFAULT_PROJID &&
+		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
+		    projid))) {
+			error = SET_ERROR(EDQUOT);
+			break;
+		}
+
+		arc_buf_t *abuf = NULL;
+		if (n >= max_blksz && woff >= zp->z_size &&
+		    P2PHASE(woff, max_blksz) == 0 &&
+		    zp->z_blksz == max_blksz) {
+			/*
+			 * This write covers a full block.  "Borrow" a buffer
+			 * from the dmu so that we can fill it before we enter
+			 * a transaction.  This avoids the possibility of
+			 * holding up the transaction if the data copy hangs
+			 * up on a pagefault (e.g., from an NFS server mapping).
+			 */
+			size_t cbytes;
+
+			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
+			    max_blksz);
+			ASSERT(abuf != NULL);
+			ASSERT(arc_buf_size(abuf) == max_blksz);
+			if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
+			    UIO_WRITE, uio, &cbytes))) {
+				dmu_return_arcbuf(abuf);
+				break;
+			}
+			ASSERT3S(cbytes, ==, max_blksz);
+		}
+
+		/*
+		 * Start a transaction.
+		 */
+		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+		dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+		DB_DNODE_ENTER(db);
+		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
+		    MIN(n, max_blksz));
+		DB_DNODE_EXIT(db);
+		zfs_sa_upgrade_txholds(tx, zp);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
+			break;
+		}
+
+		/*
+		 * If rangelock_enter() over-locked we grow the blocksize
+		 * and then reduce the lock range.  This will only happen
+		 * on the first iteration since rangelock_reduce() will
+		 * shrink down lr_length to the appropriate size.
+		 */
+		if (lr->lr_length == UINT64_MAX) {
+			uint64_t new_blksz;
+
+			if (zp->z_blksz > max_blksz) {
+				/*
+				 * File's blocksize is already larger than the
+				 * "recordsize" property.  Only let it grow to
+				 * the next power of 2.
+				 */
+				ASSERT(!ISP2(zp->z_blksz));
+				new_blksz = MIN(end_size,
+				    1 << highbit64(zp->z_blksz));
+			} else {
+				new_blksz = MIN(end_size, max_blksz);
+			}
+			zfs_grow_blocksize(zp, new_blksz, tx);
+			zfs_rangelock_reduce(lr, woff, n);
+		}
+
+		/*
+		 * XXX - should we really limit each write to z_max_blksz?
+		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
+		 */
+		const ssize_t nbytes =
+		    MIN(n, max_blksz - P2PHASE(woff, max_blksz));
+
+		ssize_t tx_bytes;
+		if (abuf == NULL) {
+			tx_bytes = zfs_uio_resid(uio);
+			zfs_uio_fault_disable(uio, B_TRUE);
+			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+			    uio, nbytes, tx);
+			zfs_uio_fault_disable(uio, B_FALSE);
+#ifdef __linux__
+			if (error == EFAULT) {
+				dmu_tx_commit(tx);
+				/*
+				 * Account for partial writes before
+				 * continuing the loop.
+				 * Update needs to occur before the next
+				 * zfs_uio_prefaultpages, or prefaultpages may
+				 * error, and we may break the loop early.
+				 */
+				if (tx_bytes != zfs_uio_resid(uio))
+					n -= tx_bytes - zfs_uio_resid(uio);
+				if (zfs_uio_prefaultpages(MIN(n, max_blksz),
+				    uio)) {
+					break;
+				}
+				continue;
+			}
+#endif
+			if (error != 0) {
+				dmu_tx_commit(tx);
+				break;
+			}
+			tx_bytes -= zfs_uio_resid(uio);
+		} else {
+			/* Implied by abuf != NULL: */
+			ASSERT3S(n, >=, max_blksz);
+			ASSERT0(P2PHASE(woff, max_blksz));
+			/*
+			 * We can simplify nbytes to MIN(n, max_blksz) since
+			 * P2PHASE(woff, max_blksz) is 0, and knowing
+			 * n >= max_blksz lets us simplify further:
+			 */
+			ASSERT3S(nbytes, ==, max_blksz);
+			/*
+			 * Thus, we're writing a full block at a block-aligned
+			 * offset and extending the file past EOF.
+			 *
+			 * dmu_assign_arcbuf_by_dbuf() will directly assign the
+			 * arc buffer to a dbuf.
+			 */
+			error = dmu_assign_arcbuf_by_dbuf(
+			    sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
+			if (error != 0) {
+				dmu_return_arcbuf(abuf);
+				dmu_tx_commit(tx);
+				break;
+			}
+			ASSERT3S(nbytes, <=, zfs_uio_resid(uio));
+			zfs_uioskip(uio, nbytes);
+			tx_bytes = nbytes;
+		}
+		if (tx_bytes && zn_has_cached_data(zp) &&
+		    !(ioflag & O_DIRECT)) {
+			update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
+		}
+
+		/*
+		 * If we made no progress, we're done.  If we made even
+		 * partial progress, update the znode and ZIL accordingly.
+		 */
+		if (tx_bytes == 0) {
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
+			    (void *)&zp->z_size, sizeof (uint64_t), tx);
+			dmu_tx_commit(tx);
+			ASSERT(error != 0);
+			break;
+		}
+
+		/*
+		 * Clear Set-UID/Set-GID bits on successful write if not
+		 * privileged and at least one of the execute bits is set.
+		 *
+		 * It would be nice to do this after all writes have
+		 * been done, but that would still expose the ISUID/ISGID
+		 * to another app after the partial write is committed.
+		 *
+		 * Note: we don't call zfs_fuid_map_id() here because
+		 * user 0 is not an ephemeral uid.
+		 */
+		mutex_enter(&zp->z_acl_lock);
+		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
+		    (S_IXUSR >> 6))) != 0 &&
+		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
+		    secpolicy_vnode_setid_retain(zp, cr,
+		    ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
+			uint64_t newmode;
+			zp->z_mode &= ~(S_ISUID | S_ISGID);
+			newmode = zp->z_mode;
+			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
+			    (void *)&newmode, sizeof (uint64_t), tx);
+		}
+		mutex_exit(&zp->z_acl_lock);
+
+		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+		/*
+		 * Update the file size (zp_size) if it has changed;
+		 * account for possible concurrent updates.
+		 */
+		while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
+			(void) atomic_cas_64(&zp->z_size, end_size,
+			    zfs_uio_offset(uio));
+			ASSERT(error == 0);
+		}
+		/*
+		 * If we are replaying and eof is non zero then force
+		 * the file size to the specified eof. Note, there's no
+		 * concurrency during replay.
+		 */
+		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
+			zp->z_size = zfsvfs->z_replay_eof;
+
+		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag,
+		    NULL, NULL);
+		dmu_tx_commit(tx);
+
+		if (error != 0)
+			break;
+		ASSERT3S(tx_bytes, ==, nbytes);
+		n -= nbytes;
+
+		if (n > 0) {
+			if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
+				error = SET_ERROR(EFAULT);
+				break;
+			}
+		}
+	}
+
+	zfs_znode_update_vfs(zp);
+	zfs_rangelock_exit(lr);
+
+	/*
+	 * If we're in replay mode, or we made no progress, or the
+	 * uio data is inaccessible return an error.  Otherwise, it's
+	 * at least a partial write, so it's successful.
+	 */
+	if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
+	    error == EFAULT) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	if (ioflag & (O_SYNC | O_DSYNC) ||
+	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, zp->z_id);
+
+	const int64_t nwritten = start_resid - zfs_uio_resid(uio);
+	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
+	task_io_account_write(nwritten);
+
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*ARGSUSED*/
+int
+zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*ARGSUSED*/
+int
+zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = ZTOZSB(zp);
+	int error;
+	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	zilog_t	*zilog = zfsvfs->z_log;
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+
+	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
+
+	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+		zil_commit(zilog, 0);
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
+#ifdef ZFS_DEBUG
+static int zil_fault_io = 0;
+#endif
+
+static void zfs_get_done(zgd_t *zgd, int error);
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+	zfsvfs_t *zfsvfs = arg;
+	objset_t *os = zfsvfs->z_os;
+	znode_t *zp;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error = 0;
+
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
+
+	/*
+	 * Nothing to do if the file has been removed
+	 */
+	if (zfs_zget(zfsvfs, object, &zp) != 0)
+		return (SET_ERROR(ENOENT));
+	if (zp->z_unlinked) {
+		/*
+		 * Release the vnode asynchronously as we currently have the
+		 * txg stopped from syncing.
+		 */
+		zfs_zrele_async(zp);
+		return (SET_ERROR(ENOENT));
+	}
+
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_lwb = lwb;
+	zgd->zgd_private = zp;
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (buf != NULL) { /* immediate write */
+		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+		    offset, size, RL_READER);
+		/* test for truncation needs to be done while range locked */
+		if (offset >= zp->z_size) {
+			error = SET_ERROR(ENOENT);
+		} else {
+			error = dmu_read(os, object, offset, size, buf,
+			    DMU_READ_NO_PREFETCH);
+		}
+		ASSERT(error == 0 || error == ENOENT);
+	} else { /* indirect write */
+		/*
+		 * Have to lock the whole block to ensure when it's
+		 * written out and its checksum is being calculated
+		 * that no one can change the data. We need to re-check
+		 * blocksize after we get the lock in case it's changed!
+		 */
+		for (;;) {
+			uint64_t blkoff;
+			size = zp->z_blksz;
+			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+			offset -= blkoff;
+			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
+			    offset, size, RL_READER);
+			if (zp->z_blksz == size)
+				break;
+			offset += blkoff;
+			zfs_rangelock_exit(zgd->zgd_lr);
+		}
+		/* test for truncation needs to be done while range locked */
+		if (lr->lr_offset >= zp->z_size)
+			error = SET_ERROR(ENOENT);
+#ifdef ZFS_DEBUG
+		if (zil_fault_io) {
+			error = SET_ERROR(EIO);
+			zil_fault_io = 0;
+		}
+#endif
+		if (error == 0)
+			error = dmu_buf_hold(os, object, offset, zgd, &db,
+			    DMU_READ_NO_PREFETCH);
+
+		if (error == 0) {
+			blkptr_t *bp = &lr->lr_blkptr;
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zfs_get_done, zgd);
+			ASSERT(error || lr->lr_length <= size);
+
+			/*
+			 * On success, we need to wait for the write I/O
+			 * initiated by dmu_sync() to complete before we can
+			 * release this dbuf.  We will finish everything up
+			 * in the zfs_get_done() callback.
+			 */
+			if (error == 0)
+				return (0);
+
+			if (error == EALREADY) {
+				lr->lr_common.lrc_txtype = TX_WRITE2;
+				/*
+				 * TX_WRITE2 relies on the data previously
+				 * written by the TX_WRITE that caused
+				 * EALREADY.  We zero out the BP because
+				 * it is the old, currently-on-disk BP.
+				 */
+				zgd->zgd_bp = NULL;
+				BP_ZERO(bp);
+				error = 0;
+			}
+		}
+	}
+
+	zfs_get_done(zgd, error);
+
+	return (error);
+}
+
+
+/* ARGSUSED */
+static void
+zfs_get_done(zgd_t *zgd, int error)
+{
+	znode_t *zp = zgd->zgd_private;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_rangelock_exit(zgd->zgd_lr);
+
+	/*
+	 * Release the vnode asynchronously as we currently have the
+	 * txg stopped from syncing.
+	 */
+	zfs_zrele_async(zp);
+
+	kmem_free(zgd, sizeof (zgd_t));
+}
+
+EXPORT_SYMBOL(zfs_access);
+EXPORT_SYMBOL(zfs_fsync);
+EXPORT_SYMBOL(zfs_holey);
+EXPORT_SYMBOL(zfs_read);
+EXPORT_SYMBOL(zfs_write);
+EXPORT_SYMBOL(zfs_getsecattr);
+EXPORT_SYMBOL(zfs_setsecattr);
+
+ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, ULONG, ZMOD_RW,
+	"Bytes to read per chunk");
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
new file mode 100644
index 000000000000..7b52f9249298
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -0,0 +1,3695 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright (c) 2018 Datto Inc.
+ */
+
+/* Portions Copyright 2010 Robert Milkowski */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dmu.h>
+#include <sys/zap.h>
+#include <sys/arc.h>
+#include <sys/stat.h>
+#include <sys/zil.h>
+#include <sys/zil_impl.h>
+#include <sys/dsl_dataset.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab.h>
+#include <sys/trace_zfs.h>
+#include <sys/abd.h>
+
+/*
+ * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
+ * calls that change the file system. Each itx has enough information to
+ * be able to replay them after a system crash, power loss, or
+ * equivalent failure mode. These are stored in memory until either:
+ *
+ *   1. they are committed to the pool by the DMU transaction group
+ *      (txg), at which point they can be discarded; or
+ *   2. they are committed to the on-disk ZIL for the dataset being
+ *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
+ *      requirement).
+ *
+ * In the event of a crash or power loss, the itxs contained by each
+ * dataset's on-disk ZIL will be replayed when that dataset is first
+ * instantiated (e.g. if the dataset is a normal filesystem, when it is
+ * first mounted).
+ *
+ * As hinted at above, there is one ZIL per dataset (both the in-memory
+ * representation, and the on-disk representation). The on-disk format
+ * consists of 3 parts:
+ *
+ * 	- a single, per-dataset, ZIL header; which points to a chain of
+ * 	- zero or more ZIL blocks; each of which contains
+ * 	- zero or more ZIL records
+ *
+ * A ZIL record holds the information necessary to replay a single
+ * system call transaction. A ZIL block can hold many ZIL records, and
+ * the blocks are chained together, similarly to a singly linked list.
+ *
+ * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
+ * block in the chain, and the ZIL header points to the first block in
+ * the chain.
+ *
+ * Note, there is not a fixed place in the pool to hold these ZIL
+ * blocks; they are dynamically allocated and freed as needed from the
+ * blocks available on the pool, though they can be preferentially
+ * allocated from a dedicated "log" vdev.
+ */
+
+/*
+ * This controls the amount of time that a ZIL block (lwb) will remain
+ * "open" when it isn't "full", and it has a thread waiting for it to be
+ * committed to stable storage. Please refer to the zil_commit_waiter()
+ * function (and the comments within it) for more details.
+ */
+int zfs_commit_timeout_pct = 5;
+
+/*
+ * See zil.h for more information about these fields.
+ */
+zil_stats_t zil_stats = {
+	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
+	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
+	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
+	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *zil_ksp;
+
+/*
+ * Disable intent logging replay.  This global ZIL switch affects all pools.
+ */
+int zil_replay_disable = 0;
+
+/*
+ * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
+ * the disk(s) by the ZIL after an LWB write has completed. Setting this
+ * will cause ZIL corruption on power loss if a volatile out-of-order
+ * write cache is enabled.
+ */
+int zil_nocacheflush = 0;
+
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that will be executed with lower (asynchronous) priority
+ * to limit potential SLOG device abuse by single active ZIL writer.
+ */
+unsigned long zil_slog_bulk = 768 * 1024;
+
+static kmem_cache_t *zil_lwb_cache;
+static kmem_cache_t *zil_zcw_cache;
+
+#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+static int
+zil_bp_compare(const void *x1, const void *x2)
+{
+	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
+
+	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
+	if (likely(cmp))
+		return (cmp);
+
+	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
+}
+
+static void
+zil_bp_tree_init(zilog_t *zilog)
+{
+	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
+}
+
+static void
+zil_bp_tree_fini(zilog_t *zilog)
+{
+	avl_tree_t *t = &zilog->zl_bp_tree;
+	zil_bp_node_t *zn;
+	void *cookie = NULL;
+
+	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(zn, sizeof (zil_bp_node_t));
+
+	avl_destroy(t);
+}
+
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
+{
+	avl_tree_t *t = &zilog->zl_bp_tree;
+	const dva_t *dva;
+	zil_bp_node_t *zn;
+	avl_index_t where;
+
+	if (BP_IS_EMBEDDED(bp))
+		return (0);
+
+	dva = BP_IDENTITY(bp);
+
+	if (avl_find(t, dva, &where) != NULL)
+		return (SET_ERROR(EEXIST));
+
+	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
+	zn->zn_dva = *dva;
+	avl_insert(t, zn, where);
+
+	return (0);
+}
+
+static zil_header_t *
+zil_header_in_syncing_context(zilog_t *zilog)
+{
+	return ((zil_header_t *)zilog->zl_header);
+}
+
+static void
+zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
+{
+	zio_cksum_t *zc = &bp->blk_cksum;
+
+	zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL);
+	zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL);
+	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
+	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
+}
+
+/*
+ * Read a log block and make sure it's valid.
+ */
+static int
+zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
+    blkptr_t *nbp, void *dst, char **end)
+{
+	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+	arc_flags_t aflags = ARC_FLAG_WAIT;
+	arc_buf_t *abuf = NULL;
+	zbookmark_phys_t zb;
+	int error;
+
+	if (zilog->zl_header->zh_claim_txg == 0)
+		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+	if (!decrypt)
+		zio_flags |= ZIO_FLAG_RAW;
+
+	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
+	    &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+	if (error == 0) {
+		zio_cksum_t cksum = bp->blk_cksum;
+
+		/*
+		 * Validate the checksummed log block.
+		 *
+		 * Sequence numbers should be... sequential.  The checksum
+		 * verifier for the next block should be bp's checksum plus 1.
+		 *
+		 * Also check the log chain linkage and size used.
+		 */
+		cksum.zc_word[ZIL_ZC_SEQ]++;
+
+		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t *zilc = abuf->b_data;
+			char *lr = (char *)(zilc + 1);
+			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
+
+			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+				error = SET_ERROR(ECKSUM);
+			} else {
+				ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
+				bcopy(lr, dst, len);
+				*end = (char *)dst + len;
+				*nbp = zilc->zc_next_blk;
+			}
+		} else {
+			char *lr = abuf->b_data;
+			uint64_t size = BP_GET_LSIZE(bp);
+			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
+				error = SET_ERROR(ECKSUM);
+			} else {
+				ASSERT3U(zilc->zc_nused, <=,
+				    SPA_OLD_MAXBLOCKSIZE);
+				bcopy(lr, dst, zilc->zc_nused);
+				*end = (char *)dst + zilc->zc_nused;
+				*nbp = zilc->zc_next_blk;
+			}
+		}
+
+		arc_buf_destroy(abuf, &abuf);
+	}
+
+	return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+	const blkptr_t *bp = &lr->lr_blkptr;
+	arc_flags_t aflags = ARC_FLAG_WAIT;
+	arc_buf_t *abuf = NULL;
+	zbookmark_phys_t zb;
+	int error;
+
+	if (BP_IS_HOLE(bp)) {
+		if (wbuf != NULL)
+			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+		return (0);
+	}
+
+	if (zilog->zl_header->zh_claim_txg == 0)
+		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+	/*
+	 * If we are not using the resulting data, we are just checking that
+	 * it hasn't been corrupted so we don't need to waste CPU time
+	 * decompressing and decrypting it.
+	 */
+	if (wbuf == NULL)
+		zio_flags |= ZIO_FLAG_RAW;
+
+	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+	if (error == 0) {
+		if (wbuf != NULL)
+			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+		arc_buf_destroy(abuf, &abuf);
+	}
+
+	return (error);
+}
+
+/*
+ * Parse the intent log, and call parse_func for each valid record within.
+ */
+int
+zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+    zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
+    boolean_t decrypt)
+{
+	const zil_header_t *zh = zilog->zl_header;
+	boolean_t claimed = !!zh->zh_claim_txg;
+	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+	uint64_t max_blk_seq = 0;
+	uint64_t max_lr_seq = 0;
+	uint64_t blk_count = 0;
+	uint64_t lr_count = 0;
+	blkptr_t blk, next_blk;
+	char *lrbuf, *lrp;
+	int error = 0;
+
+	bzero(&next_blk, sizeof (blkptr_t));
+
+	/*
+	 * Old logs didn't record the maximum zh_claim_lr_seq.
+	 */
+	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+		claim_lr_seq = UINT64_MAX;
+
+	/*
+	 * Starting at the block pointed to by zh_log we read the log chain.
+	 * For each block in the chain we strongly check that block to
+	 * ensure its validity.  We stop when an invalid block is found.
+	 * For each block pointer in the chain we call parse_blk_func().
+	 * For each record in each valid block we call parse_lr_func().
+	 * If the log has been claimed, stop if we encounter a sequence
+	 * number greater than the highest claimed sequence number.
+	 */
+	lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
+	zil_bp_tree_init(zilog);
+
+	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+		int reclen;
+		char *end = NULL;
+
+		if (blk_seq > claim_blk_seq)
+			break;
+
+		error = parse_blk_func(zilog, &blk, arg, txg);
+		if (error != 0)
+			break;
+		ASSERT3U(max_blk_seq, <, blk_seq);
+		max_blk_seq = blk_seq;
+		blk_count++;
+
+		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
+			break;
+
+		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
+		    lrbuf, &end);
+		if (error != 0)
+			break;
+
+		for (lrp = lrbuf; lrp < end; lrp += reclen) {
+			lr_t *lr = (lr_t *)lrp;
+			reclen = lr->lrc_reclen;
+			ASSERT3U(reclen, >=, sizeof (lr_t));
+			if (lr->lrc_seq > claim_lr_seq)
+				goto done;
+
+			error = parse_lr_func(zilog, lr, arg, txg);
+			if (error != 0)
+				goto done;
+			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
+			max_lr_seq = lr->lrc_seq;
+			lr_count++;
+		}
+	}
+done:
+	zilog->zl_parse_error = error;
+	zilog->zl_parse_blk_seq = max_blk_seq;
+	zilog->zl_parse_lr_seq = max_lr_seq;
+	zilog->zl_parse_blk_count = blk_count;
+	zilog->zl_parse_lr_count = lr_count;
+
+	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) ||
+	    (decrypt && error == EIO));
+
+	zil_bp_tree_fini(zilog);
+	zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
+    uint64_t first_txg)
+{
+	ASSERT(!BP_IS_HOLE(bp));
+
+	/*
+	 * As we call this function from the context of a rewind to a
+	 * checkpoint, each ZIL block whose txg is later than the txg
+	 * that we rewind to is invalid. Thus, we return -1 so
+	 * zil_parse() doesn't attempt to read it.
+	 */
+	if (bp->blk_birth >= first_txg)
+		return (-1);
+
+	if (zil_bp_tree_add(zilog, bp) != 0)
+		return (0);
+
+	zio_free(zilog->zl_spa, first_txg, bp);
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t first_txg)
+{
+	return (0);
+}
+
+static int
+zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
+    uint64_t first_txg)
+{
+	/*
+	 * Claim log block if not already committed and not already claimed.
+	 * If tx == NULL, just verify that the block is claimable.
+	 */
+	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+	    zil_bp_tree_add(zilog, bp) != 0)
+		return (0);
+
+	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t first_txg)
+{
+	lr_write_t *lr = (lr_write_t *)lrc;
+	int error;
+
+	if (lrc->lrc_txtype != TX_WRITE)
+		return (0);
+
+	/*
+	 * If the block is not readable, don't claim it.  This can happen
+	 * in normal operation when a log block is written to disk before
+	 * some of the dmu_sync() blocks it points to.  In this case, the
+	 * transaction cannot have been committed to anyone (we would have
+	 * waited for all writes to be stable first), so it is semantically
+	 * correct to declare this the end of the log.
+	 */
+	if (lr->lr_blkptr.blk_birth >= first_txg) {
+		error = zil_read_log_data(zilog, lr, NULL);
+		if (error != 0)
+			return (error);
+	}
+
+	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
+}
+
+/* ARGSUSED */
+static int
+zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
+    uint64_t claim_txg)
+{
+	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+	return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t claim_txg)
+{
+	lr_write_t *lr = (lr_write_t *)lrc;
+	blkptr_t *bp = &lr->lr_blkptr;
+
+	/*
+	 * If we previously claimed it, we need to free it.
+	 */
+	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+	    !BP_IS_HOLE(bp))
+		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+	return (0);
+}
+
+static int
+zil_lwb_vdev_compare(const void *x1, const void *x2)
+{
+	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
+	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
+
+	return (TREE_CMP(v1, v2));
+}
+
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg,
+    boolean_t fastwrite)
+{
+	lwb_t *lwb;
+
+	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+	lwb->lwb_zilog = zilog;
+	lwb->lwb_blk = *bp;
+	lwb->lwb_fastwrite = fastwrite;
+	lwb->lwb_slog = slog;
+	lwb->lwb_state = LWB_STATE_CLOSED;
+	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+	lwb->lwb_max_txg = txg;
+	lwb->lwb_write_zio = NULL;
+	lwb->lwb_root_zio = NULL;
+	lwb->lwb_tx = NULL;
+	lwb->lwb_issued_timestamp = 0;
+	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+		lwb->lwb_nused = sizeof (zil_chain_t);
+		lwb->lwb_sz = BP_GET_LSIZE(bp);
+	} else {
+		lwb->lwb_nused = 0;
+		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
+	}
+
+	mutex_enter(&zilog->zl_lock);
+	list_insert_tail(&zilog->zl_lwb_list, lwb);
+	mutex_exit(&zilog->zl_lock);
+
+	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	VERIFY(list_is_empty(&lwb->lwb_waiters));
+	VERIFY(list_is_empty(&lwb->lwb_itxs));
+
+	return (lwb);
+}
+
+static void
+zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
+{
+	ASSERT(MUTEX_HELD(&zilog->zl_lock));
+	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
+	VERIFY(list_is_empty(&lwb->lwb_waiters));
+	VERIFY(list_is_empty(&lwb->lwb_itxs));
+	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
+	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
+	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
+	ASSERT(lwb->lwb_state == LWB_STATE_CLOSED ||
+	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+
+	/*
+	 * Clear the zilog's field to indicate this lwb is no longer
+	 * valid, and prevent use-after-free errors.
+	 */
+	if (zilog->zl_last_lwb_opened == lwb)
+		zilog->zl_last_lwb_opened = NULL;
+
+	kmem_cache_free(zil_lwb_cache, lwb);
+}
+
+/*
+ * Called when we create in-memory log transactions so that we know
+ * to cleanup the itxs at the end of spa_sync().
+ */
+static void
+zilog_dirty(zilog_t *zilog, uint64_t txg)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+
+	ASSERT(spa_writeable(zilog->zl_spa));
+
+	if (ds->ds_is_snapshot)
+		panic("dirtying snapshot!");
+
+	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
+		/* up the hold count until we can be written out */
+		dmu_buf_add_ref(ds->ds_dbuf, zilog);
+
+		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
+	}
+}
+
+/*
+ * Determine if the zil is dirty in the specified txg. Callers wanting to
+ * ensure that the dirty state does not change must hold the itxg_lock for
+ * the specified txg. Holding the lock will ensure that the zil cannot be
+ * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
+ * state.
+ */
+static boolean_t __maybe_unused
+zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
+/*
+ * Determine if the zil is dirty. The zil is considered dirty if it has
+ * any pending itx records that have not been cleaned by zil_clean().
+ */
+static boolean_t
+zilog_is_dirty(zilog_t *zilog)
+{
+	dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+	for (int t = 0; t < TXG_SIZE; t++) {
+		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
+/*
+ * Create an on-disk intent log.
+ */
+static lwb_t *
+zil_create(zilog_t *zilog)
+{
+	const zil_header_t *zh = zilog->zl_header;
+	lwb_t *lwb = NULL;
+	uint64_t txg = 0;
+	dmu_tx_t *tx = NULL;
+	blkptr_t blk;
+	int error = 0;
+	boolean_t fastwrite = FALSE;
+	boolean_t slog = FALSE;
+
+	/*
+	 * Wait for any previous destroy to complete.
+	 */
+	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+	ASSERT(zh->zh_claim_txg == 0);
+	ASSERT(zh->zh_replay_seq == 0);
+
+	blk = zh->zh_log;
+
+	/*
+	 * Allocate an initial log block if:
+	 *    - there isn't one already
+	 *    - the existing block is the wrong endianness
+	 */
+	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
+		tx = dmu_tx_create(zilog->zl_os);
+		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		txg = dmu_tx_get_txg(tx);
+
+		if (!BP_IS_HOLE(&blk)) {
+			zio_free(zilog->zl_spa, txg, &blk);
+			BP_ZERO(&blk);
+		}
+
+		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
+		    ZIL_MIN_BLKSZ, &slog);
+		fastwrite = TRUE;
+
+		if (error == 0)
+			zil_init_log_chain(zilog, &blk);
+	}
+
+	/*
+	 * Allocate a log write block (lwb) for the first log block.
+	 */
+	if (error == 0)
+		lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite);
+
+	/*
+	 * If we just allocated the first log block, commit our transaction
+	 * and wait for zil_sync() to stuff the block pointer into zh_log.
+	 * (zh is part of the MOS, so we cannot modify it in open context.)
+	 */
+	if (tx != NULL) {
+		dmu_tx_commit(tx);
+		txg_wait_synced(zilog->zl_dmu_pool, txg);
+	}
+
+	ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+	IMPLY(error == 0, lwb != NULL);
+
+	return (lwb);
+}
+
+/*
+ * In one tx, free all log blocks and clear the log header. If keep_first
+ * is set, then we're replaying a log with no content. We want to keep the
+ * first block, however, so that the first synchronous transaction doesn't
+ * require a txg_wait_synced() in zil_create(). We don't need to
+ * txg_wait_synced() here either when keep_first is set, because both
+ * zil_create() and zil_destroy() will wait for any in-progress destroys
+ * to complete.
+ */
+void
+zil_destroy(zilog_t *zilog, boolean_t keep_first)
+{
+	const zil_header_t *zh = zilog->zl_header;
+	lwb_t *lwb;
+	dmu_tx_t *tx;
+	uint64_t txg;
+
+	/*
+	 * Wait for any previous destroy to complete.
+	 */
+	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+
+	zilog->zl_old_header = *zh;		/* debugging aid */
+
+	if (BP_IS_HOLE(&zh->zh_log))
+		return;
+
+	tx = dmu_tx_create(zilog->zl_os);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	mutex_enter(&zilog->zl_lock);
+
+	ASSERT3U(zilog->zl_destroy_txg, <, txg);
+	zilog->zl_destroy_txg = txg;
+	zilog->zl_keep_first = keep_first;
+
+	if (!list_is_empty(&zilog->zl_lwb_list)) {
+		ASSERT(zh->zh_claim_txg == 0);
+		VERIFY(!keep_first);
+		while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+			if (lwb->lwb_fastwrite)
+				metaslab_fastwrite_unmark(zilog->zl_spa,
+				    &lwb->lwb_blk);
+
+			list_remove(&zilog->zl_lwb_list, lwb);
+			if (lwb->lwb_buf != NULL)
+				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+			zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
+			zil_free_lwb(zilog, lwb);
+		}
+	} else if (!keep_first) {
+		zil_destroy_sync(zilog, tx);
+	}
+	mutex_exit(&zilog->zl_lock);
+
+	dmu_tx_commit(tx);
+}
+
+void
+zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+	(void) zil_parse(zilog, zil_free_log_block,
+	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
+}
+
+int
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
+{
+	dmu_tx_t *tx = txarg;
+	zilog_t *zilog;
+	uint64_t first_txg;
+	zil_header_t *zh;
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_own_obj(dp, ds->ds_object,
+	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
+	if (error != 0) {
+		/*
+		 * EBUSY indicates that the objset is inconsistent, in which
+		 * case it can not have a ZIL.
+		 */
+		if (error != EBUSY) {
+			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+			    (unsigned long long)ds->ds_object, error);
+		}
+
+		return (0);
+	}
+
+	zilog = dmu_objset_zil(os);
+	zh = zil_header_in_syncing_context(zilog);
+	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
+	first_txg = spa_min_claim_txg(zilog->zl_spa);
+
+	/*
+	 * If the spa_log_state is not set to be cleared, check whether
+	 * the current uberblock is a checkpoint one and if the current
+	 * header has been claimed before moving on.
+	 *
+	 * If the current uberblock is a checkpointed uberblock then
+	 * one of the following scenarios took place:
+	 *
+	 * 1] We are currently rewinding to the checkpoint of the pool.
+	 * 2] We crashed in the middle of a checkpoint rewind but we
+	 *    did manage to write the checkpointed uberblock to the
+	 *    vdev labels, so when we tried to import the pool again
+	 *    the checkpointed uberblock was selected from the import
+	 *    procedure.
+	 *
+	 * In both cases we want to zero out all the ZIL blocks, except
+	 * the ones that have been claimed at the time of the checkpoint
+	 * (their zh_claim_txg != 0). The reason is that these blocks
+	 * may be corrupted since we may have reused their locations on
+	 * disk after we took the checkpoint.
+	 *
+	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
+	 * when we first figure out whether the current uberblock is
+	 * checkpointed or not. Unfortunately, that would discard all
+	 * the logs, including the ones that are claimed, and we would
+	 * leak space.
+	 */
+	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
+	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+	    zh->zh_claim_txg == 0)) {
+		if (!BP_IS_HOLE(&zh->zh_log)) {
+			(void) zil_parse(zilog, zil_clear_log_block,
+			    zil_noop_log_record, tx, first_txg, B_FALSE);
+		}
+		BP_ZERO(&zh->zh_log);
+		if (os->os_encrypted)
+			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+		dsl_dataset_dirty(dmu_objset_ds(os), tx);
+		dmu_objset_disown(os, B_FALSE, FTAG);
+		return (0);
+	}
+
+	/*
+	 * If we are not rewinding and opening the pool normally, then
+	 * the min_claim_txg should be equal to the first txg of the pool.
+	 */
+	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
+
+	/*
+	 * Claim all log blocks if we haven't already done so, and remember
+	 * the highest claimed sequence number.  This ensures that if we can
+	 * read only part of the log now (e.g. due to a missing device),
+	 * but we can read the entire log later, we will not try to replay
+	 * or destroy beyond the last block we successfully claimed.
+	 */
+	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
+	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
+		(void) zil_parse(zilog, zil_claim_log_block,
+		    zil_claim_log_record, tx, first_txg, B_FALSE);
+		zh->zh_claim_txg = first_txg;
+		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+			zh->zh_flags |= ZIL_REPLAY_NEEDED;
+		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
+		if (os->os_encrypted)
+			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
+		dsl_dataset_dirty(dmu_objset_ds(os), tx);
+	}
+
+	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
+	dmu_objset_disown(os, B_FALSE, FTAG);
+	return (0);
+}
+
+/*
+ * Check the log by walking the log chain.
+ * Checksum errors are ok as they indicate the end of the chain.
+ * Any other error (no device or read failure) returns an error.
+ */
+/* ARGSUSED */
+int
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
+{
+	zilog_t *zilog;
+	objset_t *os;
+	blkptr_t *bp;
+	int error;
+
+	ASSERT(tx == NULL);
+
+	error = dmu_objset_from_ds(ds, &os);
+	if (error != 0) {
+		cmn_err(CE_WARN, "can't open objset %llu, error %d",
+		    (unsigned long long)ds->ds_object, error);
+		return (0);
+	}
+
+	zilog = dmu_objset_zil(os);
+	bp = (blkptr_t *)&zilog->zl_header->zh_log;
+
+	if (!BP_IS_HOLE(bp)) {
+		vdev_t *vd;
+		boolean_t valid = B_TRUE;
+
+		/*
+		 * Check the first block and determine if it's on a log device
+		 * which may have been removed or faulted prior to loading this
+		 * pool.  If so, there's no point in checking the rest of the
+		 * log as its content should have already been synced to the
+		 * pool.
+		 */
+		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
+		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
+		if (vd->vdev_islog && vdev_is_dead(vd))
+			valid = vdev_log_state_valid(vd);
+		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
+
+		if (!valid)
+			return (0);
+
+		/*
+		 * Check whether the current uberblock is checkpointed (e.g.
+		 * we are rewinding) and whether the current header has been
+		 * claimed or not. If it hasn't then skip verifying it. We
+		 * do this because its ZIL blocks may be part of the pool's
+		 * state before the rewind, which is no longer valid.
+		 */
+		zil_header_t *zh = zil_header_in_syncing_context(zilog);
+		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
+		    zh->zh_claim_txg == 0)
+			return (0);
+	}
+
+	/*
+	 * Because tx == NULL, zil_claim_log_block() will not actually claim
+	 * any blocks, but just determine whether it is possible to do so.
+	 * In addition to checking the log chain, zil_claim_log_block()
+	 * will invoke zio_claim() with a done func of spa_claim_notify(),
+	 * which will update spa_max_claim_txg.  See spa_load() for details.
+	 */
+	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+	    zilog->zl_header->zh_claim_txg ? -1ULL :
+	    spa_min_claim_txg(os->os_spa), B_FALSE);
+
+	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
+}
+
+/*
+ * When an itx is "skipped", this function is used to properly mark the
+ * waiter as "done, and signal any thread(s) waiting on it. An itx can
+ * be skipped (and not committed to an lwb) for a variety of reasons,
+ * one of them being that the itx was committed via spa_sync(), prior to
+ * it being committed to an lwb; this can happen if a thread calling
+ * zil_commit() is racing with spa_sync().
+ */
+static void
+zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
+{
+	mutex_enter(&zcw->zcw_lock);
+	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+	zcw->zcw_done = B_TRUE;
+	cv_broadcast(&zcw->zcw_cv);
+	mutex_exit(&zcw->zcw_lock);
+}
+
+/*
+ * This function is used when the given waiter is to be linked into an
+ * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
+ * At this point, the waiter will no longer be referenced by the itx,
+ * and instead, will be referenced by the lwb.
+ */
+static void
+zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
+{
+	/*
+	 * The lwb_waiters field of the lwb is protected by the zilog's
+	 * zl_lock, thus it must be held when calling this function.
+	 */
+	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
+
+	mutex_enter(&zcw->zcw_lock);
+	ASSERT(!list_link_active(&zcw->zcw_node));
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT(lwb->lwb_state == LWB_STATE_OPENED ||
+	    lwb->lwb_state == LWB_STATE_ISSUED ||
+	    lwb->lwb_state == LWB_STATE_WRITE_DONE);
+
+	list_insert_tail(&lwb->lwb_waiters, zcw);
+	zcw->zcw_lwb = lwb;
+	mutex_exit(&zcw->zcw_lock);
+}
+
+/*
+ * This function is used when zio_alloc_zil() fails to allocate a ZIL
+ * block, and the given waiter must be linked to the "nolwb waiters"
+ * list inside of zil_process_commit_list().
+ */
+static void
+zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
+{
+	mutex_enter(&zcw->zcw_lock);
+	ASSERT(!list_link_active(&zcw->zcw_node));
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
+	list_insert_tail(nolwb, zcw);
+	mutex_exit(&zcw->zcw_lock);
+}
+
+void
+zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
+{
+	avl_tree_t *t = &lwb->lwb_vdev_tree;
+	avl_index_t where;
+	zil_vdev_node_t *zv, zvsearch;
+	int ndvas = BP_GET_NDVAS(bp);
+	int i;
+
+	if (zil_nocacheflush)
+		return;
+
+	mutex_enter(&lwb->lwb_vdev_lock);
+	for (i = 0; i < ndvas; i++) {
+		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+		if (avl_find(t, &zvsearch, &where) == NULL) {
+			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
+			zv->zv_vdev = zvsearch.zv_vdev;
+			avl_insert(t, zv, where);
+		}
+	}
+	mutex_exit(&lwb->lwb_vdev_lock);
+}
+
+static void
+zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
+{
+	avl_tree_t *src = &lwb->lwb_vdev_tree;
+	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
+	void *cookie = NULL;
+	zil_vdev_node_t *zv;
+
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+	/*
+	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
+	 * not need the protection of lwb_vdev_lock (it will only be modified
+	 * while holding zilog->zl_lock) as its writes and those of its
+	 * children have all completed.  The younger 'nlwb' may be waiting on
+	 * future writes to additional vdevs.
+	 */
+	mutex_enter(&nlwb->lwb_vdev_lock);
+	/*
+	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
+	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
+	 */
+	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
+		avl_index_t where;
+
+		if (avl_find(dst, zv, &where) == NULL) {
+			avl_insert(dst, zv, where);
+		} else {
+			kmem_free(zv, sizeof (*zv));
+		}
+	}
+	mutex_exit(&nlwb->lwb_vdev_lock);
+}
+
+void
+zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
+{
+	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
+}
+
+/*
+ * This function is a called after all vdevs associated with a given lwb
+ * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
+ * as the lwb write completes, if "zil_nocacheflush" is set. Further,
+ * all "previous" lwb's will have completed before this function is
+ * called; i.e. this function is called for all previous lwbs before
+ * it's called for "this" lwb (enforced via zio the dependencies
+ * configured in zil_lwb_set_zio_dependency()).
+ *
+ * The intention is for this function to be called as soon as the
+ * contents of an lwb are considered "stable" on disk, and will survive
+ * any sudden loss of power. At this point, any threads waiting for the
+ * lwb to reach this state are signalled, and the "waiter" structures
+ * are marked "done".
+ */
+static void
+zil_lwb_flush_vdevs_done(zio_t *zio)
+{
+	lwb_t *lwb = zio->io_private;
+	zilog_t *zilog = lwb->lwb_zilog;
+	dmu_tx_t *tx = lwb->lwb_tx;
+	zil_commit_waiter_t *zcw;
+	itx_t *itx;
+
+	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
+
+	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+
+	mutex_enter(&zilog->zl_lock);
+
+	/*
+	 * Ensure the lwb buffer pointer is cleared before releasing the
+	 * txg. If we have had an allocation failure and the txg is
+	 * waiting to sync then we want zil_sync() to remove the lwb so
+	 * that it's not picked up as the next new one in
+	 * zil_process_commit_list(). zil_sync() will only remove the
+	 * lwb if lwb_buf is null.
+	 */
+	lwb->lwb_buf = NULL;
+	lwb->lwb_tx = NULL;
+
+	ASSERT3U(lwb->lwb_issued_timestamp, >, 0);
+	zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp;
+
+	lwb->lwb_root_zio = NULL;
+
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
+	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
+
+	if (zilog->zl_last_lwb_opened == lwb) {
+		/*
+		 * Remember the highest committed log sequence number
+		 * for ztest. We only update this value when all the log
+		 * writes succeeded, because ztest wants to ASSERT that
+		 * it got the whole log chain.
+		 */
+		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
+	}
+
+	while ((itx = list_head(&lwb->lwb_itxs)) != NULL) {
+		list_remove(&lwb->lwb_itxs, itx);
+		zil_itx_destroy(itx);
+	}
+
+	while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) {
+		mutex_enter(&zcw->zcw_lock);
+
+		ASSERT(list_link_active(&zcw->zcw_node));
+		list_remove(&lwb->lwb_waiters, zcw);
+
+		ASSERT3P(zcw->zcw_lwb, ==, lwb);
+		zcw->zcw_lwb = NULL;
+
+		zcw->zcw_zio_error = zio->io_error;
+
+		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+		zcw->zcw_done = B_TRUE;
+		cv_broadcast(&zcw->zcw_cv);
+
+		mutex_exit(&zcw->zcw_lock);
+	}
+
+	mutex_exit(&zilog->zl_lock);
+
+	/*
+	 * Now that we've written this log block, we have a stable pointer
+	 * to the next block in the chain, so it's OK to let the txg in
+	 * which we allocated the next block sync.
+	 */
+	dmu_tx_commit(tx);
+}
+
+/*
+ * This is called when an lwb's write zio completes. The callback's
+ * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
+ * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
+ * in writing out this specific lwb's data, and in the case that cache
+ * flushes have been deferred, vdevs involved in writing the data for
+ * previous lwbs. The writes corresponding to all the vdevs in the
+ * lwb_vdev_tree will have completed by the time this is called, due to
+ * the zio dependencies configured in zil_lwb_set_zio_dependency(),
+ * which takes deferred flushes into account. The lwb will be "done"
+ * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
+ * completion callback for the lwb's root zio.
+ */
+static void
+zil_lwb_write_done(zio_t *zio)
+{
+	lwb_t *lwb = zio->io_private;
+	spa_t *spa = zio->io_spa;
+	zilog_t *zilog = lwb->lwb_zilog;
+	avl_tree_t *t = &lwb->lwb_vdev_tree;
+	void *cookie = NULL;
+	zil_vdev_node_t *zv;
+	lwb_t *nlwb;
+
+	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
+
+	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
+	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
+	ASSERT(!BP_IS_GANG(zio->io_bp));
+	ASSERT(!BP_IS_HOLE(zio->io_bp));
+	ASSERT(BP_GET_FILL(zio->io_bp) == 0);
+
+	abd_free(zio->io_abd);
+
+	mutex_enter(&zilog->zl_lock);
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
+	lwb->lwb_state = LWB_STATE_WRITE_DONE;
+	lwb->lwb_write_zio = NULL;
+	lwb->lwb_fastwrite = FALSE;
+	nlwb = list_next(&zilog->zl_lwb_list, lwb);
+	mutex_exit(&zilog->zl_lock);
+
+	if (avl_numnodes(t) == 0)
+		return;
+
+	/*
+	 * If there was an IO error, we're not going to call zio_flush()
+	 * on these vdevs, so we simply empty the tree and free the
+	 * nodes. We avoid calling zio_flush() since there isn't any
+	 * good reason for doing so, after the lwb block failed to be
+	 * written out.
+	 */
+	if (zio->io_error != 0) {
+		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
+			kmem_free(zv, sizeof (*zv));
+		return;
+	}
+
+	/*
+	 * If this lwb does not have any threads waiting for it to
+	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
+	 * command to the vdevs written to by "this" lwb, and instead
+	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
+	 * command for those vdevs. Thus, we merge the vdev tree of
+	 * "this" lwb with the vdev tree of the "next" lwb in the list,
+	 * and assume the "next" lwb will handle flushing the vdevs (or
+	 * deferring the flush(s) again).
+	 *
+	 * This is a useful performance optimization, especially for
+	 * workloads with lots of async write activity and few sync
+	 * write and/or fsync activity, as it has the potential to
+	 * coalesce multiple flush commands to a vdev into one.
+	 */
+	if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) {
+		zil_lwb_flush_defer(lwb, nlwb);
+		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
+		return;
+	}
+
+	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
+		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
+		if (vd != NULL)
+			zio_flush(lwb->lwb_root_zio, vd);
+		kmem_free(zv, sizeof (*zv));
+	}
+}
+
+static void
+zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
+{
+	lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+	ASSERT(MUTEX_HELD(&zilog->zl_lock));
+
+	/*
+	 * The zilog's "zl_last_lwb_opened" field is used to build the
+	 * lwb/zio dependency chain, which is used to preserve the
+	 * ordering of lwb completions that is required by the semantics
+	 * of the ZIL. Each new lwb zio becomes a parent of the
+	 * "previous" lwb zio, such that the new lwb's zio cannot
+	 * complete until the "previous" lwb's zio completes.
+	 *
+	 * This is required by the semantics of zil_commit(); the commit
+	 * waiters attached to the lwbs will be woken in the lwb zio's
+	 * completion callback, so this zio dependency graph ensures the
+	 * waiters are woken in the correct order (the same order the
+	 * lwbs were created).
+	 */
+	if (last_lwb_opened != NULL &&
+	    last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) {
+		ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+		    last_lwb_opened->lwb_state == LWB_STATE_ISSUED ||
+		    last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE);
+
+		ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL);
+		zio_add_child(lwb->lwb_root_zio,
+		    last_lwb_opened->lwb_root_zio);
+
+		/*
+		 * If the previous lwb's write hasn't already completed,
+		 * we also want to order the completion of the lwb write
+		 * zios (above, we only order the completion of the lwb
+		 * root zios). This is required because of how we can
+		 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
+		 *
+		 * When the DKIOCFLUSHWRITECACHE commands are deferred,
+		 * the previous lwb will rely on this lwb to flush the
+		 * vdevs written to by that previous lwb. Thus, we need
+		 * to ensure this lwb doesn't issue the flush until
+		 * after the previous lwb's write completes. We ensure
+		 * this ordering by setting the zio parent/child
+		 * relationship here.
+		 *
+		 * Without this relationship on the lwb's write zio,
+		 * it's possible for this lwb's write to complete prior
+		 * to the previous lwb's write completing; and thus, the
+		 * vdevs for the previous lwb would be flushed prior to
+		 * that lwb's data being written to those vdevs (the
+		 * vdevs are flushed in the lwb write zio's completion
+		 * handler, zil_lwb_write_done()).
+		 */
+		if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) {
+			ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED ||
+			    last_lwb_opened->lwb_state == LWB_STATE_ISSUED);
+
+			ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL);
+			zio_add_child(lwb->lwb_write_zio,
+			    last_lwb_opened->lwb_write_zio);
+		}
+	}
+}
+
+
+/*
+ * This function's purpose is to "open" an lwb such that it is ready to
+ * accept new itxs being committed to it. To do this, the lwb's zio
+ * structures are created, and linked to the lwb. This function is
+ * idempotent; if the passed in lwb has already been opened, this
+ * function is essentially a no-op.
+ */
+static void
+zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
+{
+	zbookmark_phys_t zb;
+	zio_priority_t prio;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+	ASSERT3P(lwb, !=, NULL);
+	EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED);
+	EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED);
+
+	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	/* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */
+	mutex_enter(&zilog->zl_lock);
+	if (lwb->lwb_root_zio == NULL) {
+		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf,
+		    BP_GET_LSIZE(&lwb->lwb_blk));
+
+		if (!lwb->lwb_fastwrite) {
+			metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk);
+			lwb->lwb_fastwrite = 1;
+		}
+
+		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
+			prio = ZIO_PRIORITY_SYNC_WRITE;
+		else
+			prio = ZIO_PRIORITY_ASYNC_WRITE;
+
+		lwb->lwb_root_zio = zio_root(zilog->zl_spa,
+		    zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL);
+		ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+
+		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio,
+		    zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd,
+		    BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb,
+		    prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
+		    ZIO_FLAG_FASTWRITE, &zb);
+		ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+
+		lwb->lwb_state = LWB_STATE_OPENED;
+
+		zil_lwb_set_zio_dependency(zilog, lwb);
+		zilog->zl_last_lwb_opened = lwb;
+	}
+	mutex_exit(&zilog->zl_lock);
+
+	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+}
+
+/*
+ * Define a limited set of intent log block sizes.
+ *
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+struct {
+	uint64_t	limit;
+	uint64_t	blksz;
+} zil_block_buckets[] = {
+	{ 4096,		4096 },			/* non TX_WRITE */
+	{ 8192 + 4096,	8192 + 4096 },		/* database */
+	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
+	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
+	{ 131072,	131072 },		/* < 128KB writes */
+	{ 131072 +4096,	65536 + 4096 },		/* 128KB writes */
+	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
+};
+
+/*
+ * Maximum block size used by the ZIL.  This is picked up when the ZIL is
+ * initialized.  Otherwise this should not be used directly; see
+ * zl_max_block_size instead.
+ */
+int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
+
+/*
+ * Start a log block write and advance to the next log block.
+ * Calls are serialized.
+ */
+static lwb_t *
+zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
+{
+	lwb_t *nlwb = NULL;
+	zil_chain_t *zilc;
+	spa_t *spa = zilog->zl_spa;
+	blkptr_t *bp;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	uint64_t zil_blksz, wsz;
+	int i, error;
+	boolean_t slog;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+	ASSERT3P(lwb->lwb_root_zio, !=, NULL);
+	ASSERT3P(lwb->lwb_write_zio, !=, NULL);
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
+	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+		zilc = (zil_chain_t *)lwb->lwb_buf;
+		bp = &zilc->zc_next_blk;
+	} else {
+		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+		bp = &zilc->zc_next_blk;
+	}
+
+	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
+
+	/*
+	 * Allocate the next block and save its address in this block
+	 * before writing it in order to establish the log chain.
+	 * Note that if the allocation of nlwb synced before we wrote
+	 * the block that points at it (lwb), we'd leak it if we crashed.
+	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+	 * We dirty the dataset to ensure that zil_sync() will be called
+	 * to clean up in the event of allocation failure or I/O failure.
+	 */
+
+	tx = dmu_tx_create(zilog->zl_os);
+
+	/*
+	 * Since we are not going to create any new dirty data, and we
+	 * can even help with clearing the existing dirty data, we
+	 * should not be subject to the dirty data based delays. We
+	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
+	 */
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	lwb->lwb_tx = tx;
+
+	/*
+	 * Log blocks are pre-allocated. Here we select the size of the next
+	 * block, based on size used in the last block.
+	 * - first find the smallest bucket that will fit the block from a
+	 *   limited set of block sizes. This is because it's faster to write
+	 *   blocks allocated from the same metaslab as they are adjacent or
+	 *   close.
+	 * - next find the maximum from the new suggested size and an array of
+	 *   previous sizes. This lessens a picket fence effect of wrongly
+	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
+	 *   requests.
+	 *
+	 * Note we only write what is used, but we can't just allocate
+	 * the maximum block size because we can exhaust the available
+	 * pool log space.
+	 */
+	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
+		continue;
+	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
+	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+	for (i = 0; i < ZIL_PREV_BLKS; i++)
+		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
+
+	BP_ZERO(bp);
+	error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog);
+	if (slog) {
+		ZIL_STAT_BUMP(zil_itx_metaslab_slog_count);
+		ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused);
+	} else {
+		ZIL_STAT_BUMP(zil_itx_metaslab_normal_count);
+		ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused);
+	}
+	if (error == 0) {
+		ASSERT3U(bp->blk_birth, ==, txg);
+		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+
+		/*
+		 * Allocate a new log write block (lwb).
+		 */
+		nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE);
+	}
+
+	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+		/* For Slim ZIL only write what is used. */
+		wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+		ASSERT3U(wsz, <=, lwb->lwb_sz);
+		zio_shrink(lwb->lwb_write_zio, wsz);
+
+	} else {
+		wsz = lwb->lwb_sz;
+	}
+
+	zilc->zc_pad = 0;
+	zilc->zc_nused = lwb->lwb_nused;
+	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
+
+	/*
+	 * clear unused data for security
+	 */
+	bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
+
+	spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER);
+
+	zil_lwb_add_block(lwb, &lwb->lwb_blk);
+	lwb->lwb_issued_timestamp = gethrtime();
+	lwb->lwb_state = LWB_STATE_ISSUED;
+
+	zio_nowait(lwb->lwb_root_zio);
+	zio_nowait(lwb->lwb_write_zio);
+
+	/*
+	 * If there was an allocation failure then nlwb will be null which
+	 * forces a txg_wait_synced().
+	 */
+	return (nlwb);
+}
+
+/*
+ * Maximum amount of write data that can be put into single log block.
+ */
+uint64_t
+zil_max_log_data(zilog_t *zilog)
+{
+	return (zilog->zl_max_block_size -
+	    sizeof (zil_chain_t) - sizeof (lr_write_t));
+}
+
+/*
+ * Maximum amount of log space we agree to waste to reduce number of
+ * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%).
+ */
+static inline uint64_t
+zil_max_waste_space(zilog_t *zilog)
+{
+	return (zil_max_log_data(zilog) / 8);
+}
+
+/*
+ * Maximum amount of write data for WR_COPIED.  For correctness, consumers
+ * must fall back to WR_NEED_COPY if we can't fit the entire record into one
+ * maximum sized log block, because each WR_COPIED record must fit in a
+ * single log block.  For space efficiency, we want to fit two records into a
+ * max-sized log block.
+ */
+uint64_t
+zil_max_copied_data(zilog_t *zilog)
+{
+	return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 -
+	    sizeof (lr_write_t));
+}
+
+static lwb_t *
+zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
+{
+	lr_t *lrcb, *lrc;
+	lr_write_t *lrwb, *lrw;
+	char *lr_buf;
+	uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(lwb->lwb_buf, !=, NULL);
+
+	zil_lwb_write_open(zilog, lwb);
+
+	lrc = &itx->itx_lr;
+	lrw = (lr_write_t *)lrc;
+
+	/*
+	 * A commit itx doesn't represent any on-disk state; instead
+	 * it's simply used as a place holder on the commit list, and
+	 * provides a mechanism for attaching a "commit waiter" onto the
+	 * correct lwb (such that the waiter can be signalled upon
+	 * completion of that lwb). Thus, we don't process this itx's
+	 * log record if it's a commit itx (these itx's don't have log
+	 * records), and instead link the itx's waiter onto the lwb's
+	 * list of waiters.
+	 *
+	 * For more details, see the comment above zil_commit().
+	 */
+	if (lrc->lrc_txtype == TX_COMMIT) {
+		mutex_enter(&zilog->zl_lock);
+		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
+		itx->itx_private = NULL;
+		mutex_exit(&zilog->zl_lock);
+		return (lwb);
+	}
+
+	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+		dlen = P2ROUNDUP_TYPED(
+		    lrw->lr_length, sizeof (uint64_t), uint64_t);
+	} else {
+		dlen = 0;
+	}
+	reclen = lrc->lrc_reclen;
+	zilog->zl_cur_used += (reclen + dlen);
+	txg = lrc->lrc_txg;
+
+	ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen));
+
+cont:
+	/*
+	 * If this record won't fit in the current log block, start a new one.
+	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
+	 */
+	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+	max_log_data = zil_max_log_data(zilog);
+	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+	    lwb_sp < zil_max_waste_space(zilog) &&
+	    (dlen % max_log_data == 0 ||
+	    lwb_sp < reclen + dlen % max_log_data))) {
+		lwb = zil_lwb_write_issue(zilog, lwb);
+		if (lwb == NULL)
+			return (NULL);
+		zil_lwb_write_open(zilog, lwb);
+		ASSERT(LWB_EMPTY(lwb));
+		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+
+		/*
+		 * There must be enough space in the new, empty log block to
+		 * hold reclen.  For WR_COPIED, we need to fit the whole
+		 * record in one block, and reclen is the header size + the
+		 * data size. For WR_NEED_COPY, we can create multiple
+		 * records, splitting the data into multiple blocks, so we
+		 * only need to fit one word of data per block; in this case
+		 * reclen is just the header size (no data).
+		 */
+		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+	}
+
+	dnow = MIN(dlen, lwb_sp - reclen);
+	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+	bcopy(lrc, lr_buf, reclen);
+	lrcb = (lr_t *)lr_buf;		/* Like lrc, but inside lwb. */
+	lrwb = (lr_write_t *)lrcb;	/* Like lrw, but inside lwb. */
+
+	ZIL_STAT_BUMP(zil_itx_count);
+
+	/*
+	 * If it's a write, fetch the data or get its blkptr as appropriate.
+	 */
+	if (lrc->lrc_txtype == TX_WRITE) {
+		if (txg > spa_freeze_txg(zilog->zl_spa))
+			txg_wait_synced(zilog->zl_dmu_pool, txg);
+		if (itx->itx_wr_state == WR_COPIED) {
+			ZIL_STAT_BUMP(zil_itx_copied_count);
+			ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length);
+		} else {
+			char *dbuf;
+			int error;
+
+			if (itx->itx_wr_state == WR_NEED_COPY) {
+				dbuf = lr_buf + reclen;
+				lrcb->lrc_reclen += dnow;
+				if (lrwb->lr_length > dnow)
+					lrwb->lr_length = dnow;
+				lrw->lr_offset += dnow;
+				lrw->lr_length -= dnow;
+				ZIL_STAT_BUMP(zil_itx_needcopy_count);
+				ZIL_STAT_INCR(zil_itx_needcopy_bytes, dnow);
+			} else {
+				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
+				dbuf = NULL;
+				ZIL_STAT_BUMP(zil_itx_indirect_count);
+				ZIL_STAT_INCR(zil_itx_indirect_bytes,
+				    lrw->lr_length);
+			}
+
+			/*
+			 * We pass in the "lwb_write_zio" rather than
+			 * "lwb_root_zio" so that the "lwb_write_zio"
+			 * becomes the parent of any zio's created by
+			 * the "zl_get_data" callback. The vdevs are
+			 * flushed after the "lwb_write_zio" completes,
+			 * so we want to make sure that completion
+			 * callback waits for these additional zio's,
+			 * such that the vdevs used by those zio's will
+			 * be included in the lwb's vdev tree, and those
+			 * vdevs will be properly flushed. If we passed
+			 * in "lwb_root_zio" here, then these additional
+			 * vdevs may not be flushed; e.g. if these zio's
+			 * completed after "lwb_write_zio" completed.
+			 */
+			error = zilog->zl_get_data(itx->itx_private,
+			    lrwb, dbuf, lwb, lwb->lwb_write_zio);
+
+			if (error == EIO) {
+				txg_wait_synced(zilog->zl_dmu_pool, txg);
+				return (lwb);
+			}
+			if (error != 0) {
+				ASSERT(error == ENOENT || error == EEXIST ||
+				    error == EALREADY);
+				return (lwb);
+			}
+		}
+	}
+
+	/*
+	 * We're actually making an entry, so update lrc_seq to be the
+	 * log record sequence number.  Note that this is generally not
+	 * equal to the itx sequence number because not all transactions
+	 * are synchronous, and sometimes spa_sync() gets there first.
+	 */
+	lrcb->lrc_seq = ++zilog->zl_lr_seq;
+	lwb->lwb_nused += reclen + dnow;
+
+	zil_lwb_add_txg(lwb, txg);
+
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
+	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+
+	dlen -= dnow;
+	if (dlen > 0) {
+		zilog->zl_cur_used += reclen;
+		goto cont;
+	}
+
+	return (lwb);
+}
+
+itx_t *
+zil_itx_create(uint64_t txtype, size_t lrsize)
+{
+	size_t itxsize;
+	itx_t *itx;
+
+	lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t);
+	itxsize = offsetof(itx_t, itx_lr) + lrsize;
+
+	itx = zio_data_buf_alloc(itxsize);
+	itx->itx_lr.lrc_txtype = txtype;
+	itx->itx_lr.lrc_reclen = lrsize;
+	itx->itx_lr.lrc_seq = 0;	/* defensive */
+	itx->itx_sync = B_TRUE;		/* default is synchronous */
+	itx->itx_callback = NULL;
+	itx->itx_callback_data = NULL;
+	itx->itx_size = itxsize;
+
+	return (itx);
+}
+
+void
+zil_itx_destroy(itx_t *itx)
+{
+	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
+	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
+
+	if (itx->itx_callback != NULL)
+		itx->itx_callback(itx->itx_callback_data);
+
+	zio_data_buf_free(itx, itx->itx_size);
+}
+
+/*
+ * Free up the sync and async itxs. The itxs_t has already been detached
+ * so no locks are needed.
+ */
+static void
+zil_itxg_clean(itxs_t *itxs)
+{
+	itx_t *itx;
+	list_t *list;
+	avl_tree_t *t;
+	void *cookie;
+	itx_async_node_t *ian;
+
+	list = &itxs->i_sync_list;
+	while ((itx = list_head(list)) != NULL) {
+		/*
+		 * In the general case, commit itxs will not be found
+		 * here, as they'll be committed to an lwb via
+		 * zil_lwb_commit(), and free'd in that function. Having
+		 * said that, it is still possible for commit itxs to be
+		 * found here, due to the following race:
+		 *
+		 *	- a thread calls zil_commit() which assigns the
+		 *	  commit itx to a per-txg i_sync_list
+		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
+		 *	  while the waiter is still on the i_sync_list
+		 *
+		 * There's nothing to prevent syncing the txg while the
+		 * waiter is on the i_sync_list. This normally doesn't
+		 * happen because spa_sync() is slower than zil_commit(),
+		 * but if zil_commit() calls txg_wait_synced() (e.g.
+		 * because zil_create() or zil_commit_writer_stall() is
+		 * called) we will hit this case.
+		 */
+		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
+			zil_commit_waiter_skip(itx->itx_private);
+
+		list_remove(list, itx);
+		zil_itx_destroy(itx);
+	}
+
+	cookie = NULL;
+	t = &itxs->i_async_tree;
+	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+		list = &ian->ia_list;
+		while ((itx = list_head(list)) != NULL) {
+			list_remove(list, itx);
+			/* commit itxs should never be on the async lists. */
+			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
+			zil_itx_destroy(itx);
+		}
+		list_destroy(list);
+		kmem_free(ian, sizeof (itx_async_node_t));
+	}
+	avl_destroy(t);
+
+	kmem_free(itxs, sizeof (itxs_t));
+}
+
+static int
+zil_aitx_compare(const void *x1, const void *x2)
+{
+	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
+	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
+
+	return (TREE_CMP(o1, o2));
+}
+
+/*
+ * Remove all async itx with the given oid.
+ */
+void
+zil_remove_async(zilog_t *zilog, uint64_t oid)
+{
+	uint64_t otxg, txg;
+	itx_async_node_t *ian;
+	avl_tree_t *t;
+	avl_index_t where;
+	list_t clean_list;
+	itx_t *itx;
+
+	ASSERT(oid != 0);
+	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
+
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+		otxg = ZILTEST_TXG;
+	else
+		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+		mutex_enter(&itxg->itxg_lock);
+		if (itxg->itxg_txg != txg) {
+			mutex_exit(&itxg->itxg_lock);
+			continue;
+		}
+
+		/*
+		 * Locate the object node and append its list.
+		 */
+		t = &itxg->itxg_itxs->i_async_tree;
+		ian = avl_find(t, &oid, &where);
+		if (ian != NULL)
+			list_move_tail(&clean_list, &ian->ia_list);
+		mutex_exit(&itxg->itxg_lock);
+	}
+	while ((itx = list_head(&clean_list)) != NULL) {
+		list_remove(&clean_list, itx);
+		/* commit itxs should never be on the async lists. */
+		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
+		zil_itx_destroy(itx);
+	}
+	list_destroy(&clean_list);
+}
+
+void
+zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
+{
+	uint64_t txg;
+	itxg_t *itxg;
+	itxs_t *itxs, *clean = NULL;
+
+	/*
+	 * Ensure the data of a renamed file is committed before the rename.
+	 */
+	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
+		zil_async_to_sync(zilog, itx->itx_oid);
+
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
+		txg = ZILTEST_TXG;
+	else
+		txg = dmu_tx_get_txg(tx);
+
+	itxg = &zilog->zl_itxg[txg & TXG_MASK];
+	mutex_enter(&itxg->itxg_lock);
+	itxs = itxg->itxg_itxs;
+	if (itxg->itxg_txg != txg) {
+		if (itxs != NULL) {
+			/*
+			 * The zil_clean callback hasn't got around to cleaning
+			 * this itxg. Save the itxs for release below.
+			 * This should be rare.
+			 */
+			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
+			    "txg %llu", itxg->itxg_txg);
+			clean = itxg->itxg_itxs;
+		}
+		itxg->itxg_txg = txg;
+		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
+		    KM_SLEEP);
+
+		list_create(&itxs->i_sync_list, sizeof (itx_t),
+		    offsetof(itx_t, itx_node));
+		avl_create(&itxs->i_async_tree, zil_aitx_compare,
+		    sizeof (itx_async_node_t),
+		    offsetof(itx_async_node_t, ia_node));
+	}
+	if (itx->itx_sync) {
+		list_insert_tail(&itxs->i_sync_list, itx);
+	} else {
+		avl_tree_t *t = &itxs->i_async_tree;
+		uint64_t foid =
+		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
+		itx_async_node_t *ian;
+		avl_index_t where;
+
+		ian = avl_find(t, &foid, &where);
+		if (ian == NULL) {
+			ian = kmem_alloc(sizeof (itx_async_node_t),
+			    KM_SLEEP);
+			list_create(&ian->ia_list, sizeof (itx_t),
+			    offsetof(itx_t, itx_node));
+			ian->ia_foid = foid;
+			avl_insert(t, ian, where);
+		}
+		list_insert_tail(&ian->ia_list, itx);
+	}
+
+	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
+
+	/*
+	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
+	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
+	 * need to be careful to always dirty the ZIL using the "real"
+	 * TXG (not itxg_txg) even when the SPA is frozen.
+	 */
+	zilog_dirty(zilog, dmu_tx_get_txg(tx));
+	mutex_exit(&itxg->itxg_lock);
+
+	/* Release the old itxs now we've dropped the lock */
+	if (clean != NULL)
+		zil_itxg_clean(clean);
+}
+
+/*
+ * If there are any in-memory intent log transactions which have now been
+ * synced then start up a taskq to free them. We should only do this after we
+ * have written out the uberblocks (i.e. txg has been committed) so that
+ * don't inadvertently clean out in-memory log records that would be required
+ * by zil_commit().
+ */
+void
+zil_clean(zilog_t *zilog, uint64_t synced_txg)
+{
+	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
+	itxs_t *clean_me;
+
+	ASSERT3U(synced_txg, <, ZILTEST_TXG);
+
+	mutex_enter(&itxg->itxg_lock);
+	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
+		mutex_exit(&itxg->itxg_lock);
+		return;
+	}
+	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
+	ASSERT3U(itxg->itxg_txg, !=, 0);
+	clean_me = itxg->itxg_itxs;
+	itxg->itxg_itxs = NULL;
+	itxg->itxg_txg = 0;
+	mutex_exit(&itxg->itxg_lock);
+	/*
+	 * Preferably start a task queue to free up the old itxs but
+	 * if taskq_dispatch can't allocate resources to do that then
+	 * free it in-line. This should be rare. Note, using TQ_SLEEP
+	 * created a bad performance problem.
+	 */
+	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
+	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
+	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
+	    (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP);
+	if (id == TASKQID_INVALID)
+		zil_itxg_clean(clean_me);
+}
+
+/*
+ * This function will traverse the queue of itxs that need to be
+ * committed, and move them onto the ZIL's zl_itx_commit_list.
+ */
+static void
+zil_get_commit_list(zilog_t *zilog)
+{
+	uint64_t otxg, txg;
+	list_t *commit_list = &zilog->zl_itx_commit_list;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+		otxg = ZILTEST_TXG;
+	else
+		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+	/*
+	 * This is inherently racy, since there is nothing to prevent
+	 * the last synced txg from changing. That's okay since we'll
+	 * only commit things in the future.
+	 */
+	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+		mutex_enter(&itxg->itxg_lock);
+		if (itxg->itxg_txg != txg) {
+			mutex_exit(&itxg->itxg_lock);
+			continue;
+		}
+
+		/*
+		 * If we're adding itx records to the zl_itx_commit_list,
+		 * then the zil better be dirty in this "txg". We can assert
+		 * that here since we're holding the itxg_lock which will
+		 * prevent spa_sync from cleaning it. Once we add the itxs
+		 * to the zl_itx_commit_list we must commit it to disk even
+		 * if it's unnecessary (i.e. the txg was synced).
+		 */
+		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
+		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
+		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
+
+		mutex_exit(&itxg->itxg_lock);
+	}
+}
+
+/*
+ * Move the async itxs for a specified object to commit into sync lists.
+ */
+void
+zil_async_to_sync(zilog_t *zilog, uint64_t foid)
+{
+	uint64_t otxg, txg;
+	itx_async_node_t *ian;
+	avl_tree_t *t;
+	avl_index_t where;
+
+	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
+		otxg = ZILTEST_TXG;
+	else
+		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+
+	/*
+	 * This is inherently racy, since there is nothing to prevent
+	 * the last synced txg from changing.
+	 */
+	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
+		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
+
+		mutex_enter(&itxg->itxg_lock);
+		if (itxg->itxg_txg != txg) {
+			mutex_exit(&itxg->itxg_lock);
+			continue;
+		}
+
+		/*
+		 * If a foid is specified then find that node and append its
+		 * list. Otherwise walk the tree appending all the lists
+		 * to the sync list. We add to the end rather than the
+		 * beginning to ensure the create has happened.
+		 */
+		t = &itxg->itxg_itxs->i_async_tree;
+		if (foid != 0) {
+			ian = avl_find(t, &foid, &where);
+			if (ian != NULL) {
+				list_move_tail(&itxg->itxg_itxs->i_sync_list,
+				    &ian->ia_list);
+			}
+		} else {
+			void *cookie = NULL;
+
+			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
+				list_move_tail(&itxg->itxg_itxs->i_sync_list,
+				    &ian->ia_list);
+				list_destroy(&ian->ia_list);
+				kmem_free(ian, sizeof (itx_async_node_t));
+			}
+		}
+		mutex_exit(&itxg->itxg_lock);
+	}
+}
+
+/*
+ * This function will prune commit itxs that are at the head of the
+ * commit list (it won't prune past the first non-commit itx), and
+ * either: a) attach them to the last lwb that's still pending
+ * completion, or b) skip them altogether.
+ *
+ * This is used as a performance optimization to prevent commit itxs
+ * from generating new lwbs when it's unnecessary to do so.
+ */
+static void
+zil_prune_commit_list(zilog_t *zilog)
+{
+	itx_t *itx;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+		lr_t *lrc = &itx->itx_lr;
+		if (lrc->lrc_txtype != TX_COMMIT)
+			break;
+
+		mutex_enter(&zilog->zl_lock);
+
+		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
+		if (last_lwb == NULL ||
+		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
+			/*
+			 * All of the itxs this waiter was waiting on
+			 * must have already completed (or there were
+			 * never any itx's for it to wait on), so it's
+			 * safe to skip this waiter and mark it done.
+			 */
+			zil_commit_waiter_skip(itx->itx_private);
+		} else {
+			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
+			itx->itx_private = NULL;
+		}
+
+		mutex_exit(&zilog->zl_lock);
+
+		list_remove(&zilog->zl_itx_commit_list, itx);
+		zil_itx_destroy(itx);
+	}
+
+	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
+}
+
+static void
+zil_commit_writer_stall(zilog_t *zilog)
+{
+	/*
+	 * When zio_alloc_zil() fails to allocate the next lwb block on
+	 * disk, we must call txg_wait_synced() to ensure all of the
+	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
+	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
+	 * to zil_process_commit_list()) will have to call zil_create(),
+	 * and start a new ZIL chain.
+	 *
+	 * Since zil_alloc_zil() failed, the lwb that was previously
+	 * issued does not have a pointer to the "next" lwb on disk.
+	 * Thus, if another ZIL writer thread was to allocate the "next"
+	 * on-disk lwb, that block could be leaked in the event of a
+	 * crash (because the previous lwb on-disk would not point to
+	 * it).
+	 *
+	 * We must hold the zilog's zl_issuer_lock while we do this, to
+	 * ensure no new threads enter zil_process_commit_list() until
+	 * all lwb's in the zl_lwb_list have been synced and freed
+	 * (which is achieved via the txg_wait_synced() call).
+	 */
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+	ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+}
+
+/*
+ * This function will traverse the commit list, creating new lwbs as
+ * needed, and committing the itxs from the commit list to these newly
+ * created lwbs. Additionally, as a new lwb is created, the previous
+ * lwb will be issued to the zio layer to be written to disk.
+ */
+static void
+zil_process_commit_list(zilog_t *zilog)
+{
+	spa_t *spa = zilog->zl_spa;
+	list_t nolwb_itxs;
+	list_t nolwb_waiters;
+	lwb_t *lwb;
+	itx_t *itx;
+
+	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
+
+	/*
+	 * Return if there's nothing to commit before we dirty the fs by
+	 * calling zil_create().
+	 */
+	if (list_head(&zilog->zl_itx_commit_list) == NULL)
+		return;
+
+	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
+	    offsetof(zil_commit_waiter_t, zcw_node));
+
+	lwb = list_tail(&zilog->zl_lwb_list);
+	if (lwb == NULL) {
+		lwb = zil_create(zilog);
+	} else {
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+	}
+
+	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
+		lr_t *lrc = &itx->itx_lr;
+		uint64_t txg = lrc->lrc_txg;
+
+		ASSERT3U(txg, !=, 0);
+
+		if (lrc->lrc_txtype == TX_COMMIT) {
+			DTRACE_PROBE2(zil__process__commit__itx,
+			    zilog_t *, zilog, itx_t *, itx);
+		} else {
+			DTRACE_PROBE2(zil__process__normal__itx,
+			    zilog_t *, zilog, itx_t *, itx);
+		}
+
+		list_remove(&zilog->zl_itx_commit_list, itx);
+
+		boolean_t synced = txg <= spa_last_synced_txg(spa);
+		boolean_t frozen = txg > spa_freeze_txg(spa);
+
+		/*
+		 * If the txg of this itx has already been synced out, then
+		 * we don't need to commit this itx to an lwb. This is
+		 * because the data of this itx will have already been
+		 * written to the main pool. This is inherently racy, and
+		 * it's still ok to commit an itx whose txg has already
+		 * been synced; this will result in a write that's
+		 * unnecessary, but will do no harm.
+		 *
+		 * With that said, we always want to commit TX_COMMIT itxs
+		 * to an lwb, regardless of whether or not that itx's txg
+		 * has been synced out. We do this to ensure any OPENED lwb
+		 * will always have at least one zil_commit_waiter_t linked
+		 * to the lwb.
+		 *
+		 * As a counter-example, if we skipped TX_COMMIT itx's
+		 * whose txg had already been synced, the following
+		 * situation could occur if we happened to be racing with
+		 * spa_sync:
+		 *
+		 * 1. We commit a non-TX_COMMIT itx to an lwb, where the
+		 *    itx's txg is 10 and the last synced txg is 9.
+		 * 2. spa_sync finishes syncing out txg 10.
+		 * 3. We move to the next itx in the list, it's a TX_COMMIT
+		 *    whose txg is 10, so we skip it rather than committing
+		 *    it to the lwb used in (1).
+		 *
+		 * If the itx that is skipped in (3) is the last TX_COMMIT
+		 * itx in the commit list, than it's possible for the lwb
+		 * used in (1) to remain in the OPENED state indefinitely.
+		 *
+		 * To prevent the above scenario from occurring, ensuring
+		 * that once an lwb is OPENED it will transition to ISSUED
+		 * and eventually DONE, we always commit TX_COMMIT itx's to
+		 * an lwb here, even if that itx's txg has already been
+		 * synced.
+		 *
+		 * Finally, if the pool is frozen, we _always_ commit the
+		 * itx.  The point of freezing the pool is to prevent data
+		 * from being written to the main pool via spa_sync, and
+		 * instead rely solely on the ZIL to persistently store the
+		 * data; i.e.  when the pool is frozen, the last synced txg
+		 * value can't be trusted.
+		 */
+		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
+			if (lwb != NULL) {
+				lwb = zil_lwb_commit(zilog, itx, lwb);
+
+				if (lwb == NULL)
+					list_insert_tail(&nolwb_itxs, itx);
+				else
+					list_insert_tail(&lwb->lwb_itxs, itx);
+			} else {
+				if (lrc->lrc_txtype == TX_COMMIT) {
+					zil_commit_waiter_link_nolwb(
+					    itx->itx_private, &nolwb_waiters);
+				}
+
+				list_insert_tail(&nolwb_itxs, itx);
+			}
+		} else {
+			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
+			zil_itx_destroy(itx);
+		}
+	}
+
+	if (lwb == NULL) {
+		/*
+		 * This indicates zio_alloc_zil() failed to allocate the
+		 * "next" lwb on-disk. When this happens, we must stall
+		 * the ZIL write pipeline; see the comment within
+		 * zil_commit_writer_stall() for more details.
+		 */
+		zil_commit_writer_stall(zilog);
+
+		/*
+		 * Additionally, we have to signal and mark the "nolwb"
+		 * waiters as "done" here, since without an lwb, we
+		 * can't do this via zil_lwb_flush_vdevs_done() like
+		 * normal.
+		 */
+		zil_commit_waiter_t *zcw;
+		while ((zcw = list_head(&nolwb_waiters)) != NULL) {
+			zil_commit_waiter_skip(zcw);
+			list_remove(&nolwb_waiters, zcw);
+		}
+
+		/*
+		 * And finally, we have to destroy the itx's that
+		 * couldn't be committed to an lwb; this will also call
+		 * the itx's callback if one exists for the itx.
+		 */
+		while ((itx = list_head(&nolwb_itxs)) != NULL) {
+			list_remove(&nolwb_itxs, itx);
+			zil_itx_destroy(itx);
+		}
+	} else {
+		ASSERT(list_is_empty(&nolwb_waiters));
+		ASSERT3P(lwb, !=, NULL);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
+
+		/*
+		 * At this point, the ZIL block pointed at by the "lwb"
+		 * variable is in one of the following states: "closed"
+		 * or "open".
+		 *
+		 * If it's "closed", then no itxs have been committed to
+		 * it, so there's no point in issuing its zio (i.e. it's
+		 * "empty").
+		 *
+		 * If it's "open", then it contains one or more itxs that
+		 * eventually need to be committed to stable storage. In
+		 * this case we intentionally do not issue the lwb's zio
+		 * to disk yet, and instead rely on one of the following
+		 * two mechanisms for issuing the zio:
+		 *
+		 * 1. Ideally, there will be more ZIL activity occurring
+		 * on the system, such that this function will be
+		 * immediately called again (not necessarily by the same
+		 * thread) and this lwb's zio will be issued via
+		 * zil_lwb_commit(). This way, the lwb is guaranteed to
+		 * be "full" when it is issued to disk, and we'll make
+		 * use of the lwb's size the best we can.
+		 *
+		 * 2. If there isn't sufficient ZIL activity occurring on
+		 * the system, such that this lwb's zio isn't issued via
+		 * zil_lwb_commit(), zil_commit_waiter() will issue the
+		 * lwb's zio. If this occurs, the lwb is not guaranteed
+		 * to be "full" by the time its zio is issued, and means
+		 * the size of the lwb was "too large" given the amount
+		 * of ZIL activity occurring on the system at that time.
+		 *
+		 * We do this for a couple of reasons:
+		 *
+		 * 1. To try and reduce the number of IOPs needed to
+		 * write the same number of itxs. If an lwb has space
+		 * available in its buffer for more itxs, and more itxs
+		 * will be committed relatively soon (relative to the
+		 * latency of performing a write), then it's beneficial
+		 * to wait for these "next" itxs. This way, more itxs
+		 * can be committed to stable storage with fewer writes.
+		 *
+		 * 2. To try and use the largest lwb block size that the
+		 * incoming rate of itxs can support. Again, this is to
+		 * try and pack as many itxs into as few lwbs as
+		 * possible, without significantly impacting the latency
+		 * of each individual itx.
+		 */
+	}
+}
+
+/*
+ * This function is responsible for ensuring the passed in commit waiter
+ * (and associated commit itx) is committed to an lwb. If the waiter is
+ * not already committed to an lwb, all itxs in the zilog's queue of
+ * itxs will be processed. The assumption is the passed in waiter's
+ * commit itx will found in the queue just like the other non-commit
+ * itxs, such that when the entire queue is processed, the waiter will
+ * have been committed to an lwb.
+ *
+ * The lwb associated with the passed in waiter is not guaranteed to
+ * have been issued by the time this function completes. If the lwb is
+ * not issued, we rely on future calls to zil_commit_writer() to issue
+ * the lwb, or the timeout mechanism found in zil_commit_waiter().
+ */
+static void
+zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+	ASSERT(spa_writeable(zilog->zl_spa));
+
+	mutex_enter(&zilog->zl_issuer_lock);
+
+	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
+		/*
+		 * It's possible that, while we were waiting to acquire
+		 * the "zl_issuer_lock", another thread committed this
+		 * waiter to an lwb. If that occurs, we bail out early,
+		 * without processing any of the zilog's queue of itxs.
+		 *
+		 * On certain workloads and system configurations, the
+		 * "zl_issuer_lock" can become highly contended. In an
+		 * attempt to reduce this contention, we immediately drop
+		 * the lock if the waiter has already been processed.
+		 *
+		 * We've measured this optimization to reduce CPU spent
+		 * contending on this lock by up to 5%, using a system
+		 * with 32 CPUs, low latency storage (~50 usec writes),
+		 * and 1024 threads performing sync writes.
+		 */
+		goto out;
+	}
+
+	ZIL_STAT_BUMP(zil_commit_writer_count);
+
+	zil_get_commit_list(zilog);
+	zil_prune_commit_list(zilog);
+	zil_process_commit_list(zilog);
+
+out:
+	mutex_exit(&zilog->zl_issuer_lock);
+}
+
+static void
+zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
+	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
+
+	lwb_t *lwb = zcw->zcw_lwb;
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED);
+
+	/*
+	 * If the lwb has already been issued by another thread, we can
+	 * immediately return since there's no work to be done (the
+	 * point of this function is to issue the lwb). Additionally, we
+	 * do this prior to acquiring the zl_issuer_lock, to avoid
+	 * acquiring it when it's not necessary to do so.
+	 */
+	if (lwb->lwb_state == LWB_STATE_ISSUED ||
+	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+		return;
+
+	/*
+	 * In order to call zil_lwb_write_issue() we must hold the
+	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
+	 * since we're already holding the commit waiter's "zcw_lock",
+	 * and those two locks are acquired in the opposite order
+	 * elsewhere.
+	 */
+	mutex_exit(&zcw->zcw_lock);
+	mutex_enter(&zilog->zl_issuer_lock);
+	mutex_enter(&zcw->zcw_lock);
+
+	/*
+	 * Since we just dropped and re-acquired the commit waiter's
+	 * lock, we have to re-check to see if the waiter was marked
+	 * "done" during that process. If the waiter was marked "done",
+	 * the "lwb" pointer is no longer valid (it can be free'd after
+	 * the waiter is marked "done"), so without this check we could
+	 * wind up with a use-after-free error below.
+	 */
+	if (zcw->zcw_done)
+		goto out;
+
+	ASSERT3P(lwb, ==, zcw->zcw_lwb);
+
+	/*
+	 * We've already checked this above, but since we hadn't acquired
+	 * the zilog's zl_issuer_lock, we have to perform this check a
+	 * second time while holding the lock.
+	 *
+	 * We don't need to hold the zl_lock since the lwb cannot transition
+	 * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb
+	 * _can_ transition from ISSUED to DONE, but it's OK to race with
+	 * that transition since we treat the lwb the same, whether it's in
+	 * the ISSUED or DONE states.
+	 *
+	 * The important thing, is we treat the lwb differently depending on
+	 * if it's ISSUED or OPENED, and block any other threads that might
+	 * attempt to issue this lwb. For that reason we hold the
+	 * zl_issuer_lock when checking the lwb_state; we must not call
+	 * zil_lwb_write_issue() if the lwb had already been issued.
+	 *
+	 * See the comment above the lwb_state_t structure definition for
+	 * more details on the lwb states, and locking requirements.
+	 */
+	if (lwb->lwb_state == LWB_STATE_ISSUED ||
+	    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+	    lwb->lwb_state == LWB_STATE_FLUSH_DONE)
+		goto out;
+
+	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
+
+	/*
+	 * As described in the comments above zil_commit_waiter() and
+	 * zil_process_commit_list(), we need to issue this lwb's zio
+	 * since we've reached the commit waiter's timeout and it still
+	 * hasn't been issued.
+	 */
+	lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
+
+	IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED);
+
+	/*
+	 * Since the lwb's zio hadn't been issued by the time this thread
+	 * reached its timeout, we reset the zilog's "zl_cur_used" field
+	 * to influence the zil block size selection algorithm.
+	 *
+	 * By having to issue the lwb's zio here, it means the size of the
+	 * lwb was too large, given the incoming throughput of itxs.  By
+	 * setting "zl_cur_used" to zero, we communicate this fact to the
+	 * block size selection algorithm, so it can take this information
+	 * into account, and potentially select a smaller size for the
+	 * next lwb block that is allocated.
+	 */
+	zilog->zl_cur_used = 0;
+
+	if (nlwb == NULL) {
+		/*
+		 * When zil_lwb_write_issue() returns NULL, this
+		 * indicates zio_alloc_zil() failed to allocate the
+		 * "next" lwb on-disk. When this occurs, the ZIL write
+		 * pipeline must be stalled; see the comment within the
+		 * zil_commit_writer_stall() function for more details.
+		 *
+		 * We must drop the commit waiter's lock prior to
+		 * calling zil_commit_writer_stall() or else we can wind
+		 * up with the following deadlock:
+		 *
+		 * - This thread is waiting for the txg to sync while
+		 *   holding the waiter's lock; txg_wait_synced() is
+		 *   used within txg_commit_writer_stall().
+		 *
+		 * - The txg can't sync because it is waiting for this
+		 *   lwb's zio callback to call dmu_tx_commit().
+		 *
+		 * - The lwb's zio callback can't call dmu_tx_commit()
+		 *   because it's blocked trying to acquire the waiter's
+		 *   lock, which occurs prior to calling dmu_tx_commit()
+		 */
+		mutex_exit(&zcw->zcw_lock);
+		zil_commit_writer_stall(zilog);
+		mutex_enter(&zcw->zcw_lock);
+	}
+
+out:
+	mutex_exit(&zilog->zl_issuer_lock);
+	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+}
+
+/*
+ * This function is responsible for performing the following two tasks:
+ *
+ * 1. its primary responsibility is to block until the given "commit
+ *    waiter" is considered "done".
+ *
+ * 2. its secondary responsibility is to issue the zio for the lwb that
+ *    the given "commit waiter" is waiting on, if this function has
+ *    waited "long enough" and the lwb is still in the "open" state.
+ *
+ * Given a sufficient amount of itxs being generated and written using
+ * the ZIL, the lwb's zio will be issued via the zil_lwb_commit()
+ * function. If this does not occur, this secondary responsibility will
+ * ensure the lwb is issued even if there is not other synchronous
+ * activity on the system.
+ *
+ * For more details, see zil_process_commit_list(); more specifically,
+ * the comment at the bottom of that function.
+ */
+static void
+zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
+	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
+	ASSERT(spa_writeable(zilog->zl_spa));
+
+	mutex_enter(&zcw->zcw_lock);
+
+	/*
+	 * The timeout is scaled based on the lwb latency to avoid
+	 * significantly impacting the latency of each individual itx.
+	 * For more details, see the comment at the bottom of the
+	 * zil_process_commit_list() function.
+	 */
+	int pct = MAX(zfs_commit_timeout_pct, 1);
+	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
+	hrtime_t wakeup = gethrtime() + sleep;
+	boolean_t timedout = B_FALSE;
+
+	while (!zcw->zcw_done) {
+		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
+
+		lwb_t *lwb = zcw->zcw_lwb;
+
+		/*
+		 * Usually, the waiter will have a non-NULL lwb field here,
+		 * but it's possible for it to be NULL as a result of
+		 * zil_commit() racing with spa_sync().
+		 *
+		 * When zil_clean() is called, it's possible for the itxg
+		 * list (which may be cleaned via a taskq) to contain
+		 * commit itxs. When this occurs, the commit waiters linked
+		 * off of these commit itxs will not be committed to an
+		 * lwb.  Additionally, these commit waiters will not be
+		 * marked done until zil_commit_waiter_skip() is called via
+		 * zil_itxg_clean().
+		 *
+		 * Thus, it's possible for this commit waiter (i.e. the
+		 * "zcw" variable) to be found in this "in between" state;
+		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
+		 * been skipped, so it's "zcw_done" field is still B_FALSE.
+		 */
+		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED);
+
+		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
+			ASSERT3B(timedout, ==, B_FALSE);
+
+			/*
+			 * If the lwb hasn't been issued yet, then we
+			 * need to wait with a timeout, in case this
+			 * function needs to issue the lwb after the
+			 * timeout is reached; responsibility (2) from
+			 * the comment above this function.
+			 */
+			int rc = cv_timedwait_hires(&zcw->zcw_cv,
+			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
+			    CALLOUT_FLAG_ABSOLUTE);
+
+			if (rc != -1 || zcw->zcw_done)
+				continue;
+
+			timedout = B_TRUE;
+			zil_commit_waiter_timeout(zilog, zcw);
+
+			if (!zcw->zcw_done) {
+				/*
+				 * If the commit waiter has already been
+				 * marked "done", it's possible for the
+				 * waiter's lwb structure to have already
+				 * been freed.  Thus, we can only reliably
+				 * make these assertions if the waiter
+				 * isn't done.
+				 */
+				ASSERT3P(lwb, ==, zcw->zcw_lwb);
+				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
+			}
+		} else {
+			/*
+			 * If the lwb isn't open, then it must have already
+			 * been issued. In that case, there's no need to
+			 * use a timeout when waiting for the lwb to
+			 * complete.
+			 *
+			 * Additionally, if the lwb is NULL, the waiter
+			 * will soon be signaled and marked done via
+			 * zil_clean() and zil_itxg_clean(), so no timeout
+			 * is required.
+			 */
+
+			IMPLY(lwb != NULL,
+			    lwb->lwb_state == LWB_STATE_ISSUED ||
+			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
+			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
+			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
+		}
+	}
+
+	mutex_exit(&zcw->zcw_lock);
+}
+
+static zil_commit_waiter_t *
+zil_alloc_commit_waiter(void)
+{
+	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
+
+	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
+	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_link_init(&zcw->zcw_node);
+	zcw->zcw_lwb = NULL;
+	zcw->zcw_done = B_FALSE;
+	zcw->zcw_zio_error = 0;
+
+	return (zcw);
+}
+
+static void
+zil_free_commit_waiter(zil_commit_waiter_t *zcw)
+{
+	ASSERT(!list_link_active(&zcw->zcw_node));
+	ASSERT3P(zcw->zcw_lwb, ==, NULL);
+	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
+	mutex_destroy(&zcw->zcw_lock);
+	cv_destroy(&zcw->zcw_cv);
+	kmem_cache_free(zil_zcw_cache, zcw);
+}
+
+/*
+ * This function is used to create a TX_COMMIT itx and assign it. This
+ * way, it will be linked into the ZIL's list of synchronous itxs, and
+ * then later committed to an lwb (or skipped) when
+ * zil_process_commit_list() is called.
+ */
+static void
+zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
+{
+	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
+	itx->itx_sync = B_TRUE;
+	itx->itx_private = zcw;
+
+	zil_itx_assign(zilog, itx, tx);
+
+	dmu_tx_commit(tx);
+}
+
+/*
+ * Commit ZFS Intent Log transactions (itxs) to stable storage.
+ *
+ * When writing ZIL transactions to the on-disk representation of the
+ * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
+ * itxs can be committed to a single lwb. Once a lwb is written and
+ * committed to stable storage (i.e. the lwb is written, and vdevs have
+ * been flushed), each itx that was committed to that lwb is also
+ * considered to be committed to stable storage.
+ *
+ * When an itx is committed to an lwb, the log record (lr_t) contained
+ * by the itx is copied into the lwb's zio buffer, and once this buffer
+ * is written to disk, it becomes an on-disk ZIL block.
+ *
+ * As itxs are generated, they're inserted into the ZIL's queue of
+ * uncommitted itxs. The semantics of zil_commit() are such that it will
+ * block until all itxs that were in the queue when it was called, are
+ * committed to stable storage.
+ *
+ * If "foid" is zero, this means all "synchronous" and "asynchronous"
+ * itxs, for all objects in the dataset, will be committed to stable
+ * storage prior to zil_commit() returning. If "foid" is non-zero, all
+ * "synchronous" itxs for all objects, but only "asynchronous" itxs
+ * that correspond to the foid passed in, will be committed to stable
+ * storage prior to zil_commit() returning.
+ *
+ * Generally speaking, when zil_commit() is called, the consumer doesn't
+ * actually care about _all_ of the uncommitted itxs. Instead, they're
+ * simply trying to waiting for a specific itx to be committed to disk,
+ * but the interface(s) for interacting with the ZIL don't allow such
+ * fine-grained communication. A better interface would allow a consumer
+ * to create and assign an itx, and then pass a reference to this itx to
+ * zil_commit(); such that zil_commit() would return as soon as that
+ * specific itx was committed to disk (instead of waiting for _all_
+ * itxs to be committed).
+ *
+ * When a thread calls zil_commit() a special "commit itx" will be
+ * generated, along with a corresponding "waiter" for this commit itx.
+ * zil_commit() will wait on this waiter's CV, such that when the waiter
+ * is marked done, and signaled, zil_commit() will return.
+ *
+ * This commit itx is inserted into the queue of uncommitted itxs. This
+ * provides an easy mechanism for determining which itxs were in the
+ * queue prior to zil_commit() having been called, and which itxs were
+ * added after zil_commit() was called.
+ *
+ * The commit it is special; it doesn't have any on-disk representation.
+ * When a commit itx is "committed" to an lwb, the waiter associated
+ * with it is linked onto the lwb's list of waiters. Then, when that lwb
+ * completes, each waiter on the lwb's list is marked done and signaled
+ * -- allowing the thread waiting on the waiter to return from zil_commit().
+ *
+ * It's important to point out a few critical factors that allow us
+ * to make use of the commit itxs, commit waiters, per-lwb lists of
+ * commit waiters, and zio completion callbacks like we're doing:
+ *
+ *   1. The list of waiters for each lwb is traversed, and each commit
+ *      waiter is marked "done" and signaled, in the zio completion
+ *      callback of the lwb's zio[*].
+ *
+ *      * Actually, the waiters are signaled in the zio completion
+ *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
+ *        that are sent to the vdevs upon completion of the lwb zio.
+ *
+ *   2. When the itxs are inserted into the ZIL's queue of uncommitted
+ *      itxs, the order in which they are inserted is preserved[*]; as
+ *      itxs are added to the queue, they are added to the tail of
+ *      in-memory linked lists.
+ *
+ *      When committing the itxs to lwbs (to be written to disk), they
+ *      are committed in the same order in which the itxs were added to
+ *      the uncommitted queue's linked list(s); i.e. the linked list of
+ *      itxs to commit is traversed from head to tail, and each itx is
+ *      committed to an lwb in that order.
+ *
+ *      * To clarify:
+ *
+ *        - the order of "sync" itxs is preserved w.r.t. other
+ *          "sync" itxs, regardless of the corresponding objects.
+ *        - the order of "async" itxs is preserved w.r.t. other
+ *          "async" itxs corresponding to the same object.
+ *        - the order of "async" itxs is *not* preserved w.r.t. other
+ *          "async" itxs corresponding to different objects.
+ *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
+ *          versa) is *not* preserved, even for itxs that correspond
+ *          to the same object.
+ *
+ *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
+ *      zil_get_commit_list(), and zil_process_commit_list().
+ *
+ *   3. The lwbs represent a linked list of blocks on disk. Thus, any
+ *      lwb cannot be considered committed to stable storage, until its
+ *      "previous" lwb is also committed to stable storage. This fact,
+ *      coupled with the fact described above, means that itxs are
+ *      committed in (roughly) the order in which they were generated.
+ *      This is essential because itxs are dependent on prior itxs.
+ *      Thus, we *must not* deem an itx as being committed to stable
+ *      storage, until *all* prior itxs have also been committed to
+ *      stable storage.
+ *
+ *      To enforce this ordering of lwb zio's, while still leveraging as
+ *      much of the underlying storage performance as possible, we rely
+ *      on two fundamental concepts:
+ *
+ *          1. The creation and issuance of lwb zio's is protected by
+ *             the zilog's "zl_issuer_lock", which ensures only a single
+ *             thread is creating and/or issuing lwb's at a time
+ *          2. The "previous" lwb is a child of the "current" lwb
+ *             (leveraging the zio parent-child dependency graph)
+ *
+ *      By relying on this parent-child zio relationship, we can have
+ *      many lwb zio's concurrently issued to the underlying storage,
+ *      but the order in which they complete will be the same order in
+ *      which they were created.
+ */
+void
+zil_commit(zilog_t *zilog, uint64_t foid)
+{
+	/*
+	 * We should never attempt to call zil_commit on a snapshot for
+	 * a couple of reasons:
+	 *
+	 * 1. A snapshot may never be modified, thus it cannot have any
+	 *    in-flight itxs that would have modified the dataset.
+	 *
+	 * 2. By design, when zil_commit() is called, a commit itx will
+	 *    be assigned to this zilog; as a result, the zilog will be
+	 *    dirtied. We must not dirty the zilog of a snapshot; there's
+	 *    checks in the code that enforce this invariant, and will
+	 *    cause a panic if it's not upheld.
+	 */
+	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
+
+	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+		return;
+
+	if (!spa_writeable(zilog->zl_spa)) {
+		/*
+		 * If the SPA is not writable, there should never be any
+		 * pending itxs waiting to be committed to disk. If that
+		 * weren't true, we'd skip writing those itxs out, and
+		 * would break the semantics of zil_commit(); thus, we're
+		 * verifying that truth before we return to the caller.
+		 */
+		ASSERT(list_is_empty(&zilog->zl_lwb_list));
+		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+		for (int i = 0; i < TXG_SIZE; i++)
+			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
+		return;
+	}
+
+	/*
+	 * If the ZIL is suspended, we don't want to dirty it by calling
+	 * zil_commit_itx_assign() below, nor can we write out
+	 * lwbs like would be done in zil_commit_write(). Thus, we
+	 * simply rely on txg_wait_synced() to maintain the necessary
+	 * semantics, and avoid calling those functions altogether.
+	 */
+	if (zilog->zl_suspend > 0) {
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+		return;
+	}
+
+	zil_commit_impl(zilog, foid);
+}
+
+void
+zil_commit_impl(zilog_t *zilog, uint64_t foid)
+{
+	ZIL_STAT_BUMP(zil_commit_count);
+
+	/*
+	 * Move the "async" itxs for the specified foid to the "sync"
+	 * queues, such that they will be later committed (or skipped)
+	 * to an lwb when zil_process_commit_list() is called.
+	 *
+	 * Since these "async" itxs must be committed prior to this
+	 * call to zil_commit returning, we must perform this operation
+	 * before we call zil_commit_itx_assign().
+	 */
+	zil_async_to_sync(zilog, foid);
+
+	/*
+	 * We allocate a new "waiter" structure which will initially be
+	 * linked to the commit itx using the itx's "itx_private" field.
+	 * Since the commit itx doesn't represent any on-disk state,
+	 * when it's committed to an lwb, rather than copying the its
+	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
+	 * added to the lwb's list of waiters. Then, when the lwb is
+	 * committed to stable storage, each waiter in the lwb's list of
+	 * waiters will be marked "done", and signalled.
+	 *
+	 * We must create the waiter and assign the commit itx prior to
+	 * calling zil_commit_writer(), or else our specific commit itx
+	 * is not guaranteed to be committed to an lwb prior to calling
+	 * zil_commit_waiter().
+	 */
+	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
+	zil_commit_itx_assign(zilog, zcw);
+
+	zil_commit_writer(zilog, zcw);
+	zil_commit_waiter(zilog, zcw);
+
+	if (zcw->zcw_zio_error != 0) {
+		/*
+		 * If there was an error writing out the ZIL blocks that
+		 * this thread is waiting on, then we fallback to
+		 * relying on spa_sync() to write out the data this
+		 * thread is waiting on. Obviously this has performance
+		 * implications, but the expectation is for this to be
+		 * an exceptional case, and shouldn't occur often.
+		 */
+		DTRACE_PROBE2(zil__commit__io__error,
+		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
+		txg_wait_synced(zilog->zl_dmu_pool, 0);
+	}
+
+	zil_free_commit_waiter(zcw);
+}
+
+/*
+ * Called in syncing context to free committed log blocks and update log header.
+ */
+void
+zil_sync(zilog_t *zilog, dmu_tx_t *tx)
+{
+	zil_header_t *zh = zil_header_in_syncing_context(zilog);
+	uint64_t txg = dmu_tx_get_txg(tx);
+	spa_t *spa = zilog->zl_spa;
+	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
+	lwb_t *lwb;
+
+	/*
+	 * We don't zero out zl_destroy_txg, so make sure we don't try
+	 * to destroy it twice.
+	 */
+	if (spa_sync_pass(spa) != 1)
+		return;
+
+	mutex_enter(&zilog->zl_lock);
+
+	ASSERT(zilog->zl_stop_sync == 0);
+
+	if (*replayed_seq != 0) {
+		ASSERT(zh->zh_replay_seq < *replayed_seq);
+		zh->zh_replay_seq = *replayed_seq;
+		*replayed_seq = 0;
+	}
+
+	if (zilog->zl_destroy_txg == txg) {
+		blkptr_t blk = zh->zh_log;
+
+		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
+
+		bzero(zh, sizeof (zil_header_t));
+		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
+
+		if (zilog->zl_keep_first) {
+			/*
+			 * If this block was part of log chain that couldn't
+			 * be claimed because a device was missing during
+			 * zil_claim(), but that device later returns,
+			 * then this block could erroneously appear valid.
+			 * To guard against this, assign a new GUID to the new
+			 * log chain so it doesn't matter what blk points to.
+			 */
+			zil_init_log_chain(zilog, &blk);
+			zh->zh_log = blk;
+		}
+	}
+
+	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
+		zh->zh_log = lwb->lwb_blk;
+		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
+			break;
+		list_remove(&zilog->zl_lwb_list, lwb);
+		zio_free(spa, txg, &lwb->lwb_blk);
+		zil_free_lwb(zilog, lwb);
+
+		/*
+		 * If we don't have anything left in the lwb list then
+		 * we've had an allocation failure and we need to zero
+		 * out the zil_header blkptr so that we don't end
+		 * up freeing the same block twice.
+		 */
+		if (list_head(&zilog->zl_lwb_list) == NULL)
+			BP_ZERO(&zh->zh_log);
+	}
+
+	/*
+	 * Remove fastwrite on any blocks that have been pre-allocated for
+	 * the next commit. This prevents fastwrite counter pollution by
+	 * unused, long-lived LWBs.
+	 */
+	for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) {
+		if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) {
+			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
+			lwb->lwb_fastwrite = 0;
+		}
+	}
+
+	mutex_exit(&zilog->zl_lock);
+}
+
+/* ARGSUSED */
+static int
+zil_lwb_cons(void *vbuf, void *unused, int kmflag)
+{
+	lwb_t *lwb = vbuf;
+	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
+	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
+	    offsetof(zil_commit_waiter_t, zcw_node));
+	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
+	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
+	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
+	return (0);
+}
+
+/* ARGSUSED */
+static void
+zil_lwb_dest(void *vbuf, void *unused)
+{
+	lwb_t *lwb = vbuf;
+	mutex_destroy(&lwb->lwb_vdev_lock);
+	avl_destroy(&lwb->lwb_vdev_tree);
+	list_destroy(&lwb->lwb_waiters);
+	list_destroy(&lwb->lwb_itxs);
+}
+
+void
+zil_init(void)
+{
+	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
+	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
+
+	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
+	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	zil_ksp = kstat_create("zfs", 0, "zil", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zil_ksp != NULL) {
+		zil_ksp->ks_data = &zil_stats;
+		kstat_install(zil_ksp);
+	}
+}
+
+void
+zil_fini(void)
+{
+	kmem_cache_destroy(zil_zcw_cache);
+	kmem_cache_destroy(zil_lwb_cache);
+
+	if (zil_ksp != NULL) {
+		kstat_delete(zil_ksp);
+		zil_ksp = NULL;
+	}
+}
+
+void
+zil_set_sync(zilog_t *zilog, uint64_t sync)
+{
+	zilog->zl_sync = sync;
+}
+
+void
+zil_set_logbias(zilog_t *zilog, uint64_t logbias)
+{
+	zilog->zl_logbias = logbias;
+}
+
+zilog_t *
+zil_alloc(objset_t *os, zil_header_t *zh_phys)
+{
+	zilog_t *zilog;
+
+	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
+
+	zilog->zl_header = zh_phys;
+	zilog->zl_os = os;
+	zilog->zl_spa = dmu_objset_spa(os);
+	zilog->zl_dmu_pool = dmu_objset_pool(os);
+	zilog->zl_destroy_txg = TXG_INITIAL - 1;
+	zilog->zl_logbias = dmu_objset_logbias(os);
+	zilog->zl_sync = dmu_objset_syncprop(os);
+	zilog->zl_dirty_max_txg = 0;
+	zilog->zl_last_lwb_opened = NULL;
+	zilog->zl_last_lwb_latency = 0;
+	zilog->zl_max_block_size = zil_maxblocksize;
+
+	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	for (int i = 0; i < TXG_SIZE; i++) {
+		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
+		    MUTEX_DEFAULT, NULL);
+	}
+
+	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
+	    offsetof(lwb_t, lwb_node));
+
+	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
+	    offsetof(itx_t, itx_node));
+
+	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
+
+	return (zilog);
+}
+
+void
+zil_free(zilog_t *zilog)
+{
+	int i;
+
+	zilog->zl_stop_sync = 1;
+
+	ASSERT0(zilog->zl_suspend);
+	ASSERT0(zilog->zl_suspending);
+
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+	list_destroy(&zilog->zl_lwb_list);
+
+	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
+	list_destroy(&zilog->zl_itx_commit_list);
+
+	for (i = 0; i < TXG_SIZE; i++) {
+		/*
+		 * It's possible for an itx to be generated that doesn't dirty
+		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
+		 * callback to remove the entry. We remove those here.
+		 *
+		 * Also free up the ziltest itxs.
+		 */
+		if (zilog->zl_itxg[i].itxg_itxs)
+			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
+		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
+	}
+
+	mutex_destroy(&zilog->zl_issuer_lock);
+	mutex_destroy(&zilog->zl_lock);
+
+	cv_destroy(&zilog->zl_cv_suspend);
+
+	kmem_free(zilog, sizeof (zilog_t));
+}
+
+/*
+ * Open an intent log.
+ */
+zilog_t *
+zil_open(objset_t *os, zil_get_data_t *get_data)
+{
+	zilog_t *zilog = dmu_objset_zil(os);
+
+	ASSERT3P(zilog->zl_get_data, ==, NULL);
+	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
+	ASSERT(list_is_empty(&zilog->zl_lwb_list));
+
+	zilog->zl_get_data = get_data;
+
+	return (zilog);
+}
+
+/*
+ * Close an intent log.
+ */
+void
+zil_close(zilog_t *zilog)
+{
+	lwb_t *lwb;
+	uint64_t txg;
+
+	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
+		zil_commit(zilog, 0);
+	} else {
+		ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL);
+		ASSERT0(zilog->zl_dirty_max_txg);
+		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
+	}
+
+	mutex_enter(&zilog->zl_lock);
+	lwb = list_tail(&zilog->zl_lwb_list);
+	if (lwb == NULL)
+		txg = zilog->zl_dirty_max_txg;
+	else
+		txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg);
+	mutex_exit(&zilog->zl_lock);
+
+	/*
+	 * We need to use txg_wait_synced() to wait long enough for the
+	 * ZIL to be clean, and to wait for all pending lwbs to be
+	 * written out.
+	 */
+	if (txg != 0)
+		txg_wait_synced(zilog->zl_dmu_pool, txg);
+
+	if (zilog_is_dirty(zilog))
+		zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, txg);
+	if (txg < spa_freeze_txg(zilog->zl_spa))
+		VERIFY(!zilog_is_dirty(zilog));
+
+	zilog->zl_get_data = NULL;
+
+	/*
+	 * We should have only one lwb left on the list; remove it now.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	lwb = list_head(&zilog->zl_lwb_list);
+	if (lwb != NULL) {
+		ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list));
+		ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED);
+
+		if (lwb->lwb_fastwrite)
+			metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk);
+
+		list_remove(&zilog->zl_lwb_list, lwb);
+		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
+		zil_free_lwb(zilog, lwb);
+	}
+	mutex_exit(&zilog->zl_lock);
+}
+
+static char *suspend_tag = "zil suspending";
+
+/*
+ * Suspend an intent log.  While in suspended mode, we still honor
+ * synchronous semantics, but we rely on txg_wait_synced() to do it.
+ * On old version pools, we suspend the log briefly when taking a
+ * snapshot so that it will have an empty intent log.
+ *
+ * Long holds are not really intended to be used the way we do here --
+ * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
+ * could fail.  Therefore we take pains to only put a long hold if it is
+ * actually necessary.  Fortunately, it will only be necessary if the
+ * objset is currently mounted (or the ZVOL equivalent).  In that case it
+ * will already have a long hold, so we are not really making things any worse.
+ *
+ * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
+ * zvol_state_t), and use their mechanism to prevent their hold from being
+ * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
+ * very little gain.
+ *
+ * if cookiep == NULL, this does both the suspend & resume.
+ * Otherwise, it returns with the dataset "long held", and the cookie
+ * should be passed into zil_resume().
+ */
+int
+zil_suspend(const char *osname, void **cookiep)
+{
+	objset_t *os;
+	zilog_t *zilog;
+	const zil_header_t *zh;
+	int error;
+
+	error = dmu_objset_hold(osname, suspend_tag, &os);
+	if (error != 0)
+		return (error);
+	zilog = dmu_objset_zil(os);
+
+	mutex_enter(&zilog->zl_lock);
+	zh = zilog->zl_header;
+
+	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
+		mutex_exit(&zilog->zl_lock);
+		dmu_objset_rele(os, suspend_tag);
+		return (SET_ERROR(EBUSY));
+	}
+
+	/*
+	 * Don't put a long hold in the cases where we can avoid it.  This
+	 * is when there is no cookie so we are doing a suspend & resume
+	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
+	 * for the suspend because it's already suspended, or there's no ZIL.
+	 */
+	if (cookiep == NULL && !zilog->zl_suspending &&
+	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
+		mutex_exit(&zilog->zl_lock);
+		dmu_objset_rele(os, suspend_tag);
+		return (0);
+	}
+
+	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
+	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
+
+	zilog->zl_suspend++;
+
+	if (zilog->zl_suspend > 1) {
+		/*
+		 * Someone else is already suspending it.
+		 * Just wait for them to finish.
+		 */
+
+		while (zilog->zl_suspending)
+			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
+		mutex_exit(&zilog->zl_lock);
+
+		if (cookiep == NULL)
+			zil_resume(os);
+		else
+			*cookiep = os;
+		return (0);
+	}
+
+	/*
+	 * If there is no pointer to an on-disk block, this ZIL must not
+	 * be active (e.g. filesystem not mounted), so there's nothing
+	 * to clean up.
+	 */
+	if (BP_IS_HOLE(&zh->zh_log)) {
+		ASSERT(cookiep != NULL); /* fast path already handled */
+
+		*cookiep = os;
+		mutex_exit(&zilog->zl_lock);
+		return (0);
+	}
+
+	/*
+	 * The ZIL has work to do. Ensure that the associated encryption
+	 * key will remain mapped while we are committing the log by
+	 * grabbing a reference to it. If the key isn't loaded we have no
+	 * choice but to return an error until the wrapping key is loaded.
+	 */
+	if (os->os_encrypted &&
+	    dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
+		zilog->zl_suspend--;
+		mutex_exit(&zilog->zl_lock);
+		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
+		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
+		return (SET_ERROR(EACCES));
+	}
+
+	zilog->zl_suspending = B_TRUE;
+	mutex_exit(&zilog->zl_lock);
+
+	/*
+	 * We need to use zil_commit_impl to ensure we wait for all
+	 * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed
+	 * to disk before proceeding. If we used zil_commit instead, it
+	 * would just call txg_wait_synced(), because zl_suspend is set.
+	 * txg_wait_synced() doesn't wait for these lwb's to be
+	 * LWB_STATE_FLUSH_DONE before returning.
+	 */
+	zil_commit_impl(zilog, 0);
+
+	/*
+	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
+	 * use txg_wait_synced() to ensure the data from the zilog has
+	 * migrated to the main pool before calling zil_destroy().
+	 */
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+	zil_destroy(zilog, B_FALSE);
+
+	mutex_enter(&zilog->zl_lock);
+	zilog->zl_suspending = B_FALSE;
+	cv_broadcast(&zilog->zl_cv_suspend);
+	mutex_exit(&zilog->zl_lock);
+
+	if (os->os_encrypted)
+		dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
+
+	if (cookiep == NULL)
+		zil_resume(os);
+	else
+		*cookiep = os;
+	return (0);
+}
+
+void
+zil_resume(void *cookie)
+{
+	objset_t *os = cookie;
+	zilog_t *zilog = dmu_objset_zil(os);
+
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(zilog->zl_suspend != 0);
+	zilog->zl_suspend--;
+	mutex_exit(&zilog->zl_lock);
+	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
+	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
+}
+
+typedef struct zil_replay_arg {
+	zil_replay_func_t **zr_replay;
+	void		*zr_arg;
+	boolean_t	zr_byteswap;
+	char		*zr_lr;
+} zil_replay_arg_t;
+
+static int
+zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
+{
+	char name[ZFS_MAX_DATASET_NAME_LEN];
+
+	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
+
+	dmu_objset_name(zilog->zl_os, name);
+
+	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+	    (u_longlong_t)lr->lrc_seq,
+	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+	return (error);
+}
+
+static int
+zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
+    uint64_t claim_txg)
+{
+	zil_replay_arg_t *zr = zra;
+	const zil_header_t *zh = zilog->zl_header;
+	uint64_t reclen = lr->lrc_reclen;
+	uint64_t txtype = lr->lrc_txtype;
+	int error = 0;
+
+	zilog->zl_replaying_seq = lr->lrc_seq;
+
+	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
+		return (0);
+
+	if (lr->lrc_txg < claim_txg)		/* already committed */
+		return (0);
+
+	/* Strip case-insensitive bit, still present in log record */
+	txtype &= ~TX_CI;
+
+	if (txtype == 0 || txtype >= TX_MAX_TYPE)
+		return (zil_replay_error(zilog, lr, EINVAL));
+
+	/*
+	 * If this record type can be logged out of order, the object
+	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
+	 */
+	if (TX_OOO(txtype)) {
+		error = dmu_object_info(zilog->zl_os,
+		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
+		if (error == ENOENT || error == EEXIST)
+			return (0);
+	}
+
+	/*
+	 * Make a copy of the data so we can revise and extend it.
+	 */
+	bcopy(lr, zr->zr_lr, reclen);
+
+	/*
+	 * If this is a TX_WRITE with a blkptr, suck in the data.
+	 */
+	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
+		error = zil_read_log_data(zilog, (lr_write_t *)lr,
+		    zr->zr_lr + reclen);
+		if (error != 0)
+			return (zil_replay_error(zilog, lr, error));
+	}
+
+	/*
+	 * The log block containing this lr may have been byteswapped
+	 * so that we can easily examine common fields like lrc_txtype.
+	 * However, the log is a mix of different record types, and only the
+	 * replay vectors know how to byteswap their records.  Therefore, if
+	 * the lr was byteswapped, undo it before invoking the replay vector.
+	 */
+	if (zr->zr_byteswap)
+		byteswap_uint64_array(zr->zr_lr, reclen);
+
+	/*
+	 * We must now do two things atomically: replay this log record,
+	 * and update the log header sequence number to reflect the fact that
+	 * we did so. At the end of each replay function the sequence number
+	 * is updated if we are in replay mode.
+	 */
+	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+	if (error != 0) {
+		/*
+		 * The DMU's dnode layer doesn't see removes until the txg
+		 * commits, so a subsequent claim can spuriously fail with
+		 * EEXIST. So if we receive any error we try syncing out
+		 * any removes then retry the transaction.  Note that we
+		 * specify B_FALSE for byteswap now, so we don't do it twice.
+		 */
+		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+		if (error != 0)
+			return (zil_replay_error(zilog, lr, error));
+	}
+	return (0);
+}
+
+/* ARGSUSED */
+static int
+zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
+{
+	zilog->zl_replay_blks++;
+
+	return (0);
+}
+
+/*
+ * If this dataset has a non-empty intent log, replay it and destroy it.
+ */
+void
+zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
+{
+	zilog_t *zilog = dmu_objset_zil(os);
+	const zil_header_t *zh = zilog->zl_header;
+	zil_replay_arg_t zr;
+
+	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
+		zil_destroy(zilog, B_TRUE);
+		return;
+	}
+
+	zr.zr_replay = replay_func;
+	zr.zr_arg = arg;
+	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
+	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+
+	/*
+	 * Wait for in-progress removes to sync before starting replay.
+	 */
+	txg_wait_synced(zilog->zl_dmu_pool, 0);
+
+	zilog->zl_replay = B_TRUE;
+	zilog->zl_replay_time = ddi_get_lbolt();
+	ASSERT(zilog->zl_replay_blks == 0);
+	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
+	    zh->zh_claim_txg, B_TRUE);
+	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
+
+	zil_destroy(zilog, B_FALSE);
+	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+	zilog->zl_replay = B_FALSE;
+}
+
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
+{
+	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
+		return (B_TRUE);
+
+	if (zilog->zl_replay) {
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+		    zilog->zl_replaying_seq;
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/* ARGSUSED */
+int
+zil_reset(const char *osname, void *arg)
+{
+	int error;
+
+	error = zil_suspend(osname, NULL);
+	/* EACCES means crypto key not loaded */
+	if ((error == EACCES) || (error == EBUSY))
+		return (SET_ERROR(error));
+	if (error != 0)
+		return (SET_ERROR(EEXIST));
+	return (0);
+}
+
+EXPORT_SYMBOL(zil_alloc);
+EXPORT_SYMBOL(zil_free);
+EXPORT_SYMBOL(zil_open);
+EXPORT_SYMBOL(zil_close);
+EXPORT_SYMBOL(zil_replay);
+EXPORT_SYMBOL(zil_replaying);
+EXPORT_SYMBOL(zil_destroy);
+EXPORT_SYMBOL(zil_destroy_sync);
+EXPORT_SYMBOL(zil_itx_create);
+EXPORT_SYMBOL(zil_itx_destroy);
+EXPORT_SYMBOL(zil_itx_assign);
+EXPORT_SYMBOL(zil_commit);
+EXPORT_SYMBOL(zil_claim);
+EXPORT_SYMBOL(zil_check_log_chain);
+EXPORT_SYMBOL(zil_sync);
+EXPORT_SYMBOL(zil_clean);
+EXPORT_SYMBOL(zil_suspend);
+EXPORT_SYMBOL(zil_resume);
+EXPORT_SYMBOL(zil_lwb_add_block);
+EXPORT_SYMBOL(zil_bp_tree_add);
+EXPORT_SYMBOL(zil_set_sync);
+EXPORT_SYMBOL(zil_set_logbias);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW,
+	"ZIL block open timeout percentage");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
+	"Disable intent logging replay");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
+	"Disable ZIL cache flushes");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW,
+	"Limit in bytes slog sync writes per commit");
+
+ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, INT, ZMOD_RW,
+	"Limit in bytes of ZIL log block size");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zio.c b/sys/contrib/openzfs/module/zfs/zio.c
new file mode 100644
index 000000000000..7f3cb19d46db
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio.c
@@ -0,0 +1,5039 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/sysmacros.h>
+#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
+#include <sys/spa.h>
+#include <sys/txg.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_trim.h>
+#include <sys/zio_impl.h>
+#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
+#include <sys/metaslab_impl.h>
+#include <sys/time.h>
+#include <sys/trace_zfs.h>
+#include <sys/abd.h>
+#include <sys/dsl_crypt.h>
+#include <cityhash.h>
+
+/*
+ * ==========================================================================
+ * I/O type descriptions
+ * ==========================================================================
+ */
+const char *zio_type_name[ZIO_TYPES] = {
+	/*
+	 * Note: Linux kernel thread name length is limited
+	 * so these names will differ from upstream open zfs.
+	 */
+	"z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim"
+};
+
+int zio_dva_throttle_enabled = B_TRUE;
+int zio_deadman_log_all = B_FALSE;
+
+/*
+ * ==========================================================================
+ * I/O kmem caches
+ * ==========================================================================
+ */
+kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
+kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
+#endif
+
+/* Mark IOs as "slow" if they take longer than 30 seconds */
+int zio_slow_io_ms = (30 * MILLISEC);
+
+#define	BP_SPANB(indblkshift, level) \
+	(((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define	COMPARE_META_LEVEL	0x80000000ul
+/*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ *
+ * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable
+ * compression (including of metadata).  In practice, we don't have this
+ * many sync passes, so this has no effect.
+ *
+ * The original intent was that disabling compression would help the sync
+ * passes to converge. However, in practice disabling compression increases
+ * the average number of sync passes, because when we turn compression off, a
+ * lot of block's size will change and thus we have to re-allocate (not
+ * overwrite) them. It also increases the number of 128KB allocations (e.g.
+ * for indirect blocks and spacemaps) because these will not be compressed.
+ * The 128K allocations are especially detrimental to performance on highly
+ * fragmented systems, which may have very few free segments of this size,
+ * and may need to load new metaslabs to satisfy 128K allocations.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+
+/*
+ * An allocating zio is one that either currently has the DVA allocate
+ * stage set or will have it later in its lifetime.
+ */
+#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+/*
+ * Enable smaller cores by excluding metadata
+ * allocations as well.
+ */
+int zio_exclude_metadata = 0;
+int zio_requeue_io_start_cut_in_line = 1;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
+
+static inline void __zio_execute(zio_t *zio);
+
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
+void
+zio_init(void)
+{
+	size_t c;
+
+	zio_cache = kmem_cache_create("zio_cache",
+	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	zio_link_cache = kmem_cache_create("zio_link_cache",
+	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	/*
+	 * For small buffers, we want a cache for each multiple of
+	 * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
+	 * for each quarter-power of 2.
+	 */
+	for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
+		size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
+		size_t p2 = size;
+		size_t align = 0;
+		size_t data_cflags, cflags;
+
+		data_cflags = KMC_NODEBUG;
+		cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+		    KMC_NODEBUG : 0;
+
+#if defined(_ILP32) && defined(_KERNEL)
+		/*
+		 * Cache size limited to 1M on 32-bit platforms until ARC
+		 * buffers no longer require virtual address space.
+		 */
+		if (size > zfs_max_recordsize)
+			break;
+#endif
+
+		while (!ISP2(p2))
+			p2 &= p2 - 1;
+
+#ifndef _KERNEL
+		/*
+		 * If we are using watchpoints, put each buffer on its own page,
+		 * to eliminate the performance overhead of trapping to the
+		 * kernel when modifying a non-watched buffer that shares the
+		 * page with a watched buffer.
+		 */
+		if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
+			continue;
+		/*
+		 * Here's the problem - on 4K native devices in userland on
+		 * Linux using O_DIRECT, buffers must be 4K aligned or I/O
+		 * will fail with EINVAL, causing zdb (and others) to coredump.
+		 * Since userland probably doesn't need optimized buffer caches,
+		 * we just force 4K alignment on everything.
+		 */
+		align = 8 * SPA_MINBLOCKSIZE;
+#else
+		if (size < PAGESIZE) {
+			align = SPA_MINBLOCKSIZE;
+		} else if (IS_P2ALIGNED(size, p2 >> 2)) {
+			align = PAGESIZE;
+		}
+#endif
+
+		if (align != 0) {
+			char name[36];
+			if (cflags == data_cflags) {
+				/*
+				 * Resulting kmem caches would be identical.
+				 * Save memory by creating only one.
+				 */
+				(void) snprintf(name, sizeof (name),
+				    "zio_buf_comb_%lu", (ulong_t)size);
+				zio_buf_cache[c] = kmem_cache_create(name,
+				    size, align, NULL, NULL, NULL, NULL, NULL,
+				    cflags);
+				zio_data_buf_cache[c] = zio_buf_cache[c];
+				continue;
+			}
+			(void) snprintf(name, sizeof (name), "zio_buf_%lu",
+			    (ulong_t)size);
+			zio_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, cflags);
+
+			(void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+			    (ulong_t)size);
+			zio_data_buf_cache[c] = kmem_cache_create(name, size,
+			    align, NULL, NULL, NULL, NULL, NULL, data_cflags);
+		}
+	}
+
+	while (--c != 0) {
+		ASSERT(zio_buf_cache[c] != NULL);
+		if (zio_buf_cache[c - 1] == NULL)
+			zio_buf_cache[c - 1] = zio_buf_cache[c];
+
+		ASSERT(zio_data_buf_cache[c] != NULL);
+		if (zio_data_buf_cache[c - 1] == NULL)
+			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
+	}
+
+	zio_inject_init();
+
+	lz4_init();
+}
+
+void
+zio_fini(void)
+{
+	size_t i, j, n;
+	kmem_cache_t *cache;
+
+	n = SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT;
+
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+	for (i = 0; i < n; i++) {
+		if (zio_buf_cache_allocs[i] != zio_buf_cache_frees[i])
+			(void) printf("zio_fini: [%d] %llu != %llu\n",
+			    (int)((i + 1) << SPA_MINBLOCKSHIFT),
+			    (long long unsigned)zio_buf_cache_allocs[i],
+			    (long long unsigned)zio_buf_cache_frees[i]);
+	}
+#endif
+
+	/*
+	 * The same kmem cache can show up multiple times in both zio_buf_cache
+	 * and zio_data_buf_cache. Do a wasteful but trivially correct scan to
+	 * sort it out.
+	 */
+	for (i = 0; i < n; i++) {
+		cache = zio_buf_cache[i];
+		if (cache == NULL)
+			continue;
+		for (j = i; j < n; j++) {
+			if (cache == zio_buf_cache[j])
+				zio_buf_cache[j] = NULL;
+			if (cache == zio_data_buf_cache[j])
+				zio_data_buf_cache[j] = NULL;
+		}
+		kmem_cache_destroy(cache);
+	}
+
+	for (i = 0; i < n; i++) {
+		cache = zio_data_buf_cache[i];
+		if (cache == NULL)
+			continue;
+		for (j = i; j < n; j++) {
+			if (cache == zio_data_buf_cache[j])
+				zio_data_buf_cache[j] = NULL;
+		}
+		kmem_cache_destroy(cache);
+	}
+
+	for (i = 0; i < n; i++) {
+		if (zio_buf_cache[i] != NULL)
+			panic("zio_fini: zio_buf_cache[%d] != NULL", (int)i);
+		if (zio_data_buf_cache[i] != NULL)
+			panic("zio_fini: zio_data_buf_cache[%d] != NULL", (int)i);
+	}
+
+	kmem_cache_destroy(zio_link_cache);
+	kmem_cache_destroy(zio_cache);
+
+	zio_inject_fini();
+
+	lz4_fini();
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free I/O buffers
+ * ==========================================================================
+ */
+
+/*
+ * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
+ * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
+ * useful to inspect ZFS metadata, but if possible, we should avoid keeping
+ * excess / transient data in-core during a crashdump.
+ */
+void *
+zio_buf_alloc(size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+	atomic_add_64(&zio_buf_cache_allocs[c], 1);
+#endif
+
+	return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
+}
+
+/*
+ * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
+ * crashdump if the kernel panics.  This exists so that we will limit the amount
+ * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
+ * of kernel heap dumped to disk when the kernel panics)
+ */
+void *
+zio_data_buf_alloc(size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
+}
+
+void
+zio_buf_free(void *buf, size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+#if defined(ZFS_DEBUG) && !defined(_KERNEL)
+	atomic_add_64(&zio_buf_cache_frees[c], 1);
+#endif
+
+	kmem_cache_free(zio_buf_cache[c], buf);
+}
+
+void
+zio_data_buf_free(void *buf, size_t size)
+{
+	size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+
+	VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+
+	kmem_cache_free(zio_data_buf_cache[c], buf);
+}
+
+static void
+zio_abd_free(void *abd, size_t size)
+{
+	abd_free((abd_t *)abd);
+}
+
+/*
+ * ==========================================================================
+ * Push and pop I/O transform buffers
+ * ==========================================================================
+ */
+void
+zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize,
+    zio_transform_func_t *transform)
+{
+	zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
+
+	zt->zt_orig_abd = zio->io_abd;
+	zt->zt_orig_size = zio->io_size;
+	zt->zt_bufsize = bufsize;
+	zt->zt_transform = transform;
+
+	zt->zt_next = zio->io_transform_stack;
+	zio->io_transform_stack = zt;
+
+	zio->io_abd = data;
+	zio->io_size = size;
+}
+
+void
+zio_pop_transforms(zio_t *zio)
+{
+	zio_transform_t *zt;
+
+	while ((zt = zio->io_transform_stack) != NULL) {
+		if (zt->zt_transform != NULL)
+			zt->zt_transform(zio,
+			    zt->zt_orig_abd, zt->zt_orig_size);
+
+		if (zt->zt_bufsize != 0)
+			abd_free(zio->io_abd);
+
+		zio->io_abd = zt->zt_orig_abd;
+		zio->io_size = zt->zt_orig_size;
+		zio->io_transform_stack = zt->zt_next;
+
+		kmem_free(zt, sizeof (zio_transform_t));
+	}
+}
+
+/*
+ * ==========================================================================
+ * I/O transform callbacks for subblocks, decompression, and decryption
+ * ==========================================================================
+ */
+static void
+zio_subblock(zio_t *zio, abd_t *data, uint64_t size)
+{
+	ASSERT(zio->io_size > size);
+
+	if (zio->io_type == ZIO_TYPE_READ)
+		abd_copy(data, zio->io_abd, size);
+}
+
+static void
+zio_decompress(zio_t *zio, abd_t *data, uint64_t size)
+{
+	if (zio->io_error == 0) {
+		void *tmp = abd_borrow_buf(data, size);
+		int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
+		    zio->io_abd, tmp, zio->io_size, size,
+		    &zio->io_prop.zp_complevel);
+		abd_return_buf_copy(data, tmp, size);
+
+		if (zio_injection_enabled && ret == 0)
+			ret = zio_handle_fault_injection(zio, EINVAL);
+
+		if (ret != 0)
+			zio->io_error = SET_ERROR(EIO);
+	}
+}
+
+static void
+zio_decrypt(zio_t *zio, abd_t *data, uint64_t size)
+{
+	int ret;
+	void *tmp;
+	blkptr_t *bp = zio->io_bp;
+	spa_t *spa = zio->io_spa;
+	uint64_t dsobj = zio->io_bookmark.zb_objset;
+	uint64_t lsize = BP_GET_LSIZE(bp);
+	dmu_object_type_t ot = BP_GET_TYPE(bp);
+	uint8_t salt[ZIO_DATA_SALT_LEN];
+	uint8_t iv[ZIO_DATA_IV_LEN];
+	uint8_t mac[ZIO_DATA_MAC_LEN];
+	boolean_t no_crypt = B_FALSE;
+
+	ASSERT(BP_USES_CRYPT(bp));
+	ASSERT3U(size, !=, 0);
+
+	if (zio->io_error != 0)
+		return;
+
+	/*
+	 * Verify the cksum of MACs stored in an indirect bp. It will always
+	 * be possible to verify this since it does not require an encryption
+	 * key.
+	 */
+	if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {
+		zio_crypt_decode_mac_bp(bp, mac);
+
+		if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) {
+			/*
+			 * We haven't decompressed the data yet, but
+			 * zio_crypt_do_indirect_mac_checksum() requires
+			 * decompressed data to be able to parse out the MACs
+			 * from the indirect block. We decompress it now and
+			 * throw away the result after we are finished.
+			 */
+			tmp = zio_buf_alloc(lsize);
+			ret = zio_decompress_data(BP_GET_COMPRESS(bp),
+			    zio->io_abd, tmp, zio->io_size, lsize,
+			    &zio->io_prop.zp_complevel);
+			if (ret != 0) {
+				ret = SET_ERROR(EIO);
+				goto error;
+			}
+			ret = zio_crypt_do_indirect_mac_checksum(B_FALSE,
+			    tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac);
+			zio_buf_free(tmp, lsize);
+		} else {
+			ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE,
+			    zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac);
+		}
+		abd_copy(data, zio->io_abd, size);
+
+		if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) {
+			ret = zio_handle_decrypt_injection(spa,
+			    &zio->io_bookmark, ot, ECKSUM);
+		}
+		if (ret != 0)
+			goto error;
+
+		return;
+	}
+
+	/*
+	 * If this is an authenticated block, just check the MAC. It would be
+	 * nice to separate this out into its own flag, but for the moment
+	 * enum zio_flag is out of bits.
+	 */
+	if (BP_IS_AUTHENTICATED(bp)) {
+		if (ot == DMU_OT_OBJSET) {
+			ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa,
+			    dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp));
+		} else {
+			zio_crypt_decode_mac_bp(bp, mac);
+			ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj,
+			    zio->io_abd, size, mac);
+			if (zio_injection_enabled && ret == 0) {
+				ret = zio_handle_decrypt_injection(spa,
+				    &zio->io_bookmark, ot, ECKSUM);
+			}
+		}
+		abd_copy(data, zio->io_abd, size);
+
+		if (ret != 0)
+			goto error;
+
+		return;
+	}
+
+	zio_crypt_decode_params_bp(bp, salt, iv);
+
+	if (ot == DMU_OT_INTENT_LOG) {
+		tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t));
+		zio_crypt_decode_mac_zil(tmp, mac);
+		abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t));
+	} else {
+		zio_crypt_decode_mac_bp(bp, mac);
+	}
+
+	ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp),
+	    BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data,
+	    zio->io_abd, &no_crypt);
+	if (no_crypt)
+		abd_copy(data, zio->io_abd, size);
+
+	if (ret != 0)
+		goto error;
+
+	return;
+
+error:
+	/* assert that the key was found unless this was speculative */
+	ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE));
+
+	/*
+	 * If there was a decryption / authentication error return EIO as
+	 * the io_error. If this was not a speculative zio, create an ereport.
+	 */
+	if (ret == ECKSUM) {
+		zio->io_error = SET_ERROR(EIO);
+		if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
+			spa_log_error(spa, &zio->io_bookmark);
+			(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
+			    spa, NULL, &zio->io_bookmark, zio, 0);
+		}
+	} else {
+		zio->io_error = ret;
+	}
+}
+
+/*
+ * ==========================================================================
+ * I/O parent/child relationships and pipeline interlocks
+ * ==========================================================================
+ */
+zio_t *
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
+{
+	list_t *pl = &cio->io_parent_list;
+
+	*zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+	if (*zl == NULL)
+		return (NULL);
+
+	ASSERT((*zl)->zl_child == cio);
+	return ((*zl)->zl_parent);
+}
+
+zio_t *
+zio_walk_children(zio_t *pio, zio_link_t **zl)
+{
+	list_t *cl = &pio->io_child_list;
+
+	ASSERT(MUTEX_HELD(&pio->io_lock));
+
+	*zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+	if (*zl == NULL)
+		return (NULL);
+
+	ASSERT((*zl)->zl_parent == pio);
+	return ((*zl)->zl_child);
+}
+
+zio_t *
+zio_unique_parent(zio_t *cio)
+{
+	zio_link_t *zl = NULL;
+	zio_t *pio = zio_walk_parents(cio, &zl);
+
+	VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
+	return (pio);
+}
+
+void
+zio_add_child(zio_t *pio, zio_t *cio)
+{
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+
+	/*
+	 * Logical I/Os can have logical, gang, or vdev children.
+	 * Gang I/Os can have gang or vdev children.
+	 * Vdev I/Os can only have vdev children.
+	 * The following ASSERT captures all of these constraints.
+	 */
+	ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
+
+	zl->zl_parent = pio;
+	zl->zl_child = cio;
+
+	mutex_enter(&pio->io_lock);
+	mutex_enter(&cio->io_lock);
+
+	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+
+	list_insert_head(&pio->io_child_list, zl);
+	list_insert_head(&cio->io_parent_list, zl);
+
+	pio->io_child_count++;
+	cio->io_parent_count++;
+
+	mutex_exit(&cio->io_lock);
+	mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
+{
+	ASSERT(zl->zl_parent == pio);
+	ASSERT(zl->zl_child == cio);
+
+	mutex_enter(&pio->io_lock);
+	mutex_enter(&cio->io_lock);
+
+	list_remove(&pio->io_child_list, zl);
+	list_remove(&cio->io_parent_list, zl);
+
+	pio->io_child_count--;
+	cio->io_parent_count--;
+
+	mutex_exit(&cio->io_lock);
+	mutex_exit(&pio->io_lock);
+	kmem_cache_free(zio_link_cache, zl);
+}
+
+static boolean_t
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
+{
+	boolean_t waiting = B_FALSE;
+
+	mutex_enter(&zio->io_lock);
+	ASSERT(zio->io_stall == NULL);
+	for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
+		if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
+			continue;
+
+		uint64_t *countp = &zio->io_children[c][wait];
+		if (*countp != 0) {
+			zio->io_stage >>= 1;
+			ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
+			zio->io_stall = countp;
+			waiting = B_TRUE;
+			break;
+		}
+	}
+	mutex_exit(&zio->io_lock);
+	return (waiting);
+}
+
+__attribute__((always_inline))
+static inline void
+zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait,
+    zio_t **next_to_executep)
+{
+	uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
+	int *errorp = &pio->io_child_error[zio->io_child_type];
+
+	mutex_enter(&pio->io_lock);
+	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		*errorp = zio_worst_error(*errorp, zio->io_error);
+	pio->io_reexecute |= zio->io_reexecute;
+	ASSERT3U(*countp, >, 0);
+
+	(*countp)--;
+
+	if (*countp == 0 && pio->io_stall == countp) {
+		zio_taskq_type_t type =
+		    pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+		    ZIO_TASKQ_INTERRUPT;
+		pio->io_stall = NULL;
+		mutex_exit(&pio->io_lock);
+
+		/*
+		 * If we can tell the caller to execute this parent next, do
+		 * so.  Otherwise dispatch the parent zio as its own task.
+		 *
+		 * Having the caller execute the parent when possible reduces
+		 * locking on the zio taskq's, reduces context switch
+		 * overhead, and has no recursion penalty.  Note that one
+		 * read from disk typically causes at least 3 zio's: a
+		 * zio_null(), the logical zio_read(), and then a physical
+		 * zio.  When the physical ZIO completes, we are able to call
+		 * zio_done() on all 3 of these zio's from one invocation of
+		 * zio_execute() by returning the parent back to
+		 * zio_execute().  Since the parent isn't executed until this
+		 * thread returns back to zio_execute(), the caller should do
+		 * so promptly.
+		 *
+		 * In other cases, dispatching the parent prevents
+		 * overflowing the stack when we have deeply nested
+		 * parent-child relationships, as we do with the "mega zio"
+		 * of writes for spa_sync(), and the chain of ZIL blocks.
+		 */
+		if (next_to_executep != NULL && *next_to_executep == NULL) {
+			*next_to_executep = pio;
+		} else {
+			zio_taskq_dispatch(pio, type, B_FALSE);
+		}
+	} else {
+		mutex_exit(&pio->io_lock);
+	}
+}
+
+static void
+zio_inherit_child_errors(zio_t *zio, enum zio_child c)
+{
+	if (zio->io_child_error[c] != 0 && zio->io_error == 0)
+		zio->io_error = zio->io_child_error[c];
+}
+
+int
+zio_bookmark_compare(const void *x1, const void *x2)
+{
+	const zio_t *z1 = x1;
+	const zio_t *z2 = x2;
+
+	if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
+		return (-1);
+	if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
+		return (1);
+
+	if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
+		return (-1);
+	if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
+		return (1);
+
+	if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
+		return (-1);
+	if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
+		return (1);
+
+	if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
+		return (-1);
+	if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
+		return (1);
+
+	if (z1 < z2)
+		return (-1);
+	if (z1 > z2)
+		return (1);
+
+	return (0);
+}
+
+/*
+ * ==========================================================================
+ * Create the various types of I/O (read, write, free, etc)
+ * ==========================================================================
+ */
+static zio_t *
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done,
+    void *private, zio_type_t type, zio_priority_t priority,
+    enum zio_flag flags, vdev_t *vd, uint64_t offset,
+    const zbookmark_phys_t *zb, enum zio_stage stage,
+    enum zio_stage pipeline)
+{
+	zio_t *zio;
+
+	IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE);
+	ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0);
+	ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
+
+	ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
+	ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
+	ASSERT(vd || stage == ZIO_STAGE_OPEN);
+
+	IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0);
+
+	zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
+	bzero(zio, sizeof (zio_t));
+
+	mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL);
+	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
+
+	list_create(&zio->io_parent_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_parent_node));
+	list_create(&zio->io_child_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_child_node));
+	metaslab_trace_init(&zio->io_alloc_list);
+
+	if (vd != NULL)
+		zio->io_child_type = ZIO_CHILD_VDEV;
+	else if (flags & ZIO_FLAG_GANG_CHILD)
+		zio->io_child_type = ZIO_CHILD_GANG;
+	else if (flags & ZIO_FLAG_DDT_CHILD)
+		zio->io_child_type = ZIO_CHILD_DDT;
+	else
+		zio->io_child_type = ZIO_CHILD_LOGICAL;
+
+	if (bp != NULL) {
+		zio->io_bp = (blkptr_t *)bp;
+		zio->io_bp_copy = *bp;
+		zio->io_bp_orig = *bp;
+		if (type != ZIO_TYPE_WRITE ||
+		    zio->io_child_type == ZIO_CHILD_DDT)
+			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
+		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
+			zio->io_logical = zio;
+		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
+			pipeline |= ZIO_GANG_STAGES;
+	}
+
+	zio->io_spa = spa;
+	zio->io_txg = txg;
+	zio->io_done = done;
+	zio->io_private = private;
+	zio->io_type = type;
+	zio->io_priority = priority;
+	zio->io_vd = vd;
+	zio->io_offset = offset;
+	zio->io_orig_abd = zio->io_abd = data;
+	zio->io_orig_size = zio->io_size = psize;
+	zio->io_lsize = lsize;
+	zio->io_orig_flags = zio->io_flags = flags;
+	zio->io_orig_stage = zio->io_stage = stage;
+	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+	zio->io_pipeline_trace = ZIO_STAGE_OPEN;
+
+	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
+
+	if (zb != NULL)
+		zio->io_bookmark = *zb;
+
+	if (pio != NULL) {
+		if (zio->io_metaslab_class == NULL)
+			zio->io_metaslab_class = pio->io_metaslab_class;
+		if (zio->io_logical == NULL)
+			zio->io_logical = pio->io_logical;
+		if (zio->io_child_type == ZIO_CHILD_GANG)
+			zio->io_gang_leader = pio->io_gang_leader;
+		zio_add_child(pio, zio);
+	}
+
+	taskq_init_ent(&zio->io_tqent);
+
+	return (zio);
+}
+
+static void
+zio_destroy(zio_t *zio)
+{
+	metaslab_trace_fini(&zio->io_alloc_list);
+	list_destroy(&zio->io_parent_list);
+	list_destroy(&zio->io_child_list);
+	mutex_destroy(&zio->io_lock);
+	cv_destroy(&zio->io_cv);
+	kmem_cache_free(zio_cache, zio);
+}
+
+zio_t *
+zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
+    void *private, enum zio_flag flags)
+{
+	zio_t *zio;
+
+	zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
+
+	return (zio);
+}
+
+zio_t *
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+	return (zio_null(NULL, spa, NULL, done, private, flags));
+}
+
+static int
+zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
+    enum blk_verify_flag blk_verify, const char *fmt, ...)
+{
+	va_list adx;
+	char buf[256];
+
+	va_start(adx, fmt);
+	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
+	va_end(adx);
+
+	switch (blk_verify) {
+	case BLK_VERIFY_HALT:
+		dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
+		zfs_panic_recover("%s: %s", spa_name(spa), buf);
+		break;
+	case BLK_VERIFY_LOG:
+		zfs_dbgmsg("%s: %s", spa_name(spa), buf);
+		break;
+	case BLK_VERIFY_ONLY:
+		break;
+	}
+
+	return (1);
+}
+
+/*
+ * Verify the block pointer fields contain reasonable values.  This means
+ * it only contains known object types, checksum/compression identifiers,
+ * block sizes within the maximum allowed limits, valid DVAs, etc.
+ *
+ * If everything checks out B_TRUE is returned.  The zfs_blkptr_verify
+ * argument controls the behavior when an invalid field is detected.
+ *
+ * Modes for zfs_blkptr_verify:
+ *   1) BLK_VERIFY_ONLY (evaluate the block)
+ *   2) BLK_VERIFY_LOG (evaluate the block and log problems)
+ *   3) BLK_VERIFY_HALT (call zfs_panic_recover on error)
+ */
+boolean_t
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
+    enum blk_verify_flag blk_verify)
+{
+	int errors = 0;
+
+	if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+		    "blkptr at %p has invalid TYPE %llu",
+		    bp, (longlong_t)BP_GET_TYPE(bp));
+	}
+	if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+	    BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+		    "blkptr at %p has invalid CHECKSUM %llu",
+		    bp, (longlong_t)BP_GET_CHECKSUM(bp));
+	}
+	if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+	    BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+		    "blkptr at %p has invalid COMPRESS %llu",
+		    bp, (longlong_t)BP_GET_COMPRESS(bp));
+	}
+	if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+		    "blkptr at %p has invalid LSIZE %llu",
+		    bp, (longlong_t)BP_GET_LSIZE(bp));
+	}
+	if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+		errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+		    "blkptr at %p has invalid PSIZE %llu",
+		    bp, (longlong_t)BP_GET_PSIZE(bp));
+	}
+
+	if (BP_IS_EMBEDDED(bp)) {
+		if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
+			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+			    "blkptr at %p has invalid ETYPE %llu",
+			    bp, (longlong_t)BPE_GET_ETYPE(bp));
+		}
+	}
+
+	/*
+	 * Do not verify individual DVAs if the config is not trusted. This
+	 * will be done once the zio is executed in vdev_mirror_map_alloc.
+	 */
+	if (!spa->spa_trust_config)
+		return (B_TRUE);
+
+	if (!config_held)
+		spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
+	else
+		ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
+	/*
+	 * Pool-specific checks.
+	 *
+	 * Note: it would be nice to verify that the blk_birth and
+	 * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
+	 * allows the birth time of log blocks (and dmu_sync()-ed blocks
+	 * that are in the log) to be arbitrarily large.
+	 */
+	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+		uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+
+		if (vdevid >= spa->spa_root_vdev->vdev_children) {
+			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+			    "blkptr at %p DVA %u has invalid VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+		if (vd == NULL) {
+			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+			    "blkptr at %p DVA %u has invalid VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		if (vd->vdev_ops == &vdev_hole_ops) {
+			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+			    "blkptr at %p DVA %u has hole VDEV %llu",
+			    bp, i, (longlong_t)vdevid);
+			continue;
+		}
+		if (vd->vdev_ops == &vdev_missing_ops) {
+			/*
+			 * "missing" vdevs are valid during import, but we
+			 * don't have their detailed info (e.g. asize), so
+			 * we can't perform any more checks on them.
+			 */
+			continue;
+		}
+		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+		uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+		if (BP_IS_GANG(bp))
+			asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+		if (offset + asize > vd->vdev_asize) {
+			errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
+			    "blkptr at %p DVA %u has invalid OFFSET %llu",
+			    bp, i, (longlong_t)offset);
+		}
+	}
+	if (errors > 0)
+		dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
+	if (!config_held)
+		spa_config_exit(spa, SCL_VDEV, bp);
+
+	return (errors == 0);
+}
+
+boolean_t
+zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp)
+{
+	uint64_t vdevid = DVA_GET_VDEV(dva);
+
+	if (vdevid >= spa->spa_root_vdev->vdev_children)
+		return (B_FALSE);
+
+	vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+	if (vd == NULL)
+		return (B_FALSE);
+
+	if (vd->vdev_ops == &vdev_hole_ops)
+		return (B_FALSE);
+
+	if (vd->vdev_ops == &vdev_missing_ops) {
+		return (B_FALSE);
+	}
+
+	uint64_t offset = DVA_GET_OFFSET(dva);
+	uint64_t asize = DVA_GET_ASIZE(dva);
+
+	if (BP_IS_GANG(bp))
+		asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+	if (offset + asize > vd->vdev_asize)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+zio_t *
+zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
+    abd_t *data, uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
+{
+	zio_t *zio;
+
+	(void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
+	    BLK_VERIFY_HALT);
+
+	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
+	    data, size, size, done, private,
+	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
+
+	return (zio);
+}
+
+zio_t *
+zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+    abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp,
+    zio_done_func_t *ready, zio_done_func_t *children_ready,
+    zio_done_func_t *physdone, zio_done_func_t *done,
+    void *private, zio_priority_t priority, enum zio_flag flags,
+    const zbookmark_phys_t *zb)
+{
+	zio_t *zio;
+
+	ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
+	    zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
+	    zp->zp_compress >= ZIO_COMPRESS_OFF &&
+	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
+	    DMU_OT_IS_VALID(zp->zp_type) &&
+	    zp->zp_level < 32 &&
+	    zp->zp_copies > 0 &&
+	    zp->zp_copies <= spa_max_replication(spa));
+
+	zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private,
+	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
+
+	zio->io_ready = ready;
+	zio->io_children_ready = children_ready;
+	zio->io_physdone = physdone;
+	zio->io_prop = *zp;
+
+	/*
+	 * Data can be NULL if we are going to call zio_write_override() to
+	 * provide the already-allocated BP.  But we may need the data to
+	 * verify a dedup hit (if requested).  In this case, don't try to
+	 * dedup (just take the already-allocated BP verbatim). Encrypted
+	 * dedup blocks need data as well so we also disable dedup in this
+	 * case.
+	 */
+	if (data == NULL &&
+	    (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) {
+		zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
+    uint64_t size, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
+{
+	zio_t *zio;
+
+	zio = zio_create(pio, spa, txg, bp, data, size, size, done, private,
+	    ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
+	    ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+
+	return (zio);
+}
+
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+{
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+	/*
+	 * We must reset the io_prop to match the values that existed
+	 * when the bp was first written by dmu_sync() keeping in mind
+	 * that nopwrite and dedup are mutually exclusive.
+	 */
+	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+	zio->io_prop.zp_nopwrite = nopwrite;
+	zio->io_prop.zp_copies = copies;
+	zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+
+	(void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT);
+
+	/*
+	 * The check for EMBEDDED is a performance optimization.  We
+	 * process the free here (by ignoring it) rather than
+	 * putting it on the list and then processing it in zio_free_sync().
+	 */
+	if (BP_IS_EMBEDDED(bp))
+		return;
+	metaslab_check_free(spa, bp);
+
+	/*
+	 * Frees that are for the currently-syncing txg, are not going to be
+	 * deferred, and which will not need to do a read (i.e. not GANG or
+	 * DEDUP), can be processed immediately.  Otherwise, put them on the
+	 * in-memory list for later processing.
+	 *
+	 * Note that we only defer frees after zfs_sync_pass_deferred_free
+	 * when the log space map feature is disabled. [see relevant comment
+	 * in spa_sync_iterate_to_convergence()]
+	 */
+	if (BP_IS_GANG(bp) ||
+	    BP_GET_DEDUP(bp) ||
+	    txg != spa->spa_syncing_txg ||
+	    (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
+	    !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
+		bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+	} else {
+		VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL);
+	}
+}
+
+/*
+ * To improve performance, this function may return NULL if we were able
+ * to do the free immediately.  This avoids the cost of creating a zio
+ * (and linking it to the parent, etc).
+ */
+zio_t *
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    enum zio_flag flags)
+{
+	ASSERT(!BP_IS_HOLE(bp));
+	ASSERT(spa_syncing_txg(spa) == txg);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (NULL);
+
+	metaslab_check_free(spa, bp);
+	arc_freed(spa, bp);
+	dsl_scan_freed(spa, bp);
+
+	if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
+		/*
+		 * GANG and DEDUP blocks can induce a read (for the gang block
+		 * header, or the DDT), so issue them asynchronously so that
+		 * this thread is not tied up.
+		 */
+		enum zio_stage stage =
+		    ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
+
+		return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+		    BP_GET_PSIZE(bp), NULL, NULL,
+		    ZIO_TYPE_FREE, ZIO_PRIORITY_NOW,
+		    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage));
+	} else {
+		metaslab_free(spa, bp, txg, B_FALSE);
+		return (NULL);
+	}
+}
+
+zio_t *
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+	zio_t *zio;
+
+	(void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
+	    BLK_VERIFY_HALT);
+
+	if (BP_IS_EMBEDDED(bp))
+		return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
+	/*
+	 * A claim is an allocation of a specific block.  Claims are needed
+	 * to support immediate writes in the intent log.  The issue is that
+	 * immediate writes contain committed data, but in a txg that was
+	 * *not* committed.  Upon opening the pool after an unclean shutdown,
+	 * the intent log claims all blocks that contain immediate write data
+	 * so that the SPA knows they're in use.
+	 *
+	 * All claims *must* be resolved in the first txg -- before the SPA
+	 * starts allocating blocks -- so that nothing is allocated twice.
+	 * If txg == 0 we just verify that the block is claimable.
+	 */
+	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <,
+	    spa_min_claim_txg(spa));
+	ASSERT(txg == spa_min_claim_txg(spa) || txg == 0);
+	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(8) */
+
+	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
+	    BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW,
+	    flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+	ASSERT0(zio->io_queued_timestamp);
+
+	return (zio);
+}
+
+zio_t *
+zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
+    zio_done_func_t *done, void *private, enum zio_flag flags)
+{
+	zio_t *zio;
+	int c;
+
+	if (vd->vdev_children == 0) {
+		zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private,
+		    ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
+		    ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
+
+		zio->io_cmd = cmd;
+	} else {
+		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
+
+		for (c = 0; c < vd->vdev_children; c++)
+			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
+			    done, private, flags));
+	}
+
+	return (zio);
+}
+
+zio_t *
+zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    zio_done_func_t *done, void *private, zio_priority_t priority,
+    enum zio_flag flags, enum trim_flag trim_flags)
+{
+	zio_t *zio;
+
+	ASSERT0(vd->vdev_children);
+	ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+	ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+	ASSERT3U(size, !=, 0);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done,
+	    private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL,
+	    vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE);
+	zio->io_trim_flags = trim_flags;
+
+	return (zio);
+}
+
+zio_t *
+zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    abd_t *data, int checksum, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+	zio_t *zio;
+
+	ASSERT(vd->vdev_children == 0);
+	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+	    private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+	    offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+
+	zio->io_prop.zp_checksum = checksum;
+
+	return (zio);
+}
+
+zio_t *
+zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
+    abd_t *data, int checksum, zio_done_func_t *done, void *private,
+    zio_priority_t priority, enum zio_flag flags, boolean_t labels)
+{
+	zio_t *zio;
+
+	ASSERT(vd->vdev_children == 0);
+	ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
+	    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+	ASSERT3U(offset + size, <=, vd->vdev_psize);
+
+	zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done,
+	    private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd,
+	    offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+
+	zio->io_prop.zp_checksum = checksum;
+
+	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+		/*
+		 * zec checksums are necessarily destructive -- they modify
+		 * the end of the write buffer to hold the verifier/checksum.
+		 * Therefore, we must make a local copy in case the data is
+		 * being written to multiple places in parallel.
+		 */
+		abd_t *wbuf = abd_alloc_sametype(data, size);
+		abd_copy(wbuf, data, size);
+
+		zio_push_transform(zio, wbuf, size, size, NULL);
+	}
+
+	return (zio);
+}
+
+/*
+ * Create a child I/O to do some work for us.
+ */
+zio_t *
+zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
+    abd_t *data, uint64_t size, int type, zio_priority_t priority,
+    enum zio_flag flags, zio_done_func_t *done, void *private)
+{
+	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
+	zio_t *zio;
+
+	/*
+	 * vdev child I/Os do not propagate their error to the parent.
+	 * Therefore, for correct operation the caller *must* check for
+	 * and handle the error in the child i/o's done callback.
+	 * The only exceptions are i/os that we don't care about
+	 * (OPTIONAL or REPAIR).
+	 */
+	ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) ||
+	    done != NULL);
+
+	if (type == ZIO_TYPE_READ && bp != NULL) {
+		/*
+		 * If we have the bp, then the child should perform the
+		 * checksum and the parent need not.  This pushes error
+		 * detection as close to the leaves as possible and
+		 * eliminates redundant checksums in the interior nodes.
+		 */
+		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+	}
+
+	if (vd->vdev_ops->vdev_op_leaf) {
+		ASSERT0(vd->vdev_children);
+		offset += VDEV_LABEL_START_SIZE;
+	}
+
+	flags |= ZIO_VDEV_CHILD_FLAGS(pio);
+
+	/*
+	 * If we've decided to do a repair, the write is not speculative --
+	 * even if the original read was.
+	 */
+	if (flags & ZIO_FLAG_IO_REPAIR)
+		flags &= ~ZIO_FLAG_SPECULATIVE;
+
+	/*
+	 * If we're creating a child I/O that is not associated with a
+	 * top-level vdev, then the child zio is not an allocating I/O.
+	 * If this is a retried I/O then we ignore it since we will
+	 * have already processed the original allocating I/O.
+	 */
+	if (flags & ZIO_FLAG_IO_ALLOCATING &&
+	    (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+		ASSERT(pio->io_metaslab_class != NULL);
+		ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled);
+		ASSERT(type == ZIO_TYPE_WRITE);
+		ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+		ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+		    pio->io_child_type == ZIO_CHILD_GANG);
+
+		flags &= ~ZIO_FLAG_IO_ALLOCATING;
+	}
+
+
+	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size,
+	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+
+	zio->io_physdone = pio->io_physdone;
+	if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+		zio->io_logical->io_phys_children++;
+
+	return (zio);
+}
+
+zio_t *
+zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size,
+    zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+    zio_done_func_t *done, void *private)
+{
+	zio_t *zio;
+
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
+	    data, size, size, done, private, type, priority,
+	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
+	    vd, offset, NULL,
+	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
+
+	return (zio);
+}
+
+void
+zio_flush(zio_t *zio, vdev_t *vd)
+{
+	zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
+	    NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
+}
+
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+	ASSERT3P(zio->io_executor, ==, NULL);
+	ASSERT3U(zio->io_orig_size, ==, zio->io_size);
+	ASSERT3U(size, <=, zio->io_size);
+
+	/*
+	 * We don't shrink for raidz because of problems with the
+	 * reconstruction when reading back less than the block size.
+	 * Note, BP_IS_RAIDZ() assumes no compression.
+	 */
+	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+	if (!BP_IS_RAIDZ(zio->io_bp)) {
+		/* we are not doing a raw write */
+		ASSERT3U(zio->io_size, ==, zio->io_lsize);
+		zio->io_orig_size = zio->io_size = zio->io_lsize = size;
+	}
+}
+
+/*
+ * ==========================================================================
+ * Prepare to read and write logical blocks
+ * ==========================================================================
+ */
+
+static zio_t *
+zio_read_bp_init(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	uint64_t psize =
+	    BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
+
+	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
+		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+		    psize, psize, zio_decompress);
+	}
+
+	if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) ||
+	    BP_HAS_INDIRECT_MAC_CKSUM(bp)) &&
+	    zio->io_child_type == ZIO_CHILD_LOGICAL) {
+		zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize),
+		    psize, psize, zio_decrypt);
+	}
+
+	if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+		int psize = BPE_GET_PSIZE(bp);
+		void *data = abd_borrow_buf(zio->io_abd, psize);
+
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		decode_embedded_bp_compressed(bp, data);
+		abd_return_buf_copy(zio->io_abd, data, psize);
+	} else {
+		ASSERT(!BP_IS_EMBEDDED(bp));
+		ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+	}
+
+	if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
+	return (zio);
+}
+
+static zio_t *
+zio_write_bp_init(zio_t *zio)
+{
+	if (!IO_IS_ALLOCATING(zio))
+		return (zio);
+
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+	if (zio->io_bp_override) {
+		blkptr_t *bp = zio->io_bp;
+		zio_prop_t *zp = &zio->io_prop;
+
+		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+		*bp = *zio->io_bp_override;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+		if (BP_IS_EMBEDDED(bp))
+			return (zio);
+
+		/*
+		 * If we've been overridden and nopwrite is set then
+		 * set the flag accordingly to indicate that a nopwrite
+		 * has already occurred.
+		 */
+		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+			ASSERT(!zp->zp_dedup);
+			ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
+			zio->io_flags |= ZIO_FLAG_NOPWRITE;
+			return (zio);
+		}
+
+		ASSERT(!zp->zp_nopwrite);
+
+		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+			return (zio);
+
+		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
+
+		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum &&
+		    !zp->zp_encrypt) {
+			BP_SET_DEDUP(bp, 1);
+			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+			return (zio);
+		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
+		zio->io_bp_override = NULL;
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
+	}
+
+	return (zio);
+}
+
+static zio_t *
+zio_write_compress(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_prop_t *zp = &zio->io_prop;
+	enum zio_compress compress = zp->zp_compress;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t lsize = zio->io_lsize;
+	uint64_t psize = zio->io_size;
+	int pass = 1;
+
+	/*
+	 * If our children haven't all reached the ready stage,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+	    ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
+		return (NULL);
+	}
+
+	if (!IO_IS_ALLOCATING(zio))
+		return (zio);
+
+	if (zio->io_children_ready != NULL) {
+		/*
+		 * Now that all our children are ready, run the callback
+		 * associated with this zio in case it wants to modify the
+		 * data to be written.
+		 */
+		ASSERT3U(zp->zp_level, >, 0);
+		zio->io_children_ready(zio);
+	}
+
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+	ASSERT(zio->io_bp_override == NULL);
+
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+		/*
+		 * We're rewriting an existing block, which means we're
+		 * working on behalf of spa_sync().  For spa_sync() to
+		 * converge, it must eventually be the case that we don't
+		 * have to allocate new blocks.  But compression changes
+		 * the blocksize, which forces a reallocate, and makes
+		 * convergence take longer.  Therefore, after the first
+		 * few passes, stop compressing to ensure convergence.
+		 */
+		pass = spa_sync_pass(spa);
+
+		ASSERT(zio->io_txg == spa_syncing_txg(spa));
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+		ASSERT(!BP_GET_DEDUP(bp));
+
+		if (pass >= zfs_sync_pass_dont_compress)
+			compress = ZIO_COMPRESS_OFF;
+
+		/* Make sure someone doesn't change their mind on overwrites */
+		ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
+		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
+	}
+
+	/* If it's a compressed write that is not raw, compress the buffer. */
+	if (compress != ZIO_COMPRESS_OFF &&
+	    !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) {
+		void *cbuf = zio_buf_alloc(lsize);
+		psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize,
+		    zp->zp_complevel);
+		if (psize == 0 || psize >= lsize) {
+			compress = ZIO_COMPRESS_OFF;
+			zio_buf_free(cbuf, lsize);
+		} else if (!zp->zp_dedup && !zp->zp_encrypt &&
+		    psize <= BPE_PAYLOAD_SIZE &&
+		    zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+		    spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+			encode_embedded_bp_compressed(bp,
+			    cbuf, compress, lsize, psize);
+			BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+			BP_SET_TYPE(bp, zio->io_prop.zp_type);
+			BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+			zio_buf_free(cbuf, lsize);
+			bp->blk_birth = zio->io_txg;
+			zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+			ASSERT(spa_feature_is_active(spa,
+			    SPA_FEATURE_EMBEDDED_DATA));
+			return (zio);
+		} else {
+			/*
+			 * Round compressed size up to the minimum allocation
+			 * size of the smallest-ashift device, and zero the
+			 * tail. This ensures that the compressed size of the
+			 * BP (and thus compressratio property) are correct,
+			 * in that we charge for the padding used to fill out
+			 * the last sector.
+			 */
+			ASSERT3U(spa->spa_min_alloc, >=, SPA_MINBLOCKSHIFT);
+			size_t rounded = (size_t)roundup(psize,
+			    spa->spa_min_alloc);
+			if (rounded >= lsize) {
+				compress = ZIO_COMPRESS_OFF;
+				zio_buf_free(cbuf, lsize);
+				psize = lsize;
+			} else {
+				abd_t *cdata = abd_get_from_buf(cbuf, lsize);
+				abd_take_ownership_of_buf(cdata, B_TRUE);
+				abd_zero_off(cdata, psize, rounded - psize);
+				psize = rounded;
+				zio_push_transform(zio, cdata,
+				    psize, lsize, NULL);
+			}
+		}
+
+		/*
+		 * We were unable to handle this as an override bp, treat
+		 * it as a regular write I/O.
+		 */
+		zio->io_bp_override = NULL;
+		*bp = zio->io_bp_orig;
+		zio->io_pipeline = zio->io_orig_pipeline;
+
+	} else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 &&
+	    zp->zp_type == DMU_OT_DNODE) {
+		/*
+		 * The DMU actually relies on the zio layer's compression
+		 * to free metadnode blocks that have had all contained
+		 * dnodes freed. As a result, even when doing a raw
+		 * receive, we must check whether the block can be compressed
+		 * to a hole.
+		 */
+		psize = zio_compress_data(ZIO_COMPRESS_EMPTY,
+		    zio->io_abd, NULL, lsize, zp->zp_complevel);
+		if (psize == 0 || psize >= lsize)
+			compress = ZIO_COMPRESS_OFF;
+	} else {
+		ASSERT3U(psize, !=, 0);
+	}
+
+	/*
+	 * The final pass of spa_sync() must be all rewrites, but the first
+	 * few passes offer a trade-off: allocating blocks defers convergence,
+	 * but newly allocated blocks are sequential, so they can be written
+	 * to disk faster.  Therefore, we allow the first few passes of
+	 * spa_sync() to allocate new blocks, but force rewrites after that.
+	 * There should only be a handful of blocks after pass 1 in any case.
+	 */
+	if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+	    BP_GET_PSIZE(bp) == psize &&
+	    pass >= zfs_sync_pass_rewrite) {
+		VERIFY3U(psize, !=, 0);
+		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+
+		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
+		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
+	} else {
+		BP_ZERO(bp);
+		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+	}
+
+	if (psize == 0) {
+		if (zio->io_bp_orig.blk_birth != 0 &&
+		    spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+			BP_SET_LSIZE(bp, lsize);
+			BP_SET_TYPE(bp, zp->zp_type);
+			BP_SET_LEVEL(bp, zp->zp_level);
+			BP_SET_BIRTH(bp, zio->io_txg, 0);
+		}
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+	} else {
+		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
+		BP_SET_LSIZE(bp, lsize);
+		BP_SET_TYPE(bp, zp->zp_type);
+		BP_SET_LEVEL(bp, zp->zp_level);
+		BP_SET_PSIZE(bp, psize);
+		BP_SET_COMPRESS(bp, compress);
+		BP_SET_CHECKSUM(bp, zp->zp_checksum);
+		BP_SET_DEDUP(bp, zp->zp_dedup);
+		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+		if (zp->zp_dedup) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			ASSERT(!zp->zp_encrypt ||
+			    DMU_OT_IS_ENCRYPTED(zp->zp_type));
+			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+		}
+		if (zp->zp_nopwrite) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+		}
+	}
+	return (zio);
+}
+
+static zio_t *
+zio_free_bp_init(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+		if (BP_GET_DEDUP(bp))
+			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
+	}
+
+	ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
+
+	return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Execute the I/O pipeline
+ * ==========================================================================
+ */
+
+static void
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
+{
+	spa_t *spa = zio->io_spa;
+	zio_type_t t = zio->io_type;
+	int flags = (cutinline ? TQ_FRONT : 0);
+
+	/*
+	 * If we're a config writer or a probe, the normal issue and
+	 * interrupt threads may all be blocked waiting for the config lock.
+	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
+	 */
+	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
+		t = ZIO_TYPE_NULL;
+
+	/*
+	 * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
+	 */
+	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
+		t = ZIO_TYPE_NULL;
+
+	/*
+	 * If this is a high priority I/O, then use the high priority taskq if
+	 * available.
+	 */
+	if ((zio->io_priority == ZIO_PRIORITY_NOW ||
+	    zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) &&
+	    spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
+		q++;
+
+	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+
+	/*
+	 * NB: We are assuming that the zio can only be dispatched
+	 * to a single taskq at a time.  It would be a grievous error
+	 * to dispatch the zio to another taskq at the same time.
+	 */
+	ASSERT(taskq_empty_ent(&zio->io_tqent));
+	spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
+	    flags, &zio->io_tqent);
+}
+
+static boolean_t
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
+{
+	spa_t *spa = zio->io_spa;
+
+	taskq_t *tq = taskq_of_curthread();
+
+	for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
+		spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+		uint_t i;
+		for (i = 0; i < tqs->stqs_count; i++) {
+			if (tqs->stqs_taskq[i] == tq)
+				return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static zio_t *
+zio_issue_async(zio_t *zio)
+{
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+
+	return (NULL);
+}
+
+void
+zio_interrupt(zio_t *zio)
+{
+	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
+}
+
+void
+zio_delay_interrupt(zio_t *zio)
+{
+	/*
+	 * The timeout_generic() function isn't defined in userspace, so
+	 * rather than trying to implement the function, the zio delay
+	 * functionality has been disabled for userspace builds.
+	 */
+
+#ifdef _KERNEL
+	/*
+	 * If io_target_timestamp is zero, then no delay has been registered
+	 * for this IO, thus jump to the end of this function and "skip" the
+	 * delay; issuing it directly to the zio layer.
+	 */
+	if (zio->io_target_timestamp != 0) {
+		hrtime_t now = gethrtime();
+
+		if (now >= zio->io_target_timestamp) {
+			/*
+			 * This IO has already taken longer than the target
+			 * delay to complete, so we don't want to delay it
+			 * any longer; we "miss" the delay and issue it
+			 * directly to the zio layer. This is likely due to
+			 * the target latency being set to a value less than
+			 * the underlying hardware can satisfy (e.g. delay
+			 * set to 1ms, but the disks take 10ms to complete an
+			 * IO request).
+			 */
+
+			DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
+			    hrtime_t, now);
+
+			zio_interrupt(zio);
+		} else {
+			taskqid_t tid;
+			hrtime_t diff = zio->io_target_timestamp - now;
+			clock_t expire_at_tick = ddi_get_lbolt() +
+			    NSEC_TO_TICK(diff);
+
+			DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
+			    hrtime_t, now, hrtime_t, diff);
+
+			if (NSEC_TO_TICK(diff) == 0) {
+				/* Our delay is less than a jiffy - just spin */
+				zfs_sleep_until(zio->io_target_timestamp);
+				zio_interrupt(zio);
+			} else {
+				/*
+				 * Use taskq_dispatch_delay() in the place of
+				 * OpenZFS's timeout_generic().
+				 */
+				tid = taskq_dispatch_delay(system_taskq,
+				    (task_func_t *)zio_interrupt,
+				    zio, TQ_NOSLEEP, expire_at_tick);
+				if (tid == TASKQID_INVALID) {
+					/*
+					 * Couldn't allocate a task.  Just
+					 * finish the zio without a delay.
+					 */
+					zio_interrupt(zio);
+				}
+			}
+		}
+		return;
+	}
+#endif
+	DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
+	zio_interrupt(zio);
+}
+
+static void
+zio_deadman_impl(zio_t *pio, int ziodepth)
+{
+	zio_t *cio, *cio_next;
+	zio_link_t *zl = NULL;
+	vdev_t *vd = pio->io_vd;
+
+	if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) {
+		vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL;
+		zbookmark_phys_t *zb = &pio->io_bookmark;
+		uint64_t delta = gethrtime() - pio->io_timestamp;
+		uint64_t failmode = spa_get_deadman_failmode(pio->io_spa);
+
+		zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu "
+		    "delta=%llu queued=%llu io=%llu "
+		    "path=%s last=%llu "
+		    "type=%d priority=%d flags=0x%x "
+		    "stage=0x%x pipeline=0x%x pipeline-trace=0x%x "
+		    "objset=%llu object=%llu level=%llu blkid=%llu "
+		    "offset=%llu size=%llu error=%d",
+		    ziodepth, pio, pio->io_timestamp,
+		    delta, pio->io_delta, pio->io_delay,
+		    vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0,
+		    pio->io_type, pio->io_priority, pio->io_flags,
+		    pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace,
+		    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+		    pio->io_offset, pio->io_size, pio->io_error);
+		(void) zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN,
+		    pio->io_spa, vd, zb, pio, 0);
+
+		if (failmode == ZIO_FAILURE_MODE_CONTINUE &&
+		    taskq_empty_ent(&pio->io_tqent)) {
+			zio_interrupt(pio);
+		}
+	}
+
+	mutex_enter(&pio->io_lock);
+	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio, &zl);
+		zio_deadman_impl(cio, ziodepth + 1);
+	}
+	mutex_exit(&pio->io_lock);
+}
+
+/*
+ * Log the critical information describing this zio and all of its children
+ * using the zfs_dbgmsg() interface then post deadman event for the ZED.
+ */
+void
+zio_deadman(zio_t *pio, char *tag)
+{
+	spa_t *spa = pio->io_spa;
+	char *name = spa_name(spa);
+
+	if (!zfs_deadman_enabled || spa_suspended(spa))
+		return;
+
+	zio_deadman_impl(pio, 0);
+
+	switch (spa_get_deadman_failmode(spa)) {
+	case ZIO_FAILURE_MODE_WAIT:
+		zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name);
+		break;
+
+	case ZIO_FAILURE_MODE_CONTINUE:
+		zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name);
+		break;
+
+	case ZIO_FAILURE_MODE_PANIC:
+		fm_panic("%s determined I/O to pool '%s' is hung.", tag, name);
+		break;
+	}
+}
+
+/*
+ * Execute the I/O pipeline until one of the following occurs:
+ * (1) the I/O completes; (2) the pipeline stalls waiting for
+ * dependent child I/Os; (3) the I/O issues, so we're waiting
+ * for an I/O completion interrupt; (4) the I/O is delegated by
+ * vdev-level caching or aggregation; (5) the I/O is deferred
+ * due to vdev-level queueing; (6) the I/O is handed off to
+ * another thread.  In all cases, the pipeline stops whenever
+ * there's no CPU work; it never burns a thread in cv_wait_io().
+ *
+ * There's no locking on io_stage because there's no legitimate way
+ * for multiple threads to be attempting to process the same I/O.
+ */
+static zio_pipe_stage_t *zio_pipeline[];
+
+/*
+ * zio_execute() is a wrapper around the static function
+ * __zio_execute() so that we can force  __zio_execute() to be
+ * inlined.  This reduces stack overhead which is important
+ * because __zio_execute() is called recursively in several zio
+ * code paths.  zio_execute() itself cannot be inlined because
+ * it is externally visible.
+ */
+void
+zio_execute(zio_t *zio)
+{
+	fstrans_cookie_t cookie;
+
+	cookie = spl_fstrans_mark();
+	__zio_execute(zio);
+	spl_fstrans_unmark(cookie);
+}
+
+/*
+ * Used to determine if in the current context the stack is sized large
+ * enough to allow zio_execute() to be called recursively.  A minimum
+ * stack size of 16K is required to avoid needing to re-dispatch the zio.
+ */
+static boolean_t
+zio_execute_stack_check(zio_t *zio)
+{
+#if !defined(HAVE_LARGE_STACKS)
+	dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
+
+	/* Executing in txg_sync_thread() context. */
+	if (dp && curthread == dp->dp_tx.tx_sync_thread)
+		return (B_TRUE);
+
+	/* Pool initialization outside of zio_taskq context. */
+	if (dp && spa_is_initializing(dp->dp_spa) &&
+	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) &&
+	    !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH))
+		return (B_TRUE);
+#endif /* HAVE_LARGE_STACKS */
+
+	return (B_FALSE);
+}
+
+__attribute__((always_inline))
+static inline void
+__zio_execute(zio_t *zio)
+{
+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+
+	while (zio->io_stage < ZIO_STAGE_DONE) {
+		enum zio_stage pipeline = zio->io_pipeline;
+		enum zio_stage stage = zio->io_stage;
+
+		zio->io_executor = curthread;
+
+		ASSERT(!MUTEX_HELD(&zio->io_lock));
+		ASSERT(ISP2(stage));
+		ASSERT(zio->io_stall == NULL);
+
+		do {
+			stage <<= 1;
+		} while ((stage & pipeline) == 0);
+
+		ASSERT(stage <= ZIO_STAGE_DONE);
+
+		/*
+		 * If we are in interrupt context and this pipeline stage
+		 * will grab a config lock that is held across I/O,
+		 * or may wait for an I/O that needs an interrupt thread
+		 * to complete, issue async to avoid deadlock.
+		 *
+		 * For VDEV_IO_START, we cut in line so that the io will
+		 * be sent to disk promptly.
+		 */
+		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
+		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
+			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+			    zio_requeue_io_start_cut_in_line : B_FALSE;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+			return;
+		}
+
+		/*
+		 * If the current context doesn't have large enough stacks
+		 * the zio must be issued asynchronously to prevent overflow.
+		 */
+		if (zio_execute_stack_check(zio)) {
+			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+			    zio_requeue_io_start_cut_in_line : B_FALSE;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
+			return;
+		}
+
+		zio->io_stage = stage;
+		zio->io_pipeline_trace |= zio->io_stage;
+
+		/*
+		 * The zio pipeline stage returns the next zio to execute
+		 * (typically the same as this one), or NULL if we should
+		 * stop.
+		 */
+		zio = zio_pipeline[highbit64(stage) - 1](zio);
+
+		if (zio == NULL)
+			return;
+	}
+}
+
+
+/*
+ * ==========================================================================
+ * Initiate I/O, either sync or async
+ * ==========================================================================
+ */
+int
+zio_wait(zio_t *zio)
+{
+	/*
+	 * Some routines, like zio_free_sync(), may return a NULL zio
+	 * to avoid the performance overhead of creating and then destroying
+	 * an unneeded zio.  For the callers' simplicity, we accept a NULL
+	 * zio and ignore it.
+	 */
+	if (zio == NULL)
+		return (0);
+
+	long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms);
+	int error;
+
+	ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN);
+	ASSERT3P(zio->io_executor, ==, NULL);
+
+	zio->io_waiter = curthread;
+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();
+
+	__zio_execute(zio);
+
+	mutex_enter(&zio->io_lock);
+	while (zio->io_executor != NULL) {
+		error = cv_timedwait_io(&zio->io_cv, &zio->io_lock,
+		    ddi_get_lbolt() + timeout);
+
+		if (zfs_deadman_enabled && error == -1 &&
+		    gethrtime() - zio->io_queued_timestamp >
+		    spa_deadman_ziotime(zio->io_spa)) {
+			mutex_exit(&zio->io_lock);
+			timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms);
+			zio_deadman(zio, FTAG);
+			mutex_enter(&zio->io_lock);
+		}
+	}
+	mutex_exit(&zio->io_lock);
+
+	error = zio->io_error;
+	zio_destroy(zio);
+
+	return (error);
+}
+
+void
+zio_nowait(zio_t *zio)
+{
+	/*
+	 * See comment in zio_wait().
+	 */
+	if (zio == NULL)
+		return;
+
+	ASSERT3P(zio->io_executor, ==, NULL);
+
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    zio_unique_parent(zio) == NULL) {
+		zio_t *pio;
+
+		/*
+		 * This is a logical async I/O with no parent to wait for it.
+		 * We add it to the spa_async_root_zio "Godfather" I/O which
+		 * will ensure they complete prior to unloading the pool.
+		 */
+		spa_t *spa = zio->io_spa;
+		pio = spa->spa_async_zio_root[CPU_SEQID_UNSTABLE];
+
+		zio_add_child(pio, zio);
+	}
+
+	ASSERT0(zio->io_queued_timestamp);
+	zio->io_queued_timestamp = gethrtime();
+	__zio_execute(zio);
+}
+
+/*
+ * ==========================================================================
+ * Reexecute, cancel, or suspend/resume failed I/O
+ * ==========================================================================
+ */
+
+static void
+zio_reexecute(zio_t *pio)
+{
+	zio_t *cio, *cio_next;
+
+	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
+	ASSERT(pio->io_gang_leader == NULL);
+	ASSERT(pio->io_gang_tree == NULL);
+
+	pio->io_flags = pio->io_orig_flags;
+	pio->io_stage = pio->io_orig_stage;
+	pio->io_pipeline = pio->io_orig_pipeline;
+	pio->io_reexecute = 0;
+	pio->io_flags |= ZIO_FLAG_REEXECUTED;
+	pio->io_pipeline_trace = 0;
+	pio->io_error = 0;
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_state[w] = 0;
+	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+		pio->io_child_error[c] = 0;
+
+	if (IO_IS_ALLOCATING(pio))
+		BP_ZERO(pio->io_bp);
+
+	/*
+	 * As we reexecute pio's children, new children could be created.
+	 * New children go to the head of pio's io_child_list, however,
+	 * so we will (correctly) not reexecute them.  The key is that
+	 * the remainder of pio's io_child_list, from 'cio_next' onward,
+	 * cannot be affected by any side effects of reexecuting 'cio'.
+	 */
+	zio_link_t *zl = NULL;
+	mutex_enter(&pio->io_lock);
+	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio, &zl);
+		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+			pio->io_children[cio->io_child_type][w]++;
+		mutex_exit(&pio->io_lock);
+		zio_reexecute(cio);
+		mutex_enter(&pio->io_lock);
+	}
+	mutex_exit(&pio->io_lock);
+
+	/*
+	 * Now that all children have been reexecuted, execute the parent.
+	 * We don't reexecute "The Godfather" I/O here as it's the
+	 * responsibility of the caller to wait on it.
+	 */
+	if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+		pio->io_queued_timestamp = gethrtime();
+		__zio_execute(pio);
+	}
+}
+
+void
+zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason)
+{
+	if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
+		fm_panic("Pool '%s' has encountered an uncorrectable I/O "
+		    "failure and the failure mode property for this pool "
+		    "is set to panic.", spa_name(spa));
+
+	cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O "
+	    "failure and has been suspended.\n", spa_name(spa));
+
+	(void) zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL,
+	    NULL, NULL, 0);
+
+	mutex_enter(&spa->spa_suspend_lock);
+
+	if (spa->spa_suspend_zio_root == NULL)
+		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
+
+	spa->spa_suspended = reason;
+
+	if (zio != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+		ASSERT(zio != spa->spa_suspend_zio_root);
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+		ASSERT(zio_unique_parent(zio) == NULL);
+		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
+		zio_add_child(spa->spa_suspend_zio_root, zio);
+	}
+
+	mutex_exit(&spa->spa_suspend_lock);
+}
+
+int
+zio_resume(spa_t *spa)
+{
+	zio_t *pio;
+
+	/*
+	 * Reexecute all previously suspended i/o.
+	 */
+	mutex_enter(&spa->spa_suspend_lock);
+	spa->spa_suspended = ZIO_SUSPEND_NONE;
+	cv_broadcast(&spa->spa_suspend_cv);
+	pio = spa->spa_suspend_zio_root;
+	spa->spa_suspend_zio_root = NULL;
+	mutex_exit(&spa->spa_suspend_lock);
+
+	if (pio == NULL)
+		return (0);
+
+	zio_reexecute(pio);
+	return (zio_wait(pio));
+}
+
+void
+zio_resume_wait(spa_t *spa)
+{
+	mutex_enter(&spa->spa_suspend_lock);
+	while (spa_suspended(spa))
+		cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
+	mutex_exit(&spa->spa_suspend_lock);
+}
+
+/*
+ * ==========================================================================
+ * Gang blocks.
+ *
+ * A gang block is a collection of small blocks that looks to the DMU
+ * like one large block.  When zio_dva_allocate() cannot find a block
+ * of the requested size, due to either severe fragmentation or the pool
+ * being nearly full, it calls zio_write_gang_block() to construct the
+ * block from smaller fragments.
+ *
+ * A gang block consists of a gang header (zio_gbh_phys_t) and up to
+ * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
+ * an indirect block: it's an array of block pointers.  It consumes
+ * only one sector and hence is allocatable regardless of fragmentation.
+ * The gang header's bps point to its gang members, which hold the data.
+ *
+ * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
+ * as the verifier to ensure uniqueness of the SHA256 checksum.
+ * Critically, the gang block bp's blk_cksum is the checksum of the data,
+ * not the gang header.  This ensures that data block signatures (needed for
+ * deduplication) are independent of how the block is physically stored.
+ *
+ * Gang blocks can be nested: a gang member may itself be a gang block.
+ * Thus every gang block is a tree in which root and all interior nodes are
+ * gang headers, and the leaves are normal blocks that contain user data.
+ * The root of the gang tree is called the gang leader.
+ *
+ * To perform any operation (read, rewrite, free, claim) on a gang block,
+ * zio_gang_assemble() first assembles the gang tree (minus data leaves)
+ * in the io_gang_tree field of the original logical i/o by recursively
+ * reading the gang leader and all gang headers below it.  This yields
+ * an in-core tree containing the contents of every gang header and the
+ * bps for every constituent of the gang block.
+ *
+ * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
+ * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
+ * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
+ * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
+ * zio_read_gang() is a wrapper around zio_read() that omits reading gang
+ * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
+ * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
+ * of the gang header plus zio_checksum_compute() of the data to update the
+ * gang header's blk_cksum as described above.
+ *
+ * The two-phase assemble/issue model solves the problem of partial failure --
+ * what if you'd freed part of a gang block but then couldn't read the
+ * gang header for another part?  Assembling the entire gang tree first
+ * ensures that all the necessary gang header I/O has succeeded before
+ * starting the actual work of free, claim, or write.  Once the gang tree
+ * is assembled, free and claim are in-memory operations that cannot fail.
+ *
+ * In the event that a gang write fails, zio_dva_unallocate() walks the
+ * gang tree to immediately free (i.e. insert back into the space map)
+ * everything we've allocated.  This ensures that we don't get ENOSPC
+ * errors during repeated suspend/resume cycles due to a flaky device.
+ *
+ * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
+ * the gang tree, we won't modify the block, so we can safely defer the free
+ * (knowing that the block is still intact).  If we *can* assemble the gang
+ * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
+ * each constituent bp and we can allocate a new block on the next sync pass.
+ *
+ * In all cases, the gang tree allows complete recovery from partial failure.
+ * ==========================================================================
+ */
+
+static void
+zio_gang_issue_func_done(zio_t *zio)
+{
+	abd_free(zio->io_abd);
+}
+
+static zio_t *
+zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
+{
+	if (gn != NULL)
+		return (pio);
+
+	return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset),
+	    BP_GET_PSIZE(bp), zio_gang_issue_func_done,
+	    NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+	    &pio->io_bookmark));
+}
+
+static zio_t *
+zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
+{
+	zio_t *zio;
+
+	if (gn != NULL) {
+		abd_t *gbh_abd =
+		    abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+		    gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL,
+		    pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
+		    &pio->io_bookmark);
+		/*
+		 * As we rewrite each gang header, the pipeline will compute
+		 * a new gang block header checksum for it; but no one will
+		 * compute a new data checksum, so we do that here.  The one
+		 * exception is the gang leader: the pipeline already computed
+		 * its data checksum because that stage precedes gang assembly.
+		 * (Presently, nothing actually uses interior data checksums;
+		 * this is just good hygiene.)
+		 */
+		if (gn != pio->io_gang_leader->io_gang_tree) {
+			abd_t *buf = abd_get_offset(data, offset);
+
+			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
+			    buf, BP_GET_PSIZE(bp));
+
+			abd_free(buf);
+		}
+		/*
+		 * If we are here to damage data for testing purposes,
+		 * leave the GBH alone so that we can detect the damage.
+		 */
+		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+	} else {
+		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
+		    abd_get_offset(data, offset), BP_GET_PSIZE(bp),
+		    zio_gang_issue_func_done, NULL, pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+	}
+
+	return (zio);
+}
+
+/* ARGSUSED */
+static zio_t *
+zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
+{
+	zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+	    ZIO_GANG_CHILD_FLAGS(pio));
+	if (zio == NULL) {
+		zio = zio_null(pio, pio->io_spa,
+		    NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio));
+	}
+	return (zio);
+}
+
+/* ARGSUSED */
+static zio_t *
+zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data,
+    uint64_t offset)
+{
+	return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
+	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+}
+
+static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
+	NULL,
+	zio_read_gang,
+	zio_rewrite_gang,
+	zio_free_gang,
+	zio_claim_gang,
+	NULL
+};
+
+static void zio_gang_tree_assemble_done(zio_t *zio);
+
+static zio_gang_node_t *
+zio_gang_node_alloc(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn;
+
+	ASSERT(*gnpp == NULL);
+
+	gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
+	gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
+	*gnpp = gn;
+
+	return (gn);
+}
+
+static void
+zio_gang_node_free(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn = *gnpp;
+
+	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+		ASSERT(gn->gn_child[g] == NULL);
+
+	zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+	kmem_free(gn, sizeof (*gn));
+	*gnpp = NULL;
+}
+
+static void
+zio_gang_tree_free(zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn = *gnpp;
+
+	if (gn == NULL)
+		return;
+
+	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
+		zio_gang_tree_free(&gn->gn_child[g]);
+
+	zio_gang_node_free(gnpp);
+}
+
+static void
+zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
+{
+	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
+	abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE);
+
+	ASSERT(gio->io_gang_leader == gio);
+	ASSERT(BP_IS_GANG(bp));
+
+	zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+	    zio_gang_tree_assemble_done, gn, gio->io_priority,
+	    ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
+}
+
+static void
+zio_gang_tree_assemble_done(zio_t *zio)
+{
+	zio_t *gio = zio->io_gang_leader;
+	zio_gang_node_t *gn = zio->io_private;
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(gio == zio_unique_parent(zio));
+	ASSERT(zio->io_child_count == 0);
+
+	if (zio->io_error)
+		return;
+
+	/* this ABD was created from a linear buf in zio_gang_tree_assemble */
+	if (BP_SHOULD_BYTESWAP(bp))
+		byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size);
+
+	ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh);
+	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
+	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+	abd_free(zio->io_abd);
+
+	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+		if (!BP_IS_GANG(gbp))
+			continue;
+		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
+	}
+}
+
+static void
+zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data,
+    uint64_t offset)
+{
+	zio_t *gio = pio->io_gang_leader;
+	zio_t *zio;
+
+	ASSERT(BP_IS_GANG(bp) == !!gn);
+	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
+	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
+
+	/*
+	 * If you're a gang header, your data is in gn->gn_gbh.
+	 * If you're a gang member, your data is in 'data' and gn == NULL.
+	 */
+	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset);
+
+	if (gn != NULL) {
+		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
+
+		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
+			if (BP_IS_HOLE(gbp))
+				continue;
+			zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data,
+			    offset);
+			offset += BP_GET_PSIZE(gbp);
+		}
+	}
+
+	if (gn == gio->io_gang_tree)
+		ASSERT3U(gio->io_size, ==, offset);
+
+	if (zio != pio)
+		zio_nowait(zio);
+}
+
+static zio_t *
+zio_gang_assemble(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	zio->io_gang_leader = zio;
+
+	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
+
+	return (zio);
+}
+
+static zio_t *
+zio_gang_issue(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
+		return (NULL);
+	}
+
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
+		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd,
+		    0);
+	else
+		zio_gang_tree_free(&zio->io_gang_tree);
+
+	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	return (zio);
+}
+
+static void
+zio_write_gang_member_ready(zio_t *zio)
+{
+	zio_t *pio = zio_unique_parent(zio);
+	dva_t *cdva = zio->io_bp->blk_dva;
+	dva_t *pdva = pio->io_bp->blk_dva;
+	uint64_t asize;
+	zio_t *gio __maybe_unused = zio->io_gang_leader;
+
+	if (BP_IS_HOLE(zio->io_bp))
+		return;
+
+	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
+
+	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
+	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
+	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
+
+	mutex_enter(&pio->io_lock);
+	for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
+		ASSERT(DVA_GET_GANG(&pdva[d]));
+		asize = DVA_GET_ASIZE(&pdva[d]);
+		asize += DVA_GET_ASIZE(&cdva[d]);
+		DVA_SET_ASIZE(&pdva[d], asize);
+	}
+	mutex_exit(&pio->io_lock);
+}
+
+static void
+zio_write_gang_done(zio_t *zio)
+{
+	/*
+	 * The io_abd field will be NULL for a zio with no data.  The io_flags
+	 * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't
+	 * check for it here as it is cleared in zio_ready.
+	 */
+	if (zio->io_abd != NULL)
+		abd_free(zio->io_abd);
+}
+
+static zio_t *
+zio_write_gang_block(zio_t *pio, metaslab_class_t *mc)
+{
+	spa_t *spa = pio->io_spa;
+	blkptr_t *bp = pio->io_bp;
+	zio_t *gio = pio->io_gang_leader;
+	zio_t *zio;
+	zio_gang_node_t *gn, **gnpp;
+	zio_gbh_phys_t *gbh;
+	abd_t *gbh_abd;
+	uint64_t txg = pio->io_txg;
+	uint64_t resid = pio->io_size;
+	uint64_t lsize;
+	int copies = gio->io_prop.zp_copies;
+	int gbh_copies;
+	zio_prop_t zp;
+	int error;
+	boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA);
+
+	/*
+	 * encrypted blocks need DVA[2] free so encrypted gang headers can't
+	 * have a third copy.
+	 */
+	gbh_copies = MIN(copies + 1, spa_max_replication(spa));
+	if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP)
+		gbh_copies = SPA_DVAS_PER_BP - 1;
+
+	int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+	if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(has_data);
+
+		flags |= METASLAB_ASYNC_ALLOC;
+		VERIFY(zfs_refcount_held(&mc->mc_allocator[pio->io_allocator].
+		    mca_alloc_slots, pio));
+
+		/*
+		 * The logical zio has already placed a reservation for
+		 * 'copies' allocation slots but gang blocks may require
+		 * additional copies. These additional copies
+		 * (i.e. gbh_copies - copies) are guaranteed to succeed
+		 * since metaslab_class_throttle_reserve() always allows
+		 * additional reservations for gang blocks.
+		 */
+		VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+		    pio->io_allocator, pio, flags));
+	}
+
+	error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags,
+	    &pio->io_alloc_list, pio, pio->io_allocator);
+	if (error) {
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(has_data);
+
+			/*
+			 * If we failed to allocate the gang block header then
+			 * we remove any additional allocation reservations that
+			 * we placed here. The original reservation will
+			 * be removed when the logical I/O goes to the ready
+			 * stage.
+			 */
+			metaslab_class_throttle_unreserve(mc,
+			    gbh_copies - copies, pio->io_allocator, pio);
+		}
+
+		pio->io_error = error;
+		return (pio);
+	}
+
+	if (pio == gio) {
+		gnpp = &gio->io_gang_tree;
+	} else {
+		gnpp = pio->io_private;
+		ASSERT(pio->io_ready == zio_write_gang_member_ready);
+	}
+
+	gn = zio_gang_node_alloc(gnpp);
+	gbh = gn->gn_gbh;
+	bzero(gbh, SPA_GANGBLOCKSIZE);
+	gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE);
+
+	/*
+	 * Create the gang header.
+	 */
+	zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE,
+	    zio_write_gang_done, NULL, pio->io_priority,
+	    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+	/*
+	 * Create and nowait the gang children.
+	 */
+	for (int g = 0; resid != 0; resid -= lsize, g++) {
+		lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
+		    SPA_MINBLOCKSIZE);
+		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
+
+		zp.zp_checksum = gio->io_prop.zp_checksum;
+		zp.zp_compress = ZIO_COMPRESS_OFF;
+		zp.zp_complevel = gio->io_prop.zp_complevel;
+		zp.zp_type = DMU_OT_NONE;
+		zp.zp_level = 0;
+		zp.zp_copies = gio->io_prop.zp_copies;
+		zp.zp_dedup = B_FALSE;
+		zp.zp_dedup_verify = B_FALSE;
+		zp.zp_nopwrite = B_FALSE;
+		zp.zp_encrypt = gio->io_prop.zp_encrypt;
+		zp.zp_byteorder = gio->io_prop.zp_byteorder;
+		bzero(zp.zp_salt, ZIO_DATA_SALT_LEN);
+		bzero(zp.zp_iv, ZIO_DATA_IV_LEN);
+		bzero(zp.zp_mac, ZIO_DATA_MAC_LEN);
+
+		zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+		    has_data ? abd_get_offset(pio->io_abd, pio->io_size -
+		    resid) : NULL, lsize, lsize, &zp,
+		    zio_write_gang_member_ready, NULL, NULL,
+		    zio_write_gang_done, &gn->gn_child[g], pio->io_priority,
+		    ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+		if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(has_data);
+
+			/*
+			 * Gang children won't throttle but we should
+			 * account for their work, so reserve an allocation
+			 * slot for them here.
+			 */
+			VERIFY(metaslab_class_throttle_reserve(mc,
+			    zp.zp_copies, cio->io_allocator, cio, flags));
+		}
+		zio_nowait(cio);
+	}
+
+	/*
+	 * Set pio's pipeline to just wait for zio to finish.
+	 */
+	pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	/*
+	 * We didn't allocate this bp, so make sure it doesn't get unmarked.
+	 */
+	pio->io_flags &= ~ZIO_FLAG_FASTWRITE;
+
+	zio_nowait(zio);
+
+	return (pio);
+}
+
+/*
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary.  The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required.  Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions.  To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
+ */
+static zio_t *
+zio_nop_write(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	zio_prop_t *zp = &zio->io_prop;
+
+	ASSERT(BP_GET_LEVEL(bp) == 0);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(zp->zp_nopwrite);
+	ASSERT(!zp->zp_dedup);
+	ASSERT(zio->io_bp_override == NULL);
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Check to see if the original bp and the new bp have matching
+	 * characteristics (i.e. same checksum, compression algorithms, etc).
+	 * If they don't then just continue with the pipeline which will
+	 * allocate a new bp.
+	 */
+	if (BP_IS_HOLE(bp_orig) ||
+	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+	    ZCHECKSUM_FLAG_NOPWRITE) ||
+	    BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) ||
+	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
+		return (zio);
+
+	/*
+	 * If the checksums match then reset the pipeline so that we
+	 * avoid allocating a new bp and issuing any I/O.
+	 */
+	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE);
+		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+		    sizeof (uint64_t)) == 0);
+
+		/*
+		 * If we're overwriting a block that is currently on an
+		 * indirect vdev, then ignore the nopwrite request and
+		 * allow a new block to be allocated on a concrete vdev.
+		 */
+		spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER);
+		vdev_t *tvd = vdev_lookup_top(zio->io_spa,
+		    DVA_GET_VDEV(&bp->blk_dva[0]));
+		if (tvd->vdev_ops == &vdev_indirect_ops) {
+			spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+			return (zio);
+		}
+		spa_config_exit(zio->io_spa, SCL_VDEV, FTAG);
+
+		*bp = *bp_orig;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		zio->io_flags |= ZIO_FLAG_NOPWRITE;
+	}
+
+	return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Dedup
+ * ==========================================================================
+ */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp;
+	zio_t *pio = zio_unique_parent(zio);
+
+	mutex_enter(&pio->io_lock);
+	ddp = ddt_phys_select(dde, bp);
+	if (zio->io_error == 0)
+		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+
+	if (zio->io_error == 0 && dde->dde_repair_abd == NULL)
+		dde->dde_repair_abd = zio->io_abd;
+	else
+		abd_free(zio->io_abd);
+	mutex_exit(&pio->io_lock);
+}
+
+static zio_t *
+zio_ddt_read_start(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+		ddt_phys_t *ddp = dde->dde_phys;
+		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+		blkptr_t blk;
+
+		ASSERT(zio->io_vsd == NULL);
+		zio->io_vsd = dde;
+
+		if (ddp_self == NULL)
+			return (zio);
+
+		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+				continue;
+			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+			    &blk);
+			zio_nowait(zio_read(zio, zio->io_spa, &blk,
+			    abd_alloc_for_io(zio->io_size, B_TRUE),
+			    zio->io_size, zio_ddt_child_read_done, dde,
+			    zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) |
+			    ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark));
+		}
+		return (zio);
+	}
+
+	zio_nowait(zio_read(zio, zio->io_spa, bp,
+	    zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority,
+	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+	return (zio);
+}
+
+static zio_t *
+zio_ddt_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
+		return (NULL);
+	}
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = zio->io_vsd;
+		if (ddt == NULL) {
+			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+			return (zio);
+		}
+		if (dde == NULL) {
+			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+			return (NULL);
+		}
+		if (dde->dde_repair_abd != NULL) {
+			abd_copy(zio->io_abd, dde->dde_repair_abd,
+			    zio->io_size);
+			zio->io_child_error[ZIO_CHILD_DDT] = 0;
+		}
+		ddt_repair_done(ddt, dde);
+		zio->io_vsd = NULL;
+	}
+
+	ASSERT(zio->io_vsd == NULL);
+
+	return (zio);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+	spa_t *spa = zio->io_spa;
+	boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW);
+
+	ASSERT(!(zio->io_bp_override && do_raw));
+
+	/*
+	 * Note: we compare the original data, not the transformed data,
+	 * because when zio->io_bp is an override bp, we will not have
+	 * pushed the I/O transforms.  That's an important optimization
+	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+	 * However, we should never get a raw, override zio so in these
+	 * cases we can compare the io_abd directly. This is useful because
+	 * it allows us to do dedup verification even if we don't have access
+	 * to the original data (for instance, if the encryption keys aren't
+	 * loaded).
+	 */
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		zio_t *lio = dde->dde_lead_zio[p];
+
+		if (lio != NULL && do_raw) {
+			return (lio->io_size != zio->io_size ||
+			    abd_cmp(zio->io_abd, lio->io_abd) != 0);
+		} else if (lio != NULL) {
+			return (lio->io_orig_size != zio->io_orig_size ||
+			    abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0);
+		}
+	}
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
+		if (ddp->ddp_phys_birth != 0 && do_raw) {
+			blkptr_t blk = *zio->io_bp;
+			uint64_t psize;
+			abd_t *tmpabd;
+			int error;
+
+			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+			psize = BP_GET_PSIZE(&blk);
+
+			if (psize != zio->io_size)
+				return (B_TRUE);
+
+			ddt_exit(ddt);
+
+			tmpabd = abd_alloc_for_io(psize, B_TRUE);
+
+			error = zio_wait(zio_read(NULL, spa, &blk, tmpabd,
+			    psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+			    ZIO_FLAG_RAW, &zio->io_bookmark));
+
+			if (error == 0) {
+				if (abd_cmp(tmpabd, zio->io_abd) != 0)
+					error = SET_ERROR(ENOENT);
+			}
+
+			abd_free(tmpabd);
+			ddt_enter(ddt);
+			return (error != 0);
+		} else if (ddp->ddp_phys_birth != 0) {
+			arc_buf_t *abuf = NULL;
+			arc_flags_t aflags = ARC_FLAG_WAIT;
+			blkptr_t blk = *zio->io_bp;
+			int error;
+
+			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+			if (BP_GET_LSIZE(&blk) != zio->io_orig_size)
+				return (B_TRUE);
+
+			ddt_exit(ddt);
+
+			error = arc_read(NULL, spa, &blk,
+			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    &aflags, &zio->io_bookmark);
+
+			if (error == 0) {
+				if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data,
+				    zio->io_orig_size) != 0)
+					error = SET_ERROR(ENOENT);
+				arc_buf_destroy(abuf, &abuf);
+			}
+
+			ddt_enter(ddt);
+			return (error != 0);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+	zio_t *pio;
+
+	if (zio->io_error)
+		return;
+
+	ddt_enter(ddt);
+
+	ASSERT(dde->dde_lead_zio[p] == zio);
+
+	ddt_phys_fill(ddp, zio->io_bp);
+
+	zio_link_t *zl = NULL;
+	while ((pio = zio_walk_parents(zio, &zl)) != NULL)
+		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+
+	ddt_enter(ddt);
+
+	ASSERT(ddp->ddp_refcnt == 0);
+	ASSERT(dde->dde_lead_zio[p] == zio);
+	dde->dde_lead_zio[p] = NULL;
+
+	if (zio->io_error == 0) {
+		zio_link_t *zl = NULL;
+		while (zio_walk_parents(zio, &zl) != NULL)
+			ddt_phys_addref(ddp);
+	} else {
+		ddt_phys_clear(ddp);
+	}
+
+	ddt_exit(ddt);
+}
+
+static zio_t *
+zio_ddt_write(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t txg = zio->io_txg;
+	zio_prop_t *zp = &zio->io_prop;
+	int p = zp->zp_copies;
+	zio_t *cio = NULL;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+	ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW)));
+
+	ddt_enter(ddt);
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ddp = &dde->dde_phys[p];
+
+	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+		/*
+		 * If we're using a weak checksum, upgrade to a strong checksum
+		 * and try again.  If we're already using a strong checksum,
+		 * we can't resolve it, so just convert to an ordinary write.
+		 * (And automatically e-mail a paper to Nature?)
+		 */
+		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)) {
+			zp->zp_checksum = spa_dedup_checksum(spa);
+			zio_pop_transforms(zio);
+			zio->io_stage = ZIO_STAGE_OPEN;
+			BP_ZERO(bp);
+		} else {
+			zp->zp_dedup = B_FALSE;
+			BP_SET_DEDUP(bp, B_FALSE);
+		}
+		ASSERT(!BP_GET_DEDUP(bp));
+		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+		ddt_exit(ddt);
+		return (zio);
+	}
+
+	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+		if (ddp->ddp_phys_birth != 0)
+			ddt_bp_fill(ddp, bp, txg);
+		if (dde->dde_lead_zio[p] != NULL)
+			zio_add_child(zio, dde->dde_lead_zio[p]);
+		else
+			ddt_phys_addref(ddp);
+	} else if (zio->io_bp_override) {
+		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+		ddt_phys_fill(ddp, bp);
+		ddt_phys_addref(ddp);
+	} else {
+		cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd,
+		    zio->io_orig_size, zio->io_orig_size, zp,
+		    zio_ddt_child_write_ready, NULL, NULL,
+		    zio_ddt_child_write_done, dde, zio->io_priority,
+		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+		zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL);
+		dde->dde_lead_zio[p] = cio;
+	}
+
+	ddt_exit(ddt);
+
+	zio_nowait(cio);
+
+	return (zio);
+}
+
+ddt_entry_t *freedde; /* for debugging */
+
+static zio_t *
+zio_ddt_free(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	ddt_enter(ddt);
+	freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
+	if (dde) {
+		ddp = ddt_phys_select(dde, bp);
+		if (ddp)
+			ddt_phys_decref(ddp);
+	}
+	ddt_exit(ddt);
+
+	return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa, int allocator)
+{
+	zio_t *zio;
+
+	ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator]));
+
+	zio = avl_first(&spa->spa_alloc_trees[allocator]);
+	if (zio == NULL)
+		return (NULL);
+
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Try to place a reservation for this zio. If we're unable to
+	 * reserve then we throttle.
+	 */
+	ASSERT3U(zio->io_allocator, ==, allocator);
+	if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
+	    zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) {
+		return (NULL);
+	}
+
+	avl_remove(&spa->spa_alloc_trees[allocator], zio);
+	ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+	return (zio);
+}
+
+static zio_t *
+zio_dva_throttle(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	zio_t *nio;
+	metaslab_class_t *mc;
+
+	/* locate an appropriate allocation class */
+	mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type,
+	    zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk);
+
+	if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+	    !mc->mc_alloc_throttle_enabled ||
+	    zio->io_child_type == ZIO_CHILD_GANG ||
+	    zio->io_flags & ZIO_FLAG_NODATA) {
+		return (zio);
+	}
+
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	ASSERT3U(zio->io_queued_timestamp, >, 0);
+	ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+	zbookmark_phys_t *bm = &zio->io_bookmark;
+	/*
+	 * We want to try to use as many allocators as possible to help improve
+	 * performance, but we also want logically adjacent IOs to be physically
+	 * adjacent to improve sequential read performance. We chunk each object
+	 * into 2^20 block regions, and then hash based on the objset, object,
+	 * level, and region to accomplish both of these goals.
+	 */
+	zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object,
+	    bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
+	mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]);
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	zio->io_metaslab_class = mc;
+	avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio);
+	nio = zio_io_to_allocate(spa, zio->io_allocator);
+	mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]);
+	return (nio);
+}
+
+static void
+zio_allocate_dispatch(spa_t *spa, int allocator)
+{
+	zio_t *zio;
+
+	mutex_enter(&spa->spa_alloc_locks[allocator]);
+	zio = zio_io_to_allocate(spa, allocator);
+	mutex_exit(&spa->spa_alloc_locks[allocator]);
+	if (zio == NULL)
+		return;
+
+	ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+	ASSERT0(zio->io_error);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
+static zio_t *
+zio_dva_allocate(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	metaslab_class_t *mc;
+	blkptr_t *bp = zio->io_bp;
+	int error;
+	int flags = 0;
+
+	if (zio->io_gang_leader == NULL) {
+		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+		zio->io_gang_leader = zio;
+	}
+
+	ASSERT(BP_IS_HOLE(bp));
+	ASSERT0(BP_GET_NDVAS(bp));
+	ASSERT3U(zio->io_prop.zp_copies, >, 0);
+	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
+	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
+
+	flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0;
+	if (zio->io_flags & ZIO_FLAG_NODATA)
+		flags |= METASLAB_DONT_THROTTLE;
+	if (zio->io_flags & ZIO_FLAG_GANG_CHILD)
+		flags |= METASLAB_GANG_CHILD;
+	if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE)
+		flags |= METASLAB_ASYNC_ALLOC;
+
+	/*
+	 * if not already chosen, locate an appropriate allocation class
+	 */
+	mc = zio->io_metaslab_class;
+	if (mc == NULL) {
+		mc = spa_preferred_class(spa, zio->io_size,
+		    zio->io_prop.zp_type, zio->io_prop.zp_level,
+		    zio->io_prop.zp_zpl_smallblk);
+		zio->io_metaslab_class = mc;
+	}
+
+	/*
+	 * Try allocating the block in the usual metaslab class.
+	 * If that's full, allocate it in the normal class.
+	 * If that's full, allocate as a gang block,
+	 * and if all are full, the allocation fails (which shouldn't happen).
+	 *
+	 * Note that we do not fall back on embedded slog (ZIL) space, to
+	 * preserve unfragmented slog space, which is critical for decent
+	 * sync write performance.  If a log allocation fails, we will fall
+	 * back to spa_sync() which is abysmal for performance.
+	 */
+	error = metaslab_alloc(spa, mc, zio->io_size, bp,
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+	    &zio->io_alloc_list, zio, zio->io_allocator);
+
+	/*
+	 * Fallback to normal class when an alloc class is full
+	 */
+	if (error == ENOSPC && mc != spa_normal_class(spa)) {
+		/*
+		 * If throttling, transfer reservation over to normal class.
+		 * The io_allocator slot can remain the same even though we
+		 * are switching classes.
+		 */
+		if (mc->mc_alloc_throttle_enabled &&
+		    (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) {
+			metaslab_class_throttle_unreserve(mc,
+			    zio->io_prop.zp_copies, zio->io_allocator, zio);
+			zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING;
+
+			VERIFY(metaslab_class_throttle_reserve(
+			    spa_normal_class(spa),
+			    zio->io_prop.zp_copies, zio->io_allocator, zio,
+			    flags | METASLAB_MUST_RESERVE));
+		}
+		zio->io_metaslab_class = mc = spa_normal_class(spa);
+		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+			zfs_dbgmsg("%s: metaslab allocation failure, "
+			    "trying normal class: zio %px, size %llu, error %d",
+			    spa_name(spa), zio, zio->io_size, error);
+		}
+
+		error = metaslab_alloc(spa, mc, zio->io_size, bp,
+		    zio->io_prop.zp_copies, zio->io_txg, NULL, flags,
+		    &zio->io_alloc_list, zio, zio->io_allocator);
+	}
+
+	if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) {
+		if (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC) {
+			zfs_dbgmsg("%s: metaslab allocation failure, "
+			    "trying ganging: zio %px, size %llu, error %d",
+			    spa_name(spa), zio, zio->io_size, error);
+		}
+		return (zio_write_gang_block(zio, mc));
+	}
+	if (error != 0) {
+		if (error != ENOSPC ||
+		    (zfs_flags & ZFS_DEBUG_METASLAB_ALLOC)) {
+			zfs_dbgmsg("%s: metaslab allocation failure: zio %px, "
+			    "size %llu, error %d",
+			    spa_name(spa), zio, zio->io_size, error);
+		}
+		zio->io_error = error;
+	}
+
+	return (zio);
+}
+
+static zio_t *
+zio_dva_free(zio_t *zio)
+{
+	metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
+
+	return (zio);
+}
+
+static zio_t *
+zio_dva_claim(zio_t *zio)
+{
+	int error;
+
+	error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
+	if (error)
+		zio->io_error = error;
+
+	return (zio);
+}
+
+/*
+ * Undo an allocation.  This is used by zio_done() when an I/O fails
+ * and we want to give back the block we just allocated.
+ * This handles both normal blocks and gang blocks.
+ */
+static void
+zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
+{
+	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+	ASSERT(zio->io_bp_override == NULL);
+
+	if (!BP_IS_HOLE(bp))
+		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
+
+	if (gn != NULL) {
+		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
+			zio_dva_unallocate(zio, gn->gn_child[g],
+			    &gn->gn_gbh->zg_blkptr[g]);
+		}
+	}
+}
+
+/*
+ * Try to allocate an intent log block.  Return 0 on success, errno on failure.
+ */
+int
+zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
+    uint64_t size, boolean_t *slog)
+{
+	int error = 1;
+	zio_alloc_list_t io_alloc_list;
+
+	ASSERT(txg > spa_syncing_txg(spa));
+
+	metaslab_trace_init(&io_alloc_list);
+
+	/*
+	 * Block pointer fields are useful to metaslabs for stats and debugging.
+	 * Fill in the obvious ones before calling into metaslab_alloc().
+	 */
+	BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+	BP_SET_PSIZE(new_bp, size);
+	BP_SET_LEVEL(new_bp, 0);
+
+	/*
+	 * When allocating a zil block, we don't have information about
+	 * the final destination of the block except the objset it's part
+	 * of, so we just hash the objset ID to pick the allocator to get
+	 * some parallelism.
+	 */
+	int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
+	int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) %
+	    spa->spa_alloc_count;
+	error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
+	    txg, NULL, flags, &io_alloc_list, NULL, allocator);
+	*slog = (error == 0);
+	if (error != 0) {
+		error = metaslab_alloc(spa, spa_embedded_log_class(spa), size,
+		    new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, NULL, allocator);
+	}
+	if (error != 0) {
+		error = metaslab_alloc(spa, spa_normal_class(spa), size,
+		    new_bp, 1, txg, NULL, flags,
+		    &io_alloc_list, NULL, allocator);
+	}
+	metaslab_trace_fini(&io_alloc_list);
+
+	if (error == 0) {
+		BP_SET_LSIZE(new_bp, size);
+		BP_SET_PSIZE(new_bp, size);
+		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
+		BP_SET_CHECKSUM(new_bp,
+		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
+		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
+		BP_SET_LEVEL(new_bp, 0);
+		BP_SET_DEDUP(new_bp, 0);
+		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
+
+		/*
+		 * encrypted blocks will require an IV and salt. We generate
+		 * these now since we will not be rewriting the bp at
+		 * rewrite time.
+		 */
+		if (os->os_encrypted) {
+			uint8_t iv[ZIO_DATA_IV_LEN];
+			uint8_t salt[ZIO_DATA_SALT_LEN];
+
+			BP_SET_CRYPT(new_bp, B_TRUE);
+			VERIFY0(spa_crypt_get_salt(spa,
+			    dmu_objset_id(os), salt));
+			VERIFY0(zio_crypt_generate_iv(iv));
+
+			zio_crypt_encode_params_bp(new_bp, salt, iv);
+		}
+	} else {
+		zfs_dbgmsg("%s: zil block allocation failure: "
+		    "size %llu, error %d", spa_name(spa), size, error);
+	}
+
+	return (error);
+}
+
+/*
+ * ==========================================================================
+ * Read and write to physical devices
+ * ==========================================================================
+ */
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
+static zio_t *
+zio_vdev_io_start(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	uint64_t align;
+	spa_t *spa = zio->io_spa;
+
+	zio->io_delay = 0;
+
+	ASSERT(zio->io_error == 0);
+	ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
+
+	if (vd == NULL) {
+		if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+			spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
+
+		/*
+		 * The mirror_ops handle multiple DVAs in a single BP.
+		 */
+		vdev_mirror_ops.vdev_op_io_start(zio);
+		return (NULL);
+	}
+
+	ASSERT3P(zio->io_logical, !=, zio);
+	if (zio->io_type == ZIO_TYPE_WRITE) {
+		ASSERT(spa->spa_trust_config);
+
+		/*
+		 * Note: the code can handle other kinds of writes,
+		 * but we don't expect them.
+		 */
+		if (zio->io_vd->vdev_removing) {
+			ASSERT(zio->io_flags &
+			    (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL |
+			    ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE));
+		}
+	}
+
+	align = 1ULL << vd->vdev_top->vdev_ashift;
+
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+	    P2PHASE(zio->io_size, align) != 0) {
+		/* Transform logical writes to be a full physical block size. */
+		uint64_t asize = P2ROUNDUP(zio->io_size, align);
+		abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize);
+		ASSERT(vd == vd->vdev_top);
+		if (zio->io_type == ZIO_TYPE_WRITE) {
+			abd_copy(abuf, zio->io_abd, zio->io_size);
+			abd_zero_off(abuf, zio->io_size, asize - zio->io_size);
+		}
+		zio_push_transform(zio, abuf, asize, asize, zio_subblock);
+	}
+
+	/*
+	 * If this is not a physical io, make sure that it is properly aligned
+	 * before proceeding.
+	 */
+	if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+		ASSERT0(P2PHASE(zio->io_offset, align));
+		ASSERT0(P2PHASE(zio->io_size, align));
+	} else {
+		/*
+		 * For physical writes, we allow 512b aligned writes and assume
+		 * the device will perform a read-modify-write as necessary.
+		 */
+		ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
+		ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
+	}
+
+	VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+
+	/*
+	 * If this is a repair I/O, and there's no self-healing involved --
+	 * that is, we're just resilvering what we expect to resilver --
+	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
+	 * This prevents spurious resilvering.
+	 *
+	 * There are a few ways that we can end up creating these spurious
+	 * resilver i/os:
+	 *
+	 * 1. A resilver i/o will be issued if any DVA in the BP has a
+	 * dirty DTL.  The mirror code will issue resilver writes to
+	 * each DVA, including the one(s) that are not on vdevs with dirty
+	 * DTLs.
+	 *
+	 * 2. With nested replication, which happens when we have a
+	 * "replacing" or "spare" vdev that's a child of a mirror or raidz.
+	 * For example, given mirror(replacing(A+B), C), it's likely that
+	 * only A is out of date (it's the new device). In this case, we'll
+	 * read from C, then use the data to resilver A+B -- but we don't
+	 * actually want to resilver B, just A. The top-level mirror has no
+	 * way to know this, so instead we just discard unnecessary repairs
+	 * as we work our way down the vdev tree.
+	 *
+	 * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc.
+	 * The same logic applies to any form of nested replication: ditto
+	 * + mirror, RAID-Z + replacing, etc.
+	 *
+	 * However, indirect vdevs point off to other vdevs which may have
+	 * DTL's, so we never bypass them.  The child i/os on concrete vdevs
+	 * will be properly bypassed instead.
+	 *
+	 * Leaf DTL_PARTIAL can be empty when a legitimate write comes from
+	 * a dRAID spare vdev. For example, when a dRAID spare is first
+	 * used, its spare blocks need to be written to but the leaf vdev's
+	 * of such blocks can have empty DTL_PARTIAL.
+	 *
+	 * There seemed no clean way to allow such writes while bypassing
+	 * spurious ones. At this point, just avoid all bypassing for dRAID
+	 * for correctness.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    vd->vdev_ops != &vdev_indirect_ops &&
+	    vd->vdev_top->vdev_ops != &vdev_draid_ops &&
+	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		zio_vdev_io_bypass(zio);
+		return (zio);
+	}
+
+	/*
+	 * Select the next best leaf I/O to process.  Distributed spares are
+	 * excluded since they dispatch the I/O directly to a leaf vdev after
+	 * applying the dRAID mapping.
+	 */
+	if (vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_ops != &vdev_draid_spare_ops &&
+	    (zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE ||
+	    zio->io_type == ZIO_TYPE_TRIM)) {
+
+		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
+			return (zio);
+
+		if ((zio = vdev_queue_io(zio)) == NULL)
+			return (NULL);
+
+		if (!vdev_accessible(vd, zio)) {
+			zio->io_error = SET_ERROR(ENXIO);
+			zio_interrupt(zio);
+			return (NULL);
+		}
+		zio->io_delay = gethrtime();
+	}
+
+	vd->vdev_ops->vdev_op_io_start(zio);
+	return (NULL);
+}
+
+static zio_t *
+zio_vdev_io_done(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
+	boolean_t unexpected_error = B_FALSE;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
+		return (NULL);
+	}
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ ||
+	    zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM);
+
+	if (zio->io_delay)
+		zio->io_delay = gethrtime() - zio->io_delay;
+
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_ops != &vdev_draid_spare_ops) {
+		vdev_queue_io_done(zio);
+
+		if (zio->io_type == ZIO_TYPE_WRITE)
+			vdev_cache_write(zio);
+
+		if (zio_injection_enabled && zio->io_error == 0)
+			zio->io_error = zio_handle_device_injections(vd, zio,
+			    EIO, EILSEQ);
+
+		if (zio_injection_enabled && zio->io_error == 0)
+			zio->io_error = zio_handle_label_injection(zio, EIO);
+
+		if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) {
+			if (!vdev_accessible(vd, zio)) {
+				zio->io_error = SET_ERROR(ENXIO);
+			} else {
+				unexpected_error = B_TRUE;
+			}
+		}
+	}
+
+	ops->vdev_op_io_done(zio);
+
+	if (unexpected_error)
+		VERIFY(vdev_probe(vd, zio) == NULL);
+
+	return (zio);
+}
+
+/*
+ * This function is used to change the priority of an existing zio that is
+ * currently in-flight. This is used by the arc to upgrade priority in the
+ * event that a demand read is made for a block that is currently queued
+ * as a scrub or async read IO. Otherwise, the high priority read request
+ * would end up having to wait for the lower priority IO.
+ */
+void
+zio_change_priority(zio_t *pio, zio_priority_t priority)
+{
+	zio_t *cio, *cio_next;
+	zio_link_t *zl = NULL;
+
+	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+
+	if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) {
+		vdev_queue_change_io_priority(pio, priority);
+	} else {
+		pio->io_priority = priority;
+	}
+
+	mutex_enter(&pio->io_lock);
+	for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio, &zl);
+		zio_change_priority(cio, priority);
+	}
+	mutex_exit(&pio->io_lock);
+}
+
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+    const abd_t *good_buf)
+{
+	/* no processing needed */
+	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+	void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size);
+
+	abd_copy(abd, zio->io_abd, zio->io_size);
+
+	zcr->zcr_cbinfo = zio->io_size;
+	zcr->zcr_cbdata = abd;
+	zcr->zcr_finish = zio_vsd_default_cksum_finish;
+	zcr->zcr_free = zio_abd_free;
+}
+
+static zio_t *
+zio_vdev_io_assess(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
+		return (NULL);
+	}
+
+	if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
+		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
+
+	if (zio->io_vsd != NULL) {
+		zio->io_vsd_ops->vsd_free(zio);
+		zio->io_vsd = NULL;
+	}
+
+	if (zio_injection_enabled && zio->io_error == 0)
+		zio->io_error = zio_handle_fault_injection(zio, EIO);
+
+	/*
+	 * If the I/O failed, determine whether we should attempt to retry it.
+	 *
+	 * On retry, we cut in line in the issue queue, since we don't want
+	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
+	 */
+	if (zio->io_error && vd == NULL &&
+	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE));	/* not a leaf */
+		ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));	/* not a leaf */
+		zio->io_error = 0;
+		zio->io_flags |= ZIO_FLAG_IO_RETRY |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
+		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+		    zio_requeue_io_start_cut_in_line);
+		return (NULL);
+	}
+
+	/*
+	 * If we got an error on a leaf device, convert it to ENXIO
+	 * if the device is not accessible at all.
+	 */
+	if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    !vdev_accessible(vd, zio))
+		zio->io_error = SET_ERROR(ENXIO);
+
+	/*
+	 * If we can't write to an interior vdev (mirror or RAID-Z),
+	 * set vdev_cant_write so that we stop trying to allocate from it.
+	 */
+	if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
+	    vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
+		vd->vdev_cant_write = B_TRUE;
+	}
+
+	/*
+	 * If a cache flush returns ENOTSUP or ENOTTY, we know that no future
+	 * attempts will ever succeed. In this case we set a persistent
+	 * boolean flag so that we don't bother with it in the future.
+	 */
+	if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) &&
+	    zio->io_type == ZIO_TYPE_IOCTL &&
+	    zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL)
+		vd->vdev_nowritecache = B_TRUE;
+
+	if (zio->io_error)
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+	if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+	    zio->io_physdone != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+		ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+		zio->io_physdone(zio->io_logical);
+	}
+
+	return (zio);
+}
+
+void
+zio_vdev_io_reissue(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_redone(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
+
+	zio->io_stage >>= 1;
+}
+
+void
+zio_vdev_io_bypass(zio_t *zio)
+{
+	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
+	ASSERT(zio->io_error == 0);
+
+	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
+	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
+}
+
+/*
+ * ==========================================================================
+ * Encrypt and store encryption parameters
+ * ==========================================================================
+ */
+
+
+/*
+ * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for
+ * managing the storage of encryption parameters and passing them to the
+ * lower-level encryption functions.
+ */
+static zio_t *
+zio_encrypt(zio_t *zio)
+{
+	zio_prop_t *zp = &zio->io_prop;
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t psize = BP_GET_PSIZE(bp);
+	uint64_t dsobj = zio->io_bookmark.zb_objset;
+	dmu_object_type_t ot = BP_GET_TYPE(bp);
+	void *enc_buf = NULL;
+	abd_t *eabd = NULL;
+	uint8_t salt[ZIO_DATA_SALT_LEN];
+	uint8_t iv[ZIO_DATA_IV_LEN];
+	uint8_t mac[ZIO_DATA_MAC_LEN];
+	boolean_t no_crypt = B_FALSE;
+
+	/* the root zio already encrypted the data */
+	if (zio->io_child_type == ZIO_CHILD_GANG)
+		return (zio);
+
+	/* only ZIL blocks are re-encrypted on rewrite */
+	if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG)
+		return (zio);
+
+	if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) {
+		BP_SET_CRYPT(bp, B_FALSE);
+		return (zio);
+	}
+
+	/* if we are doing raw encryption set the provided encryption params */
+	if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) {
+		ASSERT0(BP_GET_LEVEL(bp));
+		BP_SET_CRYPT(bp, B_TRUE);
+		BP_SET_BYTEORDER(bp, zp->zp_byteorder);
+		if (ot != DMU_OT_OBJSET)
+			zio_crypt_encode_mac_bp(bp, zp->zp_mac);
+
+		/* dnode blocks must be written out in the provided byteorder */
+		if (zp->zp_byteorder != ZFS_HOST_BYTEORDER &&
+		    ot == DMU_OT_DNODE) {
+			void *bswap_buf = zio_buf_alloc(psize);
+			abd_t *babd = abd_get_from_buf(bswap_buf, psize);
+
+			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+			abd_copy_to_buf(bswap_buf, zio->io_abd, psize);
+			dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf,
+			    psize);
+
+			abd_take_ownership_of_buf(babd, B_TRUE);
+			zio_push_transform(zio, babd, psize, psize, NULL);
+		}
+
+		if (DMU_OT_IS_ENCRYPTED(ot))
+			zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv);
+		return (zio);
+	}
+
+	/* indirect blocks only maintain a cksum of the lower level MACs */
+	if (BP_GET_LEVEL(bp) > 0) {
+		BP_SET_CRYPT(bp, B_TRUE);
+		VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE,
+		    zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp),
+		    mac));
+		zio_crypt_encode_mac_bp(bp, mac);
+		return (zio);
+	}
+
+	/*
+	 * Objset blocks are a special case since they have 2 256-bit MACs
+	 * embedded within them.
+	 */
+	if (ot == DMU_OT_OBJSET) {
+		ASSERT0(DMU_OT_IS_ENCRYPTED(ot));
+		ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
+		BP_SET_CRYPT(bp, B_TRUE);
+		VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj,
+		    zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp)));
+		return (zio);
+	}
+
+	/* unencrypted object types are only authenticated with a MAC */
+	if (!DMU_OT_IS_ENCRYPTED(ot)) {
+		BP_SET_CRYPT(bp, B_TRUE);
+		VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj,
+		    zio->io_abd, psize, mac));
+		zio_crypt_encode_mac_bp(bp, mac);
+		return (zio);
+	}
+
+	/*
+	 * Later passes of sync-to-convergence may decide to rewrite data
+	 * in place to avoid more disk reallocations. This presents a problem
+	 * for encryption because this constitutes rewriting the new data with
+	 * the same encryption key and IV. However, this only applies to blocks
+	 * in the MOS (particularly the spacemaps) and we do not encrypt the
+	 * MOS. We assert that the zio is allocating or an intent log write
+	 * to enforce this.
+	 */
+	ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG);
+	ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG);
+	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION));
+	ASSERT3U(psize, !=, 0);
+
+	enc_buf = zio_buf_alloc(psize);
+	eabd = abd_get_from_buf(enc_buf, psize);
+	abd_take_ownership_of_buf(eabd, B_TRUE);
+
+	/*
+	 * For an explanation of what encryption parameters are stored
+	 * where, see the block comment in zio_crypt.c.
+	 */
+	if (ot == DMU_OT_INTENT_LOG) {
+		zio_crypt_decode_params_bp(bp, salt, iv);
+	} else {
+		BP_SET_CRYPT(bp, B_TRUE);
+	}
+
+	/* Perform the encryption. This should not fail */
+	VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark,
+	    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
+	    salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt));
+
+	/* encode encryption metadata into the bp */
+	if (ot == DMU_OT_INTENT_LOG) {
+		/*
+		 * ZIL blocks store the MAC in the embedded checksum, so the
+		 * transform must always be applied.
+		 */
+		zio_crypt_encode_mac_zil(enc_buf, mac);
+		zio_push_transform(zio, eabd, psize, psize, NULL);
+	} else {
+		BP_SET_CRYPT(bp, B_TRUE);
+		zio_crypt_encode_params_bp(bp, salt, iv);
+		zio_crypt_encode_mac_bp(bp, mac);
+
+		if (no_crypt) {
+			ASSERT3U(ot, ==, DMU_OT_DNODE);
+			abd_free(eabd);
+		} else {
+			zio_push_transform(zio, eabd, psize, psize, NULL);
+		}
+	}
+
+	return (zio);
+}
+
+/*
+ * ==========================================================================
+ * Generate and verify checksums
+ * ==========================================================================
+ */
+static zio_t *
+zio_checksum_generate(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum;
+
+	if (bp == NULL) {
+		/*
+		 * This is zio_write_phys().
+		 * We're either generating a label checksum, or none at all.
+		 */
+		checksum = zio->io_prop.zp_checksum;
+
+		if (checksum == ZIO_CHECKSUM_OFF)
+			return (zio);
+
+		ASSERT(checksum == ZIO_CHECKSUM_LABEL);
+	} else {
+		if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
+			ASSERT(!IO_IS_ALLOCATING(zio));
+			checksum = ZIO_CHECKSUM_GANG_HEADER;
+		} else {
+			checksum = BP_GET_CHECKSUM(bp);
+		}
+	}
+
+	zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size);
+
+	return (zio);
+}
+
+static zio_t *
+zio_checksum_verify(zio_t *zio)
+{
+	zio_bad_cksum_t info;
+	blkptr_t *bp = zio->io_bp;
+	int error;
+
+	ASSERT(zio->io_vd != NULL);
+
+	if (bp == NULL) {
+		/*
+		 * This is zio_read_phys().
+		 * We're either verifying a label checksum, or nothing at all.
+		 */
+		if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
+			return (zio);
+
+		ASSERT3U(zio->io_prop.zp_checksum, ==, ZIO_CHECKSUM_LABEL);
+	}
+
+	if ((error = zio_checksum_error(zio, &info)) != 0) {
+		zio->io_error = error;
+		if (error == ECKSUM &&
+		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+			int ret = zfs_ereport_start_checksum(zio->io_spa,
+			    zio->io_vd, &zio->io_bookmark, zio,
+			    zio->io_offset, zio->io_size, NULL, &info);
+
+			if (ret != EALREADY) {
+				mutex_enter(&zio->io_vd->vdev_stat_lock);
+				zio->io_vd->vdev_stat.vs_checksum_errors++;
+				mutex_exit(&zio->io_vd->vdev_stat_lock);
+			}
+		}
+	}
+
+	return (zio);
+}
+
+/*
+ * Called by RAID-Z to ensure we don't compute the checksum twice.
+ */
+void
+zio_checksum_verified(zio_t *zio)
+{
+	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
+}
+
+/*
+ * ==========================================================================
+ * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
+ * An error of 0 indicates success.  ENXIO indicates whole-device failure,
+ * which may be transient (e.g. unplugged) or permanent.  ECKSUM and EIO
+ * indicate errors that are specific to one I/O, and most likely permanent.
+ * Any other error is presumed to be worse because we weren't expecting it.
+ * ==========================================================================
+ */
+int
+zio_worst_error(int e1, int e2)
+{
+	static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
+	int r1, r2;
+
+	for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
+		if (e1 == zio_error_rank[r1])
+			break;
+
+	for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
+		if (e2 == zio_error_rank[r2])
+			break;
+
+	return (r1 > r2 ? e1 : e2);
+}
+
+/*
+ * ==========================================================================
+ * I/O completion
+ * ==========================================================================
+ */
+static zio_t *
+zio_ready(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	zio_t *pio, *pio_next;
+	zio_link_t *zl = NULL;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
+	    ZIO_WAIT_READY)) {
+		return (NULL);
+	}
+
+	if (zio->io_ready) {
+		ASSERT(IO_IS_ALLOCATING(zio));
+		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
+		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
+
+		zio->io_ready(zio);
+	}
+
+	if (bp != NULL && bp != &zio->io_bp_copy)
+		zio->io_bp_copy = *bp;
+
+	if (zio->io_error != 0) {
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+		if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+			ASSERT(IO_IS_ALLOCATING(zio));
+			ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+			ASSERT(zio->io_metaslab_class != NULL);
+
+			/*
+			 * We were unable to allocate anything, unreserve and
+			 * issue the next I/O to allocate.
+			 */
+			metaslab_class_throttle_unreserve(
+			    zio->io_metaslab_class, zio->io_prop.zp_copies,
+			    zio->io_allocator, zio);
+			zio_allocate_dispatch(zio->io_spa, zio->io_allocator);
+		}
+	}
+
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_READY] = 1;
+	pio = zio_walk_parents(zio, &zl);
+	mutex_exit(&zio->io_lock);
+
+	/*
+	 * As we notify zio's parents, new parents could be added.
+	 * New parents go to the head of zio's io_parent_list, however,
+	 * so we will (correctly) not notify them.  The remainder of zio's
+	 * io_parent_list, from 'pio_next' onward, cannot change because
+	 * all parents must wait for us to be done before they can be done.
+	 */
+	for (; pio != NULL; pio = pio_next) {
+		pio_next = zio_walk_parents(zio, &zl);
+		zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL);
+	}
+
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		if (BP_IS_GANG(bp)) {
+			zio->io_flags &= ~ZIO_FLAG_NODATA;
+		} else {
+			ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE);
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+		}
+	}
+
+	if (zio_injection_enabled &&
+	    zio->io_spa->spa_syncing_txg == zio->io_txg)
+		zio_handle_ignored_writes(zio);
+
+	return (zio);
+}
+
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+	zio_t *lio __maybe_unused = zio->io_logical;
+	zio_t *pio = zio_unique_parent(zio);
+	vdev_t *vd = zio->io_vd;
+	int flags = METASLAB_ASYNC_ALLOC;
+
+	ASSERT3P(zio->io_bp, !=, NULL);
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+	ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+	ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+	ASSERT(vd != NULL);
+	ASSERT3P(vd, ==, vd->vdev_top);
+	ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY));
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+	ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+	ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+	/*
+	 * Parents of gang children can have two flavors -- ones that
+	 * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+	 * and ones that allocated the constituent blocks. The allocation
+	 * throttle needs to know the allocating parent zio so we must find
+	 * it here.
+	 */
+	if (pio->io_child_type == ZIO_CHILD_GANG) {
+		/*
+		 * If our parent is a rewrite gang child then our grandparent
+		 * would have been the one that performed the allocation.
+		 */
+		if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+			pio = zio_unique_parent(pio);
+		flags |= METASLAB_GANG_CHILD;
+	}
+
+	ASSERT(IO_IS_ALLOCATING(pio));
+	ASSERT3P(zio, !=, zio->io_logical);
+	ASSERT(zio->io_logical != NULL);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+	ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+	ASSERT(zio->io_metaslab_class != NULL);
+
+	mutex_enter(&pio->io_lock);
+	metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags,
+	    pio->io_allocator, B_TRUE);
+	mutex_exit(&pio->io_lock);
+
+	metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1,
+	    pio->io_allocator, pio);
+
+	/*
+	 * Call into the pipeline to see if there is more work that
+	 * needs to be done. If there is work to be done it will be
+	 * dispatched to another taskq thread.
+	 */
+	zio_allocate_dispatch(zio->io_spa, pio->io_allocator);
+}
+
+static zio_t *
+zio_done(zio_t *zio)
+{
+	/*
+	 * Always attempt to keep stack usage minimal here since
+	 * we can be called recursively up to 19 levels deep.
+	 */
+	const uint64_t psize = zio->io_size;
+	zio_t *pio, *pio_next;
+	zio_link_t *zl = NULL;
+
+	/*
+	 * If our children haven't all completed,
+	 * wait for them and then repeat this pipeline stage.
+	 */
+	if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
+		return (NULL);
+	}
+
+	/*
+	 * If the allocation throttle is enabled, then update the accounting.
+	 * We only track child I/Os that are part of an allocating async
+	 * write. We must do this since the allocation is performed
+	 * by the logical I/O but the actual write is done by child I/Os.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+	    zio->io_child_type == ZIO_CHILD_VDEV) {
+		ASSERT(zio->io_metaslab_class != NULL);
+		ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled);
+		zio_dva_throttle_done(zio);
+	}
+
+	/*
+	 * If the allocation throttle is enabled, verify that
+	 * we have decremented the refcounts for every I/O that was throttled.
+	 */
+	if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+		ASSERT(zio->io_bp != NULL);
+
+		metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio,
+		    zio->io_allocator);
+		VERIFY(zfs_refcount_not_held(&zio->io_metaslab_class->
+		    mc_allocator[zio->io_allocator].mca_alloc_slots, zio));
+	}
+
+
+	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
+		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+			ASSERT(zio->io_children[c][w] == 0);
+
+	if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) {
+		ASSERT(zio->io_bp->blk_pad[0] == 0);
+		ASSERT(zio->io_bp->blk_pad[1] == 0);
+		ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy,
+		    sizeof (blkptr_t)) == 0 ||
+		    (zio->io_bp == zio_unique_parent(zio)->io_bp));
+		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) &&
+		    zio->io_bp_override == NULL &&
+		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
+			ASSERT3U(zio->io_prop.zp_copies, <=,
+			    BP_GET_NDVAS(zio->io_bp));
+			ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 ||
+			    (BP_COUNT_GANG(zio->io_bp) ==
+			    BP_GET_NDVAS(zio->io_bp)));
+		}
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+			VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+	}
+
+	/*
+	 * If there were child vdev/gang/ddt errors, they apply to us now.
+	 */
+	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
+	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+	/*
+	 * If the I/O on the transformed data was successful, generate any
+	 * checksum reports now while we still have the transformed data.
+	 */
+	if (zio->io_error == 0) {
+		while (zio->io_cksum_report != NULL) {
+			zio_cksum_report_t *zcr = zio->io_cksum_report;
+			uint64_t align = zcr->zcr_align;
+			uint64_t asize = P2ROUNDUP(psize, align);
+			abd_t *adata = zio->io_abd;
+
+			if (asize != psize) {
+				adata = abd_alloc(asize, B_TRUE);
+				abd_copy(adata, zio->io_abd, psize);
+				abd_zero_off(adata, psize, asize - psize);
+			}
+
+			zio->io_cksum_report = zcr->zcr_next;
+			zcr->zcr_next = NULL;
+			zcr->zcr_finish(zcr, adata);
+			zfs_ereport_free_checksum(zcr);
+
+			if (asize != psize)
+				abd_free(adata);
+		}
+	}
+
+	zio_pop_transforms(zio);	/* note: may set zio->io_error */
+
+	vdev_stat_update(zio, psize);
+
+	/*
+	 * If this I/O is attached to a particular vdev is slow, exceeding
+	 * 30 seconds to complete, post an error described the I/O delay.
+	 * We ignore these errors if the device is currently unavailable.
+	 */
+	if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) {
+		if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) {
+			/*
+			 * We want to only increment our slow IO counters if
+			 * the IO is valid (i.e. not if the drive is removed).
+			 *
+			 * zfs_ereport_post() will also do these checks, but
+			 * it can also ratelimit and have other failures, so we
+			 * need to increment the slow_io counters independent
+			 * of it.
+			 */
+			if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY,
+			    zio->io_spa, zio->io_vd, zio)) {
+				mutex_enter(&zio->io_vd->vdev_stat_lock);
+				zio->io_vd->vdev_stat.vs_slow_ios++;
+				mutex_exit(&zio->io_vd->vdev_stat_lock);
+
+				(void) zfs_ereport_post(FM_EREPORT_ZFS_DELAY,
+				    zio->io_spa, zio->io_vd, &zio->io_bookmark,
+				    zio, 0);
+			}
+		}
+	}
+
+	if (zio->io_error) {
+		/*
+		 * If this I/O is attached to a particular vdev,
+		 * generate an error message describing the I/O failure
+		 * at the block level.  We ignore these errors if the
+		 * device is currently unavailable.
+		 */
+		if (zio->io_error != ECKSUM && zio->io_vd != NULL &&
+		    !vdev_is_dead(zio->io_vd)) {
+			int ret = zfs_ereport_post(FM_EREPORT_ZFS_IO,
+			    zio->io_spa, zio->io_vd, &zio->io_bookmark, zio, 0);
+			if (ret != EALREADY) {
+				mutex_enter(&zio->io_vd->vdev_stat_lock);
+				if (zio->io_type == ZIO_TYPE_READ)
+					zio->io_vd->vdev_stat.vs_read_errors++;
+				else if (zio->io_type == ZIO_TYPE_WRITE)
+					zio->io_vd->vdev_stat.vs_write_errors++;
+				mutex_exit(&zio->io_vd->vdev_stat_lock);
+			}
+		}
+
+		if ((zio->io_error == EIO || !(zio->io_flags &
+		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+		    zio == zio->io_logical) {
+			/*
+			 * For logical I/O requests, tell the SPA to log the
+			 * error and generate a logical data ereport.
+			 */
+			spa_log_error(zio->io_spa, &zio->io_bookmark);
+			(void) zfs_ereport_post(FM_EREPORT_ZFS_DATA,
+			    zio->io_spa, NULL, &zio->io_bookmark, zio, 0);
+		}
+	}
+
+	if (zio->io_error && zio == zio->io_logical) {
+		/*
+		 * Determine whether zio should be reexecuted.  This will
+		 * propagate all the way to the root via zio_notify_parent().
+		 */
+		ASSERT(zio->io_vd == NULL && zio->io_bp != NULL);
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+		if (IO_IS_ALLOCATING(zio) &&
+		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+			if (zio->io_error != ENOSPC)
+				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
+			else
+				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+		}
+
+		if ((zio->io_type == ZIO_TYPE_READ ||
+		    zio->io_type == ZIO_TYPE_FREE) &&
+		    !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
+		    zio->io_error == ENXIO &&
+		    spa_load_state(zio->io_spa) == SPA_LOAD_NONE &&
+		    spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE)
+			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
+			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+		/*
+		 * Here is a possibly good place to attempt to do
+		 * either combinatorial reconstruction or error correction
+		 * based on checksums.  It also might be a good place
+		 * to send out preliminary ereports before we suspend
+		 * processing.
+		 */
+	}
+
+	/*
+	 * If there were logical child errors, they apply to us now.
+	 * We defer this until now to avoid conflating logical child
+	 * errors with errors that happened to the zio itself when
+	 * updating vdev stats and reporting FMA events above.
+	 */
+	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
+
+	if ((zio->io_error || zio->io_reexecute) &&
+	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
+		zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp);
+
+	zio_gang_tree_free(&zio->io_gang_tree);
+
+	/*
+	 * Godfather I/Os should never suspend.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
+	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
+		zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND;
+
+	if (zio->io_reexecute) {
+		/*
+		 * This is a logical I/O that wants to reexecute.
+		 *
+		 * Reexecute is top-down.  When an i/o fails, if it's not
+		 * the root, it simply notifies its parent and sticks around.
+		 * The parent, seeing that it still has children in zio_done(),
+		 * does the same.  This percolates all the way up to the root.
+		 * The root i/o will reexecute or suspend the entire tree.
+		 *
+		 * This approach ensures that zio_reexecute() honors
+		 * all the original i/o dependency relationships, e.g.
+		 * parents not executing until children are ready.
+		 */
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+		zio->io_gang_leader = NULL;
+
+		mutex_enter(&zio->io_lock);
+		zio->io_state[ZIO_WAIT_DONE] = 1;
+		mutex_exit(&zio->io_lock);
+
+		/*
+		 * "The Godfather" I/O monitors its children but is
+		 * not a true parent to them. It will track them through
+		 * the pipeline but severs its ties whenever they get into
+		 * trouble (e.g. suspended). This allows "The Godfather"
+		 * I/O to return status without blocking.
+		 */
+		zl = NULL;
+		for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+		    pio = pio_next) {
+			zio_link_t *remove_zl = zl;
+			pio_next = zio_walk_parents(zio, &zl);
+
+			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
+			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+				zio_remove_child(pio, zio, remove_zl);
+				/*
+				 * This is a rare code path, so we don't
+				 * bother with "next_to_execute".
+				 */
+				zio_notify_parent(pio, zio, ZIO_WAIT_DONE,
+				    NULL);
+			}
+		}
+
+		if ((pio = zio_unique_parent(zio)) != NULL) {
+			/*
+			 * We're not a root i/o, so there's nothing to do
+			 * but notify our parent.  Don't propagate errors
+			 * upward since we haven't permanently failed yet.
+			 */
+			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
+			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
+			/*
+			 * This is a rare code path, so we don't bother with
+			 * "next_to_execute".
+			 */
+			zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL);
+		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
+			/*
+			 * We'd fail again if we reexecuted now, so suspend
+			 * until conditions improve (e.g. device comes online).
+			 */
+			zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR);
+		} else {
+			/*
+			 * Reexecution is potentially a huge amount of work.
+			 * Hand it off to the otherwise-unused claim taskq.
+			 */
+			ASSERT(taskq_empty_ent(&zio->io_tqent));
+			spa_taskq_dispatch_ent(zio->io_spa,
+			    ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE,
+			    (task_func_t *)zio_reexecute, zio, 0,
+			    &zio->io_tqent);
+		}
+		return (NULL);
+	}
+
+	ASSERT(zio->io_child_count == 0);
+	ASSERT(zio->io_reexecute == 0);
+	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
+
+	/*
+	 * Report any checksum errors, since the I/O is complete.
+	 */
+	while (zio->io_cksum_report != NULL) {
+		zio_cksum_report_t *zcr = zio->io_cksum_report;
+		zio->io_cksum_report = zcr->zcr_next;
+		zcr->zcr_next = NULL;
+		zcr->zcr_finish(zcr, NULL);
+		zfs_ereport_free_checksum(zcr);
+	}
+
+	if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp &&
+	    !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) &&
+	    !(zio->io_flags & ZIO_FLAG_NOPWRITE)) {
+		metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp);
+	}
+
+	/*
+	 * It is the responsibility of the done callback to ensure that this
+	 * particular zio is no longer discoverable for adoption, and as
+	 * such, cannot acquire any new parents.
+	 */
+	if (zio->io_done)
+		zio->io_done(zio);
+
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_DONE] = 1;
+	mutex_exit(&zio->io_lock);
+
+	/*
+	 * We are done executing this zio.  We may want to execute a parent
+	 * next.  See the comment in zio_notify_parent().
+	 */
+	zio_t *next_to_execute = NULL;
+	zl = NULL;
+	for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+		zio_link_t *remove_zl = zl;
+		pio_next = zio_walk_parents(zio, &zl);
+		zio_remove_child(pio, zio, remove_zl);
+		zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute);
+	}
+
+	if (zio->io_waiter != NULL) {
+		mutex_enter(&zio->io_lock);
+		zio->io_executor = NULL;
+		cv_broadcast(&zio->io_cv);
+		mutex_exit(&zio->io_lock);
+	} else {
+		zio_destroy(zio);
+	}
+
+	return (next_to_execute);
+}
+
+/*
+ * ==========================================================================
+ * I/O pipeline definition
+ * ==========================================================================
+ */
+static zio_pipe_stage_t *zio_pipeline[] = {
+	NULL,
+	zio_read_bp_init,
+	zio_write_bp_init,
+	zio_free_bp_init,
+	zio_issue_async,
+	zio_write_compress,
+	zio_encrypt,
+	zio_checksum_generate,
+	zio_nop_write,
+	zio_ddt_read_start,
+	zio_ddt_read_done,
+	zio_ddt_write,
+	zio_ddt_free,
+	zio_gang_assemble,
+	zio_gang_issue,
+	zio_dva_throttle,
+	zio_dva_allocate,
+	zio_dva_free,
+	zio_dva_claim,
+	zio_ready,
+	zio_vdev_io_start,
+	zio_vdev_io_done,
+	zio_vdev_io_assess,
+	zio_checksum_verify,
+	zio_done
+};
+
+
+
+
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects.  Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+    const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
+	/*
+	 * These variables represent the "equivalent" values for the zbookmark,
+	 * after converting zbookmarks inside the meta dnode to their
+	 * normal-object equivalents.
+	 */
+	uint64_t zb1obj, zb2obj;
+	uint64_t zb1L0, zb2L0;
+	uint64_t zb1level, zb2level;
+
+	if (zb1->zb_object == zb2->zb_object &&
+	    zb1->zb_level == zb2->zb_level &&
+	    zb1->zb_blkid == zb2->zb_blkid)
+		return (0);
+
+	IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT);
+	IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT);
+
+	/*
+	 * BP_SPANB calculates the span in blocks.
+	 */
+	zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+	zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
+
+	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
+		zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb1L0 = 0;
+		zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb1obj = zb1->zb_object;
+		zb1level = zb1->zb_level;
+	}
+
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+		zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+		zb2L0 = 0;
+		zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+	} else {
+		zb2obj = zb2->zb_object;
+		zb2level = zb2->zb_level;
+	}
+
+	/* Now that we have a canonical representation, do the comparison. */
+	if (zb1obj != zb2obj)
+		return (zb1obj < zb2obj ? -1 : 1);
+	else if (zb1L0 != zb2L0)
+		return (zb1L0 < zb2L0 ? -1 : 1);
+	else if (zb1level != zb2level)
+		return (zb1level > zb2level ? -1 : 1);
+	/*
+	 * This can (theoretically) happen if the bookmarks have the same object
+	 * and level, but different blkids, if the block sizes are not the same.
+	 * There is presently no way to change the indirect block sizes
+	 */
+	return (0);
+}
+
+/*
+ *  This function checks the following: given that last_block is the place that
+ *  our traversal stopped last time, does that guarantee that we've visited
+ *  every node under subtree_root?  Therefore, we can't just use the raw output
+ *  of zbookmark_compare.  We have to pass in a modified version of
+ *  subtree_root; by incrementing the block id, and then checking whether
+ *  last_block is before or equal to that, we can tell whether or not having
+ *  visited last_block implies that all of subtree_root's children have been
+ *  visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+    const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+	zbookmark_phys_t mod_zb = *subtree_root;
+	mod_zb.zb_blkid++;
+	ASSERT(last_block->zb_level == 0);
+
+	/* The objset_phys_t isn't before anything. */
+	if (dnp == NULL)
+		return (B_FALSE);
+
+	/*
+	 * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+	 * data block size in sectors, because that variable is only used if
+	 * the bookmark refers to a block in the meta-dnode.  Since we don't
+	 * know without examining it what object it refers to, and there's no
+	 * harm in passing in this value in other cases, we always pass it in.
+	 *
+	 * We pass in 0 for the indirect block size shift because zb2 must be
+	 * level 0.  The indirect block size is only used to calculate the span
+	 * of the bookmark, but since the bookmark must be level 0, the span is
+	 * always 1, so the math works out.
+	 *
+	 * If you make changes to how the zbookmark_compare code works, be sure
+	 * to make sure that this code still works afterwards.
+	 */
+	return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+	    1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+	    last_block) <= 0);
+}
+
+EXPORT_SYMBOL(zio_type_name);
+EXPORT_SYMBOL(zio_buf_alloc);
+EXPORT_SYMBOL(zio_data_buf_alloc);
+EXPORT_SYMBOL(zio_buf_free);
+EXPORT_SYMBOL(zio_data_buf_free);
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW,
+	"Max I/O completion time (milliseconds) before marking it as slow");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW,
+	"Prioritize requeued I/O");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free,  INT, ZMOD_RW,
+	"Defer frees starting in this pass");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW,
+	"Don't compress starting in this pass");
+
+ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW,
+	"Rewrite new bps starting in this pass");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW,
+	"Throttle block allocations in the ZIO pipeline");
+
+ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW,
+	"Log all slow ZIOs, not just those with vdevs");
+/* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c
new file mode 100644
index 000000000000..f8fee78c6068
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c
@@ -0,0 +1,570 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil.h>
+#include <sys/abd.h>
+#include <zfs_fletcher.h>
+
+/*
+ * Checksum vectors.
+ *
+ * In the SPA, everything is checksummed.  We support checksum vectors
+ * for three distinct reasons:
+ *
+ *   1. Different kinds of data need different levels of protection.
+ *	For SPA metadata, we always want a very strong checksum.
+ *	For user data, we let users make the trade-off between speed
+ *	and checksum strength.
+ *
+ *   2. Cryptographic hash and MAC algorithms are an area of active research.
+ *	It is likely that in future hash functions will be at least as strong
+ *	as current best-of-breed, and may be substantially faster as well.
+ *	We want the ability to take advantage of these new hashes as soon as
+ *	they become available.
+ *
+ *   3. If someone develops hardware that can compute a strong hash quickly,
+ *	we want the ability to take advantage of that hardware.
+ *
+ * Of course, we don't want a checksum upgrade to invalidate existing
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
+ *
+ * When writing a block, we always checksum it with the latest-and-greatest
+ * checksum function of the appropriate strength.  When reading a block,
+ * we compare the expected checksum against the actual checksum, which we
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really).  A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed.  This salt is kept secret (stored on the pool, but
+ * never shown to the user).  Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time.  How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data.  Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context.  The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
+ */
+
+/*ARGSUSED*/
+static void
+abd_checksum_off(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
+}
+
+/*ARGSUSED*/
+static void
+abd_fletcher_2_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) abd_iterate_func(abd, 0, size,
+	    fletcher_2_incremental_native, zcp);
+}
+
+/*ARGSUSED*/
+static void
+abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_init(zcp);
+	(void) abd_iterate_func(abd, 0, size,
+	    fletcher_2_incremental_byteswap, zcp);
+}
+
+static inline void
+abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp)
+{
+	fletcher_4_abd_ops.acf_init(acdp);
+	abd_iterate_func(abd, 0, size, fletcher_4_abd_ops.acf_iter, acdp);
+	fletcher_4_abd_ops.acf_fini(acdp);
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_native(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_4_ctx_t ctx;
+
+	zio_abd_checksum_data_t acd = {
+		.acd_byteorder	= ZIO_CHECKSUM_NATIVE,
+		.acd_zcp 	= zcp,
+		.acd_ctx	= &ctx
+	};
+
+	abd_fletcher_4_impl(abd, size, &acd);
+
+}
+
+/*ARGSUSED*/
+void
+abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	fletcher_4_ctx_t ctx;
+
+	zio_abd_checksum_data_t acd = {
+		.acd_byteorder	= ZIO_CHECKSUM_BYTESWAP,
+		.acd_zcp 	= zcp,
+		.acd_ctx	= &ctx
+	};
+
+	abd_fletcher_4_impl(abd, size, &acd);
+}
+
+zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
+	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
+	{{NULL, NULL}, NULL, NULL, 0, "on"},
+	{{abd_checksum_off,		abd_checksum_off},
+	    NULL, NULL, 0, "off"},
+	{{abd_checksum_SHA256,		abd_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "label"},
+	{{abd_checksum_SHA256,		abd_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "gang_header"},
+	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
+	    NULL, NULL, 0, "fletcher2"},
+	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+	{{abd_checksum_SHA256,		abd_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+	{{abd_checksum_off,		abd_checksum_off},
+	    NULL, NULL, 0, "noparity"},
+	{{abd_checksum_SHA512_native,	abd_checksum_SHA512_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
+	    abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+#if !defined(__FreeBSD__)
+	{{abd_checksum_edonr_native,	abd_checksum_edonr_byteswap},
+	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+#endif
+};
+
+/*
+ * The flag corresponding to the "verify" in dedup=[checksum,]verify
+ * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
+ */
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
+
+	switch (cksum) {
+	case ZIO_CHECKSUM_SHA512:
+		return (SPA_FEATURE_SHA512);
+	case ZIO_CHECKSUM_SKEIN:
+		return (SPA_FEATURE_SKEIN);
+#if !defined(__FreeBSD__)
+	case ZIO_CHECKSUM_EDONR:
+		return (SPA_FEATURE_EDONR);
+#endif
+	default:
+		return (SPA_FEATURE_NONE);
+	}
+}
+
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
+{
+	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+	if (child == ZIO_CHECKSUM_INHERIT)
+		return (parent);
+
+	if (child == ZIO_CHECKSUM_ON)
+		return (ZIO_CHECKSUM_ON_VALUE);
+
+	return (child);
+}
+
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+    enum zio_checksum parent)
+{
+	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+	if (child == ZIO_CHECKSUM_INHERIT)
+		return (parent);
+
+	if (child == ZIO_CHECKSUM_ON)
+		return (spa_dedup_checksum(spa));
+
+	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP) ||
+	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+	return (child);
+}
+
+/*
+ * Set the external verifier for a gang block based on <vdev, offset, txg>,
+ * a tuple which is guaranteed to be unique for the life of the pool.
+ */
+static void
+zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
+{
+	const dva_t *dva = BP_IDENTITY(bp);
+	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
+
+	ASSERT(BP_IS_GANG(bp));
+
+	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
+}
+
+/*
+ * Set the external verifier for a label block based on its offset.
+ * The vdev is implicit, and the txg is unknowable at pool open time --
+ * hence the logic in vdev_uberblock_load() to find the most recent copy.
+ */
+static void
+zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
+{
+	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
+}
+
+/*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+	if (ci->ci_tmpl_init == NULL)
+		return;
+	if (spa->spa_cksum_tmpls[checksum] != NULL)
+		return;
+
+	VERIFY(ci->ci_tmpl_free != NULL);
+	mutex_enter(&spa->spa_cksum_tmpls_lock);
+	if (spa->spa_cksum_tmpls[checksum] == NULL) {
+		spa->spa_cksum_tmpls[checksum] =
+		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
+		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+	}
+	mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/* convenience function to update a checksum to accommodate an encryption MAC */
+static void
+zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
+{
+	/*
+	 * Weak checksums do not have their entropy spread evenly
+	 * across the bits of the checksum. Therefore, when truncating
+	 * a weak checksum we XOR the first 2 words with the last 2 so
+	 * that we don't "lose" any entropy unnecessarily.
+	 */
+	if (xor) {
+		cksum->zc_word[0] ^= cksum->zc_word[2];
+		cksum->zc_word[1] ^= cksum->zc_word[3];
+	}
+
+	cksum->zc_word[2] = saved->zc_word[2];
+	cksum->zc_word[3] = saved->zc_word[3];
+}
+
+/*
+ * Generate the checksum.
+ */
+void
+zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
+    abd_t *abd, uint64_t size)
+{
+	static const uint64_t zec_magic = ZEC_MAGIC;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t offset = zio->io_offset;
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+	zio_cksum_t cksum, saved;
+	spa_t *spa = zio->io_spa;
+	boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
+
+	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(ci->ci_func[0] != NULL);
+
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+		zio_eck_t eck;
+		size_t eck_offset;
+
+		bzero(&saved, sizeof (zio_cksum_t));
+
+		if (checksum == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t zilc;
+			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
+
+			size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
+			    uint64_t);
+			eck = zilc.zc_eck;
+			eck_offset = offsetof(zil_chain_t, zc_eck);
+		} else {
+			eck_offset = size - sizeof (zio_eck_t);
+			abd_copy_to_buf_off(&eck, abd, eck_offset,
+			    sizeof (zio_eck_t));
+		}
+
+		if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
+			zio_checksum_gang_verifier(&eck.zec_cksum, bp);
+		} else if (checksum == ZIO_CHECKSUM_LABEL) {
+			zio_checksum_label_verifier(&eck.zec_cksum, offset);
+		} else {
+			saved = eck.zec_cksum;
+			eck.zec_cksum = bp->blk_cksum;
+		}
+
+		abd_copy_from_buf_off(abd, &zec_magic,
+		    eck_offset + offsetof(zio_eck_t, zec_magic),
+		    sizeof (zec_magic));
+		abd_copy_from_buf_off(abd, &eck.zec_cksum,
+		    eck_offset + offsetof(zio_eck_t, zec_cksum),
+		    sizeof (zio_cksum_t));
+
+		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
+		    &cksum);
+		if (bp != NULL && BP_USES_CRYPT(bp) &&
+		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+			zio_checksum_handle_crypt(&cksum, &saved, insecure);
+
+		abd_copy_from_buf_off(abd, &cksum,
+		    eck_offset + offsetof(zio_eck_t, zec_cksum),
+		    sizeof (zio_cksum_t));
+	} else {
+		saved = bp->blk_cksum;
+		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
+		    &cksum);
+		if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+			zio_checksum_handle_crypt(&cksum, &saved, insecure);
+		bp->blk_cksum = cksum;
+	}
+}
+
+int
+zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
+    enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset,
+    zio_bad_cksum_t *info)
+{
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+	zio_cksum_t actual_cksum, expected_cksum;
+	zio_eck_t eck;
+	int byteswap;
+
+	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
+		return (SET_ERROR(EINVAL));
+
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
+		zio_cksum_t verifier;
+		size_t eck_offset;
+
+		if (checksum == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t zilc;
+			uint64_t nused;
+
+			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
+
+			eck = zilc.zc_eck;
+			eck_offset = offsetof(zil_chain_t, zc_eck) +
+			    offsetof(zio_eck_t, zec_cksum);
+
+			if (eck.zec_magic == ZEC_MAGIC) {
+				nused = zilc.zc_nused;
+			} else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) {
+				nused = BSWAP_64(zilc.zc_nused);
+			} else {
+				return (SET_ERROR(ECKSUM));
+			}
+
+			if (nused > size) {
+				return (SET_ERROR(ECKSUM));
+			}
+
+			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+		} else {
+			eck_offset = size - sizeof (zio_eck_t);
+			abd_copy_to_buf_off(&eck, abd, eck_offset,
+			    sizeof (zio_eck_t));
+			eck_offset += offsetof(zio_eck_t, zec_cksum);
+		}
+
+		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
+			zio_checksum_gang_verifier(&verifier, bp);
+		else if (checksum == ZIO_CHECKSUM_LABEL)
+			zio_checksum_label_verifier(&verifier, offset);
+		else
+			verifier = bp->blk_cksum;
+
+		byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC));
+
+		if (byteswap)
+			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
+
+		expected_cksum = eck.zec_cksum;
+
+		abd_copy_from_buf_off(abd, &verifier, eck_offset,
+		    sizeof (zio_cksum_t));
+
+		ci->ci_func[byteswap](abd, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
+
+		abd_copy_from_buf_off(abd, &expected_cksum, eck_offset,
+		    sizeof (zio_cksum_t));
+
+		if (byteswap) {
+			byteswap_uint64_array(&expected_cksum,
+			    sizeof (zio_cksum_t));
+		}
+	} else {
+		byteswap = BP_SHOULD_BYTESWAP(bp);
+		expected_cksum = bp->blk_cksum;
+		ci->ci_func[byteswap](abd, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
+	}
+
+	/*
+	 * MAC checksums are a special case since half of this checksum will
+	 * actually be the encryption MAC. This will be verified by the
+	 * decryption process, so we just check the truncated checksum now.
+	 * Objset blocks use embedded MACs so we don't truncate the checksum
+	 * for them.
+	 */
+	if (bp != NULL && BP_USES_CRYPT(bp) &&
+	    BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
+		if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
+			actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
+			actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
+		}
+
+		actual_cksum.zc_word[2] = 0;
+		actual_cksum.zc_word[3] = 0;
+		expected_cksum.zc_word[2] = 0;
+		expected_cksum.zc_word[3] = 0;
+	}
+
+	if (info != NULL) {
+		info->zbc_expected = expected_cksum;
+		info->zbc_actual = actual_cksum;
+		info->zbc_checksum_name = ci->ci_name;
+		info->zbc_byteswapped = byteswap;
+		info->zbc_injected = 0;
+		info->zbc_has_cksum = 1;
+	}
+
+	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
+		return (SET_ERROR(ECKSUM));
+
+	return (0);
+}
+
+int
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+{
+	blkptr_t *bp = zio->io_bp;
+	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+	int error;
+	uint64_t size = (bp == NULL ? zio->io_size :
+	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+	uint64_t offset = zio->io_offset;
+	abd_t *data = zio->io_abd;
+	spa_t *spa = zio->io_spa;
+
+	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
+	    offset, info);
+
+	if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
+		error = zio_handle_fault_injection(zio, ECKSUM);
+		if (error != 0)
+			info->zbc_injected = 1;
+	}
+
+	return (error);
+}
+
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+	for (enum zio_checksum checksum = 0;
+	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
+		if (spa->spa_cksum_tmpls[checksum] != NULL) {
+			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+			VERIFY(ci->ci_tmpl_free != NULL);
+			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+			spa->spa_cksum_tmpls[checksum] = NULL;
+		}
+	}
+}
diff --git a/sys/contrib/openzfs/module/zfs/zio_compress.c b/sys/contrib/openzfs/module/zfs/zio_compress.c
new file mode 100644
index 000000000000..2db3cec35d5d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio_compress.c
@@ -0,0 +1,220 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2013, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zfeature.h>
+#include <sys/zio.h>
+#include <sys/zio_compress.h>
+#include <sys/zstd/zstd.h>
+
+/*
+ * If nonzero, every 1/X decompression attempts will fail, simulating
+ * an undetected memory error.
+ */
+unsigned long zio_decompress_fail_fraction = 0;
+
+/*
+ * Compression vectors.
+ */
+zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
+	{"inherit",	0,	NULL,		NULL, NULL},
+	{"on",		0,	NULL,		NULL, NULL},
+	{"uncompressed", 0,	NULL,		NULL, NULL},
+	{"lzjb",	0,	lzjb_compress,	lzjb_decompress, NULL},
+	{"empty",	0,	NULL,		NULL, NULL},
+	{"gzip-1",	1,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-2",	2,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-3",	3,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-4",	4,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-5",	5,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-6",	6,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-7",	7,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-8",	8,	gzip_compress,	gzip_decompress, NULL},
+	{"gzip-9",	9,	gzip_compress,	gzip_decompress, NULL},
+	{"zle",		64,	zle_compress,	zle_decompress, NULL},
+	{"lz4",		0,	lz4_compress_zfs, lz4_decompress_zfs, NULL},
+	{"zstd",	ZIO_ZSTD_LEVEL_DEFAULT,	zfs_zstd_compress,
+	    zfs_zstd_decompress, zfs_zstd_decompress_level},
+};
+
+uint8_t
+zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child,
+    uint8_t parent)
+{
+	uint8_t result;
+
+	if (!ZIO_COMPRESS_HASLEVEL(compress))
+		return (0);
+
+	result = child;
+	if (result == ZIO_COMPLEVEL_INHERIT)
+		result = parent;
+
+	return (result);
+}
+
+enum zio_compress
+zio_compress_select(spa_t *spa, enum zio_compress child,
+    enum zio_compress parent)
+{
+	enum zio_compress result;
+
+	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT(parent != ZIO_COMPRESS_INHERIT);
+
+	result = child;
+	if (result == ZIO_COMPRESS_INHERIT)
+		result = parent;
+
+	if (result == ZIO_COMPRESS_ON) {
+		if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
+			result = ZIO_COMPRESS_LZ4_ON_VALUE;
+		else
+			result = ZIO_COMPRESS_LEGACY_ON_VALUE;
+	}
+
+	return (result);
+}
+
+/*ARGSUSED*/
+static int
+zio_compress_zeroed_cb(void *data, size_t len, void *private)
+{
+	uint64_t *end = (uint64_t *)((char *)data + len);
+	for (uint64_t *word = (uint64_t *)data; word < end; word++)
+		if (*word != 0)
+			return (1);
+
+	return (0);
+}
+
+size_t
+zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len,
+    uint8_t level)
+{
+	size_t c_len, d_len;
+	uint8_t complevel;
+	zio_compress_info_t *ci = &zio_compress_table[c];
+
+	ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+
+	/*
+	 * If the data is all zeroes, we don't even need to allocate
+	 * a block for it.  We indicate this by returning zero size.
+	 */
+	if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0)
+		return (0);
+
+	if (c == ZIO_COMPRESS_EMPTY)
+		return (s_len);
+
+	/* Compress at least 12.5% */
+	d_len = s_len - (s_len >> 3);
+
+	complevel = ci->ci_level;
+
+	if (c == ZIO_COMPRESS_ZSTD) {
+		/* If we don't know the level, we can't compress it */
+		if (level == ZIO_COMPLEVEL_INHERIT)
+			return (s_len);
+
+		if (level == ZIO_COMPLEVEL_DEFAULT)
+			complevel = ZIO_ZSTD_LEVEL_DEFAULT;
+		else
+			complevel = level;
+
+		ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT);
+	}
+
+	/* No compression algorithms can read from ABDs directly */
+	void *tmp = abd_borrow_buf_copy(src, s_len);
+	c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel);
+	abd_return_buf(src, tmp, s_len);
+
+	if (c_len > d_len)
+		return (s_len);
+
+	ASSERT3U(c_len, <=, d_len);
+	return (c_len);
+}
+
+int
+zio_decompress_data_buf(enum zio_compress c, void *src, void *dst,
+    size_t s_len, size_t d_len, uint8_t *level)
+{
+	zio_compress_info_t *ci = &zio_compress_table[c];
+	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+		return (SET_ERROR(EINVAL));
+
+	if (ci->ci_decompress_level != NULL && level != NULL)
+		return (ci->ci_decompress_level(src, dst, s_len, d_len, level));
+
+	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
+}
+
+int
+zio_decompress_data(enum zio_compress c, abd_t *src, void *dst,
+    size_t s_len, size_t d_len, uint8_t *level)
+{
+	void *tmp = abd_borrow_buf_copy(src, s_len);
+	int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level);
+	abd_return_buf(src, tmp, s_len);
+
+	/*
+	 * Decompression shouldn't fail, because we've already verified
+	 * the checksum.  However, for extra protection (e.g. against bitflips
+	 * in non-ECC RAM), we handle this error (and test it).
+	 */
+	if (zio_decompress_fail_fraction != 0 &&
+	    spa_get_random(zio_decompress_fail_fraction) == 0)
+		ret = SET_ERROR(EINVAL);
+
+	return (ret);
+}
+
+int
+zio_compress_to_feature(enum zio_compress comp)
+{
+	switch (comp) {
+	case ZIO_COMPRESS_ZSTD:
+		return (SPA_FEATURE_ZSTD_COMPRESS);
+	default:
+		/* fallthru */;
+	}
+	return (SPA_FEATURE_NONE);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zio_inject.c b/sys/contrib/openzfs/module/zfs/zio_inject.c
new file mode 100644
index 000000000000..e56ea88682ff
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zio_inject.c
@@ -0,0 +1,972 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2017, Intel Corporation.
+ */
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault.  These are kept in a global list.  Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field.  If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
+#include <sys/fs/zfs.h>
+
+uint32_t zio_injection_enabled = 0;
+
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
+typedef struct inject_handler {
+	int			zi_id;
+	spa_t			*zi_spa;
+	zinject_record_t	zi_record;
+	uint64_t		*zi_lanes;
+	int			zi_next_lane;
+	list_node_t		zi_link;
+} inject_handler_t;
+
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
+static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
+static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
+static int inject_next_id = 1;
+
+/*
+ * Test if the requested frequency was triggered
+ */
+static boolean_t
+freq_triggered(uint32_t frequency)
+{
+	/*
+	 * zero implies always (100%)
+	 */
+	if (frequency == 0)
+		return (B_TRUE);
+
+	/*
+	 * Note: we still handle legacy (unscaled) frequency values
+	 */
+	uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX;
+
+	return (spa_get_random(maximum) < frequency);
+}
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
+    zinject_record_t *record, int error)
+{
+	/*
+	 * Check for a match against the MOS, which is based on type
+	 */
+	if (zb->zb_objset == DMU_META_OBJSET &&
+	    record->zi_objset == DMU_META_OBJSET &&
+	    record->zi_object == DMU_META_DNODE_OBJECT) {
+		if (record->zi_type == DMU_OT_NONE ||
+		    type == record->zi_type)
+			return (freq_triggered(record->zi_freq));
+		else
+			return (B_FALSE);
+	}
+
+	/*
+	 * Check for an exact match.
+	 */
+	if (zb->zb_objset == record->zi_objset &&
+	    zb->zb_object == record->zi_object &&
+	    zb->zb_level == record->zi_level &&
+	    zb->zb_blkid >= record->zi_start &&
+	    zb->zb_blkid <= record->zi_end &&
+	    (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+	    error == record->zi_error) {
+		return (freq_triggered(record->zi_freq));
+	}
+
+	return (B_FALSE);
+}
+
+/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+{
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (spa != handler->zi_spa)
+			continue;
+
+		if (handler->zi_record.zi_type == type &&
+		    strcmp(tag, handler->zi_record.zi_func) == 0)
+			panic("Panic requested in function %s\n", tag);
+	}
+
+	rw_exit(&inject_lock);
+}
+
+/*
+ * Inject a decryption failure. Decryption failures can occur in
+ * both the ARC and the ZIO layers.
+ */
+int
+zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
+    uint64_t type, int error)
+{
+	int ret = 0;
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
+			continue;
+
+		if (zio_match_handler(zb, type, ZI_NO_DVA,
+		    &handler->zi_record, error)) {
+			ret = error;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+	return (ret);
+}
+
+/*
+ * If this is a physical I/O for a vdev child determine which DVA it is
+ * for. We iterate backwards through the DVAs matching on the offset so
+ * that we end up with ZI_NO_DVA (-1) if we don't find a match.
+ */
+static int
+zio_match_dva(zio_t *zio)
+{
+	int i = ZI_NO_DVA;
+
+	if (zio->io_bp != NULL && zio->io_vd != NULL &&
+	    zio->io_child_type == ZIO_CHILD_VDEV) {
+		for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
+			dva_t *dva = &zio->io_bp->blk_dva[i];
+			uint64_t off = DVA_GET_OFFSET(dva);
+			vdev_t *vd = vdev_lookup_top(zio->io_spa,
+			    DVA_GET_VDEV(dva));
+
+			/* Compensate for vdev label added to leaves */
+			if (zio->io_vd->vdev_ops->vdev_op_leaf)
+				off += VDEV_LABEL_START_SIZE;
+
+			if (zio->io_vd == vd && zio->io_offset == off)
+				break;
+		}
+	}
+
+	return (i);
+}
+
+
+/*
+ * Determine if the I/O in question should return failure.  Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+	int ret = 0;
+	inject_handler_t *handler;
+
+	/*
+	 * Ignore I/O not associated with any logical data.
+	 */
+	if (zio->io_logical == NULL)
+		return (0);
+
+	/*
+	 * Currently, we only support fault injection on reads.
+	 */
+	if (zio->io_type != ZIO_TYPE_READ)
+		return (0);
+
+	/*
+	 * A rebuild I/O has no checksum to verify.
+	 */
+	if (zio->io_priority == ZIO_PRIORITY_REBUILD && error == ECKSUM)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
+			continue;
+
+		/* If this handler matches, return the specified error */
+		if (zio_match_handler(&zio->io_logical->io_bookmark,
+		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+		    zio_match_dva(zio), &handler->zi_record, error)) {
+			ret = error;
+			break;
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+/*
+ * Determine if the zio is part of a label update and has an injection
+ * handler associated with that portion of the label. Currently, we
+ * allow error injection in either the nvlist or the uberblock region of
+ * of the vdev label.
+ */
+int
+zio_handle_label_injection(zio_t *zio, int error)
+{
+	inject_handler_t *handler;
+	vdev_t *vd = zio->io_vd;
+	uint64_t offset = zio->io_offset;
+	int label;
+	int ret = 0;
+
+	if (offset >= VDEV_LABEL_START_SIZE &&
+	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
+		return (0);
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+		uint64_t start = handler->zi_record.zi_start;
+		uint64_t end = handler->zi_record.zi_end;
+
+		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
+			continue;
+
+		/*
+		 * The injection region is the relative offsets within a
+		 * vdev label. We must determine the label which is being
+		 * updated and adjust our region accordingly.
+		 */
+		label = vdev_label_number(vd->vdev_psize, offset);
+		start = vdev_label_offset(vd->vdev_psize, label, start);
+		end = vdev_label_offset(vd->vdev_psize, label, end);
+
+		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
+		    (offset >= start && offset <= end)) {
+			ret = error;
+			break;
+		}
+	}
+	rw_exit(&inject_lock);
+	return (ret);
+}
+
+/*ARGSUSED*/
+static int
+zio_inject_bitflip_cb(void *data, size_t len, void *private)
+{
+	zio_t *zio __maybe_unused = private;
+	uint8_t *buffer = data;
+	uint_t byte = spa_get_random(len);
+
+	ASSERT(zio->io_type == ZIO_TYPE_READ);
+
+	/* flip a single random bit in an abd data buffer */
+	buffer[byte] ^= 1 << spa_get_random(8);
+
+	return (1);	/* stop after first flip */
+}
+
+static int
+zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2)
+{
+	inject_handler_t *handler;
+	int ret = 0;
+
+	/*
+	 * We skip over faults in the labels unless it's during
+	 * device open (i.e. zio == NULL).
+	 */
+	if (zio != NULL) {
+		uint64_t offset = zio->io_offset;
+
+		if (offset < VDEV_LABEL_START_SIZE ||
+		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+			return (0);
+	}
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
+			continue;
+
+		if (vd->vdev_guid == handler->zi_record.zi_guid) {
+			if (handler->zi_record.zi_failfast &&
+			    (zio == NULL || (zio->io_flags &
+			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
+				continue;
+			}
+
+			/* Handle type specific I/O failures */
+			if (zio != NULL &&
+			    handler->zi_record.zi_iotype != ZIO_TYPES &&
+			    handler->zi_record.zi_iotype != zio->io_type)
+				continue;
+
+			if (handler->zi_record.zi_error == err1 ||
+			    handler->zi_record.zi_error == err2) {
+				/*
+				 * limit error injection if requested
+				 */
+				if (!freq_triggered(handler->zi_record.zi_freq))
+					continue;
+
+				/*
+				 * For a failed open, pretend like the device
+				 * has gone away.
+				 */
+				if (err1 == ENXIO)
+					vd->vdev_stat.vs_aux =
+					    VDEV_AUX_OPEN_FAILED;
+
+				/*
+				 * Treat these errors as if they had been
+				 * retried so that all the appropriate stats
+				 * and FMA events are generated.
+				 */
+				if (!handler->zi_record.zi_failfast &&
+				    zio != NULL)
+					zio->io_flags |= ZIO_FLAG_IO_RETRY;
+
+				/*
+				 * EILSEQ means flip a bit after a read
+				 */
+				if (handler->zi_record.zi_error == EILSEQ) {
+					if (zio == NULL)
+						break;
+
+					/* locate buffer data and flip a bit */
+					(void) abd_iterate_func(zio->io_abd, 0,
+					    zio->io_size, zio_inject_bitflip_cb,
+					    zio);
+					break;
+				}
+
+				ret = handler->zi_record.zi_error;
+				break;
+			}
+			if (handler->zi_record.zi_error == ENXIO) {
+				ret = SET_ERROR(EIO);
+				break;
+			}
+		}
+	}
+
+	rw_exit(&inject_lock);
+
+	return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
+{
+	return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX));
+}
+
+int
+zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2)
+{
+	return (zio_handle_device_injection_impl(vd, zio, err1, err2));
+}
+
+/*
+ * Simulate hardware that ignores cache flushes.  For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		/* Ignore errors not destined for this pool */
+		if (zio->io_spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
+			continue;
+
+		/*
+		 * Positive duration implies # of seconds, negative
+		 * a number of txgs
+		 */
+		if (handler->zi_record.zi_timer == 0) {
+			if (handler->zi_record.zi_duration > 0)
+				handler->zi_record.zi_timer = ddi_get_lbolt64();
+			else
+				handler->zi_record.zi_timer = zio->io_txg;
+		}
+
+		/* Have a "problem" writing 60% of the time */
+		if (spa_get_random(100) < 60)
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+		break;
+	}
+
+	rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+	inject_handler_t *handler;
+
+	if (zio_injection_enabled == 0)
+		return;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (spa != handler->zi_spa ||
+		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
+			continue;
+
+		if (handler->zi_record.zi_duration > 0) {
+			VERIFY(handler->zi_record.zi_timer == 0 ||
+			    ddi_time_after64(
+			    (int64_t)handler->zi_record.zi_timer +
+			    handler->zi_record.zi_duration * hz,
+			    ddi_get_lbolt64()));
+		} else {
+			/* duration is negative so the subtraction here adds */
+			VERIFY(handler->zi_record.zi_timer == 0 ||
+			    handler->zi_record.zi_timer -
+			    handler->zi_record.zi_duration >=
+			    spa_syncing_txg(spa));
+		}
+	}
+
+	rw_exit(&inject_lock);
+}
+
+hrtime_t
+zio_handle_io_delay(zio_t *zio)
+{
+	vdev_t *vd = zio->io_vd;
+	inject_handler_t *min_handler = NULL;
+	hrtime_t min_target = 0;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	/*
+	 * inject_delay_count is a subset of zio_injection_enabled that
+	 * is only incremented for delay handlers. These checks are
+	 * mainly added to remind the reader why we're not explicitly
+	 * checking zio_injection_enabled like the other functions.
+	 */
+	IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+	IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+	/*
+	 * If there aren't any inject delay handlers registered, then we
+	 * can short circuit and simply return 0 here. A value of zero
+	 * informs zio_delay_interrupt() that this request should not be
+	 * delayed. This short circuit keeps us from acquiring the
+	 * inject_delay_mutex unnecessarily.
+	 */
+	if (inject_delay_count == 0) {
+		rw_exit(&inject_lock);
+		return (0);
+	}
+
+	/*
+	 * Each inject handler has a number of "lanes" associated with
+	 * it. Each lane is able to handle requests independently of one
+	 * another, and at a latency defined by the inject handler
+	 * record's zi_timer field. Thus if a handler in configured with
+	 * a single lane with a 10ms latency, it will delay requests
+	 * such that only a single request is completed every 10ms. So,
+	 * if more than one request is attempted per each 10ms interval,
+	 * the average latency of the requests will be greater than
+	 * 10ms; but if only a single request is submitted each 10ms
+	 * interval the average latency will be 10ms.
+	 *
+	 * We need to acquire this mutex to prevent multiple concurrent
+	 * threads being assigned to the same lane of a given inject
+	 * handler. The mutex allows us to perform the following two
+	 * operations atomically:
+	 *
+	 *	1. determine the minimum handler and minimum target
+	 *	   value of all the possible handlers
+	 *	2. update that minimum handler's lane array
+	 *
+	 * Without atomicity, two (or more) threads could pick the same
+	 * lane in step (1), and then conflict with each other in step
+	 * (2). This could allow a single lane handler to process
+	 * multiple requests simultaneously, which shouldn't be possible.
+	 */
+	mutex_enter(&inject_delay_mtx);
+
+	for (inject_handler_t *handler = list_head(&inject_handlers);
+	    handler != NULL; handler = list_next(&inject_handlers, handler)) {
+		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
+			continue;
+
+		if (!freq_triggered(handler->zi_record.zi_freq))
+			continue;
+
+		if (vd->vdev_guid != handler->zi_record.zi_guid)
+			continue;
+
+		/*
+		 * Defensive; should never happen as the array allocation
+		 * occurs prior to inserting this handler on the list.
+		 */
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+
+		/*
+		 * This should never happen, the zinject command should
+		 * prevent a user from setting an IO delay with zero lanes.
+		 */
+		ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+		ASSERT3U(handler->zi_record.zi_nlanes, >,
+		    handler->zi_next_lane);
+
+		/*
+		 * We want to issue this IO to the lane that will become
+		 * idle the soonest, so we compare the soonest this
+		 * specific handler can complete the IO with all other
+		 * handlers, to find the lowest value of all possible
+		 * lanes. We then use this lane to submit the request.
+		 *
+		 * Since each handler has a constant value for its
+		 * delay, we can just use the "next" lane for that
+		 * handler; as it will always be the lane with the
+		 * lowest value for that particular handler (i.e. the
+		 * lane that will become idle the soonest). This saves a
+		 * scan of each handler's lanes array.
+		 *
+		 * There's two cases to consider when determining when
+		 * this specific IO request should complete. If this
+		 * lane is idle, we want to "submit" the request now so
+		 * it will complete after zi_timer milliseconds. Thus,
+		 * we set the target to now + zi_timer.
+		 *
+		 * If the lane is busy, we want this request to complete
+		 * zi_timer milliseconds after the lane becomes idle.
+		 * Since the 'zi_lanes' array holds the time at which
+		 * each lane will become idle, we use that value to
+		 * determine when this request should complete.
+		 */
+		hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
+		hrtime_t busy = handler->zi_record.zi_timer +
+		    handler->zi_lanes[handler->zi_next_lane];
+		hrtime_t target = MAX(idle, busy);
+
+		if (min_handler == NULL) {
+			min_handler = handler;
+			min_target = target;
+			continue;
+		}
+
+		ASSERT3P(min_handler, !=, NULL);
+		ASSERT3U(min_target, !=, 0);
+
+		/*
+		 * We don't yet increment the "next lane" variable since
+		 * we still might find a lower value lane in another
+		 * handler during any remaining iterations. Once we're
+		 * sure we've selected the absolute minimum, we'll claim
+		 * the lane and increment the handler's "next lane"
+		 * field below.
+		 */
+
+		if (target < min_target) {
+			min_handler = handler;
+			min_target = target;
+		}
+	}
+
+	/*
+	 * 'min_handler' will be NULL if no IO delays are registered for
+	 * this vdev, otherwise it will point to the handler containing
+	 * the lane that will become idle the soonest.
+	 */
+	if (min_handler != NULL) {
+		ASSERT3U(min_target, !=, 0);
+		min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+		/*
+		 * If we've used all possible lanes for this handler,
+		 * loop back and start using the first lane again;
+		 * otherwise, just increment the lane index.
+		 */
+		min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+		    min_handler->zi_record.zi_nlanes;
+	}
+
+	mutex_exit(&inject_delay_mtx);
+	rw_exit(&inject_lock);
+
+	return (min_target);
+}
+
+static int
+zio_calculate_range(const char *pool, zinject_record_t *record)
+{
+	dsl_pool_t *dp;
+	dsl_dataset_t *ds;
+	objset_t *os = NULL;
+	dnode_t *dn = NULL;
+	int error;
+
+	/*
+	 * Obtain the dnode for object using pool, objset, and object
+	 */
+	error = dsl_pool_hold(pool, FTAG, &dp);
+	if (error)
+		return (error);
+
+	error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds);
+	dsl_pool_rele(dp, FTAG);
+	if (error)
+		return (error);
+
+	error = dmu_objset_from_ds(ds, &os);
+	dsl_dataset_rele(ds, FTAG);
+	if (error)
+		return (error);
+
+	error = dnode_hold(os, record->zi_object, FTAG, &dn);
+	if (error)
+		return (error);
+
+	/*
+	 * Translate the range into block IDs
+	 */
+	if (record->zi_start != 0 || record->zi_end != -1ULL) {
+		record->zi_start >>= dn->dn_datablkshift;
+		record->zi_end >>= dn->dn_datablkshift;
+	}
+	if (record->zi_level > 0) {
+		if (record->zi_level >= dn->dn_nlevels) {
+			dnode_rele(dn, FTAG);
+			return (SET_ERROR(EDOM));
+		}
+
+		if (record->zi_start != 0 || record->zi_end != 0) {
+			int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+			for (int level = record->zi_level; level > 0; level--) {
+				record->zi_start >>= shift;
+				record->zi_end >>= shift;
+			}
+		}
+	}
+
+	dnode_rele(dn, FTAG);
+	return (0);
+}
+
+/*
+ * Create a new handler for the given record.  We add it to the list, adding
+ * a reference to the spa_t in the process.  We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+	inject_handler_t *handler;
+	int error;
+	spa_t *spa;
+
+	/*
+	 * If this is pool-wide metadata, make sure we unload the corresponding
+	 * spa_t, so that the next attempt to load it will trigger the fault.
+	 * We call spa_reset() to unload the pool appropriately.
+	 */
+	if (flags & ZINJECT_UNLOAD_SPA)
+		if ((error = spa_reset(name)) != 0)
+			return (error);
+
+	if (record->zi_cmd == ZINJECT_DELAY_IO) {
+		/*
+		 * A value of zero for the number of lanes or for the
+		 * delay time doesn't make sense.
+		 */
+		if (record->zi_timer == 0 || record->zi_nlanes == 0)
+			return (SET_ERROR(EINVAL));
+
+		/*
+		 * The number of lanes is directly mapped to the size of
+		 * an array used by the handler. Thus, to ensure the
+		 * user doesn't trigger an allocation that's "too large"
+		 * we cap the number of lanes here.
+		 */
+		if (record->zi_nlanes >= UINT16_MAX)
+			return (SET_ERROR(EINVAL));
+	}
+
+	/*
+	 * If the supplied range was in bytes -- calculate the actual blkid
+	 */
+	if (flags & ZINJECT_CALC_RANGE) {
+		error = zio_calculate_range(name, record);
+		if (error != 0)
+			return (error);
+	}
+
+	if (!(flags & ZINJECT_NULL)) {
+		/*
+		 * spa_inject_ref() will add an injection reference, which will
+		 * prevent the pool from being removed from the namespace while
+		 * still allowing it to be unloaded.
+		 */
+		if ((spa = spa_inject_addref(name)) == NULL)
+			return (SET_ERROR(ENOENT));
+
+		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+		handler->zi_spa = spa;
+		handler->zi_record = *record;
+
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			handler->zi_lanes = kmem_zalloc(
+			    sizeof (*handler->zi_lanes) *
+			    handler->zi_record.zi_nlanes, KM_SLEEP);
+			handler->zi_next_lane = 0;
+		} else {
+			handler->zi_lanes = NULL;
+			handler->zi_next_lane = 0;
+		}
+
+		rw_enter(&inject_lock, RW_WRITER);
+
+		/*
+		 * We can't move this increment into the conditional
+		 * above because we need to hold the RW_WRITER lock of
+		 * inject_lock, and we don't want to hold that while
+		 * allocating the handler's zi_lanes array.
+		 */
+		if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+			ASSERT3S(inject_delay_count, >=, 0);
+			inject_delay_count++;
+			ASSERT3S(inject_delay_count, >, 0);
+		}
+
+		*id = handler->zi_id = inject_next_id++;
+		list_insert_tail(&inject_handlers, handler);
+		atomic_inc_32(&zio_injection_enabled);
+
+		rw_exit(&inject_lock);
+	}
+
+	/*
+	 * Flush the ARC, so that any attempts to read this data will end up
+	 * going to the ZIO layer.  Note that this is a little overkill, but
+	 * we don't have the necessary ARC interfaces to do anything else, and
+	 * fault injection isn't a performance critical path.
+	 */
+	if (flags & ZINJECT_FLUSH_ARC)
+		/*
+		 * We must use FALSE to ensure arc_flush returns, since
+		 * we're not preventing concurrent ARC insertions.
+		 */
+		arc_flush(NULL, FALSE);
+
+	return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function.  Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+    zinject_record_t *record)
+{
+	inject_handler_t *handler;
+	int ret;
+
+	mutex_enter(&spa_namespace_lock);
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler))
+		if (handler->zi_id > *id)
+			break;
+
+	if (handler) {
+		*record = handler->zi_record;
+		*id = handler->zi_id;
+		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
+		ret = 0;
+	} else {
+		ret = SET_ERROR(ENOENT);
+	}
+
+	rw_exit(&inject_lock);
+	mutex_exit(&spa_namespace_lock);
+
+	return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_WRITER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler))
+		if (handler->zi_id == id)
+			break;
+
+	if (handler == NULL) {
+		rw_exit(&inject_lock);
+		return (SET_ERROR(ENOENT));
+	}
+
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3S(inject_delay_count, >, 0);
+		inject_delay_count--;
+		ASSERT3S(inject_delay_count, >=, 0);
+	}
+
+	list_remove(&inject_handlers, handler);
+	rw_exit(&inject_lock);
+
+	if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+		ASSERT3P(handler->zi_lanes, !=, NULL);
+		kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+		    handler->zi_record.zi_nlanes);
+	} else {
+		ASSERT3P(handler->zi_lanes, ==, NULL);
+	}
+
+	spa_inject_delref(handler->zi_spa);
+	kmem_free(handler, sizeof (inject_handler_t));
+	atomic_dec_32(&zio_injection_enabled);
+
+	return (0);
+}
+
+void
+zio_inject_init(void)
+{
+	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+	mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&inject_handlers, sizeof (inject_handler_t),
+	    offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+	list_destroy(&inject_handlers);
+	mutex_destroy(&inject_delay_mtx);
+	rw_destroy(&inject_lock);
+}
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(zio_injection_enabled);
+EXPORT_SYMBOL(zio_inject_fault);
+EXPORT_SYMBOL(zio_inject_list_next);
+EXPORT_SYMBOL(zio_clear_fault);
+EXPORT_SYMBOL(zio_handle_fault_injection);
+EXPORT_SYMBOL(zio_handle_device_injection);
+EXPORT_SYMBOL(zio_handle_label_injection);
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zle.c b/sys/contrib/openzfs/module/zfs/zle.c
new file mode 100644
index 000000000000..0decebb13ca7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zle.c
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding.  This is a fast and simple algorithm to eliminate
+ * runs of zeroes.  Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values.  If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/zio_compress.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *s_end = src + s_len;
+	uchar_t *d_end = dst + d_len;
+
+	while (src < s_end && dst < d_end - 1) {
+		uchar_t *first = src;
+		uchar_t *len = dst++;
+		if (src[0] == 0) {
+			uchar_t *last = src + (256 - n);
+			while (src < MIN(last, s_end) && src[0] == 0)
+				src++;
+			*len = src - first - 1 + n;
+		} else {
+			uchar_t *last = src + n;
+			if (d_end - dst < n)
+				break;
+			while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+				*dst++ = *src++;
+			if (src[0])
+				*dst++ = *src++;
+			*len = src - first - 1;
+		}
+	}
+	return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *s_end = src + s_len;
+	uchar_t *d_end = dst + d_len;
+
+	while (src < s_end && dst < d_end) {
+		int len = 1 + *src++;
+		if (len <= n) {
+			if (src + len > s_end || dst + len > d_end)
+				return (-1);
+			while (len-- != 0)
+				*dst++ = *src++;
+		} else {
+			len -= n;
+			if (dst + len > d_end)
+				return (-1);
+			while (len-- != 0)
+				*dst++ = 0;
+		}
+	}
+	return (dst == d_end ? 0 : -1);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zrlock.c b/sys/contrib/openzfs/module/zfs/zrlock.c
new file mode 100644
index 000000000000..a4def6053622
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zrlock.c
@@ -0,0 +1,188 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 The MathWorks, Inc. All rights reserved.
+ */
+
+/*
+ * A Zero Reference Lock (ZRL) is a reference count that can lock out new
+ * references only when the count is zero and only without waiting if the count
+ * is not already zero. It is similar to a read-write lock in that it allows
+ * multiple readers and only a single writer, but it does not allow a writer to
+ * block while waiting for readers to exit, and therefore the question of
+ * reader/writer priority is moot (no WRWANT bit). Since the equivalent of
+ * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it
+ * is perfectly safe for the same reader to acquire the same lock multiple
+ * times. The fact that a ZRL is reentrant for readers (through multiple calls
+ * to zrl_add()) makes it convenient for determining whether something is
+ * actively referenced without the fuss of flagging lock ownership across
+ * function calls.
+ */
+#include <sys/zrlock.h>
+#include <sys/trace_zfs.h>
+
+/*
+ * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
+ * treated as zero references.
+ */
+#define	ZRL_LOCKED	-1
+#define	ZRL_DESTROYED	-2
+
+void
+zrl_init(zrlock_t *zrl)
+{
+	mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL);
+	zrl->zr_refcount = 0;
+	cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL);
+#ifdef	ZFS_DEBUG
+	zrl->zr_owner = NULL;
+	zrl->zr_caller = NULL;
+#endif
+}
+
+void
+zrl_destroy(zrlock_t *zrl)
+{
+	ASSERT0(zrl->zr_refcount);
+
+	mutex_destroy(&zrl->zr_mtx);
+	zrl->zr_refcount = ZRL_DESTROYED;
+	cv_destroy(&zrl->zr_cv);
+}
+
+void
+zrl_add_impl(zrlock_t *zrl, const char *zc)
+{
+	for (;;) {
+		uint32_t n = (uint32_t)zrl->zr_refcount;
+		while (n != ZRL_LOCKED) {
+			uint32_t cas = atomic_cas_32(
+			    (uint32_t *)&zrl->zr_refcount, n, n + 1);
+			if (cas == n) {
+				ASSERT3S((int32_t)n, >=, 0);
+#ifdef	ZFS_DEBUG
+				if (zrl->zr_owner == curthread) {
+					DTRACE_PROBE3(zrlock__reentry,
+					    zrlock_t *, zrl,
+					    kthread_t *, curthread,
+					    uint32_t, n);
+				}
+				zrl->zr_owner = curthread;
+				zrl->zr_caller = zc;
+#endif
+				return;
+			}
+			n = cas;
+		}
+
+		mutex_enter(&zrl->zr_mtx);
+		while (zrl->zr_refcount == ZRL_LOCKED) {
+			cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+		}
+		mutex_exit(&zrl->zr_mtx);
+	}
+}
+
+void
+zrl_remove(zrlock_t *zrl)
+{
+	uint32_t n;
+
+#ifdef	ZFS_DEBUG
+	if (zrl->zr_owner == curthread) {
+		zrl->zr_owner = NULL;
+		zrl->zr_caller = NULL;
+	}
+#endif
+	n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+	ASSERT3S((int32_t)n, >=, 0);
+}
+
+int
+zrl_tryenter(zrlock_t *zrl)
+{
+	uint32_t n = (uint32_t)zrl->zr_refcount;
+
+	if (n == 0) {
+		uint32_t cas = atomic_cas_32(
+		    (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
+		if (cas == 0) {
+#ifdef	ZFS_DEBUG
+			ASSERT3P(zrl->zr_owner, ==, NULL);
+			zrl->zr_owner = curthread;
+#endif
+			return (1);
+		}
+	}
+
+	ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
+
+	return (0);
+}
+
+void
+zrl_exit(zrlock_t *zrl)
+{
+	ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
+
+	mutex_enter(&zrl->zr_mtx);
+#ifdef	ZFS_DEBUG
+	ASSERT3P(zrl->zr_owner, ==, curthread);
+	zrl->zr_owner = NULL;
+	membar_producer();	/* make sure the owner store happens first */
+#endif
+	zrl->zr_refcount = 0;
+	cv_broadcast(&zrl->zr_cv);
+	mutex_exit(&zrl->zr_mtx);
+}
+
+int
+zrl_is_zero(zrlock_t *zrl)
+{
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+	return (zrl->zr_refcount <= 0);
+}
+
+int
+zrl_is_locked(zrlock_t *zrl)
+{
+	ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
+
+	return (zrl->zr_refcount == ZRL_LOCKED);
+}
+
+#ifdef	ZFS_DEBUG
+kthread_t *
+zrl_owner(zrlock_t *zrl)
+{
+	return (zrl->zr_owner);
+}
+#endif
+
+#if defined(_KERNEL)
+
+EXPORT_SYMBOL(zrl_add_impl);
+EXPORT_SYMBOL(zrl_remove);
+
+#endif
diff --git a/sys/contrib/openzfs/module/zfs/zthr.c b/sys/contrib/openzfs/module/zfs/zthr.c
new file mode 100644
index 000000000000..5ac2e30467e3
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zthr.c
@@ -0,0 +1,536 @@
+/*
+ * CDDL HEADER START
+ *
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2017, 2020 by Delphix. All rights reserved.
+ */
+
+/*
+ * ZTHR Infrastructure
+ * ===================
+ *
+ * ZTHR threads are used for isolated operations that span multiple txgs
+ * within a SPA. They generally exist from SPA creation/loading and until
+ * the SPA is exported/destroyed. The ideal requirements for an operation
+ * to be modeled with a zthr are the following:
+ *
+ * 1] The operation needs to run over multiple txgs.
+ * 2] There is be a single point of reference in memory or on disk that
+ *    indicates whether the operation should run/is running or has
+ *    stopped.
+ *
+ * If the operation satisfies the above then the following rules guarantee
+ * a certain level of correctness:
+ *
+ * 1] Any thread EXCEPT the zthr changes the work indicator from stopped
+ *    to running but not the opposite.
+ * 2] Only the zthr can change the work indicator from running to stopped
+ *    (e.g. when it is done) but not the opposite.
+ *
+ * This way a normal zthr cycle should go like this:
+ *
+ * 1] An external thread changes the work indicator from stopped to
+ *    running and wakes up the zthr.
+ * 2] The zthr wakes up, checks the indicator and starts working.
+ * 3] When the zthr is done, it changes the indicator to stopped, allowing
+ *    a new cycle to start.
+ *
+ * Besides being awakened by other threads, a zthr can be configured
+ * during creation to wakeup on its own after a specified interval
+ * [see zthr_create_timer()].
+ *
+ * Note: ZTHR threads are NOT a replacement for generic threads! Please
+ * ensure that they fit your use-case well before using them.
+ *
+ * == ZTHR creation
+ *
+ * Every zthr needs four inputs to start running:
+ *
+ * 1] A user-defined checker function (checkfunc) that decides whether
+ *    the zthr should start working or go to sleep. The function should
+ *    return TRUE when the zthr needs to work or FALSE to let it sleep,
+ *    and should adhere to the following signature:
+ *    boolean_t checkfunc_name(void *args, zthr_t *t);
+ *
+ * 2] A user-defined ZTHR function (func) which the zthr executes when
+ *    it is not sleeping. The function should adhere to the following
+ *    signature type:
+ *    void func_name(void *args, zthr_t *t);
+ *
+ * 3] A void args pointer that will be passed to checkfunc and func
+ *    implicitly by the infrastructure.
+ *
+ * 4] A name for the thread. This string must be valid for the lifetime
+ *    of the zthr.
+ *
+ * The reason why the above API needs two different functions,
+ * instead of one that both checks and does the work, has to do with
+ * the zthr's internal state lock (zthr_state_lock) and the allowed
+ * cancellation windows. We want to hold the zthr_state_lock while
+ * running checkfunc but not while running func. This way the zthr
+ * can be cancelled while doing work and not while checking for work.
+ *
+ * To start a zthr:
+ *     zthr_t *zthr_pointer = zthr_create(checkfunc, func, args);
+ * or
+ *     zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func,
+ *         args, max_sleep);
+ *
+ * After that you should be able to wakeup, cancel, and resume the
+ * zthr from another thread using the zthr_pointer.
+ *
+ * NOTE: ZTHR threads could potentially wake up spuriously and the
+ * user should take this into account when writing a checkfunc.
+ * [see ZTHR state transitions]
+ *
+ * == ZTHR wakeup
+ *
+ * ZTHR wakeup should be used when new work is added for the zthr. The
+ * sleeping zthr will wakeup, see that it has more work to complete
+ * and proceed. This can be invoked from open or syncing context.
+ *
+ * To wakeup a zthr:
+ *     zthr_wakeup(zthr_t *t)
+ *
+ * == ZTHR cancellation and resumption
+ *
+ * ZTHR threads must be cancelled when their SPA is being exported
+ * or when they need to be paused so they don't interfere with other
+ * operations.
+ *
+ * To cancel a zthr:
+ *     zthr_cancel(zthr_pointer);
+ *
+ * To resume it:
+ *     zthr_resume(zthr_pointer);
+ *
+ * ZTHR cancel and resume should be invoked in open context during the
+ * lifecycle of the pool as it is imported, exported or destroyed.
+ *
+ * A zthr will implicitly check if it has received a cancellation
+ * signal every time func returns and every time it wakes up [see
+ * ZTHR state transitions below].
+ *
+ * At times, waiting for the zthr's func to finish its job may take
+ * time. This may be very time-consuming for some operations that
+ * need to cancel the SPA's zthrs (e.g spa_export). For this scenario
+ * the user can explicitly make their ZTHR function aware of incoming
+ * cancellation signals using zthr_iscancelled(). A common pattern for
+ * that looks like this:
+ *
+ * int
+ * func_name(void *args, zthr_t *t)
+ * {
+ *     ... <unpack args> ...
+ *     while (!work_done && !zthr_iscancelled(t)) {
+ *         ... <do more work> ...
+ *     }
+ * }
+ *
+ * == ZTHR cleanup
+ *
+ * Cancelling a zthr doesn't clean up its metadata (internal locks,
+ * function pointers to func and checkfunc, etc..). This is because
+ * we want to keep them around in case we want to resume the execution
+ * of the zthr later. Similarly for zthrs that exit themselves.
+ *
+ * To completely cleanup a zthr, cancel it first to ensure that it
+ * is not running and then use zthr_destroy().
+ *
+ * == ZTHR state transitions
+ *
+ *    zthr creation
+ *      +
+ *      |
+ *      |      woke up
+ *      |   +--------------+ sleep
+ *      |   |                  ^
+ *      |   |                  |
+ *      |   |                  | FALSE
+ *      |   |                  |
+ *      v   v     FALSE        +
+ *   cancelled? +---------> checkfunc?
+ *      +   ^                  +
+ *      |   |                  |
+ *      |   |                  | TRUE
+ *      |   |                  |
+ *      |   |  func returned   v
+ *      |   +---------------+ func
+ *      |
+ *      | TRUE
+ *      |
+ *      v
+ *   zthr stopped running
+ *
+ * == Implementation of ZTHR requests
+ *
+ * ZTHR cancel and resume are requests on a zthr to change its
+ * internal state. These requests are serialized using the
+ * zthr_request_lock, while changes in its internal state are
+ * protected by the zthr_state_lock. A request will first acquire
+ * the zthr_request_lock and then immediately acquire the
+ * zthr_state_lock. We do this so that incoming requests are
+ * serialized using the request lock, while still allowing us
+ * to use the state lock for thread communication via zthr_cv.
+ *
+ * ZTHR wakeup broadcasts to zthr_cv, causing sleeping threads
+ * to wakeup. It acquires the zthr_state_lock but not the
+ * zthr_request_lock, so that a wakeup on a zthr in the middle
+ * of being cancelled will not block.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zthr.h>
+
+struct zthr {
+	/* running thread doing the work */
+	kthread_t	*zthr_thread;
+
+	/* lock protecting internal data & invariants */
+	kmutex_t	zthr_state_lock;
+
+	/* mutex that serializes external requests */
+	kmutex_t	zthr_request_lock;
+
+	/* notification mechanism for requests */
+	kcondvar_t	zthr_cv;
+
+	/* flag set to true if we are canceling the zthr */
+	boolean_t	zthr_cancel;
+
+	/* flag set to true if we are waiting for the zthr to finish */
+	boolean_t	zthr_haswaiters;
+	kcondvar_t	zthr_wait_cv;
+	/*
+	 * maximum amount of time that the zthr is spent sleeping;
+	 * if this is 0, the thread doesn't wake up until it gets
+	 * signaled.
+	 */
+	hrtime_t	zthr_sleep_timeout;
+
+	/* consumer-provided callbacks & data */
+	zthr_checkfunc_t	*zthr_checkfunc;
+	zthr_func_t	*zthr_func;
+	void		*zthr_arg;
+	const char	*zthr_name;
+};
+
+static void
+zthr_procedure(void *arg)
+{
+	zthr_t *t = arg;
+
+	mutex_enter(&t->zthr_state_lock);
+	ASSERT3P(t->zthr_thread, ==, curthread);
+
+	while (!t->zthr_cancel) {
+		if (t->zthr_checkfunc(t->zthr_arg, t)) {
+			mutex_exit(&t->zthr_state_lock);
+			t->zthr_func(t->zthr_arg, t);
+			mutex_enter(&t->zthr_state_lock);
+		} else {
+			if (t->zthr_sleep_timeout == 0) {
+				cv_wait_idle(&t->zthr_cv, &t->zthr_state_lock);
+			} else {
+				(void) cv_timedwait_idle_hires(&t->zthr_cv,
+				    &t->zthr_state_lock, t->zthr_sleep_timeout,
+				    MSEC2NSEC(1), 0);
+			}
+		}
+		if (t->zthr_haswaiters) {
+			t->zthr_haswaiters = B_FALSE;
+			cv_broadcast(&t->zthr_wait_cv);
+		}
+	}
+
+	/*
+	 * Clear out the kernel thread metadata and notify the
+	 * zthr_cancel() thread that we've stopped running.
+	 */
+	t->zthr_thread = NULL;
+	t->zthr_cancel = B_FALSE;
+	cv_broadcast(&t->zthr_cv);
+
+	mutex_exit(&t->zthr_state_lock);
+	thread_exit();
+}
+
+zthr_t *
+zthr_create(const char *zthr_name, zthr_checkfunc_t *checkfunc,
+    zthr_func_t *func, void *arg)
+{
+	return (zthr_create_timer(zthr_name, checkfunc,
+	    func, arg, (hrtime_t)0));
+}
+
+/*
+ * Create a zthr with specified maximum sleep time.  If the time
+ * in sleeping state exceeds max_sleep, a wakeup(do the check and
+ * start working if required) will be triggered.
+ */
+zthr_t *
+zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc,
+    zthr_func_t *func, void *arg, hrtime_t max_sleep)
+{
+	zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+	mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL);
+
+	mutex_enter(&t->zthr_state_lock);
+	t->zthr_checkfunc = checkfunc;
+	t->zthr_func = func;
+	t->zthr_arg = arg;
+	t->zthr_sleep_timeout = max_sleep;
+	t->zthr_name = zthr_name;
+
+	t->zthr_thread = thread_create_named(zthr_name, NULL, 0,
+	    zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
+
+	mutex_exit(&t->zthr_state_lock);
+
+	return (t);
+}
+
+void
+zthr_destroy(zthr_t *t)
+{
+	ASSERT(!MUTEX_HELD(&t->zthr_state_lock));
+	ASSERT(!MUTEX_HELD(&t->zthr_request_lock));
+	VERIFY3P(t->zthr_thread, ==, NULL);
+	mutex_destroy(&t->zthr_request_lock);
+	mutex_destroy(&t->zthr_state_lock);
+	cv_destroy(&t->zthr_cv);
+	cv_destroy(&t->zthr_wait_cv);
+	kmem_free(t, sizeof (*t));
+}
+
+/*
+ * Wake up the zthr if it is sleeping. If the thread has been cancelled
+ * or is in the process of being cancelled, this is a no-op.
+ */
+void
+zthr_wakeup(zthr_t *t)
+{
+	mutex_enter(&t->zthr_state_lock);
+
+	/*
+	 * There are 5 states that we can find the zthr when issuing
+	 * this broadcast:
+	 *
+	 * [1] The common case of the thread being asleep, at which
+	 *     point the broadcast will wake it up.
+	 * [2] The thread has been cancelled. Waking up a cancelled
+	 *     thread is a no-op. Any work that is still left to be
+	 *     done should be handled the next time the thread is
+	 *     resumed.
+	 * [3] The thread is doing work and is already up, so this
+	 *     is basically a no-op.
+	 * [4] The thread was just created/resumed, in which case the
+	 *     behavior is similar to [3].
+	 * [5] The thread is in the middle of being cancelled, which
+	 *     will be a no-op.
+	 */
+	cv_broadcast(&t->zthr_cv);
+
+	mutex_exit(&t->zthr_state_lock);
+}
+
+/*
+ * Sends a cancel request to the zthr and blocks until the zthr is
+ * cancelled. If the zthr is not running (e.g. has been cancelled
+ * already), this is a no-op. Note that this function should not be
+ * called from syncing context as it could deadlock with the zthr_func.
+ */
+void
+zthr_cancel(zthr_t *t)
+{
+	mutex_enter(&t->zthr_request_lock);
+	mutex_enter(&t->zthr_state_lock);
+
+	/*
+	 * Since we are holding the zthr_state_lock at this point
+	 * we can find the state in one of the following 4 states:
+	 *
+	 * [1] The thread has already been cancelled, therefore
+	 *     there is nothing for us to do.
+	 * [2] The thread is sleeping so we set the flag, broadcast
+	 *     the CV and wait for it to exit.
+	 * [3] The thread is doing work, in which case we just set
+	 *     the flag and wait for it to finish.
+	 * [4] The thread was just created/resumed, in which case
+	 *     the behavior is similar to [3].
+	 *
+	 * Since requests are serialized, by the time that we get
+	 * control back we expect that the zthr is cancelled and
+	 * not running anymore.
+	 */
+	if (t->zthr_thread != NULL) {
+		t->zthr_cancel = B_TRUE;
+
+		/* broadcast in case the zthr is sleeping */
+		cv_broadcast(&t->zthr_cv);
+
+		while (t->zthr_thread != NULL)
+			cv_wait(&t->zthr_cv, &t->zthr_state_lock);
+
+		ASSERT(!t->zthr_cancel);
+	}
+
+	mutex_exit(&t->zthr_state_lock);
+	mutex_exit(&t->zthr_request_lock);
+}
+
+/*
+ * Sends a resume request to the supplied zthr. If the zthr is already
+ * running this is a no-op. Note that this function should not be
+ * called from syncing context as it could deadlock with the zthr_func.
+ */
+void
+zthr_resume(zthr_t *t)
+{
+	mutex_enter(&t->zthr_request_lock);
+	mutex_enter(&t->zthr_state_lock);
+
+	ASSERT3P(&t->zthr_checkfunc, !=, NULL);
+	ASSERT3P(&t->zthr_func, !=, NULL);
+	ASSERT(!t->zthr_cancel);
+	ASSERT(!t->zthr_haswaiters);
+
+	/*
+	 * There are 4 states that we find the zthr in at this point
+	 * given the locks that we hold:
+	 *
+	 * [1] The zthr was cancelled, so we spawn a new thread for
+	 *     the zthr (common case).
+	 * [2] The zthr is running at which point this is a no-op.
+	 * [3] The zthr is sleeping at which point this is a no-op.
+	 * [4] The zthr was just spawned at which point this is a
+	 *     no-op.
+	 */
+	if (t->zthr_thread == NULL) {
+		t->zthr_thread = thread_create_named(t->zthr_name, NULL, 0,
+		    zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri);
+	}
+
+	mutex_exit(&t->zthr_state_lock);
+	mutex_exit(&t->zthr_request_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * (specifically the zthr_func callback provided) to check
+ * if another thread has signaled it to stop running before
+ * doing some expensive operation.
+ *
+ * returns TRUE if we are in the middle of trying to cancel
+ *     this thread.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_iscancelled(zthr_t *t)
+{
+	ASSERT3P(t->zthr_thread, ==, curthread);
+
+	/*
+	 * The majority of the functions here grab zthr_request_lock
+	 * first and then zthr_state_lock. This function only grabs
+	 * the zthr_state_lock. That is because this function should
+	 * only be called from the zthr_func to check if someone has
+	 * issued a zthr_cancel() on the thread. If there is a zthr_cancel()
+	 * happening concurrently, attempting to grab the request lock
+	 * here would result in a deadlock.
+	 *
+	 * By grabbing only the zthr_state_lock this function is allowed
+	 * to run concurrently with a zthr_cancel() request.
+	 */
+	mutex_enter(&t->zthr_state_lock);
+	boolean_t cancelled = t->zthr_cancel;
+	mutex_exit(&t->zthr_state_lock);
+	return (cancelled);
+}
+
+/*
+ * Wait for the zthr to finish its current function. Similar to
+ * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end
+ * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was
+ * sleeping or cancelled, return immediately.
+ */
+void
+zthr_wait_cycle_done(zthr_t *t)
+{
+	mutex_enter(&t->zthr_state_lock);
+
+	/*
+	 * Since we are holding the zthr_state_lock at this point
+	 * we can find the state in one of the following 5 states:
+	 *
+	 * [1] The thread has already cancelled, therefore
+	 *     there is nothing for us to do.
+	 * [2] The thread is sleeping so we set the flag, broadcast
+	 *     the CV and wait for it to exit.
+	 * [3] The thread is doing work, in which case we just set
+	 *     the flag and wait for it to finish.
+	 * [4] The thread was just created/resumed, in which case
+	 *     the behavior is similar to [3].
+	 * [5] The thread is the middle of being cancelled, which is
+	 *     similar to [3]. We'll wait for the cancel, which is
+	 *     waiting for the zthr func.
+	 *
+	 * Since requests are serialized, by the time that we get
+	 * control back we expect that the zthr has completed it's
+	 * zthr_func.
+	 */
+	if (t->zthr_thread != NULL) {
+		t->zthr_haswaiters = B_TRUE;
+
+		/* broadcast in case the zthr is sleeping */
+		cv_broadcast(&t->zthr_cv);
+
+		while ((t->zthr_haswaiters) && (t->zthr_thread != NULL))
+			cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock);
+
+		ASSERT(!t->zthr_haswaiters);
+	}
+
+	mutex_exit(&t->zthr_state_lock);
+}
+
+/*
+ * This function is intended to be used by the zthr itself
+ * to check if another thread is waiting on it to finish
+ *
+ * returns TRUE if we have been asked to finish.
+ *
+ * returns FALSE otherwise.
+ */
+boolean_t
+zthr_has_waiters(zthr_t *t)
+{
+	ASSERT3P(t->zthr_thread, ==, curthread);
+
+	mutex_enter(&t->zthr_state_lock);
+
+	/*
+	 * Similarly to zthr_iscancelled(), we only grab the
+	 * zthr_state_lock so that the zthr itself can use this
+	 * to check for the request.
+	 */
+	boolean_t has_waiters = t->zthr_haswaiters;
+	mutex_exit(&t->zthr_state_lock);
+	return (has_waiters);
+}
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
new file mode 100644
index 000000000000..7c6dae8650c7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -0,0 +1,1739 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
+ * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
+ * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
+ * LLNL-CODE-403049.
+ *
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot and module load.  No user command
+ * needs to be run before opening and using a device.
+ *
+ * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright (c) 2016 Actifio, Inc. All rights reserved.
+ * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ */
+
+/*
+ * Note on locking of zvol state structures.
+ *
+ * These structures are used to maintain internal state used to emulate block
+ * devices on top of zvols. In particular, management of device minor number
+ * operations - create, remove, rename, and set_snapdev - involves access to
+ * these structures. The zvol_state_lock is primarily used to protect the
+ * zvol_state_list. The zv->zv_state_lock is used to protect the contents
+ * of the zvol_state_t structures, as well as to make sure that when the
+ * time comes to remove the structure from the list, it is not in use, and
+ * therefore, it can be taken off zvol_state_list and freed.
+ *
+ * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
+ * e.g. for the duration of receive and rollback operations. This lock can be
+ * held for significant periods of time. Given that it is undesirable to hold
+ * mutexes for long periods of time, the following lock ordering applies:
+ * - take zvol_state_lock if necessary, to protect zvol_state_list
+ * - take zv_suspend_lock if necessary, by the code path in question
+ * - take zv_state_lock to protect zvol_state_t
+ *
+ * The minor operations are issued to spa->spa_zvol_taskq queues, that are
+ * single-threaded (to preserve order of minor operations), and are executed
+ * through the zvol_task_cb that dispatches the specific operations. Therefore,
+ * these operations are serialized per pool. Consequently, we can be certain
+ * that for a given zvol, there is only one operation at a time in progress.
+ * That is why one can be sure that first, zvol_state_t for a given zvol is
+ * allocated and placed on zvol_state_list, and then other minor operations
+ * for this zvol are going to proceed in the order of issue.
+ *
+ */
+
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
+#include <sys/zil_impl.h>
+#include <sys/dmu_tx.h>
+#include <sys/zio.h>
+#include <sys/zfs_rlock.h>
+#include <sys/spa_impl.h>
+#include <sys/zvol.h>
+
+#include <sys/zvol_impl.h>
+
+
+unsigned int zvol_inhibit_dev = 0;
+unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
+
+struct hlist_head *zvol_htable;
+list_t zvol_state_list;
+krwlock_t zvol_state_lock;
+const zvol_platform_ops_t *ops;
+
+typedef enum {
+	ZVOL_ASYNC_REMOVE_MINORS,
+	ZVOL_ASYNC_RENAME_MINORS,
+	ZVOL_ASYNC_SET_SNAPDEV,
+	ZVOL_ASYNC_SET_VOLMODE,
+	ZVOL_ASYNC_MAX
+} zvol_async_op_t;
+
+typedef struct {
+	zvol_async_op_t op;
+	char pool[MAXNAMELEN];
+	char name1[MAXNAMELEN];
+	char name2[MAXNAMELEN];
+	zprop_source_t source;
+	uint64_t value;
+} zvol_task_t;
+
+uint64_t
+zvol_name_hash(const char *name)
+{
+	int i;
+	uint64_t crc = -1ULL;
+	const uint8_t *p = (const uint8_t *)name;
+	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+	for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) {
+		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
+	}
+	return (crc);
+}
+
+/*
+ * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
+ */
+zvol_state_t *
+zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
+{
+	zvol_state_t *zv;
+	struct hlist_node *p = NULL;
+
+	rw_enter(&zvol_state_lock, RW_READER);
+	hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
+		zv = hlist_entry(p, zvol_state_t, zv_hlink);
+		mutex_enter(&zv->zv_state_lock);
+		if (zv->zv_hash == hash &&
+		    strncmp(zv->zv_name, name, MAXNAMELEN) == 0) {
+			/*
+			 * this is the right zvol, take the locks in the
+			 * right order
+			 */
+			if (mode != RW_NONE &&
+			    !rw_tryenter(&zv->zv_suspend_lock, mode)) {
+				mutex_exit(&zv->zv_state_lock);
+				rw_enter(&zv->zv_suspend_lock, mode);
+				mutex_enter(&zv->zv_state_lock);
+				/*
+				 * zvol cannot be renamed as we continue
+				 * to hold zvol_state_lock
+				 */
+				ASSERT(zv->zv_hash == hash &&
+				    strncmp(zv->zv_name, name, MAXNAMELEN)
+				    == 0);
+			}
+			rw_exit(&zvol_state_lock);
+			return (zv);
+		}
+		mutex_exit(&zv->zv_state_lock);
+	}
+	rw_exit(&zvol_state_lock);
+
+	return (NULL);
+}
+
+/*
+ * Find a zvol_state_t given the name.
+ * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
+ * return (NULL) without the taking locks. The zv_suspend_lock is always taken
+ * before zv_state_lock. The mode argument indicates the mode (including none)
+ * for zv_suspend_lock to be taken.
+ */
+static zvol_state_t *
+zvol_find_by_name(const char *name, int mode)
+{
+	return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
+}
+
+/*
+ * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
+ */
+void
+zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+{
+	zfs_creat_t *zct = arg;
+	nvlist_t *nvprops = zct->zct_props;
+	int error;
+	uint64_t volblocksize, volsize;
+
+	VERIFY(nvlist_lookup_uint64(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
+	if (nvlist_lookup_uint64(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
+		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
+
+	/*
+	 * These properties must be removed from the list so the generic
+	 * property setting step won't apply to them.
+	 */
+	VERIFY(nvlist_remove_all(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
+	(void) nvlist_remove_all(nvprops,
+	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
+
+	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
+	    DMU_OT_NONE, 0, tx);
+	ASSERT(error == 0);
+
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
+	ASSERT(error == 0);
+}
+
+/*
+ * ZFS_IOC_OBJSET_STATS entry point.
+ */
+int
+zvol_get_stats(objset_t *os, nvlist_t *nv)
+{
+	int error;
+	dmu_object_info_t *doi;
+	uint64_t val;
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
+	if (error)
+		return (SET_ERROR(error));
+
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
+	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
+	error = dmu_object_info(os, ZVOL_OBJ, doi);
+
+	if (error == 0) {
+		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
+		    doi->doi_data_block_size);
+	}
+
+	kmem_free(doi, sizeof (dmu_object_info_t));
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * Sanity check volume size.
+ */
+int
+zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
+{
+	if (volsize == 0)
+		return (SET_ERROR(EINVAL));
+
+	if (volsize % blocksize != 0)
+		return (SET_ERROR(EINVAL));
+
+#ifdef _ILP32
+	if (volsize - 1 > SPEC_MAXOFFSET_T)
+		return (SET_ERROR(EOVERFLOW));
+#endif
+	return (0);
+}
+
+/*
+ * Ensure the zap is flushed then inform the VFS of the capacity change.
+ */
+static int
+zvol_update_volsize(uint64_t volsize, objset_t *os)
+{
+	dmu_tx_t *tx;
+	int error;
+	uint64_t txg;
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	dmu_tx_mark_netfree(tx);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (SET_ERROR(error));
+	}
+	txg = dmu_tx_get_txg(tx);
+
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
+	    &volsize, tx);
+	dmu_tx_commit(tx);
+
+	txg_wait_synced(dmu_objset_pool(os), txg);
+
+	if (error == 0)
+		error = dmu_free_long_range(os,
+		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
+
+	return (error);
+}
+
+/*
+ * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
+ * size will result in a udev "change" event being generated.
+ */
+int
+zvol_set_volsize(const char *name, uint64_t volsize)
+{
+	objset_t *os = NULL;
+	uint64_t readonly;
+	int error;
+	boolean_t owned = B_FALSE;
+
+	error = dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+	if (error != 0)
+		return (SET_ERROR(error));
+	if (readonly)
+		return (SET_ERROR(EROFS));
+
+	zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
+
+	ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
+	    RW_READ_HELD(&zv->zv_suspend_lock)));
+
+	if (zv == NULL || zv->zv_objset == NULL) {
+		if (zv != NULL)
+			rw_exit(&zv->zv_suspend_lock);
+		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
+		    FTAG, &os)) != 0) {
+			if (zv != NULL)
+				mutex_exit(&zv->zv_state_lock);
+			return (SET_ERROR(error));
+		}
+		owned = B_TRUE;
+		if (zv != NULL)
+			zv->zv_objset = os;
+	} else {
+		os = zv->zv_objset;
+	}
+
+	dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
+
+	if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
+	    (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
+		goto out;
+
+	error = zvol_update_volsize(volsize, os);
+	if (error == 0 && zv != NULL) {
+		zv->zv_volsize = volsize;
+		zv->zv_changed = 1;
+	}
+out:
+	kmem_free(doi, sizeof (dmu_object_info_t));
+
+	if (owned) {
+		dmu_objset_disown(os, B_TRUE, FTAG);
+		if (zv != NULL)
+			zv->zv_objset = NULL;
+	} else {
+		rw_exit(&zv->zv_suspend_lock);
+	}
+
+	if (zv != NULL)
+		mutex_exit(&zv->zv_state_lock);
+
+	if (error == 0 && zv != NULL)
+		ops->zv_update_volsize(zv, volsize);
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * Sanity check volume block size.
+ */
+int
+zvol_check_volblocksize(const char *name, uint64_t volblocksize)
+{
+	/* Record sizes above 128k need the feature to be enabled */
+	if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
+		spa_t *spa;
+		int error;
+
+		if ((error = spa_open(name, &spa, FTAG)) != 0)
+			return (error);
+
+		if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ENOTSUP));
+		}
+
+		/*
+		 * We don't allow setting the property above 1MB,
+		 * unless the tunable has been changed.
+		 */
+		if (volblocksize > zfs_max_recordsize)
+			return (SET_ERROR(EDOM));
+
+		spa_close(spa, FTAG);
+	}
+
+	if (volblocksize < SPA_MINBLOCKSIZE ||
+	    volblocksize > SPA_MAXBLOCKSIZE ||
+	    !ISP2(volblocksize))
+		return (SET_ERROR(EDOM));
+
+	return (0);
+}
+
+/*
+ * Set ZFS_PROP_VOLBLOCKSIZE set entry point.
+ */
+int
+zvol_set_volblocksize(const char *name, uint64_t volblocksize)
+{
+	zvol_state_t *zv;
+	dmu_tx_t *tx;
+	int error;
+
+	zv = zvol_find_by_name(name, RW_READER);
+
+	if (zv == NULL)
+		return (SET_ERROR(ENXIO));
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+
+	if (zv->zv_flags & ZVOL_RDONLY) {
+		mutex_exit(&zv->zv_state_lock);
+		rw_exit(&zv->zv_suspend_lock);
+		return (SET_ERROR(EROFS));
+	}
+
+	tx = dmu_tx_create(zv->zv_objset);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
+		    volblocksize, 0, tx);
+		if (error == ENOTSUP)
+			error = SET_ERROR(EBUSY);
+		dmu_tx_commit(tx);
+		if (error == 0)
+			zv->zv_volblocksize = volblocksize;
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+	rw_exit(&zv->zv_suspend_lock);
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
+ * implement DKIOCFREE/free-long-range.
+ */
+static int
+zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zvol_state_t *zv = arg1;
+	lr_truncate_t *lr = arg2;
+	uint64_t offset, length;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
+}
+
+/*
+ * Replay a TX_WRITE ZIL transaction that didn't get committed
+ * after a system failure
+ */
+static int
+zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
+{
+	zvol_state_t *zv = arg1;
+	lr_write_t *lr = arg2;
+	objset_t *os = zv->zv_objset;
+	char *data = (char *)(lr + 1);  /* data follows lr_write_t */
+	uint64_t offset, length;
+	dmu_tx_t *tx;
+	int error;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+	} else {
+		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
+		dmu_tx_commit(tx);
+	}
+
+	return (error);
+}
+
+static int
+zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
+{
+	return (SET_ERROR(ENOTSUP));
+}
+
+/*
+ * Callback vectors for replaying records.
+ * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
+ */
+zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
+	zvol_replay_err,	/* no such transaction type */
+	zvol_replay_err,	/* TX_CREATE */
+	zvol_replay_err,	/* TX_MKDIR */
+	zvol_replay_err,	/* TX_MKXATTR */
+	zvol_replay_err,	/* TX_SYMLINK */
+	zvol_replay_err,	/* TX_REMOVE */
+	zvol_replay_err,	/* TX_RMDIR */
+	zvol_replay_err,	/* TX_LINK */
+	zvol_replay_err,	/* TX_RENAME */
+	zvol_replay_write,	/* TX_WRITE */
+	zvol_replay_truncate,	/* TX_TRUNCATE */
+	zvol_replay_err,	/* TX_SETATTR */
+	zvol_replay_err,	/* TX_ACL */
+	zvol_replay_err,	/* TX_CREATE_ATTR */
+	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
+	zvol_replay_err,	/* TX_MKDIR_ACL */
+	zvol_replay_err,	/* TX_MKDIR_ATTR */
+	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
+	zvol_replay_err,	/* TX_WRITE2 */
+};
+
+/*
+ * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
+ *
+ * We store data in the log buffers if it's small enough.
+ * Otherwise we will later flush the data out via dmu_sync().
+ */
+ssize_t zvol_immediate_write_sz = 32768;
+
+void
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
+    uint64_t size, int sync)
+{
+	uint32_t blocksize = zv->zv_volblocksize;
+	zilog_t *zilog = zv->zv_zilog;
+	itx_wr_state_t write_state;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    size >= blocksize && blocksize > zvol_immediate_write_sz)
+		write_state = WR_INDIRECT;
+	else if (sync)
+		write_state = WR_COPIED;
+	else
+		write_state = WR_NEED_COPY;
+
+	while (size) {
+		itx_t *itx;
+		lr_write_t *lr;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = size;
+
+		if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(offset, blocksize), size);
+
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+		    (wr_state == WR_COPIED ? len : 0));
+		lr = (lr_write_t *)&itx->itx_lr;
+		if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
+		    offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
+			zil_itx_destroy(itx);
+			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+			lr = (lr_write_t *)&itx->itx_lr;
+			wr_state = WR_NEED_COPY;
+		}
+
+		itx->itx_wr_state = wr_state;
+		lr->lr_foid = ZVOL_OBJ;
+		lr->lr_offset = offset;
+		lr->lr_length = len;
+		lr->lr_blkoff = 0;
+		BP_ZERO(&lr->lr_blkptr);
+
+		itx->itx_private = zv;
+		itx->itx_sync = sync;
+
+		(void) zil_itx_assign(zilog, itx, tx);
+
+		offset += len;
+		size -= len;
+	}
+}
+
+/*
+ * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
+ */
+void
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
+    boolean_t sync)
+{
+	itx_t *itx;
+	lr_truncate_t *lr;
+	zilog_t *zilog = zv->zv_zilog;
+
+	if (zil_replaying(zilog, tx))
+		return;
+
+	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+	lr = (lr_truncate_t *)&itx->itx_lr;
+	lr->lr_foid = ZVOL_OBJ;
+	lr->lr_offset = off;
+	lr->lr_length = len;
+
+	itx->itx_sync = sync;
+	zil_itx_assign(zilog, itx, tx);
+}
+
+
+/* ARGSUSED */
+static void
+zvol_get_done(zgd_t *zgd, int error)
+{
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_rangelock_exit(zgd->zgd_lr);
+
+	kmem_free(zgd, sizeof (zgd_t));
+}
+
+/*
+ * Get data to generate a TX_WRITE intent log record.
+ */
+int
+zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
+{
+	zvol_state_t *zv = arg;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error;
+
+	ASSERT3P(lwb, !=, NULL);
+	ASSERT3P(zio, !=, NULL);
+	ASSERT3U(size, !=, 0);
+
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_lwb = lwb;
+
+	/*
+	 * Write records come in two flavors: immediate and indirect.
+	 * For small writes it's cheaper to store the data with the
+	 * log record (immediate); for large writes it's cheaper to
+	 * sync the data and get a pointer to it (indirect) so that
+	 * we don't have to write the data twice.
+	 */
+	if (buf != NULL) { /* immediate write */
+		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
+		    size, RL_READER);
+		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+	} else { /* indirect write */
+		/*
+		 * Have to lock the whole block to ensure when it's written out
+		 * and its checksum is being calculated that no one can change
+		 * the data. Contrarily to zfs_get_data we need not re-check
+		 * blocksize after we get the lock because it cannot be changed.
+		 */
+		size = zv->zv_volblocksize;
+		offset = P2ALIGN_TYPED(offset, size, uint64_t);
+		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
+		    size, RL_READER);
+		error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db,
+		    DMU_READ_NO_PREFETCH);
+		if (error == 0) {
+			blkptr_t *bp = &lr->lr_blkptr;
+
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db != NULL);
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zvol_get_done, zgd);
+
+			if (error == 0)
+				return (0);
+		}
+	}
+
+	zvol_get_done(zgd, error);
+
+	return (SET_ERROR(error));
+}
+
+/*
+ * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
+ */
+
+void
+zvol_insert(zvol_state_t *zv)
+{
+	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
+	list_insert_head(&zvol_state_list, zv);
+	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
+}
+
+/*
+ * Simply remove the zvol from to list of zvols.
+ */
+static void
+zvol_remove(zvol_state_t *zv)
+{
+	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
+	list_remove(&zvol_state_list, zv);
+	hlist_del(&zv->zv_hlink);
+}
+
+/*
+ * Setup zv after we just own the zv->objset
+ */
+static int
+zvol_setup_zv(zvol_state_t *zv)
+{
+	uint64_t volsize;
+	int error;
+	uint64_t ro;
+	objset_t *os = zv->zv_objset;
+
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
+
+	zv->zv_zilog = NULL;
+	zv->zv_flags &= ~ZVOL_WRITTEN_TO;
+
+	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
+	if (error)
+		return (SET_ERROR(error));
+
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error)
+		return (SET_ERROR(error));
+
+	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
+	if (error)
+		return (SET_ERROR(error));
+
+	ops->zv_set_capacity(zv, volsize >> 9);
+	zv->zv_volsize = volsize;
+
+	if (ro || dmu_objset_is_snapshot(os) ||
+	    !spa_writeable(dmu_objset_spa(os))) {
+		ops->zv_set_disk_ro(zv, 1);
+		zv->zv_flags |= ZVOL_RDONLY;
+	} else {
+		ops->zv_set_disk_ro(zv, 0);
+		zv->zv_flags &= ~ZVOL_RDONLY;
+	}
+	return (0);
+}
+
+/*
+ * Shutdown every zv_objset related stuff except zv_objset itself.
+ * The is the reverse of zvol_setup_zv.
+ */
+static void
+zvol_shutdown_zv(zvol_state_t *zv)
+{
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
+	    RW_LOCK_HELD(&zv->zv_suspend_lock));
+
+	if (zv->zv_flags & ZVOL_WRITTEN_TO) {
+		ASSERT(zv->zv_zilog != NULL);
+		zil_close(zv->zv_zilog);
+	}
+
+	zv->zv_zilog = NULL;
+
+	dnode_rele(zv->zv_dn, zv);
+	zv->zv_dn = NULL;
+
+	/*
+	 * Evict cached data. We must write out any dirty data before
+	 * disowning the dataset.
+	 */
+	if (zv->zv_flags & ZVOL_WRITTEN_TO)
+		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+	(void) dmu_objset_evict_dbufs(zv->zv_objset);
+}
+
+/*
+ * return the proper tag for rollback and recv
+ */
+void *
+zvol_tag(zvol_state_t *zv)
+{
+	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+	return (zv->zv_open_count > 0 ? zv : NULL);
+}
+
+/*
+ * Suspend the zvol for recv and rollback.
+ */
+zvol_state_t *
+zvol_suspend(const char *name)
+{
+	zvol_state_t *zv;
+
+	zv = zvol_find_by_name(name, RW_WRITER);
+
+	if (zv == NULL)
+		return (NULL);
+
+	/* block all I/O, release in zvol_resume. */
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+
+	atomic_inc(&zv->zv_suspend_ref);
+
+	if (zv->zv_open_count > 0)
+		zvol_shutdown_zv(zv);
+
+	/*
+	 * do not hold zv_state_lock across suspend/resume to
+	 * avoid locking up zvol lookups
+	 */
+	mutex_exit(&zv->zv_state_lock);
+
+	/* zv_suspend_lock is released in zvol_resume() */
+	return (zv);
+}
+
+int
+zvol_resume(zvol_state_t *zv)
+{
+	int error = 0;
+
+	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
+
+	mutex_enter(&zv->zv_state_lock);
+
+	if (zv->zv_open_count > 0) {
+		VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
+		VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
+		VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
+		dmu_objset_rele(zv->zv_objset, zv);
+
+		error = zvol_setup_zv(zv);
+	}
+
+	mutex_exit(&zv->zv_state_lock);
+
+	rw_exit(&zv->zv_suspend_lock);
+	/*
+	 * We need this because we don't hold zvol_state_lock while releasing
+	 * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
+	 * zv_suspend_lock to determine it is safe to free because rwlock is
+	 * not inherent atomic.
+	 */
+	atomic_dec(&zv->zv_suspend_ref);
+
+	return (SET_ERROR(error));
+}
+
+int
+zvol_first_open(zvol_state_t *zv, boolean_t readonly)
+{
+	objset_t *os;
+	int error, locked = 0;
+	boolean_t ro;
+
+	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	/*
+	 * In all other cases the spa_namespace_lock is taken before the
+	 * bdev->bd_mutex lock.	 But in this case the Linux __blkdev_get()
+	 * function calls fops->open() with the bdev->bd_mutex lock held.
+	 * This deadlock can be easily observed with zvols used as vdevs.
+	 *
+	 * To avoid a potential lock inversion deadlock we preemptively
+	 * try to take the spa_namespace_lock().  Normally it will not
+	 * be contended and this is safe because spa_open_common() handles
+	 * the case where the caller already holds the spa_namespace_lock.
+	 *
+	 * When it is contended we risk a lock inversion if we were to
+	 * block waiting for the lock.	Luckily, the __blkdev_get()
+	 * function allows us to return -ERESTARTSYS which will result in
+	 * bdev->bd_mutex being dropped, reacquired, and fops->open() being
+	 * called again.  This process can be repeated safely until both
+	 * locks are acquired.
+	 */
+	if (!mutex_owned(&spa_namespace_lock)) {
+		locked = mutex_tryenter(&spa_namespace_lock);
+		if (!locked)
+			return (SET_ERROR(EINTR));
+	}
+
+	ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
+	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
+	if (error)
+		goto out_mutex;
+
+	zv->zv_objset = os;
+
+	error = zvol_setup_zv(zv);
+
+	if (error) {
+		dmu_objset_disown(os, 1, zv);
+		zv->zv_objset = NULL;
+	}
+
+out_mutex:
+	if (locked)
+		mutex_exit(&spa_namespace_lock);
+	return (SET_ERROR(error));
+}
+
+void
+zvol_last_close(zvol_state_t *zv)
+{
+	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+	zvol_shutdown_zv(zv);
+
+	dmu_objset_disown(zv->zv_objset, 1, zv);
+	zv->zv_objset = NULL;
+}
+
+typedef struct minors_job {
+	list_t *list;
+	list_node_t link;
+	/* input */
+	char *name;
+	/* output */
+	int error;
+} minors_job_t;
+
+/*
+ * Prefetch zvol dnodes for the minors_job
+ */
+static void
+zvol_prefetch_minors_impl(void *arg)
+{
+	minors_job_t *job = arg;
+	char *dsname = job->name;
+	objset_t *os = NULL;
+
+	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
+	    FTAG, &os);
+	if (job->error == 0) {
+		dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ);
+		dmu_objset_disown(os, B_TRUE, FTAG);
+	}
+}
+
+/*
+ * Mask errors to continue dmu_objset_find() traversal
+ */
+static int
+zvol_create_snap_minor_cb(const char *dsname, void *arg)
+{
+	minors_job_t *j = arg;
+	list_t *minors_list = j->list;
+	const char *name = j->name;
+
+	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+
+	/* skip the designated dataset */
+	if (name && strcmp(dsname, name) == 0)
+		return (0);
+
+	/* at this point, the dsname should name a snapshot */
+	if (strchr(dsname, '@') == 0) {
+		dprintf("zvol_create_snap_minor_cb(): "
+		    "%s is not a snapshot name\n", dsname);
+	} else {
+		minors_job_t *job;
+		char *n = kmem_strdup(dsname);
+		if (n == NULL)
+			return (0);
+
+		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
+		job->name = n;
+		job->list = minors_list;
+		job->error = 0;
+		list_insert_tail(minors_list, job);
+		/* don't care if dispatch fails, because job->error is 0 */
+		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
+		    TQ_SLEEP);
+	}
+
+	return (0);
+}
+
+/*
+ * Mask errors to continue dmu_objset_find() traversal
+ */
+static int
+zvol_create_minors_cb(const char *dsname, void *arg)
+{
+	uint64_t snapdev;
+	int error;
+	list_t *minors_list = arg;
+
+	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
+
+	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
+	if (error)
+		return (0);
+
+	/*
+	 * Given the name and the 'snapdev' property, create device minor nodes
+	 * with the linkages to zvols/snapshots as needed.
+	 * If the name represents a zvol, create a minor node for the zvol, then
+	 * check if its snapshots are 'visible', and if so, iterate over the
+	 * snapshots and create device minor nodes for those.
+	 */
+	if (strchr(dsname, '@') == 0) {
+		minors_job_t *job;
+		char *n = kmem_strdup(dsname);
+		if (n == NULL)
+			return (0);
+
+		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
+		job->name = n;
+		job->list = minors_list;
+		job->error = 0;
+		list_insert_tail(minors_list, job);
+		/* don't care if dispatch fails, because job->error is 0 */
+		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
+		    TQ_SLEEP);
+
+		if (snapdev == ZFS_SNAPDEV_VISIBLE) {
+			/*
+			 * traverse snapshots only, do not traverse children,
+			 * and skip the 'dsname'
+			 */
+			error = dmu_objset_find(dsname,
+			    zvol_create_snap_minor_cb, (void *)job,
+			    DS_FIND_SNAPSHOTS);
+		}
+	} else {
+		dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
+		    dsname);
+	}
+
+	return (0);
+}
+
+/*
+ * Create minors for the specified dataset, including children and snapshots.
+ * Pay attention to the 'snapdev' property and iterate over the snapshots
+ * only if they are 'visible'. This approach allows one to assure that the
+ * snapshot metadata is read from disk only if it is needed.
+ *
+ * The name can represent a dataset to be recursively scanned for zvols and
+ * their snapshots, or a single zvol snapshot. If the name represents a
+ * dataset, the scan is performed in two nested stages:
+ * - scan the dataset for zvols, and
+ * - for each zvol, create a minor node, then check if the zvol's snapshots
+ *   are 'visible', and only then iterate over the snapshots if needed
+ *
+ * If the name represents a snapshot, a check is performed if the snapshot is
+ * 'visible' (which also verifies that the parent is a zvol), and if so,
+ * a minor node for that snapshot is created.
+ */
+void
+zvol_create_minors_recursive(const char *name)
+{
+	list_t minors_list;
+	minors_job_t *job;
+
+	if (zvol_inhibit_dev)
+		return;
+
+	/*
+	 * This is the list for prefetch jobs. Whenever we found a match
+	 * during dmu_objset_find, we insert a minors_job to the list and do
+	 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
+	 * any lock because all list operation is done on the current thread.
+	 *
+	 * We will use this list to do zvol_create_minor_impl after prefetch
+	 * so we don't have to traverse using dmu_objset_find again.
+	 */
+	list_create(&minors_list, sizeof (minors_job_t),
+	    offsetof(minors_job_t, link));
+
+
+	if (strchr(name, '@') != NULL) {
+		uint64_t snapdev;
+
+		int error = dsl_prop_get_integer(name, "snapdev",
+		    &snapdev, NULL);
+
+		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
+			(void) ops->zv_create_minor(name);
+	} else {
+		fstrans_cookie_t cookie = spl_fstrans_mark();
+		(void) dmu_objset_find(name, zvol_create_minors_cb,
+		    &minors_list, DS_FIND_CHILDREN);
+		spl_fstrans_unmark(cookie);
+	}
+
+	taskq_wait_outstanding(system_taskq, 0);
+
+	/*
+	 * Prefetch is completed, we can do zvol_create_minor_impl
+	 * sequentially.
+	 */
+	while ((job = list_head(&minors_list)) != NULL) {
+		list_remove(&minors_list, job);
+		if (!job->error)
+			(void) ops->zv_create_minor(job->name);
+		kmem_strfree(job->name);
+		kmem_free(job, sizeof (minors_job_t));
+	}
+
+	list_destroy(&minors_list);
+}
+
+void
+zvol_create_minor(const char *name)
+{
+	/*
+	 * Note: the dsl_pool_config_lock must not be held.
+	 * Minor node creation needs to obtain the zvol_state_lock.
+	 * zvol_open() obtains the zvol_state_lock and then the dsl pool
+	 * config lock.  Therefore, we can't have the config lock now if
+	 * we are going to wait for the zvol_state_lock, because it
+	 * would be a lock order inversion which could lead to deadlock.
+	 */
+
+	if (zvol_inhibit_dev)
+		return;
+
+	if (strchr(name, '@') != NULL) {
+		uint64_t snapdev;
+
+		int error = dsl_prop_get_integer(name,
+		    "snapdev", &snapdev, NULL);
+
+		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
+			(void) ops->zv_create_minor(name);
+	} else {
+		(void) ops->zv_create_minor(name);
+	}
+}
+
+/*
+ * Remove minors for specified dataset including children and snapshots.
+ */
+
+void
+zvol_remove_minors_impl(const char *name)
+{
+	zvol_state_t *zv, *zv_next;
+	int namelen = ((name) ? strlen(name) : 0);
+	taskqid_t t;
+	list_t free_list;
+
+	if (zvol_inhibit_dev)
+		return;
+
+	list_create(&free_list, sizeof (zvol_state_t),
+	    offsetof(zvol_state_t, zv_next));
+
+	rw_enter(&zvol_state_lock, RW_WRITER);
+
+	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+		zv_next = list_next(&zvol_state_list, zv);
+
+		mutex_enter(&zv->zv_state_lock);
+		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
+		    (strncmp(zv->zv_name, name, namelen) == 0 &&
+		    (zv->zv_name[namelen] == '/' ||
+		    zv->zv_name[namelen] == '@'))) {
+			/*
+			 * By holding zv_state_lock here, we guarantee that no
+			 * one is currently using this zv
+			 */
+
+			/* If in use, leave alone */
+			if (zv->zv_open_count > 0 ||
+			    atomic_read(&zv->zv_suspend_ref)) {
+				mutex_exit(&zv->zv_state_lock);
+				continue;
+			}
+
+			zvol_remove(zv);
+
+			/*
+			 * Cleared while holding zvol_state_lock as a writer
+			 * which will prevent zvol_open() from opening it.
+			 */
+			ops->zv_clear_private(zv);
+
+			/* Drop zv_state_lock before zvol_free() */
+			mutex_exit(&zv->zv_state_lock);
+
+			/* Try parallel zv_free, if failed do it in place */
+			t = taskq_dispatch(system_taskq,
+			    (task_func_t *)ops->zv_free, zv, TQ_SLEEP);
+			if (t == TASKQID_INVALID)
+				list_insert_head(&free_list, zv);
+		} else {
+			mutex_exit(&zv->zv_state_lock);
+		}
+	}
+	rw_exit(&zvol_state_lock);
+
+	/* Drop zvol_state_lock before calling zvol_free() */
+	while ((zv = list_head(&free_list)) != NULL) {
+		list_remove(&free_list, zv);
+		ops->zv_free(zv);
+	}
+}
+
+/* Remove minor for this specific volume only */
+static void
+zvol_remove_minor_impl(const char *name)
+{
+	zvol_state_t *zv = NULL, *zv_next;
+
+	if (zvol_inhibit_dev)
+		return;
+
+	rw_enter(&zvol_state_lock, RW_WRITER);
+
+	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+		zv_next = list_next(&zvol_state_list, zv);
+
+		mutex_enter(&zv->zv_state_lock);
+		if (strcmp(zv->zv_name, name) == 0) {
+			/*
+			 * By holding zv_state_lock here, we guarantee that no
+			 * one is currently using this zv
+			 */
+
+			/* If in use, leave alone */
+			if (zv->zv_open_count > 0 ||
+			    atomic_read(&zv->zv_suspend_ref)) {
+				mutex_exit(&zv->zv_state_lock);
+				continue;
+			}
+			zvol_remove(zv);
+
+			ops->zv_clear_private(zv);
+			mutex_exit(&zv->zv_state_lock);
+			break;
+		} else {
+			mutex_exit(&zv->zv_state_lock);
+		}
+	}
+
+	/* Drop zvol_state_lock before calling zvol_free() */
+	rw_exit(&zvol_state_lock);
+
+	if (zv != NULL)
+		ops->zv_free(zv);
+}
+
+/*
+ * Rename minors for specified dataset including children and snapshots.
+ */
+static void
+zvol_rename_minors_impl(const char *oldname, const char *newname)
+{
+	zvol_state_t *zv, *zv_next;
+	int oldnamelen, newnamelen;
+
+	if (zvol_inhibit_dev)
+		return;
+
+	oldnamelen = strlen(oldname);
+	newnamelen = strlen(newname);
+
+	rw_enter(&zvol_state_lock, RW_READER);
+
+	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
+		zv_next = list_next(&zvol_state_list, zv);
+
+		mutex_enter(&zv->zv_state_lock);
+
+		if (strcmp(zv->zv_name, oldname) == 0) {
+			ops->zv_rename_minor(zv, newname);
+		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
+		    (zv->zv_name[oldnamelen] == '/' ||
+		    zv->zv_name[oldnamelen] == '@')) {
+			char *name = kmem_asprintf("%s%c%s", newname,
+			    zv->zv_name[oldnamelen],
+			    zv->zv_name + oldnamelen + 1);
+			ops->zv_rename_minor(zv, name);
+			kmem_strfree(name);
+		}
+
+		mutex_exit(&zv->zv_state_lock);
+	}
+
+	rw_exit(&zvol_state_lock);
+}
+
+typedef struct zvol_snapdev_cb_arg {
+	uint64_t snapdev;
+} zvol_snapdev_cb_arg_t;
+
+static int
+zvol_set_snapdev_cb(const char *dsname, void *param)
+{
+	zvol_snapdev_cb_arg_t *arg = param;
+
+	if (strchr(dsname, '@') == NULL)
+		return (0);
+
+	switch (arg->snapdev) {
+		case ZFS_SNAPDEV_VISIBLE:
+			(void) ops->zv_create_minor(dsname);
+			break;
+		case ZFS_SNAPDEV_HIDDEN:
+			(void) zvol_remove_minor_impl(dsname);
+			break;
+	}
+
+	return (0);
+}
+
+static void
+zvol_set_snapdev_impl(char *name, uint64_t snapdev)
+{
+	zvol_snapdev_cb_arg_t arg = {snapdev};
+	fstrans_cookie_t cookie = spl_fstrans_mark();
+	/*
+	 * The zvol_set_snapdev_sync() sets snapdev appropriately
+	 * in the dataset hierarchy. Here, we only scan snapshots.
+	 */
+	dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
+	spl_fstrans_unmark(cookie);
+}
+
+typedef struct zvol_volmode_cb_arg {
+	uint64_t volmode;
+} zvol_volmode_cb_arg_t;
+
+static void
+zvol_set_volmode_impl(char *name, uint64_t volmode)
+{
+	fstrans_cookie_t cookie;
+	uint64_t old_volmode;
+	zvol_state_t *zv;
+
+	if (strchr(name, '@') != NULL)
+		return;
+
+	/*
+	 * It's unfortunate we need to remove minors before we create new ones:
+	 * this is necessary because our backing gendisk (zvol_state->zv_disk)
+	 * could be different when we set, for instance, volmode from "geom"
+	 * to "dev" (or vice versa).
+	 */
+	zv = zvol_find_by_name(name, RW_NONE);
+	if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
+			return;
+	if (zv != NULL) {
+		old_volmode = zv->zv_volmode;
+		mutex_exit(&zv->zv_state_lock);
+		if (old_volmode == volmode)
+			return;
+		zvol_wait_close(zv);
+	}
+	cookie = spl_fstrans_mark();
+	switch (volmode) {
+		case ZFS_VOLMODE_NONE:
+			(void) zvol_remove_minor_impl(name);
+			break;
+		case ZFS_VOLMODE_GEOM:
+		case ZFS_VOLMODE_DEV:
+			(void) zvol_remove_minor_impl(name);
+			(void) ops->zv_create_minor(name);
+			break;
+		case ZFS_VOLMODE_DEFAULT:
+			(void) zvol_remove_minor_impl(name);
+			if (zvol_volmode == ZFS_VOLMODE_NONE)
+				break;
+			else /* if zvol_volmode is invalid defaults to "geom" */
+				(void) ops->zv_create_minor(name);
+			break;
+	}
+	spl_fstrans_unmark(cookie);
+}
+
+static zvol_task_t *
+zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
+    uint64_t value)
+{
+	zvol_task_t *task;
+	char *delim;
+
+	/* Never allow tasks on hidden names. */
+	if (name1[0] == '$')
+		return (NULL);
+
+	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
+	task->op = op;
+	task->value = value;
+	delim = strchr(name1, '/');
+	strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
+
+	strlcpy(task->name1, name1, MAXNAMELEN);
+	if (name2 != NULL)
+		strlcpy(task->name2, name2, MAXNAMELEN);
+
+	return (task);
+}
+
+static void
+zvol_task_free(zvol_task_t *task)
+{
+	kmem_free(task, sizeof (zvol_task_t));
+}
+
+/*
+ * The worker thread function performed asynchronously.
+ */
+static void
+zvol_task_cb(void *arg)
+{
+	zvol_task_t *task = arg;
+
+	switch (task->op) {
+	case ZVOL_ASYNC_REMOVE_MINORS:
+		zvol_remove_minors_impl(task->name1);
+		break;
+	case ZVOL_ASYNC_RENAME_MINORS:
+		zvol_rename_minors_impl(task->name1, task->name2);
+		break;
+	case ZVOL_ASYNC_SET_SNAPDEV:
+		zvol_set_snapdev_impl(task->name1, task->value);
+		break;
+	case ZVOL_ASYNC_SET_VOLMODE:
+		zvol_set_volmode_impl(task->name1, task->value);
+		break;
+	default:
+		VERIFY(0);
+		break;
+	}
+
+	zvol_task_free(task);
+}
+
+typedef struct zvol_set_prop_int_arg {
+	const char *zsda_name;
+	uint64_t zsda_value;
+	zprop_source_t zsda_source;
+	dmu_tx_t *zsda_tx;
+} zvol_set_prop_int_arg_t;
+
+/*
+ * Sanity check the dataset for safe use by the sync task.  No additional
+ * conditions are imposed.
+ */
+static int
+zvol_set_snapdev_check(void *arg, dmu_tx_t *tx)
+{
+	zvol_set_prop_int_arg_t *zsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd;
+	int error;
+
+	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
+	if (error != 0)
+		return (error);
+
+	dsl_dir_rele(dd, FTAG);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	char dsname[MAXNAMELEN];
+	zvol_task_t *task;
+	uint64_t snapdev;
+
+	dsl_dataset_name(ds, dsname);
+	if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0)
+		return (0);
+	task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev);
+	if (task == NULL)
+		return (0);
+
+	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
+	    task, TQ_SLEEP);
+	return (0);
+}
+
+/*
+ * Traverse all child datasets and apply snapdev appropriately.
+ * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
+ * dataset and read the effective "snapdev" on every child in the callback
+ * function: this is because the value is not guaranteed to be the same in the
+ * whole dataset hierarchy.
+ */
+static void
+zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx)
+{
+	zvol_set_prop_int_arg_t *zsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	int error;
+
+	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
+	zsda->zsda_tx = tx;
+
+	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
+	if (error == 0) {
+		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV),
+		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
+		    &zsda->zsda_value, zsda->zsda_tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb,
+	    zsda, DS_FIND_CHILDREN);
+
+	dsl_dir_rele(dd, FTAG);
+}
+
+int
+zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev)
+{
+	zvol_set_prop_int_arg_t zsda;
+
+	zsda.zsda_name = ddname;
+	zsda.zsda_source = source;
+	zsda.zsda_value = snapdev;
+
+	return (dsl_sync_task(ddname, zvol_set_snapdev_check,
+	    zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+/*
+ * Sanity check the dataset for safe use by the sync task.  No additional
+ * conditions are imposed.
+ */
+static int
+zvol_set_volmode_check(void *arg, dmu_tx_t *tx)
+{
+	zvol_set_prop_int_arg_t *zsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd;
+	int error;
+
+	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
+	if (error != 0)
+		return (error);
+
+	dsl_dir_rele(dd, FTAG);
+
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+	char dsname[MAXNAMELEN];
+	zvol_task_t *task;
+	uint64_t volmode;
+
+	dsl_dataset_name(ds, dsname);
+	if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0)
+		return (0);
+	task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode);
+	if (task == NULL)
+		return (0);
+
+	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
+	    task, TQ_SLEEP);
+	return (0);
+}
+
+/*
+ * Traverse all child datasets and apply volmode appropriately.
+ * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
+ * dataset and read the effective "volmode" on every child in the callback
+ * function: this is because the value is not guaranteed to be the same in the
+ * whole dataset hierarchy.
+ */
+static void
+zvol_set_volmode_sync(void *arg, dmu_tx_t *tx)
+{
+	zvol_set_prop_int_arg_t *zsda = arg;
+	dsl_pool_t *dp = dmu_tx_pool(tx);
+	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	int error;
+
+	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
+	zsda->zsda_tx = tx;
+
+	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
+	if (error == 0) {
+		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE),
+		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
+		    &zsda->zsda_value, zsda->zsda_tx);
+		dsl_dataset_rele(ds, FTAG);
+	}
+
+	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb,
+	    zsda, DS_FIND_CHILDREN);
+
+	dsl_dir_rele(dd, FTAG);
+}
+
+int
+zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode)
+{
+	zvol_set_prop_int_arg_t zsda;
+
+	zsda.zsda_name = ddname;
+	zsda.zsda_source = source;
+	zsda.zsda_value = volmode;
+
+	return (dsl_sync_task(ddname, zvol_set_volmode_check,
+	    zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
+}
+
+void
+zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
+{
+	zvol_task_t *task;
+	taskqid_t id;
+
+	task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
+	if (task == NULL)
+		return;
+
+	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+	if ((async == B_FALSE) && (id != TASKQID_INVALID))
+		taskq_wait_id(spa->spa_zvol_taskq, id);
+}
+
+void
+zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
+    boolean_t async)
+{
+	zvol_task_t *task;
+	taskqid_t id;
+
+	task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
+	if (task == NULL)
+		return;
+
+	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
+	if ((async == B_FALSE) && (id != TASKQID_INVALID))
+		taskq_wait_id(spa->spa_zvol_taskq, id);
+}
+
+boolean_t
+zvol_is_zvol(const char *name)
+{
+
+	return (ops->zv_is_zvol(name));
+}
+
+void
+zvol_register_ops(const zvol_platform_ops_t *zvol_ops)
+{
+	ops = zvol_ops;
+}
+
+int
+zvol_init_impl(void)
+{
+	int i;
+
+	list_create(&zvol_state_list, sizeof (zvol_state_t),
+	    offsetof(zvol_state_t, zv_next));
+	rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
+
+	zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
+	    KM_SLEEP);
+	for (i = 0; i < ZVOL_HT_SIZE; i++)
+		INIT_HLIST_HEAD(&zvol_htable[i]);
+
+	return (0);
+}
+
+void
+zvol_fini_impl(void)
+{
+	zvol_remove_minors_impl(NULL);
+
+	/*
+	 * The call to "zvol_remove_minors_impl" may dispatch entries to
+	 * the system_taskq, but it doesn't wait for those entries to
+	 * complete before it returns. Thus, we must wait for all of the
+	 * removals to finish, before we can continue.
+	 */
+	taskq_wait_outstanding(system_taskq, 0);
+
+	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
+	list_destroy(&zvol_state_list);
+	rw_destroy(&zvol_state_lock);
+}
diff --git a/sys/contrib/openzfs/module/zstd/Makefile.in b/sys/contrib/openzfs/module/zstd/Makefile.in
new file mode 100644
index 000000000000..f67db710f097
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/Makefile.in
@@ -0,0 +1,38 @@
+ifneq ($(KBUILD_EXTMOD),)
+src = @abs_srcdir@
+obj = @abs_builddir@
+zstd_include = $(src)/include
+else
+zstd_include = $(srctree)/$(src)/include
+endif
+
+MODULE := zzstd
+
+obj-$(CONFIG_ZFS) := $(MODULE).o
+
+asflags-y := -I$(zstd_include)
+ccflags-y := -I$(zstd_include)
+
+# Zstd uses -O3 by default, so we should follow
+ccflags-y += -O3
+
+# -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h
+# Set it for other compilers, too.
+$(obj)/lib/zstd.o: c_flags += -fno-tree-vectorize
+
+# SSE register return with SSE disabled if -march=znverX is passed
+$(obj)/lib/zstd.o: c_flags += -U__BMI__
+
+# Quiet warnings about frame size due to unused code in unmodified zstd lib
+$(obj)/lib/zstd.o: c_flags += -Wframe-larger-than=20480
+
+# Disable aarch64 neon SIMD instructions for kernel mode
+$(obj)/lib/zstd.o: c_flags += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
+
+$(obj)/zfs_zstd.o: c_flags += -include $(zstd_include)/zstd_compat_wrapper.h
+
+$(MODULE)-objs += zfs_zstd.o
+$(MODULE)-objs += lib/zstd.o
+
+all:
+	mkdir -p lib
diff --git a/sys/contrib/openzfs/module/zstd/README.md b/sys/contrib/openzfs/module/zstd/README.md
new file mode 100644
index 000000000000..f8e127736aac
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/README.md
@@ -0,0 +1,65 @@
+# ZSTD-On-ZFS Library Manual
+
+## Introduction
+
+This subtree contains the ZSTD library used in ZFS. It is heavily cut-down by
+dropping any unneeded files, and combined into a single file, but otherwise is
+intentionally unmodified. Please do not alter the file containing the zstd
+library, besides upgrading to a newer ZSTD release.
+
+Tree structure:
+
+* `zfs_zstd.c` is the actual `zzstd` kernel module.
+* `lib/` contains the the unmodified, [_"amalgamated"_](https://github.com/facebook/zstd/blob/dev/contrib/single_file_libs/README.md)
+  version of the `Zstandard` library, generated from our template file
+* `zstd-in.c` is our template file for generating the library
+* `include/`: This directory contains supplemental includes for platform
+  compatibility, which are not expected to be used by ZFS elsewhere in the
+  future. Thus we keep them private to ZSTD.
+
+## Updating ZSTD
+
+To update ZSTD the following steps need to be taken:
+
+1. Grab the latest release of [ZSTD](https://github.com/facebook/zstd/releases).
+2. Update `module/zstd/zstd-in.c` if required. (see
+   `zstd/contrib/single_file_libs/zstd-in.c` in the zstd repository)
+3. Generate the "single-file-library" and put it to `module/zstd/lib/`.
+4. Copy the following files to `module/zstd/lib/`:
+   - `zstd/lib/zstd.h`
+   - `zstd/lib/common/zstd_errors.h`
+
+This can be done using a few shell commands from inside the zfs repo:
+
+~~~sh
+cd PATH/TO/ZFS
+
+url="https://github.com/facebook/zstd"
+release="$(curl -s "${url}"/releases/latest | grep -oP '(?<=v)[\d\.]+')"
+zstd="/tmp/zstd-${release}/"
+
+wget -O /tmp/zstd.tar.gz \
+    "${url}/releases/download/v${release}/zstd-${release}.tar.gz"
+tar -C /tmp -xzf /tmp/zstd.tar.gz
+
+cp ${zstd}/lib/zstd.h module/zstd/lib/
+cp ${zstd}/lib/zstd_errors.h module/zstd/lib/
+${zstd}/contrib/single_file_libs/combine.sh \
+    -r ${zstd}/lib -o module/zstd/lib/zstd.c module/zstd/zstd-in.c
+~~~
+
+Note: if the zstd library for zfs is updated to a newer version,
+the macro list in include/zstd_compat_wrapper.h usually needs to be updated.
+this can be done with some hand crafting of the output of the following
+script: nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable
+
+
+## Altering ZSTD and breaking changes
+
+If ZSTD made changes that break compatibility or you need to make breaking
+changes to the way we handle ZSTD, it is required to maintain backwards
+compatibility.
+
+We already save the ZSTD version number within the block header to be used
+to add future compatibility checks and/or fixes. However, currently it is
+not actually used in such a way.
diff --git a/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h b/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h
new file mode 100644
index 000000000000..088517d3d23b
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/aarch64_compat.h
@@ -0,0 +1,37 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2018-2020, Sebastian Gottschall
+ */
+
+#ifdef _KERNEL
+#undef __aarch64__
+#endif
diff --git a/sys/contrib/openzfs/module/zstd/include/limits.h b/sys/contrib/openzfs/module/zstd/include/limits.h
new file mode 100644
index 000000000000..3bf5b67765ae
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/limits.h
@@ -0,0 +1,63 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef	_ZSTD_LIMITS_H
+#define	_ZSTD_LIMITS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/limits.h>
+#elif defined(__linux__)
+#include <linux/limits.h>
+#include <linux/kernel.h>
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <limits.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_LIMITS_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stddef.h b/sys/contrib/openzfs/module/zstd/include/stddef.h
new file mode 100644
index 000000000000..3f46fb8b033e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stddef.h
@@ -0,0 +1,62 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef	_ZSTD_STDDEF_H
+#define	_ZSTD_STDDEF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/types.h>
+#elif defined(__linux__)
+#include <linux/types.h>
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <stddef.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDDEF_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stdint.h b/sys/contrib/openzfs/module/zstd/include/stdint.h
new file mode 100644
index 000000000000..2d98a556c23e
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stdint.h
@@ -0,0 +1,62 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef	_ZSTD_STDINT_H
+#define	_ZSTD_STDINT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/stdint.h>
+#elif defined(__linux__)
+#include <linux/types.h>
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <stdint.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDINT_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stdio.h b/sys/contrib/openzfs/module/zstd/include/stdio.h
new file mode 100644
index 000000000000..5a7c6ec69916
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stdio.h
@@ -0,0 +1,54 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef	_ZSTD_STDIO_H
+#define	_ZSTD_STDIO_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _KERNEL
+
+#include_next <stdio.h>
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDIO_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/stdlib.h b/sys/contrib/openzfs/module/zstd/include/stdlib.h
new file mode 100644
index 000000000000..c341a0c84884
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/stdlib.h
@@ -0,0 +1,58 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef	_ZSTD_STDLIB_H
+#define	_ZSTD_STDLIB_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef	GCC_VERSION
+
+/*
+ * Define calloc, malloc, free to make building work. They are never really used
+ * in zstdlib.c since allocation is done in zstd.c.
+ */
+#define	calloc(n, sz)	NULL
+#define	malloc(sz)	NULL
+#define	free(ptr)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STDLIB_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/string.h b/sys/contrib/openzfs/module/zstd/include/string.h
new file mode 100644
index 000000000000..78998d3c4655
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/string.h
@@ -0,0 +1,62 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2014-2019, Allan Jude
+ * Copyright (c) 2020, Brian Behlendorf
+ * Copyright (c) 2020, Michael Niewöhner
+ */
+
+#ifndef	_ZSTD_STRING_H
+#define	_ZSTD_STRING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+#if defined(__FreeBSD__)
+#include <sys/systm.h>    /* memcpy, memset */
+#elif defined(__linux__)
+#include <linux/string.h> /* memcpy, memset */
+#else
+#error "Unsupported platform"
+#endif
+
+#else /* !_KERNEL */
+#include_next <string.h>
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZSTD_STRING_H */
diff --git a/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h b/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h
new file mode 100644
index 000000000000..5cca517b5508
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/include/zstd_compat_wrapper.h
@@ -0,0 +1,460 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2020, Sebastian Gottschall
+ */
+
+/*
+ * This wrapper fixes a problem, in case the ZFS filesystem driver, is compiled
+ * staticly into the kernel.
+ * This will cause a symbol collision with the older in-kernel zstd library.
+ * The following macros will simply rename all local zstd symbols and references
+ *
+ * Note: if the zstd library for zfs is updated to a newer version, this macro
+ * list usually needs to be updated.
+ * this can be done with some hand crafting of the output of the following
+ * script
+ * nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable
+ */
+
+#define	BIT_initDStream zfs_BIT_initDStream
+#define	BIT_mask zfs_BIT_mask
+#define	BIT_reloadDStream zfs_BIT_reloadDStream
+#define	ERR_getErrorString zfs_ERR_getErrorString
+#define	FSE_NCountWriteBound zfs_FSE_NCountWriteBound
+#define	FSE_buildCTable zfs_FSE_buildCTable
+#define	FSE_buildCTable_raw zfs_FSE_buildCTable_raw
+#define	FSE_buildCTable_rle zfs_FSE_buildCTable_rle
+#define	FSE_buildCTable_wksp zfs_FSE_buildCTable_wksp
+#define	FSE_buildDTable zfs_FSE_buildDTable
+#define	FSE_buildDTable_raw zfs_FSE_buildDTable_raw
+#define	FSE_buildDTable_rle zfs_FSE_buildDTable_rle
+#define	FSE_compress zfs_FSE_compress
+#define	FSE_compress2 zfs_FSE_compress2
+#define	FSE_compressBound zfs_FSE_compressBound
+#define	FSE_compress_usingCTable zfs_FSE_compress_usingCTable
+#define	FSE_compress_usingCTable_generic zfs_FSE_compress_usingCTable_generic
+#define	FSE_compress_wksp zfs_FSE_compress_wksp
+#define	FSE_createCTable zfs_FSE_createCTable
+#define	FSE_createDTable zfs_FSE_createDTable
+#define	FSE_decompress zfs_FSE_decompress
+#define	FSE_decompress_usingDTable zfs_FSE_decompress_usingDTable
+#define	FSE_decompress_wksp zfs_FSE_decompress_wksp
+#define	FSE_freeCTable zfs_FSE_freeCTable
+#define	FSE_freeDTable zfs_FSE_freeDTable
+#define	FSE_getErrorName zfs_FSE_getErrorName
+#define	FSE_normalizeCount zfs_FSE_normalizeCount
+#define	FSE_optimalTableLog zfs_FSE_optimalTableLog
+#define	FSE_optimalTableLog_internal zfs_FSE_optimalTableLog_internal
+#define	FSE_readNCount zfs_FSE_readNCount
+#define	FSE_versionNumber zfs_FSE_versionNumber
+#define	FSE_writeNCount zfs_FSE_writeNCount
+#define	HIST_count zfs_HIST_count
+#define	HIST_countFast zfs_HIST_countFast
+#define	HIST_countFast_wksp zfs_HIST_countFast_wksp
+#define	HIST_count_parallel_wksp zfs_HIST_count_parallel_wksp
+#define	HIST_count_simple zfs_HIST_count_simple
+#define	HIST_count_wksp zfs_HIST_count_wksp
+#define	HUF_buildCTable zfs_HUF_buildCTable
+#define	HUF_buildCTable_wksp zfs_HUF_buildCTable_wksp
+#define	HUF_compress zfs_HUF_compress
+#define	HUF_compress1X zfs_HUF_compress1X
+#define	HUF_compress1X_repeat zfs_HUF_compress1X_repeat
+#define	HUF_compress1X_usingCTable zfs_HUF_compress1X_usingCTable
+#define	HUF_compress1X_wksp zfs_HUF_compress1X_wksp
+#define	HUF_compress2 zfs_HUF_compress2
+#define	HUF_compress4X_repeat zfs_HUF_compress4X_repeat
+#define	HUF_compress4X_usingCTable zfs_HUF_compress4X_usingCTable
+#define	HUF_compress4X_wksp zfs_HUF_compress4X_wksp
+#define	HUF_compressBound zfs_HUF_compressBound
+#define	HUF_compressWeights zfs_HUF_compressWeights
+#define	HUF_decompress zfs_HUF_decompress
+#define	HUF_decompress1X1 zfs_HUF_decompress1X1
+#define	HUF_decompress1X1_DCtx zfs_HUF_decompress1X1_DCtx
+#define	HUF_decompress1X1_DCtx_wksp zfs_HUF_decompress1X1_DCtx_wksp
+#define	HUF_decompress1X1_DCtx_wksp_bmi2 zfs_HUF_decompress1X1_DCtx_wksp_bmi2
+#define	HUF_decompress1X1_usingDTable zfs_HUF_decompress1X1_usingDTable
+#define	HUF_decompress1X2 zfs_HUF_decompress1X2
+#define	HUF_decompress1X2_DCtx zfs_HUF_decompress1X2_DCtx
+#define	HUF_decompress1X2_DCtx_wksp zfs_HUF_decompress1X2_DCtx_wksp
+#define	HUF_decompress1X2_usingDTable zfs_HUF_decompress1X2_usingDTable
+#define	HUF_decompress1X_DCtx zfs_HUF_decompress1X_DCtx
+#define	HUF_decompress1X_DCtx_wksp zfs_HUF_decompress1X_DCtx_wksp
+#define	HUF_decompress1X_usingDTable zfs_HUF_decompress1X_usingDTable
+#define	HUF_decompress1X_usingDTable_bmi2 zfs_HUF_decompress1X_usingDTable_bmi2
+#define	HUF_decompress4X1 zfs_HUF_decompress4X1
+#define	HUF_decompress4X1_DCtx zfs_HUF_decompress4X1_DCtx
+#define	HUF_decompress4X1_DCtx_wksp zfs_HUF_decompress4X1_DCtx_wksp
+#define	HUF_decompress4X1_usingDTable zfs_HUF_decompress4X1_usingDTable
+#define	HUF_decompress4X2 zfs_HUF_decompress4X2
+#define	HUF_decompress4X2_DCtx zfs_HUF_decompress4X2_DCtx
+#define	HUF_decompress4X2_DCtx_wksp zfs_HUF_decompress4X2_DCtx_wksp
+#define	HUF_decompress4X2_usingDTable zfs_HUF_decompress4X2_usingDTable
+#define	HUF_decompress4X_DCtx zfs_HUF_decompress4X_DCtx
+#define	HUF_decompress4X_hufOnly zfs_HUF_decompress4X_hufOnly
+#define	HUF_decompress4X_hufOnly_wksp zfs_HUF_decompress4X_hufOnly_wksp
+#define	HUF_decompress4X_hufOnly_wksp_bmi2 \
+	zfs_HUF_decompress4X_hufOnly_wksp_bmi2
+#define	HUF_decompress4X_usingDTable zfs_HUF_decompress4X_usingDTable
+#define	HUF_decompress4X_usingDTable_bmi2 zfs_HUF_decompress4X_usingDTable_bmi2
+#define	HUF_estimateCompressedSize zfs_HUF_estimateCompressedSize
+#define	HUF_fillDTableX2Level2 zfs_HUF_fillDTableX2Level2
+#define	HUF_getErrorName zfs_HUF_getErrorName
+#define	HUF_getNbBits zfs_HUF_getNbBits
+#define	HUF_optimalTableLog zfs_HUF_optimalTableLog
+#define	HUF_readCTable zfs_HUF_readCTable
+#define	HUF_readDTableX1 zfs_HUF_readDTableX1
+#define	HUF_readDTableX1_wksp zfs_HUF_readDTableX1_wksp
+#define	HUF_readDTableX2 zfs_HUF_readDTableX2
+#define	HUF_readDTableX2_wksp zfs_HUF_readDTableX2_wksp
+#define	HUF_readStats zfs_HUF_readStats
+#define	HUF_selectDecoder zfs_HUF_selectDecoder
+#define	HUF_setMaxHeight zfs_HUF_setMaxHeight
+#define	HUF_validateCTable zfs_HUF_validateCTable
+#define	HUF_writeCTable zfs_HUF_writeCTable
+#define	LL_base zfs_LL_base
+#define	LL_bits zfs_LL_bits
+#define	LL_defaultDTable zfs_LL_defaultDTable
+#define	LL_defaultNorm zfs_LL_defaultNorm
+#define	ML_base zfs_ML_base
+#define	ML_bits zfs_ML_bits
+#define	ML_defaultDTable zfs_ML_defaultDTable
+#define	ML_defaultNorm zfs_ML_defaultNorm
+#define	OF_base zfs_OF_base
+#define	OF_bits zfs_OF_bits
+#define	OF_defaultDTable zfs_OF_defaultDTable
+#define	OF_defaultNorm zfs_OF_defaultNorm
+#define	POOL_add zfs_POOL_add
+#define	POOL_create zfs_POOL_create
+#define	POOL_create_advanced zfs_POOL_create_advanced
+#define	POOL_free zfs_POOL_free
+#define	POOL_resize zfs_POOL_resize
+#define	POOL_sizeof zfs_POOL_sizeof
+#define	POOL_tryAdd zfs_POOL_tryAdd
+#define	ZSTD_CCtxParams_getParameter zfs_ZSTD_CCtxParams_getParameter
+#define	ZSTD_CCtxParams_init zfs_ZSTD_CCtxParams_init
+#define	ZSTD_CCtxParams_init_advanced zfs_ZSTD_CCtxParams_init_advanced
+#define	ZSTD_CCtxParams_reset zfs_ZSTD_CCtxParams_reset
+#define	ZSTD_CCtxParams_setParameter zfs_ZSTD_CCtxParams_setParameter
+#define	ZSTD_CCtx_getParameter zfs_ZSTD_CCtx_getParameter
+#define	ZSTD_CCtx_loadDictionary zfs_ZSTD_CCtx_loadDictionary
+#define	ZSTD_CCtx_loadDictionary_advanced zfs_ZSTD_CCtx_loadDictionary_advanced
+#define	ZSTD_CCtx_loadDictionary_byReference \
+	zfs_ZSTD_CCtx_loadDictionary_byReference
+#define	ZSTD_CCtx_refCDict zfs_ZSTD_CCtx_refCDict
+#define	ZSTD_CCtx_refPrefix zfs_ZSTD_CCtx_refPrefix
+#define	ZSTD_CCtx_refPrefix_advanced zfs_ZSTD_CCtx_refPrefix_advanced
+#define	ZSTD_CCtx_reset zfs_ZSTD_CCtx_reset
+#define	ZSTD_CCtx_setParameter zfs_ZSTD_CCtx_setParameter
+#define	ZSTD_CCtx_setParametersUsingCCtxParams \
+	zfs_ZSTD_CCtx_setParametersUsingCCtxParams
+#define	ZSTD_CCtx_setPledgedSrcSize zfs_ZSTD_CCtx_setPledgedSrcSize
+#define	ZSTD_CStreamInSize zfs_ZSTD_CStreamInSize
+#define	ZSTD_CStreamOutSize zfs_ZSTD_CStreamOutSize
+#define	ZSTD_DCtx_loadDictionary zfs_ZSTD_DCtx_loadDictionary
+#define	ZSTD_DCtx_loadDictionary_advanced zfs_ZSTD_DCtx_loadDictionary_advanced
+#define	ZSTD_DCtx_loadDictionary_byReference \
+	zfs_ZSTD_DCtx_loadDictionary_byReference
+#define	ZSTD_DCtx_refDDict zfs_ZSTD_DCtx_refDDict
+#define	ZSTD_DCtx_refPrefix zfs_ZSTD_DCtx_refPrefix
+#define	ZSTD_DCtx_refPrefix_advanced zfs_ZSTD_DCtx_refPrefix_advanced
+#define	ZSTD_DCtx_reset zfs_ZSTD_DCtx_reset
+#define	ZSTD_DCtx_setFormat zfs_ZSTD_DCtx_setFormat
+#define	ZSTD_DCtx_setMaxWindowSize zfs_ZSTD_DCtx_setMaxWindowSize
+#define	ZSTD_DCtx_setParameter zfs_ZSTD_DCtx_setParameter
+#define	ZSTD_DDict_dictContent zfs_ZSTD_DDict_dictContent
+#define	ZSTD_DDict_dictSize zfs_ZSTD_DDict_dictSize
+#define	ZSTD_DStreamInSize zfs_ZSTD_DStreamInSize
+#define	ZSTD_DStreamOutSize zfs_ZSTD_DStreamOutSize
+#define	ZSTD_DUBT_findBestMatch zfs_ZSTD_DUBT_findBestMatch
+#define	ZSTD_NCountCost zfs_ZSTD_NCountCost
+#define	ZSTD_XXH64_digest zfs_ZSTD_XXH64_digest
+#define	ZSTD_adjustCParams zfs_ZSTD_adjustCParams
+#define	ZSTD_assignParamsToCCtxParams zfs_ZSTD_assignParamsToCCtxParams
+#define	ZSTD_buildCTable zfs_ZSTD_buildCTable
+#define	ZSTD_buildFSETable zfs_ZSTD_buildFSETable
+#define	ZSTD_buildSeqStore zfs_ZSTD_buildSeqStore
+#define	ZSTD_buildSeqTable zfs_ZSTD_buildSeqTable
+#define	ZSTD_cParam_getBounds zfs_ZSTD_cParam_getBounds
+#define	ZSTD_cParam_withinBounds zfs_ZSTD_cParam_withinBounds
+#define	ZSTD_calloc zfs_ZSTD_calloc
+#define	ZSTD_checkCParams zfs_ZSTD_checkCParams
+#define	ZSTD_checkContinuity zfs_ZSTD_checkContinuity
+#define	ZSTD_compress zfs_ZSTD_compress
+#define	ZSTD_compress2 zfs_ZSTD_compress2
+#define	ZSTD_compressBegin zfs_ZSTD_compressBegin
+#define	ZSTD_compressBegin_advanced zfs_ZSTD_compressBegin_advanced
+#define	ZSTD_compressBegin_advanced_internal \
+	zfs_ZSTD_compressBegin_advanced_internal
+#define	ZSTD_compressBegin_usingCDict zfs_ZSTD_compressBegin_usingCDict
+#define	ZSTD_compressBegin_usingCDict_advanced \
+	zfs_ZSTD_compressBegin_usingCDict_advanced
+#define	ZSTD_compressBegin_usingDict zfs_ZSTD_compressBegin_usingDict
+#define	ZSTD_compressBlock zfs_ZSTD_compressBlock
+#define	ZSTD_compressBlock_btlazy2 zfs_ZSTD_compressBlock_btlazy2
+#define	ZSTD_compressBlock_btlazy2_dictMatchState \
+	zfs_ZSTD_compressBlock_btlazy2_dictMatchState
+#define	ZSTD_compressBlock_btlazy2_extDict \
+	zfs_ZSTD_compressBlock_btlazy2_extDict
+#define	ZSTD_compressBlock_btopt zfs_ZSTD_compressBlock_btopt
+#define	ZSTD_compressBlock_btopt_dictMatchState \
+	zfs_ZSTD_compressBlock_btopt_dictMatchState
+#define	ZSTD_compressBlock_btopt_extDict zfs_ZSTD_compressBlock_btopt_extDict
+#define	ZSTD_compressBlock_btultra zfs_ZSTD_compressBlock_btultra
+#define	ZSTD_compressBlock_btultra2 zfs_ZSTD_compressBlock_btultra2
+#define	ZSTD_compressBlock_btultra_dictMatchState \
+	zfs_ZSTD_compressBlock_btultra_dictMatchState
+#define	ZSTD_compressBlock_btultra_extDict \
+	zfs_ZSTD_compressBlock_btultra_extDict
+#define	ZSTD_compressBlock_doubleFast zfs_ZSTD_compressBlock_doubleFast
+#define	ZSTD_compressBlock_doubleFast_dictMatchState \
+	zfs_ZSTD_compressBlock_doubleFast_dictMatchState
+#define	ZSTD_compressBlock_doubleFast_extDict \
+	zfs_ZSTD_compressBlock_doubleFast_extDict
+#define	ZSTD_compressBlock_doubleFast_extDict_generic \
+	zfs_ZSTD_compressBlock_doubleFast_extDict_generic
+#define	ZSTD_compressBlock_fast zfs_ZSTD_compressBlock_fast
+#define	ZSTD_compressBlock_fast_dictMatchState \
+	zfs_ZSTD_compressBlock_fast_dictMatchState
+#define	ZSTD_compressBlock_fast_extDict zfs_ZSTD_compressBlock_fast_extDict
+#define	ZSTD_compressBlock_fast_extDict_generic \
+	zfs_ZSTD_compressBlock_fast_extDict_generic
+#define	ZSTD_compressBlock_greedy zfs_ZSTD_compressBlock_greedy
+#define	ZSTD_compressBlock_greedy_dictMatchState \
+	zfs_ZSTD_compressBlock_greedy_dictMatchState
+#define	ZSTD_compressBlock_greedy_extDict zfs_ZSTD_compressBlock_greedy_extDict
+#define	ZSTD_compressBlock_internal zfs_ZSTD_compressBlock_internal
+#define	ZSTD_compressBlock_lazy zfs_ZSTD_compressBlock_lazy
+#define	ZSTD_compressBlock_lazy2 zfs_ZSTD_compressBlock_lazy2
+#define	ZSTD_compressBlock_lazy2_dictMatchState \
+	zfs_ZSTD_compressBlock_lazy2_dictMatchState
+#define	ZSTD_compressBlock_lazy2_extDict zfs_ZSTD_compressBlock_lazy2_extDict
+#define	ZSTD_compressBlock_lazy_dictMatchState \
+	zfs_ZSTD_compressBlock_lazy_dictMatchState
+#define	ZSTD_compressBlock_lazy_extDict zfs_ZSTD_compressBlock_lazy_extDict
+#define	ZSTD_compressBound zfs_ZSTD_compressBound
+#define	ZSTD_compressCCtx zfs_ZSTD_compressCCtx
+#define	ZSTD_compressContinue zfs_ZSTD_compressContinue
+#define	ZSTD_compressContinue_internal zfs_ZSTD_compressContinue_internal
+#define	ZSTD_compressEnd zfs_ZSTD_compressEnd
+#define	ZSTD_compressLiterals zfs_ZSTD_compressLiterals
+#define	ZSTD_compressRleLiteralsBlock zfs_ZSTD_compressRleLiteralsBlock
+#define	ZSTD_compressStream zfs_ZSTD_compressStream
+#define	ZSTD_compressStream2 zfs_ZSTD_compressStream2
+#define	ZSTD_compressStream2_simpleArgs zfs_ZSTD_compressStream2_simpleArgs
+#define	ZSTD_compressSuperBlock zfs_ZSTD_compressSuperBlock
+#define	ZSTD_compress_advanced zfs_ZSTD_compress_advanced
+#define	ZSTD_compress_advanced_internal zfs_ZSTD_compress_advanced_internal
+#define	ZSTD_compress_internal zfs_ZSTD_compress_internal
+#define	ZSTD_compress_usingCDict zfs_ZSTD_compress_usingCDict
+#define	ZSTD_compress_usingCDict_advanced zfs_ZSTD_compress_usingCDict_advanced
+#define	ZSTD_compress_usingDict zfs_ZSTD_compress_usingDict
+#define	ZSTD_copyCCtx zfs_ZSTD_copyCCtx
+#define	ZSTD_copyDCtx zfs_ZSTD_copyDCtx
+#define	ZSTD_copyDDictParameters zfs_ZSTD_copyDDictParameters
+#define	ZSTD_count zfs_ZSTD_count
+#define	ZSTD_count_2segments zfs_ZSTD_count_2segments
+#define	ZSTD_createCCtx zfs_ZSTD_createCCtx
+#define	ZSTD_createCCtxParams zfs_ZSTD_createCCtxParams
+#define	ZSTD_createCCtx_advanced zfs_ZSTD_createCCtx_advanced
+#define	ZSTD_createCDict zfs_ZSTD_createCDict
+#define	ZSTD_createCDict_advanced zfs_ZSTD_createCDict_advanced
+#define	ZSTD_createCDict_byReference zfs_ZSTD_createCDict_byReference
+#define	ZSTD_createCStream zfs_ZSTD_createCStream
+#define	ZSTD_createCStream_advanced zfs_ZSTD_createCStream_advanced
+#define	ZSTD_createDCtx zfs_ZSTD_createDCtx
+#define	ZSTD_createDCtx_advanced zfs_ZSTD_createDCtx_advanced
+#define	ZSTD_createDDict zfs_ZSTD_createDDict
+#define	ZSTD_createDDict_advanced zfs_ZSTD_createDDict_advanced
+#define	ZSTD_createDDict_byReference zfs_ZSTD_createDDict_byReference
+#define	ZSTD_createDStream zfs_ZSTD_createDStream
+#define	ZSTD_createDStream_advanced zfs_ZSTD_createDStream_advanced
+#define	ZSTD_crossEntropyCost zfs_ZSTD_crossEntropyCost
+#define	ZSTD_cycleLog zfs_ZSTD_cycleLog
+#define	ZSTD_dParam_getBounds zfs_ZSTD_dParam_getBounds
+#define	ZSTD_decodeLiteralsBlock zfs_ZSTD_decodeLiteralsBlock
+#define	ZSTD_decodeSeqHeaders zfs_ZSTD_decodeSeqHeaders
+#define	ZSTD_decodingBufferSize_min zfs_ZSTD_decodingBufferSize_min
+#define	ZSTD_decompress zfs_ZSTD_decompress
+#define	ZSTD_decompressBegin zfs_ZSTD_decompressBegin
+#define	ZSTD_decompressBegin_usingDDict zfs_ZSTD_decompressBegin_usingDDict
+#define	ZSTD_decompressBegin_usingDict zfs_ZSTD_decompressBegin_usingDict
+#define	ZSTD_decompressBlock zfs_ZSTD_decompressBlock
+#define	ZSTD_decompressBlock_internal zfs_ZSTD_decompressBlock_internal
+#define	ZSTD_decompressBound zfs_ZSTD_decompressBound
+#define	ZSTD_decompressContinue zfs_ZSTD_decompressContinue
+#define	ZSTD_decompressContinueStream zfs_ZSTD_decompressContinueStream
+#define	ZSTD_decompressDCtx zfs_ZSTD_decompressDCtx
+#define	ZSTD_decompressMultiFrame zfs_ZSTD_decompressMultiFrame
+#define	ZSTD_decompressStream zfs_ZSTD_decompressStream
+#define	ZSTD_decompressStream_simpleArgs zfs_ZSTD_decompressStream_simpleArgs
+#define	ZSTD_decompress_usingDDict zfs_ZSTD_decompress_usingDDict
+#define	ZSTD_decompress_usingDict zfs_ZSTD_decompress_usingDict
+#define	ZSTD_defaultCParameters zfs_ZSTD_defaultCParameters
+#define	ZSTD_did_fieldSize zfs_ZSTD_did_fieldSize
+#define	ZSTD_encodeSequences zfs_ZSTD_encodeSequences
+#define	ZSTD_encodeSequences_default zfs_ZSTD_encodeSequences_default
+#define	ZSTD_endStream zfs_ZSTD_endStream
+#define	ZSTD_estimateCCtxSize zfs_ZSTD_estimateCCtxSize
+#define	ZSTD_estimateCCtxSize_usingCCtxParams \
+	zfs_ZSTD_estimateCCtxSize_usingCCtxParams
+#define	ZSTD_estimateCCtxSize_usingCParams \
+	zfs_ZSTD_estimateCCtxSize_usingCParams
+#define	ZSTD_estimateCDictSize zfs_ZSTD_estimateCDictSize
+#define	ZSTD_estimateCDictSize_advanced zfs_ZSTD_estimateCDictSize_advanced
+#define	ZSTD_estimateCStreamSize zfs_ZSTD_estimateCStreamSize
+#define	ZSTD_estimateCStreamSize_usingCCtxParams \
+	zfs_ZSTD_estimateCStreamSize_usingCCtxParams
+#define	ZSTD_estimateCStreamSize_usingCParams \
+	zfs_ZSTD_estimateCStreamSize_usingCParams
+#define	ZSTD_estimateDCtxSize zfs_ZSTD_estimateDCtxSize
+#define	ZSTD_estimateDDictSize zfs_ZSTD_estimateDDictSize
+#define	ZSTD_estimateDStreamSize zfs_ZSTD_estimateDStreamSize
+#define	ZSTD_estimateDStreamSize_fromFrame \
+	zfs_ZSTD_estimateDStreamSize_fromFrame
+#define	ZSTD_fcs_fieldSize zfs_ZSTD_fcs_fieldSize
+#define	ZSTD_fillDoubleHashTable zfs_ZSTD_fillDoubleHashTable
+#define	ZSTD_fillHashTable zfs_ZSTD_fillHashTable
+#define	ZSTD_findDecompressedSize zfs_ZSTD_findDecompressedSize
+#define	ZSTD_findFrameCompressedSize zfs_ZSTD_findFrameCompressedSize
+#define	ZSTD_findFrameSizeInfo zfs_ZSTD_findFrameSizeInfo
+#define	ZSTD_flushStream zfs_ZSTD_flushStream
+#define	ZSTD_frameHeaderSize zfs_ZSTD_frameHeaderSize
+#define	ZSTD_free zfs_ZSTD_free
+#define	ZSTD_freeCCtx zfs_ZSTD_freeCCtx
+#define	ZSTD_freeCCtxParams zfs_ZSTD_freeCCtxParams
+#define	ZSTD_freeCDict zfs_ZSTD_freeCDict
+#define	ZSTD_freeCStream zfs_ZSTD_freeCStream
+#define	ZSTD_freeDCtx zfs_ZSTD_freeDCtx
+#define	ZSTD_freeDDict zfs_ZSTD_freeDDict
+#define	ZSTD_freeDStream zfs_ZSTD_freeDStream
+#define	ZSTD_fseBitCost zfs_ZSTD_fseBitCost
+#define	ZSTD_getBlockSize zfs_ZSTD_getBlockSize
+#define	ZSTD_getCParams zfs_ZSTD_getCParams
+#define	ZSTD_getCParamsFromCCtxParams zfs_ZSTD_getCParamsFromCCtxParams
+#define	ZSTD_getCParamsFromCDict zfs_ZSTD_getCParamsFromCDict
+#define	ZSTD_getCParams_internal zfs_ZSTD_getCParams_internal
+#define	ZSTD_getDDict zfs_ZSTD_getDDict
+#define	ZSTD_getDecompressedSize zfs_ZSTD_getDecompressedSize
+#define	ZSTD_getDictID_fromDDict zfs_ZSTD_getDictID_fromDDict
+#define	ZSTD_getDictID_fromDict zfs_ZSTD_getDictID_fromDict
+#define	ZSTD_getDictID_fromFrame zfs_ZSTD_getDictID_fromFrame
+#define	ZSTD_getErrorCode zfs_ZSTD_getErrorCode
+#define	ZSTD_getErrorName zfs_ZSTD_getErrorName
+#define	ZSTD_getErrorString zfs_ZSTD_getErrorString
+#define	ZSTD_getFrameContentSize zfs_ZSTD_getFrameContentSize
+#define	ZSTD_getFrameHeader zfs_ZSTD_getFrameHeader
+#define	ZSTD_getFrameHeader_advanced zfs_ZSTD_getFrameHeader_advanced
+#define	ZSTD_getFrameProgression zfs_ZSTD_getFrameProgression
+#define	ZSTD_getParams zfs_ZSTD_getParams
+#define	ZSTD_getSeqStore zfs_ZSTD_getSeqStore
+#define	ZSTD_getSequences zfs_ZSTD_getSequences
+#define	ZSTD_getcBlockSize zfs_ZSTD_getcBlockSize
+#define	ZSTD_hashPtr zfs_ZSTD_hashPtr
+#define	ZSTD_initCDict_internal zfs_ZSTD_initCDict_internal
+#define	ZSTD_initCStream zfs_ZSTD_initCStream
+#define	ZSTD_initCStream_advanced zfs_ZSTD_initCStream_advanced
+#define	ZSTD_initCStream_internal zfs_ZSTD_initCStream_internal
+#define	ZSTD_initCStream_srcSize zfs_ZSTD_initCStream_srcSize
+#define	ZSTD_initCStream_usingCDict zfs_ZSTD_initCStream_usingCDict
+#define	ZSTD_initCStream_usingCDict_advanced \
+	zfs_ZSTD_initCStream_usingCDict_advanced
+#define	ZSTD_initCStream_usingDict zfs_ZSTD_initCStream_usingDict
+#define	ZSTD_initDDict_internal zfs_ZSTD_initDDict_internal
+#define	ZSTD_initDStream zfs_ZSTD_initDStream
+#define	ZSTD_initDStream_usingDDict zfs_ZSTD_initDStream_usingDDict
+#define	ZSTD_initDStream_usingDict zfs_ZSTD_initDStream_usingDict
+#define	ZSTD_initFseState zfs_ZSTD_initFseState
+#define	ZSTD_initStaticCCtx zfs_ZSTD_initStaticCCtx
+#define	ZSTD_initStaticCDict zfs_ZSTD_initStaticCDict
+#define	ZSTD_initStaticCStream zfs_ZSTD_initStaticCStream
+#define	ZSTD_initStaticDCtx zfs_ZSTD_initStaticDCtx
+#define	ZSTD_initStaticDDict zfs_ZSTD_initStaticDDict
+#define	ZSTD_initStaticDStream zfs_ZSTD_initStaticDStream
+#define	ZSTD_initStats_ultra zfs_ZSTD_initStats_ultra
+#define	ZSTD_insertAndFindFirstIndex zfs_ZSTD_insertAndFindFirstIndex
+#define	ZSTD_insertAndFindFirstIndexHash3 zfs_ZSTD_insertAndFindFirstIndexHash3
+#define	ZSTD_insertAndFindFirstIndex_internal \
+	zfs_ZSTD_insertAndFindFirstIndex_internal
+#define	ZSTD_insertBlock zfs_ZSTD_insertBlock
+#define	ZSTD_invalidateRepCodes zfs_ZSTD_invalidateRepCodes
+#define	ZSTD_isFrame zfs_ZSTD_isFrame
+#define	ZSTD_ldm_adjustParameters zfs_ZSTD_ldm_adjustParameters
+#define	ZSTD_ldm_blockCompress zfs_ZSTD_ldm_blockCompress
+#define	ZSTD_ldm_fillHashTable zfs_ZSTD_ldm_fillHashTable
+#define	ZSTD_ldm_generateSequences zfs_ZSTD_ldm_generateSequences
+#define	ZSTD_ldm_getMaxNbSeq zfs_ZSTD_ldm_getMaxNbSeq
+#define	ZSTD_ldm_getTableSize zfs_ZSTD_ldm_getTableSize
+#define	ZSTD_ldm_skipSequences zfs_ZSTD_ldm_skipSequences
+#define	ZSTD_loadCEntropy zfs_ZSTD_loadCEntropy
+#define	ZSTD_loadDEntropy zfs_ZSTD_loadDEntropy
+#define	ZSTD_loadDictionaryContent zfs_ZSTD_loadDictionaryContent
+#define	ZSTD_makeCCtxParamsFromCParams zfs_ZSTD_makeCCtxParamsFromCParams
+#define	ZSTD_malloc zfs_ZSTD_malloc
+#define	ZSTD_maxCLevel zfs_ZSTD_maxCLevel
+#define	ZSTD_minCLevel zfs_ZSTD_minCLevel
+#define	ZSTD_nextInputType zfs_ZSTD_nextInputType
+#define	ZSTD_nextSrcSizeToDecompress zfs_ZSTD_nextSrcSizeToDecompress
+#define	ZSTD_noCompressLiterals zfs_ZSTD_noCompressLiterals
+#define	ZSTD_referenceExternalSequences zfs_ZSTD_referenceExternalSequences
+#define	ZSTD_rescaleFreqs zfs_ZSTD_rescaleFreqs
+#define	ZSTD_resetCCtx_internal zfs_ZSTD_resetCCtx_internal
+#define	ZSTD_resetCCtx_usingCDict zfs_ZSTD_resetCCtx_usingCDict
+#define	ZSTD_resetCStream zfs_ZSTD_resetCStream
+#define	ZSTD_resetDStream zfs_ZSTD_resetDStream
+#define	ZSTD_resetSeqStore zfs_ZSTD_resetSeqStore
+#define	ZSTD_reset_compressedBlockState zfs_ZSTD_reset_compressedBlockState
+#define	ZSTD_safecopy zfs_ZSTD_safecopy
+#define	ZSTD_selectBlockCompressor zfs_ZSTD_selectBlockCompressor
+#define	ZSTD_selectEncodingType zfs_ZSTD_selectEncodingType
+#define	ZSTD_seqToCodes zfs_ZSTD_seqToCodes
+#define	ZSTD_sizeof_CCtx zfs_ZSTD_sizeof_CCtx
+#define	ZSTD_sizeof_CDict zfs_ZSTD_sizeof_CDict
+#define	ZSTD_sizeof_CStream zfs_ZSTD_sizeof_CStream
+#define	ZSTD_sizeof_DCtx zfs_ZSTD_sizeof_DCtx
+#define	ZSTD_sizeof_DDict zfs_ZSTD_sizeof_DDict
+#define	ZSTD_sizeof_DStream zfs_ZSTD_sizeof_DStream
+#define	ZSTD_toFlushNow zfs_ZSTD_toFlushNow
+#define	ZSTD_updateRep zfs_ZSTD_updateRep
+#define	ZSTD_updateStats zfs_ZSTD_updateStats
+#define	ZSTD_updateTree zfs_ZSTD_updateTree
+#define	ZSTD_versionNumber zfs_ZSTD_versionNumber
+#define	ZSTD_versionString zfs_ZSTD_versionString
+#define	ZSTD_writeFrameHeader zfs_ZSTD_writeFrameHeader
+#define	ZSTD_writeLastEmptyBlock zfs_ZSTD_writeLastEmptyBlock
+#define	algoTime zfs_algoTime
+#define	attachDictSizeCutoffs zfs_attachDictSizeCutoffs
+#define	g_ctx zfs_g_ctx
+#define	g_debuglevel zfs_g_debuglevel
+#define	kInverseProbabilityLog256 zfs_kInverseProbabilityLog256
+#define	repStartValue zfs_repStartValue
+#define	FSE_isError zfs_FSE_isError
+#define	HUF_isError zfs_HUF_isError
diff --git a/sys/contrib/openzfs/module/zstd/lib/zstd.c b/sys/contrib/openzfs/module/zstd/lib/zstd.c
new file mode 100644
index 000000000000..acdd4d9dac9d
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/lib/zstd.c
@@ -0,0 +1,27826 @@
+/*
+ * BSD 3-Clause Clear License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved.
+ * Copyright (c) 2019-2020, Michael Niewöhner. All rights reserved.
+ */
+
+#define	MEM_MODULE
+#define	XXH_NAMESPACE ZSTD_
+#define	XXH_PRIVATE_API
+#define	XXH_INLINE_ALL
+#define	ZSTD_LEGACY_SUPPORT 0
+#define	ZSTD_LIB_DICTBUILDER 0
+#define	ZSTD_LIB_DEPRECATED 0
+#define	ZSTD_NOBENCH
+
+/**** start inlining common/debug.c ****/
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * This module only hosts one global variable
+ * which can be used to dynamically influence the verbosity of traces,
+ * such as DEBUGLOG and RAWLOG
+ */
+
+/**** start inlining debug.h ****/
+/* ******************************************************************
+ * debug
+ * Part of FSE library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/*
+ * The purpose of this header is to enable debug functions.
+ * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time,
+ * and DEBUG_STATIC_ASSERT() for compile-time.
+ *
+ * By default, DEBUGLEVEL==0, which means run-time debug is disabled.
+ *
+ * Level 1 enables assert() only.
+ * Starting level 2, traces can be generated and pushed to stderr.
+ * The higher the level, the more verbose the traces.
+ *
+ * It's possible to dynamically adjust level using variable g_debug_level,
+ * which is only declared if DEBUGLEVEL>=2,
+ * and is a global variable, not multi-thread protected (use with care)
+ */
+
+#ifndef DEBUG_H_12987983217
+#define DEBUG_H_12987983217
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* static assert is triggered at compile time, leaving no runtime artefact.
+ * static assert only works with compile-time constants.
+ * Also, this variant can only be used inside a function. */
+#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1])
+
+
+/* DEBUGLEVEL is expected to be defined externally,
+ * typically through compiler command line.
+ * Value must be a number. */
+#ifndef DEBUGLEVEL
+#  define DEBUGLEVEL 0
+#endif
+
+
+/* DEBUGFILE can be defined externally,
+ * typically through compiler command line.
+ * note : currently useless.
+ * Value must be stderr or stdout */
+#ifndef DEBUGFILE
+#  define DEBUGFILE stderr
+#endif
+
+
+/* recommended values for DEBUGLEVEL :
+ * 0 : release mode, no debug, all run-time checks disabled
+ * 1 : enables assert() only, no display
+ * 2 : reserved, for currently active debug path
+ * 3 : events once per object lifetime (CCtx, CDict, etc.)
+ * 4 : events once per frame
+ * 5 : events once per block
+ * 6 : events once per sequence (verbose)
+ * 7+: events at every position (*very* verbose)
+ *
+ * It's generally inconvenient to output traces > 5.
+ * In which case, it's possible to selectively trigger high verbosity levels
+ * by modifying g_debug_level.
+ */
+
+#if (DEBUGLEVEL>=1)
+#  include <assert.h>
+#else
+#  ifndef assert   /* assert may be already defined, due to prior #include <assert.h> */
+#    define assert(condition) ((void)0)   /* disable assert (default) */
+#  endif
+#endif
+
+#if (DEBUGLEVEL>=2)
+#  include <stdio.h>
+extern int g_debuglevel; /* the variable is only declared,
+                            it actually lives in debug.c,
+                            and is shared by the whole process.
+                            It's not thread-safe.
+                            It's useful when enabling very verbose levels
+                            on selective conditions (such as position in src) */
+
+#  define RAWLOG(l, ...) {                                      \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __VA_ARGS__);               \
+            }   }
+#  define DEBUGLOG(l, ...) {                                    \
+                if (l<=g_debuglevel) {                          \
+                    fprintf(stderr, __FILE__ ": " __VA_ARGS__); \
+                    fprintf(stderr, " \n");                     \
+            }   }
+#else
+#  define RAWLOG(l, ...)      {}    /* disabled */
+#  define DEBUGLOG(l, ...)    {}    /* disabled */
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* DEBUG_H_12987983217 */
+/**** ended inlining debug.h ****/
+
+int g_debuglevel = DEBUGLEVEL;
+/**** ended inlining common/debug.c ****/
+/**** start inlining common/entropy_common.c ****/
+/* ******************************************************************
+ * Common functions of New Generation Entropy library
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* *************************************
+*  Dependencies
+***************************************/
+/**** start inlining mem.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef MEM_H_MODULE
+#define MEM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>     /* size_t, ptrdiff_t */
+#include <string.h>     /* memcpy */
+
+
+/*-****************************************
+*  Compiler specifics
+******************************************/
+#if defined(_MSC_VER)   /* Visual Studio */
+#   include <stdlib.h>  /* _byteswap_ulong */
+#   include <intrin.h>  /* _byteswap_* */
+#endif
+#if defined(__GNUC__)
+#  define MEM_STATIC static __inline __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define MEM_STATIC static inline
+#elif defined(_MSC_VER)
+#  define MEM_STATIC static __inline
+#else
+#  define MEM_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+#ifndef __has_builtin
+#  define __has_builtin(x) 0  /* compat. with non-clang compilers */
+#endif
+
+/* code only tested on 32 and 64 bits systems */
+#define MEM_STATIC_ASSERT(c)   { enum { MEM_static_assert = 1/(int)(!!(c)) }; }
+MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); }
+
+/* detects whether we are being compiled under msan */
+#if defined (__has_feature)
+#  if __has_feature(memory_sanitizer)
+#    define MEMORY_SANITIZER 1
+#  endif
+#endif
+
+#if defined (MEMORY_SANITIZER)
+/* Not all platforms that support msan provide sanitizers/msan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+
+#include <stdint.h> /* intptr_t */
+
+/* Make memory region fully initialized (without changing its contents). */
+void __msan_unpoison(const volatile void *a, size_t size);
+
+/* Make memory region fully uninitialized (without changing its contents).
+   This is a legacy interface that does not update origin information. Use
+   __msan_allocated_memory() instead. */
+void __msan_poison(const volatile void *a, size_t size);
+
+/* Returns the offset of the first (at least partially) poisoned byte in the
+   memory range, or -1 if the whole range is good. */
+intptr_t __msan_test_shadow(const volatile void *x, size_t size);
+#endif
+
+/* detects whether we are being compiled under asan */
+#if defined (__has_feature)
+#  if __has_feature(address_sanitizer)
+#    define ADDRESS_SANITIZER 1
+#  endif
+#elif defined(__SANITIZE_ADDRESS__)
+#  define ADDRESS_SANITIZER 1
+#endif
+
+#if defined (ADDRESS_SANITIZER)
+/* Not all platforms that support asan provide sanitizers/asan_interface.h.
+ * We therefore declare the functions we need ourselves, rather than trying to
+ * include the header file... */
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as unaddressable.
+ *
+ * This memory must be previously allocated by your program. Instrumented
+ * code is forbidden from accessing addresses in this region until it is
+ * unpoisoned. This function is not guaranteed to poison the entire region -
+ * it could poison only a subregion of <c>[addr, addr+size)</c> due to ASan
+ * alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can poison or
+ * unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_poison_memory_region(void const volatile *addr, size_t size);
+
+/**
+ * Marks a memory region (<c>[addr, addr+size)</c>) as addressable.
+ *
+ * This memory must be previously allocated by your program. Accessing
+ * addresses in this region is allowed until this region is poisoned again.
+ * This function could unpoison a super-region of <c>[addr, addr+size)</c> due
+ * to ASan alignment restrictions.
+ *
+ * \note This function is not thread-safe because no two threads can
+ * poison or unpoison memory in the same memory region simultaneously.
+ *
+ * \param addr Start of memory region.
+ * \param size Size of memory region. */
+void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+#endif
+
+
+/*-**************************************************************
+*  Basic Types
+*****************************************************************/
+#if  !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# include <stdint.h>
+  typedef   uint8_t BYTE;
+  typedef  uint16_t U16;
+  typedef   int16_t S16;
+  typedef  uint32_t U32;
+  typedef   int32_t S32;
+  typedef  uint64_t U64;
+  typedef   int64_t S64;
+#else
+# include <limits.h>
+#if CHAR_BIT != 8
+#  error "this implementation requires char to be exactly 8-bit type"
+#endif
+  typedef unsigned char      BYTE;
+#if USHRT_MAX != 65535
+#  error "this implementation requires short to be exactly 16-bit type"
+#endif
+  typedef unsigned short      U16;
+  typedef   signed short      S16;
+#if UINT_MAX != 4294967295
+#  error "this implementation requires int to be exactly 32-bit type"
+#endif
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+/* note : there are no limits defined for long long type in C90.
+ * limits exist in C99, however, in such case, <stdint.h> is preferred */
+  typedef unsigned long long  U64;
+  typedef   signed long long  S64;
+#endif
+
+
+/*-**************************************************************
+*  Memory I/O
+*****************************************************************/
+/* MEM_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets depending on alignment.
+ *            In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6)
+ * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef MEM_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define MEM_FORCE_MEMORY_ACCESS 2
+#  elif defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__)
+#    define MEM_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; }
+MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; }
+
+MEM_STATIC unsigned MEM_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2)
+
+/* violates C standard, by lying on structure alignment.
+Only use if no other choice to achieve best performance on target platform */
+MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; }
+MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; }
+MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; }
+MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; }
+
+#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32))
+    __pragma( pack(push, 1) )
+    typedef struct { U16 v; } unalign16;
+    typedef struct { U32 v; } unalign32;
+    typedef struct { U64 v; } unalign64;
+    typedef struct { size_t v; } unalignArch;
+    __pragma( pack(pop) )
+#else
+    typedef struct { U16 v; } __attribute__((packed)) unalign16;
+    typedef struct { U32 v; } __attribute__((packed)) unalign32;
+    typedef struct { U64 v; } __attribute__((packed)) unalign64;
+    typedef struct { size_t v; } __attribute__((packed)) unalignArch;
+#endif
+
+MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; }
+MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; }
+MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; }
+MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; }
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; }
+MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; }
+MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; }
+
+#else
+
+/* default method, safe and standard.
+   can sometimes prove slower */
+
+MEM_STATIC U16 MEM_read16(const void* memPtr)
+{
+    U16 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U32 MEM_read32(const void* memPtr)
+{
+    U32 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC U64 MEM_read64(const void* memPtr)
+{
+    U64 val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC size_t MEM_readST(const void* memPtr)
+{
+    size_t val; memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+MEM_STATIC void MEM_write16(void* memPtr, U16 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write32(void* memPtr, U32 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+MEM_STATIC void MEM_write64(void* memPtr, U64 value)
+{
+    memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* MEM_FORCE_MEMORY_ACCESS */
+
+MEM_STATIC U32 MEM_swap32(U32 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_ulong(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap32))
+    return __builtin_bswap32(in);
+#else
+    return  ((in << 24) & 0xff000000 ) |
+            ((in <<  8) & 0x00ff0000 ) |
+            ((in >>  8) & 0x0000ff00 ) |
+            ((in >> 24) & 0x000000ff );
+#endif
+}
+
+MEM_STATIC U64 MEM_swap64(U64 in)
+{
+#if defined(_MSC_VER)     /* Visual Studio */
+    return _byteswap_uint64(in);
+#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \
+  || (defined(__clang__) && __has_builtin(__builtin_bswap64))
+    return __builtin_bswap64(in);
+#else
+    return  ((in << 56) & 0xff00000000000000ULL) |
+            ((in << 40) & 0x00ff000000000000ULL) |
+            ((in << 24) & 0x0000ff0000000000ULL) |
+            ((in << 8)  & 0x000000ff00000000ULL) |
+            ((in >> 8)  & 0x00000000ff000000ULL) |
+            ((in >> 24) & 0x0000000000ff0000ULL) |
+            ((in >> 40) & 0x000000000000ff00ULL) |
+            ((in >> 56) & 0x00000000000000ffULL);
+#endif
+}
+
+MEM_STATIC size_t MEM_swapST(size_t in)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_swap32((U32)in);
+    else
+        return (size_t)MEM_swap64((U64)in);
+}
+
+/*=== Little endian r/w ===*/
+
+MEM_STATIC U16 MEM_readLE16(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read16(memPtr);
+    else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)(p[0] + (p[1]<<8));
+    }
+}
+
+MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val)
+{
+    if (MEM_isLittleEndian()) {
+        MEM_write16(memPtr, val);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE)val;
+        p[1] = (BYTE)(val>>8);
+    }
+}
+
+MEM_STATIC U32 MEM_readLE24(const void* memPtr)
+{
+    return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16);
+}
+
+MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val)
+{
+    MEM_writeLE16(memPtr, (U16)val);
+    ((BYTE*)memPtr)[2] = (BYTE)(val>>16);
+}
+
+MEM_STATIC U32 MEM_readLE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read32(memPtr);
+    else
+        return MEM_swap32(MEM_read32(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, val32);
+    else
+        MEM_write32(memPtr, MEM_swap32(val32));
+}
+
+MEM_STATIC U64 MEM_readLE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_read64(memPtr);
+    else
+        return MEM_swap64(MEM_read64(memPtr));
+}
+
+MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, val64);
+    else
+        MEM_write64(memPtr, MEM_swap64(val64));
+}
+
+MEM_STATIC size_t MEM_readLEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readLE32(memPtr);
+    else
+        return (size_t)MEM_readLE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeLE32(memPtr, (U32)val);
+    else
+        MEM_writeLE64(memPtr, (U64)val);
+}
+
+/*=== Big endian r/w ===*/
+
+MEM_STATIC U32 MEM_readBE32(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap32(MEM_read32(memPtr));
+    else
+        return MEM_read32(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32)
+{
+    if (MEM_isLittleEndian())
+        MEM_write32(memPtr, MEM_swap32(val32));
+    else
+        MEM_write32(memPtr, val32);
+}
+
+MEM_STATIC U64 MEM_readBE64(const void* memPtr)
+{
+    if (MEM_isLittleEndian())
+        return MEM_swap64(MEM_read64(memPtr));
+    else
+        return MEM_read64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64)
+{
+    if (MEM_isLittleEndian())
+        MEM_write64(memPtr, MEM_swap64(val64));
+    else
+        MEM_write64(memPtr, val64);
+}
+
+MEM_STATIC size_t MEM_readBEST(const void* memPtr)
+{
+    if (MEM_32bits())
+        return (size_t)MEM_readBE32(memPtr);
+    else
+        return (size_t)MEM_readBE64(memPtr);
+}
+
+MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val)
+{
+    if (MEM_32bits())
+        MEM_writeBE32(memPtr, (U32)val);
+    else
+        MEM_writeBE64(memPtr, (U64)val);
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* MEM_H_MODULE */
+/**** ended inlining mem.h ****/
+/**** start inlining error_private.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* Note : this module is expected to remain private, do not expose it */
+
+#ifndef ERROR_H_MODULE
+#define ERROR_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>        /* size_t */
+/**** start inlining zstd_errors.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDERRORLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDERRORLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */
+/**** ended inlining zstd_errors.h ****/
+
+
+/* ****************************************
+*  Compiler-specific
+******************************************/
+#if defined(__GNUC__)
+#  define ERR_STATIC static __attribute__((unused))
+#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#  define ERR_STATIC static inline
+#elif defined(_MSC_VER)
+#  define ERR_STATIC static __inline
+#else
+#  define ERR_STATIC static  /* this version may generate warnings for unused static functions; disable the relevant warning */
+#endif
+
+
+/*-****************************************
+*  Customization (error_public.h)
+******************************************/
+typedef ZSTD_ErrorCode ERR_enum;
+#define PREFIX(name) ZSTD_error_##name
+
+
+/*-****************************************
+*  Error codes handling
+******************************************/
+#undef ERROR   /* already defined on Visual Studio */
+#define ERROR(name) ZSTD_ERROR(name)
+#define ZSTD_ERROR(name) ((size_t)-PREFIX(name))
+
+ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); }
+
+ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); }
+
+/* check and forward error code */
+#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e
+#define CHECK_F(f)   { CHECK_V_F(_var_err__, f); }
+
+
+/*-****************************************
+*  Error Strings
+******************************************/
+
+const char* ERR_getErrorString(ERR_enum code);   /* error_private.c */
+
+ERR_STATIC const char* ERR_getErrorName(size_t code)
+{
+    return ERR_getErrorString(ERR_getErrorCode(code));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ERROR_H_MODULE */
+/**** ended inlining error_private.h ****/
+#define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
+/**** start inlining fse.h ****/
+/* ******************************************************************
+ * FSE : Finite State Entropy codec
+ * Public Prototypes declaration
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef FSE_H
+#define FSE_H
+
+
+/*-*****************************************
+*  Dependencies
+******************************************/
+#include <stddef.h>    /* size_t, ptrdiff_t */
+
+
+/*-*****************************************
+*  FSE_PUBLIC_API : control library symbols visibility
+******************************************/
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define FSE_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define FSE_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define FSE_PUBLIC_API
+#endif
+
+/*------   Version   ------*/
+#define FSE_VERSION_MAJOR    0
+#define FSE_VERSION_MINOR    9
+#define FSE_VERSION_RELEASE  0
+
+#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE
+#define FSE_QUOTE(str) #str
+#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str)
+#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION)
+
+#define FSE_VERSION_NUMBER  (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE)
+FSE_PUBLIC_API unsigned FSE_versionNumber(void);   /**< library version number; to be used when checking dll version */
+
+
+/*-****************************************
+*  FSE simple functions
+******************************************/
+/*! FSE_compress() :
+    Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'.
+    'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize).
+    @return : size of compressed data (<= dstCapacity).
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead.
+                     if FSE_isError(return), compression failed (more details using FSE_getErrorName())
+*/
+FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/*! FSE_decompress():
+    Decompress FSE data from buffer 'cSrc', of size 'cSrcSize',
+    into already allocated destination buffer 'dst', of size 'dstCapacity'.
+    @return : size of regenerated data (<= maxDstSize),
+              or an error code, which can be tested using FSE_isError() .
+
+    ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!!
+    Why ? : making this distinction requires a header.
+    Header management is intentionally delegated to the user layer, which can better manage special cases.
+*/
+FSE_PUBLIC_API size_t FSE_decompress(void* dst,  size_t dstCapacity,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/*-*****************************************
+*  Tool functions
+******************************************/
+FSE_PUBLIC_API size_t FSE_compressBound(size_t size);       /* maximum compressed size */
+
+/* Error Management */
+FSE_PUBLIC_API unsigned    FSE_isError(size_t code);        /* tells if a return value is an error code */
+FSE_PUBLIC_API const char* FSE_getErrorName(size_t code);   /* provides error code string (useful for debugging) */
+
+
+/*-*****************************************
+*  FSE advanced functions
+******************************************/
+/*! FSE_compress2() :
+    Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog'
+    Both parameters can be defined as '0' to mean : use default value
+    @return : size of compressed data
+    Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!!
+                     if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression.
+                     if FSE_isError(return), it's an error code.
+*/
+FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+
+
+/*-*****************************************
+*  FSE detailed API
+******************************************/
+/*!
+FSE_compress() does the following:
+1. count symbol occurrence from source[] into table count[] (see hist.h)
+2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog)
+3. save normalized counters to memory buffer using writeNCount()
+4. build encoding table 'CTable' from normalized counters
+5. encode the data stream using encoding table 'CTable'
+
+FSE_decompress() does the following:
+1. read normalized counters with readNCount()
+2. build decoding table 'DTable' from normalized counters
+3. decode the data stream using decoding table 'DTable'
+
+The following API allows targeting specific sub-functions for advanced tasks.
+For example, it's possible to compress several blocks using the same 'CTable',
+or to save and provide normalized distribution using external method.
+*/
+
+/* *** COMPRESSION *** */
+
+/*! FSE_optimalTableLog():
+    dynamically downsize 'tableLog' when conditions are met.
+    It saves CPU time, by using smaller tables, while preserving or even improving compression ratio.
+    @return : recommended tableLog (necessarily <= 'maxTableLog') */
+FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_normalizeCount():
+    normalize counts so that sum(count[]) == Power_of_2 (2^tableLog)
+    'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1).
+    @return : tableLog,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog,
+                    const unsigned* count, size_t srcSize, unsigned maxSymbolValue);
+
+/*! FSE_NCountWriteBound():
+    Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'.
+    Typically useful for allocation purpose. */
+FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_writeNCount():
+    Compactly save 'normalizedCounter' into 'buffer'.
+    @return : size of the compressed table,
+              or an errorCode, which can be tested using FSE_isError(). */
+FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                                 const short* normalizedCounter,
+                                 unsigned maxSymbolValue, unsigned tableLog);
+
+/*! Constructor and Destructor of FSE_CTable.
+    Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */
+typedef unsigned FSE_CTable;   /* don't allocate that. It's only meant to be more restrictive than void* */
+FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeCTable (FSE_CTable* ct);
+
+/*! FSE_buildCTable():
+    Builds `ct`, which must be already allocated, using FSE_createCTable().
+    @return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_compress_usingCTable():
+    Compress `src` using `ct` into `dst` which must be already allocated.
+    @return : size of compressed data (<= `dstCapacity`),
+              or 0 if compressed data could not fit into `dst`,
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct);
+
+/*!
+Tutorial :
+----------
+The first step is to count all symbols. FSE_count() does this job very fast.
+Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells.
+'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0]
+maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value)
+FSE_count() will return the number of occurrence of the most frequent symbol.
+This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+The next step is to normalize the frequencies.
+FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'.
+It also guarantees a minimum of 1 to any Symbol with frequency >= 1.
+You can use 'tableLog'==0 to mean "use default tableLog value".
+If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(),
+which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default").
+
+The result of FSE_normalizeCount() will be saved into a table,
+called 'normalizedCounter', which is a table of signed short.
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells.
+The return value is tableLog if everything proceeded as expected.
+It is 0 if there is a single symbol within distribution.
+If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()).
+
+'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount().
+'buffer' must be already allocated.
+For guaranteed success, buffer size must be at least FSE_headerBound().
+The result of the function is the number of bytes written into 'buffer'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small).
+
+'normalizedCounter' can then be used to create the compression table 'CTable'.
+The space required by 'CTable' must be already allocated, using FSE_createCTable().
+You can then use FSE_buildCTable() to fill 'CTable'.
+If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()).
+
+'CTable' can then be used to compress 'src', with FSE_compress_usingCTable().
+Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize'
+The function returns the size of compressed data (without header), necessarily <= `dstCapacity`.
+If it returns '0', compressed data could not fit into 'dst'.
+If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()).
+*/
+
+
+/* *** DECOMPRESSION *** */
+
+/*! FSE_readNCount():
+    Read compactly saved 'normalizedCounter' from 'rBuffer'.
+    @return : size read from 'rBuffer',
+              or an errorCode, which can be tested using FSE_isError().
+              maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */
+FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter,
+                           unsigned* maxSymbolValuePtr, unsigned* tableLogPtr,
+                           const void* rBuffer, size_t rBuffSize);
+
+/*! Constructor and Destructor of FSE_DTable.
+    Note that its size depends on 'tableLog' */
+typedef unsigned FSE_DTable;   /* don't allocate that. It's just a way to be more restrictive than void* */
+FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog);
+FSE_PUBLIC_API void        FSE_freeDTable(FSE_DTable* dt);
+
+/*! FSE_buildDTable():
+    Builds 'dt', which must be already allocated, using FSE_createDTable().
+    return : 0, or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog);
+
+/*! FSE_decompress_usingDTable():
+    Decompress compressed source `cSrc` of size `cSrcSize` using `dt`
+    into `dst` which must be already allocated.
+    @return : size of regenerated data (necessarily <= `dstCapacity`),
+              or an errorCode, which can be tested using FSE_isError() */
+FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt);
+
+/*!
+Tutorial :
+----------
+(Note : these functions only decompress FSE-compressed blocks.
+ If block is uncompressed, use memcpy() instead
+ If block is a single repeated byte, use memset() instead )
+
+The first step is to obtain the normalized frequencies of symbols.
+This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount().
+'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short.
+In practice, that means it's necessary to know 'maxSymbolValue' beforehand,
+or size the table to handle worst case situations (typically 256).
+FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'.
+The result of FSE_readNCount() is the number of bytes read from 'rBuffer'.
+Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that.
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'.
+This is performed by the function FSE_buildDTable().
+The space required by 'FSE_DTable' must be already allocated using FSE_createDTable().
+If there is an error, the function will return an error code, which can be tested using FSE_isError().
+
+`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable().
+`cSrcSize` must be strictly correct, otherwise decompression will fail.
+FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`).
+If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small)
+*/
+
+#endif  /* FSE_H */
+
+#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY)
+#define FSE_H_FSE_STATIC_LINKING_ONLY
+
+/* *** Dependency *** */
+/**** start inlining bitstream.h ****/
+/* ******************************************************************
+ * bitstream
+ * Part of FSE library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+#ifndef BITSTREAM_H_MODULE
+#define BITSTREAM_H_MODULE
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+*  This API consists of small unitary functions, which must be inlined for best performance.
+*  Since link-time-optimization is not available for all compilers,
+*  these functions are defined into a .h to be included.
+*/
+
+/*-****************************************
+*  Dependencies
+******************************************/
+/**** skipping file: mem.h ****/
+/**** start inlining compiler.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPILER_H
+#define ZSTD_COMPILER_H
+
+/*-*******************************************************
+*  Compiler specifics
+*********************************************************/
+/* force inlining */
+
+#if !defined(ZSTD_NO_INLINE)
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#else
+
+#define INLINE_KEYWORD
+#define FORCE_INLINE_ATTR
+
+#endif
+
+/**
+ * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant
+ * parameters. They must be inlined for the compiler to eliminate the constant
+ * branches.
+ */
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+/**
+ * HINT_INLINE is used to help the compiler generate better code. It is *not*
+ * used for "templates", so it can be tweaked based on the compilers
+ * performance.
+ *
+ * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the
+ * always_inline attribute.
+ *
+ * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline
+ * attribute.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5
+#  define HINT_INLINE static INLINE_KEYWORD
+#else
+#  define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR
+#endif
+
+/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */
+#if defined(__GNUC__)
+#  define UNUSED_ATTR __attribute__((unused))
+#else
+#  define UNUSED_ATTR
+#endif
+
+/* force no inlining */
+#ifdef _MSC_VER
+#  define FORCE_NOINLINE static __declspec(noinline)
+#else
+#  if defined(__GNUC__) || defined(__ICCARM__)
+#    define FORCE_NOINLINE static __attribute__((__noinline__))
+#  else
+#    define FORCE_NOINLINE static
+#  endif
+#endif
+
+/* target attribute */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0  /* Compatibility with non-clang compilers. */
+#endif
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
+#else
+#  define TARGET_ATTRIBUTE(target)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default.
+ */
+#ifndef DYNAMIC_BMI2
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \
+      && (defined(__x86_64__) || defined(_M_X86)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/* prefetch
+ * can be disabled, by declaring NO_PREFETCH build macro */
+#if defined(NO_PREFETCH)
+#  define PREFETCH_L1(ptr)  (void)(ptr)  /* disabled */
+#  define PREFETCH_L2(ptr)  (void)(ptr)  /* disabled */
+#else
+#  if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))  /* _mm_prefetch() is not defined outside of x86/x64 */
+#    include <mmintrin.h>   /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */
+#    define PREFETCH_L1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+#    define PREFETCH_L2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+#    elif defined(__aarch64__)
+#     define PREFETCH_L1(ptr)  __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr)))
+#     define PREFETCH_L2(ptr)  __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr)))
+#  elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) )
+#    define PREFETCH_L1(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */)
+#    define PREFETCH_L2(ptr)  __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */)
+#  else
+#    define PREFETCH_L1(ptr) (void)(ptr)  /* disabled */
+#    define PREFETCH_L2(ptr) (void)(ptr)  /* disabled */
+#  endif
+#endif  /* NO_PREFETCH */
+
+#define CACHELINE_SIZE 64
+
+#define PREFETCH_AREA(p, s)  {            \
+    const char* const _ptr = (const char*)(p);  \
+    size_t const _size = (size_t)(s);     \
+    size_t _pos;                          \
+    for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) {  \
+        PREFETCH_L2(_ptr + _pos);         \
+    }                                     \
+}
+
+/* vectorization
+ * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__)
+#  if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5)
+#    define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
+#  else
+#    define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")")
+#  endif
+#else
+#  define DONT_VECTORIZE
+#endif
+
+/* Tell the compiler that a branch is likely or unlikely.
+ * Only use these macros if it causes the compiler to generate better code.
+ * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc
+ * and clang, please do.
+ */
+#if defined(__GNUC__)
+#define LIKELY(x) (__builtin_expect((x), 1))
+#define UNLIKELY(x) (__builtin_expect((x), 0))
+#else
+#define LIKELY(x) (x)
+#define UNLIKELY(x) (x)
+#endif
+
+/* disable warnings */
+#ifdef _MSC_VER    /* Visual Studio */
+#  include <intrin.h>                    /* For Visual 2005 */
+#  pragma warning(disable : 4100)        /* disable: C4100: unreferenced formal parameter */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#  pragma warning(disable : 4214)        /* disable: C4214: non-int bitfields */
+#  pragma warning(disable : 4324)        /* disable: C4324: padded structure */
+#endif
+
+#endif /* ZSTD_COMPILER_H */
+/**** ended inlining compiler.h ****/
+/**** skipping file: debug.h ****/
+/**** skipping file: error_private.h ****/
+
+
+/*=========================================
+*  Target specific
+=========================================*/
+#if defined(__BMI__) && defined(__GNUC__)
+#  include <immintrin.h>   /* support for bextr (experimental) */
+#elif defined(__ICCARM__)
+#  include <intrinsics.h>
+#endif
+
+#define STREAM_ACCUMULATOR_MIN_32  25
+#define STREAM_ACCUMULATOR_MIN_64  57
+#define STREAM_ACCUMULATOR_MIN    ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64))
+
+
+/*-******************************************
+*  bitStream encoding API (write forward)
+********************************************/
+/* bitStream can mix input from multiple sources.
+ * A critical property of these streams is that they encode and decode in **reverse** direction.
+ * So the first bit sequence you add will be the last to be read, like a LIFO stack.
+ */
+typedef struct {
+    size_t bitContainer;
+    unsigned bitPos;
+    char*  startPtr;
+    char*  ptr;
+    char*  endPtr;
+} BIT_CStream_t;
+
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity);
+MEM_STATIC void   BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+MEM_STATIC void   BIT_flushBits(BIT_CStream_t* bitC);
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC);
+
+/* Start with initCStream, providing the size of buffer to write into.
+*  bitStream will never write outside of this buffer.
+*  `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code.
+*
+*  bits are first added to a local register.
+*  Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems.
+*  Writing data into memory is an explicit operation, performed by the flushBits function.
+*  Hence keep track how many bits are potentially stored into local register to avoid register overflow.
+*  After a flushBits, a maximum of 7 bits might still be stored into local register.
+*
+*  Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers.
+*
+*  Last operation is to close the bitStream.
+*  The function returns the final size of CStream in bytes.
+*  If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable)
+*/
+
+
+/*-********************************************
+*  bitStream decoding API (read backward)
+**********************************************/
+typedef struct {
+    size_t   bitContainer;
+    unsigned bitsConsumed;
+    const char* ptr;
+    const char* start;
+    const char* limitPtr;
+} BIT_DStream_t;
+
+typedef enum { BIT_DStream_unfinished = 0,
+               BIT_DStream_endOfBuffer = 1,
+               BIT_DStream_completed = 2,
+               BIT_DStream_overflow = 3 } BIT_DStream_status;  /* result of BIT_reloadDStream() */
+               /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */
+
+MEM_STATIC size_t   BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize);
+MEM_STATIC size_t   BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits);
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD);
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD);
+
+
+/* Start by invoking BIT_initDStream().
+*  A chunk of the bitStream is then stored into a local register.
+*  Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+*  You can then retrieve bitFields stored into the local register, **in reverse order**.
+*  Local register is explicitly reloaded from memory by the BIT_reloadDStream() method.
+*  A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished.
+*  Otherwise, it can be less than that, so proceed accordingly.
+*  Checking if DStream has reached its end can be performed with BIT_endOfDStream().
+*/
+
+
+/*-****************************************
+*  unsafe API
+******************************************/
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits);
+/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */
+
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC);
+/* unsafe version; does not check buffer overflow */
+
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits);
+/* faster, but works only if nbBits >= 1 */
+
+
+
+/*-**************************************************************
+*  Internal functions
+****************************************************************/
+MEM_STATIC unsigned BIT_highbit32 (U32 val)
+{
+    assert(val != 0);
+    {
+#   if defined(_MSC_VER)   /* Visual */
+        unsigned long r=0;
+        return _BitScanReverse ( &r, val ) ? (unsigned)r : 0;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* Use GCC Intrinsic */
+        return __builtin_clz (val) ^ 31;
+#   elif defined(__ICCARM__)    /* IAR Intrinsic */
+        return 31 - __CLZ(val);
+#   else   /* Software version */
+        static const unsigned DeBruijnClz[32] = { 0,  9,  1, 10, 13, 21,  2, 29,
+                                                 11, 14, 16, 18, 22, 25,  3, 30,
+                                                  8, 12, 20, 28, 15, 17, 24,  7,
+                                                 19, 27, 23,  6, 26,  5,  4, 31 };
+        U32 v = val;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27];
+#   endif
+    }
+}
+
+/*=====    Local Constants   =====*/
+static const unsigned BIT_mask[] = {
+    0,          1,         3,         7,         0xF,       0x1F,
+    0x3F,       0x7F,      0xFF,      0x1FF,     0x3FF,     0x7FF,
+    0xFFF,      0x1FFF,    0x3FFF,    0x7FFF,    0xFFFF,    0x1FFFF,
+    0x3FFFF,    0x7FFFF,   0xFFFFF,   0x1FFFFF,  0x3FFFFF,  0x7FFFFF,
+    0xFFFFFF,   0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF,
+    0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */
+#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0]))
+
+/*-**************************************************************
+*  bitStream encoding
+****************************************************************/
+/*! BIT_initCStream() :
+ *  `dstCapacity` must be > sizeof(size_t)
+ *  @return : 0 if success,
+ *            otherwise an error code (can be tested using ERR_isError()) */
+MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    bitC->bitContainer = 0;
+    bitC->bitPos = 0;
+    bitC->startPtr = (char*)startPtr;
+    bitC->ptr = bitC->startPtr;
+    bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer);
+    if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall);
+    return 0;
+}
+
+/*! BIT_addBits() :
+ *  can add up to 31 bits into `bitC`.
+ *  Note : does not check for register overflow ! */
+MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC,
+                            size_t value, unsigned nbBits)
+{
+    MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32);
+    assert(nbBits < BIT_MASK_SIZE);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_addBitsFast() :
+ *  works only if `value` is _clean_,
+ *  meaning all high bits above nbBits are 0 */
+MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC,
+                                size_t value, unsigned nbBits)
+{
+    assert((value>>nbBits) == 0);
+    assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    bitC->bitContainer |= value << bitC->bitPos;
+    bitC->bitPos += nbBits;
+}
+
+/*! BIT_flushBitsFast() :
+ *  assumption : bitContainer has not overflowed
+ *  unsafe version; does not check buffer overflow */
+MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_flushBits() :
+ *  assumption : bitContainer has not overflowed
+ *  safe version; check for buffer overflow, and prevents it.
+ *  note : does not signal buffer overflow.
+ *  overflow will be revealed later on using BIT_closeCStream() */
+MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC)
+{
+    size_t const nbBytes = bitC->bitPos >> 3;
+    assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8);
+    assert(bitC->ptr <= bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitC->bitContainer);
+    bitC->ptr += nbBytes;
+    if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr;
+    bitC->bitPos &= 7;
+    bitC->bitContainer >>= nbBytes*8;
+}
+
+/*! BIT_closeCStream() :
+ *  @return : size of CStream, in bytes,
+ *            or 0 if it could not fit into dstBuffer */
+MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC)
+{
+    BIT_addBitsFast(bitC, 1, 1);   /* endMark */
+    BIT_flushBits(bitC);
+    if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */
+    return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0);
+}
+
+
+/*-********************************************************
+*  bitStream decoding
+**********************************************************/
+/*! BIT_initDStream() :
+ *  Initialize a BIT_DStream_t.
+ * `bitD` : a pointer to an already allocated BIT_DStream_t structure.
+ * `srcSize` must be the *exact* size of the bitStream, in bytes.
+ * @return : size of stream (== srcSize), or an errorCode if a problem is detected
+ */
+MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize)
+{
+    if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); }
+
+    bitD->start = (const char*)srcBuffer;
+    bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer);
+
+    if (srcSize >=  sizeof(bitD->bitContainer)) {  /* normal case */
+        bitD->ptr   = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer);
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);
+        { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+          bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;  /* ensures bitsConsumed is always set */
+          if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ }
+    } else {
+        bitD->ptr   = bitD->start;
+        bitD->bitContainer = *(const BYTE*)(bitD->start);
+        switch(srcSize)
+        {
+        case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16);
+                /* fall-through */
+
+        case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24);
+                /* fall-through */
+
+        case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32);
+                /* fall-through */
+
+        case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24;
+                /* fall-through */
+
+        case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16;
+                /* fall-through */
+
+        case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) <<  8;
+                /* fall-through */
+
+        default: break;
+        }
+        {   BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1];
+            bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0;
+            if (lastByte == 0) return ERROR(corruption_detected);  /* endMark not present */
+        }
+        bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8;
+    }
+
+    return srcSize;
+}
+
+MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
+{
+    return bitContainer >> start;
+}
+
+MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
+{
+    U32 const regMask = sizeof(bitContainer)*8 - 1;
+    /* if start > regMask, bitstream is corrupted, and result is undefined */
+    assert(nbBits < BIT_MASK_SIZE);
+    return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+}
+
+MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
+{
+    assert(nbBits < BIT_MASK_SIZE);
+    return bitContainer & BIT_mask[nbBits];
+}
+
+/*! BIT_lookBits() :
+ *  Provides next n bits from local register.
+ *  local register is not modified.
+ *  On 32-bits, maxNbBits==24.
+ *  On 64-bits, maxNbBits==56.
+ * @return : value extracted */
+MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    /* arbitrate between double-shift and shift+mask */
+#if 1
+    /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8,
+     * bitstream is likely corrupted, and result is undefined */
+    return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits);
+#else
+    /* this code path is slower on my os-x laptop */
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask);
+#endif
+}
+
+/*! BIT_lookBitsFast() :
+ *  unsafe version; only works if nbBits >= 1 */
+MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
+{
+    U32 const regMask = sizeof(bitD->bitContainer)*8 - 1;
+    assert(nbBits >= 1);
+    return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
+}
+
+MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
+{
+    bitD->bitsConsumed += nbBits;
+}
+
+/*! BIT_readBits() :
+ *  Read (consume) next n bits from local register and update.
+ *  Pay attention to not read more than nbBits contained into local register.
+ * @return : extracted value. */
+MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBits(bitD, nbBits);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_readBitsFast() :
+ *  unsafe version; only works only if nbBits >= 1 */
+MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits)
+{
+    size_t const value = BIT_lookBitsFast(bitD, nbBits);
+    assert(nbBits >= 1);
+    BIT_skipBits(bitD, nbBits);
+    return value;
+}
+
+/*! BIT_reloadDStreamFast() :
+ *  Similar to BIT_reloadDStream(), but with two differences:
+ *  1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold!
+ *  2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this
+ *     point you must use BIT_reloadDStream() to reload.
+ */
+MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD)
+{
+    if (UNLIKELY(bitD->ptr < bitD->limitPtr))
+        return BIT_DStream_overflow;
+    assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8);
+    bitD->ptr -= bitD->bitsConsumed >> 3;
+    bitD->bitsConsumed &= 7;
+    bitD->bitContainer = MEM_readLEST(bitD->ptr);
+    return BIT_DStream_unfinished;
+}
+
+/*! BIT_reloadDStream() :
+ *  Refill `bitD` from buffer previously set in BIT_initDStream() .
+ *  This function is safe, it guarantees it will not read beyond src buffer.
+ * @return : status of `BIT_DStream_t` internal register.
+ *           when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */
+MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD)
+{
+    if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))  /* overflow detected, like end of stream */
+        return BIT_DStream_overflow;
+
+    if (bitD->ptr >= bitD->limitPtr) {
+        return BIT_reloadDStreamFast(bitD);
+    }
+    if (bitD->ptr == bitD->start) {
+        if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer;
+        return BIT_DStream_completed;
+    }
+    /* start < ptr < limitPtr */
+    {   U32 nbBytes = bitD->bitsConsumed >> 3;
+        BIT_DStream_status result = BIT_DStream_unfinished;
+        if (bitD->ptr - nbBytes < bitD->start) {
+            nbBytes = (U32)(bitD->ptr - bitD->start);  /* ptr > start */
+            result = BIT_DStream_endOfBuffer;
+        }
+        bitD->ptr -= nbBytes;
+        bitD->bitsConsumed -= nbBytes*8;
+        bitD->bitContainer = MEM_readLEST(bitD->ptr);   /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */
+        return result;
+    }
+}
+
+/*! BIT_endOfDStream() :
+ * @return : 1 if DStream has _exactly_ reached its end (all bits consumed).
+ */
+MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream)
+{
+    return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8));
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* BITSTREAM_H_MODULE */
+/**** ended inlining bitstream.h ****/
+
+
+/* *****************************************
+*  Static allocation
+*******************************************/
+/* FSE buffer bounds */
+#define FSE_NCOUNTBOUND 512
+#define FSE_BLOCKBOUND(size) (size + (size>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */)
+#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */
+#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue)   (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2))
+#define FSE_DTABLE_SIZE_U32(maxTableLog)                   (1 + (1<<maxTableLog))
+
+/* or use the size to malloc() space directly. Pay attention to alignment restrictions though */
+#define FSE_CTABLE_SIZE(maxTableLog, maxSymbolValue)   (FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(FSE_CTable))
+#define FSE_DTABLE_SIZE(maxTableLog)                   (FSE_DTABLE_SIZE_U32(maxTableLog) * sizeof(FSE_DTable))
+
+
+/* *****************************************
+ *  FSE advanced API
+ ***************************************** */
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus);
+/**< same as FSE_optimalTableLog(), which used `minus==2` */
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * FSE_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable.
+ */
+#define FSE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue)   ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) )
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits);
+/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */
+
+size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue);
+/**< build a fake FSE_CTable, designed to compress always the same symbolValue */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` must be >= `(1<<tableLog)`.
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits);
+/**< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */
+
+size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue);
+/**< build a fake FSE_DTable, designed to always generate the same symbolValue */
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog);
+/**< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DTABLE_SIZE_U32(maxLog)` */
+
+typedef enum {
+   FSE_repeat_none,  /**< Cannot use the previous table */
+   FSE_repeat_check, /**< Can use the previous table but it must be checked */
+   FSE_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } FSE_repeat;
+
+/* *****************************************
+*  FSE symbol compression API
+*******************************************/
+/*!
+   This API consists of small unitary functions, which highly benefit from being inlined.
+   Hence their body are included in next section.
+*/
+typedef struct {
+    ptrdiff_t   value;
+    const void* stateTable;
+    const void* symbolTT;
+    unsigned    stateLog;
+} FSE_CState_t;
+
+static void FSE_initCState(FSE_CState_t* CStatePtr, const FSE_CTable* ct);
+
+static void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* CStatePtr, unsigned symbol);
+
+static void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* CStatePtr);
+
+/**<
+These functions are inner components of FSE_compress_usingCTable().
+They allow the creation of custom streams, mixing multiple tables and bit sources.
+
+A key property to keep in mind is that encoding and decoding are done **in reverse direction**.
+So the first symbol you will encode is the last you will decode, like a LIFO stack.
+
+You will need a few variables to track your CStream. They are :
+
+FSE_CTable    ct;         // Provided by FSE_buildCTable()
+BIT_CStream_t bitStream;  // bitStream tracking structure
+FSE_CState_t  state;      // State tracking structure (can have several)
+
+
+The first thing to do is to init bitStream and state.
+    size_t errorCode = BIT_initCStream(&bitStream, dstBuffer, maxDstSize);
+    FSE_initCState(&state, ct);
+
+Note that BIT_initCStream() can produce an error code, so its result should be tested, using FSE_isError();
+You can then encode your input data, byte after byte.
+FSE_encodeSymbol() outputs a maximum of 'tableLog' bits at a time.
+Remember decoding will be done in reverse direction.
+    FSE_encodeByte(&bitStream, &state, symbol);
+
+At any time, you can also add any bit sequence.
+Note : maximum allowed nbBits is 25, for compatibility with 32-bits decoders
+    BIT_addBits(&bitStream, bitField, nbBits);
+
+The above methods don't commit data to memory, they just store it into local register, for speed.
+Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t).
+Writing data to memory is a manual operation, performed by the flushBits function.
+    BIT_flushBits(&bitStream);
+
+Your last FSE encoding operation shall be to flush your last state value(s).
+    FSE_flushState(&bitStream, &state);
+
+Finally, you must close the bitStream.
+The function returns the size of CStream in bytes.
+If data couldn't fit into dstBuffer, it will return a 0 ( == not compressible)
+If there is an error, it returns an errorCode (which can be tested using FSE_isError()).
+    size_t size = BIT_closeCStream(&bitStream);
+*/
+
+
+/* *****************************************
+*  FSE symbol decompression API
+*******************************************/
+typedef struct {
+    size_t      state;
+    const void* table;   /* precise table may vary, depending on U16 */
+} FSE_DState_t;
+
+
+static void     FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt);
+
+static unsigned char FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+
+static unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr);
+
+/**<
+Let's now decompose FSE_decompress_usingDTable() into its unitary components.
+You will decode FSE-encoded symbols from the bitStream,
+and also any other bitFields you put in, **in reverse order**.
+
+You will need a few variables to track your bitStream. They are :
+
+BIT_DStream_t DStream;    // Stream context
+FSE_DState_t  DState;     // State context. Multiple ones are possible
+FSE_DTable*   DTablePtr;  // Decoding table, provided by FSE_buildDTable()
+
+The first thing to do is to init the bitStream.
+    errorCode = BIT_initDStream(&DStream, srcBuffer, srcSize);
+
+You should then retrieve your initial state(s)
+(in reverse flushing order if you have several ones) :
+    errorCode = FSE_initDState(&DState, &DStream, DTablePtr);
+
+You can then decode your data, symbol after symbol.
+For information the maximum number of bits read by FSE_decodeSymbol() is 'tableLog'.
+Keep in mind that symbols are decoded in reverse order, like a LIFO stack (last in, first out).
+    unsigned char symbol = FSE_decodeSymbol(&DState, &DStream);
+
+You can retrieve any bitfield you eventually stored into the bitStream (in reverse order)
+Note : maximum allowed nbBits is 25, for 32-bits compatibility
+    size_t bitField = BIT_readBits(&DStream, nbBits);
+
+All above operations only read from local register (which size depends on size_t).
+Refueling the register from memory is manually performed by the reload method.
+    endSignal = FSE_reloadDStream(&DStream);
+
+BIT_reloadDStream() result tells if there is still some more data to read from DStream.
+BIT_DStream_unfinished : there is still some data left into the DStream.
+BIT_DStream_endOfBuffer : Dstream reached end of buffer. Its container may no longer be completely filled.
+BIT_DStream_completed : Dstream reached its exact end, corresponding in general to decompression completed.
+BIT_DStream_tooFar : Dstream went too far. Decompression result is corrupted.
+
+When reaching end of buffer (BIT_DStream_endOfBuffer), progress slowly, notably if you decode multiple symbols per loop,
+to properly detect the exact end of stream.
+After each decoded symbol, check if DStream is fully consumed using this simple test :
+    BIT_reloadDStream(&DStream) >= BIT_DStream_completed
+
+When it's done, verify decompression is fully completed, by checking both DStream and the relevant states.
+Checking if DStream has reached its end is performed by :
+    BIT_endOfDStream(&DStream);
+Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible.
+    FSE_endOfDState(&DState);
+*/
+
+
+/* *****************************************
+*  FSE unsafe API
+*******************************************/
+static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD);
+/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */
+
+
+/* *****************************************
+*  Implementation of inlined functions
+*******************************************/
+typedef struct {
+    int deltaFindState;
+    U32 deltaNbBits;
+} FSE_symbolCompressionTransform; /* total 8 bytes */
+
+MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct)
+{
+    const void* ptr = ct;
+    const U16* u16ptr = (const U16*) ptr;
+    const U32 tableLog = MEM_read16(ptr);
+    statePtr->value = (ptrdiff_t)1<<tableLog;
+    statePtr->stateTable = u16ptr+2;
+    statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1);
+    statePtr->stateLog = tableLog;
+}
+
+
+/*! FSE_initCState2() :
+*   Same as FSE_initCState(), but the first symbol to include (which will be the last to be read)
+*   uses the smallest state value possible, saving the cost of this symbol */
+MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol)
+{
+    FSE_initCState(statePtr, ct);
+    {   const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+        const U16* stateTable = (const U16*)(statePtr->stateTable);
+        U32 nbBitsOut  = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16);
+        statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits;
+        statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+    }
+}
+
+MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol)
+{
+    FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol];
+    const U16* const stateTable = (const U16*)(statePtr->stateTable);
+    U32 const nbBitsOut  = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16);
+    BIT_addBits(bitC, statePtr->value, nbBitsOut);
+    statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState];
+}
+
+MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr)
+{
+    BIT_addBits(bitC, statePtr->value, statePtr->stateLog);
+    BIT_flushBits(bitC);
+}
+
+
+/* FSE_getMaxNbBits() :
+ * Approximate maximum cost of a symbol, in bits.
+ * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16;
+}
+
+/* FSE_bitCost() :
+ * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+ * note 1 : assume symbolValue is valid (<= maxSymbolValue)
+ * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */
+MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog)
+{
+    const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr;
+    U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16;
+    U32 const threshold = (minNbBits+1) << 16;
+    assert(tableLog < 16);
+    assert(accuracyLog < 31-tableLog);  /* ensure enough room for renormalization double shift */
+    {   U32 const tableSize = 1 << tableLog;
+        U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize);
+        U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog;   /* linear interpolation (very approximate) */
+        U32 const bitMultiplier = 1 << accuracyLog;
+        assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold);
+        assert(normalizedDeltaFromThreshold <= bitMultiplier);
+        return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold;
+    }
+}
+
+
+/* ======    Decompression    ====== */
+
+typedef struct {
+    U16 tableLog;
+    U16 fastMode;
+} FSE_DTableHeader;   /* sizeof U32 */
+
+typedef struct
+{
+    unsigned short newState;
+    unsigned char  symbol;
+    unsigned char  nbBits;
+} FSE_decode_t;   /* size == U32 */
+
+MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    return DInfo.symbol;
+}
+
+MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.newState + lowBits;
+}
+
+MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+/*! FSE_decodeSymbolFast() :
+    unsafe, only works if no symbol has a probability > 50% */
+MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD)
+{
+    FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    BYTE const symbol = DInfo.symbol;
+    size_t const lowBits = BIT_readBitsFast(bitD, nbBits);
+
+    DStatePtr->state = DInfo.newState + lowBits;
+    return symbol;
+}
+
+MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr)
+{
+    return DStatePtr->state == 0;
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/* **************************************************************
+*  Tuning parameters
+****************************************************************/
+/*!MEMORY_USAGE :
+*  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+*  Increasing memory usage improves compression ratio
+*  Reduced memory usage can improve speed, due to cache effect
+*  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+#ifndef FSE_MAX_MEMORY_USAGE
+#  define FSE_MAX_MEMORY_USAGE 14
+#endif
+#ifndef FSE_DEFAULT_MEMORY_USAGE
+#  define FSE_DEFAULT_MEMORY_USAGE 13
+#endif
+
+/*!FSE_MAX_SYMBOL_VALUE :
+*  Maximum symbol value authorized.
+*  Required for proper stack allocation */
+#ifndef FSE_MAX_SYMBOL_VALUE
+#  define FSE_MAX_SYMBOL_VALUE 255
+#endif
+
+/* **************************************************************
+*  template functions type & suffix
+****************************************************************/
+#define FSE_FUNCTION_TYPE BYTE
+#define FSE_FUNCTION_EXTENSION
+#define FSE_DECODE_TYPE FSE_decode_t
+
+
+#endif   /* !FSE_COMMONDEFS_ONLY */
+
+
+/* ***************************************************************
+*  Constants
+*****************************************************************/
+#define FSE_MAX_TABLELOG  (FSE_MAX_MEMORY_USAGE-2)
+#define FSE_MAX_TABLESIZE (1U<<FSE_MAX_TABLELOG)
+#define FSE_MAXTABLESIZE_MASK (FSE_MAX_TABLESIZE-1)
+#define FSE_DEFAULT_TABLELOG (FSE_DEFAULT_MEMORY_USAGE-2)
+#define FSE_MIN_TABLELOG 5
+
+#define FSE_TABLELOG_ABSOLUTE_MAX 15
+#if FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX
+#  error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported"
+#endif
+
+#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3)
+
+
+#endif /* FSE_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining fse.h ****/
+#define HUF_STATIC_LINKING_ONLY  /* HUF_TABLELOG_ABSOLUTEMAX */
+/**** start inlining huf.h ****/
+/* ******************************************************************
+ * huff0 huffman codec,
+ * part of Finite State Entropy library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef HUF_H_298734234
+#define HUF_H_298734234
+
+/* *** Dependencies *** */
+#include <stddef.h>    /* size_t */
+
+
+/* *** library symbols visibility *** */
+/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual,
+ *        HUF symbols remain "private" (internal symbols for library only).
+ *        Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */
+#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4)
+#  define HUF_PUBLIC_API __attribute__ ((visibility ("default")))
+#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1)   /* Visual expected */
+#  define HUF_PUBLIC_API __declspec(dllexport)
+#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1)
+#  define HUF_PUBLIC_API __declspec(dllimport)  /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */
+#else
+#  define HUF_PUBLIC_API
+#endif
+
+
+/* ========================== */
+/* ***  simple functions  *** */
+/* ========================== */
+
+/** HUF_compress() :
+ *  Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'.
+ * 'dst' buffer must be already allocated.
+ *  Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize).
+ * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB.
+ * @return : size of compressed data (<= `dstCapacity`).
+ *  Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!!
+ *                   if HUF_isError(return), compression failed (more details using HUF_getErrorName())
+ */
+HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+/** HUF_decompress() :
+ *  Decompress HUF data from buffer 'cSrc', of size 'cSrcSize',
+ *  into already allocated buffer 'dst', of minimum size 'dstSize'.
+ * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data.
+ *  Note : in contrast with FSE, HUF_decompress can regenerate
+ *         RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data,
+ *         because it knows size to regenerate (originalSize).
+ * @return : size of regenerated data (== originalSize),
+ *           or an error code, which can be tested using HUF_isError()
+ */
+HUF_PUBLIC_API size_t HUF_decompress(void* dst,  size_t originalSize,
+                               const void* cSrc, size_t cSrcSize);
+
+
+/* ***   Tool functions *** */
+#define HUF_BLOCKSIZE_MAX (128 * 1024)                  /**< maximum input size for a single block compressed with HUF_compress */
+HUF_PUBLIC_API size_t HUF_compressBound(size_t size);   /**< maximum compressed size (worst case) */
+
+/* Error Management */
+HUF_PUBLIC_API unsigned    HUF_isError(size_t code);       /**< tells if a return value is an error code */
+HUF_PUBLIC_API const char* HUF_getErrorName(size_t code);  /**< provides error code string (useful for debugging) */
+
+
+/* ***   Advanced function   *** */
+
+/** HUF_compress2() :
+ *  Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`.
+ * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX .
+ * `tableLog` must be `<= HUF_TABLELOG_MAX` . */
+HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               unsigned maxSymbolValue, unsigned tableLog);
+
+/** HUF_compress4X_wksp() :
+ *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
+ * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */
+#define HUF_WORKSPACE_SIZE ((6 << 10) + 256)
+#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     unsigned maxSymbolValue, unsigned tableLog,
+                                     void* workSpace, size_t wkspSize);
+
+#endif   /* HUF_H_298734234 */
+
+/* ******************************************************************
+ *  WARNING !!
+ *  The following section contains advanced and experimental definitions
+ *  which shall never be used in the context of a dynamic library,
+ *  because they are not guaranteed to remain stable in the future.
+ *  Only consider them in association with static linking.
+ * *****************************************************************/
+#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY)
+#define HUF_H_HUF_STATIC_LINKING_ONLY
+
+/* *** Dependencies *** */
+/**** skipping file: mem.h ****/
+
+
+/* *** Constants *** */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none specified */
+#define HUF_SYMBOLVALUE_MAX  255
+
+#define HUF_TABLELOG_ABSOLUTEMAX  15  /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */
+#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
+#  error "HUF_TABLELOG_MAX is too large !"
+#endif
+
+
+/* ****************************************
+*  Static allocation
+******************************************/
+/* HUF buffer bounds */
+#define HUF_CTABLEBOUND 129
+#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8)   /* only true when incompressible is pre-filtered with fast heuristic */
+#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size))   /* Macro version, useful for static allocation */
+
+/* static allocation of HUF's Compression Table */
+#define HUF_CTABLE_SIZE_U32(maxSymbolValue)   ((maxSymbolValue)+1)   /* Use tables of U32, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32))
+#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
+    U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \
+    void* name##hv = &(name##hb); \
+    HUF_CElt* name = (HUF_CElt*)(name##hv)   /* no final ; */
+
+/* static allocation of HUF's DTable */
+typedef U32 HUF_DTable;
+#define HUF_DTABLE_SIZE(maxTableLog)   (1 + (1<<(maxTableLog)))
+#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) }
+#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \
+        HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) }
+
+
+/* ****************************************
+*  Advanced decompression functions
+******************************************/
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+#endif
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< decodes RLE and uncompressed */
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */
+size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+#endif
+
+
+/* ****************************************
+ *  HUF detailed API
+ * ****************************************/
+
+/*! HUF_compress() does the following:
+ *  1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h")
+ *  2. (optional) refine tableLog using HUF_optimalTableLog()
+ *  3. build Huffman table from count using HUF_buildCTable()
+ *  4. save Huffman table to memory buffer using HUF_writeCTable()
+ *  5. encode the data stream using HUF_compress4X_usingCTable()
+ *
+ *  The following API allows targeting specific sub-functions for advanced tasks.
+ *  For example, it's possible to compress several blocks using the same 'CTable',
+ *  or to save and regenerate 'CTable' using external methods.
+ */
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue);
+typedef struct HUF_CElt_s HUF_CElt;   /* incomplete type */
+size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits);   /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog);
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue);
+
+typedef enum {
+   HUF_repeat_none,  /**< Cannot use the previous table */
+   HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */
+   HUF_repeat_valid  /**< Can use the previous table and it is assumed to be valid */
+ } HUF_repeat;
+/** HUF_compress4X_repeat() :
+ *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,    /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE.
+ */
+#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1)
+#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned))
+size_t HUF_buildCTable_wksp (HUF_CElt* tree,
+                       const unsigned* count, U32 maxSymbolValue, U32 maxNbBits,
+                             void* workSpace, size_t wkspSize);
+
+/*! HUF_readStats() :
+ *  Read compact Huffman tree, saved by HUF_writeCTable().
+ * `huffWeight` is destination buffer.
+ * @return : size read from `src` , or an error Code .
+ *  Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize,
+                     U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize);
+
+/** HUF_readCTable() :
+ *  Loading a CTable saved with HUF_writeCTable() */
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights);
+
+/** HUF_getNbBits() :
+ *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX
+ *  Note 1 : is not inlined, as HUF_CElt definition is private
+ *  Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);
+
+/*
+ * HUF_decompress() does the following:
+ * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics
+ * 2. build Huffman table from save, using HUF_readDTableX?()
+ * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable()
+ */
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize);
+
+/**
+ *  The minimum workspace size for the `workSpace` used in
+ *  HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp().
+ *
+ *  The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when
+ *  HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15.
+ *  Buffer overflow errors may potentially occur if code modifications result in
+ *  a required workspace size greater than that specified in the following
+ *  macro.
+ */
+#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10)
+#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32))
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize);
+size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize);
+#endif
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+
+
+/* ====================== */
+/* single stream variants */
+/* ====================== */
+
+size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog);
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize);  /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable);
+/** HUF_compress1X_repeat() :
+ *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none.
+ *  If it uses hufTable it does not modify hufTable or repeat.
+ *  If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used.
+ *  If preferRepeat then the old table will always be used if valid. */
+size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
+                       const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned tableLog,
+                       void* workSpace, size_t wkspSize,   /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2);
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* single-symbol decoder */
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /* double-symbol decoder */
+#endif
+
+size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< single-symbol decoder */
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< single-symbol decoder */
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);   /**< double-symbols decoder */
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize);   /**< double-symbols decoder */
+#endif
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);   /**< automatic selection of sing or double symbol decoder, based on DTable */
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable);
+#endif
+
+/* BMI2 variants.
+ * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0.
+ */
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+#endif
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2);
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2);
+
+#endif /* HUF_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining huf.h ****/
+
+
+/*===   Version   ===*/
+unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; }
+
+
+/*===   Error Management   ===*/
+unsigned FSE_isError(size_t code) { return ERR_isError(code); }
+const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+unsigned HUF_isError(size_t code) { return ERR_isError(code); }
+const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+
+/*-**************************************************************
+*  FSE NCount encoding-decoding
+****************************************************************/
+size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr,
+                 const void* headerBuffer, size_t hbSize)
+{
+    const BYTE* const istart = (const BYTE*) headerBuffer;
+    const BYTE* const iend = istart + hbSize;
+    const BYTE* ip = istart;
+    int nbBits;
+    int remaining;
+    int threshold;
+    U32 bitStream;
+    int bitCount;
+    unsigned charnum = 0;
+    int previous0 = 0;
+
+    if (hbSize < 4) {
+        /* This function only works when hbSize >= 4 */
+        char buffer[4];
+        memset(buffer, 0, sizeof(buffer));
+        memcpy(buffer, headerBuffer, hbSize);
+        {   size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr,
+                                                    buffer, sizeof(buffer));
+            if (FSE_isError(countSize)) return countSize;
+            if (countSize > hbSize) return ERROR(corruption_detected);
+            return countSize;
+    }   }
+    assert(hbSize >= 4);
+
+    /* init */
+    memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0]));   /* all symbols not present in NCount have a frequency of 0 */
+    bitStream = MEM_readLE32(ip);
+    nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG;   /* extract tableLog */
+    if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge);
+    bitStream >>= 4;
+    bitCount = 4;
+    *tableLogPtr = nbBits;
+    remaining = (1<<nbBits)+1;
+    threshold = 1<<nbBits;
+    nbBits++;
+
+    while ((remaining>1) & (charnum<=*maxSVPtr)) {
+        if (previous0) {
+            unsigned n0 = charnum;
+            while ((bitStream & 0xFFFF) == 0xFFFF) {
+                n0 += 24;
+                if (ip < iend-5) {
+                    ip += 2;
+                    bitStream = MEM_readLE32(ip) >> bitCount;
+                } else {
+                    bitStream >>= 16;
+                    bitCount   += 16;
+            }   }
+            while ((bitStream & 3) == 3) {
+                n0 += 3;
+                bitStream >>= 2;
+                bitCount += 2;
+            }
+            n0 += bitStream & 3;
+            bitCount += 2;
+            if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall);
+            while (charnum < n0) normalizedCounter[charnum++] = 0;
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                assert((bitCount >> 3) <= 3); /* For first condition to work */
+                ip += bitCount>>3;
+                bitCount &= 7;
+                bitStream = MEM_readLE32(ip) >> bitCount;
+            } else {
+                bitStream >>= 2;
+        }   }
+        {   int const max = (2*threshold-1) - remaining;
+            int count;
+
+            if ((bitStream & (threshold-1)) < (U32)max) {
+                count = bitStream & (threshold-1);
+                bitCount += nbBits-1;
+            } else {
+                count = bitStream & (2*threshold-1);
+                if (count >= threshold) count -= max;
+                bitCount += nbBits;
+            }
+
+            count--;   /* extra accuracy */
+            remaining -= count < 0 ? -count : count;   /* -1 means +1 */
+            normalizedCounter[charnum++] = (short)count;
+            previous0 = !count;
+            while (remaining < threshold) {
+                nbBits--;
+                threshold >>= 1;
+            }
+
+            if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) {
+                ip += bitCount>>3;
+                bitCount &= 7;
+            } else {
+                bitCount -= (int)(8 * (iend - 4 - ip));
+                ip = iend - 4;
+            }
+            bitStream = MEM_readLE32(ip) >> (bitCount & 31);
+    }   }   /* while ((remaining>1) & (charnum<=*maxSVPtr)) */
+    if (remaining != 1) return ERROR(corruption_detected);
+    if (bitCount > 32) return ERROR(corruption_detected);
+    *maxSVPtr = charnum-1;
+
+    ip += (bitCount+7)>>3;
+    return ip-istart;
+}
+
+
+/*! HUF_readStats() :
+    Read compact Huffman tree, saved by HUF_writeCTable().
+    `huffWeight` is destination buffer.
+    `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32.
+    @return : size read from `src` , or an error Code .
+    Note : Needed by HUF_readCTable() and HUF_readDTableX?() .
+*/
+size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats,
+                     U32* nbSymbolsPtr, U32* tableLogPtr,
+                     const void* src, size_t srcSize)
+{
+    U32 weightTotal;
+    const BYTE* ip = (const BYTE*) src;
+    size_t iSize;
+    size_t oSize;
+
+    if (!srcSize) return ERROR(srcSize_wrong);
+    iSize = ip[0];
+    /* memset(huffWeight, 0, hwSize);   *//* is not necessary, even though some analyzer complain ... */
+
+    if (iSize >= 128) {  /* special header */
+        oSize = iSize - 127;
+        iSize = ((oSize+1)/2);
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        if (oSize >= hwSize) return ERROR(corruption_detected);
+        ip += 1;
+        {   U32 n;
+            for (n=0; n<oSize; n+=2) {
+                huffWeight[n]   = ip[n/2] >> 4;
+                huffWeight[n+1] = ip[n/2] & 15;
+    }   }   }
+    else  {   /* header compressed with FSE (normal case) */
+        FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)];  /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */
+        if (iSize+1 > srcSize) return ERROR(srcSize_wrong);
+        oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6);   /* max (hwSize-1) values decoded, as last one is implied */
+        if (FSE_isError(oSize)) return oSize;
+    }
+
+    /* collect weight stats */
+    memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
+    weightTotal = 0;
+    {   U32 n; for (n=0; n<oSize; n++) {
+            if (huffWeight[n] >= HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+            rankStats[huffWeight[n]]++;
+            weightTotal += (1 << huffWeight[n]) >> 1;
+    }   }
+    if (weightTotal == 0) return ERROR(corruption_detected);
+
+    /* get last non-null symbol weight (implied, total must be 2^n) */
+    {   U32 const tableLog = BIT_highbit32(weightTotal) + 1;
+        if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected);
+        *tableLogPtr = tableLog;
+        /* determine last weight */
+        {   U32 const total = 1 << tableLog;
+            U32 const rest = total - weightTotal;
+            U32 const verif = 1 << BIT_highbit32(rest);
+            U32 const lastWeight = BIT_highbit32(rest) + 1;
+            if (verif != rest) return ERROR(corruption_detected);    /* last value must be a clean power of 2 */
+            huffWeight[oSize] = (BYTE)lastWeight;
+            rankStats[lastWeight]++;
+    }   }
+
+    /* check tree construction validity */
+    if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected);   /* by construction : at least 2 elts of rank 1, must be even */
+
+    /* results */
+    *nbSymbolsPtr = (U32)(oSize+1);
+    return iSize+1;
+}
+/**** ended inlining common/entropy_common.c ****/
+/**** start inlining common/error_private.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* The purpose of this file is to have a single list of error strings embedded in binary */
+
+/**** skipping file: error_private.h ****/
+
+const char* ERR_getErrorString(ERR_enum code)
+{
+#ifdef ZSTD_STRIP_ERROR_STRINGS
+    (void)code;
+    return "Error strings stripped";
+#else
+    static const char* const notErrorCode = "Unspecified error code";
+    switch( code )
+    {
+    case PREFIX(no_error): return "No error detected";
+    case PREFIX(GENERIC):  return "Error (generic)";
+    case PREFIX(prefix_unknown): return "Unknown frame descriptor";
+    case PREFIX(version_unsupported): return "Version not supported";
+    case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter";
+    case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding";
+    case PREFIX(corruption_detected): return "Corrupted block detected";
+    case PREFIX(checksum_wrong): return "Restored data doesn't match checksum";
+    case PREFIX(parameter_unsupported): return "Unsupported parameter";
+    case PREFIX(parameter_outOfBound): return "Parameter is out of bound";
+    case PREFIX(init_missing): return "Context should be init first";
+    case PREFIX(memory_allocation): return "Allocation error : not enough memory";
+    case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough";
+    case PREFIX(stage_wrong): return "Operation not authorized at current processing stage";
+    case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported";
+    case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large";
+    case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small";
+    case PREFIX(dictionary_corrupted): return "Dictionary is corrupted";
+    case PREFIX(dictionary_wrong): return "Dictionary mismatch";
+    case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples";
+    case PREFIX(dstSize_tooSmall): return "Destination buffer is too small";
+    case PREFIX(srcSize_wrong): return "Src size is incorrect";
+    case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer";
+        /* following error codes are not stable and may be removed or changed in a future version */
+    case PREFIX(frameIndex_tooLarge): return "Frame index is too large";
+    case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking";
+    case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong";
+    case PREFIX(maxCode):
+    default: return notErrorCode;
+    }
+#endif
+}
+/**** ended inlining common/error_private.c ****/
+/**** start inlining common/fse_decompress.c ****/
+/* ******************************************************************
+ * FSE : Finite State Entropy decoder
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+/**** skipping file: bitstream.h ****/
+/**** skipping file: compiler.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: fse.h ****/
+/**** skipping file: error_private.h ****/
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+FSE_DTable* FSE_createDTable (unsigned tableLog)
+{
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) );
+}
+
+void FSE_freeDTable (FSE_DTable* dt)
+{
+    free(dt);
+}
+
+size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    void* const tdPtr = dt+1;   /* because *dt is unsigned, 32-bits aligned on 32-bits */
+    FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr);
+    U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge);
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+
+    /* Init, lay down lowprob symbols */
+    {   FSE_DTableHeader DTableH;
+        DTableH.tableLog = (U16)tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].symbol = (FSE_FUNCTION_TYPE)s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    symbolNext[s] = normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].symbol = (FSE_FUNCTION_TYPE)s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        if (position!=0) return ERROR(GENERIC);   /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            FSE_FUNCTION_TYPE const symbol = (FSE_FUNCTION_TYPE)(tableDecode[u].symbol);
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].newState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+    }   }
+
+    return 0;
+}
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+/*-*******************************************************
+*  Decompression (Byte symbols)
+*********************************************************/
+size_t FSE_buildDTable_rle (FSE_DTable* dt, BYTE symbolValue)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const cell = (FSE_decode_t*)dPtr;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->newState = 0;
+    cell->symbol = symbolValue;
+    cell->nbBits = 0;
+
+    return 0;
+}
+
+
+size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits)
+{
+    void* ptr = dt;
+    FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr;
+    void* dPtr = dt + 1;
+    FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr;
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSV1 = tableMask+1;
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);         /* min size */
+
+    /* Build Decoding Table */
+    DTableH->tableLog = (U16)nbBits;
+    DTableH->fastMode = 1;
+    for (s=0; s<maxSV1; s++) {
+        dinfo[s].newState = 0;
+        dinfo[s].symbol = (BYTE)s;
+        dinfo[s].nbBits = (BYTE)nbBits;
+    }
+
+    return 0;
+}
+
+FORCE_INLINE_TEMPLATE size_t FSE_decompress_usingDTable_generic(
+          void* dst, size_t maxDstSize,
+    const void* cSrc, size_t cSrcSize,
+    const FSE_DTable* dt, const unsigned fast)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const omax = op + maxDstSize;
+    BYTE* const olimit = omax-3;
+
+    BIT_DStream_t bitD;
+    FSE_DState_t state1;
+    FSE_DState_t state2;
+
+    /* Init */
+    CHECK_F(BIT_initDStream(&bitD, cSrc, cSrcSize));
+
+    FSE_initDState(&state1, &bitD, dt);
+    FSE_initDState(&state2, &bitD, dt);
+
+#define FSE_GETSYMBOL(statePtr) fast ? FSE_decodeSymbolFast(statePtr, &bitD) : FSE_decodeSymbol(statePtr, &bitD)
+
+    /* 4 symbols per loop */
+    for ( ; (BIT_reloadDStream(&bitD)==BIT_DStream_unfinished) & (op<olimit) ; op+=4) {
+        op[0] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[1] = FSE_GETSYMBOL(&state2);
+
+        if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } }
+
+        op[2] = FSE_GETSYMBOL(&state1);
+
+        if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8)    /* This test must be static */
+            BIT_reloadDStream(&bitD);
+
+        op[3] = FSE_GETSYMBOL(&state2);
+    }
+
+    /* tail */
+    /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */
+    while (1) {
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state1);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state2);
+            break;
+        }
+
+        if (op>(omax-2)) return ERROR(dstSize_tooSmall);
+        *op++ = FSE_GETSYMBOL(&state2);
+        if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) {
+            *op++ = FSE_GETSYMBOL(&state1);
+            break;
+    }   }
+
+    return op-ostart;
+}
+
+
+size_t FSE_decompress_usingDTable(void* dst, size_t originalSize,
+                            const void* cSrc, size_t cSrcSize,
+                            const FSE_DTable* dt)
+{
+    const void* ptr = dt;
+    const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr;
+    const U32 fastMode = DTableH->fastMode;
+
+    /* select fast mode (static) */
+    if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1);
+    return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0);
+}
+
+
+size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog)
+{
+    const BYTE* const istart = (const BYTE*)cSrc;
+    const BYTE* ip = istart;
+    short counting[FSE_MAX_SYMBOL_VALUE+1];
+    unsigned tableLog;
+    unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+
+    /* normal FSE decoding mode */
+    size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize);
+    if (FSE_isError(NCountLength)) return NCountLength;
+    /* if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); */  /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */
+    if (tableLog > maxLog) return ERROR(tableLog_tooLarge);
+    ip += NCountLength;
+    cSrcSize -= NCountLength;
+
+    CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) );
+
+    return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace);   /* always return, even if it is an error code */
+}
+
+
+typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)];
+
+size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize)
+{
+    DTable_max_t dt;   /* Static analyzer seems unable to understand this table will be properly initialized later */
+    return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG);
+}
+
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+/**** ended inlining common/fse_decompress.c ****/
+/**** start inlining common/pool.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ======   Dependencies   ======= */
+#include <stddef.h>    /* size_t */
+/**** skipping file: debug.h ****/
+/**** start inlining zstd_internal.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CCOMMON_H_MODULE
+#define ZSTD_CCOMMON_H_MODULE
+
+/* this module contains definitions which must be identical
+ * across compression, decompression and dictBuilder.
+ * It also contains a few functions useful to at least 2 of them
+ * and which benefit from being inlined */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+/**** skipping file: compiler.h ****/
+/**** skipping file: mem.h ****/
+/**** skipping file: debug.h ****/
+/**** skipping file: error_private.h ****/
+#define ZSTD_STATIC_LINKING_ONLY
+/**** start inlining ../zstd.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <limits.h>   /* INT_MAX */
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    4
+#define ZSTD_VERSION_RELEASE  5
+
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< to check runtime library version */
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* requires v1.3.0+ */
+
+/* *************************************
+ *  Default constant
+ ***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/* *************************************
+ *  Constants
+ ***************************************/
+
+/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
+#define ZSTD_MAGICNUMBER            0xFD2FB528    /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437    /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50    /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK   0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX  17
+#define ZSTD_BLOCKSIZE_MAX     (1<<ZSTD_BLOCKSIZELOG_MAX)
+
+
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : requires v1.3.0+
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of `src` frame content, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findFrameCompressedSize() :
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+ * `srcSize` must be >= first frame size
+ * @return : the compressed size of the first frame starting at `src`,
+ *           suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+ *        or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+
+
+/***************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Note : re-using context is just a speed / resource optimization.
+ *         It doesn't change the compression ratio, which remains identical.
+ *  Note 2 : In multi-threaded environments,
+ *         use one different context per thread for parallel execution.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+ *  Important : in order to behave similarly to `ZSTD_compress()`,
+ *  this function compresses at requested compression level,
+ *  __ignoring any other parameter__ .
+ *  If any advanced parameter was set using the advanced API,
+ *  they will all be reset. Only `compressionLevel` remains.
+ */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(),
+ *  requires an allocated ZSTD_DCtx.
+ *  Compatible with sticky parameters.
+ */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
+
+
+/***************************************
+*  Advanced compression API
+***************************************/
+
+/* API design :
+ *   Parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+ *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+ *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *
+ *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+ *
+ *   This API supercedes all other "advanced" API entry points in the experimental section.
+ *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ */
+
+
+/* Compression strategies, listed from fastest to strongest */
+typedef enum { ZSTD_fast=1,
+               ZSTD_dfast=2,
+               ZSTD_greedy=3,
+               ZSTD_lazy=4,
+               ZSTD_lazy2=5,
+               ZSTD_btlazy2=6,
+               ZSTD_btopt=7,
+               ZSTD_btultra=8,
+               ZSTD_btultra2=9
+               /* note : new strategies _might_ be added in the future.
+                         Only the order (from fast to strong) is guaranteed */
+} ZSTD_strategy;
+
+
+typedef enum {
+
+    /* compression parameters
+     * Note: When compressing with a ZSTD_CDict these parameters are superseded
+     * by the parameters used to construct the ZSTD_CDict.
+     * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+    ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+                              * Note that exact compression parameters are dynamically determined,
+                              * depending on both compression level and srcSize (when known).
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+                              * Note 1 : it's possible to pass a negative compression level.
+                              * Note 2 : setting a level does not automatically set all other compression parameters
+                              *   to default. Setting this will however eventually dynamically impact the compression
+                              *   parameters which have not been manually set. The manually set
+                              *   ones will 'stick'. */
+    /* Advanced compression parameters :
+     * It's possible to pin down compression parameters to some specific values.
+     * In which case, these values are no longer dynamically selected by the compressor */
+    ZSTD_c_windowLog=101,    /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * This will set a memory budget for streaming decompression,
+                              * with larger values requiring more memory
+                              * and typically compressing more.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+                              *       requires explicitly allowing such size at streaming decompression stage. */
+    ZSTD_c_hashLog=102,      /* Size of the initial probe table, as a power of 2.
+                              * Resulting memory usage is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_c_chainLog=103,     /* Size of the multi-probe search table, as a power of 2.
+                              * Resulting memory usage is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless for "fast" strategy.
+                              * It's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_c_searchLog=104,    /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless for "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_c_minMatch=105,     /* Minimum size of searched matches.
+                              * Note that Zstandard can still find matches of smaller size,
+                              * it just tweaks its search algorithm to look for this size and larger.
+                              * Larger values increase compression and decompression speed, but decrease ratio.
+                              * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+                              * For strategies btopt, btultra & btultra2:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_c_strategy=107,     /* See ZSTD_strategy enum definition.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+
+    /* LDM mode parameters */
+    ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                     * This parameter is designed to improve compression ratio
+                                     * for large inputs, by finding large matches at long distance.
+                                     * It increases memory usage and window size.
+                                     * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+                                     * except when expressly set to a different value. */
+    ZSTD_c_ldmHashLog=161,   /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_c_ldmMinMatch=162,  /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashRateLog". */
+
+    /* frame parameters */
+    ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression.
+                              * This is automatically the case when using ZSTD_compress2(),
+                              * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_c_dictIDFlag=202,   /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+     * They return an error otherwise. */
+    ZSTD_c_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() :
+                              * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+                              * while compression work is performed in parallel, within worker threads.
+                              * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+                              *  in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+    ZSTD_c_jobSize=401,      /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+                              * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlap size, or 1 MB, whichever is largest.
+                              * The minimum size is automatically and transparently enforced. */
+    ZSTD_c_overlapLog=402,   /* Control the overlap size, as a fraction of window size.
+                              * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+                              * It helps preserve compression ratio, while each job is compressed in parallel.
+                              * This value is enforced only when nbWorkers >= 1.
+                              * Larger values increase compression ratio, but decrease speed.
+                              * Possible values range from 0 to 9 :
+                              * - 0 means "default" : value will be determined by the library, depending on strategy
+                              * - 1 means "no overlap"
+                              * - 9 means "full overlap", using a full window size.
+                              * Each intermediate rank increases/decreases load size by a factor 2 :
+                              * 9: full window;  8: w/2;  7: w/4;  6: w/8;  5:w/16;  4: w/32;  3:w/64;  2:w/128;  1:no overlap;  0:default
+                              * default value varies between 6 and 9, depending on strategy */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_c_rsyncable
+     * ZSTD_c_format
+     * ZSTD_c_forceMaxWindow
+     * ZSTD_c_forceAttachDict
+     * ZSTD_c_literalCompressionMode
+     * ZSTD_c_targetCBlockSize
+     * ZSTD_c_srcSizeHint
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly;
+     *        also, the enums values themselves are unstable and can still change.
+     */
+     ZSTD_c_experimentalParam1=500,
+     ZSTD_c_experimentalParam2=10,
+     ZSTD_c_experimentalParam3=1000,
+     ZSTD_c_experimentalParam4=1001,
+     ZSTD_c_experimentalParam5=1002,
+     ZSTD_c_experimentalParam6=1003,
+     ZSTD_c_experimentalParam7=1004
+} ZSTD_cParameter;
+
+typedef struct {
+    size_t error;
+    int lowerBound;
+    int upperBound;
+} ZSTD_bounds;
+
+/*! ZSTD_cParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - lower and upper bounds, both inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam);
+
+/*! ZSTD_CCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbWorkers >= 1),
+ *              the following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active for next job only (after a flush()).
+ * @return : an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ *  Total input data size to be compressed as a single frame.
+ *  Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag.
+ *  This value will also be controlled at end of frame, and trigger an error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame.
+ *           In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ *           ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame.
+ *  Note 2 : pledgedSrcSize is only valid once, for the next frame.
+ *           It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN.
+ *  Note 3 : Whenever all input data is provided and consumed in a single round,
+ *           for example with ZSTD_compress2(),
+ *           or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end),
+ *           this value is automatically overridden by srcSize instead.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+typedef enum {
+    ZSTD_reset_session_only = 1,
+    ZSTD_reset_parameters = 2,
+    ZSTD_reset_session_and_parameters = 3
+} ZSTD_ResetDirective;
+
+/*! ZSTD_CCtx_reset() :
+ *  There are 2 different things that can be reset, independently or jointly :
+ *  - The session : will stop compressing current frame, and make CCtx ready to start a new one.
+ *                  Useful after an error, or to interrupt any ongoing compression.
+ *                  Any internal data not yet flushed is cancelled.
+ *                  Compression parameters and dictionary remain unchanged.
+ *                  They will be used to compress next frame.
+ *                  Resetting session never fails.
+ *  - The parameters : changes all parameters back to "default".
+ *                  This removes any reference to any dictionary too.
+ *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ *  - Both : similar to resetting the session, followed by resetting parameters.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+
+/*! ZSTD_compress2() :
+ *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ *  ZSTD_compress2() always starts a new frame.
+ *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - The function is always blocking, returns when compression is completed.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ *           or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+                                   void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+
+/***************************************
+*  Advanced decompression API
+***************************************/
+
+/* The advanced API pushes parameters one by one into an existing DCtx context.
+ * Parameters are sticky, and remain valid for all following frames
+ * using the same DCtx context.
+ * It's possible to reset parameters to default values using ZSTD_DCtx_reset().
+ * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream().
+ *        Therefore, no new decompression function is necessary.
+ */
+
+typedef enum {
+
+    ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+                              * the streaming API will refuse to allocate memory buffer
+                              * in order to protect the host from unreasonable memory requirements.
+                              * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+                              * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+                              * Special: value 0 means "use default maximum windowLog". */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_d_format
+     * ZSTD_d_stableOutBuffer
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly
+     */
+     ZSTD_d_experimentalParam1=1000,
+     ZSTD_d_experimentalParam2=1001
+
+} ZSTD_dParameter;
+
+/*! ZSTD_dParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - both lower and upper bounds, inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam);
+
+/*! ZSTD_DCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_dParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is only possible during frame initialization (before starting decompression).
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value);
+
+/*! ZSTD_DCtx_reset() :
+ *  Return a DCtx to clean state.
+ *  Session and parameters can be reset jointly or separately.
+ *  Parameters can only be reset when no active frame is being decompressed.
+ * @return : 0, or an error code, which can be tested with ZSTD_isError()
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+*  For parallel execution, use one separate ZSTD_CStream per thread.
+*
+*  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+*  Parameters are sticky : when starting a new compression on the same context,
+*  it will re-use the same sticky parameters as previous compression session.
+*  When in doubt, it's recommended to fully initialize the context before usage.
+*  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+*  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+*  set more specific parameters, the pledged source size, or load a dictionary.
+*
+*  Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+*  consume input stream. The function will automatically update both `pos`
+*  fields within `input` and `output`.
+*  Note that the function may not consume the entire input, for example, because
+*  the output buffer is already full, in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  and then present again remaining input data.
+*  note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+*        but doesn't guarantee maximal forward progress. This is especially relevant
+*        when compressing with multiple threads. The call won't block if it can
+*        consume some input, but if it can't it will wait for some, but not all,
+*        output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+*           or an error code, which can be tested using ZSTD_isError().
+*
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+*  operation.
+*  note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+*  start a new frame.
+*  note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if frame fully completed and fully flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+typedef enum {
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+    ZSTD_e_flush=1,    /* flush any data provided so far,
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression.
+                        * note : multithreaded compression will block to flush as much output as possible. */
+    ZSTD_e_end=2       /* flush any remaining data _and_ close current frame.
+                        * note that frame is only closed after compressed data is fully flushed (return value == 0).
+                        * After that point, any additional data starts a new frame.
+                        * note : each frame is independent (does not reference any content from previous frame).
+                        : note : multithreaded compression will block to flush as much output as possible. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compressStream2() :
+ *  Behaves about the same as ZSTD_compressStream, with additional control on end directive.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode)
+ *  - output->pos must be <= dstCapacity, input->pos must be <= srcSize
+ *  - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ *  - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller.
+ *  - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available,
+ *                                                  and then immediately returns, just indicating that there is some data remaining to be flushed.
+ *                                                  The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ *  - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking.
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ *            or an error code, which can be tested using ZSTD_isError().
+ *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ *            Before starting a new compression job, or changing compression parameters,
+ *            it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                         ZSTD_outBuffer* output,
+                                         ZSTD_inBuffer* input,
+                                         ZSTD_EndDirective endOp);
+
+
+/* These buffer sizes are softly recommended.
+ * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
+ * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(),
+ * reducing the amount of memory shuffling and buffering, resulting in minor performance savings.
+ *
+ * However, note that these recommendations are from the perspective of a C caller program.
+ * If the streaming interface is invoked from some other language,
+ * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo,
+ * a major performance rule is to reduce crossing such interface to an absolute minimum.
+ * It's not rare that performance ends being spent more into the interface, rather than compression itself.
+ * In which cases, prefer using large buffers, as large as practical,
+ * for both input and output, to reduce the nb of roundtrips.
+ */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
+
+/* *****************************************************************************
+ * This following is a legacy streaming API.
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+ * Advanced parameters and dictionary compression can only be used through the
+ * new API.
+ ******************************************************************************/
+
+/*!
+ * Equivalent to:
+ *
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ */
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+/*!
+ * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue).
+ * NOTE: The return value is different. ZSTD_compressStream() returns a hint for
+ * the next read size (if non-zero and not an error). ZSTD_compressStream2()
+ * returns the minimum nb of bytes left to flush (if non-zero and not an error).
+ */
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+*  Alternatively, use advanced API to set specific properties.
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  The function tries to flush all data decoded immediately, respecting output buffer size.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
+*  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+*  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+*        or an error code, which can be tested using ZSTD_isError(),
+*        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+*                                the return value is a suggested next input size (just a hint for better latency)
+*                                that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+
+/* This function is redundant with the advanced API and equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression at an explicit compression level using a Dictionary.
+ *  A dictionary can be any arbitrary data segment (also called a prefix),
+ *  or a buffer with specified information (see dictBuilder/zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a known Dictionary.
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/***********************************
+ *  Bulk processing dictionary API
+ **********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages or blocks using the same dictionary,
+ *  it's recommended to digest the dictionary only once, since it's a costly operation.
+ *  ZSTD_createCDict() will create a state from digesting a dictionary.
+ *  The resulting state can be used for future compression operations with very limited startup cost.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict.
+ *  Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content.
+ *  Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer,
+ *      in which case the only thing that it transports is the @compressionLevel.
+ *      This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively,
+ *      expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times.
+ *  Note : compression level is _decided at dictionary creation time_,
+ *     and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict. */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/********************************
+ *  Dictionary helper functions
+ *******************************/
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/*******************************************************************************
+ * Advanced dictionary and prefix API
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+ * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and
+ * only reset with the context is reset with ZSTD_reset_parameters or
+ * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ ******************************************************************************/
+
+
+/*! ZSTD_CCtx_loadDictionary() :
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+ *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 2 : Loading a dictionary involves building tables.
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *           Tables are dependent on compression parameters, and for this reason,
+ *           compression parameters can no longer be changed after loading a dictionary.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_refCDict() :
+ *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Note that compression parameters are enforced from within CDict,
+ *  and supersede any compression parameter previously set within CCtx.
+ *  The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+ *  The ignored parameters will be used again if the CCtx is returned to no-dictionary mode.
+ *  The dictionary will remain valid for future compressed frames using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Referencing a NULL CDict means "return to no-dictionary mode".
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compressed frame.
+ *  A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+ *           Its content must remain unmodified during compression.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_c_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *           If there is a need to use the same prefix multiple times, consider loadDictionary instead.
+ *  Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
+ *           Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                 const void* prefix, size_t prefixSize);
+
+/*! ZSTD_DCtx_loadDictionary() :
+ *  Create an internal DDict from dict buffer,
+ *  to be used to decompress next frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ *            meaning "return to no-dictionary mode".
+ *  Note 1 : Loading a dictionary involves building tables,
+ *           which has a non-negligible impact on CPU usage and latency.
+ *           It's recommended to "load once, use many times", to amortize the cost
+ *  Note 2 :`dict` content will be copied internally, so `dict` can be released after loading.
+ *           Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of
+ *           how dictionary content is loaded and interpreted.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_refDDict() :
+ *  Reference a prepared dictionary, to be used to decompress next frames.
+ *  The dictionary remains active for decompression of future frames using same DCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+ *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+/*! ZSTD_DCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) to decompress next frame.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_decompressStream() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_decompressStream() returns 0.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section)
+ *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A full dictionary is more costly, as it requires building tables.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                 const void* prefix, size_t prefixSize);
+
+/* ===   Memory management   === */
+
+/*! ZSTD_sizeof_*() :
+ *  These functions give the _current_ memory usage of selected object.
+ *  Note that object memory usage can evolve (increase or decrease) over time. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+#endif  /* ZSTD_H_235446 */
+
+
+/* **************************************************************************************
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
+ * The definitions in the following section are considered experimental.
+ * They are provided for advanced scenarios.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/****************************************************************************************
+ *   experimental API (static linking only)
+ ****************************************************************************************
+ * The following symbols and constants
+ * are not planned to join "stable API" status in the near future.
+ * They can still change in future versions.
+ * Some of them are planned to remain in the static_only section indefinitely.
+ * Some of them might be removed in the future (especially when redundant with existing stable functions)
+ * ***************************************************************************************/
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1)   /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format)    ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE    8
+
+/* compression parameter bounds */
+#define ZSTD_WINDOWLOG_MAX_32    30
+#define ZSTD_WINDOWLOG_MAX_64    31
+#define ZSTD_WINDOWLOG_MAX     ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN       10
+#define ZSTD_HASHLOG_MAX       ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN          6
+#define ZSTD_CHAINLOG_MAX_32     29
+#define ZSTD_CHAINLOG_MAX_64     30
+#define ZSTD_CHAINLOG_MAX      ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN        ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX      (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN        1
+#define ZSTD_MINMATCH_MAX         7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN         3   /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX    ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN        ZSTD_fast
+#define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+
+
+#define ZSTD_OVERLAPLOG_MIN       0
+#define ZSTD_OVERLAPLOG_MAX       9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27   /* by default, the streaming decoder will refuse any frame
+                                           * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+                                           * to preserve host's memory from unreasonable requirements.
+                                           * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+                                           * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
+
+/* LDM parameter bounds */
+#define ZSTD_LDM_HASHLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX      ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN        4
+#define ZSTD_LDM_MINMATCH_MAX     4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN   1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX   8
+#define ZSTD_LDM_HASHRATELOG_MIN     0
+#define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+
+/* Advanced parameter bounds */
+#define ZSTD_TARGETCBLOCKSIZE_MIN   64
+#define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN        0
+#define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+
+/* internal */
+#define ZSTD_HASHLOG3_MAX           17
+
+
+/* ---  Advanced types  --- */
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef struct {
+    unsigned int matchPos; /* Match pos in dst */
+    /* If seqDef.offset > 3, then this is seqDef.offset - 3
+     * If seqDef.offset < 3, then this is the corresponding repeat offset
+     * But if seqDef.offset < 3 and litLength == 0, this is the
+     *   repeat offset before the corresponding repeat offset
+     * And if seqDef.offset == 3 and litLength == 0, this is the
+     *   most recent repeat offset - 1
+     */
+    unsigned int offset;
+    unsigned int litLength; /* Literal length */
+    unsigned int matchLength; /* Match length */
+    /* 0 when seq not rep and seqDef.offset otherwise
+     * when litLength == 0 this will be <= 4, otherwise <= 3 like normal
+     */
+    unsigned int rep;
+} ZSTD_Sequence;
+
+typedef struct {
+    unsigned windowLog;       /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;        /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;         /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;       /**< nb of searches : larger == more compression, slower */
+    unsigned minMatch;        /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;    /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;   /**< see ZSTD_strategy definition above */
+} ZSTD_compressionParameters;
+
+typedef struct {
+    int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    int checksumFlag;    /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+    int noDictIDFlag;    /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef enum {
+    ZSTD_dct_auto = 0,       /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict = 2    /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0,  /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef = 1    /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+typedef enum {
+    ZSTD_f_zstd1 = 0,           /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless = 1  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                                 * Useful to save 4 bytes per generated frame.
+                                 * Decoder cannot recognise automatically this format, requiring this instruction. */
+} ZSTD_format_e;
+
+typedef enum {
+    /* Note: this enum and the behavior it controls are effectively internal
+     * implementation details of the compressor. They are expected to continue
+     * to evolve and should be considered only in the context of extremely
+     * advanced performance tuning.
+     *
+     * Zstd currently supports the use of a CDict in three ways:
+     *
+     * - The contents of the CDict can be copied into the working context. This
+     *   means that the compression can search both the dictionary and input
+     *   while operating on a single set of internal tables. This makes
+     *   the compression faster per-byte of input. However, the initial copy of
+     *   the CDict's tables incurs a fixed cost at the beginning of the
+     *   compression. For small compressions (< 8 KB), that copy can dominate
+     *   the cost of the compression.
+     *
+     * - The CDict's tables can be used in-place. In this model, compression is
+     *   slower per input byte, because the compressor has to search two sets of
+     *   tables. However, this model incurs no start-up cost (as long as the
+     *   working context's tables can be reused). For small inputs, this can be
+     *   faster than copying the CDict's tables.
+     *
+     * - The CDict's tables are not used at all, and instead we use the working
+     *   context alone to reload the dictionary and use params based on the source
+     *   size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+     *   This method is effective when the dictionary sizes are very small relative
+     *   to the input size, and the input size is fairly large to begin with.
+     *
+     * Zstd has a simple internal heuristic that selects which strategy to use
+     * at the beginning of a compression. However, if experimentation shows that
+     * Zstd is making poor choices, it is possible to override that choice with
+     * this enum.
+     */
+    ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+    ZSTD_dictForceAttach   = 1, /* Never copy the dictionary. */
+    ZSTD_dictForceCopy     = 2, /* Always copy the dictionary. */
+    ZSTD_dictForceLoad     = 3  /* Always reload the dictionary */
+} ZSTD_dictAttachPref_e;
+
+typedef enum {
+  ZSTD_lcm_auto = 0,          /**< Automatically determine the compression mode based on the compression level.
+                               *   Negative compression levels will be uncompressed, and positive compression
+                               *   levels will be compressed. */
+  ZSTD_lcm_huffman = 1,       /**< Always attempt Huffman compression. Uncompressed literals will still be
+                               *   emitted if Huffman compression is not profitable. */
+  ZSTD_lcm_uncompressed = 2   /**< Always emit uncompressed literals. */
+} ZSTD_literalCompressionMode_e;
+
+
+/***************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findDecompressedSize() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - decompressed size of all data in all successive frames
+ *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ *            read each contained frame header.  This is fast as most of the data is skipped,
+ *            however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_decompressBound() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - upper-bound for the decompressed size of all data in all successive frames
+ *            - if an error occured: ZSTD_CONTENTSIZE_ERROR
+ *
+ *  note 1  : an error can occur if `src` contains an invalid or incorrectly formatted frame.
+ *  note 2  : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`.
+ *            in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value.
+ *  note 3  : when the decompressed size field isn't available, the upper-bound for that frame is calculated by:
+ *              upper-bound = # blocks * min(128 KB, Window_Size)
+ */
+ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+
+/*! ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+/*! ZSTD_getSequences() :
+ * Extract sequences from the sequence store
+ * zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2
+ * @return : number of sequences extracted
+ */
+ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+    size_t outSeqsSize, const void* src, size_t srcSize);
+
+
+/***************************************
+*  Memory management
+***************************************/
+
+/*! ZSTD_estimate*() :
+ *  These functions make it possible to estimate memory usage
+ *  of a future {D,C}Ctx, before its creation.
+ *
+ *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+ *  for any compression level up to selected one.
+ *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+ *         does not include space for a window buffer.
+ *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ *  The estimate will assume the input may be arbitrarily large,
+ *  which is the worst case.
+ *
+ *  When srcSize can be bound by a known and rather "small" value,
+ *  this fact can be used to provide a tighter estimation
+ *  because the CCtx compression context will need less memory.
+ *  This tighter estimation can be provided by more advanced functions
+ *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+ *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ *         an internal ?Dict will be created, which additional size is not estimated here.
+ *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_customMem customMem);
+
+
+
+/***************************************
+*  Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is just referenced, not duplicated.
+ *  As a consequence, `dictBuffer` **must** outlive CDict,
+ *  and its content must remain unmodified throughout the lifetime of CDict.
+ *  note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+ * `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+ *  same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+ *  All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+ *  Ensure param values remain within authorized range.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ *  optimize params for a given `srcSize` and `dictSize`.
+ * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN.
+ * `dictSize` must be `0` when there is no dictionary.
+ *  cPar can be invalid : all parameters will be clamped within valid range in the @return struct.
+ *  This function never fails (wide contract) */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+ *  Note : this function is now DEPRECATED.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */
+ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+                                          void* dst, size_t dstCapacity,
+                                    const void* src, size_t srcSize,
+                                    const void* dict,size_t dictSize,
+                                          ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+ *  Note : this function is now REDUNDANT.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning in some future version */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_CDict* cdict,
+                                              ZSTD_frameParameters fParams);
+
+
+/*! ZSTD_CCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
+ *  It saves some memory, but also requires that `dict` outlives its usage within `cctx` */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_refPrefix_advanced() :
+ *  Same as ZSTD_CCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/* ===   experimental parameters   === */
+/* these parameters can be used with ZSTD_setParameter()
+ * they are not guaranteed to remain supported in the future */
+
+ /* Enables rsyncable mode,
+  * which makes compressed files more rsync friendly
+  * by adding periodic synchronization points to the compressed data.
+  * The target average block size is ZSTD_c_jobSize / 2.
+  * It's possible to modify the job size to increase or decrease
+  * the granularity of the synchronization point.
+  * Once the jobSize is smaller than the window size,
+  * it will result in compression ratio degradation.
+  * NOTE 1: rsyncable mode only works when multithreading is enabled.
+  * NOTE 2: rsyncable performs poorly in combination with long range mode,
+  * since it will decrease the effectiveness of synchronization points,
+  * though mileage may vary.
+  * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+  * If the selected compression level is already running significantly slower,
+  * the overall speed won't be significantly impacted.
+  */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+
+/* Select a compression format.
+ * The value must be of type ZSTD_format_e.
+ * See ZSTD_format_e enum definition for details */
+#define ZSTD_c_format ZSTD_c_experimentalParam2
+
+/* Force back-reference distances to remain < windowSize,
+ * even when referencing into Dictionary content (default:0) */
+#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3
+
+/* Controls whether the contents of a CDict
+ * are used in place, or copied into the working context.
+ * Accepts values from the ZSTD_dictAttachPref_e enum.
+ * See the comments on that enum for an explanation of the feature. */
+#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+
+/* Controls how the literals are compressed (default is auto).
+ * The value must be of type ZSTD_literalCompressionMode_e.
+ * See ZSTD_literalCompressionMode_t enum definition for details.
+ */
+#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+
+/* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size (default:0) */
+#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+
+/* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size,
+ * but compression ratio may regress significantly if guess considerably underestimates */
+#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
+
+/*! ZSTD_CCtx_getParameter() :
+ *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
+
+
+/*! ZSTD_CCtx_params :
+ *  Quick howto :
+ *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ *  - ZSTD_CCtxParams_setParameter() : Push parameters one by one into
+ *                                     an existing ZSTD_CCtx_params structure.
+ *                                     This is similar to
+ *                                     ZSTD_CCtx_setParameter().
+ *  - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ *                                    an existing CCtx.
+ *                                    These parameters will be applied to
+ *                                    all subsequent frames.
+ *  - ZSTD_compressStream2() : Do compression using the CCtx.
+ *  - ZSTD_freeCCtxParams() : Free the memory.
+ *
+ *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ *  for static allocation of CCtx for single-threaded compression.
+ */
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ *  Initializes the compression parameters of cctxParams according to
+ *  compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ *  Initializes the compression and frame parameters of cctxParams according to
+ *  params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+/*! ZSTD_CCtxParams_setParameter() :
+ *  Similar to ZSTD_CCtx_setParameter.
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams().
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtxParams_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  Apply a set of ZSTD_CCtx_params to the compression context.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+/*! ZSTD_compressStream2_simpleArgs() :
+ *  Same as ZSTD_compressStream2(),
+ *  but using only integral types as arguments.
+ *  This variant might be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
+
+/***************************************
+*  Advanced decompression functions
+***************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict,
+ *  it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but references `dict` content instead of copying it into `dctx`.
+ *  This saves memory if `dict` remains around.,
+ *  However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but gives direct control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?). */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_refPrefix_advanced() :
+ *  Same as ZSTD_DCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ *  Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ *  This protects a decoder context from reserving too much memory for itself (potential attack scenario).
+ *  This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ *  By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+/* ZSTD_d_format
+ * experimental parameter,
+ * allowing selection between ZSTD_format_e input compression formats
+ */
+#define ZSTD_d_format ZSTD_d_experimentalParam1
+/* ZSTD_d_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the decompressor, and
+ * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer
+ * MUST be large enough to fit the entire decompressed frame. This will be
+ * checked when the frame content size is known. The data in the ZSTD_outBuffer
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+ * When this flags is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+ * If you need to avoid the input buffer allocation use the buffer-less
+ * streaming API.
+ *
+ * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, decompression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST
+ * not be modified during decompression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate
+ * matches. Normally zstd maintains its own buffer for this purpose, but passing
+ * this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2
+
+/*! ZSTD_DCtx_setFormat() :
+ *  Instruct the decoder context about what kind of data to decode next.
+ *  This instruction is mandatory to decode data without a fully-formed header,
+ *  such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+/*! ZSTD_decompressStream_simpleArgs() :
+ *  Same as ZSTD_decompressStream(),
+ *  but using only integral types as arguments.
+ *  This can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
+
+/********************************************************************
+*  Advanced streaming functions
+*  Warning : most of these functions are now redundant with the Advanced API.
+*  Once Advanced API reaches "stable" status,
+*  redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+/**! ZSTD_initCStream_srcSize() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * pledgedSrcSize must be correct. If it is not known at init time, use
+ * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs,
+ * "0" also disables frame content size field. It may be enabled in the future.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                         int compressionLevel,
+                         unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingDict() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * Creates of an internal CDict (incompatible with static CCtx), except if
+ * dict == NULL or dictSize < 8, in which case no dict is used.
+ * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if
+ * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           int compressionLevel);
+
+/**! ZSTD_initCStream_advanced() :
+ * This function is deprecated, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+ *     for ((param, value) : params) {
+ *         ZSTD_CCtx_setParameter(zcs, param, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
+ * pledgedSrcSize must be correct.
+ * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize,
+                          ZSTD_parameters params,
+                          unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingCDict() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * note : cdict will just be referenced, and must outlive compression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+/**! ZSTD_initCStream_usingCDict_advanced() :
+ *   This function is DEPRECATED, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+ *     for ((fParam, value) : fParams) {
+ *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters.
+ * pledgedSrcSize must be correct. If srcSize is not known at init time, use
+ * value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize);
+
+/*! ZSTD_resetCStream() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ *  start a new frame, using same parameters from previous frame.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ *  Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression() :
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
+ */
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flush speed is limited by production speed of oldest job
+ *    irrespective of the speed of concurrent (and newer) jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+
+
+/*=====   Advanced Streaming decompression functions  =====*/
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+ *
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, ddict);
+ *
+ * note : ddict is referenced, it must outlive decompression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *
+ * re-use decompression parameters from previous init; saves dictionary loading
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with several restrictions, documented below.
+*  Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+  - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+  Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+  As a consequence, check that values remain within valid application range.
+  For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+  Each application can set its own limits, depending on local restrictions.
+  For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+  There are multiple ways to guarantee this condition.
+
+  The most memory efficient way is to use a round buffer of sufficient size.
+  Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+  up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+  which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+  At which point, decoding can resume from the beginning of the buffer.
+  Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+  There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+  Finally, if you control the compression process, you can also ignore all buffer size rules,
+  as long as the encoder and decoder progress in "lock-step",
+  aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+  Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+  If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by decompressor.
+  The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+  For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+
+/* ============================ */
+/**       Block level API       */
+/* ============================ */
+
+/*!
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) !
+      ===> In which case, nothing is produced into `dst` !
+      + User __must__ test for such outcome and deal directly with uncompressed data
+      + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0.
+        Doing so would mess up with statistics history, leading to potential data corruption.
+      + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !!
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
+*/
+
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining ../zstd.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: huf.h ****/
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY  /* XXH64_state_t */
+#endif
+/**** start inlining xxhash.h ****/
+/*
+ * xxHash - Extremely Fast Hash algorithm
+ * Header File
+ * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc.
+ *
+ * You can contact the author at :
+ * - xxHash source repository : https://github.com/Cyan4973/xxHash
+ * 
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bits version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bits applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+*  API modifier
+******************************/
+/** XXH_PRIVATE_API
+*   This is useful if you want to include xxhash functions in `static` mode
+*   in order to inline them, and remove their symbol from the public list.
+*   Methodology :
+*     #define XXH_PRIVATE_API
+*     #include "xxhash.h"
+*   `xxhash.c` is automatically included.
+*   It's not useful to compile and link it as a separate module anymore.
+*/
+#ifdef XXH_PRIVATE_API
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+#    define XXH_PUBLIC_API static   /* this version may generate warnings for unused static functions; disable the relevant warning */
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_PRIVATE_API */
+
+/*!XXH_NAMESPACE, aka Namespace Emulation :
+
+If you want to include _and expose_ xxHash functions from within your own library,
+but also want to avoid symbol collisions with another library which also includes xxHash,
+
+you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values).
+
+Note that no change is required within the calling program as long as it includes `xxhash.h` :
+regular symbol name will be automatically translated by this header.
+*/
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/* ****************************
+*  Simple Hash Functions
+******************************/
+typedef unsigned int       XXH32_hash_t;
+typedef unsigned long long XXH64_hash_t;
+
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*!
+XXH32() :
+    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s
+XXH64() :
+    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+*/
+
+
+/* ****************************
+*  Streaming Hash Functions
+******************************/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+
+/*! State allocation, compatible with dynamic libraries */
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+
+
+/* hash streaming */
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*
+These functions generate the xxHash of an input provided in multiple segments.
+Note that, for small input, they are slower than single-call functions, due to state management.
+For small input, prefer `XXH32()` and `XXH64()` .
+
+XXH state must first be allocated, using XXH*_createState() .
+
+Start a new hash by initializing state with a seed, using XXH*_reset().
+
+Then, feed the hash state by calling XXH*_update() as many times as necessary.
+Obviously, input must be allocated and read accessible.
+The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+
+Finally, a hash value can be produced anytime, by using XXH*_digest().
+This function returns the nn-bits hash as an int or long long.
+
+It's still possible to continue inserting input into the hash state after a digest,
+and generate some new hashes later on, by calling again XXH*_digest().
+
+When done, free XXH state space if it was allocated dynamically.
+*/
+
+
+/* **************************
+*  Utils
+****************************/
+#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L))   /* ! C99 */
+#  define restrict   /* disable restrict */
+#endif
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state);
+
+
+/* **************************
+*  Canonical representation
+****************************/
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+*  The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+*  These functions allow transformation of hash result into and from its canonical format.
+*  This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+*/
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+
+#endif /* XXHASH_H_5627135585666179 */
+
+
+
+/* ================================================================================================
+   This section contains definitions which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   They shall only be used with static linking.
+   Never use these definitions in association with dynamic linking !
+=================================================================================================== */
+#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345)
+#define XXH_STATIC_H_3543687687345
+
+/* These definitions are only meant to allow allocation of XXH state
+   statically, on stack, or in a struct for example.
+   Do not use members directly. */
+
+   struct XXH32_state_s {
+       unsigned total_len_32;
+       unsigned large_len;
+       unsigned v1;
+       unsigned v2;
+       unsigned v3;
+       unsigned v4;
+       unsigned mem32[4];   /* buffer defined as U32 for alignment */
+       unsigned memsize;
+       unsigned reserved;   /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH32_state_t */
+
+   struct XXH64_state_s {
+       unsigned long long total_len;
+       unsigned long long v1;
+       unsigned long long v2;
+       unsigned long long v3;
+       unsigned long long v4;
+       unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+       unsigned memsize;
+       unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+   };   /* typedef'd to XXH64_state_t */
+
+
+#  ifdef XXH_PRIVATE_API
+/**** start inlining xxhash.c ****/
+/*
+ *  xxHash - Fast Hash algorithm
+ *  Copyright (c) 2012-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - xxHash homepage: http://www.xxhash.com
+ *  - xxHash source repository : https://github.com/Cyan4973/xxHash
+ * 
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+*/
+
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \
+  defined(__ICCARM__)
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
+ * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
+ * By default, this option is disabled. To enable it, uncomment below define :
+ */
+/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independence be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
+#  define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash; set to 0 when the input data
+ * is guaranteed to be aligned.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/* Modify the local functions below should you wish to use some other memory routines */
+/* for malloc(), free() */
+#include <stdlib.h>
+#include <stddef.h>     /* size_t */
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/* for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#ifndef XXH_STATIC_LINKING_ONLY
+#  define XXH_STATIC_LINKING_ONLY
+#endif
+/**** skipping file: xxhash.h ****/
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#  define INLINE_KEYWORD inline
+#else
+#  define INLINE_KEYWORD
+#endif
+
+#if defined(__GNUC__) || defined(__ICCARM__)
+#  define FORCE_INLINE_ATTR __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define FORCE_INLINE_ATTR __forceinline
+#else
+#  define FORCE_INLINE_ATTR
+#endif
+
+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR
+
+
+#ifdef _MSC_VER
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint8_t  BYTE;
+    typedef uint16_t U16;
+    typedef uint32_t U32;
+    typedef  int32_t S32;
+    typedef uint64_t U64;
+#  else
+    typedef unsigned char      BYTE;
+    typedef unsigned short     U16;
+    typedef unsigned int       U32;
+    typedef   signed int       S32;
+    typedef unsigned long long U64;   /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */
+#  endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
+
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#if defined(__ICCARM__)
+#  include <intrinsics.h>
+#  define XXH_rotl32(x,r) __ROR(x,(32 - r))
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#endif
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#  define XXH_swap64 _byteswap_uint64
+#elif GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+
+/* *************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+    static const int g_one = 1;
+#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/* *************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(int)(!!(c)) }; }    /* use only *after* variable declarations */
+
+
+/* *************************************
+*  Constants
+***************************************/
+static const U32 PRIME32_1 = 2654435761U;
+static const U32 PRIME32_2 = 2246822519U;
+static const U32 PRIME32_3 = 3266489917U;
+static const U32 PRIME32_4 =  668265263U;
+static const U32 PRIME32_5 =  374761393U;
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;
+static const U64 PRIME64_2 = 14029467366897019727ULL;
+static const U64 PRIME64_3 =  1609587929392839161ULL;
+static const U64 PRIME64_4 =  9650029242287828579ULL;
+static const U64 PRIME64_5 =  2870177450012600261ULL;
+
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* **************************
+*  Utils
+****************************/
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+
+/* ***************************
+*  Simple Hash Functions
+*****************************/
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+    seed += input * PRIME32_2;
+    seed  = XXH_rotl32(seed, 13);
+    seed *= PRIME32_1;
+    return seed;
+}
+
+FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+            v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+            v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+            v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+        } while (p<=limit);
+
+        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32) len;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_get32bits(p) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_CREATESTATE_STATIC(state);
+    XXH32_reset(state, seed);
+    XXH32_update(state, input, len);
+    return XXH32_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+    U64 h64;
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+        } while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_get64bits(p));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
+        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_CREATESTATE_STATIC(state);
+    XXH64_reset(state, seed);
+    XXH64_update(state, input, len);
+    return XXH64_digest(state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+/* **************************************************
+*  Advanced Hash Functions
+****************************************************/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+
+/*** Hash feed ***/
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-4);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state)-8);   /* do not write into reserved, for future removal */
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+    memcpy(statePtr, &state, sizeof(state));
+    return XXH_OK;
+}
+
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len_32 += (unsigned)len;
+    state->large_len |= (len>=16) | (state->total_len_32>=16);
+
+    if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+        state->memsize += (unsigned)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* some data left from previous update */
+        XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+        {   const U32* p32 = state->mem32;
+            state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+            state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+            state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
+        }
+        p += 16-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p <= bEnd-16) {
+        const BYTE* const limit = bEnd - 16;
+        U32 v1 = state->v1;
+        U32 v2 = state->v2;
+        U32 v3 = state->v3;
+        U32 v4 = state->v4;
+
+        do {
+            v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+            v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+            v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+            v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem32;
+    const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
+    U32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    while (p+4<=bEnd) {
+        h32 += XXH_readLE32(p, endian) * PRIME32_3;
+        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h32 += (*p) * PRIME32_5;
+        h32  = XXH_rotl32(h32, 11) * PRIME32_1;
+        p++;
+    }
+
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+
+    return h32;
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+
+/* **** XXH64 **** */
+
+FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* const bEnd = p + len;
+
+#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+    if (input==NULL) return XXH_ERROR;
+#endif
+
+    state->total_len += len;
+
+    if (state->memsize + len < 32) {  /* fill in tmp buffer */
+        if (input != NULL) {
+            XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+        }
+        state->memsize += (U32)len;
+        return XXH_OK;
+    }
+
+    if (state->memsize) {   /* tmp buffer is full */
+        XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+        state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+        state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+        state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+        state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+        p += 32-state->memsize;
+        state->memsize = 0;
+    }
+
+    if (p+32 <= bEnd) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = state->v1;
+        U64 v2 = state->v2;
+        U64 v3 = state->v3;
+        U64 v4 = state->v4;
+
+        do {
+            v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+            v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+            v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+            v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+        } while (p<=limit);
+
+        state->v1 = v1;
+        state->v2 = v2;
+        state->v3 = v3;
+        state->v4 = v4;
+    }
+
+    if (p < bEnd) {
+        XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+        state->memsize = (unsigned)(bEnd-p);
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+
+FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+{
+    const BYTE * p = (const BYTE*)state->mem64;
+    const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
+    U64 h64;
+
+    if (state->total_len >= 32) {
+        U64 const v1 = state->v1;
+        U64 const v2 = state->v2;
+        U64 const v3 = state->v3;
+        U64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    while (p+8<=bEnd) {
+        U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
+        h64 ^= k1;
+        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
+        p+=8;
+    }
+
+    if (p+4<=bEnd) {
+        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
+        h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+        p+=4;
+    }
+
+    while (p<bEnd) {
+        h64 ^= (*p) * PRIME64_5;
+        h64  = XXH_rotl64(h64, 11) * PRIME64_1;
+        p++;
+    }
+
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+
+    return h64;
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/* **************************
+*  Canonical representation
+****************************/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+/**** ended inlining xxhash.c ****/
+#  endif
+
+#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+/**** ended inlining xxhash.h ****/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---- static assert (debug) --- */
+#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)
+#define ZSTD_isError ERR_isError   /* for inlining */
+#define FSE_isError  ERR_isError
+#define HUF_isError  ERR_isError
+
+
+/*-*************************************
+*  shared macros
+***************************************/
+#undef MIN
+#undef MAX
+#define MIN(a,b) ((a)<(b) ? (a) : (b))
+#define MAX(a,b) ((a)>(b) ? (a) : (b))
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * This is a helper function to help force C99-correctness during compilation.
+ * Under strict compilation modes, variadic macro arguments can't be empty.
+ * However, variadic function arguments can be. Using a function therefore lets
+ * us statically check that at least one (string) argument was passed,
+ * independent of the compilation flags.
+ */
+static INLINE_KEYWORD UNUSED_ATTR
+void _force_has_format_string(const char *format, ...) {
+  (void)format;
+}
+
+/**
+ * Ignore: this is an internal helper.
+ *
+ * We want to force this function invocation to be syntactically correct, but
+ * we don't want to force runtime evaluation of its arguments.
+ */
+#define _FORCE_HAS_FORMAT_STRING(...) \
+  if (0) { \
+    _force_has_format_string(__VA_ARGS__); \
+  }
+
+/**
+ * Return the specified error if the condition evaluates to true.
+ *
+ * In debug modes, prints additional information.
+ * In order to do that (particularly, printing the conditional that failed),
+ * this can't just wrap RETURN_ERROR().
+ */
+#define RETURN_ERROR_IF(cond, err, ...) \
+  if (cond) { \
+    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+           __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  }
+
+/**
+ * Unconditionally return the specified error.
+ *
+ * In debug modes, prints additional information.
+ */
+#define RETURN_ERROR(err, ...) \
+  do { \
+    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+           __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  } while(0);
+
+/**
+ * If the provided expression evaluates to an error code, returns that error code.
+ *
+ * In debug modes, prints additional information.
+ */
+#define FORWARD_IF_ERROR(err, ...) \
+  do { \
+    size_t const err_code = (err); \
+    if (ERR_isError(err_code)) { \
+      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+             __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \
+      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+      RAWLOG(3, ": " __VA_ARGS__); \
+      RAWLOG(3, "\n"); \
+      return err_code; \
+    } \
+  } while(0);
+
+
+/*-*************************************
+*  Common constants
+***************************************/
+#define ZSTD_OPT_NUM    (1<<12)
+
+#define ZSTD_REP_NUM      3                 /* number of repcodes */
+#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 };
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define BIT7 128
+#define BIT6  64
+#define BIT5  32
+#define BIT4  16
+#define BIT1   2
+#define BIT0   1
+
+#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10
+static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 };
+static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 };
+
+#define ZSTD_FRAMEIDSIZE 4   /* magic number size */
+
+#define ZSTD_BLOCKHEADERSIZE 3   /* C standard doesn't allow `static const` variable to be init using another `static const` variable */
+static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE;
+typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e;
+
+#define ZSTD_FRAMECHECKSUMSIZE 4
+
+#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */
+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */)   /* for a non-null block */
+
+#define HufLog 12
+typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e;
+
+#define LONGNBSEQ 0x7F00
+
+#define MINMATCH 3
+
+#define Litbits  8
+#define MaxLit ((1<<Litbits) - 1)
+#define MaxML   52
+#define MaxLL   35
+#define DefaultMaxOff 28
+#define MaxOff  31
+#define MaxSeq MAX(MaxLL, MaxML)   /* Assumption : MaxOff < MaxLL,MaxML */
+#define MLFSELog    9
+#define LLFSELog    9
+#define OffFSELog   8
+#define MaxFSELog  MAX(MAX(MLFSELog, LLFSELog), OffFSELog)
+
+static const U32 LL_bits[MaxLL+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3,
+                                      4, 6, 7, 8, 9,10,11,12,
+                                     13,14,15,16 };
+static const S16 LL_defaultNorm[MaxLL+1] = { 4, 3, 2, 2, 2, 2, 2, 2,
+                                             2, 2, 2, 2, 2, 1, 1, 1,
+                                             2, 2, 2, 2, 2, 2, 2, 2,
+                                             2, 3, 2, 1, 1, 1, 1, 1,
+                                            -1,-1,-1,-1 };
+#define LL_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 LL_defaultNormLog = LL_DEFAULTNORMLOG;
+
+static const U32 ML_bits[MaxML+1] = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      0, 0, 0, 0, 0, 0, 0, 0,
+                                      1, 1, 1, 1, 2, 2, 3, 3,
+                                      4, 4, 5, 7, 8, 9,10,11,
+                                     12,13,14,15,16 };
+static const S16 ML_defaultNorm[MaxML+1] = { 1, 4, 3, 2, 2, 2, 2, 2,
+                                             2, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1,-1,-1,
+                                            -1,-1,-1,-1,-1 };
+#define ML_DEFAULTNORMLOG 6  /* for static allocation */
+static const U32 ML_defaultNormLog = ML_DEFAULTNORMLOG;
+
+static const S16 OF_defaultNorm[DefaultMaxOff+1] = { 1, 1, 1, 1, 1, 1, 2, 2,
+                                                     2, 1, 1, 1, 1, 1, 1, 1,
+                                                     1, 1, 1, 1, 1, 1, 1, 1,
+                                                    -1,-1,-1,-1,-1 };
+#define OF_DEFAULTNORMLOG 5  /* for static allocation */
+static const U32 OF_defaultNormLog = OF_DEFAULTNORMLOG;
+
+
+/*-*******************************************
+*  Shared functions to include for inlining
+*********************************************/
+static void ZSTD_copy8(void* dst, const void* src) {
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
+    vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
+    memcpy(dst, src, 8);
+#endif
+}
+
+#define COPY8(d,s) { ZSTD_copy8(d,s); d+=8; s+=8; }
+static void ZSTD_copy16(void* dst, const void* src) {
+#if !defined(ZSTD_NO_INTRINSICS) && defined(__ARM_NEON)
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#else
+    memcpy(dst, src, 16);
+#endif
+}
+#define COPY16(d,s) { ZSTD_copy16(d,s); d+=16; s+=16; }
+
+#define WILDCOPY_OVERLENGTH 32
+#define WILDCOPY_VECLEN 16
+
+typedef enum {
+    ZSTD_no_overlap,
+    ZSTD_overlap_src_before_dst
+    /*  ZSTD_overlap_dst_before_src, */
+} ZSTD_overlap_e;
+
+/*! ZSTD_wildcopy() :
+ *  Custom version of memcpy(), can over read/write up to WILDCOPY_OVERLENGTH bytes (if length==0)
+ *  @param ovtype controls the overlap detection
+ *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ *         - ZSTD_overlap_src_before_dst: The src and dst may overlap, but they MUST be at least 8 bytes apart.
+ *           The src buffer must be before the dst buffer.
+ */
+MEM_STATIC FORCE_INLINE_ATTR 
+void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e const ovtype)
+{
+    ptrdiff_t diff = (BYTE*)dst - (const BYTE*)src;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + length;
+
+    assert(diff >= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN));
+
+    if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) {
+        /* Handle short offset copies. */
+        do {
+            COPY8(op, ip)
+        } while (op < oend);
+    } else {
+        assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN);
+        /* Separate out the first COPY16() call because the copy length is
+         * almost certain to be short, so the branches have different
+         * probabilities. Since it is almost certain to be short, only do
+         * one COPY16() in the first call. Then, do two calls per loop since
+         * at that point it is more likely to have a high trip count.
+         */
+#ifndef __aarch64__
+        do {
+            COPY16(op, ip);
+        }
+        while (op < oend);
+#else
+        COPY16(op, ip);
+        if (op >= oend) return;
+        do {
+            COPY16(op, ip);
+            COPY16(op, ip);
+        }
+        while (op < oend);
+#endif
+    }
+}
+
+MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    size_t const length = MIN(dstCapacity, srcSize);
+    if (length > 0) {
+        memcpy(dst, src, length);
+    }
+    return length;
+}
+
+/* define "workspace is too large" as this number of times larger than needed */
+#define ZSTD_WORKSPACETOOLARGE_FACTOR 3
+
+/* when workspace is continuously too large
+ * during at least this number of times,
+ * context's memory usage is considered wasteful,
+ * because it's sized to handle a worst case scenario which rarely happens.
+ * In which case, resize it down to free some memory */
+#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128
+
+
+/*-*******************************************
+*  Private declarations
+*********************************************/
+typedef struct seqDef_s {
+    U32 offset;
+    U16 litLength;
+    U16 matchLength;
+} seqDef;
+
+typedef struct {
+    seqDef* sequencesStart;
+    seqDef* sequences;
+    BYTE* litStart;
+    BYTE* lit;
+    BYTE* llCode;
+    BYTE* mlCode;
+    BYTE* ofCode;
+    size_t maxNbSeq;
+    size_t maxNbLit;
+    U32   longLengthID;   /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */
+    U32   longLengthPos;
+} seqStore_t;
+
+typedef struct {
+    U32 litLength;
+    U32 matchLength;
+} ZSTD_sequenceLength;
+
+/**
+ * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences
+ * indicated by longLengthPos and longLengthID, and adds MINMATCH back to matchLength.
+ */
+MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq)
+{
+    ZSTD_sequenceLength seqLen;
+    seqLen.litLength = seq->litLength;
+    seqLen.matchLength = seq->matchLength + MINMATCH;
+    if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) {
+        if (seqStore->longLengthID == 1) {
+            seqLen.litLength += 0xFFFF;
+        }
+        if (seqStore->longLengthID == 2) {
+            seqLen.matchLength += 0xFFFF;
+        }
+    }
+    return seqLen;
+}
+
+/**
+ * Contains the compressed frame size and an upper-bound for the decompressed frame size.
+ * Note: before using `compressedSize`, check for errors using ZSTD_isError().
+ *       similarly, before using `decompressedBound`, check for errors using:
+ *          `decompressedBound != ZSTD_CONTENTSIZE_ERROR`
+ */
+typedef struct {
+    size_t compressedSize;
+    unsigned long long decompressedBound;
+} ZSTD_frameSizeInfo;   /* decompress & legacy */
+
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx);   /* compress & dictBuilder */
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr);   /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */
+
+/* custom memory allocation functions */
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem);
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem);
+void ZSTD_free(void* ptr, ZSTD_customMem customMem);
+
+
+MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, dictBuilder, decodeCorpus */
+{
+    assert(val != 0);
+    {
+#   if defined(_MSC_VER)   /* Visual */
+        unsigned long r=0;
+        return _BitScanReverse(&r, val) ? (unsigned)r : 0;
+#   elif defined(__GNUC__) && (__GNUC__ >= 3)   /* GCC Intrinsic */
+        return __builtin_clz (val) ^ 31;
+#   elif defined(__ICCARM__)    /* IAR Intrinsic */
+        return 31 - __CLZ(val);
+#   else   /* Software version */
+        static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
+        U32 v = val;
+        v |= v >> 1;
+        v |= v >> 2;
+        v |= v >> 4;
+        v |= v >> 8;
+        v |= v >> 16;
+        return DeBruijnClz[(v * 0x07C4ACDDU) >> 27];
+#   endif
+    }
+}
+
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx);   /* zstdmt, adaptive_compression (shouldn't get this definition from here) */
+
+
+typedef struct {
+    blockType_e blockType;
+    U32 lastBlock;
+    U32 origSize;
+} blockProperties_t;   /* declared here for decompress and fullbench */
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr);
+
+/*! ZSTD_decodeSeqHeaders() :
+ *  decode sequence header from src */
+/* Used by: decompress, fullbench (does not get its definition from here) */
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                       const void* src, size_t srcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_CCOMMON_H_MODULE */
+/**** ended inlining zstd_internal.h ****/
+/**** start inlining pool.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef POOL_H
+#define POOL_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+#include <stddef.h>   /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_customMem */
+/**** skipping file: ../zstd.h ****/
+
+typedef struct POOL_ctx_s POOL_ctx;
+
+/*! POOL_create() :
+ *  Create a thread pool with at most `numThreads` threads.
+ * `numThreads` must be at least 1.
+ *  The maximum number of queued jobs before blocking is `queueSize`.
+ * @return : POOL_ctx pointer on success, else NULL.
+*/
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize);
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem);
+
+/*! POOL_free() :
+ *  Free a thread pool returned by POOL_create().
+ */
+void POOL_free(POOL_ctx* ctx);
+
+/*! POOL_resize() :
+ *  Expands or shrinks pool's number of threads.
+ *  This is more efficient than releasing + creating a new context,
+ *  since it tries to preserve and re-use existing threads.
+ * `numThreads` must be at least 1.
+ * @return : 0 when resize was successful,
+ *           !0 (typically 1) if there is an error.
+ *    note : only numThreads can be resized, queueSize remains unchanged.
+ */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads);
+
+/*! POOL_sizeof() :
+ * @return threadpool memory usage
+ *  note : compatible with NULL (returns 0 in this case)
+ */
+size_t POOL_sizeof(POOL_ctx* ctx);
+
+/*! POOL_function :
+ *  The function type that can be added to a thread pool.
+ */
+typedef void (*POOL_function)(void*);
+
+/*! POOL_add() :
+ *  Add the job `function(opaque)` to the thread pool. `ctx` must be valid.
+ *  Possibly blocks until there is room in the queue.
+ *  Note : The function may be executed asynchronously,
+ *         therefore, `opaque` must live until function has been completed.
+ */
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+/*! POOL_tryAdd() :
+ *  Add the job `function(opaque)` to thread pool _if_ a worker is available.
+ *  Returns immediately even if not (does not block).
+ * @return : 1 if successful, 0 if not.
+ */
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
+/**** ended inlining pool.h ****/
+
+/* ======   Compiler specifics   ====== */
+#if defined(_MSC_VER)
+#  pragma warning(disable : 4204)        /* disable: C4204: non-constant aggregate initializer */
+#endif
+
+
+#ifdef ZSTD_MULTITHREAD
+
+/**** start inlining threading.h ****/
+/**
+ * Copyright (c) 2016 Tino Reichardt
+ * All rights reserved.
+ *
+ * You can contact the author at:
+ * - zstdmt source repository: https://github.com/mcmilk/zstdmt
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef THREADING_H_938743
+#define THREADING_H_938743
+
+/**** skipping file: debug.h ****/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#if defined(ZSTD_MULTITHREAD) && defined(_WIN32)
+
+/**
+ * Windows minimalist Pthread Wrapper, based on :
+ * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html
+ */
+#ifdef WINVER
+#  undef WINVER
+#endif
+#define WINVER       0x0600
+
+#ifdef _WIN32_WINNT
+#  undef _WIN32_WINNT
+#endif
+#define _WIN32_WINNT 0x0600
+
+#ifndef WIN32_LEAN_AND_MEAN
+#  define WIN32_LEAN_AND_MEAN
+#endif
+
+#undef ERROR   /* reported already defined on VS 2015 (Rich Geldreich) */
+#include <windows.h>
+#undef ERROR
+#define ERROR(name) ZSTD_ERROR(name)
+
+
+/* mutex */
+#define ZSTD_pthread_mutex_t           CRITICAL_SECTION
+#define ZSTD_pthread_mutex_init(a, b)  ((void)(b), InitializeCriticalSection((a)), 0)
+#define ZSTD_pthread_mutex_destroy(a)  DeleteCriticalSection((a))
+#define ZSTD_pthread_mutex_lock(a)     EnterCriticalSection((a))
+#define ZSTD_pthread_mutex_unlock(a)   LeaveCriticalSection((a))
+
+/* condition variable */
+#define ZSTD_pthread_cond_t             CONDITION_VARIABLE
+#define ZSTD_pthread_cond_init(a, b)    ((void)(b), InitializeConditionVariable((a)), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    SleepConditionVariableCS((a), (b), INFINITE)
+#define ZSTD_pthread_cond_signal(a)     WakeConditionVariable((a))
+#define ZSTD_pthread_cond_broadcast(a)  WakeAllConditionVariable((a))
+
+/* ZSTD_pthread_create() and ZSTD_pthread_join() */
+typedef struct {
+    HANDLE handle;
+    void* (*start_routine)(void*);
+    void* arg;
+} ZSTD_pthread_t;
+
+int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused,
+                   void* (*start_routine) (void*), void* arg);
+
+int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr);
+
+/**
+ * add here more wrappers as required
+ */
+
+
+#elif defined(ZSTD_MULTITHREAD)    /* posix assumed ; need a better detection method */
+/* ===   POSIX Systems   === */
+#  include <pthread.h>
+
+#if DEBUGLEVEL < 1
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t
+#define ZSTD_pthread_mutex_init(a, b)   pthread_mutex_init((a), (b))
+#define ZSTD_pthread_mutex_destroy(a)   pthread_mutex_destroy((a))
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock((a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock((a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t
+#define ZSTD_pthread_cond_init(a, b)    pthread_cond_init((a), (b))
+#define ZSTD_pthread_cond_destroy(a)    pthread_cond_destroy((a))
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait((a), (b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal((a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast((a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
+
+#else /* DEBUGLEVEL >= 1 */
+
+/* Debug implementation of threading.
+ * In this implementation we use pointers for mutexes and condition variables.
+ * This way, if we forget to init/destroy them the program will crash or ASAN
+ * will report leaks.
+ */
+
+#define ZSTD_pthread_mutex_t            pthread_mutex_t*
+int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr);
+int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex);
+#define ZSTD_pthread_mutex_lock(a)      pthread_mutex_lock(*(a))
+#define ZSTD_pthread_mutex_unlock(a)    pthread_mutex_unlock(*(a))
+
+#define ZSTD_pthread_cond_t             pthread_cond_t*
+int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr);
+int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond);
+#define ZSTD_pthread_cond_wait(a, b)    pthread_cond_wait(*(a), *(b))
+#define ZSTD_pthread_cond_signal(a)     pthread_cond_signal(*(a))
+#define ZSTD_pthread_cond_broadcast(a)  pthread_cond_broadcast(*(a))
+
+#define ZSTD_pthread_t                  pthread_t
+#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d))
+#define ZSTD_pthread_join(a, b)         pthread_join((a),(b))
+
+#endif
+
+#else  /* ZSTD_MULTITHREAD not defined */
+/* No multithreading support */
+
+typedef int ZSTD_pthread_mutex_t;
+#define ZSTD_pthread_mutex_init(a, b)   ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_mutex_destroy(a)   ((void)(a))
+#define ZSTD_pthread_mutex_lock(a)      ((void)(a))
+#define ZSTD_pthread_mutex_unlock(a)    ((void)(a))
+
+typedef int ZSTD_pthread_cond_t;
+#define ZSTD_pthread_cond_init(a, b)    ((void)(a), (void)(b), 0)
+#define ZSTD_pthread_cond_destroy(a)    ((void)(a))
+#define ZSTD_pthread_cond_wait(a, b)    ((void)(a), (void)(b))
+#define ZSTD_pthread_cond_signal(a)     ((void)(a))
+#define ZSTD_pthread_cond_broadcast(a)  ((void)(a))
+
+/* do not use ZSTD_pthread_t */
+
+#endif /* ZSTD_MULTITHREAD */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* THREADING_H_938743 */
+/**** ended inlining threading.h ****/
+
+/* A job is a function and an opaque argument */
+typedef struct POOL_job_s {
+    POOL_function function;
+    void *opaque;
+} POOL_job;
+
+struct POOL_ctx_s {
+    ZSTD_customMem customMem;
+    /* Keep track of the threads */
+    ZSTD_pthread_t* threads;
+    size_t threadCapacity;
+    size_t threadLimit;
+
+    /* The queue is a circular buffer */
+    POOL_job *queue;
+    size_t queueHead;
+    size_t queueTail;
+    size_t queueSize;
+
+    /* The number of threads working on jobs */
+    size_t numThreadsBusy;
+    /* Indicates if the queue is empty */
+    int queueEmpty;
+
+    /* The mutex protects the queue */
+    ZSTD_pthread_mutex_t queueMutex;
+    /* Condition variable for pushers to wait on when the queue is full */
+    ZSTD_pthread_cond_t queuePushCond;
+    /* Condition variables for poppers to wait on when the queue is empty */
+    ZSTD_pthread_cond_t queuePopCond;
+    /* Indicates if the queue is shutting down */
+    int shutdown;
+};
+
+/* POOL_thread() :
+ * Work thread for the thread pool.
+ * Waits for jobs and executes them.
+ * @returns : NULL on failure else non-null.
+ */
+static void* POOL_thread(void* opaque) {
+    POOL_ctx* const ctx = (POOL_ctx*)opaque;
+    if (!ctx) { return NULL; }
+    for (;;) {
+        /* Lock the mutex and wait for a non-empty queue or until shutdown */
+        ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+
+        while ( ctx->queueEmpty
+            || (ctx->numThreadsBusy >= ctx->threadLimit) ) {
+            if (ctx->shutdown) {
+                /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit),
+                 * a few threads will be shutdown while !queueEmpty,
+                 * but enough threads will remain active to finish the queue */
+                ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+                return opaque;
+            }
+            ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex);
+        }
+        /* Pop a job off the queue */
+        {   POOL_job const job = ctx->queue[ctx->queueHead];
+            ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize;
+            ctx->numThreadsBusy++;
+            ctx->queueEmpty = ctx->queueHead == ctx->queueTail;
+            /* Unlock the mutex, signal a pusher, and run the job */
+            ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+
+            job.function(job.opaque);
+
+            /* If the intended queue size was 0, signal after finishing job */
+            ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+            ctx->numThreadsBusy--;
+            if (ctx->queueSize == 1) {
+                ZSTD_pthread_cond_signal(&ctx->queuePushCond);
+            }
+            ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        }
+    }  /* for (;;) */
+    assert(0);  /* Unreachable */
+}
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize,
+                               ZSTD_customMem customMem) {
+    POOL_ctx* ctx;
+    /* Check parameters */
+    if (!numThreads) { return NULL; }
+    /* Allocate the context and zero initialize */
+    ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem);
+    if (!ctx) { return NULL; }
+    /* Initialize the job queue.
+     * It needs one extra space since one space is wasted to differentiate
+     * empty and full queues.
+     */
+    ctx->queueSize = queueSize + 1;
+    ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem);
+    ctx->queueHead = 0;
+    ctx->queueTail = 0;
+    ctx->numThreadsBusy = 0;
+    ctx->queueEmpty = 1;
+    {
+        int error = 0;
+        error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL);
+        error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL);
+        if (error) { POOL_free(ctx); return NULL; }
+    }
+    ctx->shutdown = 0;
+    /* Allocate space for the thread handles */
+    ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem);
+    ctx->threadCapacity = 0;
+    ctx->customMem = customMem;
+    /* Check for errors */
+    if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; }
+    /* Initialize the threads */
+    {   size_t i;
+        for (i = 0; i < numThreads; ++i) {
+            if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) {
+                ctx->threadCapacity = i;
+                POOL_free(ctx);
+                return NULL;
+        }   }
+        ctx->threadCapacity = numThreads;
+        ctx->threadLimit = numThreads;
+    }
+    return ctx;
+}
+
+/*! POOL_join() :
+    Shutdown the queue, wake any sleeping threads, and join all of the threads.
+*/
+static void POOL_join(POOL_ctx* ctx) {
+    /* Shut down the queue */
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    ctx->shutdown = 1;
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    /* Wake up sleeping threads */
+    ZSTD_pthread_cond_broadcast(&ctx->queuePushCond);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    /* Join all of the threads */
+    {   size_t i;
+        for (i = 0; i < ctx->threadCapacity; ++i) {
+            ZSTD_pthread_join(ctx->threads[i], NULL);  /* note : could fail */
+    }   }
+}
+
+void POOL_free(POOL_ctx *ctx) {
+    if (!ctx) { return; }
+    POOL_join(ctx);
+    ZSTD_pthread_mutex_destroy(&ctx->queueMutex);
+    ZSTD_pthread_cond_destroy(&ctx->queuePushCond);
+    ZSTD_pthread_cond_destroy(&ctx->queuePopCond);
+    ZSTD_free(ctx->queue, ctx->customMem);
+    ZSTD_free(ctx->threads, ctx->customMem);
+    ZSTD_free(ctx, ctx->customMem);
+}
+
+
+
+size_t POOL_sizeof(POOL_ctx *ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    return sizeof(*ctx)
+        + ctx->queueSize * sizeof(POOL_job)
+        + ctx->threadCapacity * sizeof(ZSTD_pthread_t);
+}
+
+
+/* @return : 0 on success, 1 on error */
+static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads)
+{
+    if (numThreads <= ctx->threadCapacity) {
+        if (!numThreads) return 1;
+        ctx->threadLimit = numThreads;
+        return 0;
+    }
+    /* numThreads > threadCapacity */
+    {   ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem);
+        if (!threadPool) return 1;
+        /* replace existing thread pool */
+        memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool));
+        ZSTD_free(ctx->threads, ctx->customMem);
+        ctx->threads = threadPool;
+        /* Initialize additional threads */
+        {   size_t threadId;
+            for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) {
+                if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) {
+                    ctx->threadCapacity = threadId;
+                    return 1;
+            }   }
+    }   }
+    /* successfully expanded */
+    ctx->threadCapacity = numThreads;
+    ctx->threadLimit = numThreads;
+    return 0;
+}
+
+/* @return : 0 on success, 1 on error */
+int POOL_resize(POOL_ctx* ctx, size_t numThreads)
+{
+    int result;
+    if (ctx==NULL) return 1;
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    result = POOL_resize_internal(ctx, numThreads);
+    ZSTD_pthread_cond_broadcast(&ctx->queuePopCond);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return result;
+}
+
+/**
+ * Returns 1 if the queue is full and 0 otherwise.
+ *
+ * When queueSize is 1 (pool was created with an intended queueSize of 0),
+ * then a queue is empty if there is a thread free _and_ no job is waiting.
+ */
+static int isQueueFull(POOL_ctx const* ctx) {
+    if (ctx->queueSize > 1) {
+        return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize);
+    } else {
+        return (ctx->numThreadsBusy == ctx->threadLimit) ||
+               !ctx->queueEmpty;
+    }
+}
+
+
+static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque)
+{
+    POOL_job const job = {function, opaque};
+    assert(ctx != NULL);
+    if (ctx->shutdown) return;
+
+    ctx->queueEmpty = 0;
+    ctx->queue[ctx->queueTail] = job;
+    ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize;
+    ZSTD_pthread_cond_signal(&ctx->queuePopCond);
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    /* Wait until there is space in the queue for the new job */
+    while (isQueueFull(ctx) && (!ctx->shutdown)) {
+        ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex);
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+}
+
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque)
+{
+    assert(ctx != NULL);
+    ZSTD_pthread_mutex_lock(&ctx->queueMutex);
+    if (isQueueFull(ctx)) {
+        ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+        return 0;
+    }
+    POOL_add_internal(ctx, function, opaque);
+    ZSTD_pthread_mutex_unlock(&ctx->queueMutex);
+    return 1;
+}
+
+
+#else  /* ZSTD_MULTITHREAD  not defined */
+
+/* ========================== */
+/* No multi-threading support */
+/* ========================== */
+
+
+/* We don't need any data, but if it is empty, malloc() might return NULL. */
+struct POOL_ctx_s {
+    int dummy;
+};
+static POOL_ctx g_ctx;
+
+POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) {
+    return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem);
+}
+
+POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) {
+    (void)numThreads;
+    (void)queueSize;
+    (void)customMem;
+    return &g_ctx;
+}
+
+void POOL_free(POOL_ctx* ctx) {
+    assert(!ctx || ctx == &g_ctx);
+    (void)ctx;
+}
+
+int POOL_resize(POOL_ctx* ctx, size_t numThreads) {
+    (void)ctx; (void)numThreads;
+    return 0;
+}
+
+void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+}
+
+int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) {
+    (void)ctx;
+    function(opaque);
+    return 1;
+}
+
+size_t POOL_sizeof(POOL_ctx* ctx) {
+    if (ctx==NULL) return 0;  /* supports sizeof NULL */
+    assert(ctx == &g_ctx);
+    return sizeof(*ctx);
+}
+
+#endif  /* ZSTD_MULTITHREAD */
+/**** ended inlining common/pool.c ****/
+/**** start inlining common/zstd_common.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stdlib.h>      /* malloc, calloc, free */
+#include <string.h>      /* memset */
+/**** skipping file: error_private.h ****/
+/**** skipping file: zstd_internal.h ****/
+
+
+/*-****************************************
+*  Version
+******************************************/
+unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; }
+
+const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; }
+
+
+/*-****************************************
+*  ZSTD Error Management
+******************************************/
+#undef ZSTD_isError   /* defined within zstd_internal.h */
+/*! ZSTD_isError() :
+ *  tells if a return value is an error code
+ *  symbol is required for external callers */
+unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
+
+/*! ZSTD_getErrorName() :
+ *  provides error code string from function result (useful for debugging) */
+const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); }
+
+/*! ZSTD_getError() :
+ *  convert a `size_t` function result into a proper ZSTD_errorCode enum */
+ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); }
+
+/*! ZSTD_getErrorString() :
+ *  provides error code string from enum */
+const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); }
+
+
+
+/*=**************************************************************
+*  Custom allocator
+****************************************************************/
+void* ZSTD_malloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc)
+        return customMem.customAlloc(customMem.opaque, size);
+    return malloc(size);
+}
+
+void* ZSTD_calloc(size_t size, ZSTD_customMem customMem)
+{
+    if (customMem.customAlloc) {
+        /* calloc implemented as malloc+memset;
+         * not as efficient as calloc, but next best guess for custom malloc */
+        void* const ptr = customMem.customAlloc(customMem.opaque, size);
+        memset(ptr, 0, size);
+        return ptr;
+    }
+    return calloc(1, size);
+}
+
+void ZSTD_free(void* ptr, ZSTD_customMem customMem)
+{
+    if (ptr!=NULL) {
+        if (customMem.customFree)
+            customMem.customFree(customMem.opaque, ptr);
+        else
+            free(ptr);
+    }
+}
+/**** ended inlining common/zstd_common.c ****/
+
+/**** start inlining compress/fse_compress.c ****/
+/* ******************************************************************
+ * FSE : Finite State Entropy encoder
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <stdlib.h>     /* malloc, free, qsort */
+#include <string.h>     /* memcpy, memset */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/debug.h ****/
+/**** start inlining hist.h ****/
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+#include <stddef.h>   /* size_t */
+
+
+/* --- simple histogram functions --- */
+
+/*! HIST_count():
+ *  Provides the precise count of each byte within a table 'count'.
+ * 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1).
+ *  Updates *maxSymbolValuePtr with actual largest symbol value detected.
+ * @return : count of the most frequent symbol (which isn't identified).
+ *           or an error code, which can be tested using HIST_isError().
+ *           note : if return == srcSize, there is only one symbol.
+ */
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                  const void* src, size_t srcSize);
+
+unsigned HIST_isError(size_t code);  /**< tells if a return value is an error code */
+
+
+/* --- advanced histogram functions --- */
+
+#define HIST_WKSP_SIZE_U32 1024
+#define HIST_WKSP_SIZE    (HIST_WKSP_SIZE_U32 * sizeof(unsigned))
+/** HIST_count_wksp() :
+ *  Same as HIST_count(), but using an externally provided scratch buffer.
+ *  Benefit is this function will use very little stack space.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* src, size_t srcSize,
+                       void* workSpace, size_t workSpaceSize);
+
+/** HIST_countFast() :
+ *  same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr.
+ *  This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr`
+ */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                      const void* src, size_t srcSize);
+
+/** HIST_countFast_wksp() :
+ *  Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize,
+                           void* workSpace, size_t workSpaceSize);
+
+/*! HIST_count_simple() :
+ *  Same as HIST_countFast(), this function is unsafe,
+ *  and will segfault if any value within `src` is `> *maxSymbolValuePtr`.
+ *  It is also a bit slower for large inputs.
+ *  However, it does not need any additional memory (not even on stack).
+ * @return : count of the most frequent symbol.
+ *  Note this function doesn't produce any error (i.e. it must succeed).
+ */
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize);
+/**** ended inlining hist.h ****/
+/**** skipping file: ../common/bitstream.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+/**** skipping file: ../common/error_private.h ****/
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define FSE_isError ERR_isError
+
+
+/* **************************************************************
+*  Templates
+****************************************************************/
+/*
+  designed to be included
+  for type-specific functions (template emulation in C)
+  Objective is to write these functions only once, for improved maintenance
+*/
+
+/* safety checks */
+#ifndef FSE_FUNCTION_EXTENSION
+#  error "FSE_FUNCTION_EXTENSION must be defined"
+#endif
+#ifndef FSE_FUNCTION_TYPE
+#  error "FSE_FUNCTION_TYPE must be defined"
+#endif
+
+/* Function names */
+#define FSE_CAT(X,Y) X##Y
+#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y)
+#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y)
+
+
+/* Function templates */
+
+/* FSE_buildCTable_wksp() :
+ * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`).
+ * wkspSize should be sized to handle worst case situation, which is `1<<max_tableLog * sizeof(FSE_FUNCTION_TYPE)`
+ * workSpace must also be properly aligned with FSE_FUNCTION_TYPE requirements
+ */
+size_t FSE_buildCTable_wksp(FSE_CTable* ct,
+                      const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                            void* workSpace, size_t wkspSize)
+{
+    U32 const tableSize = 1 << tableLog;
+    U32 const tableMask = tableSize - 1;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableLog ? tableSize>>1 : 1) ;
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    U32 const step = FSE_TABLESTEP(tableSize);
+    U32 cumul[FSE_MAX_SYMBOL_VALUE+2];
+
+    FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace;
+    U32 highThreshold = tableSize-1;
+
+    /* CTable header */
+    if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge);
+    tableU16[-2] = (U16) tableLog;
+    tableU16[-1] = (U16) maxSymbolValue;
+    assert(tableLog < 16);   /* required for threshold strategy to work */
+
+    /* For explanations on how to distribute symbol values over the table :
+     * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */
+
+     #ifdef __clang_analyzer__
+     memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize);   /* useless initialization, just to keep scan-build happy */
+     #endif
+
+    /* symbol start positions */
+    {   U32 u;
+        cumul[0] = 0;
+        for (u=1; u <= maxSymbolValue+1; u++) {
+            if (normalizedCounter[u-1]==-1) {  /* Low proba symbol */
+                cumul[u] = cumul[u-1] + 1;
+                tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1);
+            } else {
+                cumul[u] = cumul[u-1] + normalizedCounter[u-1];
+        }   }
+        cumul[maxSymbolValue+1] = tableSize+1;
+    }
+
+    /* Spread symbols */
+    {   U32 position = 0;
+        U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            int nbOccurrences;
+            int const freq = normalizedCounter[symbol];
+            for (nbOccurrences=0; nbOccurrences<freq; nbOccurrences++) {
+                tableSymbol[position] = (FSE_FUNCTION_TYPE)symbol;
+                position = (position + step) & tableMask;
+                while (position > highThreshold)
+                    position = (position + step) & tableMask;   /* Low proba area */
+        }   }
+
+        assert(position==0);  /* Must have initialized all positions */
+    }
+
+    /* Build table */
+    {   U32 u; for (u=0; u<tableSize; u++) {
+        FSE_FUNCTION_TYPE s = tableSymbol[u];   /* note : static analyzer may not understand tableSymbol is properly initialized */
+        tableU16[cumul[s]++] = (U16) (tableSize+u);   /* TableU16 : sorted by symbol order; gives next state value */
+    }   }
+
+    /* Build Symbol Transformation Table */
+    {   unsigned total = 0;
+        unsigned s;
+        for (s=0; s<=maxSymbolValue; s++) {
+            switch (normalizedCounter[s])
+            {
+            case  0:
+                /* filling nonetheless, for compatibility with FSE_getMaxNbBits() */
+                symbolTT[s].deltaNbBits = ((tableLog+1) << 16) - (1<<tableLog);
+                break;
+
+            case -1:
+            case  1:
+                symbolTT[s].deltaNbBits = (tableLog << 16) - (1<<tableLog);
+                symbolTT[s].deltaFindState = total - 1;
+                total ++;
+                break;
+            default :
+                {
+                    U32 const maxBitsOut = tableLog - BIT_highbit32 (normalizedCounter[s]-1);
+                    U32 const minStatePlus = normalizedCounter[s] << maxBitsOut;
+                    symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus;
+                    symbolTT[s].deltaFindState = total - normalizedCounter[s];
+                    total +=  normalizedCounter[s];
+    }   }   }   }
+
+#if 0  /* debug : symbol costs */
+    DEBUGLOG(5, "\n --- table statistics : ");
+    {   U32 symbol;
+        for (symbol=0; symbol<=maxSymbolValue; symbol++) {
+            DEBUGLOG(5, "%3u: w=%3i,   maxBits=%u, fracBits=%.2f",
+                symbol, normalizedCounter[symbol],
+                FSE_getMaxNbBits(symbolTT, symbol),
+                (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
+        }
+    }
+#endif
+
+    return 0;
+}
+
+
+size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    FSE_FUNCTION_TYPE tableSymbol[FSE_MAX_TABLESIZE];   /* memset() is not necessary, even if static analyzer complain about it */
+    return FSE_buildCTable_wksp(ct, normalizedCounter, maxSymbolValue, tableLog, tableSymbol, sizeof(tableSymbol));
+}
+
+
+
+#ifndef FSE_COMMONDEFS_ONLY
+
+
+/*-**************************************************************
+*  FSE NCount encoding
+****************************************************************/
+size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog) >> 3) + 3;
+    return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbolValue==0 ? use default */
+}
+
+static size_t
+FSE_writeNCount_generic (void* header, size_t headerBufferSize,
+                   const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog,
+                         unsigned writeIsSafe)
+{
+    BYTE* const ostart = (BYTE*) header;
+    BYTE* out = ostart;
+    BYTE* const oend = ostart + headerBufferSize;
+    int nbBits;
+    const int tableSize = 1 << tableLog;
+    int remaining;
+    int threshold;
+    U32 bitStream = 0;
+    int bitCount = 0;
+    unsigned symbol = 0;
+    unsigned const alphabetSize = maxSymbolValue + 1;
+    int previousIs0 = 0;
+
+    /* Table Size */
+    bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount;
+    bitCount  += 4;
+
+    /* Init */
+    remaining = tableSize+1;   /* +1 for extra accuracy */
+    threshold = tableSize;
+    nbBits = tableLog+1;
+
+    while ((symbol < alphabetSize) && (remaining>1)) {  /* stops at 1 */
+        if (previousIs0) {
+            unsigned start = symbol;
+            while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++;
+            if (symbol == alphabetSize) break;   /* incorrect distribution */
+            while (symbol >= start+24) {
+                start+=24;
+                bitStream += 0xFFFFU << bitCount;
+                if ((!writeIsSafe) && (out > oend-2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE) bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out+=2;
+                bitStream>>=16;
+            }
+            while (symbol >= start+3) {
+                start+=3;
+                bitStream += 3 << bitCount;
+                bitCount += 2;
+            }
+            bitStream += (symbol-start) << bitCount;
+            bitCount += 2;
+            if (bitCount>16) {
+                if ((!writeIsSafe) && (out > oend - 2))
+                    return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+                out[0] = (BYTE)bitStream;
+                out[1] = (BYTE)(bitStream>>8);
+                out += 2;
+                bitStream >>= 16;
+                bitCount -= 16;
+        }   }
+        {   int count = normalizedCounter[symbol++];
+            int const max = (2*threshold-1) - remaining;
+            remaining -= count < 0 ? -count : count;
+            count++;   /* +1 for extra accuracy */
+            if (count>=threshold)
+                count += max;   /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */
+            bitStream += count << bitCount;
+            bitCount  += nbBits;
+            bitCount  -= (count<max);
+            previousIs0  = (count==1);
+            if (remaining<1) return ERROR(GENERIC);
+            while (remaining<threshold) { nbBits--; threshold>>=1; }
+        }
+        if (bitCount>16) {
+            if ((!writeIsSafe) && (out > oend - 2))
+                return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+            out[0] = (BYTE)bitStream;
+            out[1] = (BYTE)(bitStream>>8);
+            out += 2;
+            bitStream >>= 16;
+            bitCount -= 16;
+    }   }
+
+    if (remaining != 1)
+        return ERROR(GENERIC);  /* incorrect normalized distribution */
+    assert(symbol <= alphabetSize);
+
+    /* flush remaining bitStream */
+    if ((!writeIsSafe) && (out > oend - 2))
+        return ERROR(dstSize_tooSmall);   /* Buffer overflow */
+    out[0] = (BYTE)bitStream;
+    out[1] = (BYTE)(bitStream>>8);
+    out+= (bitCount+7) /8;
+
+    return (out-ostart);
+}
+
+
+size_t FSE_writeNCount (void* buffer, size_t bufferSize,
+                  const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog)
+{
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported */
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported */
+
+    if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog))
+        return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0);
+
+    return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */);
+}
+
+
+/*-**************************************************************
+*  FSE Compression Code
+****************************************************************/
+
+FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog)
+{
+    size_t size;
+    if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX;
+    size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32);
+    return (FSE_CTable*)malloc(size);
+}
+
+void FSE_freeCTable (FSE_CTable* ct) { free(ct); }
+
+/* provides the minimum logSize to safely represent a distribution */
+static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue)
+{
+    U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1;
+    U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2;
+    U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols;
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    return minBits;
+}
+
+unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus)
+{
+    U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus;
+    U32 tableLog = maxTableLog;
+    U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue);
+    assert(srcSize > 1); /* Not supported, RLE should be used instead */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (maxBitsSrc < tableLog) tableLog = maxBitsSrc;   /* Accuracy can be reduced */
+    if (minBits > tableLog) tableLog = minBits;   /* Need a minimum to safely represent all symbol values */
+    if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG;
+    if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG;
+    return tableLog;
+}
+
+unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2);
+}
+
+
+/* Secondary normalization method.
+   To be used when primary method fails. */
+
+static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue)
+{
+    short const NOT_YET_ASSIGNED = -2;
+    U32 s;
+    U32 distributed = 0;
+    U32 ToDistribute;
+
+    /* Init */
+    U32 const lowThreshold = (U32)(total >> tableLog);
+    U32 lowOne = (U32)((total * 3) >> (tableLog + 1));
+
+    for (s=0; s<=maxSymbolValue; s++) {
+        if (count[s] == 0) {
+            norm[s]=0;
+            continue;
+        }
+        if (count[s] <= lowThreshold) {
+            norm[s] = -1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+        if (count[s] <= lowOne) {
+            norm[s] = 1;
+            distributed++;
+            total -= count[s];
+            continue;
+        }
+
+        norm[s]=NOT_YET_ASSIGNED;
+    }
+    ToDistribute = (1 << tableLog) - distributed;
+
+    if (ToDistribute == 0)
+        return 0;
+
+    if ((total / ToDistribute) > lowOne) {
+        /* risk of rounding to zero */
+        lowOne = (U32)((total * 3) / (ToDistribute * 2));
+        for (s=0; s<=maxSymbolValue; s++) {
+            if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) {
+                norm[s] = 1;
+                distributed++;
+                total -= count[s];
+                continue;
+        }   }
+        ToDistribute = (1 << tableLog) - distributed;
+    }
+
+    if (distributed == maxSymbolValue+1) {
+        /* all values are pretty poor;
+           probably incompressible data (should have already been detected);
+           find max, then give all remaining points to max */
+        U32 maxV = 0, maxC = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > maxC) { maxV=s; maxC=count[s]; }
+        norm[maxV] += (short)ToDistribute;
+        return 0;
+    }
+
+    if (total == 0) {
+        /* all of the symbols were low enough for the lowOne or lowThreshold */
+        for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1))
+            if (norm[s] > 0) { ToDistribute--; norm[s]++; }
+        return 0;
+    }
+
+    {   U64 const vStepLog = 62 - tableLog;
+        U64 const mid = (1ULL << (vStepLog-1)) - 1;
+        U64 const rStep = ((((U64)1<<vStepLog) * ToDistribute) + mid) / total;   /* scale on remaining */
+        U64 tmpTotal = mid;
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (norm[s]==NOT_YET_ASSIGNED) {
+                U64 const end = tmpTotal + (count[s] * rStep);
+                U32 const sStart = (U32)(tmpTotal >> vStepLog);
+                U32 const sEnd = (U32)(end >> vStepLog);
+                U32 const weight = sEnd - sStart;
+                if (weight < 1)
+                    return ERROR(GENERIC);
+                norm[s] = (short)weight;
+                tmpTotal = end;
+    }   }   }
+
+    return 0;
+}
+
+
+size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog,
+                           const unsigned* count, size_t total,
+                           unsigned maxSymbolValue)
+{
+    /* Sanity checks */
+    if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG;
+    if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC);   /* Unsupported size */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);   /* Unsupported size */
+    if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC);   /* Too small tableLog, compression potentially impossible */
+
+    {   static U32 const rtbTable[] = {     0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 };
+        U64 const scale = 62 - tableLog;
+        U64 const step = ((U64)1<<62) / total;   /* <== here, one division ! */
+        U64 const vStep = 1ULL<<(scale-20);
+        int stillToDistribute = 1<<tableLog;
+        unsigned s;
+        unsigned largest=0;
+        short largestP=0;
+        U32 lowThreshold = (U32)(total >> tableLog);
+
+        for (s=0; s<=maxSymbolValue; s++) {
+            if (count[s] == total) return 0;   /* rle special case */
+            if (count[s] == 0) { normalizedCounter[s]=0; continue; }
+            if (count[s] <= lowThreshold) {
+                normalizedCounter[s] = -1;
+                stillToDistribute--;
+            } else {
+                short proba = (short)((count[s]*step) >> scale);
+                if (proba<8) {
+                    U64 restToBeat = vStep * rtbTable[proba];
+                    proba += (count[s]*step) - ((U64)proba<<scale) > restToBeat;
+                }
+                if (proba > largestP) { largestP=proba; largest=s; }
+                normalizedCounter[s] = proba;
+                stillToDistribute -= proba;
+        }   }
+        if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) {
+            /* corner case, need another normalization method */
+            size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue);
+            if (FSE_isError(errorCode)) return errorCode;
+        }
+        else normalizedCounter[largest] += (short)stillToDistribute;
+    }
+
+#if 0
+    {   /* Print Table (debug) */
+        U32 s;
+        U32 nTotal = 0;
+        for (s=0; s<=maxSymbolValue; s++)
+            RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]);
+        for (s=0; s<=maxSymbolValue; s++)
+            nTotal += abs(normalizedCounter[s]);
+        if (nTotal != (1U<<tableLog))
+            RAWLOG(2, "Warning !!! Total == %u != %u !!!", nTotal, 1U<<tableLog);
+        getchar();
+    }
+#endif
+
+    return tableLog;
+}
+
+
+/* fake FSE_CTable, for raw (uncompressed) input */
+size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits)
+{
+    const unsigned tableSize = 1 << nbBits;
+    const unsigned tableMask = tableSize - 1;
+    const unsigned maxSymbolValue = tableMask;
+    void* const ptr = ct;
+    U16* const tableU16 = ( (U16*) ptr) + 2;
+    void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1);   /* assumption : tableLog >= 1 */
+    FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT);
+    unsigned s;
+
+    /* Sanity checks */
+    if (nbBits < 1) return ERROR(GENERIC);             /* min size */
+
+    /* header */
+    tableU16[-2] = (U16) nbBits;
+    tableU16[-1] = (U16) maxSymbolValue;
+
+    /* Build table */
+    for (s=0; s<tableSize; s++)
+        tableU16[s] = (U16)(tableSize + s);
+
+    /* Build Symbol Transformation Table */
+    {   const U32 deltaNbBits = (nbBits << 16) - (1 << nbBits);
+        for (s=0; s<=maxSymbolValue; s++) {
+            symbolTT[s].deltaNbBits = deltaNbBits;
+            symbolTT[s].deltaFindState = s-1;
+    }   }
+
+    return 0;
+}
+
+/* fake FSE_CTable, for rle input (always same symbol) */
+size_t FSE_buildCTable_rle (FSE_CTable* ct, BYTE symbolValue)
+{
+    void* ptr = ct;
+    U16* tableU16 = ( (U16*) ptr) + 2;
+    void* FSCTptr = (U32*)ptr + 2;
+    FSE_symbolCompressionTransform* symbolTT = (FSE_symbolCompressionTransform*) FSCTptr;
+
+    /* header */
+    tableU16[-2] = (U16) 0;
+    tableU16[-1] = (U16) symbolValue;
+
+    /* Build table */
+    tableU16[0] = 0;
+    tableU16[1] = 0;   /* just in case */
+
+    /* Build Symbol Transformation Table */
+    symbolTT[symbolValue].deltaNbBits = 0;
+    symbolTT[symbolValue].deltaFindState = 0;
+
+    return 0;
+}
+
+
+static size_t FSE_compress_usingCTable_generic (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct, const unsigned fast)
+{
+    const BYTE* const istart = (const BYTE*) src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip=iend;
+
+    BIT_CStream_t bitC;
+    FSE_CState_t CState1, CState2;
+
+    /* init */
+    if (srcSize <= 2) return 0;
+    { size_t const initError = BIT_initCStream(&bitC, dst, dstSize);
+      if (FSE_isError(initError)) return 0; /* not enough space available to write a bitstream */ }
+
+#define FSE_FLUSHBITS(s)  (fast ? BIT_flushBitsFast(s) : BIT_flushBits(s))
+
+    if (srcSize & 1) {
+        FSE_initCState2(&CState1, ct, *--ip);
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    } else {
+        FSE_initCState2(&CState2, ct, *--ip);
+        FSE_initCState2(&CState1, ct, *--ip);
+    }
+
+    /* join to mod 4 */
+    srcSize -= 2;
+    if ((sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) {  /* test bit 2 */
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    /* 2 or 4 encoding per loop */
+    while ( ip>istart ) {
+
+        FSE_encodeSymbol(&bitC, &CState2, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 )   /* this test must be static */
+            FSE_FLUSHBITS(&bitC);
+
+        FSE_encodeSymbol(&bitC, &CState1, *--ip);
+
+        if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) {  /* this test must be static */
+            FSE_encodeSymbol(&bitC, &CState2, *--ip);
+            FSE_encodeSymbol(&bitC, &CState1, *--ip);
+        }
+
+        FSE_FLUSHBITS(&bitC);
+    }
+
+    FSE_flushCState(&bitC, &CState2);
+    FSE_flushCState(&bitC, &CState1);
+    return BIT_closeCStream(&bitC);
+}
+
+size_t FSE_compress_usingCTable (void* dst, size_t dstSize,
+                           const void* src, size_t srcSize,
+                           const FSE_CTable* ct)
+{
+    unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize));
+
+    if (fast)
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1);
+    else
+        return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0);
+}
+
+
+size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); }
+
+/* FSE_compress_wksp() :
+ * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`).
+ * `wkspSize` size must be `(1<<tableLog)`.
+ */
+size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    unsigned count[FSE_MAX_SYMBOL_VALUE+1];
+    S16   norm[FSE_MAX_SYMBOL_VALUE+1];
+    FSE_CTable* CTable = (FSE_CTable*)workSpace;
+    size_t const CTableSize = FSE_CTABLE_SIZE_U32(tableLog, maxSymbolValue);
+    void* scratchBuffer = (void*)(CTable + CTableSize);
+    size_t const scratchBufferSize = wkspSize - (CTableSize * sizeof(FSE_CTable));
+
+    /* init conditions */
+    if (wkspSize < FSE_WKSP_SIZE_U32(tableLog, maxSymbolValue)) return ERROR(tableLog_tooLarge);
+    if (srcSize <= 1) return 0;  /* Not compressible */
+    if (!maxSymbolValue) maxSymbolValue = FSE_MAX_SYMBOL_VALUE;
+    if (!tableLog) tableLog = FSE_DEFAULT_TABLELOG;
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(maxCount, HIST_count_wksp(count, &maxSymbolValue, src, srcSize, scratchBuffer, scratchBufferSize) );
+        if (maxCount == srcSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;         /* each symbol present maximum once => not compressible */
+        if (maxCount < (srcSize >> 7)) return 0;   /* Heuristic : not compressible enough */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) );
+        op += nc_err;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    /* check compressibility */
+    if ( (size_t)(op-ostart) >= srcSize-1 ) return 0;
+
+    return op-ostart;
+}
+
+typedef struct {
+    FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)];
+    BYTE scratchBuffer[1 << FSE_MAX_TABLELOG];
+} fseWkspMax_t;
+
+size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog)
+{
+    fseWkspMax_t scratchBuffer;
+    DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE));   /* compilation failures here means scratchBuffer is not large enough */
+    if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge);
+    return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer));
+}
+
+size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG);
+}
+
+
+#endif   /* FSE_COMMONDEFS_ONLY */
+/**** ended inlining compress/fse_compress.c ****/
+/**** start inlining compress/hist.c ****/
+/* ******************************************************************
+ * hist : Histogram functions
+ * part of Finite State Entropy project
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* --- dependencies --- */
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/debug.h ****/
+/**** skipping file: ../common/error_private.h ****/
+/**** skipping file: hist.h ****/
+
+
+/* --- Error management --- */
+unsigned HIST_isError(size_t code) { return ERR_isError(code); }
+
+/*-**************************************************************
+ *  Histogram functions
+ ****************************************************************/
+unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr,
+                           const void* src, size_t srcSize)
+{
+    const BYTE* ip = (const BYTE*)src;
+    const BYTE* const end = ip + srcSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned largestCount=0;
+
+    memset(count, 0, (maxSymbolValue+1) * sizeof(*count));
+    if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; }
+
+    while (ip<end) {
+        assert(*ip <= maxSymbolValue);
+        count[*ip++]++;
+    }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+
+    {   U32 s;
+        for (s=0; s<=maxSymbolValue; s++)
+            if (count[s] > largestCount) largestCount = count[s];
+    }
+
+    return largestCount;
+}
+
+typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e;
+
+/* HIST_count_parallel_wksp() :
+ * store histogram into 4 intermediate tables, recombined at the end.
+ * this design makes better use of OoO cpus,
+ * and is noticeably faster when some values are heavily repeated.
+ * But it needs some additional workspace for intermediate tables.
+ * `workSpace` size must be a table of size >= HIST_WKSP_SIZE_U32.
+ * @return : largest histogram frequency,
+ *           or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */
+static size_t HIST_count_parallel_wksp(
+                                unsigned* count, unsigned* maxSymbolValuePtr,
+                                const void* source, size_t sourceSize,
+                                HIST_checkInput_e check,
+                                U32* const workSpace)
+{
+    const BYTE* ip = (const BYTE*)source;
+    const BYTE* const iend = ip+sourceSize;
+    unsigned maxSymbolValue = *maxSymbolValuePtr;
+    unsigned max=0;
+    U32* const Counting1 = workSpace;
+    U32* const Counting2 = Counting1 + 256;
+    U32* const Counting3 = Counting2 + 256;
+    U32* const Counting4 = Counting3 + 256;
+
+    memset(workSpace, 0, 4*256*sizeof(unsigned));
+
+    /* safety checks */
+    if (!sourceSize) {
+        memset(count, 0, maxSymbolValue + 1);
+        *maxSymbolValuePtr = 0;
+        return 0;
+    }
+    if (!maxSymbolValue) maxSymbolValue = 255;            /* 0 == default */
+
+    /* by stripes of 16 bytes */
+    {   U32 cached = MEM_read32(ip); ip += 4;
+        while (ip < iend-15) {
+            U32 c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+            c = cached; cached = MEM_read32(ip); ip += 4;
+            Counting1[(BYTE) c     ]++;
+            Counting2[(BYTE)(c>>8) ]++;
+            Counting3[(BYTE)(c>>16)]++;
+            Counting4[       c>>24 ]++;
+        }
+        ip-=4;
+    }
+
+    /* finish last symbols */
+    while (ip<iend) Counting1[*ip++]++;
+
+    if (check) {   /* verify stats will fit into destination table */
+        U32 s; for (s=255; s>maxSymbolValue; s--) {
+            Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s];
+            if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall);
+    }   }
+
+    {   U32 s;
+        if (maxSymbolValue > 255) maxSymbolValue = 255;
+        for (s=0; s<=maxSymbolValue; s++) {
+            count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s];
+            if (count[s] > max) max = count[s];
+    }   }
+
+    while (!count[maxSymbolValue]) maxSymbolValue--;
+    *maxSymbolValuePtr = maxSymbolValue;
+    return (size_t)max;
+}
+
+/* HIST_countFast_wksp() :
+ * Same as HIST_countFast(), but using an externally provided scratch buffer.
+ * `workSpace` is a writable buffer which must be 4-bytes aligned,
+ * `workSpaceSize` must be >= HIST_WKSP_SIZE
+ */
+size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                          const void* source, size_t sourceSize,
+                          void* workSpace, size_t workSpaceSize)
+{
+    if (sourceSize < 1500) /* heuristic threshold */
+        return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize);
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace);
+}
+
+/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */
+size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr,
+                     const void* source, size_t sourceSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters));
+}
+
+/* HIST_count_wksp() :
+ * Same as HIST_count(), but using an externally provided scratch buffer.
+ * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */
+size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr,
+                       const void* source, size_t sourceSize,
+                       void* workSpace, size_t workSpaceSize)
+{
+    if ((size_t)workSpace & 3) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall);
+    if (*maxSymbolValuePtr < 255)
+        return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace);
+    *maxSymbolValuePtr = 255;
+    return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize);
+}
+
+size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr,
+                 const void* src, size_t srcSize)
+{
+    unsigned tmpCounters[HIST_WKSP_SIZE_U32];
+    return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters));
+}
+/**** ended inlining compress/hist.c ****/
+/**** start inlining compress/huf_compress.c ****/
+/* ******************************************************************
+ * Huffman encoder, part of New Generation Entropy library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *  - Public forum : https://groups.google.com/forum/#!forum/lz4c
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Compiler specifics
+****************************************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#endif
+
+
+/* **************************************************************
+*  Includes
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+#include <stdio.h>      /* printf (debug) */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/bitstream.h ****/
+/**** skipping file: hist.h ****/
+#define FSE_STATIC_LINKING_ONLY   /* FSE_optimalTableLog_internal */
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/error_private.h ****/
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c)   /* use only *after* variable declarations */
+
+
+/* **************************************************************
+*  Utils
+****************************************************************/
+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue)
+{
+    return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1);
+}
+
+
+/* *******************************************************
+*  HUF : Huffman block compression
+*********************************************************/
+/* HUF_compressWeights() :
+ * Same as FSE_compress(), but dedicated to huff0's weights compression.
+ * The use case needs much less stack memory.
+ * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX.
+ */
+#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6
+static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize)
+{
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* op = ostart;
+    BYTE* const oend = ostart + dstSize;
+
+    unsigned maxSymbolValue = HUF_TABLELOG_MAX;
+    U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
+
+    FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)];
+    BYTE scratchBuffer[1<<MAX_FSE_TABLELOG_FOR_HUFF_HEADER];
+
+    unsigned count[HUF_TABLELOG_MAX+1];
+    S16 norm[HUF_TABLELOG_MAX+1];
+
+    /* init conditions */
+    if (wtSize <= 1) return 0;  /* Not compressible */
+
+    /* Scan input and build symbol stats */
+    {   unsigned const maxCount = HIST_count_simple(count, &maxSymbolValue, weightTable, wtSize);   /* never fails */
+        if (maxCount == wtSize) return 1;   /* only a single symbol in src : rle */
+        if (maxCount == 1) return 0;        /* each symbol present maximum once => not compressible */
+    }
+
+    tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue);
+    CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) );
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), norm, maxSymbolValue, tableLog) );
+        op += hSize;
+    }
+
+    /* Compress */
+    CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) );
+    {   CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, CTable) );
+        if (cSize == 0) return 0;   /* not enough space for compressed data */
+        op += cSize;
+    }
+
+    return (size_t)(op-ostart);
+}
+
+
+struct HUF_CElt_s {
+  U16  val;
+  BYTE nbBits;
+};   /* typedef'd to HUF_CElt within "huf.h" */
+
+/*! HUF_writeCTable() :
+    `CTable` : Huffman tree to save, using huf representation.
+    @return : size of saved CTable */
+size_t HUF_writeCTable (void* dst, size_t maxDstSize,
+                        const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog)
+{
+    BYTE bitsToWeight[HUF_TABLELOG_MAX + 1];   /* precomputed conversion table */
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX];
+    BYTE* op = (BYTE*)dst;
+    U32 n;
+
+     /* check conditions */
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+
+    /* convert to weight */
+    bitsToWeight[0] = 0;
+    for (n=1; n<huffLog+1; n++)
+        bitsToWeight[n] = (BYTE)(huffLog + 1 - n);
+    for (n=0; n<maxSymbolValue; n++)
+        huffWeight[n] = bitsToWeight[CTable[n].nbBits];
+
+    /* attempt weights compression by FSE */
+    {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, huffWeight, maxSymbolValue) );
+        if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed */
+            op[0] = (BYTE)hSize;
+            return hSize+1;
+    }   }
+
+    /* write raw values as 4-bits (max : 15) */
+    if (maxSymbolValue > (256-128)) return ERROR(GENERIC);   /* should not happen : likely means source cannot be compressed */
+    if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall);   /* not enough space within dst buffer */
+    op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1));
+    huffWeight[maxSymbolValue] = 0;   /* to be sure it doesn't cause msan issue in final combination */
+    for (n=0; n<maxSymbolValue; n+=2)
+        op[(n/2)+1] = (BYTE)((huffWeight[n] << 4) + huffWeight[n+1]);
+    return ((maxSymbolValue+1)/2) + 1;
+}
+
+
+size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights)
+{
+    BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1];   /* init not required, even though some static analyzer may complain */
+    U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for values from 0 to 16 */
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+
+    /* get symbol weights */
+    CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, rankVal, &nbSymbols, &tableLog, src, srcSize));
+
+    /* check result */
+    if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall);
+
+    /* Prepare base value per rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<=tableLog; n++) {
+            U32 current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill nbBits */
+    *hasZeroWeights = 0;
+    {   U32 n; for (n=0; n<nbSymbols; n++) {
+            const U32 w = huffWeight[n];
+            *hasZeroWeights |= (w == 0);
+            CTable[n].nbBits = (BYTE)(tableLog + 1 - w) & -(w != 0);
+    }   }
+
+    /* fill val */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  = {0};  /* support w=0=>n=tableLog+1 */
+        U16 valPerRank[HUF_TABLELOG_MAX+2] = {0};
+        { U32 n; for (n=0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]++; }
+        /* determine stating value per rank */
+        valPerRank[tableLog+1] = 0;   /* for w==0 */
+        {   U16 min = 0;
+            U32 n; for (n=tableLog; n>0; n--) {  /* start at n=tablelog <-> w=1 */
+                valPerRank[n] = min;     /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        /* assign value within rank, symbol order */
+        { U32 n; for (n=0; n<nbSymbols; n++) CTable[n].val = valPerRank[CTable[n].nbBits]++; }
+    }
+
+    *maxSymbolValuePtr = nbSymbols - 1;
+    return readSize;
+}
+
+U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
+{
+    const HUF_CElt* table = (const HUF_CElt*)symbolTable;
+    assert(symbolValue <= HUF_SYMBOLVALUE_MAX);
+    return table[symbolValue].nbBits;
+}
+
+
+typedef struct nodeElt_s {
+    U32 count;
+    U16 parent;
+    BYTE byte;
+    BYTE nbBits;
+} nodeElt;
+
+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits)
+{
+    const U32 largestBits = huffNode[lastNonNull].nbBits;
+    if (largestBits <= maxNbBits) return largestBits;   /* early exit : no elt > maxNbBits */
+
+    /* there are several too large elements (at least >= 2) */
+    {   int totalCost = 0;
+        const U32 baseCost = 1 << (largestBits - maxNbBits);
+        int n = (int)lastNonNull;
+
+        while (huffNode[n].nbBits > maxNbBits) {
+            totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits));
+            huffNode[n].nbBits = (BYTE)maxNbBits;
+            n --;
+        }  /* n stops at huffNode[n].nbBits <= maxNbBits */
+        while (huffNode[n].nbBits == maxNbBits) n--;   /* n end at index of smallest symbol using < maxNbBits */
+
+        /* renorm totalCost */
+        totalCost >>= (largestBits - maxNbBits);  /* note : totalCost is necessarily a multiple of baseCost */
+
+        /* repay normalized cost */
+        {   U32 const noSymbol = 0xF0F0F0F0;
+            U32 rankLast[HUF_TABLELOG_MAX+2];
+
+            /* Get pos of last (smallest) symbol per rank */
+            memset(rankLast, 0xF0, sizeof(rankLast));
+            {   U32 currentNbBits = maxNbBits;
+                int pos;
+                for (pos=n ; pos >= 0; pos--) {
+                    if (huffNode[pos].nbBits >= currentNbBits) continue;
+                    currentNbBits = huffNode[pos].nbBits;   /* < maxNbBits */
+                    rankLast[maxNbBits-currentNbBits] = (U32)pos;
+            }   }
+
+            while (totalCost > 0) {
+                U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1;
+                for ( ; nBitsToDecrease > 1; nBitsToDecrease--) {
+                    U32 const highPos = rankLast[nBitsToDecrease];
+                    U32 const lowPos = rankLast[nBitsToDecrease-1];
+                    if (highPos == noSymbol) continue;
+                    if (lowPos == noSymbol) break;
+                    {   U32 const highTotal = huffNode[highPos].count;
+                        U32 const lowTotal = 2 * huffNode[lowPos].count;
+                        if (highTotal <= lowTotal) break;
+                }   }
+                /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */
+                /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */
+                while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol))
+                    nBitsToDecrease ++;
+                totalCost -= 1 << (nBitsToDecrease-1);
+                if (rankLast[nBitsToDecrease-1] == noSymbol)
+                    rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease];   /* this rank is no longer empty */
+                huffNode[rankLast[nBitsToDecrease]].nbBits ++;
+                if (rankLast[nBitsToDecrease] == 0)    /* special case, reached largest symbol */
+                    rankLast[nBitsToDecrease] = noSymbol;
+                else {
+                    rankLast[nBitsToDecrease]--;
+                    if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease)
+                        rankLast[nBitsToDecrease] = noSymbol;   /* this rank is now empty */
+            }   }   /* while (totalCost > 0) */
+
+            while (totalCost < 0) {  /* Sometimes, cost correction overshoot */
+                if (rankLast[1] == noSymbol) {  /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
+                    while (huffNode[n].nbBits == maxNbBits) n--;
+                    huffNode[n+1].nbBits--;
+                    assert(n >= 0);
+                    rankLast[1] = (U32)(n+1);
+                    totalCost++;
+                    continue;
+                }
+                huffNode[ rankLast[1] + 1 ].nbBits--;
+                rankLast[1]++;
+                totalCost ++;
+    }   }   }   /* there are several too large elements (at least >= 2) */
+
+    return maxNbBits;
+}
+
+typedef struct {
+    U32 base;
+    U32 current;
+} rankPos;
+
+typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
+
+#define RANK_POSITION_TABLE_SIZE 32
+
+typedef struct {
+  huffNodeTable huffNodeTbl;
+  rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
+} HUF_buildCTable_wksp_tables;
+
+static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition)
+{
+    U32 n;
+
+    memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE);
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 r = BIT_highbit32(count[n] + 1);
+        rankPosition[r].base ++;
+    }
+    for (n=30; n>0; n--) rankPosition[n-1].base += rankPosition[n].base;
+    for (n=0; n<32; n++) rankPosition[n].current = rankPosition[n].base;
+    for (n=0; n<=maxSymbolValue; n++) {
+        U32 const c = count[n];
+        U32 const r = BIT_highbit32(c+1) + 1;
+        U32 pos = rankPosition[r].current++;
+        while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) {
+            huffNode[pos] = huffNode[pos-1];
+            pos--;
+        }
+        huffNode[pos].count = c;
+        huffNode[pos].byte  = (BYTE)n;
+    }
+}
+
+
+/** HUF_buildCTable_wksp() :
+ *  Same as HUF_buildCTable(), but using externally allocated scratch buffer.
+ *  `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables).
+ */
+#define STARTNODE (HUF_SYMBOLVALUE_MAX+1)
+
+size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+{
+    HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace;
+    nodeElt* const huffNode0 = wksp_tables->huffNodeTbl;
+    nodeElt* const huffNode = huffNode0+1;
+    int nonNullRank;
+    int lowS, lowN;
+    int nodeNb = STARTNODE;
+    int n, nodeRoot;
+
+    /* safety checks */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
+      return ERROR(workSpace_tooSmall);
+    if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT;
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX)
+      return ERROR(maxSymbolValue_tooLarge);
+    memset(huffNode0, 0, sizeof(huffNodeTable));
+
+    /* sort, decreasing order */
+    HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition);
+
+    /* init for parents */
+    nonNullRank = (int)maxSymbolValue;
+    while(huffNode[nonNullRank].count == 0) nonNullRank--;
+    lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb;
+    huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count;
+    huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb;
+    nodeNb++; lowS-=2;
+    for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30);
+    huffNode0[0].count = (U32)(1U<<31);  /* fake entry, strong barrier */
+
+    /* create parents */
+    while (nodeNb <= nodeRoot) {
+        int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++;
+        huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count;
+        huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb;
+        nodeNb++;
+    }
+
+    /* distribute weights (unlimited tree height) */
+    huffNode[nodeRoot].nbBits = 0;
+    for (n=nodeRoot-1; n>=STARTNODE; n--)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+    for (n=0; n<=nonNullRank; n++)
+        huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1;
+
+    /* enforce maxTableLog */
+    maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
+
+    /* fill result into tree (val, nbBits) */
+    {   U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0};
+        U16 valPerRank[HUF_TABLELOG_MAX+1] = {0};
+        int const alphabetSize = (int)(maxSymbolValue + 1);
+        if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fit into table */
+        for (n=0; n<=nonNullRank; n++)
+            nbPerRank[huffNode[n].nbBits]++;
+        /* determine stating value per rank */
+        {   U16 min = 0;
+            for (n=(int)maxNbBits; n>0; n--) {
+                valPerRank[n] = min;      /* get starting value within each rank */
+                min += nbPerRank[n];
+                min >>= 1;
+        }   }
+        for (n=0; n<alphabetSize; n++)
+            tree[huffNode[n].byte].nbBits = huffNode[n].nbBits;   /* push nbBits per symbol, symbol order */
+        for (n=0; n<alphabetSize; n++)
+            tree[n].val = valPerRank[tree[n].nbBits]++;   /* assign value within rank, symbol order */
+    }
+
+    return maxNbBits;
+}
+
+/** HUF_buildCTable() :
+ * @return : maxNbBits
+ *  Note : count is used before tree is written, so they can safely overlap
+ */
+size_t HUF_buildCTable (HUF_CElt* tree, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits)
+{
+    HUF_buildCTable_wksp_tables workspace;
+    return HUF_buildCTable_wksp(tree, count, maxSymbolValue, maxNbBits, &workspace, sizeof(workspace));
+}
+
+size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue)
+{
+    size_t nbBits = 0;
+    int s;
+    for (s = 0; s <= (int)maxSymbolValue; ++s) {
+        nbBits += CTable[s].nbBits * count[s];
+    }
+    return nbBits >> 3;
+}
+
+int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) {
+  int bad = 0;
+  int s;
+  for (s = 0; s <= (int)maxSymbolValue; ++s) {
+    bad |= (count[s] != 0) & (CTable[s].nbBits == 0);
+  }
+  return !bad;
+}
+
+size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
+
+FORCE_INLINE_TEMPLATE void
+HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable)
+{
+    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+}
+
+#define HUF_FLUSHBITS(s)  BIT_flushBits(s)
+
+#define HUF_FLUSHBITS_1(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream)
+
+#define HUF_FLUSHBITS_2(stream) \
+    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream)
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    const BYTE* ip = (const BYTE*) src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+    size_t n;
+    BIT_CStream_t bitC;
+
+    /* init */
+    if (dstSize < 8) return 0;   /* not enough space to compress */
+    { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op));
+      if (HUF_isError(initErr)) return 0; }
+
+    n = srcSize & ~3;  /* join to mod 4 */
+    switch (srcSize & 3)
+    {
+        case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
+                 HUF_FLUSHBITS_2(&bitC);
+		 /* fall-through */
+        case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
+                 HUF_FLUSHBITS_1(&bitC);
+		 /* fall-through */
+        case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
+                 HUF_FLUSHBITS(&bitC);
+		 /* fall-through */
+        case 0 : /* fall-through */
+        default: break;
+    }
+
+    for (; n>0; n-=4) {  /* note : n&3==0 at this stage */
+        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
+        HUF_FLUSHBITS_2(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
+        HUF_FLUSHBITS_1(&bitC);
+        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
+        HUF_FLUSHBITS(&bitC);
+    }
+
+    return BIT_closeCStream(&bitC);
+}
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
+                                   const void* src, size_t srcSize,
+                                   const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize,
+                                      const void* src, size_t srcSize,
+                                      const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    if (bmi2) {
+        return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable);
+    }
+    return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable);
+}
+
+#else
+
+static size_t
+HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, const int bmi2)
+{
+    (void)bmi2;
+    return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable);
+}
+
+#endif
+
+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+
+static size_t
+HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
+                              const void* src, size_t srcSize,
+                              const HUF_CElt* CTable, int bmi2)
+{
+    size_t const segmentSize = (srcSize+3)/4;   /* first 3 segments */
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*) dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    if (dstSize < 6 + 1 + 1 + 1 + 8) return 0;   /* minimum space to compress successfully */
+    if (srcSize < 12) return 0;   /* no saving possible : too small input */
+    op += 6;   /* jumpTable */
+
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+2, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) );
+        if (cSize==0) return 0;
+        assert(cSize <= 65535);
+        MEM_writeLE16(ostart+4, (U16)cSize);
+        op += cSize;
+    }
+
+    ip += segmentSize;
+    assert(op <= oend);
+    assert(ip <= iend);
+    {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
+        if (cSize==0) return 0;
+        op += cSize;
+    }
+
+    return (size_t)(op-ostart);
+}
+
+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0);
+}
+
+typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
+
+static size_t HUF_compressCTable_internal(
+                BYTE* const ostart, BYTE* op, BYTE* const oend,
+                const void* src, size_t srcSize,
+                HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2)
+{
+    size_t const cSize = (nbStreams==HUF_singleStream) ?
+                         HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) :
+                         HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2);
+    if (HUF_isError(cSize)) { return cSize; }
+    if (cSize==0) { return 0; }   /* uncompressible */
+    op += cSize;
+    /* check compressibility */
+    assert(op >= ostart);
+    if ((size_t)(op-ostart) >= srcSize-1) { return 0; }
+    return (size_t)(op-ostart);
+}
+
+typedef struct {
+    unsigned count[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_buildCTable_wksp_tables buildCTable_wksp;
+} HUF_compress_tables_t;
+
+/* HUF_compress_internal() :
+ * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */
+static size_t
+HUF_compress_internal (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                       unsigned maxSymbolValue, unsigned huffLog,
+                       HUF_nbStreams_e nbStreams,
+                       void* workSpace, size_t wkspSize,
+                       HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat,
+                 const int bmi2)
+{
+    HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart;
+
+    HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE);
+
+    /* checks & inits */
+    if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC);  /* must be aligned on 4-bytes boundaries */
+    if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
+    if (!srcSize) return 0;  /* Uncompressed */
+    if (!dstSize) return 0;  /* cannot fit anything within dst budget */
+    if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* current block size limit */
+    if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge);
+    if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX;
+    if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT;
+
+    /* Heuristic : If old table is valid, use it for small inputs */
+    if (preferRepeat && repeat && *repeat == HUF_repeat_valid) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           nbStreams, oldHufTable, bmi2);
+    }
+
+    /* Scan input and build symbol stats */
+    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace, wkspSize) );
+        if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; }   /* single symbol, rle */
+        if (largest <= (srcSize >> 7)+4) return 0;   /* heuristic : probably not compressible enough */
+    }
+
+    /* Check validity of previous table */
+    if ( repeat
+      && *repeat == HUF_repeat_check
+      && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) {
+        *repeat = HUF_repeat_none;
+    }
+    /* Heuristic : use existing table for small inputs */
+    if (preferRepeat && repeat && *repeat != HUF_repeat_none) {
+        return HUF_compressCTable_internal(ostart, op, oend,
+                                           src, srcSize,
+                                           nbStreams, oldHufTable, bmi2);
+    }
+
+    /* Build Huffman Tree */
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count,
+                                            maxSymbolValue, huffLog,
+                                            &table->buildCTable_wksp, sizeof(table->buildCTable_wksp));
+        CHECK_F(maxBits);
+        huffLog = (U32)maxBits;
+        /* Zero unused symbols in CTable, so we can check it for validity */
+        memset(table->CTable + (maxSymbolValue + 1), 0,
+               sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt)));
+    }
+
+    /* Write table description header */
+    {   CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) );
+        /* Check if using previous huffman table is beneficial */
+        if (repeat && *repeat != HUF_repeat_none) {
+            size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue);
+            size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue);
+            if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) {
+                return HUF_compressCTable_internal(ostart, op, oend,
+                                                   src, srcSize,
+                                                   nbStreams, oldHufTable, bmi2);
+        }   }
+
+        /* Use the new huffman table */
+        if (hSize + 12ul >= srcSize) { return 0; }
+        op += hSize;
+        if (repeat) { *repeat = HUF_repeat_none; }
+        if (oldHufTable)
+            memcpy(oldHufTable, table->CTable, sizeof(table->CTable));  /* Save new table */
+    }
+    return HUF_compressCTable_internal(ostart, op, oend,
+                                       src, srcSize,
+                                       nbStreams, table->CTable, bmi2);
+}
+
+
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_singleStream,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_singleStream,
+                                 workSpace, wkspSize, hufTable,
+                                 repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress1X (void* dst, size_t dstSize,
+                 const void* src, size_t srcSize,
+                 unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * provide workspace to generate compression tables */
+size_t HUF_compress4X_wksp (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_fourStreams,
+                                 workSpace, wkspSize,
+                                 NULL, NULL, 0, 0 /*bmi2*/);
+}
+
+/* HUF_compress4X_repeat():
+ * compress input using 4 streams.
+ * re-use an existing huffman compression table */
+size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
+                      const void* src, size_t srcSize,
+                      unsigned maxSymbolValue, unsigned huffLog,
+                      void* workSpace, size_t wkspSize,
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2)
+{
+    return HUF_compress_internal(dst, dstSize, src, srcSize,
+                                 maxSymbolValue, huffLog, HUF_fourStreams,
+                                 workSpace, wkspSize,
+                                 hufTable, repeat, preferRepeat, bmi2);
+}
+
+size_t HUF_compress2 (void* dst, size_t dstSize,
+                const void* src, size_t srcSize,
+                unsigned maxSymbolValue, unsigned huffLog)
+{
+    unsigned workSpace[HUF_WORKSPACE_SIZE_U32];
+    return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace));
+}
+
+size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize)
+{
+    return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT);
+}
+/**** ended inlining compress/huf_compress.c ****/
+/**** start inlining compress/zstd_compress_literals.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+/**** start inlining zstd_compress_literals.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_LITERALS_H
+#define ZSTD_COMPRESS_LITERALS_H
+
+/**** start inlining zstd_compress_internal.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* This header contains definitions
+ * that shall **only** be used by modules within lib/compress.
+ */
+
+#ifndef ZSTD_COMPRESS_H
+#define ZSTD_COMPRESS_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** start inlining zstd_cwksp.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CWKSP_H
+#define ZSTD_CWKSP_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Constants
+***************************************/
+
+/* Since the workspace is effectively its own little malloc implementation /
+ * arena, when we run under ASAN, we should similarly insert redzones between
+ * each internal element of the workspace, so ASAN will catch overruns that
+ * reach outside an object but that stay inside the workspace.
+ *
+ * This defines the size of that redzone.
+ */
+#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE
+#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
+#endif
+
+/*-*************************************
+*  Structures
+***************************************/
+typedef enum {
+    ZSTD_cwksp_alloc_objects,
+    ZSTD_cwksp_alloc_buffers,
+    ZSTD_cwksp_alloc_aligned
+} ZSTD_cwksp_alloc_phase_e;
+
+/**
+ * Zstd fits all its internal datastructures into a single continuous buffer,
+ * so that it only needs to perform a single OS allocation (or so that a buffer
+ * can be provided to it and it can perform no allocations at all). This buffer
+ * is called the workspace.
+ *
+ * Several optimizations complicate that process of allocating memory ranges
+ * from this workspace for each internal datastructure:
+ *
+ * - These different internal datastructures have different setup requirements:
+ *
+ *   - The static objects need to be cleared once and can then be trivially
+ *     reused for each compression.
+ *
+ *   - Various buffers don't need to be initialized at all--they are always
+ *     written into before they're read.
+ *
+ *   - The matchstate tables have a unique requirement that they don't need
+ *     their memory to be totally cleared, but they do need the memory to have
+ *     some bound, i.e., a guarantee that all values in the memory they've been
+ *     allocated is less than some maximum value (which is the starting value
+ *     for the indices that they will then use for compression). When this
+ *     guarantee is provided to them, they can use the memory without any setup
+ *     work. When it can't, they have to clear the area.
+ *
+ * - These buffers also have different alignment requirements.
+ *
+ * - We would like to reuse the objects in the workspace for multiple
+ *   compressions without having to perform any expensive reallocation or
+ *   reinitialization work.
+ *
+ * - We would like to be able to efficiently reuse the workspace across
+ *   multiple compressions **even when the compression parameters change** and
+ *   we need to resize some of the objects (where possible).
+ *
+ * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp
+ * abstraction was created. It works as follows:
+ *
+ * Workspace Layout:
+ *
+ * [                        ... workspace ...                         ]
+ * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers]
+ *
+ * The various objects that live in the workspace are divided into the
+ * following categories, and are allocated separately:
+ *
+ * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict,
+ *   so that literally everything fits in a single buffer. Note: if present,
+ *   this must be the first object in the workspace, since ZSTD_free{CCtx,
+ *   CDict}() rely on a pointer comparison to see whether one or two frees are
+ *   required.
+ *
+ * - Fixed size objects: these are fixed-size, fixed-count objects that are
+ *   nonetheless "dynamically" allocated in the workspace so that we can
+ *   control how they're initialized separately from the broader ZSTD_CCtx.
+ *   Examples:
+ *   - Entropy Workspace
+ *   - 2 x ZSTD_compressedBlockState_t
+ *   - CDict dictionary contents
+ *
+ * - Tables: these are any of several different datastructures (hash tables,
+ *   chain tables, binary trees) that all respect a common format: they are
+ *   uint32_t arrays, all of whose values are between 0 and (nextSrc - base).
+ *   Their sizes depend on the cparams.
+ *
+ * - Aligned: these buffers are used for various purposes that require 4 byte
+ *   alignment, but don't require any initialization before they're used.
+ *
+ * - Buffers: these buffers are used for various purposes that don't require
+ *   any alignment or initialization before they're used. This means they can
+ *   be moved around at no cost for a new compression.
+ *
+ * Allocating Memory:
+ *
+ * The various types of objects must be allocated in order, so they can be
+ * correctly packed into the workspace buffer. That order is:
+ *
+ * 1. Objects
+ * 2. Buffers
+ * 3. Aligned
+ * 4. Tables
+ *
+ * Attempts to reserve objects of different types out of order will fail.
+ */
+typedef struct {
+    void* workspace;
+    void* workspaceEnd;
+
+    void* objectEnd;
+    void* tableEnd;
+    void* tableValidEnd;
+    void* allocStart;
+
+    int allocFailed;
+    int workspaceOversizedDuration;
+    ZSTD_cwksp_alloc_phase_e phase;
+} ZSTD_cwksp;
+
+/*-*************************************
+*  Functions
+***************************************/
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws);
+
+MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) {
+    (void)ws;
+    assert(ws->workspace <= ws->objectEnd);
+    assert(ws->objectEnd <= ws->tableEnd);
+    assert(ws->objectEnd <= ws->tableValidEnd);
+    assert(ws->tableEnd <= ws->allocStart);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    assert(ws->allocStart <= ws->workspaceEnd);
+}
+
+/**
+ * Align must be a power of 2.
+ */
+MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) {
+    size_t const mask = align - 1;
+    assert((align & mask) == 0);
+    return (size + mask) & ~mask;
+}
+
+/**
+ * Use this to determine how much space in the workspace we will consume to
+ * allocate this object. (Normally it should be exactly the size of the object,
+ * but under special conditions, like ASAN, where we pad each object, it might
+ * be larger.)
+ *
+ * Since tables aren't currently redzoned, you don't need to call through this
+ * to figure out how much space you need for the matchState tables. Everything
+ * else is though.
+ */
+MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#else
+    return size;
+#endif
+}
+
+MEM_STATIC void ZSTD_cwksp_internal_advance_phase(
+        ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) {
+    assert(phase >= ws->phase);
+    if (phase > ws->phase) {
+        if (ws->phase < ZSTD_cwksp_alloc_buffers &&
+                phase >= ZSTD_cwksp_alloc_buffers) {
+            ws->tableValidEnd = ws->objectEnd;
+        }
+        if (ws->phase < ZSTD_cwksp_alloc_aligned &&
+                phase >= ZSTD_cwksp_alloc_aligned) {
+            /* If unaligned allocations down from a too-large top have left us
+             * unaligned, we need to realign our alloc ptr. Technically, this
+             * can consume space that is unaccounted for in the neededSpace
+             * calculation. However, I believe this can only happen when the
+             * workspace is too large, and specifically when it is too large
+             * by a larger margin than the space that will be consumed. */
+            /* TODO: cleaner, compiler warning friendly way to do this??? */
+            ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1));
+            if (ws->allocStart < ws->tableValidEnd) {
+                ws->tableValidEnd = ws->allocStart;
+            }
+        }
+        ws->phase = phase;
+    }
+}
+
+/**
+ * Returns whether this object/buffer/etc was allocated in this workspace.
+ */
+MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) {
+    return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd);
+}
+
+/**
+ * Internal function. Do not use directly.
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_internal(
+        ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) {
+    void* alloc;
+    void* bottom = ws->tableEnd;
+    ZSTD_cwksp_internal_advance_phase(ws, phase);
+    alloc = (BYTE *)ws->allocStart - bytes;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    alloc = (BYTE *)alloc - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(alloc >= bottom);
+    if (alloc < bottom) {
+        DEBUGLOG(4, "cwksp: alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    if (alloc < ws->tableValidEnd) {
+        ws->tableValidEnd = alloc;
+    }
+    ws->allocStart = alloc;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+    __asan_unpoison_memory_region(alloc, bytes);
+#endif
+
+    return alloc;
+}
+
+/**
+ * Reserves and returns unaligned memory.
+ */
+MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) {
+    return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers);
+}
+
+/**
+ * Reserves and returns memory sized on and aligned on sizeof(unsigned).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
+    assert((bytes & (sizeof(U32)-1)) == 0);
+    return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned);
+}
+
+/**
+ * Aligned on sizeof(unsigned). These buffers have the special property that
+ * their values remain constrained, allowing us to re-use them without
+ * memset()-ing them.
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
+    const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned;
+    void* alloc = ws->tableEnd;
+    void* end = (BYTE *)alloc + bytes;
+    void* top = ws->allocStart;
+
+    DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    assert((bytes & (sizeof(U32)-1)) == 0);
+    ZSTD_cwksp_internal_advance_phase(ws, phase);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(end <= top);
+    if (end > top) {
+        DEBUGLOG(4, "cwksp: table alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->tableEnd = end;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    __asan_unpoison_memory_region(alloc, bytes);
+#endif
+
+    return alloc;
+}
+
+/**
+ * Aligned on sizeof(void*).
+ */
+MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
+    size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*));
+    void* alloc = ws->objectEnd;
+    void* end = (BYTE*)alloc + roundedBytes;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* over-reserve space */
+    end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+#endif
+
+    DEBUGLOG(5,
+        "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining",
+        alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes);
+    assert(((size_t)alloc & (sizeof(void*)-1)) == 0);
+    assert((bytes & (sizeof(void*)-1)) == 0);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    /* we must be in the first phase, no advance is possible */
+    if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) {
+        DEBUGLOG(4, "cwksp: object alloc failed!");
+        ws->allocFailed = 1;
+        return NULL;
+    }
+    ws->objectEnd = end;
+    ws->tableEnd = end;
+    ws->tableValidEnd = end;
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on
+     * either size. */
+    alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE;
+    __asan_unpoison_memory_region(alloc, bytes);
+#endif
+
+    return alloc;
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
+
+#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the table re-use logic is sound, and that we don't
+     * access table space that we haven't cleaned, we re-"poison" the table
+     * space every time we mark it dirty. */
+    {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        assert(__msan_test_shadow(ws->objectEnd, size) == -1);
+        __msan_poison(ws->objectEnd, size);
+    }
+#endif
+
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    ws->tableValidEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        ws->tableValidEnd = ws->tableEnd;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Zero the part of the allocated tables not already marked clean.
+ */
+MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables");
+    assert(ws->tableValidEnd >= ws->objectEnd);
+    assert(ws->tableValidEnd <= ws->allocStart);
+    if (ws->tableValidEnd < ws->tableEnd) {
+        memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd);
+    }
+    ZSTD_cwksp_mark_tables_clean(ws);
+}
+
+/**
+ * Invalidates table allocations.
+ * All other allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing tables!");
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    {
+        size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * Invalidates all buffer, aligned, and table allocations.
+ * Object allocations remain valid.
+ */
+MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) {
+    DEBUGLOG(4, "cwksp: clearing!");
+
+#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the context re-use logic is sound, and that we don't
+     * access stuff that this compression hasn't initialized, we re-"poison"
+     * the workspace (or at least the non-static, non-table parts of it)
+     * every time we start a new compression. */
+    {
+        size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd;
+        __msan_poison(ws->tableValidEnd, size);
+    }
+#endif
+
+#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE)
+    {
+        size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd;
+        __asan_poison_memory_region(ws->objectEnd, size);
+    }
+#endif
+
+    ws->tableEnd = ws->objectEnd;
+    ws->allocStart = ws->workspaceEnd;
+    ws->allocFailed = 0;
+    if (ws->phase > ZSTD_cwksp_alloc_buffers) {
+        ws->phase = ZSTD_cwksp_alloc_buffers;
+    }
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+/**
+ * The provided workspace takes ownership of the buffer [start, start+size).
+ * Any existing values in the workspace are ignored (the previously managed
+ * buffer, if present, must be separately freed).
+ */
+MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size) {
+    DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size);
+    assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */
+    ws->workspace = start;
+    ws->workspaceEnd = (BYTE*)start + size;
+    ws->objectEnd = ws->workspace;
+    ws->tableValidEnd = ws->objectEnd;
+    ws->phase = ZSTD_cwksp_alloc_objects;
+    ZSTD_cwksp_clear(ws);
+    ws->workspaceOversizedDuration = 0;
+    ZSTD_cwksp_assert_internal_consistency(ws);
+}
+
+MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) {
+    void* workspace = ZSTD_malloc(size, customMem);
+    DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size);
+    RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!");
+    ZSTD_cwksp_init(ws, workspace, size);
+    return 0;
+}
+
+MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) {
+    void *ptr = ws->workspace;
+    DEBUGLOG(4, "cwksp: freeing workspace");
+    memset(ws, 0, sizeof(ZSTD_cwksp));
+    ZSTD_free(ptr, customMem);
+}
+
+/**
+ * Moves the management of a workspace from one cwksp to another. The src cwksp
+ * is left in an invalid state (src must be re-init()'ed before its used again).
+ */
+MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) {
+    *dst = *src;
+    memset(src, 0, sizeof(ZSTD_cwksp));
+}
+
+MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace);
+}
+
+MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) {
+    return ws->allocFailed;
+}
+
+/*-*************************************
+*  Functions Checking Free Space
+***************************************/
+
+MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
+    return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace;
+}
+
+MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_available(
+        ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR);
+}
+
+MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)
+        && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+MEM_STATIC void ZSTD_cwksp_bump_oversized_duration(
+        ZSTD_cwksp* ws, size_t additionalNeededSpace) {
+    if (ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) {
+        ws->workspaceOversizedDuration++;
+    } else {
+        ws->workspaceOversizedDuration = 0;
+    }
+}
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_CWKSP_H */
+/**** ended inlining zstd_cwksp.h ****/
+#ifdef ZSTD_MULTITHREAD
+/**** start inlining zstdmt_compress.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ #ifndef ZSTDMT_COMPRESS_H
+ #define ZSTDMT_COMPRESS_H
+
+ #if defined (__cplusplus)
+ extern "C" {
+ #endif
+
+
+/* Note : This is an internal API.
+ *        These APIs used to be exposed with ZSTDLIB_API,
+ *        because it used to be the only way to invoke MT compression.
+ *        Now, it's recommended to use ZSTD_compress2 and ZSTD_compressStream2()
+ *        instead.
+ *
+ *        If you depend on these APIs and can't switch, then define
+ *        ZSTD_LEGACY_MULTITHREADED_API when making the dynamic library.
+ *        However, we may completely remove these functions in a future
+ *        release, so please switch soon.
+ *
+ *        This API requires ZSTD_MULTITHREAD to be defined during compilation,
+ *        otherwise ZSTDMT_createCCtx*() will fail.
+ */
+
+#ifdef ZSTD_LEGACY_MULTITHREADED_API
+#  define ZSTDMT_API ZSTDLIB_API
+#else
+#  define ZSTDMT_API
+#endif
+
+/* ===   Dependencies   === */
+#include <stddef.h>                /* size_t */
+#define ZSTD_STATIC_LINKING_ONLY   /* ZSTD_parameters */
+/**** skipping file: ../zstd.h ****/
+
+
+/* ===   Constants   === */
+#ifndef ZSTDMT_NBWORKERS_MAX
+#  define ZSTDMT_NBWORKERS_MAX 200
+#endif
+#ifndef ZSTDMT_JOBSIZE_MIN
+#  define ZSTDMT_JOBSIZE_MIN (1 MB)
+#endif
+#define ZSTDMT_JOBLOG_MAX   (MEM_32bits() ? 29 : 30)
+#define ZSTDMT_JOBSIZE_MAX  (MEM_32bits() ? (512 MB) : (1024 MB))
+
+
+/* ===   Memory management   === */
+typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx;
+/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */
+ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers);
+/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */
+ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers,
+                                                    ZSTD_customMem cMem);
+ZSTDMT_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx);
+
+ZSTDMT_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx);
+
+
+/* ===   Simple one-pass compression function   === */
+
+ZSTDMT_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                       int compressionLevel);
+
+
+
+/* ===   Streaming functions   === */
+
+ZSTDMT_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel);
+ZSTDMT_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize);  /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */
+
+ZSTDMT_API size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx);
+ZSTDMT_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDMT_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);   /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+ZSTDMT_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output);     /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */
+
+
+/* ===   Advanced functions and parameters  === */
+
+ZSTDMT_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx,
+                                          void* dst, size_t dstCapacity,
+                                    const void* src, size_t srcSize,
+                                    const ZSTD_CDict* cdict,
+                                          ZSTD_parameters params,
+                                          int overlapLog);
+
+ZSTDMT_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx,
+                                        const void* dict, size_t dictSize,   /* dict can be released after init, a local copy is preserved within zcs */
+                                        ZSTD_parameters params,
+                                        unsigned long long pledgedSrcSize);  /* pledgedSrcSize is optional and can be zero == unknown */
+
+ZSTDMT_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx,
+                                        const ZSTD_CDict* cdict,
+                                        ZSTD_frameParameters fparams,
+                                        unsigned long long pledgedSrcSize);  /* note : zero means empty */
+
+/* ZSTDMT_parameter :
+ * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */
+typedef enum {
+    ZSTDMT_p_jobSize,     /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */
+    ZSTDMT_p_overlapLog,  /* Each job may reload a part of previous job to enhance compression ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */
+    ZSTDMT_p_rsyncable    /* Enables rsyncable mode. */
+} ZSTDMT_parameter;
+
+/* ZSTDMT_setMTCtxParameter() :
+ * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter.
+ * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__
+ * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDMT_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int value);
+
+/* ZSTDMT_getMTCtxParameter() :
+ * Query the ZSTDMT_CCtx for a parameter value.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()) */
+ZSTDMT_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int* value);
+
+
+/*! ZSTDMT_compressStream_generic() :
+ *  Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream()
+ *  depending on flush directive.
+ * @return : minimum amount of data still to be flushed
+ *           0 if fully flushed
+ *           or an error code
+ *  note : needs to be init using any ZSTD_initCStream*() variant */
+ZSTDMT_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx,
+                                                ZSTD_outBuffer* output,
+                                                ZSTD_inBuffer* input,
+                                                ZSTD_EndDirective endOp);
+
+
+/* ========================================================
+ * ===  Private interface, for use by ZSTD_compress.c   ===
+ * ===  Not exposed in libzstd. Never invoke directly   ===
+ * ======================================================== */
+
+ /*! ZSTDMT_toFlushNow()
+  *  Tell how many bytes are ready to be flushed immediately.
+  *  Probe the oldest active job (not yet entirely flushed) and check its output buffer.
+  *  If return 0, it means there is no active job,
+  *  or, it means oldest job is still active, but everything produced has been flushed so far,
+  *  therefore flushing is limited by speed of oldest job. */
+size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx);
+
+/*! ZSTDMT_CCtxParam_setMTCtxParameter()
+ *  like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */
+size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, int value);
+
+/*! ZSTDMT_CCtxParam_setNbWorkers()
+ *  Set nbWorkers, and clamp it.
+ *  Also reset jobSize and overlapLog */
+size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers);
+
+/*! ZSTDMT_updateCParams_whileCompressing() :
+ *  Updates only a selected set of compression parameters, to remain compatible with current frame.
+ *  New parameters will be applied to next compression job. */
+void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams);
+
+/*! ZSTDMT_getFrameProgression():
+ *  tells how much data has been consumed (input) and produced (output) for current frame.
+ *  able to count progression inside worker threads.
+ */
+ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx);
+
+
+/*! ZSTDMT_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs,
+                    const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType,
+                    const ZSTD_CDict* cdict,
+                    ZSTD_CCtx_params params, unsigned long long pledgedSrcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTDMT_COMPRESS_H */
+/**** ended inlining zstdmt_compress.h ****/
+#endif
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define kSearchStrength      8
+#define HASH_READ_SIZE       8
+#define ZSTD_DUBT_UNSORTED_MARK 1   /* For btlazy2 strategy, index ZSTD_DUBT_UNSORTED_MARK==1 means "unsorted".
+                                       It could be confused for a real successor at index "1", if sorted as larger than its predecessor.
+                                       It's not a big deal though : candidate will just be sorted again.
+                                       Additionally, candidate position 1 will be lost.
+                                       But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss.
+                                       The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy.
+                                       This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e;
+typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage;
+
+typedef struct ZSTD_prefixDict_s {
+    const void* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+} ZSTD_prefixDict;
+
+typedef struct {
+    void* dictBuffer;
+    void const* dict;
+    size_t dictSize;
+    ZSTD_dictContentType_e dictContentType;
+    ZSTD_CDict* cdict;
+} ZSTD_localDict;
+
+typedef struct {
+    U32 CTable[HUF_CTABLE_SIZE_U32(255)];
+    HUF_repeat repeatMode;
+} ZSTD_hufCTables_t;
+
+typedef struct {
+    FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)];
+    FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)];
+    FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)];
+    FSE_repeat offcode_repeatMode;
+    FSE_repeat matchlength_repeatMode;
+    FSE_repeat litlength_repeatMode;
+} ZSTD_fseCTables_t;
+
+typedef struct {
+    ZSTD_hufCTables_t huf;
+    ZSTD_fseCTables_t fse;
+} ZSTD_entropyCTables_t;
+
+typedef struct {
+    U32 off;
+    U32 len;
+} ZSTD_match_t;
+
+typedef struct {
+    int price;
+    U32 off;
+    U32 mlen;
+    U32 litlen;
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_optimal_t;
+
+typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e;
+
+typedef struct {
+    /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+    unsigned* litFreq;           /* table of literals statistics, of size 256 */
+    unsigned* litLengthFreq;     /* table of litLength statistics, of size (MaxLL+1) */
+    unsigned* matchLengthFreq;   /* table of matchLength statistics, of size (MaxML+1) */
+    unsigned* offCodeFreq;       /* table of offCode statistics, of size (MaxOff+1) */
+    ZSTD_match_t* matchTable;    /* list of found matches, of size ZSTD_OPT_NUM+1 */
+    ZSTD_optimal_t* priceTable;  /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
+
+    U32  litSum;                 /* nb of literals */
+    U32  litLengthSum;           /* nb of litLength codes */
+    U32  matchLengthSum;         /* nb of matchLength codes */
+    U32  offCodeSum;             /* nb of offset codes */
+    U32  litSumBasePrice;        /* to compare to log2(litfreq) */
+    U32  litLengthSumBasePrice;  /* to compare to log2(llfreq)  */
+    U32  matchLengthSumBasePrice;/* to compare to log2(mlfreq)  */
+    U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
+    ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, or follow a pre-defined cost structure */
+    const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionary statistics */
+    ZSTD_literalCompressionMode_e literalCompressionMode;
+} optState_t;
+
+typedef struct {
+  ZSTD_entropyCTables_t entropy;
+  U32 rep[ZSTD_REP_NUM];
+} ZSTD_compressedBlockState_t;
+
+typedef struct {
+    BYTE const* nextSrc;    /* next block here to continue on current prefix */
+    BYTE const* base;       /* All regular indexes relative to this position */
+    BYTE const* dictBase;   /* extDict indexes relative to this position */
+    U32 dictLimit;          /* below that point, need extDict */
+    U32 lowLimit;           /* below that point, no more valid data */
+} ZSTD_window_t;
+
+typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+struct ZSTD_matchState_t {
+    ZSTD_window_t window;   /* State for window round buffer management */
+    U32 loadedDictEnd;      /* index of end of dictionary, within context's referential.
+                             * When loadedDictEnd != 0, a dictionary is in use, and still valid.
+                             * This relies on a mechanism to set loadedDictEnd=0 when dictionary is no longer within distance.
+                             * Such mechanism is provided within ZSTD_window_enforceMaxDist() and ZSTD_checkDictValidity().
+                             * When dict referential is copied into active context (i.e. not attached),
+                             * loadedDictEnd == dictSize, since referential starts from zero.
+                             */
+    U32 nextToUpdate;       /* index from which to continue table update */
+    U32 hashLog3;           /* dispatch table for matches of len==3 : larger == faster, more memory */
+    U32* hashTable;
+    U32* hashTable3;
+    U32* chainTable;
+    optState_t opt;         /* optimal parser state */
+    const ZSTD_matchState_t* dictMatchState;
+    ZSTD_compressionParameters cParams;
+};
+
+typedef struct {
+    ZSTD_compressedBlockState_t* prevCBlock;
+    ZSTD_compressedBlockState_t* nextCBlock;
+    ZSTD_matchState_t matchState;
+} ZSTD_blockState_t;
+
+typedef struct {
+    U32 offset;
+    U32 checksum;
+} ldmEntry_t;
+
+typedef struct {
+    ZSTD_window_t window;   /* State for the window round buffer management */
+    ldmEntry_t* hashTable;
+    U32 loadedDictEnd;
+    BYTE* bucketOffsets;    /* Next position in bucket to insert entry */
+    U64 hashPower;          /* Used to compute the rolling hash.
+                             * Depends on ldmParams.minMatchLength */
+} ldmState_t;
+
+typedef struct {
+    U32 enableLdm;          /* 1 if enable long distance matching */
+    U32 hashLog;            /* Log size of hashTable */
+    U32 bucketSizeLog;      /* Log bucket size for collision resolution, at most 8 */
+    U32 minMatchLength;     /* Minimum match length */
+    U32 hashRateLog;       /* Log number of entries to skip */
+    U32 windowLog;          /* Window log for the LDM */
+} ldmParams_t;
+
+typedef struct {
+    U32 offset;
+    U32 litLength;
+    U32 matchLength;
+} rawSeq;
+
+typedef struct {
+  rawSeq* seq;     /* The start of the sequences */
+  size_t pos;      /* The position where reading stopped. <= size. */
+  size_t size;     /* The number of sequences. <= capacity. */
+  size_t capacity; /* The capacity starting from `seq` pointer */
+} rawSeqStore_t;
+
+typedef struct {
+    int collectSequences;
+    ZSTD_Sequence* seqStart;
+    size_t seqIndex;
+    size_t maxSequences;
+} SeqCollector;
+
+struct ZSTD_CCtx_params_s {
+    ZSTD_format_e format;
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+
+    int compressionLevel;
+    int forceWindow;           /* force back-references to respect limit of
+                                * 1<<wLog, even for dictionary */
+    size_t targetCBlockSize;   /* Tries to fit compressed block size to be around targetCBlockSize.
+                                * No target when targetCBlockSize == 0.
+                                * There is no guarantee on compressed block size */
+    int srcSizeHint;           /* User's best guess of source size.
+                                * Hint is not valid when srcSizeHint == 0.
+                                * There is no guarantee that hint is close to actual source size */
+
+    ZSTD_dictAttachPref_e attachDictPref;
+    ZSTD_literalCompressionMode_e literalCompressionMode;
+
+    /* Multithreading: used to pass parameters to mtctx */
+    int nbWorkers;
+    size_t jobSize;
+    int overlapLog;
+    int rsyncable;
+
+    /* Long distance matching parameters */
+    ldmParams_t ldmParams;
+
+    /* Internal use, for createCCtxParams() and freeCCtxParams() only */
+    ZSTD_customMem customMem;
+};  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
+
+struct ZSTD_CCtx_s {
+    ZSTD_compressionStage_e stage;
+    int cParamsChanged;                  /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */
+    int bmi2;                            /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+    ZSTD_CCtx_params requestedParams;
+    ZSTD_CCtx_params appliedParams;
+    U32   dictID;
+
+    ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */
+    size_t blockSize;
+    unsigned long long pledgedSrcSizePlusOne;  /* this way, 0 (default) == unknown */
+    unsigned long long consumedSrcSize;
+    unsigned long long producedCSize;
+    XXH64_state_t xxhState;
+    ZSTD_customMem customMem;
+    size_t staticSize;
+    SeqCollector seqCollector;
+    int isFirstBlock;
+    int initialized;
+
+    seqStore_t seqStore;      /* sequences storage ptrs */
+    ldmState_t ldmState;      /* long distance matching state */
+    rawSeq* ldmSequences;     /* Storage for the ldm output sequences */
+    size_t maxNbLdmSequences;
+    rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */
+    ZSTD_blockState_t blockState;
+    U32* entropyWorkspace;  /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+
+    /* streaming */
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inToCompress;
+    size_t inBuffPos;
+    size_t inBuffTarget;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outBuffContentSize;
+    size_t outBuffFlushedSize;
+    ZSTD_cStreamStage streamStage;
+    U32    frameEnded;
+
+    /* Dictionary */
+    ZSTD_localDict localDict;
+    const ZSTD_CDict* cdict;
+    ZSTD_prefixDict prefixDict;   /* single-usage dictionary */
+
+    /* Multi-threading */
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_CCtx* mtctx;
+#endif
+};
+
+typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
+
+typedef enum { ZSTD_noDict = 0, ZSTD_extDict = 1, ZSTD_dictMatchState = 2 } ZSTD_dictMode_e;
+
+
+typedef size_t (*ZSTD_blockCompressor) (
+        ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode);
+
+
+MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
+{
+    static const BYTE LL_Code[64] = {  0,  1,  2,  3,  4,  5,  6,  7,
+                                       8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 16, 17, 17, 18, 18, 19, 19,
+                                      20, 20, 20, 20, 21, 21, 21, 21,
+                                      22, 22, 22, 22, 22, 22, 22, 22,
+                                      23, 23, 23, 23, 23, 23, 23, 23,
+                                      24, 24, 24, 24, 24, 24, 24, 24,
+                                      24, 24, 24, 24, 24, 24, 24, 24 };
+    static const U32 LL_deltaCode = 19;
+    return (litLength > 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength];
+}
+
+/* ZSTD_MLcode() :
+ * note : mlBase = matchLength - MINMATCH;
+ *        because it's the format it's stored in seqStore->sequences */
+MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
+{
+    static const BYTE ML_Code[128] = { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+                                      16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+                                      32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+                                      38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+                                      40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+                                      41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+                                      42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 };
+    static const U32 ML_deltaCode = 36;
+    return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase];
+}
+
+typedef struct repcodes_s {
+    U32 rep[3];
+} repcodes_t;
+
+MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
+{
+    repcodes_t newReps;
+    if (offset >= ZSTD_REP_NUM) {  /* full offset */
+        newReps.rep[2] = rep[1];
+        newReps.rep[1] = rep[0];
+        newReps.rep[0] = offset - ZSTD_REP_MOVE;
+    } else {   /* repcode */
+        U32 const repCode = offset + ll0;
+        if (repCode > 0) {  /* note : if repCode==0, no change */
+            U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2];
+            newReps.rep[1] = rep[0];
+            newReps.rep[0] = currentOffset;
+        } else {   /* repCode == 0 */
+            memcpy(&newReps, rep, sizeof(newReps));
+        }
+    }
+    return newReps;
+}
+
+/* ZSTD_cParam_withinBounds:
+ * @return 1 if value is within cParam bounds,
+ * 0 otherwise */
+MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value)
+{
+    ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+    if (ZSTD_isError(bounds.error)) return 0;
+    if (value < bounds.lowerBound) return 0;
+    if (value > bounds.upperBound) return 0;
+    return 1;
+}
+
+/* ZSTD_noCompressBlock() :
+ * Writes uncompressed block to dst buffer from given src.
+ * Returns the size of the block */
+MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock)
+{
+    U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3);
+    RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity,
+                    dstSize_tooSmall, "dst buf too small for uncompressed block");
+    MEM_writeLE24(dst, cBlockHeader24);
+    memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize);
+    return ZSTD_blockHeaderSize + srcSize;
+}
+
+MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock)
+{
+    BYTE* const op = (BYTE*)dst;
+    U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3);
+    RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, "");
+    MEM_writeLE24(op, cBlockHeader);
+    op[3] = src;
+    return 4;
+}
+
+
+/* ZSTD_minGain() :
+ * minimum compression required
+ * to generate a compress block or a compressed literals section.
+ * note : use same formula for both situations */
+MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat)
+{
+    U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6;
+    ZSTD_STATIC_ASSERT(ZSTD_btultra == 8);
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    return (srcSize >> minlog) + 2;
+}
+
+MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams)
+{
+    switch (cctxParams->literalCompressionMode) {
+    case ZSTD_lcm_huffman:
+        return 0;
+    case ZSTD_lcm_uncompressed:
+        return 1;
+    default:
+        assert(0 /* impossible: pre-validated */);
+        /* fall-through */
+    case ZSTD_lcm_auto:
+        return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0);
+    }
+}
+
+/*! ZSTD_safecopyLiterals() :
+ *  memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w.
+ *  Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single
+ *  large copies.
+ */
+static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) {
+    assert(iend > ilimit_w);
+    if (ip <= ilimit_w) {
+        ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap);
+        op += ilimit_w - ip;
+        ip = ilimit_w;
+    }
+    while (ip < iend) *op++ = *ip++;
+}
+
+/*! ZSTD_storeSeq() :
+ *  Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t.
+ *  `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes).
+ *  `mlBase` : matchLength - MINMATCH
+ *  Allowed to overread literals up to litLimit.
+*/
+HINT_INLINE UNUSED_ATTR
+void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase)
+{
+    BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH;
+    BYTE const* const litEnd = literals + litLength;
+#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6)
+    static const BYTE* g_start = NULL;
+    if (g_start==NULL) g_start = (const BYTE*)literals;  /* note : index only works for compression within a single segment */
+    {   U32 const pos = (U32)((const BYTE*)literals - g_start);
+        DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
+               pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode);
+    }
+#endif
+    assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq);
+    /* copy Literals */
+    assert(seqStorePtr->maxNbLit <= 128 KB);
+    assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit);
+    assert(literals + litLength <= litLimit);
+    if (litEnd <= litLimit_w) {
+        /* Common case we can use wildcopy.
+	 * First copy 16 bytes, because literals are likely short.
+	 */
+        assert(WILDCOPY_OVERLENGTH >= 16);
+        ZSTD_copy16(seqStorePtr->lit, literals);
+        if (litLength > 16) {
+            ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap);
+        }
+    } else {
+        ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w);
+    }
+    seqStorePtr->lit += litLength;
+
+    /* literal Length */
+    if (litLength>0xFFFF) {
+        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+        seqStorePtr->longLengthID = 1;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].litLength = (U16)litLength;
+
+    /* match offset */
+    seqStorePtr->sequences[0].offset = offCode + 1;
+
+    /* match Length */
+    if (mlBase>0xFFFF) {
+        assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */
+        seqStorePtr->longLengthID = 2;
+        seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    }
+    seqStorePtr->sequences[0].matchLength = (U16)mlBase;
+
+    seqStorePtr->sequences++;
+}
+
+
+/*-*************************************
+*  Match length counter
+***************************************/
+static unsigned ZSTD_NbCommonBytes (size_t val)
+{
+    if (MEM_isLittleEndian()) {
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0;
+#       elif defined(__GNUC__) && (__GNUC__ >= 4)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2,
+                                                     0, 3, 1, 3, 1, 4, 2, 7,
+                                                     0, 2, 3, 6, 1, 5, 3, 5,
+                                                     1, 3, 4, 4, 2, 5, 6, 7,
+                                                     7, 0, 1, 2, 3, 3, 4, 6,
+                                                     2, 6, 5, 5, 3, 4, 5, 6,
+                                                     7, 1, 2, 4, 6, 4, 4, 5,
+                                                     7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r=0;
+            return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0;
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0,
+                                                     3, 2, 2, 1, 3, 2, 0, 1,
+                                                     3, 3, 1, 2, 2, 2, 2, 0,
+                                                     3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    } else {  /* Big Endian CPU */
+        if (MEM_64bits()) {
+#       if defined(_MSC_VER) && defined(_WIN64)
+            unsigned long r = 0;
+            return _BitScanReverse64( &r, val ) ? (unsigned)(r >> 3) : 0;
+#       elif defined(__GNUC__) && (__GNUC__ >= 4)
+            return (__builtin_clzll(val) >> 3);
+#       else
+            unsigned r;
+            const unsigned n32 = sizeof(size_t)*4;   /* calculate this way due to compiler complaining in 32-bits mode */
+            if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        } else { /* 32 bits */
+#       if defined(_MSC_VER)
+            unsigned long r = 0;
+            return _BitScanReverse( &r, (unsigned long)val ) ? (unsigned)(r >> 3) : 0;
+#       elif defined(__GNUC__) && (__GNUC__ >= 3)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+    }   }
+}
+
+
+MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit)
+{
+    const BYTE* const pStart = pIn;
+    const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
+
+    if (pIn < pInLoopLimit) {
+        { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+          if (diff) return ZSTD_NbCommonBytes(diff); }
+        pIn+=sizeof(size_t); pMatch+=sizeof(size_t);
+        while (pIn < pInLoopLimit) {
+            size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+            if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+            pIn += ZSTD_NbCommonBytes(diff);
+            return (size_t)(pIn - pStart);
+    }   }
+    if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (size_t)(pIn - pStart);
+}
+
+/** ZSTD_count_2segments() :
+ *  can count match length with `ip` & `match` in 2 different segments.
+ *  convention : on reaching mEnd, match count continue starting from iStart
+ */
+MEM_STATIC size_t
+ZSTD_count_2segments(const BYTE* ip, const BYTE* match,
+                     const BYTE* iEnd, const BYTE* mEnd, const BYTE* iStart)
+{
+    const BYTE* const vEnd = MIN( ip + (mEnd - match), iEnd);
+    size_t const matchLength = ZSTD_count(ip, match, vEnd);
+    if (match + matchLength != mEnd) return matchLength;
+    DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength);
+    DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match);
+    DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip);
+    DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart);
+    DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd));
+    return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd);
+}
+
+
+/*-*************************************
+ *  Hashes
+ ***************************************/
+static const U32 prime3bytes = 506832829U;
+static U32    ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes)  >> (32-h) ; }
+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */
+
+static const U32 prime4bytes = 2654435761U;
+static U32    ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; }
+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); }
+
+static const U64 prime5bytes = 889523592379ULL;
+static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u  << (64-40)) * prime5bytes) >> (64-h)) ; }
+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); }
+
+static const U64 prime6bytes = 227718039650203ULL;
+static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u  << (64-48)) * prime6bytes) >> (64-h)) ; }
+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); }
+
+static const U64 prime7bytes = 58295818150454627ULL;
+static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u  << (64-56)) * prime7bytes) >> (64-h)) ; }
+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); }
+
+static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL;
+static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; }
+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); }
+
+MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls)
+{
+    switch(mls)
+    {
+    default:
+    case 4: return ZSTD_hash4Ptr(p, hBits);
+    case 5: return ZSTD_hash5Ptr(p, hBits);
+    case 6: return ZSTD_hash6Ptr(p, hBits);
+    case 7: return ZSTD_hash7Ptr(p, hBits);
+    case 8: return ZSTD_hash8Ptr(p, hBits);
+    }
+}
+
+/** ZSTD_ipow() :
+ * Return base^exponent.
+ */
+static U64 ZSTD_ipow(U64 base, U64 exponent)
+{
+    U64 power = 1;
+    while (exponent) {
+      if (exponent & 1) power *= base;
+      exponent >>= 1;
+      base *= base;
+    }
+    return power;
+}
+
+#define ZSTD_ROLL_HASH_CHAR_OFFSET 10
+
+/** ZSTD_rollingHash_append() :
+ * Add the buffer to the hash value.
+ */
+static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size)
+{
+    BYTE const* istart = (BYTE const*)buf;
+    size_t pos;
+    for (pos = 0; pos < size; ++pos) {
+        hash *= prime8bytes;
+        hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET;
+    }
+    return hash;
+}
+
+/** ZSTD_rollingHash_compute() :
+ * Compute the rolling hash value of the buffer.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size)
+{
+    return ZSTD_rollingHash_append(0, buf, size);
+}
+
+/** ZSTD_rollingHash_primePower() :
+ * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash
+ * over a window of length bytes.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length)
+{
+    return ZSTD_ipow(prime8bytes, length - 1);
+}
+
+/** ZSTD_rollingHash_rotate() :
+ * Rotate the rolling hash by one byte.
+ */
+MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower)
+{
+    hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower;
+    hash *= prime8bytes;
+    hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET;
+    return hash;
+}
+
+/*-*************************************
+*  Round buffer management
+***************************************/
+#if (ZSTD_WINDOWLOG_MAX_64 > 31)
+# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX"
+#endif
+/* Max current allowed */
+#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX))
+/* Maximum chunk size before overflow correction needs to be called again */
+#define ZSTD_CHUNKSIZE_MAX                                                     \
+    ( ((U32)-1)                  /* Maximum ending current index */            \
+    - ZSTD_CURRENT_MAX)          /* Maximum beginning lowLimit */
+
+/**
+ * ZSTD_window_clear():
+ * Clears the window containing the history by simply setting it to empty.
+ */
+MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window)
+{
+    size_t const endT = (size_t)(window->nextSrc - window->base);
+    U32 const end = (U32)endT;
+
+    window->lowLimit = end;
+    window->dictLimit = end;
+}
+
+/**
+ * ZSTD_window_hasExtDict():
+ * Returns non-zero if the window has a non-empty extDict.
+ */
+MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window)
+{
+    return window.lowLimit < window.dictLimit;
+}
+
+/**
+ * ZSTD_matchState_dictMode():
+ * Inspects the provided matchState and figures out what dictMode should be
+ * passed to the compressor.
+ */
+MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms)
+{
+    return ZSTD_window_hasExtDict(ms->window) ?
+        ZSTD_extDict :
+        ms->dictMatchState != NULL ?
+            ZSTD_dictMatchState :
+            ZSTD_noDict;
+}
+
+/**
+ * ZSTD_window_needOverflowCorrection():
+ * Returns non-zero if the indices are getting too large and need overflow
+ * protection.
+ */
+MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window,
+                                                  void const* srcEnd)
+{
+    U32 const current = (U32)((BYTE const*)srcEnd - window.base);
+    return current > ZSTD_CURRENT_MAX;
+}
+
+/**
+ * ZSTD_window_correctOverflow():
+ * Reduces the indices to protect from index overflow.
+ * Returns the correction made to the indices, which must be applied to every
+ * stored index.
+ *
+ * The least significant cycleLog bits of the indices must remain the same,
+ * which may be 0. Every index up to maxDist in the past must be valid.
+ * NOTE: (maxDist & cycleMask) must be zero.
+ */
+MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog,
+                                           U32 maxDist, void const* src)
+{
+    /* preemptive overflow correction:
+     * 1. correction is large enough:
+     *    lowLimit > (3<<29) ==> current > 3<<29 + 1<<windowLog
+     *    1<<windowLog <= newCurrent < 1<<chainLog + 1<<windowLog
+     *
+     *    current - newCurrent
+     *    > (3<<29 + 1<<windowLog) - (1<<windowLog + 1<<chainLog)
+     *    > (3<<29) - (1<<chainLog)
+     *    > (3<<29) - (1<<30)             (NOTE: chainLog <= 30)
+     *    > 1<<29
+     *
+     * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow:
+     *    After correction, current is less than (1<<chainLog + 1<<windowLog).
+     *    In 64-bit mode we are safe, because we have 64-bit ptrdiff_t.
+     *    In 32-bit mode we are safe, because (chainLog <= 29), so
+     *    ip+ZSTD_CHUNKSIZE_MAX - cctx->base < 1<<32.
+     * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
+     *    windowLog <= 31 ==> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
+     */
+    U32 const cycleMask = (1U << cycleLog) - 1;
+    U32 const current = (U32)((BYTE const*)src - window->base);
+    U32 const currentCycle0 = current & cycleMask;
+    /* Exclude zero so that newCurrent - maxDist >= 1. */
+    U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0;
+    U32 const newCurrent = currentCycle1 + maxDist;
+    U32 const correction = current - newCurrent;
+    assert((maxDist & cycleMask) == 0);
+    assert(current > newCurrent);
+    /* Loose bound, should be around 1<<29 (see above) */
+    assert(correction > 1<<28);
+
+    window->base += correction;
+    window->dictBase += correction;
+    if (window->lowLimit <= correction) window->lowLimit = 1;
+    else window->lowLimit -= correction;
+    if (window->dictLimit <= correction) window->dictLimit = 1;
+    else window->dictLimit -= correction;
+
+    /* Ensure we can still reference the full window. */
+    assert(newCurrent >= maxDist);
+    assert(newCurrent - maxDist >= 1);
+    /* Ensure that lowLimit and dictLimit didn't underflow. */
+    assert(window->lowLimit <= newCurrent);
+    assert(window->dictLimit <= newCurrent);
+
+    DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction,
+             window->lowLimit);
+    return correction;
+}
+
+/**
+ * ZSTD_window_enforceMaxDist():
+ * Updates lowLimit so that:
+ *    (srcEnd - base) - lowLimit == maxDist + loadedDictEnd
+ *
+ * It ensures index is valid as long as index >= lowLimit.
+ * This must be called before a block compression call.
+ *
+ * loadedDictEnd is only defined if a dictionary is in use for current compression.
+ * As the name implies, loadedDictEnd represents the index at end of dictionary.
+ * The value lies within context's referential, it can be directly compared to blockEndIdx.
+ *
+ * If loadedDictEndPtr is NULL, no dictionary is in use, and we use loadedDictEnd == 0.
+ * If loadedDictEndPtr is not NULL, we set it to zero after updating lowLimit.
+ * This is because dictionaries are allowed to be referenced fully
+ * as long as the last byte of the dictionary is in the window.
+ * Once input has progressed beyond window size, dictionary cannot be referenced anymore.
+ *
+ * In normal dict mode, the dictionary lies between lowLimit and dictLimit.
+ * In dictMatchState mode, lowLimit and dictLimit are the same,
+ * and the dictionary is below them.
+ * forceWindow and dictMatchState are therefore incompatible.
+ */
+MEM_STATIC void
+ZSTD_window_enforceMaxDist(ZSTD_window_t* window,
+                     const void* blockEnd,
+                           U32   maxDist,
+                           U32*  loadedDictEndPtr,
+                     const ZSTD_matchState_t** dictMatchStatePtr)
+{
+    U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+    U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0;
+    DEBUGLOG(5, "ZSTD_window_enforceMaxDist: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+                (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+
+    /* - When there is no dictionary : loadedDictEnd == 0.
+         In which case, the test (blockEndIdx > maxDist) is merely to avoid
+         overflowing next operation `newLowLimit = blockEndIdx - maxDist`.
+       - When there is a standard dictionary :
+         Index referential is copied from the dictionary,
+         which means it starts from 0.
+         In which case, loadedDictEnd == dictSize,
+         and it makes sense to compare `blockEndIdx > maxDist + dictSize`
+         since `blockEndIdx` also starts from zero.
+       - When there is an attached dictionary :
+         loadedDictEnd is expressed within the referential of the context,
+         so it can be directly compared against blockEndIdx.
+    */
+    if (blockEndIdx > maxDist + loadedDictEnd) {
+        U32 const newLowLimit = blockEndIdx - maxDist;
+        if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit;
+        if (window->dictLimit < window->lowLimit) {
+            DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u",
+                        (unsigned)window->dictLimit, (unsigned)window->lowLimit);
+            window->dictLimit = window->lowLimit;
+        }
+        /* On reaching window size, dictionaries are invalidated */
+        if (loadedDictEndPtr) *loadedDictEndPtr = 0;
+        if (dictMatchStatePtr) *dictMatchStatePtr = NULL;
+    }
+}
+
+/* Similar to ZSTD_window_enforceMaxDist(),
+ * but only invalidates dictionary
+ * when input progresses beyond window size.
+ * assumption : loadedDictEndPtr and dictMatchStatePtr are valid (non NULL)
+ *              loadedDictEnd uses same referential as window->base
+ *              maxDist is the window size */
+MEM_STATIC void
+ZSTD_checkDictValidity(const ZSTD_window_t* window,
+                       const void* blockEnd,
+                             U32   maxDist,
+                             U32*  loadedDictEndPtr,
+                       const ZSTD_matchState_t** dictMatchStatePtr)
+{
+    assert(loadedDictEndPtr != NULL);
+    assert(dictMatchStatePtr != NULL);
+    {   U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base);
+        U32 const loadedDictEnd = *loadedDictEndPtr;
+        DEBUGLOG(5, "ZSTD_checkDictValidity: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u",
+                    (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd);
+        assert(blockEndIdx >= loadedDictEnd);
+
+        if (blockEndIdx > loadedDictEnd + maxDist) {
+            /* On reaching window size, dictionaries are invalidated.
+             * For simplification, if window size is reached anywhere within next block,
+             * the dictionary is invalidated for the full block.
+             */
+            DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)");
+            *loadedDictEndPtr = 0;
+            *dictMatchStatePtr = NULL;
+        } else {
+            if (*loadedDictEndPtr != 0) {
+                DEBUGLOG(6, "dictionary considered valid for current block");
+    }   }   }
+}
+
+MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
+    memset(window, 0, sizeof(*window));
+    window->base = (BYTE const*)"";
+    window->dictBase = (BYTE const*)"";
+    window->dictLimit = 1;    /* start from 1, so that 1st position is valid */
+    window->lowLimit = 1;     /* it ensures first and later CCtx usages compress the same */
+    window->nextSrc = window->base + 1;   /* see issue #1241 */
+}
+
+/**
+ * ZSTD_window_update():
+ * Updates the window by appending [src, src + srcSize) to the window.
+ * If it is not contiguous, the current prefix becomes the extDict, and we
+ * forget about the extDict. Handles overlap of the prefix and extDict.
+ * Returns non-zero if the segment is contiguous.
+ */
+MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
+                                  void const* src, size_t srcSize)
+{
+    BYTE const* const ip = (BYTE const*)src;
+    U32 contiguous = 1;
+    DEBUGLOG(5, "ZSTD_window_update");
+    if (srcSize == 0)
+        return contiguous;
+    assert(window->base != NULL);
+    assert(window->dictBase != NULL);
+    /* Check if blocks follow each other */
+    if (src != window->nextSrc) {
+        /* not contiguous */
+        size_t const distanceFromBase = (size_t)(window->nextSrc - window->base);
+        DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit);
+        window->lowLimit = window->dictLimit;
+        assert(distanceFromBase == (size_t)(U32)distanceFromBase);  /* should never overflow */
+        window->dictLimit = (U32)distanceFromBase;
+        window->dictBase = window->base;
+        window->base = ip - distanceFromBase;
+        /* ms->nextToUpdate = window->dictLimit; */
+        if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit;   /* too small extDict */
+        contiguous = 0;
+    }
+    window->nextSrc = ip + srcSize;
+    /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */
+    if ( (ip+srcSize > window->dictBase + window->lowLimit)
+       & (ip < window->dictBase + window->dictLimit)) {
+        ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase;
+        U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx;
+        window->lowLimit = lowLimitMax;
+        DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit);
+    }
+    return contiguous;
+}
+
+/**
+ * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog)
+{
+    U32    const maxDistance = 1U << windowLog;
+    U32    const lowestValid = ms->window.lowLimit;
+    U32    const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
+    U32    const isDictionary = (ms->loadedDictEnd != 0);
+    U32    const matchLowest = isDictionary ? lowestValid : withinWindow;
+    return matchLowest;
+}
+
+/**
+ * Returns the lowest allowed match index in the prefix.
+ */
+MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog)
+{
+    U32    const maxDistance = 1U << windowLog;
+    U32    const lowestValid = ms->window.dictLimit;
+    U32    const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
+    U32    const isDictionary = (ms->loadedDictEnd != 0);
+    U32    const matchLowest = isDictionary ? lowestValid : withinWindow;
+    return matchLowest;
+}
+
+
+
+/* debug functions */
+#if (DEBUGLEVEL>=2)
+
+MEM_STATIC double ZSTD_fWeight(U32 rawStat)
+{
+    U32 const fp_accuracy = 8;
+    U32 const fp_multiplier = (1 << fp_accuracy);
+    U32 const newStat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(newStat);
+    U32 const BWeight = hb * fp_multiplier;
+    U32 const FWeight = (newStat << fp_accuracy) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + fp_accuracy < 31);
+    return (double)weight / fp_multiplier;
+}
+
+/* display a table content,
+ * listing each element, its frequency, and its predicted bit cost */
+MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max)
+{
+    unsigned u, sum;
+    for (u=0, sum=0; u<=max; u++) sum += table[u];
+    DEBUGLOG(2, "total nb elts: %u", sum);
+    for (u=0; u<=max; u++) {
+        DEBUGLOG(2, "%2u: %5u  (%.2f)",
+                u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) );
+    }
+}
+
+#endif
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+/* ===============================================================
+ * Shared internal declarations
+ * These prototypes may be called from sources not in lib/compress
+ * =============================================================== */
+
+/* ZSTD_loadCEntropy() :
+ * dict : must point at beginning of a valid zstd dictionary.
+ * return : size of dictionary header (size of magic number + dict ID + entropy tables)
+ * assumptions : magic number supposed already checked
+ *               and dictSize >= 8 */
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         short* offcodeNCount, unsigned* offcodeMaxValue,
+                         const void* const dict, size_t dictSize);
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs);
+
+/* ==============================================================
+ * Private declarations
+ * These prototypes shall only be called from within lib/compress
+ * ============================================================== */
+
+/* ZSTD_getCParamsFromCCtxParams() :
+ * cParams are built depending on compressionLevel, src size hints,
+ * LDM and manually set compression parameters.
+ * Note: srcSizeHint == 0 means 0!
+ */
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize);
+
+/*! ZSTD_initCStream_internal() :
+ *  Private use only. Init streaming operation.
+ *  expects params to be valid.
+ *  must receive dict, or cdict, or none, but not both.
+ *  @return : 0, or an error code */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                     const ZSTD_CDict* cdict,
+                     const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize);
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr);
+
+/*! ZSTD_getCParamsFromCDict() :
+ *  as the name implies */
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict);
+
+/* ZSTD_compressBegin_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params,
+                                    unsigned long long pledgedSrcSize);
+
+/* ZSTD_compress_advanced_internal() :
+ * Private use only. To be called from zstdmt_compress.c. */
+size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize,
+                                 const void* dict,size_t dictSize,
+                                 const ZSTD_CCtx_params* params);
+
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity);
+
+
+/* ZSTD_referenceExternalSequences() :
+ * Must be called before starting a compression operation.
+ * seqs must parse a prefix of the source.
+ * This cannot be used when long range matching is enabled.
+ * Zstd will use these sequences, and pass the literals to a secondary block
+ * compressor.
+ * @return : An error code on failure.
+ * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory
+ * access and data corruption.
+ */
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq);
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat);
+
+#endif /* ZSTD_COMPRESS_H */
+/**** ended inlining zstd_compress_internal.h ****/
+
+
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+                        const int bmi2);
+
+#endif /* ZSTD_COMPRESS_LITERALS_H */
+/**** ended inlining zstd_compress_literals.h ****/
+
+size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, "");
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    memcpy(ostart + flSize, src, srcSize);
+    DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize));
+    return srcSize + flSize;
+}
+
+size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    BYTE* const ostart = (BYTE* const)dst;
+    U32   const flSize = 1 + (srcSize>31) + (srcSize>4095);
+
+    (void)dstCapacity;  /* dstCapacity already guaranteed to be >=4, hence large enough */
+
+    switch(flSize)
+    {
+        case 1: /* 2 - 1 - 5 */
+            ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3));
+            break;
+        case 2: /* 2 - 2 - 12 */
+            MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4)));
+            break;
+        case 3: /* 2 - 2 - 20 */
+            MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4)));
+            break;
+        default:   /* not necessary : flSize is {1,2,3} */
+            assert(0);
+    }
+
+    ostart[flSize] = *(const BYTE*)src;
+    DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1);
+    return flSize+1;
+}
+
+size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
+                              ZSTD_hufCTables_t* nextHuf,
+                              ZSTD_strategy strategy, int disableLiteralCompression,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                              void* entropyWorkspace, size_t entropyWorkspaceSize,
+                        const int bmi2)
+{
+    size_t const minGain = ZSTD_minGain(srcSize, strategy);
+    size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB);
+    BYTE*  const ostart = (BYTE*)dst;
+    U32 singleStream = srcSize < 256;
+    symbolEncodingType_e hType = set_compressed;
+    size_t cLitSize;
+
+    DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)",
+                disableLiteralCompression, (U32)srcSize);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (disableLiteralCompression)
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+
+    /* small ? don't even attempt compression (speed opt) */
+#   define COMPRESS_LITERALS_SIZE_MIN 63
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+
+    RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression");
+    {   HUF_repeat repeat = prevHuf->repeatMode;
+        int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0;
+        if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1;
+        cLitSize = singleStream ?
+            HUF_compress1X_repeat(
+                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) :
+            HUF_compress4X_repeat(
+                ostart+lhSize, dstCapacity-lhSize, src, srcSize,
+                HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize,
+                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
+        if (repeat != HUF_repeat_none) {
+            /* reused the existing table */
+            DEBUGLOG(5, "Reusing previous huffman table");
+            hType = set_repeat;
+        }
+    }
+
+    if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) {
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+        return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
+    }
+    if (cLitSize==1) {
+        memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+        return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize);
+    }
+
+    if (hType == set_compressed) {
+        /* using a newly constructed table */
+        nextHuf->repeatMode = HUF_repeat_check;
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize));
+    return lhSize+cLitSize;
+}
+/**** ended inlining compress/zstd_compress_literals.c ****/
+/**** start inlining compress/zstd_compress_sequences.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+/**** start inlining zstd_compress_sequences.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_SEQUENCES_H
+#define ZSTD_COMPRESS_SEQUENCES_H
+
+/**** skipping file: ../common/fse.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+typedef enum {
+    ZSTD_defaultDisallowed = 0,
+    ZSTD_defaultAllowed = 1
+} ZSTD_defaultPolicy_e;
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy);
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize);
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2);
+
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max);
+
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max);
+#endif /* ZSTD_COMPRESS_SEQUENCES_H */
+/**** ended inlining zstd_compress_sequences.h ****/
+
+/**
+ * -log2(x / 256) lookup table for x in [0, 256).
+ * If x == 0: Return 0
+ * Else: Return floor(-log2(x / 256) * 256)
+ */
+static unsigned const kInverseProbabilityLog256[256] = {
+    0,    2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162,
+    1130, 1100, 1073, 1047, 1024, 1001, 980,  960,  941,  923,  906,  889,
+    874,  859,  844,  830,  817,  804,  791,  779,  768,  756,  745,  734,
+    724,  714,  704,  694,  685,  676,  667,  658,  650,  642,  633,  626,
+    618,  610,  603,  595,  588,  581,  574,  567,  561,  554,  548,  542,
+    535,  529,  523,  517,  512,  506,  500,  495,  489,  484,  478,  473,
+    468,  463,  458,  453,  448,  443,  438,  434,  429,  424,  420,  415,
+    411,  407,  402,  398,  394,  390,  386,  382,  377,  373,  370,  366,
+    362,  358,  354,  350,  347,  343,  339,  336,  332,  329,  325,  322,
+    318,  315,  311,  308,  305,  302,  298,  295,  292,  289,  286,  282,
+    279,  276,  273,  270,  267,  264,  261,  258,  256,  253,  250,  247,
+    244,  241,  239,  236,  233,  230,  228,  225,  222,  220,  217,  215,
+    212,  209,  207,  204,  202,  199,  197,  194,  192,  190,  187,  185,
+    182,  180,  178,  175,  173,  171,  168,  166,  164,  162,  159,  157,
+    155,  153,  151,  149,  146,  144,  142,  140,  138,  136,  134,  132,
+    130,  128,  126,  123,  121,  119,  117,  115,  114,  112,  110,  108,
+    106,  104,  102,  100,  98,   96,   94,   93,   91,   89,   87,   85,
+    83,   82,   80,   78,   76,   74,   73,   71,   69,   67,   66,   64,
+    62,   61,   59,   57,   55,   54,   52,   50,   49,   47,   46,   44,
+    42,   41,   39,   37,   36,   34,   33,   31,   30,   28,   26,   25,
+    23,   22,   20,   19,   17,   16,   14,   13,   11,   10,   8,    7,
+    5,    4,    2,    1,
+};
+
+static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) {
+  void const* ptr = ctable;
+  U16 const* u16ptr = (U16 const*)ptr;
+  U32 const maxSymbolValue = MEM_read16(u16ptr + 1);
+  return maxSymbolValue;
+}
+
+/**
+ * Returns the cost in bytes of encoding the normalized count header.
+ * Returns an error if any of the helper functions return an error.
+ */
+static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max,
+                              size_t const nbSeq, unsigned const FSELog)
+{
+    BYTE wksp[FSE_NCOUNTBOUND];
+    S16 norm[MaxSeq + 1];
+    const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+    FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max), "");
+    return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog);
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution described by count
+ * using the entropy bound.
+ */
+static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total)
+{
+    unsigned cost = 0;
+    unsigned s;
+    for (s = 0; s <= max; ++s) {
+        unsigned norm = (unsigned)((256 * count[s]) / total);
+        if (count[s] != 0 && norm == 0)
+            norm = 1;
+        assert(count[s] < total);
+        cost += count[s] * kInverseProbabilityLog256[norm];
+    }
+    return cost >> 8;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using ctable.
+ * Returns an error if ctable cannot represent all the symbols in count.
+ */
+size_t ZSTD_fseBitCost(
+    FSE_CTable const* ctable,
+    unsigned const* count,
+    unsigned const max)
+{
+    unsigned const kAccuracyLog = 8;
+    size_t cost = 0;
+    unsigned s;
+    FSE_CState_t cstate;
+    FSE_initCState(&cstate, ctable);
+    if (ZSTD_getFSEMaxSymbolValue(ctable) < max) {
+        DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u",
+                    ZSTD_getFSEMaxSymbolValue(ctable), max);
+        return ERROR(GENERIC);
+    }
+    for (s = 0; s <= max; ++s) {
+        unsigned const tableLog = cstate.stateLog;
+        unsigned const badCost = (tableLog + 1) << kAccuracyLog;
+        unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog);
+        if (count[s] == 0)
+            continue;
+        if (bitCost >= badCost) {
+            DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s);
+            return ERROR(GENERIC);
+        }
+        cost += (size_t)count[s] * bitCost;
+    }
+    return cost >> kAccuracyLog;
+}
+
+/**
+ * Returns the cost in bits of encoding the distribution in count using the
+ * table described by norm. The max symbol support by norm is assumed >= max.
+ * norm must be valid for every symbol with non-zero probability in count.
+ */
+size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog,
+                             unsigned const* count, unsigned const max)
+{
+    unsigned const shift = 8 - accuracyLog;
+    size_t cost = 0;
+    unsigned s;
+    assert(accuracyLog <= 8);
+    for (s = 0; s <= max; ++s) {
+        unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1;
+        unsigned const norm256 = normAcc << shift;
+        assert(norm256 > 0);
+        assert(norm256 < 256);
+        cost += count[s] * kInverseProbabilityLog256[norm256];
+    }
+    return cost >> 8;
+}
+
+symbolEncodingType_e
+ZSTD_selectEncodingType(
+        FSE_repeat* repeatMode, unsigned const* count, unsigned const max,
+        size_t const mostFrequent, size_t nbSeq, unsigned const FSELog,
+        FSE_CTable const* prevCTable,
+        short const* defaultNorm, U32 defaultNormLog,
+        ZSTD_defaultPolicy_e const isDefaultAllowed,
+        ZSTD_strategy const strategy)
+{
+    ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0);
+    if (mostFrequent == nbSeq) {
+        *repeatMode = FSE_repeat_none;
+        if (isDefaultAllowed && nbSeq <= 2) {
+            /* Prefer set_basic over set_rle when there are 2 or less symbols,
+             * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol.
+             * If basic encoding isn't possible, always choose RLE.
+             */
+            DEBUGLOG(5, "Selected set_basic");
+            return set_basic;
+        }
+        DEBUGLOG(5, "Selected set_rle");
+        return set_rle;
+    }
+    if (strategy < ZSTD_lazy) {
+        if (isDefaultAllowed) {
+            size_t const staticFse_nbSeq_max = 1000;
+            size_t const mult = 10 - strategy;
+            size_t const baseLog = 3;
+            size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog;  /* 28-36 for offset, 56-72 for lengths */
+            assert(defaultNormLog >= 5 && defaultNormLog <= 6);  /* xx_DEFAULTNORMLOG */
+            assert(mult <= 9 && mult >= 7);
+            if ( (*repeatMode == FSE_repeat_valid)
+              && (nbSeq < staticFse_nbSeq_max) ) {
+                DEBUGLOG(5, "Selected set_repeat");
+                return set_repeat;
+            }
+            if ( (nbSeq < dynamicFse_nbSeq_min)
+              || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) {
+                DEBUGLOG(5, "Selected set_basic");
+                /* The format allows default tables to be repeated, but it isn't useful.
+                 * When using simple heuristics to select encoding type, we don't want
+                 * to confuse these tables with dictionaries. When running more careful
+                 * analysis, we don't need to waste time checking both repeating tables
+                 * and default tables.
+                 */
+                *repeatMode = FSE_repeat_none;
+                return set_basic;
+            }
+        }
+    } else {
+        size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC);
+        size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC);
+        size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog);
+        size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq);
+
+        if (isDefaultAllowed) {
+            assert(!ZSTD_isError(basicCost));
+            assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost)));
+        }
+        assert(!ZSTD_isError(NCountCost));
+        assert(compressedCost < ERROR(maxCode));
+        DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u",
+                    (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost);
+        if (basicCost <= repeatCost && basicCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_basic");
+            assert(isDefaultAllowed);
+            *repeatMode = FSE_repeat_none;
+            return set_basic;
+        }
+        if (repeatCost <= compressedCost) {
+            DEBUGLOG(5, "Selected set_repeat");
+            assert(!ZSTD_isError(repeatCost));
+            return set_repeat;
+        }
+        assert(compressedCost < basicCost && compressedCost < repeatCost);
+    }
+    DEBUGLOG(5, "Selected set_compressed");
+    *repeatMode = FSE_repeat_check;
+    return set_compressed;
+}
+
+size_t
+ZSTD_buildCTable(void* dst, size_t dstCapacity,
+                FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type,
+                unsigned* count, U32 max,
+                const BYTE* codeTable, size_t nbSeq,
+                const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax,
+                const FSE_CTable* prevCTable, size_t prevCTableSize,
+                void* entropyWorkspace, size_t entropyWorkspaceSize)
+{
+    BYTE* op = (BYTE*)dst;
+    const BYTE* const oend = op + dstCapacity;
+    DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity);
+
+    switch (type) {
+    case set_rle:
+        FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), "");
+        RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space");
+        *op = codeTable[0];
+        return 1;
+    case set_repeat:
+        memcpy(nextCTable, prevCTable, prevCTableSize);
+        return 0;
+    case set_basic:
+        FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), "");  /* note : could be pre-calculated */
+        return 0;
+    case set_compressed: {
+        S16 norm[MaxSeq + 1];
+        size_t nbSeq_1 = nbSeq;
+        const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max);
+        if (count[codeTable[nbSeq-1]] > 1) {
+            count[codeTable[nbSeq-1]]--;
+            nbSeq_1--;
+        }
+        assert(nbSeq_1 > 1);
+        FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max), "");
+        {   size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog);   /* overflow protected */
+            FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
+            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, entropyWorkspace, entropyWorkspaceSize), "");
+            return NCountSize;
+        }
+    }
+    default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach");
+    }
+}
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_encodeSequences_body(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    BIT_CStream_t blockStream;
+    FSE_CState_t  stateMatchLength;
+    FSE_CState_t  stateOffsetBits;
+    FSE_CState_t  stateLitLength;
+
+    RETURN_ERROR_IF(
+        ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)),
+        dstSize_tooSmall, "not enough space remaining");
+    DEBUGLOG(6, "available space for bitstream : %i  (dstCapacity=%u)",
+                (int)(blockStream.endPtr - blockStream.startPtr),
+                (unsigned)dstCapacity);
+
+    /* first symbols */
+    FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateOffsetBits,  CTable_OffsetBits,  ofCodeTable[nbSeq-1]);
+    FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbSeq-1]);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]);
+    if (MEM_32bits()) BIT_flushBits(&blockStream);
+    if (longOffsets) {
+        U32 const ofBits = ofCodeTable[nbSeq-1];
+        unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+        if (extraBits) {
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits);
+            BIT_flushBits(&blockStream);
+        }
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits,
+                    ofBits - extraBits);
+    } else {
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]);
+    }
+    BIT_flushBits(&blockStream);
+
+    {   size_t n;
+        for (n=nbSeq-2 ; n<nbSeq ; n--) {      /* intentional underflow */
+            BYTE const llCode = llCodeTable[n];
+            BYTE const ofCode = ofCodeTable[n];
+            BYTE const mlCode = mlCodeTable[n];
+            U32  const llBits = LL_bits[llCode];
+            U32  const ofBits = ofCode;
+            U32  const mlBits = ML_bits[mlCode];
+            DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u",
+                        (unsigned)sequences[n].litLength,
+                        (unsigned)sequences[n].matchLength + MINMATCH,
+                        (unsigned)sequences[n].offset);
+                                                                            /* 32b*/  /* 64b*/
+                                                                            /* (7)*/  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);       /* 15 */  /* 15 */
+            FSE_encodeSymbol(&blockStream, &stateMatchLength, mlCode);      /* 24 */  /* 24 */
+            if (MEM_32bits()) BIT_flushBits(&blockStream);                  /* (7)*/
+            FSE_encodeSymbol(&blockStream, &stateLitLength, llCode);        /* 16 */  /* 33 */
+            if (MEM_32bits() || (ofBits+mlBits+llBits >= 64-7-(LLFSELog+MLFSELog+OffFSELog)))
+                BIT_flushBits(&blockStream);                                /* (7)*/
+            BIT_addBits(&blockStream, sequences[n].litLength, llBits);
+            if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream);
+            BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+            if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream);
+            if (longOffsets) {
+                unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1);
+                if (extraBits) {
+                    BIT_addBits(&blockStream, sequences[n].offset, extraBits);
+                    BIT_flushBits(&blockStream);                            /* (7)*/
+                }
+                BIT_addBits(&blockStream, sequences[n].offset >> extraBits,
+                            ofBits - extraBits);                            /* 31 */
+            } else {
+                BIT_addBits(&blockStream, sequences[n].offset, ofBits);     /* 31 */
+            }
+            BIT_flushBits(&blockStream);                                    /* (7)*/
+            DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr));
+    }   }
+
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog);
+    FSE_flushCState(&blockStream, &stateMatchLength);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog);
+    FSE_flushCState(&blockStream, &stateOffsetBits);
+    DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog);
+    FSE_flushCState(&blockStream, &stateLitLength);
+
+    {   size_t const streamSize = BIT_closeCStream(&blockStream);
+        RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space");
+        return streamSize;
+    }
+}
+
+static size_t
+ZSTD_encodeSequences_default(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+
+#if DYNAMIC_BMI2
+
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_encodeSequences_bmi2(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets)
+{
+    return ZSTD_encodeSequences_body(dst, dstCapacity,
+                                    CTable_MatchLength, mlCodeTable,
+                                    CTable_OffsetBits, ofCodeTable,
+                                    CTable_LitLength, llCodeTable,
+                                    sequences, nbSeq, longOffsets);
+}
+
+#endif
+
+size_t ZSTD_encodeSequences(
+            void* dst, size_t dstCapacity,
+            FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
+            FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable,
+            FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable,
+            seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2)
+{
+    DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity);
+#if DYNAMIC_BMI2
+    if (bmi2) {
+        return ZSTD_encodeSequences_bmi2(dst, dstCapacity,
+                                         CTable_MatchLength, mlCodeTable,
+                                         CTable_OffsetBits, ofCodeTable,
+                                         CTable_LitLength, llCodeTable,
+                                         sequences, nbSeq, longOffsets);
+    }
+#endif
+    (void)bmi2;
+    return ZSTD_encodeSequences_default(dst, dstCapacity,
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq, longOffsets);
+}
+/**** ended inlining compress/zstd_compress_sequences.c ****/
+/**** start inlining compress/zstd_compress_superblock.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+ /*-*************************************
+ *  Dependencies
+ ***************************************/
+/**** start inlining zstd_compress_superblock.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMPRESS_ADVANCED_H
+#define ZSTD_COMPRESS_ADVANCED_H
+
+/*-*************************************
+*  Dependencies
+***************************************/
+
+/**** skipping file: ../zstd.h ****/
+
+/*-*************************************
+*  Target Compressed Block Size
+***************************************/
+
+/* ZSTD_compressSuperBlock() :
+ * Used to compress a super block when targetCBlockSize is being used.
+ * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
+                               unsigned lastBlock);
+
+#endif /* ZSTD_COMPRESS_ADVANCED_H */
+/**** ended inlining zstd_compress_superblock.h ****/
+
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: hist.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_compress_sequences.h ****/
+/**** skipping file: zstd_compress_literals.h ****/
+
+/*-*************************************
+*  Superblock entropy buffer structs
+***************************************/
+/** ZSTD_hufCTablesMetadata_t :
+ *  Stores Literals Block Type for a super-block in hType, and
+ *  huffman tree description in hufDesBuffer.
+ *  hufDesSize refers to the size of huffman tree description in bytes.
+ *  This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */
+typedef struct {
+    symbolEncodingType_e hType;
+    BYTE hufDesBuffer[500]; /* TODO give name to this value */
+    size_t hufDesSize;
+} ZSTD_hufCTablesMetadata_t;
+
+/** ZSTD_fseCTablesMetadata_t :
+ *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and
+ *  fse tables in fseTablesBuffer.
+ *  fseTablesSize refers to the size of fse tables in bytes.
+ *  This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */
+typedef struct {
+    symbolEncodingType_e llType;
+    symbolEncodingType_e ofType;
+    symbolEncodingType_e mlType;
+    BYTE fseTablesBuffer[500]; /* TODO give name to this value */
+    size_t fseTablesSize;
+    size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */
+} ZSTD_fseCTablesMetadata_t;
+
+typedef struct {
+    ZSTD_hufCTablesMetadata_t hufMetadata;
+    ZSTD_fseCTablesMetadata_t fseMetadata;
+} ZSTD_entropyCTablesMetadata_t;
+
+
+/** ZSTD_buildSuperBlockEntropy_literal() :
+ *  Builds entropy for the super-block literals.
+ *  Stores literals block type (raw, rle, compressed, repeat) and
+ *  huffman description table to hufMetadata.
+ *  @return : size of huffman description table or error code */
+static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize,
+                                            const ZSTD_hufCTables_t* prevHuf,
+                                                  ZSTD_hufCTables_t* nextHuf,
+                                                  ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                  const int disableLiteralsCompression,
+                                                  void* workspace, size_t wkspSize)
+{
+    BYTE* const wkspStart = (BYTE*)workspace;
+    BYTE* const wkspEnd = wkspStart + wkspSize;
+    BYTE* const countWkspStart = wkspStart;
+    unsigned* const countWksp = (unsigned*)workspace;
+    const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned);
+    BYTE* const nodeWksp = countWkspStart + countWkspSize;
+    const size_t nodeWkspSize = wkspEnd-nodeWksp;
+    unsigned maxSymbolValue = 255;
+    unsigned huffLog = HUF_TABLELOG_DEFAULT;
+    HUF_repeat repeat = prevHuf->repeatMode;
+
+    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (disableLiteralsCompression) {
+        DEBUGLOG(5, "set_basic - disabled");
+        hufMetadata->hType = set_basic;
+        return 0;
+    }
+
+    /* small ? don't even attempt compression (speed opt) */
+#   define COMPRESS_LITERALS_SIZE_MIN 63
+    {   size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <= minLitSize) {
+            DEBUGLOG(5, "set_basic - too small");
+            hufMetadata->hType = set_basic;
+            return 0;
+        }
+    }
+
+    /* Scan input and build symbol stats */
+    {   size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize);
+        FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+        if (largest == srcSize) {
+            DEBUGLOG(5, "set_rle");
+            hufMetadata->hType = set_rle;
+            return 0;
+        }
+        if (largest <= (srcSize >> 7)+4) {
+            DEBUGLOG(5, "set_basic - no gain");
+            hufMetadata->hType = set_basic;
+            return 0;
+        }
+    }
+
+    /* Validate the previous Huffman table */
+    if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+        repeat = HUF_repeat_none;
+    }
+
+    /* Build Huffman Tree */
+    memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+    huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp,
+                                                    maxSymbolValue, huffLog,
+                                                    nodeWksp, nodeWkspSize);
+        FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+        huffLog = (U32)maxBits;
+        {   /* Build and write the CTable */
+            size_t const newCSize = HUF_estimateCompressedSize(
+                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+            size_t const hSize = HUF_writeCTable(
+                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer),
+                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog);
+            /* Check against repeating the previous CTable */
+            if (repeat != HUF_repeat_none) {
+                size_t const oldCSize = HUF_estimateCompressedSize(
+                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue);
+                if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) {
+                    DEBUGLOG(5, "set_repeat - smaller");
+                    memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                    hufMetadata->hType = set_repeat;
+                    return 0;
+                }
+            }
+            if (newCSize + hSize >= srcSize) {
+                DEBUGLOG(5, "set_basic - no gains");
+                memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                hufMetadata->hType = set_basic;
+                return 0;
+            }
+            DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize);
+            hufMetadata->hType = set_compressed;
+            nextHuf->repeatMode = HUF_repeat_check;
+            return hSize;
+        }
+    }
+}
+
+/** ZSTD_buildSuperBlockEntropy_sequences() :
+ *  Builds entropy for the super-block sequences.
+ *  Stores symbol compression modes and fse table to fseMetadata.
+ *  @return : size of fse tables or error code */
+static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr,
+                                              const ZSTD_fseCTables_t* prevEntropy,
+                                                    ZSTD_fseCTables_t* nextEntropy,
+                                              const ZSTD_CCtx_params* cctxParams,
+                                                    ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                    void* workspace, size_t wkspSize)
+{
+    BYTE* const wkspStart = (BYTE*)workspace;
+    BYTE* const wkspEnd = wkspStart + wkspSize;
+    BYTE* const countWkspStart = wkspStart;
+    unsigned* const countWksp = (unsigned*)workspace;
+    const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned);
+    BYTE* const cTableWksp = countWkspStart + countWkspSize;
+    const size_t cTableWkspSize = wkspEnd-cTableWksp;
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart;
+    BYTE* const ostart = fseMetadata->fseTablesBuffer;
+    BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer);
+    BYTE* op = ostart;
+
+    assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE));
+    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq);
+    memset(workspace, 0, wkspSize);
+
+    fseMetadata->lastCountSize = 0;
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+    /* build CTable for Literal Lengths */
+    {   U32 LLtype;
+        unsigned max = MaxLL;
+        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode;
+        LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode,
+                                        countWksp, max, mostFrequent, nbSeq,
+                                        LLFSELog, prevEntropy->litlengthCTable,
+                                        LL_defaultNorm, LL_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
+                                                    countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                                                    prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable),
+                                                    cTableWksp, cTableWkspSize);
+            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed");
+            if (LLtype == set_compressed)
+                fseMetadata->lastCountSize = countSize;
+            op += countSize;
+            fseMetadata->llType = (symbolEncodingType_e) LLtype;
+    }   }
+    /* build CTable for Offsets */
+    {   U32 Offtype;
+        unsigned max = MaxOff;
+        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode;
+        Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode,
+                                        countWksp, max, mostFrequent, nbSeq,
+                                        OffFSELog, prevEntropy->offcodeCTable,
+                                        OF_defaultNorm, OF_defaultNormLog,
+                                        defaultPolicy, strategy);
+        assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
+                                                    countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                                                    prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable),
+                                                    cTableWksp, cTableWkspSize);
+            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed");
+            if (Offtype == set_compressed)
+                fseMetadata->lastCountSize = countSize;
+            op += countSize;
+            fseMetadata->ofType = (symbolEncodingType_e) Offtype;
+    }   }
+    /* build CTable for MatchLengths */
+    {   U32 MLtype;
+        unsigned max = MaxML;
+        size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op));
+        nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode;
+        MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode,
+                                        countWksp, max, mostFrequent, nbSeq,
+                                        MLFSELog, prevEntropy->matchlengthCTable,
+                                        ML_defaultNorm, ML_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
+                                                    countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
+                                                    prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable),
+                                                    cTableWksp, cTableWkspSize);
+            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed");
+            if (MLtype == set_compressed)
+                fseMetadata->lastCountSize = countSize;
+            op += countSize;
+            fseMetadata->mlType = (symbolEncodingType_e) MLtype;
+    }   }
+    assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer));
+    return op-ostart;
+}
+
+
+/** ZSTD_buildSuperBlockEntropy() :
+ *  Builds entropy for the super-block.
+ *  @return : 0 on success or error code */
+static size_t
+ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr,
+                      const ZSTD_entropyCTables_t* prevEntropy,
+                            ZSTD_entropyCTables_t* nextEntropy,
+                      const ZSTD_CCtx_params* cctxParams,
+                            ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                            void* workspace, size_t wkspSize)
+{
+    size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart;
+    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy");
+    entropyMetadata->hufMetadata.hufDesSize =
+        ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize,
+                                            &prevEntropy->huf, &nextEntropy->huf,
+                                            &entropyMetadata->hufMetadata,
+                                            ZSTD_disableLiteralsCompression(cctxParams),
+                                            workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed");
+    entropyMetadata->fseMetadata.fseTablesSize =
+        ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr,
+                                              &prevEntropy->fse, &nextEntropy->fse,
+                                              cctxParams,
+                                              &entropyMetadata->fseMetadata,
+                                              workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed");
+    return 0;
+}
+
+/** ZSTD_compressSubBlock_literal() :
+ *  Compresses literals section for a sub-block.
+ *  When we have to write the Huffman table we will sometimes choose a header
+ *  size larger than necessary. This is because we have to pick the header size
+ *  before we know the table size + compressed size, so we have a bound on the
+ *  table size. If we guessed incorrectly, we fall back to uncompressed literals.
+ *
+ *  We write the header when writeEntropy=1 and set entropyWrriten=1 when we succeeded
+ *  in writing the header, otherwise it is set to 0.
+ *
+ *  hufMetadata->hType has literals block type info.
+ *      If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block.
+ *      If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block.
+ *      If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block
+ *      If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block
+ *      and the following sub-blocks' literals sections will be Treeless_Literals_Block.
+ *  @return : compressed size of literals section of a sub-block
+ *            Or 0 if it unable to compress.
+ *            Or error code */
+static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable,
+                                    const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                    const BYTE* literals, size_t litSize,
+                                    void* dst, size_t dstSize,
+                                    const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    size_t const header = writeEntropy ? 200 : 0;
+    size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header));
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstSize;
+    BYTE* op = ostart + lhSize;
+    U32 const singleStream = lhSize == 3;
+    symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat;
+    size_t cLitSize = 0;
+
+    (void)bmi2; /* TODO bmi2... */
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy);
+
+    *entropyWritten = 0;
+    if (litSize == 0 || hufMetadata->hType == set_basic) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal");
+      return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+    } else if (hufMetadata->hType == set_rle) {
+      DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal");
+      return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize);
+    }
+
+    assert(litSize > 0);
+    assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat);
+
+    if (writeEntropy && hufMetadata->hType == set_compressed) {
+        memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize);
+        op += hufMetadata->hufDesSize;
+        cLitSize += hufMetadata->hufDesSize;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize);
+    }
+
+    /* TODO bmi2 */
+    {   const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable)
+                                          : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable);
+        op += cSize;
+        cLitSize += cSize;
+        if (cSize == 0 || ERR_isError(cSize)) {
+            DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize));
+            return 0;
+        }
+        /* If we expand and we aren't writing a header then emit uncompressed */
+        if (!writeEntropy && cLitSize >= litSize) {
+            DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        /* If we are writing headers then allow expansion that doesn't change our header size. */
+        if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) {
+            assert(cLitSize > litSize);
+            DEBUGLOG(5, "Literals expanded beyond allowed header size");
+            return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize);
+        }
+        DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize);
+    }
+
+    /* Build header */
+    switch(lhSize)
+    {
+    case 3: /* 2 - 2 - 10 - 10 */
+        {   U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14);
+            MEM_writeLE24(ostart, lhc);
+            break;
+        }
+    case 4: /* 2 - 2 - 14 - 14 */
+        {   U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18);
+            MEM_writeLE32(ostart, lhc);
+            break;
+        }
+    case 5: /* 2 - 2 - 18 - 18 */
+        {   U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22);
+            MEM_writeLE32(ostart, lhc);
+            ostart[4] = (BYTE)(cLitSize >> 10);
+            break;
+        }
+    default:  /* not possible : lhSize is {3,4,5} */
+        assert(0);
+    }
+    *entropyWritten = 1;
+    DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart));
+    return op-ostart;
+}
+
+static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) {
+    const seqDef* const sstart = sequences;
+    const seqDef* const send = sequences + nbSeq;
+    const seqDef* sp = sstart;
+    size_t matchLengthSum = 0;
+    size_t litLengthSum = 0;
+    while (send-sp > 0) {
+        ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp);
+        litLengthSum += seqLen.litLength;
+        matchLengthSum += seqLen.matchLength;
+        sp++;
+    }
+    assert(litLengthSum <= litSize);
+    if (!lastSequence) {
+        assert(litLengthSum == litSize);
+    }
+    return matchLengthSum + litSize;
+}
+
+/** ZSTD_compressSubBlock_sequences() :
+ *  Compresses sequences section for a sub-block.
+ *  fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have
+ *  symbol compression modes for the super-block.
+ *  The first successfully compressed block will have these in its header.
+ *  We set entropyWritten=1 when we succeed in compressing the sequences.
+ *  The following sub-blocks will always have repeat mode.
+ *  @return : compressed size of sequences section of a sub-block
+ *            Or 0 if it is unable to compress
+ *            Or error code. */
+static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables,
+                                              const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                              const seqDef* sequences, size_t nbSeq,
+                                              const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                              const ZSTD_CCtx_params* cctxParams,
+                                              void* dst, size_t dstCapacity,
+                                              const int bmi2, int writeEntropy, int* entropyWritten)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    BYTE* seqHead;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets);
+
+    *entropyWritten = 0;
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall, "");
+    if (nbSeq < 0x7F)
+        *op++ = (BYTE)nbSeq;
+    else if (nbSeq < LONGNBSEQ)
+        op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2;
+    else
+        op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3;
+    if (nbSeq==0) {
+        return op - ostart;
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart));
+
+    if (writeEntropy) {
+        const U32 LLtype = fseMetadata->llType;
+        const U32 Offtype = fseMetadata->ofType;
+        const U32 MLtype = fseMetadata->mlType;
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize);
+        *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+        memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize);
+        op += fseMetadata->fseTablesSize;
+    } else {
+        const U32 repeat = set_repeat;
+        *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2));
+    }
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, oend - op,
+                                        fseTables->matchlengthCTable, mlCode,
+                                        fseTables->offcodeCTable, ofCode,
+                                        fseTables->litlengthCTable, llCode,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+        op += bitstreamSize;
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+        if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(fseMetadata->lastCountSize + bitstreamSize == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+#endif
+        DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize);
+    }
+
+    /* zstd versions <= 1.4.0 mistakenly report error when
+     * sequences section body size is less than 3 bytes.
+     * Fixed by https://github.com/facebook/zstd/pull/1664.
+     * This can happen when the previous sequences section block is compressed
+     * with rle mode and the current block's sequences section is compressed
+     * with repeat mode where sequences section body size can be 1 byte.
+     */
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (op-seqHead < 4) {
+        DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting "
+                    "an uncompressed block when sequences are < 4 bytes");
+        return 0;
+    }
+#endif
+
+    *entropyWritten = 1;
+    return op - ostart;
+}
+
+/** ZSTD_compressSubBlock() :
+ *  Compresses a single sub-block.
+ *  @return : compressed size of the sub-block
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy,
+                                    const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                    const seqDef* sequences, size_t nbSeq,
+                                    const BYTE* literals, size_t litSize,
+                                    const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode,
+                                    const ZSTD_CCtx_params* cctxParams,
+                                    void* dst, size_t dstCapacity,
+                                    const int bmi2,
+                                    int writeLitEntropy, int writeSeqEntropy,
+                                    int* litEntropyWritten, int* seqEntropyWritten,
+                                    U32 lastBlock)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart + ZSTD_blockHeaderSize;
+    DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)",
+                litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock);
+    {   size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable,
+                                                        &entropyMetadata->hufMetadata, literals, litSize,
+                                                        op, oend-op, bmi2, writeLitEntropy, litEntropyWritten);
+        FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed");
+        if (cLitSize == 0) return 0;
+        op += cLitSize;
+    }
+    {   size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse,
+                                                  &entropyMetadata->fseMetadata,
+                                                  sequences, nbSeq,
+                                                  llCode, mlCode, ofCode,
+                                                  cctxParams,
+                                                  op, oend-op,
+                                                  bmi2, writeSeqEntropy, seqEntropyWritten);
+        FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed");
+        if (cSeqSize == 0) return 0;
+        op += cSeqSize;
+    }
+    /* Write block header */
+    {   size_t cSize = (op-ostart)-ZSTD_blockHeaderSize;
+        U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+        MEM_writeLE24(ostart, cBlockHeader24);
+    }
+    return op-ostart;
+}
+
+static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize,
+                                                const ZSTD_hufCTables_t* huf,
+                                                const ZSTD_hufCTablesMetadata_t* hufMetadata,
+                                                void* workspace, size_t wkspSize,
+                                                int writeEntropy)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    unsigned maxSymbolValue = 255;
+    size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+
+    if (hufMetadata->hType == set_basic) return litSize;
+    else if (hufMetadata->hType == set_rle) return 1;
+    else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) {
+        size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize;
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type,
+                        const BYTE* codeTable, unsigned maxCode,
+                        size_t nbSeq, const FSE_CTable* fseCTable,
+                        const U32* additionalBits,
+                        short const* defaultNorm, U32 defaultNormLog,
+                        void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp = (unsigned*)workspace;
+    const BYTE* ctp = codeTable;
+    const BYTE* const ctStart = ctp;
+    const BYTE* const ctEnd = ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits = 0;
+    unsigned max = maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize);  /* can't fail */
+    if (type == set_basic) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max);
+    } else if (type == set_rle) {
+        cSymbolTypeSizeEstimateInBits = 0;
+    } else if (type == set_compressed || type == set_repeat) {
+        cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10;
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp];
+        else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits / 8;
+}
+
+static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable,
+                                                  const BYTE* llCodeTable,
+                                                  const BYTE* mlCodeTable,
+                                                  size_t nbSeq,
+                                                  const ZSTD_fseCTables_t* fseTables,
+                                                  const ZSTD_fseCTablesMetadata_t* fseMetadata,
+                                                  void* workspace, size_t wkspSize,
+                                                  int writeEntropy)
+{
+    size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */
+    size_t cSeqSizeEstimate = 0;
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff,
+                                         nbSeq, fseTables->offcodeCTable, NULL,
+                                         OF_defaultNorm, OF_defaultNormLog,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL,
+                                         nbSeq, fseTables->litlengthCTable, LL_bits,
+                                         LL_defaultNorm, LL_defaultNormLog,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML,
+                                         nbSeq, fseTables->matchlengthCTable, ML_bits,
+                                         ML_defaultNorm, ML_defaultNormLog,
+                                         workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize,
+                                        const BYTE* ofCodeTable,
+                                        const BYTE* llCodeTable,
+                                        const BYTE* mlCodeTable,
+                                        size_t nbSeq,
+                                        const ZSTD_entropyCTables_t* entropy,
+                                        const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                                        void* workspace, size_t wkspSize,
+                                        int writeLitEntropy, int writeSeqEntropy) {
+    size_t cSizeEstimate = 0;
+    cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize,
+                                                         &entropy->huf, &entropyMetadata->hufMetadata,
+                                                         workspace, wkspSize, writeLitEntropy);
+    cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable,
+                                                         nbSeq, &entropy->fse, &entropyMetadata->fseMetadata,
+                                                         workspace, wkspSize, writeSeqEntropy);
+    return cSizeEstimate + ZSTD_blockHeaderSize;
+}
+
+static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata)
+{
+    if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle)
+        return 1;
+    if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle)
+        return 1;
+    if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle)
+        return 1;
+    return 0;
+}
+
+/** ZSTD_compressSubBlock_multi() :
+ *  Breaks super-block into multiple sub-blocks and compresses them.
+ *  Entropy will be written to the first block.
+ *  The following blocks will use repeat mode to compress.
+ *  All sub-blocks are compressed blocks (no raw or rle blocks).
+ *  @return : compressed size of the super block (which is multiple ZSTD blocks)
+ *            Or 0 if it failed to compress. */
+static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr,
+                            const ZSTD_compressedBlockState_t* prevCBlock,
+                            ZSTD_compressedBlockState_t* nextCBlock,
+                            const ZSTD_entropyCTablesMetadata_t* entropyMetadata,
+                            const ZSTD_CCtx_params* cctxParams,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const int bmi2, U32 lastBlock,
+                            void* workspace, size_t wkspSize)
+{
+    const seqDef* const sstart = seqStorePtr->sequencesStart;
+    const seqDef* const send = seqStorePtr->sequences;
+    const seqDef* sp = sstart;
+    const BYTE* const lstart = seqStorePtr->litStart;
+    const BYTE* const lend = seqStorePtr->lit;
+    const BYTE* lp = lstart;
+    BYTE const* ip = (BYTE const*)src;
+    BYTE const* const iend = ip + srcSize;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    const BYTE* llCodePtr = seqStorePtr->llCode;
+    const BYTE* mlCodePtr = seqStorePtr->mlCode;
+    const BYTE* ofCodePtr = seqStorePtr->ofCode;
+    size_t targetCBlockSize = cctxParams->targetCBlockSize;
+    size_t litSize, seqCount;
+    int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed;
+    int writeSeqEntropy = 1;
+    int lastSequence = 0;
+
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)",
+                (unsigned)(lend-lp), (unsigned)(send-sstart));
+
+    litSize = 0;
+    seqCount = 0;
+    do {
+        size_t cBlockSizeEstimate = 0;
+        if (sstart == send) {
+            lastSequence = 1;
+        } else {
+            const seqDef* const sequence = sp + seqCount;
+            lastSequence = sequence == send - 1;
+            litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength;
+            seqCount++;
+        }
+        if (lastSequence) {
+            assert(lp <= lend);
+            assert(litSize <= (size_t)(lend - lp));
+            litSize = (size_t)(lend - lp);
+        }
+        /* I think there is an optimization opportunity here.
+         * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful
+         * since it recalculates estimate from scratch.
+         * For example, it would recount literal distribution and symbol codes everytime.
+         */
+        cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount,
+                                                       &nextCBlock->entropy, entropyMetadata,
+                                                       workspace, wkspSize, writeLitEntropy, writeSeqEntropy);
+        if (cBlockSizeEstimate > targetCBlockSize || lastSequence) {
+            int litEntropyWritten = 0;
+            int seqEntropyWritten = 0;
+            const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence);
+            const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata,
+                                                       sp, seqCount,
+                                                       lp, litSize,
+                                                       llCodePtr, mlCodePtr, ofCodePtr,
+                                                       cctxParams,
+                                                       op, oend-op,
+                                                       bmi2, writeLitEntropy, writeSeqEntropy,
+                                                       &litEntropyWritten, &seqEntropyWritten,
+                                                       lastBlock && lastSequence);
+            FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed");
+            if (cSize > 0 && cSize < decompressedSize) {
+                DEBUGLOG(5, "Committed the sub-block");
+                assert(ip + decompressedSize <= iend);
+                ip += decompressedSize;
+                sp += seqCount;
+                lp += litSize;
+                op += cSize;
+                llCodePtr += seqCount;
+                mlCodePtr += seqCount;
+                ofCodePtr += seqCount;
+                litSize = 0;
+                seqCount = 0;
+                /* Entropy only needs to be written once */
+                if (litEntropyWritten) {
+                    writeLitEntropy = 0;
+                }
+                if (seqEntropyWritten) {
+                    writeSeqEntropy = 0;
+                }
+            }
+        }
+    } while (!lastSequence);
+    if (writeLitEntropy) {
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten");
+        memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf));
+    }
+    if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) {
+        /* If we haven't written our entropy tables, then we've violated our contract and
+         * must emit an uncompressed block.
+         */
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten");
+        return 0;
+    }
+    if (ip < iend) {
+        size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock);
+        DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip));
+        FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+        assert(cSize != 0);
+        op += cSize;
+        /* We have to regenerate the repcodes because we've skipped some sequences */
+        if (sp < send) {
+            seqDef const* seq;
+            repcodes_t rep;
+            memcpy(&rep, prevCBlock->rep, sizeof(rep)); 
+            for (seq = sstart; seq < sp; ++seq) {
+                rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0);
+            }
+            memcpy(nextCBlock->rep, &rep, sizeof(rep));
+        }
+    }
+    DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed");
+    return op-ostart;
+}
+
+size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               void const* src, size_t srcSize,
+                               unsigned lastBlock) {
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+
+    FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore,
+          &zc->blockState.prevCBlock->entropy,
+          &zc->blockState.nextCBlock->entropy,
+          &zc->appliedParams,
+          &entropyMetadata,
+          zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */), "");
+
+    return ZSTD_compressSubBlock_multi(&zc->seqStore,
+            zc->blockState.prevCBlock,
+            zc->blockState.nextCBlock,
+            &entropyMetadata,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            src, srcSize,
+            zc->bmi2, lastBlock,
+            zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */);
+}
+/**** ended inlining compress/zstd_compress_superblock.c ****/
+/**** start inlining compress/zstd_compress.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <limits.h>         /* INT_MAX */
+#include <string.h>         /* memset */
+/**** start inlining ../common/cpu.h ****/
+/*
+ * Copyright (c) 2018-2020, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_COMMON_CPU_H
+#define ZSTD_COMMON_CPU_H
+
+/**
+ * Implementation taken from folly/CpuId.h
+ * https://github.com/facebook/folly/blob/master/folly/CpuId.h
+ */
+
+#include <string.h>
+
+/**** skipping file: mem.h ****/
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+typedef struct {
+    U32 f1c;
+    U32 f1d;
+    U32 f7b;
+    U32 f7c;
+} ZSTD_cpuid_t;
+
+MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) {
+    U32 f1c = 0;
+    U32 f1d = 0;
+    U32 f7b = 0;
+    U32 f7c = 0;
+#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+    int reg[4];
+    __cpuid((int*)reg, 0);
+    {
+        int const n = reg[0];
+        if (n >= 1) {
+            __cpuid((int*)reg, 1);
+            f1c = (U32)reg[2];
+            f1d = (U32)reg[3];
+        }
+        if (n >= 7) {
+            __cpuidex((int*)reg, 7, 0);
+            f7b = (U32)reg[1];
+            f7c = (U32)reg[2];
+        }
+    }
+#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__)
+    /* The following block like the normal cpuid branch below, but gcc
+     * reserves ebx for use of its pic register so we must specially
+     * handle the save and restore to avoid clobbering the register
+     */
+    U32 n;
+    __asm__(
+        "pushl %%ebx\n\t"
+        "cpuid\n\t"
+        "popl %%ebx\n\t"
+        : "=a"(n)
+        : "a"(0)
+        : "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "popl %%ebx\n\t"
+          : "=a"(f1a), "=c"(f1c), "=d"(f1d)
+          : "a"(1));
+    }
+    if (n >= 7) {
+      __asm__(
+          "pushl %%ebx\n\t"
+          "cpuid\n\t"
+          "movl %%ebx, %%eax\n\t"
+          "popl %%ebx"
+          : "=a"(f7b), "=c"(f7c)
+          : "a"(7), "c"(0)
+          : "edx");
+    }
+#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__)
+    U32 n;
+    __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx");
+    if (n >= 1) {
+      U32 f1a;
+      __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx");
+    }
+    if (n >= 7) {
+      U32 f7a;
+      __asm__("cpuid"
+              : "=a"(f7a), "=b"(f7b), "=c"(f7c)
+              : "a"(7), "c"(0)
+              : "edx");
+    }
+#endif
+    {
+        ZSTD_cpuid_t cpuid;
+        cpuid.f1c = f1c;
+        cpuid.f1d = f1d;
+        cpuid.f7b = f7b;
+        cpuid.f7c = f7c;
+        return cpuid;
+    }
+}
+
+#define X(name, r, bit)                                                        \
+  MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) {                 \
+    return ((cpuid.r) & (1U << bit)) != 0;                                     \
+  }
+
+/* cpuid(1): Processor Info and Feature Bits. */
+#define C(name, bit) X(name, f1c, bit)
+  C(sse3, 0)
+  C(pclmuldq, 1)
+  C(dtes64, 2)
+  C(monitor, 3)
+  C(dscpl, 4)
+  C(vmx, 5)
+  C(smx, 6)
+  C(eist, 7)
+  C(tm2, 8)
+  C(ssse3, 9)
+  C(cnxtid, 10)
+  C(fma, 12)
+  C(cx16, 13)
+  C(xtpr, 14)
+  C(pdcm, 15)
+  C(pcid, 17)
+  C(dca, 18)
+  C(sse41, 19)
+  C(sse42, 20)
+  C(x2apic, 21)
+  C(movbe, 22)
+  C(popcnt, 23)
+  C(tscdeadline, 24)
+  C(aes, 25)
+  C(xsave, 26)
+  C(osxsave, 27)
+  C(avx, 28)
+  C(f16c, 29)
+  C(rdrand, 30)
+#undef C
+#define D(name, bit) X(name, f1d, bit)
+  D(fpu, 0)
+  D(vme, 1)
+  D(de, 2)
+  D(pse, 3)
+  D(tsc, 4)
+  D(msr, 5)
+  D(pae, 6)
+  D(mce, 7)
+  D(cx8, 8)
+  D(apic, 9)
+  D(sep, 11)
+  D(mtrr, 12)
+  D(pge, 13)
+  D(mca, 14)
+  D(cmov, 15)
+  D(pat, 16)
+  D(pse36, 17)
+  D(psn, 18)
+  D(clfsh, 19)
+  D(ds, 21)
+  D(acpi, 22)
+  D(mmx, 23)
+  D(fxsr, 24)
+  D(sse, 25)
+  D(sse2, 26)
+  D(ss, 27)
+  D(htt, 28)
+  D(tm, 29)
+  D(pbe, 31)
+#undef D
+
+/* cpuid(7): Extended Features. */
+#define B(name, bit) X(name, f7b, bit)
+  B(bmi1, 3)
+  B(hle, 4)
+  B(avx2, 5)
+  B(smep, 7)
+  B(bmi2, 8)
+  B(erms, 9)
+  B(invpcid, 10)
+  B(rtm, 11)
+  B(mpx, 14)
+  B(avx512f, 16)
+  B(avx512dq, 17)
+  B(rdseed, 18)
+  B(adx, 19)
+  B(smap, 20)
+  B(avx512ifma, 21)
+  B(pcommit, 22)
+  B(clflushopt, 23)
+  B(clwb, 24)
+  B(avx512pf, 26)
+  B(avx512er, 27)
+  B(avx512cd, 28)
+  B(sha, 29)
+  B(avx512bw, 30)
+  B(avx512vl, 31)
+#undef B
+#define C(name, bit) X(name, f7c, bit)
+  C(prefetchwt1, 0)
+  C(avx512vbmi, 1)
+#undef C
+
+#undef X
+
+#endif /* ZSTD_COMMON_CPU_H */
+/**** ended inlining ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: hist.h ****/
+#define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_compress_sequences.h ****/
+/**** skipping file: zstd_compress_literals.h ****/
+/**** start inlining zstd_fast.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_FAST_H
+#define ZSTD_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
+/**** ended inlining zstd_fast.h ****/
+/**** start inlining zstd_double_fast.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_DOUBLE_FAST_H
+#define ZSTD_DOUBLE_FAST_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: zstd_compress_internal.h ****/
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm);
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_DOUBLE_FAST_H */
+/**** ended inlining zstd_double_fast.h ****/
+/**** start inlining zstd_lazy.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LAZY_H
+#define ZSTD_LAZY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: zstd_compress_internal.h ****/
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+
+void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue);  /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_LAZY_H */
+/**** ended inlining zstd_lazy.h ****/
+/**** start inlining zstd_opt.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_OPT_H
+#define ZSTD_OPT_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: zstd_compress_internal.h ****/
+
+/* used in ZSTD_loadDictionaryContent() */
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend);
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+
+        /* note : no btultra2 variant for extDict nor dictMatchState,
+         * because btultra2 is not meant to work with dictionaries
+         * and is only specific for the first block (no prefix) */
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_OPT_H */
+/**** ended inlining zstd_opt.h ****/
+/**** start inlining zstd_ldm.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LDM_H
+#define ZSTD_LDM_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: ../zstd.h ****/
+
+/*-*************************************
+*  Long distance matching
+***************************************/
+
+#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT
+
+void ZSTD_ldm_fillHashTable(
+            ldmState_t* state, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params);
+
+/**
+ * ZSTD_ldm_generateSequences():
+ *
+ * Generates the sequences using the long distance match finder.
+ * Generates long range matching sequences in `sequences`, which parse a prefix
+ * of the source. `sequences` must be large enough to store every sequence,
+ * which can be checked with `ZSTD_ldm_getMaxNbSeq()`.
+ * @returns 0 or an error code.
+ *
+ * NOTE: The user must have called ZSTD_window_update() for all of the input
+ * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks.
+ * NOTE: This function returns an error if it runs out of space to store
+ *       sequences.
+ */
+size_t ZSTD_ldm_generateSequences(
+            ldmState_t* ldms, rawSeqStore_t* sequences,
+            ldmParams_t const* params, void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_blockCompress():
+ *
+ * Compresses a block using the predefined sequences, along with a secondary
+ * block compressor. The literals section of every sequence is passed to the
+ * secondary block compressor, and those sequences are interspersed with the
+ * predefined sequences. Returns the length of the last literals.
+ * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed.
+ * `rawSeqStore.seq` may also be updated to split the last sequence between two
+ * blocks.
+ * @return The length of the last literals.
+ *
+ * NOTE: The source must be at most the maximum block size, but the predefined
+ * sequences can be any size, and may be longer than the block. In the case that
+ * they are longer than the block, the last sequences may need to be split into
+ * two. We handle that case correctly, and update `rawSeqStore` appropriately.
+ * NOTE: This function does not return any errors.
+ */
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+            void const* src, size_t srcSize);
+
+/**
+ * ZSTD_ldm_skipSequences():
+ *
+ * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`.
+ * Avoids emitting matches less than `minMatch` bytes.
+ * Must be called for data with is not passed to ZSTD_ldm_blockCompress().
+ */
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize,
+    U32 const minMatch);
+
+
+/** ZSTD_ldm_getTableSize() :
+ *  Estimate the space needed for long distance matching tables or 0 if LDM is
+ *  disabled.
+ */
+size_t ZSTD_ldm_getTableSize(ldmParams_t params);
+
+/** ZSTD_ldm_getSeqSpace() :
+ *  Return an upper bound on the number of sequences that can be produced by
+ *  the long distance matcher, or 0 if LDM is disabled.
+ */
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize);
+
+/** ZSTD_ldm_adjustParameters() :
+ *  If the params->hashRateLog is not set, set it to its default value based on
+ *  windowLog and params->hashLog.
+ *
+ *  Ensures that params->bucketSizeLog is <= params->hashLog (setting it to
+ *  params->hashLog if it is not).
+ *
+ *  Ensures that the minMatchLength >= targetLength during optimal parsing.
+ */
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_FAST_H */
+/**** ended inlining zstd_ldm.h ****/
+/**** skipping file: zstd_compress_superblock.h ****/
+
+
+/*-*************************************
+*  Helper functions
+***************************************/
+/* ZSTD_compressBound()
+ * Note that the result from this function is only compatible with the "normal"
+ * full-block strategy.
+ * When there are a lot of small blocks due to frequent flush in streaming mode
+ * the overhead of headers can make the compressed data to be larger than the
+ * return value of ZSTD_compressBound().
+ */
+size_t ZSTD_compressBound(size_t srcSize) {
+    return ZSTD_COMPRESSBOUND(srcSize);
+}
+
+
+/*-*************************************
+*  Context memory management
+***************************************/
+struct ZSTD_CDict_s {
+    const void* dictContent;
+    size_t dictContentSize;
+    U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */
+    ZSTD_cwksp workspace;
+    ZSTD_matchState_t matchState;
+    ZSTD_compressedBlockState_t cBlockState;
+    ZSTD_customMem customMem;
+    U32 dictID;
+    int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */
+};  /* typedef'd to ZSTD_CDict within "zstd.h" */
+
+ZSTD_CCtx* ZSTD_createCCtx(void)
+{
+    return ZSTD_createCCtx_advanced(ZSTD_defaultCMem);
+}
+
+static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager)
+{
+    assert(cctx != NULL);
+    memset(cctx, 0, sizeof(*cctx));
+    cctx->customMem = memManager;
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    {   size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
+        assert(!ZSTD_isError(err));
+        (void)err;
+    }
+}
+
+ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem)
+{
+    ZSTD_STATIC_ASSERT(zcss_init==0);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1));
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+    {   ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem);
+        if (!cctx) return NULL;
+        ZSTD_initCCtx(cctx, customMem);
+        return cctx;
+    }
+}
+
+ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize)
+{
+    ZSTD_cwksp ws;
+    ZSTD_CCtx* cctx;
+    if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL;  /* minimum size */
+    if ((size_t)workspace & 7) return NULL;  /* must be 8-aligned */
+    ZSTD_cwksp_init(&ws, workspace, workspaceSize);
+
+    cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx));
+    if (cctx == NULL) return NULL;
+
+    memset(cctx, 0, sizeof(ZSTD_CCtx));
+    ZSTD_cwksp_move(&cctx->workspace, &ws);
+    cctx->staticSize = workspaceSize;
+
+    /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */
+    if (!ZSTD_cwksp_check_available(&cctx->workspace, HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL;
+    cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+    cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t));
+    cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, HUF_WORKSPACE_SIZE);
+    cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    return cctx;
+}
+
+/**
+ * Clears and frees all of the dictionaries in the CCtx.
+ */
+static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx)
+{
+    ZSTD_free(cctx->localDict.dictBuffer, cctx->customMem);
+    ZSTD_freeCDict(cctx->localDict.cdict);
+    memset(&cctx->localDict, 0, sizeof(cctx->localDict));
+    memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));
+    cctx->cdict = NULL;
+}
+
+static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict)
+{
+    size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0;
+    size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict);
+    return bufferSize + cdictSize;
+}
+
+static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx)
+{
+    assert(cctx != NULL);
+    assert(cctx->staticSize == 0);
+    ZSTD_clearAllDicts(cctx);
+#ifdef ZSTD_MULTITHREAD
+    ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL;
+#endif
+    ZSTD_cwksp_free(&cctx->workspace, cctx->customMem);
+}
+
+size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support free on NULL */
+    RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                    "not compatible with static CCtx");
+    {
+        int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx);
+        ZSTD_freeCCtxContent(cctx);
+        if (!cctxInWorkspace) {
+            ZSTD_free(cctx, cctx->customMem);
+        }
+    }
+    return 0;
+}
+
+
+static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    return ZSTDMT_sizeof_CCtx(cctx->mtctx);
+#else
+    (void)cctx;
+    return 0;
+#endif
+}
+
+
+size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx)
+{
+    if (cctx==NULL) return 0;   /* support sizeof on NULL */
+    /* cctx may be in the workspace */
+    return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx))
+           + ZSTD_cwksp_sizeof(&cctx->workspace)
+           + ZSTD_sizeof_localDict(cctx->localDict)
+           + ZSTD_sizeof_mtctx(cctx);
+}
+
+size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
+{
+    return ZSTD_sizeof_CCtx(zcs);  /* same object */
+}
+
+/* private API call, for dictBuilder only */
+const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); }
+
+static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
+        ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params cctxParams;
+    memset(&cctxParams, 0, sizeof(cctxParams));
+    cctxParams.cParams = cParams;
+    cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;  /* should not matter, as all cParams are presumed properly defined */
+    assert(!ZSTD_checkCParams(cParams));
+    cctxParams.fParams.contentSizeFlag = 1;
+    return cctxParams;
+}
+
+static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced(
+        ZSTD_customMem customMem)
+{
+    ZSTD_CCtx_params* params;
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+    params = (ZSTD_CCtx_params*)ZSTD_calloc(
+            sizeof(ZSTD_CCtx_params), customMem);
+    if (!params) { return NULL; }
+    params->customMem = customMem;
+    params->compressionLevel = ZSTD_CLEVEL_DEFAULT;
+    params->fParams.contentSizeFlag = 1;
+    return params;
+}
+
+ZSTD_CCtx_params* ZSTD_createCCtxParams(void)
+{
+    return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params)
+{
+    if (params == NULL) { return 0; }
+    ZSTD_free(params, params->customMem);
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params)
+{
+    return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT);
+}
+
+size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) {
+    RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+    memset(cctxParams, 0, sizeof(*cctxParams));
+    cctxParams->compressionLevel = compressionLevel;
+    cctxParams->fParams.contentSizeFlag = 1;
+    return 0;
+}
+
+size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params)
+{
+    RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!");
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+    memset(cctxParams, 0, sizeof(*cctxParams));
+    assert(!ZSTD_checkCParams(params.cParams));
+    cctxParams->cParams = params.cParams;
+    cctxParams->fParams = params.fParams;
+    cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* should not matter, as all cParams are presumed properly defined */
+    return 0;
+}
+
+/* ZSTD_assignParamsToCCtxParams() :
+ * params is presumed valid at this stage */
+static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams(
+        const ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params)
+{
+    ZSTD_CCtx_params ret = *cctxParams;
+    assert(!ZSTD_checkCParams(params->cParams));
+    ret.cParams = params->cParams;
+    ret.fParams = params->fParams;
+    ret.compressionLevel = ZSTD_CLEVEL_DEFAULT;   /* should not matter, as all cParams are presumed properly defined */
+    return ret;
+}
+
+ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
+{
+    ZSTD_bounds bounds = { 0, 0, 0 };
+
+    switch(param)
+    {
+    case ZSTD_c_compressionLevel:
+        bounds.lowerBound = ZSTD_minCLevel();
+        bounds.upperBound = ZSTD_maxCLevel();
+        return bounds;
+
+    case ZSTD_c_windowLog:
+        bounds.lowerBound = ZSTD_WINDOWLOG_MIN;
+        bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_hashLog:
+        bounds.lowerBound = ZSTD_HASHLOG_MIN;
+        bounds.upperBound = ZSTD_HASHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_chainLog:
+        bounds.lowerBound = ZSTD_CHAINLOG_MIN;
+        bounds.upperBound = ZSTD_CHAINLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_searchLog:
+        bounds.lowerBound = ZSTD_SEARCHLOG_MIN;
+        bounds.upperBound = ZSTD_SEARCHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_minMatch:
+        bounds.lowerBound = ZSTD_MINMATCH_MIN;
+        bounds.upperBound = ZSTD_MINMATCH_MAX;
+        return bounds;
+
+    case ZSTD_c_targetLength:
+        bounds.lowerBound = ZSTD_TARGETLENGTH_MIN;
+        bounds.upperBound = ZSTD_TARGETLENGTH_MAX;
+        return bounds;
+
+    case ZSTD_c_strategy:
+        bounds.lowerBound = ZSTD_STRATEGY_MIN;
+        bounds.upperBound = ZSTD_STRATEGY_MAX;
+        return bounds;
+
+    case ZSTD_c_contentSizeFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_checksumFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_dictIDFlag:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_nbWorkers:
+        bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+        bounds.upperBound = ZSTDMT_NBWORKERS_MAX;
+#else
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_jobSize:
+        bounds.lowerBound = 0;
+#ifdef ZSTD_MULTITHREAD
+        bounds.upperBound = ZSTDMT_JOBSIZE_MAX;
+#else
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_overlapLog:
+#ifdef ZSTD_MULTITHREAD
+        bounds.lowerBound = ZSTD_OVERLAPLOG_MIN;
+        bounds.upperBound = ZSTD_OVERLAPLOG_MAX;
+#else
+        bounds.lowerBound = 0;
+        bounds.upperBound = 0;
+#endif
+        return bounds;
+
+    case ZSTD_c_enableLongDistanceMatching:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_ldmHashLog:
+        bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN;
+        bounds.upperBound = ZSTD_LDM_HASHLOG_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmMinMatch:
+        bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN;
+        bounds.upperBound = ZSTD_LDM_MINMATCH_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmBucketSizeLog:
+        bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN;
+        bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX;
+        return bounds;
+
+    case ZSTD_c_ldmHashRateLog:
+        bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN;
+        bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX;
+        return bounds;
+
+    /* experimental parameters */
+    case ZSTD_c_rsyncable:
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_forceMaxWindow :
+        bounds.lowerBound = 0;
+        bounds.upperBound = 1;
+        return bounds;
+
+    case ZSTD_c_format:
+        ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+        bounds.lowerBound = ZSTD_f_zstd1;
+        bounds.upperBound = ZSTD_f_zstd1_magicless;   /* note : how to ensure at compile time that this is the highest value enum ? */
+        return bounds;
+
+    case ZSTD_c_forceAttachDict:
+        ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceCopy);
+        bounds.lowerBound = ZSTD_dictDefaultAttach;
+        bounds.upperBound = ZSTD_dictForceLoad;       /* note : how to ensure at compile time that this is the highest value enum ? */
+        return bounds;
+
+    case ZSTD_c_literalCompressionMode:
+        ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed);
+        bounds.lowerBound = ZSTD_lcm_auto;
+        bounds.upperBound = ZSTD_lcm_uncompressed;
+        return bounds;
+
+    case ZSTD_c_targetCBlockSize:
+        bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN;
+        bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX;
+        return bounds;
+
+    case ZSTD_c_srcSizeHint:
+        bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN;
+        bounds.upperBound = ZSTD_SRCSIZEHINT_MAX;
+        return bounds;
+
+    default:
+        bounds.error = ERROR(parameter_unsupported);
+        return bounds;
+    }
+}
+
+/* ZSTD_cParam_clampBounds:
+ * Clamps the value into the bounded range.
+ */
+static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value)
+{
+    ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);
+    if (ZSTD_isError(bounds.error)) return bounds.error;
+    if (*value < bounds.lowerBound) *value = bounds.lowerBound;
+    if (*value > bounds.upperBound) *value = bounds.upperBound;
+    return 0;
+}
+
+#define BOUNDCHECK(cParam, val) { \
+    RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \
+                    parameter_outOfBound, "Param out of bounds"); \
+}
+
+
+static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param)
+{
+    switch(param)
+    {
+    case ZSTD_c_compressionLevel:
+    case ZSTD_c_hashLog:
+    case ZSTD_c_chainLog:
+    case ZSTD_c_searchLog:
+    case ZSTD_c_minMatch:
+    case ZSTD_c_targetLength:
+    case ZSTD_c_strategy:
+        return 1;
+
+    case ZSTD_c_format:
+    case ZSTD_c_windowLog:
+    case ZSTD_c_contentSizeFlag:
+    case ZSTD_c_checksumFlag:
+    case ZSTD_c_dictIDFlag:
+    case ZSTD_c_forceMaxWindow :
+    case ZSTD_c_nbWorkers:
+    case ZSTD_c_jobSize:
+    case ZSTD_c_overlapLog:
+    case ZSTD_c_rsyncable:
+    case ZSTD_c_enableLongDistanceMatching:
+    case ZSTD_c_ldmHashLog:
+    case ZSTD_c_ldmMinMatch:
+    case ZSTD_c_ldmBucketSizeLog:
+    case ZSTD_c_ldmHashRateLog:
+    case ZSTD_c_forceAttachDict:
+    case ZSTD_c_literalCompressionMode:
+    case ZSTD_c_targetCBlockSize:
+    case ZSTD_c_srcSizeHint:
+    default:
+        return 0;
+    }
+}
+
+size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value);
+    if (cctx->streamStage != zcss_init) {
+        if (ZSTD_isUpdateAuthorized(param)) {
+            cctx->cParamsChanged = 1;
+        } else {
+            RETURN_ERROR(stage_wrong, "can only set params in ctx init stage");
+    }   }
+
+    switch(param)
+    {
+    case ZSTD_c_nbWorkers:
+        RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported,
+                        "MT not compatible with static alloc");
+        break;
+
+    case ZSTD_c_compressionLevel:
+    case ZSTD_c_windowLog:
+    case ZSTD_c_hashLog:
+    case ZSTD_c_chainLog:
+    case ZSTD_c_searchLog:
+    case ZSTD_c_minMatch:
+    case ZSTD_c_targetLength:
+    case ZSTD_c_strategy:
+    case ZSTD_c_ldmHashRateLog:
+    case ZSTD_c_format:
+    case ZSTD_c_contentSizeFlag:
+    case ZSTD_c_checksumFlag:
+    case ZSTD_c_dictIDFlag:
+    case ZSTD_c_forceMaxWindow:
+    case ZSTD_c_forceAttachDict:
+    case ZSTD_c_literalCompressionMode:
+    case ZSTD_c_jobSize:
+    case ZSTD_c_overlapLog:
+    case ZSTD_c_rsyncable:
+    case ZSTD_c_enableLongDistanceMatching:
+    case ZSTD_c_ldmHashLog:
+    case ZSTD_c_ldmMinMatch:
+    case ZSTD_c_ldmBucketSizeLog:
+    case ZSTD_c_targetCBlockSize:
+    case ZSTD_c_srcSizeHint:
+        break;
+
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+    return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams,
+                                    ZSTD_cParameter param, int value)
+{
+    DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value);
+    switch(param)
+    {
+    case ZSTD_c_format :
+        BOUNDCHECK(ZSTD_c_format, value);
+        CCtxParams->format = (ZSTD_format_e)value;
+        return (size_t)CCtxParams->format;
+
+    case ZSTD_c_compressionLevel : {
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        if (value) {  /* 0 : does not change current level */
+            CCtxParams->compressionLevel = value;
+        }
+        if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel;
+        return 0;  /* return type (size_t) cannot represent negative values */
+    }
+
+    case ZSTD_c_windowLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_windowLog, value);
+        CCtxParams->cParams.windowLog = (U32)value;
+        return CCtxParams->cParams.windowLog;
+
+    case ZSTD_c_hashLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_hashLog, value);
+        CCtxParams->cParams.hashLog = (U32)value;
+        return CCtxParams->cParams.hashLog;
+
+    case ZSTD_c_chainLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_chainLog, value);
+        CCtxParams->cParams.chainLog = (U32)value;
+        return CCtxParams->cParams.chainLog;
+
+    case ZSTD_c_searchLog :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_searchLog, value);
+        CCtxParams->cParams.searchLog = (U32)value;
+        return (size_t)value;
+
+    case ZSTD_c_minMatch :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_minMatch, value);
+        CCtxParams->cParams.minMatch = value;
+        return CCtxParams->cParams.minMatch;
+
+    case ZSTD_c_targetLength :
+        BOUNDCHECK(ZSTD_c_targetLength, value);
+        CCtxParams->cParams.targetLength = value;
+        return CCtxParams->cParams.targetLength;
+
+    case ZSTD_c_strategy :
+        if (value!=0)   /* 0 => use default */
+            BOUNDCHECK(ZSTD_c_strategy, value);
+        CCtxParams->cParams.strategy = (ZSTD_strategy)value;
+        return (size_t)CCtxParams->cParams.strategy;
+
+    case ZSTD_c_contentSizeFlag :
+        /* Content size written in frame header _when known_ (default:1) */
+        DEBUGLOG(4, "set content size flag = %u", (value!=0));
+        CCtxParams->fParams.contentSizeFlag = value != 0;
+        return CCtxParams->fParams.contentSizeFlag;
+
+    case ZSTD_c_checksumFlag :
+        /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */
+        CCtxParams->fParams.checksumFlag = value != 0;
+        return CCtxParams->fParams.checksumFlag;
+
+    case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */
+        DEBUGLOG(4, "set dictIDFlag = %u", (value!=0));
+        CCtxParams->fParams.noDictIDFlag = !value;
+        return !CCtxParams->fParams.noDictIDFlag;
+
+    case ZSTD_c_forceMaxWindow :
+        CCtxParams->forceWindow = (value != 0);
+        return CCtxParams->forceWindow;
+
+    case ZSTD_c_forceAttachDict : {
+        const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value;
+        BOUNDCHECK(ZSTD_c_forceAttachDict, pref);
+        CCtxParams->attachDictPref = pref;
+        return CCtxParams->attachDictPref;
+    }
+
+    case ZSTD_c_literalCompressionMode : {
+        const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value;
+        BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
+        CCtxParams->literalCompressionMode = lcm;
+        return CCtxParams->literalCompressionMode;
+    }
+
+    case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        CCtxParams->nbWorkers = value;
+        return CCtxParams->nbWorkers;
+#endif
+
+    case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        /* Adjust to the minimum non-default value. */
+        if (value != 0 && value < ZSTDMT_JOBSIZE_MIN)
+            value = ZSTDMT_JOBSIZE_MIN;
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), "");
+        assert(value >= 0);
+        CCtxParams->jobSize = value;
+        return CCtxParams->jobSize;
+#endif
+
+    case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+        CCtxParams->overlapLog = value;
+        return CCtxParams->overlapLog;
+#endif
+
+    case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading");
+        return 0;
+#else
+        FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), "");
+        CCtxParams->rsyncable = value;
+        return CCtxParams->rsyncable;
+#endif
+
+    case ZSTD_c_enableLongDistanceMatching :
+        CCtxParams->ldmParams.enableLdm = (value!=0);
+        return CCtxParams->ldmParams.enableLdm;
+
+    case ZSTD_c_ldmHashLog :
+        if (value!=0)   /* 0 ==> auto */
+            BOUNDCHECK(ZSTD_c_ldmHashLog, value);
+        CCtxParams->ldmParams.hashLog = value;
+        return CCtxParams->ldmParams.hashLog;
+
+    case ZSTD_c_ldmMinMatch :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmMinMatch, value);
+        CCtxParams->ldmParams.minMatchLength = value;
+        return CCtxParams->ldmParams.minMatchLength;
+
+    case ZSTD_c_ldmBucketSizeLog :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value);
+        CCtxParams->ldmParams.bucketSizeLog = value;
+        return CCtxParams->ldmParams.bucketSizeLog;
+
+    case ZSTD_c_ldmHashRateLog :
+        RETURN_ERROR_IF(value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN,
+                        parameter_outOfBound, "Param out of bounds!");
+        CCtxParams->ldmParams.hashRateLog = value;
+        return CCtxParams->ldmParams.hashRateLog;
+
+    case ZSTD_c_targetCBlockSize :
+        if (value!=0)   /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_targetCBlockSize, value);
+        CCtxParams->targetCBlockSize = value;
+        return CCtxParams->targetCBlockSize;
+
+    case ZSTD_c_srcSizeHint :
+        if (value!=0)    /* 0 ==> default */
+            BOUNDCHECK(ZSTD_c_srcSizeHint, value);
+        CCtxParams->srcSizeHint = value;
+        return CCtxParams->srcSizeHint;
+
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+}
+
+size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value)
+{
+    return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value);
+}
+
+size_t ZSTD_CCtxParams_getParameter(
+        ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, int* value)
+{
+    switch(param)
+    {
+    case ZSTD_c_format :
+        *value = CCtxParams->format;
+        break;
+    case ZSTD_c_compressionLevel :
+        *value = CCtxParams->compressionLevel;
+        break;
+    case ZSTD_c_windowLog :
+        *value = (int)CCtxParams->cParams.windowLog;
+        break;
+    case ZSTD_c_hashLog :
+        *value = (int)CCtxParams->cParams.hashLog;
+        break;
+    case ZSTD_c_chainLog :
+        *value = (int)CCtxParams->cParams.chainLog;
+        break;
+    case ZSTD_c_searchLog :
+        *value = CCtxParams->cParams.searchLog;
+        break;
+    case ZSTD_c_minMatch :
+        *value = CCtxParams->cParams.minMatch;
+        break;
+    case ZSTD_c_targetLength :
+        *value = CCtxParams->cParams.targetLength;
+        break;
+    case ZSTD_c_strategy :
+        *value = (unsigned)CCtxParams->cParams.strategy;
+        break;
+    case ZSTD_c_contentSizeFlag :
+        *value = CCtxParams->fParams.contentSizeFlag;
+        break;
+    case ZSTD_c_checksumFlag :
+        *value = CCtxParams->fParams.checksumFlag;
+        break;
+    case ZSTD_c_dictIDFlag :
+        *value = !CCtxParams->fParams.noDictIDFlag;
+        break;
+    case ZSTD_c_forceMaxWindow :
+        *value = CCtxParams->forceWindow;
+        break;
+    case ZSTD_c_forceAttachDict :
+        *value = CCtxParams->attachDictPref;
+        break;
+    case ZSTD_c_literalCompressionMode :
+        *value = CCtxParams->literalCompressionMode;
+        break;
+    case ZSTD_c_nbWorkers :
+#ifndef ZSTD_MULTITHREAD
+        assert(CCtxParams->nbWorkers == 0);
+#endif
+        *value = CCtxParams->nbWorkers;
+        break;
+    case ZSTD_c_jobSize :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        assert(CCtxParams->jobSize <= INT_MAX);
+        *value = (int)CCtxParams->jobSize;
+        break;
+#endif
+    case ZSTD_c_overlapLog :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        *value = CCtxParams->overlapLog;
+        break;
+#endif
+    case ZSTD_c_rsyncable :
+#ifndef ZSTD_MULTITHREAD
+        RETURN_ERROR(parameter_unsupported, "not compiled with multithreading");
+#else
+        *value = CCtxParams->rsyncable;
+        break;
+#endif
+    case ZSTD_c_enableLongDistanceMatching :
+        *value = CCtxParams->ldmParams.enableLdm;
+        break;
+    case ZSTD_c_ldmHashLog :
+        *value = CCtxParams->ldmParams.hashLog;
+        break;
+    case ZSTD_c_ldmMinMatch :
+        *value = CCtxParams->ldmParams.minMatchLength;
+        break;
+    case ZSTD_c_ldmBucketSizeLog :
+        *value = CCtxParams->ldmParams.bucketSizeLog;
+        break;
+    case ZSTD_c_ldmHashRateLog :
+        *value = CCtxParams->ldmParams.hashRateLog;
+        break;
+    case ZSTD_c_targetCBlockSize :
+        *value = (int)CCtxParams->targetCBlockSize;
+        break;
+    case ZSTD_c_srcSizeHint :
+        *value = (int)CCtxParams->srcSizeHint;
+        break;
+    default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
+    }
+    return 0;
+}
+
+/** ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  just applies `params` into `cctx`
+ *  no action is performed, parameters are merely stored.
+ *  If ZSTDMT is enabled, parameters are pushed to cctx->mtctx.
+ *    This is possible even if a compression is ongoing.
+ *    In which case, new parameters will be applied on the fly, starting with next compression job.
+ */
+size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams");
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "The context is in the wrong stage!");
+    RETURN_ERROR_IF(cctx->cdict, stage_wrong,
+                    "Can't override parameters with cdict attached (some must "
+                    "be inherited from the cdict).");
+
+    cctx->requestedParams = *params;
+    return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize);
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't set pledgedSrcSize when not in init stage.");
+    cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+    return 0;
+}
+
+/**
+ * Initializes the local dict using the requested parameters.
+ * NOTE: This does not use the pledged src size, because it may be used for more
+ * than one compression.
+ */
+static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx)
+{
+    ZSTD_localDict* const dl = &cctx->localDict;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams(
+            &cctx->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN, dl->dictSize);
+    if (dl->dict == NULL) {
+        /* No local dictionary. */
+        assert(dl->dictBuffer == NULL);
+        assert(dl->cdict == NULL);
+        assert(dl->dictSize == 0);
+        return 0;
+    }
+    if (dl->cdict != NULL) {
+        assert(cctx->cdict == dl->cdict);
+        /* Local dictionary already initialized. */
+        return 0;
+    }
+    assert(dl->dictSize > 0);
+    assert(cctx->cdict == NULL);
+    assert(cctx->prefixDict.dict == NULL);
+
+    dl->cdict = ZSTD_createCDict_advanced(
+            dl->dict,
+            dl->dictSize,
+            ZSTD_dlm_byRef,
+            dl->dictContentType,
+            cParams,
+            cctx->customMem);
+    RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed");
+    cctx->cdict = dl->cdict;
+    return 0;
+}
+
+size_t ZSTD_CCtx_loadDictionary_advanced(
+        ZSTD_CCtx* cctx, const void* dict, size_t dictSize,
+        ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't load a dictionary when ctx is not in init stage.");
+    RETURN_ERROR_IF(cctx->staticSize, memory_allocation,
+                    "no malloc for static CCtx");
+    DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize);
+    ZSTD_clearAllDicts(cctx);  /* in case one already exists */
+    if (dict == NULL || dictSize == 0)  /* no dictionary mode */
+        return 0;
+    if (dictLoadMethod == ZSTD_dlm_byRef) {
+        cctx->localDict.dict = dict;
+    } else {
+        void* dictBuffer = ZSTD_malloc(dictSize, cctx->customMem);
+        RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!");
+        memcpy(dictBuffer, dict, dictSize);
+        cctx->localDict.dictBuffer = dictBuffer;
+        cctx->localDict.dict = dictBuffer;
+    }
+    cctx->localDict.dictSize = dictSize;
+    cctx->localDict.dictContentType = dictContentType;
+    return 0;
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(
+      ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_CCtx_loadDictionary_advanced(
+            cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+
+size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a dict when ctx not in init stage.");
+    /* Free the existing local cdict (if any) to save memory. */
+    ZSTD_clearAllDicts(cctx);
+    cctx->cdict = cdict;
+    return 0;
+}
+
+size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+size_t ZSTD_CCtx_refPrefix_advanced(
+        ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                    "Can't ref a prefix when ctx not in init stage.");
+    ZSTD_clearAllDicts(cctx);
+    if (prefix != NULL && prefixSize > 0) {
+        cctx->prefixDict.dict = prefix;
+        cctx->prefixDict.dictSize = prefixSize;
+        cctx->prefixDict.dictContentType = dictContentType;
+    }
+    return 0;
+}
+
+/*! ZSTD_CCtx_reset() :
+ *  Also dumps dictionary */
+size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset)
+{
+    if ( (reset == ZSTD_reset_session_only)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        cctx->streamStage = zcss_init;
+        cctx->pledgedSrcSizePlusOne = 0;
+    }
+    if ( (reset == ZSTD_reset_parameters)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong,
+                        "Can't reset parameters only when not in init stage.");
+        ZSTD_clearAllDicts(cctx);
+        return ZSTD_CCtxParams_reset(&cctx->requestedParams);
+    }
+    return 0;
+}
+
+
+/** ZSTD_checkCParams() :
+    control CParam values remain within authorized range.
+    @return : 0, or an error code if one value is beyond authorized range */
+size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams)
+{
+    BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog);
+    BOUNDCHECK(ZSTD_c_chainLog,  (int)cParams.chainLog);
+    BOUNDCHECK(ZSTD_c_hashLog,   (int)cParams.hashLog);
+    BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog);
+    BOUNDCHECK(ZSTD_c_minMatch,  (int)cParams.minMatch);
+    BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength);
+    BOUNDCHECK(ZSTD_c_strategy,  cParams.strategy);
+    return 0;
+}
+
+/** ZSTD_clampCParams() :
+ *  make CParam values within valid range.
+ *  @return : valid CParams */
+static ZSTD_compressionParameters
+ZSTD_clampCParams(ZSTD_compressionParameters cParams)
+{
+#   define CLAMP_TYPE(cParam, val, type) {                                \
+        ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam);         \
+        if ((int)val<bounds.lowerBound) val=(type)bounds.lowerBound;      \
+        else if ((int)val>bounds.upperBound) val=(type)bounds.upperBound; \
+    }
+#   define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned)
+    CLAMP(ZSTD_c_windowLog, cParams.windowLog);
+    CLAMP(ZSTD_c_chainLog,  cParams.chainLog);
+    CLAMP(ZSTD_c_hashLog,   cParams.hashLog);
+    CLAMP(ZSTD_c_searchLog, cParams.searchLog);
+    CLAMP(ZSTD_c_minMatch,  cParams.minMatch);
+    CLAMP(ZSTD_c_targetLength,cParams.targetLength);
+    CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy);
+    return cParams;
+}
+
+/** ZSTD_cycleLog() :
+ *  condition for correct operation : hashLog > 1 */
+U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat)
+{
+    U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2);
+    return hashLog - btScale;
+}
+
+/** ZSTD_adjustCParams_internal() :
+ *  optimize `cPar` for a specified input (`srcSize` and `dictSize`).
+ *  mostly downsize to reduce memory consumption and initialization latency.
+ * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known.
+ *  note : `srcSize==0` means 0!
+ *  condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */
+static ZSTD_compressionParameters
+ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar,
+                            unsigned long long srcSize,
+                            size_t dictSize)
+{
+    static const U64 minSrcSize = 513; /* (1<<9) + 1 */
+    static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1);
+    assert(ZSTD_checkCParams(cPar)==0);
+
+    if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+        srcSize = minSrcSize;
+
+    /* resize windowLog if input is small enough, to use less memory */
+    if ( (srcSize < maxWindowResize)
+      && (dictSize < maxWindowResize) )  {
+        U32 const tSize = (U32)(srcSize + dictSize);
+        static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN;
+        U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN :
+                            ZSTD_highbit32(tSize-1) + 1;
+        if (cPar.windowLog > srcLog) cPar.windowLog = srcLog;
+    }
+    if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1;
+    {   U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy);
+        if (cycleLog > cPar.windowLog)
+            cPar.chainLog -= (cycleLog - cPar.windowLog);
+    }
+
+    if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN)
+        cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN;  /* minimum wlog required for valid frame header */
+
+    return cPar;
+}
+
+ZSTD_compressionParameters
+ZSTD_adjustCParams(ZSTD_compressionParameters cPar,
+                   unsigned long long srcSize,
+                   size_t dictSize)
+{
+    cPar = ZSTD_clampCParams(cPar);   /* resulting cPar is necessarily valid (all parameters within range) */
+    if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize);
+}
+
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize);
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize);
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams(
+        const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize)
+{
+    ZSTD_compressionParameters cParams;
+    if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) {
+      srcSizeHint = CCtxParams->srcSizeHint;
+    }
+    cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize);
+    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG;
+    if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog;
+    if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog;
+    if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog;
+    if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog;
+    if (CCtxParams->cParams.minMatch) cParams.minMatch = CCtxParams->cParams.minMatch;
+    if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength;
+    if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy;
+    assert(!ZSTD_checkCParams(cParams));
+    /* srcSizeHint == 0 means 0 */
+    return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize);
+}
+
+static size_t
+ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const U32 forCCtx)
+{
+    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+    /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't
+     * surrounded by redzones in ASAN. */
+    size_t const tableSpace = chainSize * sizeof(U32)
+                            + hSize * sizeof(U32)
+                            + h3Size * sizeof(U32);
+    size_t const optPotentialSpace =
+        ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32))
+      + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32))
+      + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32))
+      + ZSTD_cwksp_alloc_size((1<<Litbits) * sizeof(U32))
+      + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
+      + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+    size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt))
+                                ? optPotentialSpace
+                                : 0;
+    DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
+                (U32)chainSize, (U32)hSize, (U32)h3Size);
+    return tableSpace + optSpace;
+}
+
+size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+    {   ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        U32    const divider = (cParams.minMatch==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                                + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef))
+                                + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+        size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE);
+        size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+        size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1);
+
+        size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams);
+        size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq));
+
+        /* estimateCCtxSize is for one-shot compression. So no buffers should
+         * be needed. However, we still allocate two 0-sized buffers, which can
+         * take space under ASAN. */
+        size_t const bufferSpace = ZSTD_cwksp_alloc_size(0)
+                                 + ZSTD_cwksp_alloc_size(0);
+
+        size_t const cctxSpace = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx));
+
+        size_t const neededSpace =
+            cctxSpace +
+            entropySpace +
+            blockStateSpace +
+            ldmSpace +
+            ldmSeqSpace +
+            matchStateSize +
+            tokenSpace +
+            bufferSpace;
+
+        DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace);
+        return neededSpace;
+    }
+}
+
+size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+    return ZSTD_estimateCCtxSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+    return ZSTD_estimateCCtxSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCCtxSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCCtxSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params)
+{
+    RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only.");
+    {   ZSTD_compressionParameters const cParams =
+                ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+        size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params);
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog);
+        size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize;
+        size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1;
+        size_t const streamingSize = ZSTD_cwksp_alloc_size(inBuffSize)
+                                   + ZSTD_cwksp_alloc_size(outBuffSize);
+
+        return CCtxSize + streamingSize;
+    }
+}
+
+size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams)
+{
+    ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams);
+    return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
+}
+
+static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0);
+    return ZSTD_estimateCStreamSize_usingCParams(cParams);
+}
+
+size_t ZSTD_estimateCStreamSize(int compressionLevel)
+{
+    int level;
+    size_t memBudget = 0;
+    for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) {
+        size_t const newMB = ZSTD_estimateCStreamSize_internal(level);
+        if (newMB > memBudget) memBudget = newMB;
+    }
+    return memBudget;
+}
+
+/* ZSTD_getFrameProgression():
+ * tells how much data has been consumed (input) and produced (output) for current frame.
+ * able to count progression inside worker threads (non-blocking mode).
+ */
+ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_getFrameProgression(cctx->mtctx);
+    }
+#endif
+    {   ZSTD_frameProgression fp;
+        size_t const buffered = (cctx->inBuff == NULL) ? 0 :
+                                cctx->inBuffPos - cctx->inToCompress;
+        if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress);
+        assert(buffered <= ZSTD_BLOCKSIZE_MAX);
+        fp.ingested = cctx->consumedSrcSize + buffered;
+        fp.consumed = cctx->consumedSrcSize;
+        fp.produced = cctx->producedCSize;
+        fp.flushed  = cctx->producedCSize;   /* simplified; some data might still be left within streaming output buffer */
+        fp.currentJobID = 0;
+        fp.nbActiveWorkers = 0;
+        return fp;
+}   }
+
+/*! ZSTD_toFlushNow()
+ *  Only useful for multithreading scenarios currently (nbWorkers >= 1).
+ */
+size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        return ZSTDMT_toFlushNow(cctx->mtctx);
+    }
+#endif
+    (void)cctx;
+    return 0;   /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */
+}
+
+static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1,
+                                    ZSTD_compressionParameters cParams2)
+{
+    (void)cParams1;
+    (void)cParams2;
+    assert(cParams1.windowLog    == cParams2.windowLog);
+    assert(cParams1.chainLog     == cParams2.chainLog);
+    assert(cParams1.hashLog      == cParams2.hashLog);
+    assert(cParams1.searchLog    == cParams2.searchLog);
+    assert(cParams1.minMatch     == cParams2.minMatch);
+    assert(cParams1.targetLength == cParams2.targetLength);
+    assert(cParams1.strategy     == cParams2.strategy);
+}
+
+void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs)
+{
+    int i;
+    for (i = 0; i < ZSTD_REP_NUM; ++i)
+        bs->rep[i] = repStartValue[i];
+    bs->entropy.huf.repeatMode = HUF_repeat_none;
+    bs->entropy.fse.offcode_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none;
+    bs->entropy.fse.litlength_repeatMode = FSE_repeat_none;
+}
+
+/*! ZSTD_invalidateMatchState()
+ *  Invalidate all the matches in the match finder tables.
+ *  Requires nextSrc and base to be set (can be NULL).
+ */
+static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms)
+{
+    ZSTD_window_clear(&ms->window);
+
+    ms->nextToUpdate = ms->window.dictLimit;
+    ms->loadedDictEnd = 0;
+    ms->opt.litLengthSum = 0;  /* force reset of btopt stats */
+    ms->dictMatchState = NULL;
+}
+
+/**
+ * Indicates whether this compression proceeds directly from user-provided
+ * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or
+ * whether the context needs to buffer the input/output (ZSTDb_buffered).
+ */
+typedef enum {
+    ZSTDb_not_buffered,
+    ZSTDb_buffered
+} ZSTD_buffered_policy_e;
+
+/**
+ * Controls, for this matchState reset, whether the tables need to be cleared /
+ * prepared for the coming compression (ZSTDcrp_makeClean), or whether the
+ * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a
+ * subsequent operation will overwrite the table space anyways (e.g., copying
+ * the matchState contents in from a CDict).
+ */
+typedef enum {
+    ZSTDcrp_makeClean,
+    ZSTDcrp_leaveDirty
+} ZSTD_compResetPolicy_e;
+
+/**
+ * Controls, for this matchState reset, whether indexing can continue where it
+ * left off (ZSTDirp_continue), or whether it needs to be restarted from zero
+ * (ZSTDirp_reset).
+ */
+typedef enum {
+    ZSTDirp_continue,
+    ZSTDirp_reset
+} ZSTD_indexResetPolicy_e;
+
+typedef enum {
+    ZSTD_resetTarget_CDict,
+    ZSTD_resetTarget_CCtx
+} ZSTD_resetTarget_e;
+
+static size_t
+ZSTD_reset_matchState(ZSTD_matchState_t* ms,
+                      ZSTD_cwksp* ws,
+                const ZSTD_compressionParameters* cParams,
+                const ZSTD_compResetPolicy_e crp,
+                const ZSTD_indexResetPolicy_e forceResetIndex,
+                const ZSTD_resetTarget_e forWho)
+{
+    size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog);
+    size_t const hSize = ((size_t)1) << cParams->hashLog;
+    U32    const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
+    size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0;
+
+    DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset);
+    if (forceResetIndex == ZSTDirp_reset) {
+        ZSTD_window_init(&ms->window);
+        ZSTD_cwksp_mark_tables_dirty(ws);
+    }
+
+    ms->hashLog3 = hashLog3;
+
+    ZSTD_invalidateMatchState(ms);
+
+    assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */
+
+    ZSTD_cwksp_clear_tables(ws);
+
+    DEBUGLOG(5, "reserving table space");
+    /* table Space */
+    ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32));
+    ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32));
+    ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32));
+    RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+                    "failed a workspace allocation in ZSTD_reset_matchState");
+
+    DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty);
+    if (crp!=ZSTDcrp_leaveDirty) {
+        /* reset tables only */
+        ZSTD_cwksp_clean_tables(ws);
+    }
+
+    /* opt parser space */
+    if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) {
+        DEBUGLOG(4, "reserving optimal parser space");
+        ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<<Litbits) * sizeof(unsigned));
+        ms->opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned));
+        ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned));
+        ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned));
+        ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t));
+        ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+    }
+
+    ms->cParams = *cParams;
+
+    RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
+                    "failed a workspace allocation in ZSTD_reset_matchState");
+
+    return 0;
+}
+
+/* ZSTD_indexTooCloseToMax() :
+ * minor optimization : prefer memset() rather than reduceIndex()
+ * which is measurably slow in some circumstances (reported for Visual Studio).
+ * Works when re-using a context for a lot of smallish inputs :
+ * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN,
+ * memset() will be triggered before reduceIndex().
+ */
+#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB)
+static int ZSTD_indexTooCloseToMax(ZSTD_window_t w)
+{
+    return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN);
+}
+
+/*! ZSTD_resetCCtx_internal() :
+    note : `params` are assumed fully validated at this stage */
+static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
+                                      ZSTD_CCtx_params params,
+                                      U64 const pledgedSrcSize,
+                                      ZSTD_compResetPolicy_e const crp,
+                                      ZSTD_buffered_policy_e const zbuff)
+{
+    ZSTD_cwksp* const ws = &zc->workspace;
+    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u",
+                (U32)pledgedSrcSize, params.cParams.windowLog);
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+
+    zc->isFirstBlock = 1;
+
+    if (params.ldmParams.enableLdm) {
+        /* Adjust long distance matching parameters */
+        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
+        assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog);
+        assert(params.ldmParams.hashRateLog < 32);
+        zc->ldmState.hashPower = ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength);
+    }
+
+    {   size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize));
+        size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
+        U32    const divider = (params.cParams.minMatch==3) ? 3 : 4;
+        size_t const maxNbSeq = blockSize / divider;
+        size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize)
+                                + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef))
+                                + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE));
+        size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0;
+        size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0;
+        size_t const matchStateSize = ZSTD_sizeof_matchState(&params.cParams, /* forCCtx */ 1);
+        size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize);
+
+        ZSTD_indexResetPolicy_e needsIndexReset = zc->initialized ? ZSTDirp_continue : ZSTDirp_reset;
+
+        if (ZSTD_indexTooCloseToMax(zc->blockState.matchState.window)) {
+            needsIndexReset = ZSTDirp_reset;
+        }
+
+        if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0);
+
+        /* Check if workspace is large enough, alloc a new one if needed */
+        {   size_t const cctxSpace = zc->staticSize ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0;
+            size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE);
+            size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t));
+            size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + ZSTD_cwksp_alloc_size(buffOutSize);
+            size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams);
+            size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq));
+
+            size_t const neededSpace =
+                cctxSpace +
+                entropySpace +
+                blockStateSpace +
+                ldmSpace +
+                ldmSeqSpace +
+                matchStateSize +
+                tokenSpace +
+                bufferSpace;
+
+            int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace;
+            int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace);
+
+            DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers",
+                        neededSpace>>10, matchStateSize>>10, bufferSpace>>10);
+            DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize);
+
+            if (workspaceTooSmall || workspaceWasteful) {
+                DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB",
+                            ZSTD_cwksp_sizeof(ws) >> 10,
+                            neededSpace >> 10);
+
+                RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize");
+
+                needsIndexReset = ZSTDirp_reset;
+
+                ZSTD_cwksp_free(ws, zc->customMem);
+                FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), "");
+
+                DEBUGLOG(5, "reserving object space");
+                /* Statically sized space.
+                 * entropyWorkspace never moves,
+                 * though prev/next block swap places */
+                assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t)));
+                zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock");
+                zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
+                RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock");
+                zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, HUF_WORKSPACE_SIZE);
+                RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace");
+        }   }
+
+        ZSTD_cwksp_clear(ws);
+
+        /* init params */
+        zc->appliedParams = params;
+        zc->blockState.matchState.cParams = params.cParams;
+        zc->pledgedSrcSizePlusOne = pledgedSrcSize+1;
+        zc->consumedSrcSize = 0;
+        zc->producedCSize = 0;
+        if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)
+            zc->appliedParams.fParams.contentSizeFlag = 0;
+        DEBUGLOG(4, "pledged content size : %u ; flag : %u",
+            (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag);
+        zc->blockSize = blockSize;
+
+        XXH64_reset(&zc->xxhState, 0);
+        zc->stage = ZSTDcs_init;
+        zc->dictID = 0;
+
+        ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock);
+
+        /* ZSTD_wildcopy() is used to copy into the literals buffer,
+         * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes.
+         */
+        zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH);
+        zc->seqStore.maxNbLit = blockSize;
+
+        /* buffers */
+        zc->inBuffSize = buffInSize;
+        zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize);
+        zc->outBuffSize = buffOutSize;
+        zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);
+
+        /* ldm bucketOffsets table */
+        if (params.ldmParams.enableLdm) {
+            /* TODO: avoid memset? */
+            size_t const ldmBucketSize =
+                  ((size_t)1) << (params.ldmParams.hashLog -
+                                  params.ldmParams.bucketSizeLog);
+            zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, ldmBucketSize);
+            memset(zc->ldmState.bucketOffsets, 0, ldmBucketSize);
+        }
+
+        /* sequences storage */
+        ZSTD_referenceExternalSequences(zc, NULL, 0);
+        zc->seqStore.maxNbSeq = maxNbSeq;
+        zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE));
+        zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef));
+
+        FORWARD_IF_ERROR(ZSTD_reset_matchState(
+            &zc->blockState.matchState,
+            ws,
+            &params.cParams,
+            crp,
+            needsIndexReset,
+            ZSTD_resetTarget_CCtx), "");
+
+        /* ldm hash table */
+        if (params.ldmParams.enableLdm) {
+            /* TODO: avoid memset? */
+            size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog;
+            zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t));
+            memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t));
+            zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq));
+            zc->maxNbLdmSequences = maxNbLdmSeq;
+
+            ZSTD_window_init(&zc->ldmState.window);
+            ZSTD_window_clear(&zc->ldmState.window);
+            zc->ldmState.loadedDictEnd = 0;
+        }
+
+        DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws));
+        zc->initialized = 1;
+
+        return 0;
+    }
+}
+
+/* ZSTD_invalidateRepCodes() :
+ * ensures next compression will not use repcodes from previous block.
+ * Note : only works with regular variant;
+ *        do not use with extDict variant ! */
+void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) {
+    int i;
+    for (i=0; i<ZSTD_REP_NUM; i++) cctx->blockState.prevCBlock->rep[i] = 0;
+    assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window));
+}
+
+/* These are the approximate sizes for each strategy past which copying the
+ * dictionary tables into the working context is faster than using them
+ * in-place.
+ */
+static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = {
+    8 KB,  /* unused */
+    8 KB,  /* ZSTD_fast */
+    16 KB, /* ZSTD_dfast */
+    32 KB, /* ZSTD_greedy */
+    32 KB, /* ZSTD_lazy */
+    32 KB, /* ZSTD_lazy2 */
+    32 KB, /* ZSTD_btlazy2 */
+    32 KB, /* ZSTD_btopt */
+    8 KB,  /* ZSTD_btultra */
+    8 KB   /* ZSTD_btultra2 */
+};
+
+static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
+                                 const ZSTD_CCtx_params* params,
+                                 U64 pledgedSrcSize)
+{
+    size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
+    return ( pledgedSrcSize <= cutoff
+          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+          || params->attachDictPref == ZSTD_dictForceAttach )
+        && params->attachDictPref != ZSTD_dictForceCopy
+        && !params->forceWindow; /* dictMatchState isn't correctly
+                                 * handled in _enforceMaxDist */
+}
+
+static size_t
+ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
+                        const ZSTD_CDict* cdict,
+                        ZSTD_CCtx_params params,
+                        U64 pledgedSrcSize,
+                        ZSTD_buffered_policy_e zbuff)
+{
+    {   const ZSTD_compressionParameters* const cdict_cParams = &cdict->matchState.cParams;
+        unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Resize working context table params for input only, since the dict
+         * has its own tables. */
+        /* pledgeSrcSize == 0 means 0! */
+        params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0);
+        params.cParams.windowLog = windowLog;
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                                 ZSTDcrp_makeClean, zbuff), "");
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+    }
+
+    {   const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc
+                                  - cdict->matchState.window.base);
+        const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit;
+        if (cdictLen == 0) {
+            /* don't even attach dictionaries with no contents */
+            DEBUGLOG(4, "skipping attaching empty dictionary");
+        } else {
+            DEBUGLOG(4, "attaching dictionary into context");
+            cctx->blockState.matchState.dictMatchState = &cdict->matchState;
+
+            /* prep working match state so dict matches never have negative indices
+             * when they are translated to the working context's index space. */
+            if (cctx->blockState.matchState.window.dictLimit < cdictEnd) {
+                cctx->blockState.matchState.window.nextSrc =
+                    cctx->blockState.matchState.window.base + cdictEnd;
+                ZSTD_window_clear(&cctx->blockState.matchState.window);
+            }
+            /* loadedDictEnd is expressed within the referential of the active context */
+            cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit;
+    }   }
+
+    cctx->dictID = cdict->dictID;
+
+    /* copy block state */
+    memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            ZSTD_CCtx_params params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams;
+
+    DEBUGLOG(4, "copying dictionary into context");
+
+    {   unsigned const windowLog = params.cParams.windowLog;
+        assert(windowLog != 0);
+        /* Copy only compression parameters related to tables. */
+        params.cParams = *cdict_cParams;
+        params.cParams.windowLog = windowLog;
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                                 ZSTDcrp_leaveDirty, zbuff), "");
+        assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy);
+        assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog);
+        assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog);
+    }
+
+    ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
+
+    /* copy tables */
+    {   size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog);
+        size_t const hSize =  (size_t)1 << cdict_cParams->hashLog;
+
+        memcpy(cctx->blockState.matchState.hashTable,
+               cdict->matchState.hashTable,
+               hSize * sizeof(U32));
+        memcpy(cctx->blockState.matchState.chainTable,
+               cdict->matchState.chainTable,
+               chainSize * sizeof(U32));
+    }
+
+    /* Zero the hashTable3, since the cdict never fills it */
+    {   int const h3log = cctx->blockState.matchState.hashLog3;
+        size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+        assert(cdict->matchState.hashLog3 == 0);
+        memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32));
+    }
+
+    ZSTD_cwksp_mark_tables_clean(&cctx->workspace);
+
+    /* copy dictionary offsets */
+    {   ZSTD_matchState_t const* srcMatchState = &cdict->matchState;
+        ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+
+    cctx->dictID = cdict->dictID;
+
+    /* copy block state */
+    memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState));
+
+    return 0;
+}
+
+/* We have a choice between copying the dictionary context into the working
+ * context, or referencing the dictionary context from the working context
+ * in-place. We decide here which strategy to use. */
+static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx,
+                            const ZSTD_CDict* cdict,
+                            const ZSTD_CCtx_params* params,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+
+    DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)",
+                (unsigned)pledgedSrcSize);
+
+    if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) {
+        return ZSTD_resetCCtx_byAttachingCDict(
+            cctx, cdict, *params, pledgedSrcSize, zbuff);
+    } else {
+        return ZSTD_resetCCtx_byCopyingCDict(
+            cctx, cdict, *params, pledgedSrcSize, zbuff);
+    }
+}
+
+/*! ZSTD_copyCCtx_internal() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  The "context", in this case, refers to the hash and chain tables,
+ *  entropy tables, and dictionary references.
+ * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx.
+ * @return : 0, or an error code */
+static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx,
+                            const ZSTD_CCtx* srcCCtx,
+                            ZSTD_frameParameters fParams,
+                            U64 pledgedSrcSize,
+                            ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
+    RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong,
+                    "Can't copy a ctx that's not in init stage.");
+
+    memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem));
+    {   ZSTD_CCtx_params params = dstCCtx->requestedParams;
+        /* Copy only compression parameters related to tables. */
+        params.cParams = srcCCtx->appliedParams.cParams;
+        params.fParams = fParams;
+        ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize,
+                                ZSTDcrp_leaveDirty, zbuff);
+        assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog);
+        assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy);
+        assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog);
+        assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog);
+        assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3);
+    }
+
+    ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);
+
+    /* copy tables */
+    {   size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog);
+        size_t const hSize =  (size_t)1 << srcCCtx->appliedParams.cParams.hashLog;
+        int const h3log = srcCCtx->blockState.matchState.hashLog3;
+        size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0;
+
+        memcpy(dstCCtx->blockState.matchState.hashTable,
+               srcCCtx->blockState.matchState.hashTable,
+               hSize * sizeof(U32));
+        memcpy(dstCCtx->blockState.matchState.chainTable,
+               srcCCtx->blockState.matchState.chainTable,
+               chainSize * sizeof(U32));
+        memcpy(dstCCtx->blockState.matchState.hashTable3,
+               srcCCtx->blockState.matchState.hashTable3,
+               h3Size * sizeof(U32));
+    }
+
+    ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace);
+
+    /* copy dictionary offsets */
+    {
+        const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState;
+        ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState;
+        dstMatchState->window       = srcMatchState->window;
+        dstMatchState->nextToUpdate = srcMatchState->nextToUpdate;
+        dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd;
+    }
+    dstCCtx->dictID = srcCCtx->dictID;
+
+    /* copy block state */
+    memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock));
+
+    return 0;
+}
+
+/*! ZSTD_copyCCtx() :
+ *  Duplicate an existing context `srcCCtx` into another one `dstCCtx`.
+ *  Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()).
+ *  pledgedSrcSize==0 means "unknown".
+*   @return : 0, or an error code */
+size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize)
+{
+    ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0);
+    ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1);
+    if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN;
+    fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN);
+
+    return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx,
+                                fParams, pledgedSrcSize,
+                                zbuff);
+}
+
+
+#define ZSTD_ROWSIZE 16
+/*! ZSTD_reduceTable() :
+ *  reduce table indexes by `reducerValue`, or squash to zero.
+ *  PreserveMark preserves "unsorted mark" for btlazy2 strategy.
+ *  It must be set to a clear 0/1 value, to remove branch during inlining.
+ *  Presume table size is a multiple of ZSTD_ROWSIZE
+ *  to help auto-vectorization */
+FORCE_INLINE_TEMPLATE void
+ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark)
+{
+    int const nbRows = (int)size / ZSTD_ROWSIZE;
+    int cellNb = 0;
+    int rowNb;
+    assert((size & (ZSTD_ROWSIZE-1)) == 0);  /* multiple of ZSTD_ROWSIZE */
+    assert(size < (1U<<31));   /* can be casted to int */
+
+#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE)
+    /* To validate that the table re-use logic is sound, and that we don't
+     * access table space that we haven't cleaned, we re-"poison" the table
+     * space every time we mark it dirty.
+     *
+     * This function however is intended to operate on those dirty tables and
+     * re-clean them. So when this function is used correctly, we can unpoison
+     * the memory it operated on. This introduces a blind spot though, since
+     * if we now try to operate on __actually__ poisoned memory, we will not
+     * detect that. */
+    __msan_unpoison(table, size * sizeof(U32));
+#endif
+
+    for (rowNb=0 ; rowNb < nbRows ; rowNb++) {
+        int column;
+        for (column=0; column<ZSTD_ROWSIZE; column++) {
+            if (preserveMark) {
+                U32 const adder = (table[cellNb] == ZSTD_DUBT_UNSORTED_MARK) ? reducerValue : 0;
+                table[cellNb] += adder;
+            }
+            if (table[cellNb] < reducerValue) table[cellNb] = 0;
+            else table[cellNb] -= reducerValue;
+            cellNb++;
+    }   }
+}
+
+static void ZSTD_reduceTable(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 0);
+}
+
+static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const reducerValue)
+{
+    ZSTD_reduceTable_internal(table, size, reducerValue, 1);
+}
+
+/*! ZSTD_reduceIndex() :
+*   rescale all indexes to avoid future overflow (indexes are U32) */
+static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue)
+{
+    {   U32 const hSize = (U32)1 << params->cParams.hashLog;
+        ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
+    }
+
+    if (params->cParams.strategy != ZSTD_fast) {
+        U32 const chainSize = (U32)1 << params->cParams.chainLog;
+        if (params->cParams.strategy == ZSTD_btlazy2)
+            ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue);
+        else
+            ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue);
+    }
+
+    if (ms->hashLog3) {
+        U32 const h3Size = (U32)1 << ms->hashLog3;
+        ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue);
+    }
+}
+
+
+/*-*******************************************************
+*  Block entropic compression
+*********************************************************/
+
+/* See doc/zstd_compression_format.md for detailed format description */
+
+void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
+{
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    BYTE* const llCodeTable = seqStorePtr->llCode;
+    BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    U32 u;
+    assert(nbSeq <= seqStorePtr->maxNbSeq);
+    for (u=0; u<nbSeq; u++) {
+        U32 const llv = sequences[u].litLength;
+        U32 const mlv = sequences[u].matchLength;
+        llCodeTable[u] = (BYTE)ZSTD_LLcode(llv);
+        ofCodeTable[u] = (BYTE)ZSTD_highbit32(sequences[u].offset);
+        mlCodeTable[u] = (BYTE)ZSTD_MLcode(mlv);
+    }
+    if (seqStorePtr->longLengthID==1)
+        llCodeTable[seqStorePtr->longLengthPos] = MaxLL;
+    if (seqStorePtr->longLengthID==2)
+        mlCodeTable[seqStorePtr->longLengthPos] = MaxML;
+}
+
+/* ZSTD_useTargetCBlockSize():
+ * Returns if target compressed block size param is being used.
+ * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams)
+{
+    DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize);
+    return (cctxParams->targetCBlockSize != 0);
+}
+
+/* ZSTD_compressSequences_internal():
+ * actually compresses both literals and sequences */
+MEM_STATIC size_t
+ZSTD_compressSequences_internal(seqStore_t* seqStorePtr,
+                          const ZSTD_entropyCTables_t* prevEntropy,
+                                ZSTD_entropyCTables_t* nextEntropy,
+                          const ZSTD_CCtx_params* cctxParams,
+                                void* dst, size_t dstCapacity,
+                                void* entropyWorkspace, size_t entropyWkspSize,
+                          const int bmi2)
+{
+    const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN;
+    ZSTD_strategy const strategy = cctxParams->cParams.strategy;
+    unsigned count[MaxSeq+1];
+    FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable;
+    FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable;
+    FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable;
+    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
+    const seqDef* const sequences = seqStorePtr->sequencesStart;
+    const BYTE* const ofCodeTable = seqStorePtr->ofCode;
+    const BYTE* const llCodeTable = seqStorePtr->llCode;
+    const BYTE* const mlCodeTable = seqStorePtr->mlCode;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* const oend = ostart + dstCapacity;
+    BYTE* op = ostart;
+    size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
+    BYTE* seqHead;
+    BYTE* lastNCount = NULL;
+
+    DEBUGLOG(5, "ZSTD_compressSequences_internal (nbSeq=%zu)", nbSeq);
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+
+    /* Compress literals */
+    {   const BYTE* const literals = seqStorePtr->litStart;
+        size_t const litSize = (size_t)(seqStorePtr->lit - literals);
+        size_t const cSize = ZSTD_compressLiterals(
+                                    &prevEntropy->huf, &nextEntropy->huf,
+                                    cctxParams->cParams.strategy,
+                                    ZSTD_disableLiteralsCompression(cctxParams),
+                                    op, dstCapacity,
+                                    literals, litSize,
+                                    entropyWorkspace, entropyWkspSize,
+                                    bmi2);
+        FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
+        assert(cSize <= dstCapacity);
+        op += cSize;
+    }
+
+    /* Sequences Header */
+    RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/,
+                    dstSize_tooSmall, "Can't fit seq hdr in output buf!");
+    if (nbSeq < 128) {
+        *op++ = (BYTE)nbSeq;
+    } else if (nbSeq < LONGNBSEQ) {
+        op[0] = (BYTE)((nbSeq>>8) + 0x80);
+        op[1] = (BYTE)nbSeq;
+        op+=2;
+    } else {
+        op[0]=0xFF;
+        MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ));
+        op+=3;
+    }
+    assert(op <= oend);
+    if (nbSeq==0) {
+        /* Copy the old tables over as if we repeated them */
+        memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse));
+        return (size_t)(op - ostart);
+    }
+
+    /* seqHead : flags for FSE encoding type */
+    seqHead = op++;
+    assert(op <= oend);
+
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+    /* build CTable for Literal Lengths */
+    {   unsigned max = MaxLL;
+        size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode;
+        LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode,
+                                        count, max, mostFrequent, nbSeq,
+                                        LLFSELog, prevEntropy->fse.litlengthCTable,
+                                        LL_defaultNorm, LL_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
+                count, max, llCodeTable, nbSeq,
+                LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                prevEntropy->fse.litlengthCTable,
+                sizeof(prevEntropy->fse.litlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed");
+            if (LLtype == set_compressed)
+                lastNCount = op;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    /* build CTable for Offsets */
+    {   unsigned max = MaxOff;
+        size_t const mostFrequent = HIST_countFast_wksp(
+            count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);  /* can't fail */
+        /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode;
+        Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode,
+                                        count, max, mostFrequent, nbSeq,
+                                        OffFSELog, prevEntropy->fse.offcodeCTable,
+                                        OF_defaultNorm, OF_defaultNormLog,
+                                        defaultPolicy, strategy);
+        assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
+                count, max, ofCodeTable, nbSeq,
+                OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                prevEntropy->fse.offcodeCTable,
+                sizeof(prevEntropy->fse.offcodeCTable),
+                entropyWorkspace, entropyWkspSize);
+            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed");
+            if (Offtype == set_compressed)
+                lastNCount = op;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+    /* build CTable for MatchLengths */
+    {   unsigned max = MaxML;
+        size_t const mostFrequent = HIST_countFast_wksp(
+            count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op));
+        nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode;
+        MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode,
+                                        count, max, mostFrequent, nbSeq,
+                                        MLFSELog, prevEntropy->fse.matchlengthCTable,
+                                        ML_defaultNorm, ML_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize = ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
+                count, max, mlCodeTable, nbSeq,
+                ML_defaultNorm, ML_defaultNormLog, MaxML,
+                prevEntropy->fse.matchlengthCTable,
+                sizeof(prevEntropy->fse.matchlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed");
+            if (MLtype == set_compressed)
+                lastNCount = op;
+            op += countSize;
+            assert(op <= oend);
+    }   }
+
+    *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+
+    {   size_t const bitstreamSize = ZSTD_encodeSequences(
+                                        op, (size_t)(oend - op),
+                                        CTable_MatchLength, mlCodeTable,
+                                        CTable_OffsetBits, ofCodeTable,
+                                        CTable_LitLength, llCodeTable,
+                                        sequences, nbSeq,
+                                        longOffsets, bmi2);
+        FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed");
+        op += bitstreamSize;
+        assert(op <= oend);
+        /* zstd versions <= 1.3.4 mistakenly report corruption when
+         * FSE_readNCount() receives a buffer < 4 bytes.
+         * Fixed by https://github.com/facebook/zstd/pull/1146.
+         * This can happen when the last set_compressed table present is 2
+         * bytes and the bitstream is only one byte.
+         * In this exceedingly rare case, we will simply emit an uncompressed
+         * block, since it isn't worth optimizing.
+         */
+        if (lastNCount && (op - lastNCount) < 4) {
+            /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */
+            assert(op - lastNCount == 3);
+            DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by "
+                        "emitting an uncompressed block.");
+            return 0;
+        }
+    }
+
+    DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart));
+    return (size_t)(op - ostart);
+}
+
+MEM_STATIC size_t
+ZSTD_compressSequences(seqStore_t* seqStorePtr,
+                       const ZSTD_entropyCTables_t* prevEntropy,
+                             ZSTD_entropyCTables_t* nextEntropy,
+                       const ZSTD_CCtx_params* cctxParams,
+                             void* dst, size_t dstCapacity,
+                             size_t srcSize,
+                             void* entropyWorkspace, size_t entropyWkspSize,
+                             int bmi2)
+{
+    size_t const cSize = ZSTD_compressSequences_internal(
+                            seqStorePtr, prevEntropy, nextEntropy, cctxParams,
+                            dst, dstCapacity,
+                            entropyWorkspace, entropyWkspSize, bmi2);
+    if (cSize == 0) return 0;
+    /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block.
+     * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block.
+     */
+    if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity))
+        return 0;  /* block not compressed */
+    FORWARD_IF_ERROR(cSize, "ZSTD_compressSequences_internal failed");
+
+    /* Check compressibility */
+    {   size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy);
+        if (cSize >= maxCSize) return 0;  /* block not compressed */
+    }
+
+    return cSize;
+}
+
+/* ZSTD_selectBlockCompressor() :
+ * Not static, but internal use only (used by long distance matcher)
+ * assumption : strat is a valid strategy */
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode)
+{
+    static const ZSTD_blockCompressor blockCompressor[3][ZSTD_STRATEGY_MAX+1] = {
+        { ZSTD_compressBlock_fast  /* default for 0 */,
+          ZSTD_compressBlock_fast,
+          ZSTD_compressBlock_doubleFast,
+          ZSTD_compressBlock_greedy,
+          ZSTD_compressBlock_lazy,
+          ZSTD_compressBlock_lazy2,
+          ZSTD_compressBlock_btlazy2,
+          ZSTD_compressBlock_btopt,
+          ZSTD_compressBlock_btultra,
+          ZSTD_compressBlock_btultra2 },
+        { ZSTD_compressBlock_fast_extDict  /* default for 0 */,
+          ZSTD_compressBlock_fast_extDict,
+          ZSTD_compressBlock_doubleFast_extDict,
+          ZSTD_compressBlock_greedy_extDict,
+          ZSTD_compressBlock_lazy_extDict,
+          ZSTD_compressBlock_lazy2_extDict,
+          ZSTD_compressBlock_btlazy2_extDict,
+          ZSTD_compressBlock_btopt_extDict,
+          ZSTD_compressBlock_btultra_extDict,
+          ZSTD_compressBlock_btultra_extDict },
+        { ZSTD_compressBlock_fast_dictMatchState  /* default for 0 */,
+          ZSTD_compressBlock_fast_dictMatchState,
+          ZSTD_compressBlock_doubleFast_dictMatchState,
+          ZSTD_compressBlock_greedy_dictMatchState,
+          ZSTD_compressBlock_lazy_dictMatchState,
+          ZSTD_compressBlock_lazy2_dictMatchState,
+          ZSTD_compressBlock_btlazy2_dictMatchState,
+          ZSTD_compressBlock_btopt_dictMatchState,
+          ZSTD_compressBlock_btultra_dictMatchState,
+          ZSTD_compressBlock_btultra_dictMatchState }
+    };
+    ZSTD_blockCompressor selectedCompressor;
+    ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1);
+
+    assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
+    selectedCompressor = blockCompressor[(int)dictMode][(int)strat];
+    assert(selectedCompressor != NULL);
+    return selectedCompressor;
+}
+
+static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr,
+                                   const BYTE* anchor, size_t lastLLSize)
+{
+    memcpy(seqStorePtr->lit, anchor, lastLLSize);
+    seqStorePtr->lit += lastLLSize;
+}
+
+void ZSTD_resetSeqStore(seqStore_t* ssPtr)
+{
+    ssPtr->lit = ssPtr->litStart;
+    ssPtr->sequences = ssPtr->sequencesStart;
+    ssPtr->longLengthID = 0;
+}
+
+typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
+
+static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
+{
+    ZSTD_matchState_t* const ms = &zc->blockState.matchState;
+    DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize);
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    /* Assert that we have correctly flushed the ctx params into the ms's copy */
+    ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams);
+    if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) {
+        ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch);
+        return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */
+    }
+    ZSTD_resetSeqStore(&(zc->seqStore));
+    /* required for optimal parser to read stats from dictionary */
+    ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy;
+    /* tell the optimal parser how we expect to compress literals */
+    ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode;
+    /* a gap between an attached dict and the current window is not safe,
+     * they must remain adjacent,
+     * and when that stops being the case, the dict must be unset */
+    assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit);
+
+    /* limited update after a very long match */
+    {   const BYTE* const base = ms->window.base;
+        const BYTE* const istart = (const BYTE*)src;
+        const U32 current = (U32)(istart-base);
+        if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1));   /* ensure no overflow */
+        if (current > ms->nextToUpdate + 384)
+            ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384));
+    }
+
+    /* select and store sequences */
+    {   ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms);
+        size_t lastLLSize;
+        {   int i;
+            for (i = 0; i < ZSTD_REP_NUM; ++i)
+                zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
+        }
+        if (zc->externSeqStore.pos < zc->externSeqStore.size) {
+            assert(!zc->appliedParams.ldmParams.enableLdm);
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&zc->externSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       src, srcSize);
+            assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
+        } else if (zc->appliedParams.ldmParams.enableLdm) {
+            rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
+
+            ldmSeqStore.seq = zc->ldmSequences;
+            ldmSeqStore.capacity = zc->maxNbLdmSequences;
+            /* Updates ldmSeqStore.size */
+            FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore,
+                                               &zc->appliedParams.ldmParams,
+                                               src, srcSize), "");
+            /* Updates ldmSeqStore.pos */
+            lastLLSize =
+                ZSTD_ldm_blockCompress(&ldmSeqStore,
+                                       ms, &zc->seqStore,
+                                       zc->blockState.nextCBlock->rep,
+                                       src, srcSize);
+            assert(ldmSeqStore.pos == ldmSeqStore.size);
+        } else {   /* not long range mode */
+            ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode);
+            lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize);
+        }
+        {   const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
+            ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
+    }   }
+    return ZSTDbss_compress;
+}
+
+static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
+{
+    const seqStore_t* seqStore = ZSTD_getSeqStore(zc);
+    const seqDef* seqs = seqStore->sequencesStart;
+    size_t seqsSize = seqStore->sequences - seqs;
+
+    ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex];
+    size_t i; size_t position; int repIdx;
+
+    assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences);
+    for (i = 0, position = 0; i < seqsSize; ++i) {
+        outSeqs[i].offset = seqs[i].offset;
+        outSeqs[i].litLength = seqs[i].litLength;
+        outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH;
+
+        if (i == seqStore->longLengthPos) {
+            if (seqStore->longLengthID == 1) {
+                outSeqs[i].litLength += 0x10000;
+            } else if (seqStore->longLengthID == 2) {
+                outSeqs[i].matchLength += 0x10000;
+            }
+        }
+
+        if (outSeqs[i].offset <= ZSTD_REP_NUM) {
+            outSeqs[i].rep = outSeqs[i].offset;
+            repIdx = (unsigned int)i - outSeqs[i].offset;
+
+            if (outSeqs[i].litLength == 0) {
+                if (outSeqs[i].offset < 3) {
+                    --repIdx;
+                } else {
+                    repIdx = (unsigned int)i - 1;
+                }
+                ++outSeqs[i].rep;
+            }
+            assert(repIdx >= -3);
+            outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1];
+            if (outSeqs[i].rep == 4) {
+                --outSeqs[i].offset;
+            }
+        } else {
+            outSeqs[i].offset -= ZSTD_REP_NUM;
+        }
+
+        position += outSeqs[i].litLength;
+        outSeqs[i].matchPos = (unsigned int)position;
+        position += outSeqs[i].matchLength;
+    }
+    zc->seqCollector.seqIndex += seqsSize;
+}
+
+size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+    size_t outSeqsSize, const void* src, size_t srcSize)
+{
+    const size_t dstCapacity = ZSTD_compressBound(srcSize);
+    void* dst = ZSTD_malloc(dstCapacity, ZSTD_defaultCMem);
+    SeqCollector seqCollector;
+
+    RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!");
+
+    seqCollector.collectSequences = 1;
+    seqCollector.seqStart = outSeqs;
+    seqCollector.seqIndex = 0;
+    seqCollector.maxSequences = outSeqsSize;
+    zc->seqCollector = seqCollector;
+
+    ZSTD_compress2(zc, dst, dstCapacity, src, srcSize);
+    ZSTD_free(dst, ZSTD_defaultCMem);
+    return zc->seqCollector.seqIndex;
+}
+
+/* Returns true if the given block is a RLE block */
+static int ZSTD_isRLE(const BYTE *ip, size_t length) {
+    size_t i;
+    if (length < 2) return 1;
+    for (i = 1; i < length; ++i) {
+        if (ip[0] != ip[i]) return 0;
+    }
+    return 1;
+}
+
+/* Returns true if the given block may be RLE.
+ * This is just a heuristic based on the compressibility.
+ * It may return both false positives and false negatives.
+ */
+static int ZSTD_maybeRLE(seqStore_t const* seqStore)
+{
+    size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart);
+    size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart);
+
+    return nbSeqs < 4 && nbLits < 10;
+}
+
+static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc)
+{
+    ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock;
+    zc->blockState.prevCBlock = zc->blockState.nextCBlock;
+    zc->blockState.nextCBlock = tmp;
+}
+
+static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                                        void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize, U32 frame)
+{
+    /* This the upper bound for the length of an rle block.
+     * This isn't the actual upper bound. Finding the real threshold
+     * needs further investigation.
+     */
+    const U32 rleMaxLength = 25;
+    size_t cSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* op = (BYTE*)dst;
+    DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit,
+                (unsigned)zc->blockState.matchState.nextToUpdate);
+
+    {   const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+        FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+        if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; }
+    }
+
+    if (zc->seqCollector.collectSequences) {
+        ZSTD_copyBlockSequences(zc);
+        return 0;
+    }
+
+    /* encode sequences and literals */
+    cSize = ZSTD_compressSequences(&zc->seqStore,
+            &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy,
+            &zc->appliedParams,
+            dst, dstCapacity,
+            srcSize,
+            zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */,
+            zc->bmi2);
+
+    if (frame &&
+        /* We don't want to emit our first block as a RLE even if it qualifies because
+         * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+         * This is only an issue for zstd <= v1.4.3
+         */
+        !zc->isFirstBlock &&
+        cSize < rleMaxLength &&
+        ZSTD_isRLE(ip, srcSize))
+    {
+        cSize = 1;
+        op[0] = ip[0];
+    }
+
+out:
+    if (!ZSTD_isError(cSize) && cSize > 1) {
+        ZSTD_confirmRepcodesAndEntropyTables(zc);
+    }
+    /* We check that dictionaries have offset codes available for the first
+     * block. After the first block, the offcode table might not have large
+     * enough codes to represent the offsets in the data.
+     */
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               const size_t bss, U32 lastBlock)
+{
+    DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()");
+    if (bss == ZSTDbss_compress) {
+        if (/* We don't want to emit our first block as a RLE even if it qualifies because
+            * doing so will cause the decoder (cli only) to throw a "should consume all input error."
+            * This is only an issue for zstd <= v1.4.3
+            */
+            !zc->isFirstBlock &&
+            ZSTD_maybeRLE(&zc->seqStore) &&
+            ZSTD_isRLE((BYTE const*)src, srcSize))
+        {
+            return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock);
+        }
+        /* Attempt superblock compression.
+         *
+         * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the
+         * standard ZSTD_compressBound(). This is a problem, because even if we have
+         * space now, taking an extra byte now could cause us to run out of space later
+         * and violate ZSTD_compressBound().
+         *
+         * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize.
+         *
+         * In order to respect ZSTD_compressBound() we must attempt to emit a raw
+         * uncompressed block in these cases:
+         *   * cSize == 0: Return code for an uncompressed block.
+         *   * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize).
+         *     ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of
+         *     output space.
+         *   * cSize >= blockBound(srcSize): We have expanded the block too much so
+         *     emit an uncompressed block.
+         */
+        {
+            size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock);
+            if (cSize != ERROR(dstSize_tooSmall)) {
+                size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
+                if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) {
+                    ZSTD_confirmRepcodesAndEntropyTables(zc);
+                    return cSize;
+                }
+            }
+        }
+    }
+
+    DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()");
+    /* Superblock compression failed, attempt to emit a single no compress block.
+     * The decoder will be able to stream this block since it is uncompressed.
+     */
+    return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock);
+}
+
+static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc,
+                               void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                               U32 lastBlock)
+{
+    size_t cSize = 0;
+    const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize);
+    DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize);
+    FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+
+    cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock);
+    FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed");
+
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check;
+
+    return cSize;
+}
+
+static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms,
+                                         ZSTD_cwksp* ws,
+                                         ZSTD_CCtx_params const* params,
+                                         void const* ip,
+                                         void const* iend)
+{
+    if (ZSTD_window_needOverflowCorrection(ms->window, iend)) {
+        U32 const maxDist = (U32)1 << params->cParams.windowLog;
+        U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy);
+        U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip);
+        ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30);
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30);
+        ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31);
+        ZSTD_cwksp_mark_tables_dirty(ws);
+        ZSTD_reduceIndex(ms, params, correction);
+        ZSTD_cwksp_mark_tables_clean(ws);
+        if (ms->nextToUpdate < correction) ms->nextToUpdate = 0;
+        else ms->nextToUpdate -= correction;
+        /* invalidate dictionaries on overflow correction */
+        ms->loadedDictEnd = 0;
+        ms->dictMatchState = NULL;
+    }
+}
+
+/*! ZSTD_compress_frameChunk() :
+*   Compress a chunk of data into one or multiple blocks.
+*   All blocks will be terminated, all input will be consumed.
+*   Function will issue an error if there is not enough `dstCapacity` to hold the compressed content.
+*   Frame is supposed already started (header already produced)
+*   @return : compressed size, or an error code
+*/
+static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     U32 lastFrameChunk)
+{
+    size_t blockSize = cctx->blockSize;
+    size_t remaining = srcSize;
+    const BYTE* ip = (const BYTE*)src;
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog;
+
+    assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX);
+
+    DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize);
+    if (cctx->appliedParams.fParams.checksumFlag && srcSize)
+        XXH64_update(&cctx->xxhState, src, srcSize);
+
+    while (remaining) {
+        ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+        U32 const lastBlock = lastFrameChunk & (blockSize >= remaining);
+
+        RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE,
+                        dstSize_tooSmall,
+                        "not enough space to store compressed block");
+        if (remaining < blockSize) blockSize = remaining;
+
+        ZSTD_overflowCorrectIfNeeded(
+            ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize);
+        ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState);
+
+        /* Ensure hash/chain table insertion resumes no sooner than lowlimit */
+        if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit;
+
+        {   size_t cSize;
+            if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) {
+                cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed");
+                assert(cSize > 0);
+                assert(cSize <= blockSize + ZSTD_blockHeaderSize);
+            } else {
+                cSize = ZSTD_compressBlock_internal(cctx,
+                                        op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize,
+                                        ip, blockSize, 1 /* frame */);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed");
+
+                if (cSize == 0) {  /* block is not compressible */
+                    cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock);
+                    FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+                } else {
+                    U32 const cBlockHeader = cSize == 1 ?
+                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) :
+                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3);
+                    MEM_writeLE24(op, cBlockHeader);
+                    cSize += ZSTD_blockHeaderSize;
+                }
+            }
+
+
+            ip += blockSize;
+            assert(remaining >= blockSize);
+            remaining -= blockSize;
+            op += cSize;
+            assert(dstCapacity >= cSize);
+            dstCapacity -= cSize;
+            cctx->isFirstBlock = 0;
+            DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u",
+                        (unsigned)cSize);
+    }   }
+
+    if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending;
+    return (size_t)(op-ostart);
+}
+
+
+static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity,
+                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID)
+{   BYTE* const op = (BYTE*)dst;
+    U32   const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536);   /* 0-3 */
+    U32   const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength;   /* 0-3 */
+    U32   const checksumFlag = params->fParams.checksumFlag>0;
+    U32   const windowSize = (U32)1 << params->cParams.windowLog;
+    U32   const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize);
+    BYTE  const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3);
+    U32   const fcsCode = params->fParams.contentSizeFlag ?
+                     (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0;  /* 0-3 */
+    BYTE  const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) );
+    size_t pos=0;
+
+    assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN));
+    RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall,
+                    "dst buf is too small to fit worst-case frame header size.");
+    DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u",
+                !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode);
+
+    if (params->format == ZSTD_f_zstd1) {
+        MEM_writeLE32(dst, ZSTD_MAGICNUMBER);
+        pos = 4;
+    }
+    op[pos++] = frameHeaderDescriptionByte;
+    if (!singleSegment) op[pos++] = windowLogByte;
+    switch(dictIDSizeCode)
+    {
+        default:  assert(0); /* impossible */
+        case 0 : break;
+        case 1 : op[pos] = (BYTE)(dictID); pos++; break;
+        case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break;
+        case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break;
+    }
+    switch(fcsCode)
+    {
+        default:  assert(0); /* impossible */
+        case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break;
+        case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break;
+        case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break;
+        case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break;
+    }
+    return pos;
+}
+
+/* ZSTD_writeLastEmptyBlock() :
+ * output an empty Block with end-of-frame mark to complete a frame
+ * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h))
+ *           or an error code if `dstCapacity` is too small (<ZSTD_blockHeaderSize)
+ */
+size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity)
+{
+    RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall,
+                    "dst buf is too small to write frame trailer empty block.");
+    {   U32 const cBlockHeader24 = 1 /*lastBlock*/ + (((U32)bt_raw)<<1);  /* 0 size */
+        MEM_writeLE24(dst, cBlockHeader24);
+        return ZSTD_blockHeaderSize;
+    }
+}
+
+size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq)
+{
+    RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong,
+                    "wrong cctx stage");
+    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm,
+                    parameter_unsupported,
+                    "incompatible with ldm");
+    cctx->externSeqStore.seq = seq;
+    cctx->externSeqStore.size = nbSeq;
+    cctx->externSeqStore.capacity = nbSeq;
+    cctx->externSeqStore.pos = 0;
+    return 0;
+}
+
+
+static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize,
+                               U32 frame, U32 lastFrameChunk)
+{
+    ZSTD_matchState_t* const ms = &cctx->blockState.matchState;
+    size_t fhSize = 0;
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u",
+                cctx->stage, (unsigned)srcSize);
+    RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong,
+                    "missing init (ZSTD_compressBegin)");
+
+    if (frame && (cctx->stage==ZSTDcs_init)) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams,
+                                       cctx->pledgedSrcSizePlusOne-1, cctx->dictID);
+        FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+        assert(fhSize <= dstCapacity);
+        dstCapacity -= fhSize;
+        dst = (char*)dst + fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (!srcSize) return fhSize;  /* do not generate an empty block if no input */
+
+    if (!ZSTD_window_update(&ms->window, src, srcSize)) {
+        ms->nextToUpdate = ms->window.dictLimit;
+    }
+    if (cctx->appliedParams.ldmParams.enableLdm) {
+        ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+    }
+
+    if (!frame) {
+        /* overflow check and correction for block mode */
+        ZSTD_overflowCorrectIfNeeded(
+            ms, &cctx->workspace, &cctx->appliedParams,
+            src, (BYTE const*)src + srcSize);
+    }
+
+    DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize);
+    {   size_t const cSize = frame ?
+                             ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) :
+                             ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */);
+        FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed");
+        cctx->consumedSrcSize += srcSize;
+        cctx->producedCSize += (cSize + fhSize);
+        assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+        if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+            ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+            RETURN_ERROR_IF(
+                cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne,
+                srcSize_wrong,
+                "error : pledgedSrcSize = %u, while realSrcSize >= %u",
+                (unsigned)cctx->pledgedSrcSizePlusOne-1,
+                (unsigned)cctx->consumedSrcSize);
+        }
+        return cSize + fhSize;
+    }
+}
+
+size_t ZSTD_compressContinue (ZSTD_CCtx* cctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize);
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */);
+}
+
+
+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx)
+{
+    ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams;
+    assert(!ZSTD_checkCParams(cParams));
+    return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog);
+}
+
+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize);
+    { size_t const blockSizeMax = ZSTD_getBlockSize(cctx);
+      RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); }
+
+    return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */);
+}
+
+/*! ZSTD_loadDictionaryContent() :
+ *  @return : 0, or an error code
+ */
+static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
+                                         ldmState_t* ls,
+                                         ZSTD_cwksp* ws,
+                                         ZSTD_CCtx_params const* params,
+                                         const void* src, size_t srcSize,
+                                         ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* const iend = ip + srcSize;
+
+    ZSTD_window_update(&ms->window, src, srcSize);
+    ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base);
+
+    if (params->ldmParams.enableLdm && ls != NULL) {
+        ZSTD_window_update(&ls->window, src, srcSize);
+        ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base);
+    }
+
+    /* Assert that we the ms params match the params we're being given */
+    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
+    if (srcSize <= HASH_READ_SIZE) return 0;
+
+    while (iend - ip > HASH_READ_SIZE) {
+        size_t const remaining = (size_t)(iend - ip);
+        size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX);
+        const BYTE* const ichunk = ip + chunk;
+
+        ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk);
+
+        if (params->ldmParams.enableLdm && ls != NULL)
+            ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, &params->ldmParams);
+
+        switch(params->cParams.strategy)
+        {
+        case ZSTD_fast:
+            ZSTD_fillHashTable(ms, ichunk, dtlm);
+            break;
+        case ZSTD_dfast:
+            ZSTD_fillDoubleHashTable(ms, ichunk, dtlm);
+            break;
+
+        case ZSTD_greedy:
+        case ZSTD_lazy:
+        case ZSTD_lazy2:
+            if (chunk >= HASH_READ_SIZE)
+                ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
+            break;
+
+        case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+        case ZSTD_btopt:
+        case ZSTD_btultra:
+        case ZSTD_btultra2:
+            if (chunk >= HASH_READ_SIZE)
+                ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk);
+            break;
+
+        default:
+            assert(0);  /* not possible : not a valid strategy id */
+        }
+
+        ip = ichunk;
+    }
+
+    ms->nextToUpdate = (U32)(iend - ms->window.base);
+    return 0;
+}
+
+
+/* Dictionaries that assign zero probability to symbols that show up causes problems
+   when FSE encoding.  Refuse dictionaries that assign zero probability to symbols
+   that we may encounter during compression.
+   NOTE: This behavior is not standard and could be improved in the future. */
+static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) {
+    U32 s;
+    RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols");
+    for (s = 0; s <= maxSymbolValue; ++s) {
+        RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols");
+    }
+    return 0;
+}
+
+size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace,
+                         short* offcodeNCount, unsigned* offcodeMaxValue,
+                         const void* const dict, size_t dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;    /* skip magic num and dict ID */
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    dictPtr += 8;
+    bs->entropy.huf.repeatMode = HUF_repeat_check;
+
+    {   unsigned maxSymbolValue = 255;
+        unsigned hasZeroWeights = 1;
+        size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr,
+            dictEnd-dictPtr, &hasZeroWeights);
+
+        /* We only set the loaded table as valid if it contains all non-zero
+         * weights. Otherwise, we set it to check */
+        if (!hasZeroWeights)
+            bs->entropy.huf.repeatMode = HUF_repeat_valid;
+
+        RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, "");
+        dictPtr += hufHeaderSize;
+    }
+
+    {   unsigned offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+        /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */
+        /* fill all offset symbols to avoid garbage at end of table */
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.offcodeCTable,
+                offcodeNCount, MaxOff, offcodeLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+        /* Every match length code must have non-zero probability */
+        FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), "");
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.matchlengthCTable,
+                matchlengthNCount, matchlengthMaxValue, matchlengthLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+        /* Every literal length code must have non-zero probability */
+        FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), "");
+        RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp(
+                bs->entropy.fse.litlengthCTable,
+                litlengthNCount, litlengthMaxValue, litlengthLog,
+                workspace, HUF_WORKSPACE_SIZE)),
+            dictionary_corrupted, "");
+        dictPtr += litlengthHeaderSize;
+    }
+
+    RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+    bs->rep[0] = MEM_readLE32(dictPtr+0);
+    bs->rep[1] = MEM_readLE32(dictPtr+4);
+    bs->rep[2] = MEM_readLE32(dictPtr+8);
+    dictPtr += 12;
+
+    return dictPtr - (const BYTE*)dict;
+}
+
+/* Dictionary format :
+ * See :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
+ */
+/*! ZSTD_loadZstdDictionary() :
+ * @return : dictID, or an error code
+ *  assumptions : magic number supposed already checked
+ *                dictSize supposed >= 8
+ */
+static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs,
+                                      ZSTD_matchState_t* ms,
+                                      ZSTD_cwksp* ws,
+                                      ZSTD_CCtx_params const* params,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictTableLoadMethod_e dtlm,
+                                      void* workspace)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+    short offcodeNCount[MaxOff+1];
+    unsigned offcodeMaxValue = MaxOff;
+    size_t dictID;
+    size_t eSize;
+
+    ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<<MAX(MLFSELog,LLFSELog)));
+    assert(dictSize >= 8);
+    assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY);
+
+    dictID = params->fParams.noDictIDFlag ? 0 :  MEM_readLE32(dictPtr + 4 /* skip magic number */ );
+    eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize);
+    FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed");
+    dictPtr += eSize;
+
+    {   size_t const dictContentSize = (size_t)(dictEnd - dictPtr);
+        U32 offcodeMax = MaxOff;
+        if (dictContentSize <= ((U32)-1) - 128 KB) {
+            U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */
+            offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */
+        }
+        /* All offset values <= dictContentSize + 128 KB must be representable */
+        FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), "");
+        /* All repCodes must be <= dictContentSize and != 0*/
+        {   U32 u;
+            for (u=0; u<3; u++) {
+                RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, "");
+                RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, "");
+        }   }
+
+        bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid;
+        bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid;
+        FORWARD_IF_ERROR(ZSTD_loadDictionaryContent(
+            ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), "");
+        return dictID;
+    }
+}
+
+/** ZSTD_compress_insertDictionary() :
+*   @return : dictID, or an error code */
+static size_t
+ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs,
+                               ZSTD_matchState_t* ms,
+                               ldmState_t* ls,
+                               ZSTD_cwksp* ws,
+                         const ZSTD_CCtx_params* params,
+                         const void* dict, size_t dictSize,
+                               ZSTD_dictContentType_e dictContentType,
+                               ZSTD_dictTableLoadMethod_e dtlm,
+                               void* workspace)
+{
+    DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize);
+    if ((dict==NULL) || (dictSize<8)) {
+        RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+        return 0;
+    }
+
+    ZSTD_reset_compressedBlockState(bs);
+
+    /* dict restricted modes */
+    if (dictContentType == ZSTD_dct_rawContent)
+        return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm);
+
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) {
+        if (dictContentType == ZSTD_dct_auto) {
+            DEBUGLOG(4, "raw content dictionary detected");
+            return ZSTD_loadDictionaryContent(
+                ms, ls, ws, params, dict, dictSize, dtlm);
+        }
+        RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, "");
+        assert(0);   /* impossible */
+    }
+
+    /* dict as full zstd dictionary */
+    return ZSTD_loadZstdDictionary(
+        bs, ms, ws, params, dict, dictSize, dtlm, workspace);
+}
+
+#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB)
+#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6)
+
+/*! ZSTD_compressBegin_internal() :
+ * @return : 0, or an error code */
+static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params, U64 pledgedSrcSize,
+                                    ZSTD_buffered_policy_e zbuff)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+    if ( (cdict)
+      && (cdict->dictContentSize > 0)
+      && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+        || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+        || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+        || cdict->compressionLevel == 0)
+      && (params->attachDictPref != ZSTD_dictForceLoad) ) {
+        return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff);
+    }
+
+    FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize,
+                                     ZSTDcrp_makeClean, zbuff) , "");
+    {   size_t const dictID = cdict ?
+                ZSTD_compress_insertDictionary(
+                        cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                        &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent,
+                        cdict->dictContentSize, dictContentType, dtlm,
+                        cctx->entropyWorkspace)
+              : ZSTD_compress_insertDictionary(
+                        cctx->blockState.prevCBlock, &cctx->blockState.matchState,
+                        &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize,
+                        dictContentType, dtlm, cctx->entropyWorkspace);
+        FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+        assert(dictID <= UINT_MAX);
+        cctx->dictID = (U32)dictID;
+    }
+    return 0;
+}
+
+size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx,
+                                    const void* dict, size_t dictSize,
+                                    ZSTD_dictContentType_e dictContentType,
+                                    ZSTD_dictTableLoadMethod_e dtlm,
+                                    const ZSTD_CDict* cdict,
+                                    const ZSTD_CCtx_params* params,
+                                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog);
+    /* compression parameters verification and optimization */
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , "");
+    return ZSTD_compressBegin_internal(cctx,
+                                       dict, dictSize, dictContentType, dtlm,
+                                       cdict,
+                                       params, pledgedSrcSize,
+                                       ZSTDb_not_buffered);
+}
+
+/*! ZSTD_compressBegin_advanced() :
+*   @return : 0, or an error code */
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx,
+                             const void* dict, size_t dictSize,
+                                   ZSTD_parameters params, unsigned long long pledgedSrcSize)
+{
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
+    return ZSTD_compressBegin_advanced_internal(cctx,
+                                            dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                            NULL /*cdict*/,
+                                            &cctxParams, pledgedSrcSize);
+}
+
+size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
+    DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize);
+    return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                                       &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered);
+}
+
+size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel)
+{
+    return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel);
+}
+
+
+/*! ZSTD_writeEpilogue() :
+*   Ends a frame.
+*   @return : nb of bytes written into dst (or an error code) */
+static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity)
+{
+    BYTE* const ostart = (BYTE*)dst;
+    BYTE* op = ostart;
+    size_t fhSize = 0;
+
+    DEBUGLOG(4, "ZSTD_writeEpilogue");
+    RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing");
+
+    /* special case : empty frame */
+    if (cctx->stage == ZSTDcs_init) {
+        fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0);
+        FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed");
+        dstCapacity -= fhSize;
+        op += fhSize;
+        cctx->stage = ZSTDcs_ongoing;
+    }
+
+    if (cctx->stage != ZSTDcs_ending) {
+        /* write one last empty block, make it the "last" block */
+        U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0;
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue");
+        MEM_writeLE32(op, cBlockHeader24);
+        op += ZSTD_blockHeaderSize;
+        dstCapacity -= ZSTD_blockHeaderSize;
+    }
+
+    if (cctx->appliedParams.fParams.checksumFlag) {
+        U32 const checksum = (U32) XXH64_digest(&cctx->xxhState);
+        RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum");
+        DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum);
+        MEM_writeLE32(op, checksum);
+        op += 4;
+    }
+
+    cctx->stage = ZSTDcs_created;  /* return to "created but no init" status */
+    return op-ostart;
+}
+
+size_t ZSTD_compressEnd (ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize)
+{
+    size_t endResult;
+    size_t const cSize = ZSTD_compressContinue_internal(cctx,
+                                dst, dstCapacity, src, srcSize,
+                                1 /* frame mode */, 1 /* last chunk */);
+    FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed");
+    endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize);
+    FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed");
+    assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0));
+    if (cctx->pledgedSrcSizePlusOne != 0) {  /* control src size */
+        ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1);
+        DEBUGLOG(4, "end of frame : controlling src size");
+        RETURN_ERROR_IF(
+            cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1,
+            srcSize_wrong,
+             "error : pledgedSrcSize = %u, while realSrcSize = %u",
+            (unsigned)cctx->pledgedSrcSizePlusOne-1,
+            (unsigned)cctx->consumedSrcSize);
+    }
+    return cSize + endResult;
+}
+
+
+static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx,
+                                      void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const void* dict,size_t dictSize,
+                                const ZSTD_parameters* params)
+{
+    ZSTD_CCtx_params const cctxParams =
+            ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params);
+    DEBUGLOG(4, "ZSTD_compress_internal");
+    return ZSTD_compress_advanced_internal(cctx,
+                                           dst, dstCapacity,
+                                           src, srcSize,
+                                           dict, dictSize,
+                                           &cctxParams);
+}
+
+size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict,size_t dictSize,
+                               ZSTD_parameters params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced");
+    FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
+    return ZSTD_compress_internal(cctx,
+                                  dst, dstCapacity,
+                                  src, srcSize,
+                                  dict, dictSize,
+                                  &params);
+}
+
+/* Internal */
+size_t ZSTD_compress_advanced_internal(
+        ZSTD_CCtx* cctx,
+        void* dst, size_t dstCapacity,
+        const void* src, size_t srcSize,
+        const void* dict,size_t dictSize,
+        const ZSTD_CCtx_params* params)
+{
+    DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize);
+    FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                         dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL,
+                         params, srcSize, ZSTDb_not_buffered) , "");
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize,
+                         const void* dict, size_t dictSize,
+                               int compressionLevel)
+{
+    ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0);
+    ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, &params);
+    DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize);
+    assert(params.fParams.contentSizeFlag == 1);
+    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams);
+}
+
+size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                         void* dst, size_t dstCapacity,
+                   const void* src, size_t srcSize,
+                         int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize);
+    assert(cctx != NULL);
+    return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel);
+}
+
+size_t ZSTD_compress(void* dst, size_t dstCapacity,
+               const void* src, size_t srcSize,
+                     int compressionLevel)
+{
+    size_t result;
+    ZSTD_CCtx ctxBody;
+    ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem);
+    result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel);
+    ZSTD_freeCCtxContent(&ctxBody);   /* can't free ctxBody itself, as it's on stack; free only heap content */
+    return result;
+}
+
+
+/* =====  Dictionary API  ===== */
+
+/*! ZSTD_estimateCDictSize_advanced() :
+ *  Estimate amount of memory that will be needed to create a dictionary with following arguments */
+size_t ZSTD_estimateCDictSize_advanced(
+        size_t dictSize, ZSTD_compressionParameters cParams,
+        ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));
+    return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+         + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+         + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0)
+         + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+            : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *))));
+}
+
+size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+    return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy);
+}
+
+size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support sizeof on NULL */
+    DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict));
+    /* cdict may be in the workspace */
+    return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict))
+        + ZSTD_cwksp_sizeof(&cdict->workspace);
+}
+
+static size_t ZSTD_initCDict_internal(
+                    ZSTD_CDict* cdict,
+              const void* dictBuffer, size_t dictSize,
+                    ZSTD_dictLoadMethod_e dictLoadMethod,
+                    ZSTD_dictContentType_e dictContentType,
+                    ZSTD_compressionParameters cParams)
+{
+    DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType);
+    assert(!ZSTD_checkCParams(cParams));
+    cdict->matchState.cParams = cParams;
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
+        cdict->dictContent = dictBuffer;
+    } else {
+         void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*)));
+        RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!");
+        cdict->dictContent = internalBuffer;
+        memcpy(internalBuffer, dictBuffer, dictSize);
+    }
+    cdict->dictContentSize = dictSize;
+
+    cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE);
+
+
+    /* Reset the state to no dictionary */
+    ZSTD_reset_compressedBlockState(&cdict->cBlockState);
+    FORWARD_IF_ERROR(ZSTD_reset_matchState(
+        &cdict->matchState,
+        &cdict->workspace,
+        &cParams,
+        ZSTDcrp_makeClean,
+        ZSTDirp_reset,
+        ZSTD_resetTarget_CDict), "");
+    /* (Maybe) load the dictionary
+     * Skips loading the dictionary if it is < 8 bytes.
+     */
+    {   ZSTD_CCtx_params params;
+        memset(&params, 0, sizeof(params));
+        params.compressionLevel = ZSTD_CLEVEL_DEFAULT;
+        params.fParams.contentSizeFlag = 1;
+        params.cParams = cParams;
+        {   size_t const dictID = ZSTD_compress_insertDictionary(
+                    &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace,
+                    &params, cdict->dictContent, cdict->dictContentSize,
+                    dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace);
+            FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
+            assert(dictID <= (size_t)(U32)-1);
+            cdict->dictID = (U32)dictID;
+        }
+    }
+
+    return 0;
+}
+
+ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_compressionParameters cParams, ZSTD_customMem customMem)
+{
+    DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (unsigned)dictContentType);
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   size_t const workspaceSize =
+            ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
+            ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) +
+            ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) +
+            (dictLoadMethod == ZSTD_dlm_byRef ? 0
+             : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))));
+        void* const workspace = ZSTD_malloc(workspaceSize, customMem);
+        ZSTD_cwksp ws;
+        ZSTD_CDict* cdict;
+
+        if (!workspace) {
+            ZSTD_free(workspace, customMem);
+            return NULL;
+        }
+
+        ZSTD_cwksp_init(&ws, workspace, workspaceSize);
+
+        cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+        assert(cdict != NULL);
+        ZSTD_cwksp_move(&cdict->workspace, &ws);
+        cdict->customMem = customMem;
+        cdict->compressionLevel = 0; /* signals advanced API usage */
+
+        if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                        dictBuffer, dictSize,
+                                        dictLoadMethod, dictContentType,
+                                        cParams) )) {
+            ZSTD_freeCDict(cdict);
+            return NULL;
+        }
+
+        return cdict;
+    }
+}
+
+ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+    ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dict, dictSize,
+                                                  ZSTD_dlm_byCopy, ZSTD_dct_auto,
+                                                  cParams, ZSTD_defaultCMem);
+    if (cdict)
+        cdict->compressionLevel = compressionLevel == 0 ? ZSTD_CLEVEL_DEFAULT : compressionLevel;
+    return cdict;
+}
+
+ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel)
+{
+    ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize);
+    return ZSTD_createCDict_advanced(dict, dictSize,
+                                     ZSTD_dlm_byRef, ZSTD_dct_auto,
+                                     cParams, ZSTD_defaultCMem);
+}
+
+size_t ZSTD_freeCDict(ZSTD_CDict* cdict)
+{
+    if (cdict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = cdict->customMem;
+        int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict);
+        ZSTD_cwksp_free(&cdict->workspace, cMem);
+        if (!cdictInWorkspace) {
+            ZSTD_free(cdict, cMem);
+        }
+        return 0;
+    }
+}
+
+/*! ZSTD_initStaticCDict_advanced() :
+ *  Generate a digested dictionary in provided memory area.
+ *  workspace: The memory area to emplace the dictionary into.
+ *             Provided pointer must 8-bytes aligned.
+ *             It must outlive dictionary usage.
+ *  workspaceSize: Use ZSTD_estimateCDictSize()
+ *                 to determine how large workspace must be.
+ *  cParams : use ZSTD_getCParams() to transform a compression level
+ *            into its relevants cParams.
+ * @return : pointer to ZSTD_CDict*, or NULL if error (size too small)
+ *  Note : there is no corresponding "free" function.
+ *         Since workspace was allocated externally, it must be freed externally.
+ */
+const ZSTD_CDict* ZSTD_initStaticCDict(
+                                 void* workspace, size_t workspaceSize,
+                           const void* dict, size_t dictSize,
+                                 ZSTD_dictLoadMethod_e dictLoadMethod,
+                                 ZSTD_dictContentType_e dictContentType,
+                                 ZSTD_compressionParameters cParams)
+{
+    size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0);
+    size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
+                            + (dictLoadMethod == ZSTD_dlm_byRef ? 0
+                               : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*))))
+                            + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
+                            + matchStateSize;
+    ZSTD_CDict* cdict;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+
+    {
+        ZSTD_cwksp ws;
+        ZSTD_cwksp_init(&ws, workspace, workspaceSize);
+        cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict));
+        if (cdict == NULL) return NULL;
+        ZSTD_cwksp_move(&cdict->workspace, &ws);
+    }
+
+    DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u",
+        (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize));
+    if (workspaceSize < neededSize) return NULL;
+
+    if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
+                                              dict, dictSize,
+                                              dictLoadMethod, dictContentType,
+                                              cParams) ))
+        return NULL;
+
+    return cdict;
+}
+
+ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict)
+{
+    assert(cdict != NULL);
+    return cdict->matchState.cParams;
+}
+
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * cdict must be != NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced");
+    RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!");
+    {   ZSTD_CCtx_params params = cctx->requestedParams;
+        params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF
+                        || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER
+                        || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+                        || cdict->compressionLevel == 0 )
+                      && (params.attachDictPref != ZSTD_dictForceLoad) ?
+                ZSTD_getCParamsFromCDict(cdict)
+              : ZSTD_getCParams(cdict->compressionLevel,
+                                pledgedSrcSize,
+                                cdict->dictContentSize);
+        /* Increase window log to fit the entire dictionary and source if the
+         * source size is known. Limit the increase to 19, which is the
+         * window log for compression level 1 with the largest source size.
+         */
+        if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+            U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19);
+            U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1;
+            params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog);
+        }
+        params.fParams = fParams;
+        return ZSTD_compressBegin_internal(cctx,
+                                           NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast,
+                                           cdict,
+                                           &params, pledgedSrcSize,
+                                           ZSTDb_not_buffered);
+    }
+}
+
+/* ZSTD_compressBegin_usingCDict() :
+ * pledgedSrcSize=0 means "unknown"
+ * if pledgedSrcSize>0, it will enable contentSizeFlag */
+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag);
+    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN);
+}
+
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParameters fParams)
+{
+    FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), "");   /* will check if cdict != NULL */
+    return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
+}
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times.
+ *  Note that compression parameters are decided at CDict creation time
+ *  while frame parameters are hardcoded */
+size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict)
+{
+    ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ };
+    return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams);
+}
+
+
+
+/* ******************************************************************
+*  Streaming
+********************************************************************/
+
+ZSTD_CStream* ZSTD_createCStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createCStream");
+    return ZSTD_createCStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticCCtx(workspace, workspaceSize);
+}
+
+ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem)
+{   /* CStream and CCtx are now same object */
+    return ZSTD_createCCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeCStream(ZSTD_CStream* zcs)
+{
+    return ZSTD_freeCCtx(zcs);   /* same object */
+}
+
+
+
+/*======   Initialization   ======*/
+
+size_t ZSTD_CStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_CStreamOutSize(void)
+{
+    return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ;
+}
+
+static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx,
+                    const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType,
+                    const ZSTD_CDict* const cdict,
+                    ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_resetCStream_internal");
+    /* Finalize the compression parameters */
+    params.cParams = ZSTD_getCParamsFromCCtxParams(&params, pledgedSrcSize, dictSize);
+    /* params are supposed to be fully validated at this point */
+    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+
+    FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx,
+                                         dict, dictSize, dictContentType, ZSTD_dtlm_fast,
+                                         cdict,
+                                         &params, pledgedSrcSize,
+                                         ZSTDb_buffered) , "");
+
+    cctx->inToCompress = 0;
+    cctx->inBuffPos = 0;
+    cctx->inBuffTarget = cctx->blockSize
+                      + (cctx->blockSize == pledgedSrcSize);   /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */
+    cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0;
+    cctx->streamStage = zcss_load;
+    cctx->frameEnded = 0;
+    return 0;   /* ready to go */
+}
+
+/* ZSTD_resetCStream():
+ * pledgedSrcSize == 0 means "unknown" */
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss)
+{
+    /* temporary : 0 interpreted as "unknown" during transition period.
+     * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+     * 0 will be interpreted as "empty" in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize);
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    return 0;
+}
+
+/*! ZSTD_initCStream_internal() :
+ *  Note : for lib/compress only. Used by zstdmt_compress.c.
+ *  Assumption 1 : params are valid
+ *  Assumption 2 : either dict, or cdict, is defined, not both */
+size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize, const ZSTD_CDict* cdict,
+                    const ZSTD_CCtx_params* params,
+                    unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_internal");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
+    zcs->requestedParams = *params;
+    assert(!((dict) && (cdict)));  /* either dict or cdict, not both */
+    if (dict) {
+        FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    } else {
+        /* Dictionary is cleared if !cdict */
+        FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    }
+    return 0;
+}
+
+/* ZSTD_initCStream_usingCDict_advanced() :
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                                            const ZSTD_CDict* cdict,
+                                            ZSTD_frameParameters fParams,
+                                            unsigned long long pledgedSrcSize)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    zcs->requestedParams.fParams = fParams;
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    return 0;
+}
+
+/* note : cdict must outlive compression session */
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingCDict");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , "");
+    return 0;
+}
+
+
+/* ZSTD_initCStream_advanced() :
+ * pledgedSrcSize must be exact.
+ * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                                 const void* dict, size_t dictSize,
+                                 ZSTD_parameters params, unsigned long long pss)
+{
+    /* for compatibility with older programs relying on this behavior.
+     * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN.
+     * This line will be removed in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_initCStream_advanced");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , "");
+    zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, &params);
+    FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream_usingDict");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss)
+{
+    /* temporary : 0 interpreted as "unknown" during transition period.
+     * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN.
+     * 0 will be interpreted as "empty" in the future.
+     */
+    U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss;
+    DEBUGLOG(4, "ZSTD_initCStream_srcSize");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , "");
+    return 0;
+}
+
+size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel)
+{
+    DEBUGLOG(4, "ZSTD_initCStream");
+    FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , "");
+    FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , "");
+    return 0;
+}
+
+/*======   Compression   ======*/
+
+static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx)
+{
+    size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos;
+    if (hintInSize==0) hintInSize = cctx->blockSize;
+    return hintInSize;
+}
+
+/** ZSTD_compressStream_generic():
+ *  internal function for all *compressStream*() variants
+ *  non-static, because can be called from zstdmt_compress.c
+ * @return : hint size for next input */
+static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs,
+                                          ZSTD_outBuffer* output,
+                                          ZSTD_inBuffer* input,
+                                          ZSTD_EndDirective const flushMode)
+{
+    const char* const istart = (const char*)input->src;
+    const char* const iend = input->size != 0 ? istart + input->size : istart;
+    const char* ip = input->pos != 0 ? istart + input->pos : istart;
+    char* const ostart = (char*)output->dst;
+    char* const oend = output->size != 0 ? ostart + output->size : ostart;
+    char* op = output->pos != 0 ? ostart + output->pos : ostart;
+    U32 someMoreWork = 1;
+
+    /* check expectations */
+    DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode);
+    assert(zcs->inBuff != NULL);
+    assert(zcs->inBuffSize > 0);
+    assert(zcs->outBuff !=  NULL);
+    assert(zcs->outBuffSize > 0);
+    assert(output->pos <= output->size);
+    assert(input->pos <= input->size);
+
+    while (someMoreWork) {
+        switch(zcs->streamStage)
+        {
+        case zcss_init:
+            RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!");
+
+        case zcss_load:
+            if ( (flushMode == ZSTD_e_end)
+              && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip))  /* enough dstCapacity */
+              && (zcs->inBuffPos == 0) ) {
+                /* shortcut to compression pass directly into output buffer */
+                size_t const cSize = ZSTD_compressEnd(zcs,
+                                                op, oend-op, ip, iend-ip);
+                DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed");
+                ip = iend;
+                op += cSize;
+                zcs->frameEnded = 1;
+                ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                someMoreWork = 0; break;
+            }
+            /* complete loading into inBuffer */
+            {   size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos;
+                size_t const loaded = ZSTD_limitCopy(
+                                        zcs->inBuff + zcs->inBuffPos, toLoad,
+                                        ip, iend-ip);
+                zcs->inBuffPos += loaded;
+                if (loaded != 0)
+                    ip += loaded;
+                if ( (flushMode == ZSTD_e_continue)
+                  && (zcs->inBuffPos < zcs->inBuffTarget) ) {
+                    /* not enough input to fill full block : stop here */
+                    someMoreWork = 0; break;
+                }
+                if ( (flushMode == ZSTD_e_flush)
+                  && (zcs->inBuffPos == zcs->inToCompress) ) {
+                    /* empty */
+                    someMoreWork = 0; break;
+                }
+            }
+            /* compress current block (note : this stage cannot be stopped in the middle) */
+            DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode);
+            {   void* cDst;
+                size_t cSize;
+                size_t const iSize = zcs->inBuffPos - zcs->inToCompress;
+                size_t oSize = oend-op;
+                unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend);
+                if (oSize >= ZSTD_compressBound(iSize))
+                    cDst = op;   /* compress into output buffer, to skip flush stage */
+                else
+                    cDst = zcs->outBuff, oSize = zcs->outBuffSize;
+                cSize = lastBlock ?
+                        ZSTD_compressEnd(zcs, cDst, oSize,
+                                    zcs->inBuff + zcs->inToCompress, iSize) :
+                        ZSTD_compressContinue(zcs, cDst, oSize,
+                                    zcs->inBuff + zcs->inToCompress, iSize);
+                FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed");
+                zcs->frameEnded = lastBlock;
+                /* prepare next block */
+                zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize;
+                if (zcs->inBuffTarget > zcs->inBuffSize)
+                    zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize;
+                DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u",
+                         (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize);
+                if (!lastBlock)
+                    assert(zcs->inBuffTarget <= zcs->inBuffSize);
+                zcs->inToCompress = zcs->inBuffPos;
+                if (cDst == op) {  /* no need to flush */
+                    op += cSize;
+                    if (zcs->frameEnded) {
+                        DEBUGLOG(5, "Frame completed directly in outBuffer");
+                        someMoreWork = 0;
+                        ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                    }
+                    break;
+                }
+                zcs->outBuffContentSize = cSize;
+                zcs->outBuffFlushedSize = 0;
+                zcs->streamStage = zcss_flush; /* pass-through to flush stage */
+            }
+	    /* fall-through */
+        case zcss_flush:
+            DEBUGLOG(5, "flush stage");
+            {   size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize;
+                size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op),
+                            zcs->outBuff + zcs->outBuffFlushedSize, toFlush);
+                DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u",
+                            (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed);
+                if (flushed)
+                    op += flushed;
+                zcs->outBuffFlushedSize += flushed;
+                if (toFlush!=flushed) {
+                    /* flush not fully completed, presumably because dst is too small */
+                    assert(op==oend);
+                    someMoreWork = 0;
+                    break;
+                }
+                zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0;
+                if (zcs->frameEnded) {
+                    DEBUGLOG(5, "Frame completed on flush");
+                    someMoreWork = 0;
+                    ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+                    break;
+                }
+                zcs->streamStage = zcss_load;
+                break;
+            }
+
+        default: /* impossible */
+            assert(0);
+        }
+    }
+
+    input->pos = ip - istart;
+    output->pos = op - ostart;
+    if (zcs->frameEnded) return 0;
+    return ZSTD_nextInputSizeHint(zcs);
+}
+
+static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx)
+{
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers >= 1) {
+        assert(cctx->mtctx != NULL);
+        return ZSTDMT_nextInputSizeHint(cctx->mtctx);
+    }
+#endif
+    return ZSTD_nextInputSizeHint(cctx);
+
+}
+
+size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , "");
+    return ZSTD_nextInputSizeHint_MTorST(zcs);
+}
+
+
+size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                             ZSTD_outBuffer* output,
+                             ZSTD_inBuffer* input,
+                             ZSTD_EndDirective endOp)
+{
+    DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp);
+    /* check conditions */
+    RETURN_ERROR_IF(output->pos > output->size, GENERIC, "invalid buffer");
+    RETURN_ERROR_IF(input->pos  > input->size, GENERIC, "invalid buffer");
+    assert(cctx!=NULL);
+
+    /* transparent initialization stage */
+    if (cctx->streamStage == zcss_init) {
+        ZSTD_CCtx_params params = cctx->requestedParams;
+        ZSTD_prefixDict const prefixDict = cctx->prefixDict;
+        FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */
+        memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));   /* single usage */
+        assert(prefixDict.dict==NULL || cctx->cdict==NULL);    /* only one can be set */
+        DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
+        if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1;  /* auto-fix pledgedSrcSize */
+        params.cParams = ZSTD_getCParamsFromCCtxParams(
+                &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/);
+
+
+#ifdef ZSTD_MULTITHREAD
+        if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) {
+            params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */
+        }
+        if (params.nbWorkers > 0) {
+            /* mt context creation */
+            if (cctx->mtctx == NULL) {
+                DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u",
+                            params.nbWorkers);
+                cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem);
+                RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!");
+            }
+            /* mt compression */
+            DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers);
+            FORWARD_IF_ERROR( ZSTDMT_initCStream_internal(
+                        cctx->mtctx,
+                        prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+                        cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , "");
+            cctx->streamStage = zcss_load;
+            cctx->appliedParams.nbWorkers = params.nbWorkers;
+        } else
+#endif
+        {   FORWARD_IF_ERROR( ZSTD_resetCStream_internal(cctx,
+                            prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType,
+                            cctx->cdict,
+                            params, cctx->pledgedSrcSizePlusOne-1) , "");
+            assert(cctx->streamStage == zcss_load);
+            assert(cctx->appliedParams.nbWorkers == 0);
+    }   }
+    /* end of transparent initialization stage */
+
+    /* compression stage */
+#ifdef ZSTD_MULTITHREAD
+    if (cctx->appliedParams.nbWorkers > 0) {
+        int const forceMaxProgress = (endOp == ZSTD_e_flush || endOp == ZSTD_e_end);
+        size_t flushMin;
+        assert(forceMaxProgress || endOp == ZSTD_e_continue /* Protection for a new flush type */);
+        if (cctx->cParamsChanged) {
+            ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams);
+            cctx->cParamsChanged = 0;
+        }
+        do {
+            flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp);
+            if ( ZSTD_isError(flushMin)
+              || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */
+                ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+            }
+            FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed");
+        } while (forceMaxProgress && flushMin != 0 && output->pos < output->size);
+        DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic");
+        /* Either we don't require maximum forward progress, we've finished the
+         * flush, or we are out of output space.
+         */
+        assert(!forceMaxProgress || flushMin == 0 || output->pos == output->size);
+        return flushMin;
+    }
+#endif
+    FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , "");
+    DEBUGLOG(5, "completed ZSTD_compressStream2");
+    return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */
+}
+
+size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compressStream2() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
+
+size_t ZSTD_compress2(ZSTD_CCtx* cctx,
+                      void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize);
+    ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only);
+    {   size_t oPos = 0;
+        size_t iPos = 0;
+        size_t const result = ZSTD_compressStream2_simpleArgs(cctx,
+                                        dst, dstCapacity, &oPos,
+                                        src, srcSize, &iPos,
+                                        ZSTD_e_end);
+        FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed");
+        if (result != 0) {  /* compression not completed, due to lack of output space */
+            assert(oPos == dstCapacity);
+            RETURN_ERROR(dstSize_tooSmall, "");
+        }
+        assert(iPos == srcSize);   /* all input is expected consumed */
+        return oPos;
+    }
+}
+
+/*======   Finalize   ======*/
+
+/*! ZSTD_flushStream() :
+ * @return : amount of data remaining to flush */
+size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = { NULL, 0, 0 };
+    return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush);
+}
+
+
+size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output)
+{
+    ZSTD_inBuffer input = { NULL, 0, 0 };
+    size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end);
+    FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed");
+    if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush;   /* minimal estimation */
+    /* single thread mode : attempt to calculate remaining to flush more precisely */
+    {   size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE;
+        size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4);
+        size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize;
+        DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush);
+        return toFlush;
+    }
+}
+
+
+/*-=====  Pre-defined compression levels  =====-*/
+
+#define ZSTD_MAX_CLEVEL     22
+int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
+int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = {
+{   /* "default" - for any srcSize > 256 KB */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */
+    { 21, 18, 19,  2,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 19, 19,  3,  5,  4, ZSTD_greedy  },  /* level  6 */
+    { 21, 19, 19,  3,  5,  8, ZSTD_lazy    },  /* level  7 */
+    { 21, 19, 19,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 22, 21, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
+    { 22, 21, 22,  5,  5, 32, ZSTD_btlazy2 },  /* level 13 */
+    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
+    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */
+    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */
+},
+{   /* for srcSize <= 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  2,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 18, 18,  3,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */
+    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 17, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 17, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */
+    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/
+    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/
+    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <= 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels */
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/
+    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+};
+
+/*! ZSTD_getCParams_internal() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ *  Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown.
+ *        Use dictSize == 0 for unknown or unused. */
+static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+    int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN;
+    size_t const addedSize = unknown && dictSize > 0 ? 500 : 0;
+    U64 const rSize = unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize;
+    U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB);
+    int row = compressionLevel;
+    DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel);
+    if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT;   /* 0 == default */
+    if (compressionLevel < 0) row = 0;   /* entry 0 is baseline for fast mode */
+    if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL;
+    {   ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row];
+        if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel);   /* acceleration factor */
+        /* refine parameters based on srcSize & dictSize */
+        return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize);
+    }
+}
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize.
+ *  Size values are optional, provide 0 if not known or unused */
+ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize)
+{
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize);
+}
+
+/*! ZSTD_getParams() :
+ *  same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ *  Fields of `ZSTD_frameParameters` are set to default values */
+static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+    ZSTD_parameters params;
+    ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize);
+    DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel);
+    memset(&params, 0, sizeof(params));
+    params.cParams = cParams;
+    params.fParams.contentSizeFlag = 1;
+    return params;
+}
+
+/*! ZSTD_getParams() :
+ *  same idea as ZSTD_getCParams()
+ * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`).
+ *  Fields of `ZSTD_frameParameters` are set to default values */
+ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) {
+    if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN;
+    return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize);
+}
+/**** ended inlining compress/zstd_compress.c ****/
+/**** start inlining compress/zstd_double_fast.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_double_fast.h ****/
+
+
+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
+                              void const* end, ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashLarge = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash tables.
+     * Insert the other positions into the large hash table if their entry
+     * is empty.
+     */
+    for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        U32 i;
+        for (i = 0; i < fastHashFillStep; ++i) {
+            size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls);
+            size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8);
+            if (i == 0)
+                hashSmall[smHash] = current + i;
+            if (i == 0 || hashLarge[lgHash] == 0)
+                hashLarge[lgHash] = current + i;
+            /* Only load extra positions for ZSTD_dtlm_full */
+            if (dtlm == ZSTD_dtlm_fast)
+                break;
+    }   }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    const U32 hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    const U32 hBitsS = cParams->chainLog;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mode */
+    const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved = 0;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams =
+                                     dictMode == ZSTD_dictMatchState ?
+                                     &dms->cParams : NULL;
+    const U32* const dictHashLong  = dictMode == ZSTD_dictMatchState ?
+                                     dms->hashTable : NULL;
+    const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ?
+                                     dms->chainTable : NULL;
+    const U32 dictStartIndex       = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictStart    = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictStartIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictHBitsL           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->hashLog : hBitsL;
+    const U32 dictHBitsS           = dictMode == ZSTD_dictMatchState ?
+                                     dictCParams->chainLog : hBitsS;
+    const U32 dictAndPrefixLength  = (U32)((ip - prefixLowest) + (dictEnd - dictStart));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic");
+
+    assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState);
+
+    /* if a dictionary is attached, it must be within window range */
+    if (dictMode == ZSTD_dictMatchState) {
+        assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex);
+    }
+
+    /* init */
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const current = (U32)(ip - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+        U32 const maxRep = current - windowLow;
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        U32 offset;
+        size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8);
+        size_t const h = ZSTD_hashPtr(ip, hBitsS, mls);
+        size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8);
+        size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndexL = hashLong[h2];
+        U32 matchIndexS = hashSmall[h];
+        const BYTE* matchLong = base + matchIndexL;
+        const BYTE* match = base + matchIndexS;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                            && repIndex < prefixLowestIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+        hashLong[h2] = hashSmall[h] = current;   /* update hash tables */
+
+        /* check dictMatchState repcode */
+        if (dictMode == ZSTD_dictMatchState
+            && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+            && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+            goto _match_stored;
+        }
+
+        /* check noDict repcode */
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+            mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+            goto _match_stored;
+        }
+
+        if (matchIndexL > prefixLowestIndex) {
+            /* check prefix long match */
+            if (MEM_read64(matchLong) == MEM_read64(ip)) {
+                mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8;
+                offset = (U32)(ip-matchLong);
+                while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState long match */
+            U32 const dictMatchIndexL = dictHashLong[dictHL];
+            const BYTE* dictMatchL = dictBase + dictMatchIndexL;
+            assert(dictMatchL < dictEnd);
+
+            if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) {
+                mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8;
+                offset = (U32)(current - dictMatchIndexL - dictIndexDelta);
+                while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */
+                goto _match_found;
+        }   }
+
+        if (matchIndexS > prefixLowestIndex) {
+            /* check prefix short match */
+            if (MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+            }
+        } else if (dictMode == ZSTD_dictMatchState) {
+            /* check dictMatchState short match */
+            U32 const dictMatchIndexS = dictHashSmall[dictHS];
+            match = dictBase + dictMatchIndexS;
+            matchIndexS = dictMatchIndexS + dictIndexDelta;
+
+            if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) {
+                goto _search_next_long;
+        }   }
+
+        ip += ((ip-anchor) >> kSearchStrength) + 1;
+#if defined(__aarch64__)
+        PREFETCH_L1(ip+256);
+#endif
+        continue;
+
+_search_next_long:
+
+        {   size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+            size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8);
+            U32 const matchIndexL3 = hashLong[hl3];
+            const BYTE* matchL3 = base + matchIndexL3;
+            hashLong[hl3] = current + 1;
+
+            /* check prefix long +1 match */
+            if (matchIndexL3 > prefixLowestIndex) {
+                if (MEM_read64(matchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8;
+                    ip++;
+                    offset = (U32)(ip-matchL3);
+                    while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            } else if (dictMode == ZSTD_dictMatchState) {
+                /* check dict long +1 match */
+                U32 const dictMatchIndexL3 = dictHashLong[dictHLNext];
+                const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3;
+                assert(dictMatchL3 < dictEnd);
+                if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) {
+                    mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8;
+                    ip++;
+                    offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta);
+                    while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */
+                    goto _match_found;
+        }   }   }
+
+        /* if no long +1 match, explore the short match we found */
+        if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) {
+            mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4;
+            offset = (U32)(current - matchIndexS);
+            while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        } else {
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            offset = (U32)(ip - match);
+            while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+        }
+
+        /* fall-through */
+
+_match_found:
+        offset_2 = offset_1;
+        offset_1 = offset;
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+_match_stored:
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = current+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            if (dictMode == ZSTD_dictMatchState) {
+                while (ip <= ilimit) {
+                    U32 const current2 = (U32)(ip-base);
+                    U32 const repIndex2 = current2 - offset_2;
+                    const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState
+                        && repIndex2 < prefixLowestIndex ?
+                            dictBase + repIndex2 - dictIndexDelta :
+                            base + repIndex2;
+                    if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                       && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                        const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend;
+                        size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                        U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                        ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+                        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                        hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                        ip += repLength2;
+                        anchor = ip;
+                        continue;
+                    }
+                    break;
+            }   }
+
+            if (dictMode == ZSTD_noDict) {
+                while ( (ip <= ilimit)
+                     && ( (offset_2>0)
+                        & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                    U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff;  /* swap offset_2 <=> offset_1 */
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base);
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH);
+                    ip += rLength;
+                    anchor = ip;
+                    continue;   /* faster when present ... (?) */
+        }   }   }
+    }   /* while (ip < ilimit) */
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_doubleFast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict);
+    }
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    const U32 mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_doubleFast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams = &ms->cParams;
+    U32* const hashLong = ms->hashTable;
+    U32  const hBitsL = cParams->hashLog;
+    U32* const hashSmall = ms->chainTable;
+    U32  const hBitsS = cParams->chainLog;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize);
+
+    /* if extDict is invalidated due to maxDistance, switch to "regular" variant */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict);
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls);
+        const U32 matchIndex = hashSmall[hSmall];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* match = matchBase + matchIndex;
+
+        const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8);
+        const U32 matchLongIndex = hashLong[hLong];
+        const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* matchLong = matchLongBase + matchLongIndex;
+
+        const U32 current = (U32)(ip-base);
+        const U32 repIndex = current + 1 - offset_1;   /* offset_1 expected <= current +1 */
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
+        size_t mLength;
+        hashSmall[hSmall] = hashLong[hLong] = current;   /* update hash table */
+
+        if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */
+            & (repIndex > dictStartIndex))
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+        } else {
+            if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) {
+                const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart;
+                U32 offset;
+                mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8;
+                offset = current - matchLongIndex;
+                while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) {
+                size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8);
+                U32 const matchIndex3 = hashLong[h3];
+                const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base;
+                const BYTE* match3 = match3Base + matchIndex3;
+                U32 offset;
+                hashLong[h3] = current + 1;
+                if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) {
+                    const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8;
+                    ip++;
+                    offset = current+1 - matchIndex3;
+                    while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */
+                } else {
+                    const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                    const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                    mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+                    offset = current - matchIndex;
+                    while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                }
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+
+            } else {
+                ip += ((ip-anchor) >> kSearchStrength) + 1;
+                continue;
+        }   }
+
+        /* move to next sequence start */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert = current+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base);
+            }
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3)   /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
+                    & (repIndex2 > dictStartIndex))
+                  && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_doubleFast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
+    }
+}
+/**** ended inlining compress/zstd_double_fast.c ****/
+/**** start inlining compress/zstd_fast.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_fast.h ****/
+
+
+void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
+                        const void* const end,
+                        ZSTD_dictTableLoadMethod_e dtlm)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hBits = cParams->hashLog;
+    U32  const mls = cParams->minMatch;
+    const BYTE* const base = ms->window.base;
+    const BYTE* ip = base + ms->nextToUpdate;
+    const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
+    const U32 fastHashFillStep = 3;
+
+    /* Always insert every fastHashFillStep position into the hash table.
+     * Insert the other positions if their hash entry is empty.
+     */
+    for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) {
+        U32 const current = (U32)(ip - base);
+        size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
+        hashTable[hash0] = current;
+        if (dtlm == ZSTD_dtlm_fast) continue;
+        /* Only load extra positions for ZSTD_dtlm_full */
+        {   U32 p;
+            for (p = 1; p < fastHashFillStep; ++p) {
+                size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
+                if (hashTable[hash] == 0) {  /* not yet filled */
+                    hashTable[hash] = current + p;
+    }   }   }   }
+}
+
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_fast_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize,
+        U32 const mls)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */
+    const BYTE* ip0 = istart;
+    const BYTE* ip1;
+    const BYTE* anchor = istart;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved = 0;
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
+    ip0 += (ip0 == prefixStart);
+    ip1 = ip0 + 1;
+    {   U32 const current = (U32)(ip0 - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog);
+        U32 const maxRep = current - windowLow;
+        if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0;
+    }
+
+    /* Main Search Loop */
+#ifdef __INTEL_COMPILER
+    /* From intel 'The vector pragma indicates that the loop should be
+     * vectorized if it is legal to do so'. Can be used together with
+     * #pragma ivdep (but have opted to exclude that because intel
+     * warns against using it).*/
+    #pragma vector always
+#endif
+    while (ip1 < ilimit) {   /* < instead of <=, because check at ip0+2 */
+        size_t mLength;
+        BYTE const* ip2 = ip0 + 2;
+        size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls);
+        U32 const val0 = MEM_read32(ip0);
+        size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls);
+        U32 const val1 = MEM_read32(ip1);
+        U32 const current0 = (U32)(ip0-base);
+        U32 const current1 = (U32)(ip1-base);
+        U32 const matchIndex0 = hashTable[h0];
+        U32 const matchIndex1 = hashTable[h1];
+        BYTE const* repMatch = ip2 - offset_1;
+        const BYTE* match0 = base + matchIndex0;
+        const BYTE* match1 = base + matchIndex1;
+        U32 offcode;
+
+#if defined(__aarch64__)
+        PREFETCH_L1(ip0+256);
+#endif
+
+        hashTable[h0] = current0;   /* update hash table */
+        hashTable[h1] = current1;   /* update hash table */
+
+        assert(ip0 + 1 == ip1);
+
+        if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) {
+            mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0;
+            ip0 = ip2 - mLength;
+            match0 = repMatch - mLength;
+            mLength += 4;
+            offcode = 0;
+            goto _match;
+        }
+        if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) {
+            /* found a regular match */
+            goto _offset;
+        }
+        if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) {
+            /* found a regular match after one literal */
+            ip0 = ip1;
+            match0 = match1;
+            goto _offset;
+        }
+        {   size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize;
+            assert(step >= 2);
+            ip0 += step;
+            ip1 += step;
+            continue;
+        }
+_offset: /* Requires: ip0, match0 */
+        /* Compute the offset code */
+        offset_2 = offset_1;
+        offset_1 = (U32)(ip0-match0);
+        offcode = offset_1 + ZSTD_REP_MOVE;
+        mLength = 4;
+        /* Count the backwards match length */
+        while (((ip0>anchor) & (match0>prefixStart))
+             && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */
+
+_match: /* Requires: ip0, match0, offcode */
+        /* Count the forward length */
+        mLength += ZSTD_count(ip0+mLength, match0+mLength, iend);
+        ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH);
+        /* match found */
+        ip0 += mLength;
+        anchor = ip0;
+
+        if (ip0 <= ilimit) {
+            /* Fill Table */
+            assert(base+current0+2 > istart);  /* check base overflow */
+            hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2;  /* here because current+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
+
+            if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */
+                while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) {
+                    /* store sequence */
+                    size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4;
+                    { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */
+                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
+                    ip0 += rLength;
+                    ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH);
+                    anchor = ip0;
+                    continue;   /* faster when present (confirmed on gcc-8) ... (?) */
+        }   }   }
+        ip1 = ip0 + 1;
+    }
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_fast(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState == NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7);
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_fast_dictMatchState_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    const BYTE* const base = ms->window.base;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   prefixStartIndex = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - HASH_READ_SIZE;
+    U32 offset_1=rep[0], offset_2=rep[1];
+    U32 offsetSaved = 0;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
+    const U32* const dictHashTable = dms->hashTable;
+    const U32 dictStartIndex       = dms->window.dictLimit;
+    const BYTE* const dictBase     = dms->window.base;
+    const BYTE* const dictStart    = dictBase + dictStartIndex;
+    const BYTE* const dictEnd      = dms->window.nextSrc;
+    const U32 dictIndexDelta       = prefixStartIndex - (U32)(dictEnd - dictBase);
+    const U32 dictAndPrefixLength  = (U32)(ip - prefixStart + dictEnd - dictStart);
+    const U32 dictHLog             = dictCParams->hashLog;
+
+    /* if a dictionary is still attached, it necessarily means that
+     * it is within window size. So we just check it. */
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 endIndex = (U32)((size_t)(ip - base) + srcSize);
+    assert(endIndex - prefixStartIndex <= maxDistance);
+    (void)maxDistance; (void)endIndex;   /* these variables are not used when assert() is disabled */
+
+    /* ensure there will be no no underflow
+     * when translating a dict index into a local index */
+    assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
+    ip += (dictAndPrefixLength == 0);
+    /* dictMatchState repCode checks don't currently handle repCode == 0
+     * disabling. */
+    assert(offset_1 <= dictAndPrefixLength);
+    assert(offset_2 <= dictAndPrefixLength);
+
+    /* Main Search Loop */
+    while (ip < ilimit) {   /* < instead of <=, because repcode check at (ip+1) */
+        size_t mLength;
+        size_t const h = ZSTD_hashPtr(ip, hlog, mls);
+        U32 const current = (U32)(ip-base);
+        U32 const matchIndex = hashTable[h];
+        const BYTE* match = base + matchIndex;
+        const U32 repIndex = current + 1 - offset_1;
+        const BYTE* repMatch = (repIndex < prefixStartIndex) ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+        hashTable[h] = current;   /* update hash table */
+
+        if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
+          && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH);
+        } else if ( (matchIndex <= prefixStartIndex) ) {
+            size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls);
+            U32 const dictMatchIndex = dictHashTable[dictHash];
+            const BYTE* dictMatch = dictBase + dictMatchIndex;
+            if (dictMatchIndex <= dictStartIndex ||
+                MEM_read32(dictMatch) != MEM_read32(ip)) {
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                continue;
+            } else {
+                /* found a dict match */
+                U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta);
+                mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4;
+                while (((ip>anchor) & (dictMatch>dictStart))
+                     && (ip[-1] == dictMatch[-1])) {
+                    ip--; dictMatch--; mLength++;
+                } /* catch up */
+                offset_2 = offset_1;
+                offset_1 = offset;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+            }
+        } else if (MEM_read32(match) != MEM_read32(ip)) {
+            /* it's not a match, and we're not going to check the dictionary */
+            assert(stepSize >= 1);
+            ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+            continue;
+        } else {
+            /* found a regular match */
+            U32 const offset = (U32)(ip-match);
+            mLength = ZSTD_count(ip+4, match+4, iend) + 4;
+            while (((ip>anchor) & (match>prefixStart))
+                 && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */
+            offset_2 = offset_1;
+            offset_1 = offset;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+        }
+
+        /* match found */
+        ip += mLength;
+        anchor = ip;
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            assert(base+current+2 > istart);  /* check base overflow */
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;  /* here because current+2 could be > iend-8 */
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
+                        dictBase - dictIndexDelta + repIndex2 :
+                        base + repIndex2;
+                if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH);
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+    }
+
+    /* save reps for next block */
+    rep[0] = offset_1 ? offset_1 : offsetSaved;
+    rep[1] = offset_2 ? offset_2 : offsetSaved;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+size_t ZSTD_compressBlock_fast_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    assert(ms->dictMatchState != NULL);
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7);
+    }
+}
+
+
+static size_t ZSTD_compressBlock_fast_extDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32 const hlog = cParams->hashLog;
+    /* support stepSize of 0 */
+    U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const U32   endIndex = (U32)((size_t)(istart - base) + srcSize);
+    const U32   lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
+    const U32   dictStartIndex = lowLimit;
+    const BYTE* const dictStart = dictBase + dictStartIndex;
+    const U32   dictLimit = ms->window.dictLimit;
+    const U32   prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
+    const BYTE* const prefixStart = base + prefixStartIndex;
+    const BYTE* const dictEnd = dictBase + prefixStartIndex;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    U32 offset_1=rep[0], offset_2=rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
+
+    /* switch to "regular" variant if extDict is invalidated due to maxDistance */
+    if (prefixStartIndex == dictStartIndex)
+        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls);
+
+    /* Search Loop */
+    while (ip < ilimit) {  /* < instead of <=, because (ip+1) */
+        const size_t h = ZSTD_hashPtr(ip, hlog, mls);
+        const U32    matchIndex = hashTable[h];
+        const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base;
+        const BYTE*  match = matchBase + matchIndex;
+        const U32    current = (U32)(ip-base);
+        const U32    repIndex = current + 1 - offset_1;
+        const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
+        const BYTE* const repMatch = repBase + repIndex;
+        hashTable[h] = current;   /* update hash table */
+        DEBUGLOG(7, "offset_1 = %u , current = %u", offset_1, current);
+        assert(offset_1 <= current +1);   /* check repIndex */
+
+        if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex))
+           && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+            const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
+            size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4;
+            ip++;
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH);
+            ip += rLength;
+            anchor = ip;
+        } else {
+            if ( (matchIndex < dictStartIndex) ||
+                 (MEM_read32(match) != MEM_read32(ip)) ) {
+                assert(stepSize >= 1);
+                ip += ((ip-anchor) >> kSearchStrength) + stepSize;
+                continue;
+            }
+            {   const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend;
+                const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart;
+                U32 const offset = current - matchIndex;
+                size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4;
+                while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; }   /* catch up */
+                offset_2 = offset_1; offset_1 = offset;  /* update offset history */
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ip += mLength;
+                anchor = ip;
+        }   }
+
+        if (ip <= ilimit) {
+            /* Fill Table */
+            hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2;
+            hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base);
+            /* check immediate repcode */
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex2 = current2 - offset_2;
+                const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex))  /* intentional overflow */
+                   && (MEM_read32(repMatch2) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
+                    size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
+                    { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; }  /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH);
+                    hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2;
+                    ip += repLength2;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+    }   }   }
+
+    /* save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_fast_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    U32 const mls = ms->cParams.minMatch;
+    switch(mls)
+    {
+    default: /* includes case 3 */
+    case 4 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4);
+    case 5 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5);
+    case 6 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6);
+    case 7 :
+        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7);
+    }
+}
+/**** ended inlining compress/zstd_fast.c ****/
+/**** start inlining compress/zstd_lazy.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: zstd_lazy.h ****/
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+
+static void
+ZSTD_updateDUBT(ZSTD_matchState_t* ms,
+                const BYTE* ip, const BYTE* iend,
+                U32 mls)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const hashTable = ms->hashTable;
+    U32  const hashLog = cParams->hashLog;
+
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    if (idx != target)
+        DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
+                    idx, target, ms->window.dictLimit);
+    assert(ip + 8 <= iend);   /* condition for ZSTD_hashPtr */
+    (void)iend;
+
+    assert(idx >= ms->window.dictLimit);   /* condition for valid base+idx */
+    for ( ; idx < target ; idx++) {
+        size_t const h  = ZSTD_hashPtr(base + idx, hashLog, mls);   /* assumption : ip + 8 <= iend */
+        U32    const matchIndex = hashTable[h];
+
+        U32*   const nextCandidatePtr = bt + 2*(idx&btMask);
+        U32*   const sortMarkPtr  = nextCandidatePtr + 1;
+
+        DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx);
+        hashTable[h] = idx;   /* Update Hash Table */
+        *nextCandidatePtr = matchIndex;   /* update BT like a chain */
+        *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
+    }
+    ms->nextToUpdate = target;
+}
+
+
+/** ZSTD_insertDUBT1() :
+ *  sort one already inserted but unsorted position
+ *  assumption : current >= btlow == (current - btmask)
+ *  doesn't fail */
+static void
+ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
+                 U32 current, const BYTE* inputEnd,
+                 U32 nbCompares, U32 btLow,
+                 const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const bt = ms->chainTable;
+    U32  const btLog  = cParams->chainLog - 1;
+    U32  const btMask = (1 << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current;
+    const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 matchIndex = *smallerPtr;   /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowValid = ms->window.lowLimit;
+    U32 const maxDistance = 1U << cParams->windowLog;
+    U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid;
+
+
+    DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
+                current, dictLimit, windowLow);
+    assert(current >= btLow);
+    assert(ip < iend);   /* condition for ZSTD_count */
+
+    while (nbCompares-- && (matchIndex > windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < current);
+        /* note : all candidates are now supposed sorted,
+         * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
+         * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
+
+        if ( (dictMode != ZSTD_extDict)
+          || (matchIndex+matchLength >= dictLimit)  /* both in current segment*/
+          || (current < dictLimit) /* both in extDict */) {
+            const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
+                                     || (matchIndex+matchLength >= dictLimit)) ?
+                                        base : dictBase;
+            assert( (matchIndex+matchLength >= dictLimit)   /* might be wrong if extDict is incorrectly set to 0 */
+                 || (current < dictLimit) );
+            match = mBase + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* preparation for next read of match[matchLength] */
+        }
+
+        DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
+                    current, matchIndex, (U32)matchLength);
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
+                        matchIndex, btLow, nextPtr[1]);
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
+                        matchIndex, btLow, nextPtr[0]);
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+}
+
+
+static size_t
+ZSTD_DUBT_findBetterDictMatch (
+        ZSTD_matchState_t* ms,
+        const BYTE* const ip, const BYTE* const iend,
+        size_t* offsetPtr,
+        size_t bestLength,
+        U32 nbCompares,
+        U32 const mls,
+        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_matchState_t * const dms = ms->dictMatchState;
+    const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
+    const U32 * const dictHashTable = dms->hashTable;
+    U32         const hashLog = dmsCParams->hashLog;
+    size_t      const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32               dictMatchIndex = dictHashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    U32         const current = (U32)(ip-base);
+    const BYTE* const dictBase = dms->window.base;
+    const BYTE* const dictEnd = dms->window.nextSrc;
+    U32         const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
+    U32         const dictLowLimit = dms->window.lowLimit;
+    U32         const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
+
+    U32*        const dictBt = dms->chainTable;
+    U32         const btLog  = dmsCParams->chainLog - 1;
+    U32         const btMask = (1 << btLog) - 1;
+    U32         const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
+
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+
+    (void)dictMode;
+    assert(dictMode == ZSTD_dictMatchState);
+
+    while (nbCompares-- && (dictMatchIndex > dictLowLimit)) {
+        U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        const BYTE* match = dictBase + dictMatchIndex;
+        matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+        if (dictMatchIndex+matchLength >= dictHighLimit)
+            match = base + dictMatchIndex + dictIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+        if (matchLength > bestLength) {
+            U32 matchIndex = dictMatchIndex + dictIndexDelta;
+            if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
+                DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
+                    current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex);
+                bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+            }
+            if (ip+matchLength == iend) {   /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */
+                break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+        } else {
+            /* match is larger than current */
+            if (dictMatchIndex <= btLow) { break; }   /* beyond tree size, stop the search */
+            commonLengthLarger = matchLength;
+            dictMatchIndex = nextPtr[0];
+        }
+    }
+
+    if (bestLength >= MINMATCH) {
+        U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+        DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                    current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+    }
+    return bestLength;
+
+}
+
+
+static size_t
+ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iend,
+                        size_t* offsetPtr,
+                        U32 const mls,
+                        const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32          matchIndex  = hashTable[h];
+
+    const BYTE* const base = ms->window.base;
+    U32    const current = (U32)(ip-base);
+    U32    const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog);
+
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32    const btLow = (btMask >= current) ? 0 : current - btMask;
+    U32    const unsortLimit = MAX(btLow, windowLow);
+
+    U32*         nextCandidate = bt + 2*(matchIndex&btMask);
+    U32*         unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+    U32          nbCompares = 1U << cParams->searchLog;
+    U32          nbCandidates = nbCompares;
+    U32          previousCandidate = 0;
+
+    DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current);
+    assert(ip <= iend-8);   /* required for h calculation */
+
+    /* reach end of unsorted candidates list */
+    while ( (matchIndex > unsortLimit)
+         && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
+         && (nbCandidates > 1) ) {
+        DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
+                    matchIndex);
+        *unsortedMark = previousCandidate;  /* the unsortedMark becomes a reversed chain, to move up back to original position */
+        previousCandidate = matchIndex;
+        matchIndex = *nextCandidate;
+        nextCandidate = bt + 2*(matchIndex&btMask);
+        unsortedMark = bt + 2*(matchIndex&btMask) + 1;
+        nbCandidates --;
+    }
+
+    /* nullify last candidate if it's still unsorted
+     * simplification, detrimental to compression ratio, beneficial for speed */
+    if ( (matchIndex > unsortLimit)
+      && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
+        DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
+                    matchIndex);
+        *nextCandidate = *unsortedMark = 0;
+    }
+
+    /* batch sort stacked candidates */
+    matchIndex = previousCandidate;
+    while (matchIndex) {  /* will end on matchIndex == 0 */
+        U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1;
+        U32 const nextCandidateIdx = *nextCandidateIdxPtr;
+        ZSTD_insertDUBT1(ms, matchIndex, iend,
+                         nbCandidates, unsortLimit, dictMode);
+        matchIndex = nextCandidateIdx;
+        nbCandidates++;
+    }
+
+    /* find longest match */
+    {   size_t commonLengthSmaller = 0, commonLengthLarger = 0;
+        const BYTE* const dictBase = ms->window.dictBase;
+        const U32 dictLimit = ms->window.dictLimit;
+        const BYTE* const dictEnd = dictBase + dictLimit;
+        const BYTE* const prefixStart = base + dictLimit;
+        U32* smallerPtr = bt + 2*(current&btMask);
+        U32* largerPtr  = bt + 2*(current&btMask) + 1;
+        U32 matchEndIdx = current + 8 + 1;
+        U32 dummy32;   /* to be nullified at the end */
+        size_t bestLength = 0;
+
+        matchIndex  = hashTable[h];
+        hashTable[h] = current;   /* Update Hash Table */
+
+        while (nbCompares-- && (matchIndex > windowLow)) {
+            U32* const nextPtr = bt + 2*(matchIndex & btMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match;
+
+            if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) {
+                match = base + matchIndex;
+                matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+            } else {
+                match = dictBase + matchIndex;
+                matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+                if (matchIndex+matchLength >= dictLimit)
+                    match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+            }
+
+            if (matchLength > bestLength) {
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
+                    bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex;
+                if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+                    if (dictMode == ZSTD_dictMatchState) {
+                        nbCompares = 0; /* in addition to avoiding checking any
+                                         * further in this loop, make sure we
+                                         * skip checking in the dictionary. */
+                    }
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (match[matchLength] < ip[matchLength]) {
+                /* match is smaller than current */
+                *smallerPtr = matchIndex;             /* update smaller idx */
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+                matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                *largerPtr = matchIndex;
+                commonLengthLarger = matchLength;
+                if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+                largerPtr = nextPtr;
+                matchIndex = nextPtr[0];
+        }   }
+
+        *smallerPtr = *largerPtr = 0;
+
+        if (dictMode == ZSTD_dictMatchState && nbCompares) {
+            bestLength = ZSTD_DUBT_findBetterDictMatch(
+                    ms, ip, iend,
+                    offsetPtr, bestLength, nbCompares,
+                    mls, dictMode);
+        }
+
+        assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */
+        ms->nextToUpdate = matchEndIdx - 8;   /* skip repetitive patterns */
+        if (bestLength >= MINMATCH) {
+            U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex;
+            DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
+                        current, (U32)bestLength, (U32)*offsetPtr, mIndex);
+        }
+        return bestLength;
+    }
+}
+
+
+/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iLimit,
+                      size_t* offsetPtr,
+                const U32 mls /* template */,
+                const ZSTD_dictMode_e dictMode)
+{
+    DEBUGLOG(7, "ZSTD_BtFindBestMatch");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateDUBT(ms, ip, iLimit, mls);
+    return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode);
+}
+
+
+static size_t
+ZSTD_BtFindBestMatch_selectMLS (  ZSTD_matchState_t* ms,
+                            const BYTE* ip, const BYTE* const iLimit,
+                                  size_t* offsetPtr)
+{
+    switch(ms->cParams.minMatch)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    }
+}
+
+
+static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.minMatch)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    }
+}
+
+
+static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.minMatch)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+    case 7 :
+    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+    }
+}
+
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
+                        const BYTE* ip, U32 const mls)
+{
+    U32* const hashTable  = ms->hashTable;
+    const U32 hashLog = cParams->hashLog;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainMask = (1 << cParams->chainLog) - 1;
+    const BYTE* const base = ms->window.base;
+    const U32 target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
+        hashTable[h] = idx;
+        idx++;
+    }
+
+    ms->nextToUpdate = target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch);
+}
+
+
+/* inlining is important to hardwire a hot branch (template emulation) */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_HcFindBestMatch_generic (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32* const chainTable = ms->chainTable;
+    const U32 chainSize = (1 << cParams->chainLog);
+    const U32 chainMask = chainSize-1;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const U32 current = (U32)(ip-base);
+    const U32 maxDistance = 1U << cParams->windowLog;
+    const U32 lowestValid = ms->window.lowLimit;
+    const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid;
+    const U32 isDictionary = (ms->loadedDictEnd != 0);
+    const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
+    const U32 minChain = current > chainSize ? current - chainSize : 0;
+    U32 nbAttempts = 1U << cParams->searchLog;
+    size_t ml=4-1;
+
+    /* HC4 match finder */
+    U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls);
+
+    for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
+        size_t currentMl=0;
+        if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) {
+            const BYTE* const match = base + matchIndex;
+            assert(matchIndex >= dictLimit);   /* ensures this is true if dictMode != ZSTD_extDict */
+            if (match[ml] == ip[ml])   /* potentially better */
+                currentMl = ZSTD_count(ip, match, iLimit);
+        } else {
+            const BYTE* const match = dictBase + matchIndex;
+            assert(match+4 <= dictEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml = currentMl;
+            *offsetPtr = current - matchIndex + ZSTD_REP_MOVE;
+            if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+        }
+
+        if (matchIndex <= minChain) break;
+        matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
+    }
+
+    if (dictMode == ZSTD_dictMatchState) {
+        const ZSTD_matchState_t* const dms = ms->dictMatchState;
+        const U32* const dmsChainTable = dms->chainTable;
+        const U32 dmsChainSize         = (1 << dms->cParams.chainLog);
+        const U32 dmsChainMask         = dmsChainSize - 1;
+        const U32 dmsLowestIndex       = dms->window.dictLimit;
+        const BYTE* const dmsBase      = dms->window.base;
+        const BYTE* const dmsEnd       = dms->window.nextSrc;
+        const U32 dmsSize              = (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        = dictLimit - dmsSize;
+        const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0;
+
+        matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
+
+        for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) {
+            size_t currentMl=0;
+            const BYTE* const match = dmsBase + matchIndex;
+            assert(match+4 <= dmsEnd);
+            if (MEM_read32(match) == MEM_read32(ip))   /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4;
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml = currentMl;
+                *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE;
+                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+            }
+
+            if (matchIndex <= dmsMinChain) break;
+            matchIndex = dmsChainTable[matchIndex & dmsChainMask];
+        }
+    }
+
+    return ml;
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.minMatch)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict);
+    }
+}
+
+
+static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.minMatch)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState);
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* const iLimit,
+                        size_t* offsetPtr)
+{
+    switch(ms->cParams.minMatch)
+    {
+    default : /* includes case 3 */
+    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict);
+    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict);
+    case 7 :
+    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict);
+    }
+}
+
+
+/* *******************************
+*  Common parser - lazy strategy
+*********************************/
+typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_lazy_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        const void* src, size_t srcSize,
+                        const searchMethod_e searchMethod, const U32 depth,
+                        ZSTD_dictMode_e const dictMode)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 prefixLowestIndex = ms->window.dictLimit;
+    const BYTE* const prefixLowest = base + prefixLowestIndex;
+
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+    searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ?
+        (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS
+                                         : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) :
+        (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS
+                                         : ZSTD_HcFindBestMatch_selectMLS);
+    U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0;
+
+    const ZSTD_matchState_t* const dms = ms->dictMatchState;
+    const U32 dictLowestIndex      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.base : NULL;
+    const BYTE* const dictLowest   = dictMode == ZSTD_dictMatchState ?
+                                     dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = dictMode == ZSTD_dictMatchState ?
+                                     dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = dictMode == ZSTD_dictMatchState ?
+                                     prefixLowestIndex - (U32)(dictEnd - dictBase) :
+                                     0;
+    const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
+
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode);
+
+    /* init */
+    ip += (dictAndPrefixLength == 0);
+    if (dictMode == ZSTD_noDict) {
+        U32 const current = (U32)(ip - base);
+        U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, ms->cParams.windowLog);
+        U32 const maxRep = current - windowLow;
+        if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
+        if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
+    }
+    if (dictMode == ZSTD_dictMatchState) {
+        /* dictMatchState repCode checks don't currently handle repCode == 0
+         * disabling. */
+        assert(offset_1 <= dictAndPrefixLength);
+        assert(offset_2 <= dictAndPrefixLength);
+    }
+
+    /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+     * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+     */
+    __asm__(".p2align 5");
+#endif
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+
+        /* check repCode */
+        if (dictMode == ZSTD_dictMatchState) {
+            const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
+            const BYTE* repMatch = (dictMode == ZSTD_dictMatchState
+                                && repIndex < prefixLowestIndex) ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+            if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) {
+                const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                if (depth==0) goto _storeSequence;
+            }
+        }
+        if ( dictMode == ZSTD_noDict
+          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) {
+            matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+            if (depth==0) goto _storeSequence;
+        }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+        if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            if ( (dictMode == ZSTD_noDict)
+              && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                int const gain2 = (int)(mlRep * 3);
+                int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                if ((mlRep >= 4) && (gain2 > gain1))
+                    matchLength = mlRep, offset = 0, start = ip;
+            }
+            if (dictMode == ZSTD_dictMatchState) {
+                const U32 repIndex = (U32)(ip - base) - offset_1;
+                const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                               dictBase + (repIndex - dictIndexDelta) :
+                               base + repIndex;
+                if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                    && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                    int const gain2 = (int)(mlRep * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
+                }
+            }
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                if ( (dictMode == ZSTD_noDict)
+                  && (offset) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
+                    size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4;
+                    int const gain2 = (int)(mlRep * 4);
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((mlRep >= 4) && (gain2 > gain1))
+                        matchLength = mlRep, offset = 0, start = ip;
+                }
+                if (dictMode == ZSTD_dictMatchState) {
+                    const U32 repIndex = (U32)(ip - base) - offset_1;
+                    const BYTE* repMatch = repIndex < prefixLowestIndex ?
+                                   dictBase + (repIndex - dictIndexDelta) :
+                                   base + repIndex;
+                    if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */)
+                        && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                        const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
+                        size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
+                        int const gain2 = (int)(mlRep * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((mlRep >= 4) && (gain2 > gain1))
+                            matchLength = mlRep, offset = 0, start = ip;
+                    }
+                }
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* NOTE:
+         * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
+         * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which
+         * overflows the pointer, which is undefined behavior.
+         */
+        /* catch up */
+        if (offset) {
+            if (dictMode == ZSTD_noDict) {
+                while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest))
+                     && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
+                    { start--; matchLength++; }
+            }
+            if (dictMode == ZSTD_dictMatchState) {
+                U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+                const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
+                const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
+                while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            }
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        if (dictMode == ZSTD_dictMatchState) {
+            while (ip <= ilimit) {
+                U32 const current2 = (U32)(ip-base);
+                U32 const repIndex = current2 - offset_2;
+                const BYTE* repMatch = dictMode == ZSTD_dictMatchState
+                    && repIndex < prefixLowestIndex ?
+                        dictBase - dictIndexDelta + repIndex :
+                        base + repIndex;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */)
+                   && (MEM_read32(repMatch) == MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
+                    matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4;
+                    offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset_2 <=> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+                    ip += matchLength;
+                    anchor = ip;
+                    continue;
+                }
+                break;
+            }
+        }
+
+        if (dictMode == ZSTD_noDict) {
+            while ( ((ip <= ilimit) & (offset_2>0))
+                 && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
+                /* store sequence */
+                matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+    }   }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1 ? offset_1 : savedOffset;
+    rep[1] = offset_2 ? offset_2 : savedOffset;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_btlazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btlazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState);
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_lazy_extDict_generic(
+                        ZSTD_matchState_t* ms, seqStore_t* seqStore,
+                        U32 rep[ZSTD_REP_NUM],
+                        const void* src, size_t srcSize,
+                        const searchMethod_e searchMethod, const U32 depth)
+{
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const BYTE* const dictEnd  = dictBase + dictLimit;
+    const BYTE* const dictStart  = dictBase + ms->window.lowLimit;
+    const U32 windowLog = ms->cParams.windowLog;
+
+    typedef size_t (*searchMax_f)(
+                        ZSTD_matchState_t* ms,
+                        const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
+    searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
+
+    U32 offset_1 = rep[0], offset_2 = rep[1];
+
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
+
+    /* init */
+    ip += (ip == prefixStart);
+
+    /* Match Loop */
+#if defined(__GNUC__) && defined(__x86_64__)
+    /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the
+     * code alignment is perturbed. To fix the instability align the loop on 32-bytes.
+     */
+    __asm__(".p2align 5");
+#endif
+    while (ip < ilimit) {
+        size_t matchLength=0;
+        size_t offset=0;
+        const BYTE* start=ip+1;
+        U32 current = (U32)(ip-base);
+
+        /* check repCode */
+        {   const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current+1, windowLog);
+            const U32 repIndex = (U32)(current+1 - offset_1);
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))   /* intentional overflow */
+            if (MEM_read32(ip+1) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                if (depth==0) goto _storeSequence;
+        }   }
+
+        /* first search (depth 0) */
+        {   size_t offsetFound = 999999999;
+            size_t const ml2 = searchMax(ms, ip, iend, &offsetFound);
+            if (ml2 > matchLength)
+                matchLength = ml2, start = ip, offset=offsetFound;
+        }
+
+         if (matchLength < 4) {
+            ip += ((ip-anchor) >> kSearchStrength) + 1;   /* jump faster over incompressible sections */
+            continue;
+        }
+
+        /* let's try to find a better solution */
+        if (depth>=1)
+        while (ip<ilimit) {
+            ip ++;
+            current++;
+            /* check repCode */
+            if (offset) {
+                const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current, windowLog);
+                const U32 repIndex = (U32)(current - offset_1);
+                const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                const BYTE* const repMatch = repBase + repIndex;
+                if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */
+                if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                    /* repcode detected */
+                    const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                    size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                    int const gain2 = (int)(repLength * 3);
+                    int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1);
+                    if ((repLength >= 4) && (gain2 > gain1))
+                        matchLength = repLength, offset = 0, start = ip;
+            }   }
+
+            /* search match, depth 1 */
+            {   size_t offset2=999999999;
+                size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+                int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4);
+                if ((ml2 >= 4) && (gain2 > gain1)) {
+                    matchLength = ml2, offset = offset2, start = ip;
+                    continue;   /* search a better one */
+            }   }
+
+            /* let's find an even better one */
+            if ((depth==2) && (ip<ilimit)) {
+                ip ++;
+                current++;
+                /* check repCode */
+                if (offset) {
+                    const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current, windowLog);
+                    const U32 repIndex = (U32)(current - offset_1);
+                    const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+                    const BYTE* const repMatch = repBase + repIndex;
+                    if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */
+                    if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                        /* repcode detected */
+                        const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                        size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                        int const gain2 = (int)(repLength * 4);
+                        int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1);
+                        if ((repLength >= 4) && (gain2 > gain1))
+                            matchLength = repLength, offset = 0, start = ip;
+                }   }
+
+                /* search match, depth 2 */
+                {   size_t offset2=999999999;
+                    size_t const ml2 = searchMax(ms, ip, iend, &offset2);
+                    int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1));   /* raw approx */
+                    int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7);
+                    if ((ml2 >= 4) && (gain2 > gain1)) {
+                        matchLength = ml2, offset = offset2, start = ip;
+                        continue;
+            }   }   }
+            break;  /* nothing found : store previous solution */
+        }
+
+        /* catch up */
+        if (offset) {
+            U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
+            const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
+            const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
+            while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; }  /* catch up */
+            offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
+        }
+
+        /* store sequence */
+_storeSequence:
+        {   size_t const litLength = start - anchor;
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH);
+            anchor = ip = start + matchLength;
+        }
+
+        /* check immediate repcode */
+        while (ip <= ilimit) {
+            const U32 repCurrent = (U32)(ip-base);
+            const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
+            const U32 repIndex = repCurrent - offset_2;
+            const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
+            const BYTE* const repMatch = repBase + repIndex;
+            if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow))  /* intentional overflow */
+            if (MEM_read32(ip) == MEM_read32(repMatch)) {
+                /* repcode detected we should take it */
+                const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
+                matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4;
+                offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH);
+                ip += matchLength;
+                anchor = ip;
+                continue;   /* faster when present ... (?) */
+            }
+            break;
+    }   }
+
+    /* Save reps for next block */
+    rep[0] = offset_1;
+    rep[1] = offset_2;
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_greedy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2);
+}
+
+size_t ZSTD_compressBlock_btlazy2_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2);
+}
+/**** ended inlining compress/zstd_lazy.c ****/
+/**** start inlining compress/zstd_ldm.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_ldm.h ****/
+
+/**** skipping file: ../common/debug.h ****/
+/**** skipping file: zstd_fast.h ****/
+/**** skipping file: zstd_double_fast.h ****/
+
+#define LDM_BUCKET_SIZE_LOG 3
+#define LDM_MIN_MATCH_LENGTH 64
+#define LDM_HASH_RLOG 7
+#define LDM_HASH_CHAR_OFFSET 10
+
+void ZSTD_ldm_adjustParameters(ldmParams_t* params,
+                               ZSTD_compressionParameters const* cParams)
+{
+    params->windowLog = cParams->windowLog;
+    ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
+    DEBUGLOG(4, "ZSTD_ldm_adjustParameters");
+    if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
+    if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
+    if (cParams->strategy >= ZSTD_btopt) {
+      /* Get out of the way of the optimal parser */
+      U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength);
+      assert(minMatch >= ZSTD_LDM_MINMATCH_MIN);
+      assert(minMatch <= ZSTD_LDM_MINMATCH_MAX);
+      params->minMatchLength = minMatch;
+    }
+    if (params->hashLog == 0) {
+        params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG);
+        assert(params->hashLog <= ZSTD_HASHLOG_MAX);
+    }
+    if (params->hashRateLog == 0) {
+        params->hashRateLog = params->windowLog < params->hashLog
+                                   ? 0
+                                   : params->windowLog - params->hashLog;
+    }
+    params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
+}
+
+size_t ZSTD_ldm_getTableSize(ldmParams_t params)
+{
+    size_t const ldmHSize = ((size_t)1) << params.hashLog;
+    size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
+    size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog);
+    size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize)
+                           + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t));
+    return params.enableLdm ? totalSize : 0;
+}
+
+size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
+{
+    return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0;
+}
+
+/** ZSTD_ldm_getSmallHash() :
+ *  numBits should be <= 32
+ *  If numBits==0, returns 0.
+ *  @return : the most significant numBits of value. */
+static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits)
+{
+    assert(numBits <= 32);
+    return numBits == 0 ? 0 : (U32)(value >> (64 - numBits));
+}
+
+/** ZSTD_ldm_getChecksum() :
+ *  numBitsToDiscard should be <= 32
+ *  @return : the next most significant 32 bits after numBitsToDiscard */
+static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard)
+{
+    assert(numBitsToDiscard <= 32);
+    return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF;
+}
+
+/** ZSTD_ldm_getTag() ;
+ *  Given the hash, returns the most significant numTagBits bits
+ *  after (32 + hbits) bits.
+ *
+ *  If there are not enough bits remaining, return the last
+ *  numTagBits bits. */
+static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits)
+{
+    assert(numTagBits < 32 && hbits <= 32);
+    if (32 - hbits < numTagBits) {
+        return hash & (((U32)1 << numTagBits) - 1);
+    } else {
+        return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1);
+    }
+}
+
+/** ZSTD_ldm_getBucket() :
+ *  Returns a pointer to the start of the bucket associated with hash. */
+static ldmEntry_t* ZSTD_ldm_getBucket(
+        ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
+{
+    return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
+}
+
+/** ZSTD_ldm_insertEntry() :
+ *  Insert the entry with corresponding hash into the hash table */
+static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
+                                 size_t const hash, const ldmEntry_t entry,
+                                 ldmParams_t const ldmParams)
+{
+    BYTE* const bucketOffsets = ldmState->bucketOffsets;
+    *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry;
+    bucketOffsets[hash]++;
+    bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1;
+}
+
+/** ZSTD_ldm_makeEntryAndInsertByTag() :
+ *
+ *  Gets the small hash, checksum, and tag from the rollingHash.
+ *
+ *  If the tag matches (1 << ldmParams.hashRateLog)-1, then
+ *  creates an ldmEntry from the offset, and inserts it into the hash table.
+ *
+ *  hBits is the length of the small hash, which is the most significant hBits
+ *  of rollingHash. The checksum is the next 32 most significant bits, followed
+ *  by ldmParams.hashRateLog bits that make up the tag. */
+static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState,
+                                             U64 const rollingHash,
+                                             U32 const hBits,
+                                             U32 const offset,
+                                             ldmParams_t const ldmParams)
+{
+    U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashRateLog);
+    U32 const tagMask = ((U32)1 << ldmParams.hashRateLog) - 1;
+    if (tag == tagMask) {
+        U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits);
+        U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+        ldmEntry_t entry;
+        entry.offset = offset;
+        entry.checksum = checksum;
+        ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams);
+    }
+}
+
+/** ZSTD_ldm_countBackwardsMatch() :
+ *  Returns the number of bytes that match backwards before pIn and pMatch.
+ *
+ *  We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
+static size_t ZSTD_ldm_countBackwardsMatch(
+            const BYTE* pIn, const BYTE* pAnchor,
+            const BYTE* pMatch, const BYTE* pBase)
+{
+    size_t matchLength = 0;
+    while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) {
+        pIn--;
+        pMatch--;
+        matchLength++;
+    }
+    return matchLength;
+}
+
+/** ZSTD_ldm_fillFastTables() :
+ *
+ *  Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
+ *  This is similar to ZSTD_loadDictionaryContent.
+ *
+ *  The tables for the other strategies are filled within their
+ *  block compressors. */
+static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
+                                      void const* end)
+{
+    const BYTE* const iend = (const BYTE*)end;
+
+    switch(ms->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast);
+        break;
+
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast);
+        break;
+
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+    case ZSTD_btlazy2:
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+    case ZSTD_btultra2:
+        break;
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
+    }
+
+    return 0;
+}
+
+/** ZSTD_ldm_fillLdmHashTable() :
+ *
+ *  Fills hashTable from (lastHashed + 1) to iend (non-inclusive).
+ *  lastHash is the rolling hash that corresponds to lastHashed.
+ *
+ *  Returns the rolling hash corresponding to position iend-1. */
+static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state,
+                                     U64 lastHash, const BYTE* lastHashed,
+                                     const BYTE* iend, const BYTE* base,
+                                     U32 hBits, ldmParams_t const ldmParams)
+{
+    U64 rollingHash = lastHash;
+    const BYTE* cur = lastHashed + 1;
+
+    while (cur < iend) {
+        rollingHash = ZSTD_rollingHash_rotate(rollingHash, cur[-1],
+                                              cur[ldmParams.minMatchLength-1],
+                                              state->hashPower);
+        ZSTD_ldm_makeEntryAndInsertByTag(state,
+                                         rollingHash, hBits,
+                                         (U32)(cur - base), ldmParams);
+        ++cur;
+    }
+    return rollingHash;
+}
+
+void ZSTD_ldm_fillHashTable(
+            ldmState_t* state, const BYTE* ip,
+            const BYTE* iend, ldmParams_t const* params)
+{
+    DEBUGLOG(5, "ZSTD_ldm_fillHashTable");
+    if ((size_t)(iend - ip) >= params->minMatchLength) {
+        U64 startingHash = ZSTD_rollingHash_compute(ip, params->minMatchLength);
+        ZSTD_ldm_fillLdmHashTable(
+            state, startingHash, ip, iend - params->minMatchLength, state->window.base,
+            params->hashLog - params->bucketSizeLog,
+            *params);
+    }
+}
+
+
+/** ZSTD_ldm_limitTableUpdate() :
+ *
+ *  Sets cctx->nextToUpdate to a position corresponding closer to anchor
+ *  if it is far way
+ *  (after a long match, only update tables a limited amount). */
+static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
+{
+    U32 const current = (U32)(anchor - ms->window.base);
+    if (current > ms->nextToUpdate + 1024) {
+        ms->nextToUpdate =
+            current - MIN(512, current - ms->nextToUpdate - 1024);
+    }
+}
+
+static size_t ZSTD_ldm_generateSequences_internal(
+        ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    /* LDM parameters */
+    int const extDict = ZSTD_window_hasExtDict(ldmState->window);
+    U32 const minMatchLength = params->minMatchLength;
+    U64 const hashPower = ldmState->hashPower;
+    U32 const hBits = params->hashLog - params->bucketSizeLog;
+    U32 const ldmBucketSize = 1U << params->bucketSizeLog;
+    U32 const hashRateLog = params->hashRateLog;
+    U32 const ldmTagMask = (1U << params->hashRateLog) - 1;
+    /* Prefix and extDict parameters */
+    U32 const dictLimit = ldmState->window.dictLimit;
+    U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
+    BYTE const* const base = ldmState->window.base;
+    BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
+    BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
+    BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
+    BYTE const* const lowPrefixPtr = base + dictLimit;
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE);
+    /* Input positions */
+    BYTE const* anchor = istart;
+    BYTE const* ip = istart;
+    /* Rolling hash */
+    BYTE const* lastHashed = NULL;
+    U64 rollingHash = 0;
+
+    while (ip <= ilimit) {
+        size_t mLength;
+        U32 const current = (U32)(ip - base);
+        size_t forwardMatchLength = 0, backwardMatchLength = 0;
+        ldmEntry_t* bestEntry = NULL;
+        if (ip != istart) {
+            rollingHash = ZSTD_rollingHash_rotate(rollingHash, lastHashed[0],
+                                                  lastHashed[minMatchLength],
+                                                  hashPower);
+        } else {
+            rollingHash = ZSTD_rollingHash_compute(ip, minMatchLength);
+        }
+        lastHashed = ip;
+
+        /* Do not insert and do not look for a match */
+        if (ZSTD_ldm_getTag(rollingHash, hBits, hashRateLog) != ldmTagMask) {
+           ip++;
+           continue;
+        }
+
+        /* Get the best entry and compute the match lengths */
+        {
+            ldmEntry_t* const bucket =
+                ZSTD_ldm_getBucket(ldmState,
+                                   ZSTD_ldm_getSmallHash(rollingHash, hBits),
+                                   *params);
+            ldmEntry_t* cur;
+            size_t bestMatchLength = 0;
+            U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
+
+            for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) {
+                size_t curForwardMatchLength, curBackwardMatchLength,
+                       curTotalMatchLength;
+                if (cur->checksum != checksum || cur->offset <= lowestIndex) {
+                    continue;
+                }
+                if (extDict) {
+                    BYTE const* const curMatchBase =
+                        cur->offset < dictLimit ? dictBase : base;
+                    BYTE const* const pMatch = curMatchBase + cur->offset;
+                    BYTE const* const matchEnd =
+                        cur->offset < dictLimit ? dictEnd : iend;
+                    BYTE const* const lowMatchPtr =
+                        cur->offset < dictLimit ? dictStart : lowPrefixPtr;
+
+                    curForwardMatchLength = ZSTD_count_2segments(
+                                                ip, pMatch, iend,
+                                                matchEnd, lowPrefixPtr);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowMatchPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
+                } else { /* !extDict */
+                    BYTE const* const pMatch = base + cur->offset;
+                    curForwardMatchLength = ZSTD_count(ip, pMatch, iend);
+                    if (curForwardMatchLength < minMatchLength) {
+                        continue;
+                    }
+                    curBackwardMatchLength =
+                        ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
+                                                     lowPrefixPtr);
+                    curTotalMatchLength = curForwardMatchLength +
+                                          curBackwardMatchLength;
+                }
+
+                if (curTotalMatchLength > bestMatchLength) {
+                    bestMatchLength = curTotalMatchLength;
+                    forwardMatchLength = curForwardMatchLength;
+                    backwardMatchLength = curBackwardMatchLength;
+                    bestEntry = cur;
+                }
+            }
+        }
+
+        /* No match found -- continue searching */
+        if (bestEntry == NULL) {
+            ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash,
+                                             hBits, current,
+                                             *params);
+            ip++;
+            continue;
+        }
+
+        /* Match found */
+        mLength = forwardMatchLength + backwardMatchLength;
+        ip -= backwardMatchLength;
+
+        {
+            /* Store the sequence:
+             * ip = current - backwardMatchLength
+             * The match is at (bestEntry->offset - backwardMatchLength)
+             */
+            U32 const matchIndex = bestEntry->offset;
+            U32 const offset = current - matchIndex;
+            rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
+
+            /* Out of sequence storage */
+            if (rawSeqStore->size == rawSeqStore->capacity)
+                return ERROR(dstSize_tooSmall);
+            seq->litLength = (U32)(ip - anchor);
+            seq->matchLength = (U32)mLength;
+            seq->offset = offset;
+            rawSeqStore->size++;
+        }
+
+        /* Insert the current entry into the hash table */
+        ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
+                                         (U32)(lastHashed - base),
+                                         *params);
+
+        assert(ip + backwardMatchLength == lastHashed);
+
+        /* Fill the hash table from lastHashed+1 to ip+mLength*/
+        /* Heuristic: don't need to fill the entire table at end of block */
+        if (ip + mLength <= ilimit) {
+            rollingHash = ZSTD_ldm_fillLdmHashTable(
+                              ldmState, rollingHash, lastHashed,
+                              ip + mLength, base, hBits, *params);
+            lastHashed = ip + mLength - 1;
+        }
+        ip += mLength;
+        anchor = ip;
+    }
+    return iend - anchor;
+}
+
+/*! ZSTD_ldm_reduceTable() :
+ *  reduce table indexes by `reducerValue` */
+static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
+                                 U32 const reducerValue)
+{
+    U32 u;
+    for (u = 0; u < size; u++) {
+        if (table[u].offset < reducerValue) table[u].offset = 0;
+        else table[u].offset -= reducerValue;
+    }
+}
+
+size_t ZSTD_ldm_generateSequences(
+        ldmState_t* ldmState, rawSeqStore_t* sequences,
+        ldmParams_t const* params, void const* src, size_t srcSize)
+{
+    U32 const maxDist = 1U << params->windowLog;
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    size_t const kMaxChunkSize = 1 << 20;
+    size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0);
+    size_t chunk;
+    size_t leftoverSize = 0;
+
+    assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
+    /* Check that ZSTD_window_update() has been called for this chunk prior
+     * to passing it to this function.
+     */
+    assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
+    /* The input could be very large (in zstdmt), so it must be broken up into
+     * chunks to enforce the maximum distance and handle overflow correction.
+     */
+    assert(sequences->pos <= sequences->size);
+    assert(sequences->size <= sequences->capacity);
+    for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
+        BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
+        size_t const remaining = (size_t)(iend - chunkStart);
+        BYTE const *const chunkEnd =
+            (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
+        size_t const chunkSize = chunkEnd - chunkStart;
+        size_t newLeftoverSize;
+        size_t const prevSize = sequences->size;
+
+        assert(chunkStart < iend);
+        /* 1. Perform overflow correction if necessary. */
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) {
+            U32 const ldmHSize = 1U << params->hashLog;
+            U32 const correction = ZSTD_window_correctOverflow(
+                &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart);
+            ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
+            /* invalidate dictionaries on overflow correction */
+            ldmState->loadedDictEnd = 0;
+        }
+        /* 2. We enforce the maximum offset allowed.
+         *
+         * kMaxChunkSize should be small enough that we don't lose too much of
+         * the window through early invalidation.
+         * TODO: * Test the chunk size.
+         *       * Try invalidation after the sequence generation and test the
+         *         the offset against maxDist directly.
+         *
+         * NOTE: Because of dictionaries + sequence splitting we MUST make sure
+         * that any offset used is valid at the END of the sequence, since it may
+         * be split into two sequences. This condition holds when using
+         * ZSTD_window_enforceMaxDist(), but if we move to checking offsets
+         * against maxDist directly, we'll have to carefully handle that case.
+         */
+        ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL);
+        /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */
+        newLeftoverSize = ZSTD_ldm_generateSequences_internal(
+            ldmState, sequences, params, chunkStart, chunkSize);
+        if (ZSTD_isError(newLeftoverSize))
+            return newLeftoverSize;
+        /* 4. We add the leftover literals from previous iterations to the first
+         *    newly generated sequence, or add the `newLeftoverSize` if none are
+         *    generated.
+         */
+        /* Prepend the leftover literals from the last call */
+        if (prevSize < sequences->size) {
+            sequences->seq[prevSize].litLength += (U32)leftoverSize;
+            leftoverSize = newLeftoverSize;
+        } else {
+            assert(newLeftoverSize == chunkSize);
+            leftoverSize += chunkSize;
+        }
+    }
+    return 0;
+}
+
+void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
+    while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
+        rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
+        if (srcSize <= seq->litLength) {
+            /* Skip past srcSize literals */
+            seq->litLength -= (U32)srcSize;
+            return;
+        }
+        srcSize -= seq->litLength;
+        seq->litLength = 0;
+        if (srcSize < seq->matchLength) {
+            /* Skip past the first srcSize of the match */
+            seq->matchLength -= (U32)srcSize;
+            if (seq->matchLength < minMatch) {
+                /* The match is too short, omit it */
+                if (rawSeqStore->pos + 1 < rawSeqStore->size) {
+                    seq[1].litLength += seq[0].matchLength;
+                }
+                rawSeqStore->pos++;
+            }
+            return;
+        }
+        srcSize -= seq->matchLength;
+        seq->matchLength = 0;
+        rawSeqStore->pos++;
+    }
+}
+
+/**
+ * If the sequence length is longer than remaining then the sequence is split
+ * between this block and the next.
+ *
+ * Returns the current sequence to handle, or if the rest of the block should
+ * be literals, it returns a sequence with offset == 0.
+ */
+static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
+                                 U32 const remaining, U32 const minMatch)
+{
+    rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
+    assert(sequence.offset > 0);
+    /* Likely: No partial sequence */
+    if (remaining >= sequence.litLength + sequence.matchLength) {
+        rawSeqStore->pos++;
+        return sequence;
+    }
+    /* Cut the sequence short (offset == 0 ==> rest is literals). */
+    if (remaining <= sequence.litLength) {
+        sequence.offset = 0;
+    } else if (remaining < sequence.litLength + sequence.matchLength) {
+        sequence.matchLength = remaining - sequence.litLength;
+        if (sequence.matchLength < minMatch) {
+            sequence.offset = 0;
+        }
+    }
+    /* Skip past `remaining` bytes for the future sequences. */
+    ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
+    return sequence;
+}
+
+size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
+    ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    void const* src, size_t srcSize)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    unsigned const minMatch = cParams->minMatch;
+    ZSTD_blockCompressor const blockCompressor =
+        ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms));
+    /* Input bounds */
+    BYTE const* const istart = (BYTE const*)src;
+    BYTE const* const iend = istart + srcSize;
+    /* Input positions */
+    BYTE const* ip = istart;
+
+    DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
+    assert(rawSeqStore->pos <= rawSeqStore->size);
+    assert(rawSeqStore->size <= rawSeqStore->capacity);
+    /* Loop through each sequence and apply the block compressor to the lits */
+    while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
+        /* maybeSplitSequence updates rawSeqStore->pos */
+        rawSeq const sequence = maybeSplitSequence(rawSeqStore,
+                                                   (U32)(iend - ip), minMatch);
+        int i;
+        /* End signal */
+        if (sequence.offset == 0)
+            break;
+
+        assert(ip + sequence.litLength + sequence.matchLength <= iend);
+
+        /* Fill tables for block compressor */
+        ZSTD_ldm_limitTableUpdate(ms, ip);
+        ZSTD_ldm_fillFastTables(ms, ip);
+        /* Run the block compressor */
+        DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength);
+        {
+            size_t const newLitLength =
+                blockCompressor(ms, seqStore, rep, ip, sequence.litLength);
+            ip += sequence.litLength;
+            /* Update the repcodes */
+            for (i = ZSTD_REP_NUM - 1; i > 0; i--)
+                rep[i] = rep[i-1];
+            rep[0] = sequence.offset;
+            /* Store the sequence */
+            ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
+                          sequence.offset + ZSTD_REP_MOVE,
+                          sequence.matchLength - MINMATCH);
+            ip += sequence.matchLength;
+        }
+    }
+    /* Fill the tables for the block compressor */
+    ZSTD_ldm_limitTableUpdate(ms, ip);
+    ZSTD_ldm_fillFastTables(ms, ip);
+    /* Compress the last literals */
+    return blockCompressor(ms, seqStore, rep, ip, iend - ip);
+}
+/**** ended inlining compress/zstd_ldm.c ****/
+/**** start inlining compress/zstd_opt.c ****/
+/*
+ * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/**** skipping file: zstd_compress_internal.h ****/
+/**** skipping file: hist.h ****/
+/**** skipping file: zstd_opt.h ****/
+
+
+#define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that frequencies adapt faster to new stats */
+#define ZSTD_FREQ_DIV       4   /* log factor when using previous stats to init next stats */
+#define ZSTD_MAX_PRICE     (1<<30)
+
+#define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */
+
+
+/*-*************************************
+*  Price functions for optimal parser
+***************************************/
+
+#if 0    /* approximation at bit level */
+#  define BITCOST_ACCURACY 0
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat)  ((void)opt, ZSTD_bitWeight(stat))
+#elif 0  /* fractional bit accuracy */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
+#else    /* opt==approx, ultra==accurate */
+#  define BITCOST_ACCURACY 8
+#  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
+#  define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat))
+#endif
+
+MEM_STATIC U32 ZSTD_bitWeight(U32 stat)
+{
+    return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER);
+}
+
+MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat)
+{
+    U32 const stat = rawStat + 1;
+    U32 const hb = ZSTD_highbit32(stat);
+    U32 const BWeight = hb * BITCOST_MULTIPLIER;
+    U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb;
+    U32 const weight = BWeight + FWeight;
+    assert(hb + BITCOST_ACCURACY < 31);
+    return weight;
+}
+
+#if (DEBUGLEVEL>=2)
+/* debugging function,
+ * @return price in bytes as fractional value
+ * for debug messages only */
+MEM_STATIC double ZSTD_fCost(U32 price)
+{
+    return (double)price / (BITCOST_MULTIPLIER*8);
+}
+#endif
+
+static int ZSTD_compressedLiterals(optState_t const* const optPtr)
+{
+    return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed;
+}
+
+static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
+{
+    if (ZSTD_compressedLiterals(optPtr))
+        optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel);
+    optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel);
+    optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel);
+    optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel);
+}
+
+
+/* ZSTD_downscaleStat() :
+ * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus)
+ * return the resulting sum of elements */
+static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus)
+{
+    U32 s, sum=0;
+    DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1);
+    assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31);
+    for (s=0; s<lastEltIndex+1; s++) {
+        table[s] = 1 + (table[s] >> (ZSTD_FREQ_DIV+malus));
+        sum += table[s];
+    }
+    return sum;
+}
+
+/* ZSTD_rescaleFreqs() :
+ * if first block (detected by optPtr->litLengthSum == 0) : init statistics
+ *    take hints from dictionary if there is one
+ *    or init from zero, using src for literals stats, or flat 1 for match symbols
+ * otherwise downscale existing stats, to be used as seed for next block.
+ */
+static void
+ZSTD_rescaleFreqs(optState_t* const optPtr,
+            const BYTE* const src, size_t const srcSize,
+                  int const optLevel)
+{
+    int const compressedLiterals = ZSTD_compressedLiterals(optPtr);
+    DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize);
+    optPtr->priceType = zop_dynamic;
+
+    if (optPtr->litLengthSum == 0) {  /* first block : init */
+        if (srcSize <= ZSTD_PREDEF_THRESHOLD) {  /* heuristic */
+            DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef");
+            optPtr->priceType = zop_predef;
+        }
+
+        assert(optPtr->symbolCosts != NULL);
+        if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) {
+            /* huffman table presumed generated by dictionary */
+            optPtr->priceType = zop_dynamic;
+
+            if (compressedLiterals) {
+                unsigned lit;
+                assert(optPtr->litFreq != NULL);
+                optPtr->litSum = 0;
+                for (lit=0; lit<=MaxLit; lit++) {
+                    U32 const scaleLog = 11;   /* scale to 2K */
+                    U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit);
+                    assert(bitCost <= scaleLog);
+                    optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litSum += optPtr->litFreq[lit];
+            }   }
+
+            {   unsigned ll;
+                FSE_CState_t llstate;
+                FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable);
+                optPtr->litLengthSum = 0;
+                for (ll=0; ll<=MaxLL; ll++) {
+                    U32 const scaleLog = 10;   /* scale to 1K */
+                    U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll);
+                    assert(bitCost < scaleLog);
+                    optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->litLengthSum += optPtr->litLengthFreq[ll];
+            }   }
+
+            {   unsigned ml;
+                FSE_CState_t mlstate;
+                FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable);
+                optPtr->matchLengthSum = 0;
+                for (ml=0; ml<=MaxML; ml++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml);
+                    assert(bitCost < scaleLog);
+                    optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->matchLengthSum += optPtr->matchLengthFreq[ml];
+            }   }
+
+            {   unsigned of;
+                FSE_CState_t ofstate;
+                FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable);
+                optPtr->offCodeSum = 0;
+                for (of=0; of<=MaxOff; of++) {
+                    U32 const scaleLog = 10;
+                    U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of);
+                    assert(bitCost < scaleLog);
+                    optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/;
+                    optPtr->offCodeSum += optPtr->offCodeFreq[of];
+            }   }
+
+        } else {  /* not a dictionary */
+
+            assert(optPtr->litFreq != NULL);
+            if (compressedLiterals) {
+                unsigned lit = MaxLit;
+                HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   /* use raw first block to init statistics */
+                optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+            }
+
+            {   unsigned ll;
+                for (ll=0; ll<=MaxLL; ll++)
+                    optPtr->litLengthFreq[ll] = 1;
+            }
+            optPtr->litLengthSum = MaxLL+1;
+
+            {   unsigned ml;
+                for (ml=0; ml<=MaxML; ml++)
+                    optPtr->matchLengthFreq[ml] = 1;
+            }
+            optPtr->matchLengthSum = MaxML+1;
+
+            {   unsigned of;
+                for (of=0; of<=MaxOff; of++)
+                    optPtr->offCodeFreq[of] = 1;
+            }
+            optPtr->offCodeSum = MaxOff+1;
+
+        }
+
+    } else {   /* new block : re-use previous statistics, scaled down */
+
+        if (compressedLiterals)
+            optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1);
+        optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0);
+        optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0);
+        optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0);
+    }
+
+    ZSTD_setBasePrices(optPtr, optLevel);
+}
+
+/* ZSTD_rawLiteralsCost() :
+ * price of literals (only) in specified segment (which length can be 0).
+ * does not include price of literalLength symbol */
+static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength,
+                                const optState_t* const optPtr,
+                                int optLevel)
+{
+    if (litLength == 0) return 0;
+
+    if (!ZSTD_compressedLiterals(optPtr))
+        return (litLength << 3) * BITCOST_MULTIPLIER;  /* Uncompressed - 8 bytes per literal. */
+
+    if (optPtr->priceType == zop_predef)
+        return (litLength*6) * BITCOST_MULTIPLIER;  /* 6 bit per literal - no statistic used */
+
+    /* dynamic statistics */
+    {   U32 price = litLength * optPtr->litSumBasePrice;
+        U32 u;
+        for (u=0; u < litLength; u++) {
+            assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice);   /* literal cost should never be negative */
+            price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel);
+        }
+        return price;
+    }
+}
+
+/* ZSTD_litLengthPrice() :
+ * cost of literalLength symbol */
+static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel)
+{
+    if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel);
+
+    /* dynamic statistics */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        return (LL_bits[llCode] * BITCOST_MULTIPLIER)
+             + optPtr->litLengthSumBasePrice
+             - WEIGHT(optPtr->litLengthFreq[llCode], optLevel);
+    }
+}
+
+/* ZSTD_getMatchPrice() :
+ * Provides the cost of the match part (offset + matchLength) of a sequence
+ * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence.
+ * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
+FORCE_INLINE_TEMPLATE U32
+ZSTD_getMatchPrice(U32 const offset,
+                   U32 const matchLength,
+             const optState_t* const optPtr,
+                   int const optLevel)
+{
+    U32 price;
+    U32 const offCode = ZSTD_highbit32(offset+1);
+    U32 const mlBase = matchLength - MINMATCH;
+    assert(matchLength >= MINMATCH);
+
+    if (optPtr->priceType == zop_predef)  /* fixed scheme, do not use statistics */
+        return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER);
+
+    /* dynamic statistics */
+    price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel));
+    if ((optLevel<2) /*static*/ && offCode >= 20)
+        price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */
+
+    /* match Length */
+    {   U32 const mlCode = ZSTD_MLcode(mlBase);
+        price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel));
+    }
+
+    price += BITCOST_MULTIPLIER / 5;   /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */
+
+    DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price);
+    return price;
+}
+
+/* ZSTD_updateStats() :
+ * assumption : literals + litLengtn <= iend */
+static void ZSTD_updateStats(optState_t* const optPtr,
+                             U32 litLength, const BYTE* literals,
+                             U32 offsetCode, U32 matchLength)
+{
+    /* literals */
+    if (ZSTD_compressedLiterals(optPtr)) {
+        U32 u;
+        for (u=0; u < litLength; u++)
+            optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+        optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+    }
+
+    /* literal Length */
+    {   U32 const llCode = ZSTD_LLcode(litLength);
+        optPtr->litLengthFreq[llCode]++;
+        optPtr->litLengthSum++;
+    }
+
+    /* match offset code (0-2=>repCode; 3+=>offset+2) */
+    {   U32 const offCode = ZSTD_highbit32(offsetCode+1);
+        assert(offCode <= MaxOff);
+        optPtr->offCodeFreq[offCode]++;
+        optPtr->offCodeSum++;
+    }
+
+    /* match Length */
+    {   U32 const mlBase = matchLength - MINMATCH;
+        U32 const mlCode = ZSTD_MLcode(mlBase);
+        optPtr->matchLengthFreq[mlCode]++;
+        optPtr->matchLengthSum++;
+    }
+}
+
+
+/* ZSTD_readMINMATCH() :
+ * function safe only for comparisons
+ * assumption : memPtr must be at least 4 bytes before end of buffer */
+MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length)
+{
+    switch (length)
+    {
+    default :
+    case 4 : return MEM_read32(memPtr);
+    case 3 : if (MEM_isLittleEndian())
+                return MEM_read32(memPtr)<<8;
+             else
+                return MEM_read32(memPtr)>>8;
+    }
+}
+
+
+/* Update hashTable3 up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms,
+                                              U32* nextToUpdate3,
+                                              const BYTE* const ip)
+{
+    U32* const hashTable3 = ms->hashTable3;
+    U32 const hashLog3 = ms->hashLog3;
+    const BYTE* const base = ms->window.base;
+    U32 idx = *nextToUpdate3;
+    U32 const target = (U32)(ip - base);
+    size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3);
+    assert(hashLog3 > 0);
+
+    while(idx < target) {
+        hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx;
+        idx++;
+    }
+
+    *nextToUpdate3 = target;
+    return hashTable3[hash3];
+}
+
+
+/*-*************************************
+*  Binary Tree search
+***************************************/
+/** ZSTD_insertBt1() : add one or multiple positions to tree.
+ *  ip : assumed <= iend-8 .
+ * @return : nb of positions added */
+static U32 ZSTD_insertBt1(
+                ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iend,
+                U32 const mls, const int extDict)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32*   const hashTable = ms->hashTable;
+    U32    const hashLog = cParams->hashLog;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32*   const bt = ms->chainTable;
+    U32    const btLog  = cParams->chainLog - 1;
+    U32    const btMask = (1 << btLog) - 1;
+    U32 matchIndex = hashTable[h];
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const dictBase = ms->window.dictBase;
+    const U32 dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    const BYTE* match;
+    const U32 current = (U32)(ip-base);
+    const U32 btLow = btMask >= current ? 0 : current - btMask;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = smallerPtr + 1;
+    U32 dummy32;   /* to be nullified at the end */
+    U32 const windowLow = ms->window.lowLimit;
+    U32 matchEndIdx = current+8+1;
+    size_t bestLength = 8;
+    U32 nbCompares = 1U << cParams->searchLog;
+#ifdef ZSTD_C_PREDICT
+    U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
+    U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1);
+    predictedSmall += (predictedSmall>0);
+    predictedLarge += (predictedLarge>0);
+#endif /* ZSTD_C_PREDICT */
+
+    DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current);
+
+    assert(ip <= iend-8);   /* required for h calculation */
+    hashTable[h] = current;   /* Update Hash Table */
+
+    assert(windowLow > 0);
+    while (nbCompares-- && (matchIndex >= windowLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(matchIndex < current);
+
+#ifdef ZSTD_C_PREDICT   /* note : can create issues when hlog small <= 11 */
+        const U32* predictPtr = bt + 2*((matchIndex-1) & btMask);   /* written this way, as bt is a roll buffer */
+        if (matchIndex == predictedSmall) {
+            /* no need to check length, result known */
+            *smallerPtr = matchIndex;
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new "smaller" => larger of match */
+            matchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            predictedSmall = predictPtr[1] + (predictPtr[1]>0);
+            continue;
+        }
+        if (matchIndex == predictedLarge) {
+            *largerPtr = matchIndex;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+            predictedLarge = predictPtr[0] + (predictPtr[0]>0);
+            continue;
+        }
+#endif
+
+        if (!extDict || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);   /* might be wrong if actually extDict */
+            match = base + matchIndex;
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
+        } else {
+            match = dictBase + matchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* to prepare for next usage of match[matchLength] */
+        }
+
+        if (matchLength > bestLength) {
+            bestLength = matchLength;
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+        }
+
+        if (ip+matchLength == iend) {   /* equal : no way to know if inf or sup */
+            break;   /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */
+        }
+
+        if (match[matchLength] < ip[matchLength]) {  /* necessarily within buffer */
+            /* match is smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            smallerPtr = nextPtr+1;               /* new "candidate" => larger than match, which was smaller than target */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous and closer to current */
+        } else {
+            /* match is larger than current */
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop searching */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+    {   U32 positions = 0;
+        if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384));   /* speed optimization */
+        assert(matchEndIdx > current + 8);
+        return MAX(positions, matchEndIdx - (current + 8));
+    }
+}
+
+FORCE_INLINE_TEMPLATE
+void ZSTD_updateTree_internal(
+                ZSTD_matchState_t* ms,
+                const BYTE* const ip, const BYTE* const iend,
+                const U32 mls, const ZSTD_dictMode_e dictMode)
+{
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32 idx = ms->nextToUpdate;
+    DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u  (dictMode:%u)",
+                idx, target, dictMode);
+
+    while(idx < target) {
+        U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict);
+        assert(idx < (U32)(idx + forward));
+        idx += forward;
+    }
+    assert((size_t)(ip - base) <= (size_t)(U32)(-1));
+    assert((size_t)(iend - base) <= (size_t)(U32)(-1));
+    ms->nextToUpdate = target;
+}
+
+void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) {
+    ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict);
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_insertBtAndGetAllMatches (
+                    ZSTD_match_t* matches,   /* store result (found matches) in this table (presumed large enough) */
+                    ZSTD_matchState_t* ms,
+                    U32* nextToUpdate3,
+                    const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode,
+                    const U32 rep[ZSTD_REP_NUM],
+                    U32 const ll0,   /* tells if associated literal length is 0 or not. This value must be 0 or 1 */
+                    const U32 lengthToBeat,
+                    U32 const mls /* template */)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    const BYTE* const base = ms->window.base;
+    U32 const current = (U32)(ip-base);
+    U32 const hashLog = cParams->hashLog;
+    U32 const minMatch = (mls==3) ? 3 : 4;
+    U32* const hashTable = ms->hashTable;
+    size_t const h  = ZSTD_hashPtr(ip, hashLog, mls);
+    U32 matchIndex  = hashTable[h];
+    U32* const bt   = ms->chainTable;
+    U32 const btLog = cParams->chainLog - 1;
+    U32 const btMask= (1U << btLog) - 1;
+    size_t commonLengthSmaller=0, commonLengthLarger=0;
+    const BYTE* const dictBase = ms->window.dictBase;
+    U32 const dictLimit = ms->window.dictLimit;
+    const BYTE* const dictEnd = dictBase + dictLimit;
+    const BYTE* const prefixStart = base + dictLimit;
+    U32 const btLow = (btMask >= current) ? 0 : current - btMask;
+    U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog);
+    U32 const matchLow = windowLow ? windowLow : 1;
+    U32* smallerPtr = bt + 2*(current&btMask);
+    U32* largerPtr  = bt + 2*(current&btMask) + 1;
+    U32 matchEndIdx = current+8+1;   /* farthest referenced position of any match => detects repetitive patterns */
+    U32 dummy32;   /* to be nullified at the end */
+    U32 mnum = 0;
+    U32 nbCompares = 1U << cParams->searchLog;
+
+    const ZSTD_matchState_t* dms    = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL;
+    const ZSTD_compressionParameters* const dmsCParams =
+                                      dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL;
+    const BYTE* const dmsBase       = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL;
+    const BYTE* const dmsEnd        = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL;
+    U32         const dmsHighLimit  = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0;
+    U32         const dmsLowLimit   = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0;
+    U32         const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0;
+    U32         const dmsHashLog    = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog;
+    U32         const dmsBtLog      = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog;
+    U32         const dmsBtMask     = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0;
+    U32         const dmsBtLow      = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit;
+
+    size_t bestLength = lengthToBeat-1;
+    DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current);
+
+    /* check repCode */
+    assert(ll0 <= 1);   /* necessarily 1 or 0 */
+    {   U32 const lastR = ZSTD_REP_NUM + ll0;
+        U32 repCode;
+        for (repCode = ll0; repCode < lastR; repCode++) {
+            U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode];
+            U32 const repIndex = current - repOffset;
+            U32 repLen = 0;
+            assert(current >= dictLimit);
+            if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) {  /* equivalent to `current > repIndex >= dictLimit` */
+                /* We must validate the repcode offset because when we're using a dictionary the
+                 * valid offset range shrinks when the dictionary goes out of bounds.
+                 */
+                if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) {
+                    repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch;
+                }
+            } else {  /* repIndex < dictLimit || repIndex >= current */
+                const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ?
+                                             dmsBase + repIndex - dmsIndexDelta :
+                                             dictBase + repIndex;
+                assert(current >= windowLow);
+                if ( dictMode == ZSTD_extDict
+                  && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow)  /* equivalent to `current > repIndex >= windowLow` */
+                     & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */)
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch;
+                }
+                if (dictMode == ZSTD_dictMatchState
+                  && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta))  /* equivalent to `current > repIndex >= dmsLowLimit` */
+                     & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */
+                  && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) {
+                    repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch;
+            }   }
+            /* save longer solution */
+            if (repLen > bestLength) {
+                DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u",
+                            repCode, ll0, repOffset, repLen);
+                bestLength = repLen;
+                matches[mnum].off = repCode - ll0;
+                matches[mnum].len = (U32)repLen;
+                mnum++;
+                if ( (repLen > sufficient_len)
+                   | (ip+repLen == iLimit) ) {  /* best possible */
+                    return mnum;
+    }   }   }   }
+
+    /* HC3 match finder */
+    if ((mls == 3) /*static*/ && (bestLength < mls)) {
+        U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip);
+        if ((matchIndex3 >= matchLow)
+          & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) {
+            size_t mlen;
+            if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) {
+                const BYTE* const match = base + matchIndex3;
+                mlen = ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match = dictBase + matchIndex3;
+                mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart);
+            }
+
+            /* save best solution */
+            if (mlen >= mls /* == 3 > bestLength */) {
+                DEBUGLOG(8, "found small match with hlog3, of length %u",
+                            (U32)mlen);
+                bestLength = mlen;
+                assert(current > matchIndex3);
+                assert(mnum==0);  /* no prior solution */
+                matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE;
+                matches[0].len = (U32)mlen;
+                mnum = 1;
+                if ( (mlen > sufficient_len) |
+                     (ip+mlen == iLimit) ) {  /* best possible length */
+                    ms->nextToUpdate = current+1;  /* skip insertion */
+                    return 1;
+        }   }   }
+        /* no dictMatchState lookup: dicts don't have a populated HC3 table */
+    }
+
+    hashTable[h] = current;   /* Update Hash Table */
+
+    while (nbCompares-- && (matchIndex >= matchLow)) {
+        U32* const nextPtr = bt + 2*(matchIndex & btMask);
+        const BYTE* match;
+        size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+        assert(current > matchIndex);
+
+        if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) {
+            assert(matchIndex+matchLength >= dictLimit);  /* ensure the condition is correct when !extDict */
+            match = base + matchIndex;
+            if (matchIndex >= dictLimit) assert(memcmp(match, ip, matchLength) == 0);  /* ensure early section of match is equal as expected */
+            matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
+        } else {
+            match = dictBase + matchIndex;
+            assert(memcmp(match, ip, matchLength) == 0);  /* ensure early section of match is equal as expected */
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
+            if (matchIndex+matchLength >= dictLimit)
+                match = base + matchIndex;   /* prepare for match[matchLength] read */
+        }
+
+        if (matchLength > bestLength) {
+            DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)",
+                    (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+            assert(matchEndIdx > matchIndex);
+            if (matchLength > matchEndIdx - matchIndex)
+                matchEndIdx = matchIndex + (U32)matchLength;
+            bestLength = matchLength;
+            matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+            matches[mnum].len = (U32)matchLength;
+            mnum++;
+            if ( (matchLength > ZSTD_OPT_NUM)
+               | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */
+                break; /* drop, to preserve bt consistency (miss a little bit of compression) */
+            }
+        }
+
+        if (match[matchLength] < ip[matchLength]) {
+            /* match smaller than current */
+            *smallerPtr = matchIndex;             /* update smaller idx */
+            commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+            if (matchIndex <= btLow) { smallerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            smallerPtr = nextPtr+1;               /* new candidate => larger than match, which was smaller than current */
+            matchIndex = nextPtr[1];              /* new matchIndex, larger than previous, closer to current */
+        } else {
+            *largerPtr = matchIndex;
+            commonLengthLarger = matchLength;
+            if (matchIndex <= btLow) { largerPtr=&dummy32; break; }   /* beyond tree size, stop the search */
+            largerPtr = nextPtr;
+            matchIndex = nextPtr[0];
+    }   }
+
+    *smallerPtr = *largerPtr = 0;
+
+    if (dictMode == ZSTD_dictMatchState && nbCompares) {
+        size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls);
+        U32 dictMatchIndex = dms->hashTable[dmsH];
+        const U32* const dmsBt = dms->chainTable;
+        commonLengthSmaller = commonLengthLarger = 0;
+        while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) {
+            const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask);
+            size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger);   /* guaranteed minimum nb of common bytes */
+            const BYTE* match = dmsBase + dictMatchIndex;
+            matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart);
+            if (dictMatchIndex+matchLength >= dmsHighLimit)
+                match = base + dictMatchIndex + dmsIndexDelta;   /* to prepare for next usage of match[matchLength] */
+
+            if (matchLength > bestLength) {
+                matchIndex = dictMatchIndex + dmsIndexDelta;
+                DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)",
+                        (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE);
+                if (matchLength > matchEndIdx - matchIndex)
+                    matchEndIdx = matchIndex + (U32)matchLength;
+                bestLength = matchLength;
+                matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE;
+                matches[mnum].len = (U32)matchLength;
+                mnum++;
+                if ( (matchLength > ZSTD_OPT_NUM)
+                   | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) {
+                    break;   /* drop, to guarantee consistency (miss a little bit of compression) */
+                }
+            }
+
+            if (dictMatchIndex <= dmsBtLow) { break; }   /* beyond tree size, stop the search */
+            if (match[matchLength] < ip[matchLength]) {
+                commonLengthSmaller = matchLength;    /* all smaller will now have at least this guaranteed common length */
+                dictMatchIndex = nextPtr[1];              /* new matchIndex larger than previous (closer to current) */
+            } else {
+                /* match is larger than current */
+                commonLengthLarger = matchLength;
+                dictMatchIndex = nextPtr[0];
+            }
+        }
+    }
+
+    assert(matchEndIdx > current+8);
+    ms->nextToUpdate = matchEndIdx - 8;  /* skip repetitive patterns */
+    return mnum;
+}
+
+
+FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
+                        ZSTD_match_t* matches,   /* store result (match found, increasing size) in this table */
+                        ZSTD_matchState_t* ms,
+                        U32* nextToUpdate3,
+                        const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode,
+                        const U32 rep[ZSTD_REP_NUM],
+                        U32 const ll0,
+                        U32 const lengthToBeat)
+{
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+    U32 const matchLengthSearch = cParams->minMatch;
+    DEBUGLOG(8, "ZSTD_BtGetAllMatches");
+    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped area */
+    ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode);
+    switch(matchLengthSearch)
+    {
+    case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3);
+    default :
+    case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4);
+    case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5);
+    case 7 :
+    case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6);
+    }
+}
+
+
+/*-*******************************
+*  Optimal parser
+*********************************/
+
+
+static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
+{
+    return sol.litlen + sol.mlen;
+}
+
+#if 0 /* debug */
+
+static void
+listStats(const U32* table, int lastEltID)
+{
+    int const nbElts = lastEltID + 1;
+    int enb;
+    for (enb=0; enb < nbElts; enb++) {
+        (void)table;
+        /* RAWLOG(2, "%3i:%3i,  ", enb, table[enb]); */
+        RAWLOG(2, "%4i,", table[enb]);
+    }
+    RAWLOG(2, " \n");
+}
+
+#endif
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
+                               seqStore_t* seqStore,
+                               U32 rep[ZSTD_REP_NUM],
+                         const void* src, size_t srcSize,
+                         const int optLevel,
+                         const ZSTD_dictMode_e dictMode)
+{
+    optState_t* const optStatePtr = &ms->opt;
+    const BYTE* const istart = (const BYTE*)src;
+    const BYTE* ip = istart;
+    const BYTE* anchor = istart;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* const ilimit = iend - 8;
+    const BYTE* const base = ms->window.base;
+    const BYTE* const prefixStart = base + ms->window.dictLimit;
+    const ZSTD_compressionParameters* const cParams = &ms->cParams;
+
+    U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
+    U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
+    U32 nextToUpdate3 = ms->nextToUpdate;
+
+    ZSTD_optimal_t* const opt = optStatePtr->priceTable;
+    ZSTD_match_t* const matches = optStatePtr->matchTable;
+    ZSTD_optimal_t lastSequence;
+
+    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u",
+                (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate);
+    assert(optLevel <= 2);
+    ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
+    ip += (ip==prefixStart);
+
+    /* Match Loop */
+    while (ip < ilimit) {
+        U32 cur, last_pos = 0;
+
+        /* find first match */
+        {   U32 const litlen = (U32)(ip - anchor);
+            U32 const ll0 = !litlen;
+            U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch);
+            if (!nbMatches) { ip++; continue; }
+
+            /* initialize opt[0] */
+            { U32 i ; for (i=0; i<ZSTD_REP_NUM; i++) opt[0].rep[i] = rep[i]; }
+            opt[0].mlen = 0;  /* means is_a_literal */
+            opt[0].litlen = litlen;
+            /* We don't need to include the actual price of the literals because
+             * it is static for the duration of the forward pass, and is included
+             * in every price. We include the literal length to avoid negative
+             * prices when we subtract the previous literal length.
+             */
+            opt[0].price = ZSTD_litLengthPrice(litlen, optStatePtr, optLevel);
+
+            /* large match -> immediate encoding */
+            {   U32 const maxML = matches[nbMatches-1].len;
+                U32 const maxOffset = matches[nbMatches-1].off;
+                DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series",
+                            nbMatches, maxML, maxOffset, (U32)(ip-prefixStart));
+
+                if (maxML > sufficient_len) {
+                    lastSequence.litlen = litlen;
+                    lastSequence.mlen = maxML;
+                    lastSequence.off = maxOffset;
+                    DEBUGLOG(6, "large match (%u>%u), immediate encoding",
+                                maxML, sufficient_len);
+                    cur = 0;
+                    last_pos = ZSTD_totalLen(lastSequence);
+                    goto _shortestPath;
+            }   }
+
+            /* set prices for first matches starting position == 0 */
+            {   U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 pos;
+                U32 matchNb;
+                for (pos = 1; pos < minMatch; pos++) {
+                    opt[pos].price = ZSTD_MAX_PRICE;   /* mlen, litlen and price will be fixed during forward scanning */
+                }
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    U32 const end = matches[matchNb].len;
+                    for ( ; pos <= end ; pos++ ) {
+                        U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel);
+                        U32 const sequencePrice = literalsPrice + matchPrice;
+                        DEBUGLOG(7, "rPos:%u => set initial price : %.2f",
+                                    pos, ZSTD_fCost(sequencePrice));
+                        opt[pos].mlen = pos;
+                        opt[pos].off = offset;
+                        opt[pos].litlen = litlen;
+                        opt[pos].price = sequencePrice;
+                }   }
+                last_pos = pos-1;
+            }
+        }
+
+        /* check further positions */
+        for (cur = 1; cur <= last_pos; cur++) {
+            const BYTE* const inr = ip + cur;
+            assert(cur < ZSTD_OPT_NUM);
+            DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)
+
+            /* Fix current position with one literal if cheaper */
+            {   U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1;
+                int const price = opt[cur-1].price
+                                + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel)
+                                + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel)
+                                - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel);
+                assert(price < 1000000000); /* overflow check */
+                if (price <= opt[cur].price) {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen,
+                                opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]);
+                    opt[cur].mlen = 0;
+                    opt[cur].off = 0;
+                    opt[cur].litlen = litlen;
+                    opt[cur].price = price;
+                } else {
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)",
+                                inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price),
+                                opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]);
+                }
+            }
+
+            /* Set the repcodes of the current position. We must do it here
+             * because we rely on the repcodes of the 2nd to last sequence being
+             * correct to set the next chunks repcodes during the backward
+             * traversal.
+             */
+            ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t));
+            assert(cur >= opt[cur].mlen);
+            if (opt[cur].mlen != 0) {
+                U32 const prev = cur - opt[cur].mlen;
+                repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0);
+                memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
+            } else {
+                memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t));
+            }
+
+            /* last match must start at a minimum distance of 8 from oend */
+            if (inr > ilimit) continue;
+
+            if (cur == last_pos) break;
+
+            if ( (optLevel==0) /*static_test*/
+              && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) {
+                DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1);
+                continue;  /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+            }
+
+            {   U32 const ll0 = (opt[cur].mlen != 0);
+                U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
+                U32 const previousPrice = opt[cur].price;
+                U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
+                U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch);
+                U32 matchNb;
+                if (!nbMatches) {
+                    DEBUGLOG(7, "rPos:%u : no match found", cur);
+                    continue;
+                }
+
+                {   U32 const maxML = matches[nbMatches-1].len;
+                    DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u",
+                                inr-istart, cur, nbMatches, maxML);
+
+                    if ( (maxML > sufficient_len)
+                      || (cur + maxML >= ZSTD_OPT_NUM) ) {
+                        lastSequence.mlen = maxML;
+                        lastSequence.off = matches[nbMatches-1].off;
+                        lastSequence.litlen = litlen;
+                        cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0;  /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */
+                        last_pos = cur + ZSTD_totalLen(lastSequence);
+                        if (cur > ZSTD_OPT_NUM) cur = 0;   /* underflow => first match */
+                        goto _shortestPath;
+                }   }
+
+                /* set prices using matches found at position == cur */
+                for (matchNb = 0; matchNb < nbMatches; matchNb++) {
+                    U32 const offset = matches[matchNb].off;
+                    U32 const lastML = matches[matchNb].len;
+                    U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+                    U32 mlen;
+
+                    DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u",
+                                matchNb, matches[matchNb].off, lastML, litlen);
+
+                    for (mlen = lastML; mlen >= startML; mlen--) {  /* scan downward */
+                        U32 const pos = cur + mlen;
+                        int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel);
+
+                        if ((pos > last_pos) || (price < opt[pos].price)) {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; }   /* fill empty positions */
+                            opt[pos].mlen = mlen;
+                            opt[pos].off = offset;
+                            opt[pos].litlen = litlen;
+                            opt[pos].price = price;
+                        } else {
+                            DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)",
+                                        pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price));
+                            if (optLevel==0) break;  /* early update abort; gets ~+10% speed for about -0.01 ratio loss */
+                        }
+            }   }   }
+        }  /* for (cur = 1; cur <= last_pos; cur++) */
+
+        lastSequence = opt[last_pos];
+        cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0;  /* single sequence, and it starts before `ip` */
+        assert(cur < ZSTD_OPT_NUM);  /* control overflow*/
+
+_shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
+        assert(opt[0].mlen == 0);
+
+        /* Set the next chunk's repcodes based on the repcodes of the beginning
+         * of the last match, and the last sequence. This avoids us having to
+         * update them while traversing the sequences.
+         */
+        if (lastSequence.mlen != 0) {
+            repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0);
+            memcpy(rep, &reps, sizeof(reps));
+        } else {
+            memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
+        }
+
+        {   U32 const storeEnd = cur + 1;
+            U32 storeStart = storeEnd;
+            U32 seqPos = cur;
+
+            DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)",
+                        last_pos, cur); (void)last_pos;
+            assert(storeEnd < ZSTD_OPT_NUM);
+            DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                        storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off);
+            opt[storeEnd] = lastSequence;
+            while (seqPos > 0) {
+                U32 const backDist = ZSTD_totalLen(opt[seqPos]);
+                storeStart--;
+                DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)",
+                            seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off);
+                opt[storeStart] = opt[seqPos];
+                seqPos = (seqPos > backDist) ? seqPos - backDist : 0;
+            }
+
+            /* save sequences */
+            DEBUGLOG(6, "sending selected sequences into seqStore")
+            {   U32 storePos;
+                for (storePos=storeStart; storePos <= storeEnd; storePos++) {
+                    U32 const llen = opt[storePos].litlen;
+                    U32 const mlen = opt[storePos].mlen;
+                    U32 const offCode = opt[storePos].off;
+                    U32 const advance = llen + mlen;
+                    DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u",
+                                anchor - istart, (unsigned)llen, (unsigned)mlen);
+
+                    if (mlen==0) {  /* only literals => must be last "sequence", actually starting a new stream of sequences */
+                        assert(storePos == storeEnd);   /* must be last sequence */
+                        ip = anchor + llen;     /* last "sequence" is a bunch of literals => don't progress anchor */
+                        continue;   /* will finish */
+                    }
+
+                    assert(anchor + llen <= iend);
+                    ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH);
+                    anchor += advance;
+                    ip = anchor;
+            }   }
+            ZSTD_setBasePrices(optStatePtr, optLevel);
+        }
+    }   /* while (ip < ilimit) */
+
+    /* Return the last literals size */
+    return (size_t)(iend - anchor);
+}
+
+
+size_t ZSTD_compressBlock_btopt(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btopt");
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict);
+}
+
+
+/* used in 2-pass strategy */
+static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus)
+{
+    U32 s, sum=0;
+    assert(ZSTD_FREQ_DIV+bonus >= 0);
+    for (s=0; s<lastEltIndex+1; s++) {
+        table[s] <<= ZSTD_FREQ_DIV+bonus;
+        table[s]--;
+        sum += table[s];
+    }
+    return sum;
+}
+
+/* used in 2-pass strategy */
+MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr)
+{
+    if (ZSTD_compressedLiterals(optPtr))
+        optPtr->litSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0);
+    optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0);
+    optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0);
+    optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0);
+}
+
+/* ZSTD_initStats_ultra():
+ * make a first compression pass, just to seed stats with more accurate starting values.
+ * only works on first block, with no dictionary and no ldm.
+ * this function cannot error, hence its contract must be respected.
+ */
+static void
+ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
+                     seqStore_t* seqStore,
+                     U32 rep[ZSTD_REP_NUM],
+               const void* src, size_t srcSize)
+{
+    U32 tmpRep[ZSTD_REP_NUM];  /* updated rep codes will sink here */
+    memcpy(tmpRep, rep, sizeof(tmpRep));
+
+    DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize);
+    assert(ms->opt.litLengthSum == 0);    /* first block */
+    assert(seqStore->sequences == seqStore->sequencesStart);   /* no ldm */
+    assert(ms->window.dictLimit == ms->window.lowLimit);   /* no dictionary */
+    assert(ms->window.dictLimit - ms->nextToUpdate <= 1);  /* no prefix (note: intentional overflow, defined as 2-complement) */
+
+    ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/
+
+    /* invalidate first scan from history */
+    ZSTD_resetSeqStore(seqStore);
+    ms->window.base -= srcSize;
+    ms->window.dictLimit += (U32)srcSize;
+    ms->window.lowLimit = ms->window.dictLimit;
+    ms->nextToUpdate = ms->window.dictLimit;
+
+    /* re-inforce weight of collected statistics */
+    ZSTD_upscaleStats(&ms->opt);
+}
+
+size_t ZSTD_compressBlock_btultra(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize);
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btultra2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    U32 const current = (U32)((const BYTE*)src - ms->window.base);
+    DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize);
+
+    /* 2-pass strategy:
+     * this strategy makes a first pass over first block to collect statistics
+     * and seed next round's statistics with it.
+     * After 1st pass, function forgets everything, and starts a new block.
+     * Consequently, this can only work if no data has been previously loaded in tables,
+     * aka, no dictionary, no prefix, no ldm preprocessing.
+     * The compression ratio gain is generally small (~0.5% on first block),
+     * the cost is 2x cpu time on first block. */
+    assert(srcSize <= ZSTD_BLOCKSIZE_MAX);
+    if ( (ms->opt.litLengthSum==0)   /* first block */
+      && (seqStore->sequences == seqStore->sequencesStart)  /* no ldm */
+      && (ms->window.dictLimit == ms->window.lowLimit)   /* no dictionary */
+      && (current == ms->window.dictLimit)   /* start of frame, nothing already loaded nor skipped */
+      && (srcSize > ZSTD_PREDEF_THRESHOLD)
+      ) {
+        ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
+    }
+
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_btopt_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btultra_dictMatchState(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_btopt_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict);
+}
+
+size_t ZSTD_compressBlock_btultra_extDict(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict);
+}
+
+/* note : no btultra2 variant for extDict nor dictMatchState,
+ * because btultra2 is not meant to work with dictionaries
+ * and is only specific for the first block (no prefix) */
+/**** ended inlining compress/zstd_opt.c ****/
+
+/**** start inlining decompress/huf_decompress.c ****/
+/* ******************************************************************
+ * huff0 huffman decoder,
+ * part of Finite State Entropy library
+ * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc.
+ *
+ *  You can contact the author at :
+ *  - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+****************************************************************** */
+
+/* **************************************************************
+*  Dependencies
+****************************************************************/
+#include <string.h>     /* memcpy, memset */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/bitstream.h ****/
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/error_private.h ****/
+
+/* **************************************************************
+*  Macros
+****************************************************************/
+
+/* These two optional macros force the use one way or another of the two
+ * Huffman decompression implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(HUF_FORCE_DECOMPRESS_X1) && \
+    defined(HUF_FORCE_DECOMPRESS_X2)
+#error "Cannot force the use of the X1 and X2 decoders at the same time!"
+#endif
+
+
+/* **************************************************************
+*  Error Management
+****************************************************************/
+#define HUF_isError ERR_isError
+
+
+/* **************************************************************
+*  Byte alignment for workSpace management
+****************************************************************/
+#define HUF_ALIGN(x, a)         HUF_ALIGN_MASK((x), (a) - 1)
+#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+
+
+/* **************************************************************
+*  BMI2 Variant Wrappers
+****************************************************************/
+#if DYNAMIC_BMI2
+
+#define HUF_DGEN(fn)                                                        \
+                                                                            \
+    static size_t fn##_default(                                             \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2(                       \
+                  void* dst,  size_t dstSize,                               \
+            const void* cSrc, size_t cSrcSize,                              \
+            const HUF_DTable* DTable)                                       \
+    {                                                                       \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }                                                                       \
+                                                                            \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        if (bmi2) {                                                         \
+            return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable);         \
+        }                                                                   \
+        return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable);          \
+    }
+
+#else
+
+#define HUF_DGEN(fn)                                                        \
+    static size_t fn(void* dst, size_t dstSize, void const* cSrc,           \
+                     size_t cSrcSize, HUF_DTable const* DTable, int bmi2)   \
+    {                                                                       \
+        (void)bmi2;                                                         \
+        return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);             \
+    }
+
+#endif
+
+
+/*-***************************/
+/*  generic DTableDesc       */
+/*-***************************/
+typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc;
+
+static DTableDesc HUF_getDTableDesc(const HUF_DTable* table)
+{
+    DTableDesc dtd;
+    memcpy(&dtd, table, sizeof(dtd));
+    return dtd;
+}
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+
+/*-***************************/
+/*  single-symbol decoding   */
+/*-***************************/
+typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1;   /* single-symbol decoding */
+
+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize)
+{
+    U32 tableLog = 0;
+    U32 nbSymbols = 0;
+    size_t iSize;
+    void* const dtPtr = DTable + 1;
+    HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr;
+
+    U32* rankVal;
+    BYTE* huffWeight;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1;
+    huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable));
+    /* memset(huffWeight, 0, sizeof(huffWeight)); */   /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* Table header */
+    {   DTableDesc dtd = HUF_getDTableDesc(DTable);
+        if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge);   /* DTable too small, Huffman tree cannot fit in */
+        dtd.tableType = 0;
+        dtd.tableLog = (BYTE)tableLog;
+        memcpy(DTable, &dtd, sizeof(dtd));
+    }
+
+    /* Calculate starting value for each rank */
+    {   U32 n, nextRankStart = 0;
+        for (n=1; n<tableLog+1; n++) {
+            U32 const current = nextRankStart;
+            nextRankStart += (rankVal[n] << (n-1));
+            rankVal[n] = current;
+    }   }
+
+    /* fill DTable */
+    {   U32 n;
+        size_t const nEnd = nbSymbols;
+        for (n=0; n<nEnd; n++) {
+            size_t const w = huffWeight[n];
+            size_t const length = (1 << w) >> 1;
+            size_t const uStart = rankVal[w];
+            size_t const uEnd = uStart + length;
+            size_t u;
+            HUF_DEltX1 D;
+            D.byte = (BYTE)n;
+            D.nbBits = (BYTE)(tableLog + 1 - w);
+            rankVal[w] = (U32)uEnd;
+            if (length < 4) {
+                /* Use length in the loop bound so the compiler knows it is short. */
+                for (u = 0; u < length; ++u)
+                    dt[uStart + u] = D;
+            } else {
+                /* Unroll the loop 4 times, we know it is a power of 2. */
+                for (u = uStart; u < uEnd; u += 4) {
+                    dt[u + 0] = D;
+                    dt[u + 1] = D;
+                    dt[u + 2] = D;
+                    dt[u + 3] = D;
+    }   }   }   }
+    return iSize;
+}
+
+size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_readDTableX1_wksp(DTable, src, srcSize,
+                                 workSpace, sizeof(workSpace));
+}
+
+FORCE_INLINE_TEMPLATE BYTE
+HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */
+    BYTE const c = dt[val].byte;
+    BIT_skipBits(Dstream, dt[val].nbBits);
+    return c;
+}
+
+#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \
+    *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr)  \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr)
+
+HINT_INLINE size_t
+HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 4 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) {
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+    }
+
+    /* [0-3] symbols remaining */
+    if (MEM_32bits())
+        while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd))
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    /* no more data to retrieve from bitstream, no need to reload */
+    while (p < pEnd)
+        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+
+    return pEnd-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BYTE* op = (BYTE*)dst;
+    BYTE* const oend = op + dstSize;
+    const void* dtPtr = DTable + 1;
+    const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+    BIT_DStream_t bitD;
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+    U32 const dtLog = dtd.tableLog;
+
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog);
+
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X1_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    /* Check */
+    if (cSrcSize < 10) return ERROR(corruption_detected);  /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - 3;
+        const void* const dtPtr = DTable + 1;
+        const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        const size_t segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+        U32 endSignal = 1;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */
+        for ( ; (endSignal) & (op4 < olimit) ; ) {
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
+            endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+        }
+
+        /* check corruption */
+        /* note : should not be necessary : op# advance in lock step, and we control op4.
+         *        but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 supposed already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX1(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+
+typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
+                                               const void *cSrc,
+                                               size_t cSrcSize,
+                                               const HUF_DTable *DTable);
+
+HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
+
+
+
+size_t HUF_decompress1X1_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X1_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 0) return ERROR(GENERIC);
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize,
+                                                workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0);
+}
+
+
+size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+#endif /* HUF_FORCE_DECOMPRESS_X2 */
+
+
+#ifndef HUF_FORCE_DECOMPRESS_X1
+
+/* *************************/
+/* double-symbols decoding */
+/* *************************/
+
+typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /* double-symbols decoding */
+typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
+typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
+
+
+/* HUF_fillDTableX2Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed,
+                           const U32* rankValOrigin, const int minWeight,
+                           const sortedSymbol_t* sortedSymbols, const U32 sortedListSize,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    HUF_DEltX2 DElt;
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+
+    /* get pre-calculated rankVal */
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill skipped values */
+    if (minWeight>1) {
+        U32 i, skipSize = rankVal[minWeight];
+        MEM_writeLE16(&(DElt.sequence), baseSeq);
+        DElt.nbBits   = (BYTE)(consumed);
+        DElt.length   = 1;
+        for (i = 0; i < skipSize; i++)
+            DTable[i] = DElt;
+    }
+
+    /* fill DTable */
+    {   U32 s; for (s=0; s<sortedListSize; s++) {   /* note : sortedSymbols already skipped */
+            const U32 symbol = sortedSymbols[s].symbol;
+            const U32 weight = sortedSymbols[s].weight;
+            const U32 nbBits = nbBitsBaseline - weight;
+            const U32 length = 1 << (sizeLog-nbBits);
+            const U32 start = rankVal[weight];
+            U32 i = start;
+            const U32 end = start + length;
+
+            MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8)));
+            DElt.nbBits = (BYTE)(nbBits + consumed);
+            DElt.length = 2;
+            do { DTable[i++] = DElt; } while (i<end);   /* since length >= 1 */
+
+            rankVal[weight] += length;
+    }   }
+}
+
+
+static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
+                           const sortedSymbol_t* sortedList, const U32 sortedListSize,
+                           const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight,
+                           const U32 nbBitsBaseline)
+{
+    U32 rankVal[HUF_TABLELOG_MAX + 1];
+    const int scaleLog = nbBitsBaseline - targetLog;   /* note : targetLog >= srcLog, hence scaleLog <= 1 */
+    const U32 minBits  = nbBitsBaseline - maxWeight;
+    U32 s;
+
+    memcpy(rankVal, rankValOrigin, sizeof(rankVal));
+
+    /* fill DTable */
+    for (s=0; s<sortedListSize; s++) {
+        const U16 symbol = sortedList[s].symbol;
+        const U32 weight = sortedList[s].weight;
+        const U32 nbBits = nbBitsBaseline - weight;
+        const U32 start = rankVal[weight];
+        const U32 length = 1 << (targetLog-nbBits);
+
+        if (targetLog-nbBits >= minBits) {   /* enough room for a second symbol */
+            U32 sortedRank;
+            int minWeight = nbBits + scaleLog;
+            if (minWeight < 1) minWeight = 1;
+            sortedRank = rankStart[minWeight];
+            HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
+                           rankValOrigin[nbBits], minWeight,
+                           sortedList+sortedRank, sortedListSize-sortedRank,
+                           nbBitsBaseline, symbol);
+        } else {
+            HUF_DEltX2 DElt;
+            MEM_writeLE16(&(DElt.sequence), symbol);
+            DElt.nbBits = (BYTE)(nbBits);
+            DElt.length = 1;
+            {   U32 const end = start + length;
+                U32 u;
+                for (u = start; u < end; u++) DTable[u] = DElt;
+        }   }
+        rankVal[weight] += length;
+    }
+}
+
+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize)
+{
+    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    U32 const maxTableLog = dtd.maxTableLog;
+    size_t iSize;
+    void* dtPtr = DTable+1;   /* force compiler to avoid strict-aliasing */
+    HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr;
+    U32 *rankStart;
+
+    rankValCol_t* rankVal;
+    U32* rankStats;
+    U32* rankStart0;
+    sortedSymbol_t* sortedSymbol;
+    BYTE* weightList;
+    size_t spaceUsed32 = 0;
+
+    rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2;
+    rankStats = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 1;
+    rankStart0 = (U32 *)workSpace + spaceUsed32;
+    spaceUsed32 += HUF_TABLELOG_MAX + 2;
+    sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t);
+    spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2;
+    weightList = (BYTE *)((U32 *)workSpace + spaceUsed32);
+    spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2;
+
+    if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge);
+
+    rankStart = rankStart0 + 1;
+    memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1));
+
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable));   /* if compiler fails here, assertion is wrong */
+    if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
+    /* memset(weightList, 0, sizeof(weightList)); */  /* is not necessary, even though some analyzer complain ... */
+
+    iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize);
+    if (HUF_isError(iSize)) return iSize;
+
+    /* check result */
+    if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTable can't fit code depth */
+
+    /* find maxWeight */
+    for (maxW = tableLog; rankStats[maxW]==0; maxW--) {}  /* necessarily finds a solution before 0 */
+
+    /* Get start index of each weight */
+    {   U32 w, nextRankStart = 0;
+        for (w=1; w<maxW+1; w++) {
+            U32 current = nextRankStart;
+            nextRankStart += rankStats[w];
+            rankStart[w] = current;
+        }
+        rankStart[0] = nextRankStart;   /* put all 0w symbols at the end of sorted list*/
+        sizeOfSort = nextRankStart;
+    }
+
+    /* sort symbols by weight */
+    {   U32 s;
+        for (s=0; s<nbSymbols; s++) {
+            U32 const w = weightList[s];
+            U32 const r = rankStart[w]++;
+            sortedSymbol[r].symbol = (BYTE)s;
+            sortedSymbol[r].weight = (BYTE)w;
+        }
+        rankStart[0] = 0;   /* forget 0w symbols; this is beginning of weight(1) */
+    }
+
+    /* Build rankVal */
+    {   U32* const rankVal0 = rankVal[0];
+        {   int const rescale = (maxTableLog-tableLog) - 1;   /* tableLog <= maxTableLog */
+            U32 nextRankVal = 0;
+            U32 w;
+            for (w=1; w<maxW+1; w++) {
+                U32 current = nextRankVal;
+                nextRankVal += rankStats[w] << (w+rescale);
+                rankVal0[w] = current;
+        }   }
+        {   U32 const minBits = tableLog+1 - maxW;
+            U32 consumed;
+            for (consumed = minBits; consumed < maxTableLog - minBits + 1; consumed++) {
+                U32* const rankValPtr = rankVal[consumed];
+                U32 w;
+                for (w = 1; w < maxW+1; w++) {
+                    rankValPtr[w] = rankVal0[w] >> consumed;
+    }   }   }   }
+
+    HUF_fillDTableX2(dt, maxTableLog,
+                   sortedSymbol, sizeOfSort,
+                   rankStart0, rankVal, maxW,
+                   tableLog+1);
+
+    dtd.tableLog = (BYTE)maxTableLog;
+    dtd.tableType = 1;
+    memcpy(DTable, &dtd, sizeof(dtd));
+    return iSize;
+}
+
+size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize)
+{
+  U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+  return HUF_readDTableX2_wksp(DTable, src, srcSize,
+                               workSpace, sizeof(workSpace));
+}
+
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 2);
+    BIT_skipBits(DStream, dt[val].nbBits);
+    return dt[val].length;
+}
+
+FORCE_INLINE_TEMPLATE U32
+HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog)
+{
+    size_t const val = BIT_lookBitsFast(DStream, dtLog);   /* note : dtLog >= 1 */
+    memcpy(op, dt+val, 1);
+    if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits);
+    else {
+        if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
+            BIT_skipBits(DStream, dt[val].nbBits);
+            if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
+                /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */
+                DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8);
+    }   }
+    return 1;
+}
+
+#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \
+    ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \
+    if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \
+    if (MEM_64bits()) \
+        ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog)
+
+HINT_INLINE size_t
+HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd,
+                const HUF_DEltX2* const dt, const U32 dtLog)
+{
+    BYTE* const pStart = p;
+
+    /* up to 8 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    }
+
+    /* closer to end : up to 2 symbols at a time */
+    while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2))
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+
+    while (p <= pEnd-2)
+        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reached the end of DStream */
+
+    if (p < pEnd)
+        p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
+
+    return p-pStart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress1X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    BIT_DStream_t bitD;
+
+    /* Init */
+    CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) );
+
+    /* decode */
+    {   BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        const void* const dtPtr = DTable+1;   /* force compiler to not use strict-aliasing */
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog);
+    }
+
+    /* check */
+    if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected);
+
+    /* decoded size */
+    return dstSize;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+HUF_decompress4X2_usingDTable_internal_body(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    if (cSrcSize < 10) return ERROR(corruption_detected);   /* strict minimum : jump table + 1 byte per stream */
+
+    {   const BYTE* const istart = (const BYTE*) cSrc;
+        BYTE* const ostart = (BYTE*) dst;
+        BYTE* const oend = ostart + dstSize;
+        BYTE* const olimit = oend - (sizeof(size_t)-1);
+        const void* const dtPtr = DTable+1;
+        const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr;
+
+        /* Init */
+        BIT_DStream_t bitD1;
+        BIT_DStream_t bitD2;
+        BIT_DStream_t bitD3;
+        BIT_DStream_t bitD4;
+        size_t const length1 = MEM_readLE16(istart);
+        size_t const length2 = MEM_readLE16(istart+2);
+        size_t const length3 = MEM_readLE16(istart+4);
+        size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6);
+        const BYTE* const istart1 = istart + 6;  /* jumpTable */
+        const BYTE* const istart2 = istart1 + length1;
+        const BYTE* const istart3 = istart2 + length2;
+        const BYTE* const istart4 = istart3 + length3;
+        size_t const segmentSize = (dstSize+3) / 4;
+        BYTE* const opStart2 = ostart + segmentSize;
+        BYTE* const opStart3 = opStart2 + segmentSize;
+        BYTE* const opStart4 = opStart3 + segmentSize;
+        BYTE* op1 = ostart;
+        BYTE* op2 = opStart2;
+        BYTE* op3 = opStart3;
+        BYTE* op4 = opStart4;
+        U32 endSignal = 1;
+        DTableDesc const dtd = HUF_getDTableDesc(DTable);
+        U32 const dtLog = dtd.tableLog;
+
+        if (length4 > cSrcSize) return ERROR(corruption_detected);   /* overflow */
+        CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
+        CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
+        CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
+        CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
+
+        /* 16-32 symbols per loop (4-8 symbols per stream) */
+        for ( ; (endSignal) & (op4 < olimit); ) {
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished;
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished;
+            endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished;
+#else
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+            endSignal = (U32)LIKELY(
+                        (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished)
+                      & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished)
+                      & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished)
+                      & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished));
+#endif
+        }
+
+        /* check corruption */
+        if (op1 > opStart2) return ERROR(corruption_detected);
+        if (op2 > opStart3) return ERROR(corruption_detected);
+        if (op3 > opStart4) return ERROR(corruption_detected);
+        /* note : op4 already verified within main loop */
+
+        /* finish bitStreams one by one */
+        HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog);
+        HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog);
+        HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog);
+        HUF_decodeStreamX2(op4, &bitD4, oend,     dt, dtLog);
+
+        /* check */
+        { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4);
+          if (!endCheck) return ERROR(corruption_detected); }
+
+        /* decoded size */
+        return dstSize;
+    }
+}
+
+HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
+HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
+
+size_t HUF_decompress1X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize,
+                                               workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+size_t HUF_decompress4X2_usingDTable(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    DTableDesc dtd = HUF_getDTableDesc(DTable);
+    if (dtd.tableType != 1) return ERROR(GENERIC);
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+}
+
+static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize,
+                                         workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+
+size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                   const void* cSrc, size_t cSrcSize,
+                                   void* workSpace, size_t wkspSize)
+{
+    return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0);
+}
+
+
+size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                              const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                       workSpace, sizeof(workSpace));
+}
+
+size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX);
+    return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize);
+}
+
+#endif /* HUF_FORCE_DECOMPRESS_X1 */
+
+
+/* ***********************************/
+/* Universal decompression selectors */
+/* ***********************************/
+
+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#else
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#endif
+}
+
+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize,
+                                    const void* cSrc, size_t cSrcSize,
+                                    const HUF_DTable* DTable)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#else
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0);
+#endif
+}
+
+
+#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
+static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] =
+{
+    /* single, double, quad */
+    {{0,0}, {1,1}, {2,2}},  /* Q==0 : impossible */
+    {{0,0}, {1,1}, {2,2}},  /* Q==1 : impossible */
+    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q == 2 : 12-18% */
+    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q == 3 : 18-25% */
+    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q == 4 : 25-32% */
+    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q == 5 : 32-38% */
+    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q == 6 : 38-44% */
+    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q == 7 : 44-50% */
+    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q == 8 : 50-56% */
+    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q == 9 : 56-62% */
+    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q ==10 : 62-69% */
+    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q ==11 : 69-75% */
+    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q ==12 : 75-81% */
+    {{1349,128}, {2644,106}, {5260,106}},   /* Q ==13 : 81-87% */
+    {{1455,128}, {2422,124}, {4174,124}},   /* Q ==14 : 87-93% */
+    {{ 722,128}, {1891,145}, {1936,146}},   /* Q ==15 : 93-99% */
+};
+#endif
+
+/** HUF_selectDecoder() :
+ *  Tells which decoder is likely to decode faster,
+ *  based on a set of pre-computed metrics.
+ * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 .
+ *  Assumption : 0 < dstSize <= 128 KB */
+U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize)
+{
+    assert(dstSize > 0);
+    assert(dstSize <= 128*1024);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dstSize;
+    (void)cSrcSize;
+    return 0;
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dstSize;
+    (void)cSrcSize;
+    return 1;
+#else
+    /* decoder timing evaluation */
+    {   U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize);   /* Q < 16 */
+        U32 const D256 = (U32)(dstSize >> 8);
+        U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256);
+        U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256);
+        DTime1 += DTime1 >> 3;  /* advantage to algorithm using less memory, to reduce cache eviction */
+        return DTime1 < DTime0;
+    }
+#endif
+}
+
+
+typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize);
+
+size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
+    static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 };
+#endif
+
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize);
+#else
+        return decompress[algoNb](dst, dstSize, cSrc, cSrcSize);
+#endif
+    }
+}
+
+size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize);
+#else
+        return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) :
+                        HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ;
+#endif
+    }
+}
+
+size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                         workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst,
+                                     size_t dstSize, const void* cSrc,
+                                     size_t cSrcSize, void* workSpace,
+                                     size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#else
+        return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                            cSrcSize, workSpace, wkspSize):
+                        HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize);
+#endif
+    }
+}
+
+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize,
+                                  const void* cSrc, size_t cSrcSize,
+                                  void* workSpace, size_t wkspSize)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize > dstSize) return ERROR(corruption_detected);   /* invalid */
+    if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; }   /* not compressed */
+    if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; }   /* RLE */
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+#else
+        return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize):
+                        HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc,
+                                cSrcSize, workSpace, wkspSize);
+#endif
+    }
+}
+
+size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize,
+                             const void* cSrc, size_t cSrcSize)
+{
+    U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];
+    return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize,
+                                      workSpace, sizeof(workSpace));
+}
+
+
+size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#else
+    return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#endif
+}
+
+#ifndef HUF_FORCE_DECOMPRESS_X2
+size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    const BYTE* ip = (const BYTE*) cSrc;
+
+    size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize);
+    if (HUF_isError(hSize)) return hSize;
+    if (hSize >= cSrcSize) return ERROR(srcSize_wrong);
+    ip += hSize; cSrcSize -= hSize;
+
+    return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2);
+}
+#endif
+
+size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2)
+{
+    DTableDesc const dtd = HUF_getDTableDesc(DTable);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+    (void)dtd;
+    assert(dtd.tableType == 0);
+    return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+    (void)dtd;
+    assert(dtd.tableType == 1);
+    return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#else
+    return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) :
+                           HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2);
+#endif
+}
+
+size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2)
+{
+    /* validation checks */
+    if (dstSize == 0) return ERROR(dstSize_tooSmall);
+    if (cSrcSize == 0) return ERROR(corruption_detected);
+
+    {   U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize);
+#if defined(HUF_FORCE_DECOMPRESS_X1)
+        (void)algoNb;
+        assert(algoNb == 0);
+        return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#elif defined(HUF_FORCE_DECOMPRESS_X2)
+        (void)algoNb;
+        assert(algoNb == 1);
+        return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#else
+        return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) :
+                        HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2);
+#endif
+    }
+}
+/**** ended inlining decompress/huf_decompress.c ****/
+/**** start inlining decompress/zstd_ddict.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_ddict.c :
+ * concentrates all logic that needs to know the internals of ZSTD_DDict object */
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <string.h>      /* memcpy, memmove, memset */
+/**** skipping file: ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** start inlining zstd_decompress_internal.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* zstd_decompress_internal:
+ * objects and definitions shared within lib/decompress modules */
+
+ #ifndef ZSTD_DECOMPRESS_INTERNAL_H
+ #define ZSTD_DECOMPRESS_INTERNAL_H
+
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+
+
+/*-*******************************************************
+ *  Constants
+ *********************************************************/
+static const U32 LL_base[MaxLL+1] = {
+                 0,    1,    2,     3,     4,     5,     6,      7,
+                 8,    9,   10,    11,    12,    13,    14,     15,
+                16,   18,   20,    22,    24,    28,    32,     40,
+                48,   64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+                0x2000, 0x4000, 0x8000, 0x10000 };
+
+static const U32 OF_base[MaxOff+1] = {
+                 0,        1,       1,       5,     0xD,     0x1D,     0x3D,     0x7D,
+                 0xFD,   0x1FD,   0x3FD,   0x7FD,   0xFFD,   0x1FFD,   0x3FFD,   0x7FFD,
+                 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD,
+                 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
+
+static const U32 OF_bits[MaxOff+1] = {
+                     0,  1,  2,  3,  4,  5,  6,  7,
+                     8,  9, 10, 11, 12, 13, 14, 15,
+                    16, 17, 18, 19, 20, 21, 22, 23,
+                    24, 25, 26, 27, 28, 29, 30, 31 };
+
+static const U32 ML_base[MaxML+1] = {
+                     3,  4,  5,    6,     7,     8,     9,    10,
+                    11, 12, 13,   14,    15,    16,    17,    18,
+                    19, 20, 21,   22,    23,    24,    25,    26,
+                    27, 28, 29,   30,    31,    32,    33,    34,
+                    35, 37, 39,   41,    43,    47,    51,    59,
+                    67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
+                    0x1003, 0x2003, 0x4003, 0x8003, 0x10003 };
+
+
+/*-*******************************************************
+ *  Decompression types
+ *********************************************************/
+ typedef struct {
+     U32 fastMode;
+     U32 tableLog;
+ } ZSTD_seqSymbol_header;
+
+ typedef struct {
+     U16  nextState;
+     BYTE nbAdditionalBits;
+     BYTE nbBits;
+     U32  baseValue;
+ } ZSTD_seqSymbol;
+
+ #define SEQSYMBOL_TABLE_SIZE(log)   (1 + (1 << (log)))
+
+typedef struct {
+    ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)];    /* Note : Space reserved for FSE Tables */
+    ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)];   /* is also used as temporary workspace while building hufTable during DDict creation */
+    ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)];    /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */
+    HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)];  /* can accommodate HUF_decompress4X */
+    U32 rep[ZSTD_REP_NUM];
+} ZSTD_entropyDTables_t;
+
+typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader,
+               ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock,
+               ZSTDds_decompressLastBlock, ZSTDds_checkChecksum,
+               ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage;
+
+typedef enum { zdss_init=0, zdss_loadHeader,
+               zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage;
+
+typedef enum {
+    ZSTD_use_indefinitely = -1,  /* Use the dictionary indefinitely */
+    ZSTD_dont_use = 0,           /* Do not use the dictionary (if one exists free it) */
+    ZSTD_use_once = 1            /* Use the dictionary once and set to ZSTD_dont_use */
+} ZSTD_dictUses_e;
+
+typedef enum {
+    ZSTD_obm_buffered = 0,  /* Buffer the output */
+    ZSTD_obm_stable = 1     /* ZSTD_outBuffer is stable */
+} ZSTD_outBufferMode_e;
+
+struct ZSTD_DCtx_s
+{
+    const ZSTD_seqSymbol* LLTptr;
+    const ZSTD_seqSymbol* MLTptr;
+    const ZSTD_seqSymbol* OFTptr;
+    const HUF_DTable* HUFptr;
+    ZSTD_entropyDTables_t entropy;
+    U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32];   /* space needed when building huffman tables */
+    const void* previousDstEnd;   /* detect continuity */
+    const void* prefixStart;      /* start of current segment */
+    const void* virtualStart;     /* virtual start of previous segment if it was just before current one */
+    const void* dictEnd;          /* end of previous segment */
+    size_t expected;
+    ZSTD_frameHeader fParams;
+    U64 decodedSize;
+    blockType_e bType;            /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */
+    ZSTD_dStage stage;
+    U32 litEntropy;
+    U32 fseEntropy;
+    XXH64_state_t xxhState;
+    size_t headerSize;
+    ZSTD_format_e format;
+    const BYTE* litPtr;
+    ZSTD_customMem customMem;
+    size_t litSize;
+    size_t rleSize;
+    size_t staticSize;
+    int bmi2;                     /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */
+
+    /* dictionary */
+    ZSTD_DDict* ddictLocal;
+    const ZSTD_DDict* ddict;     /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */
+    U32 dictID;
+    int ddictIsCold;             /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */
+    ZSTD_dictUses_e dictUses;
+
+    /* streaming */
+    ZSTD_dStreamStage streamStage;
+    char*  inBuff;
+    size_t inBuffSize;
+    size_t inPos;
+    size_t maxWindowSize;
+    char*  outBuff;
+    size_t outBuffSize;
+    size_t outStart;
+    size_t outEnd;
+    size_t lhSize;
+    void* legacyContext;
+    U32 previousLegacyVersion;
+    U32 legacyVersion;
+    U32 hostageByte;
+    int noForwardProgress;
+    ZSTD_outBufferMode_e outBufferMode;
+    ZSTD_outBuffer expectedOutBuffer;
+
+    /* workspace */
+    BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
+    BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
+
+    size_t oversizedDuration;
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    void const* dictContentBeginForFuzzing;
+    void const* dictContentEndForFuzzing;
+#endif
+};  /* typedef'd to ZSTD_DCtx within "zstd.h" */
+
+
+/*-*******************************************************
+ *  Shared internal functions
+ *********************************************************/
+
+/*! ZSTD_loadDEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */
+size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+                   const void* const dict, size_t const dictSize);
+
+/*! ZSTD_checkContinuity() :
+ *  check if next `dst` follows previous position, where decompression ended.
+ *  If yes, do nothing (continue on current segment).
+ *  If not, classify previous segment as "external dictionary", and start a new segment.
+ *  This function cannot fail. */
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst);
+
+
+#endif /* ZSTD_DECOMPRESS_INTERNAL_H */
+/**** ended inlining zstd_decompress_internal.h ****/
+/**** start inlining zstd_ddict.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DDICT_H
+#define ZSTD_DDICT_H
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include <stddef.h>   /* size_t */
+/**** skipping file: ../zstd.h ****/
+
+
+/*-*******************************************************
+ *  Interface
+ *********************************************************/
+
+/* note: several prototypes are already published in `zstd.h` :
+ * ZSTD_createDDict()
+ * ZSTD_createDDict_byReference()
+ * ZSTD_createDDict_advanced()
+ * ZSTD_freeDDict()
+ * ZSTD_initStaticDDict()
+ * ZSTD_sizeof_DDict()
+ * ZSTD_estimateDDictSize()
+ * ZSTD_getDictID_fromDict()
+ */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict);
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict);
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+
+
+#endif /* ZSTD_DDICT_H */
+/**** ended inlining zstd_ddict.h ****/
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+/**** start inlining ../legacy/zstd_legacy.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_LEGACY_H
+#define ZSTD_LEGACY_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+/**** skipping file: ../common/mem.h ****/
+/**** skipping file: ../common/error_private.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+
+#if !defined (ZSTD_LEGACY_SUPPORT) || (ZSTD_LEGACY_SUPPORT == 0)
+#  undef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 8
+#endif
+
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+/**** start inlining zstd_v01.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V01_H_28739879432
+#define ZSTD_V01_H_28739879432
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv01_decompress() : decompress ZSTD frames compliant with v0.1.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv01_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv01_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.1.x format
+     srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+     cSize (output parameter)  : the number of bytes that would be read to decompress this frame
+                                 or an error code if it fails (which can be tested using ZSTDv01_isError())
+     dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+                                 or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+     note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+void ZSTDv01_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+                                     size_t* cSize, unsigned long long* dBound);
+
+/**
+ZSTDv01_isError() : tells if the result of ZSTDv01_decompress() is an error
+*/
+unsigned ZSTDv01_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv01_Dctx_s ZSTDv01_Dctx;
+ZSTDv01_Dctx* ZSTDv01_createDCtx(void);
+size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx);
+
+size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx);
+size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv01_magicNumber   0xFD2FB51E   /* Big Endian version */
+#define ZSTDv01_magicNumberLE 0x1EB52FFD   /* Little Endian version */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V01_H_28739879432 */
+/**** ended inlining zstd_v01.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+/**** start inlining zstd_v02.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V02_H_4174539423
+#define ZSTD_V02_H_4174539423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv02_decompress() : decompress ZSTD frames compliant with v0.2.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv02_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.2.x format
+     srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+     cSize (output parameter)  : the number of bytes that would be read to decompress this frame
+                                 or an error code if it fails (which can be tested using ZSTDv01_isError())
+     dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+                                 or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+    note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+void ZSTDv02_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+                                     size_t* cSize, unsigned long long* dBound);
+
+/**
+ZSTDv02_isError() : tells if the result of ZSTDv02_decompress() is an error
+*/
+unsigned ZSTDv02_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv02_Dctx_s ZSTDv02_Dctx;
+ZSTDv02_Dctx* ZSTDv02_createDCtx(void);
+size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx);
+
+size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx);
+size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv02_magicNumber 0xFD2FB522   /* v0.2 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V02_H_4174539423 */
+/**** ended inlining zstd_v02.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+/**** start inlining zstd_v03.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V03_H_298734209782
+#define ZSTD_V03_H_298734209782
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv03_decompress() : decompress ZSTD frames compliant with v0.3.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv03_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.3.x format
+     srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+     cSize (output parameter)  : the number of bytes that would be read to decompress this frame
+                                 or an error code if it fails (which can be tested using ZSTDv01_isError())
+     dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+                                 or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+    note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+ void ZSTDv03_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+                                      size_t* cSize, unsigned long long* dBound);
+
+    /**
+ZSTDv03_isError() : tells if the result of ZSTDv03_decompress() is an error
+*/
+unsigned ZSTDv03_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv03_Dctx_s ZSTDv03_Dctx;
+ZSTDv03_Dctx* ZSTDv03_createDCtx(void);
+size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_decompressDCtx(void* ctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+/* *************************************
+*  Streaming functions
+***************************************/
+size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx);
+
+size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx);
+size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv03_magicNumber 0xFD2FB523   /* v0.3 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V03_H_298734209782 */
+/**** ended inlining zstd_v03.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+/**** start inlining zstd_v04.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_V04_H_91868324769238
+#define ZSTD_V04_H_91868324769238
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* *************************************
+*  Includes
+***************************************/
+#include <stddef.h>   /* size_t */
+
+
+/* *************************************
+*  Simple one-step function
+***************************************/
+/**
+ZSTDv04_decompress() : decompress ZSTD frames compliant with v0.4.x format
+    compressedSize : is the exact source size
+    maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated.
+                      It must be equal or larger than originalSize, otherwise decompression will fail.
+    return : the number of bytes decompressed into destination buffer (originalSize)
+             or an errorCode if it fails (which can be tested using ZSTDv01_isError())
+*/
+size_t ZSTDv04_decompress( void* dst, size_t maxOriginalSize,
+                     const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv04_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.4.x format
+     srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+     cSize (output parameter)  : the number of bytes that would be read to decompress this frame
+                                 or an error code if it fails (which can be tested using ZSTDv01_isError())
+     dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+                                 or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+    note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+ void ZSTDv04_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+                                      size_t* cSize, unsigned long long* dBound);
+
+/**
+ZSTDv04_isError() : tells if the result of ZSTDv04_decompress() is an error
+*/
+unsigned ZSTDv04_isError(size_t code);
+
+
+/* *************************************
+*  Advanced functions
+***************************************/
+typedef struct ZSTDv04_Dctx_s ZSTDv04_Dctx;
+ZSTDv04_Dctx* ZSTDv04_createDCtx(void);
+size_t ZSTDv04_freeDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_decompressDCtx(ZSTDv04_Dctx* dctx,
+                              void* dst, size_t maxOriginalSize,
+                        const void* src, size_t compressedSize);
+
+
+/* *************************************
+*  Direct Streaming
+***************************************/
+size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx);
+
+size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx);
+size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize);
+/**
+  Use above functions alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block.
+  Result is the number of bytes regenerated within 'dst'.
+  It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header.
+*/
+
+
+/* *************************************
+*  Buffered Streaming
+***************************************/
+typedef struct ZBUFFv04_DCtx_s ZBUFFv04_DCtx;
+ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void);
+size_t         ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx);
+
+size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx);
+size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr);
+
+/** ************************************************
+*  Streaming decompression
+*
+*  A ZBUFF_DCtx object is required to track streaming operation.
+*  Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources.
+*  Use ZBUFF_decompressInit() to start a new decompression operation.
+*  ZBUFF_DCtx objects can be reused multiple times.
+*
+*  Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary()
+*  It must be the same content as the one set during compression phase.
+*  Dictionary content must remain accessible during the decompression process.
+*
+*  Use ZBUFF_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *maxDstSizePtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFF_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize
+*  output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded.
+*  input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* **************************************************/
+unsigned ZBUFFv04_isError(size_t errorCode);
+const char* ZBUFFv04_getErrorName(size_t errorCode);
+
+
+/** The below functions provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are not compulsory, they just tend to offer better latency */
+size_t ZBUFFv04_recommendedDInSize(void);
+size_t ZBUFFv04_recommendedDOutSize(void);
+
+
+/* *************************************
+*  Prefix - version detection
+***************************************/
+#define ZSTDv04_magicNumber 0xFD2FB524   /* v0.4 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_V04_H_91868324769238 */
+/**** ended inlining zstd_v04.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+/**** start inlining zstd_v05.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv05_H
+#define ZSTDv05_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*-*************************************
+*  Dependencies
+***************************************/
+#include <stddef.h>   /* size_t */
+/**** skipping file: ../common/mem.h ****/
+
+
+/* *************************************
+*  Simple functions
+***************************************/
+/*! ZSTDv05_decompress() :
+    `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+    `dstCapacity` must be large enough, equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv05_isError()) */
+size_t ZSTDv05_decompress( void* dst, size_t dstCapacity,
+                     const void* src, size_t compressedSize);
+
+ /**
+ ZSTDv05_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.5.x format
+     srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+     cSize (output parameter)  : the number of bytes that would be read to decompress this frame
+                                 or an error code if it fails (which can be tested using ZSTDv01_isError())
+     dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+                                 or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+    note : assumes `cSize` and `dBound` are _not_ NULL.
+ */
+void ZSTDv05_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+                                     size_t* cSize, unsigned long long* dBound);
+
+/* *************************************
+*  Helper functions
+***************************************/
+/* Error Management */
+unsigned    ZSTDv05_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+const char* ZSTDv05_getErrorName(size_t code);     /*!< provides readable string for an error code */
+
+
+/* *************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv05_DCtx_s ZSTDv05_DCtx;
+ZSTDv05_DCtx* ZSTDv05_createDCtx(void);
+size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv05_decompressDCtx() :
+*   Same as ZSTDv05_decompress(), but requires an already allocated ZSTDv05_DCtx (see ZSTDv05_createDCtx()) */
+size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  Simple Dictionary API
+*************************/
+/*! ZSTDv05_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTDv05_decompressDCtx() */
+size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const void* dict,size_t dictSize);
+
+/*-************************
+*  Advanced Streaming API
+***************************/
+typedef enum { ZSTDv05_fast, ZSTDv05_greedy, ZSTDv05_lazy, ZSTDv05_lazy2, ZSTDv05_btlazy2, ZSTDv05_opt, ZSTDv05_btopt } ZSTDv05_strategy;
+typedef struct {
+    U64 srcSize;
+    U32 windowLog;     /* the only useful information to retrieve */
+    U32 contentLog; U32 hashLog; U32 searchLog; U32 searchLength; U32 targetLength; ZSTDv05_strategy strategy;
+} ZSTDv05_parameters;
+size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize);
+
+size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize);
+void   ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx);
+size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx);
+size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  ZBUFF API
+*************************/
+typedef struct ZBUFFv05_DCtx_s ZBUFFv05_DCtx;
+ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void);
+size_t         ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* dctx);
+
+size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* dctx);
+size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* dctx, const void* dict, size_t dictSize);
+
+size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression
+*
+*  A ZBUFFv05_DCtx object is required to track streaming operations.
+*  Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources.
+*  Use ZBUFFv05_decompressInit() to start a new decompression operation,
+*   or ZBUFFv05_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv05_DCtx objects can be reused multiple times.
+*
+*  Use ZBUFFv05_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change @dst.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency)
+*            or 0 when a frame is completely decoded
+*            or an error code, which can be tested using ZBUFFv05_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv05_recommendedDInSize() / ZBUFFv05_recommendedDOutSize()
+*  output : ZBUFFv05_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv05_recommendedDInSize==128Kb+3; just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+unsigned ZBUFFv05_isError(size_t errorCode);
+const char* ZBUFFv05_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, and tend to offer better latency */
+size_t ZBUFFv05_recommendedDInSize(void);
+size_t ZBUFFv05_recommendedDOutSize(void);
+
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv05_MAGICNUMBER 0xFD2FB525   /* v0.5 */
+
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv0505_H */
+/**** ended inlining zstd_v05.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+/**** start inlining zstd_v06.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv06_H
+#define ZSTDv06_H
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
+
+/*======  Export for Windows  ======*/
+/*!
+*  ZSTDv06_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv06_DLL_EXPORT) && (ZSTDv06_DLL_EXPORT==1)
+#  define ZSTDLIBv06_API __declspec(dllexport)
+#else
+#  define ZSTDLIBv06_API
+#endif
+
+
+/* *************************************
+*  Simple functions
+***************************************/
+/*! ZSTDv06_decompress() :
+    `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail.
+    `dstCapacity` must be large enough, equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv06_isError()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompress( void* dst, size_t dstCapacity,
+                                    const void* src, size_t compressedSize);
+
+/**
+ZSTDv06_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.6.x format
+    srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    cSize (output parameter)  : the number of bytes that would be read to decompress this frame
+                                or an error code if it fails (which can be tested using ZSTDv01_isError())
+    dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+                                or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+    note : assumes `cSize` and `dBound` are _not_ NULL.
+*/
+void ZSTDv06_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+                                     size_t* cSize, unsigned long long* dBound);
+
+/* *************************************
+*  Helper functions
+***************************************/
+ZSTDLIBv06_API size_t      ZSTDv06_compressBound(size_t srcSize); /*!< maximum compressed size (worst case scenario) */
+
+/* Error Management */
+ZSTDLIBv06_API unsigned    ZSTDv06_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv06_API const char* ZSTDv06_getErrorName(size_t code);     /*!< provides readable string for an error code */
+
+
+/* *************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv06_DCtx_s ZSTDv06_DCtx;
+ZSTDLIBv06_API ZSTDv06_DCtx* ZSTDv06_createDCtx(void);
+ZSTDLIBv06_API size_t     ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv06_decompressDCtx() :
+*   Same as ZSTDv06_decompress(), but requires an already allocated ZSTDv06_DCtx (see ZSTDv06_createDCtx()) */
+ZSTDLIBv06_API size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-***********************
+*  Dictionary API
+*************************/
+/*! ZSTDv06_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted.
+*   Note : dict can be NULL, in which case, it's equivalent to ZSTDv06_decompressDCtx() */
+ZSTDLIBv06_API size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx,
+                                                   void* dst, size_t dstCapacity,
+                                             const void* src, size_t srcSize,
+                                             const void* dict,size_t dictSize);
+
+
+/*-************************
+*  Advanced Streaming API
+***************************/
+struct ZSTDv06_frameParams_s { unsigned long long frameContentSize; unsigned windowLog; };
+typedef struct ZSTDv06_frameParams_s ZSTDv06_frameParams;
+
+ZSTDLIBv06_API size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+ZSTDLIBv06_API size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIBv06_API void   ZSTDv06_copyDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx);
+
+ZSTDLIBv06_API size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+
+/* *************************************
+*  ZBUFF API
+***************************************/
+
+typedef struct ZBUFFv06_DCtx_s ZBUFFv06_DCtx;
+ZSTDLIBv06_API ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void);
+ZSTDLIBv06_API size_t         ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* dctx);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* dctx);
+ZSTDLIBv06_API size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv06_API size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* dctx,
+                                                  void* dst, size_t* dstCapacityPtr,
+                                            const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv06_DCtx object is required to track streaming operations.
+*  Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources.
+*  Use ZBUFFv06_decompressInit() to start a new decompression operation,
+*   or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv06_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv06_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv06_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize()
+*  output : ZBUFFv06_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv06_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZSTDLIBv06_API unsigned ZBUFFv06_isError(size_t errorCode);
+ZSTDLIBv06_API const char* ZBUFFv06_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDInSize(void);
+ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void);
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv06_MAGICNUMBER 0xFD2FB526   /* v0.6 */
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv06_BUFFERED_H */
+/**** ended inlining zstd_v06.h ****/
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+/**** start inlining zstd_v07.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTDv07_H_235446
+#define ZSTDv07_H_235446
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*======  Dependency  ======*/
+#include <stddef.h>   /* size_t */
+
+
+/*======  Export for Windows  ======*/
+/*!
+*  ZSTDv07_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*/
+#if defined(_WIN32) && defined(ZSTDv07_DLL_EXPORT) && (ZSTDv07_DLL_EXPORT==1)
+#  define ZSTDLIBv07_API __declspec(dllexport)
+#else
+#  define ZSTDLIBv07_API
+#endif
+
+
+/* *************************************
+*  Simple API
+***************************************/
+/*! ZSTDv07_getDecompressedSize() :
+*   @return : decompressed size if known, 0 otherwise.
+       note 1 : if `0`, follow up with ZSTDv07_getFrameParams() to know precise failure cause.
+       note 2 : decompressed size could be wrong or intentionally modified !
+                always ensure results fit within application's authorized limits */
+unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTDv07_decompress() :
+    `compressedSize` : must be _exact_ size of compressed input, otherwise decompression will fail.
+    `dstCapacity` must be equal or larger than originalSize.
+    @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+              or an errorCode if it fails (which can be tested using ZSTDv07_isError()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompress( void* dst, size_t dstCapacity,
+                                    const void* src, size_t compressedSize);
+
+/**
+ZSTDv07_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.7.x format
+    srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src'
+    cSize (output parameter)  : the number of bytes that would be read to decompress this frame
+                                or an error code if it fails (which can be tested using ZSTDv01_isError())
+    dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame
+                                or ZSTD_CONTENTSIZE_ERROR if an error occurs
+
+    note : assumes `cSize` and `dBound` are _not_ NULL.
+*/
+void ZSTDv07_findFrameSizeInfoLegacy(const void *src, size_t srcSize,
+                                     size_t* cSize, unsigned long long* dBound);
+
+/*======  Helper functions  ======*/
+ZSTDLIBv07_API unsigned    ZSTDv07_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIBv07_API const char* ZSTDv07_getErrorName(size_t code);     /*!< provides readable string from an error code */
+
+
+/*-*************************************
+*  Explicit memory management
+***************************************/
+/** Decompression context */
+typedef struct ZSTDv07_DCtx_s ZSTDv07_DCtx;
+ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx(void);
+ZSTDLIBv07_API size_t     ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx);      /*!< @return : errorCode */
+
+/** ZSTDv07_decompressDCtx() :
+*   Same as ZSTDv07_decompress(), requires an allocated ZSTDv07_DCtx (see ZSTDv07_createDCtx()) */
+ZSTDLIBv07_API size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-************************
+*  Simple dictionary API
+***************************/
+/*! ZSTDv07_decompress_usingDict() :
+*   Decompression using a pre-defined Dictionary content (see dictBuilder).
+*   Dictionary must be identical to the one used during compression.
+*   Note : This function load the dictionary, resulting in a significant startup time */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx,
+                                                   void* dst, size_t dstCapacity,
+                                             const void* src, size_t srcSize,
+                                             const void* dict,size_t dictSize);
+
+
+/*-**************************
+*  Advanced Dictionary API
+****************************/
+/*! ZSTDv07_createDDict() :
+*   Create a digested dictionary, ready to start decompression operation without startup delay.
+*   `dict` can be released after creation */
+typedef struct ZSTDv07_DDict_s ZSTDv07_DDict;
+ZSTDLIBv07_API ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize);
+ZSTDLIBv07_API size_t      ZSTDv07_freeDDict(ZSTDv07_DDict* ddict);
+
+/*! ZSTDv07_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Faster startup than ZSTDv07_decompress_usingDict(), recommended when same dictionary is used multiple times. */
+ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx,
+                                                    void* dst, size_t dstCapacity,
+                                              const void* src, size_t srcSize,
+                                              const ZSTDv07_DDict* ddict);
+
+typedef struct {
+    unsigned long long frameContentSize;
+    unsigned windowSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTDv07_frameParams;
+
+ZSTDLIBv07_API size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+
+
+
+
+/* *************************************
+*  Streaming functions
+***************************************/
+typedef struct ZBUFFv07_DCtx_s ZBUFFv07_DCtx;
+ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void);
+ZSTDLIBv07_API size_t      ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* dctx);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* dctx);
+ZSTDLIBv07_API size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* dctx, const void* dict, size_t dictSize);
+
+ZSTDLIBv07_API size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* dctx,
+                                            void* dst, size_t* dstCapacityPtr,
+                                      const void* src, size_t* srcSizePtr);
+
+/*-***************************************************************************
+*  Streaming decompression howto
+*
+*  A ZBUFFv07_DCtx object is required to track streaming operations.
+*  Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources.
+*  Use ZBUFFv07_decompressInit() to start a new decompression operation,
+*   or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary.
+*  Note that ZBUFFv07_DCtx objects can be re-init multiple times.
+*
+*  Use ZBUFFv07_decompressContinue() repetitively to consume your input.
+*  *srcSizePtr and *dstCapacityPtr can be any size.
+*  The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr.
+*  Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again.
+*  The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`.
+*  @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency),
+*            or 0 when a frame is completely decoded,
+*            or an error code, which can be tested using ZBUFFv07_isError().
+*
+*  Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize()
+*  output : ZBUFFv07_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded.
+*  input  : ZBUFFv07_recommendedDInSize == 128KB + 3;
+*           just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 .
+* *******************************************************************************/
+
+
+/* *************************************
+*  Tool functions
+***************************************/
+ZSTDLIBv07_API unsigned ZBUFFv07_isError(size_t errorCode);
+ZSTDLIBv07_API const char* ZBUFFv07_getErrorName(size_t errorCode);
+
+/** Functions below provide recommended buffer sizes for Compression or Decompression operations.
+*   These sizes are just hints, they tend to offer better latency */
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDInSize(void);
+ZSTDLIBv07_API size_t ZBUFFv07_recommendedDOutSize(void);
+
+
+/*-*************************************
+*  Constants
+***************************************/
+#define ZSTDv07_MAGICNUMBER            0xFD2FB527   /* v0.7 */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* ZSTDv07_H_235446 */
+/**** ended inlining zstd_v07.h ****/
+#endif
+
+/** ZSTD_isLegacy() :
+    @return : > 0 if supported by legacy decoder. 0 otherwise.
+              return value is the version.
+*/
+MEM_STATIC unsigned ZSTD_isLegacy(const void* src, size_t srcSize)
+{
+    U32 magicNumberLE;
+    if (srcSize<4) return 0;
+    magicNumberLE = MEM_readLE32(src);
+    switch(magicNumberLE)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case ZSTDv01_magicNumberLE:return 1;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case ZSTDv02_magicNumber : return 2;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case ZSTDv03_magicNumber : return 3;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case ZSTDv04_magicNumber : return 4;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case ZSTDv05_MAGICNUMBER : return 5;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case ZSTDv06_MAGICNUMBER : return 6;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case ZSTDv07_MAGICNUMBER : return 7;
+#endif
+        default : return 0;
+    }
+}
+
+
+MEM_STATIC unsigned long long ZSTD_getDecompressedSize_legacy(const void* src, size_t srcSize)
+{
+    U32 const version = ZSTD_isLegacy(src, srcSize);
+    if (version < 5) return 0;  /* no decompressed size in frame header, or not a legacy format */
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+    if (version==5) {
+        ZSTDv05_parameters fParams;
+        size_t const frResult = ZSTDv05_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.srcSize;
+    }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+    if (version==6) {
+        ZSTDv06_frameParams fParams;
+        size_t const frResult = ZSTDv06_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.frameContentSize;
+    }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+    if (version==7) {
+        ZSTDv07_frameParams fParams;
+        size_t const frResult = ZSTDv07_getFrameParams(&fParams, src, srcSize);
+        if (frResult != 0) return 0;
+        return fParams.frameContentSize;
+    }
+#endif
+    return 0;   /* should not be possible */
+}
+
+
+MEM_STATIC size_t ZSTD_decompressLegacy(
+                     void* dst, size_t dstCapacity,
+               const void* src, size_t compressedSize,
+               const void* dict,size_t dictSize)
+{
+    U32 const version = ZSTD_isLegacy(src, compressedSize);
+    (void)dst; (void)dstCapacity; (void)dict; (void)dictSize;  /* unused when ZSTD_LEGACY_SUPPORT >= 8 */
+    switch(version)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case 1 :
+            return ZSTDv01_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case 2 :
+            return ZSTDv02_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case 3 :
+            return ZSTDv03_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            return ZSTDv04_decompress(dst, dstCapacity, src, compressedSize);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            {   size_t result;
+                ZSTDv05_DCtx* const zd = ZSTDv05_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv05_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv05_freeDCtx(zd);
+                return result;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            {   size_t result;
+                ZSTDv06_DCtx* const zd = ZSTDv06_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv06_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv06_freeDCtx(zd);
+                return result;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            {   size_t result;
+                ZSTDv07_DCtx* const zd = ZSTDv07_createDCtx();
+                if (zd==NULL) return ERROR(memory_allocation);
+                result = ZSTDv07_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize);
+                ZSTDv07_freeDCtx(zd);
+                return result;
+            }
+#endif
+        default :
+            return ERROR(prefix_unknown);
+    }
+}
+
+MEM_STATIC ZSTD_frameSizeInfo ZSTD_findFrameSizeInfoLegacy(const void *src, size_t srcSize)
+{
+    ZSTD_frameSizeInfo frameSizeInfo;
+    U32 const version = ZSTD_isLegacy(src, srcSize);
+    switch(version)
+    {
+#if (ZSTD_LEGACY_SUPPORT <= 1)
+        case 1 :
+            ZSTDv01_findFrameSizeInfoLegacy(src, srcSize,
+                &frameSizeInfo.compressedSize,
+                &frameSizeInfo.decompressedBound);
+            break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 2)
+        case 2 :
+            ZSTDv02_findFrameSizeInfoLegacy(src, srcSize,
+                &frameSizeInfo.compressedSize,
+                &frameSizeInfo.decompressedBound);
+            break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 3)
+        case 3 :
+            ZSTDv03_findFrameSizeInfoLegacy(src, srcSize,
+                &frameSizeInfo.compressedSize,
+                &frameSizeInfo.decompressedBound);
+            break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            ZSTDv04_findFrameSizeInfoLegacy(src, srcSize,
+                &frameSizeInfo.compressedSize,
+                &frameSizeInfo.decompressedBound);
+            break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            ZSTDv05_findFrameSizeInfoLegacy(src, srcSize,
+                &frameSizeInfo.compressedSize,
+                &frameSizeInfo.decompressedBound);
+            break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            ZSTDv06_findFrameSizeInfoLegacy(src, srcSize,
+                &frameSizeInfo.compressedSize,
+                &frameSizeInfo.decompressedBound);
+            break;
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            ZSTDv07_findFrameSizeInfoLegacy(src, srcSize,
+                &frameSizeInfo.compressedSize,
+                &frameSizeInfo.decompressedBound);
+            break;
+#endif
+        default :
+            frameSizeInfo.compressedSize = ERROR(prefix_unknown);
+            frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+            break;
+    }
+    if (!ZSTD_isError(frameSizeInfo.compressedSize) && frameSizeInfo.compressedSize > srcSize) {
+        frameSizeInfo.compressedSize = ERROR(srcSize_wrong);
+        frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+    }
+    return frameSizeInfo;
+}
+
+MEM_STATIC size_t ZSTD_findFrameCompressedSizeLegacy(const void *src, size_t srcSize)
+{
+    ZSTD_frameSizeInfo frameSizeInfo = ZSTD_findFrameSizeInfoLegacy(src, srcSize);
+    return frameSizeInfo.compressedSize;
+}
+
+MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version)
+{
+    switch(version)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)legacyContext;
+            return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 : return ZBUFFv04_freeDCtx((ZBUFFv04_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 : return ZBUFFv05_freeDCtx((ZBUFFv05_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 : return ZBUFFv06_freeDCtx((ZBUFFv06_DCtx*)legacyContext);
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 : return ZBUFFv07_freeDCtx((ZBUFFv07_DCtx*)legacyContext);
+#endif
+    }
+}
+
+
+MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion,
+                                        const void* dict, size_t dictSize)
+{
+    DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion);
+    if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion);
+    switch(newVersion)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)dict; (void)dictSize;
+            return 0;
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+        {
+            ZBUFFv04_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv04_createDCtx() : (ZBUFFv04_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv04_decompressInit(dctx);
+            ZBUFFv04_decompressWithDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+        {
+            ZBUFFv05_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv05_createDCtx() : (ZBUFFv05_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv05_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+        {
+            ZBUFFv06_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv06_createDCtx() : (ZBUFFv06_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv06_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+        {
+            ZBUFFv07_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv07_createDCtx() : (ZBUFFv07_DCtx*)*legacyContext;
+            if (dctx==NULL) return ERROR(memory_allocation);
+            ZBUFFv07_decompressInitDictionary(dctx, dict, dictSize);
+            *legacyContext = dctx;
+            return 0;
+        }
+#endif
+    }
+}
+
+
+
+MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version,
+                                              ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version);
+    switch(version)
+    {
+        default :
+        case 1 :
+        case 2 :
+        case 3 :
+            (void)legacyContext; (void)output; (void)input;
+            return ERROR(version_unsupported);
+#if (ZSTD_LEGACY_SUPPORT <= 4)
+        case 4 :
+            {
+                ZBUFFv04_DCtx* dctx = (ZBUFFv04_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv04_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 5)
+        case 5 :
+            {
+                ZBUFFv05_DCtx* dctx = (ZBUFFv05_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv05_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 6)
+        case 6 :
+            {
+                ZBUFFv06_DCtx* dctx = (ZBUFFv06_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv06_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+#if (ZSTD_LEGACY_SUPPORT <= 7)
+        case 7 :
+            {
+                ZBUFFv07_DCtx* dctx = (ZBUFFv07_DCtx*) legacyContext;
+                const void* src = (const char*)input->src + input->pos;
+                size_t readSize = input->size - input->pos;
+                void* dst = (char*)output->dst + output->pos;
+                size_t decodedSize = output->size - output->pos;
+                size_t const hintSize = ZBUFFv07_decompressContinue(dctx, dst, &decodedSize, src, &readSize);
+                output->pos += decodedSize;
+                input->pos += readSize;
+                return hintSize;
+            }
+#endif
+    }
+}
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif   /* ZSTD_LEGACY_H */
+/**** ended inlining ../legacy/zstd_legacy.h ****/
+#endif
+
+
+
+/*-*******************************************************
+*  Types
+*********************************************************/
+struct ZSTD_DDict_s {
+    void* dictBuffer;
+    const void* dictContent;
+    size_t dictSize;
+    ZSTD_entropyDTables_t entropy;
+    U32 dictID;
+    U32 entropyPresent;
+    ZSTD_customMem cMem;
+};  /* typedef'd to ZSTD_DDict within "zstd.h" */
+
+const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictContent;
+}
+
+size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict)
+{
+    assert(ddict != NULL);
+    return ddict->dictSize;
+}
+
+void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_copyDDictParameters");
+    assert(dctx != NULL);
+    assert(ddict != NULL);
+    dctx->dictID = ddict->dictID;
+    dctx->prefixStart = ddict->dictContent;
+    dctx->virtualStart = ddict->dictContent;
+    dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize;
+    dctx->previousDstEnd = dctx->dictEnd;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+    dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+    if (ddict->entropyPresent) {
+        dctx->litEntropy = 1;
+        dctx->fseEntropy = 1;
+        dctx->LLTptr = ddict->entropy.LLTable;
+        dctx->MLTptr = ddict->entropy.MLTable;
+        dctx->OFTptr = ddict->entropy.OFTable;
+        dctx->HUFptr = ddict->entropy.hufTable;
+        dctx->entropy.rep[0] = ddict->entropy.rep[0];
+        dctx->entropy.rep[1] = ddict->entropy.rep[1];
+        dctx->entropy.rep[2] = ddict->entropy.rep[2];
+    } else {
+        dctx->litEntropy = 0;
+        dctx->fseEntropy = 0;
+    }
+}
+
+
+static size_t
+ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict,
+                           ZSTD_dictContentType_e dictContentType)
+{
+    ddict->dictID = 0;
+    ddict->entropyPresent = 0;
+    if (dictContentType == ZSTD_dct_rawContent) return 0;
+
+    if (ddict->dictSize < 8) {
+        if (dictContentType == ZSTD_dct_fullDict)
+            return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+        return 0;   /* pure content mode */
+    }
+    {   U32 const magic = MEM_readLE32(ddict->dictContent);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            if (dictContentType == ZSTD_dct_fullDict)
+                return ERROR(dictionary_corrupted);   /* only accept specified dictionaries */
+            return 0;   /* pure content mode */
+        }
+    }
+    ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE);
+
+    /* load entropy tables */
+    RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy(
+            &ddict->entropy, ddict->dictContent, ddict->dictSize)),
+        dictionary_corrupted, "");
+    ddict->entropyPresent = 1;
+    return 0;
+}
+
+
+static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict,
+                                      const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType)
+{
+    if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) {
+        ddict->dictBuffer = NULL;
+        ddict->dictContent = dict;
+        if (!dict) dictSize = 0;
+    } else {
+        void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem);
+        ddict->dictBuffer = internalBuffer;
+        ddict->dictContent = internalBuffer;
+        if (!internalBuffer) return ERROR(memory_allocation);
+        memcpy(internalBuffer, dict, dictSize);
+    }
+    ddict->dictSize = dictSize;
+    ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+
+    /* parse dictionary content */
+    FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , "");
+
+    return 0;
+}
+
+ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                      ZSTD_dictLoadMethod_e dictLoadMethod,
+                                      ZSTD_dictContentType_e dictContentType,
+                                      ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem);
+        if (ddict == NULL) return NULL;
+        ddict->cMem = customMem;
+        {   size_t const initResult = ZSTD_initDDict_internal(ddict,
+                                            dict, dictSize,
+                                            dictLoadMethod, dictContentType);
+            if (ZSTD_isError(initResult)) {
+                ZSTD_freeDDict(ddict);
+                return NULL;
+        }   }
+        return ddict;
+    }
+}
+
+/*! ZSTD_createDDict() :
+*   Create a digested dictionary, to start decompression without startup delay.
+*   `dict` content is copied inside DDict.
+*   Consequently, `dict` can be released after `ZSTD_DDict` creation */
+ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator);
+}
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, to start decompression without startup delay.
+ *  Dictionary content is simply referenced, it will be accessed during decompression.
+ *  Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */
+ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize)
+{
+    ZSTD_customMem const allocator = { NULL, NULL, NULL };
+    return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator);
+}
+
+
+const ZSTD_DDict* ZSTD_initStaticDDict(
+                                void* sBuffer, size_t sBufferSize,
+                                const void* dict, size_t dictSize,
+                                ZSTD_dictLoadMethod_e dictLoadMethod,
+                                ZSTD_dictContentType_e dictContentType)
+{
+    size_t const neededSpace = sizeof(ZSTD_DDict)
+                             + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+    ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer;
+    assert(sBuffer != NULL);
+    assert(dict != NULL);
+    if ((size_t)sBuffer & 7) return NULL;   /* 8-aligned */
+    if (sBufferSize < neededSpace) return NULL;
+    if (dictLoadMethod == ZSTD_dlm_byCopy) {
+        memcpy(ddict+1, dict, dictSize);  /* local copy */
+        dict = ddict+1;
+    }
+    if (ZSTD_isError( ZSTD_initDDict_internal(ddict,
+                                              dict, dictSize,
+                                              ZSTD_dlm_byRef, dictContentType) ))
+        return NULL;
+    return ddict;
+}
+
+
+size_t ZSTD_freeDDict(ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support free on NULL */
+    {   ZSTD_customMem const cMem = ddict->cMem;
+        ZSTD_free(ddict->dictBuffer, cMem);
+        ZSTD_free(ddict, cMem);
+        return 0;
+    }
+}
+
+/*! ZSTD_estimateDDictSize() :
+ *  Estimate amount of memory that will be needed to create a dictionary for decompression.
+ *  Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */
+size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod)
+{
+    return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize);
+}
+
+size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;   /* support sizeof on NULL */
+    return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ;
+}
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict)
+{
+    if (ddict==NULL) return 0;
+    return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize);
+}
+/**** ended inlining decompress/zstd_ddict.c ****/
+/**** start inlining decompress/zstd_decompress.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+/* ***************************************************************
+*  Tuning parameters
+*****************************************************************/
+/*!
+ * HEAPMODE :
+ * Select how default decompression function ZSTD_decompress() allocates its context,
+ * on stack (0), or into heap (1, default; requires malloc()).
+ * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected.
+ */
+#ifndef ZSTD_HEAPMODE
+#  define ZSTD_HEAPMODE 1
+#endif
+
+/*!
+*  LEGACY_SUPPORT :
+*  if set to 1+, ZSTD_decompress() can decode older formats (v0.1+)
+*/
+#ifndef ZSTD_LEGACY_SUPPORT
+#  define ZSTD_LEGACY_SUPPORT 0
+#endif
+
+/*!
+ *  MAXWINDOWSIZE_DEFAULT :
+ *  maximum window size accepted by DStream __by default__.
+ *  Frames requiring more memory will be rejected.
+ *  It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize().
+ */
+#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT
+#  define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1)
+#endif
+
+/*!
+ *  NO_FORWARD_PROGRESS_MAX :
+ *  maximum allowed nb of calls to ZSTD_decompressStream()
+ *  without any forward progress
+ *  (defined as: no byte read from input, and no byte flushed to output)
+ *  before triggering an error.
+ */
+#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX
+#  define ZSTD_NO_FORWARD_PROGRESS_MAX 16
+#endif
+
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <string.h>      /* memcpy, memmove, memset */
+/**** skipping file: ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: zstd_decompress_internal.h ****/
+/**** skipping file: zstd_ddict.h ****/
+/**** start inlining zstd_decompress_block.h ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+
+#ifndef ZSTD_DEC_BLOCK_H
+#define ZSTD_DEC_BLOCK_H
+
+/*-*******************************************************
+ *  Dependencies
+ *********************************************************/
+#include <stddef.h>   /* size_t */
+/**** skipping file: ../zstd.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: zstd_decompress_internal.h ****/
+
+
+/* ===   Prototypes   === */
+
+/* note: prototypes already published within `zstd.h` :
+ * ZSTD_decompressBlock()
+ */
+
+/* note: prototypes already published within `zstd_internal.h` :
+ * ZSTD_getcBlockSize()
+ * ZSTD_decodeSeqHeaders()
+ */
+
+
+/* ZSTD_decompressBlock_internal() :
+ * decompress block, starting at `src`,
+ * into destination buffer `dst`.
+ * @return : decompressed block size,
+ *           or an error code (which can be tested using ZSTD_isError())
+ */
+size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                               void* dst, size_t dstCapacity,
+                         const void* src, size_t srcSize, const int frame);
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * this function must be called with valid parameters only
+ * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.)
+ * in which case it cannot fail.
+ * Internal use only.
+ */
+void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+             const short* normalizedCounter, unsigned maxSymbolValue,
+             const U32* baseValue, const U32* nbAdditionalBits,
+                   unsigned tableLog);
+
+
+#endif /* ZSTD_DEC_BLOCK_H */
+/**** ended inlining zstd_decompress_block.h ****/
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+/**** skipping file: ../legacy/zstd_legacy.h ****/
+#endif
+
+
+/*-*************************************************************
+*   Context management
+***************************************************************/
+size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support sizeof NULL */
+    return sizeof(*dctx)
+           + ZSTD_sizeof_DDict(dctx->ddictLocal)
+           + dctx->inBuffSize + dctx->outBuffSize;
+}
+
+size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); }
+
+
+static size_t ZSTD_startingInputLength(ZSTD_format_e format)
+{
+    size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format);
+    /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */
+    assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) );
+    return startingInputLength;
+}
+
+static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
+{
+    dctx->format = ZSTD_f_zstd1;  /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */
+    dctx->staticSize  = 0;
+    dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+    dctx->ddict       = NULL;
+    dctx->ddictLocal  = NULL;
+    dctx->dictEnd     = NULL;
+    dctx->ddictIsCold = 0;
+    dctx->dictUses = ZSTD_dont_use;
+    dctx->inBuff      = NULL;
+    dctx->inBuffSize  = 0;
+    dctx->outBuffSize = 0;
+    dctx->streamStage = zdss_init;
+    dctx->legacyContext = NULL;
+    dctx->previousLegacyVersion = 0;
+    dctx->noForwardProgress = 0;
+    dctx->oversizedDuration = 0;
+    dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    dctx->outBufferMode = ZSTD_obm_buffered;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentEndForFuzzing = NULL;
+#endif
+}
+
+ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize)
+{
+    ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace;
+
+    if ((size_t)workspace & 7) return NULL;  /* 8-aligned */
+    if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL;  /* minimum size */
+
+    ZSTD_initDCtx_internal(dctx);
+    dctx->staticSize = workspaceSize;
+    dctx->inBuff = (char*)(dctx+1);
+    return dctx;
+}
+
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    if (!customMem.customAlloc ^ !customMem.customFree) return NULL;
+
+    {   ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem);
+        if (!dctx) return NULL;
+        dctx->customMem = customMem;
+        ZSTD_initDCtx_internal(dctx);
+        return dctx;
+    }
+}
+
+ZSTD_DCtx* ZSTD_createDCtx(void)
+{
+    DEBUGLOG(3, "ZSTD_createDCtx");
+    return ZSTD_createDCtx_advanced(ZSTD_defaultCMem);
+}
+
+static void ZSTD_clearDict(ZSTD_DCtx* dctx)
+{
+    ZSTD_freeDDict(dctx->ddictLocal);
+    dctx->ddictLocal = NULL;
+    dctx->ddict = NULL;
+    dctx->dictUses = ZSTD_dont_use;
+}
+
+size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx)
+{
+    if (dctx==NULL) return 0;   /* support free on NULL */
+    RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx");
+    {   ZSTD_customMem const cMem = dctx->customMem;
+        ZSTD_clearDict(dctx);
+        ZSTD_free(dctx->inBuff, cMem);
+        dctx->inBuff = NULL;
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (dctx->legacyContext)
+            ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion);
+#endif
+        ZSTD_free(dctx, cMem);
+        return 0;
+    }
+}
+
+/* no longer useful */
+void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx)
+{
+    size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx);
+    memcpy(dstDCtx, srcDCtx, toCopy);  /* no need to copy workspace */
+}
+
+
+/*-*************************************************************
+ *   Frame header decoding
+ ***************************************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+unsigned ZSTD_isFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
+    {   U32 const magic = MEM_readLE32(buffer);
+        if (magic == ZSTD_MAGICNUMBER) return 1;
+        if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1;
+    }
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(buffer, size)) return 1;
+#endif
+    return 0;
+}
+
+/** ZSTD_frameHeaderSize_internal() :
+ *  srcSize must be large enough to reach header size fields.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
+ * @return : size of the Frame Header
+ *           or an error code, which can be tested with ZSTD_isError() */
+static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+    RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, "");
+
+    {   BYTE const fhd = ((const BYTE*)src)[minInputSize-1];
+        U32 const dictID= fhd & 3;
+        U32 const singleSegment = (fhd >> 5) & 1;
+        U32 const fcsId = fhd >> 6;
+        return minInputSize + !singleSegment
+             + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId]
+             + (singleSegment && !fcsId);
+    }
+}
+
+/** ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_frameHeaderSize_prefix.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize)
+{
+    return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameHeader_advanced() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format)
+{
+    const BYTE* ip = (const BYTE*)src;
+    size_t const minInputSize = ZSTD_startingInputLength(format);
+
+    memset(zfhPtr, 0, sizeof(*zfhPtr));   /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */
+    if (srcSize < minInputSize) return minInputSize;
+    RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter");
+
+    if ( (format != ZSTD_f_zstd1_magicless)
+      && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) {
+        if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+            /* skippable frame */
+            if (srcSize < ZSTD_SKIPPABLEHEADERSIZE)
+                return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */
+            memset(zfhPtr, 0, sizeof(*zfhPtr));
+            zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE);
+            zfhPtr->frameType = ZSTD_skippableFrame;
+            return 0;
+        }
+        RETURN_ERROR(prefix_unknown, "");
+    }
+
+    /* ensure there is enough `srcSize` to fully read/decode frame header */
+    {   size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format);
+        if (srcSize < fhsize) return fhsize;
+        zfhPtr->headerSize = (U32)fhsize;
+    }
+
+    {   BYTE const fhdByte = ip[minInputSize-1];
+        size_t pos = minInputSize;
+        U32 const dictIDSizeCode = fhdByte&3;
+        U32 const checksumFlag = (fhdByte>>2)&1;
+        U32 const singleSegment = (fhdByte>>5)&1;
+        U32 const fcsID = fhdByte>>6;
+        U64 windowSize = 0;
+        U32 dictID = 0;
+        U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN;
+        RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported,
+                        "reserved bits, must be zero");
+
+        if (!singleSegment) {
+            BYTE const wlByte = ip[pos++];
+            U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, "");
+            windowSize = (1ULL << windowLog);
+            windowSize += (windowSize >> 3) * (wlByte&7);
+        }
+        switch(dictIDSizeCode)
+        {
+            default: assert(0);  /* impossible */
+            case 0 : break;
+            case 1 : dictID = ip[pos]; pos++; break;
+            case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break;
+            case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break;
+        }
+        switch(fcsID)
+        {
+            default: assert(0);  /* impossible */
+            case 0 : if (singleSegment) frameContentSize = ip[pos]; break;
+            case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break;
+            case 2 : frameContentSize = MEM_readLE32(ip+pos); break;
+            case 3 : frameContentSize = MEM_readLE64(ip+pos); break;
+        }
+        if (singleSegment) windowSize = frameContentSize;
+
+        zfhPtr->frameType = ZSTD_frame;
+        zfhPtr->frameContentSize = frameContentSize;
+        zfhPtr->windowSize = windowSize;
+        zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+        zfhPtr->dictID = dictID;
+        zfhPtr->checksumFlag = checksumFlag;
+    }
+    return 0;
+}
+
+/** ZSTD_getFrameHeader() :
+ *  decode Frame Header, or require larger `srcSize`.
+ *  note : this function does not consume input, it only reads it.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize)
+{
+    return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1);
+}
+
+
+/** ZSTD_getFrameContentSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size of the single frame pointed to be `src` if known, otherwise
+ *         - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *         - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */
+unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize)
+{
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize)) {
+        unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize);
+        return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret;
+    }
+#endif
+    {   ZSTD_frameHeader zfh;
+        if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0)
+            return ZSTD_CONTENTSIZE_ERROR;
+        if (zfh.frameType == ZSTD_skippableFrame) {
+            return 0;
+        } else {
+            return zfh.frameContentSize;
+    }   }
+}
+
+static size_t readSkippableFrameSize(void const* src, size_t srcSize)
+{
+    size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE;
+    U32 sizeU32;
+
+    RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, "");
+
+    sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE);
+    RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32,
+                    frameParameter_unsupported, "");
+    {
+        size_t const skippableSize = skippableHeaderSize + sizeU32;
+        RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, "");
+        return skippableSize;
+    }
+}
+
+/** ZSTD_findDecompressedSize() :
+ *  compatible with legacy mode
+ *  `srcSize` must be the exact length of some number of ZSTD compressed and/or
+ *      skippable frames
+ *  @return : decompressed size of the frames contained */
+unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long totalDstSize = 0;
+
+    while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) {
+        U32 const magicNumber = MEM_readLE32(src);
+
+        if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+            size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+            if (ZSTD_isError(skippableSize)) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+            assert(skippableSize <= srcSize);
+
+            src = (const BYTE *)src + skippableSize;
+            srcSize -= skippableSize;
+            continue;
+        }
+
+        {   unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+            if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret;
+
+            /* check for overflow */
+            if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR;
+            totalDstSize += ret;
+        }
+        {   size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize);
+            if (ZSTD_isError(frameSrcSize)) {
+                return ZSTD_CONTENTSIZE_ERROR;
+            }
+
+            src = (const BYTE *)src + frameSrcSize;
+            srcSize -= frameSrcSize;
+        }
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    if (srcSize) return ZSTD_CONTENTSIZE_ERROR;
+
+    return totalDstSize;
+}
+
+/** ZSTD_getDecompressedSize() :
+ *  compatible with legacy mode
+ * @return : decompressed size if known, 0 otherwise
+             note : 0 can mean any of the following :
+                   - frame content is empty
+                   - decompressed size field is not present in frame header
+                   - frame header unknown / not supported
+                   - frame header not complete (`srcSize` too small) */
+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize)
+{
+    unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize);
+    ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN);
+    return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret;
+}
+
+
+/** ZSTD_decodeFrameHeader() :
+ * `headerSize` must be the size provided by ZSTD_frameHeaderSize().
+ * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */
+static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize)
+{
+    size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format);
+    if (ZSTD_isError(result)) return result;    /* invalid header */
+    RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small");
+#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    /* Skip the dictID check in fuzzing mode, because it makes the search
+     * harder.
+     */
+    RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID),
+                    dictionary_wrong, "");
+#endif
+    if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0);
+    return 0;
+}
+
+static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret)
+{
+    ZSTD_frameSizeInfo frameSizeInfo;
+    frameSizeInfo.compressedSize = ret;
+    frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR;
+    return frameSizeInfo;
+}
+
+static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize)
+{
+    ZSTD_frameSizeInfo frameSizeInfo;
+    memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo));
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+    if (ZSTD_isLegacy(src, srcSize))
+        return ZSTD_findFrameSizeInfoLegacy(src, srcSize);
+#endif
+
+    if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE)
+        && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+        frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize);
+        assert(ZSTD_isError(frameSizeInfo.compressedSize) ||
+               frameSizeInfo.compressedSize <= srcSize);
+        return frameSizeInfo;
+    } else {
+        const BYTE* ip = (const BYTE*)src;
+        const BYTE* const ipstart = ip;
+        size_t remainingSize = srcSize;
+        size_t nbBlocks = 0;
+        ZSTD_frameHeader zfh;
+
+        /* Extract Frame Header */
+        {   size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize);
+            if (ZSTD_isError(ret))
+                return ZSTD_errorFrameSizeInfo(ret);
+            if (ret > 0)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+        }
+
+        ip += zfh.headerSize;
+        remainingSize -= zfh.headerSize;
+
+        /* Iterate over each block */
+        while (1) {
+            blockProperties_t blockProperties;
+            size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties);
+            if (ZSTD_isError(cBlockSize))
+                return ZSTD_errorFrameSizeInfo(cBlockSize);
+
+            if (ZSTD_blockHeaderSize + cBlockSize > remainingSize)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+
+            ip += ZSTD_blockHeaderSize + cBlockSize;
+            remainingSize -= ZSTD_blockHeaderSize + cBlockSize;
+            nbBlocks++;
+
+            if (blockProperties.lastBlock) break;
+        }
+
+        /* Final frame content checksum */
+        if (zfh.checksumFlag) {
+            if (remainingSize < 4)
+                return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong));
+            ip += 4;
+        }
+
+        frameSizeInfo.compressedSize = ip - ipstart;
+        frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN)
+                                        ? zfh.frameContentSize
+                                        : nbBlocks * zfh.blockSizeMax;
+        return frameSizeInfo;
+    }
+}
+
+/** ZSTD_findFrameCompressedSize() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the compressed size of the frame starting at `src` */
+size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize)
+{
+    ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+    return frameSizeInfo.compressedSize;
+}
+
+/** ZSTD_decompressBound() :
+ *  compatible with legacy mode
+ *  `src` must point to the start of a ZSTD frame or a skippeable frame
+ *  `srcSize` must be at least as large as the frame contained
+ *  @return : the maximum decompressed size of the compressed source
+ */
+unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize)
+{
+    unsigned long long bound = 0;
+    /* Iterate over each frame */
+    while (srcSize > 0) {
+        ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize);
+        size_t const compressedSize = frameSizeInfo.compressedSize;
+        unsigned long long const decompressedBound = frameSizeInfo.decompressedBound;
+        if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR)
+            return ZSTD_CONTENTSIZE_ERROR;
+        assert(srcSize >= compressedSize);
+        src = (const BYTE*)src + compressedSize;
+        srcSize -= compressedSize;
+        bound += decompressedBound;
+    }
+    return bound;
+}
+
+
+/*-*************************************************************
+ *   Frame decoding
+ ***************************************************************/
+
+/** ZSTD_insertBlock() :
+ *  insert `src` block into `dctx` history. Useful to track uncompressed blocks. */
+size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize)
+{
+    DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize);
+    ZSTD_checkContinuity(dctx, blockStart);
+    dctx->previousDstEnd = (const char*)blockStart + blockSize;
+    return blockSize;
+}
+
+
+static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity,
+                          const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_copyRawBlock");
+    if (dst == NULL) {
+        if (srcSize == 0) return 0;
+        RETURN_ERROR(dstBuffer_null, "");
+    }
+    RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, "");
+    memcpy(dst, src, srcSize);
+    return srcSize;
+}
+
+static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity,
+                               BYTE b,
+                               size_t regenSize)
+{
+    if (dst == NULL) {
+        if (regenSize == 0) return 0;
+        RETURN_ERROR(dstBuffer_null, "");
+    }
+    RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, "");
+    memset(dst, b, regenSize);
+    return regenSize;
+}
+
+
+/*! ZSTD_decompressFrame() :
+ * @dctx must be properly initialized
+ *  will update *srcPtr and *srcSizePtr,
+ *  to make *srcPtr progress by one frame. */
+static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
+                                   void* dst, size_t dstCapacity,
+                             const void** srcPtr, size_t *srcSizePtr)
+{
+    const BYTE* ip = (const BYTE*)(*srcPtr);
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart;
+    BYTE* op = ostart;
+    size_t remainingSrcSize = *srcSizePtr;
+
+    DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr);
+
+    /* check */
+    RETURN_ERROR_IF(
+        remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize,
+        srcSize_wrong, "");
+
+    /* Frame Header */
+    {   size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal(
+                ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format);
+        if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize;
+        RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize,
+                        srcSize_wrong, "");
+        FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , "");
+        ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize;
+    }
+
+    /* Loop on each block */
+    while (1) {
+        size_t decodedSize;
+        blockProperties_t blockProperties;
+        size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties);
+        if (ZSTD_isError(cBlockSize)) return cBlockSize;
+
+        ip += ZSTD_blockHeaderSize;
+        remainingSrcSize -= ZSTD_blockHeaderSize;
+        RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, "");
+
+        switch(blockProperties.blockType)
+        {
+        case bt_compressed:
+            decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1);
+            break;
+        case bt_raw :
+            decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize);
+            break;
+        case bt_rle :
+            decodedSize = ZSTD_setRleBlock(op, oend-op, *ip, blockProperties.origSize);
+            break;
+        case bt_reserved :
+        default:
+            RETURN_ERROR(corruption_detected, "invalid block type");
+        }
+
+        if (ZSTD_isError(decodedSize)) return decodedSize;
+        if (dctx->fParams.checksumFlag)
+            XXH64_update(&dctx->xxhState, op, decodedSize);
+        if (decodedSize != 0)
+            op += decodedSize;
+        assert(ip != NULL);
+        ip += cBlockSize;
+        remainingSrcSize -= cBlockSize;
+        if (blockProperties.lastBlock) break;
+    }
+
+    if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) {
+        RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize,
+                        corruption_detected, "");
+    }
+    if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */
+        U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState);
+        U32 checkRead;
+        RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, "");
+        checkRead = MEM_readLE32(ip);
+        RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, "");
+        ip += 4;
+        remainingSrcSize -= 4;
+    }
+
+    /* Allow caller to get size read */
+    *srcPtr = ip;
+    *srcSizePtr = remainingSrcSize;
+    return op-ostart;
+}
+
+static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx,
+                                        void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  const void* dict, size_t dictSize,
+                                  const ZSTD_DDict* ddict)
+{
+    void* const dststart = dst;
+    int moreThan1Frame = 0;
+
+    DEBUGLOG(5, "ZSTD_decompressMultiFrame");
+    assert(dict==NULL || ddict==NULL);  /* either dict or ddict set, not both */
+
+    if (ddict) {
+        dict = ZSTD_DDict_dictContent(ddict);
+        dictSize = ZSTD_DDict_dictSize(ddict);
+    }
+
+    while (srcSize >= ZSTD_startingInputLength(dctx->format)) {
+
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1)
+        if (ZSTD_isLegacy(src, srcSize)) {
+            size_t decodedSize;
+            size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize);
+            if (ZSTD_isError(frameSize)) return frameSize;
+            RETURN_ERROR_IF(dctx->staticSize, memory_allocation,
+                "legacy support is not compatible with static dctx");
+
+            decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize);
+            if (ZSTD_isError(decodedSize)) return decodedSize;
+
+            assert(decodedSize <=- dstCapacity);
+            dst = (BYTE*)dst + decodedSize;
+            dstCapacity -= decodedSize;
+
+            src = (const BYTE*)src + frameSize;
+            srcSize -= frameSize;
+
+            continue;
+        }
+#endif
+
+        {   U32 const magicNumber = MEM_readLE32(src);
+            DEBUGLOG(4, "reading magic number %08X (expecting %08X)",
+                        (unsigned)magicNumber, ZSTD_MAGICNUMBER);
+            if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {
+                size_t const skippableSize = readSkippableFrameSize(src, srcSize);
+                FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed");
+                assert(skippableSize <= srcSize);
+
+                src = (const BYTE *)src + skippableSize;
+                srcSize -= skippableSize;
+                continue;
+        }   }
+
+        if (ddict) {
+            /* we were called from ZSTD_decompress_usingDDict */
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), "");
+        } else {
+            /* this will initialize correctly with no dict if dict == NULL, so
+             * use this in all cases but ddict */
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), "");
+        }
+        ZSTD_checkContinuity(dctx, dst);
+
+        {   const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity,
+                                                    &src, &srcSize);
+            RETURN_ERROR_IF(
+                (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown)
+             && (moreThan1Frame==1),
+                srcSize_wrong,
+                "at least one frame successfully completed, but following "
+                "bytes are garbage: it's more likely to be a srcSize error, "
+                "specifying more bytes than compressed size of frame(s). This "
+                "error message replaces ERROR(prefix_unknown), which would be "
+                "confusing, as the first header is actually correct. Note that "
+                "one could be unlucky, it might be a corruption error instead, "
+                "happening right at the place where we expect zstd magic "
+                "bytes. But this is _much_ less likely than a srcSize field "
+                "error.");
+            if (ZSTD_isError(res)) return res;
+            assert(res <= dstCapacity);
+            if (res != 0)
+                dst = (BYTE*)dst + res;
+            dstCapacity -= res;
+        }
+        moreThan1Frame = 1;
+    }  /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */
+
+    RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed");
+
+    return (BYTE*)dst - (BYTE*)dststart;
+}
+
+size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                 void* dst, size_t dstCapacity,
+                           const void* src, size_t srcSize,
+                           const void* dict, size_t dictSize)
+{
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL);
+}
+
+
+static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx)
+{
+    switch (dctx->dictUses) {
+    default:
+        assert(0 /* Impossible */);
+        /* fall-through */
+    case ZSTD_dont_use:
+        ZSTD_clearDict(dctx);
+        return NULL;
+    case ZSTD_use_indefinitely:
+        return dctx->ddict;
+    case ZSTD_use_once:
+        dctx->dictUses = ZSTD_dont_use;
+        return dctx->ddict;
+    }
+}
+
+size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx));
+}
+
+
+size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1)
+    size_t regenSize;
+    ZSTD_DCtx* const dctx = ZSTD_createDCtx();
+    RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!");
+    regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize);
+    ZSTD_freeDCtx(dctx);
+    return regenSize;
+#else   /* stack mode */
+    ZSTD_DCtx dctx;
+    ZSTD_initDCtx_internal(&dctx);
+    return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize);
+#endif
+}
+
+
+/*-**************************************
+*   Advanced Streaming Decompression API
+*   Bufferless and synchronous
+****************************************/
+size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; }
+
+/**
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input can be streamed,
+ * we allow taking a partial block as the input. Currently only raw uncompressed blocks can
+ * be streamed.
+ *
+ * For blocks that can be streamed, this allows us to reduce the latency until we produce
+ * output, and avoid copying the input.
+ *
+ * @param inputSize - The total amount of input that the caller currently has.
+ */
+static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) {
+    if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock))
+        return dctx->expected;
+    if (dctx->bType != bt_raw)
+        return dctx->expected;
+    return MIN(MAX(inputSize, 1), dctx->expected);
+}
+
+ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
+    switch(dctx->stage)
+    {
+    default:   /* should not happen */
+        assert(0);
+    case ZSTDds_getFrameHeaderSize:
+    case ZSTDds_decodeFrameHeader:
+        return ZSTDnit_frameHeader;
+    case ZSTDds_decodeBlockHeader:
+        return ZSTDnit_blockHeader;
+    case ZSTDds_decompressBlock:
+        return ZSTDnit_block;
+    case ZSTDds_decompressLastBlock:
+        return ZSTDnit_lastBlock;
+    case ZSTDds_checkChecksum:
+        return ZSTDnit_checksum;
+    case ZSTDds_decodeSkippableHeader:
+    case ZSTDds_skipFrame:
+        return ZSTDnit_skippableFrame;
+    }
+}
+
+static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; }
+
+/** ZSTD_decompressContinue() :
+ *  srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress())
+ *  @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity)
+ *            or an error code, which can be tested using ZSTD_isError() */
+size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize)
+{
+    DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize);
+    /* Sanity check */
+    RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed");
+    if (dstCapacity) ZSTD_checkContinuity(dctx, dst);
+
+    switch (dctx->stage)
+    {
+    case ZSTDds_getFrameHeaderSize :
+        assert(src != NULL);
+        if (dctx->format == ZSTD_f_zstd1) {  /* allows header */
+            assert(srcSize >= ZSTD_FRAMEIDSIZE);  /* to read skippable magic number */
+            if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {        /* skippable frame */
+                memcpy(dctx->headerBuffer, src, srcSize);
+                dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize;  /* remaining to load to get full skippable frame header */
+                dctx->stage = ZSTDds_decodeSkippableHeader;
+                return 0;
+        }   }
+        dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format);
+        if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize;
+        memcpy(dctx->headerBuffer, src, srcSize);
+        dctx->expected = dctx->headerSize - srcSize;
+        dctx->stage = ZSTDds_decodeFrameHeader;
+        return 0;
+
+    case ZSTDds_decodeFrameHeader:
+        assert(src != NULL);
+        memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize);
+        FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), "");
+        dctx->expected = ZSTD_blockHeaderSize;
+        dctx->stage = ZSTDds_decodeBlockHeader;
+        return 0;
+
+    case ZSTDds_decodeBlockHeader:
+        {   blockProperties_t bp;
+            size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp);
+            if (ZSTD_isError(cBlockSize)) return cBlockSize;
+            RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum");
+            dctx->expected = cBlockSize;
+            dctx->bType = bp.blockType;
+            dctx->rleSize = bp.origSize;
+            if (cBlockSize) {
+                dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock;
+                return 0;
+            }
+            /* empty block */
+            if (bp.lastBlock) {
+                if (dctx->fParams.checksumFlag) {
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0; /* end of frame */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->expected = ZSTD_blockHeaderSize;  /* jump to next header */
+                dctx->stage = ZSTDds_decodeBlockHeader;
+            }
+            return 0;
+        }
+
+    case ZSTDds_decompressLastBlock:
+    case ZSTDds_decompressBlock:
+        DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock");
+        {   size_t rSize;
+            switch(dctx->bType)
+            {
+            case bt_compressed:
+                DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
+                rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1);
+                dctx->expected = 0;  /* Streaming not supported */
+                break;
+            case bt_raw :
+                assert(srcSize <= dctx->expected);
+                rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize);
+                FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed");
+                assert(rSize == srcSize);
+                dctx->expected -= rSize;
+                break;
+            case bt_rle :
+                rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize);
+                dctx->expected = 0;  /* Streaming not supported */
+                break;
+            case bt_reserved :   /* should never happen */
+            default:
+                RETURN_ERROR(corruption_detected, "invalid block type");
+            }
+            FORWARD_IF_ERROR(rSize, "");
+            RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum");
+            DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize);
+            dctx->decodedSize += rSize;
+            if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize);
+            dctx->previousDstEnd = (char*)dst + rSize;
+
+            /* Stay on the same stage until we are finished streaming the block. */
+            if (dctx->expected > 0) {
+                return rSize;
+            }
+
+            if (dctx->stage == ZSTDds_decompressLastBlock) {   /* end of frame */
+                DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize);
+                RETURN_ERROR_IF(
+                    dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                 && dctx->decodedSize != dctx->fParams.frameContentSize,
+                    corruption_detected, "");
+                if (dctx->fParams.checksumFlag) {  /* another round for frame checksum */
+                    dctx->expected = 4;
+                    dctx->stage = ZSTDds_checkChecksum;
+                } else {
+                    dctx->expected = 0;   /* ends here */
+                    dctx->stage = ZSTDds_getFrameHeaderSize;
+                }
+            } else {
+                dctx->stage = ZSTDds_decodeBlockHeader;
+                dctx->expected = ZSTD_blockHeaderSize;
+            }
+            return rSize;
+        }
+
+    case ZSTDds_checkChecksum:
+        assert(srcSize == 4);  /* guaranteed by dctx->expected */
+        {   U32 const h32 = (U32)XXH64_digest(&dctx->xxhState);
+            U32 const check32 = MEM_readLE32(src);
+            DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32);
+            RETURN_ERROR_IF(check32 != h32, checksum_wrong, "");
+            dctx->expected = 0;
+            dctx->stage = ZSTDds_getFrameHeaderSize;
+            return 0;
+        }
+
+    case ZSTDds_decodeSkippableHeader:
+        assert(src != NULL);
+        assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE);
+        memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize);   /* complete skippable header */
+        dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE);   /* note : dctx->expected can grow seriously large, beyond local buffer size */
+        dctx->stage = ZSTDds_skipFrame;
+        return 0;
+
+    case ZSTDds_skipFrame:
+        dctx->expected = 0;
+        dctx->stage = ZSTDds_getFrameHeaderSize;
+        return 0;
+
+    default:
+        assert(0);   /* impossible */
+        RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+    }
+}
+
+
+static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    dctx->dictEnd = dctx->previousDstEnd;
+    dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+    dctx->prefixStart = dict;
+    dctx->previousDstEnd = (const char*)dict + dictSize;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    dctx->dictContentBeginForFuzzing = dctx->prefixStart;
+    dctx->dictContentEndForFuzzing = dctx->previousDstEnd;
+#endif
+    return 0;
+}
+
+/*! ZSTD_loadDEntropy() :
+ *  dict : must point at beginning of a valid zstd dictionary.
+ * @return : size of entropy tables read */
+size_t
+ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy,
+                  const void* const dict, size_t const dictSize)
+{
+    const BYTE* dictPtr = (const BYTE*)dict;
+    const BYTE* const dictEnd = dictPtr + dictSize;
+
+    RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small");
+    assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY);   /* dict must be valid */
+    dictPtr += 8;   /* skip header = magic + dictID */
+
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable));
+    ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable));
+    ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE);
+    {   void* const workspace = &entropy->LLTable;   /* use fse tables as temporary workspace; implies fse tables are grouped together */
+        size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable);
+#ifdef HUF_FORCE_DECOMPRESS_X1
+        /* in minimal huffman, we always use X1 variants */
+        size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
+#else
+        size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable,
+                                                dictPtr, dictEnd - dictPtr,
+                                                workspace, workspaceSize);
+#endif
+        RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, "");
+        dictPtr += hSize;
+    }
+
+    {   short offcodeNCount[MaxOff+1];
+        unsigned offcodeMaxValue = MaxOff, offcodeLog;
+        size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, "");
+        RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->OFTable,
+                            offcodeNCount, offcodeMaxValue,
+                            OF_base, OF_bits,
+                            offcodeLog);
+        dictPtr += offcodeHeaderSize;
+    }
+
+    {   short matchlengthNCount[MaxML+1];
+        unsigned matchlengthMaxValue = MaxML, matchlengthLog;
+        size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, "");
+        RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->MLTable,
+                            matchlengthNCount, matchlengthMaxValue,
+                            ML_base, ML_bits,
+                            matchlengthLog);
+        dictPtr += matchlengthHeaderSize;
+    }
+
+    {   short litlengthNCount[MaxLL+1];
+        unsigned litlengthMaxValue = MaxLL, litlengthLog;
+        size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr);
+        RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, "");
+        RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, "");
+        ZSTD_buildFSETable( entropy->LLTable,
+                            litlengthNCount, litlengthMaxValue,
+                            LL_base, LL_bits,
+                            litlengthLog);
+        dictPtr += litlengthHeaderSize;
+    }
+
+    RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, "");
+    {   int i;
+        size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12));
+        for (i=0; i<3; i++) {
+            U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4;
+            RETURN_ERROR_IF(rep==0 || rep > dictContentSize,
+                            dictionary_corrupted, "");
+            entropy->rep[i] = rep;
+    }   }
+
+    return dictPtr - (const BYTE*)dict;
+}
+
+static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize);
+    {   U32 const magic = MEM_readLE32(dict);
+        if (magic != ZSTD_MAGIC_DICTIONARY) {
+            return ZSTD_refDictContent(dctx, dict, dictSize);   /* pure content mode */
+    }   }
+    dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+
+    /* load entropy tables */
+    {   size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize);
+        RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, "");
+        dict = (const char*)dict + eSize;
+        dictSize -= eSize;
+    }
+    dctx->litEntropy = dctx->fseEntropy = 1;
+
+    /* reference dictionary content */
+    return ZSTD_refDictContent(dctx, dict, dictSize);
+}
+
+size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx)
+{
+    assert(dctx != NULL);
+    dctx->expected = ZSTD_startingInputLength(dctx->format);  /* dctx->format must be properly set */
+    dctx->stage = ZSTDds_getFrameHeaderSize;
+    dctx->decodedSize = 0;
+    dctx->previousDstEnd = NULL;
+    dctx->prefixStart = NULL;
+    dctx->virtualStart = NULL;
+    dctx->dictEnd = NULL;
+    dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001);  /* cover both little and big endian */
+    dctx->litEntropy = dctx->fseEntropy = 0;
+    dctx->dictID = 0;
+    dctx->bType = bt_reserved;
+    ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue));
+    memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue));  /* initial repcodes */
+    dctx->LLTptr = dctx->entropy.LLTable;
+    dctx->MLTptr = dctx->entropy.MLTable;
+    dctx->OFTptr = dctx->entropy.OFTable;
+    dctx->HUFptr = dctx->entropy.hufTable;
+    return 0;
+}
+
+size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+    if (dict && dictSize)
+        RETURN_ERROR_IF(
+            ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)),
+            dictionary_corrupted, "");
+    return 0;
+}
+
+
+/* ======   ZSTD_DDict   ====== */
+
+size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict");
+    assert(dctx != NULL);
+    if (ddict) {
+        const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict);
+        size_t const dictSize = ZSTD_DDict_dictSize(ddict);
+        const void* const dictEnd = dictStart + dictSize;
+        dctx->ddictIsCold = (dctx->dictEnd != dictEnd);
+        DEBUGLOG(4, "DDict is %s",
+                    dctx->ddictIsCold ? "~cold~" : "hot!");
+    }
+    FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , "");
+    if (ddict) {   /* NULL ddict is equivalent to no dictionary */
+        ZSTD_copyDDictParameters(dctx, ddict);
+    }
+    return 0;
+}
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize)
+{
+    if (dictSize < 8) return 0;
+    if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0;
+    return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE);
+}
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompress frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary (most common case).
+ *  - The frame was built with dictID intentionally removed.
+ *    Needed dictionary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, frame header could not be decoded.
+ *    Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`.
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use
+ *  ZSTD_getFrameHeader(), which will provide a more precise error code. */
+unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize)
+{
+    ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 };
+    size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize);
+    if (ZSTD_isError(hError)) return 0;
+    return zfp.dictID;
+}
+
+
+/*! ZSTD_decompress_usingDDict() :
+*   Decompression using a pre-digested Dictionary
+*   Use dictionary without significant overhead. */
+size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                  void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                            const ZSTD_DDict* ddict)
+{
+    /* pass content and size in case legacy frames are encountered */
+    return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize,
+                                     NULL, 0,
+                                     ddict);
+}
+
+
+/*=====================================
+*   Streaming decompression
+*====================================*/
+
+ZSTD_DStream* ZSTD_createDStream(void)
+{
+    DEBUGLOG(3, "ZSTD_createDStream");
+    return ZSTD_createDStream_advanced(ZSTD_defaultCMem);
+}
+
+ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
+{
+    return ZSTD_initStaticDCtx(workspace, workspaceSize);
+}
+
+ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_advanced(customMem);
+}
+
+size_t ZSTD_freeDStream(ZSTD_DStream* zds)
+{
+    return ZSTD_freeDCtx(zds);
+}
+
+
+/* ***  Initialization  *** */
+
+size_t ZSTD_DStreamInSize(void)  { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; }
+size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; }
+
+size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx,
+                                   const void* dict, size_t dictSize,
+                                         ZSTD_dictLoadMethod_e dictLoadMethod,
+                                         ZSTD_dictContentType_e dictContentType)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    ZSTD_clearDict(dctx);
+    if (dict && dictSize != 0) {
+        dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem);
+        RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!");
+        dctx->ddict = dctx->ddictLocal;
+        dctx->dictUses = ZSTD_use_indefinitely;
+    }
+    return 0;
+}
+
+size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize)
+{
+    return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
+}
+
+size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType)
+{
+    FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), "");
+    dctx->dictUses = ZSTD_use_once;
+    return 0;
+}
+
+size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize)
+{
+    return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent);
+}
+
+
+/* ZSTD_initDStream_usingDict() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize)
+{
+    DEBUGLOG(4, "ZSTD_initDStream_usingDict");
+    FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , "");
+    return ZSTD_startingInputLength(zds->format);
+}
+
+/* note : this variant can't fail */
+size_t ZSTD_initDStream(ZSTD_DStream* zds)
+{
+    DEBUGLOG(4, "ZSTD_initDStream");
+    return ZSTD_initDStream_usingDDict(zds, NULL);
+}
+
+/* ZSTD_initDStream_usingDDict() :
+ * ddict will just be referenced, and must outlive decompression session
+ * this function cannot fail */
+size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict)
+{
+    FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , "");
+    FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , "");
+    return ZSTD_startingInputLength(dctx->format);
+}
+
+/* ZSTD_resetDStream() :
+ * return : expected size, aka ZSTD_startingInputLength().
+ * this function cannot fail */
+size_t ZSTD_resetDStream(ZSTD_DStream* dctx)
+{
+    FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), "");
+    return ZSTD_startingInputLength(dctx->format);
+}
+
+
+size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    ZSTD_clearDict(dctx);
+    if (ddict) {
+        dctx->ddict = ddict;
+        dctx->dictUses = ZSTD_use_indefinitely;
+    }
+    return 0;
+}
+
+/* ZSTD_DCtx_setMaxWindowSize() :
+ * note : no direct equivalence in ZSTD_DCtx_setParameter,
+ * since this version sets windowSize, and the other sets windowLog */
+size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize)
+{
+    ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax);
+    size_t const min = (size_t)1 << bounds.lowerBound;
+    size_t const max = (size_t)1 << bounds.upperBound;
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, "");
+    RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, "");
+    dctx->maxWindowSize = maxWindowSize;
+    return 0;
+}
+
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format)
+{
+    return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, format);
+}
+
+ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam)
+{
+    ZSTD_bounds bounds = { 0, 0, 0 };
+    switch(dParam) {
+        case ZSTD_d_windowLogMax:
+            bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN;
+            bounds.upperBound = ZSTD_WINDOWLOG_MAX;
+            return bounds;
+        case ZSTD_d_format:
+            bounds.lowerBound = (int)ZSTD_f_zstd1;
+            bounds.upperBound = (int)ZSTD_f_zstd1_magicless;
+            ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless);
+            return bounds;
+        case ZSTD_d_stableOutBuffer:
+            bounds.lowerBound = (int)ZSTD_obm_buffered;
+            bounds.upperBound = (int)ZSTD_obm_stable;
+            return bounds;
+        default:;
+    }
+    bounds.error = ERROR(parameter_unsupported);
+    return bounds;
+}
+
+/* ZSTD_dParam_withinBounds:
+ * @return 1 if value is within dParam bounds,
+ * 0 otherwise */
+static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value)
+{
+    ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam);
+    if (ZSTD_isError(bounds.error)) return 0;
+    if (value < bounds.lowerBound) return 0;
+    if (value > bounds.upperBound) return 0;
+    return 1;
+}
+
+#define CHECK_DBOUNDS(p,v) {                \
+    RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \
+}
+
+size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value)
+{
+    RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+    switch(dParam) {
+        case ZSTD_d_windowLogMax:
+            if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT;
+            CHECK_DBOUNDS(ZSTD_d_windowLogMax, value);
+            dctx->maxWindowSize = ((size_t)1) << value;
+            return 0;
+        case ZSTD_d_format:
+            CHECK_DBOUNDS(ZSTD_d_format, value);
+            dctx->format = (ZSTD_format_e)value;
+            return 0;
+        case ZSTD_d_stableOutBuffer:
+            CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value);
+            dctx->outBufferMode = (ZSTD_outBufferMode_e)value;
+            return 0;
+        default:;
+    }
+    RETURN_ERROR(parameter_unsupported, "");
+}
+
+size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset)
+{
+    if ( (reset == ZSTD_reset_session_only)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        dctx->streamStage = zdss_init;
+        dctx->noForwardProgress = 0;
+    }
+    if ( (reset == ZSTD_reset_parameters)
+      || (reset == ZSTD_reset_session_and_parameters) ) {
+        RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, "");
+        ZSTD_clearDict(dctx);
+        dctx->format = ZSTD_f_zstd1;
+        dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT;
+    }
+    return 0;
+}
+
+
+size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
+{
+    return ZSTD_sizeof_DCtx(dctx);
+}
+
+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize)
+{
+    size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2);
+    unsigned long long const neededSize = MIN(frameContentSize, neededRBSize);
+    size_t const minRBSize = (size_t) neededSize;
+    RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize,
+                    frameParameter_windowTooLarge, "");
+    return minRBSize;
+}
+
+size_t ZSTD_estimateDStreamSize(size_t windowSize)
+{
+    size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX);
+    size_t const inBuffSize = blockSize;  /* no block can be larger */
+    size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN);
+    return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize;
+}
+
+size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize)
+{
+    U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX;   /* note : should be user-selectable, but requires an additional parameter (or a dctx) */
+    ZSTD_frameHeader zfh;
+    size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize);
+    if (ZSTD_isError(err)) return err;
+    RETURN_ERROR_IF(err>0, srcSize_wrong, "");
+    RETURN_ERROR_IF(zfh.windowSize > windowSizeMax,
+                    frameParameter_windowTooLarge, "");
+    return ZSTD_estimateDStreamSize((size_t)zfh.windowSize);
+}
+
+
+/* *****   Decompression   ***** */
+
+static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+    return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR;
+}
+
+static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize)
+{
+    if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize))
+        zds->oversizedDuration++;
+    else 
+        zds->oversizedDuration = 0;
+}
+
+static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds)
+{
+    return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION;
+}
+
+/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */
+static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output)
+{
+    ZSTD_outBuffer const expect = zds->expectedOutBuffer;
+    /* No requirement when ZSTD_obm_stable is not enabled. */
+    if (zds->outBufferMode != ZSTD_obm_stable)
+        return 0;
+    /* Any buffer is allowed in zdss_init, this must be the same for every other call until
+     * the context is reset.
+     */
+    if (zds->streamStage == zdss_init)
+        return 0;
+    /* The buffer must match our expectation exactly. */
+    if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size)
+        return 0;
+    RETURN_ERROR(dstBuffer_wrong, "ZSTD_obm_stable enabled but output differs!");
+}
+
+/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream()
+ * and updates the stage and the output buffer state. This call is extracted so it can be
+ * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode.
+ * NOTE: You must break after calling this function since the streamStage is modified.
+ */
+static size_t ZSTD_decompressContinueStream(
+            ZSTD_DStream* zds, char** op, char* oend,
+            void const* src, size_t srcSize) {
+    int const isSkipFrame = ZSTD_isSkipFrame(zds);
+    if (zds->outBufferMode == ZSTD_obm_buffered) {
+        size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart;
+        size_t const decodedSize = ZSTD_decompressContinue(zds,
+                zds->outBuff + zds->outStart, dstSize, src, srcSize);
+        FORWARD_IF_ERROR(decodedSize, "");
+        if (!decodedSize && !isSkipFrame) {
+            zds->streamStage = zdss_read;
+        } else {
+            zds->outEnd = zds->outStart + decodedSize;
+            zds->streamStage = zdss_flush;
+        }
+    } else {
+        /* Write directly into the output buffer */
+        size_t const dstSize = isSkipFrame ? 0 : oend - *op;
+        size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize);
+        FORWARD_IF_ERROR(decodedSize, "");
+        *op += decodedSize;
+        /* Flushing is not needed. */
+        zds->streamStage = zdss_read;
+        assert(*op <= oend);
+        assert(zds->outBufferMode == ZSTD_obm_stable);
+    }
+    return 0;
+}
+
+size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input)
+{
+    const char* const src = (const char*)input->src;
+    const char* const istart = input->pos != 0 ? src + input->pos : src;
+    const char* const iend = input->size != 0 ? src + input->size : src;
+    const char* ip = istart;
+    char* const dst = (char*)output->dst;
+    char* const ostart = output->pos != 0 ? dst + output->pos : dst;
+    char* const oend = output->size != 0 ? dst + output->size : dst;
+    char* op = ostart;
+    U32 someMoreWork = 1;
+
+    DEBUGLOG(5, "ZSTD_decompressStream");
+    RETURN_ERROR_IF(
+        input->pos > input->size,
+        srcSize_wrong,
+        "forbidden. in: pos: %u   vs size: %u",
+        (U32)input->pos, (U32)input->size);
+    RETURN_ERROR_IF(
+        output->pos > output->size,
+        dstSize_tooSmall,
+        "forbidden. out: pos: %u   vs size: %u",
+        (U32)output->pos, (U32)output->size);
+    DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos));
+    FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), "");
+
+    while (someMoreWork) {
+        switch(zds->streamStage)
+        {
+        case zdss_init :
+            DEBUGLOG(5, "stage zdss_init => transparent reset ");
+            zds->streamStage = zdss_loadHeader;
+            zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0;
+            zds->legacyVersion = 0;
+            zds->hostageByte = 0;
+            zds->expectedOutBuffer = *output;
+            /* fall-through */
+
+        case zdss_loadHeader :
+            DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip));
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+            if (zds->legacyVersion) {
+                RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+                    "legacy support is incompatible with static dctx");
+                {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input);
+                    if (hint==0) zds->streamStage = zdss_init;
+                    return hint;
+            }   }
+#endif
+            {   size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format);
+                DEBUGLOG(5, "header size : %u", (U32)hSize);
+                if (ZSTD_isError(hSize)) {
+#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1)
+                    U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart);
+                    if (legacyVersion) {
+                        ZSTD_DDict const* const ddict = ZSTD_getDDict(zds);
+                        const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL;
+                        size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0;
+                        DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion);
+                        RETURN_ERROR_IF(zds->staticSize, memory_allocation,
+                            "legacy support is incompatible with static dctx");
+                        FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext,
+                                    zds->previousLegacyVersion, legacyVersion,
+                                    dict, dictSize), "");
+                        zds->legacyVersion = zds->previousLegacyVersion = legacyVersion;
+                        {   size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input);
+                            if (hint==0) zds->streamStage = zdss_init;   /* or stay in stage zdss_loadHeader */
+                            return hint;
+                    }   }
+#endif
+                    return hSize;   /* error */
+                }
+                if (hSize != 0) {   /* need more input */
+                    size_t const toLoad = hSize - zds->lhSize;   /* if hSize!=0, hSize > zds->lhSize */
+                    size_t const remainingInput = (size_t)(iend-ip);
+                    assert(iend >= ip);
+                    if (toLoad > remainingInput) {   /* not enough input to load full header */
+                        if (remainingInput > 0) {
+                            memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput);
+                            zds->lhSize += remainingInput;
+                        }
+                        input->pos = input->size;
+                        return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize;   /* remaining header bytes + next block header */
+                    }
+                    assert(ip != NULL);
+                    memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad;
+                    break;
+            }   }
+
+            /* check for single-pass mode opportunity */
+            if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                && zds->fParams.frameType != ZSTD_skippableFrame
+                && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) {
+                size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart);
+                if (cSize <= (size_t)(iend-istart)) {
+                    /* shortcut : using single-pass mode */
+                    size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, ZSTD_getDDict(zds));
+                    if (ZSTD_isError(decompressedSize)) return decompressedSize;
+                    DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()")
+                    ip = istart + cSize;
+                    op += decompressedSize;
+                    zds->expected = 0;
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+            }   }
+
+            /* Check output buffer is large enough for ZSTD_odm_stable. */
+            if (zds->outBufferMode == ZSTD_obm_stable
+                && zds->fParams.frameType != ZSTD_skippableFrame
+                && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN
+                && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) {
+                RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small");
+            }
+
+            /* Consume header (see ZSTDds_decodeFrameHeader) */
+            DEBUGLOG(4, "Consume header");
+            FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), "");
+
+            if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) {  /* skippable frame */
+                zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE);
+                zds->stage = ZSTDds_skipFrame;
+            } else {
+                FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), "");
+                zds->expected = ZSTD_blockHeaderSize;
+                zds->stage = ZSTDds_decodeBlockHeader;
+            }
+
+            /* control buffer memory usage */
+            DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)",
+                        (U32)(zds->fParams.windowSize >>10),
+                        (U32)(zds->maxWindowSize >> 10) );
+            zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN);
+            RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize,
+                            frameParameter_windowTooLarge, "");
+
+            /* Adapt buffer sizes to frame header instructions */
+            {   size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */);
+                size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_obm_buffered
+                        ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize)
+                        : 0;
+
+                ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize);
+
+                {   int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize);
+                    int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds);
+                    
+                    if (tooSmall || tooLarge) {
+                        size_t const bufferSize = neededInBuffSize + neededOutBuffSize;
+                        DEBUGLOG(4, "inBuff  : from %u to %u",
+                                    (U32)zds->inBuffSize, (U32)neededInBuffSize);
+                        DEBUGLOG(4, "outBuff : from %u to %u",
+                                    (U32)zds->outBuffSize, (U32)neededOutBuffSize);
+                        if (zds->staticSize) {  /* static DCtx */
+                            DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize);
+                            assert(zds->staticSize >= sizeof(ZSTD_DCtx));  /* controlled at init */
+                            RETURN_ERROR_IF(
+                                bufferSize > zds->staticSize - sizeof(ZSTD_DCtx),
+                                memory_allocation, "");
+                        } else {
+                            ZSTD_free(zds->inBuff, zds->customMem);
+                            zds->inBuffSize = 0;
+                            zds->outBuffSize = 0;
+                            zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem);
+                            RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, "");
+                        }
+                        zds->inBuffSize = neededInBuffSize;
+                        zds->outBuff = zds->inBuff + zds->inBuffSize;
+                        zds->outBuffSize = neededOutBuffSize;
+            }   }   }
+            zds->streamStage = zdss_read;
+            /* fall-through */
+
+        case zdss_read:
+            DEBUGLOG(5, "stage zdss_read");
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip);
+                DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize);
+                if (neededInSize==0) {  /* end of frame */
+                    zds->streamStage = zdss_init;
+                    someMoreWork = 0;
+                    break;
+                }
+                if ((size_t)(iend-ip) >= neededInSize) {  /* decode directly from src */
+                    FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), "");
+                    ip += neededInSize;
+                    /* Function modifies the stage so we must break */
+                    break;
+            }   }
+            if (ip==iend) { someMoreWork = 0; break; }   /* no more input */
+            zds->streamStage = zdss_load;
+            /* fall-through */
+
+        case zdss_load:
+            {   size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds);
+                size_t const toLoad = neededInSize - zds->inPos;
+                int const isSkipFrame = ZSTD_isSkipFrame(zds);
+                size_t loadedSize;
+                /* At this point we shouldn't be decompressing a block that we can stream. */
+                assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip));
+                if (isSkipFrame) {
+                    loadedSize = MIN(toLoad, (size_t)(iend-ip));
+                } else {
+                    RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos,
+                                    corruption_detected,
+                                    "should never happen");
+                    loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip);
+                }
+                ip += loadedSize;
+                zds->inPos += loadedSize;
+                if (loadedSize < toLoad) { someMoreWork = 0; break; }   /* not enough input, wait for more */
+
+                /* decode loaded input */
+                zds->inPos = 0;   /* input is consumed */
+                FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), "");
+                /* Function modifies the stage so we must break */
+                break;
+            }
+        case zdss_flush:
+            {   size_t const toFlushSize = zds->outEnd - zds->outStart;
+                size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize);
+                op += flushedSize;
+                zds->outStart += flushedSize;
+                if (flushedSize == toFlushSize) {  /* flush completed */
+                    zds->streamStage = zdss_read;
+                    if ( (zds->outBuffSize < zds->fParams.frameContentSize)
+                      && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) {
+                        DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)",
+                                (int)(zds->outBuffSize - zds->outStart),
+                                (U32)zds->fParams.blockSizeMax);
+                        zds->outStart = zds->outEnd = 0;
+                    }
+                    break;
+            }   }
+            /* cannot complete flush */
+            someMoreWork = 0;
+            break;
+
+        default:
+            assert(0);    /* impossible */
+            RETURN_ERROR(GENERIC, "impossible to reach");   /* some compiler require default to do something */
+    }   }
+
+    /* result */
+    input->pos = (size_t)(ip - (const char*)(input->src));
+    output->pos = (size_t)(op - (char*)(output->dst));
+
+    /* Update the expected output buffer for ZSTD_obm_stable. */
+    zds->expectedOutBuffer = *output;
+
+    if ((ip==istart) && (op==ostart)) {  /* no forward progress */
+        zds->noForwardProgress ++;
+        if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) {
+            RETURN_ERROR_IF(op==oend, dstSize_tooSmall, "");
+            RETURN_ERROR_IF(ip==iend, srcSize_wrong, "");
+            assert(0);
+        }
+    } else {
+        zds->noForwardProgress = 0;
+    }
+    {   size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds);
+        if (!nextSrcSizeHint) {   /* frame fully decoded */
+            if (zds->outEnd == zds->outStart) {  /* output fully flushed */
+                if (zds->hostageByte) {
+                    if (input->pos >= input->size) {
+                        /* can't release hostage (not present) */
+                        zds->streamStage = zdss_read;
+                        return 1;
+                    }
+                    input->pos++;  /* release hostage */
+                }   /* zds->hostageByte */
+                return 0;
+            }  /* zds->outEnd == zds->outStart */
+            if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */
+                input->pos--;   /* note : pos > 0, otherwise, impossible to finish reading last block */
+                zds->hostageByte=1;
+            }
+            return 1;
+        }  /* nextSrcSizeHint==0 */
+        nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block);   /* preload header of next block */
+        assert(zds->inPos <= nextSrcSizeHint);
+        nextSrcSizeHint -= zds->inPos;   /* part already loaded*/
+        return nextSrcSizeHint;
+    }
+}
+
+size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos)
+{
+    ZSTD_outBuffer output = { dst, dstCapacity, *dstPos };
+    ZSTD_inBuffer  input  = { src, srcSize, *srcPos };
+    /* ZSTD_compress_generic() will check validity of dstPos and srcPos */
+    size_t const cErr = ZSTD_decompressStream(dctx, &output, &input);
+    *dstPos = output.pos;
+    *srcPos = input.pos;
+    return cErr;
+}
+/**** ended inlining decompress/zstd_decompress.c ****/
+/**** start inlining decompress/zstd_decompress_block.c ****/
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+/* zstd_decompress_block :
+ * this module takes care of decompressing _compressed_ block */
+
+/*-*******************************************************
+*  Dependencies
+*********************************************************/
+#include <string.h>      /* memcpy, memmove, memset */
+/**** skipping file: ../common/compiler.h ****/
+/**** skipping file: ../common/cpu.h ****/
+/**** skipping file: ../common/mem.h ****/
+#define FSE_STATIC_LINKING_ONLY
+/**** skipping file: ../common/fse.h ****/
+#define HUF_STATIC_LINKING_ONLY
+/**** skipping file: ../common/huf.h ****/
+/**** skipping file: ../common/zstd_internal.h ****/
+/**** skipping file: zstd_decompress_internal.h ****/
+/**** skipping file: zstd_ddict.h ****/
+/**** skipping file: zstd_decompress_block.h ****/
+
+/*_*******************************************************
+*  Macros
+**********************************************************/
+
+/* These two optional macros force the use one way or another of the two
+ * ZSTD_decompressSequences implementations. You can't force in both directions
+ * at the same time.
+ */
+#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!"
+#endif
+
+
+/*_*******************************************************
+*  Memory operations
+**********************************************************/
+static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); }
+
+
+/*-*************************************************************
+ *   Block decoding
+ ***************************************************************/
+
+/*! ZSTD_getcBlockSize() :
+ *  Provides the size of compressed block from block header `src` */
+size_t ZSTD_getcBlockSize(const void* src, size_t srcSize,
+                          blockProperties_t* bpPtr)
+{
+    RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, "");
+
+    {   U32 const cBlockHeader = MEM_readLE24(src);
+        U32 const cSize = cBlockHeader >> 3;
+        bpPtr->lastBlock = cBlockHeader & 1;
+        bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3);
+        bpPtr->origSize = cSize;   /* only useful for RLE */
+        if (bpPtr->blockType == bt_rle) return 1;
+        RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, "");
+        return cSize;
+    }
+}
+
+
+/* Hidden declaration for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize);
+/*! ZSTD_decodeLiteralsBlock() :
+ * @return : nb of bytes read from src (< srcSize )
+ *  note : symbol not declared but exposed for fullbench */
+size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
+                          const void* src, size_t srcSize)   /* note : srcSize < BLOCKSIZE */
+{
+    DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
+    RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
+
+    {   const BYTE* const istart = (const BYTE*) src;
+        symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3);
+
+        switch(litEncType)
+        {
+        case set_repeat:
+            DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block");
+            RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, "");
+            /* fall-through */
+
+        case set_compressed:
+            RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3");
+            {   size_t lhSize, litSize, litCSize;
+                U32 singleStream=0;
+                U32 const lhlCode = (istart[0] >> 2) & 3;
+                U32 const lhc = MEM_readLE32(istart);
+                size_t hufSuccess;
+                switch(lhlCode)
+                {
+                case 0: case 1: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    /* 2 - 2 - 10 - 10 */
+                    singleStream = !lhlCode;
+                    lhSize = 3;
+                    litSize  = (lhc >> 4) & 0x3FF;
+                    litCSize = (lhc >> 14) & 0x3FF;
+                    break;
+                case 2:
+                    /* 2 - 2 - 14 - 14 */
+                    lhSize = 4;
+                    litSize  = (lhc >> 4) & 0x3FFF;
+                    litCSize = lhc >> 18;
+                    break;
+                case 3:
+                    /* 2 - 2 - 18 - 18 */
+                    lhSize = 5;
+                    litSize  = (lhc >> 4) & 0x3FFFF;
+                    litCSize = (lhc >> 22) + ((size_t)istart[4] << 10);
+                    break;
+                }
+                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, "");
+
+                /* prefetch huffman table if cold */
+                if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
+                    PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable));
+                }
+
+                if (litEncType==set_repeat) {
+                    if (singleStream) {
+                        hufSuccess = HUF_decompress1X_usingDTable_bmi2(
+                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                            dctx->HUFptr, dctx->bmi2);
+                    } else {
+                        hufSuccess = HUF_decompress4X_usingDTable_bmi2(
+                            dctx->litBuffer, litSize, istart+lhSize, litCSize,
+                            dctx->HUFptr, dctx->bmi2);
+                    }
+                } else {
+                    if (singleStream) {
+#if defined(HUF_FORCE_DECOMPRESS_X2)
+                        hufSuccess = HUF_decompress1X_DCtx_wksp(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace));
+#else
+                        hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace), dctx->bmi2);
+#endif
+                    } else {
+                        hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2(
+                            dctx->entropy.hufTable, dctx->litBuffer, litSize,
+                            istart+lhSize, litCSize, dctx->workspace,
+                            sizeof(dctx->workspace), dctx->bmi2);
+                    }
+                }
+
+                RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, "");
+
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                dctx->litEntropy = 1;
+                if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable;
+                memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                return litCSize + lhSize;
+            }
+
+        case set_basic:
+            {   size_t litSize, lhSize;
+                U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    break;
+                }
+
+                if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* risk reading beyond src buffer with wildcopy */
+                    RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, "");
+                    memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                    dctx->litPtr = dctx->litBuffer;
+                    dctx->litSize = litSize;
+                    memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH);
+                    return lhSize+litSize;
+                }
+                /* direct reference into compressed stream */
+                dctx->litPtr = istart+lhSize;
+                dctx->litSize = litSize;
+                return lhSize+litSize;
+            }
+
+        case set_rle:
+            {   U32 const lhlCode = ((istart[0]) >> 2) & 3;
+                size_t litSize, lhSize;
+                switch(lhlCode)
+                {
+                case 0: case 2: default:   /* note : default is impossible, since lhlCode into [0..3] */
+                    lhSize = 1;
+                    litSize = istart[0] >> 3;
+                    break;
+                case 1:
+                    lhSize = 2;
+                    litSize = MEM_readLE16(istart) >> 4;
+                    break;
+                case 3:
+                    lhSize = 3;
+                    litSize = MEM_readLE24(istart) >> 4;
+                    RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4");
+                    break;
+                }
+                RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, "");
+                memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH);
+                dctx->litPtr = dctx->litBuffer;
+                dctx->litSize = litSize;
+                return lhSize+1;
+            }
+        default:
+            RETURN_ERROR(corruption_detected, "impossible");
+        }
+    }
+}
+
+/* Default FSE distribution tables.
+ * These are pre-calculated FSE decoding tables using default distributions as defined in specification :
+ * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions
+ * They were generated programmatically with following method :
+ * - start from default distributions, present in /lib/common/zstd_internal.h
+ * - generate tables normally, using ZSTD_buildFSETable()
+ * - printout the content of tables
+ * - pretify output, report below, test with fuzzer to ensure it's correct */
+
+/* Default FSE distribution table for Literal Lengths */
+static const ZSTD_seqSymbol LL_defaultDTable[(1<<LL_DEFAULTNORMLOG)+1] = {
+     {  1,  1,  1, LL_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+     /* nextState, nbAddBits, nbBits, baseVal */
+     {  0,  0,  4,    0},  { 16,  0,  4,    0},
+     { 32,  0,  5,    1},  {  0,  0,  5,    3},
+     {  0,  0,  5,    4},  {  0,  0,  5,    6},
+     {  0,  0,  5,    7},  {  0,  0,  5,    9},
+     {  0,  0,  5,   10},  {  0,  0,  5,   12},
+     {  0,  0,  6,   14},  {  0,  1,  5,   16},
+     {  0,  1,  5,   20},  {  0,  1,  5,   22},
+     {  0,  2,  5,   28},  {  0,  3,  5,   32},
+     {  0,  4,  5,   48},  { 32,  6,  5,   64},
+     {  0,  7,  5,  128},  {  0,  8,  6,  256},
+     {  0, 10,  6, 1024},  {  0, 12,  6, 4096},
+     { 32,  0,  4,    0},  {  0,  0,  4,    1},
+     {  0,  0,  5,    2},  { 32,  0,  5,    4},
+     {  0,  0,  5,    5},  { 32,  0,  5,    7},
+     {  0,  0,  5,    8},  { 32,  0,  5,   10},
+     {  0,  0,  5,   11},  {  0,  0,  6,   13},
+     { 32,  1,  5,   16},  {  0,  1,  5,   18},
+     { 32,  1,  5,   22},  {  0,  2,  5,   24},
+     { 32,  3,  5,   32},  {  0,  3,  5,   40},
+     {  0,  6,  4,   64},  { 16,  6,  4,   64},
+     { 32,  7,  5,  128},  {  0,  9,  6,  512},
+     {  0, 11,  6, 2048},  { 48,  0,  4,    0},
+     { 16,  0,  4,    1},  { 32,  0,  5,    2},
+     { 32,  0,  5,    3},  { 32,  0,  5,    5},
+     { 32,  0,  5,    6},  { 32,  0,  5,    8},
+     { 32,  0,  5,    9},  { 32,  0,  5,   11},
+     { 32,  0,  5,   12},  {  0,  0,  6,   15},
+     { 32,  1,  5,   18},  { 32,  1,  5,   20},
+     { 32,  2,  5,   24},  { 32,  2,  5,   28},
+     { 32,  3,  5,   40},  { 32,  4,  5,   48},
+     {  0, 16,  6,65536},  {  0, 15,  6,32768},
+     {  0, 14,  6,16384},  {  0, 13,  6, 8192},
+};   /* LL_defaultDTable */
+
+/* Default FSE distribution table for Offset Codes */
+static const ZSTD_seqSymbol OF_defaultDTable[(1<<OF_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, OF_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  5,    0},     {  0,  6,  4,   61},
+    {  0,  9,  5,  509},     {  0, 15,  5,32765},
+    {  0, 21,  5,2097149},   {  0,  3,  5,    5},
+    {  0,  7,  4,  125},     {  0, 12,  5, 4093},
+    {  0, 18,  5,262141},    {  0, 23,  5,8388605},
+    {  0,  5,  5,   29},     {  0,  8,  4,  253},
+    {  0, 14,  5,16381},     {  0, 20,  5,1048573},
+    {  0,  2,  5,    1},     { 16,  7,  4,  125},
+    {  0, 11,  5, 2045},     {  0, 17,  5,131069},
+    {  0, 22,  5,4194301},   {  0,  4,  5,   13},
+    { 16,  8,  4,  253},     {  0, 13,  5, 8189},
+    {  0, 19,  5,524285},    {  0,  1,  5,    1},
+    { 16,  6,  4,   61},     {  0, 10,  5, 1021},
+    {  0, 16,  5,65533},     {  0, 28,  5,268435453},
+    {  0, 27,  5,134217725}, {  0, 26,  5,67108861},
+    {  0, 25,  5,33554429},  {  0, 24,  5,16777213},
+};   /* OF_defaultDTable */
+
+
+/* Default FSE distribution table for Match Lengths */
+static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEFAULTNORMLOG)+1] = {
+    {  1,  1,  1, ML_DEFAULTNORMLOG},  /* header : fastMode, tableLog */
+    /* nextState, nbAddBits, nbBits, baseVal */
+    {  0,  0,  6,    3},  {  0,  0,  4,    4},
+    { 32,  0,  5,    5},  {  0,  0,  5,    6},
+    {  0,  0,  5,    8},  {  0,  0,  5,    9},
+    {  0,  0,  5,   11},  {  0,  0,  6,   13},
+    {  0,  0,  6,   16},  {  0,  0,  6,   19},
+    {  0,  0,  6,   22},  {  0,  0,  6,   25},
+    {  0,  0,  6,   28},  {  0,  0,  6,   31},
+    {  0,  0,  6,   34},  {  0,  1,  6,   37},
+    {  0,  1,  6,   41},  {  0,  2,  6,   47},
+    {  0,  3,  6,   59},  {  0,  4,  6,   83},
+    {  0,  7,  6,  131},  {  0,  9,  6,  515},
+    { 16,  0,  4,    4},  {  0,  0,  4,    5},
+    { 32,  0,  5,    6},  {  0,  0,  5,    7},
+    { 32,  0,  5,    9},  {  0,  0,  5,   10},
+    {  0,  0,  6,   12},  {  0,  0,  6,   15},
+    {  0,  0,  6,   18},  {  0,  0,  6,   21},
+    {  0,  0,  6,   24},  {  0,  0,  6,   27},
+    {  0,  0,  6,   30},  {  0,  0,  6,   33},
+    {  0,  1,  6,   35},  {  0,  1,  6,   39},
+    {  0,  2,  6,   43},  {  0,  3,  6,   51},
+    {  0,  4,  6,   67},  {  0,  5,  6,   99},
+    {  0,  8,  6,  259},  { 32,  0,  4,    4},
+    { 48,  0,  4,    4},  { 16,  0,  4,    5},
+    { 32,  0,  5,    7},  { 32,  0,  5,    8},
+    { 32,  0,  5,   10},  { 32,  0,  5,   11},
+    {  0,  0,  6,   14},  {  0,  0,  6,   17},
+    {  0,  0,  6,   20},  {  0,  0,  6,   23},
+    {  0,  0,  6,   26},  {  0,  0,  6,   29},
+    {  0,  0,  6,   32},  {  0, 16,  6,65539},
+    {  0, 15,  6,32771},  {  0, 14,  6,16387},
+    {  0, 13,  6, 8195},  {  0, 12,  6, 4099},
+    {  0, 11,  6, 2051},  {  0, 10,  6, 1027},
+};   /* ML_defaultDTable */
+
+
+static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 nbAddBits)
+{
+    void* ptr = dt;
+    ZSTD_seqSymbol_header* const DTableH = (ZSTD_seqSymbol_header*)ptr;
+    ZSTD_seqSymbol* const cell = dt + 1;
+
+    DTableH->tableLog = 0;
+    DTableH->fastMode = 0;
+
+    cell->nbBits = 0;
+    cell->nextState = 0;
+    assert(nbAddBits < 255);
+    cell->nbAdditionalBits = (BYTE)nbAddBits;
+    cell->baseValue = baseValue;
+}
+
+
+/* ZSTD_buildFSETable() :
+ * generate FSE decoding table for one symbol (ll, ml or off)
+ * cannot fail if input is valid =>
+ * all inputs are presumed validated at this stage */
+void
+ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
+            const short* normalizedCounter, unsigned maxSymbolValue,
+            const U32* baseValue, const U32* nbAdditionalBits,
+            unsigned tableLog)
+{
+    ZSTD_seqSymbol* const tableDecode = dt+1;
+    U16 symbolNext[MaxSeq+1];
+
+    U32 const maxSV1 = maxSymbolValue + 1;
+    U32 const tableSize = 1 << tableLog;
+    U32 highThreshold = tableSize-1;
+
+    /* Sanity Checks */
+    assert(maxSymbolValue <= MaxSeq);
+    assert(tableLog <= MaxFSELog);
+
+    /* Init, lay down lowprob symbols */
+    {   ZSTD_seqSymbol_header DTableH;
+        DTableH.tableLog = tableLog;
+        DTableH.fastMode = 1;
+        {   S16 const largeLimit= (S16)(1 << (tableLog-1));
+            U32 s;
+            for (s=0; s<maxSV1; s++) {
+                if (normalizedCounter[s]==-1) {
+                    tableDecode[highThreshold--].baseValue = s;
+                    symbolNext[s] = 1;
+                } else {
+                    if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0;
+                    assert(normalizedCounter[s]>=0);
+                    symbolNext[s] = (U16)normalizedCounter[s];
+        }   }   }
+        memcpy(dt, &DTableH, sizeof(DTableH));
+    }
+
+    /* Spread symbols */
+    {   U32 const tableMask = tableSize-1;
+        U32 const step = FSE_TABLESTEP(tableSize);
+        U32 s, position = 0;
+        for (s=0; s<maxSV1; s++) {
+            int i;
+            for (i=0; i<normalizedCounter[s]; i++) {
+                tableDecode[position].baseValue = s;
+                position = (position + step) & tableMask;
+                while (position > highThreshold) position = (position + step) & tableMask;   /* lowprob area */
+        }   }
+        assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */
+    }
+
+    /* Build Decoding table */
+    {   U32 u;
+        for (u=0; u<tableSize; u++) {
+            U32 const symbol = tableDecode[u].baseValue;
+            U32 const nextState = symbolNext[symbol]++;
+            tableDecode[u].nbBits = (BYTE) (tableLog - BIT_highbit32(nextState) );
+            tableDecode[u].nextState = (U16) ( (nextState << tableDecode[u].nbBits) - tableSize);
+            assert(nbAdditionalBits[symbol] < 255);
+            tableDecode[u].nbAdditionalBits = (BYTE)nbAdditionalBits[symbol];
+            tableDecode[u].baseValue = baseValue[symbol];
+    }   }
+}
+
+
+/*! ZSTD_buildSeqTable() :
+ * @return : nb bytes read from src,
+ *           or an error code if it fails */
+static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_seqSymbol** DTablePtr,
+                                 symbolEncodingType_e type, unsigned max, U32 maxLog,
+                                 const void* src, size_t srcSize,
+                                 const U32* baseValue, const U32* nbAdditionalBits,
+                                 const ZSTD_seqSymbol* defaultTable, U32 flagRepeatTable,
+                                 int ddictIsCold, int nbSeq)
+{
+    switch(type)
+    {
+    case set_rle :
+        RETURN_ERROR_IF(!srcSize, srcSize_wrong, "");
+        RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, "");
+        {   U32 const symbol = *(const BYTE*)src;
+            U32 const baseline = baseValue[symbol];
+            U32 const nbBits = nbAdditionalBits[symbol];
+            ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
+        }
+        *DTablePtr = DTableSpace;
+        return 1;
+    case set_basic :
+        *DTablePtr = defaultTable;
+        return 0;
+    case set_repeat:
+        RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, "");
+        /* prefetch FSE table if used */
+        if (ddictIsCold && (nbSeq > 24 /* heuristic */)) {
+            const void* const pStart = *DTablePtr;
+            size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog));
+            PREFETCH_AREA(pStart, pSize);
+        }
+        return 0;
+    case set_compressed :
+        {   unsigned tableLog;
+            S16 norm[MaxSeq+1];
+            size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize);
+            RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, "");
+            RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, "");
+            ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog);
+            *DTablePtr = DTableSpace;
+            return headerSize;
+        }
+    default :
+        assert(0);
+        RETURN_ERROR(GENERIC, "impossible");
+    }
+}
+
+size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
+                             const void* src, size_t srcSize)
+{
+    const BYTE* const istart = (const BYTE* const)src;
+    const BYTE* const iend = istart + srcSize;
+    const BYTE* ip = istart;
+    int nbSeq;
+    DEBUGLOG(5, "ZSTD_decodeSeqHeaders");
+
+    /* check */
+    RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, "");
+
+    /* SeqHead */
+    nbSeq = *ip++;
+    if (!nbSeq) {
+        *nbSeqPtr=0;
+        RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, "");
+        return 1;
+    }
+    if (nbSeq > 0x7F) {
+        if (nbSeq == 0xFF) {
+            RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, "");
+            nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2;
+        } else {
+            RETURN_ERROR_IF(ip >= iend, srcSize_wrong, "");
+            nbSeq = ((nbSeq-0x80)<<8) + *ip++;
+        }
+    }
+    *nbSeqPtr = nbSeq;
+
+    /* FSE table descriptors */
+    RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */
+    {   symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6);
+        symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3);
+        symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3);
+        ip++;
+
+        /* Build DTables */
+        {   size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr,
+                                                      LLtype, MaxLL, LLFSELog,
+                                                      ip, iend-ip,
+                                                      LL_base, LL_bits,
+                                                      LL_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
+            RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += llhSize;
+        }
+
+        {   size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr,
+                                                      OFtype, MaxOff, OffFSELog,
+                                                      ip, iend-ip,
+                                                      OF_base, OF_bits,
+                                                      OF_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
+            RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += ofhSize;
+        }
+
+        {   size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr,
+                                                      MLtype, MaxML, MLFSELog,
+                                                      ip, iend-ip,
+                                                      ML_base, ML_bits,
+                                                      ML_defaultDTable, dctx->fseEntropy,
+                                                      dctx->ddictIsCold, nbSeq);
+            RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed");
+            ip += mlhSize;
+        }
+    }
+
+    return ip-istart;
+}
+
+
+typedef struct {
+    size_t litLength;
+    size_t matchLength;
+    size_t offset;
+    const BYTE* match;
+} seq_t;
+
+typedef struct {
+    size_t state;
+    const ZSTD_seqSymbol* table;
+} ZSTD_fseState;
+
+typedef struct {
+    BIT_DStream_t DStream;
+    ZSTD_fseState stateLL;
+    ZSTD_fseState stateOffb;
+    ZSTD_fseState stateML;
+    size_t prevOffset[ZSTD_REP_NUM];
+    const BYTE* prefixStart;
+    const BYTE* dictEnd;
+    size_t pos;
+} seqState_t;
+
+/*! ZSTD_overlapCopy8() :
+ *  Copies 8 bytes from ip to op and updates op and ip where ip <= op.
+ *  If the offset is < 8 then the offset is spread to at least 8 bytes.
+ *
+ *  Precondition: *ip <= *op
+ *  Postcondition: *op - *op >= 8
+ */
+HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) {
+    assert(*ip <= *op);
+    if (offset < 8) {
+        /* close range match, overlap */
+        static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };   /* added */
+        static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 };   /* subtracted */
+        int const sub2 = dec64table[offset];
+        (*op)[0] = (*ip)[0];
+        (*op)[1] = (*ip)[1];
+        (*op)[2] = (*ip)[2];
+        (*op)[3] = (*ip)[3];
+        *ip += dec32table[offset];
+        ZSTD_copy4(*op+4, *ip);
+        *ip -= sub2;
+    } else {
+        ZSTD_copy8(*op, *ip);
+    }
+    *ip += 8;
+    *op += 8;
+    assert(*op - *ip >= 8);
+}
+
+/*! ZSTD_safecopy() :
+ *  Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer
+ *  and write up to 16 bytes past oend_w (op >= oend_w is allowed).
+ *  This function is only called in the uncommon case where the sequence is near the end of the block. It
+ *  should be fast for a single long sequence, but can be slow for several short sequences.
+ *
+ *  @param ovtype controls the overlap detection
+ *         - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart.
+ *         - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart.
+ *           The src buffer must be before the dst buffer.
+ */
+static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
+    ptrdiff_t const diff = op - ip;
+    BYTE* const oend = op + length;
+
+    assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) ||
+           (ovtype == ZSTD_overlap_src_before_dst && diff >= 0));
+
+    if (length < 8) {
+        /* Handle short lengths. */
+        while (op < oend) *op++ = *ip++;
+        return;
+    }
+    if (ovtype == ZSTD_overlap_src_before_dst) {
+        /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */
+        assert(length >= 8);
+        ZSTD_overlapCopy8(&op, &ip, diff);
+        assert(op - ip >= 8);
+        assert(op <= oend);
+    }
+
+    if (oend <= oend_w) {
+        /* No risk of overwrite. */
+        ZSTD_wildcopy(op, ip, length, ovtype);
+        return;
+    }
+    if (op <= oend_w) {
+        /* Wildcopy until we get close to the end. */
+        assert(oend > oend_w);
+        ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
+        ip += oend_w - op;
+        op = oend_w;
+    }
+    /* Handle the leftovers. */
+    while (op < oend) *op++ = *ip++;
+}
+
+/* ZSTD_execSequenceEnd():
+ * This version handles cases that are near the end of the output buffer. It requires
+ * more careful checks to make sure there is no overflow. By separating out these hard
+ * and unlikely cases, we can speed up the common cases.
+ *
+ * NOTE: This function needs to be fast for a single long sequence, but doesn't need
+ * to be optimized for many small sequences, since those fall into ZSTD_execSequence().
+ */
+FORCE_NOINLINE
+size_t ZSTD_execSequenceEnd(BYTE* op,
+                            BYTE* const oend, seq_t sequence,
+                            const BYTE** litPtr, const BYTE* const litLimit,
+                            const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;
+
+    /* bounds checks : careful of address space overflow in 32-bit mode */
+    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer");
+    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer");
+    assert(op < op + sequenceLength);
+    assert(oLitEnd < op + sequenceLength);
+
+    /* copy literals */
+    ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap);
+    op = oLitEnd;
+    *litPtr = iLitEnd;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, "");
+        match = dictEnd - (prefixStart-match);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = prefixStart;
+    }   }
+    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst);
+    return sequenceLength;
+}
+
+HINT_INLINE
+size_t ZSTD_execSequence(BYTE* op,
+                         BYTE* const oend, seq_t sequence,
+                         const BYTE** litPtr, const BYTE* const litLimit,
+                         const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd)
+{
+    BYTE* const oLitEnd = op + sequence.litLength;
+    size_t const sequenceLength = sequence.litLength + sequence.matchLength;
+    BYTE* const oMatchEnd = op + sequenceLength;   /* risk : address space overflow (32-bits) */
+    BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH;   /* risk : address space underflow on oend=NULL */
+    const BYTE* const iLitEnd = *litPtr + sequence.litLength;
+    const BYTE* match = oLitEnd - sequence.offset;
+
+    assert(op != NULL /* Precondition */);
+    assert(oend_w < oend /* No underflow */);
+    /* Handle edge cases in a slow path:
+     *   - Read beyond end of literals
+     *   - Match end is within WILDCOPY_OVERLIMIT of oend
+     *   - 32-bit mode and the match length overflows
+     */
+    if (UNLIKELY(
+            iLitEnd > litLimit ||
+            oMatchEnd > oend_w ||
+            (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH)))
+        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
+
+    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
+    assert(op <= oLitEnd /* No overflow */);
+    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
+    assert(oMatchEnd <= oend /* No underflow */);
+    assert(iLitEnd <= litLimit /* Literal length is in bounds */);
+    assert(oLitEnd <= oend_w /* Can wildcopy literals */);
+    assert(oMatchEnd <= oend_w /* Can wildcopy matches */);
+
+    /* Copy Literals:
+     * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9.
+     * We likely don't need the full 32-byte wildcopy.
+     */
+    assert(WILDCOPY_OVERLENGTH >= 16);
+    ZSTD_copy16(op, (*litPtr));
+    if (UNLIKELY(sequence.litLength > 16)) {
+        ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap);
+    }
+    op = oLitEnd;
+    *litPtr = iLitEnd;   /* update for next sequence */
+
+    /* Copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix -> go into extDict */
+        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, "");
+        match = dictEnd + (match - prefixStart);
+        if (match + sequence.matchLength <= dictEnd) {
+            memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 = dictEnd - match;
+            memmove(oLitEnd, match, length1);
+            op = oLitEnd + length1;
+            sequence.matchLength -= length1;
+            match = prefixStart;
+    }   }
+    /* Match within prefix of 1 or more bytes */
+    assert(op <= oMatchEnd);
+    assert(oMatchEnd <= oend_w);
+    assert(match >= prefixStart);
+    assert(sequence.matchLength >= 1);
+
+    /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy
+     * without overlap checking.
+     */
+    if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) {
+        /* We bet on a full wildcopy for matches, since we expect matches to be
+         * longer than literals (in general). In silesia, ~10% of matches are longer
+         * than 16 bytes.
+         */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap);
+        return sequenceLength;
+    }
+    assert(sequence.offset < WILDCOPY_VECLEN);
+
+    /* Copy 8 bytes and spread the offset to be >= 8. */
+    ZSTD_overlapCopy8(&op, &match, sequence.offset);
+
+    /* If the match length is > 8 bytes, then continue with the wildcopy. */
+    if (sequence.matchLength > 8) {
+        assert(op < oMatchEnd);
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst);
+    }
+    return sequenceLength;
+}
+
+static void
+ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt)
+{
+    const void* ptr = dt;
+    const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr;
+    DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog);
+    DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits",
+                (U32)DStatePtr->state, DTableH->tableLog);
+    BIT_reloadDStream(bitD);
+    DStatePtr->table = dt + 1;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
+{
+    ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state];
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.nextState + lowBits;
+}
+
+FORCE_INLINE_TEMPLATE void
+ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo)
+{
+    U32 const nbBits = DInfo.nbBits;
+    size_t const lowBits = BIT_readBits(bitD, nbBits);
+    DStatePtr->state = DInfo.nextState + lowBits;
+}
+
+/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum
+ * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1)
+ * bits before reloading. This value is the maximum number of bytes we read
+ * after reloading when we are decoding long offsets.
+ */
+#define LONG_OFFSETS_MAX_EXTRA_BITS_32                       \
+    (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32       \
+        ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32  \
+        : 0)
+
+typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e;
+typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e;
+
+FORCE_INLINE_TEMPLATE seq_t
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch)
+{
+    seq_t seq;
+    ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state];
+    ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state];
+    ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state];
+    U32 const llBase = llDInfo.baseValue;
+    U32 const mlBase = mlDInfo.baseValue;
+    U32 const ofBase = ofDInfo.baseValue;
+    BYTE const llBits = llDInfo.nbAdditionalBits;
+    BYTE const mlBits = mlDInfo.nbAdditionalBits;
+    BYTE const ofBits = ofDInfo.nbAdditionalBits;
+    BYTE const totalBits = llBits+mlBits+ofBits;
+
+    /* sequence */
+    {   size_t offset;
+        if (ofBits > 1) {
+            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1);
+            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5);
+            assert(ofBits <= MaxOff);
+            if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) {
+                U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed);
+                offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits);
+                BIT_reloadDStream(&seqState->DStream);
+                if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits);
+                assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32);   /* to avoid another reload */
+            } else {
+                offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/);   /* <=  (ZSTD_WINDOWLOG_MAX-1) bits */
+                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
+            }
+            seqState->prevOffset[2] = seqState->prevOffset[1];
+            seqState->prevOffset[1] = seqState->prevOffset[0];
+            seqState->prevOffset[0] = offset;
+        } else {
+            U32 const ll0 = (llBase == 0);
+            if (LIKELY((ofBits == 0))) {
+                if (LIKELY(!ll0))
+                    offset = seqState->prevOffset[0];
+                else {
+                    offset = seqState->prevOffset[1];
+                    seqState->prevOffset[1] = seqState->prevOffset[0];
+                    seqState->prevOffset[0] = offset;
+                }
+            } else {
+                offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1);
+                {   size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset];
+                    temp += !temp;   /* 0 is not valid; input is corrupted; force offset to 1 */
+                    if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1];
+                    seqState->prevOffset[1] = seqState->prevOffset[0];
+                    seqState->prevOffset[0] = offset = temp;
+        }   }   }
+        seq.offset = offset;
+    }
+
+    seq.matchLength = mlBase;
+    if (mlBits > 0)
+        seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/);
+
+    if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32))
+        BIT_reloadDStream(&seqState->DStream);
+    if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog)))
+        BIT_reloadDStream(&seqState->DStream);
+    /* Ensure there are enough bits to read the rest of data in 64-bit mode. */
+    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64);
+
+    seq.litLength = llBase;
+    if (llBits > 0)
+        seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/);
+
+    if (MEM_32bits())
+        BIT_reloadDStream(&seqState->DStream);
+
+    DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u",
+                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+
+    if (prefetch == ZSTD_p_prefetch) {
+        size_t const pos = seqState->pos + seq.litLength;
+        const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart;
+        seq.match = matchBase + pos - seq.offset;  /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted.
+                                                    * No consequence though : no memory access will occur, offset is only used for prefetching */
+        seqState->pos = pos + seq.matchLength;
+    }
+
+    /* ANS state update
+     * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
+     * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
+     * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
+     * better option, so it is the default for other compilers. But, if you
+     * measure that it is worse, please put up a pull request.
+     */
+    {
+#if defined(__GNUC__) && !defined(__clang__)
+        const int kUseUpdateFseState = 1;
+#else
+        const int kUseUpdateFseState = 0;
+#endif
+        if (kUseUpdateFseState) {
+            ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);    /* <=  9 bits */
+            ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);    /* <=  9 bits */
+            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+            ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);  /* <=  8 bits */
+        } else {
+            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo);    /* <=  9 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo);    /* <=  9 bits */
+            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <= 18 bits */
+            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo);  /* <=  8 bits */
+        }
+    }
+
+    return seq;
+}
+
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd)
+{
+    size_t const windowSize = dctx->fParams.windowSize;
+    /* No dictionary used. */
+    if (dctx->dictContentEndForFuzzing == NULL) return 0;
+    /* Dictionary is our prefix. */
+    if (prefixStart == dctx->dictContentBeginForFuzzing) return 1;
+    /* Dictionary is not our ext-dict. */
+    if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0;
+    /* Dictionary is not within our window size. */
+    if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0;
+    /* Dictionary is active. */
+    return 1;
+}
+
+MEM_STATIC void ZSTD_assertValidSequence(
+        ZSTD_DCtx const* dctx,
+        BYTE const* op, BYTE const* oend,
+        seq_t const seq,
+        BYTE const* prefixStart, BYTE const* virtualStart)
+{
+    size_t const windowSize = dctx->fParams.windowSize;
+    size_t const sequenceSize = seq.litLength + seq.matchLength;
+    BYTE const* const oLitEnd = op + seq.litLength;
+    DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u",
+            (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
+    assert(op <= oend);
+    assert((size_t)(oend - op) >= sequenceSize);
+    assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX);
+    if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) {
+        size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing);
+        /* Offset must be within the dictionary. */
+        assert(seq.offset <= (size_t)(oLitEnd - virtualStart));
+        assert(seq.offset <= windowSize + dictSize);
+    } else {
+        /* Offset must be within our window. */
+        assert(seq.offset <= windowSize);
+    }
+}
+#endif
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset,
+                         const int frame)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const vBase = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+    (void)frame;
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        size_t error = 0;
+        dctx->fseEntropy = 1;
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+        assert(dst != NULL);
+
+        ZSTD_STATIC_ASSERT(
+                BIT_DStream_unfinished < BIT_DStream_completed &&
+                BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+                BIT_DStream_completed < BIT_DStream_overflow);
+
+#if defined(__GNUC__) && defined(__x86_64__)
+        /* Align the decompression loop to 32 + 16 bytes.
+         *
+         * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression
+         * speed swings based on the alignment of the decompression loop. This
+         * performance swing is caused by parts of the decompression loop falling
+         * out of the DSB. The entire decompression loop should fit in the DSB,
+         * when it can't we get much worse performance. You can measure if you've
+         * hit the good case or the bad case with this perf command for some
+         * compressed file test.zst:
+         *
+         *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \
+         *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst
+         *
+         * If you see most cycles served out of the MITE you've hit the bad case.
+         * If you see most cycles served out of the DSB you've hit the good case.
+         * If it is pretty even then you may be in an okay case.
+         *
+         * I've been able to reproduce this issue on the following CPUs:
+         *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
+         *               Use Instruments->Counters to get DSB/MITE cycles.
+         *               I never got performance swings, but I was able to
+         *               go from the good case of mostly DSB to half of the
+         *               cycles served from MITE.
+         *   - Coffeelake: Intel i9-9900k
+         *
+         * I haven't been able to reproduce the instability or DSB misses on any
+         * of the following CPUS:
+         *   - Haswell
+         *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
+         *   - Skylake
+         *
+         * If you are seeing performance stability this script can help test.
+         * It tests on 4 commits in zstd where I saw performance change.
+         *
+         *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4
+         */
+        __asm__(".p2align 5");
+        __asm__("nop");
+        __asm__(".p2align 4");
+#endif
+        for ( ; ; ) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch);
+            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+            assert(!ZSTD_isError(oneSeqSize));
+            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase);
+#endif
+            DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
+            BIT_reloadDStream(&(seqState.DStream));
+            /* gcc and clang both don't like early returns in this loop.
+             * gcc doesn't like early breaks either.
+             * Instead save an error and report it at the end.
+             * When there is an error, don't increment op, so we don't
+             * overwrite.
+             */
+            if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize;
+            else op += oneSeqSize;
+            if (UNLIKELY(!--nbSeq)) break;
+        }
+
+        /* check if reached exact end */
+        DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq);
+        if (ZSTD_isError(error)) return error;
+        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, "");
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_decompressSequencesLong_body(
+                               ZSTD_DCtx* dctx,
+                               void* dst, size_t maxDstSize,
+                         const void* seqStart, size_t seqSize, int nbSeq,
+                         const ZSTD_longOffset_e isLongOffset,
+                         const int frame)
+{
+    const BYTE* ip = (const BYTE*)seqStart;
+    const BYTE* const iend = ip + seqSize;
+    BYTE* const ostart = (BYTE* const)dst;
+    BYTE* const oend = ostart + maxDstSize;
+    BYTE* op = ostart;
+    const BYTE* litPtr = dctx->litPtr;
+    const BYTE* const litEnd = litPtr + dctx->litSize;
+    const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart);
+    const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart);
+    const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd);
+    (void)frame;
+
+    /* Regen sequences */
+    if (nbSeq) {
+#define STORED_SEQS 4
+#define STORED_SEQS_MASK (STORED_SEQS-1)
+#define ADVANCED_SEQS 4
+        seq_t sequences[STORED_SEQS];
+        int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS);
+        seqState_t seqState;
+        int seqNb;
+        dctx->fseEntropy = 1;
+        { int i; for (i=0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] = dctx->entropy.rep[i]; }
+        seqState.prefixStart = prefixStart;
+        seqState.pos = (size_t)(op-prefixStart);
+        seqState.dictEnd = dictEnd;
+        assert(dst != NULL);
+        assert(iend >= ip);
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr);
+
+        /* prepare in advance */
+        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
+            sequences[seqNb] = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
+            PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+        }
+        RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
+
+        /* decode and decompress */
+        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && (seqNb<nbSeq) ; seqNb++) {
+            seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_prefetch);
+            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+            assert(!ZSTD_isError(oneSeqSize));
+            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
+            sequences[seqNb & STORED_SEQS_MASK] = sequence;
+            op += oneSeqSize;
+        }
+        RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
+
+        /* finish queue */
+        seqNb -= seqAdvance;
+        for ( ; seqNb<nbSeq ; seqNb++) {
+            size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE)
+            assert(!ZSTD_isError(oneSeqSize));
+            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+            op += oneSeqSize;
+        }
+
+        /* save reps for next block */
+        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    {   size_t const lastLLSize = litEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, "");
+        if (op != NULL) {
+            memcpy(op, litPtr, lastLLSize);
+            op += lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+static size_t
+ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+
+#if DYNAMIC_BMI2
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static TARGET_ATTRIBUTE("bmi2") size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+static TARGET_ATTRIBUTE("bmi2") size_t
+ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+#endif /* DYNAMIC_BMI2 */
+
+typedef size_t (*ZSTD_decompressSequences_t)(
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t maxDstSize,
+                            const void* seqStart, size_t seqSize, int nbSeq,
+                            const ZSTD_longOffset_e isLongOffset,
+                            const int frame);
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+static size_t
+ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize,
+                   const void* seqStart, size_t seqSize, int nbSeq,
+                   const ZSTD_longOffset_e isLongOffset,
+                   const int frame)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequences");
+#if DYNAMIC_BMI2
+    if (dctx->bmi2) {
+        return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    }
+#endif
+  return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
+
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+/* ZSTD_decompressSequencesLong() :
+ * decompression function triggered when a minimum share of offsets is considered "long",
+ * aka out of cache.
+ * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance".
+ * This function will try to mitigate main memory latency through the use of prefetching */
+static size_t
+ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
+                             void* dst, size_t maxDstSize,
+                             const void* seqStart, size_t seqSize, int nbSeq,
+                             const ZSTD_longOffset_e isLongOffset,
+                             const int frame)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesLong");
+#if DYNAMIC_BMI2
+    if (dctx->bmi2) {
+        return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    }
+#endif
+  return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
+#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */
+
+
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+/* ZSTD_getLongOffsetsShare() :
+ * condition : offTable must be valid
+ * @return : "share" of long offsets (arbitrarily defined as > (1<<23))
+ *           compared to maximum possible of (1<<OffFSELog) */
+static unsigned
+ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTable)
+{
+    const void* ptr = offTable;
+    U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog;
+    const ZSTD_seqSymbol* table = offTable + 1;
+    U32 const max = 1 << tableLog;
+    U32 u, total = 0;
+    DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog);
+
+    assert(max <= (1 << OffFSELog));  /* max not too large */
+    for (u=0; u<max; u++) {
+        if (table[u].nbAdditionalBits > 22) total += 1;
+    }
+
+    assert(tableLog <= OffFSELog);
+    total <<= (OffFSELog - tableLog);  /* scale to OffFSELog */
+
+    return total;
+}
+#endif
+
+size_t
+ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
+                              void* dst, size_t dstCapacity,
+                        const void* src, size_t srcSize, const int frame)
+{   /* blockType == blockCompressed */
+    const BYTE* ip = (const BYTE*)src;
+    /* isLongOffset must be true if there are long offsets.
+     * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN.
+     * We don't expect that to be the case in 64-bit mode.
+     * In block mode, window size is not known, so we have to be conservative.
+     * (note: but it could be evaluated from current-lowLimit)
+     */
+    ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN))));
+    DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize);
+
+    RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
+
+    /* Decode literals section */
+    {   size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize);
+        DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
+        if (ZSTD_isError(litCSize)) return litCSize;
+        ip += litCSize;
+        srcSize -= litCSize;
+    }
+
+    /* Build Decoding Tables */
+    {
+        /* These macros control at build-time which decompressor implementation
+         * we use. If neither is defined, we do some inspection and dispatch at
+         * runtime.
+         */
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        int usePrefetchDecoder = dctx->ddictIsCold;
+#endif
+        int nbSeq;
+        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
+        if (ZSTD_isError(seqHSize)) return seqHSize;
+        ip += seqHSize;
+        srcSize -= seqHSize;
+
+        RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled");
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        if ( !usePrefetchDecoder
+          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
+            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
+            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
+            usePrefetchDecoder = (shareLongOffsets >= minShare);
+        }
+#endif
+
+        dctx->ddictIsCold = 0;
+
+#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \
+    !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG)
+        if (usePrefetchDecoder)
+#endif
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+#endif
+
+#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+        /* else */
+        return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame);
+#endif
+    }
+}
+
+
+void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst)
+{
+    if (dst != dctx->previousDstEnd) {   /* not contiguous */
+        dctx->dictEnd = dctx->previousDstEnd;
+        dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart));
+        dctx->prefixStart = dst;
+        dctx->previousDstEnd = dst;
+    }
+}
+
+
+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity,
+                      const void* src, size_t srcSize)
+{
+    size_t dSize;
+    ZSTD_checkContinuity(dctx, dst);
+    dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0);
+    dctx->previousDstEnd = (char*)dst + dSize;
+    return dSize;
+}
+/**** ended inlining decompress/zstd_decompress_block.c ****/
diff --git a/sys/contrib/openzfs/module/zstd/lib/zstd.h b/sys/contrib/openzfs/module/zstd/lib/zstd.h
new file mode 100644
index 000000000000..b6772f8818a7
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/lib/zstd.h
@@ -0,0 +1,2115 @@
+/*
+ * BSD 3-Clause Clear License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved.
+ */
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef ZSTD_H_235446
+#define ZSTD_H_235446
+
+/* ======   Dependency   ======*/
+#include <limits.h>   /* INT_MAX */
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#endif
+
+
+/*******************************************************************************
+  Introduction
+
+  zstd, short for Zstandard, is a fast lossless compression algorithm, targeting
+  real-time compression scenarios at zlib-level and better compression ratios.
+  The zstd compression library provides in-memory compression and decompression
+  functions.
+
+  The library supports regular compression levels from 1 up to ZSTD_maxCLevel(),
+  which is currently 22. Levels >= 20, labeled `--ultra`, should be used with
+  caution, as they require more memory. The library also offers negative
+  compression levels, which extend the range of speed vs. ratio preferences.
+  The lower the level, the faster the speed (at the cost of compression).
+
+  Compression can be done in:
+    - a single step (described as Simple API)
+    - a single step, reusing a context (described as Explicit context)
+    - unbounded multiple steps (described as Streaming compression)
+
+  The compression ratio achievable on small data can be highly improved using
+  a dictionary. Dictionary compression can be performed in:
+    - a single step (described as Simple dictionary API)
+    - a single step, reusing a dictionary (described as Bulk-processing
+      dictionary API)
+
+  Advanced experimental functions can be accessed using
+  `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h.
+
+  Advanced experimental APIs should never be used with a dynamically-linked
+  library. They are not "stable"; their definitions or signatures may change in
+  the future. Only static linking is allowed.
+*******************************************************************************/
+
+/*------   Version   ------*/
+#define ZSTD_VERSION_MAJOR    1
+#define ZSTD_VERSION_MINOR    4
+#define ZSTD_VERSION_RELEASE  5
+
+#define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE)
+ZSTDLIB_API unsigned ZSTD_versionNumber(void);   /**< to check runtime library version */
+
+#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE
+#define ZSTD_QUOTE(str) #str
+#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str)
+#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION)
+ZSTDLIB_API const char* ZSTD_versionString(void);   /* requires v1.3.0+ */
+
+/* *************************************
+ *  Default constant
+ ***************************************/
+#ifndef ZSTD_CLEVEL_DEFAULT
+#  define ZSTD_CLEVEL_DEFAULT 3
+#endif
+
+/* *************************************
+ *  Constants
+ ***************************************/
+
+/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */
+#define ZSTD_MAGICNUMBER            0xFD2FB528    /* valid since v0.8.0 */
+#define ZSTD_MAGIC_DICTIONARY       0xEC30A437    /* valid since v0.7.0 */
+#define ZSTD_MAGIC_SKIPPABLE_START  0x184D2A50    /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */
+#define ZSTD_MAGIC_SKIPPABLE_MASK   0xFFFFFFF0
+
+#define ZSTD_BLOCKSIZELOG_MAX  17
+#define ZSTD_BLOCKSIZE_MAX     (1<<ZSTD_BLOCKSIZELOG_MAX)
+
+
+
+/***************************************
+*  Simple API
+***************************************/
+/*! ZSTD_compress() :
+ *  Compresses `src` content as a single zstd compressed frame into already allocated `dst`.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ *  @return : compressed size written into `dst` (<= `dstCapacity),
+ *            or an error code if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize,
+                                  int compressionLevel);
+
+/*! ZSTD_decompress() :
+ *  `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames.
+ *  `dstCapacity` is an upper bound of originalSize to regenerate.
+ *  If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data.
+ *  @return : the number of bytes decompressed into `dst` (<= `dstCapacity`),
+ *            or an errorCode if it fails (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity,
+                              const void* src, size_t compressedSize);
+
+/*! ZSTD_getFrameContentSize() : requires v1.3.0+
+ *  `src` should point to the start of a ZSTD encoded frame.
+ *  `srcSize` must be at least as large as the frame header.
+ *            hint : any size >= `ZSTD_frameHeaderSize_max` is large enough.
+ *  @return : - decompressed size of `src` frame content, if known
+ *            - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined
+ *            - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small)
+ *   note 1 : a 0 return value means the frame is valid but "empty".
+ *   note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *            Optionally, application can rely on some implicit limit,
+ *            as ZSTD_decompress() only needs an upper bound of decompressed size.
+ *            (For example, data could be necessarily cut into blocks <= 16 KB).
+ *   note 3 : decompressed size is always present when compression is completed using single-pass functions,
+ *            such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict().
+ *   note 4 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure return value fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 6 : This function replaces ZSTD_getDecompressedSize() */
+#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1)
+#define ZSTD_CONTENTSIZE_ERROR   (0ULL - 2)
+ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
+
+/*! ZSTD_getDecompressedSize() :
+ *  NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize().
+ *  Both functions work the same way, but ZSTD_getDecompressedSize() blends
+ *  "empty", "unknown" and "error" results to the same return value (0),
+ *  while ZSTD_getFrameContentSize() gives them separate return values.
+ * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */
+ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_findFrameCompressedSize() :
+ * `src` should point to the start of a ZSTD frame or skippable frame.
+ * `srcSize` must be >= first frame size
+ * @return : the compressed size of the first frame starting at `src`,
+ *           suitable to pass as `srcSize` to `ZSTD_decompress` or similar,
+ *        or an error code if input is invalid */
+ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize);
+
+
+/*======  Helper functions  ======*/
+#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */
+ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
+ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
+ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum negative compression level allowed */
+ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum compression level available */
+
+
+/***************************************
+*  Explicit context
+***************************************/
+/*= Compression context
+ *  When compressing many times,
+ *  it is recommended to allocate a context just once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Note : re-using context is just a speed / resource optimization.
+ *         It doesn't change the compression ratio, which remains identical.
+ *  Note 2 : In multi-threaded environments,
+ *         use one different context per thread for parallel execution.
+ */
+typedef struct ZSTD_CCtx_s ZSTD_CCtx;
+ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeCCtx(ZSTD_CCtx* cctx);
+
+/*! ZSTD_compressCCtx() :
+ *  Same as ZSTD_compress(), using an explicit ZSTD_CCtx.
+ *  Important : in order to behave similarly to `ZSTD_compress()`,
+ *  this function compresses at requested compression level,
+ *  __ignoring any other parameter__ .
+ *  If any advanced parameter was set using the advanced API,
+ *  they will all be reset. Only `compressionLevel` remains.
+ */
+ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
+                                     void* dst, size_t dstCapacity,
+                               const void* src, size_t srcSize,
+                                     int compressionLevel);
+
+/*= Decompression context
+ *  When decompressing many times,
+ *  it is recommended to allocate a context only once,
+ *  and re-use it for each successive compression operation.
+ *  This will make workload friendlier for system's memory.
+ *  Use one context per thread for parallel execution. */
+typedef struct ZSTD_DCtx_s ZSTD_DCtx;
+ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void);
+ZSTDLIB_API size_t     ZSTD_freeDCtx(ZSTD_DCtx* dctx);
+
+/*! ZSTD_decompressDCtx() :
+ *  Same as ZSTD_decompress(),
+ *  requires an allocated ZSTD_DCtx.
+ *  Compatible with sticky parameters.
+ */
+ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
+                                       void* dst, size_t dstCapacity,
+                                 const void* src, size_t srcSize);
+
+
+/***************************************
+*  Advanced compression API
+***************************************/
+
+/* API design :
+ *   Parameters are pushed one by one into an existing context,
+ *   using ZSTD_CCtx_set*() functions.
+ *   Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame.
+ *   "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` !
+ *   __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ .
+ *
+ *   It's possible to reset all parameters to "default" using ZSTD_CCtx_reset().
+ *
+ *   This API supercedes all other "advanced" API entry points in the experimental section.
+ *   In the future, we expect to remove from experimental API entry points which are redundant with this API.
+ */
+
+
+/* Compression strategies, listed from fastest to strongest */
+typedef enum { ZSTD_fast=1,
+               ZSTD_dfast=2,
+               ZSTD_greedy=3,
+               ZSTD_lazy=4,
+               ZSTD_lazy2=5,
+               ZSTD_btlazy2=6,
+               ZSTD_btopt=7,
+               ZSTD_btultra=8,
+               ZSTD_btultra2=9
+               /* note : new strategies _might_ be added in the future.
+                         Only the order (from fast to strong) is guaranteed */
+} ZSTD_strategy;
+
+
+typedef enum {
+
+    /* compression parameters
+     * Note: When compressing with a ZSTD_CDict these parameters are superseded
+     * by the parameters used to construct the ZSTD_CDict.
+     * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */
+    ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table.
+                              * Note that exact compression parameters are dynamically determined,
+                              * depending on both compression level and srcSize (when known).
+                              * Default level is ZSTD_CLEVEL_DEFAULT==3.
+                              * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT.
+                              * Note 1 : it's possible to pass a negative compression level.
+                              * Note 2 : setting a level does not automatically set all other compression parameters
+                              *   to default. Setting this will however eventually dynamically impact the compression
+                              *   parameters which have not been manually set. The manually set
+                              *   ones will 'stick'. */
+    /* Advanced compression parameters :
+     * It's possible to pin down compression parameters to some specific values.
+     * In which case, these values are no longer dynamically selected by the compressor */
+    ZSTD_c_windowLog=101,    /* Maximum allowed back-reference distance, expressed as power of 2.
+                              * This will set a memory budget for streaming decompression,
+                              * with larger values requiring more memory
+                              * and typically compressing more.
+                              * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX.
+                              * Special: value 0 means "use default windowLog".
+                              * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT
+                              *       requires explicitly allowing such size at streaming decompression stage. */
+    ZSTD_c_hashLog=102,      /* Size of the initial probe table, as a power of 2.
+                              * Resulting memory usage is (1 << (hashLog+2)).
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX.
+                              * Larger tables improve compression ratio of strategies <= dFast,
+                              * and improve speed of strategies > dFast.
+                              * Special: value 0 means "use default hashLog". */
+    ZSTD_c_chainLog=103,     /* Size of the multi-probe search table, as a power of 2.
+                              * Resulting memory usage is (1 << (chainLog+2)).
+                              * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX.
+                              * Larger tables result in better and slower compression.
+                              * This parameter is useless for "fast" strategy.
+                              * It's still useful when using "dfast" strategy,
+                              * in which case it defines a secondary probe table.
+                              * Special: value 0 means "use default chainLog". */
+    ZSTD_c_searchLog=104,    /* Number of search attempts, as a power of 2.
+                              * More attempts result in better and slower compression.
+                              * This parameter is useless for "fast" and "dFast" strategies.
+                              * Special: value 0 means "use default searchLog". */
+    ZSTD_c_minMatch=105,     /* Minimum size of searched matches.
+                              * Note that Zstandard can still find matches of smaller size,
+                              * it just tweaks its search algorithm to look for this size and larger.
+                              * Larger values increase compression and decompression speed, but decrease ratio.
+                              * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX.
+                              * Note that currently, for all strategies < btopt, effective minimum is 4.
+                              *                    , for all strategies > fast, effective maximum is 6.
+                              * Special: value 0 means "use default minMatchLength". */
+    ZSTD_c_targetLength=106, /* Impact of this field depends on strategy.
+                              * For strategies btopt, btultra & btultra2:
+                              *     Length of Match considered "good enough" to stop search.
+                              *     Larger values make compression stronger, and slower.
+                              * For strategy fast:
+                              *     Distance between match sampling.
+                              *     Larger values make compression faster, and weaker.
+                              * Special: value 0 means "use default targetLength". */
+    ZSTD_c_strategy=107,     /* See ZSTD_strategy enum definition.
+                              * The higher the value of selected strategy, the more complex it is,
+                              * resulting in stronger and slower compression.
+                              * Special: value 0 means "use default strategy". */
+
+    /* LDM mode parameters */
+    ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching.
+                                     * This parameter is designed to improve compression ratio
+                                     * for large inputs, by finding large matches at long distance.
+                                     * It increases memory usage and window size.
+                                     * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB
+                                     * except when expressly set to a different value. */
+    ZSTD_c_ldmHashLog=161,   /* Size of the table for long distance matching, as a power of 2.
+                              * Larger values increase memory usage and compression ratio,
+                              * but decrease compression speed.
+                              * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX
+                              * default: windowlog - 7.
+                              * Special: value 0 means "automatically determine hashlog". */
+    ZSTD_c_ldmMinMatch=162,  /* Minimum match size for long distance matcher.
+                              * Larger/too small values usually decrease compression ratio.
+                              * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX.
+                              * Special: value 0 means "use default value" (default: 64). */
+    ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution.
+                              * Larger values improve collision resolution but decrease compression speed.
+                              * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX.
+                              * Special: value 0 means "use default value" (default: 3). */
+    ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table.
+                              * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN).
+                              * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage.
+                              * Larger values improve compression speed.
+                              * Deviating far from default value will likely result in a compression ratio decrease.
+                              * Special: value 0 means "automatically determine hashRateLog". */
+
+    /* frame parameters */
+    ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1)
+                              * Content size must be known at the beginning of compression.
+                              * This is automatically the case when using ZSTD_compress2(),
+                              * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */
+    ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */
+    ZSTD_c_dictIDFlag=202,   /* When applicable, dictionary's ID is written into frame header (default:1) */
+
+    /* multi-threading parameters */
+    /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD).
+     * They return an error otherwise. */
+    ZSTD_c_nbWorkers=400,    /* Select how many threads will be spawned to compress in parallel.
+                              * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() :
+                              * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller,
+                              * while compression work is performed in parallel, within worker threads.
+                              * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end :
+                              *  in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call).
+                              * More workers improve speed, but also increase memory usage.
+                              * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */
+    ZSTD_c_jobSize=401,      /* Size of a compression job. This value is enforced only when nbWorkers >= 1.
+                              * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads.
+                              * 0 means default, which is dynamically determined based on compression parameters.
+                              * Job size must be a minimum of overlap size, or 1 MB, whichever is largest.
+                              * The minimum size is automatically and transparently enforced. */
+    ZSTD_c_overlapLog=402,   /* Control the overlap size, as a fraction of window size.
+                              * The overlap size is an amount of data reloaded from previous job at the beginning of a new job.
+                              * It helps preserve compression ratio, while each job is compressed in parallel.
+                              * This value is enforced only when nbWorkers >= 1.
+                              * Larger values increase compression ratio, but decrease speed.
+                              * Possible values range from 0 to 9 :
+                              * - 0 means "default" : value will be determined by the library, depending on strategy
+                              * - 1 means "no overlap"
+                              * - 9 means "full overlap", using a full window size.
+                              * Each intermediate rank increases/decreases load size by a factor 2 :
+                              * 9: full window;  8: w/2;  7: w/4;  6: w/8;  5:w/16;  4: w/32;  3:w/64;  2:w/128;  1:no overlap;  0:default
+                              * default value varies between 6 and 9, depending on strategy */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_c_rsyncable
+     * ZSTD_c_format
+     * ZSTD_c_forceMaxWindow
+     * ZSTD_c_forceAttachDict
+     * ZSTD_c_literalCompressionMode
+     * ZSTD_c_targetCBlockSize
+     * ZSTD_c_srcSizeHint
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly;
+     *        also, the enums values themselves are unstable and can still change.
+     */
+     ZSTD_c_experimentalParam1=500,
+     ZSTD_c_experimentalParam2=10,
+     ZSTD_c_experimentalParam3=1000,
+     ZSTD_c_experimentalParam4=1001,
+     ZSTD_c_experimentalParam5=1002,
+     ZSTD_c_experimentalParam6=1003,
+     ZSTD_c_experimentalParam7=1004
+} ZSTD_cParameter;
+
+typedef struct {
+    size_t error;
+    int lowerBound;
+    int upperBound;
+} ZSTD_bounds;
+
+/*! ZSTD_cParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - lower and upper bounds, both inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam);
+
+/*! ZSTD_CCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is generally only possible during frame initialization (before starting compression).
+ *  Exception : when using multi-threading mode (nbWorkers >= 1),
+ *              the following parameters can be updated _during_ compression (within same frame):
+ *              => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy.
+ *              new parameters will be active for next job only (after a flush()).
+ * @return : an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtx_setPledgedSrcSize() :
+ *  Total input data size to be compressed as a single frame.
+ *  Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag.
+ *  This value will also be controlled at end of frame, and trigger an error if not respected.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame.
+ *           In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN.
+ *           ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame.
+ *  Note 2 : pledgedSrcSize is only valid once, for the next frame.
+ *           It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN.
+ *  Note 3 : Whenever all input data is provided and consumed in a single round,
+ *           for example with ZSTD_compress2(),
+ *           or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end),
+ *           this value is automatically overridden by srcSize instead.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
+
+typedef enum {
+    ZSTD_reset_session_only = 1,
+    ZSTD_reset_parameters = 2,
+    ZSTD_reset_session_and_parameters = 3
+} ZSTD_ResetDirective;
+
+/*! ZSTD_CCtx_reset() :
+ *  There are 2 different things that can be reset, independently or jointly :
+ *  - The session : will stop compressing current frame, and make CCtx ready to start a new one.
+ *                  Useful after an error, or to interrupt any ongoing compression.
+ *                  Any internal data not yet flushed is cancelled.
+ *                  Compression parameters and dictionary remain unchanged.
+ *                  They will be used to compress next frame.
+ *                  Resetting session never fails.
+ *  - The parameters : changes all parameters back to "default".
+ *                  This removes any reference to any dictionary too.
+ *                  Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing)
+ *                  otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError())
+ *  - Both : similar to resetting the session, followed by resetting parameters.
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset);
+
+/*! ZSTD_compress2() :
+ *  Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API.
+ *  ZSTD_compress2() always starts a new frame.
+ *  Should cctx hold data from a previously unfinished frame, everything about it is forgotten.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - The function is always blocking, returns when compression is completed.
+ *  Hint : compression runs faster if `dstCapacity` >=  `ZSTD_compressBound(srcSize)`.
+ * @return : compressed size written into `dst` (<= `dstCapacity),
+ *           or an error code if it fails (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
+                                   void* dst, size_t dstCapacity,
+                             const void* src, size_t srcSize);
+
+
+/***************************************
+*  Advanced decompression API
+***************************************/
+
+/* The advanced API pushes parameters one by one into an existing DCtx context.
+ * Parameters are sticky, and remain valid for all following frames
+ * using the same DCtx context.
+ * It's possible to reset parameters to default values using ZSTD_DCtx_reset().
+ * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream().
+ *        Therefore, no new decompression function is necessary.
+ */
+
+typedef enum {
+
+    ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which
+                              * the streaming API will refuse to allocate memory buffer
+                              * in order to protect the host from unreasonable memory requirements.
+                              * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+                              * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT).
+                              * Special: value 0 means "use default maximum windowLog". */
+
+    /* note : additional experimental parameters are also available
+     * within the experimental section of the API.
+     * At the time of this writing, they include :
+     * ZSTD_d_format
+     * ZSTD_d_stableOutBuffer
+     * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them.
+     * note : never ever use experimentalParam? names directly
+     */
+     ZSTD_d_experimentalParam1=1000,
+     ZSTD_d_experimentalParam2=1001
+
+} ZSTD_dParameter;
+
+/*! ZSTD_dParam_getBounds() :
+ *  All parameters must belong to an interval with lower and upper bounds,
+ *  otherwise they will either trigger an error or be automatically clamped.
+ * @return : a structure, ZSTD_bounds, which contains
+ *         - an error status field, which must be tested using ZSTD_isError()
+ *         - both lower and upper bounds, inclusive
+ */
+ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam);
+
+/*! ZSTD_DCtx_setParameter() :
+ *  Set one compression parameter, selected by enum ZSTD_dParameter.
+ *  All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds().
+ *  Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter).
+ *  Setting a parameter is only possible during frame initialization (before starting decompression).
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value);
+
+/*! ZSTD_DCtx_reset() :
+ *  Return a DCtx to clean state.
+ *  Session and parameters can be reset jointly or separately.
+ *  Parameters can only be reset when no active frame is being decompressed.
+ * @return : 0, or an error code, which can be tested with ZSTD_isError()
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset);
+
+
+/****************************
+*  Streaming
+****************************/
+
+typedef struct ZSTD_inBuffer_s {
+  const void* src;    /**< start of input buffer */
+  size_t size;        /**< size of input buffer */
+  size_t pos;         /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_inBuffer;
+
+typedef struct ZSTD_outBuffer_s {
+  void*  dst;         /**< start of output buffer */
+  size_t size;        /**< size of output buffer */
+  size_t pos;         /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */
+} ZSTD_outBuffer;
+
+
+
+/*-***********************************************************************
+*  Streaming compression - HowTo
+*
+*  A ZSTD_CStream object is required to track streaming operation.
+*  Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources.
+*  ZSTD_CStream objects can be reused multiple times on consecutive compression operations.
+*  It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory.
+*
+*  For parallel execution, use one separate ZSTD_CStream per thread.
+*
+*  note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing.
+*
+*  Parameters are sticky : when starting a new compression on the same context,
+*  it will re-use the same sticky parameters as previous compression session.
+*  When in doubt, it's recommended to fully initialize the context before usage.
+*  Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(),
+*  ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to
+*  set more specific parameters, the pledged source size, or load a dictionary.
+*
+*  Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to
+*  consume input stream. The function will automatically update both `pos`
+*  fields within `input` and `output`.
+*  Note that the function may not consume the entire input, for example, because
+*  the output buffer is already full, in which case `input.pos < input.size`.
+*  The caller must check if input has been entirely consumed.
+*  If not, the caller must make some room to receive more compressed data,
+*  and then present again remaining input data.
+*  note: ZSTD_e_continue is guaranteed to make some forward progress when called,
+*        but doesn't guarantee maximal forward progress. This is especially relevant
+*        when compressing with multiple threads. The call won't block if it can
+*        consume some input, but if it can't it will wait for some, but not all,
+*        output to be flushed.
+* @return : provides a minimum amount of data remaining to be flushed from internal buffers
+*           or an error code, which can be tested using ZSTD_isError().
+*
+*  At any moment, it's possible to flush whatever data might remain stuck within internal buffer,
+*  using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated.
+*  Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0).
+*  In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the
+*  operation.
+*  note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if internal buffers are entirely flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+*  Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame.
+*  It will perform a flush and write frame epilogue.
+*  The epilogue is required for decoders to consider a frame completed.
+*  flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush.
+*  You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to
+*  start a new frame.
+*  note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will
+*        block until the flush is complete or the output buffer is full.
+*  @return : 0 if frame fully completed and fully flushed,
+*            >0 if some data still present within internal buffer (the value is minimal estimation of remaining size),
+*            or an error code, which can be tested using ZSTD_isError().
+*
+* *******************************************************************/
+
+typedef ZSTD_CCtx ZSTD_CStream;  /**< CCtx and CStream are now effectively same object (>= v1.3.0) */
+                                 /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */
+/*===== ZSTD_CStream management functions =====*/
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void);
+ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs);
+
+/*===== Streaming compression functions =====*/
+typedef enum {
+    ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */
+    ZSTD_e_flush=1,    /* flush any data provided so far,
+                        * it creates (at least) one new block, that can be decoded immediately on reception;
+                        * frame will continue: any future data can still reference previously compressed data, improving compression.
+                        * note : multithreaded compression will block to flush as much output as possible. */
+    ZSTD_e_end=2       /* flush any remaining data _and_ close current frame.
+                        * note that frame is only closed after compressed data is fully flushed (return value == 0).
+                        * After that point, any additional data starts a new frame.
+                        * note : each frame is independent (does not reference any content from previous frame).
+                        : note : multithreaded compression will block to flush as much output as possible. */
+} ZSTD_EndDirective;
+
+/*! ZSTD_compressStream2() :
+ *  Behaves about the same as ZSTD_compressStream, with additional control on end directive.
+ *  - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*()
+ *  - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode)
+ *  - output->pos must be <= dstCapacity, input->pos must be <= srcSize
+ *  - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit.
+ *  - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller.
+ *  - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available,
+ *                                                  and then immediately returns, just indicating that there is some data remaining to be flushed.
+ *                                                  The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte.
+ *  - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking.
+ *  - @return provides a minimum amount of data remaining to be flushed from internal buffers
+ *            or an error code, which can be tested using ZSTD_isError().
+ *            if @return != 0, flush is not fully completed, there is still some data left within internal buffers.
+ *            This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers.
+ *            For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed.
+ *  - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0),
+ *            only ZSTD_e_end or ZSTD_e_flush operations are allowed.
+ *            Before starting a new compression job, or changing compression parameters,
+ *            it is required to fully flush internal buffers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx,
+                                         ZSTD_outBuffer* output,
+                                         ZSTD_inBuffer* input,
+                                         ZSTD_EndDirective endOp);
+
+
+/* These buffer sizes are softly recommended.
+ * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output.
+ * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(),
+ * reducing the amount of memory shuffling and buffering, resulting in minor performance savings.
+ *
+ * However, note that these recommendations are from the perspective of a C caller program.
+ * If the streaming interface is invoked from some other language,
+ * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo,
+ * a major performance rule is to reduce crossing such interface to an absolute minimum.
+ * It's not rare that performance ends being spent more into the interface, rather than compression itself.
+ * In which cases, prefer using large buffers, as large as practical,
+ * for both input and output, to reduce the nb of roundtrips.
+ */
+ZSTDLIB_API size_t ZSTD_CStreamInSize(void);    /**< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */
+
+
+/* *****************************************************************************
+ * This following is a legacy streaming API.
+ * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
+ * It is redundant, but remains fully supported.
+ * Advanced parameters and dictionary compression can only be used through the
+ * new API.
+ ******************************************************************************/
+
+/*!
+ * Equivalent to:
+ *
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ */
+ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel);
+/*!
+ * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue).
+ * NOTE: The return value is different. ZSTD_compressStream() returns a hint for
+ * the next read size (if non-zero and not an error). ZSTD_compressStream2()
+ * returns the minimum nb of bytes left to flush (if non-zero and not an error).
+ */
+ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */
+ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */
+ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output);
+
+
+/*-***************************************************************************
+*  Streaming decompression - HowTo
+*
+*  A ZSTD_DStream object is required to track streaming operations.
+*  Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources.
+*  ZSTD_DStream objects can be re-used multiple times.
+*
+*  Use ZSTD_initDStream() to start a new decompression operation.
+* @return : recommended first input size
+*  Alternatively, use advanced API to set specific properties.
+*
+*  Use ZSTD_decompressStream() repetitively to consume your input.
+*  The function will update both `pos` fields.
+*  If `input.pos < input.size`, some input has not been consumed.
+*  It's up to the caller to present again remaining data.
+*  The function tries to flush all data decoded immediately, respecting output buffer size.
+*  If `output.pos < output.size`, decoder has flushed everything it could.
+*  But if `output.pos == output.size`, there might be some data left within internal buffers.,
+*  In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer.
+*  Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX.
+* @return : 0 when a frame is completely decoded and fully flushed,
+*        or an error code, which can be tested using ZSTD_isError(),
+*        or any other value > 0, which means there is still some decoding or flushing to do to complete current frame :
+*                                the return value is a suggested next input size (just a hint for better latency)
+*                                that will never request more than the remaining frame size.
+* *******************************************************************************/
+
+typedef ZSTD_DCtx ZSTD_DStream;  /**< DCtx and DStream are now effectively same object (>= v1.3.0) */
+                                 /* For compatibility with versions <= v1.2.0, prefer differentiating them. */
+/*===== ZSTD_DStream management functions =====*/
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void);
+ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds);
+
+/*===== Streaming decompression functions =====*/
+
+/* This function is redundant with the advanced API and equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, NULL);
+ */
+ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds);
+
+ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input);
+
+ZSTDLIB_API size_t ZSTD_DStreamInSize(void);    /*!< recommended size for input buffer */
+ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */
+
+
+/**************************
+*  Simple dictionary API
+***************************/
+/*! ZSTD_compress_usingDict() :
+ *  Compression at an explicit compression level using a Dictionary.
+ *  A dictionary can be any arbitrary data segment (also called a prefix),
+ *  or a buffer with specified information (see dictBuilder/zdict.h).
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx,
+                                           void* dst, size_t dstCapacity,
+                                     const void* src, size_t srcSize,
+                                     const void* dict,size_t dictSize,
+                                           int compressionLevel);
+
+/*! ZSTD_decompress_usingDict() :
+ *  Decompression using a known Dictionary.
+ *  Dictionary must be identical to the one used during compression.
+ *  Note : This function loads the dictionary, resulting in significant startup delay.
+ *         It's intended for a dictionary used only once.
+ *  Note : When `dict == NULL || dictSize < 8` no dictionary is used. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx,
+                                             void* dst, size_t dstCapacity,
+                                       const void* src, size_t srcSize,
+                                       const void* dict,size_t dictSize);
+
+
+/***********************************
+ *  Bulk processing dictionary API
+ **********************************/
+typedef struct ZSTD_CDict_s ZSTD_CDict;
+
+/*! ZSTD_createCDict() :
+ *  When compressing multiple messages or blocks using the same dictionary,
+ *  it's recommended to digest the dictionary only once, since it's a costly operation.
+ *  ZSTD_createCDict() will create a state from digesting a dictionary.
+ *  The resulting state can be used for future compression operations with very limited startup cost.
+ *  ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict.
+ *  Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content.
+ *  Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer,
+ *      in which case the only thing that it transports is the @compressionLevel.
+ *      This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively,
+ *      expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize,
+                                         int compressionLevel);
+
+/*! ZSTD_freeCDict() :
+ *  Function frees memory allocated by ZSTD_createCDict(). */
+ZSTDLIB_API size_t      ZSTD_freeCDict(ZSTD_CDict* CDict);
+
+/*! ZSTD_compress_usingCDict() :
+ *  Compression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times.
+ *  Note : compression level is _decided at dictionary creation time_,
+ *     and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
+                                            void* dst, size_t dstCapacity,
+                                      const void* src, size_t srcSize,
+                                      const ZSTD_CDict* cdict);
+
+
+typedef struct ZSTD_DDict_s ZSTD_DDict;
+
+/*! ZSTD_createDDict() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  dictBuffer can be released after DDict creation, as its content is copied inside DDict. */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_freeDDict() :
+ *  Function frees memory allocated with ZSTD_createDDict() */
+ZSTDLIB_API size_t      ZSTD_freeDDict(ZSTD_DDict* ddict);
+
+/*! ZSTD_decompress_usingDDict() :
+ *  Decompression using a digested Dictionary.
+ *  Recommended when same dictionary is used multiple times. */
+ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_DDict* ddict);
+
+
+/********************************
+ *  Dictionary helper functions
+ *******************************/
+
+/*! ZSTD_getDictID_fromDict() :
+ *  Provides the dictID stored within dictionary.
+ *  if @return == 0, the dictionary is not conformant with Zstandard specification.
+ *  It can still be loaded, but as a content-only dictionary. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize);
+
+/*! ZSTD_getDictID_fromDDict() :
+ *  Provides the dictID of the dictionary loaded into `ddict`.
+ *  If @return == 0, the dictionary is not conformant to Zstandard specification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
+
+/*! ZSTD_getDictID_fromFrame() :
+ *  Provides the dictID required to decompressed the frame stored within `src`.
+ *  If @return == 0, the dictID could not be decoded.
+ *  This could for one of the following reasons :
+ *  - The frame does not require a dictionary to be decoded (most common case).
+ *  - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information.
+ *    Note : this use case also happens when using a non-conformant dictionary.
+ *  - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`).
+ *  - This is not a Zstandard frame.
+ *  When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize);
+
+
+/*******************************************************************************
+ * Advanced dictionary and prefix API
+ *
+ * This API allows dictionaries to be used with ZSTD_compress2(),
+ * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and
+ * only reset with the context is reset with ZSTD_reset_parameters or
+ * ZSTD_reset_session_and_parameters. Prefixes are single-use.
+ ******************************************************************************/
+
+
+/*! ZSTD_CCtx_loadDictionary() :
+ *  Create an internal CDict from `dict` buffer.
+ *  Decompression will have to use same dictionary.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary,
+ *           meaning "return to no-dictionary mode".
+ *  Note 1 : Dictionary is sticky, it will be used for all future compressed frames.
+ *           To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters).
+ *  Note 2 : Loading a dictionary involves building tables.
+ *           It's also a CPU consuming operation, with non-negligible impact on latency.
+ *           Tables are dependent on compression parameters, and for this reason,
+ *           compression parameters can no longer be changed after loading a dictionary.
+ *  Note 3 :`dict` content will be copied internally.
+ *           Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead.
+ *           In such a case, dictionary buffer must outlive its users.
+ *  Note 4 : Use ZSTD_CCtx_loadDictionary_advanced()
+ *           to precisely select how dictionary content must be interpreted. */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_refCDict() :
+ *  Reference a prepared dictionary, to be used for all next compressed frames.
+ *  Note that compression parameters are enforced from within CDict,
+ *  and supersede any compression parameter previously set within CCtx.
+ *  The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs.
+ *  The ignored parameters will be used again if the CCtx is returned to no-dictionary mode.
+ *  The dictionary will remain valid for future compressed frames using same CCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Referencing a NULL CDict means "return to no-dictionary mode".
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */
+ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict);
+
+/*! ZSTD_CCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) for next compressed frame.
+ *  A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end).
+ *  Decompression will need same prefix to properly regenerate data.
+ *  Compressing with a prefix is similar in outcome as performing a diff and compressing it,
+ *  but performs much faster, especially during decompression (compression speed is tunable with compression level).
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary
+ *  Note 1 : Prefix buffer is referenced. It **must** outlive compression.
+ *           Its content must remain unmodified during compression.
+ *  Note 2 : If the intention is to diff some large src data blob with some prior version of itself,
+ *           ensure that the window size is large enough to contain the entire source.
+ *           See ZSTD_c_windowLog.
+ *  Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters.
+ *           It's a CPU consuming operation, with non-negligible impact on latency.
+ *           If there is a need to use the same prefix multiple times, consider loadDictionary instead.
+ *  Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent).
+ *           Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
+                                 const void* prefix, size_t prefixSize);
+
+/*! ZSTD_DCtx_loadDictionary() :
+ *  Create an internal DDict from dict buffer,
+ *  to be used to decompress next frames.
+ *  The dictionary remains valid for all future frames, until explicitly invalidated.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary,
+ *            meaning "return to no-dictionary mode".
+ *  Note 1 : Loading a dictionary involves building tables,
+ *           which has a non-negligible impact on CPU usage and latency.
+ *           It's recommended to "load once, use many times", to amortize the cost
+ *  Note 2 :`dict` content will be copied internally, so `dict` can be released after loading.
+ *           Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead.
+ *  Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of
+ *           how dictionary content is loaded and interpreted.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_refDDict() :
+ *  Reference a prepared dictionary, to be used to decompress next frames.
+ *  The dictionary remains active for decompression of future frames using same DCtx.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Currently, only one dictionary can be managed.
+ *           Referencing a new dictionary effectively "discards" any previous one.
+ *  Special: referencing a NULL DDict means "return to no-dictionary mode".
+ *  Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+/*! ZSTD_DCtx_refPrefix() :
+ *  Reference a prefix (single-usage dictionary) to decompress next frame.
+ *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
+ *  and must use the same prefix as the one used during compression.
+ *  Prefix is **only used once**. Reference is discarded at end of frame.
+ *  End of frame is reached when ZSTD_decompressStream() returns 0.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ *  Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary
+ *  Note 2 : Prefix buffer is referenced. It **must** outlive decompression.
+ *           Prefix buffer must remain unmodified up to the end of frame,
+ *           reached when ZSTD_decompressStream() returns 0.
+ *  Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent).
+ *           Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section)
+ *  Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost.
+ *           A full dictionary is more costly, as it requires building tables.
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx,
+                                 const void* prefix, size_t prefixSize);
+
+/* ===   Memory management   === */
+
+/*! ZSTD_sizeof_*() :
+ *  These functions give the _current_ memory usage of selected object.
+ *  Note that object memory usage can evolve (increase or decrease) over time. */
+ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs);
+ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds);
+ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict);
+ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict);
+
+#endif  /* ZSTD_H_235446 */
+
+
+/* **************************************************************************************
+ *   ADVANCED AND EXPERIMENTAL FUNCTIONS
+ ****************************************************************************************
+ * The definitions in the following section are considered experimental.
+ * They are provided for advanced scenarios.
+ * They should never be used with a dynamic library, as prototypes may change in the future.
+ * Use them only in association with static linking.
+ * ***************************************************************************************/
+
+#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
+#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
+
+/****************************************************************************************
+ *   experimental API (static linking only)
+ ****************************************************************************************
+ * The following symbols and constants
+ * are not planned to join "stable API" status in the near future.
+ * They can still change in future versions.
+ * Some of them are planned to remain in the static_only section indefinitely.
+ * Some of them might be removed in the future (especially when redundant with existing stable functions)
+ * ***************************************************************************************/
+
+#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1)   /* minimum input size required to query frame header size */
+#define ZSTD_FRAMEHEADERSIZE_MIN(format)    ((format) == ZSTD_f_zstd1 ? 6 : 2)
+#define ZSTD_FRAMEHEADERSIZE_MAX   18   /* can be useful for static allocation */
+#define ZSTD_SKIPPABLEHEADERSIZE    8
+
+/* compression parameter bounds */
+#define ZSTD_WINDOWLOG_MAX_32    30
+#define ZSTD_WINDOWLOG_MAX_64    31
+#define ZSTD_WINDOWLOG_MAX     ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64))
+#define ZSTD_WINDOWLOG_MIN       10
+#define ZSTD_HASHLOG_MAX       ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30)
+#define ZSTD_HASHLOG_MIN          6
+#define ZSTD_CHAINLOG_MAX_32     29
+#define ZSTD_CHAINLOG_MAX_64     30
+#define ZSTD_CHAINLOG_MAX      ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64))
+#define ZSTD_CHAINLOG_MIN        ZSTD_HASHLOG_MIN
+#define ZSTD_SEARCHLOG_MAX      (ZSTD_WINDOWLOG_MAX-1)
+#define ZSTD_SEARCHLOG_MIN        1
+#define ZSTD_MINMATCH_MAX         7   /* only for ZSTD_fast, other strategies are limited to 6 */
+#define ZSTD_MINMATCH_MIN         3   /* only for ZSTD_btopt+, faster strategies are limited to 4 */
+#define ZSTD_TARGETLENGTH_MAX    ZSTD_BLOCKSIZE_MAX
+#define ZSTD_TARGETLENGTH_MIN     0   /* note : comparing this constant to an unsigned results in a tautological test */
+#define ZSTD_STRATEGY_MIN        ZSTD_fast
+#define ZSTD_STRATEGY_MAX        ZSTD_btultra2
+
+
+#define ZSTD_OVERLAPLOG_MIN       0
+#define ZSTD_OVERLAPLOG_MAX       9
+
+#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27   /* by default, the streaming decoder will refuse any frame
+                                           * requiring larger than (1<<ZSTD_WINDOWLOG_LIMIT_DEFAULT) window size,
+                                           * to preserve host's memory from unreasonable requirements.
+                                           * This limit can be overridden using ZSTD_DCtx_setParameter(,ZSTD_d_windowLogMax,).
+                                           * The limit does not apply for one-pass decoders (such as ZSTD_decompress()), since no additional memory is allocated */
+
+
+/* LDM parameter bounds */
+#define ZSTD_LDM_HASHLOG_MIN      ZSTD_HASHLOG_MIN
+#define ZSTD_LDM_HASHLOG_MAX      ZSTD_HASHLOG_MAX
+#define ZSTD_LDM_MINMATCH_MIN        4
+#define ZSTD_LDM_MINMATCH_MAX     4096
+#define ZSTD_LDM_BUCKETSIZELOG_MIN   1
+#define ZSTD_LDM_BUCKETSIZELOG_MAX   8
+#define ZSTD_LDM_HASHRATELOG_MIN     0
+#define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN)
+
+/* Advanced parameter bounds */
+#define ZSTD_TARGETCBLOCKSIZE_MIN   64
+#define ZSTD_TARGETCBLOCKSIZE_MAX   ZSTD_BLOCKSIZE_MAX
+#define ZSTD_SRCSIZEHINT_MIN        0
+#define ZSTD_SRCSIZEHINT_MAX        INT_MAX
+
+/* internal */
+#define ZSTD_HASHLOG3_MAX           17
+
+
+/* ---  Advanced types  --- */
+
+typedef struct ZSTD_CCtx_params_s ZSTD_CCtx_params;
+
+typedef struct {
+    unsigned int matchPos; /* Match pos in dst */
+    /* If seqDef.offset > 3, then this is seqDef.offset - 3
+     * If seqDef.offset < 3, then this is the corresponding repeat offset
+     * But if seqDef.offset < 3 and litLength == 0, this is the
+     *   repeat offset before the corresponding repeat offset
+     * And if seqDef.offset == 3 and litLength == 0, this is the
+     *   most recent repeat offset - 1
+     */
+    unsigned int offset;
+    unsigned int litLength; /* Literal length */
+    unsigned int matchLength; /* Match length */
+    /* 0 when seq not rep and seqDef.offset otherwise
+     * when litLength == 0 this will be <= 4, otherwise <= 3 like normal
+     */
+    unsigned int rep;
+} ZSTD_Sequence;
+
+typedef struct {
+    unsigned windowLog;       /**< largest match distance : larger == more compression, more memory needed during decompression */
+    unsigned chainLog;        /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */
+    unsigned hashLog;         /**< dispatch table : larger == faster, more memory */
+    unsigned searchLog;       /**< nb of searches : larger == more compression, slower */
+    unsigned minMatch;        /**< match length searched : larger == faster decompression, sometimes less compression */
+    unsigned targetLength;    /**< acceptable match size for optimal parser (only) : larger == more compression, slower */
+    ZSTD_strategy strategy;   /**< see ZSTD_strategy definition above */
+} ZSTD_compressionParameters;
+
+typedef struct {
+    int contentSizeFlag; /**< 1: content size will be in frame header (when known) */
+    int checksumFlag;    /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */
+    int noDictIDFlag;    /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */
+} ZSTD_frameParameters;
+
+typedef struct {
+    ZSTD_compressionParameters cParams;
+    ZSTD_frameParameters fParams;
+} ZSTD_parameters;
+
+typedef enum {
+    ZSTD_dct_auto = 0,       /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */
+    ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */
+    ZSTD_dct_fullDict = 2    /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */
+} ZSTD_dictContentType_e;
+
+typedef enum {
+    ZSTD_dlm_byCopy = 0,  /**< Copy dictionary content internally */
+    ZSTD_dlm_byRef = 1    /**< Reference dictionary content -- the dictionary buffer must outlive its users. */
+} ZSTD_dictLoadMethod_e;
+
+typedef enum {
+    ZSTD_f_zstd1 = 0,           /* zstd frame format, specified in zstd_compression_format.md (default) */
+    ZSTD_f_zstd1_magicless = 1  /* Variant of zstd frame format, without initial 4-bytes magic number.
+                                 * Useful to save 4 bytes per generated frame.
+                                 * Decoder cannot recognise automatically this format, requiring this instruction. */
+} ZSTD_format_e;
+
+typedef enum {
+    /* Note: this enum and the behavior it controls are effectively internal
+     * implementation details of the compressor. They are expected to continue
+     * to evolve and should be considered only in the context of extremely
+     * advanced performance tuning.
+     *
+     * Zstd currently supports the use of a CDict in three ways:
+     *
+     * - The contents of the CDict can be copied into the working context. This
+     *   means that the compression can search both the dictionary and input
+     *   while operating on a single set of internal tables. This makes
+     *   the compression faster per-byte of input. However, the initial copy of
+     *   the CDict's tables incurs a fixed cost at the beginning of the
+     *   compression. For small compressions (< 8 KB), that copy can dominate
+     *   the cost of the compression.
+     *
+     * - The CDict's tables can be used in-place. In this model, compression is
+     *   slower per input byte, because the compressor has to search two sets of
+     *   tables. However, this model incurs no start-up cost (as long as the
+     *   working context's tables can be reused). For small inputs, this can be
+     *   faster than copying the CDict's tables.
+     *
+     * - The CDict's tables are not used at all, and instead we use the working
+     *   context alone to reload the dictionary and use params based on the source
+     *   size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict().
+     *   This method is effective when the dictionary sizes are very small relative
+     *   to the input size, and the input size is fairly large to begin with.
+     *
+     * Zstd has a simple internal heuristic that selects which strategy to use
+     * at the beginning of a compression. However, if experimentation shows that
+     * Zstd is making poor choices, it is possible to override that choice with
+     * this enum.
+     */
+    ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */
+    ZSTD_dictForceAttach   = 1, /* Never copy the dictionary. */
+    ZSTD_dictForceCopy     = 2, /* Always copy the dictionary. */
+    ZSTD_dictForceLoad     = 3  /* Always reload the dictionary */
+} ZSTD_dictAttachPref_e;
+
+typedef enum {
+  ZSTD_lcm_auto = 0,          /**< Automatically determine the compression mode based on the compression level.
+                               *   Negative compression levels will be uncompressed, and positive compression
+                               *   levels will be compressed. */
+  ZSTD_lcm_huffman = 1,       /**< Always attempt Huffman compression. Uncompressed literals will still be
+                               *   emitted if Huffman compression is not profitable. */
+  ZSTD_lcm_uncompressed = 2   /**< Always emit uncompressed literals. */
+} ZSTD_literalCompressionMode_e;
+
+
+/***************************************
+*  Frame size functions
+***************************************/
+
+/*! ZSTD_findDecompressedSize() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - decompressed size of all data in all successive frames
+ *            - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN
+ *            - if an error occurred: ZSTD_CONTENTSIZE_ERROR
+ *
+ *   note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode.
+ *            When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 2 : decompressed size is always present when compression is done with ZSTD_compress()
+ *   note 3 : decompressed size can be very large (64-bits value),
+ *            potentially larger than what local system can handle as a single memory segment.
+ *            In which case, it's necessary to use streaming mode to decompress data.
+ *   note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified.
+ *            Always ensure result fits within application's authorized limits.
+ *            Each application can set its own limits.
+ *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to
+ *            read each contained frame header.  This is fast as most of the data is skipped,
+ *            however it does mean that all frame data must be present and valid. */
+ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize);
+
+/*! ZSTD_decompressBound() :
+ *  `src` should point to the start of a series of ZSTD encoded and/or skippable frames
+ *  `srcSize` must be the _exact_ size of this series
+ *       (i.e. there should be a frame boundary at `src + srcSize`)
+ *  @return : - upper-bound for the decompressed size of all data in all successive frames
+ *            - if an error occured: ZSTD_CONTENTSIZE_ERROR
+ *
+ *  note 1  : an error can occur if `src` contains an invalid or incorrectly formatted frame.
+ *  note 2  : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`.
+ *            in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value.
+ *  note 3  : when the decompressed size field isn't available, the upper-bound for that frame is calculated by:
+ *              upper-bound = # blocks * min(128 KB, Window_Size)
+ */
+ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize);
+
+/*! ZSTD_frameHeaderSize() :
+ *  srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX.
+ * @return : size of the Frame Header,
+ *           or an error code (if srcSize is too small) */
+ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+
+/*! ZSTD_getSequences() :
+ * Extract sequences from the sequence store
+ * zc can be used to insert custom compression params.
+ * This function invokes ZSTD_compress2
+ * @return : number of sequences extracted
+ */
+ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs,
+    size_t outSeqsSize, const void* src, size_t srcSize);
+
+
+/***************************************
+*  Memory management
+***************************************/
+
+/*! ZSTD_estimate*() :
+ *  These functions make it possible to estimate memory usage
+ *  of a future {D,C}Ctx, before its creation.
+ *
+ *  ZSTD_estimateCCtxSize() will provide a memory budget large enough
+ *  for any compression level up to selected one.
+ *  Note : Unlike ZSTD_estimateCStreamSize*(), this estimate
+ *         does not include space for a window buffer.
+ *         Therefore, the estimation is only guaranteed for single-shot compressions, not streaming.
+ *  The estimate will assume the input may be arbitrarily large,
+ *  which is the worst case.
+ *
+ *  When srcSize can be bound by a known and rather "small" value,
+ *  this fact can be used to provide a tighter estimation
+ *  because the CCtx compression context will need less memory.
+ *  This tighter estimation can be provided by more advanced functions
+ *  ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(),
+ *  and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter().
+ *  Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits.
+ *
+ *  Note 2 : only single-threaded compression is supported.
+ *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+
+/*! ZSTD_estimateCStreamSize() :
+ *  ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one.
+ *  It will also consider src size to be arbitrarily "large", which is worst case.
+ *  If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation.
+ *  ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel.
+ *  ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1.
+ *  Note : CStream size estimation is only correct for single-threaded compression.
+ *  ZSTD_DStream memory budget depends on window Size.
+ *  This information can be passed manually, using ZSTD_estimateDStreamSize,
+ *  or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame();
+ *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
+ *         an internal ?Dict will be created, which additional size is not estimated here.
+ *         In this case, get total size by adding ZSTD_estimate?DictSize */
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams);
+ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize);
+
+/*! ZSTD_estimate?DictSize() :
+ *  ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict().
+ *  ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced().
+ *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller.
+ */
+ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod);
+
+/*! ZSTD_initStatic*() :
+ *  Initialize an object using a pre-allocated fixed-size buffer.
+ *  workspace: The memory area to emplace the object into.
+ *             Provided pointer *must be 8-bytes aligned*.
+ *             Buffer must outlive object.
+ *  workspaceSize: Use ZSTD_estimate*Size() to determine
+ *                 how large workspace must be to support target scenario.
+ * @return : pointer to object (same address as workspace, just different type),
+ *           or NULL if error (size too small, incorrect alignment, etc.)
+ *  Note : zstd will never resize nor malloc() when using a static buffer.
+ *         If the object requires more memory than available,
+ *         zstd will just error out (typically ZSTD_error_memory_allocation).
+ *  Note 2 : there is no corresponding "free" function.
+ *           Since workspace is allocated externally, it must be freed externally too.
+ *  Note 3 : cParams : use ZSTD_getCParams() to convert a compression level
+ *           into its associated cParams.
+ *  Limitation 1 : currently not compatible with internal dictionary creation, triggered by
+ *                 ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict().
+ *  Limitation 2 : static cctx currently not compatible with multi-threading.
+ *  Limitation 3 : static dctx is incompatible with legacy support.
+ */
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticCCtx() */
+
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize);
+ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize);    /**< same as ZSTD_initStaticDCtx() */
+
+ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType,
+                                        ZSTD_compressionParameters cParams);
+
+ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+                                        void* workspace, size_t workspaceSize,
+                                        const void* dict, size_t dictSize,
+                                        ZSTD_dictLoadMethod_e dictLoadMethod,
+                                        ZSTD_dictContentType_e dictContentType);
+
+
+/*! Custom memory allocation :
+ *  These prototypes make it possible to pass your own allocation/free functions.
+ *  ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below.
+ *  All allocation/free operations will be completed using these custom variants instead of regular <stdlib.h> ones.
+ */
+typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size);
+typedef void  (*ZSTD_freeFunction) (void* opaque, void* address);
+typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem;
+static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL };  /**< this constant defers to stdlib's functions */
+
+ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMem);
+ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_compressionParameters cParams,
+                                                  ZSTD_customMem customMem);
+
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize,
+                                                  ZSTD_dictLoadMethod_e dictLoadMethod,
+                                                  ZSTD_dictContentType_e dictContentType,
+                                                  ZSTD_customMem customMem);
+
+
+
+/***************************************
+*  Advanced compression functions
+***************************************/
+
+/*! ZSTD_createCDict_byReference() :
+ *  Create a digested dictionary for compression
+ *  Dictionary content is just referenced, not duplicated.
+ *  As a consequence, `dictBuffer` **must** outlive CDict,
+ *  and its content must remain unmodified throughout the lifetime of CDict.
+ *  note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */
+ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel);
+
+/*! ZSTD_getCParams() :
+ * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize.
+ * `estimatedSrcSize` value is optional, select 0 if not known */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_getParams() :
+ *  same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`.
+ *  All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */
+ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
+
+/*! ZSTD_checkCParams() :
+ *  Ensure param values remain within authorized range.
+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */
+ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+
+/*! ZSTD_adjustCParams() :
+ *  optimize params for a given `srcSize` and `dictSize`.
+ * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN.
+ * `dictSize` must be `0` when there is no dictionary.
+ *  cPar can be invalid : all parameters will be clamped within valid range in the @return struct.
+ *  This function never fails (wide contract) */
+ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize);
+
+/*! ZSTD_compress_advanced() :
+ *  Note : this function is now DEPRECATED.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */
+ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+                                          void* dst, size_t dstCapacity,
+                                    const void* src, size_t srcSize,
+                                    const void* dict,size_t dictSize,
+                                          ZSTD_parameters params);
+
+/*! ZSTD_compress_usingCDict_advanced() :
+ *  Note : this function is now REDUNDANT.
+ *         It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters.
+ *  This prototype will be marked as deprecated and generate compilation warning in some future version */
+ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                              void* dst, size_t dstCapacity,
+                                        const void* src, size_t srcSize,
+                                        const ZSTD_CDict* cdict,
+                                              ZSTD_frameParameters fParams);
+
+
+/*! ZSTD_CCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx.
+ *  It saves some memory, but also requires that `dict` outlives its usage within `cctx` */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_CCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_CCtx_refPrefix_advanced() :
+ *  Same as ZSTD_CCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/* ===   experimental parameters   === */
+/* these parameters can be used with ZSTD_setParameter()
+ * they are not guaranteed to remain supported in the future */
+
+ /* Enables rsyncable mode,
+  * which makes compressed files more rsync friendly
+  * by adding periodic synchronization points to the compressed data.
+  * The target average block size is ZSTD_c_jobSize / 2.
+  * It's possible to modify the job size to increase or decrease
+  * the granularity of the synchronization point.
+  * Once the jobSize is smaller than the window size,
+  * it will result in compression ratio degradation.
+  * NOTE 1: rsyncable mode only works when multithreading is enabled.
+  * NOTE 2: rsyncable performs poorly in combination with long range mode,
+  * since it will decrease the effectiveness of synchronization points,
+  * though mileage may vary.
+  * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s.
+  * If the selected compression level is already running significantly slower,
+  * the overall speed won't be significantly impacted.
+  */
+ #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1
+
+/* Select a compression format.
+ * The value must be of type ZSTD_format_e.
+ * See ZSTD_format_e enum definition for details */
+#define ZSTD_c_format ZSTD_c_experimentalParam2
+
+/* Force back-reference distances to remain < windowSize,
+ * even when referencing into Dictionary content (default:0) */
+#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3
+
+/* Controls whether the contents of a CDict
+ * are used in place, or copied into the working context.
+ * Accepts values from the ZSTD_dictAttachPref_e enum.
+ * See the comments on that enum for an explanation of the feature. */
+#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
+
+/* Controls how the literals are compressed (default is auto).
+ * The value must be of type ZSTD_literalCompressionMode_e.
+ * See ZSTD_literalCompressionMode_t enum definition for details.
+ */
+#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
+
+/* Tries to fit compressed block size to be around targetCBlockSize.
+ * No target when targetCBlockSize == 0.
+ * There is no guarantee on compressed block size (default:0) */
+#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6
+
+/* User's best guess of source size.
+ * Hint is not valid when srcSizeHint == 0.
+ * There is no guarantee that hint is close to actual source size,
+ * but compression ratio may regress significantly if guess considerably underestimates */
+#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
+
+/*! ZSTD_CCtx_getParameter() :
+ *  Get the requested compression parameter value, selected by enum ZSTD_cParameter,
+ *  and store it into int* value.
+ * @return : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value);
+
+
+/*! ZSTD_CCtx_params :
+ *  Quick howto :
+ *  - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure
+ *  - ZSTD_CCtxParams_setParameter() : Push parameters one by one into
+ *                                     an existing ZSTD_CCtx_params structure.
+ *                                     This is similar to
+ *                                     ZSTD_CCtx_setParameter().
+ *  - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to
+ *                                    an existing CCtx.
+ *                                    These parameters will be applied to
+ *                                    all subsequent frames.
+ *  - ZSTD_compressStream2() : Do compression using the CCtx.
+ *  - ZSTD_freeCCtxParams() : Free the memory.
+ *
+ *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
+ *  for static allocation of CCtx for single-threaded compression.
+ */
+ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_reset() :
+ *  Reset params to default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+
+/*! ZSTD_CCtxParams_init() :
+ *  Initializes the compression parameters of cctxParams according to
+ *  compression level. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel);
+
+/*! ZSTD_CCtxParams_init_advanced() :
+ *  Initializes the compression and frame parameters of cctxParams according to
+ *  params. All other parameters are reset to their default values.
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params);
+
+/*! ZSTD_CCtxParams_setParameter() :
+ *  Similar to ZSTD_CCtx_setParameter.
+ *  Set one compression parameter, selected by enum ZSTD_cParameter.
+ *  Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams().
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value);
+
+/*! ZSTD_CCtxParams_getParameter() :
+ * Similar to ZSTD_CCtx_getParameter.
+ * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter.
+ * @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value);
+
+/*! ZSTD_CCtx_setParametersUsingCCtxParams() :
+ *  Apply a set of ZSTD_CCtx_params to the compression context.
+ *  This can be done even after compression is started,
+ *    if nbWorkers==0, this will have no impact until a new compression is started.
+ *    if nbWorkers>=1, new parameters will be picked up at next job,
+ *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated).
+ */
+ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+        ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
+
+/*! ZSTD_compressStream2_simpleArgs() :
+ *  Same as ZSTD_compressStream2(),
+ *  but using only integral types as arguments.
+ *  This variant might be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
+                            ZSTD_CCtx* cctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos,
+                            ZSTD_EndDirective endOp);
+
+
+/***************************************
+*  Advanced decompression functions
+***************************************/
+
+/*! ZSTD_isFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0.
+ *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled.
+ *  Note 3 : Skippable Frame Identifiers are considered valid. */
+ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+
+/*! ZSTD_createDDict_byReference() :
+ *  Create a digested dictionary, ready to start decompression operation without startup delay.
+ *  Dictionary content is referenced, and therefore stays in dictBuffer.
+ *  It is important that dictBuffer outlives DDict,
+ *  it must remain read accessible throughout the lifetime of DDict */
+ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_byReference() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but references `dict` content instead of copying it into `dctx`.
+ *  This saves memory if `dict` remains around.,
+ *  However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+
+/*! ZSTD_DCtx_loadDictionary_advanced() :
+ *  Same as ZSTD_DCtx_loadDictionary(),
+ *  but gives direct control over
+ *  how to load the dictionary (by copy ? by reference ?)
+ *  and how to interpret it (automatic ? force raw mode ? full mode only ?). */
+ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_refPrefix_advanced() :
+ *  Same as ZSTD_DCtx_refPrefix(), but gives finer control over
+ *  how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */
+ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+
+/*! ZSTD_DCtx_setMaxWindowSize() :
+ *  Refuses allocating internal buffers for frames requiring a window size larger than provided limit.
+ *  This protects a decoder context from reserving too much memory for itself (potential attack scenario).
+ *  This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode.
+ *  By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT)
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()).
+ */
+ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize);
+
+/* ZSTD_d_format
+ * experimental parameter,
+ * allowing selection between ZSTD_format_e input compression formats
+ */
+#define ZSTD_d_format ZSTD_d_experimentalParam1
+/* ZSTD_d_stableOutBuffer
+ * Experimental parameter.
+ * Default is 0 == disabled. Set to 1 to enable.
+ *
+ * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same
+ * between calls, except for the modifications that zstd makes to pos (the
+ * caller must not modify pos). This is checked by the decompressor, and
+ * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer
+ * MUST be large enough to fit the entire decompressed frame. This will be
+ * checked when the frame content size is known. The data in the ZSTD_outBuffer
+ * in the range [dst, dst + pos) MUST not be modified during decompression
+ * or you will get data corruption.
+ *
+ * When this flags is enabled zstd won't allocate an output buffer, because
+ * it can write directly to the ZSTD_outBuffer, but it will still allocate
+ * an input buffer large enough to fit any compressed block. This will also
+ * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer.
+ * If you need to avoid the input buffer allocation use the buffer-less
+ * streaming API.
+ *
+ * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using
+ * this flag is ALWAYS memory safe, and will never access out-of-bounds
+ * memory. However, decompression WILL fail if you violate the preconditions.
+ *
+ * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST
+ * not be modified during decompression or you will get data corruption. This
+ * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate
+ * matches. Normally zstd maintains its own buffer for this purpose, but passing
+ * this flag tells zstd to use the user provided buffer.
+ */
+#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2
+
+/*! ZSTD_DCtx_setFormat() :
+ *  Instruct the decoder context about what kind of data to decode next.
+ *  This instruction is mandatory to decode data without a fully-formed header,
+ *  such ZSTD_f_zstd1_magicless for example.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError()). */
+ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
+
+/*! ZSTD_decompressStream_simpleArgs() :
+ *  Same as ZSTD_decompressStream(),
+ *  but using only integral types as arguments.
+ *  This can be helpful for binders from dynamic languages
+ *  which have troubles handling structures containing memory pointers.
+ */
+ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
+                            ZSTD_DCtx* dctx,
+                            void* dst, size_t dstCapacity, size_t* dstPos,
+                      const void* src, size_t srcSize, size_t* srcPos);
+
+
+/********************************************************************
+*  Advanced streaming functions
+*  Warning : most of these functions are now redundant with the Advanced API.
+*  Once Advanced API reaches "stable" status,
+*  redundant functions will be deprecated, and then at some point removed.
+********************************************************************/
+
+/*=====   Advanced Streaming compression functions  =====*/
+/**! ZSTD_initCStream_srcSize() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ * pledgedSrcSize must be correct. If it is not known at init time, use
+ * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs,
+ * "0" also disables frame content size field. It may be enabled in the future.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+                         int compressionLevel,
+                         unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingDict() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * Creates of an internal CDict (incompatible with static CCtx), except if
+ * dict == NULL or dictSize < 8, in which case no dict is used.
+ * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if
+ * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+                     const void* dict, size_t dictSize,
+                           int compressionLevel);
+
+/**! ZSTD_initCStream_advanced() :
+ * This function is deprecated, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
+ *     for ((param, value) : params) {
+ *         ZSTD_CCtx_setParameter(zcs, param, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
+ *
+ * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
+ * pledgedSrcSize must be correct.
+ * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+                    const void* dict, size_t dictSize,
+                          ZSTD_parameters params,
+                          unsigned long long pledgedSrcSize);
+
+/**! ZSTD_initCStream_usingCDict() :
+ * This function is deprecated, and equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * note : cdict will just be referenced, and must outlive compression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict);
+
+/**! ZSTD_initCStream_usingCDict_advanced() :
+ *   This function is DEPRECATED, and is approximately equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     // Pseudocode: Set each zstd frame parameter and leave the rest as-is.
+ *     for ((fParam, value) : fParams) {
+ *         ZSTD_CCtx_setParameter(zcs, fParam, value);
+ *     }
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *     ZSTD_CCtx_refCDict(zcs, cdict);
+ *
+ * same as ZSTD_initCStream_usingCDict(), with control over frame parameters.
+ * pledgedSrcSize must be correct. If srcSize is not known at init time, use
+ * value ZSTD_CONTENTSIZE_UNKNOWN.
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t
+ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+                               const ZSTD_CDict* cdict,
+                                     ZSTD_frameParameters fParams,
+                                     unsigned long long pledgedSrcSize);
+
+/*! ZSTD_resetCStream() :
+ * This function is deprecated, and is equivalent to:
+ *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
+ *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ *
+ *  start a new frame, using same parameters from previous frame.
+ *  This is typically useful to skip dictionary loading stage, since it will re-use it in-place.
+ *  Note that zcs must be init at least once before using ZSTD_resetCStream().
+ *  If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN.
+ *  If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end.
+ *  For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs,
+ *  but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead.
+ * @return : 0, or an error code (which can be tested using ZSTD_isError())
+ *  Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize);
+
+
+typedef struct {
+    unsigned long long ingested;   /* nb input bytes read and buffered */
+    unsigned long long consumed;   /* nb input bytes actually compressed */
+    unsigned long long produced;   /* nb of compressed bytes generated and buffered */
+    unsigned long long flushed;    /* nb of compressed bytes flushed : not provided; can be tracked from caller side */
+    unsigned currentJobID;         /* MT only : latest started job nb */
+    unsigned nbActiveWorkers;      /* MT only : nb of workers actively compressing at probe time */
+} ZSTD_frameProgression;
+
+/* ZSTD_getFrameProgression() :
+ * tells how much data has been ingested (read from input)
+ * consumed (input actually compressed) and produced (output) for current frame.
+ * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed.
+ * Aggregates progression inside active worker threads.
+ */
+ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx);
+
+/*! ZSTD_toFlushNow() :
+ *  Tell how many bytes are ready to be flushed immediately.
+ *  Useful for multithreading scenarios (nbWorkers >= 1).
+ *  Probe the oldest active job, defined as oldest job not yet entirely flushed,
+ *  and check its output buffer.
+ * @return : amount of data stored in oldest job and ready to be flushed immediately.
+ *  if @return == 0, it means either :
+ *  + there is no active job (could be checked with ZSTD_frameProgression()), or
+ *  + oldest job is still actively compressing data,
+ *    but everything it has produced has also been flushed so far,
+ *    therefore flush speed is limited by production speed of oldest job
+ *    irrespective of the speed of concurrent (and newer) jobs.
+ */
+ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+
+
+/*=====   Advanced Streaming decompression functions  =====*/
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_loadDictionary(zds, dict, dictSize);
+ *
+ * note: no dictionary will be used if dict == NULL or dictSize < 8
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *     ZSTD_DCtx_refDDict(zds, ddict);
+ *
+ * note : ddict is referenced, it must outlive decompression session
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict);
+
+/**
+ * This function is deprecated, and is equivalent to:
+ *
+ *     ZSTD_DCtx_reset(zds, ZSTD_reset_session_only);
+ *
+ * re-use decompression parameters from previous init; saves dictionary loading
+ * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x
+ */
+ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+
+
+/*********************************************************************
+*  Buffer-less and synchronous inner streaming functions
+*
+*  This is an advanced API, giving full control over buffer management, for users which need direct control over memory.
+*  But it's also a complex one, with several restrictions, documented below.
+*  Prefer normal streaming API for an easier experience.
+********************************************************************* */
+
+/**
+  Buffer-less streaming compression (synchronous mode)
+
+  A ZSTD_CCtx object is required to track streaming operations.
+  Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource.
+  ZSTD_CCtx object can be re-used multiple times within successive compression operations.
+
+  Start by initializing a context.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression,
+  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx()
+
+  Then, consume your input using ZSTD_compressContinue().
+  There are some important considerations to keep in mind when using this advanced function :
+  - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only.
+  - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks.
+  - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario.
+    Worst case evaluation is provided by ZSTD_compressBound().
+    ZSTD_compressContinue() doesn't guarantee recover after a failed compression.
+  - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog).
+    It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks)
+  - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps.
+    In which case, it will "discard" the relevant memory section from its history.
+
+  Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum.
+  It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame.
+  Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders.
+
+  `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again.
+*/
+
+/*=====   Buffer-less streaming compression functions  =====*/
+ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */
+ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize);   /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**<  note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+
+/*-
+  Buffer-less streaming decompression (synchronous mode)
+
+  A ZSTD_DCtx object is required to track streaming operations.
+  Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it.
+  A ZSTD_DCtx object can be re-used multiple times.
+
+  First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader().
+  Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough.
+  Data fragment must be large enough to ensure successful decoding.
+ `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough.
+  @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled.
+           >0 : `srcSize` is too small, please provide at least @result bytes on next attempt.
+           errorCode, which can be tested using ZSTD_isError().
+
+  It fills a ZSTD_frameHeader structure with important information to correctly decode the frame,
+  such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`).
+  Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information.
+  As a consequence, check that values remain within valid application range.
+  For example, do not allocate memory blindly, check that `windowSize` is within expectation.
+  Each application can set its own limits, depending on local restrictions.
+  For extended interoperability, it is recommended to support `windowSize` of at least 8 MB.
+
+  ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes.
+  ZSTD_decompressContinue() is very sensitive to contiguity,
+  if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place,
+  or that previous contiguous segment is large enough to properly handle maximum back-reference distance.
+  There are multiple ways to guarantee this condition.
+
+  The most memory efficient way is to use a round buffer of sufficient size.
+  Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(),
+  which can @return an error code if required value is too large for current system (in 32-bits mode).
+  In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one,
+  up to the moment there is not enough room left in the buffer to guarantee decoding another full block,
+  which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`.
+  At which point, decoding can resume from the beginning of the buffer.
+  Note that already decoded data stored in the buffer should be flushed before being overwritten.
+
+  There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory.
+
+  Finally, if you control the compression process, you can also ignore all buffer size rules,
+  as long as the encoder and decoder progress in "lock-step",
+  aka use exactly the same buffer sizes, break contiguity at the same place, etc.
+
+  Once buffers are setup, start decompression, with ZSTD_decompressBegin().
+  If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict().
+
+  Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively.
+  ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue().
+  ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail.
+
+ @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity).
+  It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item.
+  It can also be an error code, which can be tested with ZSTD_isError().
+
+  A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero.
+  Context can then be reset to start a new decompression.
+
+  Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType().
+  This information is not required to properly decode a frame.
+
+  == Special case : skippable frames ==
+
+  Skippable frames allow integration of user-defined data into a flow of concatenated frames.
+  Skippable frames will be ignored (skipped) by decompressor.
+  The format of skippable frames is as follows :
+  a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F
+  b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits
+  c) Frame Content - any content (User Data) of length equal to Frame Size
+  For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame.
+  For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content.
+*/
+
+/*=====   Buffer-less streaming decompression functions  =====*/
+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e;
+typedef struct {
+    unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */
+    unsigned long long windowSize;       /* can be very large, up to <= frameContentSize */
+    unsigned blockSizeMax;
+    ZSTD_frameType_e frameType;          /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */
+    unsigned headerSize;
+    unsigned dictID;
+    unsigned checksumFlag;
+} ZSTD_frameHeader;
+
+/*! ZSTD_getFrameHeader() :
+ *  decode Frame Header, or requires larger `srcSize`.
+ * @return : 0, `zfhPtr` is correctly filled,
+ *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
+ *           or an error code, which can be tested using ZSTD_isError() */
+ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize);   /**< doesn't consume input */
+/*! ZSTD_getFrameHeader_advanced() :
+ *  same as ZSTD_getFrameHeader(),
+ *  with added capability to select a format (like ZSTD_f_zstd1_magicless) */
+ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize);  /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize);
+ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict);
+
+ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+
+/* misc */
+ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx);
+typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e;
+ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+
+
+
+
+/* ============================ */
+/**       Block level API       */
+/* ============================ */
+
+/*!
+    Block functions produce and decode raw zstd blocks, without frame metadata.
+    Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes).
+    But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes.
+
+    A few rules to respect :
+    - Compressing and decompressing require a context structure
+      + Use ZSTD_createCCtx() and ZSTD_createDCtx()
+    - It is necessary to init context before starting
+      + compression : any ZSTD_compressBegin*() variant, including with dictionary
+      + decompression : any ZSTD_decompressBegin*() variant, including with dictionary
+      + copyCCtx() and copyDCtx() can be used too
+    - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB
+      + If input is larger than a block size, it's necessary to split input data into multiple blocks
+      + For inputs larger than a single block, consider using regular ZSTD_compress() instead.
+        Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block.
+    - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) !
+      ===> In which case, nothing is produced into `dst` !
+      + User __must__ test for such outcome and deal directly with uncompressed data
+      + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0.
+        Doing so would mess up with statistics history, leading to potential data corruption.
+      + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !!
+      + In case of multiple successive blocks, should some of them be uncompressed,
+        decoder must be informed of their existence in order to follow proper history.
+        Use ZSTD_insertBlock() for such a case.
+*/
+
+/*=====   Raw zstd block functions  =====*/
+ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize);  /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */
+
+
+#endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/sys/contrib/openzfs/module/zstd/lib/zstd_errors.h b/sys/contrib/openzfs/module/zstd/lib/zstd_errors.h
new file mode 100644
index 000000000000..998398e7e57f
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/lib/zstd_errors.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_ERRORS_H_398273423
+#define ZSTD_ERRORS_H_398273423
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*===== dependency =====*/
+#include <stddef.h>   /* size_t */
+
+
+/* =====   ZSTDERRORLIB_API : control library symbols visibility   ===== */
+#ifndef ZSTDERRORLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define ZSTDERRORLIB_VISIBILITY
+#  endif
+#endif
+#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY
+#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
+#  define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY
+#endif
+
+/*-*********************************************
+ *  Error codes list
+ *-*********************************************
+ *  Error codes _values_ are pinned down since v1.3.1 only.
+ *  Therefore, don't rely on values if you may link to any version < v1.3.1.
+ *
+ *  Only values < 100 are considered stable.
+ *
+ *  note 1 : this API shall be used with static linking only.
+ *           dynamic linking is not yet officially supported.
+ *  note 2 : Prefer relying on the enum than on its value whenever possible
+ *           This is the only supported way to use the error list < v1.3.1
+ *  note 3 : ZSTD_isError() is always correct, whatever the library version.
+ **********************************************/
+typedef enum {
+  ZSTD_error_no_error = 0,
+  ZSTD_error_GENERIC  = 1,
+  ZSTD_error_prefix_unknown                = 10,
+  ZSTD_error_version_unsupported           = 12,
+  ZSTD_error_frameParameter_unsupported    = 14,
+  ZSTD_error_frameParameter_windowTooLarge = 16,
+  ZSTD_error_corruption_detected = 20,
+  ZSTD_error_checksum_wrong      = 22,
+  ZSTD_error_dictionary_corrupted      = 30,
+  ZSTD_error_dictionary_wrong          = 32,
+  ZSTD_error_dictionaryCreation_failed = 34,
+  ZSTD_error_parameter_unsupported   = 40,
+  ZSTD_error_parameter_outOfBound    = 42,
+  ZSTD_error_tableLog_tooLarge       = 44,
+  ZSTD_error_maxSymbolValue_tooLarge = 46,
+  ZSTD_error_maxSymbolValue_tooSmall = 48,
+  ZSTD_error_stage_wrong       = 60,
+  ZSTD_error_init_missing      = 62,
+  ZSTD_error_memory_allocation = 64,
+  ZSTD_error_workSpace_tooSmall= 66,
+  ZSTD_error_dstSize_tooSmall = 70,
+  ZSTD_error_srcSize_wrong    = 72,
+  ZSTD_error_dstBuffer_null   = 74,
+  /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */
+  ZSTD_error_frameIndex_tooLarge = 100,
+  ZSTD_error_seekableIO          = 102,
+  ZSTD_error_dstBuffer_wrong     = 104,
+  ZSTD_error_maxCode = 120  /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */
+} ZSTD_ErrorCode;
+
+/*! ZSTD_getErrorCode() :
+    convert a `size_t` function result into a `ZSTD_ErrorCode` enum type,
+    which can be used to compare with enum list published above */
+ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult);
+ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code);   /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* ZSTD_ERRORS_H_398273423 */
diff --git a/sys/contrib/openzfs/module/zstd/zfs_zstd.c b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
new file mode 100644
index 000000000000..69ebf252d1ba
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/zfs_zstd.c
@@ -0,0 +1,780 @@
+/*
+ * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2018, Klara Inc.
+ * Copyright (c) 2016-2018, Allan Jude
+ * Copyright (c) 2018-2020, Sebastian Gottschall
+ * Copyright (c) 2019-2020, Michael Niewöhner
+ * Copyright (c) 2020, The FreeBSD Foundation [1]
+ *
+ * [1] Portions of this software were developed by Allan Jude
+ *     under sponsorship from the FreeBSD Foundation.
+ */
+
+#include <sys/param.h>
+#include <sys/sysmacros.h>
+#include <sys/zfs_context.h>
+#include <sys/zio_compress.h>
+#include <sys/spa.h>
+#include <sys/zstd/zstd.h>
+
+#define	ZSTD_STATIC_LINKING_ONLY
+#include "lib/zstd.h"
+#include "lib/zstd_errors.h"
+
+kstat_t *zstd_ksp = NULL;
+
+typedef struct zstd_stats {
+	kstat_named_t	zstd_stat_alloc_fail;
+	kstat_named_t	zstd_stat_alloc_fallback;
+	kstat_named_t	zstd_stat_com_alloc_fail;
+	kstat_named_t	zstd_stat_dec_alloc_fail;
+	kstat_named_t	zstd_stat_com_inval;
+	kstat_named_t	zstd_stat_dec_inval;
+	kstat_named_t	zstd_stat_dec_header_inval;
+	kstat_named_t	zstd_stat_com_fail;
+	kstat_named_t	zstd_stat_dec_fail;
+	kstat_named_t	zstd_stat_buffers;
+	kstat_named_t	zstd_stat_size;
+} zstd_stats_t;
+
+static zstd_stats_t zstd_stats = {
+	{ "alloc_fail",			KSTAT_DATA_UINT64 },
+	{ "alloc_fallback",		KSTAT_DATA_UINT64 },
+	{ "compress_alloc_fail",	KSTAT_DATA_UINT64 },
+	{ "decompress_alloc_fail",	KSTAT_DATA_UINT64 },
+	{ "compress_level_invalid",	KSTAT_DATA_UINT64 },
+	{ "decompress_level_invalid",	KSTAT_DATA_UINT64 },
+	{ "decompress_header_invalid",	KSTAT_DATA_UINT64 },
+	{ "compress_failed",		KSTAT_DATA_UINT64 },
+	{ "decompress_failed",		KSTAT_DATA_UINT64 },
+	{ "buffers",			KSTAT_DATA_UINT64 },
+	{ "size",			KSTAT_DATA_UINT64 },
+};
+
+/* Enums describing the allocator type specified by kmem_type in zstd_kmem */
+enum zstd_kmem_type {
+	ZSTD_KMEM_UNKNOWN = 0,
+	/* Allocation type using kmem_vmalloc */
+	ZSTD_KMEM_DEFAULT,
+	/* Pool based allocation using mempool_alloc */
+	ZSTD_KMEM_POOL,
+	/* Reserved fallback memory for decompression only */
+	ZSTD_KMEM_DCTX,
+	ZSTD_KMEM_COUNT,
+};
+
+/* Structure for pooled memory objects */
+struct zstd_pool {
+	void *mem;
+	size_t size;
+	kmutex_t barrier;
+	hrtime_t timeout;
+};
+
+/* Global structure for handling memory allocations */
+struct zstd_kmem {
+	enum zstd_kmem_type kmem_type;
+	size_t kmem_size;
+	struct zstd_pool *pool;
+};
+
+/* Fallback memory structure used for decompression only if memory runs out */
+struct zstd_fallback_mem {
+	size_t mem_size;
+	void *mem;
+	kmutex_t barrier;
+};
+
+struct zstd_levelmap {
+	int16_t zstd_level;
+	enum zio_zstd_levels level;
+};
+
+/*
+ * ZSTD memory handlers
+ *
+ * For decompression we use a different handler which also provides fallback
+ * memory allocation in case memory runs out.
+ *
+ * The ZSTD handlers were split up for the most simplified implementation.
+ */
+static void *zstd_alloc(void *opaque, size_t size);
+static void *zstd_dctx_alloc(void *opaque, size_t size);
+static void zstd_free(void *opaque, void *ptr);
+
+/* Compression memory handler */
+static const ZSTD_customMem zstd_malloc = {
+	zstd_alloc,
+	zstd_free,
+	NULL,
+};
+
+/* Decompression memory handler */
+static const ZSTD_customMem zstd_dctx_malloc = {
+	zstd_dctx_alloc,
+	zstd_free,
+	NULL,
+};
+
+/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
+static struct zstd_levelmap zstd_levels[] = {
+	{ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
+	{ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
+	{ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
+	{ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
+	{ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
+	{ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
+	{ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
+	{ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
+	{ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
+	{ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
+	{ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
+	{ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
+	{ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
+	{ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
+	{ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
+	{ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
+	{ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
+	{ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
+	{ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
+	{-1, ZIO_ZSTD_LEVEL_FAST_1},
+	{-2, ZIO_ZSTD_LEVEL_FAST_2},
+	{-3, ZIO_ZSTD_LEVEL_FAST_3},
+	{-4, ZIO_ZSTD_LEVEL_FAST_4},
+	{-5, ZIO_ZSTD_LEVEL_FAST_5},
+	{-6, ZIO_ZSTD_LEVEL_FAST_6},
+	{-7, ZIO_ZSTD_LEVEL_FAST_7},
+	{-8, ZIO_ZSTD_LEVEL_FAST_8},
+	{-9, ZIO_ZSTD_LEVEL_FAST_9},
+	{-10, ZIO_ZSTD_LEVEL_FAST_10},
+	{-20, ZIO_ZSTD_LEVEL_FAST_20},
+	{-30, ZIO_ZSTD_LEVEL_FAST_30},
+	{-40, ZIO_ZSTD_LEVEL_FAST_40},
+	{-50, ZIO_ZSTD_LEVEL_FAST_50},
+	{-60, ZIO_ZSTD_LEVEL_FAST_60},
+	{-70, ZIO_ZSTD_LEVEL_FAST_70},
+	{-80, ZIO_ZSTD_LEVEL_FAST_80},
+	{-90, ZIO_ZSTD_LEVEL_FAST_90},
+	{-100, ZIO_ZSTD_LEVEL_FAST_100},
+	{-500, ZIO_ZSTD_LEVEL_FAST_500},
+	{-1000, ZIO_ZSTD_LEVEL_FAST_1000},
+};
+
+/*
+ * This variable represents the maximum count of the pool based on the number
+ * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
+ */
+static int pool_count = 16;
+
+#define	ZSTD_POOL_MAX		pool_count
+#define	ZSTD_POOL_TIMEOUT	60 * 2
+
+static struct zstd_fallback_mem zstd_dctx_fallback;
+static struct zstd_pool *zstd_mempool_cctx;
+static struct zstd_pool *zstd_mempool_dctx;
+
+
+static void
+zstd_mempool_reap(struct zstd_pool *zstd_mempool)
+{
+	struct zstd_pool *pool;
+
+	if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
+		return;
+	}
+
+	/* free obsolete slots */
+	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+		pool = &zstd_mempool[i];
+		if (pool->mem && mutex_tryenter(&pool->barrier)) {
+			/* Free memory if unused object older than 2 minutes */
+			if (pool->mem && gethrestime_sec() > pool->timeout) {
+				vmem_free(pool->mem, pool->size);
+				ZSTDSTAT_SUB(zstd_stat_buffers, 1);
+				ZSTDSTAT_SUB(zstd_stat_size, pool->size);
+				pool->mem = NULL;
+				pool->size = 0;
+				pool->timeout = 0;
+			}
+			mutex_exit(&pool->barrier);
+		}
+	}
+}
+
+/*
+ * Try to get a cached allocated buffer from memory pool or allocate a new one
+ * if necessary. If a object is older than 2 minutes and does not fit the
+ * requested size, it will be released and a new cached entry will be allocated.
+ * If other pooled objects are detected without being used for 2 minutes, they
+ * will be released, too.
+ *
+ * The concept is that high frequency memory allocations of bigger objects are
+ * expensive. So if a lot of work is going on, allocations will be kept for a
+ * while and can be reused in that time frame.
+ *
+ * The scheduled release will be updated every time a object is reused.
+ */
+
+static void *
+zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
+{
+	struct zstd_pool *pool;
+	struct zstd_kmem *mem = NULL;
+
+	if (!zstd_mempool) {
+		return (NULL);
+	}
+
+	/* Seek for preallocated memory slot and free obsolete slots */
+	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+		pool = &zstd_mempool[i];
+		/*
+		 * This lock is simply a marker for a pool object beeing in use.
+		 * If it's already hold, it will be skipped.
+		 *
+		 * We need to create it before checking it to avoid race
+		 * conditions caused by running in a threaded context.
+		 *
+		 * The lock is later released by zstd_mempool_free.
+		 */
+		if (mutex_tryenter(&pool->barrier)) {
+			/*
+			 * Check if objects fits the size, if so we take it and
+			 * update the timestamp.
+			 */
+			if (pool->mem && size <= pool->size) {
+				pool->timeout = gethrestime_sec() +
+				    ZSTD_POOL_TIMEOUT;
+				mem = pool->mem;
+				return (mem);
+			}
+			mutex_exit(&pool->barrier);
+		}
+	}
+
+	/*
+	 * If no preallocated slot was found, try to fill in a new one.
+	 *
+	 * We run a similar algorithm twice here to avoid pool fragmentation.
+	 * The first one may generate holes in the list if objects get released.
+	 * We always make sure that these holes get filled instead of adding new
+	 * allocations constantly at the end.
+	 */
+	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+		pool = &zstd_mempool[i];
+		if (mutex_tryenter(&pool->barrier)) {
+			/* Object is free, try to allocate new one */
+			if (!pool->mem) {
+				mem = vmem_alloc(size, KM_SLEEP);
+				if (mem) {
+					ZSTDSTAT_ADD(zstd_stat_buffers, 1);
+					ZSTDSTAT_ADD(zstd_stat_size, size);
+					pool->mem = mem;
+					pool->size = size;
+					/* Keep track for later release */
+					mem->pool = pool;
+					mem->kmem_type = ZSTD_KMEM_POOL;
+					mem->kmem_size = size;
+				}
+			}
+
+			if (size <= pool->size) {
+				/* Update timestamp */
+				pool->timeout = gethrestime_sec() +
+				    ZSTD_POOL_TIMEOUT;
+
+				return (pool->mem);
+			}
+
+			mutex_exit(&pool->barrier);
+		}
+	}
+
+	/*
+	 * If the pool is full or the allocation failed, try lazy allocation
+	 * instead.
+	 */
+	if (!mem) {
+		mem = vmem_alloc(size, KM_NOSLEEP);
+		if (mem) {
+			mem->pool = NULL;
+			mem->kmem_type = ZSTD_KMEM_DEFAULT;
+			mem->kmem_size = size;
+		}
+	}
+
+	return (mem);
+}
+
+/* Mark object as released by releasing the barrier mutex */
+static void
+zstd_mempool_free(struct zstd_kmem *z)
+{
+	mutex_exit(&z->pool->barrier);
+}
+
+/* Convert ZFS internal enum to ZSTD level */
+static int
+zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
+{
+	if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
+		*zstd_level = zstd_levels[level - 1].zstd_level;
+		return (0);
+	}
+	if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
+	    level <= ZIO_ZSTD_LEVEL_FAST_1000) {
+		*zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
+		    + ZIO_ZSTD_LEVEL_19].zstd_level;
+		return (0);
+	}
+
+	/* Invalid/unknown zfs compression enum - this should never happen. */
+	return (1);
+}
+
+/* Compress block using zstd */
+size_t
+zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+    int level)
+{
+	size_t c_len;
+	int16_t zstd_level;
+	zfs_zstdhdr_t *hdr;
+	ZSTD_CCtx *cctx;
+
+	hdr = (zfs_zstdhdr_t *)d_start;
+
+	/* Skip compression if the specified level is invalid */
+	if (zstd_enum_to_level(level, &zstd_level)) {
+		ZSTDSTAT_BUMP(zstd_stat_com_inval);
+		return (s_len);
+	}
+
+	ASSERT3U(d_len, >=, sizeof (*hdr));
+	ASSERT3U(d_len, <=, s_len);
+	ASSERT3U(zstd_level, !=, 0);
+
+	cctx = ZSTD_createCCtx_advanced(zstd_malloc);
+
+	/*
+	 * Out of kernel memory, gently fall through - this will disable
+	 * compression in zio_compress_data
+	 */
+	if (!cctx) {
+		ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
+		return (s_len);
+	}
+
+	/* Set the compression level */
+	ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
+
+	/* Use the "magicless" zstd header which saves us 4 header bytes */
+	ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
+
+	/*
+	 * Disable redundant checksum calculation and content size storage since
+	 * this is already done by ZFS itself.
+	 */
+	ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
+	ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
+
+	c_len = ZSTD_compress2(cctx,
+	    hdr->data,
+	    d_len - sizeof (*hdr),
+	    s_start, s_len);
+
+	ZSTD_freeCCtx(cctx);
+
+	/* Error in the compression routine, disable compression. */
+	if (ZSTD_isError(c_len)) {
+		/*
+		 * If we are aborting the compression because the saves are
+		 * too small, that is not a failure. Everything else is a
+		 * failure, so increment the compression failure counter.
+		 */
+		if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
+			ZSTDSTAT_BUMP(zstd_stat_com_fail);
+		}
+		return (s_len);
+	}
+
+	/*
+	 * Encode the compressed buffer size at the start. We'll need this in
+	 * decompression to counter the effects of padding which might be added
+	 * to the compressed buffer and which, if unhandled, would confuse the
+	 * hell out of our decompression function.
+	 */
+	hdr->c_len = BE_32(c_len);
+
+	/*
+	 * Check version for overflow.
+	 * The limit of 24 bits must not be exceeded. This allows a maximum
+	 * version 1677.72.15 which we don't expect to be ever reached.
+	 */
+	ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
+
+	/*
+	 * Encode the compression level as well. We may need to know the
+	 * original compression level if compressed_arc is disabled, to match
+	 * the compression settings to write this block to the L2ARC.
+	 *
+	 * Encode the actual level, so if the enum changes in the future, we
+	 * will be compatible.
+	 *
+	 * The upper 24 bits store the ZSTD version to be able to provide
+	 * future compatibility, since new versions might enhance the
+	 * compression algorithm in a way, where the compressed data will
+	 * change.
+	 *
+	 * As soon as such incompatibility occurs, handling code needs to be
+	 * added, differentiating between the versions.
+	 */
+	hdr->version = ZSTD_VERSION_NUMBER;
+	hdr->level = level;
+	hdr->raw_version_level = BE_32(hdr->raw_version_level);
+
+	return (c_len + sizeof (*hdr));
+}
+
+/* Decompress block using zstd and return its stored level */
+int
+zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, uint8_t *level)
+{
+	ZSTD_DCtx *dctx;
+	size_t result;
+	int16_t zstd_level;
+	uint32_t c_len;
+	const zfs_zstdhdr_t *hdr;
+	zfs_zstdhdr_t hdr_copy;
+
+	hdr = (const zfs_zstdhdr_t *)s_start;
+	c_len = BE_32(hdr->c_len);
+
+	/*
+	 * Make a copy instead of directly converting the header, since we must
+	 * not modify the original data that may be used again later.
+	 */
+	hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
+
+	/*
+	 * NOTE: We ignore the ZSTD version for now. As soon as any
+	 * incompatibility occurrs, it has to be handled accordingly.
+	 * The version can be accessed via `hdr_copy.version`.
+	 */
+
+	/*
+	 * Convert and check the level
+	 * An invalid level is a strong indicator for data corruption! In such
+	 * case return an error so the upper layers can try to fix it.
+	 */
+	if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) {
+		ZSTDSTAT_BUMP(zstd_stat_dec_inval);
+		return (1);
+	}
+
+	ASSERT3U(d_len, >=, s_len);
+	ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT);
+
+	/* Invalid compressed buffer size encoded at start */
+	if (c_len + sizeof (*hdr) > s_len) {
+		ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
+		return (1);
+	}
+
+	dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
+	if (!dctx) {
+		ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
+		return (1);
+	}
+
+	/* Set header type to "magicless" */
+	ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
+
+	/* Decompress the data and release the context */
+	result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
+	ZSTD_freeDCtx(dctx);
+
+	/*
+	 * Returns 0 on success (decompression function returned non-negative)
+	 * and non-zero on failure (decompression function returned negative.
+	 */
+	if (ZSTD_isError(result)) {
+		ZSTDSTAT_BUMP(zstd_stat_dec_fail);
+		return (1);
+	}
+
+	if (level) {
+		*level = hdr_copy.level;
+	}
+
+	return (0);
+}
+
+/* Decompress datablock using zstd */
+int
+zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
+    int level __maybe_unused)
+{
+
+	return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
+	    NULL));
+}
+
+/* Allocator for zstd compression context using mempool_allocator */
+static void *
+zstd_alloc(void *opaque __maybe_unused, size_t size)
+{
+	size_t nbytes = sizeof (struct zstd_kmem) + size;
+	struct zstd_kmem *z = NULL;
+
+	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
+
+	if (!z) {
+		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
+		return (NULL);
+	}
+
+	return ((void*)z + (sizeof (struct zstd_kmem)));
+}
+
+/*
+ * Allocator for zstd decompression context using mempool_allocator with
+ * fallback to reserved memory if allocation fails
+ */
+static void *
+zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
+{
+	size_t nbytes = sizeof (struct zstd_kmem) + size;
+	struct zstd_kmem *z = NULL;
+	enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
+
+	z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
+	if (!z) {
+		/* Try harder, decompression shall not fail */
+		z = vmem_alloc(nbytes, KM_SLEEP);
+		if (z) {
+			z->pool = NULL;
+		}
+		ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
+	} else {
+		return ((void*)z + (sizeof (struct zstd_kmem)));
+	}
+
+	/* Fallback if everything fails */
+	if (!z) {
+		/*
+		 * Barrier since we only can handle it in a single thread. All
+		 * other following threads need to wait here until decompression
+		 * is completed. zstd_free will release this barrier later.
+		 */
+		mutex_enter(&zstd_dctx_fallback.barrier);
+
+		z = zstd_dctx_fallback.mem;
+		type = ZSTD_KMEM_DCTX;
+		ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
+	}
+
+	/* Allocation should always be successful */
+	if (!z) {
+		return (NULL);
+	}
+
+	z->kmem_type = type;
+	z->kmem_size = nbytes;
+
+	return ((void*)z + (sizeof (struct zstd_kmem)));
+}
+
+/* Free allocated memory by its specific type */
+static void
+zstd_free(void *opaque __maybe_unused, void *ptr)
+{
+	struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
+	enum zstd_kmem_type type;
+
+	ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
+	ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
+
+	type = z->kmem_type;
+	switch (type) {
+	case ZSTD_KMEM_DEFAULT:
+		vmem_free(z, z->kmem_size);
+		break;
+	case ZSTD_KMEM_POOL:
+		zstd_mempool_free(z);
+		break;
+	case ZSTD_KMEM_DCTX:
+		mutex_exit(&zstd_dctx_fallback.barrier);
+		break;
+	default:
+		break;
+	}
+}
+
+/* Allocate fallback memory to ensure safe decompression */
+static void __init
+create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
+{
+	mem->mem_size = size;
+	mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
+	mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
+}
+
+/* Initialize memory pool barrier mutexes */
+static void __init
+zstd_mempool_init(void)
+{
+	zstd_mempool_cctx = (struct zstd_pool *)
+	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
+	zstd_mempool_dctx = (struct zstd_pool *)
+	    kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
+
+	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+		mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
+		    MUTEX_DEFAULT, NULL);
+		mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
+		    MUTEX_DEFAULT, NULL);
+	}
+}
+
+/* Initialize zstd-related memory handling */
+static int __init
+zstd_meminit(void)
+{
+	zstd_mempool_init();
+
+	/*
+	 * Estimate the size of the fallback decompression context.
+	 * The expected size on x64 with current ZSTD should be about 160 KB.
+	 */
+	create_fallback_mem(&zstd_dctx_fallback,
+	    P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
+	    PAGESIZE));
+
+	return (0);
+}
+
+/* Release object from pool and free memory */
+static void __exit
+release_pool(struct zstd_pool *pool)
+{
+	mutex_destroy(&pool->barrier);
+	vmem_free(pool->mem, pool->size);
+	pool->mem = NULL;
+	pool->size = 0;
+}
+
+/* Release memory pool objects */
+static void __exit
+zstd_mempool_deinit(void)
+{
+	for (int i = 0; i < ZSTD_POOL_MAX; i++) {
+		release_pool(&zstd_mempool_cctx[i]);
+		release_pool(&zstd_mempool_dctx[i]);
+	}
+
+	kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
+	kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
+	zstd_mempool_dctx = NULL;
+	zstd_mempool_cctx = NULL;
+}
+
+/* release unused memory from pool */
+
+void
+zfs_zstd_cache_reap_now(void)
+{
+
+	/*
+	 * Short-circuit if there are no buffers to begin with.
+	 */
+	if (ZSTDSTAT(zstd_stat_buffers) == 0)
+		return;
+
+	/*
+	 * calling alloc with zero size seeks
+	 * and releases old unused objects
+	 */
+	zstd_mempool_reap(zstd_mempool_cctx);
+	zstd_mempool_reap(zstd_mempool_dctx);
+}
+
+extern int __init
+zstd_init(void)
+{
+	/* Set pool size by using maximum sane thread count * 4 */
+	pool_count = (boot_ncpus * 4);
+	zstd_meminit();
+
+	/* Initialize kstat */
+	zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (zstd_ksp != NULL) {
+		zstd_ksp->ks_data = &zstd_stats;
+		kstat_install(zstd_ksp);
+	}
+
+	return (0);
+}
+
+extern void __exit
+zstd_fini(void)
+{
+	/* Deinitialize kstat */
+	if (zstd_ksp != NULL) {
+		kstat_delete(zstd_ksp);
+		zstd_ksp = NULL;
+	}
+
+	/* Release fallback memory */
+	vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
+	mutex_destroy(&zstd_dctx_fallback.barrier);
+
+	/* Deinit memory pool */
+	zstd_mempool_deinit();
+}
+
+#if defined(_KERNEL)
+module_init(zstd_init);
+module_exit(zstd_fini);
+
+ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
+ZFS_MODULE_LICENSE("Dual BSD/GPL");
+ZFS_MODULE_VERSION(ZSTD_VERSION_STRING);
+
+EXPORT_SYMBOL(zfs_zstd_compress);
+EXPORT_SYMBOL(zfs_zstd_decompress_level);
+EXPORT_SYMBOL(zfs_zstd_decompress);
+EXPORT_SYMBOL(zfs_zstd_cache_reap_now);
+#endif
diff --git a/sys/contrib/openzfs/module/zstd/zstd-in.c b/sys/contrib/openzfs/module/zstd/zstd-in.c
new file mode 100644
index 000000000000..121f375e5515
--- /dev/null
+++ b/sys/contrib/openzfs/module/zstd/zstd-in.c
@@ -0,0 +1,68 @@
+/*
+ * BSD 3-Clause Clear License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from this
+ * software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc.
+ * Copyright (c) 2019-2020, Michael Niewöhner
+ */
+
+#define	MEM_MODULE
+#define	XXH_NAMESPACE ZSTD_
+#define	XXH_PRIVATE_API
+#define	XXH_INLINE_ALL
+#define	ZSTD_LEGACY_SUPPORT 0
+#define	ZSTD_LIB_DICTBUILDER 0
+#define	ZSTD_LIB_DEPRECATED 0
+#define	ZSTD_NOBENCH
+
+#include "common/debug.c"
+#include "common/entropy_common.c"
+#include "common/error_private.c"
+#include "common/fse_decompress.c"
+#include "common/pool.c"
+#include "common/zstd_common.c"
+
+#include "compress/fse_compress.c"
+#include "compress/hist.c"
+#include "compress/huf_compress.c"
+#include "compress/zstd_compress_literals.c"
+#include "compress/zstd_compress_sequences.c"
+#include "compress/zstd_compress_superblock.c"
+#include "compress/zstd_compress.c"
+#include "compress/zstd_double_fast.c"
+#include "compress/zstd_fast.c"
+#include "compress/zstd_lazy.c"
+#include "compress/zstd_ldm.c"
+#include "compress/zstd_opt.c"
+
+#include "decompress/huf_decompress.c"
+#include "decompress/zstd_ddict.c"
+#include "decompress/zstd_decompress.c"
+#include "decompress/zstd_decompress_block.c"